xref: /dpdk/drivers/common/mlx5/linux/mlx5_nl.c (revision daa02b5cddbb8e11b31d41e2bf7bb1ae64dcae2f)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5 
6 #include <errno.h>
7 #include <linux/if_link.h>
8 #include <linux/rtnetlink.h>
9 #include <linux/genetlink.h>
10 #include <net/if.h>
11 #include <rdma/rdma_netlink.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <stdalign.h>
16 #include <string.h>
17 #include <sys/socket.h>
18 #include <unistd.h>
19 
20 #include <rte_errno.h>
21 
22 #include "mlx5_nl.h"
23 #include "../mlx5_common_log.h"
24 #include "mlx5_malloc.h"
25 #ifdef HAVE_DEVLINK
26 #include <linux/devlink.h>
27 #endif
28 
29 
30 /* Size of the buffer to receive kernel messages */
31 #define MLX5_NL_BUF_SIZE (32 * 1024)
32 /* Send buffer size for the Netlink socket */
33 #define MLX5_SEND_BUF_SIZE 32768
34 /* Receive buffer size for the Netlink socket */
35 #define MLX5_RECV_BUF_SIZE 32768
36 /* Maximal physical port name length. */
37 #define MLX5_PHYS_PORT_NAME_MAX 128
38 
39 /** Parameters of VLAN devices created by driver. */
40 #define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
41 /*
42  * Define NDA_RTA as defined in iproute2 sources.
43  *
44  * see in iproute2 sources file include/libnetlink.h
45  */
46 #ifndef MLX5_NDA_RTA
47 #define MLX5_NDA_RTA(r) \
48 	((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
49 #endif
50 /*
51  * Define NLMSG_TAIL as defined in iproute2 sources.
52  *
53  * see in iproute2 sources file include/libnetlink.h
54  */
55 #ifndef NLMSG_TAIL
56 #define NLMSG_TAIL(nmsg) \
57 	((struct rtattr *)(((char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
58 #endif
59 /*
60  * The following definitions are normally found in rdma/rdma_netlink.h,
61  * however they are so recent that most systems do not expose them yet.
62  */
63 #ifndef HAVE_RDMA_NL_NLDEV
64 #define RDMA_NL_NLDEV 5
65 #endif
66 #ifndef HAVE_RDMA_NLDEV_CMD_GET
67 #define RDMA_NLDEV_CMD_GET 1
68 #endif
69 #ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
70 #define RDMA_NLDEV_CMD_PORT_GET 5
71 #endif
72 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
73 #define RDMA_NLDEV_ATTR_DEV_INDEX 1
74 #endif
75 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
76 #define RDMA_NLDEV_ATTR_DEV_NAME 2
77 #endif
78 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
79 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
80 #endif
81 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_STATE
82 #define RDMA_NLDEV_ATTR_PORT_STATE 12
83 #endif
84 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
85 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
86 #endif
87 
88 /* These are normally found in linux/if_link.h. */
89 #ifndef HAVE_IFLA_NUM_VF
90 #define IFLA_NUM_VF 21
91 #endif
92 #ifndef HAVE_IFLA_EXT_MASK
93 #define IFLA_EXT_MASK 29
94 #endif
95 #ifndef HAVE_IFLA_PHYS_SWITCH_ID
96 #define IFLA_PHYS_SWITCH_ID 36
97 #endif
98 #ifndef HAVE_IFLA_PHYS_PORT_NAME
99 #define IFLA_PHYS_PORT_NAME 38
100 #endif
101 
102 /*
103  * Some Devlink defines may be missed in old kernel versions,
104  * adjust used defines.
105  */
106 #ifndef DEVLINK_GENL_NAME
107 #define DEVLINK_GENL_NAME "devlink"
108 #endif
109 #ifndef DEVLINK_GENL_VERSION
110 #define DEVLINK_GENL_VERSION 1
111 #endif
112 #ifndef DEVLINK_ATTR_BUS_NAME
113 #define DEVLINK_ATTR_BUS_NAME 1
114 #endif
115 #ifndef DEVLINK_ATTR_DEV_NAME
116 #define DEVLINK_ATTR_DEV_NAME 2
117 #endif
118 #ifndef DEVLINK_ATTR_PARAM
119 #define DEVLINK_ATTR_PARAM 80
120 #endif
121 #ifndef DEVLINK_ATTR_PARAM_NAME
122 #define DEVLINK_ATTR_PARAM_NAME 81
123 #endif
124 #ifndef DEVLINK_ATTR_PARAM_TYPE
125 #define DEVLINK_ATTR_PARAM_TYPE 83
126 #endif
127 #ifndef DEVLINK_ATTR_PARAM_VALUES_LIST
128 #define DEVLINK_ATTR_PARAM_VALUES_LIST 84
129 #endif
130 #ifndef DEVLINK_ATTR_PARAM_VALUE
131 #define DEVLINK_ATTR_PARAM_VALUE 85
132 #endif
133 #ifndef DEVLINK_ATTR_PARAM_VALUE_DATA
134 #define DEVLINK_ATTR_PARAM_VALUE_DATA 86
135 #endif
136 #ifndef DEVLINK_ATTR_PARAM_VALUE_CMODE
137 #define DEVLINK_ATTR_PARAM_VALUE_CMODE 87
138 #endif
139 #ifndef DEVLINK_PARAM_CMODE_DRIVERINIT
140 #define DEVLINK_PARAM_CMODE_DRIVERINIT 1
141 #endif
142 #ifndef DEVLINK_CMD_RELOAD
143 #define DEVLINK_CMD_RELOAD 37
144 #endif
145 #ifndef DEVLINK_CMD_PARAM_GET
146 #define DEVLINK_CMD_PARAM_GET 38
147 #endif
148 #ifndef DEVLINK_CMD_PARAM_SET
149 #define DEVLINK_CMD_PARAM_SET 39
150 #endif
151 #ifndef NLA_FLAG
152 #define NLA_FLAG 6
153 #endif
154 
155 /* Add/remove MAC address through Netlink */
156 struct mlx5_nl_mac_addr {
157 	struct rte_ether_addr (*mac)[];
158 	/**< MAC address handled by the device. */
159 	int mac_n; /**< Number of addresses in the array. */
160 };
161 
162 #define MLX5_NL_CMD_GET_IB_NAME (1 << 0)
163 #define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
164 #define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
165 #define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
166 #define MLX5_NL_CMD_GET_PORT_STATE (1 << 4)
167 
168 /** Data structure used by mlx5_nl_cmdget_cb(). */
169 struct mlx5_nl_port_info {
170 	const char *name; /**< IB device name (in). */
171 	uint32_t flags; /**< found attribute flags (out). */
172 	uint32_t ibindex; /**< IB device index (out). */
173 	uint32_t ifindex; /**< Network interface index (out). */
174 	uint32_t portnum; /**< IB device max port number (out). */
175 	uint16_t state; /**< IB device port state (out). */
176 };
177 
178 uint32_t atomic_sn;
179 
180 /* Generate Netlink sequence number. */
181 #define MLX5_NL_SN_GENERATE __atomic_add_fetch(&atomic_sn, 1, __ATOMIC_RELAXED)
182 
183 /**
184  * Opens a Netlink socket.
185  *
186  * @param protocol
187  *   Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
188  *
189  * @return
190  *   A file descriptor on success, a negative errno value otherwise and
191  *   rte_errno is set.
192  */
193 int
194 mlx5_nl_init(int protocol)
195 {
196 	int fd;
197 	int buf_size;
198 	socklen_t opt_size;
199 	struct sockaddr_nl local = {
200 		.nl_family = AF_NETLINK,
201 	};
202 	int ret;
203 
204 	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
205 	if (fd == -1) {
206 		rte_errno = errno;
207 		return -rte_errno;
208 	}
209 	opt_size = sizeof(buf_size);
210 	ret = getsockopt(fd, SOL_SOCKET, SO_SNDBUF, &buf_size, &opt_size);
211 	if (ret == -1) {
212 		rte_errno = errno;
213 		goto error;
214 	}
215 	DRV_LOG(DEBUG, "Netlink socket send buffer: %d", buf_size);
216 	if (buf_size < MLX5_SEND_BUF_SIZE) {
217 		ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF,
218 				 &buf_size, sizeof(buf_size));
219 		if (ret == -1) {
220 			rte_errno = errno;
221 			goto error;
222 		}
223 	}
224 	opt_size = sizeof(buf_size);
225 	ret = getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &buf_size, &opt_size);
226 	if (ret == -1) {
227 		rte_errno = errno;
228 		goto error;
229 	}
230 	DRV_LOG(DEBUG, "Netlink socket recv buffer: %d", buf_size);
231 	if (buf_size < MLX5_RECV_BUF_SIZE) {
232 		ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF,
233 				 &buf_size, sizeof(buf_size));
234 		if (ret == -1) {
235 			rte_errno = errno;
236 			goto error;
237 		}
238 	}
239 	ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
240 	if (ret == -1) {
241 		rte_errno = errno;
242 		goto error;
243 	}
244 	return fd;
245 error:
246 	close(fd);
247 	return -rte_errno;
248 }
249 
250 /**
251  * Send a request message to the kernel on the Netlink socket.
252  *
253  * @param[in] nlsk_fd
254  *   Netlink socket file descriptor.
255  * @param[in] nh
256  *   The Netlink message send to the kernel.
257  * @param[in] ssn
258  *   Sequence number.
259  * @param[in] req
260  *   Pointer to the request structure.
261  * @param[in] len
262  *   Length of the request in bytes.
263  *
264  * @return
265  *   The number of sent bytes on success, a negative errno value otherwise and
266  *   rte_errno is set.
267  */
268 static int
269 mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
270 		int len)
271 {
272 	struct sockaddr_nl sa = {
273 		.nl_family = AF_NETLINK,
274 	};
275 	struct iovec iov[2] = {
276 		{ .iov_base = nh, .iov_len = sizeof(*nh), },
277 		{ .iov_base = req, .iov_len = len, },
278 	};
279 	struct msghdr msg = {
280 		.msg_name = &sa,
281 		.msg_namelen = sizeof(sa),
282 		.msg_iov = iov,
283 		.msg_iovlen = 2,
284 	};
285 	int send_bytes;
286 
287 	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
288 	nh->nlmsg_seq = sn;
289 	send_bytes = sendmsg(nlsk_fd, &msg, 0);
290 	if (send_bytes < 0) {
291 		rte_errno = errno;
292 		return -rte_errno;
293 	}
294 	return send_bytes;
295 }
296 
297 /**
298  * Send a message to the kernel on the Netlink socket.
299  *
300  * @param[in] nlsk_fd
301  *   The Netlink socket file descriptor used for communication.
302  * @param[in] nh
303  *   The Netlink message send to the kernel.
304  * @param[in] sn
305  *   Sequence number.
306  *
307  * @return
308  *   The number of sent bytes on success, a negative errno value otherwise and
309  *   rte_errno is set.
310  */
311 static int
312 mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
313 {
314 	struct sockaddr_nl sa = {
315 		.nl_family = AF_NETLINK,
316 	};
317 	struct iovec iov = {
318 		.iov_base = nh,
319 		.iov_len = nh->nlmsg_len,
320 	};
321 	struct msghdr msg = {
322 		.msg_name = &sa,
323 		.msg_namelen = sizeof(sa),
324 		.msg_iov = &iov,
325 		.msg_iovlen = 1,
326 	};
327 	int send_bytes;
328 
329 	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
330 	nh->nlmsg_seq = sn;
331 	send_bytes = sendmsg(nlsk_fd, &msg, 0);
332 	if (send_bytes < 0) {
333 		rte_errno = errno;
334 		return -rte_errno;
335 	}
336 	return send_bytes;
337 }
338 
339 /**
340  * Receive a message from the kernel on the Netlink socket, following
341  * mlx5_nl_send().
342  *
343  * @param[in] nlsk_fd
344  *   The Netlink socket file descriptor used for communication.
345  * @param[in] sn
346  *   Sequence number.
347  * @param[in] cb
348  *   The callback function to call for each Netlink message received.
349  * @param[in, out] arg
350  *   Custom arguments for the callback.
351  *
352  * @return
353  *   0 on success, a negative errno value otherwise and rte_errno is set.
354  */
355 static int
356 mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
357 	     void *arg)
358 {
359 	struct sockaddr_nl sa;
360 	struct iovec iov;
361 	struct msghdr msg = {
362 		.msg_name = &sa,
363 		.msg_namelen = sizeof(sa),
364 		.msg_iov = &iov,
365 		/* One message at a time */
366 		.msg_iovlen = 1,
367 	};
368 	void *buf = NULL;
369 	int multipart = 0;
370 	int ret = 0;
371 
372 	do {
373 		struct nlmsghdr *nh;
374 		int recv_bytes;
375 
376 		do {
377 			/* Query length of incoming message. */
378 			iov.iov_base = NULL;
379 			iov.iov_len = 0;
380 			recv_bytes = recvmsg(nlsk_fd, &msg,
381 					     MSG_PEEK | MSG_TRUNC);
382 			if (recv_bytes < 0) {
383 				rte_errno = errno;
384 				ret = -rte_errno;
385 				goto exit;
386 			}
387 			if (recv_bytes == 0) {
388 				rte_errno = ENODATA;
389 				ret = -rte_errno;
390 				goto exit;
391 			}
392 			/* Allocate buffer to fetch the message. */
393 			if (recv_bytes < MLX5_RECV_BUF_SIZE)
394 				recv_bytes = MLX5_RECV_BUF_SIZE;
395 			mlx5_free(buf);
396 			buf = mlx5_malloc(0, recv_bytes, 0, SOCKET_ID_ANY);
397 			if (!buf) {
398 				rte_errno = ENOMEM;
399 				ret = -rte_errno;
400 				goto exit;
401 			}
402 			/* Fetch the message. */
403 			iov.iov_base = buf;
404 			iov.iov_len = recv_bytes;
405 			recv_bytes = recvmsg(nlsk_fd, &msg, 0);
406 			if (recv_bytes == -1) {
407 				rte_errno = errno;
408 				ret = -rte_errno;
409 				goto exit;
410 			}
411 			nh = (struct nlmsghdr *)buf;
412 		} while (nh->nlmsg_seq != sn);
413 		for (;
414 		     NLMSG_OK(nh, (unsigned int)recv_bytes);
415 		     nh = NLMSG_NEXT(nh, recv_bytes)) {
416 			if (nh->nlmsg_type == NLMSG_ERROR) {
417 				struct nlmsgerr *err_data = NLMSG_DATA(nh);
418 
419 				if (err_data->error < 0) {
420 					rte_errno = -err_data->error;
421 					ret = -rte_errno;
422 					goto exit;
423 				}
424 				/* Ack message. */
425 				ret = 0;
426 				goto exit;
427 			}
428 			/* Multi-part msgs and their trailing DONE message. */
429 			if (nh->nlmsg_flags & NLM_F_MULTI) {
430 				if (nh->nlmsg_type == NLMSG_DONE) {
431 					ret =  0;
432 					goto exit;
433 				}
434 				multipart = 1;
435 			}
436 			if (cb) {
437 				ret = cb(nh, arg);
438 				if (ret < 0)
439 					goto exit;
440 			}
441 		}
442 	} while (multipart);
443 exit:
444 	mlx5_free(buf);
445 	return ret;
446 }
447 
448 /**
449  * Parse Netlink message to retrieve the bridge MAC address.
450  *
451  * @param nh
452  *   Pointer to Netlink Message Header.
453  * @param arg
454  *   PMD data register with this callback.
455  *
456  * @return
457  *   0 on success, a negative errno value otherwise and rte_errno is set.
458  */
459 static int
460 mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
461 {
462 	struct mlx5_nl_mac_addr *data = arg;
463 	struct ndmsg *r = NLMSG_DATA(nh);
464 	struct rtattr *attribute;
465 	int len;
466 
467 	len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
468 	for (attribute = MLX5_NDA_RTA(r);
469 	     RTA_OK(attribute, len);
470 	     attribute = RTA_NEXT(attribute, len)) {
471 		if (attribute->rta_type == NDA_LLADDR) {
472 			if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
473 				DRV_LOG(WARNING,
474 					"not enough room to finalize the"
475 					" request");
476 				rte_errno = ENOMEM;
477 				return -rte_errno;
478 			}
479 #ifdef RTE_LIBRTE_MLX5_DEBUG
480 			char m[RTE_ETHER_ADDR_FMT_SIZE];
481 
482 			rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE,
483 					      RTA_DATA(attribute));
484 			DRV_LOG(DEBUG, "bridge MAC address %s", m);
485 #endif
486 			memcpy(&(*data->mac)[data->mac_n++],
487 			       RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
488 		}
489 	}
490 	return 0;
491 }
492 
493 /**
494  * Get bridge MAC addresses.
495  *
496  * @param[in] nlsk_fd
497  *   Netlink socket file descriptor.
498  * @param[in] iface_idx
499  *   Net device interface index.
500  * @param mac[out]
501  *   Pointer to the array table of MAC addresses to fill.
502  *   Its size should be of MLX5_MAX_MAC_ADDRESSES.
503  * @param mac_n[out]
504  *   Number of entries filled in MAC array.
505  *
506  * @return
507  *   0 on success, a negative errno value otherwise and rte_errno is set.
508  */
509 static int
510 mlx5_nl_mac_addr_list(int nlsk_fd, unsigned int iface_idx,
511 		      struct rte_ether_addr (*mac)[], int *mac_n)
512 {
513 	struct {
514 		struct nlmsghdr	hdr;
515 		struct ifinfomsg ifm;
516 	} req = {
517 		.hdr = {
518 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
519 			.nlmsg_type = RTM_GETNEIGH,
520 			.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
521 		},
522 		.ifm = {
523 			.ifi_family = PF_BRIDGE,
524 			.ifi_index = iface_idx,
525 		},
526 	};
527 	struct mlx5_nl_mac_addr data = {
528 		.mac = mac,
529 		.mac_n = 0,
530 	};
531 	uint32_t sn = MLX5_NL_SN_GENERATE;
532 	int ret;
533 
534 	if (nlsk_fd == -1)
535 		return 0;
536 	ret = mlx5_nl_request(nlsk_fd, &req.hdr, sn, &req.ifm,
537 			      sizeof(struct ifinfomsg));
538 	if (ret < 0)
539 		goto error;
540 	ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_mac_addr_cb, &data);
541 	if (ret < 0)
542 		goto error;
543 	*mac_n = data.mac_n;
544 	return 0;
545 error:
546 	DRV_LOG(DEBUG, "Interface %u cannot retrieve MAC address list %s",
547 		iface_idx, strerror(rte_errno));
548 	return -rte_errno;
549 }
550 
551 /**
552  * Modify the MAC address neighbour table with Netlink.
553  *
554  * @param[in] nlsk_fd
555  *   Netlink socket file descriptor.
556  * @param[in] iface_idx
557  *   Net device interface index.
558  * @param mac
559  *   MAC address to consider.
560  * @param add
561  *   1 to add the MAC address, 0 to remove the MAC address.
562  *
563  * @return
564  *   0 on success, a negative errno value otherwise and rte_errno is set.
565  */
566 static int
567 mlx5_nl_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
568 			struct rte_ether_addr *mac, int add)
569 {
570 	struct {
571 		struct nlmsghdr hdr;
572 		struct ndmsg ndm;
573 		struct rtattr rta;
574 		uint8_t buffer[RTE_ETHER_ADDR_LEN];
575 	} req = {
576 		.hdr = {
577 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
578 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
579 				NLM_F_EXCL | NLM_F_ACK,
580 			.nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
581 		},
582 		.ndm = {
583 			.ndm_family = PF_BRIDGE,
584 			.ndm_state = NUD_NOARP | NUD_PERMANENT,
585 			.ndm_ifindex = iface_idx,
586 			.ndm_flags = NTF_SELF,
587 		},
588 		.rta = {
589 			.rta_type = NDA_LLADDR,
590 			.rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
591 		},
592 	};
593 	uint32_t sn = MLX5_NL_SN_GENERATE;
594 	int ret;
595 
596 	if (nlsk_fd == -1)
597 		return 0;
598 	memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
599 	req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
600 		RTA_ALIGN(req.rta.rta_len);
601 	ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
602 	if (ret < 0)
603 		goto error;
604 	ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
605 	if (ret < 0)
606 		goto error;
607 	return 0;
608 error:
609 #ifdef RTE_LIBRTE_MLX5_DEBUG
610 	{
611 		char m[RTE_ETHER_ADDR_FMT_SIZE];
612 
613 		rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE, mac);
614 		DRV_LOG(DEBUG,
615 			"Interface %u cannot %s MAC address %s %s",
616 			iface_idx,
617 			add ? "add" : "remove", m, strerror(rte_errno));
618 	}
619 #endif
620 	return -rte_errno;
621 }
622 
623 /**
624  * Modify the VF MAC address neighbour table with Netlink.
625  *
626  * @param[in] nlsk_fd
627  *   Netlink socket file descriptor.
628  * @param[in] iface_idx
629  *   Net device interface index.
630  * @param mac
631  *    MAC address to consider.
632  * @param vf_index
633  *    VF index.
634  *
635  * @return
636  *    0 on success, a negative errno value otherwise and rte_errno is set.
637  */
638 int
639 mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
640 			   struct rte_ether_addr *mac, int vf_index)
641 {
642 	int ret;
643 	struct {
644 		struct nlmsghdr hdr;
645 		struct ifinfomsg ifm;
646 		struct rtattr vf_list_rta;
647 		struct rtattr vf_info_rta;
648 		struct rtattr vf_mac_rta;
649 		struct ifla_vf_mac ivm;
650 	} req = {
651 		.hdr = {
652 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
653 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
654 			.nlmsg_type = RTM_BASE,
655 		},
656 		.ifm = {
657 			.ifi_index = iface_idx,
658 		},
659 		.vf_list_rta = {
660 			.rta_type = IFLA_VFINFO_LIST,
661 			.rta_len = RTA_ALIGN(RTA_LENGTH(0)),
662 		},
663 		.vf_info_rta = {
664 			.rta_type = IFLA_VF_INFO,
665 			.rta_len = RTA_ALIGN(RTA_LENGTH(0)),
666 		},
667 		.vf_mac_rta = {
668 			.rta_type = IFLA_VF_MAC,
669 		},
670 	};
671 	struct ifla_vf_mac ivm = {
672 		.vf = vf_index,
673 	};
674 	uint32_t sn = MLX5_NL_SN_GENERATE;
675 
676 	memcpy(&ivm.mac, mac, RTE_ETHER_ADDR_LEN);
677 	memcpy(RTA_DATA(&req.vf_mac_rta), &ivm, sizeof(ivm));
678 
679 	req.vf_mac_rta.rta_len = RTA_LENGTH(sizeof(ivm));
680 	req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
681 		RTA_ALIGN(req.vf_list_rta.rta_len) +
682 		RTA_ALIGN(req.vf_info_rta.rta_len) +
683 		RTA_ALIGN(req.vf_mac_rta.rta_len);
684 	req.vf_list_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
685 					       &req.vf_list_rta);
686 	req.vf_info_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
687 					       &req.vf_info_rta);
688 
689 	if (nlsk_fd < 0)
690 		return -1;
691 	ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
692 	if (ret < 0)
693 		goto error;
694 	ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
695 	if (ret < 0)
696 		goto error;
697 	return 0;
698 error:
699 	DRV_LOG(ERR,
700 		"representor %u cannot set VF MAC address "
701 		RTE_ETHER_ADDR_PRT_FMT " : %s",
702 		vf_index,
703 		RTE_ETHER_ADDR_BYTES(mac),
704 		strerror(rte_errno));
705 	return -rte_errno;
706 }
707 
708 /**
709  * Add a MAC address.
710  *
711  * @param[in] nlsk_fd
712  *   Netlink socket file descriptor.
713  * @param[in] iface_idx
714  *   Net device interface index.
715  * @param mac_own
716  *   BITFIELD_DECLARE array to store the mac.
717  * @param mac
718  *   MAC address to register.
719  * @param index
720  *   MAC address index.
721  *
722  * @return
723  *   0 on success, a negative errno value otherwise and rte_errno is set.
724  */
725 int
726 mlx5_nl_mac_addr_add(int nlsk_fd, unsigned int iface_idx,
727 		     uint64_t *mac_own, struct rte_ether_addr *mac,
728 		     uint32_t index)
729 {
730 	int ret;
731 
732 	ret = mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 1);
733 	if (!ret) {
734 		MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
735 		if (index >= MLX5_MAX_MAC_ADDRESSES)
736 			return -EINVAL;
737 
738 		BITFIELD_SET(mac_own, index);
739 	}
740 	if (ret == -EEXIST)
741 		return 0;
742 	return ret;
743 }
744 
745 /**
746  * Remove a MAC address.
747  *
748  * @param[in] nlsk_fd
749  *   Netlink socket file descriptor.
750  * @param[in] iface_idx
751  *   Net device interface index.
752  * @param mac_own
753  *   BITFIELD_DECLARE array to store the mac.
754  * @param mac
755  *   MAC address to remove.
756  * @param index
757  *   MAC address index.
758  *
759  * @return
760  *   0 on success, a negative errno value otherwise and rte_errno is set.
761  */
762 int
763 mlx5_nl_mac_addr_remove(int nlsk_fd, unsigned int iface_idx, uint64_t *mac_own,
764 			struct rte_ether_addr *mac, uint32_t index)
765 {
766 	MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
767 	if (index >= MLX5_MAX_MAC_ADDRESSES)
768 		return -EINVAL;
769 
770 	BITFIELD_RESET(mac_own, index);
771 	return mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 0);
772 }
773 
774 /**
775  * Synchronize Netlink bridge table to the internal table.
776  *
777  * @param[in] nlsk_fd
778  *   Netlink socket file descriptor.
779  * @param[in] iface_idx
780  *   Net device interface index.
781  * @param mac_addrs
782  *   Mac addresses array to sync.
783  * @param n
784  *   @p mac_addrs array size.
785  */
786 void
787 mlx5_nl_mac_addr_sync(int nlsk_fd, unsigned int iface_idx,
788 		      struct rte_ether_addr *mac_addrs, int n)
789 {
790 	struct rte_ether_addr macs[n];
791 	int macs_n = 0;
792 	int i;
793 	int ret;
794 
795 	memset(macs, 0, n * sizeof(macs[0]));
796 	ret = mlx5_nl_mac_addr_list(nlsk_fd, iface_idx, &macs, &macs_n);
797 	if (ret)
798 		return;
799 	for (i = 0; i != macs_n; ++i) {
800 		int j;
801 
802 		/* Verify the address is not in the array yet. */
803 		for (j = 0; j != n; ++j)
804 			if (rte_is_same_ether_addr(&macs[i], &mac_addrs[j]))
805 				break;
806 		if (j != n)
807 			continue;
808 		if (rte_is_multicast_ether_addr(&macs[i])) {
809 			/* Find the first entry available. */
810 			for (j = MLX5_MAX_UC_MAC_ADDRESSES; j != n; ++j) {
811 				if (rte_is_zero_ether_addr(&mac_addrs[j])) {
812 					mac_addrs[j] = macs[i];
813 					break;
814 				}
815 			}
816 		} else {
817 			/* Find the first entry available. */
818 			for (j = 0; j != MLX5_MAX_UC_MAC_ADDRESSES; ++j) {
819 				if (rte_is_zero_ether_addr(&mac_addrs[j])) {
820 					mac_addrs[j] = macs[i];
821 					break;
822 				}
823 			}
824 		}
825 	}
826 }
827 
828 /**
829  * Flush all added MAC addresses.
830  *
831  * @param[in] nlsk_fd
832  *   Netlink socket file descriptor.
833  * @param[in] iface_idx
834  *   Net device interface index.
835  * @param[in] mac_addrs
836  *   Mac addresses array to flush.
837  * @param n
838  *   @p mac_addrs array size.
839  * @param mac_own
840  *   BITFIELD_DECLARE array to store the mac.
841  */
842 void
843 mlx5_nl_mac_addr_flush(int nlsk_fd, unsigned int iface_idx,
844 		       struct rte_ether_addr *mac_addrs, int n,
845 		       uint64_t *mac_own)
846 {
847 	int i;
848 
849 	if (n <= 0 || n > MLX5_MAX_MAC_ADDRESSES)
850 		return;
851 
852 	for (i = n - 1; i >= 0; --i) {
853 		struct rte_ether_addr *m = &mac_addrs[i];
854 
855 		if (BITFIELD_ISSET(mac_own, i))
856 			mlx5_nl_mac_addr_remove(nlsk_fd, iface_idx, mac_own, m,
857 						i);
858 	}
859 }
860 
861 /**
862  * Enable promiscuous / all multicast mode through Netlink.
863  *
864  * @param[in] nlsk_fd
865  *   Netlink socket file descriptor.
866  * @param[in] iface_idx
867  *   Net device interface index.
868  * @param flags
869  *   IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
870  * @param enable
871  *   Nonzero to enable, disable otherwise.
872  *
873  * @return
874  *   0 on success, a negative errno value otherwise and rte_errno is set.
875  */
876 static int
877 mlx5_nl_device_flags(int nlsk_fd, unsigned int iface_idx, uint32_t flags,
878 		     int enable)
879 {
880 	struct {
881 		struct nlmsghdr hdr;
882 		struct ifinfomsg ifi;
883 	} req = {
884 		.hdr = {
885 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
886 			.nlmsg_type = RTM_NEWLINK,
887 			.nlmsg_flags = NLM_F_REQUEST,
888 		},
889 		.ifi = {
890 			.ifi_flags = enable ? flags : 0,
891 			.ifi_change = flags,
892 			.ifi_index = iface_idx,
893 		},
894 	};
895 	uint32_t sn = MLX5_NL_SN_GENERATE;
896 	int ret;
897 
898 	MLX5_ASSERT(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
899 	if (nlsk_fd < 0)
900 		return 0;
901 	ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
902 	if (ret < 0)
903 		return ret;
904 	return 0;
905 }
906 
907 /**
908  * Enable promiscuous mode through Netlink.
909  *
910  * @param[in] nlsk_fd
911  *   Netlink socket file descriptor.
912  * @param[in] iface_idx
913  *   Net device interface index.
914  * @param enable
915  *   Nonzero to enable, disable otherwise.
916  *
917  * @return
918  *   0 on success, a negative errno value otherwise and rte_errno is set.
919  */
920 int
921 mlx5_nl_promisc(int nlsk_fd, unsigned int iface_idx, int enable)
922 {
923 	int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_PROMISC, enable);
924 
925 	if (ret)
926 		DRV_LOG(DEBUG,
927 			"Interface %u cannot %s promisc mode: Netlink error %s",
928 			iface_idx, enable ? "enable" : "disable",
929 			strerror(rte_errno));
930 	return ret;
931 }
932 
933 /**
934  * Enable all multicast mode through Netlink.
935  *
936  * @param[in] nlsk_fd
937  *   Netlink socket file descriptor.
938  * @param[in] iface_idx
939  *   Net device interface index.
940  * @param enable
941  *   Nonzero to enable, disable otherwise.
942  *
943  * @return
944  *   0 on success, a negative errno value otherwise and rte_errno is set.
945  */
946 int
947 mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable)
948 {
949 	int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_ALLMULTI,
950 				       enable);
951 
952 	if (ret)
953 		DRV_LOG(DEBUG,
954 			"Interface %u cannot %s allmulti : Netlink error %s",
955 			iface_idx, enable ? "enable" : "disable",
956 			strerror(rte_errno));
957 	return ret;
958 }
959 
960 /**
961  * Process network interface information from Netlink message.
962  *
963  * @param nh
964  *   Pointer to Netlink message header.
965  * @param arg
966  *   Opaque data pointer for this callback.
967  *
968  * @return
969  *   0 on success, a negative errno value otherwise and rte_errno is set.
970  */
971 static int
972 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
973 {
974 	struct mlx5_nl_port_info *data = arg;
975 	struct mlx5_nl_port_info local = {
976 		.flags = 0,
977 	};
978 	size_t off = NLMSG_HDRLEN;
979 
980 	if (nh->nlmsg_type !=
981 	    RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
982 	    nh->nlmsg_type !=
983 	    RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
984 		goto error;
985 	while (off < nh->nlmsg_len) {
986 		struct nlattr *na = (void *)((uintptr_t)nh + off);
987 		void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
988 
989 		if (na->nla_len > nh->nlmsg_len - off)
990 			goto error;
991 		switch (na->nla_type) {
992 		case RDMA_NLDEV_ATTR_DEV_INDEX:
993 			local.ibindex = *(uint32_t *)payload;
994 			local.flags |= MLX5_NL_CMD_GET_IB_INDEX;
995 			break;
996 		case RDMA_NLDEV_ATTR_DEV_NAME:
997 			if (!strcmp(payload, data->name))
998 				local.flags |= MLX5_NL_CMD_GET_IB_NAME;
999 			break;
1000 		case RDMA_NLDEV_ATTR_NDEV_INDEX:
1001 			local.ifindex = *(uint32_t *)payload;
1002 			local.flags |= MLX5_NL_CMD_GET_NET_INDEX;
1003 			break;
1004 		case RDMA_NLDEV_ATTR_PORT_INDEX:
1005 			local.portnum = *(uint32_t *)payload;
1006 			local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
1007 			break;
1008 		case RDMA_NLDEV_ATTR_PORT_STATE:
1009 			local.state = *(uint8_t *)payload;
1010 			local.flags |= MLX5_NL_CMD_GET_PORT_STATE;
1011 			break;
1012 		default:
1013 			break;
1014 		}
1015 		off += NLA_ALIGN(na->nla_len);
1016 	}
1017 	/*
1018 	 * It is possible to have multiple messages for all
1019 	 * Infiniband devices in the system with appropriate name.
1020 	 * So we should gather parameters locally and copy to
1021 	 * query context only in case of coinciding device name.
1022 	 */
1023 	if (local.flags & MLX5_NL_CMD_GET_IB_NAME) {
1024 		data->flags = local.flags;
1025 		data->ibindex = local.ibindex;
1026 		data->ifindex = local.ifindex;
1027 		data->portnum = local.portnum;
1028 		data->state = local.state;
1029 	}
1030 	return 0;
1031 error:
1032 	rte_errno = EINVAL;
1033 	return -rte_errno;
1034 }
1035 
1036 /**
1037  * Get port info of network interface associated with some IB device.
1038  *
1039  * This is the only somewhat safe method to avoid resorting to heuristics
1040  * when faced with port representors. Unfortunately it requires at least
1041  * Linux 4.17.
1042  *
1043  * @param nl
1044  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1045  * @param[in] pindex
1046  *   IB device port index, starting from 1
1047  * @param[out] data
1048  *   Pointer to port info.
1049  * @return
1050  *   0 on success, negative on error and rte_errno is set.
1051  */
1052 static int
1053 mlx5_nl_port_info(int nl, uint32_t pindex, struct mlx5_nl_port_info *data)
1054 {
1055 	union {
1056 		struct nlmsghdr nh;
1057 		uint8_t buf[NLMSG_HDRLEN +
1058 			    NLA_HDRLEN + NLA_ALIGN(sizeof(data->ibindex)) +
1059 			    NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
1060 	} req = {
1061 		.nh = {
1062 			.nlmsg_len = NLMSG_LENGTH(0),
1063 			.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1064 						       RDMA_NLDEV_CMD_GET),
1065 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1066 		},
1067 	};
1068 	struct nlattr *na;
1069 	uint32_t sn = MLX5_NL_SN_GENERATE;
1070 	int ret;
1071 
1072 	ret = mlx5_nl_send(nl, &req.nh, sn);
1073 	if (ret < 0)
1074 		return ret;
1075 	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
1076 	if (ret < 0)
1077 		return ret;
1078 	if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
1079 	    !(data->flags & MLX5_NL_CMD_GET_IB_INDEX))
1080 		goto error;
1081 	data->flags = 0;
1082 	sn = MLX5_NL_SN_GENERATE;
1083 	req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1084 					     RDMA_NLDEV_CMD_PORT_GET);
1085 	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1086 	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
1087 	na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
1088 	na->nla_len = NLA_HDRLEN + sizeof(data->ibindex);
1089 	na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
1090 	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1091 	       &data->ibindex, sizeof(data->ibindex));
1092 	na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
1093 	na->nla_len = NLA_HDRLEN + sizeof(pindex);
1094 	na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
1095 	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1096 	       &pindex, sizeof(pindex));
1097 	ret = mlx5_nl_send(nl, &req.nh, sn);
1098 	if (ret < 0)
1099 		return ret;
1100 	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
1101 	if (ret < 0)
1102 		return ret;
1103 	if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
1104 	    !(data->flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1105 	    !(data->flags & MLX5_NL_CMD_GET_NET_INDEX) ||
1106 	    !data->ifindex)
1107 		goto error;
1108 	return 1;
1109 error:
1110 	rte_errno = ENODEV;
1111 	return -rte_errno;
1112 }
1113 
1114 /**
1115  * Get index of network interface associated with some IB device.
1116  *
1117  * This is the only somewhat safe method to avoid resorting to heuristics
1118  * when faced with port representors. Unfortunately it requires at least
1119  * Linux 4.17.
1120  *
1121  * @param nl
1122  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1123  * @param[in] name
1124  *   IB device name.
1125  * @param[in] pindex
1126  *   IB device port index, starting from 1
1127  * @return
1128  *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
1129  *   is set.
1130  */
1131 unsigned int
1132 mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
1133 {
1134 	struct mlx5_nl_port_info data = {
1135 			.ifindex = 0,
1136 			.name = name,
1137 	};
1138 
1139 	if (mlx5_nl_port_info(nl, pindex, &data) < 0)
1140 		return 0;
1141 	return data.ifindex;
1142 }
1143 
1144 /**
1145  * Get IB device port state.
1146  *
1147  * This is the only somewhat safe method to get info for port number >= 255.
1148  * Unfortunately it requires at least Linux 4.17.
1149  *
1150  * @param nl
1151  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1152  * @param[in] name
1153  *   IB device name.
1154  * @param[in] pindex
1155  *   IB device port index, starting from 1
1156  * @return
1157  *   Port state (ibv_port_state) on success, negative on error
1158  *   and rte_errno is set.
1159  */
1160 int
1161 mlx5_nl_port_state(int nl, const char *name, uint32_t pindex)
1162 {
1163 	struct mlx5_nl_port_info data = {
1164 			.state = 0,
1165 			.name = name,
1166 	};
1167 
1168 	if (mlx5_nl_port_info(nl, pindex, &data) < 0)
1169 		return -rte_errno;
1170 	if ((data.flags & MLX5_NL_CMD_GET_PORT_STATE) == 0) {
1171 		rte_errno = ENOTSUP;
1172 		return -rte_errno;
1173 	}
1174 	return (int)data.state;
1175 }
1176 
1177 /**
1178  * Get the number of physical ports of given IB device.
1179  *
1180  * @param nl
1181  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1182  * @param[in] name
1183  *   IB device name.
1184  *
1185  * @return
1186  *   A valid (nonzero) number of ports on success, 0 otherwise
1187  *   and rte_errno is set.
1188  */
1189 unsigned int
1190 mlx5_nl_portnum(int nl, const char *name)
1191 {
1192 	struct mlx5_nl_port_info data = {
1193 		.flags = 0,
1194 		.name = name,
1195 		.ifindex = 0,
1196 		.portnum = 0,
1197 	};
1198 	struct nlmsghdr req = {
1199 		.nlmsg_len = NLMSG_LENGTH(0),
1200 		.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1201 					       RDMA_NLDEV_CMD_GET),
1202 		.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1203 	};
1204 	uint32_t sn = MLX5_NL_SN_GENERATE;
1205 	int ret;
1206 
1207 	ret = mlx5_nl_send(nl, &req, sn);
1208 	if (ret < 0)
1209 		return 0;
1210 	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1211 	if (ret < 0)
1212 		return 0;
1213 	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1214 	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1215 	    !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) {
1216 		rte_errno = ENODEV;
1217 		return 0;
1218 	}
1219 	if (!data.portnum)
1220 		rte_errno = EINVAL;
1221 	return data.portnum;
1222 }
1223 
1224 /**
1225  * Analyze gathered port parameters via Netlink to recognize master
1226  * and representor devices for E-Switch configuration.
1227  *
1228  * @param[in] num_vf_set
1229  *   flag of presence of number of VFs port attribute.
1230  * @param[inout] switch_info
1231  *   Port information, including port name as a number and port name
1232  *   type if recognized
1233  *
1234  * @return
1235  *   master and representor flags are set in switch_info according to
1236  *   recognized parameters (if any).
1237  */
1238 static void
1239 mlx5_nl_check_switch_info(bool num_vf_set,
1240 			  struct mlx5_switch_info *switch_info)
1241 {
1242 	switch (switch_info->name_type) {
1243 	case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1244 		/*
1245 		 * Name is not recognized, assume the master,
1246 		 * check the number of VFs key presence.
1247 		 */
1248 		switch_info->master = num_vf_set;
1249 		break;
1250 	case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1251 		/*
1252 		 * Name is not set, this assumes the legacy naming
1253 		 * schema for master, just check if there is a
1254 		 * number of VFs key.
1255 		 */
1256 		switch_info->master = num_vf_set;
1257 		break;
1258 	case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1259 		/* New uplink naming schema recognized. */
1260 		switch_info->master = 1;
1261 		break;
1262 	case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1263 		/* Legacy representors naming schema. */
1264 		switch_info->representor = !num_vf_set;
1265 		break;
1266 	case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
1267 		/* Fallthrough */
1268 	case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1269 		/* Fallthrough */
1270 	case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
1271 		/* New representors naming schema. */
1272 		switch_info->representor = 1;
1273 		break;
1274 	}
1275 }
1276 
1277 /**
1278  * Process switch information from Netlink message.
1279  *
1280  * @param nh
1281  *   Pointer to Netlink message header.
1282  * @param arg
1283  *   Opaque data pointer for this callback.
1284  *
1285  * @return
1286  *   0 on success, a negative errno value otherwise and rte_errno is set.
1287  */
1288 static int
1289 mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
1290 {
1291 	struct mlx5_switch_info info = {
1292 		.master = 0,
1293 		.representor = 0,
1294 		.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1295 		.port_name = 0,
1296 		.switch_id = 0,
1297 	};
1298 	size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
1299 	bool switch_id_set = false;
1300 	bool num_vf_set = false;
1301 	int len;
1302 
1303 	if (nh->nlmsg_type != RTM_NEWLINK)
1304 		goto error;
1305 	while (off < nh->nlmsg_len) {
1306 		struct rtattr *ra = (void *)((uintptr_t)nh + off);
1307 		void *payload = RTA_DATA(ra);
1308 		unsigned int i;
1309 
1310 		if (ra->rta_len > nh->nlmsg_len - off)
1311 			goto error;
1312 		switch (ra->rta_type) {
1313 		case IFLA_NUM_VF:
1314 			num_vf_set = true;
1315 			break;
1316 		case IFLA_PHYS_PORT_NAME:
1317 			len = RTA_PAYLOAD(ra);
1318 			/* Some kernels do not pad attributes with zero. */
1319 			if (len > 0 && len < MLX5_PHYS_PORT_NAME_MAX) {
1320 				char name[MLX5_PHYS_PORT_NAME_MAX];
1321 
1322 				/*
1323 				 * We can't just patch the message with padding
1324 				 * zero - it might corrupt the following items
1325 				 * in the message, we have to copy the string
1326 				 * by attribute length and pad the copied one.
1327 				 */
1328 				memcpy(name, payload, len);
1329 				name[len] = 0;
1330 				mlx5_translate_port_name(name, &info);
1331 			} else {
1332 				info.name_type =
1333 					MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
1334 			}
1335 			break;
1336 		case IFLA_PHYS_SWITCH_ID:
1337 			info.switch_id = 0;
1338 			for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
1339 				info.switch_id <<= 8;
1340 				info.switch_id |= ((uint8_t *)payload)[i];
1341 			}
1342 			switch_id_set = true;
1343 			break;
1344 		}
1345 		off += RTA_ALIGN(ra->rta_len);
1346 	}
1347 	if (switch_id_set) {
1348 		/* We have some E-Switch configuration. */
1349 		mlx5_nl_check_switch_info(num_vf_set, &info);
1350 	}
1351 	MLX5_ASSERT(!(info.master && info.representor));
1352 	memcpy(arg, &info, sizeof(info));
1353 	return 0;
1354 error:
1355 	rte_errno = EINVAL;
1356 	return -rte_errno;
1357 }
1358 
1359 /**
1360  * Get switch information associated with network interface.
1361  *
1362  * @param nl
1363  *   Netlink socket of the ROUTE kind (NETLINK_ROUTE).
1364  * @param ifindex
1365  *   Network interface index.
1366  * @param[out] info
1367  *   Switch information object, populated in case of success.
1368  *
1369  * @return
1370  *   0 on success, a negative errno value otherwise and rte_errno is set.
1371  */
1372 int
1373 mlx5_nl_switch_info(int nl, unsigned int ifindex,
1374 		    struct mlx5_switch_info *info)
1375 {
1376 	struct {
1377 		struct nlmsghdr nh;
1378 		struct ifinfomsg info;
1379 		struct rtattr rta;
1380 		uint32_t extmask;
1381 	} req = {
1382 		.nh = {
1383 			.nlmsg_len = NLMSG_LENGTH
1384 					(sizeof(req.info) +
1385 					 RTA_LENGTH(sizeof(uint32_t))),
1386 			.nlmsg_type = RTM_GETLINK,
1387 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1388 		},
1389 		.info = {
1390 			.ifi_family = AF_UNSPEC,
1391 			.ifi_index = ifindex,
1392 		},
1393 		.rta = {
1394 			.rta_type = IFLA_EXT_MASK,
1395 			.rta_len = RTA_LENGTH(sizeof(int32_t)),
1396 		},
1397 		.extmask = RTE_LE32(1),
1398 	};
1399 	uint32_t sn = MLX5_NL_SN_GENERATE;
1400 	int ret;
1401 
1402 	ret = mlx5_nl_send(nl, &req.nh, sn);
1403 	if (ret >= 0)
1404 		ret = mlx5_nl_recv(nl, sn, mlx5_nl_switch_info_cb, info);
1405 	if (info->master && info->representor) {
1406 		DRV_LOG(ERR, "ifindex %u device is recognized as master"
1407 			     " and as representor", ifindex);
1408 		rte_errno = ENODEV;
1409 		ret = -rte_errno;
1410 	}
1411 	return ret;
1412 }
1413 
1414 /*
1415  * Delete VLAN network device by ifindex.
1416  *
1417  * @param[in] tcf
1418  *   Context object initialized by mlx5_nl_vlan_vmwa_init().
1419  * @param[in] ifindex
1420  *   Interface index of network device to delete.
1421  */
1422 void
1423 mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa,
1424 		      uint32_t ifindex)
1425 {
1426 	uint32_t sn = MLX5_NL_SN_GENERATE;
1427 	int ret;
1428 	struct {
1429 		struct nlmsghdr nh;
1430 		struct ifinfomsg info;
1431 	} req = {
1432 		.nh = {
1433 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
1434 			.nlmsg_type = RTM_DELLINK,
1435 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1436 		},
1437 		.info = {
1438 			.ifi_family = AF_UNSPEC,
1439 			.ifi_index = ifindex,
1440 		},
1441 	};
1442 
1443 	if (ifindex) {
1444 		ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, sn);
1445 		if (ret >= 0)
1446 			ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1447 		if (ret < 0)
1448 			DRV_LOG(WARNING, "netlink: error deleting VLAN WA"
1449 				" ifindex %u, %d", ifindex, ret);
1450 	}
1451 }
1452 
1453 /* Set of subroutines to build Netlink message. */
1454 static struct nlattr *
1455 nl_msg_tail(struct nlmsghdr *nlh)
1456 {
1457 	return (struct nlattr *)
1458 		(((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
1459 }
1460 
1461 static void
1462 nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
1463 {
1464 	struct nlattr *nla = nl_msg_tail(nlh);
1465 
1466 	nla->nla_type = type;
1467 	nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr)) + alen;
1468 	nlh->nlmsg_len += NLMSG_ALIGN(nla->nla_len);
1469 
1470 	if (alen)
1471 		memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
1472 }
1473 
1474 static struct nlattr *
1475 nl_attr_nest_start(struct nlmsghdr *nlh, int type)
1476 {
1477 	struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
1478 
1479 	nl_attr_put(nlh, type, NULL, 0);
1480 	return nest;
1481 }
1482 
1483 static void
1484 nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
1485 {
1486 	nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
1487 }
1488 
1489 /*
1490  * Create network VLAN device with specified VLAN tag.
1491  *
1492  * @param[in] tcf
1493  *   Context object initialized by mlx5_nl_vlan_vmwa_init().
1494  * @param[in] ifindex
1495  *   Base network interface index.
1496  * @param[in] tag
1497  *   VLAN tag for VLAN network device to create.
1498  */
1499 uint32_t
1500 mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa,
1501 			 uint32_t ifindex, uint16_t tag)
1502 {
1503 	struct nlmsghdr *nlh;
1504 	struct ifinfomsg *ifm;
1505 	char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
1506 
1507 	__rte_cache_aligned
1508 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1509 		    NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
1510 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
1511 		    NLMSG_ALIGN(sizeof(uint32_t)) +
1512 		    NLMSG_ALIGN(sizeof(name)) +
1513 		    NLMSG_ALIGN(sizeof("vlan")) +
1514 		    NLMSG_ALIGN(sizeof(uint32_t)) +
1515 		    NLMSG_ALIGN(sizeof(uint16_t)) + 16];
1516 	struct nlattr *na_info;
1517 	struct nlattr *na_vlan;
1518 	uint32_t sn = MLX5_NL_SN_GENERATE;
1519 	int ret;
1520 
1521 	memset(buf, 0, sizeof(buf));
1522 	nlh = (struct nlmsghdr *)buf;
1523 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1524 	nlh->nlmsg_type = RTM_NEWLINK;
1525 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
1526 			   NLM_F_EXCL | NLM_F_ACK;
1527 	ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
1528 	nlh->nlmsg_len += sizeof(struct ifinfomsg);
1529 	ifm->ifi_family = AF_UNSPEC;
1530 	ifm->ifi_type = 0;
1531 	ifm->ifi_index = 0;
1532 	ifm->ifi_flags = IFF_UP;
1533 	ifm->ifi_change = 0xffffffff;
1534 	nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
1535 	ret = snprintf(name, sizeof(name), "%s.%u.%u",
1536 		       MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
1537 	nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
1538 	na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
1539 	nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
1540 	na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
1541 	nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
1542 	nl_attr_nest_end(nlh, na_vlan);
1543 	nl_attr_nest_end(nlh, na_info);
1544 	MLX5_ASSERT(sizeof(buf) >= nlh->nlmsg_len);
1545 	ret = mlx5_nl_send(vmwa->nl_socket, nlh, sn);
1546 	if (ret >= 0)
1547 		ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1548 	if (ret < 0) {
1549 		DRV_LOG(WARNING, "netlink: VLAN %s create failure (%d)", name,
1550 			ret);
1551 	}
1552 	/* Try to get ifindex of created or pre-existing device. */
1553 	ret = if_nametoindex(name);
1554 	if (!ret) {
1555 		DRV_LOG(WARNING, "VLAN %s failed to get index (%d)", name,
1556 			errno);
1557 		return 0;
1558 	}
1559 	return ret;
1560 }
1561 
1562 /**
1563  * Parse Netlink message to retrieve the general family ID.
1564  *
1565  * @param nh
1566  *   Pointer to Netlink Message Header.
1567  * @param arg
1568  *   PMD data register with this callback.
1569  *
1570  * @return
1571  *   0 on success, a negative errno value otherwise and rte_errno is set.
1572  */
1573 static int
1574 mlx5_nl_family_id_cb(struct nlmsghdr *nh, void *arg)
1575 {
1576 
1577 	struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1578 	struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1579 					NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1580 
1581 	for (; nla->nla_len && nla < tail;
1582 	     nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len))) {
1583 		if (nla->nla_type == CTRL_ATTR_FAMILY_ID) {
1584 			*(uint16_t *)arg = *(uint16_t *)(nla + 1);
1585 			return 0;
1586 		}
1587 	}
1588 	return -EINVAL;
1589 }
1590 
1591 #define MLX5_NL_MAX_ATTR_SIZE 100
1592 /**
1593  * Get generic netlink family ID.
1594  *
1595  * @param[in] nlsk_fd
1596  *   Netlink socket file descriptor.
1597  * @param[in] name
1598  *   The family name.
1599  *
1600  * @return
1601  *   ID >= 0 on success and @p enable is updated, a negative errno value
1602  *   otherwise and rte_errno is set.
1603  */
1604 static int
1605 mlx5_nl_generic_family_id_get(int nlsk_fd, const char *name)
1606 {
1607 	struct nlmsghdr *nlh;
1608 	struct genlmsghdr *genl;
1609 	uint32_t sn = MLX5_NL_SN_GENERATE;
1610 	int name_size = strlen(name) + 1;
1611 	int ret;
1612 	uint16_t id = -1;
1613 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1614 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1615 		    NLMSG_ALIGN(sizeof(struct nlattr)) +
1616 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE)];
1617 
1618 	memset(buf, 0, sizeof(buf));
1619 	nlh = (struct nlmsghdr *)buf;
1620 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1621 	nlh->nlmsg_type = GENL_ID_CTRL;
1622 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1623 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1624 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1625 	genl->cmd = CTRL_CMD_GETFAMILY;
1626 	genl->version = 1;
1627 	nl_attr_put(nlh, CTRL_ATTR_FAMILY_NAME, name, name_size);
1628 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1629 	if (ret >= 0)
1630 		ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_family_id_cb, &id);
1631 	if (ret < 0) {
1632 		DRV_LOG(DEBUG, "Failed to get Netlink %s family ID: %d.", name,
1633 			ret);
1634 		return ret;
1635 	}
1636 	DRV_LOG(DEBUG, "Netlink \"%s\" family ID is %u.", name, id);
1637 	return (int)id;
1638 }
1639 
1640 /**
1641  * Get Devlink family ID.
1642  *
1643  * @param[in] nlsk_fd
1644  *   Netlink socket file descriptor.
1645  *
1646  * @return
1647  *   ID >= 0 on success and @p enable is updated, a negative errno value
1648  *   otherwise and rte_errno is set.
1649  */
1650 
1651 int
1652 mlx5_nl_devlink_family_id_get(int nlsk_fd)
1653 {
1654 	return mlx5_nl_generic_family_id_get(nlsk_fd, DEVLINK_GENL_NAME);
1655 }
1656 
1657 /**
1658  * Parse Netlink message to retrieve the ROCE enable status.
1659  *
1660  * @param nh
1661  *   Pointer to Netlink Message Header.
1662  * @param arg
1663  *   PMD data register with this callback.
1664  *
1665  * @return
1666  *   0 on success, a negative errno value otherwise and rte_errno is set.
1667  */
1668 static int
1669 mlx5_nl_roce_cb(struct nlmsghdr *nh, void *arg)
1670 {
1671 
1672 	int ret = -EINVAL;
1673 	int *enable = arg;
1674 	struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1675 	struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1676 					NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1677 
1678 	while (nla->nla_len && nla < tail) {
1679 		switch (nla->nla_type) {
1680 		/* Expected nested attributes case. */
1681 		case DEVLINK_ATTR_PARAM:
1682 		case DEVLINK_ATTR_PARAM_VALUES_LIST:
1683 		case DEVLINK_ATTR_PARAM_VALUE:
1684 			ret = 0;
1685 			nla += 1;
1686 			break;
1687 		case DEVLINK_ATTR_PARAM_VALUE_DATA:
1688 			*enable = 1;
1689 			return 0;
1690 		default:
1691 			nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len));
1692 		}
1693 	}
1694 	*enable = 0;
1695 	return ret;
1696 }
1697 
1698 /**
1699  * Get ROCE enable status through Netlink.
1700  *
1701  * @param[in] nlsk_fd
1702  *   Netlink socket file descriptor.
1703  * @param[in] family_id
1704  *   the Devlink family ID.
1705  * @param pci_addr
1706  *   The device PCI address.
1707  * @param[out] enable
1708  *   Where to store the enable status.
1709  *
1710  * @return
1711  *   0 on success and @p enable is updated, a negative errno value otherwise
1712  *   and rte_errno is set.
1713  */
1714 int
1715 mlx5_nl_enable_roce_get(int nlsk_fd, int family_id, const char *pci_addr,
1716 			int *enable)
1717 {
1718 	struct nlmsghdr *nlh;
1719 	struct genlmsghdr *genl;
1720 	uint32_t sn = MLX5_NL_SN_GENERATE;
1721 	int ret;
1722 	int cur_en = 0;
1723 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1724 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1725 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 4 +
1726 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 4];
1727 
1728 	memset(buf, 0, sizeof(buf));
1729 	nlh = (struct nlmsghdr *)buf;
1730 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1731 	nlh->nlmsg_type = family_id;
1732 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1733 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1734 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1735 	genl->cmd = DEVLINK_CMD_PARAM_GET;
1736 	genl->version = DEVLINK_GENL_VERSION;
1737 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1738 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1739 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1740 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1741 	if (ret >= 0)
1742 		ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_roce_cb, &cur_en);
1743 	if (ret < 0) {
1744 		DRV_LOG(DEBUG, "Failed to get ROCE enable on device %s: %d.",
1745 			pci_addr, ret);
1746 		return ret;
1747 	}
1748 	*enable = cur_en;
1749 	DRV_LOG(DEBUG, "ROCE is %sabled for device \"%s\".",
1750 		cur_en ? "en" : "dis", pci_addr);
1751 	return ret;
1752 }
1753 
1754 /**
1755  * Reload mlx5 device kernel driver through Netlink.
1756  *
1757  * @param[in] nlsk_fd
1758  *   Netlink socket file descriptor.
1759  * @param[in] family_id
1760  *   the Devlink family ID.
1761  * @param pci_addr
1762  *   The device PCI address.
1763  * @param[out] enable
1764  *   The enable status to set.
1765  *
1766  * @return
1767  *   0 on success, a negative errno value otherwise and rte_errno is set.
1768  */
1769 static int
1770 mlx5_nl_driver_reload(int nlsk_fd, int family_id, const char *pci_addr)
1771 {
1772 	struct nlmsghdr *nlh;
1773 	struct genlmsghdr *genl;
1774 	uint32_t sn = MLX5_NL_SN_GENERATE;
1775 	int ret;
1776 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1777 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1778 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 2 +
1779 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 2];
1780 
1781 	memset(buf, 0, sizeof(buf));
1782 	nlh = (struct nlmsghdr *)buf;
1783 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1784 	nlh->nlmsg_type = family_id;
1785 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1786 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1787 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1788 	genl->cmd = DEVLINK_CMD_RELOAD;
1789 	genl->version = DEVLINK_GENL_VERSION;
1790 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1791 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1792 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1793 	if (ret >= 0)
1794 		ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1795 	if (ret < 0) {
1796 		DRV_LOG(DEBUG, "Failed to reload %s device by Netlink - %d",
1797 			pci_addr, ret);
1798 		return ret;
1799 	}
1800 	DRV_LOG(DEBUG, "Device \"%s\" was reloaded by Netlink successfully.",
1801 		pci_addr);
1802 	return 0;
1803 }
1804 
1805 /**
1806  * Set ROCE enable status through Netlink.
1807  *
1808  * @param[in] nlsk_fd
1809  *   Netlink socket file descriptor.
1810  * @param[in] family_id
1811  *   the Devlink family ID.
1812  * @param pci_addr
1813  *   The device PCI address.
1814  * @param[out] enable
1815  *   The enable status to set.
1816  *
1817  * @return
1818  *   0 on success, a negative errno value otherwise and rte_errno is set.
1819  */
1820 int
1821 mlx5_nl_enable_roce_set(int nlsk_fd, int family_id, const char *pci_addr,
1822 			int enable)
1823 {
1824 	struct nlmsghdr *nlh;
1825 	struct genlmsghdr *genl;
1826 	uint32_t sn = MLX5_NL_SN_GENERATE;
1827 	int ret;
1828 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1829 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1830 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 6 +
1831 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 6];
1832 	uint8_t cmode = DEVLINK_PARAM_CMODE_DRIVERINIT;
1833 	uint8_t ptype = NLA_FLAG;
1834 ;
1835 
1836 	memset(buf, 0, sizeof(buf));
1837 	nlh = (struct nlmsghdr *)buf;
1838 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1839 	nlh->nlmsg_type = family_id;
1840 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1841 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1842 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1843 	genl->cmd = DEVLINK_CMD_PARAM_SET;
1844 	genl->version = DEVLINK_GENL_VERSION;
1845 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1846 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1847 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1848 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_CMODE, &cmode, sizeof(cmode));
1849 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_TYPE, &ptype, sizeof(ptype));
1850 	if (enable)
1851 		nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, NULL, 0);
1852 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1853 	if (ret >= 0)
1854 		ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1855 	if (ret < 0) {
1856 		DRV_LOG(DEBUG, "Failed to %sable ROCE for device %s by Netlink:"
1857 			" %d.", enable ? "en" : "dis", pci_addr, ret);
1858 		return ret;
1859 	}
1860 	DRV_LOG(DEBUG, "Device %s ROCE was %sabled by Netlink successfully.",
1861 		pci_addr, enable ? "en" : "dis");
1862 	/* Now, need to reload the driver. */
1863 	return mlx5_nl_driver_reload(nlsk_fd, family_id, pci_addr);
1864 }
1865