xref: /dpdk/drivers/common/mlx5/linux/mlx5_nl.c (revision e12a0166c80f65e35408f4715b2f3a60763c3741)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5 
6 #include <errno.h>
7 #include <linux/if_link.h>
8 #include <linux/rtnetlink.h>
9 #include <linux/genetlink.h>
10 #include <net/if.h>
11 #include <rdma/rdma_netlink.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <stdalign.h>
16 #include <string.h>
17 #include <sys/socket.h>
18 #include <unistd.h>
19 
20 #include <rte_errno.h>
21 
22 #include "mlx5_nl.h"
23 #include "../mlx5_common_log.h"
24 #include "mlx5_malloc.h"
25 #ifdef HAVE_DEVLINK
26 #include <linux/devlink.h>
27 #endif
28 
29 
30 /* Size of the buffer to receive kernel messages */
31 #define MLX5_NL_BUF_SIZE (32 * 1024)
32 /* Send buffer size for the Netlink socket */
33 #define MLX5_SEND_BUF_SIZE 32768
34 /* Receive buffer size for the Netlink socket */
35 #define MLX5_RECV_BUF_SIZE 32768
36 /* Maximal physical port name length. */
37 #define MLX5_PHYS_PORT_NAME_MAX 128
38 
39 /** Parameters of VLAN devices created by driver. */
40 #define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
41 /*
42  * Define NDA_RTA as defined in iproute2 sources.
43  *
44  * see in iproute2 sources file include/libnetlink.h
45  */
46 #ifndef MLX5_NDA_RTA
47 #define MLX5_NDA_RTA(r) \
48 	((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
49 #endif
50 /*
51  * Define NLMSG_TAIL as defined in iproute2 sources.
52  *
53  * see in iproute2 sources file include/libnetlink.h
54  */
55 #ifndef NLMSG_TAIL
56 #define NLMSG_TAIL(nmsg) \
57 	((struct rtattr *)(((char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
58 #endif
59 /*
60  * The following definitions are normally found in rdma/rdma_netlink.h,
61  * however they are so recent that most systems do not expose them yet.
62  */
63 #ifndef HAVE_RDMA_NL_NLDEV
64 #define RDMA_NL_NLDEV 5
65 #endif
66 #ifndef HAVE_RDMA_NLDEV_CMD_GET
67 #define RDMA_NLDEV_CMD_GET 1
68 #endif
69 #ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
70 #define RDMA_NLDEV_CMD_PORT_GET 5
71 #endif
72 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
73 #define RDMA_NLDEV_ATTR_DEV_INDEX 1
74 #endif
75 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
76 #define RDMA_NLDEV_ATTR_DEV_NAME 2
77 #endif
78 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
79 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
80 #endif
81 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_STATE
82 #define RDMA_NLDEV_ATTR_PORT_STATE 12
83 #endif
84 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
85 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
86 #endif
87 
88 /* These are normally found in linux/if_link.h. */
89 #ifndef HAVE_IFLA_NUM_VF
90 #define IFLA_NUM_VF 21
91 #endif
92 #ifndef HAVE_IFLA_EXT_MASK
93 #define IFLA_EXT_MASK 29
94 #endif
95 #ifndef HAVE_IFLA_PHYS_SWITCH_ID
96 #define IFLA_PHYS_SWITCH_ID 36
97 #endif
98 #ifndef HAVE_IFLA_PHYS_PORT_NAME
99 #define IFLA_PHYS_PORT_NAME 38
100 #endif
101 
102 /*
103  * Some Devlink defines may be missed in old kernel versions,
104  * adjust used defines.
105  */
106 #ifndef DEVLINK_GENL_NAME
107 #define DEVLINK_GENL_NAME "devlink"
108 #endif
109 #ifndef DEVLINK_GENL_VERSION
110 #define DEVLINK_GENL_VERSION 1
111 #endif
112 #ifndef DEVLINK_ATTR_BUS_NAME
113 #define DEVLINK_ATTR_BUS_NAME 1
114 #endif
115 #ifndef DEVLINK_ATTR_DEV_NAME
116 #define DEVLINK_ATTR_DEV_NAME 2
117 #endif
118 #ifndef DEVLINK_ATTR_PARAM
119 #define DEVLINK_ATTR_PARAM 80
120 #endif
121 #ifndef DEVLINK_ATTR_PARAM_NAME
122 #define DEVLINK_ATTR_PARAM_NAME 81
123 #endif
124 #ifndef DEVLINK_ATTR_PARAM_TYPE
125 #define DEVLINK_ATTR_PARAM_TYPE 83
126 #endif
127 #ifndef DEVLINK_ATTR_PARAM_VALUES_LIST
128 #define DEVLINK_ATTR_PARAM_VALUES_LIST 84
129 #endif
130 #ifndef DEVLINK_ATTR_PARAM_VALUE
131 #define DEVLINK_ATTR_PARAM_VALUE 85
132 #endif
133 #ifndef DEVLINK_ATTR_PARAM_VALUE_DATA
134 #define DEVLINK_ATTR_PARAM_VALUE_DATA 86
135 #endif
136 #ifndef DEVLINK_ATTR_PARAM_VALUE_CMODE
137 #define DEVLINK_ATTR_PARAM_VALUE_CMODE 87
138 #endif
139 #ifndef DEVLINK_PARAM_CMODE_DRIVERINIT
140 #define DEVLINK_PARAM_CMODE_DRIVERINIT 1
141 #endif
142 #ifndef DEVLINK_CMD_RELOAD
143 #define DEVLINK_CMD_RELOAD 37
144 #endif
145 #ifndef DEVLINK_CMD_PARAM_GET
146 #define DEVLINK_CMD_PARAM_GET 38
147 #endif
148 #ifndef DEVLINK_CMD_PARAM_SET
149 #define DEVLINK_CMD_PARAM_SET 39
150 #endif
151 #ifndef NLA_FLAG
152 #define NLA_FLAG 6
153 #endif
154 
155 /* Add/remove MAC address through Netlink */
156 struct mlx5_nl_mac_addr {
157 	struct rte_ether_addr (*mac)[];
158 	/**< MAC address handled by the device. */
159 	int mac_n; /**< Number of addresses in the array. */
160 };
161 
162 #define MLX5_NL_CMD_GET_IB_NAME (1 << 0)
163 #define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
164 #define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
165 #define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
166 #define MLX5_NL_CMD_GET_PORT_STATE (1 << 4)
167 
168 /** Data structure used by mlx5_nl_cmdget_cb(). */
169 struct mlx5_nl_port_info {
170 	const char *name; /**< IB device name (in). */
171 	uint32_t flags; /**< found attribute flags (out). */
172 	uint32_t ibindex; /**< IB device index (out). */
173 	uint32_t ifindex; /**< Network interface index (out). */
174 	uint32_t portnum; /**< IB device max port number (out). */
175 	uint16_t state; /**< IB device port state (out). */
176 };
177 
178 RTE_ATOMIC(uint32_t) atomic_sn;
179 
180 /* Generate Netlink sequence number. */
181 #define MLX5_NL_SN_GENERATE (rte_atomic_fetch_add_explicit(&atomic_sn, 1, \
182 	rte_memory_order_relaxed) + 1)
183 
184 /**
185  * Opens a Netlink socket.
186  *
187  * @param protocol
188  *   Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
189  * @param groups
190  *   Groups to listen (e.g. RTMGRP_LINK), can be 0.
191  *
192  * @return
193  *   A file descriptor on success, a negative errno value otherwise and
194  *   rte_errno is set.
195  */
196 int
mlx5_nl_init(int protocol,int groups)197 mlx5_nl_init(int protocol, int groups)
198 {
199 	int fd;
200 	int buf_size;
201 	socklen_t opt_size;
202 	struct sockaddr_nl local = {
203 		.nl_family = AF_NETLINK,
204 		.nl_groups = groups,
205 	};
206 	int ret;
207 
208 	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
209 	if (fd == -1) {
210 		rte_errno = errno;
211 		return -rte_errno;
212 	}
213 	opt_size = sizeof(buf_size);
214 	ret = getsockopt(fd, SOL_SOCKET, SO_SNDBUF, &buf_size, &opt_size);
215 	if (ret == -1) {
216 		rte_errno = errno;
217 		goto error;
218 	}
219 	DRV_LOG(DEBUG, "Netlink socket send buffer: %d", buf_size);
220 	if (buf_size < MLX5_SEND_BUF_SIZE) {
221 		ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF,
222 				 &buf_size, sizeof(buf_size));
223 		if (ret == -1) {
224 			rte_errno = errno;
225 			goto error;
226 		}
227 	}
228 	opt_size = sizeof(buf_size);
229 	ret = getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &buf_size, &opt_size);
230 	if (ret == -1) {
231 		rte_errno = errno;
232 		goto error;
233 	}
234 	DRV_LOG(DEBUG, "Netlink socket recv buffer: %d", buf_size);
235 	if (buf_size < MLX5_RECV_BUF_SIZE) {
236 		ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF,
237 				 &buf_size, sizeof(buf_size));
238 		if (ret == -1) {
239 			rte_errno = errno;
240 			goto error;
241 		}
242 	}
243 	ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
244 	if (ret == -1) {
245 		rte_errno = errno;
246 		goto error;
247 	}
248 	return fd;
249 error:
250 	close(fd);
251 	return -rte_errno;
252 }
253 
254 /**
255  * Send a request message to the kernel on the Netlink socket.
256  *
257  * @param[in] nlsk_fd
258  *   Netlink socket file descriptor.
259  * @param[in] nh
260  *   The Netlink message send to the kernel.
261  * @param[in] ssn
262  *   Sequence number.
263  * @param[in] req
264  *   Pointer to the request structure.
265  * @param[in] len
266  *   Length of the request in bytes.
267  *
268  * @return
269  *   The number of sent bytes on success, a negative errno value otherwise and
270  *   rte_errno is set.
271  */
272 static int
mlx5_nl_request(int nlsk_fd,struct nlmsghdr * nh,uint32_t sn,void * req,int len)273 mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
274 		int len)
275 {
276 	struct sockaddr_nl sa = {
277 		.nl_family = AF_NETLINK,
278 	};
279 	struct iovec iov[2] = {
280 		{ .iov_base = nh, .iov_len = sizeof(*nh), },
281 		{ .iov_base = req, .iov_len = len, },
282 	};
283 	struct msghdr msg = {
284 		.msg_name = &sa,
285 		.msg_namelen = sizeof(sa),
286 		.msg_iov = iov,
287 		.msg_iovlen = 2,
288 	};
289 	int send_bytes;
290 
291 	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
292 	nh->nlmsg_seq = sn;
293 	send_bytes = sendmsg(nlsk_fd, &msg, 0);
294 	if (send_bytes < 0) {
295 		rte_errno = errno;
296 		return -rte_errno;
297 	}
298 	return send_bytes;
299 }
300 
301 /**
302  * Send a message to the kernel on the Netlink socket.
303  *
304  * @param[in] nlsk_fd
305  *   The Netlink socket file descriptor used for communication.
306  * @param[in] nh
307  *   The Netlink message send to the kernel.
308  * @param[in] sn
309  *   Sequence number.
310  *
311  * @return
312  *   The number of sent bytes on success, a negative errno value otherwise and
313  *   rte_errno is set.
314  */
315 static int
mlx5_nl_send(int nlsk_fd,struct nlmsghdr * nh,uint32_t sn)316 mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
317 {
318 	struct sockaddr_nl sa = {
319 		.nl_family = AF_NETLINK,
320 	};
321 	struct iovec iov = {
322 		.iov_base = nh,
323 		.iov_len = nh->nlmsg_len,
324 	};
325 	struct msghdr msg = {
326 		.msg_name = &sa,
327 		.msg_namelen = sizeof(sa),
328 		.msg_iov = &iov,
329 		.msg_iovlen = 1,
330 	};
331 	int send_bytes;
332 
333 	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
334 	nh->nlmsg_seq = sn;
335 	send_bytes = sendmsg(nlsk_fd, &msg, 0);
336 	if (send_bytes < 0) {
337 		rte_errno = errno;
338 		return -rte_errno;
339 	}
340 	return send_bytes;
341 }
342 
343 /**
344  * Receive a message from the kernel on the Netlink socket, following
345  * mlx5_nl_send().
346  *
347  * @param[in] nlsk_fd
348  *   The Netlink socket file descriptor used for communication.
349  * @param[in] sn
350  *   Sequence number.
351  * @param[in] cb
352  *   The callback function to call for each Netlink message received.
353  * @param[in, out] arg
354  *   Custom arguments for the callback.
355  *
356  * @return
357  *   0 on success, a negative errno value otherwise and rte_errno is set.
358  */
359 static int
mlx5_nl_recv(int nlsk_fd,uint32_t sn,int (* cb)(struct nlmsghdr *,void * arg),void * arg)360 mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
361 	     void *arg)
362 {
363 	struct sockaddr_nl sa;
364 	struct iovec iov;
365 	struct msghdr msg = {
366 		.msg_name = &sa,
367 		.msg_namelen = sizeof(sa),
368 		.msg_iov = &iov,
369 		/* One message at a time */
370 		.msg_iovlen = 1,
371 	};
372 	void *buf = NULL;
373 	int multipart = 0;
374 	int ret = 0;
375 
376 	do {
377 		struct nlmsghdr *nh;
378 		int recv_bytes;
379 
380 		do {
381 			/* Query length of incoming message. */
382 			iov.iov_base = NULL;
383 			iov.iov_len = 0;
384 			recv_bytes = recvmsg(nlsk_fd, &msg,
385 					     MSG_PEEK | MSG_TRUNC);
386 			if (recv_bytes < 0) {
387 				rte_errno = errno;
388 				ret = -rte_errno;
389 				goto exit;
390 			}
391 			if (recv_bytes == 0) {
392 				rte_errno = ENODATA;
393 				ret = -rte_errno;
394 				goto exit;
395 			}
396 			/* Allocate buffer to fetch the message. */
397 			if (recv_bytes < MLX5_RECV_BUF_SIZE)
398 				recv_bytes = MLX5_RECV_BUF_SIZE;
399 			mlx5_free(buf);
400 			buf = mlx5_malloc(0, recv_bytes, 0, SOCKET_ID_ANY);
401 			if (!buf) {
402 				rte_errno = ENOMEM;
403 				ret = -rte_errno;
404 				goto exit;
405 			}
406 			/* Fetch the message. */
407 			iov.iov_base = buf;
408 			iov.iov_len = recv_bytes;
409 			recv_bytes = recvmsg(nlsk_fd, &msg, 0);
410 			if (recv_bytes == -1) {
411 				rte_errno = errno;
412 				ret = -rte_errno;
413 				goto exit;
414 			}
415 			nh = (struct nlmsghdr *)buf;
416 		} while (nh->nlmsg_seq != sn);
417 		for (;
418 		     NLMSG_OK(nh, (unsigned int)recv_bytes);
419 		     nh = NLMSG_NEXT(nh, recv_bytes)) {
420 			if (nh->nlmsg_type == NLMSG_ERROR) {
421 				struct nlmsgerr *err_data = NLMSG_DATA(nh);
422 
423 				if (err_data->error < 0) {
424 					rte_errno = -err_data->error;
425 					ret = -rte_errno;
426 					goto exit;
427 				}
428 				/* Ack message. */
429 				ret = 0;
430 				goto exit;
431 			}
432 			/* Multi-part msgs and their trailing DONE message. */
433 			if (nh->nlmsg_flags & NLM_F_MULTI) {
434 				if (nh->nlmsg_type == NLMSG_DONE) {
435 					ret =  0;
436 					goto exit;
437 				}
438 				multipart = 1;
439 			}
440 			if (cb) {
441 				ret = cb(nh, arg);
442 				if (ret < 0)
443 					goto exit;
444 			}
445 		}
446 	} while (multipart);
447 exit:
448 	mlx5_free(buf);
449 	return ret;
450 }
451 
452 /**
453  * Parse Netlink message to retrieve the bridge MAC address.
454  *
455  * @param nh
456  *   Pointer to Netlink Message Header.
457  * @param arg
458  *   PMD data register with this callback.
459  *
460  * @return
461  *   0 on success, a negative errno value otherwise and rte_errno is set.
462  */
463 static int
mlx5_nl_mac_addr_cb(struct nlmsghdr * nh,void * arg)464 mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
465 {
466 	struct mlx5_nl_mac_addr *data = arg;
467 	struct ndmsg *r = NLMSG_DATA(nh);
468 	struct rtattr *attribute;
469 	int len;
470 
471 	len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
472 	for (attribute = MLX5_NDA_RTA(r);
473 	     RTA_OK(attribute, len);
474 	     attribute = RTA_NEXT(attribute, len)) {
475 		if (attribute->rta_type == NDA_LLADDR) {
476 			if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
477 				DRV_LOG(WARNING,
478 					"not enough room to finalize the"
479 					" request");
480 				rte_errno = ENOMEM;
481 				return -rte_errno;
482 			}
483 #ifdef RTE_LIBRTE_MLX5_DEBUG
484 			char m[RTE_ETHER_ADDR_FMT_SIZE];
485 
486 			rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE,
487 					      RTA_DATA(attribute));
488 			DRV_LOG(DEBUG, "bridge MAC address %s", m);
489 #endif
490 			memcpy(&(*data->mac)[data->mac_n++],
491 			       RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
492 		}
493 	}
494 	return 0;
495 }
496 
497 /**
498  * Get bridge MAC addresses.
499  *
500  * @param[in] nlsk_fd
501  *   Netlink socket file descriptor.
502  * @param[in] iface_idx
503  *   Net device interface index.
504  * @param mac[out]
505  *   Pointer to the array table of MAC addresses to fill.
506  *   Its size should be of MLX5_MAX_MAC_ADDRESSES.
507  * @param mac_n[out]
508  *   Number of entries filled in MAC array.
509  *
510  * @return
511  *   0 on success, a negative errno value otherwise and rte_errno is set.
512  */
513 static int
mlx5_nl_mac_addr_list(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr (* mac)[],int * mac_n)514 mlx5_nl_mac_addr_list(int nlsk_fd, unsigned int iface_idx,
515 		      struct rte_ether_addr (*mac)[], int *mac_n)
516 {
517 	struct {
518 		struct nlmsghdr	hdr;
519 		struct ifinfomsg ifm;
520 	} req = {
521 		.hdr = {
522 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
523 			.nlmsg_type = RTM_GETNEIGH,
524 			.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
525 		},
526 		.ifm = {
527 			.ifi_family = PF_BRIDGE,
528 			.ifi_index = iface_idx,
529 		},
530 	};
531 	struct mlx5_nl_mac_addr data = {
532 		.mac = mac,
533 		.mac_n = 0,
534 	};
535 	uint32_t sn = MLX5_NL_SN_GENERATE;
536 	int ret;
537 
538 	if (nlsk_fd == -1)
539 		return 0;
540 	ret = mlx5_nl_request(nlsk_fd, &req.hdr, sn, &req.ifm,
541 			      sizeof(struct ifinfomsg));
542 	if (ret < 0)
543 		goto error;
544 	ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_mac_addr_cb, &data);
545 	if (ret < 0)
546 		goto error;
547 	*mac_n = data.mac_n;
548 	return 0;
549 error:
550 	DRV_LOG(DEBUG, "Interface %u cannot retrieve MAC address list %s",
551 		iface_idx, strerror(rte_errno));
552 	return -rte_errno;
553 }
554 
555 /**
556  * Modify the MAC address neighbour table with Netlink.
557  *
558  * @param[in] nlsk_fd
559  *   Netlink socket file descriptor.
560  * @param[in] iface_idx
561  *   Net device interface index.
562  * @param mac
563  *   MAC address to consider.
564  * @param add
565  *   1 to add the MAC address, 0 to remove the MAC address.
566  *
567  * @return
568  *   0 on success, a negative errno value otherwise and rte_errno is set.
569  */
570 static int
mlx5_nl_mac_addr_modify(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr * mac,int add)571 mlx5_nl_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
572 			struct rte_ether_addr *mac, int add)
573 {
574 	struct {
575 		struct nlmsghdr hdr;
576 		struct ndmsg ndm;
577 		struct rtattr rta;
578 		uint8_t buffer[RTE_ETHER_ADDR_LEN];
579 	} req = {
580 		.hdr = {
581 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
582 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
583 				NLM_F_EXCL | NLM_F_ACK,
584 			.nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
585 		},
586 		.ndm = {
587 			.ndm_family = PF_BRIDGE,
588 			.ndm_state = NUD_NOARP | NUD_PERMANENT,
589 			.ndm_ifindex = iface_idx,
590 			.ndm_flags = NTF_SELF,
591 		},
592 		.rta = {
593 			.rta_type = NDA_LLADDR,
594 			.rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
595 		},
596 	};
597 	uint32_t sn = MLX5_NL_SN_GENERATE;
598 	int ret;
599 
600 	if (nlsk_fd == -1)
601 		return 0;
602 	memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
603 	req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
604 		RTA_ALIGN(req.rta.rta_len);
605 	ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
606 	if (ret < 0)
607 		goto error;
608 	ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
609 	if (ret < 0)
610 		goto error;
611 	return 0;
612 error:
613 #ifdef RTE_LIBRTE_MLX5_DEBUG
614 	{
615 		char m[RTE_ETHER_ADDR_FMT_SIZE];
616 
617 		rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE, mac);
618 		DRV_LOG(DEBUG,
619 			"Interface %u cannot %s MAC address %s %s",
620 			iface_idx,
621 			add ? "add" : "remove", m, strerror(rte_errno));
622 	}
623 #endif
624 	return -rte_errno;
625 }
626 
627 /**
628  * Modify the VF MAC address neighbour table with Netlink.
629  *
630  * @param[in] nlsk_fd
631  *   Netlink socket file descriptor.
632  * @param[in] iface_idx
633  *   Net device interface index.
634  * @param mac
635  *    MAC address to consider.
636  * @param vf_index
637  *    VF index.
638  *
639  * @return
640  *    0 on success, a negative errno value otherwise and rte_errno is set.
641  */
642 int
mlx5_nl_vf_mac_addr_modify(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr * mac,int vf_index)643 mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
644 			   struct rte_ether_addr *mac, int vf_index)
645 {
646 	int ret;
647 	struct {
648 		struct nlmsghdr hdr;
649 		struct ifinfomsg ifm;
650 		struct rtattr vf_list_rta;
651 		struct rtattr vf_info_rta;
652 		struct rtattr vf_mac_rta;
653 		struct ifla_vf_mac ivm;
654 	} req = {
655 		.hdr = {
656 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
657 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
658 			.nlmsg_type = RTM_BASE,
659 		},
660 		.ifm = {
661 			.ifi_index = iface_idx,
662 		},
663 		.vf_list_rta = {
664 			.rta_type = IFLA_VFINFO_LIST,
665 			.rta_len = RTA_ALIGN(RTA_LENGTH(0)),
666 		},
667 		.vf_info_rta = {
668 			.rta_type = IFLA_VF_INFO,
669 			.rta_len = RTA_ALIGN(RTA_LENGTH(0)),
670 		},
671 		.vf_mac_rta = {
672 			.rta_type = IFLA_VF_MAC,
673 		},
674 	};
675 	struct ifla_vf_mac ivm = {
676 		.vf = vf_index,
677 	};
678 	uint32_t sn = MLX5_NL_SN_GENERATE;
679 
680 	memcpy(&ivm.mac, mac, RTE_ETHER_ADDR_LEN);
681 	memcpy(RTA_DATA(&req.vf_mac_rta), &ivm, sizeof(ivm));
682 
683 	req.vf_mac_rta.rta_len = RTA_LENGTH(sizeof(ivm));
684 	req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
685 		RTA_ALIGN(req.vf_list_rta.rta_len) +
686 		RTA_ALIGN(req.vf_info_rta.rta_len) +
687 		RTA_ALIGN(req.vf_mac_rta.rta_len);
688 	req.vf_list_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
689 					       &req.vf_list_rta);
690 	req.vf_info_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
691 					       &req.vf_info_rta);
692 
693 	if (nlsk_fd < 0)
694 		return -1;
695 	ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
696 	if (ret < 0)
697 		goto error;
698 	ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
699 	if (ret < 0)
700 		goto error;
701 	return 0;
702 error:
703 	DRV_LOG(ERR,
704 		"representor %u cannot set VF MAC address "
705 		RTE_ETHER_ADDR_PRT_FMT " : %s",
706 		vf_index,
707 		RTE_ETHER_ADDR_BYTES(mac),
708 		strerror(rte_errno));
709 	return -rte_errno;
710 }
711 
712 /**
713  * Add a MAC address.
714  *
715  * @param[in] nlsk_fd
716  *   Netlink socket file descriptor.
717  * @param[in] iface_idx
718  *   Net device interface index.
719  * @param mac_own
720  *   BITFIELD_DECLARE array to store the mac.
721  * @param mac
722  *   MAC address to register.
723  * @param index
724  *   MAC address index.
725  *
726  * @return
727  *   0 on success, a negative errno value otherwise and rte_errno is set.
728  */
729 int
mlx5_nl_mac_addr_add(int nlsk_fd,unsigned int iface_idx,uint64_t * mac_own,struct rte_ether_addr * mac,uint32_t index)730 mlx5_nl_mac_addr_add(int nlsk_fd, unsigned int iface_idx,
731 		     uint64_t *mac_own, struct rte_ether_addr *mac,
732 		     uint32_t index)
733 {
734 	int ret;
735 
736 	ret = mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 1);
737 	if (!ret) {
738 		MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
739 		if (index >= MLX5_MAX_MAC_ADDRESSES)
740 			return -EINVAL;
741 
742 		BITFIELD_SET(mac_own, index);
743 	}
744 	if (ret == -EEXIST)
745 		return 0;
746 	return ret;
747 }
748 
749 /**
750  * Remove a MAC address.
751  *
752  * @param[in] nlsk_fd
753  *   Netlink socket file descriptor.
754  * @param[in] iface_idx
755  *   Net device interface index.
756  * @param mac_own
757  *   BITFIELD_DECLARE array to store the mac.
758  * @param mac
759  *   MAC address to remove.
760  * @param index
761  *   MAC address index.
762  *
763  * @return
764  *   0 on success, a negative errno value otherwise and rte_errno is set.
765  */
766 int
mlx5_nl_mac_addr_remove(int nlsk_fd,unsigned int iface_idx,uint64_t * mac_own,struct rte_ether_addr * mac,uint32_t index)767 mlx5_nl_mac_addr_remove(int nlsk_fd, unsigned int iface_idx, uint64_t *mac_own,
768 			struct rte_ether_addr *mac, uint32_t index)
769 {
770 	MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
771 	if (index >= MLX5_MAX_MAC_ADDRESSES)
772 		return -EINVAL;
773 
774 	BITFIELD_RESET(mac_own, index);
775 	return mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 0);
776 }
777 
778 /**
779  * Synchronize Netlink bridge table to the internal table.
780  *
781  * @param[in] nlsk_fd
782  *   Netlink socket file descriptor.
783  * @param[in] iface_idx
784  *   Net device interface index.
785  * @param mac_addrs
786  *   Mac addresses array to sync.
787  * @param n
788  *   @p mac_addrs array size.
789  */
790 void
mlx5_nl_mac_addr_sync(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr * mac_addrs,int n)791 mlx5_nl_mac_addr_sync(int nlsk_fd, unsigned int iface_idx,
792 		      struct rte_ether_addr *mac_addrs, int n)
793 {
794 	struct rte_ether_addr macs[n];
795 	int macs_n = 0;
796 	int i;
797 	int ret;
798 
799 	memset(macs, 0, n * sizeof(macs[0]));
800 	ret = mlx5_nl_mac_addr_list(nlsk_fd, iface_idx, &macs, &macs_n);
801 	if (ret)
802 		return;
803 	for (i = 0; i != macs_n; ++i) {
804 		int j;
805 
806 		/* Verify the address is not in the array yet. */
807 		for (j = 0; j != n; ++j)
808 			if (rte_is_same_ether_addr(&macs[i], &mac_addrs[j]))
809 				break;
810 		if (j != n)
811 			continue;
812 		if (rte_is_multicast_ether_addr(&macs[i])) {
813 			/* Find the first entry available. */
814 			for (j = MLX5_MAX_UC_MAC_ADDRESSES; j != n; ++j) {
815 				if (rte_is_zero_ether_addr(&mac_addrs[j])) {
816 					mac_addrs[j] = macs[i];
817 					break;
818 				}
819 			}
820 		} else {
821 			/* Find the first entry available. */
822 			for (j = 0; j != MLX5_MAX_UC_MAC_ADDRESSES; ++j) {
823 				if (rte_is_zero_ether_addr(&mac_addrs[j])) {
824 					mac_addrs[j] = macs[i];
825 					break;
826 				}
827 			}
828 		}
829 	}
830 }
831 
832 /**
833  * Flush all added MAC addresses.
834  *
835  * @param[in] nlsk_fd
836  *   Netlink socket file descriptor.
837  * @param[in] iface_idx
838  *   Net device interface index.
839  * @param[in] mac_addrs
840  *   Mac addresses array to flush.
841  * @param n
842  *   @p mac_addrs array size.
843  * @param mac_own
844  *   BITFIELD_DECLARE array to store the mac.
845  */
846 void
mlx5_nl_mac_addr_flush(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr * mac_addrs,int n,uint64_t * mac_own)847 mlx5_nl_mac_addr_flush(int nlsk_fd, unsigned int iface_idx,
848 		       struct rte_ether_addr *mac_addrs, int n,
849 		       uint64_t *mac_own)
850 {
851 	int i;
852 
853 	if (n <= 0 || n > MLX5_MAX_MAC_ADDRESSES)
854 		return;
855 
856 	for (i = n - 1; i >= 0; --i) {
857 		struct rte_ether_addr *m = &mac_addrs[i];
858 
859 		if (BITFIELD_ISSET(mac_own, i))
860 			mlx5_nl_mac_addr_remove(nlsk_fd, iface_idx, mac_own, m,
861 						i);
862 	}
863 }
864 
865 /**
866  * Enable promiscuous / all multicast mode through Netlink.
867  *
868  * @param[in] nlsk_fd
869  *   Netlink socket file descriptor.
870  * @param[in] iface_idx
871  *   Net device interface index.
872  * @param flags
873  *   IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
874  * @param enable
875  *   Nonzero to enable, disable otherwise.
876  *
877  * @return
878  *   0 on success, a negative errno value otherwise and rte_errno is set.
879  */
880 static int
mlx5_nl_device_flags(int nlsk_fd,unsigned int iface_idx,uint32_t flags,int enable)881 mlx5_nl_device_flags(int nlsk_fd, unsigned int iface_idx, uint32_t flags,
882 		     int enable)
883 {
884 	struct {
885 		struct nlmsghdr hdr;
886 		struct ifinfomsg ifi;
887 	} req = {
888 		.hdr = {
889 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
890 			.nlmsg_type = RTM_NEWLINK,
891 			.nlmsg_flags = NLM_F_REQUEST,
892 		},
893 		.ifi = {
894 			.ifi_flags = enable ? flags : 0,
895 			.ifi_change = flags,
896 			.ifi_index = iface_idx,
897 		},
898 	};
899 	uint32_t sn = MLX5_NL_SN_GENERATE;
900 	int ret;
901 
902 	MLX5_ASSERT(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
903 	if (nlsk_fd < 0)
904 		return 0;
905 	ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
906 	if (ret < 0)
907 		return ret;
908 	return 0;
909 }
910 
911 /**
912  * Enable promiscuous mode through Netlink.
913  *
914  * @param[in] nlsk_fd
915  *   Netlink socket file descriptor.
916  * @param[in] iface_idx
917  *   Net device interface index.
918  * @param enable
919  *   Nonzero to enable, disable otherwise.
920  *
921  * @return
922  *   0 on success, a negative errno value otherwise and rte_errno is set.
923  */
924 int
mlx5_nl_promisc(int nlsk_fd,unsigned int iface_idx,int enable)925 mlx5_nl_promisc(int nlsk_fd, unsigned int iface_idx, int enable)
926 {
927 	int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_PROMISC, enable);
928 
929 	if (ret)
930 		DRV_LOG(DEBUG,
931 			"Interface %u cannot %s promisc mode: Netlink error %s",
932 			iface_idx, enable ? "enable" : "disable",
933 			strerror(rte_errno));
934 	return ret;
935 }
936 
937 /**
938  * Enable all multicast mode through Netlink.
939  *
940  * @param[in] nlsk_fd
941  *   Netlink socket file descriptor.
942  * @param[in] iface_idx
943  *   Net device interface index.
944  * @param enable
945  *   Nonzero to enable, disable otherwise.
946  *
947  * @return
948  *   0 on success, a negative errno value otherwise and rte_errno is set.
949  */
950 int
mlx5_nl_allmulti(int nlsk_fd,unsigned int iface_idx,int enable)951 mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable)
952 {
953 	int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_ALLMULTI,
954 				       enable);
955 
956 	if (ret)
957 		DRV_LOG(DEBUG,
958 			"Interface %u cannot %s allmulti : Netlink error %s",
959 			iface_idx, enable ? "enable" : "disable",
960 			strerror(rte_errno));
961 	return ret;
962 }
963 
964 /**
965  * Process network interface information from Netlink message.
966  *
967  * @param nh
968  *   Pointer to Netlink message header.
969  * @param arg
970  *   Opaque data pointer for this callback.
971  *
972  * @return
973  *   0 on success, a negative errno value otherwise and rte_errno is set.
974  */
975 static int
mlx5_nl_cmdget_cb(struct nlmsghdr * nh,void * arg)976 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
977 {
978 	struct mlx5_nl_port_info *data = arg;
979 	struct mlx5_nl_port_info local = {
980 		.flags = 0,
981 	};
982 	size_t off = NLMSG_HDRLEN;
983 
984 	if (nh->nlmsg_type !=
985 	    RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
986 	    nh->nlmsg_type !=
987 	    RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
988 		goto error;
989 	while (off < nh->nlmsg_len) {
990 		struct nlattr *na = (void *)((uintptr_t)nh + off);
991 		void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
992 
993 		if (na->nla_len > nh->nlmsg_len - off)
994 			goto error;
995 		switch (na->nla_type) {
996 		case RDMA_NLDEV_ATTR_DEV_INDEX:
997 			local.ibindex = *(uint32_t *)payload;
998 			local.flags |= MLX5_NL_CMD_GET_IB_INDEX;
999 			break;
1000 		case RDMA_NLDEV_ATTR_DEV_NAME:
1001 			if (!strcmp(payload, data->name))
1002 				local.flags |= MLX5_NL_CMD_GET_IB_NAME;
1003 			break;
1004 		case RDMA_NLDEV_ATTR_NDEV_INDEX:
1005 			local.ifindex = *(uint32_t *)payload;
1006 			local.flags |= MLX5_NL_CMD_GET_NET_INDEX;
1007 			break;
1008 		case RDMA_NLDEV_ATTR_PORT_INDEX:
1009 			local.portnum = *(uint32_t *)payload;
1010 			local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
1011 			break;
1012 		case RDMA_NLDEV_ATTR_PORT_STATE:
1013 			local.state = *(uint8_t *)payload;
1014 			local.flags |= MLX5_NL_CMD_GET_PORT_STATE;
1015 			break;
1016 		default:
1017 			break;
1018 		}
1019 		off += NLA_ALIGN(na->nla_len);
1020 	}
1021 	/*
1022 	 * It is possible to have multiple messages for all
1023 	 * Infiniband devices in the system with appropriate name.
1024 	 * So we should gather parameters locally and copy to
1025 	 * query context only in case of coinciding device name.
1026 	 */
1027 	if (local.flags & MLX5_NL_CMD_GET_IB_NAME) {
1028 		data->flags = local.flags;
1029 		data->ibindex = local.ibindex;
1030 		data->ifindex = local.ifindex;
1031 		data->portnum = local.portnum;
1032 		data->state = local.state;
1033 	}
1034 	return 0;
1035 error:
1036 	rte_errno = EINVAL;
1037 	return -rte_errno;
1038 }
1039 
1040 /**
1041  * Get port info of network interface associated with some IB device.
1042  *
1043  * This is the only somewhat safe method to avoid resorting to heuristics
1044  * when faced with port representors. Unfortunately it requires at least
1045  * Linux 4.17.
1046  *
1047  * @param nl
1048  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1049  * @param[in] pindex
1050  *   IB device port index, starting from 1
1051  * @param[out] data
1052  *   Pointer to port info.
1053  * @return
1054  *   0 on success, negative on error and rte_errno is set.
1055  */
1056 static int
mlx5_nl_port_info(int nl,uint32_t pindex,struct mlx5_nl_port_info * data)1057 mlx5_nl_port_info(int nl, uint32_t pindex, struct mlx5_nl_port_info *data)
1058 {
1059 	union {
1060 		struct nlmsghdr nh;
1061 		uint8_t buf[NLMSG_HDRLEN +
1062 			    NLA_HDRLEN + NLA_ALIGN(sizeof(data->ibindex)) +
1063 			    NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
1064 	} req = {
1065 		.nh = {
1066 			.nlmsg_len = NLMSG_LENGTH(0),
1067 			.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1068 						       RDMA_NLDEV_CMD_GET),
1069 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1070 		},
1071 	};
1072 	struct nlattr *na;
1073 	uint32_t sn = MLX5_NL_SN_GENERATE;
1074 	int ret;
1075 
1076 	ret = mlx5_nl_send(nl, &req.nh, sn);
1077 	if (ret < 0)
1078 		return ret;
1079 	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
1080 	if (ret < 0)
1081 		return ret;
1082 	if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
1083 	    !(data->flags & MLX5_NL_CMD_GET_IB_INDEX))
1084 		goto error;
1085 	data->flags = 0;
1086 	sn = MLX5_NL_SN_GENERATE;
1087 	req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1088 					     RDMA_NLDEV_CMD_PORT_GET);
1089 	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1090 	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
1091 	na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
1092 	na->nla_len = NLA_HDRLEN + sizeof(data->ibindex);
1093 	na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
1094 	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1095 	       &data->ibindex, sizeof(data->ibindex));
1096 	na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
1097 	na->nla_len = NLA_HDRLEN + sizeof(pindex);
1098 	na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
1099 	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1100 	       &pindex, sizeof(pindex));
1101 	ret = mlx5_nl_send(nl, &req.nh, sn);
1102 	if (ret < 0)
1103 		return ret;
1104 	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
1105 	if (ret < 0)
1106 		return ret;
1107 	if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
1108 	    !(data->flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1109 	    !(data->flags & MLX5_NL_CMD_GET_NET_INDEX) ||
1110 	    !data->ifindex)
1111 		goto error;
1112 	return 1;
1113 error:
1114 	rte_errno = ENODEV;
1115 	return -rte_errno;
1116 }
1117 
1118 /**
1119  * Get index of network interface associated with some IB device.
1120  *
1121  * This is the only somewhat safe method to avoid resorting to heuristics
1122  * when faced with port representors. Unfortunately it requires at least
1123  * Linux 4.17.
1124  *
1125  * @param nl
1126  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1127  * @param[in] name
1128  *   IB device name.
1129  * @param[in] pindex
1130  *   IB device port index, starting from 1
1131  * @return
1132  *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
1133  *   is set.
1134  */
1135 unsigned int
mlx5_nl_ifindex(int nl,const char * name,uint32_t pindex)1136 mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
1137 {
1138 	struct mlx5_nl_port_info data = {
1139 			.ifindex = 0,
1140 			.name = name,
1141 	};
1142 
1143 	if (mlx5_nl_port_info(nl, pindex, &data) < 0)
1144 		return 0;
1145 	return data.ifindex;
1146 }
1147 
1148 /**
1149  * Get IB device port state.
1150  *
1151  * This is the only somewhat safe method to get info for port number >= 255.
1152  * Unfortunately it requires at least Linux 4.17.
1153  *
1154  * @param nl
1155  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1156  * @param[in] name
1157  *   IB device name.
1158  * @param[in] pindex
1159  *   IB device port index, starting from 1
1160  * @return
1161  *   Port state (ibv_port_state) on success, negative on error
1162  *   and rte_errno is set.
1163  */
1164 int
mlx5_nl_port_state(int nl,const char * name,uint32_t pindex)1165 mlx5_nl_port_state(int nl, const char *name, uint32_t pindex)
1166 {
1167 	struct mlx5_nl_port_info data = {
1168 			.state = 0,
1169 			.name = name,
1170 	};
1171 
1172 	if (mlx5_nl_port_info(nl, pindex, &data) < 0)
1173 		return -rte_errno;
1174 	if ((data.flags & MLX5_NL_CMD_GET_PORT_STATE) == 0) {
1175 		rte_errno = ENOTSUP;
1176 		return -rte_errno;
1177 	}
1178 	return (int)data.state;
1179 }
1180 
1181 /**
1182  * Get the number of physical ports of given IB device.
1183  *
1184  * @param nl
1185  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1186  * @param[in] name
1187  *   IB device name.
1188  *
1189  * @return
1190  *   A valid (nonzero) number of ports on success, 0 otherwise
1191  *   and rte_errno is set.
1192  */
1193 unsigned int
mlx5_nl_portnum(int nl,const char * name)1194 mlx5_nl_portnum(int nl, const char *name)
1195 {
1196 	struct mlx5_nl_port_info data = {
1197 		.flags = 0,
1198 		.name = name,
1199 		.ifindex = 0,
1200 		.portnum = 0,
1201 	};
1202 	struct nlmsghdr req = {
1203 		.nlmsg_len = NLMSG_LENGTH(0),
1204 		.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1205 					       RDMA_NLDEV_CMD_GET),
1206 		.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1207 	};
1208 	uint32_t sn = MLX5_NL_SN_GENERATE;
1209 	int ret;
1210 
1211 	ret = mlx5_nl_send(nl, &req, sn);
1212 	if (ret < 0)
1213 		return 0;
1214 	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1215 	if (ret < 0)
1216 		return 0;
1217 	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1218 	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1219 	    !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) {
1220 		rte_errno = ENODEV;
1221 		return 0;
1222 	}
1223 	if (!data.portnum)
1224 		rte_errno = EINVAL;
1225 	return data.portnum;
1226 }
1227 
1228 /**
1229  * Analyze gathered port parameters via Netlink to recognize master
1230  * and representor devices for E-Switch configuration.
1231  *
1232  * @param[in] num_vf_set
1233  *   flag of presence of number of VFs port attribute.
1234  * @param[inout] switch_info
1235  *   Port information, including port name as a number and port name
1236  *   type if recognized
1237  *
1238  * @return
1239  *   master and representor flags are set in switch_info according to
1240  *   recognized parameters (if any).
1241  */
1242 static void
mlx5_nl_check_switch_info(bool num_vf_set,struct mlx5_switch_info * switch_info)1243 mlx5_nl_check_switch_info(bool num_vf_set,
1244 			  struct mlx5_switch_info *switch_info)
1245 {
1246 	switch (switch_info->name_type) {
1247 	case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1248 		/*
1249 		 * Name is not recognized, assume the master,
1250 		 * check the number of VFs key presence.
1251 		 */
1252 		switch_info->master = num_vf_set;
1253 		break;
1254 	case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1255 		/*
1256 		 * Name is not set, this assumes the legacy naming
1257 		 * schema for master, just check if there is a
1258 		 * number of VFs key.
1259 		 */
1260 		switch_info->master = num_vf_set;
1261 		break;
1262 	case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1263 		/* New uplink naming schema recognized. */
1264 		switch_info->master = 1;
1265 		break;
1266 	case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1267 		/* Legacy representors naming schema. */
1268 		switch_info->representor = !num_vf_set;
1269 		break;
1270 	case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
1271 		/* Fallthrough */
1272 	case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1273 		/* Fallthrough */
1274 	case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
1275 		/* New representors naming schema. */
1276 		switch_info->representor = 1;
1277 		break;
1278 	}
1279 }
1280 
1281 /**
1282  * Process switch information from Netlink message.
1283  *
1284  * @param nh
1285  *   Pointer to Netlink message header.
1286  * @param arg
1287  *   Opaque data pointer for this callback.
1288  *
1289  * @return
1290  *   0 on success, a negative errno value otherwise and rte_errno is set.
1291  */
1292 static int
mlx5_nl_switch_info_cb(struct nlmsghdr * nh,void * arg)1293 mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
1294 {
1295 	struct mlx5_switch_info info = {
1296 		.master = 0,
1297 		.representor = 0,
1298 		.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1299 		.port_name = 0,
1300 		.switch_id = 0,
1301 	};
1302 	size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
1303 	bool switch_id_set = false;
1304 	bool num_vf_set = false;
1305 	int len;
1306 
1307 	if (nh->nlmsg_type != RTM_NEWLINK)
1308 		goto error;
1309 	while (off < nh->nlmsg_len) {
1310 		struct rtattr *ra = (void *)((uintptr_t)nh + off);
1311 		void *payload = RTA_DATA(ra);
1312 		unsigned int i;
1313 
1314 		if (ra->rta_len > nh->nlmsg_len - off)
1315 			goto error;
1316 		switch (ra->rta_type) {
1317 		case IFLA_NUM_VF:
1318 			num_vf_set = true;
1319 			break;
1320 		case IFLA_PHYS_PORT_NAME:
1321 			len = RTA_PAYLOAD(ra);
1322 			/* Some kernels do not pad attributes with zero. */
1323 			if (len > 0 && len < MLX5_PHYS_PORT_NAME_MAX) {
1324 				char name[MLX5_PHYS_PORT_NAME_MAX];
1325 
1326 				/*
1327 				 * We can't just patch the message with padding
1328 				 * zero - it might corrupt the following items
1329 				 * in the message, we have to copy the string
1330 				 * by attribute length and pad the copied one.
1331 				 */
1332 				memcpy(name, payload, len);
1333 				name[len] = 0;
1334 				mlx5_translate_port_name(name, &info);
1335 			} else {
1336 				info.name_type =
1337 					MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
1338 			}
1339 			break;
1340 		case IFLA_PHYS_SWITCH_ID:
1341 			info.switch_id = 0;
1342 			for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
1343 				info.switch_id <<= 8;
1344 				info.switch_id |= ((uint8_t *)payload)[i];
1345 			}
1346 			switch_id_set = true;
1347 			break;
1348 		}
1349 		off += RTA_ALIGN(ra->rta_len);
1350 	}
1351 	if (switch_id_set) {
1352 		/* We have some E-Switch configuration. */
1353 		mlx5_nl_check_switch_info(num_vf_set, &info);
1354 	}
1355 	MLX5_ASSERT(!(info.master && info.representor));
1356 	memcpy(arg, &info, sizeof(info));
1357 	return 0;
1358 error:
1359 	rte_errno = EINVAL;
1360 	return -rte_errno;
1361 }
1362 
1363 /**
1364  * Get switch information associated with network interface.
1365  *
1366  * @param nl
1367  *   Netlink socket of the ROUTE kind (NETLINK_ROUTE).
1368  * @param ifindex
1369  *   Network interface index.
1370  * @param[out] info
1371  *   Switch information object, populated in case of success.
1372  *
1373  * @return
1374  *   0 on success, a negative errno value otherwise and rte_errno is set.
1375  */
1376 int
mlx5_nl_switch_info(int nl,unsigned int ifindex,struct mlx5_switch_info * info)1377 mlx5_nl_switch_info(int nl, unsigned int ifindex,
1378 		    struct mlx5_switch_info *info)
1379 {
1380 	struct {
1381 		struct nlmsghdr nh;
1382 		struct ifinfomsg info;
1383 		struct rtattr rta;
1384 		uint32_t extmask;
1385 	} req = {
1386 		.nh = {
1387 			.nlmsg_len = NLMSG_LENGTH
1388 					(sizeof(req.info) +
1389 					 RTA_LENGTH(sizeof(uint32_t))),
1390 			.nlmsg_type = RTM_GETLINK,
1391 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1392 		},
1393 		.info = {
1394 			.ifi_family = AF_UNSPEC,
1395 			.ifi_index = ifindex,
1396 		},
1397 		.rta = {
1398 			.rta_type = IFLA_EXT_MASK,
1399 			.rta_len = RTA_LENGTH(sizeof(int32_t)),
1400 		},
1401 		.extmask = RTE_LE32(1),
1402 	};
1403 	uint32_t sn = MLX5_NL_SN_GENERATE;
1404 	int ret;
1405 
1406 	ret = mlx5_nl_send(nl, &req.nh, sn);
1407 	if (ret >= 0)
1408 		ret = mlx5_nl_recv(nl, sn, mlx5_nl_switch_info_cb, info);
1409 	if (info->master && info->representor) {
1410 		DRV_LOG(ERR, "ifindex %u device is recognized as master"
1411 			     " and as representor", ifindex);
1412 		rte_errno = ENODEV;
1413 		ret = -rte_errno;
1414 	}
1415 	return ret;
1416 }
1417 
1418 /*
1419  * Delete VLAN network device by ifindex.
1420  *
1421  * @param[in] tcf
1422  *   Context object initialized by mlx5_nl_vlan_vmwa_init().
1423  * @param[in] ifindex
1424  *   Interface index of network device to delete.
1425  */
1426 void
mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context * vmwa,uint32_t ifindex)1427 mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa,
1428 		      uint32_t ifindex)
1429 {
1430 	uint32_t sn = MLX5_NL_SN_GENERATE;
1431 	int ret;
1432 	struct {
1433 		struct nlmsghdr nh;
1434 		struct ifinfomsg info;
1435 	} req = {
1436 		.nh = {
1437 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
1438 			.nlmsg_type = RTM_DELLINK,
1439 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1440 		},
1441 		.info = {
1442 			.ifi_family = AF_UNSPEC,
1443 			.ifi_index = ifindex,
1444 		},
1445 	};
1446 
1447 	if (ifindex) {
1448 		ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, sn);
1449 		if (ret >= 0)
1450 			ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1451 		if (ret < 0)
1452 			DRV_LOG(WARNING, "netlink: error deleting VLAN WA"
1453 				" ifindex %u, %d", ifindex, ret);
1454 	}
1455 }
1456 
1457 /* Set of subroutines to build Netlink message. */
1458 static struct nlattr *
nl_msg_tail(struct nlmsghdr * nlh)1459 nl_msg_tail(struct nlmsghdr *nlh)
1460 {
1461 	return (struct nlattr *)
1462 		(((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
1463 }
1464 
1465 static void
nl_attr_put(struct nlmsghdr * nlh,int type,const void * data,int alen)1466 nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
1467 {
1468 	struct nlattr *nla = nl_msg_tail(nlh);
1469 
1470 	nla->nla_type = type;
1471 	nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr)) + alen;
1472 	nlh->nlmsg_len += NLMSG_ALIGN(nla->nla_len);
1473 
1474 	if (alen)
1475 		memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
1476 }
1477 
1478 static struct nlattr *
nl_attr_nest_start(struct nlmsghdr * nlh,int type)1479 nl_attr_nest_start(struct nlmsghdr *nlh, int type)
1480 {
1481 	struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
1482 
1483 	nl_attr_put(nlh, type, NULL, 0);
1484 	return nest;
1485 }
1486 
1487 static void
nl_attr_nest_end(struct nlmsghdr * nlh,struct nlattr * nest)1488 nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
1489 {
1490 	nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
1491 }
1492 
1493 /*
1494  * Create network VLAN device with specified VLAN tag.
1495  *
1496  * @param[in] tcf
1497  *   Context object initialized by mlx5_nl_vlan_vmwa_init().
1498  * @param[in] ifindex
1499  *   Base network interface index.
1500  * @param[in] tag
1501  *   VLAN tag for VLAN network device to create.
1502  */
1503 uint32_t
mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context * vmwa,uint32_t ifindex,uint16_t tag)1504 mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa,
1505 			 uint32_t ifindex, uint16_t tag)
1506 {
1507 	struct nlmsghdr *nlh;
1508 	struct ifinfomsg *ifm;
1509 	char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
1510 
1511 	alignas(RTE_CACHE_LINE_SIZE)
1512 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1513 		    NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
1514 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
1515 		    NLMSG_ALIGN(sizeof(uint32_t)) +
1516 		    NLMSG_ALIGN(sizeof(name)) +
1517 		    NLMSG_ALIGN(sizeof("vlan")) +
1518 		    NLMSG_ALIGN(sizeof(uint32_t)) +
1519 		    NLMSG_ALIGN(sizeof(uint16_t)) + 16];
1520 	struct nlattr *na_info;
1521 	struct nlattr *na_vlan;
1522 	uint32_t sn = MLX5_NL_SN_GENERATE;
1523 	int ret;
1524 
1525 	memset(buf, 0, sizeof(buf));
1526 	nlh = (struct nlmsghdr *)buf;
1527 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1528 	nlh->nlmsg_type = RTM_NEWLINK;
1529 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
1530 			   NLM_F_EXCL | NLM_F_ACK;
1531 	ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
1532 	nlh->nlmsg_len += sizeof(struct ifinfomsg);
1533 	ifm->ifi_family = AF_UNSPEC;
1534 	ifm->ifi_type = 0;
1535 	ifm->ifi_index = 0;
1536 	ifm->ifi_flags = IFF_UP;
1537 	ifm->ifi_change = 0xffffffff;
1538 	nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
1539 	ret = snprintf(name, sizeof(name), "%s.%u.%u",
1540 		       MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
1541 	nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
1542 	na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
1543 	nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
1544 	na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
1545 	nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
1546 	nl_attr_nest_end(nlh, na_vlan);
1547 	nl_attr_nest_end(nlh, na_info);
1548 	MLX5_ASSERT(sizeof(buf) >= nlh->nlmsg_len);
1549 	ret = mlx5_nl_send(vmwa->nl_socket, nlh, sn);
1550 	if (ret >= 0)
1551 		ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1552 	if (ret < 0) {
1553 		DRV_LOG(WARNING, "netlink: VLAN %s create failure (%d)", name,
1554 			ret);
1555 	}
1556 	/* Try to get ifindex of created or pre-existing device. */
1557 	ret = if_nametoindex(name);
1558 	if (!ret) {
1559 		DRV_LOG(WARNING, "VLAN %s failed to get index (%d)", name,
1560 			errno);
1561 		return 0;
1562 	}
1563 	return ret;
1564 }
1565 
1566 /**
1567  * Parse Netlink message to retrieve the general family ID.
1568  *
1569  * @param nh
1570  *   Pointer to Netlink Message Header.
1571  * @param arg
1572  *   PMD data register with this callback.
1573  *
1574  * @return
1575  *   0 on success, a negative errno value otherwise and rte_errno is set.
1576  */
1577 static int
mlx5_nl_family_id_cb(struct nlmsghdr * nh,void * arg)1578 mlx5_nl_family_id_cb(struct nlmsghdr *nh, void *arg)
1579 {
1580 
1581 	struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1582 	struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1583 					NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1584 
1585 	for (; nla->nla_len && nla < tail;
1586 	     nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len))) {
1587 		if (nla->nla_type == CTRL_ATTR_FAMILY_ID) {
1588 			*(uint16_t *)arg = *(uint16_t *)(nla + 1);
1589 			return 0;
1590 		}
1591 	}
1592 	return -EINVAL;
1593 }
1594 
1595 #define MLX5_NL_MAX_ATTR_SIZE 100
1596 /**
1597  * Get generic netlink family ID.
1598  *
1599  * @param[in] nlsk_fd
1600  *   Netlink socket file descriptor.
1601  * @param[in] name
1602  *   The family name.
1603  *
1604  * @return
1605  *   ID >= 0 on success and @p enable is updated, a negative errno value
1606  *   otherwise and rte_errno is set.
1607  */
1608 static int
mlx5_nl_generic_family_id_get(int nlsk_fd,const char * name)1609 mlx5_nl_generic_family_id_get(int nlsk_fd, const char *name)
1610 {
1611 	struct nlmsghdr *nlh;
1612 	struct genlmsghdr *genl;
1613 	uint32_t sn = MLX5_NL_SN_GENERATE;
1614 	int name_size = strlen(name) + 1;
1615 	int ret;
1616 	uint16_t id = -1;
1617 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1618 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1619 		    NLMSG_ALIGN(sizeof(struct nlattr)) +
1620 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE)];
1621 
1622 	memset(buf, 0, sizeof(buf));
1623 	nlh = (struct nlmsghdr *)buf;
1624 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1625 	nlh->nlmsg_type = GENL_ID_CTRL;
1626 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1627 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1628 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1629 	genl->cmd = CTRL_CMD_GETFAMILY;
1630 	genl->version = 1;
1631 	nl_attr_put(nlh, CTRL_ATTR_FAMILY_NAME, name, name_size);
1632 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1633 	if (ret >= 0)
1634 		ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_family_id_cb, &id);
1635 	if (ret < 0) {
1636 		DRV_LOG(DEBUG, "Failed to get Netlink %s family ID: %d.", name,
1637 			ret);
1638 		return ret;
1639 	}
1640 	DRV_LOG(DEBUG, "Netlink \"%s\" family ID is %u.", name, id);
1641 	return (int)id;
1642 }
1643 
1644 /**
1645  * Get Devlink family ID.
1646  *
1647  * @param[in] nlsk_fd
1648  *   Netlink socket file descriptor.
1649  *
1650  * @return
1651  *   ID >= 0 on success and @p enable is updated, a negative errno value
1652  *   otherwise and rte_errno is set.
1653  */
1654 
1655 int
mlx5_nl_devlink_family_id_get(int nlsk_fd)1656 mlx5_nl_devlink_family_id_get(int nlsk_fd)
1657 {
1658 	return mlx5_nl_generic_family_id_get(nlsk_fd, DEVLINK_GENL_NAME);
1659 }
1660 
1661 /**
1662  * Parse Netlink message to retrieve the ROCE enable status.
1663  *
1664  * @param nh
1665  *   Pointer to Netlink Message Header.
1666  * @param arg
1667  *   PMD data register with this callback.
1668  *
1669  * @return
1670  *   0 on success, a negative errno value otherwise and rte_errno is set.
1671  */
1672 static int
mlx5_nl_roce_cb(struct nlmsghdr * nh,void * arg)1673 mlx5_nl_roce_cb(struct nlmsghdr *nh, void *arg)
1674 {
1675 
1676 	int ret = -EINVAL;
1677 	int *enable = arg;
1678 	struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1679 	struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1680 					NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1681 
1682 	while (nla->nla_len && nla < tail) {
1683 		switch (nla->nla_type) {
1684 		/* Expected nested attributes case. */
1685 		case DEVLINK_ATTR_PARAM:
1686 		case DEVLINK_ATTR_PARAM_VALUES_LIST:
1687 		case DEVLINK_ATTR_PARAM_VALUE:
1688 			ret = 0;
1689 			nla += 1;
1690 			break;
1691 		case DEVLINK_ATTR_PARAM_VALUE_DATA:
1692 			*enable = 1;
1693 			return 0;
1694 		default:
1695 			nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len));
1696 		}
1697 	}
1698 	*enable = 0;
1699 	return ret;
1700 }
1701 
1702 /**
1703  * Get ROCE enable status through Netlink.
1704  *
1705  * @param[in] nlsk_fd
1706  *   Netlink socket file descriptor.
1707  * @param[in] family_id
1708  *   the Devlink family ID.
1709  * @param pci_addr
1710  *   The device PCI address.
1711  * @param[out] enable
1712  *   Where to store the enable status.
1713  *
1714  * @return
1715  *   0 on success and @p enable is updated, a negative errno value otherwise
1716  *   and rte_errno is set.
1717  */
1718 int
mlx5_nl_enable_roce_get(int nlsk_fd,int family_id,const char * pci_addr,int * enable)1719 mlx5_nl_enable_roce_get(int nlsk_fd, int family_id, const char *pci_addr,
1720 			int *enable)
1721 {
1722 	struct nlmsghdr *nlh;
1723 	struct genlmsghdr *genl;
1724 	uint32_t sn = MLX5_NL_SN_GENERATE;
1725 	int ret;
1726 	int cur_en = 0;
1727 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1728 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1729 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 4 +
1730 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 4];
1731 
1732 	memset(buf, 0, sizeof(buf));
1733 	nlh = (struct nlmsghdr *)buf;
1734 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1735 	nlh->nlmsg_type = family_id;
1736 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1737 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1738 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1739 	genl->cmd = DEVLINK_CMD_PARAM_GET;
1740 	genl->version = DEVLINK_GENL_VERSION;
1741 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1742 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1743 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1744 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1745 	if (ret >= 0)
1746 		ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_roce_cb, &cur_en);
1747 	if (ret < 0) {
1748 		DRV_LOG(DEBUG, "Failed to get ROCE enable on device %s: %d.",
1749 			pci_addr, ret);
1750 		return ret;
1751 	}
1752 	*enable = cur_en;
1753 	DRV_LOG(DEBUG, "ROCE is %sabled for device \"%s\".",
1754 		cur_en ? "en" : "dis", pci_addr);
1755 	return ret;
1756 }
1757 
1758 /**
1759  * Reload mlx5 device kernel driver through Netlink.
1760  *
1761  * @param[in] nlsk_fd
1762  *   Netlink socket file descriptor.
1763  * @param[in] family_id
1764  *   the Devlink family ID.
1765  * @param pci_addr
1766  *   The device PCI address.
1767  * @param[out] enable
1768  *   The enable status to set.
1769  *
1770  * @return
1771  *   0 on success, a negative errno value otherwise and rte_errno is set.
1772  */
1773 static int
mlx5_nl_driver_reload(int nlsk_fd,int family_id,const char * pci_addr)1774 mlx5_nl_driver_reload(int nlsk_fd, int family_id, const char *pci_addr)
1775 {
1776 	struct nlmsghdr *nlh;
1777 	struct genlmsghdr *genl;
1778 	uint32_t sn = MLX5_NL_SN_GENERATE;
1779 	int ret;
1780 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1781 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1782 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 2 +
1783 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 2];
1784 
1785 	memset(buf, 0, sizeof(buf));
1786 	nlh = (struct nlmsghdr *)buf;
1787 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1788 	nlh->nlmsg_type = family_id;
1789 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1790 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1791 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1792 	genl->cmd = DEVLINK_CMD_RELOAD;
1793 	genl->version = DEVLINK_GENL_VERSION;
1794 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1795 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1796 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1797 	if (ret >= 0)
1798 		ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1799 	if (ret < 0) {
1800 		DRV_LOG(DEBUG, "Failed to reload %s device by Netlink - %d",
1801 			pci_addr, ret);
1802 		return ret;
1803 	}
1804 	DRV_LOG(DEBUG, "Device \"%s\" was reloaded by Netlink successfully.",
1805 		pci_addr);
1806 	return 0;
1807 }
1808 
1809 /**
1810  * Set ROCE enable status through Netlink.
1811  *
1812  * @param[in] nlsk_fd
1813  *   Netlink socket file descriptor.
1814  * @param[in] family_id
1815  *   the Devlink family ID.
1816  * @param pci_addr
1817  *   The device PCI address.
1818  * @param[out] enable
1819  *   The enable status to set.
1820  *
1821  * @return
1822  *   0 on success, a negative errno value otherwise and rte_errno is set.
1823  */
1824 int
mlx5_nl_enable_roce_set(int nlsk_fd,int family_id,const char * pci_addr,int enable)1825 mlx5_nl_enable_roce_set(int nlsk_fd, int family_id, const char *pci_addr,
1826 			int enable)
1827 {
1828 	struct nlmsghdr *nlh;
1829 	struct genlmsghdr *genl;
1830 	uint32_t sn = MLX5_NL_SN_GENERATE;
1831 	int ret;
1832 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1833 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1834 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 6 +
1835 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 6];
1836 	uint8_t cmode = DEVLINK_PARAM_CMODE_DRIVERINIT;
1837 	uint8_t ptype = NLA_FLAG;
1838 ;
1839 
1840 	memset(buf, 0, sizeof(buf));
1841 	nlh = (struct nlmsghdr *)buf;
1842 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1843 	nlh->nlmsg_type = family_id;
1844 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1845 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1846 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1847 	genl->cmd = DEVLINK_CMD_PARAM_SET;
1848 	genl->version = DEVLINK_GENL_VERSION;
1849 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1850 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1851 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1852 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_CMODE, &cmode, sizeof(cmode));
1853 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_TYPE, &ptype, sizeof(ptype));
1854 	if (enable)
1855 		nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, NULL, 0);
1856 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1857 	if (ret >= 0)
1858 		ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1859 	if (ret < 0) {
1860 		DRV_LOG(DEBUG, "Failed to %sable ROCE for device %s by Netlink:"
1861 			" %d.", enable ? "en" : "dis", pci_addr, ret);
1862 		return ret;
1863 	}
1864 	DRV_LOG(DEBUG, "Device %s ROCE was %sabled by Netlink successfully.",
1865 		pci_addr, enable ? "en" : "dis");
1866 	/* Now, need to reload the driver. */
1867 	return mlx5_nl_driver_reload(nlsk_fd, family_id, pci_addr);
1868 }
1869 
1870 /**
1871  * Try to parse a Netlink message as a link status update.
1872  *
1873  * @param hdr
1874  *  Netlink message header.
1875  * @param[out] ifindex
1876  *  Index of the updated interface.
1877  *
1878  * @return
1879  *  0 on success, negative on failure.
1880  */
1881 int
mlx5_nl_parse_link_status_update(struct nlmsghdr * hdr,uint32_t * ifindex)1882 mlx5_nl_parse_link_status_update(struct nlmsghdr *hdr, uint32_t *ifindex)
1883 {
1884 	struct ifinfomsg *info;
1885 
1886 	switch (hdr->nlmsg_type) {
1887 	case RTM_NEWLINK:
1888 	case RTM_DELLINK:
1889 	case RTM_GETLINK:
1890 	case RTM_SETLINK:
1891 		info = NLMSG_DATA(hdr);
1892 		*ifindex = info->ifi_index;
1893 		return 0;
1894 	}
1895 	return -1;
1896 }
1897 
1898 /**
1899  * Read pending events from a Netlink socket.
1900  *
1901  * @param nlsk_fd
1902  *  Netlink socket.
1903  * @param cb
1904  *  Callback invoked for each of the events.
1905  * @param cb_arg
1906  *  User data for the callback.
1907  *
1908  * @return
1909  *  0 on success, including the case when there are no events.
1910  *  Negative on failure and rte_errno is set.
1911  */
1912 int
mlx5_nl_read_events(int nlsk_fd,mlx5_nl_event_cb * cb,void * cb_arg)1913 mlx5_nl_read_events(int nlsk_fd, mlx5_nl_event_cb *cb, void *cb_arg)
1914 {
1915 	char buf[8192];
1916 	struct sockaddr_nl addr;
1917 	struct iovec iov = {
1918 		.iov_base = buf,
1919 		.iov_len = sizeof(buf),
1920 	};
1921 	struct msghdr msg = {
1922 		.msg_name = &addr,
1923 		.msg_namelen = sizeof(addr),
1924 		.msg_iov = &iov,
1925 		.msg_iovlen = 1,
1926 	};
1927 	struct nlmsghdr *hdr;
1928 	ssize_t size;
1929 
1930 	while (1) {
1931 		size = recvmsg(nlsk_fd, &msg, MSG_DONTWAIT);
1932 		if (size < 0) {
1933 			if (errno == EAGAIN)
1934 				return 0;
1935 			if (errno == EINTR)
1936 				continue;
1937 			DRV_LOG(DEBUG, "Failed to receive netlink message: %s",
1938 				strerror(errno));
1939 			rte_errno = errno;
1940 			return -rte_errno;
1941 		}
1942 		hdr = (struct nlmsghdr *)buf;
1943 		while (size >= (ssize_t)sizeof(*hdr)) {
1944 			ssize_t msg_len = hdr->nlmsg_len;
1945 			ssize_t data_len = msg_len - sizeof(*hdr);
1946 			ssize_t aligned_len;
1947 
1948 			if (data_len < 0) {
1949 				DRV_LOG(DEBUG, "Netlink message too short");
1950 				rte_errno = EINVAL;
1951 				return -rte_errno;
1952 			}
1953 			aligned_len = NLMSG_ALIGN(msg_len);
1954 			if (aligned_len > size) {
1955 				DRV_LOG(DEBUG, "Netlink message too long");
1956 				rte_errno = EINVAL;
1957 				return -rte_errno;
1958 			}
1959 			cb(hdr, cb_arg);
1960 			hdr = RTE_PTR_ADD(hdr, aligned_len);
1961 			size -= aligned_len;
1962 		}
1963 	}
1964 	return 0;
1965 }
1966 
1967 static int
mlx5_nl_esw_multiport_cb(struct nlmsghdr * nh,void * arg)1968 mlx5_nl_esw_multiport_cb(struct nlmsghdr *nh, void *arg)
1969 {
1970 
1971 	int ret = -EINVAL;
1972 	int *enable = arg;
1973 	struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1974 	struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1975 					NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1976 
1977 	while (nla->nla_len && nla < tail) {
1978 		switch (nla->nla_type) {
1979 		/* Expected nested attributes case. */
1980 		case DEVLINK_ATTR_PARAM:
1981 		case DEVLINK_ATTR_PARAM_VALUES_LIST:
1982 		case DEVLINK_ATTR_PARAM_VALUE:
1983 			ret = 0;
1984 			nla += 1;
1985 			break;
1986 		case DEVLINK_ATTR_PARAM_VALUE_DATA:
1987 			*enable = 1;
1988 			return 0;
1989 		default:
1990 			nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len));
1991 		}
1992 	}
1993 	*enable = 0;
1994 	return ret;
1995 }
1996 
1997 #define NL_ESW_MULTIPORT_PARAM "esw_multiport"
1998 
1999 int
mlx5_nl_devlink_esw_multiport_get(int nlsk_fd,int family_id,const char * pci_addr,int * enable)2000 mlx5_nl_devlink_esw_multiport_get(int nlsk_fd, int family_id, const char *pci_addr, int *enable)
2001 {
2002 	struct nlmsghdr *nlh;
2003 	struct genlmsghdr *genl;
2004 	uint32_t sn = MLX5_NL_SN_GENERATE;
2005 	int ret;
2006 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
2007 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
2008 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 4 +
2009 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 4];
2010 
2011 	memset(buf, 0, sizeof(buf));
2012 	nlh = (struct nlmsghdr *)buf;
2013 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
2014 	nlh->nlmsg_type = family_id;
2015 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
2016 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
2017 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
2018 	genl->cmd = DEVLINK_CMD_PARAM_GET;
2019 	genl->version = DEVLINK_GENL_VERSION;
2020 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
2021 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
2022 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME,
2023 		    NL_ESW_MULTIPORT_PARAM, sizeof(NL_ESW_MULTIPORT_PARAM));
2024 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
2025 	if (ret >= 0)
2026 		ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_esw_multiport_cb, enable);
2027 	if (ret < 0) {
2028 		DRV_LOG(DEBUG, "Failed to get Multiport E-Switch enable on device %s: %d.",
2029 			pci_addr, ret);
2030 		return ret;
2031 	}
2032 	DRV_LOG(DEBUG, "Multiport E-Switch is %sabled for device \"%s\".",
2033 		*enable ? "en" : "dis", pci_addr);
2034 	return ret;
2035 }
2036