1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2018 6WIND S.A.
3 * Copyright 2018 Mellanox Technologies, Ltd
4 */
5
6 #include <errno.h>
7 #include <linux/if_link.h>
8 #include <linux/rtnetlink.h>
9 #include <linux/genetlink.h>
10 #include <net/if.h>
11 #include <rdma/rdma_netlink.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <stdalign.h>
16 #include <string.h>
17 #include <sys/socket.h>
18 #include <unistd.h>
19
20 #include <rte_errno.h>
21
22 #include "mlx5_nl.h"
23 #include "../mlx5_common_log.h"
24 #include "mlx5_malloc.h"
25 #ifdef HAVE_DEVLINK
26 #include <linux/devlink.h>
27 #endif
28
29
30 /* Size of the buffer to receive kernel messages */
31 #define MLX5_NL_BUF_SIZE (32 * 1024)
32 /* Send buffer size for the Netlink socket */
33 #define MLX5_SEND_BUF_SIZE 32768
34 /* Receive buffer size for the Netlink socket */
35 #define MLX5_RECV_BUF_SIZE 32768
36 /* Maximal physical port name length. */
37 #define MLX5_PHYS_PORT_NAME_MAX 128
38
39 /** Parameters of VLAN devices created by driver. */
40 #define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
41 /*
42 * Define NDA_RTA as defined in iproute2 sources.
43 *
44 * see in iproute2 sources file include/libnetlink.h
45 */
46 #ifndef MLX5_NDA_RTA
47 #define MLX5_NDA_RTA(r) \
48 ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
49 #endif
50 /*
51 * Define NLMSG_TAIL as defined in iproute2 sources.
52 *
53 * see in iproute2 sources file include/libnetlink.h
54 */
55 #ifndef NLMSG_TAIL
56 #define NLMSG_TAIL(nmsg) \
57 ((struct rtattr *)(((char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
58 #endif
59 /*
60 * The following definitions are normally found in rdma/rdma_netlink.h,
61 * however they are so recent that most systems do not expose them yet.
62 */
63 #ifndef HAVE_RDMA_NL_NLDEV
64 #define RDMA_NL_NLDEV 5
65 #endif
66 #ifndef HAVE_RDMA_NLDEV_CMD_GET
67 #define RDMA_NLDEV_CMD_GET 1
68 #endif
69 #ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
70 #define RDMA_NLDEV_CMD_PORT_GET 5
71 #endif
72 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
73 #define RDMA_NLDEV_ATTR_DEV_INDEX 1
74 #endif
75 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
76 #define RDMA_NLDEV_ATTR_DEV_NAME 2
77 #endif
78 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
79 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
80 #endif
81 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_STATE
82 #define RDMA_NLDEV_ATTR_PORT_STATE 12
83 #endif
84 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
85 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
86 #endif
87
88 /* These are normally found in linux/if_link.h. */
89 #ifndef HAVE_IFLA_NUM_VF
90 #define IFLA_NUM_VF 21
91 #endif
92 #ifndef HAVE_IFLA_EXT_MASK
93 #define IFLA_EXT_MASK 29
94 #endif
95 #ifndef HAVE_IFLA_PHYS_SWITCH_ID
96 #define IFLA_PHYS_SWITCH_ID 36
97 #endif
98 #ifndef HAVE_IFLA_PHYS_PORT_NAME
99 #define IFLA_PHYS_PORT_NAME 38
100 #endif
101
102 /*
103 * Some Devlink defines may be missed in old kernel versions,
104 * adjust used defines.
105 */
106 #ifndef DEVLINK_GENL_NAME
107 #define DEVLINK_GENL_NAME "devlink"
108 #endif
109 #ifndef DEVLINK_GENL_VERSION
110 #define DEVLINK_GENL_VERSION 1
111 #endif
112 #ifndef DEVLINK_ATTR_BUS_NAME
113 #define DEVLINK_ATTR_BUS_NAME 1
114 #endif
115 #ifndef DEVLINK_ATTR_DEV_NAME
116 #define DEVLINK_ATTR_DEV_NAME 2
117 #endif
118 #ifndef DEVLINK_ATTR_PARAM
119 #define DEVLINK_ATTR_PARAM 80
120 #endif
121 #ifndef DEVLINK_ATTR_PARAM_NAME
122 #define DEVLINK_ATTR_PARAM_NAME 81
123 #endif
124 #ifndef DEVLINK_ATTR_PARAM_TYPE
125 #define DEVLINK_ATTR_PARAM_TYPE 83
126 #endif
127 #ifndef DEVLINK_ATTR_PARAM_VALUES_LIST
128 #define DEVLINK_ATTR_PARAM_VALUES_LIST 84
129 #endif
130 #ifndef DEVLINK_ATTR_PARAM_VALUE
131 #define DEVLINK_ATTR_PARAM_VALUE 85
132 #endif
133 #ifndef DEVLINK_ATTR_PARAM_VALUE_DATA
134 #define DEVLINK_ATTR_PARAM_VALUE_DATA 86
135 #endif
136 #ifndef DEVLINK_ATTR_PARAM_VALUE_CMODE
137 #define DEVLINK_ATTR_PARAM_VALUE_CMODE 87
138 #endif
139 #ifndef DEVLINK_PARAM_CMODE_DRIVERINIT
140 #define DEVLINK_PARAM_CMODE_DRIVERINIT 1
141 #endif
142 #ifndef DEVLINK_CMD_RELOAD
143 #define DEVLINK_CMD_RELOAD 37
144 #endif
145 #ifndef DEVLINK_CMD_PARAM_GET
146 #define DEVLINK_CMD_PARAM_GET 38
147 #endif
148 #ifndef DEVLINK_CMD_PARAM_SET
149 #define DEVLINK_CMD_PARAM_SET 39
150 #endif
151 #ifndef NLA_FLAG
152 #define NLA_FLAG 6
153 #endif
154
155 /* Add/remove MAC address through Netlink */
156 struct mlx5_nl_mac_addr {
157 struct rte_ether_addr (*mac)[];
158 /**< MAC address handled by the device. */
159 int mac_n; /**< Number of addresses in the array. */
160 };
161
162 #define MLX5_NL_CMD_GET_IB_NAME (1 << 0)
163 #define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
164 #define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
165 #define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
166 #define MLX5_NL_CMD_GET_PORT_STATE (1 << 4)
167
168 /** Data structure used by mlx5_nl_cmdget_cb(). */
169 struct mlx5_nl_port_info {
170 const char *name; /**< IB device name (in). */
171 uint32_t flags; /**< found attribute flags (out). */
172 uint32_t ibindex; /**< IB device index (out). */
173 uint32_t ifindex; /**< Network interface index (out). */
174 uint32_t portnum; /**< IB device max port number (out). */
175 uint16_t state; /**< IB device port state (out). */
176 };
177
178 RTE_ATOMIC(uint32_t) atomic_sn;
179
180 /* Generate Netlink sequence number. */
181 #define MLX5_NL_SN_GENERATE (rte_atomic_fetch_add_explicit(&atomic_sn, 1, \
182 rte_memory_order_relaxed) + 1)
183
184 /**
185 * Opens a Netlink socket.
186 *
187 * @param protocol
188 * Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
189 * @param groups
190 * Groups to listen (e.g. RTMGRP_LINK), can be 0.
191 *
192 * @return
193 * A file descriptor on success, a negative errno value otherwise and
194 * rte_errno is set.
195 */
196 int
mlx5_nl_init(int protocol,int groups)197 mlx5_nl_init(int protocol, int groups)
198 {
199 int fd;
200 int buf_size;
201 socklen_t opt_size;
202 struct sockaddr_nl local = {
203 .nl_family = AF_NETLINK,
204 .nl_groups = groups,
205 };
206 int ret;
207
208 fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
209 if (fd == -1) {
210 rte_errno = errno;
211 return -rte_errno;
212 }
213 opt_size = sizeof(buf_size);
214 ret = getsockopt(fd, SOL_SOCKET, SO_SNDBUF, &buf_size, &opt_size);
215 if (ret == -1) {
216 rte_errno = errno;
217 goto error;
218 }
219 DRV_LOG(DEBUG, "Netlink socket send buffer: %d", buf_size);
220 if (buf_size < MLX5_SEND_BUF_SIZE) {
221 ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF,
222 &buf_size, sizeof(buf_size));
223 if (ret == -1) {
224 rte_errno = errno;
225 goto error;
226 }
227 }
228 opt_size = sizeof(buf_size);
229 ret = getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &buf_size, &opt_size);
230 if (ret == -1) {
231 rte_errno = errno;
232 goto error;
233 }
234 DRV_LOG(DEBUG, "Netlink socket recv buffer: %d", buf_size);
235 if (buf_size < MLX5_RECV_BUF_SIZE) {
236 ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF,
237 &buf_size, sizeof(buf_size));
238 if (ret == -1) {
239 rte_errno = errno;
240 goto error;
241 }
242 }
243 ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
244 if (ret == -1) {
245 rte_errno = errno;
246 goto error;
247 }
248 return fd;
249 error:
250 close(fd);
251 return -rte_errno;
252 }
253
254 /**
255 * Send a request message to the kernel on the Netlink socket.
256 *
257 * @param[in] nlsk_fd
258 * Netlink socket file descriptor.
259 * @param[in] nh
260 * The Netlink message send to the kernel.
261 * @param[in] ssn
262 * Sequence number.
263 * @param[in] req
264 * Pointer to the request structure.
265 * @param[in] len
266 * Length of the request in bytes.
267 *
268 * @return
269 * The number of sent bytes on success, a negative errno value otherwise and
270 * rte_errno is set.
271 */
272 static int
mlx5_nl_request(int nlsk_fd,struct nlmsghdr * nh,uint32_t sn,void * req,int len)273 mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
274 int len)
275 {
276 struct sockaddr_nl sa = {
277 .nl_family = AF_NETLINK,
278 };
279 struct iovec iov[2] = {
280 { .iov_base = nh, .iov_len = sizeof(*nh), },
281 { .iov_base = req, .iov_len = len, },
282 };
283 struct msghdr msg = {
284 .msg_name = &sa,
285 .msg_namelen = sizeof(sa),
286 .msg_iov = iov,
287 .msg_iovlen = 2,
288 };
289 int send_bytes;
290
291 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
292 nh->nlmsg_seq = sn;
293 send_bytes = sendmsg(nlsk_fd, &msg, 0);
294 if (send_bytes < 0) {
295 rte_errno = errno;
296 return -rte_errno;
297 }
298 return send_bytes;
299 }
300
301 /**
302 * Send a message to the kernel on the Netlink socket.
303 *
304 * @param[in] nlsk_fd
305 * The Netlink socket file descriptor used for communication.
306 * @param[in] nh
307 * The Netlink message send to the kernel.
308 * @param[in] sn
309 * Sequence number.
310 *
311 * @return
312 * The number of sent bytes on success, a negative errno value otherwise and
313 * rte_errno is set.
314 */
315 static int
mlx5_nl_send(int nlsk_fd,struct nlmsghdr * nh,uint32_t sn)316 mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
317 {
318 struct sockaddr_nl sa = {
319 .nl_family = AF_NETLINK,
320 };
321 struct iovec iov = {
322 .iov_base = nh,
323 .iov_len = nh->nlmsg_len,
324 };
325 struct msghdr msg = {
326 .msg_name = &sa,
327 .msg_namelen = sizeof(sa),
328 .msg_iov = &iov,
329 .msg_iovlen = 1,
330 };
331 int send_bytes;
332
333 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
334 nh->nlmsg_seq = sn;
335 send_bytes = sendmsg(nlsk_fd, &msg, 0);
336 if (send_bytes < 0) {
337 rte_errno = errno;
338 return -rte_errno;
339 }
340 return send_bytes;
341 }
342
343 /**
344 * Receive a message from the kernel on the Netlink socket, following
345 * mlx5_nl_send().
346 *
347 * @param[in] nlsk_fd
348 * The Netlink socket file descriptor used for communication.
349 * @param[in] sn
350 * Sequence number.
351 * @param[in] cb
352 * The callback function to call for each Netlink message received.
353 * @param[in, out] arg
354 * Custom arguments for the callback.
355 *
356 * @return
357 * 0 on success, a negative errno value otherwise and rte_errno is set.
358 */
359 static int
mlx5_nl_recv(int nlsk_fd,uint32_t sn,int (* cb)(struct nlmsghdr *,void * arg),void * arg)360 mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
361 void *arg)
362 {
363 struct sockaddr_nl sa;
364 struct iovec iov;
365 struct msghdr msg = {
366 .msg_name = &sa,
367 .msg_namelen = sizeof(sa),
368 .msg_iov = &iov,
369 /* One message at a time */
370 .msg_iovlen = 1,
371 };
372 void *buf = NULL;
373 int multipart = 0;
374 int ret = 0;
375
376 do {
377 struct nlmsghdr *nh;
378 int recv_bytes;
379
380 do {
381 /* Query length of incoming message. */
382 iov.iov_base = NULL;
383 iov.iov_len = 0;
384 recv_bytes = recvmsg(nlsk_fd, &msg,
385 MSG_PEEK | MSG_TRUNC);
386 if (recv_bytes < 0) {
387 rte_errno = errno;
388 ret = -rte_errno;
389 goto exit;
390 }
391 if (recv_bytes == 0) {
392 rte_errno = ENODATA;
393 ret = -rte_errno;
394 goto exit;
395 }
396 /* Allocate buffer to fetch the message. */
397 if (recv_bytes < MLX5_RECV_BUF_SIZE)
398 recv_bytes = MLX5_RECV_BUF_SIZE;
399 mlx5_free(buf);
400 buf = mlx5_malloc(0, recv_bytes, 0, SOCKET_ID_ANY);
401 if (!buf) {
402 rte_errno = ENOMEM;
403 ret = -rte_errno;
404 goto exit;
405 }
406 /* Fetch the message. */
407 iov.iov_base = buf;
408 iov.iov_len = recv_bytes;
409 recv_bytes = recvmsg(nlsk_fd, &msg, 0);
410 if (recv_bytes == -1) {
411 rte_errno = errno;
412 ret = -rte_errno;
413 goto exit;
414 }
415 nh = (struct nlmsghdr *)buf;
416 } while (nh->nlmsg_seq != sn);
417 for (;
418 NLMSG_OK(nh, (unsigned int)recv_bytes);
419 nh = NLMSG_NEXT(nh, recv_bytes)) {
420 if (nh->nlmsg_type == NLMSG_ERROR) {
421 struct nlmsgerr *err_data = NLMSG_DATA(nh);
422
423 if (err_data->error < 0) {
424 rte_errno = -err_data->error;
425 ret = -rte_errno;
426 goto exit;
427 }
428 /* Ack message. */
429 ret = 0;
430 goto exit;
431 }
432 /* Multi-part msgs and their trailing DONE message. */
433 if (nh->nlmsg_flags & NLM_F_MULTI) {
434 if (nh->nlmsg_type == NLMSG_DONE) {
435 ret = 0;
436 goto exit;
437 }
438 multipart = 1;
439 }
440 if (cb) {
441 ret = cb(nh, arg);
442 if (ret < 0)
443 goto exit;
444 }
445 }
446 } while (multipart);
447 exit:
448 mlx5_free(buf);
449 return ret;
450 }
451
452 /**
453 * Parse Netlink message to retrieve the bridge MAC address.
454 *
455 * @param nh
456 * Pointer to Netlink Message Header.
457 * @param arg
458 * PMD data register with this callback.
459 *
460 * @return
461 * 0 on success, a negative errno value otherwise and rte_errno is set.
462 */
463 static int
mlx5_nl_mac_addr_cb(struct nlmsghdr * nh,void * arg)464 mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
465 {
466 struct mlx5_nl_mac_addr *data = arg;
467 struct ndmsg *r = NLMSG_DATA(nh);
468 struct rtattr *attribute;
469 int len;
470
471 len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
472 for (attribute = MLX5_NDA_RTA(r);
473 RTA_OK(attribute, len);
474 attribute = RTA_NEXT(attribute, len)) {
475 if (attribute->rta_type == NDA_LLADDR) {
476 if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
477 DRV_LOG(WARNING,
478 "not enough room to finalize the"
479 " request");
480 rte_errno = ENOMEM;
481 return -rte_errno;
482 }
483 #ifdef RTE_LIBRTE_MLX5_DEBUG
484 char m[RTE_ETHER_ADDR_FMT_SIZE];
485
486 rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE,
487 RTA_DATA(attribute));
488 DRV_LOG(DEBUG, "bridge MAC address %s", m);
489 #endif
490 memcpy(&(*data->mac)[data->mac_n++],
491 RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
492 }
493 }
494 return 0;
495 }
496
497 /**
498 * Get bridge MAC addresses.
499 *
500 * @param[in] nlsk_fd
501 * Netlink socket file descriptor.
502 * @param[in] iface_idx
503 * Net device interface index.
504 * @param mac[out]
505 * Pointer to the array table of MAC addresses to fill.
506 * Its size should be of MLX5_MAX_MAC_ADDRESSES.
507 * @param mac_n[out]
508 * Number of entries filled in MAC array.
509 *
510 * @return
511 * 0 on success, a negative errno value otherwise and rte_errno is set.
512 */
513 static int
mlx5_nl_mac_addr_list(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr (* mac)[],int * mac_n)514 mlx5_nl_mac_addr_list(int nlsk_fd, unsigned int iface_idx,
515 struct rte_ether_addr (*mac)[], int *mac_n)
516 {
517 struct {
518 struct nlmsghdr hdr;
519 struct ifinfomsg ifm;
520 } req = {
521 .hdr = {
522 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
523 .nlmsg_type = RTM_GETNEIGH,
524 .nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
525 },
526 .ifm = {
527 .ifi_family = PF_BRIDGE,
528 .ifi_index = iface_idx,
529 },
530 };
531 struct mlx5_nl_mac_addr data = {
532 .mac = mac,
533 .mac_n = 0,
534 };
535 uint32_t sn = MLX5_NL_SN_GENERATE;
536 int ret;
537
538 if (nlsk_fd == -1)
539 return 0;
540 ret = mlx5_nl_request(nlsk_fd, &req.hdr, sn, &req.ifm,
541 sizeof(struct ifinfomsg));
542 if (ret < 0)
543 goto error;
544 ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_mac_addr_cb, &data);
545 if (ret < 0)
546 goto error;
547 *mac_n = data.mac_n;
548 return 0;
549 error:
550 DRV_LOG(DEBUG, "Interface %u cannot retrieve MAC address list %s",
551 iface_idx, strerror(rte_errno));
552 return -rte_errno;
553 }
554
555 /**
556 * Modify the MAC address neighbour table with Netlink.
557 *
558 * @param[in] nlsk_fd
559 * Netlink socket file descriptor.
560 * @param[in] iface_idx
561 * Net device interface index.
562 * @param mac
563 * MAC address to consider.
564 * @param add
565 * 1 to add the MAC address, 0 to remove the MAC address.
566 *
567 * @return
568 * 0 on success, a negative errno value otherwise and rte_errno is set.
569 */
570 static int
mlx5_nl_mac_addr_modify(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr * mac,int add)571 mlx5_nl_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
572 struct rte_ether_addr *mac, int add)
573 {
574 struct {
575 struct nlmsghdr hdr;
576 struct ndmsg ndm;
577 struct rtattr rta;
578 uint8_t buffer[RTE_ETHER_ADDR_LEN];
579 } req = {
580 .hdr = {
581 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
582 .nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
583 NLM_F_EXCL | NLM_F_ACK,
584 .nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
585 },
586 .ndm = {
587 .ndm_family = PF_BRIDGE,
588 .ndm_state = NUD_NOARP | NUD_PERMANENT,
589 .ndm_ifindex = iface_idx,
590 .ndm_flags = NTF_SELF,
591 },
592 .rta = {
593 .rta_type = NDA_LLADDR,
594 .rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
595 },
596 };
597 uint32_t sn = MLX5_NL_SN_GENERATE;
598 int ret;
599
600 if (nlsk_fd == -1)
601 return 0;
602 memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
603 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
604 RTA_ALIGN(req.rta.rta_len);
605 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
606 if (ret < 0)
607 goto error;
608 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
609 if (ret < 0)
610 goto error;
611 return 0;
612 error:
613 #ifdef RTE_LIBRTE_MLX5_DEBUG
614 {
615 char m[RTE_ETHER_ADDR_FMT_SIZE];
616
617 rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE, mac);
618 DRV_LOG(DEBUG,
619 "Interface %u cannot %s MAC address %s %s",
620 iface_idx,
621 add ? "add" : "remove", m, strerror(rte_errno));
622 }
623 #endif
624 return -rte_errno;
625 }
626
627 /**
628 * Modify the VF MAC address neighbour table with Netlink.
629 *
630 * @param[in] nlsk_fd
631 * Netlink socket file descriptor.
632 * @param[in] iface_idx
633 * Net device interface index.
634 * @param mac
635 * MAC address to consider.
636 * @param vf_index
637 * VF index.
638 *
639 * @return
640 * 0 on success, a negative errno value otherwise and rte_errno is set.
641 */
642 int
mlx5_nl_vf_mac_addr_modify(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr * mac,int vf_index)643 mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
644 struct rte_ether_addr *mac, int vf_index)
645 {
646 int ret;
647 struct {
648 struct nlmsghdr hdr;
649 struct ifinfomsg ifm;
650 struct rtattr vf_list_rta;
651 struct rtattr vf_info_rta;
652 struct rtattr vf_mac_rta;
653 struct ifla_vf_mac ivm;
654 } req = {
655 .hdr = {
656 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
657 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
658 .nlmsg_type = RTM_BASE,
659 },
660 .ifm = {
661 .ifi_index = iface_idx,
662 },
663 .vf_list_rta = {
664 .rta_type = IFLA_VFINFO_LIST,
665 .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
666 },
667 .vf_info_rta = {
668 .rta_type = IFLA_VF_INFO,
669 .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
670 },
671 .vf_mac_rta = {
672 .rta_type = IFLA_VF_MAC,
673 },
674 };
675 struct ifla_vf_mac ivm = {
676 .vf = vf_index,
677 };
678 uint32_t sn = MLX5_NL_SN_GENERATE;
679
680 memcpy(&ivm.mac, mac, RTE_ETHER_ADDR_LEN);
681 memcpy(RTA_DATA(&req.vf_mac_rta), &ivm, sizeof(ivm));
682
683 req.vf_mac_rta.rta_len = RTA_LENGTH(sizeof(ivm));
684 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
685 RTA_ALIGN(req.vf_list_rta.rta_len) +
686 RTA_ALIGN(req.vf_info_rta.rta_len) +
687 RTA_ALIGN(req.vf_mac_rta.rta_len);
688 req.vf_list_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
689 &req.vf_list_rta);
690 req.vf_info_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
691 &req.vf_info_rta);
692
693 if (nlsk_fd < 0)
694 return -1;
695 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
696 if (ret < 0)
697 goto error;
698 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
699 if (ret < 0)
700 goto error;
701 return 0;
702 error:
703 DRV_LOG(ERR,
704 "representor %u cannot set VF MAC address "
705 RTE_ETHER_ADDR_PRT_FMT " : %s",
706 vf_index,
707 RTE_ETHER_ADDR_BYTES(mac),
708 strerror(rte_errno));
709 return -rte_errno;
710 }
711
712 /**
713 * Add a MAC address.
714 *
715 * @param[in] nlsk_fd
716 * Netlink socket file descriptor.
717 * @param[in] iface_idx
718 * Net device interface index.
719 * @param mac_own
720 * BITFIELD_DECLARE array to store the mac.
721 * @param mac
722 * MAC address to register.
723 * @param index
724 * MAC address index.
725 *
726 * @return
727 * 0 on success, a negative errno value otherwise and rte_errno is set.
728 */
729 int
mlx5_nl_mac_addr_add(int nlsk_fd,unsigned int iface_idx,uint64_t * mac_own,struct rte_ether_addr * mac,uint32_t index)730 mlx5_nl_mac_addr_add(int nlsk_fd, unsigned int iface_idx,
731 uint64_t *mac_own, struct rte_ether_addr *mac,
732 uint32_t index)
733 {
734 int ret;
735
736 ret = mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 1);
737 if (!ret) {
738 MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
739 if (index >= MLX5_MAX_MAC_ADDRESSES)
740 return -EINVAL;
741
742 BITFIELD_SET(mac_own, index);
743 }
744 if (ret == -EEXIST)
745 return 0;
746 return ret;
747 }
748
749 /**
750 * Remove a MAC address.
751 *
752 * @param[in] nlsk_fd
753 * Netlink socket file descriptor.
754 * @param[in] iface_idx
755 * Net device interface index.
756 * @param mac_own
757 * BITFIELD_DECLARE array to store the mac.
758 * @param mac
759 * MAC address to remove.
760 * @param index
761 * MAC address index.
762 *
763 * @return
764 * 0 on success, a negative errno value otherwise and rte_errno is set.
765 */
766 int
mlx5_nl_mac_addr_remove(int nlsk_fd,unsigned int iface_idx,uint64_t * mac_own,struct rte_ether_addr * mac,uint32_t index)767 mlx5_nl_mac_addr_remove(int nlsk_fd, unsigned int iface_idx, uint64_t *mac_own,
768 struct rte_ether_addr *mac, uint32_t index)
769 {
770 MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
771 if (index >= MLX5_MAX_MAC_ADDRESSES)
772 return -EINVAL;
773
774 BITFIELD_RESET(mac_own, index);
775 return mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 0);
776 }
777
778 /**
779 * Synchronize Netlink bridge table to the internal table.
780 *
781 * @param[in] nlsk_fd
782 * Netlink socket file descriptor.
783 * @param[in] iface_idx
784 * Net device interface index.
785 * @param mac_addrs
786 * Mac addresses array to sync.
787 * @param n
788 * @p mac_addrs array size.
789 */
790 void
mlx5_nl_mac_addr_sync(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr * mac_addrs,int n)791 mlx5_nl_mac_addr_sync(int nlsk_fd, unsigned int iface_idx,
792 struct rte_ether_addr *mac_addrs, int n)
793 {
794 struct rte_ether_addr macs[n];
795 int macs_n = 0;
796 int i;
797 int ret;
798
799 memset(macs, 0, n * sizeof(macs[0]));
800 ret = mlx5_nl_mac_addr_list(nlsk_fd, iface_idx, &macs, &macs_n);
801 if (ret)
802 return;
803 for (i = 0; i != macs_n; ++i) {
804 int j;
805
806 /* Verify the address is not in the array yet. */
807 for (j = 0; j != n; ++j)
808 if (rte_is_same_ether_addr(&macs[i], &mac_addrs[j]))
809 break;
810 if (j != n)
811 continue;
812 if (rte_is_multicast_ether_addr(&macs[i])) {
813 /* Find the first entry available. */
814 for (j = MLX5_MAX_UC_MAC_ADDRESSES; j != n; ++j) {
815 if (rte_is_zero_ether_addr(&mac_addrs[j])) {
816 mac_addrs[j] = macs[i];
817 break;
818 }
819 }
820 } else {
821 /* Find the first entry available. */
822 for (j = 0; j != MLX5_MAX_UC_MAC_ADDRESSES; ++j) {
823 if (rte_is_zero_ether_addr(&mac_addrs[j])) {
824 mac_addrs[j] = macs[i];
825 break;
826 }
827 }
828 }
829 }
830 }
831
832 /**
833 * Flush all added MAC addresses.
834 *
835 * @param[in] nlsk_fd
836 * Netlink socket file descriptor.
837 * @param[in] iface_idx
838 * Net device interface index.
839 * @param[in] mac_addrs
840 * Mac addresses array to flush.
841 * @param n
842 * @p mac_addrs array size.
843 * @param mac_own
844 * BITFIELD_DECLARE array to store the mac.
845 */
846 void
mlx5_nl_mac_addr_flush(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr * mac_addrs,int n,uint64_t * mac_own)847 mlx5_nl_mac_addr_flush(int nlsk_fd, unsigned int iface_idx,
848 struct rte_ether_addr *mac_addrs, int n,
849 uint64_t *mac_own)
850 {
851 int i;
852
853 if (n <= 0 || n > MLX5_MAX_MAC_ADDRESSES)
854 return;
855
856 for (i = n - 1; i >= 0; --i) {
857 struct rte_ether_addr *m = &mac_addrs[i];
858
859 if (BITFIELD_ISSET(mac_own, i))
860 mlx5_nl_mac_addr_remove(nlsk_fd, iface_idx, mac_own, m,
861 i);
862 }
863 }
864
865 /**
866 * Enable promiscuous / all multicast mode through Netlink.
867 *
868 * @param[in] nlsk_fd
869 * Netlink socket file descriptor.
870 * @param[in] iface_idx
871 * Net device interface index.
872 * @param flags
873 * IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
874 * @param enable
875 * Nonzero to enable, disable otherwise.
876 *
877 * @return
878 * 0 on success, a negative errno value otherwise and rte_errno is set.
879 */
880 static int
mlx5_nl_device_flags(int nlsk_fd,unsigned int iface_idx,uint32_t flags,int enable)881 mlx5_nl_device_flags(int nlsk_fd, unsigned int iface_idx, uint32_t flags,
882 int enable)
883 {
884 struct {
885 struct nlmsghdr hdr;
886 struct ifinfomsg ifi;
887 } req = {
888 .hdr = {
889 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
890 .nlmsg_type = RTM_NEWLINK,
891 .nlmsg_flags = NLM_F_REQUEST,
892 },
893 .ifi = {
894 .ifi_flags = enable ? flags : 0,
895 .ifi_change = flags,
896 .ifi_index = iface_idx,
897 },
898 };
899 uint32_t sn = MLX5_NL_SN_GENERATE;
900 int ret;
901
902 MLX5_ASSERT(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
903 if (nlsk_fd < 0)
904 return 0;
905 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
906 if (ret < 0)
907 return ret;
908 return 0;
909 }
910
911 /**
912 * Enable promiscuous mode through Netlink.
913 *
914 * @param[in] nlsk_fd
915 * Netlink socket file descriptor.
916 * @param[in] iface_idx
917 * Net device interface index.
918 * @param enable
919 * Nonzero to enable, disable otherwise.
920 *
921 * @return
922 * 0 on success, a negative errno value otherwise and rte_errno is set.
923 */
924 int
mlx5_nl_promisc(int nlsk_fd,unsigned int iface_idx,int enable)925 mlx5_nl_promisc(int nlsk_fd, unsigned int iface_idx, int enable)
926 {
927 int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_PROMISC, enable);
928
929 if (ret)
930 DRV_LOG(DEBUG,
931 "Interface %u cannot %s promisc mode: Netlink error %s",
932 iface_idx, enable ? "enable" : "disable",
933 strerror(rte_errno));
934 return ret;
935 }
936
937 /**
938 * Enable all multicast mode through Netlink.
939 *
940 * @param[in] nlsk_fd
941 * Netlink socket file descriptor.
942 * @param[in] iface_idx
943 * Net device interface index.
944 * @param enable
945 * Nonzero to enable, disable otherwise.
946 *
947 * @return
948 * 0 on success, a negative errno value otherwise and rte_errno is set.
949 */
950 int
mlx5_nl_allmulti(int nlsk_fd,unsigned int iface_idx,int enable)951 mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable)
952 {
953 int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_ALLMULTI,
954 enable);
955
956 if (ret)
957 DRV_LOG(DEBUG,
958 "Interface %u cannot %s allmulti : Netlink error %s",
959 iface_idx, enable ? "enable" : "disable",
960 strerror(rte_errno));
961 return ret;
962 }
963
964 /**
965 * Process network interface information from Netlink message.
966 *
967 * @param nh
968 * Pointer to Netlink message header.
969 * @param arg
970 * Opaque data pointer for this callback.
971 *
972 * @return
973 * 0 on success, a negative errno value otherwise and rte_errno is set.
974 */
975 static int
mlx5_nl_cmdget_cb(struct nlmsghdr * nh,void * arg)976 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
977 {
978 struct mlx5_nl_port_info *data = arg;
979 struct mlx5_nl_port_info local = {
980 .flags = 0,
981 };
982 size_t off = NLMSG_HDRLEN;
983
984 if (nh->nlmsg_type !=
985 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
986 nh->nlmsg_type !=
987 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
988 goto error;
989 while (off < nh->nlmsg_len) {
990 struct nlattr *na = (void *)((uintptr_t)nh + off);
991 void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
992
993 if (na->nla_len > nh->nlmsg_len - off)
994 goto error;
995 switch (na->nla_type) {
996 case RDMA_NLDEV_ATTR_DEV_INDEX:
997 local.ibindex = *(uint32_t *)payload;
998 local.flags |= MLX5_NL_CMD_GET_IB_INDEX;
999 break;
1000 case RDMA_NLDEV_ATTR_DEV_NAME:
1001 if (!strcmp(payload, data->name))
1002 local.flags |= MLX5_NL_CMD_GET_IB_NAME;
1003 break;
1004 case RDMA_NLDEV_ATTR_NDEV_INDEX:
1005 local.ifindex = *(uint32_t *)payload;
1006 local.flags |= MLX5_NL_CMD_GET_NET_INDEX;
1007 break;
1008 case RDMA_NLDEV_ATTR_PORT_INDEX:
1009 local.portnum = *(uint32_t *)payload;
1010 local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
1011 break;
1012 case RDMA_NLDEV_ATTR_PORT_STATE:
1013 local.state = *(uint8_t *)payload;
1014 local.flags |= MLX5_NL_CMD_GET_PORT_STATE;
1015 break;
1016 default:
1017 break;
1018 }
1019 off += NLA_ALIGN(na->nla_len);
1020 }
1021 /*
1022 * It is possible to have multiple messages for all
1023 * Infiniband devices in the system with appropriate name.
1024 * So we should gather parameters locally and copy to
1025 * query context only in case of coinciding device name.
1026 */
1027 if (local.flags & MLX5_NL_CMD_GET_IB_NAME) {
1028 data->flags = local.flags;
1029 data->ibindex = local.ibindex;
1030 data->ifindex = local.ifindex;
1031 data->portnum = local.portnum;
1032 data->state = local.state;
1033 }
1034 return 0;
1035 error:
1036 rte_errno = EINVAL;
1037 return -rte_errno;
1038 }
1039
1040 /**
1041 * Get port info of network interface associated with some IB device.
1042 *
1043 * This is the only somewhat safe method to avoid resorting to heuristics
1044 * when faced with port representors. Unfortunately it requires at least
1045 * Linux 4.17.
1046 *
1047 * @param nl
1048 * Netlink socket of the RDMA kind (NETLINK_RDMA).
1049 * @param[in] pindex
1050 * IB device port index, starting from 1
1051 * @param[out] data
1052 * Pointer to port info.
1053 * @return
1054 * 0 on success, negative on error and rte_errno is set.
1055 */
1056 static int
mlx5_nl_port_info(int nl,uint32_t pindex,struct mlx5_nl_port_info * data)1057 mlx5_nl_port_info(int nl, uint32_t pindex, struct mlx5_nl_port_info *data)
1058 {
1059 union {
1060 struct nlmsghdr nh;
1061 uint8_t buf[NLMSG_HDRLEN +
1062 NLA_HDRLEN + NLA_ALIGN(sizeof(data->ibindex)) +
1063 NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
1064 } req = {
1065 .nh = {
1066 .nlmsg_len = NLMSG_LENGTH(0),
1067 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1068 RDMA_NLDEV_CMD_GET),
1069 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1070 },
1071 };
1072 struct nlattr *na;
1073 uint32_t sn = MLX5_NL_SN_GENERATE;
1074 int ret;
1075
1076 ret = mlx5_nl_send(nl, &req.nh, sn);
1077 if (ret < 0)
1078 return ret;
1079 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
1080 if (ret < 0)
1081 return ret;
1082 if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
1083 !(data->flags & MLX5_NL_CMD_GET_IB_INDEX))
1084 goto error;
1085 data->flags = 0;
1086 sn = MLX5_NL_SN_GENERATE;
1087 req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1088 RDMA_NLDEV_CMD_PORT_GET);
1089 req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1090 req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
1091 na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
1092 na->nla_len = NLA_HDRLEN + sizeof(data->ibindex);
1093 na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
1094 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1095 &data->ibindex, sizeof(data->ibindex));
1096 na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
1097 na->nla_len = NLA_HDRLEN + sizeof(pindex);
1098 na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
1099 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1100 &pindex, sizeof(pindex));
1101 ret = mlx5_nl_send(nl, &req.nh, sn);
1102 if (ret < 0)
1103 return ret;
1104 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
1105 if (ret < 0)
1106 return ret;
1107 if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
1108 !(data->flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1109 !(data->flags & MLX5_NL_CMD_GET_NET_INDEX) ||
1110 !data->ifindex)
1111 goto error;
1112 return 1;
1113 error:
1114 rte_errno = ENODEV;
1115 return -rte_errno;
1116 }
1117
1118 /**
1119 * Get index of network interface associated with some IB device.
1120 *
1121 * This is the only somewhat safe method to avoid resorting to heuristics
1122 * when faced with port representors. Unfortunately it requires at least
1123 * Linux 4.17.
1124 *
1125 * @param nl
1126 * Netlink socket of the RDMA kind (NETLINK_RDMA).
1127 * @param[in] name
1128 * IB device name.
1129 * @param[in] pindex
1130 * IB device port index, starting from 1
1131 * @return
1132 * A valid (nonzero) interface index on success, 0 otherwise and rte_errno
1133 * is set.
1134 */
1135 unsigned int
mlx5_nl_ifindex(int nl,const char * name,uint32_t pindex)1136 mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
1137 {
1138 struct mlx5_nl_port_info data = {
1139 .ifindex = 0,
1140 .name = name,
1141 };
1142
1143 if (mlx5_nl_port_info(nl, pindex, &data) < 0)
1144 return 0;
1145 return data.ifindex;
1146 }
1147
1148 /**
1149 * Get IB device port state.
1150 *
1151 * This is the only somewhat safe method to get info for port number >= 255.
1152 * Unfortunately it requires at least Linux 4.17.
1153 *
1154 * @param nl
1155 * Netlink socket of the RDMA kind (NETLINK_RDMA).
1156 * @param[in] name
1157 * IB device name.
1158 * @param[in] pindex
1159 * IB device port index, starting from 1
1160 * @return
1161 * Port state (ibv_port_state) on success, negative on error
1162 * and rte_errno is set.
1163 */
1164 int
mlx5_nl_port_state(int nl,const char * name,uint32_t pindex)1165 mlx5_nl_port_state(int nl, const char *name, uint32_t pindex)
1166 {
1167 struct mlx5_nl_port_info data = {
1168 .state = 0,
1169 .name = name,
1170 };
1171
1172 if (mlx5_nl_port_info(nl, pindex, &data) < 0)
1173 return -rte_errno;
1174 if ((data.flags & MLX5_NL_CMD_GET_PORT_STATE) == 0) {
1175 rte_errno = ENOTSUP;
1176 return -rte_errno;
1177 }
1178 return (int)data.state;
1179 }
1180
1181 /**
1182 * Get the number of physical ports of given IB device.
1183 *
1184 * @param nl
1185 * Netlink socket of the RDMA kind (NETLINK_RDMA).
1186 * @param[in] name
1187 * IB device name.
1188 *
1189 * @return
1190 * A valid (nonzero) number of ports on success, 0 otherwise
1191 * and rte_errno is set.
1192 */
1193 unsigned int
mlx5_nl_portnum(int nl,const char * name)1194 mlx5_nl_portnum(int nl, const char *name)
1195 {
1196 struct mlx5_nl_port_info data = {
1197 .flags = 0,
1198 .name = name,
1199 .ifindex = 0,
1200 .portnum = 0,
1201 };
1202 struct nlmsghdr req = {
1203 .nlmsg_len = NLMSG_LENGTH(0),
1204 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1205 RDMA_NLDEV_CMD_GET),
1206 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1207 };
1208 uint32_t sn = MLX5_NL_SN_GENERATE;
1209 int ret;
1210
1211 ret = mlx5_nl_send(nl, &req, sn);
1212 if (ret < 0)
1213 return 0;
1214 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1215 if (ret < 0)
1216 return 0;
1217 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1218 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1219 !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) {
1220 rte_errno = ENODEV;
1221 return 0;
1222 }
1223 if (!data.portnum)
1224 rte_errno = EINVAL;
1225 return data.portnum;
1226 }
1227
1228 /**
1229 * Analyze gathered port parameters via Netlink to recognize master
1230 * and representor devices for E-Switch configuration.
1231 *
1232 * @param[in] num_vf_set
1233 * flag of presence of number of VFs port attribute.
1234 * @param[inout] switch_info
1235 * Port information, including port name as a number and port name
1236 * type if recognized
1237 *
1238 * @return
1239 * master and representor flags are set in switch_info according to
1240 * recognized parameters (if any).
1241 */
1242 static void
mlx5_nl_check_switch_info(bool num_vf_set,struct mlx5_switch_info * switch_info)1243 mlx5_nl_check_switch_info(bool num_vf_set,
1244 struct mlx5_switch_info *switch_info)
1245 {
1246 switch (switch_info->name_type) {
1247 case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1248 /*
1249 * Name is not recognized, assume the master,
1250 * check the number of VFs key presence.
1251 */
1252 switch_info->master = num_vf_set;
1253 break;
1254 case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1255 /*
1256 * Name is not set, this assumes the legacy naming
1257 * schema for master, just check if there is a
1258 * number of VFs key.
1259 */
1260 switch_info->master = num_vf_set;
1261 break;
1262 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1263 /* New uplink naming schema recognized. */
1264 switch_info->master = 1;
1265 break;
1266 case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1267 /* Legacy representors naming schema. */
1268 switch_info->representor = !num_vf_set;
1269 break;
1270 case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
1271 /* Fallthrough */
1272 case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1273 /* Fallthrough */
1274 case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
1275 /* New representors naming schema. */
1276 switch_info->representor = 1;
1277 break;
1278 }
1279 }
1280
1281 /**
1282 * Process switch information from Netlink message.
1283 *
1284 * @param nh
1285 * Pointer to Netlink message header.
1286 * @param arg
1287 * Opaque data pointer for this callback.
1288 *
1289 * @return
1290 * 0 on success, a negative errno value otherwise and rte_errno is set.
1291 */
1292 static int
mlx5_nl_switch_info_cb(struct nlmsghdr * nh,void * arg)1293 mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
1294 {
1295 struct mlx5_switch_info info = {
1296 .master = 0,
1297 .representor = 0,
1298 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1299 .port_name = 0,
1300 .switch_id = 0,
1301 };
1302 size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
1303 bool switch_id_set = false;
1304 bool num_vf_set = false;
1305 int len;
1306
1307 if (nh->nlmsg_type != RTM_NEWLINK)
1308 goto error;
1309 while (off < nh->nlmsg_len) {
1310 struct rtattr *ra = (void *)((uintptr_t)nh + off);
1311 void *payload = RTA_DATA(ra);
1312 unsigned int i;
1313
1314 if (ra->rta_len > nh->nlmsg_len - off)
1315 goto error;
1316 switch (ra->rta_type) {
1317 case IFLA_NUM_VF:
1318 num_vf_set = true;
1319 break;
1320 case IFLA_PHYS_PORT_NAME:
1321 len = RTA_PAYLOAD(ra);
1322 /* Some kernels do not pad attributes with zero. */
1323 if (len > 0 && len < MLX5_PHYS_PORT_NAME_MAX) {
1324 char name[MLX5_PHYS_PORT_NAME_MAX];
1325
1326 /*
1327 * We can't just patch the message with padding
1328 * zero - it might corrupt the following items
1329 * in the message, we have to copy the string
1330 * by attribute length and pad the copied one.
1331 */
1332 memcpy(name, payload, len);
1333 name[len] = 0;
1334 mlx5_translate_port_name(name, &info);
1335 } else {
1336 info.name_type =
1337 MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
1338 }
1339 break;
1340 case IFLA_PHYS_SWITCH_ID:
1341 info.switch_id = 0;
1342 for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
1343 info.switch_id <<= 8;
1344 info.switch_id |= ((uint8_t *)payload)[i];
1345 }
1346 switch_id_set = true;
1347 break;
1348 }
1349 off += RTA_ALIGN(ra->rta_len);
1350 }
1351 if (switch_id_set) {
1352 /* We have some E-Switch configuration. */
1353 mlx5_nl_check_switch_info(num_vf_set, &info);
1354 }
1355 MLX5_ASSERT(!(info.master && info.representor));
1356 memcpy(arg, &info, sizeof(info));
1357 return 0;
1358 error:
1359 rte_errno = EINVAL;
1360 return -rte_errno;
1361 }
1362
1363 /**
1364 * Get switch information associated with network interface.
1365 *
1366 * @param nl
1367 * Netlink socket of the ROUTE kind (NETLINK_ROUTE).
1368 * @param ifindex
1369 * Network interface index.
1370 * @param[out] info
1371 * Switch information object, populated in case of success.
1372 *
1373 * @return
1374 * 0 on success, a negative errno value otherwise and rte_errno is set.
1375 */
1376 int
mlx5_nl_switch_info(int nl,unsigned int ifindex,struct mlx5_switch_info * info)1377 mlx5_nl_switch_info(int nl, unsigned int ifindex,
1378 struct mlx5_switch_info *info)
1379 {
1380 struct {
1381 struct nlmsghdr nh;
1382 struct ifinfomsg info;
1383 struct rtattr rta;
1384 uint32_t extmask;
1385 } req = {
1386 .nh = {
1387 .nlmsg_len = NLMSG_LENGTH
1388 (sizeof(req.info) +
1389 RTA_LENGTH(sizeof(uint32_t))),
1390 .nlmsg_type = RTM_GETLINK,
1391 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1392 },
1393 .info = {
1394 .ifi_family = AF_UNSPEC,
1395 .ifi_index = ifindex,
1396 },
1397 .rta = {
1398 .rta_type = IFLA_EXT_MASK,
1399 .rta_len = RTA_LENGTH(sizeof(int32_t)),
1400 },
1401 .extmask = RTE_LE32(1),
1402 };
1403 uint32_t sn = MLX5_NL_SN_GENERATE;
1404 int ret;
1405
1406 ret = mlx5_nl_send(nl, &req.nh, sn);
1407 if (ret >= 0)
1408 ret = mlx5_nl_recv(nl, sn, mlx5_nl_switch_info_cb, info);
1409 if (info->master && info->representor) {
1410 DRV_LOG(ERR, "ifindex %u device is recognized as master"
1411 " and as representor", ifindex);
1412 rte_errno = ENODEV;
1413 ret = -rte_errno;
1414 }
1415 return ret;
1416 }
1417
1418 /*
1419 * Delete VLAN network device by ifindex.
1420 *
1421 * @param[in] tcf
1422 * Context object initialized by mlx5_nl_vlan_vmwa_init().
1423 * @param[in] ifindex
1424 * Interface index of network device to delete.
1425 */
1426 void
mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context * vmwa,uint32_t ifindex)1427 mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa,
1428 uint32_t ifindex)
1429 {
1430 uint32_t sn = MLX5_NL_SN_GENERATE;
1431 int ret;
1432 struct {
1433 struct nlmsghdr nh;
1434 struct ifinfomsg info;
1435 } req = {
1436 .nh = {
1437 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
1438 .nlmsg_type = RTM_DELLINK,
1439 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1440 },
1441 .info = {
1442 .ifi_family = AF_UNSPEC,
1443 .ifi_index = ifindex,
1444 },
1445 };
1446
1447 if (ifindex) {
1448 ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, sn);
1449 if (ret >= 0)
1450 ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1451 if (ret < 0)
1452 DRV_LOG(WARNING, "netlink: error deleting VLAN WA"
1453 " ifindex %u, %d", ifindex, ret);
1454 }
1455 }
1456
1457 /* Set of subroutines to build Netlink message. */
1458 static struct nlattr *
nl_msg_tail(struct nlmsghdr * nlh)1459 nl_msg_tail(struct nlmsghdr *nlh)
1460 {
1461 return (struct nlattr *)
1462 (((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
1463 }
1464
1465 static void
nl_attr_put(struct nlmsghdr * nlh,int type,const void * data,int alen)1466 nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
1467 {
1468 struct nlattr *nla = nl_msg_tail(nlh);
1469
1470 nla->nla_type = type;
1471 nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr)) + alen;
1472 nlh->nlmsg_len += NLMSG_ALIGN(nla->nla_len);
1473
1474 if (alen)
1475 memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
1476 }
1477
1478 static struct nlattr *
nl_attr_nest_start(struct nlmsghdr * nlh,int type)1479 nl_attr_nest_start(struct nlmsghdr *nlh, int type)
1480 {
1481 struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
1482
1483 nl_attr_put(nlh, type, NULL, 0);
1484 return nest;
1485 }
1486
1487 static void
nl_attr_nest_end(struct nlmsghdr * nlh,struct nlattr * nest)1488 nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
1489 {
1490 nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
1491 }
1492
1493 /*
1494 * Create network VLAN device with specified VLAN tag.
1495 *
1496 * @param[in] tcf
1497 * Context object initialized by mlx5_nl_vlan_vmwa_init().
1498 * @param[in] ifindex
1499 * Base network interface index.
1500 * @param[in] tag
1501 * VLAN tag for VLAN network device to create.
1502 */
1503 uint32_t
mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context * vmwa,uint32_t ifindex,uint16_t tag)1504 mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa,
1505 uint32_t ifindex, uint16_t tag)
1506 {
1507 struct nlmsghdr *nlh;
1508 struct ifinfomsg *ifm;
1509 char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
1510
1511 alignas(RTE_CACHE_LINE_SIZE)
1512 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1513 NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
1514 NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
1515 NLMSG_ALIGN(sizeof(uint32_t)) +
1516 NLMSG_ALIGN(sizeof(name)) +
1517 NLMSG_ALIGN(sizeof("vlan")) +
1518 NLMSG_ALIGN(sizeof(uint32_t)) +
1519 NLMSG_ALIGN(sizeof(uint16_t)) + 16];
1520 struct nlattr *na_info;
1521 struct nlattr *na_vlan;
1522 uint32_t sn = MLX5_NL_SN_GENERATE;
1523 int ret;
1524
1525 memset(buf, 0, sizeof(buf));
1526 nlh = (struct nlmsghdr *)buf;
1527 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1528 nlh->nlmsg_type = RTM_NEWLINK;
1529 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
1530 NLM_F_EXCL | NLM_F_ACK;
1531 ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
1532 nlh->nlmsg_len += sizeof(struct ifinfomsg);
1533 ifm->ifi_family = AF_UNSPEC;
1534 ifm->ifi_type = 0;
1535 ifm->ifi_index = 0;
1536 ifm->ifi_flags = IFF_UP;
1537 ifm->ifi_change = 0xffffffff;
1538 nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
1539 ret = snprintf(name, sizeof(name), "%s.%u.%u",
1540 MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
1541 nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
1542 na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
1543 nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
1544 na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
1545 nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
1546 nl_attr_nest_end(nlh, na_vlan);
1547 nl_attr_nest_end(nlh, na_info);
1548 MLX5_ASSERT(sizeof(buf) >= nlh->nlmsg_len);
1549 ret = mlx5_nl_send(vmwa->nl_socket, nlh, sn);
1550 if (ret >= 0)
1551 ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1552 if (ret < 0) {
1553 DRV_LOG(WARNING, "netlink: VLAN %s create failure (%d)", name,
1554 ret);
1555 }
1556 /* Try to get ifindex of created or pre-existing device. */
1557 ret = if_nametoindex(name);
1558 if (!ret) {
1559 DRV_LOG(WARNING, "VLAN %s failed to get index (%d)", name,
1560 errno);
1561 return 0;
1562 }
1563 return ret;
1564 }
1565
1566 /**
1567 * Parse Netlink message to retrieve the general family ID.
1568 *
1569 * @param nh
1570 * Pointer to Netlink Message Header.
1571 * @param arg
1572 * PMD data register with this callback.
1573 *
1574 * @return
1575 * 0 on success, a negative errno value otherwise and rte_errno is set.
1576 */
1577 static int
mlx5_nl_family_id_cb(struct nlmsghdr * nh,void * arg)1578 mlx5_nl_family_id_cb(struct nlmsghdr *nh, void *arg)
1579 {
1580
1581 struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1582 struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1583 NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1584
1585 for (; nla->nla_len && nla < tail;
1586 nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len))) {
1587 if (nla->nla_type == CTRL_ATTR_FAMILY_ID) {
1588 *(uint16_t *)arg = *(uint16_t *)(nla + 1);
1589 return 0;
1590 }
1591 }
1592 return -EINVAL;
1593 }
1594
1595 #define MLX5_NL_MAX_ATTR_SIZE 100
1596 /**
1597 * Get generic netlink family ID.
1598 *
1599 * @param[in] nlsk_fd
1600 * Netlink socket file descriptor.
1601 * @param[in] name
1602 * The family name.
1603 *
1604 * @return
1605 * ID >= 0 on success and @p enable is updated, a negative errno value
1606 * otherwise and rte_errno is set.
1607 */
1608 static int
mlx5_nl_generic_family_id_get(int nlsk_fd,const char * name)1609 mlx5_nl_generic_family_id_get(int nlsk_fd, const char *name)
1610 {
1611 struct nlmsghdr *nlh;
1612 struct genlmsghdr *genl;
1613 uint32_t sn = MLX5_NL_SN_GENERATE;
1614 int name_size = strlen(name) + 1;
1615 int ret;
1616 uint16_t id = -1;
1617 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1618 NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1619 NLMSG_ALIGN(sizeof(struct nlattr)) +
1620 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE)];
1621
1622 memset(buf, 0, sizeof(buf));
1623 nlh = (struct nlmsghdr *)buf;
1624 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1625 nlh->nlmsg_type = GENL_ID_CTRL;
1626 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1627 genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1628 nlh->nlmsg_len += sizeof(struct genlmsghdr);
1629 genl->cmd = CTRL_CMD_GETFAMILY;
1630 genl->version = 1;
1631 nl_attr_put(nlh, CTRL_ATTR_FAMILY_NAME, name, name_size);
1632 ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1633 if (ret >= 0)
1634 ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_family_id_cb, &id);
1635 if (ret < 0) {
1636 DRV_LOG(DEBUG, "Failed to get Netlink %s family ID: %d.", name,
1637 ret);
1638 return ret;
1639 }
1640 DRV_LOG(DEBUG, "Netlink \"%s\" family ID is %u.", name, id);
1641 return (int)id;
1642 }
1643
1644 /**
1645 * Get Devlink family ID.
1646 *
1647 * @param[in] nlsk_fd
1648 * Netlink socket file descriptor.
1649 *
1650 * @return
1651 * ID >= 0 on success and @p enable is updated, a negative errno value
1652 * otherwise and rte_errno is set.
1653 */
1654
1655 int
mlx5_nl_devlink_family_id_get(int nlsk_fd)1656 mlx5_nl_devlink_family_id_get(int nlsk_fd)
1657 {
1658 return mlx5_nl_generic_family_id_get(nlsk_fd, DEVLINK_GENL_NAME);
1659 }
1660
1661 /**
1662 * Parse Netlink message to retrieve the ROCE enable status.
1663 *
1664 * @param nh
1665 * Pointer to Netlink Message Header.
1666 * @param arg
1667 * PMD data register with this callback.
1668 *
1669 * @return
1670 * 0 on success, a negative errno value otherwise and rte_errno is set.
1671 */
1672 static int
mlx5_nl_roce_cb(struct nlmsghdr * nh,void * arg)1673 mlx5_nl_roce_cb(struct nlmsghdr *nh, void *arg)
1674 {
1675
1676 int ret = -EINVAL;
1677 int *enable = arg;
1678 struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1679 struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1680 NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1681
1682 while (nla->nla_len && nla < tail) {
1683 switch (nla->nla_type) {
1684 /* Expected nested attributes case. */
1685 case DEVLINK_ATTR_PARAM:
1686 case DEVLINK_ATTR_PARAM_VALUES_LIST:
1687 case DEVLINK_ATTR_PARAM_VALUE:
1688 ret = 0;
1689 nla += 1;
1690 break;
1691 case DEVLINK_ATTR_PARAM_VALUE_DATA:
1692 *enable = 1;
1693 return 0;
1694 default:
1695 nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len));
1696 }
1697 }
1698 *enable = 0;
1699 return ret;
1700 }
1701
1702 /**
1703 * Get ROCE enable status through Netlink.
1704 *
1705 * @param[in] nlsk_fd
1706 * Netlink socket file descriptor.
1707 * @param[in] family_id
1708 * the Devlink family ID.
1709 * @param pci_addr
1710 * The device PCI address.
1711 * @param[out] enable
1712 * Where to store the enable status.
1713 *
1714 * @return
1715 * 0 on success and @p enable is updated, a negative errno value otherwise
1716 * and rte_errno is set.
1717 */
1718 int
mlx5_nl_enable_roce_get(int nlsk_fd,int family_id,const char * pci_addr,int * enable)1719 mlx5_nl_enable_roce_get(int nlsk_fd, int family_id, const char *pci_addr,
1720 int *enable)
1721 {
1722 struct nlmsghdr *nlh;
1723 struct genlmsghdr *genl;
1724 uint32_t sn = MLX5_NL_SN_GENERATE;
1725 int ret;
1726 int cur_en = 0;
1727 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1728 NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1729 NLMSG_ALIGN(sizeof(struct nlattr)) * 4 +
1730 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 4];
1731
1732 memset(buf, 0, sizeof(buf));
1733 nlh = (struct nlmsghdr *)buf;
1734 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1735 nlh->nlmsg_type = family_id;
1736 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1737 genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1738 nlh->nlmsg_len += sizeof(struct genlmsghdr);
1739 genl->cmd = DEVLINK_CMD_PARAM_GET;
1740 genl->version = DEVLINK_GENL_VERSION;
1741 nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1742 nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1743 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1744 ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1745 if (ret >= 0)
1746 ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_roce_cb, &cur_en);
1747 if (ret < 0) {
1748 DRV_LOG(DEBUG, "Failed to get ROCE enable on device %s: %d.",
1749 pci_addr, ret);
1750 return ret;
1751 }
1752 *enable = cur_en;
1753 DRV_LOG(DEBUG, "ROCE is %sabled for device \"%s\".",
1754 cur_en ? "en" : "dis", pci_addr);
1755 return ret;
1756 }
1757
1758 /**
1759 * Reload mlx5 device kernel driver through Netlink.
1760 *
1761 * @param[in] nlsk_fd
1762 * Netlink socket file descriptor.
1763 * @param[in] family_id
1764 * the Devlink family ID.
1765 * @param pci_addr
1766 * The device PCI address.
1767 * @param[out] enable
1768 * The enable status to set.
1769 *
1770 * @return
1771 * 0 on success, a negative errno value otherwise and rte_errno is set.
1772 */
1773 static int
mlx5_nl_driver_reload(int nlsk_fd,int family_id,const char * pci_addr)1774 mlx5_nl_driver_reload(int nlsk_fd, int family_id, const char *pci_addr)
1775 {
1776 struct nlmsghdr *nlh;
1777 struct genlmsghdr *genl;
1778 uint32_t sn = MLX5_NL_SN_GENERATE;
1779 int ret;
1780 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1781 NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1782 NLMSG_ALIGN(sizeof(struct nlattr)) * 2 +
1783 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 2];
1784
1785 memset(buf, 0, sizeof(buf));
1786 nlh = (struct nlmsghdr *)buf;
1787 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1788 nlh->nlmsg_type = family_id;
1789 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1790 genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1791 nlh->nlmsg_len += sizeof(struct genlmsghdr);
1792 genl->cmd = DEVLINK_CMD_RELOAD;
1793 genl->version = DEVLINK_GENL_VERSION;
1794 nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1795 nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1796 ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1797 if (ret >= 0)
1798 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1799 if (ret < 0) {
1800 DRV_LOG(DEBUG, "Failed to reload %s device by Netlink - %d",
1801 pci_addr, ret);
1802 return ret;
1803 }
1804 DRV_LOG(DEBUG, "Device \"%s\" was reloaded by Netlink successfully.",
1805 pci_addr);
1806 return 0;
1807 }
1808
1809 /**
1810 * Set ROCE enable status through Netlink.
1811 *
1812 * @param[in] nlsk_fd
1813 * Netlink socket file descriptor.
1814 * @param[in] family_id
1815 * the Devlink family ID.
1816 * @param pci_addr
1817 * The device PCI address.
1818 * @param[out] enable
1819 * The enable status to set.
1820 *
1821 * @return
1822 * 0 on success, a negative errno value otherwise and rte_errno is set.
1823 */
1824 int
mlx5_nl_enable_roce_set(int nlsk_fd,int family_id,const char * pci_addr,int enable)1825 mlx5_nl_enable_roce_set(int nlsk_fd, int family_id, const char *pci_addr,
1826 int enable)
1827 {
1828 struct nlmsghdr *nlh;
1829 struct genlmsghdr *genl;
1830 uint32_t sn = MLX5_NL_SN_GENERATE;
1831 int ret;
1832 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1833 NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1834 NLMSG_ALIGN(sizeof(struct nlattr)) * 6 +
1835 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 6];
1836 uint8_t cmode = DEVLINK_PARAM_CMODE_DRIVERINIT;
1837 uint8_t ptype = NLA_FLAG;
1838 ;
1839
1840 memset(buf, 0, sizeof(buf));
1841 nlh = (struct nlmsghdr *)buf;
1842 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1843 nlh->nlmsg_type = family_id;
1844 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1845 genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1846 nlh->nlmsg_len += sizeof(struct genlmsghdr);
1847 genl->cmd = DEVLINK_CMD_PARAM_SET;
1848 genl->version = DEVLINK_GENL_VERSION;
1849 nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1850 nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1851 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1852 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_CMODE, &cmode, sizeof(cmode));
1853 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_TYPE, &ptype, sizeof(ptype));
1854 if (enable)
1855 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, NULL, 0);
1856 ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1857 if (ret >= 0)
1858 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1859 if (ret < 0) {
1860 DRV_LOG(DEBUG, "Failed to %sable ROCE for device %s by Netlink:"
1861 " %d.", enable ? "en" : "dis", pci_addr, ret);
1862 return ret;
1863 }
1864 DRV_LOG(DEBUG, "Device %s ROCE was %sabled by Netlink successfully.",
1865 pci_addr, enable ? "en" : "dis");
1866 /* Now, need to reload the driver. */
1867 return mlx5_nl_driver_reload(nlsk_fd, family_id, pci_addr);
1868 }
1869
1870 /**
1871 * Try to parse a Netlink message as a link status update.
1872 *
1873 * @param hdr
1874 * Netlink message header.
1875 * @param[out] ifindex
1876 * Index of the updated interface.
1877 *
1878 * @return
1879 * 0 on success, negative on failure.
1880 */
1881 int
mlx5_nl_parse_link_status_update(struct nlmsghdr * hdr,uint32_t * ifindex)1882 mlx5_nl_parse_link_status_update(struct nlmsghdr *hdr, uint32_t *ifindex)
1883 {
1884 struct ifinfomsg *info;
1885
1886 switch (hdr->nlmsg_type) {
1887 case RTM_NEWLINK:
1888 case RTM_DELLINK:
1889 case RTM_GETLINK:
1890 case RTM_SETLINK:
1891 info = NLMSG_DATA(hdr);
1892 *ifindex = info->ifi_index;
1893 return 0;
1894 }
1895 return -1;
1896 }
1897
1898 /**
1899 * Read pending events from a Netlink socket.
1900 *
1901 * @param nlsk_fd
1902 * Netlink socket.
1903 * @param cb
1904 * Callback invoked for each of the events.
1905 * @param cb_arg
1906 * User data for the callback.
1907 *
1908 * @return
1909 * 0 on success, including the case when there are no events.
1910 * Negative on failure and rte_errno is set.
1911 */
1912 int
mlx5_nl_read_events(int nlsk_fd,mlx5_nl_event_cb * cb,void * cb_arg)1913 mlx5_nl_read_events(int nlsk_fd, mlx5_nl_event_cb *cb, void *cb_arg)
1914 {
1915 char buf[8192];
1916 struct sockaddr_nl addr;
1917 struct iovec iov = {
1918 .iov_base = buf,
1919 .iov_len = sizeof(buf),
1920 };
1921 struct msghdr msg = {
1922 .msg_name = &addr,
1923 .msg_namelen = sizeof(addr),
1924 .msg_iov = &iov,
1925 .msg_iovlen = 1,
1926 };
1927 struct nlmsghdr *hdr;
1928 ssize_t size;
1929
1930 while (1) {
1931 size = recvmsg(nlsk_fd, &msg, MSG_DONTWAIT);
1932 if (size < 0) {
1933 if (errno == EAGAIN)
1934 return 0;
1935 if (errno == EINTR)
1936 continue;
1937 DRV_LOG(DEBUG, "Failed to receive netlink message: %s",
1938 strerror(errno));
1939 rte_errno = errno;
1940 return -rte_errno;
1941 }
1942 hdr = (struct nlmsghdr *)buf;
1943 while (size >= (ssize_t)sizeof(*hdr)) {
1944 ssize_t msg_len = hdr->nlmsg_len;
1945 ssize_t data_len = msg_len - sizeof(*hdr);
1946 ssize_t aligned_len;
1947
1948 if (data_len < 0) {
1949 DRV_LOG(DEBUG, "Netlink message too short");
1950 rte_errno = EINVAL;
1951 return -rte_errno;
1952 }
1953 aligned_len = NLMSG_ALIGN(msg_len);
1954 if (aligned_len > size) {
1955 DRV_LOG(DEBUG, "Netlink message too long");
1956 rte_errno = EINVAL;
1957 return -rte_errno;
1958 }
1959 cb(hdr, cb_arg);
1960 hdr = RTE_PTR_ADD(hdr, aligned_len);
1961 size -= aligned_len;
1962 }
1963 }
1964 return 0;
1965 }
1966
1967 static int
mlx5_nl_esw_multiport_cb(struct nlmsghdr * nh,void * arg)1968 mlx5_nl_esw_multiport_cb(struct nlmsghdr *nh, void *arg)
1969 {
1970
1971 int ret = -EINVAL;
1972 int *enable = arg;
1973 struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1974 struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1975 NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1976
1977 while (nla->nla_len && nla < tail) {
1978 switch (nla->nla_type) {
1979 /* Expected nested attributes case. */
1980 case DEVLINK_ATTR_PARAM:
1981 case DEVLINK_ATTR_PARAM_VALUES_LIST:
1982 case DEVLINK_ATTR_PARAM_VALUE:
1983 ret = 0;
1984 nla += 1;
1985 break;
1986 case DEVLINK_ATTR_PARAM_VALUE_DATA:
1987 *enable = 1;
1988 return 0;
1989 default:
1990 nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len));
1991 }
1992 }
1993 *enable = 0;
1994 return ret;
1995 }
1996
1997 #define NL_ESW_MULTIPORT_PARAM "esw_multiport"
1998
1999 int
mlx5_nl_devlink_esw_multiport_get(int nlsk_fd,int family_id,const char * pci_addr,int * enable)2000 mlx5_nl_devlink_esw_multiport_get(int nlsk_fd, int family_id, const char *pci_addr, int *enable)
2001 {
2002 struct nlmsghdr *nlh;
2003 struct genlmsghdr *genl;
2004 uint32_t sn = MLX5_NL_SN_GENERATE;
2005 int ret;
2006 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
2007 NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
2008 NLMSG_ALIGN(sizeof(struct nlattr)) * 4 +
2009 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 4];
2010
2011 memset(buf, 0, sizeof(buf));
2012 nlh = (struct nlmsghdr *)buf;
2013 nlh->nlmsg_len = sizeof(struct nlmsghdr);
2014 nlh->nlmsg_type = family_id;
2015 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
2016 genl = (struct genlmsghdr *)nl_msg_tail(nlh);
2017 nlh->nlmsg_len += sizeof(struct genlmsghdr);
2018 genl->cmd = DEVLINK_CMD_PARAM_GET;
2019 genl->version = DEVLINK_GENL_VERSION;
2020 nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
2021 nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
2022 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME,
2023 NL_ESW_MULTIPORT_PARAM, sizeof(NL_ESW_MULTIPORT_PARAM));
2024 ret = mlx5_nl_send(nlsk_fd, nlh, sn);
2025 if (ret >= 0)
2026 ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_esw_multiport_cb, enable);
2027 if (ret < 0) {
2028 DRV_LOG(DEBUG, "Failed to get Multiport E-Switch enable on device %s: %d.",
2029 pci_addr, ret);
2030 return ret;
2031 }
2032 DRV_LOG(DEBUG, "Multiport E-Switch is %sabled for device \"%s\".",
2033 *enable ? "en" : "dis", pci_addr);
2034 return ret;
2035 }
2036