1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2018 6WIND S.A. 3 * Copyright 2018 Mellanox Technologies, Ltd 4 */ 5 6 #include <errno.h> 7 #include <linux/if_link.h> 8 #include <linux/rtnetlink.h> 9 #include <linux/genetlink.h> 10 #include <net/if.h> 11 #include <rdma/rdma_netlink.h> 12 #include <stdbool.h> 13 #include <stdint.h> 14 #include <stdlib.h> 15 #include <stdalign.h> 16 #include <string.h> 17 #include <sys/socket.h> 18 #include <unistd.h> 19 20 #include <rte_errno.h> 21 22 #include "mlx5_nl.h" 23 #include "../mlx5_common_log.h" 24 #include "mlx5_malloc.h" 25 #ifdef HAVE_DEVLINK 26 #include <linux/devlink.h> 27 #endif 28 29 30 /* Size of the buffer to receive kernel messages */ 31 #define MLX5_NL_BUF_SIZE (32 * 1024) 32 /* Send buffer size for the Netlink socket */ 33 #define MLX5_SEND_BUF_SIZE 32768 34 /* Receive buffer size for the Netlink socket */ 35 #define MLX5_RECV_BUF_SIZE 32768 36 /* Maximal physical port name length. */ 37 #define MLX5_PHYS_PORT_NAME_MAX 128 38 39 /** Parameters of VLAN devices created by driver. */ 40 #define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx" 41 /* 42 * Define NDA_RTA as defined in iproute2 sources. 43 * 44 * see in iproute2 sources file include/libnetlink.h 45 */ 46 #ifndef MLX5_NDA_RTA 47 #define MLX5_NDA_RTA(r) \ 48 ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg)))) 49 #endif 50 /* 51 * Define NLMSG_TAIL as defined in iproute2 sources. 52 * 53 * see in iproute2 sources file include/libnetlink.h 54 */ 55 #ifndef NLMSG_TAIL 56 #define NLMSG_TAIL(nmsg) \ 57 ((struct rtattr *)(((char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) 58 #endif 59 /* 60 * The following definitions are normally found in rdma/rdma_netlink.h, 61 * however they are so recent that most systems do not expose them yet. 62 */ 63 #ifndef HAVE_RDMA_NL_NLDEV 64 #define RDMA_NL_NLDEV 5 65 #endif 66 #ifndef HAVE_RDMA_NLDEV_CMD_GET 67 #define RDMA_NLDEV_CMD_GET 1 68 #endif 69 #ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET 70 #define RDMA_NLDEV_CMD_PORT_GET 5 71 #endif 72 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX 73 #define RDMA_NLDEV_ATTR_DEV_INDEX 1 74 #endif 75 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME 76 #define RDMA_NLDEV_ATTR_DEV_NAME 2 77 #endif 78 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX 79 #define RDMA_NLDEV_ATTR_PORT_INDEX 3 80 #endif 81 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX 82 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50 83 #endif 84 85 /* These are normally found in linux/if_link.h. */ 86 #ifndef HAVE_IFLA_NUM_VF 87 #define IFLA_NUM_VF 21 88 #endif 89 #ifndef HAVE_IFLA_EXT_MASK 90 #define IFLA_EXT_MASK 29 91 #endif 92 #ifndef HAVE_IFLA_PHYS_SWITCH_ID 93 #define IFLA_PHYS_SWITCH_ID 36 94 #endif 95 #ifndef HAVE_IFLA_PHYS_PORT_NAME 96 #define IFLA_PHYS_PORT_NAME 38 97 #endif 98 99 /* 100 * Some Devlink defines may be missed in old kernel versions, 101 * adjust used defines. 102 */ 103 #ifndef DEVLINK_GENL_NAME 104 #define DEVLINK_GENL_NAME "devlink" 105 #endif 106 #ifndef DEVLINK_GENL_VERSION 107 #define DEVLINK_GENL_VERSION 1 108 #endif 109 #ifndef DEVLINK_ATTR_BUS_NAME 110 #define DEVLINK_ATTR_BUS_NAME 1 111 #endif 112 #ifndef DEVLINK_ATTR_DEV_NAME 113 #define DEVLINK_ATTR_DEV_NAME 2 114 #endif 115 #ifndef DEVLINK_ATTR_PARAM 116 #define DEVLINK_ATTR_PARAM 80 117 #endif 118 #ifndef DEVLINK_ATTR_PARAM_NAME 119 #define DEVLINK_ATTR_PARAM_NAME 81 120 #endif 121 #ifndef DEVLINK_ATTR_PARAM_TYPE 122 #define DEVLINK_ATTR_PARAM_TYPE 83 123 #endif 124 #ifndef DEVLINK_ATTR_PARAM_VALUES_LIST 125 #define DEVLINK_ATTR_PARAM_VALUES_LIST 84 126 #endif 127 #ifndef DEVLINK_ATTR_PARAM_VALUE 128 #define DEVLINK_ATTR_PARAM_VALUE 85 129 #endif 130 #ifndef DEVLINK_ATTR_PARAM_VALUE_DATA 131 #define DEVLINK_ATTR_PARAM_VALUE_DATA 86 132 #endif 133 #ifndef DEVLINK_ATTR_PARAM_VALUE_CMODE 134 #define DEVLINK_ATTR_PARAM_VALUE_CMODE 87 135 #endif 136 #ifndef DEVLINK_PARAM_CMODE_DRIVERINIT 137 #define DEVLINK_PARAM_CMODE_DRIVERINIT 1 138 #endif 139 #ifndef DEVLINK_CMD_RELOAD 140 #define DEVLINK_CMD_RELOAD 37 141 #endif 142 #ifndef DEVLINK_CMD_PARAM_GET 143 #define DEVLINK_CMD_PARAM_GET 38 144 #endif 145 #ifndef DEVLINK_CMD_PARAM_SET 146 #define DEVLINK_CMD_PARAM_SET 39 147 #endif 148 #ifndef NLA_FLAG 149 #define NLA_FLAG 6 150 #endif 151 152 /* Add/remove MAC address through Netlink */ 153 struct mlx5_nl_mac_addr { 154 struct rte_ether_addr (*mac)[]; 155 /**< MAC address handled by the device. */ 156 int mac_n; /**< Number of addresses in the array. */ 157 }; 158 159 #define MLX5_NL_CMD_GET_IB_NAME (1 << 0) 160 #define MLX5_NL_CMD_GET_IB_INDEX (1 << 1) 161 #define MLX5_NL_CMD_GET_NET_INDEX (1 << 2) 162 #define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3) 163 164 /** Data structure used by mlx5_nl_cmdget_cb(). */ 165 struct mlx5_nl_ifindex_data { 166 const char *name; /**< IB device name (in). */ 167 uint32_t flags; /**< found attribute flags (out). */ 168 uint32_t ibindex; /**< IB device index (out). */ 169 uint32_t ifindex; /**< Network interface index (out). */ 170 uint32_t portnum; /**< IB device max port number (out). */ 171 }; 172 173 uint32_t atomic_sn; 174 175 /* Generate Netlink sequence number. */ 176 #define MLX5_NL_SN_GENERATE __atomic_add_fetch(&atomic_sn, 1, __ATOMIC_RELAXED) 177 178 /** 179 * Opens a Netlink socket. 180 * 181 * @param protocol 182 * Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA). 183 * 184 * @return 185 * A file descriptor on success, a negative errno value otherwise and 186 * rte_errno is set. 187 */ 188 int 189 mlx5_nl_init(int protocol) 190 { 191 int fd; 192 int buf_size; 193 socklen_t opt_size; 194 struct sockaddr_nl local = { 195 .nl_family = AF_NETLINK, 196 }; 197 int ret; 198 199 fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol); 200 if (fd == -1) { 201 rte_errno = errno; 202 return -rte_errno; 203 } 204 opt_size = sizeof(buf_size); 205 ret = getsockopt(fd, SOL_SOCKET, SO_SNDBUF, &buf_size, &opt_size); 206 if (ret == -1) { 207 rte_errno = errno; 208 goto error; 209 } 210 DRV_LOG(DEBUG, "Netlink socket send buffer: %d", buf_size); 211 if (buf_size < MLX5_SEND_BUF_SIZE) { 212 ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 213 &buf_size, sizeof(buf_size)); 214 if (ret == -1) { 215 rte_errno = errno; 216 goto error; 217 } 218 } 219 opt_size = sizeof(buf_size); 220 ret = getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &buf_size, &opt_size); 221 if (ret == -1) { 222 rte_errno = errno; 223 goto error; 224 } 225 DRV_LOG(DEBUG, "Netlink socket recv buffer: %d", buf_size); 226 if (buf_size < MLX5_RECV_BUF_SIZE) { 227 ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, 228 &buf_size, sizeof(buf_size)); 229 if (ret == -1) { 230 rte_errno = errno; 231 goto error; 232 } 233 } 234 ret = bind(fd, (struct sockaddr *)&local, sizeof(local)); 235 if (ret == -1) { 236 rte_errno = errno; 237 goto error; 238 } 239 return fd; 240 error: 241 close(fd); 242 return -rte_errno; 243 } 244 245 /** 246 * Send a request message to the kernel on the Netlink socket. 247 * 248 * @param[in] nlsk_fd 249 * Netlink socket file descriptor. 250 * @param[in] nh 251 * The Netlink message send to the kernel. 252 * @param[in] ssn 253 * Sequence number. 254 * @param[in] req 255 * Pointer to the request structure. 256 * @param[in] len 257 * Length of the request in bytes. 258 * 259 * @return 260 * The number of sent bytes on success, a negative errno value otherwise and 261 * rte_errno is set. 262 */ 263 static int 264 mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req, 265 int len) 266 { 267 struct sockaddr_nl sa = { 268 .nl_family = AF_NETLINK, 269 }; 270 struct iovec iov[2] = { 271 { .iov_base = nh, .iov_len = sizeof(*nh), }, 272 { .iov_base = req, .iov_len = len, }, 273 }; 274 struct msghdr msg = { 275 .msg_name = &sa, 276 .msg_namelen = sizeof(sa), 277 .msg_iov = iov, 278 .msg_iovlen = 2, 279 }; 280 int send_bytes; 281 282 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */ 283 nh->nlmsg_seq = sn; 284 send_bytes = sendmsg(nlsk_fd, &msg, 0); 285 if (send_bytes < 0) { 286 rte_errno = errno; 287 return -rte_errno; 288 } 289 return send_bytes; 290 } 291 292 /** 293 * Send a message to the kernel on the Netlink socket. 294 * 295 * @param[in] nlsk_fd 296 * The Netlink socket file descriptor used for communication. 297 * @param[in] nh 298 * The Netlink message send to the kernel. 299 * @param[in] sn 300 * Sequence number. 301 * 302 * @return 303 * The number of sent bytes on success, a negative errno value otherwise and 304 * rte_errno is set. 305 */ 306 static int 307 mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn) 308 { 309 struct sockaddr_nl sa = { 310 .nl_family = AF_NETLINK, 311 }; 312 struct iovec iov = { 313 .iov_base = nh, 314 .iov_len = nh->nlmsg_len, 315 }; 316 struct msghdr msg = { 317 .msg_name = &sa, 318 .msg_namelen = sizeof(sa), 319 .msg_iov = &iov, 320 .msg_iovlen = 1, 321 }; 322 int send_bytes; 323 324 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */ 325 nh->nlmsg_seq = sn; 326 send_bytes = sendmsg(nlsk_fd, &msg, 0); 327 if (send_bytes < 0) { 328 rte_errno = errno; 329 return -rte_errno; 330 } 331 return send_bytes; 332 } 333 334 /** 335 * Receive a message from the kernel on the Netlink socket, following 336 * mlx5_nl_send(). 337 * 338 * @param[in] nlsk_fd 339 * The Netlink socket file descriptor used for communication. 340 * @param[in] sn 341 * Sequence number. 342 * @param[in] cb 343 * The callback function to call for each Netlink message received. 344 * @param[in, out] arg 345 * Custom arguments for the callback. 346 * 347 * @return 348 * 0 on success, a negative errno value otherwise and rte_errno is set. 349 */ 350 static int 351 mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg), 352 void *arg) 353 { 354 struct sockaddr_nl sa; 355 struct iovec iov; 356 struct msghdr msg = { 357 .msg_name = &sa, 358 .msg_namelen = sizeof(sa), 359 .msg_iov = &iov, 360 /* One message at a time */ 361 .msg_iovlen = 1, 362 }; 363 void *buf = NULL; 364 int multipart = 0; 365 int ret = 0; 366 367 do { 368 struct nlmsghdr *nh; 369 int recv_bytes; 370 371 do { 372 /* Query length of incoming message. */ 373 iov.iov_base = NULL; 374 iov.iov_len = 0; 375 recv_bytes = recvmsg(nlsk_fd, &msg, 376 MSG_PEEK | MSG_TRUNC); 377 if (recv_bytes < 0) { 378 rte_errno = errno; 379 ret = -rte_errno; 380 goto exit; 381 } 382 if (recv_bytes == 0) { 383 rte_errno = ENODATA; 384 ret = -rte_errno; 385 goto exit; 386 } 387 /* Allocate buffer to fetch the message. */ 388 if (recv_bytes < MLX5_RECV_BUF_SIZE) 389 recv_bytes = MLX5_RECV_BUF_SIZE; 390 mlx5_free(buf); 391 buf = mlx5_malloc(0, recv_bytes, 0, SOCKET_ID_ANY); 392 if (!buf) { 393 rte_errno = ENOMEM; 394 ret = -rte_errno; 395 goto exit; 396 } 397 /* Fetch the message. */ 398 iov.iov_base = buf; 399 iov.iov_len = recv_bytes; 400 recv_bytes = recvmsg(nlsk_fd, &msg, 0); 401 if (recv_bytes == -1) { 402 rte_errno = errno; 403 ret = -rte_errno; 404 goto exit; 405 } 406 nh = (struct nlmsghdr *)buf; 407 } while (nh->nlmsg_seq != sn); 408 for (; 409 NLMSG_OK(nh, (unsigned int)recv_bytes); 410 nh = NLMSG_NEXT(nh, recv_bytes)) { 411 if (nh->nlmsg_type == NLMSG_ERROR) { 412 struct nlmsgerr *err_data = NLMSG_DATA(nh); 413 414 if (err_data->error < 0) { 415 rte_errno = -err_data->error; 416 ret = -rte_errno; 417 goto exit; 418 } 419 /* Ack message. */ 420 ret = 0; 421 goto exit; 422 } 423 /* Multi-part msgs and their trailing DONE message. */ 424 if (nh->nlmsg_flags & NLM_F_MULTI) { 425 if (nh->nlmsg_type == NLMSG_DONE) { 426 ret = 0; 427 goto exit; 428 } 429 multipart = 1; 430 } 431 if (cb) { 432 ret = cb(nh, arg); 433 if (ret < 0) 434 goto exit; 435 } 436 } 437 } while (multipart); 438 exit: 439 mlx5_free(buf); 440 return ret; 441 } 442 443 /** 444 * Parse Netlink message to retrieve the bridge MAC address. 445 * 446 * @param nh 447 * Pointer to Netlink Message Header. 448 * @param arg 449 * PMD data register with this callback. 450 * 451 * @return 452 * 0 on success, a negative errno value otherwise and rte_errno is set. 453 */ 454 static int 455 mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg) 456 { 457 struct mlx5_nl_mac_addr *data = arg; 458 struct ndmsg *r = NLMSG_DATA(nh); 459 struct rtattr *attribute; 460 int len; 461 462 len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r)); 463 for (attribute = MLX5_NDA_RTA(r); 464 RTA_OK(attribute, len); 465 attribute = RTA_NEXT(attribute, len)) { 466 if (attribute->rta_type == NDA_LLADDR) { 467 if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) { 468 DRV_LOG(WARNING, 469 "not enough room to finalize the" 470 " request"); 471 rte_errno = ENOMEM; 472 return -rte_errno; 473 } 474 #ifdef RTE_LIBRTE_MLX5_DEBUG 475 char m[RTE_ETHER_ADDR_FMT_SIZE]; 476 477 rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE, 478 RTA_DATA(attribute)); 479 DRV_LOG(DEBUG, "bridge MAC address %s", m); 480 #endif 481 memcpy(&(*data->mac)[data->mac_n++], 482 RTA_DATA(attribute), RTE_ETHER_ADDR_LEN); 483 } 484 } 485 return 0; 486 } 487 488 /** 489 * Get bridge MAC addresses. 490 * 491 * @param[in] nlsk_fd 492 * Netlink socket file descriptor. 493 * @param[in] iface_idx 494 * Net device interface index. 495 * @param mac[out] 496 * Pointer to the array table of MAC addresses to fill. 497 * Its size should be of MLX5_MAX_MAC_ADDRESSES. 498 * @param mac_n[out] 499 * Number of entries filled in MAC array. 500 * 501 * @return 502 * 0 on success, a negative errno value otherwise and rte_errno is set. 503 */ 504 static int 505 mlx5_nl_mac_addr_list(int nlsk_fd, unsigned int iface_idx, 506 struct rte_ether_addr (*mac)[], int *mac_n) 507 { 508 struct { 509 struct nlmsghdr hdr; 510 struct ifinfomsg ifm; 511 } req = { 512 .hdr = { 513 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), 514 .nlmsg_type = RTM_GETNEIGH, 515 .nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, 516 }, 517 .ifm = { 518 .ifi_family = PF_BRIDGE, 519 .ifi_index = iface_idx, 520 }, 521 }; 522 struct mlx5_nl_mac_addr data = { 523 .mac = mac, 524 .mac_n = 0, 525 }; 526 uint32_t sn = MLX5_NL_SN_GENERATE; 527 int ret; 528 529 if (nlsk_fd == -1) 530 return 0; 531 ret = mlx5_nl_request(nlsk_fd, &req.hdr, sn, &req.ifm, 532 sizeof(struct ifinfomsg)); 533 if (ret < 0) 534 goto error; 535 ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_mac_addr_cb, &data); 536 if (ret < 0) 537 goto error; 538 *mac_n = data.mac_n; 539 return 0; 540 error: 541 DRV_LOG(DEBUG, "Interface %u cannot retrieve MAC address list %s", 542 iface_idx, strerror(rte_errno)); 543 return -rte_errno; 544 } 545 546 /** 547 * Modify the MAC address neighbour table with Netlink. 548 * 549 * @param[in] nlsk_fd 550 * Netlink socket file descriptor. 551 * @param[in] iface_idx 552 * Net device interface index. 553 * @param mac 554 * MAC address to consider. 555 * @param add 556 * 1 to add the MAC address, 0 to remove the MAC address. 557 * 558 * @return 559 * 0 on success, a negative errno value otherwise and rte_errno is set. 560 */ 561 static int 562 mlx5_nl_mac_addr_modify(int nlsk_fd, unsigned int iface_idx, 563 struct rte_ether_addr *mac, int add) 564 { 565 struct { 566 struct nlmsghdr hdr; 567 struct ndmsg ndm; 568 struct rtattr rta; 569 uint8_t buffer[RTE_ETHER_ADDR_LEN]; 570 } req = { 571 .hdr = { 572 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)), 573 .nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | 574 NLM_F_EXCL | NLM_F_ACK, 575 .nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH, 576 }, 577 .ndm = { 578 .ndm_family = PF_BRIDGE, 579 .ndm_state = NUD_NOARP | NUD_PERMANENT, 580 .ndm_ifindex = iface_idx, 581 .ndm_flags = NTF_SELF, 582 }, 583 .rta = { 584 .rta_type = NDA_LLADDR, 585 .rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN), 586 }, 587 }; 588 uint32_t sn = MLX5_NL_SN_GENERATE; 589 int ret; 590 591 if (nlsk_fd == -1) 592 return 0; 593 memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN); 594 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + 595 RTA_ALIGN(req.rta.rta_len); 596 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn); 597 if (ret < 0) 598 goto error; 599 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL); 600 if (ret < 0) 601 goto error; 602 return 0; 603 error: 604 #ifdef RTE_LIBRTE_MLX5_DEBUG 605 { 606 char m[RTE_ETHER_ADDR_FMT_SIZE]; 607 608 rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE, mac); 609 DRV_LOG(DEBUG, 610 "Interface %u cannot %s MAC address %s %s", 611 iface_idx, 612 add ? "add" : "remove", m, strerror(rte_errno)); 613 } 614 #endif 615 return -rte_errno; 616 } 617 618 /** 619 * Modify the VF MAC address neighbour table with Netlink. 620 * 621 * @param[in] nlsk_fd 622 * Netlink socket file descriptor. 623 * @param[in] iface_idx 624 * Net device interface index. 625 * @param mac 626 * MAC address to consider. 627 * @param vf_index 628 * VF index. 629 * 630 * @return 631 * 0 on success, a negative errno value otherwise and rte_errno is set. 632 */ 633 int 634 mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx, 635 struct rte_ether_addr *mac, int vf_index) 636 { 637 int ret; 638 struct { 639 struct nlmsghdr hdr; 640 struct ifinfomsg ifm; 641 struct rtattr vf_list_rta; 642 struct rtattr vf_info_rta; 643 struct rtattr vf_mac_rta; 644 struct ifla_vf_mac ivm; 645 } req = { 646 .hdr = { 647 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), 648 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK, 649 .nlmsg_type = RTM_BASE, 650 }, 651 .ifm = { 652 .ifi_index = iface_idx, 653 }, 654 .vf_list_rta = { 655 .rta_type = IFLA_VFINFO_LIST, 656 .rta_len = RTA_ALIGN(RTA_LENGTH(0)), 657 }, 658 .vf_info_rta = { 659 .rta_type = IFLA_VF_INFO, 660 .rta_len = RTA_ALIGN(RTA_LENGTH(0)), 661 }, 662 .vf_mac_rta = { 663 .rta_type = IFLA_VF_MAC, 664 }, 665 }; 666 struct ifla_vf_mac ivm = { 667 .vf = vf_index, 668 }; 669 uint32_t sn = MLX5_NL_SN_GENERATE; 670 671 memcpy(&ivm.mac, mac, RTE_ETHER_ADDR_LEN); 672 memcpy(RTA_DATA(&req.vf_mac_rta), &ivm, sizeof(ivm)); 673 674 req.vf_mac_rta.rta_len = RTA_LENGTH(sizeof(ivm)); 675 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + 676 RTA_ALIGN(req.vf_list_rta.rta_len) + 677 RTA_ALIGN(req.vf_info_rta.rta_len) + 678 RTA_ALIGN(req.vf_mac_rta.rta_len); 679 req.vf_list_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr), 680 &req.vf_list_rta); 681 req.vf_info_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr), 682 &req.vf_info_rta); 683 684 if (nlsk_fd < 0) 685 return -1; 686 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn); 687 if (ret < 0) 688 goto error; 689 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL); 690 if (ret < 0) 691 goto error; 692 return 0; 693 error: 694 DRV_LOG(ERR, 695 "representor %u cannot set VF MAC address " 696 RTE_ETHER_ADDR_PRT_FMT " : %s", 697 vf_index, 698 RTE_ETHER_ADDR_BYTES(mac), 699 strerror(rte_errno)); 700 return -rte_errno; 701 } 702 703 /** 704 * Add a MAC address. 705 * 706 * @param[in] nlsk_fd 707 * Netlink socket file descriptor. 708 * @param[in] iface_idx 709 * Net device interface index. 710 * @param mac_own 711 * BITFIELD_DECLARE array to store the mac. 712 * @param mac 713 * MAC address to register. 714 * @param index 715 * MAC address index. 716 * 717 * @return 718 * 0 on success, a negative errno value otherwise and rte_errno is set. 719 */ 720 int 721 mlx5_nl_mac_addr_add(int nlsk_fd, unsigned int iface_idx, 722 uint64_t *mac_own, struct rte_ether_addr *mac, 723 uint32_t index) 724 { 725 int ret; 726 727 ret = mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 1); 728 if (!ret) { 729 MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES); 730 if (index >= MLX5_MAX_MAC_ADDRESSES) 731 return -EINVAL; 732 733 BITFIELD_SET(mac_own, index); 734 } 735 if (ret == -EEXIST) 736 return 0; 737 return ret; 738 } 739 740 /** 741 * Remove a MAC address. 742 * 743 * @param[in] nlsk_fd 744 * Netlink socket file descriptor. 745 * @param[in] iface_idx 746 * Net device interface index. 747 * @param mac_own 748 * BITFIELD_DECLARE array to store the mac. 749 * @param mac 750 * MAC address to remove. 751 * @param index 752 * MAC address index. 753 * 754 * @return 755 * 0 on success, a negative errno value otherwise and rte_errno is set. 756 */ 757 int 758 mlx5_nl_mac_addr_remove(int nlsk_fd, unsigned int iface_idx, uint64_t *mac_own, 759 struct rte_ether_addr *mac, uint32_t index) 760 { 761 MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES); 762 if (index >= MLX5_MAX_MAC_ADDRESSES) 763 return -EINVAL; 764 765 BITFIELD_RESET(mac_own, index); 766 return mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 0); 767 } 768 769 /** 770 * Synchronize Netlink bridge table to the internal table. 771 * 772 * @param[in] nlsk_fd 773 * Netlink socket file descriptor. 774 * @param[in] iface_idx 775 * Net device interface index. 776 * @param mac_addrs 777 * Mac addresses array to sync. 778 * @param n 779 * @p mac_addrs array size. 780 */ 781 void 782 mlx5_nl_mac_addr_sync(int nlsk_fd, unsigned int iface_idx, 783 struct rte_ether_addr *mac_addrs, int n) 784 { 785 struct rte_ether_addr macs[n]; 786 int macs_n = 0; 787 int i; 788 int ret; 789 790 memset(macs, 0, n * sizeof(macs[0])); 791 ret = mlx5_nl_mac_addr_list(nlsk_fd, iface_idx, &macs, &macs_n); 792 if (ret) 793 return; 794 for (i = 0; i != macs_n; ++i) { 795 int j; 796 797 /* Verify the address is not in the array yet. */ 798 for (j = 0; j != n; ++j) 799 if (rte_is_same_ether_addr(&macs[i], &mac_addrs[j])) 800 break; 801 if (j != n) 802 continue; 803 if (rte_is_multicast_ether_addr(&macs[i])) { 804 /* Find the first entry available. */ 805 for (j = MLX5_MAX_UC_MAC_ADDRESSES; j != n; ++j) { 806 if (rte_is_zero_ether_addr(&mac_addrs[j])) { 807 mac_addrs[j] = macs[i]; 808 break; 809 } 810 } 811 } else { 812 /* Find the first entry available. */ 813 for (j = 0; j != MLX5_MAX_UC_MAC_ADDRESSES; ++j) { 814 if (rte_is_zero_ether_addr(&mac_addrs[j])) { 815 mac_addrs[j] = macs[i]; 816 break; 817 } 818 } 819 } 820 } 821 } 822 823 /** 824 * Flush all added MAC addresses. 825 * 826 * @param[in] nlsk_fd 827 * Netlink socket file descriptor. 828 * @param[in] iface_idx 829 * Net device interface index. 830 * @param[in] mac_addrs 831 * Mac addresses array to flush. 832 * @param n 833 * @p mac_addrs array size. 834 * @param mac_own 835 * BITFIELD_DECLARE array to store the mac. 836 */ 837 void 838 mlx5_nl_mac_addr_flush(int nlsk_fd, unsigned int iface_idx, 839 struct rte_ether_addr *mac_addrs, int n, 840 uint64_t *mac_own) 841 { 842 int i; 843 844 if (n <= 0 || n > MLX5_MAX_MAC_ADDRESSES) 845 return; 846 847 for (i = n - 1; i >= 0; --i) { 848 struct rte_ether_addr *m = &mac_addrs[i]; 849 850 if (BITFIELD_ISSET(mac_own, i)) 851 mlx5_nl_mac_addr_remove(nlsk_fd, iface_idx, mac_own, m, 852 i); 853 } 854 } 855 856 /** 857 * Enable promiscuous / all multicast mode through Netlink. 858 * 859 * @param[in] nlsk_fd 860 * Netlink socket file descriptor. 861 * @param[in] iface_idx 862 * Net device interface index. 863 * @param flags 864 * IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti. 865 * @param enable 866 * Nonzero to enable, disable otherwise. 867 * 868 * @return 869 * 0 on success, a negative errno value otherwise and rte_errno is set. 870 */ 871 static int 872 mlx5_nl_device_flags(int nlsk_fd, unsigned int iface_idx, uint32_t flags, 873 int enable) 874 { 875 struct { 876 struct nlmsghdr hdr; 877 struct ifinfomsg ifi; 878 } req = { 879 .hdr = { 880 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), 881 .nlmsg_type = RTM_NEWLINK, 882 .nlmsg_flags = NLM_F_REQUEST, 883 }, 884 .ifi = { 885 .ifi_flags = enable ? flags : 0, 886 .ifi_change = flags, 887 .ifi_index = iface_idx, 888 }, 889 }; 890 uint32_t sn = MLX5_NL_SN_GENERATE; 891 int ret; 892 893 MLX5_ASSERT(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI))); 894 if (nlsk_fd < 0) 895 return 0; 896 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn); 897 if (ret < 0) 898 return ret; 899 return 0; 900 } 901 902 /** 903 * Enable promiscuous mode through Netlink. 904 * 905 * @param[in] nlsk_fd 906 * Netlink socket file descriptor. 907 * @param[in] iface_idx 908 * Net device interface index. 909 * @param enable 910 * Nonzero to enable, disable otherwise. 911 * 912 * @return 913 * 0 on success, a negative errno value otherwise and rte_errno is set. 914 */ 915 int 916 mlx5_nl_promisc(int nlsk_fd, unsigned int iface_idx, int enable) 917 { 918 int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_PROMISC, enable); 919 920 if (ret) 921 DRV_LOG(DEBUG, 922 "Interface %u cannot %s promisc mode: Netlink error %s", 923 iface_idx, enable ? "enable" : "disable", 924 strerror(rte_errno)); 925 return ret; 926 } 927 928 /** 929 * Enable all multicast mode through Netlink. 930 * 931 * @param[in] nlsk_fd 932 * Netlink socket file descriptor. 933 * @param[in] iface_idx 934 * Net device interface index. 935 * @param enable 936 * Nonzero to enable, disable otherwise. 937 * 938 * @return 939 * 0 on success, a negative errno value otherwise and rte_errno is set. 940 */ 941 int 942 mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable) 943 { 944 int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_ALLMULTI, 945 enable); 946 947 if (ret) 948 DRV_LOG(DEBUG, 949 "Interface %u cannot %s allmulti : Netlink error %s", 950 iface_idx, enable ? "enable" : "disable", 951 strerror(rte_errno)); 952 return ret; 953 } 954 955 /** 956 * Process network interface information from Netlink message. 957 * 958 * @param nh 959 * Pointer to Netlink message header. 960 * @param arg 961 * Opaque data pointer for this callback. 962 * 963 * @return 964 * 0 on success, a negative errno value otherwise and rte_errno is set. 965 */ 966 static int 967 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg) 968 { 969 struct mlx5_nl_ifindex_data *data = arg; 970 struct mlx5_nl_ifindex_data local = { 971 .flags = 0, 972 }; 973 size_t off = NLMSG_HDRLEN; 974 975 if (nh->nlmsg_type != 976 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) && 977 nh->nlmsg_type != 978 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET)) 979 goto error; 980 while (off < nh->nlmsg_len) { 981 struct nlattr *na = (void *)((uintptr_t)nh + off); 982 void *payload = (void *)((uintptr_t)na + NLA_HDRLEN); 983 984 if (na->nla_len > nh->nlmsg_len - off) 985 goto error; 986 switch (na->nla_type) { 987 case RDMA_NLDEV_ATTR_DEV_INDEX: 988 local.ibindex = *(uint32_t *)payload; 989 local.flags |= MLX5_NL_CMD_GET_IB_INDEX; 990 break; 991 case RDMA_NLDEV_ATTR_DEV_NAME: 992 if (!strcmp(payload, data->name)) 993 local.flags |= MLX5_NL_CMD_GET_IB_NAME; 994 break; 995 case RDMA_NLDEV_ATTR_NDEV_INDEX: 996 local.ifindex = *(uint32_t *)payload; 997 local.flags |= MLX5_NL_CMD_GET_NET_INDEX; 998 break; 999 case RDMA_NLDEV_ATTR_PORT_INDEX: 1000 local.portnum = *(uint32_t *)payload; 1001 local.flags |= MLX5_NL_CMD_GET_PORT_INDEX; 1002 break; 1003 default: 1004 break; 1005 } 1006 off += NLA_ALIGN(na->nla_len); 1007 } 1008 /* 1009 * It is possible to have multiple messages for all 1010 * Infiniband devices in the system with appropriate name. 1011 * So we should gather parameters locally and copy to 1012 * query context only in case of coinciding device name. 1013 */ 1014 if (local.flags & MLX5_NL_CMD_GET_IB_NAME) { 1015 data->flags = local.flags; 1016 data->ibindex = local.ibindex; 1017 data->ifindex = local.ifindex; 1018 data->portnum = local.portnum; 1019 } 1020 return 0; 1021 error: 1022 rte_errno = EINVAL; 1023 return -rte_errno; 1024 } 1025 1026 /** 1027 * Get index of network interface associated with some IB device. 1028 * 1029 * This is the only somewhat safe method to avoid resorting to heuristics 1030 * when faced with port representors. Unfortunately it requires at least 1031 * Linux 4.17. 1032 * 1033 * @param nl 1034 * Netlink socket of the RDMA kind (NETLINK_RDMA). 1035 * @param[in] name 1036 * IB device name. 1037 * @param[in] pindex 1038 * IB device port index, starting from 1 1039 * @return 1040 * A valid (nonzero) interface index on success, 0 otherwise and rte_errno 1041 * is set. 1042 */ 1043 unsigned int 1044 mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex) 1045 { 1046 struct mlx5_nl_ifindex_data data = { 1047 .name = name, 1048 .flags = 0, 1049 .ibindex = 0, /* Determined during first pass. */ 1050 .ifindex = 0, /* Determined during second pass. */ 1051 }; 1052 union { 1053 struct nlmsghdr nh; 1054 uint8_t buf[NLMSG_HDRLEN + 1055 NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) + 1056 NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))]; 1057 } req = { 1058 .nh = { 1059 .nlmsg_len = NLMSG_LENGTH(0), 1060 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, 1061 RDMA_NLDEV_CMD_GET), 1062 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP, 1063 }, 1064 }; 1065 struct nlattr *na; 1066 uint32_t sn = MLX5_NL_SN_GENERATE; 1067 int ret; 1068 1069 ret = mlx5_nl_send(nl, &req.nh, sn); 1070 if (ret < 0) 1071 return 0; 1072 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data); 1073 if (ret < 0) 1074 return 0; 1075 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) || 1076 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX)) 1077 goto error; 1078 data.flags = 0; 1079 sn = MLX5_NL_SN_GENERATE; 1080 req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, 1081 RDMA_NLDEV_CMD_PORT_GET); 1082 req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 1083 req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN); 1084 na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN); 1085 na->nla_len = NLA_HDRLEN + sizeof(data.ibindex); 1086 na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX; 1087 memcpy((void *)((uintptr_t)na + NLA_HDRLEN), 1088 &data.ibindex, sizeof(data.ibindex)); 1089 na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len)); 1090 na->nla_len = NLA_HDRLEN + sizeof(pindex); 1091 na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX; 1092 memcpy((void *)((uintptr_t)na + NLA_HDRLEN), 1093 &pindex, sizeof(pindex)); 1094 ret = mlx5_nl_send(nl, &req.nh, sn); 1095 if (ret < 0) 1096 return 0; 1097 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data); 1098 if (ret < 0) 1099 return 0; 1100 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) || 1101 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) || 1102 !(data.flags & MLX5_NL_CMD_GET_NET_INDEX) || 1103 !data.ifindex) 1104 goto error; 1105 return data.ifindex; 1106 error: 1107 rte_errno = ENODEV; 1108 return 0; 1109 } 1110 1111 /** 1112 * Get the number of physical ports of given IB device. 1113 * 1114 * @param nl 1115 * Netlink socket of the RDMA kind (NETLINK_RDMA). 1116 * @param[in] name 1117 * IB device name. 1118 * 1119 * @return 1120 * A valid (nonzero) number of ports on success, 0 otherwise 1121 * and rte_errno is set. 1122 */ 1123 unsigned int 1124 mlx5_nl_portnum(int nl, const char *name) 1125 { 1126 struct mlx5_nl_ifindex_data data = { 1127 .flags = 0, 1128 .name = name, 1129 .ifindex = 0, 1130 .portnum = 0, 1131 }; 1132 struct nlmsghdr req = { 1133 .nlmsg_len = NLMSG_LENGTH(0), 1134 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, 1135 RDMA_NLDEV_CMD_GET), 1136 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP, 1137 }; 1138 uint32_t sn = MLX5_NL_SN_GENERATE; 1139 int ret; 1140 1141 ret = mlx5_nl_send(nl, &req, sn); 1142 if (ret < 0) 1143 return 0; 1144 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data); 1145 if (ret < 0) 1146 return 0; 1147 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) || 1148 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) || 1149 !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) { 1150 rte_errno = ENODEV; 1151 return 0; 1152 } 1153 if (!data.portnum) 1154 rte_errno = EINVAL; 1155 return data.portnum; 1156 } 1157 1158 /** 1159 * Analyze gathered port parameters via Netlink to recognize master 1160 * and representor devices for E-Switch configuration. 1161 * 1162 * @param[in] num_vf_set 1163 * flag of presence of number of VFs port attribute. 1164 * @param[inout] switch_info 1165 * Port information, including port name as a number and port name 1166 * type if recognized 1167 * 1168 * @return 1169 * master and representor flags are set in switch_info according to 1170 * recognized parameters (if any). 1171 */ 1172 static void 1173 mlx5_nl_check_switch_info(bool num_vf_set, 1174 struct mlx5_switch_info *switch_info) 1175 { 1176 switch (switch_info->name_type) { 1177 case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN: 1178 /* 1179 * Name is not recognized, assume the master, 1180 * check the number of VFs key presence. 1181 */ 1182 switch_info->master = num_vf_set; 1183 break; 1184 case MLX5_PHYS_PORT_NAME_TYPE_NOTSET: 1185 /* 1186 * Name is not set, this assumes the legacy naming 1187 * schema for master, just check if there is a 1188 * number of VFs key. 1189 */ 1190 switch_info->master = num_vf_set; 1191 break; 1192 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: 1193 /* New uplink naming schema recognized. */ 1194 switch_info->master = 1; 1195 break; 1196 case MLX5_PHYS_PORT_NAME_TYPE_LEGACY: 1197 /* Legacy representors naming schema. */ 1198 switch_info->representor = !num_vf_set; 1199 break; 1200 case MLX5_PHYS_PORT_NAME_TYPE_PFHPF: 1201 /* Fallthrough */ 1202 case MLX5_PHYS_PORT_NAME_TYPE_PFVF: 1203 /* Fallthrough */ 1204 case MLX5_PHYS_PORT_NAME_TYPE_PFSF: 1205 /* New representors naming schema. */ 1206 switch_info->representor = 1; 1207 break; 1208 } 1209 } 1210 1211 /** 1212 * Process switch information from Netlink message. 1213 * 1214 * @param nh 1215 * Pointer to Netlink message header. 1216 * @param arg 1217 * Opaque data pointer for this callback. 1218 * 1219 * @return 1220 * 0 on success, a negative errno value otherwise and rte_errno is set. 1221 */ 1222 static int 1223 mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg) 1224 { 1225 struct mlx5_switch_info info = { 1226 .master = 0, 1227 .representor = 0, 1228 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET, 1229 .port_name = 0, 1230 .switch_id = 0, 1231 }; 1232 size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg)); 1233 bool switch_id_set = false; 1234 bool num_vf_set = false; 1235 int len; 1236 1237 if (nh->nlmsg_type != RTM_NEWLINK) 1238 goto error; 1239 while (off < nh->nlmsg_len) { 1240 struct rtattr *ra = (void *)((uintptr_t)nh + off); 1241 void *payload = RTA_DATA(ra); 1242 unsigned int i; 1243 1244 if (ra->rta_len > nh->nlmsg_len - off) 1245 goto error; 1246 switch (ra->rta_type) { 1247 case IFLA_NUM_VF: 1248 num_vf_set = true; 1249 break; 1250 case IFLA_PHYS_PORT_NAME: 1251 len = RTA_PAYLOAD(ra); 1252 /* Some kernels do not pad attributes with zero. */ 1253 if (len > 0 && len < MLX5_PHYS_PORT_NAME_MAX) { 1254 char name[MLX5_PHYS_PORT_NAME_MAX]; 1255 1256 /* 1257 * We can't just patch the message with padding 1258 * zero - it might corrupt the following items 1259 * in the message, we have to copy the string 1260 * by attribute length and pad the copied one. 1261 */ 1262 memcpy(name, payload, len); 1263 name[len] = 0; 1264 mlx5_translate_port_name(name, &info); 1265 } else { 1266 info.name_type = 1267 MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN; 1268 } 1269 break; 1270 case IFLA_PHYS_SWITCH_ID: 1271 info.switch_id = 0; 1272 for (i = 0; i < RTA_PAYLOAD(ra); ++i) { 1273 info.switch_id <<= 8; 1274 info.switch_id |= ((uint8_t *)payload)[i]; 1275 } 1276 switch_id_set = true; 1277 break; 1278 } 1279 off += RTA_ALIGN(ra->rta_len); 1280 } 1281 if (switch_id_set) { 1282 /* We have some E-Switch configuration. */ 1283 mlx5_nl_check_switch_info(num_vf_set, &info); 1284 } 1285 MLX5_ASSERT(!(info.master && info.representor)); 1286 memcpy(arg, &info, sizeof(info)); 1287 return 0; 1288 error: 1289 rte_errno = EINVAL; 1290 return -rte_errno; 1291 } 1292 1293 /** 1294 * Get switch information associated with network interface. 1295 * 1296 * @param nl 1297 * Netlink socket of the ROUTE kind (NETLINK_ROUTE). 1298 * @param ifindex 1299 * Network interface index. 1300 * @param[out] info 1301 * Switch information object, populated in case of success. 1302 * 1303 * @return 1304 * 0 on success, a negative errno value otherwise and rte_errno is set. 1305 */ 1306 int 1307 mlx5_nl_switch_info(int nl, unsigned int ifindex, 1308 struct mlx5_switch_info *info) 1309 { 1310 struct { 1311 struct nlmsghdr nh; 1312 struct ifinfomsg info; 1313 struct rtattr rta; 1314 uint32_t extmask; 1315 } req = { 1316 .nh = { 1317 .nlmsg_len = NLMSG_LENGTH 1318 (sizeof(req.info) + 1319 RTA_LENGTH(sizeof(uint32_t))), 1320 .nlmsg_type = RTM_GETLINK, 1321 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK, 1322 }, 1323 .info = { 1324 .ifi_family = AF_UNSPEC, 1325 .ifi_index = ifindex, 1326 }, 1327 .rta = { 1328 .rta_type = IFLA_EXT_MASK, 1329 .rta_len = RTA_LENGTH(sizeof(int32_t)), 1330 }, 1331 .extmask = RTE_LE32(1), 1332 }; 1333 uint32_t sn = MLX5_NL_SN_GENERATE; 1334 int ret; 1335 1336 ret = mlx5_nl_send(nl, &req.nh, sn); 1337 if (ret >= 0) 1338 ret = mlx5_nl_recv(nl, sn, mlx5_nl_switch_info_cb, info); 1339 if (info->master && info->representor) { 1340 DRV_LOG(ERR, "ifindex %u device is recognized as master" 1341 " and as representor", ifindex); 1342 rte_errno = ENODEV; 1343 ret = -rte_errno; 1344 } 1345 return ret; 1346 } 1347 1348 /* 1349 * Delete VLAN network device by ifindex. 1350 * 1351 * @param[in] tcf 1352 * Context object initialized by mlx5_nl_vlan_vmwa_init(). 1353 * @param[in] ifindex 1354 * Interface index of network device to delete. 1355 */ 1356 void 1357 mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa, 1358 uint32_t ifindex) 1359 { 1360 uint32_t sn = MLX5_NL_SN_GENERATE; 1361 int ret; 1362 struct { 1363 struct nlmsghdr nh; 1364 struct ifinfomsg info; 1365 } req = { 1366 .nh = { 1367 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), 1368 .nlmsg_type = RTM_DELLINK, 1369 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK, 1370 }, 1371 .info = { 1372 .ifi_family = AF_UNSPEC, 1373 .ifi_index = ifindex, 1374 }, 1375 }; 1376 1377 if (ifindex) { 1378 ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, sn); 1379 if (ret >= 0) 1380 ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL); 1381 if (ret < 0) 1382 DRV_LOG(WARNING, "netlink: error deleting VLAN WA" 1383 " ifindex %u, %d", ifindex, ret); 1384 } 1385 } 1386 1387 /* Set of subroutines to build Netlink message. */ 1388 static struct nlattr * 1389 nl_msg_tail(struct nlmsghdr *nlh) 1390 { 1391 return (struct nlattr *) 1392 (((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len)); 1393 } 1394 1395 static void 1396 nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen) 1397 { 1398 struct nlattr *nla = nl_msg_tail(nlh); 1399 1400 nla->nla_type = type; 1401 nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr)) + alen; 1402 nlh->nlmsg_len += NLMSG_ALIGN(nla->nla_len); 1403 1404 if (alen) 1405 memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen); 1406 } 1407 1408 static struct nlattr * 1409 nl_attr_nest_start(struct nlmsghdr *nlh, int type) 1410 { 1411 struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh); 1412 1413 nl_attr_put(nlh, type, NULL, 0); 1414 return nest; 1415 } 1416 1417 static void 1418 nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest) 1419 { 1420 nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest; 1421 } 1422 1423 /* 1424 * Create network VLAN device with specified VLAN tag. 1425 * 1426 * @param[in] tcf 1427 * Context object initialized by mlx5_nl_vlan_vmwa_init(). 1428 * @param[in] ifindex 1429 * Base network interface index. 1430 * @param[in] tag 1431 * VLAN tag for VLAN network device to create. 1432 */ 1433 uint32_t 1434 mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa, 1435 uint32_t ifindex, uint16_t tag) 1436 { 1437 struct nlmsghdr *nlh; 1438 struct ifinfomsg *ifm; 1439 char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32]; 1440 1441 __rte_cache_aligned 1442 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + 1443 NLMSG_ALIGN(sizeof(struct ifinfomsg)) + 1444 NLMSG_ALIGN(sizeof(struct nlattr)) * 8 + 1445 NLMSG_ALIGN(sizeof(uint32_t)) + 1446 NLMSG_ALIGN(sizeof(name)) + 1447 NLMSG_ALIGN(sizeof("vlan")) + 1448 NLMSG_ALIGN(sizeof(uint32_t)) + 1449 NLMSG_ALIGN(sizeof(uint16_t)) + 16]; 1450 struct nlattr *na_info; 1451 struct nlattr *na_vlan; 1452 uint32_t sn = MLX5_NL_SN_GENERATE; 1453 int ret; 1454 1455 memset(buf, 0, sizeof(buf)); 1456 nlh = (struct nlmsghdr *)buf; 1457 nlh->nlmsg_len = sizeof(struct nlmsghdr); 1458 nlh->nlmsg_type = RTM_NEWLINK; 1459 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | 1460 NLM_F_EXCL | NLM_F_ACK; 1461 ifm = (struct ifinfomsg *)nl_msg_tail(nlh); 1462 nlh->nlmsg_len += sizeof(struct ifinfomsg); 1463 ifm->ifi_family = AF_UNSPEC; 1464 ifm->ifi_type = 0; 1465 ifm->ifi_index = 0; 1466 ifm->ifi_flags = IFF_UP; 1467 ifm->ifi_change = 0xffffffff; 1468 nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex)); 1469 ret = snprintf(name, sizeof(name), "%s.%u.%u", 1470 MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag); 1471 nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1); 1472 na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO); 1473 nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan")); 1474 na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA); 1475 nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag)); 1476 nl_attr_nest_end(nlh, na_vlan); 1477 nl_attr_nest_end(nlh, na_info); 1478 MLX5_ASSERT(sizeof(buf) >= nlh->nlmsg_len); 1479 ret = mlx5_nl_send(vmwa->nl_socket, nlh, sn); 1480 if (ret >= 0) 1481 ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL); 1482 if (ret < 0) { 1483 DRV_LOG(WARNING, "netlink: VLAN %s create failure (%d)", name, 1484 ret); 1485 } 1486 /* Try to get ifindex of created or pre-existing device. */ 1487 ret = if_nametoindex(name); 1488 if (!ret) { 1489 DRV_LOG(WARNING, "VLAN %s failed to get index (%d)", name, 1490 errno); 1491 return 0; 1492 } 1493 return ret; 1494 } 1495 1496 /** 1497 * Parse Netlink message to retrieve the general family ID. 1498 * 1499 * @param nh 1500 * Pointer to Netlink Message Header. 1501 * @param arg 1502 * PMD data register with this callback. 1503 * 1504 * @return 1505 * 0 on success, a negative errno value otherwise and rte_errno is set. 1506 */ 1507 static int 1508 mlx5_nl_family_id_cb(struct nlmsghdr *nh, void *arg) 1509 { 1510 1511 struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len); 1512 struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) + 1513 NLMSG_ALIGN(sizeof(struct genlmsghdr))); 1514 1515 for (; nla->nla_len && nla < tail; 1516 nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len))) { 1517 if (nla->nla_type == CTRL_ATTR_FAMILY_ID) { 1518 *(uint16_t *)arg = *(uint16_t *)(nla + 1); 1519 return 0; 1520 } 1521 } 1522 return -EINVAL; 1523 } 1524 1525 #define MLX5_NL_MAX_ATTR_SIZE 100 1526 /** 1527 * Get generic netlink family ID. 1528 * 1529 * @param[in] nlsk_fd 1530 * Netlink socket file descriptor. 1531 * @param[in] name 1532 * The family name. 1533 * 1534 * @return 1535 * ID >= 0 on success and @p enable is updated, a negative errno value 1536 * otherwise and rte_errno is set. 1537 */ 1538 static int 1539 mlx5_nl_generic_family_id_get(int nlsk_fd, const char *name) 1540 { 1541 struct nlmsghdr *nlh; 1542 struct genlmsghdr *genl; 1543 uint32_t sn = MLX5_NL_SN_GENERATE; 1544 int name_size = strlen(name) + 1; 1545 int ret; 1546 uint16_t id = -1; 1547 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + 1548 NLMSG_ALIGN(sizeof(struct genlmsghdr)) + 1549 NLMSG_ALIGN(sizeof(struct nlattr)) + 1550 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE)]; 1551 1552 memset(buf, 0, sizeof(buf)); 1553 nlh = (struct nlmsghdr *)buf; 1554 nlh->nlmsg_len = sizeof(struct nlmsghdr); 1555 nlh->nlmsg_type = GENL_ID_CTRL; 1556 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 1557 genl = (struct genlmsghdr *)nl_msg_tail(nlh); 1558 nlh->nlmsg_len += sizeof(struct genlmsghdr); 1559 genl->cmd = CTRL_CMD_GETFAMILY; 1560 genl->version = 1; 1561 nl_attr_put(nlh, CTRL_ATTR_FAMILY_NAME, name, name_size); 1562 ret = mlx5_nl_send(nlsk_fd, nlh, sn); 1563 if (ret >= 0) 1564 ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_family_id_cb, &id); 1565 if (ret < 0) { 1566 DRV_LOG(DEBUG, "Failed to get Netlink %s family ID: %d.", name, 1567 ret); 1568 return ret; 1569 } 1570 DRV_LOG(DEBUG, "Netlink \"%s\" family ID is %u.", name, id); 1571 return (int)id; 1572 } 1573 1574 /** 1575 * Get Devlink family ID. 1576 * 1577 * @param[in] nlsk_fd 1578 * Netlink socket file descriptor. 1579 * 1580 * @return 1581 * ID >= 0 on success and @p enable is updated, a negative errno value 1582 * otherwise and rte_errno is set. 1583 */ 1584 1585 int 1586 mlx5_nl_devlink_family_id_get(int nlsk_fd) 1587 { 1588 return mlx5_nl_generic_family_id_get(nlsk_fd, DEVLINK_GENL_NAME); 1589 } 1590 1591 /** 1592 * Parse Netlink message to retrieve the ROCE enable status. 1593 * 1594 * @param nh 1595 * Pointer to Netlink Message Header. 1596 * @param arg 1597 * PMD data register with this callback. 1598 * 1599 * @return 1600 * 0 on success, a negative errno value otherwise and rte_errno is set. 1601 */ 1602 static int 1603 mlx5_nl_roce_cb(struct nlmsghdr *nh, void *arg) 1604 { 1605 1606 int ret = -EINVAL; 1607 int *enable = arg; 1608 struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len); 1609 struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) + 1610 NLMSG_ALIGN(sizeof(struct genlmsghdr))); 1611 1612 while (nla->nla_len && nla < tail) { 1613 switch (nla->nla_type) { 1614 /* Expected nested attributes case. */ 1615 case DEVLINK_ATTR_PARAM: 1616 case DEVLINK_ATTR_PARAM_VALUES_LIST: 1617 case DEVLINK_ATTR_PARAM_VALUE: 1618 ret = 0; 1619 nla += 1; 1620 break; 1621 case DEVLINK_ATTR_PARAM_VALUE_DATA: 1622 *enable = 1; 1623 return 0; 1624 default: 1625 nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len)); 1626 } 1627 } 1628 *enable = 0; 1629 return ret; 1630 } 1631 1632 /** 1633 * Get ROCE enable status through Netlink. 1634 * 1635 * @param[in] nlsk_fd 1636 * Netlink socket file descriptor. 1637 * @param[in] family_id 1638 * the Devlink family ID. 1639 * @param pci_addr 1640 * The device PCI address. 1641 * @param[out] enable 1642 * Where to store the enable status. 1643 * 1644 * @return 1645 * 0 on success and @p enable is updated, a negative errno value otherwise 1646 * and rte_errno is set. 1647 */ 1648 int 1649 mlx5_nl_enable_roce_get(int nlsk_fd, int family_id, const char *pci_addr, 1650 int *enable) 1651 { 1652 struct nlmsghdr *nlh; 1653 struct genlmsghdr *genl; 1654 uint32_t sn = MLX5_NL_SN_GENERATE; 1655 int ret; 1656 int cur_en = 0; 1657 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + 1658 NLMSG_ALIGN(sizeof(struct genlmsghdr)) + 1659 NLMSG_ALIGN(sizeof(struct nlattr)) * 4 + 1660 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 4]; 1661 1662 memset(buf, 0, sizeof(buf)); 1663 nlh = (struct nlmsghdr *)buf; 1664 nlh->nlmsg_len = sizeof(struct nlmsghdr); 1665 nlh->nlmsg_type = family_id; 1666 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 1667 genl = (struct genlmsghdr *)nl_msg_tail(nlh); 1668 nlh->nlmsg_len += sizeof(struct genlmsghdr); 1669 genl->cmd = DEVLINK_CMD_PARAM_GET; 1670 genl->version = DEVLINK_GENL_VERSION; 1671 nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4); 1672 nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1); 1673 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12); 1674 ret = mlx5_nl_send(nlsk_fd, nlh, sn); 1675 if (ret >= 0) 1676 ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_roce_cb, &cur_en); 1677 if (ret < 0) { 1678 DRV_LOG(DEBUG, "Failed to get ROCE enable on device %s: %d.", 1679 pci_addr, ret); 1680 return ret; 1681 } 1682 *enable = cur_en; 1683 DRV_LOG(DEBUG, "ROCE is %sabled for device \"%s\".", 1684 cur_en ? "en" : "dis", pci_addr); 1685 return ret; 1686 } 1687 1688 /** 1689 * Reload mlx5 device kernel driver through Netlink. 1690 * 1691 * @param[in] nlsk_fd 1692 * Netlink socket file descriptor. 1693 * @param[in] family_id 1694 * the Devlink family ID. 1695 * @param pci_addr 1696 * The device PCI address. 1697 * @param[out] enable 1698 * The enable status to set. 1699 * 1700 * @return 1701 * 0 on success, a negative errno value otherwise and rte_errno is set. 1702 */ 1703 int 1704 mlx5_nl_driver_reload(int nlsk_fd, int family_id, const char *pci_addr) 1705 { 1706 struct nlmsghdr *nlh; 1707 struct genlmsghdr *genl; 1708 uint32_t sn = MLX5_NL_SN_GENERATE; 1709 int ret; 1710 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + 1711 NLMSG_ALIGN(sizeof(struct genlmsghdr)) + 1712 NLMSG_ALIGN(sizeof(struct nlattr)) * 2 + 1713 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 2]; 1714 1715 memset(buf, 0, sizeof(buf)); 1716 nlh = (struct nlmsghdr *)buf; 1717 nlh->nlmsg_len = sizeof(struct nlmsghdr); 1718 nlh->nlmsg_type = family_id; 1719 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 1720 genl = (struct genlmsghdr *)nl_msg_tail(nlh); 1721 nlh->nlmsg_len += sizeof(struct genlmsghdr); 1722 genl->cmd = DEVLINK_CMD_RELOAD; 1723 genl->version = DEVLINK_GENL_VERSION; 1724 nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4); 1725 nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1); 1726 ret = mlx5_nl_send(nlsk_fd, nlh, sn); 1727 if (ret >= 0) 1728 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL); 1729 if (ret < 0) { 1730 DRV_LOG(DEBUG, "Failed to reload %s device by Netlink - %d", 1731 pci_addr, ret); 1732 return ret; 1733 } 1734 DRV_LOG(DEBUG, "Device \"%s\" was reloaded by Netlink successfully.", 1735 pci_addr); 1736 return 0; 1737 } 1738 1739 /** 1740 * Set ROCE enable status through Netlink. 1741 * 1742 * @param[in] nlsk_fd 1743 * Netlink socket file descriptor. 1744 * @param[in] family_id 1745 * the Devlink family ID. 1746 * @param pci_addr 1747 * The device PCI address. 1748 * @param[out] enable 1749 * The enable status to set. 1750 * 1751 * @return 1752 * 0 on success, a negative errno value otherwise and rte_errno is set. 1753 */ 1754 int 1755 mlx5_nl_enable_roce_set(int nlsk_fd, int family_id, const char *pci_addr, 1756 int enable) 1757 { 1758 struct nlmsghdr *nlh; 1759 struct genlmsghdr *genl; 1760 uint32_t sn = MLX5_NL_SN_GENERATE; 1761 int ret; 1762 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + 1763 NLMSG_ALIGN(sizeof(struct genlmsghdr)) + 1764 NLMSG_ALIGN(sizeof(struct nlattr)) * 6 + 1765 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 6]; 1766 uint8_t cmode = DEVLINK_PARAM_CMODE_DRIVERINIT; 1767 uint8_t ptype = NLA_FLAG; 1768 ; 1769 1770 memset(buf, 0, sizeof(buf)); 1771 nlh = (struct nlmsghdr *)buf; 1772 nlh->nlmsg_len = sizeof(struct nlmsghdr); 1773 nlh->nlmsg_type = family_id; 1774 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 1775 genl = (struct genlmsghdr *)nl_msg_tail(nlh); 1776 nlh->nlmsg_len += sizeof(struct genlmsghdr); 1777 genl->cmd = DEVLINK_CMD_PARAM_SET; 1778 genl->version = DEVLINK_GENL_VERSION; 1779 nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4); 1780 nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1); 1781 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12); 1782 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_CMODE, &cmode, sizeof(cmode)); 1783 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_TYPE, &ptype, sizeof(ptype)); 1784 if (enable) 1785 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, NULL, 0); 1786 ret = mlx5_nl_send(nlsk_fd, nlh, sn); 1787 if (ret >= 0) 1788 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL); 1789 if (ret < 0) { 1790 DRV_LOG(DEBUG, "Failed to %sable ROCE for device %s by Netlink:" 1791 " %d.", enable ? "en" : "dis", pci_addr, ret); 1792 return ret; 1793 } 1794 DRV_LOG(DEBUG, "Device %s ROCE was %sabled by Netlink successfully.", 1795 pci_addr, enable ? "en" : "dis"); 1796 /* Now, need to reload the driver. */ 1797 return mlx5_nl_driver_reload(nlsk_fd, family_id, pci_addr); 1798 } 1799