1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2018 6WIND S.A. 3 * Copyright 2018 Mellanox Technologies, Ltd 4 */ 5 6 #include <errno.h> 7 #include <linux/if_link.h> 8 #include <linux/rtnetlink.h> 9 #include <linux/genetlink.h> 10 #include <net/if.h> 11 #include <rdma/rdma_netlink.h> 12 #include <stdbool.h> 13 #include <stdint.h> 14 #include <stdlib.h> 15 #include <stdalign.h> 16 #include <string.h> 17 #include <sys/socket.h> 18 #include <unistd.h> 19 20 #include <rte_errno.h> 21 22 #include "mlx5_nl.h" 23 #include "../mlx5_common_log.h" 24 #include "mlx5_malloc.h" 25 #ifdef HAVE_DEVLINK 26 #include <linux/devlink.h> 27 #endif 28 29 30 /* Size of the buffer to receive kernel messages */ 31 #define MLX5_NL_BUF_SIZE (32 * 1024) 32 /* Send buffer size for the Netlink socket */ 33 #define MLX5_SEND_BUF_SIZE 32768 34 /* Receive buffer size for the Netlink socket */ 35 #define MLX5_RECV_BUF_SIZE 32768 36 /* Maximal physical port name length. */ 37 #define MLX5_PHYS_PORT_NAME_MAX 128 38 39 /** Parameters of VLAN devices created by driver. */ 40 #define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx" 41 /* 42 * Define NDA_RTA as defined in iproute2 sources. 43 * 44 * see in iproute2 sources file include/libnetlink.h 45 */ 46 #ifndef MLX5_NDA_RTA 47 #define MLX5_NDA_RTA(r) \ 48 ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg)))) 49 #endif 50 /* 51 * Define NLMSG_TAIL as defined in iproute2 sources. 52 * 53 * see in iproute2 sources file include/libnetlink.h 54 */ 55 #ifndef NLMSG_TAIL 56 #define NLMSG_TAIL(nmsg) \ 57 ((struct rtattr *)(((char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) 58 #endif 59 /* 60 * The following definitions are normally found in rdma/rdma_netlink.h, 61 * however they are so recent that most systems do not expose them yet. 62 */ 63 #ifndef HAVE_RDMA_NL_NLDEV 64 #define RDMA_NL_NLDEV 5 65 #endif 66 #ifndef HAVE_RDMA_NLDEV_CMD_GET 67 #define RDMA_NLDEV_CMD_GET 1 68 #endif 69 #ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET 70 #define RDMA_NLDEV_CMD_PORT_GET 5 71 #endif 72 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX 73 #define RDMA_NLDEV_ATTR_DEV_INDEX 1 74 #endif 75 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME 76 #define RDMA_NLDEV_ATTR_DEV_NAME 2 77 #endif 78 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX 79 #define RDMA_NLDEV_ATTR_PORT_INDEX 3 80 #endif 81 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX 82 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50 83 #endif 84 85 /* These are normally found in linux/if_link.h. */ 86 #ifndef HAVE_IFLA_NUM_VF 87 #define IFLA_NUM_VF 21 88 #endif 89 #ifndef HAVE_IFLA_EXT_MASK 90 #define IFLA_EXT_MASK 29 91 #endif 92 #ifndef HAVE_IFLA_PHYS_SWITCH_ID 93 #define IFLA_PHYS_SWITCH_ID 36 94 #endif 95 #ifndef HAVE_IFLA_PHYS_PORT_NAME 96 #define IFLA_PHYS_PORT_NAME 38 97 #endif 98 99 /* 100 * Some Devlink defines may be missed in old kernel versions, 101 * adjust used defines. 102 */ 103 #ifndef DEVLINK_GENL_NAME 104 #define DEVLINK_GENL_NAME "devlink" 105 #endif 106 #ifndef DEVLINK_GENL_VERSION 107 #define DEVLINK_GENL_VERSION 1 108 #endif 109 #ifndef DEVLINK_ATTR_BUS_NAME 110 #define DEVLINK_ATTR_BUS_NAME 1 111 #endif 112 #ifndef DEVLINK_ATTR_DEV_NAME 113 #define DEVLINK_ATTR_DEV_NAME 2 114 #endif 115 #ifndef DEVLINK_ATTR_PARAM 116 #define DEVLINK_ATTR_PARAM 80 117 #endif 118 #ifndef DEVLINK_ATTR_PARAM_NAME 119 #define DEVLINK_ATTR_PARAM_NAME 81 120 #endif 121 #ifndef DEVLINK_ATTR_PARAM_TYPE 122 #define DEVLINK_ATTR_PARAM_TYPE 83 123 #endif 124 #ifndef DEVLINK_ATTR_PARAM_VALUES_LIST 125 #define DEVLINK_ATTR_PARAM_VALUES_LIST 84 126 #endif 127 #ifndef DEVLINK_ATTR_PARAM_VALUE 128 #define DEVLINK_ATTR_PARAM_VALUE 85 129 #endif 130 #ifndef DEVLINK_ATTR_PARAM_VALUE_DATA 131 #define DEVLINK_ATTR_PARAM_VALUE_DATA 86 132 #endif 133 #ifndef DEVLINK_ATTR_PARAM_VALUE_CMODE 134 #define DEVLINK_ATTR_PARAM_VALUE_CMODE 87 135 #endif 136 #ifndef DEVLINK_PARAM_CMODE_DRIVERINIT 137 #define DEVLINK_PARAM_CMODE_DRIVERINIT 1 138 #endif 139 #ifndef DEVLINK_CMD_RELOAD 140 #define DEVLINK_CMD_RELOAD 37 141 #endif 142 #ifndef DEVLINK_CMD_PARAM_GET 143 #define DEVLINK_CMD_PARAM_GET 38 144 #endif 145 #ifndef DEVLINK_CMD_PARAM_SET 146 #define DEVLINK_CMD_PARAM_SET 39 147 #endif 148 #ifndef NLA_FLAG 149 #define NLA_FLAG 6 150 #endif 151 152 /* Add/remove MAC address through Netlink */ 153 struct mlx5_nl_mac_addr { 154 struct rte_ether_addr (*mac)[]; 155 /**< MAC address handled by the device. */ 156 int mac_n; /**< Number of addresses in the array. */ 157 }; 158 159 #define MLX5_NL_CMD_GET_IB_NAME (1 << 0) 160 #define MLX5_NL_CMD_GET_IB_INDEX (1 << 1) 161 #define MLX5_NL_CMD_GET_NET_INDEX (1 << 2) 162 #define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3) 163 164 /** Data structure used by mlx5_nl_cmdget_cb(). */ 165 struct mlx5_nl_ifindex_data { 166 const char *name; /**< IB device name (in). */ 167 uint32_t flags; /**< found attribute flags (out). */ 168 uint32_t ibindex; /**< IB device index (out). */ 169 uint32_t ifindex; /**< Network interface index (out). */ 170 uint32_t portnum; /**< IB device max port number (out). */ 171 }; 172 173 uint32_t atomic_sn; 174 175 /* Generate Netlink sequence number. */ 176 #define MLX5_NL_SN_GENERATE __atomic_add_fetch(&atomic_sn, 1, __ATOMIC_RELAXED) 177 178 /** 179 * Opens a Netlink socket. 180 * 181 * @param protocol 182 * Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA). 183 * 184 * @return 185 * A file descriptor on success, a negative errno value otherwise and 186 * rte_errno is set. 187 */ 188 int 189 mlx5_nl_init(int protocol) 190 { 191 int fd; 192 int buf_size; 193 socklen_t opt_size; 194 struct sockaddr_nl local = { 195 .nl_family = AF_NETLINK, 196 }; 197 int ret; 198 199 fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol); 200 if (fd == -1) { 201 rte_errno = errno; 202 return -rte_errno; 203 } 204 opt_size = sizeof(buf_size); 205 ret = getsockopt(fd, SOL_SOCKET, SO_SNDBUF, &buf_size, &opt_size); 206 if (ret == -1) { 207 rte_errno = errno; 208 goto error; 209 } 210 DRV_LOG(DEBUG, "Netlink socket send buffer: %d", buf_size); 211 if (buf_size < MLX5_SEND_BUF_SIZE) { 212 ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 213 &buf_size, sizeof(buf_size)); 214 if (ret == -1) { 215 rte_errno = errno; 216 goto error; 217 } 218 } 219 opt_size = sizeof(buf_size); 220 ret = getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &buf_size, &opt_size); 221 if (ret == -1) { 222 rte_errno = errno; 223 goto error; 224 } 225 DRV_LOG(DEBUG, "Netlink socket recv buffer: %d", buf_size); 226 if (buf_size < MLX5_RECV_BUF_SIZE) { 227 ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, 228 &buf_size, sizeof(buf_size)); 229 if (ret == -1) { 230 rte_errno = errno; 231 goto error; 232 } 233 } 234 ret = bind(fd, (struct sockaddr *)&local, sizeof(local)); 235 if (ret == -1) { 236 rte_errno = errno; 237 goto error; 238 } 239 return fd; 240 error: 241 close(fd); 242 return -rte_errno; 243 } 244 245 /** 246 * Send a request message to the kernel on the Netlink socket. 247 * 248 * @param[in] nlsk_fd 249 * Netlink socket file descriptor. 250 * @param[in] nh 251 * The Netlink message send to the kernel. 252 * @param[in] ssn 253 * Sequence number. 254 * @param[in] req 255 * Pointer to the request structure. 256 * @param[in] len 257 * Length of the request in bytes. 258 * 259 * @return 260 * The number of sent bytes on success, a negative errno value otherwise and 261 * rte_errno is set. 262 */ 263 static int 264 mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req, 265 int len) 266 { 267 struct sockaddr_nl sa = { 268 .nl_family = AF_NETLINK, 269 }; 270 struct iovec iov[2] = { 271 { .iov_base = nh, .iov_len = sizeof(*nh), }, 272 { .iov_base = req, .iov_len = len, }, 273 }; 274 struct msghdr msg = { 275 .msg_name = &sa, 276 .msg_namelen = sizeof(sa), 277 .msg_iov = iov, 278 .msg_iovlen = 2, 279 }; 280 int send_bytes; 281 282 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */ 283 nh->nlmsg_seq = sn; 284 send_bytes = sendmsg(nlsk_fd, &msg, 0); 285 if (send_bytes < 0) { 286 rte_errno = errno; 287 return -rte_errno; 288 } 289 return send_bytes; 290 } 291 292 /** 293 * Send a message to the kernel on the Netlink socket. 294 * 295 * @param[in] nlsk_fd 296 * The Netlink socket file descriptor used for communication. 297 * @param[in] nh 298 * The Netlink message send to the kernel. 299 * @param[in] sn 300 * Sequence number. 301 * 302 * @return 303 * The number of sent bytes on success, a negative errno value otherwise and 304 * rte_errno is set. 305 */ 306 static int 307 mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn) 308 { 309 struct sockaddr_nl sa = { 310 .nl_family = AF_NETLINK, 311 }; 312 struct iovec iov = { 313 .iov_base = nh, 314 .iov_len = nh->nlmsg_len, 315 }; 316 struct msghdr msg = { 317 .msg_name = &sa, 318 .msg_namelen = sizeof(sa), 319 .msg_iov = &iov, 320 .msg_iovlen = 1, 321 }; 322 int send_bytes; 323 324 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */ 325 nh->nlmsg_seq = sn; 326 send_bytes = sendmsg(nlsk_fd, &msg, 0); 327 if (send_bytes < 0) { 328 rte_errno = errno; 329 return -rte_errno; 330 } 331 return send_bytes; 332 } 333 334 /** 335 * Receive a message from the kernel on the Netlink socket, following 336 * mlx5_nl_send(). 337 * 338 * @param[in] nlsk_fd 339 * The Netlink socket file descriptor used for communication. 340 * @param[in] sn 341 * Sequence number. 342 * @param[in] cb 343 * The callback function to call for each Netlink message received. 344 * @param[in, out] arg 345 * Custom arguments for the callback. 346 * 347 * @return 348 * 0 on success, a negative errno value otherwise and rte_errno is set. 349 */ 350 static int 351 mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg), 352 void *arg) 353 { 354 struct sockaddr_nl sa; 355 struct iovec iov; 356 struct msghdr msg = { 357 .msg_name = &sa, 358 .msg_namelen = sizeof(sa), 359 .msg_iov = &iov, 360 /* One message at a time */ 361 .msg_iovlen = 1, 362 }; 363 void *buf = NULL; 364 int multipart = 0; 365 int ret = 0; 366 367 do { 368 struct nlmsghdr *nh; 369 int recv_bytes; 370 371 do { 372 /* Query length of incoming message. */ 373 iov.iov_base = NULL; 374 iov.iov_len = 0; 375 recv_bytes = recvmsg(nlsk_fd, &msg, 376 MSG_PEEK | MSG_TRUNC); 377 if (recv_bytes < 0) { 378 rte_errno = errno; 379 ret = -rte_errno; 380 goto exit; 381 } 382 if (recv_bytes == 0) { 383 rte_errno = ENODATA; 384 ret = -rte_errno; 385 goto exit; 386 } 387 /* Allocate buffer to fetch the message. */ 388 if (recv_bytes < MLX5_RECV_BUF_SIZE) 389 recv_bytes = MLX5_RECV_BUF_SIZE; 390 mlx5_free(buf); 391 buf = mlx5_malloc(0, recv_bytes, 0, SOCKET_ID_ANY); 392 if (!buf) { 393 rte_errno = ENOMEM; 394 ret = -rte_errno; 395 goto exit; 396 } 397 /* Fetch the message. */ 398 iov.iov_base = buf; 399 iov.iov_len = recv_bytes; 400 recv_bytes = recvmsg(nlsk_fd, &msg, 0); 401 if (recv_bytes == -1) { 402 rte_errno = errno; 403 ret = -rte_errno; 404 goto exit; 405 } 406 nh = (struct nlmsghdr *)buf; 407 } while (nh->nlmsg_seq != sn); 408 for (; 409 NLMSG_OK(nh, (unsigned int)recv_bytes); 410 nh = NLMSG_NEXT(nh, recv_bytes)) { 411 if (nh->nlmsg_type == NLMSG_ERROR) { 412 struct nlmsgerr *err_data = NLMSG_DATA(nh); 413 414 if (err_data->error < 0) { 415 rte_errno = -err_data->error; 416 ret = -rte_errno; 417 goto exit; 418 } 419 /* Ack message. */ 420 ret = 0; 421 goto exit; 422 } 423 /* Multi-part msgs and their trailing DONE message. */ 424 if (nh->nlmsg_flags & NLM_F_MULTI) { 425 if (nh->nlmsg_type == NLMSG_DONE) { 426 ret = 0; 427 goto exit; 428 } 429 multipart = 1; 430 } 431 if (cb) { 432 ret = cb(nh, arg); 433 if (ret < 0) 434 goto exit; 435 } 436 } 437 } while (multipart); 438 exit: 439 mlx5_free(buf); 440 return ret; 441 } 442 443 /** 444 * Parse Netlink message to retrieve the bridge MAC address. 445 * 446 * @param nh 447 * Pointer to Netlink Message Header. 448 * @param arg 449 * PMD data register with this callback. 450 * 451 * @return 452 * 0 on success, a negative errno value otherwise and rte_errno is set. 453 */ 454 static int 455 mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg) 456 { 457 struct mlx5_nl_mac_addr *data = arg; 458 struct ndmsg *r = NLMSG_DATA(nh); 459 struct rtattr *attribute; 460 int len; 461 462 len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r)); 463 for (attribute = MLX5_NDA_RTA(r); 464 RTA_OK(attribute, len); 465 attribute = RTA_NEXT(attribute, len)) { 466 if (attribute->rta_type == NDA_LLADDR) { 467 if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) { 468 DRV_LOG(WARNING, 469 "not enough room to finalize the" 470 " request"); 471 rte_errno = ENOMEM; 472 return -rte_errno; 473 } 474 #ifdef RTE_LIBRTE_MLX5_DEBUG 475 char m[RTE_ETHER_ADDR_FMT_SIZE]; 476 477 rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE, 478 RTA_DATA(attribute)); 479 DRV_LOG(DEBUG, "bridge MAC address %s", m); 480 #endif 481 memcpy(&(*data->mac)[data->mac_n++], 482 RTA_DATA(attribute), RTE_ETHER_ADDR_LEN); 483 } 484 } 485 return 0; 486 } 487 488 /** 489 * Get bridge MAC addresses. 490 * 491 * @param[in] nlsk_fd 492 * Netlink socket file descriptor. 493 * @param[in] iface_idx 494 * Net device interface index. 495 * @param mac[out] 496 * Pointer to the array table of MAC addresses to fill. 497 * Its size should be of MLX5_MAX_MAC_ADDRESSES. 498 * @param mac_n[out] 499 * Number of entries filled in MAC array. 500 * 501 * @return 502 * 0 on success, a negative errno value otherwise and rte_errno is set. 503 */ 504 static int 505 mlx5_nl_mac_addr_list(int nlsk_fd, unsigned int iface_idx, 506 struct rte_ether_addr (*mac)[], int *mac_n) 507 { 508 struct { 509 struct nlmsghdr hdr; 510 struct ifinfomsg ifm; 511 } req = { 512 .hdr = { 513 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), 514 .nlmsg_type = RTM_GETNEIGH, 515 .nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, 516 }, 517 .ifm = { 518 .ifi_family = PF_BRIDGE, 519 .ifi_index = iface_idx, 520 }, 521 }; 522 struct mlx5_nl_mac_addr data = { 523 .mac = mac, 524 .mac_n = 0, 525 }; 526 uint32_t sn = MLX5_NL_SN_GENERATE; 527 int ret; 528 529 if (nlsk_fd == -1) 530 return 0; 531 ret = mlx5_nl_request(nlsk_fd, &req.hdr, sn, &req.ifm, 532 sizeof(struct ifinfomsg)); 533 if (ret < 0) 534 goto error; 535 ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_mac_addr_cb, &data); 536 if (ret < 0) 537 goto error; 538 *mac_n = data.mac_n; 539 return 0; 540 error: 541 DRV_LOG(DEBUG, "Interface %u cannot retrieve MAC address list %s", 542 iface_idx, strerror(rte_errno)); 543 return -rte_errno; 544 } 545 546 /** 547 * Modify the MAC address neighbour table with Netlink. 548 * 549 * @param[in] nlsk_fd 550 * Netlink socket file descriptor. 551 * @param[in] iface_idx 552 * Net device interface index. 553 * @param mac 554 * MAC address to consider. 555 * @param add 556 * 1 to add the MAC address, 0 to remove the MAC address. 557 * 558 * @return 559 * 0 on success, a negative errno value otherwise and rte_errno is set. 560 */ 561 static int 562 mlx5_nl_mac_addr_modify(int nlsk_fd, unsigned int iface_idx, 563 struct rte_ether_addr *mac, int add) 564 { 565 struct { 566 struct nlmsghdr hdr; 567 struct ndmsg ndm; 568 struct rtattr rta; 569 uint8_t buffer[RTE_ETHER_ADDR_LEN]; 570 } req = { 571 .hdr = { 572 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)), 573 .nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | 574 NLM_F_EXCL | NLM_F_ACK, 575 .nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH, 576 }, 577 .ndm = { 578 .ndm_family = PF_BRIDGE, 579 .ndm_state = NUD_NOARP | NUD_PERMANENT, 580 .ndm_ifindex = iface_idx, 581 .ndm_flags = NTF_SELF, 582 }, 583 .rta = { 584 .rta_type = NDA_LLADDR, 585 .rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN), 586 }, 587 }; 588 uint32_t sn = MLX5_NL_SN_GENERATE; 589 int ret; 590 591 if (nlsk_fd == -1) 592 return 0; 593 memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN); 594 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + 595 RTA_ALIGN(req.rta.rta_len); 596 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn); 597 if (ret < 0) 598 goto error; 599 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL); 600 if (ret < 0) 601 goto error; 602 return 0; 603 error: 604 #ifdef RTE_LIBRTE_MLX5_DEBUG 605 { 606 char m[RTE_ETHER_ADDR_FMT_SIZE]; 607 608 rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE, mac); 609 DRV_LOG(DEBUG, 610 "Interface %u cannot %s MAC address %s %s", 611 iface_idx, 612 add ? "add" : "remove", m, strerror(rte_errno)); 613 } 614 #endif 615 return -rte_errno; 616 } 617 618 /** 619 * Modify the VF MAC address neighbour table with Netlink. 620 * 621 * @param[in] nlsk_fd 622 * Netlink socket file descriptor. 623 * @param[in] iface_idx 624 * Net device interface index. 625 * @param mac 626 * MAC address to consider. 627 * @param vf_index 628 * VF index. 629 * 630 * @return 631 * 0 on success, a negative errno value otherwise and rte_errno is set. 632 */ 633 int 634 mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx, 635 struct rte_ether_addr *mac, int vf_index) 636 { 637 int ret; 638 struct { 639 struct nlmsghdr hdr; 640 struct ifinfomsg ifm; 641 struct rtattr vf_list_rta; 642 struct rtattr vf_info_rta; 643 struct rtattr vf_mac_rta; 644 struct ifla_vf_mac ivm; 645 } req = { 646 .hdr = { 647 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), 648 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK, 649 .nlmsg_type = RTM_BASE, 650 }, 651 .ifm = { 652 .ifi_index = iface_idx, 653 }, 654 .vf_list_rta = { 655 .rta_type = IFLA_VFINFO_LIST, 656 .rta_len = RTA_ALIGN(RTA_LENGTH(0)), 657 }, 658 .vf_info_rta = { 659 .rta_type = IFLA_VF_INFO, 660 .rta_len = RTA_ALIGN(RTA_LENGTH(0)), 661 }, 662 .vf_mac_rta = { 663 .rta_type = IFLA_VF_MAC, 664 }, 665 }; 666 struct ifla_vf_mac ivm = { 667 .vf = vf_index, 668 }; 669 uint32_t sn = MLX5_NL_SN_GENERATE; 670 671 memcpy(&ivm.mac, mac, RTE_ETHER_ADDR_LEN); 672 memcpy(RTA_DATA(&req.vf_mac_rta), &ivm, sizeof(ivm)); 673 674 req.vf_mac_rta.rta_len = RTA_LENGTH(sizeof(ivm)); 675 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + 676 RTA_ALIGN(req.vf_list_rta.rta_len) + 677 RTA_ALIGN(req.vf_info_rta.rta_len) + 678 RTA_ALIGN(req.vf_mac_rta.rta_len); 679 req.vf_list_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr), 680 &req.vf_list_rta); 681 req.vf_info_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr), 682 &req.vf_info_rta); 683 684 if (nlsk_fd < 0) 685 return -1; 686 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn); 687 if (ret < 0) 688 goto error; 689 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL); 690 if (ret < 0) 691 goto error; 692 return 0; 693 error: 694 DRV_LOG(ERR, 695 "representor %u cannot set VF MAC address " 696 "%02X:%02X:%02X:%02X:%02X:%02X : %s", 697 vf_index, 698 mac->addr_bytes[0], mac->addr_bytes[1], 699 mac->addr_bytes[2], mac->addr_bytes[3], 700 mac->addr_bytes[4], mac->addr_bytes[5], 701 strerror(rte_errno)); 702 return -rte_errno; 703 } 704 705 /** 706 * Add a MAC address. 707 * 708 * @param[in] nlsk_fd 709 * Netlink socket file descriptor. 710 * @param[in] iface_idx 711 * Net device interface index. 712 * @param mac_own 713 * BITFIELD_DECLARE array to store the mac. 714 * @param mac 715 * MAC address to register. 716 * @param index 717 * MAC address index. 718 * 719 * @return 720 * 0 on success, a negative errno value otherwise and rte_errno is set. 721 */ 722 int 723 mlx5_nl_mac_addr_add(int nlsk_fd, unsigned int iface_idx, 724 uint64_t *mac_own, struct rte_ether_addr *mac, 725 uint32_t index) 726 { 727 int ret; 728 729 ret = mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 1); 730 if (!ret) { 731 MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES); 732 if (index >= MLX5_MAX_MAC_ADDRESSES) 733 return -EINVAL; 734 735 BITFIELD_SET(mac_own, index); 736 } 737 if (ret == -EEXIST) 738 return 0; 739 return ret; 740 } 741 742 /** 743 * Remove a MAC address. 744 * 745 * @param[in] nlsk_fd 746 * Netlink socket file descriptor. 747 * @param[in] iface_idx 748 * Net device interface index. 749 * @param mac_own 750 * BITFIELD_DECLARE array to store the mac. 751 * @param mac 752 * MAC address to remove. 753 * @param index 754 * MAC address index. 755 * 756 * @return 757 * 0 on success, a negative errno value otherwise and rte_errno is set. 758 */ 759 int 760 mlx5_nl_mac_addr_remove(int nlsk_fd, unsigned int iface_idx, uint64_t *mac_own, 761 struct rte_ether_addr *mac, uint32_t index) 762 { 763 MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES); 764 if (index >= MLX5_MAX_MAC_ADDRESSES) 765 return -EINVAL; 766 767 BITFIELD_RESET(mac_own, index); 768 return mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 0); 769 } 770 771 /** 772 * Synchronize Netlink bridge table to the internal table. 773 * 774 * @param[in] nlsk_fd 775 * Netlink socket file descriptor. 776 * @param[in] iface_idx 777 * Net device interface index. 778 * @param mac_addrs 779 * Mac addresses array to sync. 780 * @param n 781 * @p mac_addrs array size. 782 */ 783 void 784 mlx5_nl_mac_addr_sync(int nlsk_fd, unsigned int iface_idx, 785 struct rte_ether_addr *mac_addrs, int n) 786 { 787 struct rte_ether_addr macs[n]; 788 int macs_n = 0; 789 int i; 790 int ret; 791 792 memset(macs, 0, n * sizeof(macs[0])); 793 ret = mlx5_nl_mac_addr_list(nlsk_fd, iface_idx, &macs, &macs_n); 794 if (ret) 795 return; 796 for (i = 0; i != macs_n; ++i) { 797 int j; 798 799 /* Verify the address is not in the array yet. */ 800 for (j = 0; j != n; ++j) 801 if (rte_is_same_ether_addr(&macs[i], &mac_addrs[j])) 802 break; 803 if (j != n) 804 continue; 805 if (rte_is_multicast_ether_addr(&macs[i])) { 806 /* Find the first entry available. */ 807 for (j = MLX5_MAX_UC_MAC_ADDRESSES; j != n; ++j) { 808 if (rte_is_zero_ether_addr(&mac_addrs[j])) { 809 mac_addrs[j] = macs[i]; 810 break; 811 } 812 } 813 } else { 814 /* Find the first entry available. */ 815 for (j = 0; j != MLX5_MAX_UC_MAC_ADDRESSES; ++j) { 816 if (rte_is_zero_ether_addr(&mac_addrs[j])) { 817 mac_addrs[j] = macs[i]; 818 break; 819 } 820 } 821 } 822 } 823 } 824 825 /** 826 * Flush all added MAC addresses. 827 * 828 * @param[in] nlsk_fd 829 * Netlink socket file descriptor. 830 * @param[in] iface_idx 831 * Net device interface index. 832 * @param[in] mac_addrs 833 * Mac addresses array to flush. 834 * @param n 835 * @p mac_addrs array size. 836 * @param mac_own 837 * BITFIELD_DECLARE array to store the mac. 838 */ 839 void 840 mlx5_nl_mac_addr_flush(int nlsk_fd, unsigned int iface_idx, 841 struct rte_ether_addr *mac_addrs, int n, 842 uint64_t *mac_own) 843 { 844 int i; 845 846 if (n <= 0 || n > MLX5_MAX_MAC_ADDRESSES) 847 return; 848 849 for (i = n - 1; i >= 0; --i) { 850 struct rte_ether_addr *m = &mac_addrs[i]; 851 852 if (BITFIELD_ISSET(mac_own, i)) 853 mlx5_nl_mac_addr_remove(nlsk_fd, iface_idx, mac_own, m, 854 i); 855 } 856 } 857 858 /** 859 * Enable promiscuous / all multicast mode through Netlink. 860 * 861 * @param[in] nlsk_fd 862 * Netlink socket file descriptor. 863 * @param[in] iface_idx 864 * Net device interface index. 865 * @param flags 866 * IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti. 867 * @param enable 868 * Nonzero to enable, disable otherwise. 869 * 870 * @return 871 * 0 on success, a negative errno value otherwise and rte_errno is set. 872 */ 873 static int 874 mlx5_nl_device_flags(int nlsk_fd, unsigned int iface_idx, uint32_t flags, 875 int enable) 876 { 877 struct { 878 struct nlmsghdr hdr; 879 struct ifinfomsg ifi; 880 } req = { 881 .hdr = { 882 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), 883 .nlmsg_type = RTM_NEWLINK, 884 .nlmsg_flags = NLM_F_REQUEST, 885 }, 886 .ifi = { 887 .ifi_flags = enable ? flags : 0, 888 .ifi_change = flags, 889 .ifi_index = iface_idx, 890 }, 891 }; 892 uint32_t sn = MLX5_NL_SN_GENERATE; 893 int ret; 894 895 MLX5_ASSERT(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI))); 896 if (nlsk_fd < 0) 897 return 0; 898 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn); 899 if (ret < 0) 900 return ret; 901 return 0; 902 } 903 904 /** 905 * Enable promiscuous mode through Netlink. 906 * 907 * @param[in] nlsk_fd 908 * Netlink socket file descriptor. 909 * @param[in] iface_idx 910 * Net device interface index. 911 * @param enable 912 * Nonzero to enable, disable otherwise. 913 * 914 * @return 915 * 0 on success, a negative errno value otherwise and rte_errno is set. 916 */ 917 int 918 mlx5_nl_promisc(int nlsk_fd, unsigned int iface_idx, int enable) 919 { 920 int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_PROMISC, enable); 921 922 if (ret) 923 DRV_LOG(DEBUG, 924 "Interface %u cannot %s promisc mode: Netlink error %s", 925 iface_idx, enable ? "enable" : "disable", 926 strerror(rte_errno)); 927 return ret; 928 } 929 930 /** 931 * Enable all multicast mode through Netlink. 932 * 933 * @param[in] nlsk_fd 934 * Netlink socket file descriptor. 935 * @param[in] iface_idx 936 * Net device interface index. 937 * @param enable 938 * Nonzero to enable, disable otherwise. 939 * 940 * @return 941 * 0 on success, a negative errno value otherwise and rte_errno is set. 942 */ 943 int 944 mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable) 945 { 946 int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_ALLMULTI, 947 enable); 948 949 if (ret) 950 DRV_LOG(DEBUG, 951 "Interface %u cannot %s allmulti : Netlink error %s", 952 iface_idx, enable ? "enable" : "disable", 953 strerror(rte_errno)); 954 return ret; 955 } 956 957 /** 958 * Process network interface information from Netlink message. 959 * 960 * @param nh 961 * Pointer to Netlink message header. 962 * @param arg 963 * Opaque data pointer for this callback. 964 * 965 * @return 966 * 0 on success, a negative errno value otherwise and rte_errno is set. 967 */ 968 static int 969 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg) 970 { 971 struct mlx5_nl_ifindex_data *data = arg; 972 struct mlx5_nl_ifindex_data local = { 973 .flags = 0, 974 }; 975 size_t off = NLMSG_HDRLEN; 976 977 if (nh->nlmsg_type != 978 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) && 979 nh->nlmsg_type != 980 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET)) 981 goto error; 982 while (off < nh->nlmsg_len) { 983 struct nlattr *na = (void *)((uintptr_t)nh + off); 984 void *payload = (void *)((uintptr_t)na + NLA_HDRLEN); 985 986 if (na->nla_len > nh->nlmsg_len - off) 987 goto error; 988 switch (na->nla_type) { 989 case RDMA_NLDEV_ATTR_DEV_INDEX: 990 local.ibindex = *(uint32_t *)payload; 991 local.flags |= MLX5_NL_CMD_GET_IB_INDEX; 992 break; 993 case RDMA_NLDEV_ATTR_DEV_NAME: 994 if (!strcmp(payload, data->name)) 995 local.flags |= MLX5_NL_CMD_GET_IB_NAME; 996 break; 997 case RDMA_NLDEV_ATTR_NDEV_INDEX: 998 local.ifindex = *(uint32_t *)payload; 999 local.flags |= MLX5_NL_CMD_GET_NET_INDEX; 1000 break; 1001 case RDMA_NLDEV_ATTR_PORT_INDEX: 1002 local.portnum = *(uint32_t *)payload; 1003 local.flags |= MLX5_NL_CMD_GET_PORT_INDEX; 1004 break; 1005 default: 1006 break; 1007 } 1008 off += NLA_ALIGN(na->nla_len); 1009 } 1010 /* 1011 * It is possible to have multiple messages for all 1012 * Infiniband devices in the system with appropriate name. 1013 * So we should gather parameters locally and copy to 1014 * query context only in case of coinciding device name. 1015 */ 1016 if (local.flags & MLX5_NL_CMD_GET_IB_NAME) { 1017 data->flags = local.flags; 1018 data->ibindex = local.ibindex; 1019 data->ifindex = local.ifindex; 1020 data->portnum = local.portnum; 1021 } 1022 return 0; 1023 error: 1024 rte_errno = EINVAL; 1025 return -rte_errno; 1026 } 1027 1028 /** 1029 * Get index of network interface associated with some IB device. 1030 * 1031 * This is the only somewhat safe method to avoid resorting to heuristics 1032 * when faced with port representors. Unfortunately it requires at least 1033 * Linux 4.17. 1034 * 1035 * @param nl 1036 * Netlink socket of the RDMA kind (NETLINK_RDMA). 1037 * @param[in] name 1038 * IB device name. 1039 * @param[in] pindex 1040 * IB device port index, starting from 1 1041 * @return 1042 * A valid (nonzero) interface index on success, 0 otherwise and rte_errno 1043 * is set. 1044 */ 1045 unsigned int 1046 mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex) 1047 { 1048 struct mlx5_nl_ifindex_data data = { 1049 .name = name, 1050 .flags = 0, 1051 .ibindex = 0, /* Determined during first pass. */ 1052 .ifindex = 0, /* Determined during second pass. */ 1053 }; 1054 union { 1055 struct nlmsghdr nh; 1056 uint8_t buf[NLMSG_HDRLEN + 1057 NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) + 1058 NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))]; 1059 } req = { 1060 .nh = { 1061 .nlmsg_len = NLMSG_LENGTH(0), 1062 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, 1063 RDMA_NLDEV_CMD_GET), 1064 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP, 1065 }, 1066 }; 1067 struct nlattr *na; 1068 uint32_t sn = MLX5_NL_SN_GENERATE; 1069 int ret; 1070 1071 ret = mlx5_nl_send(nl, &req.nh, sn); 1072 if (ret < 0) 1073 return 0; 1074 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data); 1075 if (ret < 0) 1076 return 0; 1077 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) || 1078 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX)) 1079 goto error; 1080 data.flags = 0; 1081 sn = MLX5_NL_SN_GENERATE; 1082 req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, 1083 RDMA_NLDEV_CMD_PORT_GET); 1084 req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 1085 req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN); 1086 na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN); 1087 na->nla_len = NLA_HDRLEN + sizeof(data.ibindex); 1088 na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX; 1089 memcpy((void *)((uintptr_t)na + NLA_HDRLEN), 1090 &data.ibindex, sizeof(data.ibindex)); 1091 na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len)); 1092 na->nla_len = NLA_HDRLEN + sizeof(pindex); 1093 na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX; 1094 memcpy((void *)((uintptr_t)na + NLA_HDRLEN), 1095 &pindex, sizeof(pindex)); 1096 ret = mlx5_nl_send(nl, &req.nh, sn); 1097 if (ret < 0) 1098 return 0; 1099 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data); 1100 if (ret < 0) 1101 return 0; 1102 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) || 1103 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) || 1104 !(data.flags & MLX5_NL_CMD_GET_NET_INDEX) || 1105 !data.ifindex) 1106 goto error; 1107 return data.ifindex; 1108 error: 1109 rte_errno = ENODEV; 1110 return 0; 1111 } 1112 1113 /** 1114 * Get the number of physical ports of given IB device. 1115 * 1116 * @param nl 1117 * Netlink socket of the RDMA kind (NETLINK_RDMA). 1118 * @param[in] name 1119 * IB device name. 1120 * 1121 * @return 1122 * A valid (nonzero) number of ports on success, 0 otherwise 1123 * and rte_errno is set. 1124 */ 1125 unsigned int 1126 mlx5_nl_portnum(int nl, const char *name) 1127 { 1128 struct mlx5_nl_ifindex_data data = { 1129 .flags = 0, 1130 .name = name, 1131 .ifindex = 0, 1132 .portnum = 0, 1133 }; 1134 struct nlmsghdr req = { 1135 .nlmsg_len = NLMSG_LENGTH(0), 1136 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, 1137 RDMA_NLDEV_CMD_GET), 1138 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP, 1139 }; 1140 uint32_t sn = MLX5_NL_SN_GENERATE; 1141 int ret; 1142 1143 ret = mlx5_nl_send(nl, &req, sn); 1144 if (ret < 0) 1145 return 0; 1146 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data); 1147 if (ret < 0) 1148 return 0; 1149 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) || 1150 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) || 1151 !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) { 1152 rte_errno = ENODEV; 1153 return 0; 1154 } 1155 if (!data.portnum) 1156 rte_errno = EINVAL; 1157 return data.portnum; 1158 } 1159 1160 /** 1161 * Analyze gathered port parameters via Netlink to recognize master 1162 * and representor devices for E-Switch configuration. 1163 * 1164 * @param[in] num_vf_set 1165 * flag of presence of number of VFs port attribute. 1166 * @param[inout] switch_info 1167 * Port information, including port name as a number and port name 1168 * type if recognized 1169 * 1170 * @return 1171 * master and representor flags are set in switch_info according to 1172 * recognized parameters (if any). 1173 */ 1174 static void 1175 mlx5_nl_check_switch_info(bool num_vf_set, 1176 struct mlx5_switch_info *switch_info) 1177 { 1178 switch (switch_info->name_type) { 1179 case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN: 1180 /* 1181 * Name is not recognized, assume the master, 1182 * check the number of VFs key presence. 1183 */ 1184 switch_info->master = num_vf_set; 1185 break; 1186 case MLX5_PHYS_PORT_NAME_TYPE_NOTSET: 1187 /* 1188 * Name is not set, this assumes the legacy naming 1189 * schema for master, just check if there is a 1190 * number of VFs key. 1191 */ 1192 switch_info->master = num_vf_set; 1193 break; 1194 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: 1195 /* New uplink naming schema recognized. */ 1196 switch_info->master = 1; 1197 break; 1198 case MLX5_PHYS_PORT_NAME_TYPE_LEGACY: 1199 /* Legacy representors naming schema. */ 1200 switch_info->representor = !num_vf_set; 1201 break; 1202 case MLX5_PHYS_PORT_NAME_TYPE_PFHPF: 1203 /* Fallthrough */ 1204 case MLX5_PHYS_PORT_NAME_TYPE_PFVF: 1205 /* Fallthrough */ 1206 case MLX5_PHYS_PORT_NAME_TYPE_PFSF: 1207 /* New representors naming schema. */ 1208 switch_info->representor = 1; 1209 break; 1210 } 1211 } 1212 1213 /** 1214 * Process switch information from Netlink message. 1215 * 1216 * @param nh 1217 * Pointer to Netlink message header. 1218 * @param arg 1219 * Opaque data pointer for this callback. 1220 * 1221 * @return 1222 * 0 on success, a negative errno value otherwise and rte_errno is set. 1223 */ 1224 static int 1225 mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg) 1226 { 1227 struct mlx5_switch_info info = { 1228 .master = 0, 1229 .representor = 0, 1230 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET, 1231 .port_name = 0, 1232 .switch_id = 0, 1233 }; 1234 size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg)); 1235 bool switch_id_set = false; 1236 bool num_vf_set = false; 1237 int len; 1238 1239 if (nh->nlmsg_type != RTM_NEWLINK) 1240 goto error; 1241 while (off < nh->nlmsg_len) { 1242 struct rtattr *ra = (void *)((uintptr_t)nh + off); 1243 void *payload = RTA_DATA(ra); 1244 unsigned int i; 1245 1246 if (ra->rta_len > nh->nlmsg_len - off) 1247 goto error; 1248 switch (ra->rta_type) { 1249 case IFLA_NUM_VF: 1250 num_vf_set = true; 1251 break; 1252 case IFLA_PHYS_PORT_NAME: 1253 len = RTA_PAYLOAD(ra); 1254 /* Some kernels do not pad attributes with zero. */ 1255 if (len > 0 && len < MLX5_PHYS_PORT_NAME_MAX) { 1256 char name[MLX5_PHYS_PORT_NAME_MAX]; 1257 1258 /* 1259 * We can't just patch the message with padding 1260 * zero - it might corrupt the following items 1261 * in the message, we have to copy the string 1262 * by attribute length and pad the copied one. 1263 */ 1264 memcpy(name, payload, len); 1265 name[len] = 0; 1266 mlx5_translate_port_name(name, &info); 1267 } else { 1268 info.name_type = 1269 MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN; 1270 } 1271 break; 1272 case IFLA_PHYS_SWITCH_ID: 1273 info.switch_id = 0; 1274 for (i = 0; i < RTA_PAYLOAD(ra); ++i) { 1275 info.switch_id <<= 8; 1276 info.switch_id |= ((uint8_t *)payload)[i]; 1277 } 1278 switch_id_set = true; 1279 break; 1280 } 1281 off += RTA_ALIGN(ra->rta_len); 1282 } 1283 if (switch_id_set) { 1284 /* We have some E-Switch configuration. */ 1285 mlx5_nl_check_switch_info(num_vf_set, &info); 1286 } 1287 MLX5_ASSERT(!(info.master && info.representor)); 1288 memcpy(arg, &info, sizeof(info)); 1289 return 0; 1290 error: 1291 rte_errno = EINVAL; 1292 return -rte_errno; 1293 } 1294 1295 /** 1296 * Get switch information associated with network interface. 1297 * 1298 * @param nl 1299 * Netlink socket of the ROUTE kind (NETLINK_ROUTE). 1300 * @param ifindex 1301 * Network interface index. 1302 * @param[out] info 1303 * Switch information object, populated in case of success. 1304 * 1305 * @return 1306 * 0 on success, a negative errno value otherwise and rte_errno is set. 1307 */ 1308 int 1309 mlx5_nl_switch_info(int nl, unsigned int ifindex, 1310 struct mlx5_switch_info *info) 1311 { 1312 struct { 1313 struct nlmsghdr nh; 1314 struct ifinfomsg info; 1315 struct rtattr rta; 1316 uint32_t extmask; 1317 } req = { 1318 .nh = { 1319 .nlmsg_len = NLMSG_LENGTH 1320 (sizeof(req.info) + 1321 RTA_LENGTH(sizeof(uint32_t))), 1322 .nlmsg_type = RTM_GETLINK, 1323 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK, 1324 }, 1325 .info = { 1326 .ifi_family = AF_UNSPEC, 1327 .ifi_index = ifindex, 1328 }, 1329 .rta = { 1330 .rta_type = IFLA_EXT_MASK, 1331 .rta_len = RTA_LENGTH(sizeof(int32_t)), 1332 }, 1333 .extmask = RTE_LE32(1), 1334 }; 1335 uint32_t sn = MLX5_NL_SN_GENERATE; 1336 int ret; 1337 1338 ret = mlx5_nl_send(nl, &req.nh, sn); 1339 if (ret >= 0) 1340 ret = mlx5_nl_recv(nl, sn, mlx5_nl_switch_info_cb, info); 1341 if (info->master && info->representor) { 1342 DRV_LOG(ERR, "ifindex %u device is recognized as master" 1343 " and as representor", ifindex); 1344 rte_errno = ENODEV; 1345 ret = -rte_errno; 1346 } 1347 return ret; 1348 } 1349 1350 /* 1351 * Delete VLAN network device by ifindex. 1352 * 1353 * @param[in] tcf 1354 * Context object initialized by mlx5_nl_vlan_vmwa_init(). 1355 * @param[in] ifindex 1356 * Interface index of network device to delete. 1357 */ 1358 void 1359 mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa, 1360 uint32_t ifindex) 1361 { 1362 uint32_t sn = MLX5_NL_SN_GENERATE; 1363 int ret; 1364 struct { 1365 struct nlmsghdr nh; 1366 struct ifinfomsg info; 1367 } req = { 1368 .nh = { 1369 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), 1370 .nlmsg_type = RTM_DELLINK, 1371 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK, 1372 }, 1373 .info = { 1374 .ifi_family = AF_UNSPEC, 1375 .ifi_index = ifindex, 1376 }, 1377 }; 1378 1379 if (ifindex) { 1380 ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, sn); 1381 if (ret >= 0) 1382 ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL); 1383 if (ret < 0) 1384 DRV_LOG(WARNING, "netlink: error deleting VLAN WA" 1385 " ifindex %u, %d", ifindex, ret); 1386 } 1387 } 1388 1389 /* Set of subroutines to build Netlink message. */ 1390 static struct nlattr * 1391 nl_msg_tail(struct nlmsghdr *nlh) 1392 { 1393 return (struct nlattr *) 1394 (((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len)); 1395 } 1396 1397 static void 1398 nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen) 1399 { 1400 struct nlattr *nla = nl_msg_tail(nlh); 1401 1402 nla->nla_type = type; 1403 nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr)) + alen; 1404 nlh->nlmsg_len += NLMSG_ALIGN(nla->nla_len); 1405 1406 if (alen) 1407 memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen); 1408 } 1409 1410 static struct nlattr * 1411 nl_attr_nest_start(struct nlmsghdr *nlh, int type) 1412 { 1413 struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh); 1414 1415 nl_attr_put(nlh, type, NULL, 0); 1416 return nest; 1417 } 1418 1419 static void 1420 nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest) 1421 { 1422 nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest; 1423 } 1424 1425 /* 1426 * Create network VLAN device with specified VLAN tag. 1427 * 1428 * @param[in] tcf 1429 * Context object initialized by mlx5_nl_vlan_vmwa_init(). 1430 * @param[in] ifindex 1431 * Base network interface index. 1432 * @param[in] tag 1433 * VLAN tag for VLAN network device to create. 1434 */ 1435 uint32_t 1436 mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa, 1437 uint32_t ifindex, uint16_t tag) 1438 { 1439 struct nlmsghdr *nlh; 1440 struct ifinfomsg *ifm; 1441 char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32]; 1442 1443 __rte_cache_aligned 1444 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + 1445 NLMSG_ALIGN(sizeof(struct ifinfomsg)) + 1446 NLMSG_ALIGN(sizeof(struct nlattr)) * 8 + 1447 NLMSG_ALIGN(sizeof(uint32_t)) + 1448 NLMSG_ALIGN(sizeof(name)) + 1449 NLMSG_ALIGN(sizeof("vlan")) + 1450 NLMSG_ALIGN(sizeof(uint32_t)) + 1451 NLMSG_ALIGN(sizeof(uint16_t)) + 16]; 1452 struct nlattr *na_info; 1453 struct nlattr *na_vlan; 1454 uint32_t sn = MLX5_NL_SN_GENERATE; 1455 int ret; 1456 1457 memset(buf, 0, sizeof(buf)); 1458 nlh = (struct nlmsghdr *)buf; 1459 nlh->nlmsg_len = sizeof(struct nlmsghdr); 1460 nlh->nlmsg_type = RTM_NEWLINK; 1461 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | 1462 NLM_F_EXCL | NLM_F_ACK; 1463 ifm = (struct ifinfomsg *)nl_msg_tail(nlh); 1464 nlh->nlmsg_len += sizeof(struct ifinfomsg); 1465 ifm->ifi_family = AF_UNSPEC; 1466 ifm->ifi_type = 0; 1467 ifm->ifi_index = 0; 1468 ifm->ifi_flags = IFF_UP; 1469 ifm->ifi_change = 0xffffffff; 1470 nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex)); 1471 ret = snprintf(name, sizeof(name), "%s.%u.%u", 1472 MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag); 1473 nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1); 1474 na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO); 1475 nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan")); 1476 na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA); 1477 nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag)); 1478 nl_attr_nest_end(nlh, na_vlan); 1479 nl_attr_nest_end(nlh, na_info); 1480 MLX5_ASSERT(sizeof(buf) >= nlh->nlmsg_len); 1481 ret = mlx5_nl_send(vmwa->nl_socket, nlh, sn); 1482 if (ret >= 0) 1483 ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL); 1484 if (ret < 0) { 1485 DRV_LOG(WARNING, "netlink: VLAN %s create failure (%d)", name, 1486 ret); 1487 } 1488 /* Try to get ifindex of created or pre-existing device. */ 1489 ret = if_nametoindex(name); 1490 if (!ret) { 1491 DRV_LOG(WARNING, "VLAN %s failed to get index (%d)", name, 1492 errno); 1493 return 0; 1494 } 1495 return ret; 1496 } 1497 1498 /** 1499 * Parse Netlink message to retrieve the general family ID. 1500 * 1501 * @param nh 1502 * Pointer to Netlink Message Header. 1503 * @param arg 1504 * PMD data register with this callback. 1505 * 1506 * @return 1507 * 0 on success, a negative errno value otherwise and rte_errno is set. 1508 */ 1509 static int 1510 mlx5_nl_family_id_cb(struct nlmsghdr *nh, void *arg) 1511 { 1512 1513 struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len); 1514 struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) + 1515 NLMSG_ALIGN(sizeof(struct genlmsghdr))); 1516 1517 for (; nla->nla_len && nla < tail; 1518 nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len))) { 1519 if (nla->nla_type == CTRL_ATTR_FAMILY_ID) { 1520 *(uint16_t *)arg = *(uint16_t *)(nla + 1); 1521 return 0; 1522 } 1523 } 1524 return -EINVAL; 1525 } 1526 1527 #define MLX5_NL_MAX_ATTR_SIZE 100 1528 /** 1529 * Get generic netlink family ID. 1530 * 1531 * @param[in] nlsk_fd 1532 * Netlink socket file descriptor. 1533 * @param[in] name 1534 * The family name. 1535 * 1536 * @return 1537 * ID >= 0 on success and @p enable is updated, a negative errno value 1538 * otherwise and rte_errno is set. 1539 */ 1540 static int 1541 mlx5_nl_generic_family_id_get(int nlsk_fd, const char *name) 1542 { 1543 struct nlmsghdr *nlh; 1544 struct genlmsghdr *genl; 1545 uint32_t sn = MLX5_NL_SN_GENERATE; 1546 int name_size = strlen(name) + 1; 1547 int ret; 1548 uint16_t id = -1; 1549 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + 1550 NLMSG_ALIGN(sizeof(struct genlmsghdr)) + 1551 NLMSG_ALIGN(sizeof(struct nlattr)) + 1552 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE)]; 1553 1554 memset(buf, 0, sizeof(buf)); 1555 nlh = (struct nlmsghdr *)buf; 1556 nlh->nlmsg_len = sizeof(struct nlmsghdr); 1557 nlh->nlmsg_type = GENL_ID_CTRL; 1558 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 1559 genl = (struct genlmsghdr *)nl_msg_tail(nlh); 1560 nlh->nlmsg_len += sizeof(struct genlmsghdr); 1561 genl->cmd = CTRL_CMD_GETFAMILY; 1562 genl->version = 1; 1563 nl_attr_put(nlh, CTRL_ATTR_FAMILY_NAME, name, name_size); 1564 ret = mlx5_nl_send(nlsk_fd, nlh, sn); 1565 if (ret >= 0) 1566 ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_family_id_cb, &id); 1567 if (ret < 0) { 1568 DRV_LOG(DEBUG, "Failed to get Netlink %s family ID: %d.", name, 1569 ret); 1570 return ret; 1571 } 1572 DRV_LOG(DEBUG, "Netlink \"%s\" family ID is %u.", name, id); 1573 return (int)id; 1574 } 1575 1576 /** 1577 * Get Devlink family ID. 1578 * 1579 * @param[in] nlsk_fd 1580 * Netlink socket file descriptor. 1581 * 1582 * @return 1583 * ID >= 0 on success and @p enable is updated, a negative errno value 1584 * otherwise and rte_errno is set. 1585 */ 1586 1587 int 1588 mlx5_nl_devlink_family_id_get(int nlsk_fd) 1589 { 1590 return mlx5_nl_generic_family_id_get(nlsk_fd, DEVLINK_GENL_NAME); 1591 } 1592 1593 /** 1594 * Parse Netlink message to retrieve the ROCE enable status. 1595 * 1596 * @param nh 1597 * Pointer to Netlink Message Header. 1598 * @param arg 1599 * PMD data register with this callback. 1600 * 1601 * @return 1602 * 0 on success, a negative errno value otherwise and rte_errno is set. 1603 */ 1604 static int 1605 mlx5_nl_roce_cb(struct nlmsghdr *nh, void *arg) 1606 { 1607 1608 int ret = -EINVAL; 1609 int *enable = arg; 1610 struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len); 1611 struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) + 1612 NLMSG_ALIGN(sizeof(struct genlmsghdr))); 1613 1614 while (nla->nla_len && nla < tail) { 1615 switch (nla->nla_type) { 1616 /* Expected nested attributes case. */ 1617 case DEVLINK_ATTR_PARAM: 1618 case DEVLINK_ATTR_PARAM_VALUES_LIST: 1619 case DEVLINK_ATTR_PARAM_VALUE: 1620 ret = 0; 1621 nla += 1; 1622 break; 1623 case DEVLINK_ATTR_PARAM_VALUE_DATA: 1624 *enable = 1; 1625 return 0; 1626 default: 1627 nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len)); 1628 } 1629 } 1630 *enable = 0; 1631 return ret; 1632 } 1633 1634 /** 1635 * Get ROCE enable status through Netlink. 1636 * 1637 * @param[in] nlsk_fd 1638 * Netlink socket file descriptor. 1639 * @param[in] family_id 1640 * the Devlink family ID. 1641 * @param pci_addr 1642 * The device PCI address. 1643 * @param[out] enable 1644 * Where to store the enable status. 1645 * 1646 * @return 1647 * 0 on success and @p enable is updated, a negative errno value otherwise 1648 * and rte_errno is set. 1649 */ 1650 int 1651 mlx5_nl_enable_roce_get(int nlsk_fd, int family_id, const char *pci_addr, 1652 int *enable) 1653 { 1654 struct nlmsghdr *nlh; 1655 struct genlmsghdr *genl; 1656 uint32_t sn = MLX5_NL_SN_GENERATE; 1657 int ret; 1658 int cur_en = 0; 1659 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + 1660 NLMSG_ALIGN(sizeof(struct genlmsghdr)) + 1661 NLMSG_ALIGN(sizeof(struct nlattr)) * 4 + 1662 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 4]; 1663 1664 memset(buf, 0, sizeof(buf)); 1665 nlh = (struct nlmsghdr *)buf; 1666 nlh->nlmsg_len = sizeof(struct nlmsghdr); 1667 nlh->nlmsg_type = family_id; 1668 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 1669 genl = (struct genlmsghdr *)nl_msg_tail(nlh); 1670 nlh->nlmsg_len += sizeof(struct genlmsghdr); 1671 genl->cmd = DEVLINK_CMD_PARAM_GET; 1672 genl->version = DEVLINK_GENL_VERSION; 1673 nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4); 1674 nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1); 1675 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12); 1676 ret = mlx5_nl_send(nlsk_fd, nlh, sn); 1677 if (ret >= 0) 1678 ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_roce_cb, &cur_en); 1679 if (ret < 0) { 1680 DRV_LOG(DEBUG, "Failed to get ROCE enable on device %s: %d.", 1681 pci_addr, ret); 1682 return ret; 1683 } 1684 *enable = cur_en; 1685 DRV_LOG(DEBUG, "ROCE is %sabled for device \"%s\".", 1686 cur_en ? "en" : "dis", pci_addr); 1687 return ret; 1688 } 1689 1690 /** 1691 * Reload mlx5 device kernel driver through Netlink. 1692 * 1693 * @param[in] nlsk_fd 1694 * Netlink socket file descriptor. 1695 * @param[in] family_id 1696 * the Devlink family ID. 1697 * @param pci_addr 1698 * The device PCI address. 1699 * @param[out] enable 1700 * The enable status to set. 1701 * 1702 * @return 1703 * 0 on success, a negative errno value otherwise and rte_errno is set. 1704 */ 1705 int 1706 mlx5_nl_driver_reload(int nlsk_fd, int family_id, const char *pci_addr) 1707 { 1708 struct nlmsghdr *nlh; 1709 struct genlmsghdr *genl; 1710 uint32_t sn = MLX5_NL_SN_GENERATE; 1711 int ret; 1712 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + 1713 NLMSG_ALIGN(sizeof(struct genlmsghdr)) + 1714 NLMSG_ALIGN(sizeof(struct nlattr)) * 2 + 1715 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 2]; 1716 1717 memset(buf, 0, sizeof(buf)); 1718 nlh = (struct nlmsghdr *)buf; 1719 nlh->nlmsg_len = sizeof(struct nlmsghdr); 1720 nlh->nlmsg_type = family_id; 1721 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 1722 genl = (struct genlmsghdr *)nl_msg_tail(nlh); 1723 nlh->nlmsg_len += sizeof(struct genlmsghdr); 1724 genl->cmd = DEVLINK_CMD_RELOAD; 1725 genl->version = DEVLINK_GENL_VERSION; 1726 nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4); 1727 nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1); 1728 ret = mlx5_nl_send(nlsk_fd, nlh, sn); 1729 if (ret >= 0) 1730 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL); 1731 if (ret < 0) { 1732 DRV_LOG(DEBUG, "Failed to reload %s device by Netlink - %d", 1733 pci_addr, ret); 1734 return ret; 1735 } 1736 DRV_LOG(DEBUG, "Device \"%s\" was reloaded by Netlink successfully.", 1737 pci_addr); 1738 return 0; 1739 } 1740 1741 /** 1742 * Set ROCE enable status through Netlink. 1743 * 1744 * @param[in] nlsk_fd 1745 * Netlink socket file descriptor. 1746 * @param[in] family_id 1747 * the Devlink family ID. 1748 * @param pci_addr 1749 * The device PCI address. 1750 * @param[out] enable 1751 * The enable status to set. 1752 * 1753 * @return 1754 * 0 on success, a negative errno value otherwise and rte_errno is set. 1755 */ 1756 int 1757 mlx5_nl_enable_roce_set(int nlsk_fd, int family_id, const char *pci_addr, 1758 int enable) 1759 { 1760 struct nlmsghdr *nlh; 1761 struct genlmsghdr *genl; 1762 uint32_t sn = MLX5_NL_SN_GENERATE; 1763 int ret; 1764 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + 1765 NLMSG_ALIGN(sizeof(struct genlmsghdr)) + 1766 NLMSG_ALIGN(sizeof(struct nlattr)) * 6 + 1767 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 6]; 1768 uint8_t cmode = DEVLINK_PARAM_CMODE_DRIVERINIT; 1769 uint8_t ptype = NLA_FLAG; 1770 ; 1771 1772 memset(buf, 0, sizeof(buf)); 1773 nlh = (struct nlmsghdr *)buf; 1774 nlh->nlmsg_len = sizeof(struct nlmsghdr); 1775 nlh->nlmsg_type = family_id; 1776 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 1777 genl = (struct genlmsghdr *)nl_msg_tail(nlh); 1778 nlh->nlmsg_len += sizeof(struct genlmsghdr); 1779 genl->cmd = DEVLINK_CMD_PARAM_SET; 1780 genl->version = DEVLINK_GENL_VERSION; 1781 nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4); 1782 nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1); 1783 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12); 1784 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_CMODE, &cmode, sizeof(cmode)); 1785 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_TYPE, &ptype, sizeof(ptype)); 1786 if (enable) 1787 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, NULL, 0); 1788 ret = mlx5_nl_send(nlsk_fd, nlh, sn); 1789 if (ret >= 0) 1790 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL); 1791 if (ret < 0) { 1792 DRV_LOG(DEBUG, "Failed to %sable ROCE for device %s by Netlink:" 1793 " %d.", enable ? "en" : "dis", pci_addr, ret); 1794 return ret; 1795 } 1796 DRV_LOG(DEBUG, "Device %s ROCE was %sabled by Netlink successfully.", 1797 pci_addr, enable ? "en" : "dis"); 1798 /* Now, need to reload the driver. */ 1799 return mlx5_nl_driver_reload(nlsk_fd, family_id, pci_addr); 1800 } 1801