1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015 Mellanox Technologies, Ltd 4 */ 5 6 #include <stddef.h> 7 #include <assert.h> 8 #include <inttypes.h> 9 #include <unistd.h> 10 #include <stdint.h> 11 #include <stdio.h> 12 #include <string.h> 13 #include <stdlib.h> 14 #include <errno.h> 15 #include <dirent.h> 16 #include <net/if.h> 17 #include <sys/ioctl.h> 18 #include <sys/socket.h> 19 #include <netinet/in.h> 20 #include <linux/ethtool.h> 21 #include <linux/sockios.h> 22 #include <fcntl.h> 23 #include <stdalign.h> 24 #include <sys/un.h> 25 #include <time.h> 26 27 #include <rte_atomic.h> 28 #include <rte_ethdev_driver.h> 29 #include <rte_bus_pci.h> 30 #include <rte_mbuf.h> 31 #include <rte_common.h> 32 #include <rte_interrupts.h> 33 #include <rte_malloc.h> 34 #include <rte_string_fns.h> 35 #include <rte_rwlock.h> 36 37 #include "mlx5.h" 38 #include "mlx5_glue.h" 39 #include "mlx5_rxtx.h" 40 #include "mlx5_utils.h" 41 42 /* Supported speed values found in /usr/include/linux/ethtool.h */ 43 #ifndef HAVE_SUPPORTED_40000baseKR4_Full 44 #define SUPPORTED_40000baseKR4_Full (1 << 23) 45 #endif 46 #ifndef HAVE_SUPPORTED_40000baseCR4_Full 47 #define SUPPORTED_40000baseCR4_Full (1 << 24) 48 #endif 49 #ifndef HAVE_SUPPORTED_40000baseSR4_Full 50 #define SUPPORTED_40000baseSR4_Full (1 << 25) 51 #endif 52 #ifndef HAVE_SUPPORTED_40000baseLR4_Full 53 #define SUPPORTED_40000baseLR4_Full (1 << 26) 54 #endif 55 #ifndef HAVE_SUPPORTED_56000baseKR4_Full 56 #define SUPPORTED_56000baseKR4_Full (1 << 27) 57 #endif 58 #ifndef HAVE_SUPPORTED_56000baseCR4_Full 59 #define SUPPORTED_56000baseCR4_Full (1 << 28) 60 #endif 61 #ifndef HAVE_SUPPORTED_56000baseSR4_Full 62 #define SUPPORTED_56000baseSR4_Full (1 << 29) 63 #endif 64 #ifndef HAVE_SUPPORTED_56000baseLR4_Full 65 #define SUPPORTED_56000baseLR4_Full (1 << 30) 66 #endif 67 68 /* Add defines in case the running kernel is not the same as user headers. */ 69 #ifndef ETHTOOL_GLINKSETTINGS 70 struct ethtool_link_settings { 71 uint32_t cmd; 72 uint32_t speed; 73 uint8_t duplex; 74 uint8_t port; 75 uint8_t phy_address; 76 uint8_t autoneg; 77 uint8_t mdio_support; 78 uint8_t eth_to_mdix; 79 uint8_t eth_tp_mdix_ctrl; 80 int8_t link_mode_masks_nwords; 81 uint32_t reserved[8]; 82 uint32_t link_mode_masks[]; 83 }; 84 85 #define ETHTOOL_GLINKSETTINGS 0x0000004c 86 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5 87 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6 88 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17 89 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18 90 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19 91 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20 92 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21 93 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22 94 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23 95 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24 96 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25 97 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26 98 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27 99 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28 100 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29 101 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30 102 #endif 103 #ifndef HAVE_ETHTOOL_LINK_MODE_25G 104 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31 105 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32 106 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33 107 #endif 108 #ifndef HAVE_ETHTOOL_LINK_MODE_50G 109 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34 110 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35 111 #endif 112 #ifndef HAVE_ETHTOOL_LINK_MODE_100G 113 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36 114 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37 115 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38 116 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39 117 #endif 118 119 /** 120 * Get master interface name from private structure. 121 * 122 * @param[in] dev 123 * Pointer to Ethernet device. 124 * @param[out] ifname 125 * Interface name output buffer. 126 * 127 * @return 128 * 0 on success, a negative errno value otherwise and rte_errno is set. 129 */ 130 int 131 mlx5_get_master_ifname(const char *ibdev_path, char (*ifname)[IF_NAMESIZE]) 132 { 133 DIR *dir; 134 struct dirent *dent; 135 unsigned int dev_type = 0; 136 unsigned int dev_port_prev = ~0u; 137 char match[IF_NAMESIZE] = ""; 138 139 assert(ibdev_path); 140 { 141 MKSTR(path, "%s/device/net", ibdev_path); 142 143 dir = opendir(path); 144 if (dir == NULL) { 145 rte_errno = errno; 146 return -rte_errno; 147 } 148 } 149 while ((dent = readdir(dir)) != NULL) { 150 char *name = dent->d_name; 151 FILE *file; 152 unsigned int dev_port; 153 int r; 154 155 if ((name[0] == '.') && 156 ((name[1] == '\0') || 157 ((name[1] == '.') && (name[2] == '\0')))) 158 continue; 159 160 MKSTR(path, "%s/device/net/%s/%s", 161 ibdev_path, name, 162 (dev_type ? "dev_id" : "dev_port")); 163 164 file = fopen(path, "rb"); 165 if (file == NULL) { 166 if (errno != ENOENT) 167 continue; 168 /* 169 * Switch to dev_id when dev_port does not exist as 170 * is the case with Linux kernel versions < 3.15. 171 */ 172 try_dev_id: 173 match[0] = '\0'; 174 if (dev_type) 175 break; 176 dev_type = 1; 177 dev_port_prev = ~0u; 178 rewinddir(dir); 179 continue; 180 } 181 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); 182 fclose(file); 183 if (r != 1) 184 continue; 185 /* 186 * Switch to dev_id when dev_port returns the same value for 187 * all ports. May happen when using a MOFED release older than 188 * 3.0 with a Linux kernel >= 3.15. 189 */ 190 if (dev_port == dev_port_prev) 191 goto try_dev_id; 192 dev_port_prev = dev_port; 193 if (dev_port == 0) 194 strlcpy(match, name, sizeof(match)); 195 } 196 closedir(dir); 197 if (match[0] == '\0') { 198 rte_errno = ENOENT; 199 return -rte_errno; 200 } 201 strncpy(*ifname, match, sizeof(*ifname)); 202 return 0; 203 } 204 205 /** 206 * Get interface name from private structure. 207 * 208 * This is a port representor-aware version of mlx5_get_master_ifname(). 209 * 210 * @param[in] dev 211 * Pointer to Ethernet device. 212 * @param[out] ifname 213 * Interface name output buffer. 214 * 215 * @return 216 * 0 on success, a negative errno value otherwise and rte_errno is set. 217 */ 218 int 219 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]) 220 { 221 struct mlx5_priv *priv = dev->data->dev_private; 222 unsigned int ifindex; 223 224 assert(priv); 225 assert(priv->sh); 226 ifindex = priv->nl_socket_rdma >= 0 ? 227 mlx5_nl_ifindex(priv->nl_socket_rdma, 228 priv->sh->ibdev_name, 229 priv->ibv_port) : 0; 230 if (!ifindex) { 231 if (!priv->representor) 232 return mlx5_get_master_ifname(priv->sh->ibdev_path, 233 ifname); 234 rte_errno = ENXIO; 235 return -rte_errno; 236 } 237 if (if_indextoname(ifindex, &(*ifname)[0])) 238 return 0; 239 rte_errno = errno; 240 return -rte_errno; 241 } 242 243 /** 244 * Get the interface index from device name. 245 * 246 * @param[in] dev 247 * Pointer to Ethernet device. 248 * 249 * @return 250 * Nonzero interface index on success, zero otherwise and rte_errno is set. 251 */ 252 unsigned int 253 mlx5_ifindex(const struct rte_eth_dev *dev) 254 { 255 char ifname[IF_NAMESIZE]; 256 unsigned int ifindex; 257 258 if (mlx5_get_ifname(dev, &ifname)) 259 return 0; 260 ifindex = if_nametoindex(ifname); 261 if (!ifindex) 262 rte_errno = errno; 263 return ifindex; 264 } 265 266 /** 267 * Perform ifreq ioctl() on associated Ethernet device. 268 * 269 * @param[in] dev 270 * Pointer to Ethernet device. 271 * @param req 272 * Request number to pass to ioctl(). 273 * @param[out] ifr 274 * Interface request structure output buffer. 275 * 276 * @return 277 * 0 on success, a negative errno value otherwise and rte_errno is set. 278 */ 279 int 280 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr) 281 { 282 int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 283 int ret = 0; 284 285 if (sock == -1) { 286 rte_errno = errno; 287 return -rte_errno; 288 } 289 ret = mlx5_get_ifname(dev, &ifr->ifr_name); 290 if (ret) 291 goto error; 292 ret = ioctl(sock, req, ifr); 293 if (ret == -1) { 294 rte_errno = errno; 295 goto error; 296 } 297 close(sock); 298 return 0; 299 error: 300 close(sock); 301 return -rte_errno; 302 } 303 304 /** 305 * Get device MTU. 306 * 307 * @param dev 308 * Pointer to Ethernet device. 309 * @param[out] mtu 310 * MTU value output buffer. 311 * 312 * @return 313 * 0 on success, a negative errno value otherwise and rte_errno is set. 314 */ 315 int 316 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu) 317 { 318 struct ifreq request; 319 int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request); 320 321 if (ret) 322 return ret; 323 *mtu = request.ifr_mtu; 324 return 0; 325 } 326 327 /** 328 * Set device MTU. 329 * 330 * @param dev 331 * Pointer to Ethernet device. 332 * @param mtu 333 * MTU value to set. 334 * 335 * @return 336 * 0 on success, a negative errno value otherwise and rte_errno is set. 337 */ 338 static int 339 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 340 { 341 struct ifreq request = { .ifr_mtu = mtu, }; 342 343 return mlx5_ifreq(dev, SIOCSIFMTU, &request); 344 } 345 346 /** 347 * Set device flags. 348 * 349 * @param dev 350 * Pointer to Ethernet device. 351 * @param keep 352 * Bitmask for flags that must remain untouched. 353 * @param flags 354 * Bitmask for flags to modify. 355 * 356 * @return 357 * 0 on success, a negative errno value otherwise and rte_errno is set. 358 */ 359 int 360 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags) 361 { 362 struct ifreq request; 363 int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request); 364 365 if (ret) 366 return ret; 367 request.ifr_flags &= keep; 368 request.ifr_flags |= flags & ~keep; 369 return mlx5_ifreq(dev, SIOCSIFFLAGS, &request); 370 } 371 372 /** 373 * DPDK callback for Ethernet device configuration. 374 * 375 * @param dev 376 * Pointer to Ethernet device structure. 377 * 378 * @return 379 * 0 on success, a negative errno value otherwise and rte_errno is set. 380 */ 381 int 382 mlx5_dev_configure(struct rte_eth_dev *dev) 383 { 384 struct mlx5_priv *priv = dev->data->dev_private; 385 unsigned int rxqs_n = dev->data->nb_rx_queues; 386 unsigned int txqs_n = dev->data->nb_tx_queues; 387 unsigned int i; 388 unsigned int j; 389 unsigned int reta_idx_n; 390 const uint8_t use_app_rss_key = 391 !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key; 392 int ret = 0; 393 394 if (use_app_rss_key && 395 (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len != 396 MLX5_RSS_HASH_KEY_LEN)) { 397 DRV_LOG(ERR, "port %u RSS key len must be %s Bytes long", 398 dev->data->port_id, RTE_STR(MLX5_RSS_HASH_KEY_LEN)); 399 rte_errno = EINVAL; 400 return -rte_errno; 401 } 402 priv->rss_conf.rss_key = 403 rte_realloc(priv->rss_conf.rss_key, 404 MLX5_RSS_HASH_KEY_LEN, 0); 405 if (!priv->rss_conf.rss_key) { 406 DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)", 407 dev->data->port_id, rxqs_n); 408 rte_errno = ENOMEM; 409 return -rte_errno; 410 } 411 memcpy(priv->rss_conf.rss_key, 412 use_app_rss_key ? 413 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key : 414 rss_hash_default_key, 415 MLX5_RSS_HASH_KEY_LEN); 416 priv->rss_conf.rss_key_len = MLX5_RSS_HASH_KEY_LEN; 417 priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf; 418 priv->rxqs = (void *)dev->data->rx_queues; 419 priv->txqs = (void *)dev->data->tx_queues; 420 if (txqs_n != priv->txqs_n) { 421 DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u", 422 dev->data->port_id, priv->txqs_n, txqs_n); 423 priv->txqs_n = txqs_n; 424 } 425 if (rxqs_n > priv->config.ind_table_max_size) { 426 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)", 427 dev->data->port_id, rxqs_n); 428 rte_errno = EINVAL; 429 return -rte_errno; 430 } 431 if (rxqs_n != priv->rxqs_n) { 432 DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u", 433 dev->data->port_id, priv->rxqs_n, rxqs_n); 434 priv->rxqs_n = rxqs_n; 435 /* 436 * If the requested number of RX queues is not a power of two, 437 * use the maximum indirection table size for better balancing. 438 * The result is always rounded to the next power of two. 439 */ 440 reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ? 441 priv->config.ind_table_max_size : 442 rxqs_n)); 443 ret = mlx5_rss_reta_index_resize(dev, reta_idx_n); 444 if (ret) 445 return ret; 446 /* 447 * When the number of RX queues is not a power of two, 448 * the remaining table entries are padded with reused WQs 449 * and hashes are not spread uniformly. 450 */ 451 for (i = 0, j = 0; (i != reta_idx_n); ++i) { 452 (*priv->reta_idx)[i] = j; 453 if (++j == rxqs_n) 454 j = 0; 455 } 456 } 457 ret = mlx5_proc_priv_init(dev); 458 if (ret) 459 return ret; 460 return 0; 461 } 462 463 /** 464 * Sets default tuning parameters. 465 * 466 * @param dev 467 * Pointer to Ethernet device. 468 * @param[out] info 469 * Info structure output buffer. 470 */ 471 static void 472 mlx5_set_default_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 473 { 474 struct mlx5_priv *priv = dev->data->dev_private; 475 476 /* Minimum CPU utilization. */ 477 info->default_rxportconf.ring_size = 256; 478 info->default_txportconf.ring_size = 256; 479 info->default_rxportconf.burst_size = 64; 480 info->default_txportconf.burst_size = 64; 481 if (priv->link_speed_capa & ETH_LINK_SPEED_100G) { 482 info->default_rxportconf.nb_queues = 16; 483 info->default_txportconf.nb_queues = 16; 484 if (dev->data->nb_rx_queues > 2 || 485 dev->data->nb_tx_queues > 2) { 486 /* Max Throughput. */ 487 info->default_rxportconf.ring_size = 2048; 488 info->default_txportconf.ring_size = 2048; 489 } 490 } else { 491 info->default_rxportconf.nb_queues = 8; 492 info->default_txportconf.nb_queues = 8; 493 if (dev->data->nb_rx_queues > 2 || 494 dev->data->nb_tx_queues > 2) { 495 /* Max Throughput. */ 496 info->default_rxportconf.ring_size = 4096; 497 info->default_txportconf.ring_size = 4096; 498 } 499 } 500 } 501 502 /** 503 * DPDK callback to get information about the device. 504 * 505 * @param dev 506 * Pointer to Ethernet device structure. 507 * @param[out] info 508 * Info structure output buffer. 509 */ 510 void 511 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 512 { 513 struct mlx5_priv *priv = dev->data->dev_private; 514 struct mlx5_dev_config *config = &priv->config; 515 unsigned int max; 516 char ifname[IF_NAMESIZE]; 517 518 /* FIXME: we should ask the device for these values. */ 519 info->min_rx_bufsize = 32; 520 info->max_rx_pktlen = 65536; 521 /* 522 * Since we need one CQ per QP, the limit is the minimum number 523 * between the two values. 524 */ 525 max = RTE_MIN(priv->sh->device_attr.orig_attr.max_cq, 526 priv->sh->device_attr.orig_attr.max_qp); 527 /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */ 528 if (max >= 65535) 529 max = 65535; 530 info->max_rx_queues = max; 531 info->max_tx_queues = max; 532 info->max_mac_addrs = MLX5_MAX_UC_MAC_ADDRESSES; 533 info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev); 534 info->rx_offload_capa = (mlx5_get_rx_port_offloads() | 535 info->rx_queue_offload_capa); 536 info->tx_offload_capa = mlx5_get_tx_port_offloads(dev); 537 if (mlx5_get_ifname(dev, &ifname) == 0) 538 info->if_index = if_nametoindex(ifname); 539 info->reta_size = priv->reta_idx_n ? 540 priv->reta_idx_n : config->ind_table_max_size; 541 info->hash_key_size = MLX5_RSS_HASH_KEY_LEN; 542 info->speed_capa = priv->link_speed_capa; 543 info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK; 544 mlx5_set_default_params(dev, info); 545 info->switch_info.name = dev->data->name; 546 info->switch_info.domain_id = priv->domain_id; 547 info->switch_info.port_id = priv->representor_id; 548 if (priv->representor) { 549 unsigned int i = mlx5_dev_to_port_id(dev->device, NULL, 0); 550 uint16_t port_id[i]; 551 552 i = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, i), i); 553 while (i--) { 554 struct mlx5_priv *opriv = 555 rte_eth_devices[port_id[i]].data->dev_private; 556 557 if (!opriv || 558 opriv->representor || 559 opriv->domain_id != priv->domain_id) 560 continue; 561 /* 562 * Override switch name with that of the master 563 * device. 564 */ 565 info->switch_info.name = opriv->dev_data->name; 566 break; 567 } 568 } 569 } 570 571 /** 572 * Get firmware version of a device. 573 * 574 * @param dev 575 * Ethernet device port. 576 * @param fw_ver 577 * String output allocated by caller. 578 * @param fw_size 579 * Size of the output string, including terminating null byte. 580 * 581 * @return 582 * 0 on success, or the size of the non truncated string if too big. 583 */ 584 int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size) 585 { 586 struct mlx5_priv *priv = dev->data->dev_private; 587 struct ibv_device_attr *attr = &priv->sh->device_attr.orig_attr; 588 size_t size = strnlen(attr->fw_ver, sizeof(attr->fw_ver)) + 1; 589 590 if (fw_size < size) 591 return size; 592 if (fw_ver != NULL) 593 strlcpy(fw_ver, attr->fw_ver, fw_size); 594 return 0; 595 } 596 597 /** 598 * Get supported packet types. 599 * 600 * @param dev 601 * Pointer to Ethernet device structure. 602 * 603 * @return 604 * A pointer to the supported Packet types array. 605 */ 606 const uint32_t * 607 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) 608 { 609 static const uint32_t ptypes[] = { 610 /* refers to rxq_cq_to_pkt_type() */ 611 RTE_PTYPE_L2_ETHER, 612 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, 613 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, 614 RTE_PTYPE_L4_NONFRAG, 615 RTE_PTYPE_L4_FRAG, 616 RTE_PTYPE_L4_TCP, 617 RTE_PTYPE_L4_UDP, 618 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, 619 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, 620 RTE_PTYPE_INNER_L4_NONFRAG, 621 RTE_PTYPE_INNER_L4_FRAG, 622 RTE_PTYPE_INNER_L4_TCP, 623 RTE_PTYPE_INNER_L4_UDP, 624 RTE_PTYPE_UNKNOWN 625 }; 626 627 if (dev->rx_pkt_burst == mlx5_rx_burst || 628 dev->rx_pkt_burst == mlx5_rx_burst_mprq || 629 dev->rx_pkt_burst == mlx5_rx_burst_vec) 630 return ptypes; 631 return NULL; 632 } 633 634 /** 635 * Retrieve the master device for representor in the same switch domain. 636 * 637 * @param dev 638 * Pointer to representor Ethernet device structure. 639 * 640 * @return 641 * Master device structure on success, NULL otherwise. 642 */ 643 644 static struct rte_eth_dev * 645 mlx5_find_master_dev(struct rte_eth_dev *dev) 646 { 647 struct mlx5_priv *priv; 648 uint16_t port_id; 649 uint16_t domain_id; 650 651 priv = dev->data->dev_private; 652 domain_id = priv->domain_id; 653 assert(priv->representor); 654 RTE_ETH_FOREACH_DEV_OF(port_id, dev->device) { 655 priv = rte_eth_devices[port_id].data->dev_private; 656 if (priv && 657 priv->master && 658 priv->domain_id == domain_id) 659 return &rte_eth_devices[port_id]; 660 } 661 return NULL; 662 } 663 664 /** 665 * DPDK callback to retrieve physical link information. 666 * 667 * @param dev 668 * Pointer to Ethernet device structure. 669 * @param[out] link 670 * Storage for current link status. 671 * 672 * @return 673 * 0 on success, a negative errno value otherwise and rte_errno is set. 674 */ 675 static int 676 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, 677 struct rte_eth_link *link) 678 { 679 struct mlx5_priv *priv = dev->data->dev_private; 680 struct ethtool_cmd edata = { 681 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */ 682 }; 683 struct ifreq ifr; 684 struct rte_eth_link dev_link; 685 int link_speed = 0; 686 int ret; 687 688 ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr); 689 if (ret) { 690 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", 691 dev->data->port_id, strerror(rte_errno)); 692 return ret; 693 } 694 dev_link = (struct rte_eth_link) { 695 .link_status = ((ifr.ifr_flags & IFF_UP) && 696 (ifr.ifr_flags & IFF_RUNNING)), 697 }; 698 ifr = (struct ifreq) { 699 .ifr_data = (void *)&edata, 700 }; 701 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 702 if (ret) { 703 if (ret == -ENOTSUP && priv->representor) { 704 struct rte_eth_dev *master; 705 706 /* 707 * For representors we can try to inherit link 708 * settings from the master device. Actually 709 * link settings do not make a lot of sense 710 * for representors due to missing physical 711 * link. The old kernel drivers supported 712 * emulated settings query for representors, 713 * the new ones do not, so we have to add 714 * this code for compatibility issues. 715 */ 716 master = mlx5_find_master_dev(dev); 717 if (master) { 718 ifr = (struct ifreq) { 719 .ifr_data = (void *)&edata, 720 }; 721 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr); 722 } 723 } 724 if (ret) { 725 DRV_LOG(WARNING, 726 "port %u ioctl(SIOCETHTOOL," 727 " ETHTOOL_GSET) failed: %s", 728 dev->data->port_id, strerror(rte_errno)); 729 return ret; 730 } 731 } 732 link_speed = ethtool_cmd_speed(&edata); 733 if (link_speed == -1) 734 dev_link.link_speed = ETH_SPEED_NUM_NONE; 735 else 736 dev_link.link_speed = link_speed; 737 priv->link_speed_capa = 0; 738 if (edata.supported & SUPPORTED_Autoneg) 739 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG; 740 if (edata.supported & (SUPPORTED_1000baseT_Full | 741 SUPPORTED_1000baseKX_Full)) 742 priv->link_speed_capa |= ETH_LINK_SPEED_1G; 743 if (edata.supported & SUPPORTED_10000baseKR_Full) 744 priv->link_speed_capa |= ETH_LINK_SPEED_10G; 745 if (edata.supported & (SUPPORTED_40000baseKR4_Full | 746 SUPPORTED_40000baseCR4_Full | 747 SUPPORTED_40000baseSR4_Full | 748 SUPPORTED_40000baseLR4_Full)) 749 priv->link_speed_capa |= ETH_LINK_SPEED_40G; 750 dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? 751 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 752 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & 753 ETH_LINK_SPEED_FIXED); 754 if (((dev_link.link_speed && !dev_link.link_status) || 755 (!dev_link.link_speed && dev_link.link_status))) { 756 rte_errno = EAGAIN; 757 return -rte_errno; 758 } 759 *link = dev_link; 760 return 0; 761 } 762 763 /** 764 * Retrieve physical link information (unlocked version using new ioctl). 765 * 766 * @param dev 767 * Pointer to Ethernet device structure. 768 * @param[out] link 769 * Storage for current link status. 770 * 771 * @return 772 * 0 on success, a negative errno value otherwise and rte_errno is set. 773 */ 774 static int 775 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, 776 struct rte_eth_link *link) 777 778 { 779 struct mlx5_priv *priv = dev->data->dev_private; 780 struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS }; 781 struct ifreq ifr; 782 struct rte_eth_link dev_link; 783 struct rte_eth_dev *master = NULL; 784 uint64_t sc; 785 int ret; 786 787 ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr); 788 if (ret) { 789 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", 790 dev->data->port_id, strerror(rte_errno)); 791 return ret; 792 } 793 dev_link = (struct rte_eth_link) { 794 .link_status = ((ifr.ifr_flags & IFF_UP) && 795 (ifr.ifr_flags & IFF_RUNNING)), 796 }; 797 ifr = (struct ifreq) { 798 .ifr_data = (void *)&gcmd, 799 }; 800 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 801 if (ret) { 802 if (ret == -ENOTSUP && priv->representor) { 803 /* 804 * For representors we can try to inherit link 805 * settings from the master device. Actually 806 * link settings do not make a lot of sense 807 * for representors due to missing physical 808 * link. The old kernel drivers supported 809 * emulated settings query for representors, 810 * the new ones do not, so we have to add 811 * this code for compatibility issues. 812 */ 813 master = mlx5_find_master_dev(dev); 814 if (master) { 815 ifr = (struct ifreq) { 816 .ifr_data = (void *)&gcmd, 817 }; 818 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr); 819 } 820 } 821 if (ret) { 822 DRV_LOG(DEBUG, 823 "port %u ioctl(SIOCETHTOOL," 824 " ETHTOOL_GLINKSETTINGS) failed: %s", 825 dev->data->port_id, strerror(rte_errno)); 826 return ret; 827 } 828 829 } 830 gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords; 831 832 alignas(struct ethtool_link_settings) 833 uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) + 834 sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3]; 835 struct ethtool_link_settings *ecmd = (void *)data; 836 837 *ecmd = gcmd; 838 ifr.ifr_data = (void *)ecmd; 839 ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr); 840 if (ret) { 841 DRV_LOG(DEBUG, 842 "port %u ioctl(SIOCETHTOOL," 843 "ETHTOOL_GLINKSETTINGS) failed: %s", 844 dev->data->port_id, strerror(rte_errno)); 845 return ret; 846 } 847 dev_link.link_speed = ecmd->speed; 848 sc = ecmd->link_mode_masks[0] | 849 ((uint64_t)ecmd->link_mode_masks[1] << 32); 850 priv->link_speed_capa = 0; 851 if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT)) 852 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG; 853 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) | 854 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT))) 855 priv->link_speed_capa |= ETH_LINK_SPEED_1G; 856 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) | 857 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) | 858 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT))) 859 priv->link_speed_capa |= ETH_LINK_SPEED_10G; 860 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) | 861 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT))) 862 priv->link_speed_capa |= ETH_LINK_SPEED_20G; 863 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) | 864 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) | 865 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) | 866 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT))) 867 priv->link_speed_capa |= ETH_LINK_SPEED_40G; 868 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) | 869 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) | 870 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) | 871 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT))) 872 priv->link_speed_capa |= ETH_LINK_SPEED_56G; 873 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) | 874 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) | 875 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT))) 876 priv->link_speed_capa |= ETH_LINK_SPEED_25G; 877 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) | 878 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT))) 879 priv->link_speed_capa |= ETH_LINK_SPEED_50G; 880 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) | 881 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) | 882 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) | 883 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT))) 884 priv->link_speed_capa |= ETH_LINK_SPEED_100G; 885 dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ? 886 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 887 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & 888 ETH_LINK_SPEED_FIXED); 889 if (((dev_link.link_speed && !dev_link.link_status) || 890 (!dev_link.link_speed && dev_link.link_status))) { 891 rte_errno = EAGAIN; 892 return -rte_errno; 893 } 894 *link = dev_link; 895 return 0; 896 } 897 898 /** 899 * DPDK callback to retrieve physical link information. 900 * 901 * @param dev 902 * Pointer to Ethernet device structure. 903 * @param wait_to_complete 904 * Wait for request completion. 905 * 906 * @return 907 * 0 if link status was not updated, positive if it was, a negative errno 908 * value otherwise and rte_errno is set. 909 */ 910 int 911 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete) 912 { 913 int ret; 914 struct rte_eth_link dev_link; 915 time_t start_time = time(NULL); 916 917 do { 918 ret = mlx5_link_update_unlocked_gs(dev, &dev_link); 919 if (ret) 920 ret = mlx5_link_update_unlocked_gset(dev, &dev_link); 921 if (ret == 0) 922 break; 923 /* Handle wait to complete situation. */ 924 if (wait_to_complete && ret == -EAGAIN) { 925 if (abs((int)difftime(time(NULL), start_time)) < 926 MLX5_LINK_STATUS_TIMEOUT) { 927 usleep(0); 928 continue; 929 } else { 930 rte_errno = EBUSY; 931 return -rte_errno; 932 } 933 } else if (ret < 0) { 934 return ret; 935 } 936 } while (wait_to_complete); 937 ret = !!memcmp(&dev->data->dev_link, &dev_link, 938 sizeof(struct rte_eth_link)); 939 dev->data->dev_link = dev_link; 940 return ret; 941 } 942 943 /** 944 * DPDK callback to change the MTU. 945 * 946 * @param dev 947 * Pointer to Ethernet device structure. 948 * @param in_mtu 949 * New MTU. 950 * 951 * @return 952 * 0 on success, a negative errno value otherwise and rte_errno is set. 953 */ 954 int 955 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 956 { 957 struct mlx5_priv *priv = dev->data->dev_private; 958 uint16_t kern_mtu = 0; 959 int ret; 960 961 ret = mlx5_get_mtu(dev, &kern_mtu); 962 if (ret) 963 return ret; 964 /* Set kernel interface MTU first. */ 965 ret = mlx5_set_mtu(dev, mtu); 966 if (ret) 967 return ret; 968 ret = mlx5_get_mtu(dev, &kern_mtu); 969 if (ret) 970 return ret; 971 if (kern_mtu == mtu) { 972 priv->mtu = mtu; 973 DRV_LOG(DEBUG, "port %u adapter MTU set to %u", 974 dev->data->port_id, mtu); 975 return 0; 976 } 977 rte_errno = EAGAIN; 978 return -rte_errno; 979 } 980 981 /** 982 * DPDK callback to get flow control status. 983 * 984 * @param dev 985 * Pointer to Ethernet device structure. 986 * @param[out] fc_conf 987 * Flow control output buffer. 988 * 989 * @return 990 * 0 on success, a negative errno value otherwise and rte_errno is set. 991 */ 992 int 993 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 994 { 995 struct ifreq ifr; 996 struct ethtool_pauseparam ethpause = { 997 .cmd = ETHTOOL_GPAUSEPARAM 998 }; 999 int ret; 1000 1001 ifr.ifr_data = (void *)ðpause; 1002 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 1003 if (ret) { 1004 DRV_LOG(WARNING, 1005 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:" 1006 " %s", 1007 dev->data->port_id, strerror(rte_errno)); 1008 return ret; 1009 } 1010 fc_conf->autoneg = ethpause.autoneg; 1011 if (ethpause.rx_pause && ethpause.tx_pause) 1012 fc_conf->mode = RTE_FC_FULL; 1013 else if (ethpause.rx_pause) 1014 fc_conf->mode = RTE_FC_RX_PAUSE; 1015 else if (ethpause.tx_pause) 1016 fc_conf->mode = RTE_FC_TX_PAUSE; 1017 else 1018 fc_conf->mode = RTE_FC_NONE; 1019 return 0; 1020 } 1021 1022 /** 1023 * DPDK callback to modify flow control parameters. 1024 * 1025 * @param dev 1026 * Pointer to Ethernet device structure. 1027 * @param[in] fc_conf 1028 * Flow control parameters. 1029 * 1030 * @return 1031 * 0 on success, a negative errno value otherwise and rte_errno is set. 1032 */ 1033 int 1034 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 1035 { 1036 struct ifreq ifr; 1037 struct ethtool_pauseparam ethpause = { 1038 .cmd = ETHTOOL_SPAUSEPARAM 1039 }; 1040 int ret; 1041 1042 ifr.ifr_data = (void *)ðpause; 1043 ethpause.autoneg = fc_conf->autoneg; 1044 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 1045 (fc_conf->mode & RTE_FC_RX_PAUSE)) 1046 ethpause.rx_pause = 1; 1047 else 1048 ethpause.rx_pause = 0; 1049 1050 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 1051 (fc_conf->mode & RTE_FC_TX_PAUSE)) 1052 ethpause.tx_pause = 1; 1053 else 1054 ethpause.tx_pause = 0; 1055 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 1056 if (ret) { 1057 DRV_LOG(WARNING, 1058 "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" 1059 " failed: %s", 1060 dev->data->port_id, strerror(rte_errno)); 1061 return ret; 1062 } 1063 return 0; 1064 } 1065 1066 /** 1067 * Get PCI information from struct ibv_device. 1068 * 1069 * @param device 1070 * Pointer to Ethernet device structure. 1071 * @param[out] pci_addr 1072 * PCI bus address output buffer. 1073 * 1074 * @return 1075 * 0 on success, a negative errno value otherwise and rte_errno is set. 1076 */ 1077 int 1078 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, 1079 struct rte_pci_addr *pci_addr) 1080 { 1081 FILE *file; 1082 char line[32]; 1083 MKSTR(path, "%s/device/uevent", device->ibdev_path); 1084 1085 file = fopen(path, "rb"); 1086 if (file == NULL) { 1087 rte_errno = errno; 1088 return -rte_errno; 1089 } 1090 while (fgets(line, sizeof(line), file) == line) { 1091 size_t len = strlen(line); 1092 int ret; 1093 1094 /* Truncate long lines. */ 1095 if (len == (sizeof(line) - 1)) 1096 while (line[(len - 1)] != '\n') { 1097 ret = fgetc(file); 1098 if (ret == EOF) 1099 break; 1100 line[(len - 1)] = ret; 1101 } 1102 /* Extract information. */ 1103 if (sscanf(line, 1104 "PCI_SLOT_NAME=" 1105 "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", 1106 &pci_addr->domain, 1107 &pci_addr->bus, 1108 &pci_addr->devid, 1109 &pci_addr->function) == 4) { 1110 ret = 0; 1111 break; 1112 } 1113 } 1114 fclose(file); 1115 return 0; 1116 } 1117 1118 /** 1119 * Handle asynchronous removal event for entire multiport device. 1120 * 1121 * @param sh 1122 * Infiniband device shared context. 1123 */ 1124 static void 1125 mlx5_dev_interrupt_device_fatal(struct mlx5_ibv_shared *sh) 1126 { 1127 uint32_t i; 1128 1129 for (i = 0; i < sh->max_port; ++i) { 1130 struct rte_eth_dev *dev; 1131 1132 if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) { 1133 /* 1134 * Or not existing port either no 1135 * handler installed for this port. 1136 */ 1137 continue; 1138 } 1139 dev = &rte_eth_devices[sh->port[i].ih_port_id]; 1140 assert(dev); 1141 if (dev->data->dev_conf.intr_conf.rmv) 1142 _rte_eth_dev_callback_process 1143 (dev, RTE_ETH_EVENT_INTR_RMV, NULL); 1144 } 1145 } 1146 1147 /** 1148 * Handle shared asynchronous events the NIC (removal event 1149 * and link status change). Supports multiport IB device. 1150 * 1151 * @param cb_arg 1152 * Callback argument. 1153 */ 1154 void 1155 mlx5_dev_interrupt_handler(void *cb_arg) 1156 { 1157 struct mlx5_ibv_shared *sh = cb_arg; 1158 struct ibv_async_event event; 1159 1160 /* Read all message from the IB device and acknowledge them. */ 1161 for (;;) { 1162 struct rte_eth_dev *dev; 1163 uint32_t tmp; 1164 1165 if (mlx5_glue->get_async_event(sh->ctx, &event)) 1166 break; 1167 /* Retrieve and check IB port index. */ 1168 tmp = (uint32_t)event.element.port_num; 1169 if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) { 1170 /* 1171 * The DEVICE_FATAL event is called once for 1172 * entire device without port specifying. 1173 * We should notify all existing ports. 1174 */ 1175 mlx5_glue->ack_async_event(&event); 1176 mlx5_dev_interrupt_device_fatal(sh); 1177 continue; 1178 } 1179 assert(tmp && (tmp <= sh->max_port)); 1180 if (!tmp) { 1181 /* Unsupported devive level event. */ 1182 mlx5_glue->ack_async_event(&event); 1183 DRV_LOG(DEBUG, 1184 "unsupported common event (type %d)", 1185 event.event_type); 1186 continue; 1187 } 1188 if (tmp > sh->max_port) { 1189 /* Invalid IB port index. */ 1190 mlx5_glue->ack_async_event(&event); 1191 DRV_LOG(DEBUG, 1192 "cannot handle an event (type %d)" 1193 "due to invalid IB port index (%u)", 1194 event.event_type, tmp); 1195 continue; 1196 } 1197 if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) { 1198 /* No handler installed. */ 1199 mlx5_glue->ack_async_event(&event); 1200 DRV_LOG(DEBUG, 1201 "cannot handle an event (type %d)" 1202 "due to no handler installed for port %u", 1203 event.event_type, tmp); 1204 continue; 1205 } 1206 /* Retrieve ethernet device descriptor. */ 1207 tmp = sh->port[tmp - 1].ih_port_id; 1208 dev = &rte_eth_devices[tmp]; 1209 assert(dev); 1210 if ((event.event_type == IBV_EVENT_PORT_ACTIVE || 1211 event.event_type == IBV_EVENT_PORT_ERR) && 1212 dev->data->dev_conf.intr_conf.lsc) { 1213 mlx5_glue->ack_async_event(&event); 1214 if (mlx5_link_update(dev, 0) == -EAGAIN) { 1215 usleep(0); 1216 continue; 1217 } 1218 _rte_eth_dev_callback_process 1219 (dev, RTE_ETH_EVENT_INTR_LSC, NULL); 1220 continue; 1221 } 1222 DRV_LOG(DEBUG, 1223 "port %u cannot handle an unknown event (type %d)", 1224 dev->data->port_id, event.event_type); 1225 mlx5_glue->ack_async_event(&event); 1226 } 1227 } 1228 1229 /** 1230 * Uninstall shared asynchronous device events handler. 1231 * This function is implemeted to support event sharing 1232 * between multiple ports of single IB device. 1233 * 1234 * @param dev 1235 * Pointer to Ethernet device. 1236 */ 1237 static void 1238 mlx5_dev_shared_handler_uninstall(struct rte_eth_dev *dev) 1239 { 1240 struct mlx5_priv *priv = dev->data->dev_private; 1241 struct mlx5_ibv_shared *sh = priv->sh; 1242 1243 if (rte_eal_process_type() != RTE_PROC_PRIMARY) 1244 return; 1245 pthread_mutex_lock(&sh->intr_mutex); 1246 assert(priv->ibv_port); 1247 assert(priv->ibv_port <= sh->max_port); 1248 assert(dev->data->port_id < RTE_MAX_ETHPORTS); 1249 if (sh->port[priv->ibv_port - 1].ih_port_id >= RTE_MAX_ETHPORTS) 1250 goto exit; 1251 assert(sh->port[priv->ibv_port - 1].ih_port_id == 1252 (uint32_t)dev->data->port_id); 1253 assert(sh->intr_cnt); 1254 sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS; 1255 if (!sh->intr_cnt || --sh->intr_cnt) 1256 goto exit; 1257 rte_intr_callback_unregister(&sh->intr_handle, 1258 mlx5_dev_interrupt_handler, sh); 1259 sh->intr_handle.fd = 0; 1260 sh->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; 1261 exit: 1262 pthread_mutex_unlock(&sh->intr_mutex); 1263 } 1264 1265 /** 1266 * Install shared asyncronous device events handler. 1267 * This function is implemeted to support event sharing 1268 * between multiple ports of single IB device. 1269 * 1270 * @param dev 1271 * Pointer to Ethernet device. 1272 */ 1273 static void 1274 mlx5_dev_shared_handler_install(struct rte_eth_dev *dev) 1275 { 1276 struct mlx5_priv *priv = dev->data->dev_private; 1277 struct mlx5_ibv_shared *sh = priv->sh; 1278 int ret; 1279 int flags; 1280 1281 if (rte_eal_process_type() != RTE_PROC_PRIMARY) 1282 return; 1283 pthread_mutex_lock(&sh->intr_mutex); 1284 assert(priv->ibv_port); 1285 assert(priv->ibv_port <= sh->max_port); 1286 assert(dev->data->port_id < RTE_MAX_ETHPORTS); 1287 if (sh->port[priv->ibv_port - 1].ih_port_id < RTE_MAX_ETHPORTS) { 1288 /* The handler is already installed for this port. */ 1289 assert(sh->intr_cnt); 1290 goto exit; 1291 } 1292 sh->port[priv->ibv_port - 1].ih_port_id = (uint32_t)dev->data->port_id; 1293 if (sh->intr_cnt) { 1294 sh->intr_cnt++; 1295 goto exit; 1296 } 1297 /* No shared handler installed. */ 1298 assert(sh->ctx->async_fd > 0); 1299 flags = fcntl(sh->ctx->async_fd, F_GETFL); 1300 ret = fcntl(sh->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); 1301 if (ret) { 1302 DRV_LOG(INFO, "failed to change file descriptor" 1303 " async event queue"); 1304 /* Indicate there will be no interrupts. */ 1305 dev->data->dev_conf.intr_conf.lsc = 0; 1306 dev->data->dev_conf.intr_conf.rmv = 0; 1307 sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS; 1308 goto exit; 1309 } 1310 sh->intr_handle.fd = sh->ctx->async_fd; 1311 sh->intr_handle.type = RTE_INTR_HANDLE_EXT; 1312 rte_intr_callback_register(&sh->intr_handle, 1313 mlx5_dev_interrupt_handler, sh); 1314 sh->intr_cnt++; 1315 exit: 1316 pthread_mutex_unlock(&sh->intr_mutex); 1317 } 1318 1319 /** 1320 * Uninstall interrupt handler. 1321 * 1322 * @param dev 1323 * Pointer to Ethernet device. 1324 */ 1325 void 1326 mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev) 1327 { 1328 mlx5_dev_shared_handler_uninstall(dev); 1329 } 1330 1331 /** 1332 * Install interrupt handler. 1333 * 1334 * @param dev 1335 * Pointer to Ethernet device. 1336 */ 1337 void 1338 mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev) 1339 { 1340 mlx5_dev_shared_handler_install(dev); 1341 } 1342 1343 /** 1344 * DPDK callback to bring the link DOWN. 1345 * 1346 * @param dev 1347 * Pointer to Ethernet device structure. 1348 * 1349 * @return 1350 * 0 on success, a negative errno value otherwise and rte_errno is set. 1351 */ 1352 int 1353 mlx5_set_link_down(struct rte_eth_dev *dev) 1354 { 1355 return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP); 1356 } 1357 1358 /** 1359 * DPDK callback to bring the link UP. 1360 * 1361 * @param dev 1362 * Pointer to Ethernet device structure. 1363 * 1364 * @return 1365 * 0 on success, a negative errno value otherwise and rte_errno is set. 1366 */ 1367 int 1368 mlx5_set_link_up(struct rte_eth_dev *dev) 1369 { 1370 return mlx5_set_flags(dev, ~IFF_UP, IFF_UP); 1371 } 1372 1373 /** 1374 * Configure the TX function to use. 1375 * 1376 * @param dev 1377 * Pointer to private data structure. 1378 * 1379 * @return 1380 * Pointer to selected Tx burst function. 1381 */ 1382 eth_tx_burst_t 1383 mlx5_select_tx_function(struct rte_eth_dev *dev) 1384 { 1385 struct mlx5_priv *priv = dev->data->dev_private; 1386 eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst; 1387 struct mlx5_dev_config *config = &priv->config; 1388 uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads; 1389 int tso = !!(tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO | 1390 DEV_TX_OFFLOAD_VXLAN_TNL_TSO | 1391 DEV_TX_OFFLOAD_GRE_TNL_TSO | 1392 DEV_TX_OFFLOAD_IP_TNL_TSO | 1393 DEV_TX_OFFLOAD_UDP_TNL_TSO)); 1394 int swp = !!(tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO | 1395 DEV_TX_OFFLOAD_UDP_TNL_TSO | 1396 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)); 1397 int vlan_insert = !!(tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT); 1398 1399 assert(priv != NULL); 1400 /* Select appropriate TX function. */ 1401 if (vlan_insert || tso || swp) 1402 return tx_pkt_burst; 1403 if (config->mps == MLX5_MPW_ENHANCED) { 1404 if (mlx5_check_vec_tx_support(dev) > 0) { 1405 if (mlx5_check_raw_vec_tx_support(dev) > 0) 1406 tx_pkt_burst = mlx5_tx_burst_raw_vec; 1407 else 1408 tx_pkt_burst = mlx5_tx_burst_vec; 1409 DRV_LOG(DEBUG, 1410 "port %u selected enhanced MPW Tx vectorized" 1411 " function", 1412 dev->data->port_id); 1413 } else { 1414 tx_pkt_burst = mlx5_tx_burst_empw; 1415 DRV_LOG(DEBUG, 1416 "port %u selected enhanced MPW Tx function", 1417 dev->data->port_id); 1418 } 1419 } else if (config->mps && (config->txq_inline > 0)) { 1420 tx_pkt_burst = mlx5_tx_burst_mpw_inline; 1421 DRV_LOG(DEBUG, "port %u selected MPW inline Tx function", 1422 dev->data->port_id); 1423 } else if (config->mps) { 1424 tx_pkt_burst = mlx5_tx_burst_mpw; 1425 DRV_LOG(DEBUG, "port %u selected MPW Tx function", 1426 dev->data->port_id); 1427 } 1428 return tx_pkt_burst; 1429 } 1430 1431 /** 1432 * Configure the RX function to use. 1433 * 1434 * @param dev 1435 * Pointer to private data structure. 1436 * 1437 * @return 1438 * Pointer to selected Rx burst function. 1439 */ 1440 eth_rx_burst_t 1441 mlx5_select_rx_function(struct rte_eth_dev *dev) 1442 { 1443 eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst; 1444 1445 assert(dev != NULL); 1446 if (mlx5_check_vec_rx_support(dev) > 0) { 1447 rx_pkt_burst = mlx5_rx_burst_vec; 1448 DRV_LOG(DEBUG, "port %u selected Rx vectorized function", 1449 dev->data->port_id); 1450 } else if (mlx5_mprq_enabled(dev)) { 1451 rx_pkt_burst = mlx5_rx_burst_mprq; 1452 } 1453 return rx_pkt_burst; 1454 } 1455 1456 /** 1457 * Check if mlx5 device was removed. 1458 * 1459 * @param dev 1460 * Pointer to Ethernet device structure. 1461 * 1462 * @return 1463 * 1 when device is removed, otherwise 0. 1464 */ 1465 int 1466 mlx5_is_removed(struct rte_eth_dev *dev) 1467 { 1468 struct ibv_device_attr device_attr; 1469 struct mlx5_priv *priv = dev->data->dev_private; 1470 1471 if (mlx5_glue->query_device(priv->sh->ctx, &device_attr) == EIO) 1472 return 1; 1473 return 0; 1474 } 1475 1476 /** 1477 * Get port ID list of mlx5 instances sharing a common device. 1478 * 1479 * @param[in] dev 1480 * Device to look for. 1481 * @param[out] port_list 1482 * Result buffer for collected port IDs. 1483 * @param port_list_n 1484 * Maximum number of entries in result buffer. If 0, @p port_list can be 1485 * NULL. 1486 * 1487 * @return 1488 * Number of matching instances regardless of the @p port_list_n 1489 * parameter, 0 if none were found. 1490 */ 1491 unsigned int 1492 mlx5_dev_to_port_id(const struct rte_device *dev, uint16_t *port_list, 1493 unsigned int port_list_n) 1494 { 1495 uint16_t id; 1496 unsigned int n = 0; 1497 1498 RTE_ETH_FOREACH_DEV_OF(id, dev) { 1499 if (n < port_list_n) 1500 port_list[n] = id; 1501 n++; 1502 } 1503 return n; 1504 } 1505 1506 /** 1507 * Get the E-Switch domain id this port belongs to. 1508 * 1509 * @param[in] port 1510 * Device port id. 1511 * @param[out] es_domain_id 1512 * E-Switch domain id. 1513 * @param[out] es_port_id 1514 * The port id of the port in the E-Switch. 1515 * 1516 * @return 1517 * 0 on success, a negative errno value otherwise and rte_errno is set. 1518 */ 1519 int 1520 mlx5_port_to_eswitch_info(uint16_t port, 1521 uint16_t *es_domain_id, uint16_t *es_port_id) 1522 { 1523 struct rte_eth_dev *dev; 1524 struct mlx5_priv *priv; 1525 1526 if (port >= RTE_MAX_ETHPORTS) { 1527 rte_errno = EINVAL; 1528 return -rte_errno; 1529 } 1530 if (!rte_eth_dev_is_valid_port(port)) { 1531 rte_errno = ENODEV; 1532 return -rte_errno; 1533 } 1534 dev = &rte_eth_devices[port]; 1535 priv = dev->data->dev_private; 1536 if (!(priv->representor || priv->master)) { 1537 rte_errno = EINVAL; 1538 return -rte_errno; 1539 } 1540 if (es_domain_id) 1541 *es_domain_id = priv->domain_id; 1542 if (es_port_id) 1543 *es_port_id = priv->vport_id; 1544 return 0; 1545 } 1546 1547 /** 1548 * Get switch information associated with network interface. 1549 * 1550 * @param ifindex 1551 * Network interface index. 1552 * @param[out] info 1553 * Switch information object, populated in case of success. 1554 * 1555 * @return 1556 * 0 on success, a negative errno value otherwise and rte_errno is set. 1557 */ 1558 int 1559 mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info) 1560 { 1561 char ifname[IF_NAMESIZE]; 1562 char port_name[IF_NAMESIZE]; 1563 FILE *file; 1564 struct mlx5_switch_info data = { 1565 .master = 0, 1566 .representor = 0, 1567 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET, 1568 .port_name = 0, 1569 .switch_id = 0, 1570 }; 1571 DIR *dir; 1572 bool port_switch_id_set = false; 1573 bool device_dir = false; 1574 char c; 1575 int ret; 1576 1577 if (!if_indextoname(ifindex, ifname)) { 1578 rte_errno = errno; 1579 return -rte_errno; 1580 } 1581 1582 MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name", 1583 ifname); 1584 MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id", 1585 ifname); 1586 MKSTR(pci_device, "/sys/class/net/%s/device", 1587 ifname); 1588 1589 file = fopen(phys_port_name, "rb"); 1590 if (file != NULL) { 1591 ret = fscanf(file, "%s", port_name); 1592 fclose(file); 1593 if (ret == 1) 1594 mlx5_translate_port_name(port_name, &data); 1595 } 1596 file = fopen(phys_switch_id, "rb"); 1597 if (file == NULL) { 1598 rte_errno = errno; 1599 return -rte_errno; 1600 } 1601 port_switch_id_set = 1602 fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 && 1603 c == '\n'; 1604 fclose(file); 1605 dir = opendir(pci_device); 1606 if (dir != NULL) { 1607 closedir(dir); 1608 device_dir = true; 1609 } 1610 if (port_switch_id_set) { 1611 /* We have some E-Switch configuration. */ 1612 mlx5_sysfs_check_switch_info(device_dir, &data); 1613 } 1614 *info = data; 1615 assert(!(data.master && data.representor)); 1616 if (data.master && data.representor) { 1617 DRV_LOG(ERR, "ifindex %u device is recognized as master" 1618 " and as representor", ifindex); 1619 rte_errno = ENODEV; 1620 return -rte_errno; 1621 } 1622 return 0; 1623 } 1624 1625 /** 1626 * Analyze gathered port parameters via Netlink to recognize master 1627 * and representor devices for E-Switch configuration. 1628 * 1629 * @param[in] num_vf_set 1630 * flag of presence of number of VFs port attribute. 1631 * @param[inout] switch_info 1632 * Port information, including port name as a number and port name 1633 * type if recognized 1634 * 1635 * @return 1636 * master and representor flags are set in switch_info according to 1637 * recognized parameters (if any). 1638 */ 1639 void 1640 mlx5_nl_check_switch_info(bool num_vf_set, 1641 struct mlx5_switch_info *switch_info) 1642 { 1643 switch (switch_info->name_type) { 1644 case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN: 1645 /* 1646 * Name is not recognized, assume the master, 1647 * check the number of VFs key presence. 1648 */ 1649 switch_info->master = num_vf_set; 1650 break; 1651 case MLX5_PHYS_PORT_NAME_TYPE_NOTSET: 1652 /* 1653 * Name is not set, this assumes the legacy naming 1654 * schema for master, just check if there is a 1655 * number of VFs key. 1656 */ 1657 switch_info->master = num_vf_set; 1658 break; 1659 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: 1660 /* New uplink naming schema recognized. */ 1661 switch_info->master = 1; 1662 break; 1663 case MLX5_PHYS_PORT_NAME_TYPE_LEGACY: 1664 /* Legacy representors naming schema. */ 1665 switch_info->representor = !num_vf_set; 1666 break; 1667 case MLX5_PHYS_PORT_NAME_TYPE_PFVF: 1668 /* New representors naming schema. */ 1669 switch_info->representor = 1; 1670 break; 1671 } 1672 } 1673 1674 /** 1675 * Analyze gathered port parameters via sysfs to recognize master 1676 * and representor devices for E-Switch configuration. 1677 * 1678 * @param[in] device_dir 1679 * flag of presence of "device" directory under port device key. 1680 * @param[inout] switch_info 1681 * Port information, including port name as a number and port name 1682 * type if recognized 1683 * 1684 * @return 1685 * master and representor flags are set in switch_info according to 1686 * recognized parameters (if any). 1687 */ 1688 void 1689 mlx5_sysfs_check_switch_info(bool device_dir, 1690 struct mlx5_switch_info *switch_info) 1691 { 1692 switch (switch_info->name_type) { 1693 case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN: 1694 /* 1695 * Name is not recognized, assume the master, 1696 * check the device directory presence. 1697 */ 1698 switch_info->master = device_dir; 1699 break; 1700 case MLX5_PHYS_PORT_NAME_TYPE_NOTSET: 1701 /* 1702 * Name is not set, this assumes the legacy naming 1703 * schema for master, just check if there is 1704 * a device directory. 1705 */ 1706 switch_info->master = device_dir; 1707 break; 1708 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: 1709 /* New uplink naming schema recognized. */ 1710 switch_info->master = 1; 1711 break; 1712 case MLX5_PHYS_PORT_NAME_TYPE_LEGACY: 1713 /* Legacy representors naming schema. */ 1714 switch_info->representor = !device_dir; 1715 break; 1716 case MLX5_PHYS_PORT_NAME_TYPE_PFVF: 1717 /* New representors naming schema. */ 1718 switch_info->representor = 1; 1719 break; 1720 } 1721 } 1722 1723 /** 1724 * Extract port name, as a number, from sysfs or netlink information. 1725 * 1726 * @param[in] port_name_in 1727 * String representing the port name. 1728 * @param[out] port_info_out 1729 * Port information, including port name as a number and port name 1730 * type if recognized 1731 * 1732 * @return 1733 * port_name field set according to recognized name format. 1734 */ 1735 void 1736 mlx5_translate_port_name(const char *port_name_in, 1737 struct mlx5_switch_info *port_info_out) 1738 { 1739 char pf_c1, pf_c2, vf_c1, vf_c2; 1740 char *end; 1741 int sc_items; 1742 1743 /* 1744 * Check for port-name as a string of the form pf0vf0 1745 * (support kernel ver >= 5.0 or OFED ver >= 4.6). 1746 */ 1747 sc_items = sscanf(port_name_in, "%c%c%d%c%c%d", 1748 &pf_c1, &pf_c2, &port_info_out->pf_num, 1749 &vf_c1, &vf_c2, &port_info_out->port_name); 1750 if (sc_items == 6 && 1751 pf_c1 == 'p' && pf_c2 == 'f' && 1752 vf_c1 == 'v' && vf_c2 == 'f') { 1753 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_PFVF; 1754 return; 1755 } 1756 /* 1757 * Check for port-name as a string of the form p0 1758 * (support kernel ver >= 5.0, or OFED ver >= 4.6). 1759 */ 1760 sc_items = sscanf(port_name_in, "%c%d", 1761 &pf_c1, &port_info_out->port_name); 1762 if (sc_items == 2 && pf_c1 == 'p') { 1763 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UPLINK; 1764 return; 1765 } 1766 /* Check for port-name as a number (support kernel ver < 5.0 */ 1767 errno = 0; 1768 port_info_out->port_name = strtol(port_name_in, &end, 0); 1769 if (!errno && 1770 (size_t)(end - port_name_in) == strlen(port_name_in)) { 1771 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_LEGACY; 1772 return; 1773 } 1774 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN; 1775 return; 1776 } 1777