1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015 Mellanox Technologies, Ltd 4 */ 5 6 #define _GNU_SOURCE 7 8 #include <stddef.h> 9 #include <assert.h> 10 #include <inttypes.h> 11 #include <unistd.h> 12 #include <stdint.h> 13 #include <stdio.h> 14 #include <string.h> 15 #include <stdlib.h> 16 #include <errno.h> 17 #include <dirent.h> 18 #include <net/if.h> 19 #include <sys/ioctl.h> 20 #include <sys/socket.h> 21 #include <netinet/in.h> 22 #include <linux/ethtool.h> 23 #include <linux/sockios.h> 24 #include <fcntl.h> 25 #include <stdalign.h> 26 #include <sys/un.h> 27 #include <time.h> 28 29 #include <rte_atomic.h> 30 #include <rte_ethdev_driver.h> 31 #include <rte_bus_pci.h> 32 #include <rte_mbuf.h> 33 #include <rte_common.h> 34 #include <rte_interrupts.h> 35 #include <rte_malloc.h> 36 #include <rte_string_fns.h> 37 38 #include "mlx5.h" 39 #include "mlx5_glue.h" 40 #include "mlx5_rxtx.h" 41 #include "mlx5_utils.h" 42 43 /* Add defines in case the running kernel is not the same as user headers. */ 44 #ifndef ETHTOOL_GLINKSETTINGS 45 struct ethtool_link_settings { 46 uint32_t cmd; 47 uint32_t speed; 48 uint8_t duplex; 49 uint8_t port; 50 uint8_t phy_address; 51 uint8_t autoneg; 52 uint8_t mdio_support; 53 uint8_t eth_to_mdix; 54 uint8_t eth_tp_mdix_ctrl; 55 int8_t link_mode_masks_nwords; 56 uint32_t reserved[8]; 57 uint32_t link_mode_masks[]; 58 }; 59 60 #define ETHTOOL_GLINKSETTINGS 0x0000004c 61 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5 62 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6 63 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17 64 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18 65 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19 66 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20 67 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21 68 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22 69 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23 70 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24 71 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25 72 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26 73 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27 74 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28 75 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29 76 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30 77 #endif 78 #ifndef HAVE_ETHTOOL_LINK_MODE_25G 79 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31 80 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32 81 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33 82 #endif 83 #ifndef HAVE_ETHTOOL_LINK_MODE_50G 84 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34 85 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35 86 #endif 87 #ifndef HAVE_ETHTOOL_LINK_MODE_100G 88 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36 89 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37 90 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38 91 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39 92 #endif 93 94 /** 95 * Get interface name from private structure. 96 * 97 * @param[in] dev 98 * Pointer to Ethernet device. 99 * @param[out] ifname 100 * Interface name output buffer. 101 * 102 * @return 103 * 0 on success, a negative errno value otherwise and rte_errno is set. 104 */ 105 int 106 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]) 107 { 108 struct priv *priv = dev->data->dev_private; 109 DIR *dir; 110 struct dirent *dent; 111 unsigned int dev_type = 0; 112 unsigned int dev_port_prev = ~0u; 113 char match[IF_NAMESIZE] = ""; 114 115 { 116 MKSTR(path, "%s/device/net", priv->ibdev_path); 117 118 dir = opendir(path); 119 if (dir == NULL) { 120 rte_errno = errno; 121 return -rte_errno; 122 } 123 } 124 while ((dent = readdir(dir)) != NULL) { 125 char *name = dent->d_name; 126 FILE *file; 127 unsigned int dev_port; 128 int r; 129 130 if ((name[0] == '.') && 131 ((name[1] == '\0') || 132 ((name[1] == '.') && (name[2] == '\0')))) 133 continue; 134 135 MKSTR(path, "%s/device/net/%s/%s", 136 priv->ibdev_path, name, 137 (dev_type ? "dev_id" : "dev_port")); 138 139 file = fopen(path, "rb"); 140 if (file == NULL) { 141 if (errno != ENOENT) 142 continue; 143 /* 144 * Switch to dev_id when dev_port does not exist as 145 * is the case with Linux kernel versions < 3.15. 146 */ 147 try_dev_id: 148 match[0] = '\0'; 149 if (dev_type) 150 break; 151 dev_type = 1; 152 dev_port_prev = ~0u; 153 rewinddir(dir); 154 continue; 155 } 156 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); 157 fclose(file); 158 if (r != 1) 159 continue; 160 /* 161 * Switch to dev_id when dev_port returns the same value for 162 * all ports. May happen when using a MOFED release older than 163 * 3.0 with a Linux kernel >= 3.15. 164 */ 165 if (dev_port == dev_port_prev) 166 goto try_dev_id; 167 dev_port_prev = dev_port; 168 if (dev_port == (priv->port - 1u)) 169 strlcpy(match, name, sizeof(match)); 170 } 171 closedir(dir); 172 if (match[0] == '\0') { 173 rte_errno = ENOENT; 174 return -rte_errno; 175 } 176 strncpy(*ifname, match, sizeof(*ifname)); 177 return 0; 178 } 179 180 /** 181 * Get the interface index from device name. 182 * 183 * @param[in] dev 184 * Pointer to Ethernet device. 185 * 186 * @return 187 * Interface index on success, a negative errno value otherwise and 188 * rte_errno is set. 189 */ 190 int 191 mlx5_ifindex(const struct rte_eth_dev *dev) 192 { 193 char ifname[IF_NAMESIZE]; 194 int ret; 195 196 ret = mlx5_get_ifname(dev, &ifname); 197 if (ret) 198 return ret; 199 ret = if_nametoindex(ifname); 200 if (ret == -1) { 201 rte_errno = errno; 202 return -rte_errno; 203 } 204 return ret; 205 } 206 207 /** 208 * Perform ifreq ioctl() on associated Ethernet device. 209 * 210 * @param[in] dev 211 * Pointer to Ethernet device. 212 * @param req 213 * Request number to pass to ioctl(). 214 * @param[out] ifr 215 * Interface request structure output buffer. 216 * 217 * @return 218 * 0 on success, a negative errno value otherwise and rte_errno is set. 219 */ 220 int 221 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr) 222 { 223 int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 224 int ret = 0; 225 226 if (sock == -1) { 227 rte_errno = errno; 228 return -rte_errno; 229 } 230 ret = mlx5_get_ifname(dev, &ifr->ifr_name); 231 if (ret) 232 goto error; 233 ret = ioctl(sock, req, ifr); 234 if (ret == -1) { 235 rte_errno = errno; 236 goto error; 237 } 238 close(sock); 239 return 0; 240 error: 241 close(sock); 242 return -rte_errno; 243 } 244 245 /** 246 * Get device MTU. 247 * 248 * @param dev 249 * Pointer to Ethernet device. 250 * @param[out] mtu 251 * MTU value output buffer. 252 * 253 * @return 254 * 0 on success, a negative errno value otherwise and rte_errno is set. 255 */ 256 int 257 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu) 258 { 259 struct ifreq request; 260 int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request); 261 262 if (ret) 263 return ret; 264 *mtu = request.ifr_mtu; 265 return 0; 266 } 267 268 /** 269 * Set device MTU. 270 * 271 * @param dev 272 * Pointer to Ethernet device. 273 * @param mtu 274 * MTU value to set. 275 * 276 * @return 277 * 0 on success, a negative errno value otherwise and rte_errno is set. 278 */ 279 static int 280 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 281 { 282 struct ifreq request = { .ifr_mtu = mtu, }; 283 284 return mlx5_ifreq(dev, SIOCSIFMTU, &request); 285 } 286 287 /** 288 * Set device flags. 289 * 290 * @param dev 291 * Pointer to Ethernet device. 292 * @param keep 293 * Bitmask for flags that must remain untouched. 294 * @param flags 295 * Bitmask for flags to modify. 296 * 297 * @return 298 * 0 on success, a negative errno value otherwise and rte_errno is set. 299 */ 300 int 301 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags) 302 { 303 struct ifreq request; 304 int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request); 305 306 if (ret) 307 return ret; 308 request.ifr_flags &= keep; 309 request.ifr_flags |= flags & ~keep; 310 return mlx5_ifreq(dev, SIOCSIFFLAGS, &request); 311 } 312 313 /** 314 * DPDK callback for Ethernet device configuration. 315 * 316 * @param dev 317 * Pointer to Ethernet device structure. 318 * 319 * @return 320 * 0 on success, a negative errno value otherwise and rte_errno is set. 321 */ 322 int 323 mlx5_dev_configure(struct rte_eth_dev *dev) 324 { 325 struct priv *priv = dev->data->dev_private; 326 unsigned int rxqs_n = dev->data->nb_rx_queues; 327 unsigned int txqs_n = dev->data->nb_tx_queues; 328 unsigned int i; 329 unsigned int j; 330 unsigned int reta_idx_n; 331 const uint8_t use_app_rss_key = 332 !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key; 333 uint64_t supp_tx_offloads = mlx5_get_tx_port_offloads(dev); 334 uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads; 335 uint64_t supp_rx_offloads = 336 (mlx5_get_rx_port_offloads() | 337 mlx5_get_rx_queue_offloads(dev)); 338 uint64_t rx_offloads = dev->data->dev_conf.rxmode.offloads; 339 int ret = 0; 340 341 if ((tx_offloads & supp_tx_offloads) != tx_offloads) { 342 DRV_LOG(ERR, 343 "port %u some Tx offloads are not supported requested" 344 " 0x%" PRIx64 " supported 0x%" PRIx64, 345 dev->data->port_id, tx_offloads, supp_tx_offloads); 346 rte_errno = ENOTSUP; 347 return -rte_errno; 348 } 349 if ((rx_offloads & supp_rx_offloads) != rx_offloads) { 350 DRV_LOG(ERR, 351 "port %u some Rx offloads are not supported requested" 352 " 0x%" PRIx64 " supported 0x%" PRIx64, 353 dev->data->port_id, rx_offloads, supp_rx_offloads); 354 rte_errno = ENOTSUP; 355 return -rte_errno; 356 } 357 if (use_app_rss_key && 358 (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len != 359 rss_hash_default_key_len)) { 360 DRV_LOG(ERR, "port %u RSS key len must be %zu Bytes long", 361 dev->data->port_id, rss_hash_default_key_len); 362 rte_errno = EINVAL; 363 return -rte_errno; 364 } 365 priv->rss_conf.rss_key = 366 rte_realloc(priv->rss_conf.rss_key, 367 rss_hash_default_key_len, 0); 368 if (!priv->rss_conf.rss_key) { 369 DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)", 370 dev->data->port_id, rxqs_n); 371 rte_errno = ENOMEM; 372 return -rte_errno; 373 } 374 memcpy(priv->rss_conf.rss_key, 375 use_app_rss_key ? 376 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key : 377 rss_hash_default_key, 378 rss_hash_default_key_len); 379 priv->rss_conf.rss_key_len = rss_hash_default_key_len; 380 priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf; 381 priv->rxqs = (void *)dev->data->rx_queues; 382 priv->txqs = (void *)dev->data->tx_queues; 383 if (txqs_n != priv->txqs_n) { 384 DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u", 385 dev->data->port_id, priv->txqs_n, txqs_n); 386 priv->txqs_n = txqs_n; 387 } 388 if (rxqs_n > priv->config.ind_table_max_size) { 389 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)", 390 dev->data->port_id, rxqs_n); 391 rte_errno = EINVAL; 392 return -rte_errno; 393 } 394 if (rxqs_n == priv->rxqs_n) 395 return 0; 396 DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u", 397 dev->data->port_id, priv->rxqs_n, rxqs_n); 398 priv->rxqs_n = rxqs_n; 399 /* If the requested number of RX queues is not a power of two, use the 400 * maximum indirection table size for better balancing. 401 * The result is always rounded to the next power of two. */ 402 reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ? 403 priv->config.ind_table_max_size : 404 rxqs_n)); 405 ret = mlx5_rss_reta_index_resize(dev, reta_idx_n); 406 if (ret) 407 return ret; 408 /* When the number of RX queues is not a power of two, the remaining 409 * table entries are padded with reused WQs and hashes are not spread 410 * uniformly. */ 411 for (i = 0, j = 0; (i != reta_idx_n); ++i) { 412 (*priv->reta_idx)[i] = j; 413 if (++j == rxqs_n) 414 j = 0; 415 } 416 return 0; 417 } 418 419 /** 420 * Sets default tuning parameters. 421 * 422 * @param dev 423 * Pointer to Ethernet device. 424 * @param[out] info 425 * Info structure output buffer. 426 */ 427 static void 428 mlx5_set_default_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 429 { 430 struct priv *priv = dev->data->dev_private; 431 432 /* Minimum CPU utilization. */ 433 info->default_rxportconf.ring_size = 256; 434 info->default_txportconf.ring_size = 256; 435 info->default_rxportconf.burst_size = 64; 436 info->default_txportconf.burst_size = 64; 437 if (priv->link_speed_capa & ETH_LINK_SPEED_100G) { 438 info->default_rxportconf.nb_queues = 16; 439 info->default_txportconf.nb_queues = 16; 440 if (dev->data->nb_rx_queues > 2 || 441 dev->data->nb_tx_queues > 2) { 442 /* Max Throughput. */ 443 info->default_rxportconf.ring_size = 2048; 444 info->default_txportconf.ring_size = 2048; 445 } 446 } else { 447 info->default_rxportconf.nb_queues = 8; 448 info->default_txportconf.nb_queues = 8; 449 if (dev->data->nb_rx_queues > 2 || 450 dev->data->nb_tx_queues > 2) { 451 /* Max Throughput. */ 452 info->default_rxportconf.ring_size = 4096; 453 info->default_txportconf.ring_size = 4096; 454 } 455 } 456 } 457 458 /** 459 * DPDK callback to get information about the device. 460 * 461 * @param dev 462 * Pointer to Ethernet device structure. 463 * @param[out] info 464 * Info structure output buffer. 465 */ 466 void 467 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 468 { 469 struct priv *priv = dev->data->dev_private; 470 struct mlx5_dev_config *config = &priv->config; 471 unsigned int max; 472 char ifname[IF_NAMESIZE]; 473 474 /* FIXME: we should ask the device for these values. */ 475 info->min_rx_bufsize = 32; 476 info->max_rx_pktlen = 65536; 477 /* 478 * Since we need one CQ per QP, the limit is the minimum number 479 * between the two values. 480 */ 481 max = RTE_MIN(priv->device_attr.orig_attr.max_cq, 482 priv->device_attr.orig_attr.max_qp); 483 /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */ 484 if (max >= 65535) 485 max = 65535; 486 info->max_rx_queues = max; 487 info->max_tx_queues = max; 488 info->max_mac_addrs = MLX5_MAX_UC_MAC_ADDRESSES; 489 info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev); 490 info->rx_offload_capa = (mlx5_get_rx_port_offloads() | 491 info->rx_queue_offload_capa); 492 info->tx_offload_capa = mlx5_get_tx_port_offloads(dev); 493 if (mlx5_get_ifname(dev, &ifname) == 0) 494 info->if_index = if_nametoindex(ifname); 495 info->reta_size = priv->reta_idx_n ? 496 priv->reta_idx_n : config->ind_table_max_size; 497 info->hash_key_size = rss_hash_default_key_len; 498 info->speed_capa = priv->link_speed_capa; 499 info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK; 500 mlx5_set_default_params(dev, info); 501 } 502 503 /** 504 * Get supported packet types. 505 * 506 * @param dev 507 * Pointer to Ethernet device structure. 508 * 509 * @return 510 * A pointer to the supported Packet types array. 511 */ 512 const uint32_t * 513 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) 514 { 515 static const uint32_t ptypes[] = { 516 /* refers to rxq_cq_to_pkt_type() */ 517 RTE_PTYPE_L2_ETHER, 518 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, 519 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, 520 RTE_PTYPE_L4_NONFRAG, 521 RTE_PTYPE_L4_FRAG, 522 RTE_PTYPE_L4_TCP, 523 RTE_PTYPE_L4_UDP, 524 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, 525 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, 526 RTE_PTYPE_INNER_L4_NONFRAG, 527 RTE_PTYPE_INNER_L4_FRAG, 528 RTE_PTYPE_INNER_L4_TCP, 529 RTE_PTYPE_INNER_L4_UDP, 530 RTE_PTYPE_UNKNOWN 531 }; 532 533 if (dev->rx_pkt_burst == mlx5_rx_burst || 534 dev->rx_pkt_burst == mlx5_rx_burst_vec) 535 return ptypes; 536 return NULL; 537 } 538 539 /** 540 * DPDK callback to retrieve physical link information. 541 * 542 * @param dev 543 * Pointer to Ethernet device structure. 544 * @param[out] link 545 * Storage for current link status. 546 * 547 * @return 548 * 0 on success, a negative errno value otherwise and rte_errno is set. 549 */ 550 static int 551 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, 552 struct rte_eth_link *link) 553 { 554 struct priv *priv = dev->data->dev_private; 555 struct ethtool_cmd edata = { 556 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */ 557 }; 558 struct ifreq ifr; 559 struct rte_eth_link dev_link; 560 int link_speed = 0; 561 int ret; 562 563 ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr); 564 if (ret) { 565 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", 566 dev->data->port_id, strerror(rte_errno)); 567 return ret; 568 } 569 memset(&dev_link, 0, sizeof(dev_link)); 570 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && 571 (ifr.ifr_flags & IFF_RUNNING)); 572 ifr.ifr_data = (void *)&edata; 573 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 574 if (ret) { 575 DRV_LOG(WARNING, 576 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", 577 dev->data->port_id, strerror(rte_errno)); 578 return ret; 579 } 580 link_speed = ethtool_cmd_speed(&edata); 581 if (link_speed == -1) 582 dev_link.link_speed = ETH_SPEED_NUM_NONE; 583 else 584 dev_link.link_speed = link_speed; 585 priv->link_speed_capa = 0; 586 if (edata.supported & SUPPORTED_Autoneg) 587 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG; 588 if (edata.supported & (SUPPORTED_1000baseT_Full | 589 SUPPORTED_1000baseKX_Full)) 590 priv->link_speed_capa |= ETH_LINK_SPEED_1G; 591 if (edata.supported & SUPPORTED_10000baseKR_Full) 592 priv->link_speed_capa |= ETH_LINK_SPEED_10G; 593 if (edata.supported & (SUPPORTED_40000baseKR4_Full | 594 SUPPORTED_40000baseCR4_Full | 595 SUPPORTED_40000baseSR4_Full | 596 SUPPORTED_40000baseLR4_Full)) 597 priv->link_speed_capa |= ETH_LINK_SPEED_40G; 598 dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? 599 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 600 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & 601 ETH_LINK_SPEED_FIXED); 602 if ((dev_link.link_speed && !dev_link.link_status) || 603 (!dev_link.link_speed && dev_link.link_status)) { 604 rte_errno = EAGAIN; 605 return -rte_errno; 606 } 607 *link = dev_link; 608 return 0; 609 } 610 611 /** 612 * Retrieve physical link information (unlocked version using new ioctl). 613 * 614 * @param dev 615 * Pointer to Ethernet device structure. 616 * @param[out] link 617 * Storage for current link status. 618 * 619 * @return 620 * 0 on success, a negative errno value otherwise and rte_errno is set. 621 */ 622 static int 623 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, 624 struct rte_eth_link *link) 625 626 { 627 struct priv *priv = dev->data->dev_private; 628 struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS }; 629 struct ifreq ifr; 630 struct rte_eth_link dev_link; 631 uint64_t sc; 632 int ret; 633 634 ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr); 635 if (ret) { 636 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", 637 dev->data->port_id, strerror(rte_errno)); 638 return ret; 639 } 640 memset(&dev_link, 0, sizeof(dev_link)); 641 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && 642 (ifr.ifr_flags & IFF_RUNNING)); 643 ifr.ifr_data = (void *)&gcmd; 644 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 645 if (ret) { 646 DRV_LOG(DEBUG, 647 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)" 648 " failed: %s", 649 dev->data->port_id, strerror(rte_errno)); 650 return ret; 651 } 652 gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords; 653 654 alignas(struct ethtool_link_settings) 655 uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) + 656 sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3]; 657 struct ethtool_link_settings *ecmd = (void *)data; 658 659 *ecmd = gcmd; 660 ifr.ifr_data = (void *)ecmd; 661 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 662 if (ret) { 663 DRV_LOG(DEBUG, 664 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)" 665 " failed: %s", 666 dev->data->port_id, strerror(rte_errno)); 667 return ret; 668 } 669 dev_link.link_speed = ecmd->speed; 670 sc = ecmd->link_mode_masks[0] | 671 ((uint64_t)ecmd->link_mode_masks[1] << 32); 672 priv->link_speed_capa = 0; 673 if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT)) 674 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG; 675 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) | 676 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT))) 677 priv->link_speed_capa |= ETH_LINK_SPEED_1G; 678 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) | 679 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) | 680 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT))) 681 priv->link_speed_capa |= ETH_LINK_SPEED_10G; 682 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) | 683 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT))) 684 priv->link_speed_capa |= ETH_LINK_SPEED_20G; 685 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) | 686 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) | 687 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) | 688 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT))) 689 priv->link_speed_capa |= ETH_LINK_SPEED_40G; 690 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) | 691 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) | 692 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) | 693 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT))) 694 priv->link_speed_capa |= ETH_LINK_SPEED_56G; 695 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) | 696 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) | 697 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT))) 698 priv->link_speed_capa |= ETH_LINK_SPEED_25G; 699 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) | 700 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT))) 701 priv->link_speed_capa |= ETH_LINK_SPEED_50G; 702 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) | 703 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) | 704 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) | 705 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT))) 706 priv->link_speed_capa |= ETH_LINK_SPEED_100G; 707 dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ? 708 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 709 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & 710 ETH_LINK_SPEED_FIXED); 711 if ((dev_link.link_speed && !dev_link.link_status) || 712 (!dev_link.link_speed && dev_link.link_status)) { 713 rte_errno = EAGAIN; 714 return -rte_errno; 715 } 716 *link = dev_link; 717 return 0; 718 } 719 720 /** 721 * DPDK callback to retrieve physical link information. 722 * 723 * @param dev 724 * Pointer to Ethernet device structure. 725 * @param wait_to_complete 726 * Wait for request completion. 727 * 728 * @return 729 * 0 if link status was not updated, positive if it was, a negative errno 730 * value otherwise and rte_errno is set. 731 */ 732 int 733 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete) 734 { 735 int ret; 736 struct rte_eth_link dev_link; 737 time_t start_time = time(NULL); 738 739 do { 740 ret = mlx5_link_update_unlocked_gs(dev, &dev_link); 741 if (ret) 742 ret = mlx5_link_update_unlocked_gset(dev, &dev_link); 743 if (ret == 0) 744 break; 745 /* Handle wait to complete situation. */ 746 if (wait_to_complete && ret == -EAGAIN) { 747 if (abs((int)difftime(time(NULL), start_time)) < 748 MLX5_LINK_STATUS_TIMEOUT) { 749 usleep(0); 750 continue; 751 } else { 752 rte_errno = EBUSY; 753 return -rte_errno; 754 } 755 } else if (ret < 0) { 756 return ret; 757 } 758 } while (wait_to_complete); 759 ret = !!memcmp(&dev->data->dev_link, &dev_link, 760 sizeof(struct rte_eth_link)); 761 dev->data->dev_link = dev_link; 762 return ret; 763 } 764 765 /** 766 * DPDK callback to change the MTU. 767 * 768 * @param dev 769 * Pointer to Ethernet device structure. 770 * @param in_mtu 771 * New MTU. 772 * 773 * @return 774 * 0 on success, a negative errno value otherwise and rte_errno is set. 775 */ 776 int 777 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 778 { 779 struct priv *priv = dev->data->dev_private; 780 uint16_t kern_mtu = 0; 781 int ret; 782 783 ret = mlx5_get_mtu(dev, &kern_mtu); 784 if (ret) 785 return ret; 786 /* Set kernel interface MTU first. */ 787 ret = mlx5_set_mtu(dev, mtu); 788 if (ret) 789 return ret; 790 ret = mlx5_get_mtu(dev, &kern_mtu); 791 if (ret) 792 return ret; 793 if (kern_mtu == mtu) { 794 priv->mtu = mtu; 795 DRV_LOG(DEBUG, "port %u adapter MTU set to %u", 796 dev->data->port_id, mtu); 797 return 0; 798 } 799 rte_errno = EAGAIN; 800 return -rte_errno; 801 } 802 803 /** 804 * DPDK callback to get flow control status. 805 * 806 * @param dev 807 * Pointer to Ethernet device structure. 808 * @param[out] fc_conf 809 * Flow control output buffer. 810 * 811 * @return 812 * 0 on success, a negative errno value otherwise and rte_errno is set. 813 */ 814 int 815 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 816 { 817 struct ifreq ifr; 818 struct ethtool_pauseparam ethpause = { 819 .cmd = ETHTOOL_GPAUSEPARAM 820 }; 821 int ret; 822 823 ifr.ifr_data = (void *)ðpause; 824 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 825 if (ret) { 826 DRV_LOG(WARNING, 827 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:" 828 " %s", 829 dev->data->port_id, strerror(rte_errno)); 830 return ret; 831 } 832 fc_conf->autoneg = ethpause.autoneg; 833 if (ethpause.rx_pause && ethpause.tx_pause) 834 fc_conf->mode = RTE_FC_FULL; 835 else if (ethpause.rx_pause) 836 fc_conf->mode = RTE_FC_RX_PAUSE; 837 else if (ethpause.tx_pause) 838 fc_conf->mode = RTE_FC_TX_PAUSE; 839 else 840 fc_conf->mode = RTE_FC_NONE; 841 return 0; 842 } 843 844 /** 845 * DPDK callback to modify flow control parameters. 846 * 847 * @param dev 848 * Pointer to Ethernet device structure. 849 * @param[in] fc_conf 850 * Flow control parameters. 851 * 852 * @return 853 * 0 on success, a negative errno value otherwise and rte_errno is set. 854 */ 855 int 856 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 857 { 858 struct ifreq ifr; 859 struct ethtool_pauseparam ethpause = { 860 .cmd = ETHTOOL_SPAUSEPARAM 861 }; 862 int ret; 863 864 ifr.ifr_data = (void *)ðpause; 865 ethpause.autoneg = fc_conf->autoneg; 866 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 867 (fc_conf->mode & RTE_FC_RX_PAUSE)) 868 ethpause.rx_pause = 1; 869 else 870 ethpause.rx_pause = 0; 871 872 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 873 (fc_conf->mode & RTE_FC_TX_PAUSE)) 874 ethpause.tx_pause = 1; 875 else 876 ethpause.tx_pause = 0; 877 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 878 if (ret) { 879 DRV_LOG(WARNING, 880 "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" 881 " failed: %s", 882 dev->data->port_id, strerror(rte_errno)); 883 return ret; 884 } 885 return 0; 886 } 887 888 /** 889 * Get PCI information from struct ibv_device. 890 * 891 * @param device 892 * Pointer to Ethernet device structure. 893 * @param[out] pci_addr 894 * PCI bus address output buffer. 895 * 896 * @return 897 * 0 on success, a negative errno value otherwise and rte_errno is set. 898 */ 899 int 900 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, 901 struct rte_pci_addr *pci_addr) 902 { 903 FILE *file; 904 char line[32]; 905 MKSTR(path, "%s/device/uevent", device->ibdev_path); 906 907 file = fopen(path, "rb"); 908 if (file == NULL) { 909 rte_errno = errno; 910 return -rte_errno; 911 } 912 while (fgets(line, sizeof(line), file) == line) { 913 size_t len = strlen(line); 914 int ret; 915 916 /* Truncate long lines. */ 917 if (len == (sizeof(line) - 1)) 918 while (line[(len - 1)] != '\n') { 919 ret = fgetc(file); 920 if (ret == EOF) 921 break; 922 line[(len - 1)] = ret; 923 } 924 /* Extract information. */ 925 if (sscanf(line, 926 "PCI_SLOT_NAME=" 927 "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", 928 &pci_addr->domain, 929 &pci_addr->bus, 930 &pci_addr->devid, 931 &pci_addr->function) == 4) { 932 ret = 0; 933 break; 934 } 935 } 936 fclose(file); 937 return 0; 938 } 939 940 /** 941 * Device status handler. 942 * 943 * @param dev 944 * Pointer to Ethernet device. 945 * @param events 946 * Pointer to event flags holder. 947 * 948 * @return 949 * Events bitmap of callback process which can be called immediately. 950 */ 951 static uint32_t 952 mlx5_dev_status_handler(struct rte_eth_dev *dev) 953 { 954 struct priv *priv = dev->data->dev_private; 955 struct ibv_async_event event; 956 uint32_t ret = 0; 957 958 if (mlx5_link_update(dev, 0) == -EAGAIN) { 959 usleep(0); 960 return 0; 961 } 962 /* Read all message and acknowledge them. */ 963 for (;;) { 964 if (mlx5_glue->get_async_event(priv->ctx, &event)) 965 break; 966 if ((event.event_type == IBV_EVENT_PORT_ACTIVE || 967 event.event_type == IBV_EVENT_PORT_ERR) && 968 (dev->data->dev_conf.intr_conf.lsc == 1)) 969 ret |= (1 << RTE_ETH_EVENT_INTR_LSC); 970 else if (event.event_type == IBV_EVENT_DEVICE_FATAL && 971 dev->data->dev_conf.intr_conf.rmv == 1) 972 ret |= (1 << RTE_ETH_EVENT_INTR_RMV); 973 else 974 DRV_LOG(DEBUG, 975 "port %u event type %d on not handled", 976 dev->data->port_id, event.event_type); 977 mlx5_glue->ack_async_event(&event); 978 } 979 return ret; 980 } 981 982 /** 983 * Handle interrupts from the NIC. 984 * 985 * @param[in] intr_handle 986 * Interrupt handler. 987 * @param cb_arg 988 * Callback argument. 989 */ 990 void 991 mlx5_dev_interrupt_handler(void *cb_arg) 992 { 993 struct rte_eth_dev *dev = cb_arg; 994 uint32_t events; 995 996 events = mlx5_dev_status_handler(dev); 997 if (events & (1 << RTE_ETH_EVENT_INTR_LSC)) 998 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL); 999 if (events & (1 << RTE_ETH_EVENT_INTR_RMV)) 1000 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL); 1001 } 1002 1003 /** 1004 * Handle interrupts from the socket. 1005 * 1006 * @param cb_arg 1007 * Callback argument. 1008 */ 1009 static void 1010 mlx5_dev_handler_socket(void *cb_arg) 1011 { 1012 struct rte_eth_dev *dev = cb_arg; 1013 1014 mlx5_socket_handle(dev); 1015 } 1016 1017 /** 1018 * Uninstall interrupt handler. 1019 * 1020 * @param dev 1021 * Pointer to Ethernet device. 1022 */ 1023 void 1024 mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev) 1025 { 1026 struct priv *priv = dev->data->dev_private; 1027 1028 if (dev->data->dev_conf.intr_conf.lsc || 1029 dev->data->dev_conf.intr_conf.rmv) 1030 rte_intr_callback_unregister(&priv->intr_handle, 1031 mlx5_dev_interrupt_handler, dev); 1032 if (priv->primary_socket) 1033 rte_intr_callback_unregister(&priv->intr_handle_socket, 1034 mlx5_dev_handler_socket, dev); 1035 priv->intr_handle.fd = 0; 1036 priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; 1037 priv->intr_handle_socket.fd = 0; 1038 priv->intr_handle_socket.type = RTE_INTR_HANDLE_UNKNOWN; 1039 } 1040 1041 /** 1042 * Install interrupt handler. 1043 * 1044 * @param dev 1045 * Pointer to Ethernet device. 1046 */ 1047 void 1048 mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev) 1049 { 1050 struct priv *priv = dev->data->dev_private; 1051 int ret; 1052 int flags; 1053 1054 assert(priv->ctx->async_fd > 0); 1055 flags = fcntl(priv->ctx->async_fd, F_GETFL); 1056 ret = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); 1057 if (ret) { 1058 DRV_LOG(INFO, 1059 "port %u failed to change file descriptor async event" 1060 " queue", 1061 dev->data->port_id); 1062 dev->data->dev_conf.intr_conf.lsc = 0; 1063 dev->data->dev_conf.intr_conf.rmv = 0; 1064 } 1065 if (dev->data->dev_conf.intr_conf.lsc || 1066 dev->data->dev_conf.intr_conf.rmv) { 1067 priv->intr_handle.fd = priv->ctx->async_fd; 1068 priv->intr_handle.type = RTE_INTR_HANDLE_EXT; 1069 rte_intr_callback_register(&priv->intr_handle, 1070 mlx5_dev_interrupt_handler, dev); 1071 } 1072 ret = mlx5_socket_init(dev); 1073 if (ret) 1074 DRV_LOG(ERR, "port %u cannot initialise socket: %s", 1075 dev->data->port_id, strerror(rte_errno)); 1076 else if (priv->primary_socket) { 1077 priv->intr_handle_socket.fd = priv->primary_socket; 1078 priv->intr_handle_socket.type = RTE_INTR_HANDLE_EXT; 1079 rte_intr_callback_register(&priv->intr_handle_socket, 1080 mlx5_dev_handler_socket, dev); 1081 } 1082 } 1083 1084 /** 1085 * DPDK callback to bring the link DOWN. 1086 * 1087 * @param dev 1088 * Pointer to Ethernet device structure. 1089 * 1090 * @return 1091 * 0 on success, a negative errno value otherwise and rte_errno is set. 1092 */ 1093 int 1094 mlx5_set_link_down(struct rte_eth_dev *dev) 1095 { 1096 return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP); 1097 } 1098 1099 /** 1100 * DPDK callback to bring the link UP. 1101 * 1102 * @param dev 1103 * Pointer to Ethernet device structure. 1104 * 1105 * @return 1106 * 0 on success, a negative errno value otherwise and rte_errno is set. 1107 */ 1108 int 1109 mlx5_set_link_up(struct rte_eth_dev *dev) 1110 { 1111 return mlx5_set_flags(dev, ~IFF_UP, IFF_UP); 1112 } 1113 1114 /** 1115 * Configure the TX function to use. 1116 * 1117 * @param dev 1118 * Pointer to private data structure. 1119 * 1120 * @return 1121 * Pointer to selected Tx burst function. 1122 */ 1123 eth_tx_burst_t 1124 mlx5_select_tx_function(struct rte_eth_dev *dev) 1125 { 1126 struct priv *priv = dev->data->dev_private; 1127 eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst; 1128 struct mlx5_dev_config *config = &priv->config; 1129 uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads; 1130 int tso = !!(tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO | 1131 DEV_TX_OFFLOAD_VXLAN_TNL_TSO | 1132 DEV_TX_OFFLOAD_GRE_TNL_TSO)); 1133 int swp = !!(tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO | 1134 DEV_TX_OFFLOAD_UDP_TNL_TSO | 1135 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)); 1136 int vlan_insert = !!(tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT); 1137 1138 assert(priv != NULL); 1139 /* Select appropriate TX function. */ 1140 if (vlan_insert || tso || swp) 1141 return tx_pkt_burst; 1142 if (config->mps == MLX5_MPW_ENHANCED) { 1143 if (mlx5_check_vec_tx_support(dev) > 0) { 1144 if (mlx5_check_raw_vec_tx_support(dev) > 0) 1145 tx_pkt_burst = mlx5_tx_burst_raw_vec; 1146 else 1147 tx_pkt_burst = mlx5_tx_burst_vec; 1148 DRV_LOG(DEBUG, 1149 "port %u selected enhanced MPW Tx vectorized" 1150 " function", 1151 dev->data->port_id); 1152 } else { 1153 tx_pkt_burst = mlx5_tx_burst_empw; 1154 DRV_LOG(DEBUG, 1155 "port %u selected enhanced MPW Tx function", 1156 dev->data->port_id); 1157 } 1158 } else if (config->mps && (config->txq_inline > 0)) { 1159 tx_pkt_burst = mlx5_tx_burst_mpw_inline; 1160 DRV_LOG(DEBUG, "port %u selected MPW inline Tx function", 1161 dev->data->port_id); 1162 } else if (config->mps) { 1163 tx_pkt_burst = mlx5_tx_burst_mpw; 1164 DRV_LOG(DEBUG, "port %u selected MPW Tx function", 1165 dev->data->port_id); 1166 } 1167 return tx_pkt_burst; 1168 } 1169 1170 /** 1171 * Configure the RX function to use. 1172 * 1173 * @param dev 1174 * Pointer to private data structure. 1175 * 1176 * @return 1177 * Pointer to selected Rx burst function. 1178 */ 1179 eth_rx_burst_t 1180 mlx5_select_rx_function(struct rte_eth_dev *dev) 1181 { 1182 eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst; 1183 1184 assert(dev != NULL); 1185 if (mlx5_check_vec_rx_support(dev) > 0) { 1186 rx_pkt_burst = mlx5_rx_burst_vec; 1187 DRV_LOG(DEBUG, "port %u selected Rx vectorized function", 1188 dev->data->port_id); 1189 } 1190 return rx_pkt_burst; 1191 } 1192 1193 /** 1194 * Check if mlx5 device was removed. 1195 * 1196 * @param dev 1197 * Pointer to Ethernet device structure. 1198 * 1199 * @return 1200 * 1 when device is removed, otherwise 0. 1201 */ 1202 int 1203 mlx5_is_removed(struct rte_eth_dev *dev) 1204 { 1205 struct ibv_device_attr device_attr; 1206 struct priv *priv = dev->data->dev_private; 1207 1208 if (mlx5_glue->query_device(priv->ctx, &device_attr) == EIO) 1209 return 1; 1210 return 0; 1211 } 1212