1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015 Mellanox Technologies, Ltd 4 */ 5 6 #define _GNU_SOURCE 7 8 #include <stddef.h> 9 #include <assert.h> 10 #include <inttypes.h> 11 #include <unistd.h> 12 #include <stdint.h> 13 #include <stdio.h> 14 #include <string.h> 15 #include <stdlib.h> 16 #include <errno.h> 17 #include <dirent.h> 18 #include <net/if.h> 19 #include <sys/ioctl.h> 20 #include <sys/socket.h> 21 #include <netinet/in.h> 22 #include <linux/ethtool.h> 23 #include <linux/sockios.h> 24 #include <fcntl.h> 25 #include <stdalign.h> 26 #include <sys/un.h> 27 #include <time.h> 28 29 #include <rte_atomic.h> 30 #include <rte_ethdev_driver.h> 31 #include <rte_bus_pci.h> 32 #include <rte_mbuf.h> 33 #include <rte_common.h> 34 #include <rte_interrupts.h> 35 #include <rte_malloc.h> 36 #include <rte_string_fns.h> 37 38 #include "mlx5.h" 39 #include "mlx5_glue.h" 40 #include "mlx5_rxtx.h" 41 #include "mlx5_utils.h" 42 43 /* Add defines in case the running kernel is not the same as user headers. */ 44 #ifndef ETHTOOL_GLINKSETTINGS 45 struct ethtool_link_settings { 46 uint32_t cmd; 47 uint32_t speed; 48 uint8_t duplex; 49 uint8_t port; 50 uint8_t phy_address; 51 uint8_t autoneg; 52 uint8_t mdio_support; 53 uint8_t eth_to_mdix; 54 uint8_t eth_tp_mdix_ctrl; 55 int8_t link_mode_masks_nwords; 56 uint32_t reserved[8]; 57 uint32_t link_mode_masks[]; 58 }; 59 60 #define ETHTOOL_GLINKSETTINGS 0x0000004c 61 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5 62 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6 63 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17 64 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18 65 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19 66 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20 67 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21 68 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22 69 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23 70 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24 71 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25 72 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26 73 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27 74 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28 75 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29 76 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30 77 #endif 78 #ifndef HAVE_ETHTOOL_LINK_MODE_25G 79 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31 80 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32 81 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33 82 #endif 83 #ifndef HAVE_ETHTOOL_LINK_MODE_50G 84 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34 85 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35 86 #endif 87 #ifndef HAVE_ETHTOOL_LINK_MODE_100G 88 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36 89 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37 90 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38 91 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39 92 #endif 93 94 /** 95 * Get interface name from private structure. 96 * 97 * @param[in] dev 98 * Pointer to Ethernet device. 99 * @param[out] ifname 100 * Interface name output buffer. 101 * 102 * @return 103 * 0 on success, a negative errno value otherwise and rte_errno is set. 104 */ 105 int 106 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]) 107 { 108 struct priv *priv = dev->data->dev_private; 109 DIR *dir; 110 struct dirent *dent; 111 unsigned int dev_type = 0; 112 unsigned int dev_port_prev = ~0u; 113 char match[IF_NAMESIZE] = ""; 114 115 { 116 MKSTR(path, "%s/device/net", priv->ibdev_path); 117 118 dir = opendir(path); 119 if (dir == NULL) { 120 rte_errno = errno; 121 return -rte_errno; 122 } 123 } 124 while ((dent = readdir(dir)) != NULL) { 125 char *name = dent->d_name; 126 FILE *file; 127 unsigned int dev_port; 128 int r; 129 130 if ((name[0] == '.') && 131 ((name[1] == '\0') || 132 ((name[1] == '.') && (name[2] == '\0')))) 133 continue; 134 135 MKSTR(path, "%s/device/net/%s/%s", 136 priv->ibdev_path, name, 137 (dev_type ? "dev_id" : "dev_port")); 138 139 file = fopen(path, "rb"); 140 if (file == NULL) { 141 if (errno != ENOENT) 142 continue; 143 /* 144 * Switch to dev_id when dev_port does not exist as 145 * is the case with Linux kernel versions < 3.15. 146 */ 147 try_dev_id: 148 match[0] = '\0'; 149 if (dev_type) 150 break; 151 dev_type = 1; 152 dev_port_prev = ~0u; 153 rewinddir(dir); 154 continue; 155 } 156 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); 157 fclose(file); 158 if (r != 1) 159 continue; 160 /* 161 * Switch to dev_id when dev_port returns the same value for 162 * all ports. May happen when using a MOFED release older than 163 * 3.0 with a Linux kernel >= 3.15. 164 */ 165 if (dev_port == dev_port_prev) 166 goto try_dev_id; 167 dev_port_prev = dev_port; 168 if (dev_port == (priv->port - 1u)) 169 strlcpy(match, name, sizeof(match)); 170 } 171 closedir(dir); 172 if (match[0] == '\0') { 173 rte_errno = ENOENT; 174 return -rte_errno; 175 } 176 strncpy(*ifname, match, sizeof(*ifname)); 177 return 0; 178 } 179 180 /** 181 * Get the interface index from device name. 182 * 183 * @param[in] dev 184 * Pointer to Ethernet device. 185 * 186 * @return 187 * Interface index on success, a negative errno value otherwise and 188 * rte_errno is set. 189 */ 190 int 191 mlx5_ifindex(const struct rte_eth_dev *dev) 192 { 193 char ifname[IF_NAMESIZE]; 194 int ret; 195 196 ret = mlx5_get_ifname(dev, &ifname); 197 if (ret) 198 return ret; 199 ret = if_nametoindex(ifname); 200 if (ret == -1) { 201 rte_errno = errno; 202 return -rte_errno; 203 } 204 return ret; 205 } 206 207 /** 208 * Perform ifreq ioctl() on associated Ethernet device. 209 * 210 * @param[in] dev 211 * Pointer to Ethernet device. 212 * @param req 213 * Request number to pass to ioctl(). 214 * @param[out] ifr 215 * Interface request structure output buffer. 216 * 217 * @return 218 * 0 on success, a negative errno value otherwise and rte_errno is set. 219 */ 220 int 221 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr) 222 { 223 int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 224 int ret = 0; 225 226 if (sock == -1) { 227 rte_errno = errno; 228 return -rte_errno; 229 } 230 ret = mlx5_get_ifname(dev, &ifr->ifr_name); 231 if (ret) 232 goto error; 233 ret = ioctl(sock, req, ifr); 234 if (ret == -1) { 235 rte_errno = errno; 236 goto error; 237 } 238 close(sock); 239 return 0; 240 error: 241 close(sock); 242 return -rte_errno; 243 } 244 245 /** 246 * Get device MTU. 247 * 248 * @param dev 249 * Pointer to Ethernet device. 250 * @param[out] mtu 251 * MTU value output buffer. 252 * 253 * @return 254 * 0 on success, a negative errno value otherwise and rte_errno is set. 255 */ 256 int 257 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu) 258 { 259 struct ifreq request; 260 int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request); 261 262 if (ret) 263 return ret; 264 *mtu = request.ifr_mtu; 265 return 0; 266 } 267 268 /** 269 * Set device MTU. 270 * 271 * @param dev 272 * Pointer to Ethernet device. 273 * @param mtu 274 * MTU value to set. 275 * 276 * @return 277 * 0 on success, a negative errno value otherwise and rte_errno is set. 278 */ 279 static int 280 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 281 { 282 struct ifreq request = { .ifr_mtu = mtu, }; 283 284 return mlx5_ifreq(dev, SIOCSIFMTU, &request); 285 } 286 287 /** 288 * Set device flags. 289 * 290 * @param dev 291 * Pointer to Ethernet device. 292 * @param keep 293 * Bitmask for flags that must remain untouched. 294 * @param flags 295 * Bitmask for flags to modify. 296 * 297 * @return 298 * 0 on success, a negative errno value otherwise and rte_errno is set. 299 */ 300 int 301 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags) 302 { 303 struct ifreq request; 304 int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request); 305 306 if (ret) 307 return ret; 308 request.ifr_flags &= keep; 309 request.ifr_flags |= flags & ~keep; 310 return mlx5_ifreq(dev, SIOCSIFFLAGS, &request); 311 } 312 313 /** 314 * DPDK callback for Ethernet device configuration. 315 * 316 * @param dev 317 * Pointer to Ethernet device structure. 318 * 319 * @return 320 * 0 on success, a negative errno value otherwise and rte_errno is set. 321 */ 322 int 323 mlx5_dev_configure(struct rte_eth_dev *dev) 324 { 325 struct priv *priv = dev->data->dev_private; 326 unsigned int rxqs_n = dev->data->nb_rx_queues; 327 unsigned int txqs_n = dev->data->nb_tx_queues; 328 unsigned int i; 329 unsigned int j; 330 unsigned int reta_idx_n; 331 const uint8_t use_app_rss_key = 332 !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key; 333 int ret = 0; 334 335 if (use_app_rss_key && 336 (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len != 337 rss_hash_default_key_len)) { 338 DRV_LOG(ERR, "port %u RSS key len must be %zu Bytes long", 339 dev->data->port_id, rss_hash_default_key_len); 340 rte_errno = EINVAL; 341 return -rte_errno; 342 } 343 priv->rss_conf.rss_key = 344 rte_realloc(priv->rss_conf.rss_key, 345 rss_hash_default_key_len, 0); 346 if (!priv->rss_conf.rss_key) { 347 DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)", 348 dev->data->port_id, rxqs_n); 349 rte_errno = ENOMEM; 350 return -rte_errno; 351 } 352 memcpy(priv->rss_conf.rss_key, 353 use_app_rss_key ? 354 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key : 355 rss_hash_default_key, 356 rss_hash_default_key_len); 357 priv->rss_conf.rss_key_len = rss_hash_default_key_len; 358 priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf; 359 priv->rxqs = (void *)dev->data->rx_queues; 360 priv->txqs = (void *)dev->data->tx_queues; 361 if (txqs_n != priv->txqs_n) { 362 DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u", 363 dev->data->port_id, priv->txqs_n, txqs_n); 364 priv->txqs_n = txqs_n; 365 } 366 if (rxqs_n > priv->config.ind_table_max_size) { 367 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)", 368 dev->data->port_id, rxqs_n); 369 rte_errno = EINVAL; 370 return -rte_errno; 371 } 372 if (rxqs_n == priv->rxqs_n) 373 return 0; 374 DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u", 375 dev->data->port_id, priv->rxqs_n, rxqs_n); 376 priv->rxqs_n = rxqs_n; 377 /* If the requested number of RX queues is not a power of two, use the 378 * maximum indirection table size for better balancing. 379 * The result is always rounded to the next power of two. */ 380 reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ? 381 priv->config.ind_table_max_size : 382 rxqs_n)); 383 ret = mlx5_rss_reta_index_resize(dev, reta_idx_n); 384 if (ret) 385 return ret; 386 /* When the number of RX queues is not a power of two, the remaining 387 * table entries are padded with reused WQs and hashes are not spread 388 * uniformly. */ 389 for (i = 0, j = 0; (i != reta_idx_n); ++i) { 390 (*priv->reta_idx)[i] = j; 391 if (++j == rxqs_n) 392 j = 0; 393 } 394 return 0; 395 } 396 397 /** 398 * Sets default tuning parameters. 399 * 400 * @param dev 401 * Pointer to Ethernet device. 402 * @param[out] info 403 * Info structure output buffer. 404 */ 405 static void 406 mlx5_set_default_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 407 { 408 struct priv *priv = dev->data->dev_private; 409 410 /* Minimum CPU utilization. */ 411 info->default_rxportconf.ring_size = 256; 412 info->default_txportconf.ring_size = 256; 413 info->default_rxportconf.burst_size = 64; 414 info->default_txportconf.burst_size = 64; 415 if (priv->link_speed_capa & ETH_LINK_SPEED_100G) { 416 info->default_rxportconf.nb_queues = 16; 417 info->default_txportconf.nb_queues = 16; 418 if (dev->data->nb_rx_queues > 2 || 419 dev->data->nb_tx_queues > 2) { 420 /* Max Throughput. */ 421 info->default_rxportconf.ring_size = 2048; 422 info->default_txportconf.ring_size = 2048; 423 } 424 } else { 425 info->default_rxportconf.nb_queues = 8; 426 info->default_txportconf.nb_queues = 8; 427 if (dev->data->nb_rx_queues > 2 || 428 dev->data->nb_tx_queues > 2) { 429 /* Max Throughput. */ 430 info->default_rxportconf.ring_size = 4096; 431 info->default_txportconf.ring_size = 4096; 432 } 433 } 434 } 435 436 /** 437 * DPDK callback to get information about the device. 438 * 439 * @param dev 440 * Pointer to Ethernet device structure. 441 * @param[out] info 442 * Info structure output buffer. 443 */ 444 void 445 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 446 { 447 struct priv *priv = dev->data->dev_private; 448 struct mlx5_dev_config *config = &priv->config; 449 unsigned int max; 450 char ifname[IF_NAMESIZE]; 451 452 /* FIXME: we should ask the device for these values. */ 453 info->min_rx_bufsize = 32; 454 info->max_rx_pktlen = 65536; 455 /* 456 * Since we need one CQ per QP, the limit is the minimum number 457 * between the two values. 458 */ 459 max = RTE_MIN(priv->device_attr.orig_attr.max_cq, 460 priv->device_attr.orig_attr.max_qp); 461 /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */ 462 if (max >= 65535) 463 max = 65535; 464 info->max_rx_queues = max; 465 info->max_tx_queues = max; 466 info->max_mac_addrs = MLX5_MAX_UC_MAC_ADDRESSES; 467 info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev); 468 info->rx_offload_capa = (mlx5_get_rx_port_offloads() | 469 info->rx_queue_offload_capa); 470 info->tx_offload_capa = mlx5_get_tx_port_offloads(dev); 471 if (mlx5_get_ifname(dev, &ifname) == 0) 472 info->if_index = if_nametoindex(ifname); 473 info->reta_size = priv->reta_idx_n ? 474 priv->reta_idx_n : config->ind_table_max_size; 475 info->hash_key_size = rss_hash_default_key_len; 476 info->speed_capa = priv->link_speed_capa; 477 info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK; 478 mlx5_set_default_params(dev, info); 479 } 480 481 /** 482 * Get supported packet types. 483 * 484 * @param dev 485 * Pointer to Ethernet device structure. 486 * 487 * @return 488 * A pointer to the supported Packet types array. 489 */ 490 const uint32_t * 491 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) 492 { 493 static const uint32_t ptypes[] = { 494 /* refers to rxq_cq_to_pkt_type() */ 495 RTE_PTYPE_L2_ETHER, 496 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, 497 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, 498 RTE_PTYPE_L4_NONFRAG, 499 RTE_PTYPE_L4_FRAG, 500 RTE_PTYPE_L4_TCP, 501 RTE_PTYPE_L4_UDP, 502 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, 503 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, 504 RTE_PTYPE_INNER_L4_NONFRAG, 505 RTE_PTYPE_INNER_L4_FRAG, 506 RTE_PTYPE_INNER_L4_TCP, 507 RTE_PTYPE_INNER_L4_UDP, 508 RTE_PTYPE_UNKNOWN 509 }; 510 511 if (dev->rx_pkt_burst == mlx5_rx_burst || 512 dev->rx_pkt_burst == mlx5_rx_burst_vec) 513 return ptypes; 514 return NULL; 515 } 516 517 /** 518 * DPDK callback to retrieve physical link information. 519 * 520 * @param dev 521 * Pointer to Ethernet device structure. 522 * @param[out] link 523 * Storage for current link status. 524 * 525 * @return 526 * 0 on success, a negative errno value otherwise and rte_errno is set. 527 */ 528 static int 529 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, 530 struct rte_eth_link *link) 531 { 532 struct priv *priv = dev->data->dev_private; 533 struct ethtool_cmd edata = { 534 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */ 535 }; 536 struct ifreq ifr; 537 struct rte_eth_link dev_link; 538 int link_speed = 0; 539 int ret; 540 541 ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr); 542 if (ret) { 543 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", 544 dev->data->port_id, strerror(rte_errno)); 545 return ret; 546 } 547 memset(&dev_link, 0, sizeof(dev_link)); 548 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && 549 (ifr.ifr_flags & IFF_RUNNING)); 550 ifr.ifr_data = (void *)&edata; 551 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 552 if (ret) { 553 DRV_LOG(WARNING, 554 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", 555 dev->data->port_id, strerror(rte_errno)); 556 return ret; 557 } 558 link_speed = ethtool_cmd_speed(&edata); 559 if (link_speed == -1) 560 dev_link.link_speed = ETH_SPEED_NUM_NONE; 561 else 562 dev_link.link_speed = link_speed; 563 priv->link_speed_capa = 0; 564 if (edata.supported & SUPPORTED_Autoneg) 565 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG; 566 if (edata.supported & (SUPPORTED_1000baseT_Full | 567 SUPPORTED_1000baseKX_Full)) 568 priv->link_speed_capa |= ETH_LINK_SPEED_1G; 569 if (edata.supported & SUPPORTED_10000baseKR_Full) 570 priv->link_speed_capa |= ETH_LINK_SPEED_10G; 571 if (edata.supported & (SUPPORTED_40000baseKR4_Full | 572 SUPPORTED_40000baseCR4_Full | 573 SUPPORTED_40000baseSR4_Full | 574 SUPPORTED_40000baseLR4_Full)) 575 priv->link_speed_capa |= ETH_LINK_SPEED_40G; 576 dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? 577 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 578 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & 579 ETH_LINK_SPEED_FIXED); 580 if ((dev_link.link_speed && !dev_link.link_status) || 581 (!dev_link.link_speed && dev_link.link_status)) { 582 rte_errno = EAGAIN; 583 return -rte_errno; 584 } 585 *link = dev_link; 586 return 0; 587 } 588 589 /** 590 * Retrieve physical link information (unlocked version using new ioctl). 591 * 592 * @param dev 593 * Pointer to Ethernet device structure. 594 * @param[out] link 595 * Storage for current link status. 596 * 597 * @return 598 * 0 on success, a negative errno value otherwise and rte_errno is set. 599 */ 600 static int 601 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, 602 struct rte_eth_link *link) 603 604 { 605 struct priv *priv = dev->data->dev_private; 606 struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS }; 607 struct ifreq ifr; 608 struct rte_eth_link dev_link; 609 uint64_t sc; 610 int ret; 611 612 ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr); 613 if (ret) { 614 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", 615 dev->data->port_id, strerror(rte_errno)); 616 return ret; 617 } 618 memset(&dev_link, 0, sizeof(dev_link)); 619 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && 620 (ifr.ifr_flags & IFF_RUNNING)); 621 ifr.ifr_data = (void *)&gcmd; 622 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 623 if (ret) { 624 DRV_LOG(DEBUG, 625 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)" 626 " failed: %s", 627 dev->data->port_id, strerror(rte_errno)); 628 return ret; 629 } 630 gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords; 631 632 alignas(struct ethtool_link_settings) 633 uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) + 634 sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3]; 635 struct ethtool_link_settings *ecmd = (void *)data; 636 637 *ecmd = gcmd; 638 ifr.ifr_data = (void *)ecmd; 639 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 640 if (ret) { 641 DRV_LOG(DEBUG, 642 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)" 643 " failed: %s", 644 dev->data->port_id, strerror(rte_errno)); 645 return ret; 646 } 647 dev_link.link_speed = ecmd->speed; 648 sc = ecmd->link_mode_masks[0] | 649 ((uint64_t)ecmd->link_mode_masks[1] << 32); 650 priv->link_speed_capa = 0; 651 if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT)) 652 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG; 653 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) | 654 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT))) 655 priv->link_speed_capa |= ETH_LINK_SPEED_1G; 656 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) | 657 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) | 658 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT))) 659 priv->link_speed_capa |= ETH_LINK_SPEED_10G; 660 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) | 661 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT))) 662 priv->link_speed_capa |= ETH_LINK_SPEED_20G; 663 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) | 664 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) | 665 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) | 666 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT))) 667 priv->link_speed_capa |= ETH_LINK_SPEED_40G; 668 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) | 669 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) | 670 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) | 671 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT))) 672 priv->link_speed_capa |= ETH_LINK_SPEED_56G; 673 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) | 674 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) | 675 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT))) 676 priv->link_speed_capa |= ETH_LINK_SPEED_25G; 677 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) | 678 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT))) 679 priv->link_speed_capa |= ETH_LINK_SPEED_50G; 680 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) | 681 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) | 682 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) | 683 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT))) 684 priv->link_speed_capa |= ETH_LINK_SPEED_100G; 685 dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ? 686 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 687 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & 688 ETH_LINK_SPEED_FIXED); 689 if ((dev_link.link_speed && !dev_link.link_status) || 690 (!dev_link.link_speed && dev_link.link_status)) { 691 rte_errno = EAGAIN; 692 return -rte_errno; 693 } 694 *link = dev_link; 695 return 0; 696 } 697 698 /** 699 * DPDK callback to retrieve physical link information. 700 * 701 * @param dev 702 * Pointer to Ethernet device structure. 703 * @param wait_to_complete 704 * Wait for request completion. 705 * 706 * @return 707 * 0 if link status was not updated, positive if it was, a negative errno 708 * value otherwise and rte_errno is set. 709 */ 710 int 711 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete) 712 { 713 int ret; 714 struct rte_eth_link dev_link; 715 time_t start_time = time(NULL); 716 717 do { 718 ret = mlx5_link_update_unlocked_gs(dev, &dev_link); 719 if (ret) 720 ret = mlx5_link_update_unlocked_gset(dev, &dev_link); 721 if (ret == 0) 722 break; 723 /* Handle wait to complete situation. */ 724 if (wait_to_complete && ret == -EAGAIN) { 725 if (abs((int)difftime(time(NULL), start_time)) < 726 MLX5_LINK_STATUS_TIMEOUT) { 727 usleep(0); 728 continue; 729 } else { 730 rte_errno = EBUSY; 731 return -rte_errno; 732 } 733 } else if (ret < 0) { 734 return ret; 735 } 736 } while (wait_to_complete); 737 ret = !!memcmp(&dev->data->dev_link, &dev_link, 738 sizeof(struct rte_eth_link)); 739 dev->data->dev_link = dev_link; 740 return ret; 741 } 742 743 /** 744 * DPDK callback to change the MTU. 745 * 746 * @param dev 747 * Pointer to Ethernet device structure. 748 * @param in_mtu 749 * New MTU. 750 * 751 * @return 752 * 0 on success, a negative errno value otherwise and rte_errno is set. 753 */ 754 int 755 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 756 { 757 struct priv *priv = dev->data->dev_private; 758 uint16_t kern_mtu = 0; 759 int ret; 760 761 ret = mlx5_get_mtu(dev, &kern_mtu); 762 if (ret) 763 return ret; 764 /* Set kernel interface MTU first. */ 765 ret = mlx5_set_mtu(dev, mtu); 766 if (ret) 767 return ret; 768 ret = mlx5_get_mtu(dev, &kern_mtu); 769 if (ret) 770 return ret; 771 if (kern_mtu == mtu) { 772 priv->mtu = mtu; 773 DRV_LOG(DEBUG, "port %u adapter MTU set to %u", 774 dev->data->port_id, mtu); 775 return 0; 776 } 777 rte_errno = EAGAIN; 778 return -rte_errno; 779 } 780 781 /** 782 * DPDK callback to get flow control status. 783 * 784 * @param dev 785 * Pointer to Ethernet device structure. 786 * @param[out] fc_conf 787 * Flow control output buffer. 788 * 789 * @return 790 * 0 on success, a negative errno value otherwise and rte_errno is set. 791 */ 792 int 793 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 794 { 795 struct ifreq ifr; 796 struct ethtool_pauseparam ethpause = { 797 .cmd = ETHTOOL_GPAUSEPARAM 798 }; 799 int ret; 800 801 ifr.ifr_data = (void *)ðpause; 802 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 803 if (ret) { 804 DRV_LOG(WARNING, 805 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:" 806 " %s", 807 dev->data->port_id, strerror(rte_errno)); 808 return ret; 809 } 810 fc_conf->autoneg = ethpause.autoneg; 811 if (ethpause.rx_pause && ethpause.tx_pause) 812 fc_conf->mode = RTE_FC_FULL; 813 else if (ethpause.rx_pause) 814 fc_conf->mode = RTE_FC_RX_PAUSE; 815 else if (ethpause.tx_pause) 816 fc_conf->mode = RTE_FC_TX_PAUSE; 817 else 818 fc_conf->mode = RTE_FC_NONE; 819 return 0; 820 } 821 822 /** 823 * DPDK callback to modify flow control parameters. 824 * 825 * @param dev 826 * Pointer to Ethernet device structure. 827 * @param[in] fc_conf 828 * Flow control parameters. 829 * 830 * @return 831 * 0 on success, a negative errno value otherwise and rte_errno is set. 832 */ 833 int 834 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 835 { 836 struct ifreq ifr; 837 struct ethtool_pauseparam ethpause = { 838 .cmd = ETHTOOL_SPAUSEPARAM 839 }; 840 int ret; 841 842 ifr.ifr_data = (void *)ðpause; 843 ethpause.autoneg = fc_conf->autoneg; 844 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 845 (fc_conf->mode & RTE_FC_RX_PAUSE)) 846 ethpause.rx_pause = 1; 847 else 848 ethpause.rx_pause = 0; 849 850 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 851 (fc_conf->mode & RTE_FC_TX_PAUSE)) 852 ethpause.tx_pause = 1; 853 else 854 ethpause.tx_pause = 0; 855 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 856 if (ret) { 857 DRV_LOG(WARNING, 858 "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" 859 " failed: %s", 860 dev->data->port_id, strerror(rte_errno)); 861 return ret; 862 } 863 return 0; 864 } 865 866 /** 867 * Get PCI information from struct ibv_device. 868 * 869 * @param device 870 * Pointer to Ethernet device structure. 871 * @param[out] pci_addr 872 * PCI bus address output buffer. 873 * 874 * @return 875 * 0 on success, a negative errno value otherwise and rte_errno is set. 876 */ 877 int 878 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, 879 struct rte_pci_addr *pci_addr) 880 { 881 FILE *file; 882 char line[32]; 883 MKSTR(path, "%s/device/uevent", device->ibdev_path); 884 885 file = fopen(path, "rb"); 886 if (file == NULL) { 887 rte_errno = errno; 888 return -rte_errno; 889 } 890 while (fgets(line, sizeof(line), file) == line) { 891 size_t len = strlen(line); 892 int ret; 893 894 /* Truncate long lines. */ 895 if (len == (sizeof(line) - 1)) 896 while (line[(len - 1)] != '\n') { 897 ret = fgetc(file); 898 if (ret == EOF) 899 break; 900 line[(len - 1)] = ret; 901 } 902 /* Extract information. */ 903 if (sscanf(line, 904 "PCI_SLOT_NAME=" 905 "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", 906 &pci_addr->domain, 907 &pci_addr->bus, 908 &pci_addr->devid, 909 &pci_addr->function) == 4) { 910 ret = 0; 911 break; 912 } 913 } 914 fclose(file); 915 return 0; 916 } 917 918 /** 919 * Device status handler. 920 * 921 * @param dev 922 * Pointer to Ethernet device. 923 * @param events 924 * Pointer to event flags holder. 925 * 926 * @return 927 * Events bitmap of callback process which can be called immediately. 928 */ 929 static uint32_t 930 mlx5_dev_status_handler(struct rte_eth_dev *dev) 931 { 932 struct priv *priv = dev->data->dev_private; 933 struct ibv_async_event event; 934 uint32_t ret = 0; 935 936 if (mlx5_link_update(dev, 0) == -EAGAIN) { 937 usleep(0); 938 return 0; 939 } 940 /* Read all message and acknowledge them. */ 941 for (;;) { 942 if (mlx5_glue->get_async_event(priv->ctx, &event)) 943 break; 944 if ((event.event_type == IBV_EVENT_PORT_ACTIVE || 945 event.event_type == IBV_EVENT_PORT_ERR) && 946 (dev->data->dev_conf.intr_conf.lsc == 1)) 947 ret |= (1 << RTE_ETH_EVENT_INTR_LSC); 948 else if (event.event_type == IBV_EVENT_DEVICE_FATAL && 949 dev->data->dev_conf.intr_conf.rmv == 1) 950 ret |= (1 << RTE_ETH_EVENT_INTR_RMV); 951 else 952 DRV_LOG(DEBUG, 953 "port %u event type %d on not handled", 954 dev->data->port_id, event.event_type); 955 mlx5_glue->ack_async_event(&event); 956 } 957 return ret; 958 } 959 960 /** 961 * Handle interrupts from the NIC. 962 * 963 * @param[in] intr_handle 964 * Interrupt handler. 965 * @param cb_arg 966 * Callback argument. 967 */ 968 void 969 mlx5_dev_interrupt_handler(void *cb_arg) 970 { 971 struct rte_eth_dev *dev = cb_arg; 972 uint32_t events; 973 974 events = mlx5_dev_status_handler(dev); 975 if (events & (1 << RTE_ETH_EVENT_INTR_LSC)) 976 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL); 977 if (events & (1 << RTE_ETH_EVENT_INTR_RMV)) 978 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL); 979 } 980 981 /** 982 * Handle interrupts from the socket. 983 * 984 * @param cb_arg 985 * Callback argument. 986 */ 987 static void 988 mlx5_dev_handler_socket(void *cb_arg) 989 { 990 struct rte_eth_dev *dev = cb_arg; 991 992 mlx5_socket_handle(dev); 993 } 994 995 /** 996 * Uninstall interrupt handler. 997 * 998 * @param dev 999 * Pointer to Ethernet device. 1000 */ 1001 void 1002 mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev) 1003 { 1004 struct priv *priv = dev->data->dev_private; 1005 1006 if (dev->data->dev_conf.intr_conf.lsc || 1007 dev->data->dev_conf.intr_conf.rmv) 1008 rte_intr_callback_unregister(&priv->intr_handle, 1009 mlx5_dev_interrupt_handler, dev); 1010 if (priv->primary_socket) 1011 rte_intr_callback_unregister(&priv->intr_handle_socket, 1012 mlx5_dev_handler_socket, dev); 1013 priv->intr_handle.fd = 0; 1014 priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; 1015 priv->intr_handle_socket.fd = 0; 1016 priv->intr_handle_socket.type = RTE_INTR_HANDLE_UNKNOWN; 1017 } 1018 1019 /** 1020 * Install interrupt handler. 1021 * 1022 * @param dev 1023 * Pointer to Ethernet device. 1024 */ 1025 void 1026 mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev) 1027 { 1028 struct priv *priv = dev->data->dev_private; 1029 int ret; 1030 int flags; 1031 1032 assert(priv->ctx->async_fd > 0); 1033 flags = fcntl(priv->ctx->async_fd, F_GETFL); 1034 ret = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); 1035 if (ret) { 1036 DRV_LOG(INFO, 1037 "port %u failed to change file descriptor async event" 1038 " queue", 1039 dev->data->port_id); 1040 dev->data->dev_conf.intr_conf.lsc = 0; 1041 dev->data->dev_conf.intr_conf.rmv = 0; 1042 } 1043 if (dev->data->dev_conf.intr_conf.lsc || 1044 dev->data->dev_conf.intr_conf.rmv) { 1045 priv->intr_handle.fd = priv->ctx->async_fd; 1046 priv->intr_handle.type = RTE_INTR_HANDLE_EXT; 1047 rte_intr_callback_register(&priv->intr_handle, 1048 mlx5_dev_interrupt_handler, dev); 1049 } 1050 ret = mlx5_socket_init(dev); 1051 if (ret) 1052 DRV_LOG(ERR, "port %u cannot initialise socket: %s", 1053 dev->data->port_id, strerror(rte_errno)); 1054 else if (priv->primary_socket) { 1055 priv->intr_handle_socket.fd = priv->primary_socket; 1056 priv->intr_handle_socket.type = RTE_INTR_HANDLE_EXT; 1057 rte_intr_callback_register(&priv->intr_handle_socket, 1058 mlx5_dev_handler_socket, dev); 1059 } 1060 } 1061 1062 /** 1063 * DPDK callback to bring the link DOWN. 1064 * 1065 * @param dev 1066 * Pointer to Ethernet device structure. 1067 * 1068 * @return 1069 * 0 on success, a negative errno value otherwise and rte_errno is set. 1070 */ 1071 int 1072 mlx5_set_link_down(struct rte_eth_dev *dev) 1073 { 1074 return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP); 1075 } 1076 1077 /** 1078 * DPDK callback to bring the link UP. 1079 * 1080 * @param dev 1081 * Pointer to Ethernet device structure. 1082 * 1083 * @return 1084 * 0 on success, a negative errno value otherwise and rte_errno is set. 1085 */ 1086 int 1087 mlx5_set_link_up(struct rte_eth_dev *dev) 1088 { 1089 return mlx5_set_flags(dev, ~IFF_UP, IFF_UP); 1090 } 1091 1092 /** 1093 * Configure the TX function to use. 1094 * 1095 * @param dev 1096 * Pointer to private data structure. 1097 * 1098 * @return 1099 * Pointer to selected Tx burst function. 1100 */ 1101 eth_tx_burst_t 1102 mlx5_select_tx_function(struct rte_eth_dev *dev) 1103 { 1104 struct priv *priv = dev->data->dev_private; 1105 eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst; 1106 struct mlx5_dev_config *config = &priv->config; 1107 uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads; 1108 int tso = !!(tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO | 1109 DEV_TX_OFFLOAD_VXLAN_TNL_TSO | 1110 DEV_TX_OFFLOAD_GRE_TNL_TSO | 1111 DEV_TX_OFFLOAD_IP_TNL_TSO | 1112 DEV_TX_OFFLOAD_UDP_TNL_TSO)); 1113 int swp = !!(tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO | 1114 DEV_TX_OFFLOAD_UDP_TNL_TSO | 1115 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)); 1116 int vlan_insert = !!(tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT); 1117 1118 assert(priv != NULL); 1119 /* Select appropriate TX function. */ 1120 if (vlan_insert || tso || swp) 1121 return tx_pkt_burst; 1122 if (config->mps == MLX5_MPW_ENHANCED) { 1123 if (mlx5_check_vec_tx_support(dev) > 0) { 1124 if (mlx5_check_raw_vec_tx_support(dev) > 0) 1125 tx_pkt_burst = mlx5_tx_burst_raw_vec; 1126 else 1127 tx_pkt_burst = mlx5_tx_burst_vec; 1128 DRV_LOG(DEBUG, 1129 "port %u selected enhanced MPW Tx vectorized" 1130 " function", 1131 dev->data->port_id); 1132 } else { 1133 tx_pkt_burst = mlx5_tx_burst_empw; 1134 DRV_LOG(DEBUG, 1135 "port %u selected enhanced MPW Tx function", 1136 dev->data->port_id); 1137 } 1138 } else if (config->mps && (config->txq_inline > 0)) { 1139 tx_pkt_burst = mlx5_tx_burst_mpw_inline; 1140 DRV_LOG(DEBUG, "port %u selected MPW inline Tx function", 1141 dev->data->port_id); 1142 } else if (config->mps) { 1143 tx_pkt_burst = mlx5_tx_burst_mpw; 1144 DRV_LOG(DEBUG, "port %u selected MPW Tx function", 1145 dev->data->port_id); 1146 } 1147 return tx_pkt_burst; 1148 } 1149 1150 /** 1151 * Configure the RX function to use. 1152 * 1153 * @param dev 1154 * Pointer to private data structure. 1155 * 1156 * @return 1157 * Pointer to selected Rx burst function. 1158 */ 1159 eth_rx_burst_t 1160 mlx5_select_rx_function(struct rte_eth_dev *dev) 1161 { 1162 eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst; 1163 1164 assert(dev != NULL); 1165 if (mlx5_check_vec_rx_support(dev) > 0) { 1166 rx_pkt_burst = mlx5_rx_burst_vec; 1167 DRV_LOG(DEBUG, "port %u selected Rx vectorized function", 1168 dev->data->port_id); 1169 } 1170 return rx_pkt_burst; 1171 } 1172 1173 /** 1174 * Check if mlx5 device was removed. 1175 * 1176 * @param dev 1177 * Pointer to Ethernet device structure. 1178 * 1179 * @return 1180 * 1 when device is removed, otherwise 0. 1181 */ 1182 int 1183 mlx5_is_removed(struct rte_eth_dev *dev) 1184 { 1185 struct ibv_device_attr device_attr; 1186 struct priv *priv = dev->data->dev_private; 1187 1188 if (mlx5_glue->query_device(priv->ctx, &device_attr) == EIO) 1189 return 1; 1190 return 0; 1191 } 1192