1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015 Mellanox Technologies, Ltd 4 */ 5 6 #include <stddef.h> 7 #include <inttypes.h> 8 #include <unistd.h> 9 #include <stdbool.h> 10 #include <stdint.h> 11 #include <stdio.h> 12 #include <string.h> 13 #include <stdlib.h> 14 #include <errno.h> 15 #include <dirent.h> 16 #include <net/if.h> 17 #include <sys/ioctl.h> 18 #include <sys/socket.h> 19 #include <netinet/in.h> 20 #include <linux/ethtool.h> 21 #include <linux/sockios.h> 22 #include <fcntl.h> 23 #include <stdalign.h> 24 #include <sys/un.h> 25 #include <time.h> 26 27 #include <rte_atomic.h> 28 #include <rte_ethdev_driver.h> 29 #include <rte_bus_pci.h> 30 #include <rte_mbuf.h> 31 #include <rte_common.h> 32 #include <rte_interrupts.h> 33 #include <rte_malloc.h> 34 #include <rte_string_fns.h> 35 #include <rte_rwlock.h> 36 #include <rte_cycles.h> 37 38 #include <mlx5_glue.h> 39 #include <mlx5_devx_cmds.h> 40 #include <mlx5_common.h> 41 42 #include "mlx5.h" 43 #include "mlx5_rxtx.h" 44 #include "mlx5_utils.h" 45 46 /* Supported speed values found in /usr/include/linux/ethtool.h */ 47 #ifndef HAVE_SUPPORTED_40000baseKR4_Full 48 #define SUPPORTED_40000baseKR4_Full (1 << 23) 49 #endif 50 #ifndef HAVE_SUPPORTED_40000baseCR4_Full 51 #define SUPPORTED_40000baseCR4_Full (1 << 24) 52 #endif 53 #ifndef HAVE_SUPPORTED_40000baseSR4_Full 54 #define SUPPORTED_40000baseSR4_Full (1 << 25) 55 #endif 56 #ifndef HAVE_SUPPORTED_40000baseLR4_Full 57 #define SUPPORTED_40000baseLR4_Full (1 << 26) 58 #endif 59 #ifndef HAVE_SUPPORTED_56000baseKR4_Full 60 #define SUPPORTED_56000baseKR4_Full (1 << 27) 61 #endif 62 #ifndef HAVE_SUPPORTED_56000baseCR4_Full 63 #define SUPPORTED_56000baseCR4_Full (1 << 28) 64 #endif 65 #ifndef HAVE_SUPPORTED_56000baseSR4_Full 66 #define SUPPORTED_56000baseSR4_Full (1 << 29) 67 #endif 68 #ifndef HAVE_SUPPORTED_56000baseLR4_Full 69 #define SUPPORTED_56000baseLR4_Full (1 << 30) 70 #endif 71 72 /* Add defines in case the running kernel is not the same as user headers. */ 73 #ifndef ETHTOOL_GLINKSETTINGS 74 struct ethtool_link_settings { 75 uint32_t cmd; 76 uint32_t speed; 77 uint8_t duplex; 78 uint8_t port; 79 uint8_t phy_address; 80 uint8_t autoneg; 81 uint8_t mdio_support; 82 uint8_t eth_to_mdix; 83 uint8_t eth_tp_mdix_ctrl; 84 int8_t link_mode_masks_nwords; 85 uint32_t reserved[8]; 86 uint32_t link_mode_masks[]; 87 }; 88 89 /* The kernel values can be found in /include/uapi/linux/ethtool.h */ 90 #define ETHTOOL_GLINKSETTINGS 0x0000004c 91 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5 92 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6 93 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17 94 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18 95 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19 96 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20 97 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21 98 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22 99 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23 100 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24 101 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25 102 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26 103 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27 104 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28 105 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29 106 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30 107 #endif 108 #ifndef HAVE_ETHTOOL_LINK_MODE_25G 109 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31 110 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32 111 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33 112 #endif 113 #ifndef HAVE_ETHTOOL_LINK_MODE_50G 114 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34 115 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35 116 #endif 117 #ifndef HAVE_ETHTOOL_LINK_MODE_100G 118 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36 119 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37 120 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38 121 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39 122 #endif 123 #ifndef HAVE_ETHTOOL_LINK_MODE_200G 124 #define ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT 62 125 #define ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT 63 126 #define ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT 0 /* 64 - 64 */ 127 #define ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT 1 /* 65 - 64 */ 128 #define ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT 2 /* 66 - 64 */ 129 #endif 130 131 /** 132 * Get master interface name from private structure. 133 * 134 * @param[in] dev 135 * Pointer to Ethernet device. 136 * @param[out] ifname 137 * Interface name output buffer. 138 * 139 * @return 140 * 0 on success, a negative errno value otherwise and rte_errno is set. 141 */ 142 int 143 mlx5_get_master_ifname(const char *ibdev_path, char (*ifname)[IF_NAMESIZE]) 144 { 145 DIR *dir; 146 struct dirent *dent; 147 unsigned int dev_type = 0; 148 unsigned int dev_port_prev = ~0u; 149 char match[IF_NAMESIZE] = ""; 150 151 MLX5_ASSERT(ibdev_path); 152 { 153 MKSTR(path, "%s/device/net", ibdev_path); 154 155 dir = opendir(path); 156 if (dir == NULL) { 157 rte_errno = errno; 158 return -rte_errno; 159 } 160 } 161 while ((dent = readdir(dir)) != NULL) { 162 char *name = dent->d_name; 163 FILE *file; 164 unsigned int dev_port; 165 int r; 166 167 if ((name[0] == '.') && 168 ((name[1] == '\0') || 169 ((name[1] == '.') && (name[2] == '\0')))) 170 continue; 171 172 MKSTR(path, "%s/device/net/%s/%s", 173 ibdev_path, name, 174 (dev_type ? "dev_id" : "dev_port")); 175 176 file = fopen(path, "rb"); 177 if (file == NULL) { 178 if (errno != ENOENT) 179 continue; 180 /* 181 * Switch to dev_id when dev_port does not exist as 182 * is the case with Linux kernel versions < 3.15. 183 */ 184 try_dev_id: 185 match[0] = '\0'; 186 if (dev_type) 187 break; 188 dev_type = 1; 189 dev_port_prev = ~0u; 190 rewinddir(dir); 191 continue; 192 } 193 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); 194 fclose(file); 195 if (r != 1) 196 continue; 197 /* 198 * Switch to dev_id when dev_port returns the same value for 199 * all ports. May happen when using a MOFED release older than 200 * 3.0 with a Linux kernel >= 3.15. 201 */ 202 if (dev_port == dev_port_prev) 203 goto try_dev_id; 204 dev_port_prev = dev_port; 205 if (dev_port == 0) 206 strlcpy(match, name, sizeof(match)); 207 } 208 closedir(dir); 209 if (match[0] == '\0') { 210 rte_errno = ENOENT; 211 return -rte_errno; 212 } 213 strncpy(*ifname, match, sizeof(*ifname)); 214 return 0; 215 } 216 217 /** 218 * Get interface name from private structure. 219 * 220 * This is a port representor-aware version of mlx5_get_master_ifname(). 221 * 222 * @param[in] dev 223 * Pointer to Ethernet device. 224 * @param[out] ifname 225 * Interface name output buffer. 226 * 227 * @return 228 * 0 on success, a negative errno value otherwise and rte_errno is set. 229 */ 230 int 231 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]) 232 { 233 struct mlx5_priv *priv = dev->data->dev_private; 234 unsigned int ifindex; 235 236 MLX5_ASSERT(priv); 237 MLX5_ASSERT(priv->sh); 238 ifindex = mlx5_ifindex(dev); 239 if (!ifindex) { 240 if (!priv->representor) 241 return mlx5_get_master_ifname(priv->sh->ibdev_path, 242 ifname); 243 rte_errno = ENXIO; 244 return -rte_errno; 245 } 246 if (if_indextoname(ifindex, &(*ifname)[0])) 247 return 0; 248 rte_errno = errno; 249 return -rte_errno; 250 } 251 252 /** 253 * Get the interface index from device name. 254 * 255 * @param[in] dev 256 * Pointer to Ethernet device. 257 * 258 * @return 259 * Nonzero interface index on success, zero otherwise and rte_errno is set. 260 */ 261 unsigned int 262 mlx5_ifindex(const struct rte_eth_dev *dev) 263 { 264 struct mlx5_priv *priv = dev->data->dev_private; 265 unsigned int ifindex; 266 267 MLX5_ASSERT(priv); 268 MLX5_ASSERT(priv->if_index); 269 ifindex = priv->if_index; 270 if (!ifindex) 271 rte_errno = ENXIO; 272 return ifindex; 273 } 274 275 /** 276 * Perform ifreq ioctl() on associated Ethernet device. 277 * 278 * @param[in] dev 279 * Pointer to Ethernet device. 280 * @param req 281 * Request number to pass to ioctl(). 282 * @param[out] ifr 283 * Interface request structure output buffer. 284 * 285 * @return 286 * 0 on success, a negative errno value otherwise and rte_errno is set. 287 */ 288 int 289 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr) 290 { 291 int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 292 int ret = 0; 293 294 if (sock == -1) { 295 rte_errno = errno; 296 return -rte_errno; 297 } 298 ret = mlx5_get_ifname(dev, &ifr->ifr_name); 299 if (ret) 300 goto error; 301 ret = ioctl(sock, req, ifr); 302 if (ret == -1) { 303 rte_errno = errno; 304 goto error; 305 } 306 close(sock); 307 return 0; 308 error: 309 close(sock); 310 return -rte_errno; 311 } 312 313 /** 314 * Get device MTU. 315 * 316 * @param dev 317 * Pointer to Ethernet device. 318 * @param[out] mtu 319 * MTU value output buffer. 320 * 321 * @return 322 * 0 on success, a negative errno value otherwise and rte_errno is set. 323 */ 324 int 325 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu) 326 { 327 struct ifreq request; 328 int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request); 329 330 if (ret) 331 return ret; 332 *mtu = request.ifr_mtu; 333 return 0; 334 } 335 336 /** 337 * Set device MTU. 338 * 339 * @param dev 340 * Pointer to Ethernet device. 341 * @param mtu 342 * MTU value to set. 343 * 344 * @return 345 * 0 on success, a negative errno value otherwise and rte_errno is set. 346 */ 347 static int 348 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 349 { 350 struct ifreq request = { .ifr_mtu = mtu, }; 351 352 return mlx5_ifreq(dev, SIOCSIFMTU, &request); 353 } 354 355 /** 356 * Set device flags. 357 * 358 * @param dev 359 * Pointer to Ethernet device. 360 * @param keep 361 * Bitmask for flags that must remain untouched. 362 * @param flags 363 * Bitmask for flags to modify. 364 * 365 * @return 366 * 0 on success, a negative errno value otherwise and rte_errno is set. 367 */ 368 int 369 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags) 370 { 371 struct ifreq request; 372 int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request); 373 374 if (ret) 375 return ret; 376 request.ifr_flags &= keep; 377 request.ifr_flags |= flags & ~keep; 378 return mlx5_ifreq(dev, SIOCSIFFLAGS, &request); 379 } 380 381 /** 382 * DPDK callback for Ethernet device configuration. 383 * 384 * @param dev 385 * Pointer to Ethernet device structure. 386 * 387 * @return 388 * 0 on success, a negative errno value otherwise and rte_errno is set. 389 */ 390 int 391 mlx5_dev_configure(struct rte_eth_dev *dev) 392 { 393 struct mlx5_priv *priv = dev->data->dev_private; 394 unsigned int rxqs_n = dev->data->nb_rx_queues; 395 unsigned int txqs_n = dev->data->nb_tx_queues; 396 const uint8_t use_app_rss_key = 397 !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key; 398 int ret = 0; 399 400 if (use_app_rss_key && 401 (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len != 402 MLX5_RSS_HASH_KEY_LEN)) { 403 DRV_LOG(ERR, "port %u RSS key len must be %s Bytes long", 404 dev->data->port_id, RTE_STR(MLX5_RSS_HASH_KEY_LEN)); 405 rte_errno = EINVAL; 406 return -rte_errno; 407 } 408 priv->rss_conf.rss_key = 409 rte_realloc(priv->rss_conf.rss_key, 410 MLX5_RSS_HASH_KEY_LEN, 0); 411 if (!priv->rss_conf.rss_key) { 412 DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)", 413 dev->data->port_id, rxqs_n); 414 rte_errno = ENOMEM; 415 return -rte_errno; 416 } 417 418 if (dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG) 419 dev->data->dev_conf.rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH; 420 421 memcpy(priv->rss_conf.rss_key, 422 use_app_rss_key ? 423 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key : 424 rss_hash_default_key, 425 MLX5_RSS_HASH_KEY_LEN); 426 priv->rss_conf.rss_key_len = MLX5_RSS_HASH_KEY_LEN; 427 priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf; 428 priv->rxqs = (void *)dev->data->rx_queues; 429 priv->txqs = (void *)dev->data->tx_queues; 430 if (txqs_n != priv->txqs_n) { 431 DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u", 432 dev->data->port_id, priv->txqs_n, txqs_n); 433 priv->txqs_n = txqs_n; 434 } 435 if (rxqs_n > priv->config.ind_table_max_size) { 436 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)", 437 dev->data->port_id, rxqs_n); 438 rte_errno = EINVAL; 439 return -rte_errno; 440 } 441 if (rxqs_n != priv->rxqs_n) { 442 DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u", 443 dev->data->port_id, priv->rxqs_n, rxqs_n); 444 priv->rxqs_n = rxqs_n; 445 } 446 priv->skip_default_rss_reta = 0; 447 ret = mlx5_proc_priv_init(dev); 448 if (ret) 449 return ret; 450 return 0; 451 } 452 453 /** 454 * Configure default RSS reta. 455 * 456 * @param dev 457 * Pointer to Ethernet device structure. 458 * 459 * @return 460 * 0 on success, a negative errno value otherwise and rte_errno is set. 461 */ 462 int 463 mlx5_dev_configure_rss_reta(struct rte_eth_dev *dev) 464 { 465 struct mlx5_priv *priv = dev->data->dev_private; 466 unsigned int rxqs_n = dev->data->nb_rx_queues; 467 unsigned int i; 468 unsigned int j; 469 unsigned int reta_idx_n; 470 int ret = 0; 471 unsigned int *rss_queue_arr = NULL; 472 unsigned int rss_queue_n = 0; 473 474 if (priv->skip_default_rss_reta) 475 return ret; 476 rss_queue_arr = rte_malloc("", rxqs_n * sizeof(unsigned int), 0); 477 if (!rss_queue_arr) { 478 DRV_LOG(ERR, "port %u cannot allocate RSS queue list (%u)", 479 dev->data->port_id, rxqs_n); 480 rte_errno = ENOMEM; 481 return -rte_errno; 482 } 483 for (i = 0, j = 0; i < rxqs_n; i++) { 484 struct mlx5_rxq_data *rxq_data; 485 struct mlx5_rxq_ctrl *rxq_ctrl; 486 487 rxq_data = (*priv->rxqs)[i]; 488 rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); 489 if (rxq_ctrl && rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD) 490 rss_queue_arr[j++] = i; 491 } 492 rss_queue_n = j; 493 if (rss_queue_n > priv->config.ind_table_max_size) { 494 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)", 495 dev->data->port_id, rss_queue_n); 496 rte_errno = EINVAL; 497 rte_free(rss_queue_arr); 498 return -rte_errno; 499 } 500 DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u", 501 dev->data->port_id, priv->rxqs_n, rxqs_n); 502 priv->rxqs_n = rxqs_n; 503 /* 504 * If the requested number of RX queues is not a power of two, 505 * use the maximum indirection table size for better balancing. 506 * The result is always rounded to the next power of two. 507 */ 508 reta_idx_n = (1 << log2above((rss_queue_n & (rss_queue_n - 1)) ? 509 priv->config.ind_table_max_size : 510 rss_queue_n)); 511 ret = mlx5_rss_reta_index_resize(dev, reta_idx_n); 512 if (ret) { 513 rte_free(rss_queue_arr); 514 return ret; 515 } 516 /* 517 * When the number of RX queues is not a power of two, 518 * the remaining table entries are padded with reused WQs 519 * and hashes are not spread uniformly. 520 */ 521 for (i = 0, j = 0; (i != reta_idx_n); ++i) { 522 (*priv->reta_idx)[i] = rss_queue_arr[j]; 523 if (++j == rss_queue_n) 524 j = 0; 525 } 526 rte_free(rss_queue_arr); 527 return ret; 528 } 529 530 /** 531 * Sets default tuning parameters. 532 * 533 * @param dev 534 * Pointer to Ethernet device. 535 * @param[out] info 536 * Info structure output buffer. 537 */ 538 static void 539 mlx5_set_default_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 540 { 541 struct mlx5_priv *priv = dev->data->dev_private; 542 543 /* Minimum CPU utilization. */ 544 info->default_rxportconf.ring_size = 256; 545 info->default_txportconf.ring_size = 256; 546 info->default_rxportconf.burst_size = MLX5_RX_DEFAULT_BURST; 547 info->default_txportconf.burst_size = MLX5_TX_DEFAULT_BURST; 548 if ((priv->link_speed_capa & ETH_LINK_SPEED_200G) | 549 (priv->link_speed_capa & ETH_LINK_SPEED_100G)) { 550 info->default_rxportconf.nb_queues = 16; 551 info->default_txportconf.nb_queues = 16; 552 if (dev->data->nb_rx_queues > 2 || 553 dev->data->nb_tx_queues > 2) { 554 /* Max Throughput. */ 555 info->default_rxportconf.ring_size = 2048; 556 info->default_txportconf.ring_size = 2048; 557 } 558 } else { 559 info->default_rxportconf.nb_queues = 8; 560 info->default_txportconf.nb_queues = 8; 561 if (dev->data->nb_rx_queues > 2 || 562 dev->data->nb_tx_queues > 2) { 563 /* Max Throughput. */ 564 info->default_rxportconf.ring_size = 4096; 565 info->default_txportconf.ring_size = 4096; 566 } 567 } 568 } 569 570 /** 571 * Sets tx mbuf limiting parameters. 572 * 573 * @param dev 574 * Pointer to Ethernet device. 575 * @param[out] info 576 * Info structure output buffer. 577 */ 578 static void 579 mlx5_set_txlimit_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 580 { 581 struct mlx5_priv *priv = dev->data->dev_private; 582 struct mlx5_dev_config *config = &priv->config; 583 unsigned int inlen; 584 uint16_t nb_max; 585 586 inlen = (config->txq_inline_max == MLX5_ARG_UNSET) ? 587 MLX5_SEND_DEF_INLINE_LEN : 588 (unsigned int)config->txq_inline_max; 589 MLX5_ASSERT(config->txq_inline_min >= 0); 590 inlen = RTE_MAX(inlen, (unsigned int)config->txq_inline_min); 591 inlen = RTE_MIN(inlen, MLX5_WQE_SIZE_MAX + 592 MLX5_ESEG_MIN_INLINE_SIZE - 593 MLX5_WQE_CSEG_SIZE - 594 MLX5_WQE_ESEG_SIZE - 595 MLX5_WQE_DSEG_SIZE * 2); 596 nb_max = (MLX5_WQE_SIZE_MAX + 597 MLX5_ESEG_MIN_INLINE_SIZE - 598 MLX5_WQE_CSEG_SIZE - 599 MLX5_WQE_ESEG_SIZE - 600 MLX5_WQE_DSEG_SIZE - 601 inlen) / MLX5_WSEG_SIZE; 602 info->tx_desc_lim.nb_seg_max = nb_max; 603 info->tx_desc_lim.nb_mtu_seg_max = nb_max; 604 } 605 606 /** 607 * DPDK callback to get information about the device. 608 * 609 * @param dev 610 * Pointer to Ethernet device structure. 611 * @param[out] info 612 * Info structure output buffer. 613 */ 614 int 615 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 616 { 617 struct mlx5_priv *priv = dev->data->dev_private; 618 struct mlx5_dev_config *config = &priv->config; 619 unsigned int max; 620 621 /* FIXME: we should ask the device for these values. */ 622 info->min_rx_bufsize = 32; 623 info->max_rx_pktlen = 65536; 624 info->max_lro_pkt_size = MLX5_MAX_LRO_SIZE; 625 /* 626 * Since we need one CQ per QP, the limit is the minimum number 627 * between the two values. 628 */ 629 max = RTE_MIN(priv->sh->device_attr.orig_attr.max_cq, 630 priv->sh->device_attr.orig_attr.max_qp); 631 /* max_rx_queues is uint16_t. */ 632 max = RTE_MIN(max, (unsigned int)UINT16_MAX); 633 info->max_rx_queues = max; 634 info->max_tx_queues = max; 635 info->max_mac_addrs = MLX5_MAX_UC_MAC_ADDRESSES; 636 info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev); 637 info->rx_offload_capa = (mlx5_get_rx_port_offloads() | 638 info->rx_queue_offload_capa); 639 info->tx_offload_capa = mlx5_get_tx_port_offloads(dev); 640 info->if_index = mlx5_ifindex(dev); 641 info->reta_size = priv->reta_idx_n ? 642 priv->reta_idx_n : config->ind_table_max_size; 643 info->hash_key_size = MLX5_RSS_HASH_KEY_LEN; 644 info->speed_capa = priv->link_speed_capa; 645 info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK; 646 mlx5_set_default_params(dev, info); 647 mlx5_set_txlimit_params(dev, info); 648 info->switch_info.name = dev->data->name; 649 info->switch_info.domain_id = priv->domain_id; 650 info->switch_info.port_id = priv->representor_id; 651 if (priv->representor) { 652 uint16_t port_id; 653 654 if (priv->pf_bond >= 0) { 655 /* 656 * Switch port ID is opaque value with driver defined 657 * format. Push the PF index in bonding configurations 658 * in upper four bits of port ID. If we get too many 659 * representors (more than 4K) or PFs (more than 15) 660 * this approach must be reconsidered. 661 */ 662 if ((info->switch_info.port_id >> 663 MLX5_PORT_ID_BONDING_PF_SHIFT) || 664 priv->pf_bond > MLX5_PORT_ID_BONDING_PF_MASK) { 665 DRV_LOG(ERR, "can't update switch port ID" 666 " for bonding device"); 667 MLX5_ASSERT(false); 668 return -ENODEV; 669 } 670 info->switch_info.port_id |= 671 priv->pf_bond << MLX5_PORT_ID_BONDING_PF_SHIFT; 672 } 673 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { 674 struct mlx5_priv *opriv = 675 rte_eth_devices[port_id].data->dev_private; 676 677 if (!opriv || 678 opriv->representor || 679 opriv->sh != priv->sh || 680 opriv->domain_id != priv->domain_id) 681 continue; 682 /* 683 * Override switch name with that of the master 684 * device. 685 */ 686 info->switch_info.name = opriv->dev_data->name; 687 break; 688 } 689 } 690 return 0; 691 } 692 693 /** 694 * Get device current raw clock counter 695 * 696 * @param dev 697 * Pointer to Ethernet device structure. 698 * @param[out] time 699 * Current raw clock counter of the device. 700 * 701 * @return 702 * 0 if the clock has correctly been read 703 * The value of errno in case of error 704 */ 705 int 706 mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock) 707 { 708 struct mlx5_priv *priv = dev->data->dev_private; 709 struct ibv_context *ctx = priv->sh->ctx; 710 struct ibv_values_ex values; 711 int err = 0; 712 713 values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK; 714 err = mlx5_glue->query_rt_values_ex(ctx, &values); 715 if (err != 0) { 716 DRV_LOG(WARNING, "Could not query the clock !"); 717 return err; 718 } 719 *clock = values.raw_clock.tv_nsec; 720 return 0; 721 } 722 723 /** 724 * Get firmware version of a device. 725 * 726 * @param dev 727 * Ethernet device port. 728 * @param fw_ver 729 * String output allocated by caller. 730 * @param fw_size 731 * Size of the output string, including terminating null byte. 732 * 733 * @return 734 * 0 on success, or the size of the non truncated string if too big. 735 */ 736 int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size) 737 { 738 struct mlx5_priv *priv = dev->data->dev_private; 739 struct ibv_device_attr *attr = &priv->sh->device_attr.orig_attr; 740 size_t size = strnlen(attr->fw_ver, sizeof(attr->fw_ver)) + 1; 741 742 if (fw_size < size) 743 return size; 744 if (fw_ver != NULL) 745 strlcpy(fw_ver, attr->fw_ver, fw_size); 746 return 0; 747 } 748 749 /** 750 * Get supported packet types. 751 * 752 * @param dev 753 * Pointer to Ethernet device structure. 754 * 755 * @return 756 * A pointer to the supported Packet types array. 757 */ 758 const uint32_t * 759 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) 760 { 761 static const uint32_t ptypes[] = { 762 /* refers to rxq_cq_to_pkt_type() */ 763 RTE_PTYPE_L2_ETHER, 764 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, 765 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, 766 RTE_PTYPE_L4_NONFRAG, 767 RTE_PTYPE_L4_FRAG, 768 RTE_PTYPE_L4_TCP, 769 RTE_PTYPE_L4_UDP, 770 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, 771 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, 772 RTE_PTYPE_INNER_L4_NONFRAG, 773 RTE_PTYPE_INNER_L4_FRAG, 774 RTE_PTYPE_INNER_L4_TCP, 775 RTE_PTYPE_INNER_L4_UDP, 776 RTE_PTYPE_UNKNOWN 777 }; 778 779 if (dev->rx_pkt_burst == mlx5_rx_burst || 780 dev->rx_pkt_burst == mlx5_rx_burst_mprq || 781 dev->rx_pkt_burst == mlx5_rx_burst_vec) 782 return ptypes; 783 return NULL; 784 } 785 786 /** 787 * Retrieve the master device for representor in the same switch domain. 788 * 789 * @param dev 790 * Pointer to representor Ethernet device structure. 791 * 792 * @return 793 * Master device structure on success, NULL otherwise. 794 */ 795 796 static struct rte_eth_dev * 797 mlx5_find_master_dev(struct rte_eth_dev *dev) 798 { 799 struct mlx5_priv *priv; 800 uint16_t port_id; 801 uint16_t domain_id; 802 803 priv = dev->data->dev_private; 804 domain_id = priv->domain_id; 805 MLX5_ASSERT(priv->representor); 806 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { 807 struct mlx5_priv *opriv = 808 rte_eth_devices[port_id].data->dev_private; 809 if (opriv && 810 opriv->master && 811 opriv->domain_id == domain_id && 812 opriv->sh == priv->sh) 813 return &rte_eth_devices[port_id]; 814 } 815 return NULL; 816 } 817 818 /** 819 * DPDK callback to retrieve physical link information. 820 * 821 * @param dev 822 * Pointer to Ethernet device structure. 823 * @param[out] link 824 * Storage for current link status. 825 * 826 * @return 827 * 0 on success, a negative errno value otherwise and rte_errno is set. 828 */ 829 static int 830 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, 831 struct rte_eth_link *link) 832 { 833 struct mlx5_priv *priv = dev->data->dev_private; 834 struct ethtool_cmd edata = { 835 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */ 836 }; 837 struct ifreq ifr; 838 struct rte_eth_link dev_link; 839 int link_speed = 0; 840 int ret; 841 842 ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr); 843 if (ret) { 844 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", 845 dev->data->port_id, strerror(rte_errno)); 846 return ret; 847 } 848 dev_link = (struct rte_eth_link) { 849 .link_status = ((ifr.ifr_flags & IFF_UP) && 850 (ifr.ifr_flags & IFF_RUNNING)), 851 }; 852 ifr = (struct ifreq) { 853 .ifr_data = (void *)&edata, 854 }; 855 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 856 if (ret) { 857 if (ret == -ENOTSUP && priv->representor) { 858 struct rte_eth_dev *master; 859 860 /* 861 * For representors we can try to inherit link 862 * settings from the master device. Actually 863 * link settings do not make a lot of sense 864 * for representors due to missing physical 865 * link. The old kernel drivers supported 866 * emulated settings query for representors, 867 * the new ones do not, so we have to add 868 * this code for compatibility issues. 869 */ 870 master = mlx5_find_master_dev(dev); 871 if (master) { 872 ifr = (struct ifreq) { 873 .ifr_data = (void *)&edata, 874 }; 875 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr); 876 } 877 } 878 if (ret) { 879 DRV_LOG(WARNING, 880 "port %u ioctl(SIOCETHTOOL," 881 " ETHTOOL_GSET) failed: %s", 882 dev->data->port_id, strerror(rte_errno)); 883 return ret; 884 } 885 } 886 link_speed = ethtool_cmd_speed(&edata); 887 if (link_speed == -1) 888 dev_link.link_speed = ETH_SPEED_NUM_NONE; 889 else 890 dev_link.link_speed = link_speed; 891 priv->link_speed_capa = 0; 892 if (edata.supported & SUPPORTED_Autoneg) 893 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG; 894 if (edata.supported & (SUPPORTED_1000baseT_Full | 895 SUPPORTED_1000baseKX_Full)) 896 priv->link_speed_capa |= ETH_LINK_SPEED_1G; 897 if (edata.supported & SUPPORTED_10000baseKR_Full) 898 priv->link_speed_capa |= ETH_LINK_SPEED_10G; 899 if (edata.supported & (SUPPORTED_40000baseKR4_Full | 900 SUPPORTED_40000baseCR4_Full | 901 SUPPORTED_40000baseSR4_Full | 902 SUPPORTED_40000baseLR4_Full)) 903 priv->link_speed_capa |= ETH_LINK_SPEED_40G; 904 dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? 905 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 906 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & 907 ETH_LINK_SPEED_FIXED); 908 if (((dev_link.link_speed && !dev_link.link_status) || 909 (!dev_link.link_speed && dev_link.link_status))) { 910 rte_errno = EAGAIN; 911 return -rte_errno; 912 } 913 *link = dev_link; 914 return 0; 915 } 916 917 /** 918 * Retrieve physical link information (unlocked version using new ioctl). 919 * 920 * @param dev 921 * Pointer to Ethernet device structure. 922 * @param[out] link 923 * Storage for current link status. 924 * 925 * @return 926 * 0 on success, a negative errno value otherwise and rte_errno is set. 927 */ 928 static int 929 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, 930 struct rte_eth_link *link) 931 932 { 933 struct mlx5_priv *priv = dev->data->dev_private; 934 struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS }; 935 struct ifreq ifr; 936 struct rte_eth_link dev_link; 937 struct rte_eth_dev *master = NULL; 938 uint64_t sc; 939 int ret; 940 941 ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr); 942 if (ret) { 943 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", 944 dev->data->port_id, strerror(rte_errno)); 945 return ret; 946 } 947 dev_link = (struct rte_eth_link) { 948 .link_status = ((ifr.ifr_flags & IFF_UP) && 949 (ifr.ifr_flags & IFF_RUNNING)), 950 }; 951 ifr = (struct ifreq) { 952 .ifr_data = (void *)&gcmd, 953 }; 954 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 955 if (ret) { 956 if (ret == -ENOTSUP && priv->representor) { 957 /* 958 * For representors we can try to inherit link 959 * settings from the master device. Actually 960 * link settings do not make a lot of sense 961 * for representors due to missing physical 962 * link. The old kernel drivers supported 963 * emulated settings query for representors, 964 * the new ones do not, so we have to add 965 * this code for compatibility issues. 966 */ 967 master = mlx5_find_master_dev(dev); 968 if (master) { 969 ifr = (struct ifreq) { 970 .ifr_data = (void *)&gcmd, 971 }; 972 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr); 973 } 974 } 975 if (ret) { 976 DRV_LOG(DEBUG, 977 "port %u ioctl(SIOCETHTOOL," 978 " ETHTOOL_GLINKSETTINGS) failed: %s", 979 dev->data->port_id, strerror(rte_errno)); 980 return ret; 981 } 982 983 } 984 gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords; 985 986 alignas(struct ethtool_link_settings) 987 uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) + 988 sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3]; 989 struct ethtool_link_settings *ecmd = (void *)data; 990 991 *ecmd = gcmd; 992 ifr.ifr_data = (void *)ecmd; 993 ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr); 994 if (ret) { 995 DRV_LOG(DEBUG, 996 "port %u ioctl(SIOCETHTOOL," 997 "ETHTOOL_GLINKSETTINGS) failed: %s", 998 dev->data->port_id, strerror(rte_errno)); 999 return ret; 1000 } 1001 dev_link.link_speed = (ecmd->speed == UINT32_MAX) ? ETH_SPEED_NUM_NONE : 1002 ecmd->speed; 1003 sc = ecmd->link_mode_masks[0] | 1004 ((uint64_t)ecmd->link_mode_masks[1] << 32); 1005 priv->link_speed_capa = 0; 1006 if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT)) 1007 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG; 1008 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) | 1009 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT))) 1010 priv->link_speed_capa |= ETH_LINK_SPEED_1G; 1011 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) | 1012 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) | 1013 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT))) 1014 priv->link_speed_capa |= ETH_LINK_SPEED_10G; 1015 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) | 1016 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT))) 1017 priv->link_speed_capa |= ETH_LINK_SPEED_20G; 1018 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) | 1019 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) | 1020 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) | 1021 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT))) 1022 priv->link_speed_capa |= ETH_LINK_SPEED_40G; 1023 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) | 1024 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) | 1025 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) | 1026 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT))) 1027 priv->link_speed_capa |= ETH_LINK_SPEED_56G; 1028 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) | 1029 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) | 1030 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT))) 1031 priv->link_speed_capa |= ETH_LINK_SPEED_25G; 1032 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) | 1033 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT))) 1034 priv->link_speed_capa |= ETH_LINK_SPEED_50G; 1035 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) | 1036 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) | 1037 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) | 1038 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT))) 1039 priv->link_speed_capa |= ETH_LINK_SPEED_100G; 1040 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT) | 1041 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT))) 1042 priv->link_speed_capa |= ETH_LINK_SPEED_200G; 1043 1044 sc = ecmd->link_mode_masks[2] | 1045 ((uint64_t)ecmd->link_mode_masks[3] << 32); 1046 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT) | 1047 MLX5_BITSHIFT( 1048 ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT) | 1049 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT))) 1050 priv->link_speed_capa |= ETH_LINK_SPEED_200G; 1051 dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ? 1052 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 1053 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & 1054 ETH_LINK_SPEED_FIXED); 1055 if (((dev_link.link_speed && !dev_link.link_status) || 1056 (!dev_link.link_speed && dev_link.link_status))) { 1057 rte_errno = EAGAIN; 1058 return -rte_errno; 1059 } 1060 *link = dev_link; 1061 return 0; 1062 } 1063 1064 /** 1065 * DPDK callback to retrieve physical link information. 1066 * 1067 * @param dev 1068 * Pointer to Ethernet device structure. 1069 * @param wait_to_complete 1070 * Wait for request completion. 1071 * 1072 * @return 1073 * 0 if link status was not updated, positive if it was, a negative errno 1074 * value otherwise and rte_errno is set. 1075 */ 1076 int 1077 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete) 1078 { 1079 int ret; 1080 struct rte_eth_link dev_link; 1081 time_t start_time = time(NULL); 1082 int retry = MLX5_GET_LINK_STATUS_RETRY_COUNT; 1083 1084 do { 1085 ret = mlx5_link_update_unlocked_gs(dev, &dev_link); 1086 if (ret == -ENOTSUP) 1087 ret = mlx5_link_update_unlocked_gset(dev, &dev_link); 1088 if (ret == 0) 1089 break; 1090 /* Handle wait to complete situation. */ 1091 if ((wait_to_complete || retry) && ret == -EAGAIN) { 1092 if (abs((int)difftime(time(NULL), start_time)) < 1093 MLX5_LINK_STATUS_TIMEOUT) { 1094 usleep(0); 1095 continue; 1096 } else { 1097 rte_errno = EBUSY; 1098 return -rte_errno; 1099 } 1100 } else if (ret < 0) { 1101 return ret; 1102 } 1103 } while (wait_to_complete || retry-- > 0); 1104 ret = !!memcmp(&dev->data->dev_link, &dev_link, 1105 sizeof(struct rte_eth_link)); 1106 dev->data->dev_link = dev_link; 1107 return ret; 1108 } 1109 1110 /** 1111 * DPDK callback to change the MTU. 1112 * 1113 * @param dev 1114 * Pointer to Ethernet device structure. 1115 * @param in_mtu 1116 * New MTU. 1117 * 1118 * @return 1119 * 0 on success, a negative errno value otherwise and rte_errno is set. 1120 */ 1121 int 1122 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 1123 { 1124 struct mlx5_priv *priv = dev->data->dev_private; 1125 uint16_t kern_mtu = 0; 1126 int ret; 1127 1128 ret = mlx5_get_mtu(dev, &kern_mtu); 1129 if (ret) 1130 return ret; 1131 /* Set kernel interface MTU first. */ 1132 ret = mlx5_set_mtu(dev, mtu); 1133 if (ret) 1134 return ret; 1135 ret = mlx5_get_mtu(dev, &kern_mtu); 1136 if (ret) 1137 return ret; 1138 if (kern_mtu == mtu) { 1139 priv->mtu = mtu; 1140 DRV_LOG(DEBUG, "port %u adapter MTU set to %u", 1141 dev->data->port_id, mtu); 1142 return 0; 1143 } 1144 rte_errno = EAGAIN; 1145 return -rte_errno; 1146 } 1147 1148 /** 1149 * DPDK callback to get flow control status. 1150 * 1151 * @param dev 1152 * Pointer to Ethernet device structure. 1153 * @param[out] fc_conf 1154 * Flow control output buffer. 1155 * 1156 * @return 1157 * 0 on success, a negative errno value otherwise and rte_errno is set. 1158 */ 1159 int 1160 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 1161 { 1162 struct ifreq ifr; 1163 struct ethtool_pauseparam ethpause = { 1164 .cmd = ETHTOOL_GPAUSEPARAM 1165 }; 1166 int ret; 1167 1168 ifr.ifr_data = (void *)ðpause; 1169 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 1170 if (ret) { 1171 DRV_LOG(WARNING, 1172 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:" 1173 " %s", 1174 dev->data->port_id, strerror(rte_errno)); 1175 return ret; 1176 } 1177 fc_conf->autoneg = ethpause.autoneg; 1178 if (ethpause.rx_pause && ethpause.tx_pause) 1179 fc_conf->mode = RTE_FC_FULL; 1180 else if (ethpause.rx_pause) 1181 fc_conf->mode = RTE_FC_RX_PAUSE; 1182 else if (ethpause.tx_pause) 1183 fc_conf->mode = RTE_FC_TX_PAUSE; 1184 else 1185 fc_conf->mode = RTE_FC_NONE; 1186 return 0; 1187 } 1188 1189 /** 1190 * DPDK callback to modify flow control parameters. 1191 * 1192 * @param dev 1193 * Pointer to Ethernet device structure. 1194 * @param[in] fc_conf 1195 * Flow control parameters. 1196 * 1197 * @return 1198 * 0 on success, a negative errno value otherwise and rte_errno is set. 1199 */ 1200 int 1201 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 1202 { 1203 struct ifreq ifr; 1204 struct ethtool_pauseparam ethpause = { 1205 .cmd = ETHTOOL_SPAUSEPARAM 1206 }; 1207 int ret; 1208 1209 ifr.ifr_data = (void *)ðpause; 1210 ethpause.autoneg = fc_conf->autoneg; 1211 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 1212 (fc_conf->mode & RTE_FC_RX_PAUSE)) 1213 ethpause.rx_pause = 1; 1214 else 1215 ethpause.rx_pause = 0; 1216 1217 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 1218 (fc_conf->mode & RTE_FC_TX_PAUSE)) 1219 ethpause.tx_pause = 1; 1220 else 1221 ethpause.tx_pause = 0; 1222 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 1223 if (ret) { 1224 DRV_LOG(WARNING, 1225 "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" 1226 " failed: %s", 1227 dev->data->port_id, strerror(rte_errno)); 1228 return ret; 1229 } 1230 return 0; 1231 } 1232 1233 /** 1234 * Handle asynchronous removal event for entire multiport device. 1235 * 1236 * @param sh 1237 * Infiniband device shared context. 1238 */ 1239 static void 1240 mlx5_dev_interrupt_device_fatal(struct mlx5_ibv_shared *sh) 1241 { 1242 uint32_t i; 1243 1244 for (i = 0; i < sh->max_port; ++i) { 1245 struct rte_eth_dev *dev; 1246 1247 if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) { 1248 /* 1249 * Or not existing port either no 1250 * handler installed for this port. 1251 */ 1252 continue; 1253 } 1254 dev = &rte_eth_devices[sh->port[i].ih_port_id]; 1255 MLX5_ASSERT(dev); 1256 if (dev->data->dev_conf.intr_conf.rmv) 1257 _rte_eth_dev_callback_process 1258 (dev, RTE_ETH_EVENT_INTR_RMV, NULL); 1259 } 1260 } 1261 1262 /** 1263 * Handle shared asynchronous events the NIC (removal event 1264 * and link status change). Supports multiport IB device. 1265 * 1266 * @param cb_arg 1267 * Callback argument. 1268 */ 1269 void 1270 mlx5_dev_interrupt_handler(void *cb_arg) 1271 { 1272 struct mlx5_ibv_shared *sh = cb_arg; 1273 struct ibv_async_event event; 1274 1275 /* Read all message from the IB device and acknowledge them. */ 1276 for (;;) { 1277 struct rte_eth_dev *dev; 1278 uint32_t tmp; 1279 1280 if (mlx5_glue->get_async_event(sh->ctx, &event)) 1281 break; 1282 /* Retrieve and check IB port index. */ 1283 tmp = (uint32_t)event.element.port_num; 1284 if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) { 1285 /* 1286 * The DEVICE_FATAL event is called once for 1287 * entire device without port specifying. 1288 * We should notify all existing ports. 1289 */ 1290 mlx5_glue->ack_async_event(&event); 1291 mlx5_dev_interrupt_device_fatal(sh); 1292 continue; 1293 } 1294 MLX5_ASSERT(tmp && (tmp <= sh->max_port)); 1295 if (!tmp) { 1296 /* Unsupported devive level event. */ 1297 mlx5_glue->ack_async_event(&event); 1298 DRV_LOG(DEBUG, 1299 "unsupported common event (type %d)", 1300 event.event_type); 1301 continue; 1302 } 1303 if (tmp > sh->max_port) { 1304 /* Invalid IB port index. */ 1305 mlx5_glue->ack_async_event(&event); 1306 DRV_LOG(DEBUG, 1307 "cannot handle an event (type %d)" 1308 "due to invalid IB port index (%u)", 1309 event.event_type, tmp); 1310 continue; 1311 } 1312 if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) { 1313 /* No handler installed. */ 1314 mlx5_glue->ack_async_event(&event); 1315 DRV_LOG(DEBUG, 1316 "cannot handle an event (type %d)" 1317 "due to no handler installed for port %u", 1318 event.event_type, tmp); 1319 continue; 1320 } 1321 /* Retrieve ethernet device descriptor. */ 1322 tmp = sh->port[tmp - 1].ih_port_id; 1323 dev = &rte_eth_devices[tmp]; 1324 MLX5_ASSERT(dev); 1325 if ((event.event_type == IBV_EVENT_PORT_ACTIVE || 1326 event.event_type == IBV_EVENT_PORT_ERR) && 1327 dev->data->dev_conf.intr_conf.lsc) { 1328 mlx5_glue->ack_async_event(&event); 1329 if (mlx5_link_update(dev, 0) == -EAGAIN) { 1330 usleep(0); 1331 continue; 1332 } 1333 _rte_eth_dev_callback_process 1334 (dev, RTE_ETH_EVENT_INTR_LSC, NULL); 1335 continue; 1336 } 1337 DRV_LOG(DEBUG, 1338 "port %u cannot handle an unknown event (type %d)", 1339 dev->data->port_id, event.event_type); 1340 mlx5_glue->ack_async_event(&event); 1341 } 1342 } 1343 1344 /* 1345 * Unregister callback handler safely. The handler may be active 1346 * while we are trying to unregister it, in this case code -EAGAIN 1347 * is returned by rte_intr_callback_unregister(). This routine checks 1348 * the return code and tries to unregister handler again. 1349 * 1350 * @param handle 1351 * interrupt handle 1352 * @param cb_fn 1353 * pointer to callback routine 1354 * @cb_arg 1355 * opaque callback parameter 1356 */ 1357 void 1358 mlx5_intr_callback_unregister(const struct rte_intr_handle *handle, 1359 rte_intr_callback_fn cb_fn, void *cb_arg) 1360 { 1361 /* 1362 * Try to reduce timeout management overhead by not calling 1363 * the timer related routines on the first iteration. If the 1364 * unregistering succeeds on first call there will be no 1365 * timer calls at all. 1366 */ 1367 uint64_t twait = 0; 1368 uint64_t start = 0; 1369 1370 do { 1371 int ret; 1372 1373 ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg); 1374 if (ret >= 0) 1375 return; 1376 if (ret != -EAGAIN) { 1377 DRV_LOG(INFO, "failed to unregister interrupt" 1378 " handler (error: %d)", ret); 1379 MLX5_ASSERT(false); 1380 return; 1381 } 1382 if (twait) { 1383 struct timespec onems; 1384 1385 /* Wait one millisecond and try again. */ 1386 onems.tv_sec = 0; 1387 onems.tv_nsec = NS_PER_S / MS_PER_S; 1388 nanosleep(&onems, 0); 1389 /* Check whether one second elapsed. */ 1390 if ((rte_get_timer_cycles() - start) <= twait) 1391 continue; 1392 } else { 1393 /* 1394 * We get the amount of timer ticks for one second. 1395 * If this amount elapsed it means we spent one 1396 * second in waiting. This branch is executed once 1397 * on first iteration. 1398 */ 1399 twait = rte_get_timer_hz(); 1400 MLX5_ASSERT(twait); 1401 } 1402 /* 1403 * Timeout elapsed, show message (once a second) and retry. 1404 * We have no other acceptable option here, if we ignore 1405 * the unregistering return code the handler will not 1406 * be unregistered, fd will be closed and we may get the 1407 * crush. Hanging and messaging in the loop seems not to be 1408 * the worst choice. 1409 */ 1410 DRV_LOG(INFO, "Retrying to unregister interrupt handler"); 1411 start = rte_get_timer_cycles(); 1412 } while (true); 1413 } 1414 1415 /** 1416 * Handle DEVX interrupts from the NIC. 1417 * This function is probably called from the DPDK host thread. 1418 * 1419 * @param cb_arg 1420 * Callback argument. 1421 */ 1422 void 1423 mlx5_dev_interrupt_handler_devx(void *cb_arg) 1424 { 1425 #ifndef HAVE_IBV_DEVX_ASYNC 1426 (void)cb_arg; 1427 return; 1428 #else 1429 struct mlx5_ibv_shared *sh = cb_arg; 1430 union { 1431 struct mlx5dv_devx_async_cmd_hdr cmd_resp; 1432 uint8_t buf[MLX5_ST_SZ_BYTES(query_flow_counter_out) + 1433 MLX5_ST_SZ_BYTES(traffic_counter) + 1434 sizeof(struct mlx5dv_devx_async_cmd_hdr)]; 1435 } out; 1436 uint8_t *buf = out.buf + sizeof(out.cmd_resp); 1437 1438 while (!mlx5_glue->devx_get_async_cmd_comp(sh->devx_comp, 1439 &out.cmd_resp, 1440 sizeof(out.buf))) 1441 mlx5_flow_async_pool_query_handle 1442 (sh, (uint64_t)out.cmd_resp.wr_id, 1443 mlx5_devx_get_out_command_status(buf)); 1444 #endif /* HAVE_IBV_DEVX_ASYNC */ 1445 } 1446 1447 /** 1448 * DPDK callback to bring the link DOWN. 1449 * 1450 * @param dev 1451 * Pointer to Ethernet device structure. 1452 * 1453 * @return 1454 * 0 on success, a negative errno value otherwise and rte_errno is set. 1455 */ 1456 int 1457 mlx5_set_link_down(struct rte_eth_dev *dev) 1458 { 1459 return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP); 1460 } 1461 1462 /** 1463 * DPDK callback to bring the link UP. 1464 * 1465 * @param dev 1466 * Pointer to Ethernet device structure. 1467 * 1468 * @return 1469 * 0 on success, a negative errno value otherwise and rte_errno is set. 1470 */ 1471 int 1472 mlx5_set_link_up(struct rte_eth_dev *dev) 1473 { 1474 return mlx5_set_flags(dev, ~IFF_UP, IFF_UP); 1475 } 1476 1477 /** 1478 * Configure the RX function to use. 1479 * 1480 * @param dev 1481 * Pointer to private data structure. 1482 * 1483 * @return 1484 * Pointer to selected Rx burst function. 1485 */ 1486 eth_rx_burst_t 1487 mlx5_select_rx_function(struct rte_eth_dev *dev) 1488 { 1489 eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst; 1490 1491 MLX5_ASSERT(dev != NULL); 1492 if (mlx5_check_vec_rx_support(dev) > 0) { 1493 rx_pkt_burst = mlx5_rx_burst_vec; 1494 DRV_LOG(DEBUG, "port %u selected Rx vectorized function", 1495 dev->data->port_id); 1496 } else if (mlx5_mprq_enabled(dev)) { 1497 rx_pkt_burst = mlx5_rx_burst_mprq; 1498 } 1499 return rx_pkt_burst; 1500 } 1501 1502 /** 1503 * Check if mlx5 device was removed. 1504 * 1505 * @param dev 1506 * Pointer to Ethernet device structure. 1507 * 1508 * @return 1509 * 1 when device is removed, otherwise 0. 1510 */ 1511 int 1512 mlx5_is_removed(struct rte_eth_dev *dev) 1513 { 1514 struct ibv_device_attr device_attr; 1515 struct mlx5_priv *priv = dev->data->dev_private; 1516 1517 if (mlx5_glue->query_device(priv->sh->ctx, &device_attr) == EIO) 1518 return 1; 1519 return 0; 1520 } 1521 1522 /** 1523 * Get the E-Switch parameters by port id. 1524 * 1525 * @param[in] port 1526 * Device port id. 1527 * @param[in] valid 1528 * Device port id is valid, skip check. This flag is useful 1529 * when trials are performed from probing and device is not 1530 * flagged as valid yet (in attaching process). 1531 * @param[out] es_domain_id 1532 * E-Switch domain id. 1533 * @param[out] es_port_id 1534 * The port id of the port in the E-Switch. 1535 * 1536 * @return 1537 * pointer to device private data structure containing data needed 1538 * on success, NULL otherwise and rte_errno is set. 1539 */ 1540 struct mlx5_priv * 1541 mlx5_port_to_eswitch_info(uint16_t port, bool valid) 1542 { 1543 struct rte_eth_dev *dev; 1544 struct mlx5_priv *priv; 1545 1546 if (port >= RTE_MAX_ETHPORTS) { 1547 rte_errno = EINVAL; 1548 return NULL; 1549 } 1550 if (!valid && !rte_eth_dev_is_valid_port(port)) { 1551 rte_errno = ENODEV; 1552 return NULL; 1553 } 1554 dev = &rte_eth_devices[port]; 1555 priv = dev->data->dev_private; 1556 if (!(priv->representor || priv->master)) { 1557 rte_errno = EINVAL; 1558 return NULL; 1559 } 1560 return priv; 1561 } 1562 1563 /** 1564 * Get the E-Switch parameters by device instance. 1565 * 1566 * @param[in] port 1567 * Device port id. 1568 * @param[out] es_domain_id 1569 * E-Switch domain id. 1570 * @param[out] es_port_id 1571 * The port id of the port in the E-Switch. 1572 * 1573 * @return 1574 * pointer to device private data structure containing data needed 1575 * on success, NULL otherwise and rte_errno is set. 1576 */ 1577 struct mlx5_priv * 1578 mlx5_dev_to_eswitch_info(struct rte_eth_dev *dev) 1579 { 1580 struct mlx5_priv *priv; 1581 1582 priv = dev->data->dev_private; 1583 if (!(priv->representor || priv->master)) { 1584 rte_errno = EINVAL; 1585 return NULL; 1586 } 1587 return priv; 1588 } 1589 1590 /** 1591 * Get switch information associated with network interface. 1592 * 1593 * @param ifindex 1594 * Network interface index. 1595 * @param[out] info 1596 * Switch information object, populated in case of success. 1597 * 1598 * @return 1599 * 0 on success, a negative errno value otherwise and rte_errno is set. 1600 */ 1601 int 1602 mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info) 1603 { 1604 char ifname[IF_NAMESIZE]; 1605 char port_name[IF_NAMESIZE]; 1606 FILE *file; 1607 struct mlx5_switch_info data = { 1608 .master = 0, 1609 .representor = 0, 1610 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET, 1611 .port_name = 0, 1612 .switch_id = 0, 1613 }; 1614 DIR *dir; 1615 bool port_switch_id_set = false; 1616 bool device_dir = false; 1617 char c; 1618 int ret; 1619 1620 if (!if_indextoname(ifindex, ifname)) { 1621 rte_errno = errno; 1622 return -rte_errno; 1623 } 1624 1625 MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name", 1626 ifname); 1627 MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id", 1628 ifname); 1629 MKSTR(pci_device, "/sys/class/net/%s/device", 1630 ifname); 1631 1632 file = fopen(phys_port_name, "rb"); 1633 if (file != NULL) { 1634 ret = fscanf(file, "%s", port_name); 1635 fclose(file); 1636 if (ret == 1) 1637 mlx5_translate_port_name(port_name, &data); 1638 } 1639 file = fopen(phys_switch_id, "rb"); 1640 if (file == NULL) { 1641 rte_errno = errno; 1642 return -rte_errno; 1643 } 1644 port_switch_id_set = 1645 fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 && 1646 c == '\n'; 1647 fclose(file); 1648 dir = opendir(pci_device); 1649 if (dir != NULL) { 1650 closedir(dir); 1651 device_dir = true; 1652 } 1653 if (port_switch_id_set) { 1654 /* We have some E-Switch configuration. */ 1655 mlx5_sysfs_check_switch_info(device_dir, &data); 1656 } 1657 *info = data; 1658 MLX5_ASSERT(!(data.master && data.representor)); 1659 if (data.master && data.representor) { 1660 DRV_LOG(ERR, "ifindex %u device is recognized as master" 1661 " and as representor", ifindex); 1662 rte_errno = ENODEV; 1663 return -rte_errno; 1664 } 1665 return 0; 1666 } 1667 1668 /** 1669 * Analyze gathered port parameters via sysfs to recognize master 1670 * and representor devices for E-Switch configuration. 1671 * 1672 * @param[in] device_dir 1673 * flag of presence of "device" directory under port device key. 1674 * @param[inout] switch_info 1675 * Port information, including port name as a number and port name 1676 * type if recognized 1677 * 1678 * @return 1679 * master and representor flags are set in switch_info according to 1680 * recognized parameters (if any). 1681 */ 1682 void 1683 mlx5_sysfs_check_switch_info(bool device_dir, 1684 struct mlx5_switch_info *switch_info) 1685 { 1686 switch (switch_info->name_type) { 1687 case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN: 1688 /* 1689 * Name is not recognized, assume the master, 1690 * check the device directory presence. 1691 */ 1692 switch_info->master = device_dir; 1693 break; 1694 case MLX5_PHYS_PORT_NAME_TYPE_NOTSET: 1695 /* 1696 * Name is not set, this assumes the legacy naming 1697 * schema for master, just check if there is 1698 * a device directory. 1699 */ 1700 switch_info->master = device_dir; 1701 break; 1702 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: 1703 /* New uplink naming schema recognized. */ 1704 switch_info->master = 1; 1705 break; 1706 case MLX5_PHYS_PORT_NAME_TYPE_LEGACY: 1707 /* Legacy representors naming schema. */ 1708 switch_info->representor = !device_dir; 1709 break; 1710 case MLX5_PHYS_PORT_NAME_TYPE_PFVF: 1711 /* New representors naming schema. */ 1712 switch_info->representor = 1; 1713 break; 1714 } 1715 } 1716 1717 /** 1718 * DPDK callback to retrieve plug-in module EEPROM information (type and size). 1719 * 1720 * @param dev 1721 * Pointer to Ethernet device structure. 1722 * @param[out] modinfo 1723 * Storage for plug-in module EEPROM information. 1724 * 1725 * @return 1726 * 0 on success, a negative errno value otherwise and rte_errno is set. 1727 */ 1728 int 1729 mlx5_get_module_info(struct rte_eth_dev *dev, 1730 struct rte_eth_dev_module_info *modinfo) 1731 { 1732 struct ethtool_modinfo info = { 1733 .cmd = ETHTOOL_GMODULEINFO, 1734 }; 1735 struct ifreq ifr = (struct ifreq) { 1736 .ifr_data = (void *)&info, 1737 }; 1738 int ret = 0; 1739 1740 if (!dev || !modinfo) { 1741 DRV_LOG(WARNING, "missing argument, cannot get module info"); 1742 rte_errno = EINVAL; 1743 return -rte_errno; 1744 } 1745 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 1746 if (ret) { 1747 DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s", 1748 dev->data->port_id, strerror(rte_errno)); 1749 return ret; 1750 } 1751 modinfo->type = info.type; 1752 modinfo->eeprom_len = info.eeprom_len; 1753 return ret; 1754 } 1755 1756 /** 1757 * DPDK callback to retrieve plug-in module EEPROM data. 1758 * 1759 * @param dev 1760 * Pointer to Ethernet device structure. 1761 * @param[out] info 1762 * Storage for plug-in module EEPROM data. 1763 * 1764 * @return 1765 * 0 on success, a negative errno value otherwise and rte_errno is set. 1766 */ 1767 int mlx5_get_module_eeprom(struct rte_eth_dev *dev, 1768 struct rte_dev_eeprom_info *info) 1769 { 1770 struct ethtool_eeprom *eeprom; 1771 struct ifreq ifr; 1772 int ret = 0; 1773 1774 if (!dev || !info) { 1775 DRV_LOG(WARNING, "missing argument, cannot get module eeprom"); 1776 rte_errno = EINVAL; 1777 return -rte_errno; 1778 } 1779 eeprom = rte_calloc(__func__, 1, 1780 (sizeof(struct ethtool_eeprom) + info->length), 0); 1781 if (!eeprom) { 1782 DRV_LOG(WARNING, "port %u cannot allocate memory for " 1783 "eeprom data", dev->data->port_id); 1784 rte_errno = ENOMEM; 1785 return -rte_errno; 1786 } 1787 eeprom->cmd = ETHTOOL_GMODULEEEPROM; 1788 eeprom->offset = info->offset; 1789 eeprom->len = info->length; 1790 ifr = (struct ifreq) { 1791 .ifr_data = (void *)eeprom, 1792 }; 1793 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 1794 if (ret) 1795 DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s", 1796 dev->data->port_id, strerror(rte_errno)); 1797 else 1798 rte_memcpy(info->data, eeprom->data, info->length); 1799 rte_free(eeprom); 1800 return ret; 1801 } 1802 1803 /** 1804 * DPDK callback to retrieve hairpin capabilities. 1805 * 1806 * @param dev 1807 * Pointer to Ethernet device structure. 1808 * @param[out] cap 1809 * Storage for hairpin capability data. 1810 * 1811 * @return 1812 * 0 on success, a negative errno value otherwise and rte_errno is set. 1813 */ 1814 int mlx5_hairpin_cap_get(struct rte_eth_dev *dev, 1815 struct rte_eth_hairpin_cap *cap) 1816 { 1817 struct mlx5_priv *priv = dev->data->dev_private; 1818 1819 if (priv->sh->devx == 0) { 1820 rte_errno = ENOTSUP; 1821 return -rte_errno; 1822 } 1823 cap->max_nb_queues = UINT16_MAX; 1824 cap->max_rx_2_tx = 1; 1825 cap->max_tx_2_rx = 1; 1826 cap->max_nb_desc = 8192; 1827 return 0; 1828 } 1829