1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015 Mellanox Technologies, Ltd 4 */ 5 6 #include <stddef.h> 7 #include <assert.h> 8 #include <inttypes.h> 9 #include <unistd.h> 10 #include <stdint.h> 11 #include <stdio.h> 12 #include <string.h> 13 #include <stdlib.h> 14 #include <errno.h> 15 #include <dirent.h> 16 #include <net/if.h> 17 #include <sys/ioctl.h> 18 #include <sys/socket.h> 19 #include <netinet/in.h> 20 #include <linux/ethtool.h> 21 #include <linux/sockios.h> 22 #include <fcntl.h> 23 #include <stdalign.h> 24 #include <sys/un.h> 25 #include <time.h> 26 27 #include <rte_atomic.h> 28 #include <rte_ethdev_driver.h> 29 #include <rte_bus_pci.h> 30 #include <rte_mbuf.h> 31 #include <rte_common.h> 32 #include <rte_interrupts.h> 33 #include <rte_malloc.h> 34 #include <rte_string_fns.h> 35 #include <rte_rwlock.h> 36 37 #include "mlx5.h" 38 #include "mlx5_glue.h" 39 #include "mlx5_rxtx.h" 40 #include "mlx5_utils.h" 41 42 /* Supported speed values found in /usr/include/linux/ethtool.h */ 43 #ifndef HAVE_SUPPORTED_40000baseKR4_Full 44 #define SUPPORTED_40000baseKR4_Full (1 << 23) 45 #endif 46 #ifndef HAVE_SUPPORTED_40000baseCR4_Full 47 #define SUPPORTED_40000baseCR4_Full (1 << 24) 48 #endif 49 #ifndef HAVE_SUPPORTED_40000baseSR4_Full 50 #define SUPPORTED_40000baseSR4_Full (1 << 25) 51 #endif 52 #ifndef HAVE_SUPPORTED_40000baseLR4_Full 53 #define SUPPORTED_40000baseLR4_Full (1 << 26) 54 #endif 55 #ifndef HAVE_SUPPORTED_56000baseKR4_Full 56 #define SUPPORTED_56000baseKR4_Full (1 << 27) 57 #endif 58 #ifndef HAVE_SUPPORTED_56000baseCR4_Full 59 #define SUPPORTED_56000baseCR4_Full (1 << 28) 60 #endif 61 #ifndef HAVE_SUPPORTED_56000baseSR4_Full 62 #define SUPPORTED_56000baseSR4_Full (1 << 29) 63 #endif 64 #ifndef HAVE_SUPPORTED_56000baseLR4_Full 65 #define SUPPORTED_56000baseLR4_Full (1 << 30) 66 #endif 67 68 /* Add defines in case the running kernel is not the same as user headers. */ 69 #ifndef ETHTOOL_GLINKSETTINGS 70 struct ethtool_link_settings { 71 uint32_t cmd; 72 uint32_t speed; 73 uint8_t duplex; 74 uint8_t port; 75 uint8_t phy_address; 76 uint8_t autoneg; 77 uint8_t mdio_support; 78 uint8_t eth_to_mdix; 79 uint8_t eth_tp_mdix_ctrl; 80 int8_t link_mode_masks_nwords; 81 uint32_t reserved[8]; 82 uint32_t link_mode_masks[]; 83 }; 84 85 #define ETHTOOL_GLINKSETTINGS 0x0000004c 86 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5 87 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6 88 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17 89 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18 90 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19 91 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20 92 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21 93 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22 94 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23 95 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24 96 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25 97 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26 98 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27 99 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28 100 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29 101 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30 102 #endif 103 #ifndef HAVE_ETHTOOL_LINK_MODE_25G 104 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31 105 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32 106 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33 107 #endif 108 #ifndef HAVE_ETHTOOL_LINK_MODE_50G 109 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34 110 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35 111 #endif 112 #ifndef HAVE_ETHTOOL_LINK_MODE_100G 113 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36 114 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37 115 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38 116 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39 117 #endif 118 119 /** 120 * Get master interface name from private structure. 121 * 122 * @param[in] dev 123 * Pointer to Ethernet device. 124 * @param[out] ifname 125 * Interface name output buffer. 126 * 127 * @return 128 * 0 on success, a negative errno value otherwise and rte_errno is set. 129 */ 130 int 131 mlx5_get_master_ifname(const char *ibdev_path, char (*ifname)[IF_NAMESIZE]) 132 { 133 DIR *dir; 134 struct dirent *dent; 135 unsigned int dev_type = 0; 136 unsigned int dev_port_prev = ~0u; 137 char match[IF_NAMESIZE] = ""; 138 139 assert(ibdev_path); 140 { 141 MKSTR(path, "%s/device/net", ibdev_path); 142 143 dir = opendir(path); 144 if (dir == NULL) { 145 rte_errno = errno; 146 return -rte_errno; 147 } 148 } 149 while ((dent = readdir(dir)) != NULL) { 150 char *name = dent->d_name; 151 FILE *file; 152 unsigned int dev_port; 153 int r; 154 155 if ((name[0] == '.') && 156 ((name[1] == '\0') || 157 ((name[1] == '.') && (name[2] == '\0')))) 158 continue; 159 160 MKSTR(path, "%s/device/net/%s/%s", 161 ibdev_path, name, 162 (dev_type ? "dev_id" : "dev_port")); 163 164 file = fopen(path, "rb"); 165 if (file == NULL) { 166 if (errno != ENOENT) 167 continue; 168 /* 169 * Switch to dev_id when dev_port does not exist as 170 * is the case with Linux kernel versions < 3.15. 171 */ 172 try_dev_id: 173 match[0] = '\0'; 174 if (dev_type) 175 break; 176 dev_type = 1; 177 dev_port_prev = ~0u; 178 rewinddir(dir); 179 continue; 180 } 181 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); 182 fclose(file); 183 if (r != 1) 184 continue; 185 /* 186 * Switch to dev_id when dev_port returns the same value for 187 * all ports. May happen when using a MOFED release older than 188 * 3.0 with a Linux kernel >= 3.15. 189 */ 190 if (dev_port == dev_port_prev) 191 goto try_dev_id; 192 dev_port_prev = dev_port; 193 if (dev_port == 0) 194 strlcpy(match, name, sizeof(match)); 195 } 196 closedir(dir); 197 if (match[0] == '\0') { 198 rte_errno = ENOENT; 199 return -rte_errno; 200 } 201 strncpy(*ifname, match, sizeof(*ifname)); 202 return 0; 203 } 204 205 /** 206 * Get interface name from private structure. 207 * 208 * This is a port representor-aware version of mlx5_get_master_ifname(). 209 * 210 * @param[in] dev 211 * Pointer to Ethernet device. 212 * @param[out] ifname 213 * Interface name output buffer. 214 * 215 * @return 216 * 0 on success, a negative errno value otherwise and rte_errno is set. 217 */ 218 int 219 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]) 220 { 221 struct mlx5_priv *priv = dev->data->dev_private; 222 unsigned int ifindex; 223 224 assert(priv); 225 assert(priv->sh); 226 ifindex = priv->nl_socket_rdma >= 0 ? 227 mlx5_nl_ifindex(priv->nl_socket_rdma, 228 priv->sh->ibdev_name, 229 priv->ibv_port) : 0; 230 if (!ifindex) { 231 if (!priv->representor) 232 return mlx5_get_master_ifname(priv->sh->ibdev_path, 233 ifname); 234 rte_errno = ENXIO; 235 return -rte_errno; 236 } 237 if (if_indextoname(ifindex, &(*ifname)[0])) 238 return 0; 239 rte_errno = errno; 240 return -rte_errno; 241 } 242 243 /** 244 * Get the interface index from device name. 245 * 246 * @param[in] dev 247 * Pointer to Ethernet device. 248 * 249 * @return 250 * Nonzero interface index on success, zero otherwise and rte_errno is set. 251 */ 252 unsigned int 253 mlx5_ifindex(const struct rte_eth_dev *dev) 254 { 255 char ifname[IF_NAMESIZE]; 256 unsigned int ifindex; 257 258 if (mlx5_get_ifname(dev, &ifname)) 259 return 0; 260 ifindex = if_nametoindex(ifname); 261 if (!ifindex) 262 rte_errno = errno; 263 return ifindex; 264 } 265 266 /** 267 * Perform ifreq ioctl() on associated Ethernet device. 268 * 269 * @param[in] dev 270 * Pointer to Ethernet device. 271 * @param req 272 * Request number to pass to ioctl(). 273 * @param[out] ifr 274 * Interface request structure output buffer. 275 * 276 * @return 277 * 0 on success, a negative errno value otherwise and rte_errno is set. 278 */ 279 int 280 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr) 281 { 282 int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 283 int ret = 0; 284 285 if (sock == -1) { 286 rte_errno = errno; 287 return -rte_errno; 288 } 289 ret = mlx5_get_ifname(dev, &ifr->ifr_name); 290 if (ret) 291 goto error; 292 ret = ioctl(sock, req, ifr); 293 if (ret == -1) { 294 rte_errno = errno; 295 goto error; 296 } 297 close(sock); 298 return 0; 299 error: 300 close(sock); 301 return -rte_errno; 302 } 303 304 /** 305 * Get device MTU. 306 * 307 * @param dev 308 * Pointer to Ethernet device. 309 * @param[out] mtu 310 * MTU value output buffer. 311 * 312 * @return 313 * 0 on success, a negative errno value otherwise and rte_errno is set. 314 */ 315 int 316 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu) 317 { 318 struct ifreq request; 319 int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request); 320 321 if (ret) 322 return ret; 323 *mtu = request.ifr_mtu; 324 return 0; 325 } 326 327 /** 328 * Set device MTU. 329 * 330 * @param dev 331 * Pointer to Ethernet device. 332 * @param mtu 333 * MTU value to set. 334 * 335 * @return 336 * 0 on success, a negative errno value otherwise and rte_errno is set. 337 */ 338 static int 339 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 340 { 341 struct ifreq request = { .ifr_mtu = mtu, }; 342 343 return mlx5_ifreq(dev, SIOCSIFMTU, &request); 344 } 345 346 /** 347 * Set device flags. 348 * 349 * @param dev 350 * Pointer to Ethernet device. 351 * @param keep 352 * Bitmask for flags that must remain untouched. 353 * @param flags 354 * Bitmask for flags to modify. 355 * 356 * @return 357 * 0 on success, a negative errno value otherwise and rte_errno is set. 358 */ 359 int 360 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags) 361 { 362 struct ifreq request; 363 int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request); 364 365 if (ret) 366 return ret; 367 request.ifr_flags &= keep; 368 request.ifr_flags |= flags & ~keep; 369 return mlx5_ifreq(dev, SIOCSIFFLAGS, &request); 370 } 371 372 /** 373 * DPDK callback for Ethernet device configuration. 374 * 375 * @param dev 376 * Pointer to Ethernet device structure. 377 * 378 * @return 379 * 0 on success, a negative errno value otherwise and rte_errno is set. 380 */ 381 int 382 mlx5_dev_configure(struct rte_eth_dev *dev) 383 { 384 struct mlx5_priv *priv = dev->data->dev_private; 385 unsigned int rxqs_n = dev->data->nb_rx_queues; 386 unsigned int txqs_n = dev->data->nb_tx_queues; 387 unsigned int i; 388 unsigned int j; 389 unsigned int reta_idx_n; 390 const uint8_t use_app_rss_key = 391 !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key; 392 int ret = 0; 393 394 if (use_app_rss_key && 395 (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len != 396 MLX5_RSS_HASH_KEY_LEN)) { 397 DRV_LOG(ERR, "port %u RSS key len must be %s Bytes long", 398 dev->data->port_id, RTE_STR(MLX5_RSS_HASH_KEY_LEN)); 399 rte_errno = EINVAL; 400 return -rte_errno; 401 } 402 priv->rss_conf.rss_key = 403 rte_realloc(priv->rss_conf.rss_key, 404 MLX5_RSS_HASH_KEY_LEN, 0); 405 if (!priv->rss_conf.rss_key) { 406 DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)", 407 dev->data->port_id, rxqs_n); 408 rte_errno = ENOMEM; 409 return -rte_errno; 410 } 411 memcpy(priv->rss_conf.rss_key, 412 use_app_rss_key ? 413 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key : 414 rss_hash_default_key, 415 MLX5_RSS_HASH_KEY_LEN); 416 priv->rss_conf.rss_key_len = MLX5_RSS_HASH_KEY_LEN; 417 priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf; 418 priv->rxqs = (void *)dev->data->rx_queues; 419 priv->txqs = (void *)dev->data->tx_queues; 420 if (txqs_n != priv->txqs_n) { 421 DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u", 422 dev->data->port_id, priv->txqs_n, txqs_n); 423 priv->txqs_n = txqs_n; 424 } 425 if (rxqs_n > priv->config.ind_table_max_size) { 426 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)", 427 dev->data->port_id, rxqs_n); 428 rte_errno = EINVAL; 429 return -rte_errno; 430 } 431 if (rxqs_n != priv->rxqs_n) { 432 DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u", 433 dev->data->port_id, priv->rxqs_n, rxqs_n); 434 priv->rxqs_n = rxqs_n; 435 /* 436 * If the requested number of RX queues is not a power of two, 437 * use the maximum indirection table size for better balancing. 438 * The result is always rounded to the next power of two. 439 */ 440 reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ? 441 priv->config.ind_table_max_size : 442 rxqs_n)); 443 ret = mlx5_rss_reta_index_resize(dev, reta_idx_n); 444 if (ret) 445 return ret; 446 /* 447 * When the number of RX queues is not a power of two, 448 * the remaining table entries are padded with reused WQs 449 * and hashes are not spread uniformly. 450 */ 451 for (i = 0, j = 0; (i != reta_idx_n); ++i) { 452 (*priv->reta_idx)[i] = j; 453 if (++j == rxqs_n) 454 j = 0; 455 } 456 } 457 ret = mlx5_proc_priv_init(dev); 458 if (ret) 459 return ret; 460 return 0; 461 } 462 463 /** 464 * Sets default tuning parameters. 465 * 466 * @param dev 467 * Pointer to Ethernet device. 468 * @param[out] info 469 * Info structure output buffer. 470 */ 471 static void 472 mlx5_set_default_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 473 { 474 struct mlx5_priv *priv = dev->data->dev_private; 475 476 /* Minimum CPU utilization. */ 477 info->default_rxportconf.ring_size = 256; 478 info->default_txportconf.ring_size = 256; 479 info->default_rxportconf.burst_size = 64; 480 info->default_txportconf.burst_size = 64; 481 if (priv->link_speed_capa & ETH_LINK_SPEED_100G) { 482 info->default_rxportconf.nb_queues = 16; 483 info->default_txportconf.nb_queues = 16; 484 if (dev->data->nb_rx_queues > 2 || 485 dev->data->nb_tx_queues > 2) { 486 /* Max Throughput. */ 487 info->default_rxportconf.ring_size = 2048; 488 info->default_txportconf.ring_size = 2048; 489 } 490 } else { 491 info->default_rxportconf.nb_queues = 8; 492 info->default_txportconf.nb_queues = 8; 493 if (dev->data->nb_rx_queues > 2 || 494 dev->data->nb_tx_queues > 2) { 495 /* Max Throughput. */ 496 info->default_rxportconf.ring_size = 4096; 497 info->default_txportconf.ring_size = 4096; 498 } 499 } 500 } 501 502 /** 503 * DPDK callback to get information about the device. 504 * 505 * @param dev 506 * Pointer to Ethernet device structure. 507 * @param[out] info 508 * Info structure output buffer. 509 */ 510 void 511 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 512 { 513 struct mlx5_priv *priv = dev->data->dev_private; 514 struct mlx5_dev_config *config = &priv->config; 515 unsigned int max; 516 char ifname[IF_NAMESIZE]; 517 518 /* FIXME: we should ask the device for these values. */ 519 info->min_rx_bufsize = 32; 520 info->max_rx_pktlen = 65536; 521 /* 522 * Since we need one CQ per QP, the limit is the minimum number 523 * between the two values. 524 */ 525 max = RTE_MIN(priv->sh->device_attr.orig_attr.max_cq, 526 priv->sh->device_attr.orig_attr.max_qp); 527 /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */ 528 if (max >= 65535) 529 max = 65535; 530 info->max_rx_queues = max; 531 info->max_tx_queues = max; 532 info->max_mac_addrs = MLX5_MAX_UC_MAC_ADDRESSES; 533 info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev); 534 info->rx_offload_capa = (mlx5_get_rx_port_offloads() | 535 info->rx_queue_offload_capa); 536 info->tx_offload_capa = mlx5_get_tx_port_offloads(dev); 537 if (mlx5_get_ifname(dev, &ifname) == 0) 538 info->if_index = if_nametoindex(ifname); 539 info->reta_size = priv->reta_idx_n ? 540 priv->reta_idx_n : config->ind_table_max_size; 541 info->hash_key_size = MLX5_RSS_HASH_KEY_LEN; 542 info->speed_capa = priv->link_speed_capa; 543 info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK; 544 mlx5_set_default_params(dev, info); 545 info->switch_info.name = dev->data->name; 546 info->switch_info.domain_id = priv->domain_id; 547 info->switch_info.port_id = priv->representor_id; 548 if (priv->representor) { 549 unsigned int i = mlx5_dev_to_port_id(dev->device, NULL, 0); 550 uint16_t port_id[i]; 551 552 i = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, i), i); 553 while (i--) { 554 struct mlx5_priv *opriv = 555 rte_eth_devices[port_id[i]].data->dev_private; 556 557 if (!opriv || 558 opriv->representor || 559 opriv->domain_id != priv->domain_id) 560 continue; 561 /* 562 * Override switch name with that of the master 563 * device. 564 */ 565 info->switch_info.name = opriv->dev_data->name; 566 break; 567 } 568 } 569 } 570 571 /** 572 * Get device current raw clock counter 573 * 574 * @param dev 575 * Pointer to Ethernet device structure. 576 * @param[out] time 577 * Current raw clock counter of the device. 578 * 579 * @return 580 * 0 if the clock has correctly been read 581 * The value of errno in case of error 582 */ 583 int 584 mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock) 585 { 586 struct mlx5_priv *priv = dev->data->dev_private; 587 struct ibv_context *ctx = priv->sh->ctx; 588 struct ibv_values_ex values; 589 int err = 0; 590 591 values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK; 592 err = mlx5_glue->query_rt_values_ex(ctx, &values); 593 if (err != 0) { 594 DRV_LOG(WARNING, "Could not query the clock !"); 595 return err; 596 } 597 *clock = values.raw_clock.tv_nsec; 598 return 0; 599 } 600 601 /** 602 * Get firmware version of a device. 603 * 604 * @param dev 605 * Ethernet device port. 606 * @param fw_ver 607 * String output allocated by caller. 608 * @param fw_size 609 * Size of the output string, including terminating null byte. 610 * 611 * @return 612 * 0 on success, or the size of the non truncated string if too big. 613 */ 614 int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size) 615 { 616 struct mlx5_priv *priv = dev->data->dev_private; 617 struct ibv_device_attr *attr = &priv->sh->device_attr.orig_attr; 618 size_t size = strnlen(attr->fw_ver, sizeof(attr->fw_ver)) + 1; 619 620 if (fw_size < size) 621 return size; 622 if (fw_ver != NULL) 623 strlcpy(fw_ver, attr->fw_ver, fw_size); 624 return 0; 625 } 626 627 /** 628 * Get supported packet types. 629 * 630 * @param dev 631 * Pointer to Ethernet device structure. 632 * 633 * @return 634 * A pointer to the supported Packet types array. 635 */ 636 const uint32_t * 637 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) 638 { 639 static const uint32_t ptypes[] = { 640 /* refers to rxq_cq_to_pkt_type() */ 641 RTE_PTYPE_L2_ETHER, 642 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, 643 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, 644 RTE_PTYPE_L4_NONFRAG, 645 RTE_PTYPE_L4_FRAG, 646 RTE_PTYPE_L4_TCP, 647 RTE_PTYPE_L4_UDP, 648 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, 649 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, 650 RTE_PTYPE_INNER_L4_NONFRAG, 651 RTE_PTYPE_INNER_L4_FRAG, 652 RTE_PTYPE_INNER_L4_TCP, 653 RTE_PTYPE_INNER_L4_UDP, 654 RTE_PTYPE_UNKNOWN 655 }; 656 657 if (dev->rx_pkt_burst == mlx5_rx_burst || 658 dev->rx_pkt_burst == mlx5_rx_burst_mprq || 659 dev->rx_pkt_burst == mlx5_rx_burst_vec) 660 return ptypes; 661 return NULL; 662 } 663 664 /** 665 * Retrieve the master device for representor in the same switch domain. 666 * 667 * @param dev 668 * Pointer to representor Ethernet device structure. 669 * 670 * @return 671 * Master device structure on success, NULL otherwise. 672 */ 673 674 static struct rte_eth_dev * 675 mlx5_find_master_dev(struct rte_eth_dev *dev) 676 { 677 struct mlx5_priv *priv; 678 uint16_t port_id; 679 uint16_t domain_id; 680 681 priv = dev->data->dev_private; 682 domain_id = priv->domain_id; 683 assert(priv->representor); 684 RTE_ETH_FOREACH_DEV_OF(port_id, dev->device) { 685 priv = rte_eth_devices[port_id].data->dev_private; 686 if (priv && 687 priv->master && 688 priv->domain_id == domain_id) 689 return &rte_eth_devices[port_id]; 690 } 691 return NULL; 692 } 693 694 /** 695 * DPDK callback to retrieve physical link information. 696 * 697 * @param dev 698 * Pointer to Ethernet device structure. 699 * @param[out] link 700 * Storage for current link status. 701 * 702 * @return 703 * 0 on success, a negative errno value otherwise and rte_errno is set. 704 */ 705 static int 706 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, 707 struct rte_eth_link *link) 708 { 709 struct mlx5_priv *priv = dev->data->dev_private; 710 struct ethtool_cmd edata = { 711 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */ 712 }; 713 struct ifreq ifr; 714 struct rte_eth_link dev_link; 715 int link_speed = 0; 716 int ret; 717 718 ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr); 719 if (ret) { 720 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", 721 dev->data->port_id, strerror(rte_errno)); 722 return ret; 723 } 724 dev_link = (struct rte_eth_link) { 725 .link_status = ((ifr.ifr_flags & IFF_UP) && 726 (ifr.ifr_flags & IFF_RUNNING)), 727 }; 728 ifr = (struct ifreq) { 729 .ifr_data = (void *)&edata, 730 }; 731 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 732 if (ret) { 733 if (ret == -ENOTSUP && priv->representor) { 734 struct rte_eth_dev *master; 735 736 /* 737 * For representors we can try to inherit link 738 * settings from the master device. Actually 739 * link settings do not make a lot of sense 740 * for representors due to missing physical 741 * link. The old kernel drivers supported 742 * emulated settings query for representors, 743 * the new ones do not, so we have to add 744 * this code for compatibility issues. 745 */ 746 master = mlx5_find_master_dev(dev); 747 if (master) { 748 ifr = (struct ifreq) { 749 .ifr_data = (void *)&edata, 750 }; 751 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr); 752 } 753 } 754 if (ret) { 755 DRV_LOG(WARNING, 756 "port %u ioctl(SIOCETHTOOL," 757 " ETHTOOL_GSET) failed: %s", 758 dev->data->port_id, strerror(rte_errno)); 759 return ret; 760 } 761 } 762 link_speed = ethtool_cmd_speed(&edata); 763 if (link_speed == -1) 764 dev_link.link_speed = ETH_SPEED_NUM_NONE; 765 else 766 dev_link.link_speed = link_speed; 767 priv->link_speed_capa = 0; 768 if (edata.supported & SUPPORTED_Autoneg) 769 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG; 770 if (edata.supported & (SUPPORTED_1000baseT_Full | 771 SUPPORTED_1000baseKX_Full)) 772 priv->link_speed_capa |= ETH_LINK_SPEED_1G; 773 if (edata.supported & SUPPORTED_10000baseKR_Full) 774 priv->link_speed_capa |= ETH_LINK_SPEED_10G; 775 if (edata.supported & (SUPPORTED_40000baseKR4_Full | 776 SUPPORTED_40000baseCR4_Full | 777 SUPPORTED_40000baseSR4_Full | 778 SUPPORTED_40000baseLR4_Full)) 779 priv->link_speed_capa |= ETH_LINK_SPEED_40G; 780 dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? 781 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 782 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & 783 ETH_LINK_SPEED_FIXED); 784 if (((dev_link.link_speed && !dev_link.link_status) || 785 (!dev_link.link_speed && dev_link.link_status))) { 786 rte_errno = EAGAIN; 787 return -rte_errno; 788 } 789 *link = dev_link; 790 return 0; 791 } 792 793 /** 794 * Retrieve physical link information (unlocked version using new ioctl). 795 * 796 * @param dev 797 * Pointer to Ethernet device structure. 798 * @param[out] link 799 * Storage for current link status. 800 * 801 * @return 802 * 0 on success, a negative errno value otherwise and rte_errno is set. 803 */ 804 static int 805 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, 806 struct rte_eth_link *link) 807 808 { 809 struct mlx5_priv *priv = dev->data->dev_private; 810 struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS }; 811 struct ifreq ifr; 812 struct rte_eth_link dev_link; 813 struct rte_eth_dev *master = NULL; 814 uint64_t sc; 815 int ret; 816 817 ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr); 818 if (ret) { 819 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", 820 dev->data->port_id, strerror(rte_errno)); 821 return ret; 822 } 823 dev_link = (struct rte_eth_link) { 824 .link_status = ((ifr.ifr_flags & IFF_UP) && 825 (ifr.ifr_flags & IFF_RUNNING)), 826 }; 827 ifr = (struct ifreq) { 828 .ifr_data = (void *)&gcmd, 829 }; 830 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 831 if (ret) { 832 if (ret == -ENOTSUP && priv->representor) { 833 /* 834 * For representors we can try to inherit link 835 * settings from the master device. Actually 836 * link settings do not make a lot of sense 837 * for representors due to missing physical 838 * link. The old kernel drivers supported 839 * emulated settings query for representors, 840 * the new ones do not, so we have to add 841 * this code for compatibility issues. 842 */ 843 master = mlx5_find_master_dev(dev); 844 if (master) { 845 ifr = (struct ifreq) { 846 .ifr_data = (void *)&gcmd, 847 }; 848 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr); 849 } 850 } 851 if (ret) { 852 DRV_LOG(DEBUG, 853 "port %u ioctl(SIOCETHTOOL," 854 " ETHTOOL_GLINKSETTINGS) failed: %s", 855 dev->data->port_id, strerror(rte_errno)); 856 return ret; 857 } 858 859 } 860 gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords; 861 862 alignas(struct ethtool_link_settings) 863 uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) + 864 sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3]; 865 struct ethtool_link_settings *ecmd = (void *)data; 866 867 *ecmd = gcmd; 868 ifr.ifr_data = (void *)ecmd; 869 ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr); 870 if (ret) { 871 DRV_LOG(DEBUG, 872 "port %u ioctl(SIOCETHTOOL," 873 "ETHTOOL_GLINKSETTINGS) failed: %s", 874 dev->data->port_id, strerror(rte_errno)); 875 return ret; 876 } 877 dev_link.link_speed = ecmd->speed; 878 sc = ecmd->link_mode_masks[0] | 879 ((uint64_t)ecmd->link_mode_masks[1] << 32); 880 priv->link_speed_capa = 0; 881 if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT)) 882 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG; 883 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) | 884 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT))) 885 priv->link_speed_capa |= ETH_LINK_SPEED_1G; 886 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) | 887 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) | 888 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT))) 889 priv->link_speed_capa |= ETH_LINK_SPEED_10G; 890 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) | 891 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT))) 892 priv->link_speed_capa |= ETH_LINK_SPEED_20G; 893 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) | 894 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) | 895 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) | 896 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT))) 897 priv->link_speed_capa |= ETH_LINK_SPEED_40G; 898 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) | 899 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) | 900 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) | 901 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT))) 902 priv->link_speed_capa |= ETH_LINK_SPEED_56G; 903 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) | 904 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) | 905 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT))) 906 priv->link_speed_capa |= ETH_LINK_SPEED_25G; 907 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) | 908 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT))) 909 priv->link_speed_capa |= ETH_LINK_SPEED_50G; 910 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) | 911 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) | 912 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) | 913 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT))) 914 priv->link_speed_capa |= ETH_LINK_SPEED_100G; 915 dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ? 916 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 917 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & 918 ETH_LINK_SPEED_FIXED); 919 if (((dev_link.link_speed && !dev_link.link_status) || 920 (!dev_link.link_speed && dev_link.link_status))) { 921 rte_errno = EAGAIN; 922 return -rte_errno; 923 } 924 *link = dev_link; 925 return 0; 926 } 927 928 /** 929 * DPDK callback to retrieve physical link information. 930 * 931 * @param dev 932 * Pointer to Ethernet device structure. 933 * @param wait_to_complete 934 * Wait for request completion. 935 * 936 * @return 937 * 0 if link status was not updated, positive if it was, a negative errno 938 * value otherwise and rte_errno is set. 939 */ 940 int 941 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete) 942 { 943 int ret; 944 struct rte_eth_link dev_link; 945 time_t start_time = time(NULL); 946 947 do { 948 ret = mlx5_link_update_unlocked_gs(dev, &dev_link); 949 if (ret) 950 ret = mlx5_link_update_unlocked_gset(dev, &dev_link); 951 if (ret == 0) 952 break; 953 /* Handle wait to complete situation. */ 954 if (wait_to_complete && ret == -EAGAIN) { 955 if (abs((int)difftime(time(NULL), start_time)) < 956 MLX5_LINK_STATUS_TIMEOUT) { 957 usleep(0); 958 continue; 959 } else { 960 rte_errno = EBUSY; 961 return -rte_errno; 962 } 963 } else if (ret < 0) { 964 return ret; 965 } 966 } while (wait_to_complete); 967 ret = !!memcmp(&dev->data->dev_link, &dev_link, 968 sizeof(struct rte_eth_link)); 969 dev->data->dev_link = dev_link; 970 return ret; 971 } 972 973 /** 974 * DPDK callback to change the MTU. 975 * 976 * @param dev 977 * Pointer to Ethernet device structure. 978 * @param in_mtu 979 * New MTU. 980 * 981 * @return 982 * 0 on success, a negative errno value otherwise and rte_errno is set. 983 */ 984 int 985 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 986 { 987 struct mlx5_priv *priv = dev->data->dev_private; 988 uint16_t kern_mtu = 0; 989 int ret; 990 991 ret = mlx5_get_mtu(dev, &kern_mtu); 992 if (ret) 993 return ret; 994 /* Set kernel interface MTU first. */ 995 ret = mlx5_set_mtu(dev, mtu); 996 if (ret) 997 return ret; 998 ret = mlx5_get_mtu(dev, &kern_mtu); 999 if (ret) 1000 return ret; 1001 if (kern_mtu == mtu) { 1002 priv->mtu = mtu; 1003 DRV_LOG(DEBUG, "port %u adapter MTU set to %u", 1004 dev->data->port_id, mtu); 1005 return 0; 1006 } 1007 rte_errno = EAGAIN; 1008 return -rte_errno; 1009 } 1010 1011 /** 1012 * DPDK callback to get flow control status. 1013 * 1014 * @param dev 1015 * Pointer to Ethernet device structure. 1016 * @param[out] fc_conf 1017 * Flow control output buffer. 1018 * 1019 * @return 1020 * 0 on success, a negative errno value otherwise and rte_errno is set. 1021 */ 1022 int 1023 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 1024 { 1025 struct ifreq ifr; 1026 struct ethtool_pauseparam ethpause = { 1027 .cmd = ETHTOOL_GPAUSEPARAM 1028 }; 1029 int ret; 1030 1031 ifr.ifr_data = (void *)ðpause; 1032 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 1033 if (ret) { 1034 DRV_LOG(WARNING, 1035 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:" 1036 " %s", 1037 dev->data->port_id, strerror(rte_errno)); 1038 return ret; 1039 } 1040 fc_conf->autoneg = ethpause.autoneg; 1041 if (ethpause.rx_pause && ethpause.tx_pause) 1042 fc_conf->mode = RTE_FC_FULL; 1043 else if (ethpause.rx_pause) 1044 fc_conf->mode = RTE_FC_RX_PAUSE; 1045 else if (ethpause.tx_pause) 1046 fc_conf->mode = RTE_FC_TX_PAUSE; 1047 else 1048 fc_conf->mode = RTE_FC_NONE; 1049 return 0; 1050 } 1051 1052 /** 1053 * DPDK callback to modify flow control parameters. 1054 * 1055 * @param dev 1056 * Pointer to Ethernet device structure. 1057 * @param[in] fc_conf 1058 * Flow control parameters. 1059 * 1060 * @return 1061 * 0 on success, a negative errno value otherwise and rte_errno is set. 1062 */ 1063 int 1064 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 1065 { 1066 struct ifreq ifr; 1067 struct ethtool_pauseparam ethpause = { 1068 .cmd = ETHTOOL_SPAUSEPARAM 1069 }; 1070 int ret; 1071 1072 ifr.ifr_data = (void *)ðpause; 1073 ethpause.autoneg = fc_conf->autoneg; 1074 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 1075 (fc_conf->mode & RTE_FC_RX_PAUSE)) 1076 ethpause.rx_pause = 1; 1077 else 1078 ethpause.rx_pause = 0; 1079 1080 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 1081 (fc_conf->mode & RTE_FC_TX_PAUSE)) 1082 ethpause.tx_pause = 1; 1083 else 1084 ethpause.tx_pause = 0; 1085 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 1086 if (ret) { 1087 DRV_LOG(WARNING, 1088 "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" 1089 " failed: %s", 1090 dev->data->port_id, strerror(rte_errno)); 1091 return ret; 1092 } 1093 return 0; 1094 } 1095 1096 /** 1097 * Get PCI information from struct ibv_device. 1098 * 1099 * @param device 1100 * Pointer to Ethernet device structure. 1101 * @param[out] pci_addr 1102 * PCI bus address output buffer. 1103 * 1104 * @return 1105 * 0 on success, a negative errno value otherwise and rte_errno is set. 1106 */ 1107 int 1108 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, 1109 struct rte_pci_addr *pci_addr) 1110 { 1111 FILE *file; 1112 char line[32]; 1113 MKSTR(path, "%s/device/uevent", device->ibdev_path); 1114 1115 file = fopen(path, "rb"); 1116 if (file == NULL) { 1117 rte_errno = errno; 1118 return -rte_errno; 1119 } 1120 while (fgets(line, sizeof(line), file) == line) { 1121 size_t len = strlen(line); 1122 int ret; 1123 1124 /* Truncate long lines. */ 1125 if (len == (sizeof(line) - 1)) 1126 while (line[(len - 1)] != '\n') { 1127 ret = fgetc(file); 1128 if (ret == EOF) 1129 break; 1130 line[(len - 1)] = ret; 1131 } 1132 /* Extract information. */ 1133 if (sscanf(line, 1134 "PCI_SLOT_NAME=" 1135 "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", 1136 &pci_addr->domain, 1137 &pci_addr->bus, 1138 &pci_addr->devid, 1139 &pci_addr->function) == 4) { 1140 ret = 0; 1141 break; 1142 } 1143 } 1144 fclose(file); 1145 return 0; 1146 } 1147 1148 /** 1149 * Handle asynchronous removal event for entire multiport device. 1150 * 1151 * @param sh 1152 * Infiniband device shared context. 1153 */ 1154 static void 1155 mlx5_dev_interrupt_device_fatal(struct mlx5_ibv_shared *sh) 1156 { 1157 uint32_t i; 1158 1159 for (i = 0; i < sh->max_port; ++i) { 1160 struct rte_eth_dev *dev; 1161 1162 if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) { 1163 /* 1164 * Or not existing port either no 1165 * handler installed for this port. 1166 */ 1167 continue; 1168 } 1169 dev = &rte_eth_devices[sh->port[i].ih_port_id]; 1170 assert(dev); 1171 if (dev->data->dev_conf.intr_conf.rmv) 1172 _rte_eth_dev_callback_process 1173 (dev, RTE_ETH_EVENT_INTR_RMV, NULL); 1174 } 1175 } 1176 1177 /** 1178 * Handle shared asynchronous events the NIC (removal event 1179 * and link status change). Supports multiport IB device. 1180 * 1181 * @param cb_arg 1182 * Callback argument. 1183 */ 1184 void 1185 mlx5_dev_interrupt_handler(void *cb_arg) 1186 { 1187 struct mlx5_ibv_shared *sh = cb_arg; 1188 struct ibv_async_event event; 1189 1190 /* Read all message from the IB device and acknowledge them. */ 1191 for (;;) { 1192 struct rte_eth_dev *dev; 1193 uint32_t tmp; 1194 1195 if (mlx5_glue->get_async_event(sh->ctx, &event)) 1196 break; 1197 /* Retrieve and check IB port index. */ 1198 tmp = (uint32_t)event.element.port_num; 1199 if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) { 1200 /* 1201 * The DEVICE_FATAL event is called once for 1202 * entire device without port specifying. 1203 * We should notify all existing ports. 1204 */ 1205 mlx5_glue->ack_async_event(&event); 1206 mlx5_dev_interrupt_device_fatal(sh); 1207 continue; 1208 } 1209 assert(tmp && (tmp <= sh->max_port)); 1210 if (!tmp) { 1211 /* Unsupported devive level event. */ 1212 mlx5_glue->ack_async_event(&event); 1213 DRV_LOG(DEBUG, 1214 "unsupported common event (type %d)", 1215 event.event_type); 1216 continue; 1217 } 1218 if (tmp > sh->max_port) { 1219 /* Invalid IB port index. */ 1220 mlx5_glue->ack_async_event(&event); 1221 DRV_LOG(DEBUG, 1222 "cannot handle an event (type %d)" 1223 "due to invalid IB port index (%u)", 1224 event.event_type, tmp); 1225 continue; 1226 } 1227 if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) { 1228 /* No handler installed. */ 1229 mlx5_glue->ack_async_event(&event); 1230 DRV_LOG(DEBUG, 1231 "cannot handle an event (type %d)" 1232 "due to no handler installed for port %u", 1233 event.event_type, tmp); 1234 continue; 1235 } 1236 /* Retrieve ethernet device descriptor. */ 1237 tmp = sh->port[tmp - 1].ih_port_id; 1238 dev = &rte_eth_devices[tmp]; 1239 assert(dev); 1240 if ((event.event_type == IBV_EVENT_PORT_ACTIVE || 1241 event.event_type == IBV_EVENT_PORT_ERR) && 1242 dev->data->dev_conf.intr_conf.lsc) { 1243 mlx5_glue->ack_async_event(&event); 1244 if (mlx5_link_update(dev, 0) == -EAGAIN) { 1245 usleep(0); 1246 continue; 1247 } 1248 _rte_eth_dev_callback_process 1249 (dev, RTE_ETH_EVENT_INTR_LSC, NULL); 1250 continue; 1251 } 1252 DRV_LOG(DEBUG, 1253 "port %u cannot handle an unknown event (type %d)", 1254 dev->data->port_id, event.event_type); 1255 mlx5_glue->ack_async_event(&event); 1256 } 1257 } 1258 1259 /** 1260 * Uninstall shared asynchronous device events handler. 1261 * This function is implemeted to support event sharing 1262 * between multiple ports of single IB device. 1263 * 1264 * @param dev 1265 * Pointer to Ethernet device. 1266 */ 1267 static void 1268 mlx5_dev_shared_handler_uninstall(struct rte_eth_dev *dev) 1269 { 1270 struct mlx5_priv *priv = dev->data->dev_private; 1271 struct mlx5_ibv_shared *sh = priv->sh; 1272 1273 if (rte_eal_process_type() != RTE_PROC_PRIMARY) 1274 return; 1275 pthread_mutex_lock(&sh->intr_mutex); 1276 assert(priv->ibv_port); 1277 assert(priv->ibv_port <= sh->max_port); 1278 assert(dev->data->port_id < RTE_MAX_ETHPORTS); 1279 if (sh->port[priv->ibv_port - 1].ih_port_id >= RTE_MAX_ETHPORTS) 1280 goto exit; 1281 assert(sh->port[priv->ibv_port - 1].ih_port_id == 1282 (uint32_t)dev->data->port_id); 1283 assert(sh->intr_cnt); 1284 sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS; 1285 if (!sh->intr_cnt || --sh->intr_cnt) 1286 goto exit; 1287 rte_intr_callback_unregister(&sh->intr_handle, 1288 mlx5_dev_interrupt_handler, sh); 1289 sh->intr_handle.fd = 0; 1290 sh->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; 1291 exit: 1292 pthread_mutex_unlock(&sh->intr_mutex); 1293 } 1294 1295 /** 1296 * Install shared asyncronous device events handler. 1297 * This function is implemeted to support event sharing 1298 * between multiple ports of single IB device. 1299 * 1300 * @param dev 1301 * Pointer to Ethernet device. 1302 */ 1303 static void 1304 mlx5_dev_shared_handler_install(struct rte_eth_dev *dev) 1305 { 1306 struct mlx5_priv *priv = dev->data->dev_private; 1307 struct mlx5_ibv_shared *sh = priv->sh; 1308 int ret; 1309 int flags; 1310 1311 if (rte_eal_process_type() != RTE_PROC_PRIMARY) 1312 return; 1313 pthread_mutex_lock(&sh->intr_mutex); 1314 assert(priv->ibv_port); 1315 assert(priv->ibv_port <= sh->max_port); 1316 assert(dev->data->port_id < RTE_MAX_ETHPORTS); 1317 if (sh->port[priv->ibv_port - 1].ih_port_id < RTE_MAX_ETHPORTS) { 1318 /* The handler is already installed for this port. */ 1319 assert(sh->intr_cnt); 1320 goto exit; 1321 } 1322 sh->port[priv->ibv_port - 1].ih_port_id = (uint32_t)dev->data->port_id; 1323 if (sh->intr_cnt) { 1324 sh->intr_cnt++; 1325 goto exit; 1326 } 1327 /* No shared handler installed. */ 1328 assert(sh->ctx->async_fd > 0); 1329 flags = fcntl(sh->ctx->async_fd, F_GETFL); 1330 ret = fcntl(sh->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); 1331 if (ret) { 1332 DRV_LOG(INFO, "failed to change file descriptor" 1333 " async event queue"); 1334 /* Indicate there will be no interrupts. */ 1335 dev->data->dev_conf.intr_conf.lsc = 0; 1336 dev->data->dev_conf.intr_conf.rmv = 0; 1337 sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS; 1338 goto exit; 1339 } 1340 sh->intr_handle.fd = sh->ctx->async_fd; 1341 sh->intr_handle.type = RTE_INTR_HANDLE_EXT; 1342 rte_intr_callback_register(&sh->intr_handle, 1343 mlx5_dev_interrupt_handler, sh); 1344 sh->intr_cnt++; 1345 exit: 1346 pthread_mutex_unlock(&sh->intr_mutex); 1347 } 1348 1349 /** 1350 * Uninstall interrupt handler. 1351 * 1352 * @param dev 1353 * Pointer to Ethernet device. 1354 */ 1355 void 1356 mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev) 1357 { 1358 mlx5_dev_shared_handler_uninstall(dev); 1359 } 1360 1361 /** 1362 * Install interrupt handler. 1363 * 1364 * @param dev 1365 * Pointer to Ethernet device. 1366 */ 1367 void 1368 mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev) 1369 { 1370 mlx5_dev_shared_handler_install(dev); 1371 } 1372 1373 /** 1374 * DPDK callback to bring the link DOWN. 1375 * 1376 * @param dev 1377 * Pointer to Ethernet device structure. 1378 * 1379 * @return 1380 * 0 on success, a negative errno value otherwise and rte_errno is set. 1381 */ 1382 int 1383 mlx5_set_link_down(struct rte_eth_dev *dev) 1384 { 1385 return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP); 1386 } 1387 1388 /** 1389 * DPDK callback to bring the link UP. 1390 * 1391 * @param dev 1392 * Pointer to Ethernet device structure. 1393 * 1394 * @return 1395 * 0 on success, a negative errno value otherwise and rte_errno is set. 1396 */ 1397 int 1398 mlx5_set_link_up(struct rte_eth_dev *dev) 1399 { 1400 return mlx5_set_flags(dev, ~IFF_UP, IFF_UP); 1401 } 1402 1403 /** 1404 * Configure the TX function to use. 1405 * 1406 * @param dev 1407 * Pointer to private data structure. 1408 * 1409 * @return 1410 * Pointer to selected Tx burst function. 1411 */ 1412 eth_tx_burst_t 1413 mlx5_select_tx_function(struct rte_eth_dev *dev) 1414 { 1415 struct mlx5_priv *priv = dev->data->dev_private; 1416 eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst; 1417 struct mlx5_dev_config *config = &priv->config; 1418 uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads; 1419 int tso = !!(tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO | 1420 DEV_TX_OFFLOAD_VXLAN_TNL_TSO | 1421 DEV_TX_OFFLOAD_GRE_TNL_TSO | 1422 DEV_TX_OFFLOAD_IP_TNL_TSO | 1423 DEV_TX_OFFLOAD_UDP_TNL_TSO)); 1424 int swp = !!(tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO | 1425 DEV_TX_OFFLOAD_UDP_TNL_TSO | 1426 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)); 1427 int vlan_insert = !!(tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT); 1428 1429 assert(priv != NULL); 1430 /* Select appropriate TX function. */ 1431 if (vlan_insert || tso || swp) 1432 return tx_pkt_burst; 1433 if (config->mps == MLX5_MPW_ENHANCED) { 1434 if (mlx5_check_vec_tx_support(dev) > 0) { 1435 if (mlx5_check_raw_vec_tx_support(dev) > 0) 1436 tx_pkt_burst = mlx5_tx_burst_raw_vec; 1437 else 1438 tx_pkt_burst = mlx5_tx_burst_vec; 1439 DRV_LOG(DEBUG, 1440 "port %u selected enhanced MPW Tx vectorized" 1441 " function", 1442 dev->data->port_id); 1443 } else { 1444 tx_pkt_burst = mlx5_tx_burst_empw; 1445 DRV_LOG(DEBUG, 1446 "port %u selected enhanced MPW Tx function", 1447 dev->data->port_id); 1448 } 1449 } else if (config->mps && (config->txq_inline > 0)) { 1450 tx_pkt_burst = mlx5_tx_burst_mpw_inline; 1451 DRV_LOG(DEBUG, "port %u selected MPW inline Tx function", 1452 dev->data->port_id); 1453 } else if (config->mps) { 1454 tx_pkt_burst = mlx5_tx_burst_mpw; 1455 DRV_LOG(DEBUG, "port %u selected MPW Tx function", 1456 dev->data->port_id); 1457 } 1458 return tx_pkt_burst; 1459 } 1460 1461 /** 1462 * Configure the RX function to use. 1463 * 1464 * @param dev 1465 * Pointer to private data structure. 1466 * 1467 * @return 1468 * Pointer to selected Rx burst function. 1469 */ 1470 eth_rx_burst_t 1471 mlx5_select_rx_function(struct rte_eth_dev *dev) 1472 { 1473 eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst; 1474 1475 assert(dev != NULL); 1476 if (mlx5_check_vec_rx_support(dev) > 0) { 1477 rx_pkt_burst = mlx5_rx_burst_vec; 1478 DRV_LOG(DEBUG, "port %u selected Rx vectorized function", 1479 dev->data->port_id); 1480 } else if (mlx5_mprq_enabled(dev)) { 1481 rx_pkt_burst = mlx5_rx_burst_mprq; 1482 } 1483 return rx_pkt_burst; 1484 } 1485 1486 /** 1487 * Check if mlx5 device was removed. 1488 * 1489 * @param dev 1490 * Pointer to Ethernet device structure. 1491 * 1492 * @return 1493 * 1 when device is removed, otherwise 0. 1494 */ 1495 int 1496 mlx5_is_removed(struct rte_eth_dev *dev) 1497 { 1498 struct ibv_device_attr device_attr; 1499 struct mlx5_priv *priv = dev->data->dev_private; 1500 1501 if (mlx5_glue->query_device(priv->sh->ctx, &device_attr) == EIO) 1502 return 1; 1503 return 0; 1504 } 1505 1506 /** 1507 * Get port ID list of mlx5 instances sharing a common device. 1508 * 1509 * @param[in] dev 1510 * Device to look for. 1511 * @param[out] port_list 1512 * Result buffer for collected port IDs. 1513 * @param port_list_n 1514 * Maximum number of entries in result buffer. If 0, @p port_list can be 1515 * NULL. 1516 * 1517 * @return 1518 * Number of matching instances regardless of the @p port_list_n 1519 * parameter, 0 if none were found. 1520 */ 1521 unsigned int 1522 mlx5_dev_to_port_id(const struct rte_device *dev, uint16_t *port_list, 1523 unsigned int port_list_n) 1524 { 1525 uint16_t id; 1526 unsigned int n = 0; 1527 1528 RTE_ETH_FOREACH_DEV_OF(id, dev) { 1529 if (n < port_list_n) 1530 port_list[n] = id; 1531 n++; 1532 } 1533 return n; 1534 } 1535 1536 /** 1537 * Get the E-Switch domain id this port belongs to. 1538 * 1539 * @param[in] port 1540 * Device port id. 1541 * @param[out] es_domain_id 1542 * E-Switch domain id. 1543 * @param[out] es_port_id 1544 * The port id of the port in the E-Switch. 1545 * 1546 * @return 1547 * 0 on success, a negative errno value otherwise and rte_errno is set. 1548 */ 1549 int 1550 mlx5_port_to_eswitch_info(uint16_t port, 1551 uint16_t *es_domain_id, uint16_t *es_port_id) 1552 { 1553 struct rte_eth_dev *dev; 1554 struct mlx5_priv *priv; 1555 1556 if (port >= RTE_MAX_ETHPORTS) { 1557 rte_errno = EINVAL; 1558 return -rte_errno; 1559 } 1560 if (!rte_eth_dev_is_valid_port(port)) { 1561 rte_errno = ENODEV; 1562 return -rte_errno; 1563 } 1564 dev = &rte_eth_devices[port]; 1565 priv = dev->data->dev_private; 1566 if (!(priv->representor || priv->master)) { 1567 rte_errno = EINVAL; 1568 return -rte_errno; 1569 } 1570 if (es_domain_id) 1571 *es_domain_id = priv->domain_id; 1572 if (es_port_id) 1573 *es_port_id = priv->vport_id; 1574 return 0; 1575 } 1576 1577 /** 1578 * Get switch information associated with network interface. 1579 * 1580 * @param ifindex 1581 * Network interface index. 1582 * @param[out] info 1583 * Switch information object, populated in case of success. 1584 * 1585 * @return 1586 * 0 on success, a negative errno value otherwise and rte_errno is set. 1587 */ 1588 int 1589 mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info) 1590 { 1591 char ifname[IF_NAMESIZE]; 1592 char port_name[IF_NAMESIZE]; 1593 FILE *file; 1594 struct mlx5_switch_info data = { 1595 .master = 0, 1596 .representor = 0, 1597 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET, 1598 .port_name = 0, 1599 .switch_id = 0, 1600 }; 1601 DIR *dir; 1602 bool port_switch_id_set = false; 1603 bool device_dir = false; 1604 char c; 1605 int ret; 1606 1607 if (!if_indextoname(ifindex, ifname)) { 1608 rte_errno = errno; 1609 return -rte_errno; 1610 } 1611 1612 MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name", 1613 ifname); 1614 MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id", 1615 ifname); 1616 MKSTR(pci_device, "/sys/class/net/%s/device", 1617 ifname); 1618 1619 file = fopen(phys_port_name, "rb"); 1620 if (file != NULL) { 1621 ret = fscanf(file, "%s", port_name); 1622 fclose(file); 1623 if (ret == 1) 1624 mlx5_translate_port_name(port_name, &data); 1625 } 1626 file = fopen(phys_switch_id, "rb"); 1627 if (file == NULL) { 1628 rte_errno = errno; 1629 return -rte_errno; 1630 } 1631 port_switch_id_set = 1632 fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 && 1633 c == '\n'; 1634 fclose(file); 1635 dir = opendir(pci_device); 1636 if (dir != NULL) { 1637 closedir(dir); 1638 device_dir = true; 1639 } 1640 if (port_switch_id_set) { 1641 /* We have some E-Switch configuration. */ 1642 mlx5_sysfs_check_switch_info(device_dir, &data); 1643 } 1644 *info = data; 1645 assert(!(data.master && data.representor)); 1646 if (data.master && data.representor) { 1647 DRV_LOG(ERR, "ifindex %u device is recognized as master" 1648 " and as representor", ifindex); 1649 rte_errno = ENODEV; 1650 return -rte_errno; 1651 } 1652 return 0; 1653 } 1654 1655 /** 1656 * Analyze gathered port parameters via Netlink to recognize master 1657 * and representor devices for E-Switch configuration. 1658 * 1659 * @param[in] num_vf_set 1660 * flag of presence of number of VFs port attribute. 1661 * @param[inout] switch_info 1662 * Port information, including port name as a number and port name 1663 * type if recognized 1664 * 1665 * @return 1666 * master and representor flags are set in switch_info according to 1667 * recognized parameters (if any). 1668 */ 1669 void 1670 mlx5_nl_check_switch_info(bool num_vf_set, 1671 struct mlx5_switch_info *switch_info) 1672 { 1673 switch (switch_info->name_type) { 1674 case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN: 1675 /* 1676 * Name is not recognized, assume the master, 1677 * check the number of VFs key presence. 1678 */ 1679 switch_info->master = num_vf_set; 1680 break; 1681 case MLX5_PHYS_PORT_NAME_TYPE_NOTSET: 1682 /* 1683 * Name is not set, this assumes the legacy naming 1684 * schema for master, just check if there is a 1685 * number of VFs key. 1686 */ 1687 switch_info->master = num_vf_set; 1688 break; 1689 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: 1690 /* New uplink naming schema recognized. */ 1691 switch_info->master = 1; 1692 break; 1693 case MLX5_PHYS_PORT_NAME_TYPE_LEGACY: 1694 /* Legacy representors naming schema. */ 1695 switch_info->representor = !num_vf_set; 1696 break; 1697 case MLX5_PHYS_PORT_NAME_TYPE_PFVF: 1698 /* New representors naming schema. */ 1699 switch_info->representor = 1; 1700 break; 1701 } 1702 } 1703 1704 /** 1705 * Analyze gathered port parameters via sysfs to recognize master 1706 * and representor devices for E-Switch configuration. 1707 * 1708 * @param[in] device_dir 1709 * flag of presence of "device" directory under port device key. 1710 * @param[inout] switch_info 1711 * Port information, including port name as a number and port name 1712 * type if recognized 1713 * 1714 * @return 1715 * master and representor flags are set in switch_info according to 1716 * recognized parameters (if any). 1717 */ 1718 void 1719 mlx5_sysfs_check_switch_info(bool device_dir, 1720 struct mlx5_switch_info *switch_info) 1721 { 1722 switch (switch_info->name_type) { 1723 case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN: 1724 /* 1725 * Name is not recognized, assume the master, 1726 * check the device directory presence. 1727 */ 1728 switch_info->master = device_dir; 1729 break; 1730 case MLX5_PHYS_PORT_NAME_TYPE_NOTSET: 1731 /* 1732 * Name is not set, this assumes the legacy naming 1733 * schema for master, just check if there is 1734 * a device directory. 1735 */ 1736 switch_info->master = device_dir; 1737 break; 1738 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: 1739 /* New uplink naming schema recognized. */ 1740 switch_info->master = 1; 1741 break; 1742 case MLX5_PHYS_PORT_NAME_TYPE_LEGACY: 1743 /* Legacy representors naming schema. */ 1744 switch_info->representor = !device_dir; 1745 break; 1746 case MLX5_PHYS_PORT_NAME_TYPE_PFVF: 1747 /* New representors naming schema. */ 1748 switch_info->representor = 1; 1749 break; 1750 } 1751 } 1752 1753 /** 1754 * Extract port name, as a number, from sysfs or netlink information. 1755 * 1756 * @param[in] port_name_in 1757 * String representing the port name. 1758 * @param[out] port_info_out 1759 * Port information, including port name as a number and port name 1760 * type if recognized 1761 * 1762 * @return 1763 * port_name field set according to recognized name format. 1764 */ 1765 void 1766 mlx5_translate_port_name(const char *port_name_in, 1767 struct mlx5_switch_info *port_info_out) 1768 { 1769 char pf_c1, pf_c2, vf_c1, vf_c2; 1770 char *end; 1771 int sc_items; 1772 1773 /* 1774 * Check for port-name as a string of the form pf0vf0 1775 * (support kernel ver >= 5.0 or OFED ver >= 4.6). 1776 */ 1777 sc_items = sscanf(port_name_in, "%c%c%d%c%c%d", 1778 &pf_c1, &pf_c2, &port_info_out->pf_num, 1779 &vf_c1, &vf_c2, &port_info_out->port_name); 1780 if (sc_items == 6 && 1781 pf_c1 == 'p' && pf_c2 == 'f' && 1782 vf_c1 == 'v' && vf_c2 == 'f') { 1783 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_PFVF; 1784 return; 1785 } 1786 /* 1787 * Check for port-name as a string of the form p0 1788 * (support kernel ver >= 5.0, or OFED ver >= 4.6). 1789 */ 1790 sc_items = sscanf(port_name_in, "%c%d", 1791 &pf_c1, &port_info_out->port_name); 1792 if (sc_items == 2 && pf_c1 == 'p') { 1793 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UPLINK; 1794 return; 1795 } 1796 /* Check for port-name as a number (support kernel ver < 5.0 */ 1797 errno = 0; 1798 port_info_out->port_name = strtol(port_name_in, &end, 0); 1799 if (!errno && 1800 (size_t)(end - port_name_in) == strlen(port_name_in)) { 1801 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_LEGACY; 1802 return; 1803 } 1804 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN; 1805 return; 1806 } 1807