1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015 Mellanox Technologies, Ltd 4 */ 5 6 #include <stddef.h> 7 #include <inttypes.h> 8 #include <unistd.h> 9 #include <stdbool.h> 10 #include <stdint.h> 11 #include <stdio.h> 12 #include <string.h> 13 #include <stdlib.h> 14 #include <errno.h> 15 #include <dirent.h> 16 #include <net/if.h> 17 #include <sys/ioctl.h> 18 #include <sys/socket.h> 19 #include <netinet/in.h> 20 #include <linux/ethtool.h> 21 #include <linux/sockios.h> 22 #include <fcntl.h> 23 #include <stdalign.h> 24 #include <sys/un.h> 25 #include <time.h> 26 27 #include <rte_atomic.h> 28 #include <rte_ethdev_driver.h> 29 #include <rte_bus_pci.h> 30 #include <rte_mbuf.h> 31 #include <rte_common.h> 32 #include <rte_interrupts.h> 33 #include <rte_malloc.h> 34 #include <rte_string_fns.h> 35 #include <rte_rwlock.h> 36 #include <rte_cycles.h> 37 38 #include <mlx5_glue.h> 39 #include <mlx5_devx_cmds.h> 40 #include <mlx5_common.h> 41 42 #include "mlx5.h" 43 #include "mlx5_rxtx.h" 44 #include "mlx5_utils.h" 45 46 /* Supported speed values found in /usr/include/linux/ethtool.h */ 47 #ifndef HAVE_SUPPORTED_40000baseKR4_Full 48 #define SUPPORTED_40000baseKR4_Full (1 << 23) 49 #endif 50 #ifndef HAVE_SUPPORTED_40000baseCR4_Full 51 #define SUPPORTED_40000baseCR4_Full (1 << 24) 52 #endif 53 #ifndef HAVE_SUPPORTED_40000baseSR4_Full 54 #define SUPPORTED_40000baseSR4_Full (1 << 25) 55 #endif 56 #ifndef HAVE_SUPPORTED_40000baseLR4_Full 57 #define SUPPORTED_40000baseLR4_Full (1 << 26) 58 #endif 59 #ifndef HAVE_SUPPORTED_56000baseKR4_Full 60 #define SUPPORTED_56000baseKR4_Full (1 << 27) 61 #endif 62 #ifndef HAVE_SUPPORTED_56000baseCR4_Full 63 #define SUPPORTED_56000baseCR4_Full (1 << 28) 64 #endif 65 #ifndef HAVE_SUPPORTED_56000baseSR4_Full 66 #define SUPPORTED_56000baseSR4_Full (1 << 29) 67 #endif 68 #ifndef HAVE_SUPPORTED_56000baseLR4_Full 69 #define SUPPORTED_56000baseLR4_Full (1 << 30) 70 #endif 71 72 /* Add defines in case the running kernel is not the same as user headers. */ 73 #ifndef ETHTOOL_GLINKSETTINGS 74 struct ethtool_link_settings { 75 uint32_t cmd; 76 uint32_t speed; 77 uint8_t duplex; 78 uint8_t port; 79 uint8_t phy_address; 80 uint8_t autoneg; 81 uint8_t mdio_support; 82 uint8_t eth_to_mdix; 83 uint8_t eth_tp_mdix_ctrl; 84 int8_t link_mode_masks_nwords; 85 uint32_t reserved[8]; 86 uint32_t link_mode_masks[]; 87 }; 88 89 /* The kernel values can be found in /include/uapi/linux/ethtool.h */ 90 #define ETHTOOL_GLINKSETTINGS 0x0000004c 91 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5 92 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6 93 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17 94 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18 95 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19 96 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20 97 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21 98 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22 99 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23 100 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24 101 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25 102 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26 103 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27 104 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28 105 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29 106 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30 107 #endif 108 #ifndef HAVE_ETHTOOL_LINK_MODE_25G 109 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31 110 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32 111 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33 112 #endif 113 #ifndef HAVE_ETHTOOL_LINK_MODE_50G 114 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34 115 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35 116 #endif 117 #ifndef HAVE_ETHTOOL_LINK_MODE_100G 118 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36 119 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37 120 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38 121 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39 122 #endif 123 #ifndef HAVE_ETHTOOL_LINK_MODE_200G 124 #define ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT 62 125 #define ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT 63 126 #define ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT 0 /* 64 - 64 */ 127 #define ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT 1 /* 65 - 64 */ 128 #define ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT 2 /* 66 - 64 */ 129 #endif 130 131 132 /** 133 * Get interface name from private structure. 134 * 135 * This is a port representor-aware version of mlx5_get_ifname_sysfs(). 136 * 137 * @param[in] dev 138 * Pointer to Ethernet device. 139 * @param[out] ifname 140 * Interface name output buffer. 141 * 142 * @return 143 * 0 on success, a negative errno value otherwise and rte_errno is set. 144 */ 145 int 146 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]) 147 { 148 struct mlx5_priv *priv = dev->data->dev_private; 149 unsigned int ifindex; 150 151 MLX5_ASSERT(priv); 152 MLX5_ASSERT(priv->sh); 153 ifindex = mlx5_ifindex(dev); 154 if (!ifindex) { 155 if (!priv->representor) 156 return mlx5_get_ifname_sysfs(priv->sh->ibdev_path, 157 *ifname); 158 rte_errno = ENXIO; 159 return -rte_errno; 160 } 161 if (if_indextoname(ifindex, &(*ifname)[0])) 162 return 0; 163 rte_errno = errno; 164 return -rte_errno; 165 } 166 167 /** 168 * Perform ifreq ioctl() on associated Ethernet device. 169 * 170 * @param[in] dev 171 * Pointer to Ethernet device. 172 * @param req 173 * Request number to pass to ioctl(). 174 * @param[out] ifr 175 * Interface request structure output buffer. 176 * 177 * @return 178 * 0 on success, a negative errno value otherwise and rte_errno is set. 179 */ 180 int 181 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr) 182 { 183 int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 184 int ret = 0; 185 186 if (sock == -1) { 187 rte_errno = errno; 188 return -rte_errno; 189 } 190 ret = mlx5_get_ifname(dev, &ifr->ifr_name); 191 if (ret) 192 goto error; 193 ret = ioctl(sock, req, ifr); 194 if (ret == -1) { 195 rte_errno = errno; 196 goto error; 197 } 198 close(sock); 199 return 0; 200 error: 201 close(sock); 202 return -rte_errno; 203 } 204 205 /** 206 * Get device MTU. 207 * 208 * @param dev 209 * Pointer to Ethernet device. 210 * @param[out] mtu 211 * MTU value output buffer. 212 * 213 * @return 214 * 0 on success, a negative errno value otherwise and rte_errno is set. 215 */ 216 int 217 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu) 218 { 219 struct ifreq request; 220 int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request); 221 222 if (ret) 223 return ret; 224 *mtu = request.ifr_mtu; 225 return 0; 226 } 227 228 /** 229 * Set device MTU. 230 * 231 * @param dev 232 * Pointer to Ethernet device. 233 * @param mtu 234 * MTU value to set. 235 * 236 * @return 237 * 0 on success, a negative errno value otherwise and rte_errno is set. 238 */ 239 int 240 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 241 { 242 struct ifreq request = { .ifr_mtu = mtu, }; 243 244 return mlx5_ifreq(dev, SIOCSIFMTU, &request); 245 } 246 247 /** 248 * Set device flags. 249 * 250 * @param dev 251 * Pointer to Ethernet device. 252 * @param keep 253 * Bitmask for flags that must remain untouched. 254 * @param flags 255 * Bitmask for flags to modify. 256 * 257 * @return 258 * 0 on success, a negative errno value otherwise and rte_errno is set. 259 */ 260 int 261 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags) 262 { 263 struct ifreq request; 264 int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request); 265 266 if (ret) 267 return ret; 268 request.ifr_flags &= keep; 269 request.ifr_flags |= flags & ~keep; 270 return mlx5_ifreq(dev, SIOCSIFFLAGS, &request); 271 } 272 273 /** 274 * Get device current raw clock counter 275 * 276 * @param dev 277 * Pointer to Ethernet device structure. 278 * @param[out] time 279 * Current raw clock counter of the device. 280 * 281 * @return 282 * 0 if the clock has correctly been read 283 * The value of errno in case of error 284 */ 285 int 286 mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock) 287 { 288 struct mlx5_priv *priv = dev->data->dev_private; 289 struct ibv_context *ctx = priv->sh->ctx; 290 struct ibv_values_ex values; 291 int err = 0; 292 293 values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK; 294 err = mlx5_glue->query_rt_values_ex(ctx, &values); 295 if (err != 0) { 296 DRV_LOG(WARNING, "Could not query the clock !"); 297 return err; 298 } 299 *clock = values.raw_clock.tv_nsec; 300 return 0; 301 } 302 303 /** 304 * Retrieve the master device for representor in the same switch domain. 305 * 306 * @param dev 307 * Pointer to representor Ethernet device structure. 308 * 309 * @return 310 * Master device structure on success, NULL otherwise. 311 */ 312 static struct rte_eth_dev * 313 mlx5_find_master_dev(struct rte_eth_dev *dev) 314 { 315 struct mlx5_priv *priv; 316 uint16_t port_id; 317 uint16_t domain_id; 318 319 priv = dev->data->dev_private; 320 domain_id = priv->domain_id; 321 MLX5_ASSERT(priv->representor); 322 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { 323 struct mlx5_priv *opriv = 324 rte_eth_devices[port_id].data->dev_private; 325 if (opriv && 326 opriv->master && 327 opriv->domain_id == domain_id && 328 opriv->sh == priv->sh) 329 return &rte_eth_devices[port_id]; 330 } 331 return NULL; 332 } 333 334 /** 335 * DPDK callback to retrieve physical link information. 336 * 337 * @param dev 338 * Pointer to Ethernet device structure. 339 * @param[out] link 340 * Storage for current link status. 341 * 342 * @return 343 * 0 on success, a negative errno value otherwise and rte_errno is set. 344 */ 345 static int 346 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, 347 struct rte_eth_link *link) 348 { 349 struct mlx5_priv *priv = dev->data->dev_private; 350 struct ethtool_cmd edata = { 351 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */ 352 }; 353 struct ifreq ifr; 354 struct rte_eth_link dev_link; 355 int link_speed = 0; 356 int ret; 357 358 ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr); 359 if (ret) { 360 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", 361 dev->data->port_id, strerror(rte_errno)); 362 return ret; 363 } 364 dev_link = (struct rte_eth_link) { 365 .link_status = ((ifr.ifr_flags & IFF_UP) && 366 (ifr.ifr_flags & IFF_RUNNING)), 367 }; 368 ifr = (struct ifreq) { 369 .ifr_data = (void *)&edata, 370 }; 371 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 372 if (ret) { 373 if (ret == -ENOTSUP && priv->representor) { 374 struct rte_eth_dev *master; 375 376 /* 377 * For representors we can try to inherit link 378 * settings from the master device. Actually 379 * link settings do not make a lot of sense 380 * for representors due to missing physical 381 * link. The old kernel drivers supported 382 * emulated settings query for representors, 383 * the new ones do not, so we have to add 384 * this code for compatibility issues. 385 */ 386 master = mlx5_find_master_dev(dev); 387 if (master) { 388 ifr = (struct ifreq) { 389 .ifr_data = (void *)&edata, 390 }; 391 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr); 392 } 393 } 394 if (ret) { 395 DRV_LOG(WARNING, 396 "port %u ioctl(SIOCETHTOOL," 397 " ETHTOOL_GSET) failed: %s", 398 dev->data->port_id, strerror(rte_errno)); 399 return ret; 400 } 401 } 402 link_speed = ethtool_cmd_speed(&edata); 403 if (link_speed == -1) 404 dev_link.link_speed = ETH_SPEED_NUM_NONE; 405 else 406 dev_link.link_speed = link_speed; 407 priv->link_speed_capa = 0; 408 if (edata.supported & SUPPORTED_Autoneg) 409 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG; 410 if (edata.supported & (SUPPORTED_1000baseT_Full | 411 SUPPORTED_1000baseKX_Full)) 412 priv->link_speed_capa |= ETH_LINK_SPEED_1G; 413 if (edata.supported & SUPPORTED_10000baseKR_Full) 414 priv->link_speed_capa |= ETH_LINK_SPEED_10G; 415 if (edata.supported & (SUPPORTED_40000baseKR4_Full | 416 SUPPORTED_40000baseCR4_Full | 417 SUPPORTED_40000baseSR4_Full | 418 SUPPORTED_40000baseLR4_Full)) 419 priv->link_speed_capa |= ETH_LINK_SPEED_40G; 420 dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? 421 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 422 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & 423 ETH_LINK_SPEED_FIXED); 424 if (((dev_link.link_speed && !dev_link.link_status) || 425 (!dev_link.link_speed && dev_link.link_status))) { 426 rte_errno = EAGAIN; 427 return -rte_errno; 428 } 429 *link = dev_link; 430 return 0; 431 } 432 433 /** 434 * Retrieve physical link information (unlocked version using new ioctl). 435 * 436 * @param dev 437 * Pointer to Ethernet device structure. 438 * @param[out] link 439 * Storage for current link status. 440 * 441 * @return 442 * 0 on success, a negative errno value otherwise and rte_errno is set. 443 */ 444 static int 445 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, 446 struct rte_eth_link *link) 447 448 { 449 struct mlx5_priv *priv = dev->data->dev_private; 450 struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS }; 451 struct ifreq ifr; 452 struct rte_eth_link dev_link; 453 struct rte_eth_dev *master = NULL; 454 uint64_t sc; 455 int ret; 456 457 ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr); 458 if (ret) { 459 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", 460 dev->data->port_id, strerror(rte_errno)); 461 return ret; 462 } 463 dev_link = (struct rte_eth_link) { 464 .link_status = ((ifr.ifr_flags & IFF_UP) && 465 (ifr.ifr_flags & IFF_RUNNING)), 466 }; 467 ifr = (struct ifreq) { 468 .ifr_data = (void *)&gcmd, 469 }; 470 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 471 if (ret) { 472 if (ret == -ENOTSUP && priv->representor) { 473 /* 474 * For representors we can try to inherit link 475 * settings from the master device. Actually 476 * link settings do not make a lot of sense 477 * for representors due to missing physical 478 * link. The old kernel drivers supported 479 * emulated settings query for representors, 480 * the new ones do not, so we have to add 481 * this code for compatibility issues. 482 */ 483 master = mlx5_find_master_dev(dev); 484 if (master) { 485 ifr = (struct ifreq) { 486 .ifr_data = (void *)&gcmd, 487 }; 488 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr); 489 } 490 } 491 if (ret) { 492 DRV_LOG(DEBUG, 493 "port %u ioctl(SIOCETHTOOL," 494 " ETHTOOL_GLINKSETTINGS) failed: %s", 495 dev->data->port_id, strerror(rte_errno)); 496 return ret; 497 } 498 } 499 gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords; 500 501 alignas(struct ethtool_link_settings) 502 uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) + 503 sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3]; 504 struct ethtool_link_settings *ecmd = (void *)data; 505 506 *ecmd = gcmd; 507 ifr.ifr_data = (void *)ecmd; 508 ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr); 509 if (ret) { 510 DRV_LOG(DEBUG, 511 "port %u ioctl(SIOCETHTOOL," 512 "ETHTOOL_GLINKSETTINGS) failed: %s", 513 dev->data->port_id, strerror(rte_errno)); 514 return ret; 515 } 516 dev_link.link_speed = (ecmd->speed == UINT32_MAX) ? ETH_SPEED_NUM_NONE : 517 ecmd->speed; 518 sc = ecmd->link_mode_masks[0] | 519 ((uint64_t)ecmd->link_mode_masks[1] << 32); 520 priv->link_speed_capa = 0; 521 if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT)) 522 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG; 523 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) | 524 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT))) 525 priv->link_speed_capa |= ETH_LINK_SPEED_1G; 526 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) | 527 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) | 528 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT))) 529 priv->link_speed_capa |= ETH_LINK_SPEED_10G; 530 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) | 531 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT))) 532 priv->link_speed_capa |= ETH_LINK_SPEED_20G; 533 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) | 534 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) | 535 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) | 536 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT))) 537 priv->link_speed_capa |= ETH_LINK_SPEED_40G; 538 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) | 539 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) | 540 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) | 541 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT))) 542 priv->link_speed_capa |= ETH_LINK_SPEED_56G; 543 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) | 544 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) | 545 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT))) 546 priv->link_speed_capa |= ETH_LINK_SPEED_25G; 547 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) | 548 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT))) 549 priv->link_speed_capa |= ETH_LINK_SPEED_50G; 550 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) | 551 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) | 552 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) | 553 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT))) 554 priv->link_speed_capa |= ETH_LINK_SPEED_100G; 555 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT) | 556 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT))) 557 priv->link_speed_capa |= ETH_LINK_SPEED_200G; 558 559 sc = ecmd->link_mode_masks[2] | 560 ((uint64_t)ecmd->link_mode_masks[3] << 32); 561 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT) | 562 MLX5_BITSHIFT 563 (ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT) | 564 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT))) 565 priv->link_speed_capa |= ETH_LINK_SPEED_200G; 566 dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ? 567 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 568 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & 569 ETH_LINK_SPEED_FIXED); 570 if (((dev_link.link_speed && !dev_link.link_status) || 571 (!dev_link.link_speed && dev_link.link_status))) { 572 rte_errno = EAGAIN; 573 return -rte_errno; 574 } 575 *link = dev_link; 576 return 0; 577 } 578 579 /** 580 * DPDK callback to retrieve physical link information. 581 * 582 * @param dev 583 * Pointer to Ethernet device structure. 584 * @param wait_to_complete 585 * Wait for request completion. 586 * 587 * @return 588 * 0 if link status was not updated, positive if it was, a negative errno 589 * value otherwise and rte_errno is set. 590 */ 591 int 592 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete) 593 { 594 int ret; 595 struct rte_eth_link dev_link; 596 time_t start_time = time(NULL); 597 int retry = MLX5_GET_LINK_STATUS_RETRY_COUNT; 598 599 do { 600 ret = mlx5_link_update_unlocked_gs(dev, &dev_link); 601 if (ret == -ENOTSUP) 602 ret = mlx5_link_update_unlocked_gset(dev, &dev_link); 603 if (ret == 0) 604 break; 605 /* Handle wait to complete situation. */ 606 if ((wait_to_complete || retry) && ret == -EAGAIN) { 607 if (abs((int)difftime(time(NULL), start_time)) < 608 MLX5_LINK_STATUS_TIMEOUT) { 609 usleep(0); 610 continue; 611 } else { 612 rte_errno = EBUSY; 613 return -rte_errno; 614 } 615 } else if (ret < 0) { 616 return ret; 617 } 618 } while (wait_to_complete || retry-- > 0); 619 ret = !!memcmp(&dev->data->dev_link, &dev_link, 620 sizeof(struct rte_eth_link)); 621 dev->data->dev_link = dev_link; 622 return ret; 623 } 624 625 /** 626 * DPDK callback to get flow control status. 627 * 628 * @param dev 629 * Pointer to Ethernet device structure. 630 * @param[out] fc_conf 631 * Flow control output buffer. 632 * 633 * @return 634 * 0 on success, a negative errno value otherwise and rte_errno is set. 635 */ 636 int 637 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 638 { 639 struct ifreq ifr; 640 struct ethtool_pauseparam ethpause = { 641 .cmd = ETHTOOL_GPAUSEPARAM 642 }; 643 int ret; 644 645 ifr.ifr_data = (void *)ðpause; 646 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 647 if (ret) { 648 DRV_LOG(WARNING, 649 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:" 650 " %s", 651 dev->data->port_id, strerror(rte_errno)); 652 return ret; 653 } 654 fc_conf->autoneg = ethpause.autoneg; 655 if (ethpause.rx_pause && ethpause.tx_pause) 656 fc_conf->mode = RTE_FC_FULL; 657 else if (ethpause.rx_pause) 658 fc_conf->mode = RTE_FC_RX_PAUSE; 659 else if (ethpause.tx_pause) 660 fc_conf->mode = RTE_FC_TX_PAUSE; 661 else 662 fc_conf->mode = RTE_FC_NONE; 663 return 0; 664 } 665 666 /** 667 * DPDK callback to modify flow control parameters. 668 * 669 * @param dev 670 * Pointer to Ethernet device structure. 671 * @param[in] fc_conf 672 * Flow control parameters. 673 * 674 * @return 675 * 0 on success, a negative errno value otherwise and rte_errno is set. 676 */ 677 int 678 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 679 { 680 struct ifreq ifr; 681 struct ethtool_pauseparam ethpause = { 682 .cmd = ETHTOOL_SPAUSEPARAM 683 }; 684 int ret; 685 686 ifr.ifr_data = (void *)ðpause; 687 ethpause.autoneg = fc_conf->autoneg; 688 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 689 (fc_conf->mode & RTE_FC_RX_PAUSE)) 690 ethpause.rx_pause = 1; 691 else 692 ethpause.rx_pause = 0; 693 694 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 695 (fc_conf->mode & RTE_FC_TX_PAUSE)) 696 ethpause.tx_pause = 1; 697 else 698 ethpause.tx_pause = 0; 699 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 700 if (ret) { 701 DRV_LOG(WARNING, 702 "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" 703 " failed: %s", 704 dev->data->port_id, strerror(rte_errno)); 705 return ret; 706 } 707 return 0; 708 } 709 710 /** 711 * Handle asynchronous removal event for entire multiport device. 712 * 713 * @param sh 714 * Infiniband device shared context. 715 */ 716 static void 717 mlx5_dev_interrupt_device_fatal(struct mlx5_dev_ctx_shared *sh) 718 { 719 uint32_t i; 720 721 for (i = 0; i < sh->max_port; ++i) { 722 struct rte_eth_dev *dev; 723 724 if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) { 725 /* 726 * Or not existing port either no 727 * handler installed for this port. 728 */ 729 continue; 730 } 731 dev = &rte_eth_devices[sh->port[i].ih_port_id]; 732 MLX5_ASSERT(dev); 733 if (dev->data->dev_conf.intr_conf.rmv) 734 _rte_eth_dev_callback_process 735 (dev, RTE_ETH_EVENT_INTR_RMV, NULL); 736 } 737 } 738 739 /** 740 * Handle shared asynchronous events the NIC (removal event 741 * and link status change). Supports multiport IB device. 742 * 743 * @param cb_arg 744 * Callback argument. 745 */ 746 void 747 mlx5_dev_interrupt_handler(void *cb_arg) 748 { 749 struct mlx5_dev_ctx_shared *sh = cb_arg; 750 struct ibv_async_event event; 751 752 /* Read all message from the IB device and acknowledge them. */ 753 for (;;) { 754 struct rte_eth_dev *dev; 755 uint32_t tmp; 756 757 if (mlx5_glue->get_async_event(sh->ctx, &event)) 758 break; 759 /* Retrieve and check IB port index. */ 760 tmp = (uint32_t)event.element.port_num; 761 if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) { 762 /* 763 * The DEVICE_FATAL event is called once for 764 * entire device without port specifying. 765 * We should notify all existing ports. 766 */ 767 mlx5_glue->ack_async_event(&event); 768 mlx5_dev_interrupt_device_fatal(sh); 769 continue; 770 } 771 MLX5_ASSERT(tmp && (tmp <= sh->max_port)); 772 if (!tmp) { 773 /* Unsupported device level event. */ 774 mlx5_glue->ack_async_event(&event); 775 DRV_LOG(DEBUG, 776 "unsupported common event (type %d)", 777 event.event_type); 778 continue; 779 } 780 if (tmp > sh->max_port) { 781 /* Invalid IB port index. */ 782 mlx5_glue->ack_async_event(&event); 783 DRV_LOG(DEBUG, 784 "cannot handle an event (type %d)" 785 "due to invalid IB port index (%u)", 786 event.event_type, tmp); 787 continue; 788 } 789 if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) { 790 /* No handler installed. */ 791 mlx5_glue->ack_async_event(&event); 792 DRV_LOG(DEBUG, 793 "cannot handle an event (type %d)" 794 "due to no handler installed for port %u", 795 event.event_type, tmp); 796 continue; 797 } 798 /* Retrieve ethernet device descriptor. */ 799 tmp = sh->port[tmp - 1].ih_port_id; 800 dev = &rte_eth_devices[tmp]; 801 MLX5_ASSERT(dev); 802 if ((event.event_type == IBV_EVENT_PORT_ACTIVE || 803 event.event_type == IBV_EVENT_PORT_ERR) && 804 dev->data->dev_conf.intr_conf.lsc) { 805 mlx5_glue->ack_async_event(&event); 806 if (mlx5_link_update(dev, 0) == -EAGAIN) { 807 usleep(0); 808 continue; 809 } 810 _rte_eth_dev_callback_process 811 (dev, RTE_ETH_EVENT_INTR_LSC, NULL); 812 continue; 813 } 814 DRV_LOG(DEBUG, 815 "port %u cannot handle an unknown event (type %d)", 816 dev->data->port_id, event.event_type); 817 mlx5_glue->ack_async_event(&event); 818 } 819 } 820 821 /* 822 * Unregister callback handler safely. The handler may be active 823 * while we are trying to unregister it, in this case code -EAGAIN 824 * is returned by rte_intr_callback_unregister(). This routine checks 825 * the return code and tries to unregister handler again. 826 * 827 * @param handle 828 * interrupt handle 829 * @param cb_fn 830 * pointer to callback routine 831 * @cb_arg 832 * opaque callback parameter 833 */ 834 void 835 mlx5_intr_callback_unregister(const struct rte_intr_handle *handle, 836 rte_intr_callback_fn cb_fn, void *cb_arg) 837 { 838 /* 839 * Try to reduce timeout management overhead by not calling 840 * the timer related routines on the first iteration. If the 841 * unregistering succeeds on first call there will be no 842 * timer calls at all. 843 */ 844 uint64_t twait = 0; 845 uint64_t start = 0; 846 847 do { 848 int ret; 849 850 ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg); 851 if (ret >= 0) 852 return; 853 if (ret != -EAGAIN) { 854 DRV_LOG(INFO, "failed to unregister interrupt" 855 " handler (error: %d)", ret); 856 MLX5_ASSERT(false); 857 return; 858 } 859 if (twait) { 860 struct timespec onems; 861 862 /* Wait one millisecond and try again. */ 863 onems.tv_sec = 0; 864 onems.tv_nsec = NS_PER_S / MS_PER_S; 865 nanosleep(&onems, 0); 866 /* Check whether one second elapsed. */ 867 if ((rte_get_timer_cycles() - start) <= twait) 868 continue; 869 } else { 870 /* 871 * We get the amount of timer ticks for one second. 872 * If this amount elapsed it means we spent one 873 * second in waiting. This branch is executed once 874 * on first iteration. 875 */ 876 twait = rte_get_timer_hz(); 877 MLX5_ASSERT(twait); 878 } 879 /* 880 * Timeout elapsed, show message (once a second) and retry. 881 * We have no other acceptable option here, if we ignore 882 * the unregistering return code the handler will not 883 * be unregistered, fd will be closed and we may get the 884 * crush. Hanging and messaging in the loop seems not to be 885 * the worst choice. 886 */ 887 DRV_LOG(INFO, "Retrying to unregister interrupt handler"); 888 start = rte_get_timer_cycles(); 889 } while (true); 890 } 891 892 /** 893 * Handle DEVX interrupts from the NIC. 894 * This function is probably called from the DPDK host thread. 895 * 896 * @param cb_arg 897 * Callback argument. 898 */ 899 void 900 mlx5_dev_interrupt_handler_devx(void *cb_arg) 901 { 902 #ifndef HAVE_IBV_DEVX_ASYNC 903 (void)cb_arg; 904 return; 905 #else 906 struct mlx5_dev_ctx_shared *sh = cb_arg; 907 union { 908 struct mlx5dv_devx_async_cmd_hdr cmd_resp; 909 uint8_t buf[MLX5_ST_SZ_BYTES(query_flow_counter_out) + 910 MLX5_ST_SZ_BYTES(traffic_counter) + 911 sizeof(struct mlx5dv_devx_async_cmd_hdr)]; 912 } out; 913 uint8_t *buf = out.buf + sizeof(out.cmd_resp); 914 915 while (!mlx5_glue->devx_get_async_cmd_comp(sh->devx_comp, 916 &out.cmd_resp, 917 sizeof(out.buf))) 918 mlx5_flow_async_pool_query_handle 919 (sh, (uint64_t)out.cmd_resp.wr_id, 920 mlx5_devx_get_out_command_status(buf)); 921 #endif /* HAVE_IBV_DEVX_ASYNC */ 922 } 923 924 /** 925 * DPDK callback to bring the link DOWN. 926 * 927 * @param dev 928 * Pointer to Ethernet device structure. 929 * 930 * @return 931 * 0 on success, a negative errno value otherwise and rte_errno is set. 932 */ 933 int 934 mlx5_set_link_down(struct rte_eth_dev *dev) 935 { 936 return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP); 937 } 938 939 /** 940 * DPDK callback to bring the link UP. 941 * 942 * @param dev 943 * Pointer to Ethernet device structure. 944 * 945 * @return 946 * 0 on success, a negative errno value otherwise and rte_errno is set. 947 */ 948 int 949 mlx5_set_link_up(struct rte_eth_dev *dev) 950 { 951 return mlx5_set_flags(dev, ~IFF_UP, IFF_UP); 952 } 953 954 /** 955 * Check if mlx5 device was removed. 956 * 957 * @param dev 958 * Pointer to Ethernet device structure. 959 * 960 * @return 961 * 1 when device is removed, otherwise 0. 962 */ 963 int 964 mlx5_is_removed(struct rte_eth_dev *dev) 965 { 966 struct ibv_device_attr device_attr; 967 struct mlx5_priv *priv = dev->data->dev_private; 968 969 if (mlx5_glue->query_device(priv->sh->ctx, &device_attr) == EIO) 970 return 1; 971 return 0; 972 } 973 974 /** 975 * Get switch information associated with network interface. 976 * 977 * @param ifindex 978 * Network interface index. 979 * @param[out] info 980 * Switch information object, populated in case of success. 981 * 982 * @return 983 * 0 on success, a negative errno value otherwise and rte_errno is set. 984 */ 985 int 986 mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info) 987 { 988 char ifname[IF_NAMESIZE]; 989 char port_name[IF_NAMESIZE]; 990 FILE *file; 991 struct mlx5_switch_info data = { 992 .master = 0, 993 .representor = 0, 994 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET, 995 .port_name = 0, 996 .switch_id = 0, 997 }; 998 DIR *dir; 999 bool port_switch_id_set = false; 1000 bool device_dir = false; 1001 char c; 1002 int ret; 1003 1004 if (!if_indextoname(ifindex, ifname)) { 1005 rte_errno = errno; 1006 return -rte_errno; 1007 } 1008 1009 MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name", 1010 ifname); 1011 MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id", 1012 ifname); 1013 MKSTR(pci_device, "/sys/class/net/%s/device", 1014 ifname); 1015 1016 file = fopen(phys_port_name, "rb"); 1017 if (file != NULL) { 1018 ret = fscanf(file, "%s", port_name); 1019 fclose(file); 1020 if (ret == 1) 1021 mlx5_translate_port_name(port_name, &data); 1022 } 1023 file = fopen(phys_switch_id, "rb"); 1024 if (file == NULL) { 1025 rte_errno = errno; 1026 return -rte_errno; 1027 } 1028 port_switch_id_set = 1029 fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 && 1030 c == '\n'; 1031 fclose(file); 1032 dir = opendir(pci_device); 1033 if (dir != NULL) { 1034 closedir(dir); 1035 device_dir = true; 1036 } 1037 if (port_switch_id_set) { 1038 /* We have some E-Switch configuration. */ 1039 mlx5_sysfs_check_switch_info(device_dir, &data); 1040 } 1041 *info = data; 1042 MLX5_ASSERT(!(data.master && data.representor)); 1043 if (data.master && data.representor) { 1044 DRV_LOG(ERR, "ifindex %u device is recognized as master" 1045 " and as representor", ifindex); 1046 rte_errno = ENODEV; 1047 return -rte_errno; 1048 } 1049 return 0; 1050 } 1051 1052 /** 1053 * Analyze gathered port parameters via sysfs to recognize master 1054 * and representor devices for E-Switch configuration. 1055 * 1056 * @param[in] device_dir 1057 * flag of presence of "device" directory under port device key. 1058 * @param[inout] switch_info 1059 * Port information, including port name as a number and port name 1060 * type if recognized 1061 * 1062 * @return 1063 * master and representor flags are set in switch_info according to 1064 * recognized parameters (if any). 1065 */ 1066 void 1067 mlx5_sysfs_check_switch_info(bool device_dir, 1068 struct mlx5_switch_info *switch_info) 1069 { 1070 switch (switch_info->name_type) { 1071 case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN: 1072 /* 1073 * Name is not recognized, assume the master, 1074 * check the device directory presence. 1075 */ 1076 switch_info->master = device_dir; 1077 break; 1078 case MLX5_PHYS_PORT_NAME_TYPE_NOTSET: 1079 /* 1080 * Name is not set, this assumes the legacy naming 1081 * schema for master, just check if there is 1082 * a device directory. 1083 */ 1084 switch_info->master = device_dir; 1085 break; 1086 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: 1087 /* New uplink naming schema recognized. */ 1088 switch_info->master = 1; 1089 break; 1090 case MLX5_PHYS_PORT_NAME_TYPE_LEGACY: 1091 /* Legacy representors naming schema. */ 1092 switch_info->representor = !device_dir; 1093 break; 1094 case MLX5_PHYS_PORT_NAME_TYPE_PFHPF: 1095 /* Fallthrough */ 1096 case MLX5_PHYS_PORT_NAME_TYPE_PFVF: 1097 /* New representors naming schema. */ 1098 switch_info->representor = 1; 1099 break; 1100 } 1101 } 1102 1103 /** 1104 * DPDK callback to retrieve plug-in module EEPROM information (type and size). 1105 * 1106 * @param dev 1107 * Pointer to Ethernet device structure. 1108 * @param[out] modinfo 1109 * Storage for plug-in module EEPROM information. 1110 * 1111 * @return 1112 * 0 on success, a negative errno value otherwise and rte_errno is set. 1113 */ 1114 int 1115 mlx5_get_module_info(struct rte_eth_dev *dev, 1116 struct rte_eth_dev_module_info *modinfo) 1117 { 1118 struct ethtool_modinfo info = { 1119 .cmd = ETHTOOL_GMODULEINFO, 1120 }; 1121 struct ifreq ifr = (struct ifreq) { 1122 .ifr_data = (void *)&info, 1123 }; 1124 int ret = 0; 1125 1126 if (!dev || !modinfo) { 1127 DRV_LOG(WARNING, "missing argument, cannot get module info"); 1128 rte_errno = EINVAL; 1129 return -rte_errno; 1130 } 1131 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 1132 if (ret) { 1133 DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s", 1134 dev->data->port_id, strerror(rte_errno)); 1135 return ret; 1136 } 1137 modinfo->type = info.type; 1138 modinfo->eeprom_len = info.eeprom_len; 1139 return ret; 1140 } 1141 1142 /** 1143 * DPDK callback to retrieve plug-in module EEPROM data. 1144 * 1145 * @param dev 1146 * Pointer to Ethernet device structure. 1147 * @param[out] info 1148 * Storage for plug-in module EEPROM data. 1149 * 1150 * @return 1151 * 0 on success, a negative errno value otherwise and rte_errno is set. 1152 */ 1153 int mlx5_get_module_eeprom(struct rte_eth_dev *dev, 1154 struct rte_dev_eeprom_info *info) 1155 { 1156 struct ethtool_eeprom *eeprom; 1157 struct ifreq ifr; 1158 int ret = 0; 1159 1160 if (!dev || !info) { 1161 DRV_LOG(WARNING, "missing argument, cannot get module eeprom"); 1162 rte_errno = EINVAL; 1163 return -rte_errno; 1164 } 1165 eeprom = rte_calloc(__func__, 1, 1166 (sizeof(struct ethtool_eeprom) + info->length), 0); 1167 if (!eeprom) { 1168 DRV_LOG(WARNING, "port %u cannot allocate memory for " 1169 "eeprom data", dev->data->port_id); 1170 rte_errno = ENOMEM; 1171 return -rte_errno; 1172 } 1173 eeprom->cmd = ETHTOOL_GMODULEEEPROM; 1174 eeprom->offset = info->offset; 1175 eeprom->len = info->length; 1176 ifr = (struct ifreq) { 1177 .ifr_data = (void *)eeprom, 1178 }; 1179 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 1180 if (ret) 1181 DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s", 1182 dev->data->port_id, strerror(rte_errno)); 1183 else 1184 rte_memcpy(info->data, eeprom->data, info->length); 1185 rte_free(eeprom); 1186 return ret; 1187 } 1188