1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <stddef.h> 35 #include <assert.h> 36 #include <unistd.h> 37 #include <stdint.h> 38 #include <stdio.h> 39 #include <string.h> 40 #include <stdlib.h> 41 #include <errno.h> 42 #include <dirent.h> 43 #include <net/if.h> 44 #include <sys/ioctl.h> 45 #include <sys/socket.h> 46 #include <sys/utsname.h> 47 #include <netinet/in.h> 48 #include <linux/ethtool.h> 49 #include <linux/sockios.h> 50 #include <linux/version.h> 51 #include <fcntl.h> 52 53 /* DPDK headers don't like -pedantic. */ 54 #ifdef PEDANTIC 55 #pragma GCC diagnostic ignored "-Wpedantic" 56 #endif 57 #include <rte_atomic.h> 58 #include <rte_ethdev.h> 59 #include <rte_mbuf.h> 60 #include <rte_common.h> 61 #include <rte_interrupts.h> 62 #include <rte_alarm.h> 63 #include <rte_malloc.h> 64 #ifdef PEDANTIC 65 #pragma GCC diagnostic error "-Wpedantic" 66 #endif 67 68 #include "mlx5.h" 69 #include "mlx5_rxtx.h" 70 #include "mlx5_utils.h" 71 72 /* Add defines in case the running kernel is not the same as user headers. */ 73 #ifndef ETHTOOL_GLINKSETTINGS 74 struct ethtool_link_settings { 75 uint32_t cmd; 76 uint32_t speed; 77 uint8_t duplex; 78 uint8_t port; 79 uint8_t phy_address; 80 uint8_t autoneg; 81 uint8_t mdio_support; 82 uint8_t eth_to_mdix; 83 uint8_t eth_tp_mdix_ctrl; 84 int8_t link_mode_masks_nwords; 85 uint32_t reserved[8]; 86 uint32_t link_mode_masks[]; 87 }; 88 89 #define ETHTOOL_GLINKSETTINGS 0x0000004c 90 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5 91 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6 92 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17 93 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18 94 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19 95 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20 96 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21 97 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22 98 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23 99 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24 100 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25 101 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26 102 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27 103 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28 104 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29 105 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30 106 #endif 107 #ifndef HAVE_ETHTOOL_LINK_MODE_25G 108 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31 109 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32 110 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33 111 #endif 112 #ifndef HAVE_ETHTOOL_LINK_MODE_50G 113 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34 114 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35 115 #endif 116 #ifndef HAVE_ETHTOOL_LINK_MODE_100G 117 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36 118 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37 119 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38 120 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39 121 #endif 122 #define ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32 (SCHAR_MAX) 123 124 /** 125 * Return private structure associated with an Ethernet device. 126 * 127 * @param dev 128 * Pointer to Ethernet device structure. 129 * 130 * @return 131 * Pointer to private structure. 132 */ 133 struct priv * 134 mlx5_get_priv(struct rte_eth_dev *dev) 135 { 136 struct mlx5_secondary_data *sd; 137 138 if (!mlx5_is_secondary()) 139 return dev->data->dev_private; 140 sd = &mlx5_secondary_data[dev->data->port_id]; 141 return sd->data.dev_private; 142 } 143 144 /** 145 * Check if running as a secondary process. 146 * 147 * @return 148 * Nonzero if running as a secondary process. 149 */ 150 inline int 151 mlx5_is_secondary(void) 152 { 153 return rte_eal_process_type() != RTE_PROC_PRIMARY; 154 } 155 156 /** 157 * Get interface name from private structure. 158 * 159 * @param[in] priv 160 * Pointer to private structure. 161 * @param[out] ifname 162 * Interface name output buffer. 163 * 164 * @return 165 * 0 on success, -1 on failure and errno is set. 166 */ 167 int 168 priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]) 169 { 170 DIR *dir; 171 struct dirent *dent; 172 unsigned int dev_type = 0; 173 unsigned int dev_port_prev = ~0u; 174 char match[IF_NAMESIZE] = ""; 175 176 { 177 MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path); 178 179 dir = opendir(path); 180 if (dir == NULL) 181 return -1; 182 } 183 while ((dent = readdir(dir)) != NULL) { 184 char *name = dent->d_name; 185 FILE *file; 186 unsigned int dev_port; 187 int r; 188 189 if ((name[0] == '.') && 190 ((name[1] == '\0') || 191 ((name[1] == '.') && (name[2] == '\0')))) 192 continue; 193 194 MKSTR(path, "%s/device/net/%s/%s", 195 priv->ctx->device->ibdev_path, name, 196 (dev_type ? "dev_id" : "dev_port")); 197 198 file = fopen(path, "rb"); 199 if (file == NULL) { 200 if (errno != ENOENT) 201 continue; 202 /* 203 * Switch to dev_id when dev_port does not exist as 204 * is the case with Linux kernel versions < 3.15. 205 */ 206 try_dev_id: 207 match[0] = '\0'; 208 if (dev_type) 209 break; 210 dev_type = 1; 211 dev_port_prev = ~0u; 212 rewinddir(dir); 213 continue; 214 } 215 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); 216 fclose(file); 217 if (r != 1) 218 continue; 219 /* 220 * Switch to dev_id when dev_port returns the same value for 221 * all ports. May happen when using a MOFED release older than 222 * 3.0 with a Linux kernel >= 3.15. 223 */ 224 if (dev_port == dev_port_prev) 225 goto try_dev_id; 226 dev_port_prev = dev_port; 227 if (dev_port == (priv->port - 1u)) 228 snprintf(match, sizeof(match), "%s", name); 229 } 230 closedir(dir); 231 if (match[0] == '\0') 232 return -1; 233 strncpy(*ifname, match, sizeof(*ifname)); 234 return 0; 235 } 236 237 /** 238 * Check if the counter is located on ib counters file. 239 * 240 * @param[in] cntr 241 * Counter name. 242 * 243 * @return 244 * 1 if counter is located on ib counters file , 0 otherwise. 245 */ 246 int 247 priv_is_ib_cntr(const char *cntr) 248 { 249 if (!strcmp(cntr, "out_of_buffer")) 250 return 1; 251 return 0; 252 } 253 254 /** 255 * Read from sysfs entry. 256 * 257 * @param[in] priv 258 * Pointer to private structure. 259 * @param[in] entry 260 * Entry name relative to sysfs path. 261 * @param[out] buf 262 * Data output buffer. 263 * @param size 264 * Buffer size. 265 * 266 * @return 267 * 0 on success, -1 on failure and errno is set. 268 */ 269 static int 270 priv_sysfs_read(const struct priv *priv, const char *entry, 271 char *buf, size_t size) 272 { 273 char ifname[IF_NAMESIZE]; 274 FILE *file; 275 int ret; 276 int err; 277 278 if (priv_get_ifname(priv, &ifname)) 279 return -1; 280 281 if (priv_is_ib_cntr(entry)) { 282 MKSTR(path, "%s/ports/1/hw_counters/%s", 283 priv->ctx->device->ibdev_path, entry); 284 file = fopen(path, "rb"); 285 } else { 286 MKSTR(path, "%s/device/net/%s/%s", 287 priv->ctx->device->ibdev_path, ifname, entry); 288 file = fopen(path, "rb"); 289 } 290 if (file == NULL) 291 return -1; 292 ret = fread(buf, 1, size, file); 293 err = errno; 294 if (((size_t)ret < size) && (ferror(file))) 295 ret = -1; 296 else 297 ret = size; 298 fclose(file); 299 errno = err; 300 return ret; 301 } 302 303 /** 304 * Write to sysfs entry. 305 * 306 * @param[in] priv 307 * Pointer to private structure. 308 * @param[in] entry 309 * Entry name relative to sysfs path. 310 * @param[in] buf 311 * Data buffer. 312 * @param size 313 * Buffer size. 314 * 315 * @return 316 * 0 on success, -1 on failure and errno is set. 317 */ 318 static int 319 priv_sysfs_write(const struct priv *priv, const char *entry, 320 char *buf, size_t size) 321 { 322 char ifname[IF_NAMESIZE]; 323 FILE *file; 324 int ret; 325 int err; 326 327 if (priv_get_ifname(priv, &ifname)) 328 return -1; 329 330 MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, 331 ifname, entry); 332 333 file = fopen(path, "wb"); 334 if (file == NULL) 335 return -1; 336 ret = fwrite(buf, 1, size, file); 337 err = errno; 338 if (((size_t)ret < size) || (ferror(file))) 339 ret = -1; 340 else 341 ret = size; 342 fclose(file); 343 errno = err; 344 return ret; 345 } 346 347 /** 348 * Get unsigned long sysfs property. 349 * 350 * @param priv 351 * Pointer to private structure. 352 * @param[in] name 353 * Entry name relative to sysfs path. 354 * @param[out] value 355 * Value output buffer. 356 * 357 * @return 358 * 0 on success, -1 on failure and errno is set. 359 */ 360 static int 361 priv_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value) 362 { 363 int ret; 364 unsigned long value_ret; 365 char value_str[32]; 366 367 ret = priv_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1)); 368 if (ret == -1) { 369 DEBUG("cannot read %s value from sysfs: %s", 370 name, strerror(errno)); 371 return -1; 372 } 373 value_str[ret] = '\0'; 374 errno = 0; 375 value_ret = strtoul(value_str, NULL, 0); 376 if (errno) { 377 DEBUG("invalid %s value `%s': %s", name, value_str, 378 strerror(errno)); 379 return -1; 380 } 381 *value = value_ret; 382 return 0; 383 } 384 385 /** 386 * Set unsigned long sysfs property. 387 * 388 * @param priv 389 * Pointer to private structure. 390 * @param[in] name 391 * Entry name relative to sysfs path. 392 * @param value 393 * Value to set. 394 * 395 * @return 396 * 0 on success, -1 on failure and errno is set. 397 */ 398 static int 399 priv_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value) 400 { 401 int ret; 402 MKSTR(value_str, "%lu", value); 403 404 ret = priv_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1)); 405 if (ret == -1) { 406 DEBUG("cannot write %s `%s' (%lu) to sysfs: %s", 407 name, value_str, value, strerror(errno)); 408 return -1; 409 } 410 return 0; 411 } 412 413 /** 414 * Perform ifreq ioctl() on associated Ethernet device. 415 * 416 * @param[in] priv 417 * Pointer to private structure. 418 * @param req 419 * Request number to pass to ioctl(). 420 * @param[out] ifr 421 * Interface request structure output buffer. 422 * 423 * @return 424 * 0 on success, -1 on failure and errno is set. 425 */ 426 int 427 priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr) 428 { 429 int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 430 int ret = -1; 431 432 if (sock == -1) 433 return ret; 434 if (priv_get_ifname(priv, &ifr->ifr_name) == 0) 435 ret = ioctl(sock, req, ifr); 436 close(sock); 437 return ret; 438 } 439 440 /** 441 * Return the number of active VFs for the current device. 442 * 443 * @param[in] priv 444 * Pointer to private structure. 445 * @param[out] num_vfs 446 * Number of active VFs. 447 * 448 * @return 449 * 0 on success, -1 on failure and errno is set. 450 */ 451 int 452 priv_get_num_vfs(struct priv *priv, uint16_t *num_vfs) 453 { 454 /* The sysfs entry name depends on the operating system. */ 455 const char **name = (const char *[]){ 456 "device/sriov_numvfs", 457 "device/mlx5_num_vfs", 458 NULL, 459 }; 460 int ret; 461 462 do { 463 unsigned long ulong_num_vfs; 464 465 ret = priv_get_sysfs_ulong(priv, *name, &ulong_num_vfs); 466 if (!ret) 467 *num_vfs = ulong_num_vfs; 468 } while (*(++name) && ret); 469 return ret; 470 } 471 472 /** 473 * Get device MTU. 474 * 475 * @param priv 476 * Pointer to private structure. 477 * @param[out] mtu 478 * MTU value output buffer. 479 * 480 * @return 481 * 0 on success, -1 on failure and errno is set. 482 */ 483 int 484 priv_get_mtu(struct priv *priv, uint16_t *mtu) 485 { 486 unsigned long ulong_mtu; 487 488 if (priv_get_sysfs_ulong(priv, "mtu", &ulong_mtu) == -1) 489 return -1; 490 *mtu = ulong_mtu; 491 return 0; 492 } 493 494 /** 495 * Read device counter from sysfs. 496 * 497 * @param priv 498 * Pointer to private structure. 499 * @param name 500 * Counter name. 501 * @param[out] cntr 502 * Counter output buffer. 503 * 504 * @return 505 * 0 on success, -1 on failure and errno is set. 506 */ 507 int 508 priv_get_cntr_sysfs(struct priv *priv, const char *name, uint64_t *cntr) 509 { 510 unsigned long ulong_ctr; 511 512 if (priv_get_sysfs_ulong(priv, name, &ulong_ctr) == -1) 513 return -1; 514 *cntr = ulong_ctr; 515 return 0; 516 } 517 518 /** 519 * Set device MTU. 520 * 521 * @param priv 522 * Pointer to private structure. 523 * @param mtu 524 * MTU value to set. 525 * 526 * @return 527 * 0 on success, -1 on failure and errno is set. 528 */ 529 static int 530 priv_set_mtu(struct priv *priv, uint16_t mtu) 531 { 532 uint16_t new_mtu; 533 534 if (priv_set_sysfs_ulong(priv, "mtu", mtu) || 535 priv_get_mtu(priv, &new_mtu)) 536 return -1; 537 if (new_mtu == mtu) 538 return 0; 539 errno = EINVAL; 540 return -1; 541 } 542 543 /** 544 * Set device flags. 545 * 546 * @param priv 547 * Pointer to private structure. 548 * @param keep 549 * Bitmask for flags that must remain untouched. 550 * @param flags 551 * Bitmask for flags to modify. 552 * 553 * @return 554 * 0 on success, -1 on failure and errno is set. 555 */ 556 int 557 priv_set_flags(struct priv *priv, unsigned int keep, unsigned int flags) 558 { 559 unsigned long tmp; 560 561 if (priv_get_sysfs_ulong(priv, "flags", &tmp) == -1) 562 return -1; 563 tmp &= keep; 564 tmp |= (flags & (~keep)); 565 return priv_set_sysfs_ulong(priv, "flags", tmp); 566 } 567 568 /** 569 * Ethernet device configuration. 570 * 571 * Prepare the driver for a given number of TX and RX queues. 572 * 573 * @param dev 574 * Pointer to Ethernet device structure. 575 * 576 * @return 577 * 0 on success, errno value on failure. 578 */ 579 static int 580 dev_configure(struct rte_eth_dev *dev) 581 { 582 struct priv *priv = dev->data->dev_private; 583 unsigned int rxqs_n = dev->data->nb_rx_queues; 584 unsigned int txqs_n = dev->data->nb_tx_queues; 585 unsigned int i; 586 unsigned int j; 587 unsigned int reta_idx_n; 588 589 priv->rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf; 590 priv->rxqs = (void *)dev->data->rx_queues; 591 priv->txqs = (void *)dev->data->tx_queues; 592 if (txqs_n != priv->txqs_n) { 593 INFO("%p: TX queues number update: %u -> %u", 594 (void *)dev, priv->txqs_n, txqs_n); 595 priv->txqs_n = txqs_n; 596 } 597 if (rxqs_n > priv->ind_table_max_size) { 598 ERROR("cannot handle this many RX queues (%u)", rxqs_n); 599 return EINVAL; 600 } 601 if (rxqs_n == priv->rxqs_n) 602 return 0; 603 INFO("%p: RX queues number update: %u -> %u", 604 (void *)dev, priv->rxqs_n, rxqs_n); 605 priv->rxqs_n = rxqs_n; 606 /* If the requested number of RX queues is not a power of two, use the 607 * maximum indirection table size for better balancing. 608 * The result is always rounded to the next power of two. */ 609 reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ? 610 priv->ind_table_max_size : 611 rxqs_n)); 612 if (priv_rss_reta_index_resize(priv, reta_idx_n)) 613 return ENOMEM; 614 /* When the number of RX queues is not a power of two, the remaining 615 * table entries are padded with reused WQs and hashes are not spread 616 * uniformly. */ 617 for (i = 0, j = 0; (i != reta_idx_n); ++i) { 618 (*priv->reta_idx)[i] = j; 619 if (++j == rxqs_n) 620 j = 0; 621 } 622 return 0; 623 } 624 625 /** 626 * DPDK callback for Ethernet device configuration. 627 * 628 * @param dev 629 * Pointer to Ethernet device structure. 630 * 631 * @return 632 * 0 on success, negative errno value on failure. 633 */ 634 int 635 mlx5_dev_configure(struct rte_eth_dev *dev) 636 { 637 struct priv *priv = dev->data->dev_private; 638 int ret; 639 640 if (mlx5_is_secondary()) 641 return -E_RTE_SECONDARY; 642 643 priv_lock(priv); 644 ret = dev_configure(dev); 645 assert(ret >= 0); 646 priv_unlock(priv); 647 return -ret; 648 } 649 650 /** 651 * DPDK callback to get information about the device. 652 * 653 * @param dev 654 * Pointer to Ethernet device structure. 655 * @param[out] info 656 * Info structure output buffer. 657 */ 658 void 659 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 660 { 661 struct priv *priv = mlx5_get_priv(dev); 662 unsigned int max; 663 char ifname[IF_NAMESIZE]; 664 665 info->pci_dev = RTE_ETH_DEV_TO_PCI(dev); 666 667 priv_lock(priv); 668 /* FIXME: we should ask the device for these values. */ 669 info->min_rx_bufsize = 32; 670 info->max_rx_pktlen = 65536; 671 /* 672 * Since we need one CQ per QP, the limit is the minimum number 673 * between the two values. 674 */ 675 max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ? 676 priv->device_attr.max_qp : priv->device_attr.max_cq); 677 /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */ 678 if (max >= 65535) 679 max = 65535; 680 info->max_rx_queues = max; 681 info->max_tx_queues = max; 682 info->max_mac_addrs = RTE_DIM(priv->mac); 683 info->rx_offload_capa = 684 (priv->hw_csum ? 685 (DEV_RX_OFFLOAD_IPV4_CKSUM | 686 DEV_RX_OFFLOAD_UDP_CKSUM | 687 DEV_RX_OFFLOAD_TCP_CKSUM) : 688 0) | 689 (priv->hw_vlan_strip ? DEV_RX_OFFLOAD_VLAN_STRIP : 0); 690 if (!priv->mps) 691 info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT; 692 if (priv->hw_csum) 693 info->tx_offload_capa |= 694 (DEV_TX_OFFLOAD_IPV4_CKSUM | 695 DEV_TX_OFFLOAD_UDP_CKSUM | 696 DEV_TX_OFFLOAD_TCP_CKSUM); 697 if (priv->tso) 698 info->tx_offload_capa |= DEV_TX_OFFLOAD_TCP_TSO; 699 if (priv->tunnel_en) 700 info->tx_offload_capa |= (DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM | 701 DEV_TX_OFFLOAD_VXLAN_TNL_TSO | 702 DEV_TX_OFFLOAD_GRE_TNL_TSO); 703 if (priv_get_ifname(priv, &ifname) == 0) 704 info->if_index = if_nametoindex(ifname); 705 info->reta_size = priv->reta_idx_n ? 706 priv->reta_idx_n : priv->ind_table_max_size; 707 info->hash_key_size = ((*priv->rss_conf) ? 708 (*priv->rss_conf)[0]->rss_key_len : 709 0); 710 info->speed_capa = priv->link_speed_capa; 711 priv_unlock(priv); 712 } 713 714 const uint32_t * 715 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) 716 { 717 static const uint32_t ptypes[] = { 718 /* refers to rxq_cq_to_pkt_type() */ 719 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, 720 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, 721 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, 722 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, 723 RTE_PTYPE_UNKNOWN 724 725 }; 726 727 if (dev->rx_pkt_burst == mlx5_rx_burst || 728 dev->rx_pkt_burst == mlx5_rx_burst_vec) 729 return ptypes; 730 return NULL; 731 } 732 733 /** 734 * DPDK callback to retrieve physical link information. 735 * 736 * @param dev 737 * Pointer to Ethernet device structure. 738 * @param wait_to_complete 739 * Wait for request completion (ignored). 740 */ 741 static int 742 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, int wait_to_complete) 743 { 744 struct priv *priv = mlx5_get_priv(dev); 745 struct ethtool_cmd edata = { 746 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */ 747 }; 748 struct ifreq ifr; 749 struct rte_eth_link dev_link; 750 int link_speed = 0; 751 752 /* priv_lock() is not taken to allow concurrent calls. */ 753 754 (void)wait_to_complete; 755 if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) { 756 WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno)); 757 return -1; 758 } 759 memset(&dev_link, 0, sizeof(dev_link)); 760 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && 761 (ifr.ifr_flags & IFF_RUNNING)); 762 ifr.ifr_data = (void *)&edata; 763 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 764 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", 765 strerror(errno)); 766 return -1; 767 } 768 link_speed = ethtool_cmd_speed(&edata); 769 if (link_speed == -1) 770 dev_link.link_speed = 0; 771 else 772 dev_link.link_speed = link_speed; 773 priv->link_speed_capa = 0; 774 if (edata.supported & SUPPORTED_Autoneg) 775 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG; 776 if (edata.supported & (SUPPORTED_1000baseT_Full | 777 SUPPORTED_1000baseKX_Full)) 778 priv->link_speed_capa |= ETH_LINK_SPEED_1G; 779 if (edata.supported & SUPPORTED_10000baseKR_Full) 780 priv->link_speed_capa |= ETH_LINK_SPEED_10G; 781 if (edata.supported & (SUPPORTED_40000baseKR4_Full | 782 SUPPORTED_40000baseCR4_Full | 783 SUPPORTED_40000baseSR4_Full | 784 SUPPORTED_40000baseLR4_Full)) 785 priv->link_speed_capa |= ETH_LINK_SPEED_40G; 786 dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? 787 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 788 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & 789 ETH_LINK_SPEED_FIXED); 790 if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) { 791 /* Link status changed. */ 792 dev->data->dev_link = dev_link; 793 return 0; 794 } 795 /* Link status is still the same. */ 796 return -1; 797 } 798 799 /** 800 * Retrieve physical link information (unlocked version using new ioctl). 801 * 802 * @param dev 803 * Pointer to Ethernet device structure. 804 * @param wait_to_complete 805 * Wait for request completion (ignored). 806 */ 807 static int 808 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, int wait_to_complete) 809 { 810 struct priv *priv = mlx5_get_priv(dev); 811 __extension__ struct { 812 struct ethtool_link_settings edata; 813 uint32_t link_mode_data[3 * 814 ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32]; 815 } ecmd; 816 817 struct ifreq ifr; 818 struct rte_eth_link dev_link; 819 uint64_t sc; 820 821 (void)wait_to_complete; 822 if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) { 823 WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno)); 824 return -1; 825 } 826 memset(&dev_link, 0, sizeof(dev_link)); 827 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && 828 (ifr.ifr_flags & IFF_RUNNING)); 829 memset(&ecmd, 0, sizeof(ecmd)); 830 ecmd.edata.cmd = ETHTOOL_GLINKSETTINGS; 831 ifr.ifr_data = (void *)&ecmd; 832 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 833 DEBUG("ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS) failed: %s", 834 strerror(errno)); 835 return -1; 836 } 837 ecmd.edata.link_mode_masks_nwords = -ecmd.edata.link_mode_masks_nwords; 838 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 839 DEBUG("ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS) failed: %s", 840 strerror(errno)); 841 return -1; 842 } 843 dev_link.link_speed = ecmd.edata.speed; 844 sc = ecmd.edata.link_mode_masks[0] | 845 ((uint64_t)ecmd.edata.link_mode_masks[1] << 32); 846 priv->link_speed_capa = 0; 847 if (sc & ETHTOOL_LINK_MODE_Autoneg_BIT) 848 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG; 849 if (sc & (ETHTOOL_LINK_MODE_1000baseT_Full_BIT | 850 ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)) 851 priv->link_speed_capa |= ETH_LINK_SPEED_1G; 852 if (sc & (ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT | 853 ETHTOOL_LINK_MODE_10000baseKR_Full_BIT | 854 ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)) 855 priv->link_speed_capa |= ETH_LINK_SPEED_10G; 856 if (sc & (ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT | 857 ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)) 858 priv->link_speed_capa |= ETH_LINK_SPEED_20G; 859 if (sc & (ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT | 860 ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT | 861 ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT | 862 ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)) 863 priv->link_speed_capa |= ETH_LINK_SPEED_40G; 864 if (sc & (ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT | 865 ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT | 866 ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT | 867 ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)) 868 priv->link_speed_capa |= ETH_LINK_SPEED_56G; 869 if (sc & (ETHTOOL_LINK_MODE_25000baseCR_Full_BIT | 870 ETHTOOL_LINK_MODE_25000baseKR_Full_BIT | 871 ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)) 872 priv->link_speed_capa |= ETH_LINK_SPEED_25G; 873 if (sc & (ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT | 874 ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)) 875 priv->link_speed_capa |= ETH_LINK_SPEED_50G; 876 if (sc & (ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT | 877 ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT | 878 ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT | 879 ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)) 880 priv->link_speed_capa |= ETH_LINK_SPEED_100G; 881 dev_link.link_duplex = ((ecmd.edata.duplex == DUPLEX_HALF) ? 882 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 883 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & 884 ETH_LINK_SPEED_FIXED); 885 if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) { 886 /* Link status changed. */ 887 dev->data->dev_link = dev_link; 888 return 0; 889 } 890 /* Link status is still the same. */ 891 return -1; 892 } 893 894 /** 895 * DPDK callback to retrieve physical link information. 896 * 897 * @param dev 898 * Pointer to Ethernet device structure. 899 * @param wait_to_complete 900 * Wait for request completion (ignored). 901 */ 902 int 903 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete) 904 { 905 struct utsname utsname; 906 int ver[3]; 907 908 if (uname(&utsname) == -1 || 909 sscanf(utsname.release, "%d.%d.%d", 910 &ver[0], &ver[1], &ver[2]) != 3 || 911 KERNEL_VERSION(ver[0], ver[1], ver[2]) < KERNEL_VERSION(4, 9, 0)) 912 return mlx5_link_update_unlocked_gset(dev, wait_to_complete); 913 return mlx5_link_update_unlocked_gs(dev, wait_to_complete); 914 } 915 916 /** 917 * DPDK callback to change the MTU. 918 * 919 * Setting the MTU affects hardware MRU (packets larger than the MTU cannot be 920 * received). Use this as a hint to enable/disable scattered packets support 921 * and improve performance when not needed. 922 * Since failure is not an option, reconfiguring queues on the fly is not 923 * recommended. 924 * 925 * @param dev 926 * Pointer to Ethernet device structure. 927 * @param in_mtu 928 * New MTU. 929 * 930 * @return 931 * 0 on success, negative errno value on failure. 932 */ 933 int 934 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 935 { 936 struct priv *priv = dev->data->dev_private; 937 int ret = 0; 938 unsigned int i; 939 unsigned int max_frame_len; 940 int rehash; 941 int restart = priv->started; 942 943 if (mlx5_is_secondary()) 944 return -E_RTE_SECONDARY; 945 946 priv_lock(priv); 947 /* Set kernel interface MTU first. */ 948 if (priv_set_mtu(priv, mtu)) { 949 ret = errno; 950 WARN("cannot set port %u MTU to %u: %s", priv->port, mtu, 951 strerror(ret)); 952 goto out; 953 } else 954 DEBUG("adapter port %u MTU set to %u", priv->port, mtu); 955 /* Temporarily replace RX handler with a fake one, assuming it has not 956 * been copied elsewhere. */ 957 dev->rx_pkt_burst = removed_rx_burst; 958 /* Make sure everyone has left dev->rx_pkt_burst() and uses 959 * removed_rx_burst() instead. */ 960 rte_wmb(); 961 usleep(1000); 962 /* MTU does not include header and CRC. */ 963 max_frame_len = ETHER_HDR_LEN + mtu + ETHER_CRC_LEN; 964 /* Check if at least one queue is going to need a SGE update. */ 965 for (i = 0; i != priv->rxqs_n; ++i) { 966 struct rxq *rxq = (*priv->rxqs)[i]; 967 unsigned int mb_len; 968 unsigned int size = RTE_PKTMBUF_HEADROOM + max_frame_len; 969 unsigned int sges_n; 970 971 if (rxq == NULL) 972 continue; 973 mb_len = rte_pktmbuf_data_room_size(rxq->mp); 974 assert(mb_len >= RTE_PKTMBUF_HEADROOM); 975 /* 976 * Determine the number of SGEs needed for a full packet 977 * and round it to the next power of two. 978 */ 979 sges_n = log2above((size / mb_len) + !!(size % mb_len)); 980 if (sges_n != rxq->sges_n) 981 break; 982 } 983 /* 984 * If all queues have the right number of SGEs, a simple rehash 985 * of their buffers is enough, otherwise SGE information can only 986 * be updated in a queue by recreating it. All resources that depend 987 * on queues (flows, indirection tables) must be recreated as well in 988 * that case. 989 */ 990 rehash = (i == priv->rxqs_n); 991 if (!rehash) { 992 /* Clean up everything as with mlx5_dev_stop(). */ 993 priv_special_flow_disable_all(priv); 994 priv_mac_addrs_disable(priv); 995 priv_destroy_hash_rxqs(priv); 996 priv_fdir_disable(priv); 997 priv_dev_interrupt_handler_uninstall(priv, dev); 998 } 999 recover: 1000 /* Reconfigure each RX queue. */ 1001 for (i = 0; (i != priv->rxqs_n); ++i) { 1002 struct rxq *rxq = (*priv->rxqs)[i]; 1003 struct rxq_ctrl *rxq_ctrl = 1004 container_of(rxq, struct rxq_ctrl, rxq); 1005 unsigned int mb_len; 1006 unsigned int tmp; 1007 1008 if (rxq == NULL) 1009 continue; 1010 mb_len = rte_pktmbuf_data_room_size(rxq->mp); 1011 assert(mb_len >= RTE_PKTMBUF_HEADROOM); 1012 /* Provide new values to rxq_setup(). */ 1013 dev->data->dev_conf.rxmode.jumbo_frame = 1014 (max_frame_len > ETHER_MAX_LEN); 1015 dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len; 1016 if (rehash) 1017 ret = rxq_rehash(dev, rxq_ctrl); 1018 else 1019 ret = rxq_ctrl_setup(dev, rxq_ctrl, 1 << rxq->elts_n, 1020 rxq_ctrl->socket, NULL, rxq->mp); 1021 if (!ret) 1022 continue; 1023 /* Attempt to roll back in case of error. */ 1024 tmp = (mb_len << rxq->sges_n) - RTE_PKTMBUF_HEADROOM; 1025 if (max_frame_len != tmp) { 1026 max_frame_len = tmp; 1027 goto recover; 1028 } 1029 /* Double fault, disable RX. */ 1030 break; 1031 } 1032 /* Mimic mlx5_dev_start(). */ 1033 if (ret) { 1034 ERROR("unable to reconfigure RX queues, RX disabled"); 1035 } else if (restart && 1036 !rehash && 1037 !priv_create_hash_rxqs(priv) && 1038 !priv_rehash_flows(priv)) { 1039 if (dev->data->dev_conf.fdir_conf.mode == RTE_FDIR_MODE_NONE) 1040 priv_fdir_enable(priv); 1041 priv_dev_interrupt_handler_install(priv, dev); 1042 } 1043 priv->mtu = mtu; 1044 /* Burst functions can now be called again. */ 1045 rte_wmb(); 1046 /* 1047 * Use a safe RX burst function in case of error, otherwise select RX 1048 * burst function again. 1049 */ 1050 if (!ret) 1051 priv_select_rx_function(priv); 1052 out: 1053 priv_unlock(priv); 1054 assert(ret >= 0); 1055 return -ret; 1056 } 1057 1058 /** 1059 * DPDK callback to get flow control status. 1060 * 1061 * @param dev 1062 * Pointer to Ethernet device structure. 1063 * @param[out] fc_conf 1064 * Flow control output buffer. 1065 * 1066 * @return 1067 * 0 on success, negative errno value on failure. 1068 */ 1069 int 1070 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 1071 { 1072 struct priv *priv = dev->data->dev_private; 1073 struct ifreq ifr; 1074 struct ethtool_pauseparam ethpause = { 1075 .cmd = ETHTOOL_GPAUSEPARAM 1076 }; 1077 int ret; 1078 1079 if (mlx5_is_secondary()) 1080 return -E_RTE_SECONDARY; 1081 1082 ifr.ifr_data = (void *)ðpause; 1083 priv_lock(priv); 1084 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 1085 ret = errno; 1086 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)" 1087 " failed: %s", 1088 strerror(ret)); 1089 goto out; 1090 } 1091 1092 fc_conf->autoneg = ethpause.autoneg; 1093 if (ethpause.rx_pause && ethpause.tx_pause) 1094 fc_conf->mode = RTE_FC_FULL; 1095 else if (ethpause.rx_pause) 1096 fc_conf->mode = RTE_FC_RX_PAUSE; 1097 else if (ethpause.tx_pause) 1098 fc_conf->mode = RTE_FC_TX_PAUSE; 1099 else 1100 fc_conf->mode = RTE_FC_NONE; 1101 ret = 0; 1102 1103 out: 1104 priv_unlock(priv); 1105 assert(ret >= 0); 1106 return -ret; 1107 } 1108 1109 /** 1110 * DPDK callback to modify flow control parameters. 1111 * 1112 * @param dev 1113 * Pointer to Ethernet device structure. 1114 * @param[in] fc_conf 1115 * Flow control parameters. 1116 * 1117 * @return 1118 * 0 on success, negative errno value on failure. 1119 */ 1120 int 1121 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 1122 { 1123 struct priv *priv = dev->data->dev_private; 1124 struct ifreq ifr; 1125 struct ethtool_pauseparam ethpause = { 1126 .cmd = ETHTOOL_SPAUSEPARAM 1127 }; 1128 int ret; 1129 1130 if (mlx5_is_secondary()) 1131 return -E_RTE_SECONDARY; 1132 1133 ifr.ifr_data = (void *)ðpause; 1134 ethpause.autoneg = fc_conf->autoneg; 1135 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 1136 (fc_conf->mode & RTE_FC_RX_PAUSE)) 1137 ethpause.rx_pause = 1; 1138 else 1139 ethpause.rx_pause = 0; 1140 1141 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 1142 (fc_conf->mode & RTE_FC_TX_PAUSE)) 1143 ethpause.tx_pause = 1; 1144 else 1145 ethpause.tx_pause = 0; 1146 1147 priv_lock(priv); 1148 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 1149 ret = errno; 1150 WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" 1151 " failed: %s", 1152 strerror(ret)); 1153 goto out; 1154 } 1155 ret = 0; 1156 1157 out: 1158 priv_unlock(priv); 1159 assert(ret >= 0); 1160 return -ret; 1161 } 1162 1163 /** 1164 * Get PCI information from struct ibv_device. 1165 * 1166 * @param device 1167 * Pointer to Ethernet device structure. 1168 * @param[out] pci_addr 1169 * PCI bus address output buffer. 1170 * 1171 * @return 1172 * 0 on success, -1 on failure and errno is set. 1173 */ 1174 int 1175 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, 1176 struct rte_pci_addr *pci_addr) 1177 { 1178 FILE *file; 1179 char line[32]; 1180 MKSTR(path, "%s/device/uevent", device->ibdev_path); 1181 1182 file = fopen(path, "rb"); 1183 if (file == NULL) 1184 return -1; 1185 while (fgets(line, sizeof(line), file) == line) { 1186 size_t len = strlen(line); 1187 int ret; 1188 1189 /* Truncate long lines. */ 1190 if (len == (sizeof(line) - 1)) 1191 while (line[(len - 1)] != '\n') { 1192 ret = fgetc(file); 1193 if (ret == EOF) 1194 break; 1195 line[(len - 1)] = ret; 1196 } 1197 /* Extract information. */ 1198 if (sscanf(line, 1199 "PCI_SLOT_NAME=" 1200 "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", 1201 &pci_addr->domain, 1202 &pci_addr->bus, 1203 &pci_addr->devid, 1204 &pci_addr->function) == 4) { 1205 ret = 0; 1206 break; 1207 } 1208 } 1209 fclose(file); 1210 return 0; 1211 } 1212 1213 /** 1214 * Link status handler. 1215 * 1216 * @param priv 1217 * Pointer to private structure. 1218 * @param dev 1219 * Pointer to the rte_eth_dev structure. 1220 * 1221 * @return 1222 * Nonzero if the callback process can be called immediately. 1223 */ 1224 static int 1225 priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev) 1226 { 1227 struct ibv_async_event event; 1228 struct rte_eth_link *link = &dev->data->dev_link; 1229 int ret = 0; 1230 1231 /* Read all message and acknowledge them. */ 1232 for (;;) { 1233 if (ibv_get_async_event(priv->ctx, &event)) 1234 break; 1235 1236 if (event.event_type != IBV_EVENT_PORT_ACTIVE && 1237 event.event_type != IBV_EVENT_PORT_ERR) 1238 DEBUG("event type %d on port %d not handled", 1239 event.event_type, event.element.port_num); 1240 ibv_ack_async_event(&event); 1241 } 1242 mlx5_link_update(dev, 0); 1243 if (((link->link_speed == 0) && link->link_status) || 1244 ((link->link_speed != 0) && !link->link_status)) { 1245 if (!priv->pending_alarm) { 1246 /* Inconsistent status, check again later. */ 1247 priv->pending_alarm = 1; 1248 rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US, 1249 mlx5_dev_link_status_handler, 1250 dev); 1251 } 1252 } else { 1253 ret = 1; 1254 } 1255 return ret; 1256 } 1257 1258 /** 1259 * Handle delayed link status event. 1260 * 1261 * @param arg 1262 * Registered argument. 1263 */ 1264 void 1265 mlx5_dev_link_status_handler(void *arg) 1266 { 1267 struct rte_eth_dev *dev = arg; 1268 struct priv *priv = dev->data->dev_private; 1269 int ret; 1270 1271 priv_lock(priv); 1272 assert(priv->pending_alarm == 1); 1273 priv->pending_alarm = 0; 1274 ret = priv_dev_link_status_handler(priv, dev); 1275 priv_unlock(priv); 1276 if (ret) 1277 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL, 1278 NULL); 1279 } 1280 1281 /** 1282 * Handle interrupts from the NIC. 1283 * 1284 * @param[in] intr_handle 1285 * Interrupt handler. 1286 * @param cb_arg 1287 * Callback argument. 1288 */ 1289 void 1290 mlx5_dev_interrupt_handler(void *cb_arg) 1291 { 1292 struct rte_eth_dev *dev = cb_arg; 1293 struct priv *priv = dev->data->dev_private; 1294 int ret; 1295 1296 priv_lock(priv); 1297 ret = priv_dev_link_status_handler(priv, dev); 1298 priv_unlock(priv); 1299 if (ret) 1300 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL, 1301 NULL); 1302 } 1303 1304 /** 1305 * Uninstall interrupt handler. 1306 * 1307 * @param priv 1308 * Pointer to private structure. 1309 * @param dev 1310 * Pointer to the rte_eth_dev structure. 1311 */ 1312 void 1313 priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev) 1314 { 1315 if (!dev->data->dev_conf.intr_conf.lsc) 1316 return; 1317 rte_intr_callback_unregister(&priv->intr_handle, 1318 mlx5_dev_interrupt_handler, 1319 dev); 1320 if (priv->pending_alarm) 1321 rte_eal_alarm_cancel(mlx5_dev_link_status_handler, dev); 1322 priv->pending_alarm = 0; 1323 priv->intr_handle.fd = 0; 1324 priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; 1325 } 1326 1327 /** 1328 * Install interrupt handler. 1329 * 1330 * @param priv 1331 * Pointer to private structure. 1332 * @param dev 1333 * Pointer to the rte_eth_dev structure. 1334 */ 1335 void 1336 priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev) 1337 { 1338 int rc, flags; 1339 1340 if (!dev->data->dev_conf.intr_conf.lsc) 1341 return; 1342 assert(priv->ctx->async_fd > 0); 1343 flags = fcntl(priv->ctx->async_fd, F_GETFL); 1344 rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); 1345 if (rc < 0) { 1346 INFO("failed to change file descriptor async event queue"); 1347 dev->data->dev_conf.intr_conf.lsc = 0; 1348 } else { 1349 priv->intr_handle.fd = priv->ctx->async_fd; 1350 priv->intr_handle.type = RTE_INTR_HANDLE_EXT; 1351 rte_intr_callback_register(&priv->intr_handle, 1352 mlx5_dev_interrupt_handler, 1353 dev); 1354 } 1355 } 1356 1357 /** 1358 * Change the link state (UP / DOWN). 1359 * 1360 * @param priv 1361 * Pointer to Ethernet device structure. 1362 * @param up 1363 * Nonzero for link up, otherwise link down. 1364 * 1365 * @return 1366 * 0 on success, errno value on failure. 1367 */ 1368 static int 1369 priv_set_link(struct priv *priv, int up) 1370 { 1371 struct rte_eth_dev *dev = priv->dev; 1372 int err; 1373 1374 if (up) { 1375 err = priv_set_flags(priv, ~IFF_UP, IFF_UP); 1376 if (err) 1377 return err; 1378 priv_select_tx_function(priv); 1379 priv_select_rx_function(priv); 1380 } else { 1381 err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP); 1382 if (err) 1383 return err; 1384 dev->rx_pkt_burst = removed_rx_burst; 1385 dev->tx_pkt_burst = removed_tx_burst; 1386 } 1387 return 0; 1388 } 1389 1390 /** 1391 * DPDK callback to bring the link DOWN. 1392 * 1393 * @param dev 1394 * Pointer to Ethernet device structure. 1395 * 1396 * @return 1397 * 0 on success, errno value on failure. 1398 */ 1399 int 1400 mlx5_set_link_down(struct rte_eth_dev *dev) 1401 { 1402 struct priv *priv = dev->data->dev_private; 1403 int err; 1404 1405 priv_lock(priv); 1406 err = priv_set_link(priv, 0); 1407 priv_unlock(priv); 1408 return err; 1409 } 1410 1411 /** 1412 * DPDK callback to bring the link UP. 1413 * 1414 * @param dev 1415 * Pointer to Ethernet device structure. 1416 * 1417 * @return 1418 * 0 on success, errno value on failure. 1419 */ 1420 int 1421 mlx5_set_link_up(struct rte_eth_dev *dev) 1422 { 1423 struct priv *priv = dev->data->dev_private; 1424 int err; 1425 1426 priv_lock(priv); 1427 err = priv_set_link(priv, 1); 1428 priv_unlock(priv); 1429 return err; 1430 } 1431 1432 /** 1433 * Configure secondary process queues from a private data pointer (primary 1434 * or secondary) and update burst callbacks. Can take place only once. 1435 * 1436 * All queues must have been previously created by the primary process to 1437 * avoid undefined behavior. 1438 * 1439 * @param priv 1440 * Private data pointer from either primary or secondary process. 1441 * 1442 * @return 1443 * Private data pointer from secondary process, NULL in case of error. 1444 */ 1445 struct priv * 1446 mlx5_secondary_data_setup(struct priv *priv) 1447 { 1448 unsigned int port_id = 0; 1449 struct mlx5_secondary_data *sd; 1450 void **tx_queues; 1451 void **rx_queues; 1452 unsigned int nb_tx_queues; 1453 unsigned int nb_rx_queues; 1454 unsigned int i; 1455 1456 /* priv must be valid at this point. */ 1457 assert(priv != NULL); 1458 /* priv->dev must also be valid but may point to local memory from 1459 * another process, possibly with the same address and must not 1460 * be dereferenced yet. */ 1461 assert(priv->dev != NULL); 1462 /* Determine port ID by finding out where priv comes from. */ 1463 while (1) { 1464 sd = &mlx5_secondary_data[port_id]; 1465 rte_spinlock_lock(&sd->lock); 1466 /* Primary process? */ 1467 if (sd->primary_priv == priv) 1468 break; 1469 /* Secondary process? */ 1470 if (sd->data.dev_private == priv) 1471 break; 1472 rte_spinlock_unlock(&sd->lock); 1473 if (++port_id == RTE_DIM(mlx5_secondary_data)) 1474 port_id = 0; 1475 } 1476 /* Switch to secondary private structure. If private data has already 1477 * been updated by another thread, there is nothing else to do. */ 1478 priv = sd->data.dev_private; 1479 if (priv->dev->data == &sd->data) 1480 goto end; 1481 /* Sanity checks. Secondary private structure is supposed to point 1482 * to local eth_dev, itself still pointing to the shared device data 1483 * structure allocated by the primary process. */ 1484 assert(sd->shared_dev_data != &sd->data); 1485 assert(sd->data.nb_tx_queues == 0); 1486 assert(sd->data.tx_queues == NULL); 1487 assert(sd->data.nb_rx_queues == 0); 1488 assert(sd->data.rx_queues == NULL); 1489 assert(priv != sd->primary_priv); 1490 assert(priv->dev->data == sd->shared_dev_data); 1491 assert(priv->txqs_n == 0); 1492 assert(priv->txqs == NULL); 1493 assert(priv->rxqs_n == 0); 1494 assert(priv->rxqs == NULL); 1495 nb_tx_queues = sd->shared_dev_data->nb_tx_queues; 1496 nb_rx_queues = sd->shared_dev_data->nb_rx_queues; 1497 /* Allocate local storage for queues. */ 1498 tx_queues = rte_zmalloc("secondary ethdev->tx_queues", 1499 sizeof(sd->data.tx_queues[0]) * nb_tx_queues, 1500 RTE_CACHE_LINE_SIZE); 1501 rx_queues = rte_zmalloc("secondary ethdev->rx_queues", 1502 sizeof(sd->data.rx_queues[0]) * nb_rx_queues, 1503 RTE_CACHE_LINE_SIZE); 1504 if (tx_queues == NULL || rx_queues == NULL) 1505 goto error; 1506 /* Lock to prevent control operations during setup. */ 1507 priv_lock(priv); 1508 /* TX queues. */ 1509 for (i = 0; i != nb_tx_queues; ++i) { 1510 struct txq *primary_txq = (*sd->primary_priv->txqs)[i]; 1511 struct txq_ctrl *primary_txq_ctrl; 1512 struct txq_ctrl *txq_ctrl; 1513 1514 if (primary_txq == NULL) 1515 continue; 1516 primary_txq_ctrl = container_of(primary_txq, 1517 struct txq_ctrl, txq); 1518 txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl) + 1519 (1 << primary_txq->elts_n) * 1520 sizeof(struct rte_mbuf *), 0, 1521 primary_txq_ctrl->socket); 1522 if (txq_ctrl != NULL) { 1523 if (txq_ctrl_setup(priv->dev, 1524 txq_ctrl, 1525 1 << primary_txq->elts_n, 1526 primary_txq_ctrl->socket, 1527 NULL) == 0) { 1528 txq_ctrl->txq.stats.idx = 1529 primary_txq->stats.idx; 1530 tx_queues[i] = &txq_ctrl->txq; 1531 continue; 1532 } 1533 rte_free(txq_ctrl); 1534 } 1535 while (i) { 1536 txq_ctrl = tx_queues[--i]; 1537 txq_cleanup(txq_ctrl); 1538 rte_free(txq_ctrl); 1539 } 1540 goto error; 1541 } 1542 /* RX queues. */ 1543 for (i = 0; i != nb_rx_queues; ++i) { 1544 struct rxq_ctrl *primary_rxq = 1545 container_of((*sd->primary_priv->rxqs)[i], 1546 struct rxq_ctrl, rxq); 1547 1548 if (primary_rxq == NULL) 1549 continue; 1550 /* Not supported yet. */ 1551 rx_queues[i] = NULL; 1552 } 1553 /* Update everything. */ 1554 priv->txqs = (void *)tx_queues; 1555 priv->txqs_n = nb_tx_queues; 1556 priv->rxqs = (void *)rx_queues; 1557 priv->rxqs_n = nb_rx_queues; 1558 sd->data.rx_queues = rx_queues; 1559 sd->data.tx_queues = tx_queues; 1560 sd->data.nb_rx_queues = nb_rx_queues; 1561 sd->data.nb_tx_queues = nb_tx_queues; 1562 sd->data.dev_link = sd->shared_dev_data->dev_link; 1563 sd->data.mtu = sd->shared_dev_data->mtu; 1564 memcpy(sd->data.rx_queue_state, sd->shared_dev_data->rx_queue_state, 1565 sizeof(sd->data.rx_queue_state)); 1566 memcpy(sd->data.tx_queue_state, sd->shared_dev_data->tx_queue_state, 1567 sizeof(sd->data.tx_queue_state)); 1568 sd->data.dev_flags = sd->shared_dev_data->dev_flags; 1569 /* Use local data from now on. */ 1570 rte_mb(); 1571 priv->dev->data = &sd->data; 1572 rte_mb(); 1573 priv_select_tx_function(priv); 1574 priv_select_rx_function(priv); 1575 priv_unlock(priv); 1576 end: 1577 /* More sanity checks. */ 1578 assert(priv->dev->data == &sd->data); 1579 rte_spinlock_unlock(&sd->lock); 1580 return priv; 1581 error: 1582 priv_unlock(priv); 1583 rte_free(tx_queues); 1584 rte_free(rx_queues); 1585 rte_spinlock_unlock(&sd->lock); 1586 return NULL; 1587 } 1588 1589 /** 1590 * Configure the TX function to use. 1591 * 1592 * @param priv 1593 * Pointer to private structure. 1594 */ 1595 void 1596 priv_select_tx_function(struct priv *priv) 1597 { 1598 priv->dev->tx_pkt_burst = mlx5_tx_burst; 1599 /* Select appropriate TX function. */ 1600 if (priv->mps == MLX5_MPW_ENHANCED) { 1601 if (priv_check_vec_tx_support(priv) > 0) { 1602 if (priv_check_raw_vec_tx_support(priv) > 0) 1603 priv->dev->tx_pkt_burst = mlx5_tx_burst_raw_vec; 1604 else 1605 priv->dev->tx_pkt_burst = mlx5_tx_burst_vec; 1606 DEBUG("selected Enhanced MPW TX vectorized function"); 1607 } else { 1608 priv->dev->tx_pkt_burst = mlx5_tx_burst_empw; 1609 DEBUG("selected Enhanced MPW TX function"); 1610 } 1611 } else if (priv->mps && priv->txq_inline) { 1612 priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline; 1613 DEBUG("selected MPW inline TX function"); 1614 } else if (priv->mps) { 1615 priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw; 1616 DEBUG("selected MPW TX function"); 1617 } 1618 } 1619 1620 /** 1621 * Configure the RX function to use. 1622 * 1623 * @param priv 1624 * Pointer to private structure. 1625 */ 1626 void 1627 priv_select_rx_function(struct priv *priv) 1628 { 1629 if (priv_check_vec_rx_support(priv) > 0) { 1630 priv_prep_vec_rx_function(priv); 1631 priv->dev->rx_pkt_burst = mlx5_rx_burst_vec; 1632 DEBUG("selected RX vectorized function"); 1633 } else { 1634 priv->dev->rx_pkt_burst = mlx5_rx_burst; 1635 } 1636 } 1637