1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <stddef.h> 35 #include <assert.h> 36 #include <unistd.h> 37 #include <stdint.h> 38 #include <stdio.h> 39 #include <string.h> 40 #include <stdlib.h> 41 #include <errno.h> 42 #include <dirent.h> 43 #include <net/if.h> 44 #include <sys/ioctl.h> 45 #include <sys/socket.h> 46 #include <sys/utsname.h> 47 #include <netinet/in.h> 48 #include <linux/ethtool.h> 49 #include <linux/sockios.h> 50 #include <linux/version.h> 51 #include <fcntl.h> 52 53 /* DPDK headers don't like -pedantic. */ 54 #ifdef PEDANTIC 55 #pragma GCC diagnostic ignored "-Wpedantic" 56 #endif 57 #include <rte_atomic.h> 58 #include <rte_ethdev.h> 59 #include <rte_mbuf.h> 60 #include <rte_common.h> 61 #include <rte_interrupts.h> 62 #include <rte_alarm.h> 63 #include <rte_malloc.h> 64 #ifdef PEDANTIC 65 #pragma GCC diagnostic error "-Wpedantic" 66 #endif 67 68 #include "mlx5.h" 69 #include "mlx5_rxtx.h" 70 #include "mlx5_utils.h" 71 72 /* Add defines in case the running kernel is not the same as user headers. */ 73 #ifndef ETHTOOL_GLINKSETTINGS 74 struct ethtool_link_settings { 75 uint32_t cmd; 76 uint32_t speed; 77 uint8_t duplex; 78 uint8_t port; 79 uint8_t phy_address; 80 uint8_t autoneg; 81 uint8_t mdio_support; 82 uint8_t eth_to_mdix; 83 uint8_t eth_tp_mdix_ctrl; 84 int8_t link_mode_masks_nwords; 85 uint32_t reserved[8]; 86 uint32_t link_mode_masks[]; 87 }; 88 89 #define ETHTOOL_GLINKSETTINGS 0x0000004c 90 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5 91 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6 92 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17 93 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18 94 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19 95 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20 96 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21 97 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22 98 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23 99 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24 100 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25 101 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26 102 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27 103 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28 104 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29 105 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30 106 #endif 107 #ifndef HAVE_ETHTOOL_LINK_MODE_25G 108 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31 109 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32 110 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33 111 #endif 112 #ifndef HAVE_ETHTOOL_LINK_MODE_50G 113 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34 114 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35 115 #endif 116 #ifndef HAVE_ETHTOOL_LINK_MODE_100G 117 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36 118 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37 119 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38 120 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39 121 #endif 122 #define ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32 (SCHAR_MAX) 123 124 /** 125 * Return private structure associated with an Ethernet device. 126 * 127 * @param dev 128 * Pointer to Ethernet device structure. 129 * 130 * @return 131 * Pointer to private structure. 132 */ 133 struct priv * 134 mlx5_get_priv(struct rte_eth_dev *dev) 135 { 136 struct mlx5_secondary_data *sd; 137 138 if (!mlx5_is_secondary()) 139 return dev->data->dev_private; 140 sd = &mlx5_secondary_data[dev->data->port_id]; 141 return sd->data.dev_private; 142 } 143 144 /** 145 * Check if running as a secondary process. 146 * 147 * @return 148 * Nonzero if running as a secondary process. 149 */ 150 inline int 151 mlx5_is_secondary(void) 152 { 153 return rte_eal_process_type() != RTE_PROC_PRIMARY; 154 } 155 156 /** 157 * Get interface name from private structure. 158 * 159 * @param[in] priv 160 * Pointer to private structure. 161 * @param[out] ifname 162 * Interface name output buffer. 163 * 164 * @return 165 * 0 on success, -1 on failure and errno is set. 166 */ 167 int 168 priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]) 169 { 170 DIR *dir; 171 struct dirent *dent; 172 unsigned int dev_type = 0; 173 unsigned int dev_port_prev = ~0u; 174 char match[IF_NAMESIZE] = ""; 175 176 { 177 MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path); 178 179 dir = opendir(path); 180 if (dir == NULL) 181 return -1; 182 } 183 while ((dent = readdir(dir)) != NULL) { 184 char *name = dent->d_name; 185 FILE *file; 186 unsigned int dev_port; 187 int r; 188 189 if ((name[0] == '.') && 190 ((name[1] == '\0') || 191 ((name[1] == '.') && (name[2] == '\0')))) 192 continue; 193 194 MKSTR(path, "%s/device/net/%s/%s", 195 priv->ctx->device->ibdev_path, name, 196 (dev_type ? "dev_id" : "dev_port")); 197 198 file = fopen(path, "rb"); 199 if (file == NULL) { 200 if (errno != ENOENT) 201 continue; 202 /* 203 * Switch to dev_id when dev_port does not exist as 204 * is the case with Linux kernel versions < 3.15. 205 */ 206 try_dev_id: 207 match[0] = '\0'; 208 if (dev_type) 209 break; 210 dev_type = 1; 211 dev_port_prev = ~0u; 212 rewinddir(dir); 213 continue; 214 } 215 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); 216 fclose(file); 217 if (r != 1) 218 continue; 219 /* 220 * Switch to dev_id when dev_port returns the same value for 221 * all ports. May happen when using a MOFED release older than 222 * 3.0 with a Linux kernel >= 3.15. 223 */ 224 if (dev_port == dev_port_prev) 225 goto try_dev_id; 226 dev_port_prev = dev_port; 227 if (dev_port == (priv->port - 1u)) 228 snprintf(match, sizeof(match), "%s", name); 229 } 230 closedir(dir); 231 if (match[0] == '\0') 232 return -1; 233 strncpy(*ifname, match, sizeof(*ifname)); 234 return 0; 235 } 236 237 /** 238 * Check if the counter is located on ib counters file. 239 * 240 * @param[in] cntr 241 * Counter name. 242 * 243 * @return 244 * 1 if counter is located on ib counters file , 0 otherwise. 245 */ 246 int 247 priv_is_ib_cntr(const char *cntr) 248 { 249 if (!strcmp(cntr, "out_of_buffer")) 250 return 1; 251 return 0; 252 } 253 254 /** 255 * Read from sysfs entry. 256 * 257 * @param[in] priv 258 * Pointer to private structure. 259 * @param[in] entry 260 * Entry name relative to sysfs path. 261 * @param[out] buf 262 * Data output buffer. 263 * @param size 264 * Buffer size. 265 * 266 * @return 267 * 0 on success, -1 on failure and errno is set. 268 */ 269 static int 270 priv_sysfs_read(const struct priv *priv, const char *entry, 271 char *buf, size_t size) 272 { 273 char ifname[IF_NAMESIZE]; 274 FILE *file; 275 int ret; 276 int err; 277 278 if (priv_get_ifname(priv, &ifname)) 279 return -1; 280 281 if (priv_is_ib_cntr(entry)) { 282 MKSTR(path, "%s/ports/1/hw_counters/%s", 283 priv->ctx->device->ibdev_path, entry); 284 file = fopen(path, "rb"); 285 } else { 286 MKSTR(path, "%s/device/net/%s/%s", 287 priv->ctx->device->ibdev_path, ifname, entry); 288 file = fopen(path, "rb"); 289 } 290 if (file == NULL) 291 return -1; 292 ret = fread(buf, 1, size, file); 293 err = errno; 294 if (((size_t)ret < size) && (ferror(file))) 295 ret = -1; 296 else 297 ret = size; 298 fclose(file); 299 errno = err; 300 return ret; 301 } 302 303 /** 304 * Write to sysfs entry. 305 * 306 * @param[in] priv 307 * Pointer to private structure. 308 * @param[in] entry 309 * Entry name relative to sysfs path. 310 * @param[in] buf 311 * Data buffer. 312 * @param size 313 * Buffer size. 314 * 315 * @return 316 * 0 on success, -1 on failure and errno is set. 317 */ 318 static int 319 priv_sysfs_write(const struct priv *priv, const char *entry, 320 char *buf, size_t size) 321 { 322 char ifname[IF_NAMESIZE]; 323 FILE *file; 324 int ret; 325 int err; 326 327 if (priv_get_ifname(priv, &ifname)) 328 return -1; 329 330 MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, 331 ifname, entry); 332 333 file = fopen(path, "wb"); 334 if (file == NULL) 335 return -1; 336 ret = fwrite(buf, 1, size, file); 337 err = errno; 338 if (((size_t)ret < size) || (ferror(file))) 339 ret = -1; 340 else 341 ret = size; 342 fclose(file); 343 errno = err; 344 return ret; 345 } 346 347 /** 348 * Get unsigned long sysfs property. 349 * 350 * @param priv 351 * Pointer to private structure. 352 * @param[in] name 353 * Entry name relative to sysfs path. 354 * @param[out] value 355 * Value output buffer. 356 * 357 * @return 358 * 0 on success, -1 on failure and errno is set. 359 */ 360 static int 361 priv_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value) 362 { 363 int ret; 364 unsigned long value_ret; 365 char value_str[32]; 366 367 ret = priv_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1)); 368 if (ret == -1) { 369 DEBUG("cannot read %s value from sysfs: %s", 370 name, strerror(errno)); 371 return -1; 372 } 373 value_str[ret] = '\0'; 374 errno = 0; 375 value_ret = strtoul(value_str, NULL, 0); 376 if (errno) { 377 DEBUG("invalid %s value `%s': %s", name, value_str, 378 strerror(errno)); 379 return -1; 380 } 381 *value = value_ret; 382 return 0; 383 } 384 385 /** 386 * Set unsigned long sysfs property. 387 * 388 * @param priv 389 * Pointer to private structure. 390 * @param[in] name 391 * Entry name relative to sysfs path. 392 * @param value 393 * Value to set. 394 * 395 * @return 396 * 0 on success, -1 on failure and errno is set. 397 */ 398 static int 399 priv_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value) 400 { 401 int ret; 402 MKSTR(value_str, "%lu", value); 403 404 ret = priv_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1)); 405 if (ret == -1) { 406 DEBUG("cannot write %s `%s' (%lu) to sysfs: %s", 407 name, value_str, value, strerror(errno)); 408 return -1; 409 } 410 return 0; 411 } 412 413 /** 414 * Perform ifreq ioctl() on associated Ethernet device. 415 * 416 * @param[in] priv 417 * Pointer to private structure. 418 * @param req 419 * Request number to pass to ioctl(). 420 * @param[out] ifr 421 * Interface request structure output buffer. 422 * 423 * @return 424 * 0 on success, -1 on failure and errno is set. 425 */ 426 int 427 priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr) 428 { 429 int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 430 int ret = -1; 431 432 if (sock == -1) 433 return ret; 434 if (priv_get_ifname(priv, &ifr->ifr_name) == 0) 435 ret = ioctl(sock, req, ifr); 436 close(sock); 437 return ret; 438 } 439 440 /** 441 * Return the number of active VFs for the current device. 442 * 443 * @param[in] priv 444 * Pointer to private structure. 445 * @param[out] num_vfs 446 * Number of active VFs. 447 * 448 * @return 449 * 0 on success, -1 on failure and errno is set. 450 */ 451 int 452 priv_get_num_vfs(struct priv *priv, uint16_t *num_vfs) 453 { 454 /* The sysfs entry name depends on the operating system. */ 455 const char **name = (const char *[]){ 456 "device/sriov_numvfs", 457 "device/mlx5_num_vfs", 458 NULL, 459 }; 460 int ret; 461 462 do { 463 unsigned long ulong_num_vfs; 464 465 ret = priv_get_sysfs_ulong(priv, *name, &ulong_num_vfs); 466 if (!ret) 467 *num_vfs = ulong_num_vfs; 468 } while (*(++name) && ret); 469 return ret; 470 } 471 472 /** 473 * Get device MTU. 474 * 475 * @param priv 476 * Pointer to private structure. 477 * @param[out] mtu 478 * MTU value output buffer. 479 * 480 * @return 481 * 0 on success, -1 on failure and errno is set. 482 */ 483 int 484 priv_get_mtu(struct priv *priv, uint16_t *mtu) 485 { 486 unsigned long ulong_mtu; 487 488 if (priv_get_sysfs_ulong(priv, "mtu", &ulong_mtu) == -1) 489 return -1; 490 *mtu = ulong_mtu; 491 return 0; 492 } 493 494 /** 495 * Read device counter from sysfs. 496 * 497 * @param priv 498 * Pointer to private structure. 499 * @param name 500 * Counter name. 501 * @param[out] cntr 502 * Counter output buffer. 503 * 504 * @return 505 * 0 on success, -1 on failure and errno is set. 506 */ 507 int 508 priv_get_cntr_sysfs(struct priv *priv, const char *name, uint64_t *cntr) 509 { 510 unsigned long ulong_ctr; 511 512 if (priv_get_sysfs_ulong(priv, name, &ulong_ctr) == -1) 513 return -1; 514 *cntr = ulong_ctr; 515 return 0; 516 } 517 518 /** 519 * Set device MTU. 520 * 521 * @param priv 522 * Pointer to private structure. 523 * @param mtu 524 * MTU value to set. 525 * 526 * @return 527 * 0 on success, -1 on failure and errno is set. 528 */ 529 static int 530 priv_set_mtu(struct priv *priv, uint16_t mtu) 531 { 532 uint16_t new_mtu; 533 534 if (priv_set_sysfs_ulong(priv, "mtu", mtu) || 535 priv_get_mtu(priv, &new_mtu)) 536 return -1; 537 if (new_mtu == mtu) 538 return 0; 539 errno = EINVAL; 540 return -1; 541 } 542 543 /** 544 * Set device flags. 545 * 546 * @param priv 547 * Pointer to private structure. 548 * @param keep 549 * Bitmask for flags that must remain untouched. 550 * @param flags 551 * Bitmask for flags to modify. 552 * 553 * @return 554 * 0 on success, -1 on failure and errno is set. 555 */ 556 int 557 priv_set_flags(struct priv *priv, unsigned int keep, unsigned int flags) 558 { 559 unsigned long tmp; 560 561 if (priv_get_sysfs_ulong(priv, "flags", &tmp) == -1) 562 return -1; 563 tmp &= keep; 564 tmp |= (flags & (~keep)); 565 return priv_set_sysfs_ulong(priv, "flags", tmp); 566 } 567 568 /** 569 * Ethernet device configuration. 570 * 571 * Prepare the driver for a given number of TX and RX queues. 572 * 573 * @param dev 574 * Pointer to Ethernet device structure. 575 * 576 * @return 577 * 0 on success, errno value on failure. 578 */ 579 static int 580 dev_configure(struct rte_eth_dev *dev) 581 { 582 struct priv *priv = dev->data->dev_private; 583 unsigned int rxqs_n = dev->data->nb_rx_queues; 584 unsigned int txqs_n = dev->data->nb_tx_queues; 585 unsigned int i; 586 unsigned int j; 587 unsigned int reta_idx_n; 588 589 priv->rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf; 590 priv->rxqs = (void *)dev->data->rx_queues; 591 priv->txqs = (void *)dev->data->tx_queues; 592 if (txqs_n != priv->txqs_n) { 593 INFO("%p: TX queues number update: %u -> %u", 594 (void *)dev, priv->txqs_n, txqs_n); 595 priv->txqs_n = txqs_n; 596 } 597 if (rxqs_n > priv->ind_table_max_size) { 598 ERROR("cannot handle this many RX queues (%u)", rxqs_n); 599 return EINVAL; 600 } 601 if (rxqs_n == priv->rxqs_n) 602 return 0; 603 INFO("%p: RX queues number update: %u -> %u", 604 (void *)dev, priv->rxqs_n, rxqs_n); 605 priv->rxqs_n = rxqs_n; 606 /* If the requested number of RX queues is not a power of two, use the 607 * maximum indirection table size for better balancing. 608 * The result is always rounded to the next power of two. */ 609 reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ? 610 priv->ind_table_max_size : 611 rxqs_n)); 612 if (priv_rss_reta_index_resize(priv, reta_idx_n)) 613 return ENOMEM; 614 /* When the number of RX queues is not a power of two, the remaining 615 * table entries are padded with reused WQs and hashes are not spread 616 * uniformly. */ 617 for (i = 0, j = 0; (i != reta_idx_n); ++i) { 618 (*priv->reta_idx)[i] = j; 619 if (++j == rxqs_n) 620 j = 0; 621 } 622 return 0; 623 } 624 625 /** 626 * DPDK callback for Ethernet device configuration. 627 * 628 * @param dev 629 * Pointer to Ethernet device structure. 630 * 631 * @return 632 * 0 on success, negative errno value on failure. 633 */ 634 int 635 mlx5_dev_configure(struct rte_eth_dev *dev) 636 { 637 struct priv *priv = dev->data->dev_private; 638 int ret; 639 640 if (mlx5_is_secondary()) 641 return -E_RTE_SECONDARY; 642 643 priv_lock(priv); 644 ret = dev_configure(dev); 645 assert(ret >= 0); 646 priv_unlock(priv); 647 return -ret; 648 } 649 650 /** 651 * DPDK callback to get information about the device. 652 * 653 * @param dev 654 * Pointer to Ethernet device structure. 655 * @param[out] info 656 * Info structure output buffer. 657 */ 658 void 659 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 660 { 661 struct priv *priv = mlx5_get_priv(dev); 662 unsigned int max; 663 char ifname[IF_NAMESIZE]; 664 665 info->pci_dev = RTE_ETH_DEV_TO_PCI(dev); 666 667 priv_lock(priv); 668 /* FIXME: we should ask the device for these values. */ 669 info->min_rx_bufsize = 32; 670 info->max_rx_pktlen = 65536; 671 /* 672 * Since we need one CQ per QP, the limit is the minimum number 673 * between the two values. 674 */ 675 max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ? 676 priv->device_attr.max_qp : priv->device_attr.max_cq); 677 /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */ 678 if (max >= 65535) 679 max = 65535; 680 info->max_rx_queues = max; 681 info->max_tx_queues = max; 682 info->max_mac_addrs = RTE_DIM(priv->mac); 683 info->rx_offload_capa = 684 (priv->hw_csum ? 685 (DEV_RX_OFFLOAD_IPV4_CKSUM | 686 DEV_RX_OFFLOAD_UDP_CKSUM | 687 DEV_RX_OFFLOAD_TCP_CKSUM) : 688 0) | 689 (priv->hw_vlan_strip ? DEV_RX_OFFLOAD_VLAN_STRIP : 0); 690 if (!priv->mps) 691 info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT; 692 if (priv->hw_csum) 693 info->tx_offload_capa |= 694 (DEV_TX_OFFLOAD_IPV4_CKSUM | 695 DEV_TX_OFFLOAD_UDP_CKSUM | 696 DEV_TX_OFFLOAD_TCP_CKSUM); 697 if (priv->tso) 698 info->tx_offload_capa |= DEV_TX_OFFLOAD_TCP_TSO; 699 if (priv->tunnel_en) 700 info->tx_offload_capa |= (DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM | 701 DEV_TX_OFFLOAD_VXLAN_TNL_TSO | 702 DEV_TX_OFFLOAD_GRE_TNL_TSO); 703 if (priv_get_ifname(priv, &ifname) == 0) 704 info->if_index = if_nametoindex(ifname); 705 info->reta_size = priv->reta_idx_n ? 706 priv->reta_idx_n : priv->ind_table_max_size; 707 info->hash_key_size = ((*priv->rss_conf) ? 708 (*priv->rss_conf)[0]->rss_key_len : 709 0); 710 info->speed_capa = priv->link_speed_capa; 711 priv_unlock(priv); 712 } 713 714 const uint32_t * 715 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) 716 { 717 static const uint32_t ptypes[] = { 718 /* refers to rxq_cq_to_pkt_type() */ 719 RTE_PTYPE_L2_ETHER, 720 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, 721 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, 722 RTE_PTYPE_L4_NONFRAG, 723 RTE_PTYPE_L4_FRAG, 724 RTE_PTYPE_L4_TCP, 725 RTE_PTYPE_L4_UDP, 726 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, 727 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, 728 RTE_PTYPE_INNER_L4_NONFRAG, 729 RTE_PTYPE_INNER_L4_FRAG, 730 RTE_PTYPE_INNER_L4_TCP, 731 RTE_PTYPE_INNER_L4_UDP, 732 RTE_PTYPE_UNKNOWN 733 }; 734 735 if (dev->rx_pkt_burst == mlx5_rx_burst || 736 dev->rx_pkt_burst == mlx5_rx_burst_vec) 737 return ptypes; 738 return NULL; 739 } 740 741 /** 742 * DPDK callback to retrieve physical link information. 743 * 744 * @param dev 745 * Pointer to Ethernet device structure. 746 * @param wait_to_complete 747 * Wait for request completion (ignored). 748 */ 749 static int 750 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, int wait_to_complete) 751 { 752 struct priv *priv = mlx5_get_priv(dev); 753 struct ethtool_cmd edata = { 754 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */ 755 }; 756 struct ifreq ifr; 757 struct rte_eth_link dev_link; 758 int link_speed = 0; 759 760 /* priv_lock() is not taken to allow concurrent calls. */ 761 762 (void)wait_to_complete; 763 if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) { 764 WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno)); 765 return -1; 766 } 767 memset(&dev_link, 0, sizeof(dev_link)); 768 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && 769 (ifr.ifr_flags & IFF_RUNNING)); 770 ifr.ifr_data = (void *)&edata; 771 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 772 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", 773 strerror(errno)); 774 return -1; 775 } 776 link_speed = ethtool_cmd_speed(&edata); 777 if (link_speed == -1) 778 dev_link.link_speed = 0; 779 else 780 dev_link.link_speed = link_speed; 781 priv->link_speed_capa = 0; 782 if (edata.supported & SUPPORTED_Autoneg) 783 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG; 784 if (edata.supported & (SUPPORTED_1000baseT_Full | 785 SUPPORTED_1000baseKX_Full)) 786 priv->link_speed_capa |= ETH_LINK_SPEED_1G; 787 if (edata.supported & SUPPORTED_10000baseKR_Full) 788 priv->link_speed_capa |= ETH_LINK_SPEED_10G; 789 if (edata.supported & (SUPPORTED_40000baseKR4_Full | 790 SUPPORTED_40000baseCR4_Full | 791 SUPPORTED_40000baseSR4_Full | 792 SUPPORTED_40000baseLR4_Full)) 793 priv->link_speed_capa |= ETH_LINK_SPEED_40G; 794 dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? 795 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 796 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & 797 ETH_LINK_SPEED_FIXED); 798 if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) { 799 /* Link status changed. */ 800 dev->data->dev_link = dev_link; 801 return 0; 802 } 803 /* Link status is still the same. */ 804 return -1; 805 } 806 807 /** 808 * Retrieve physical link information (unlocked version using new ioctl). 809 * 810 * @param dev 811 * Pointer to Ethernet device structure. 812 * @param wait_to_complete 813 * Wait for request completion (ignored). 814 */ 815 static int 816 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, int wait_to_complete) 817 { 818 struct priv *priv = mlx5_get_priv(dev); 819 __extension__ struct { 820 struct ethtool_link_settings edata; 821 uint32_t link_mode_data[3 * 822 ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32]; 823 } ecmd; 824 825 struct ifreq ifr; 826 struct rte_eth_link dev_link; 827 uint64_t sc; 828 829 (void)wait_to_complete; 830 if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) { 831 WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno)); 832 return -1; 833 } 834 memset(&dev_link, 0, sizeof(dev_link)); 835 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && 836 (ifr.ifr_flags & IFF_RUNNING)); 837 memset(&ecmd, 0, sizeof(ecmd)); 838 ecmd.edata.cmd = ETHTOOL_GLINKSETTINGS; 839 ifr.ifr_data = (void *)&ecmd; 840 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 841 DEBUG("ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS) failed: %s", 842 strerror(errno)); 843 return -1; 844 } 845 ecmd.edata.link_mode_masks_nwords = -ecmd.edata.link_mode_masks_nwords; 846 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 847 DEBUG("ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS) failed: %s", 848 strerror(errno)); 849 return -1; 850 } 851 dev_link.link_speed = ecmd.edata.speed; 852 sc = ecmd.edata.link_mode_masks[0] | 853 ((uint64_t)ecmd.edata.link_mode_masks[1] << 32); 854 priv->link_speed_capa = 0; 855 if (sc & ETHTOOL_LINK_MODE_Autoneg_BIT) 856 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG; 857 if (sc & (ETHTOOL_LINK_MODE_1000baseT_Full_BIT | 858 ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)) 859 priv->link_speed_capa |= ETH_LINK_SPEED_1G; 860 if (sc & (ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT | 861 ETHTOOL_LINK_MODE_10000baseKR_Full_BIT | 862 ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)) 863 priv->link_speed_capa |= ETH_LINK_SPEED_10G; 864 if (sc & (ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT | 865 ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)) 866 priv->link_speed_capa |= ETH_LINK_SPEED_20G; 867 if (sc & (ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT | 868 ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT | 869 ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT | 870 ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)) 871 priv->link_speed_capa |= ETH_LINK_SPEED_40G; 872 if (sc & (ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT | 873 ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT | 874 ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT | 875 ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)) 876 priv->link_speed_capa |= ETH_LINK_SPEED_56G; 877 if (sc & (ETHTOOL_LINK_MODE_25000baseCR_Full_BIT | 878 ETHTOOL_LINK_MODE_25000baseKR_Full_BIT | 879 ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)) 880 priv->link_speed_capa |= ETH_LINK_SPEED_25G; 881 if (sc & (ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT | 882 ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)) 883 priv->link_speed_capa |= ETH_LINK_SPEED_50G; 884 if (sc & (ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT | 885 ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT | 886 ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT | 887 ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)) 888 priv->link_speed_capa |= ETH_LINK_SPEED_100G; 889 dev_link.link_duplex = ((ecmd.edata.duplex == DUPLEX_HALF) ? 890 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 891 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & 892 ETH_LINK_SPEED_FIXED); 893 if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) { 894 /* Link status changed. */ 895 dev->data->dev_link = dev_link; 896 return 0; 897 } 898 /* Link status is still the same. */ 899 return -1; 900 } 901 902 /** 903 * DPDK callback to retrieve physical link information. 904 * 905 * @param dev 906 * Pointer to Ethernet device structure. 907 * @param wait_to_complete 908 * Wait for request completion (ignored). 909 */ 910 int 911 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete) 912 { 913 struct utsname utsname; 914 int ver[3]; 915 916 if (uname(&utsname) == -1 || 917 sscanf(utsname.release, "%d.%d.%d", 918 &ver[0], &ver[1], &ver[2]) != 3 || 919 KERNEL_VERSION(ver[0], ver[1], ver[2]) < KERNEL_VERSION(4, 9, 0)) 920 return mlx5_link_update_unlocked_gset(dev, wait_to_complete); 921 return mlx5_link_update_unlocked_gs(dev, wait_to_complete); 922 } 923 924 /** 925 * DPDK callback to change the MTU. 926 * 927 * Setting the MTU affects hardware MRU (packets larger than the MTU cannot be 928 * received). Use this as a hint to enable/disable scattered packets support 929 * and improve performance when not needed. 930 * Since failure is not an option, reconfiguring queues on the fly is not 931 * recommended. 932 * 933 * @param dev 934 * Pointer to Ethernet device structure. 935 * @param in_mtu 936 * New MTU. 937 * 938 * @return 939 * 0 on success, negative errno value on failure. 940 */ 941 int 942 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 943 { 944 struct priv *priv = dev->data->dev_private; 945 int ret = 0; 946 unsigned int i; 947 unsigned int max_frame_len; 948 int rehash; 949 int restart = priv->started; 950 951 if (mlx5_is_secondary()) 952 return -E_RTE_SECONDARY; 953 954 priv_lock(priv); 955 /* Set kernel interface MTU first. */ 956 if (priv_set_mtu(priv, mtu)) { 957 ret = errno; 958 WARN("cannot set port %u MTU to %u: %s", priv->port, mtu, 959 strerror(ret)); 960 goto out; 961 } else 962 DEBUG("adapter port %u MTU set to %u", priv->port, mtu); 963 /* Temporarily replace RX handler with a fake one, assuming it has not 964 * been copied elsewhere. */ 965 dev->rx_pkt_burst = removed_rx_burst; 966 /* Make sure everyone has left dev->rx_pkt_burst() and uses 967 * removed_rx_burst() instead. */ 968 rte_wmb(); 969 usleep(1000); 970 /* MTU does not include header and CRC. */ 971 max_frame_len = ETHER_HDR_LEN + mtu + ETHER_CRC_LEN; 972 /* Check if at least one queue is going to need a SGE update. */ 973 for (i = 0; i != priv->rxqs_n; ++i) { 974 struct rxq *rxq = (*priv->rxqs)[i]; 975 unsigned int mb_len; 976 unsigned int size = RTE_PKTMBUF_HEADROOM + max_frame_len; 977 unsigned int sges_n; 978 979 if (rxq == NULL) 980 continue; 981 mb_len = rte_pktmbuf_data_room_size(rxq->mp); 982 assert(mb_len >= RTE_PKTMBUF_HEADROOM); 983 /* 984 * Determine the number of SGEs needed for a full packet 985 * and round it to the next power of two. 986 */ 987 sges_n = log2above((size / mb_len) + !!(size % mb_len)); 988 if (sges_n != rxq->sges_n) 989 break; 990 } 991 /* 992 * If all queues have the right number of SGEs, a simple rehash 993 * of their buffers is enough, otherwise SGE information can only 994 * be updated in a queue by recreating it. All resources that depend 995 * on queues (flows, indirection tables) must be recreated as well in 996 * that case. 997 */ 998 rehash = (i == priv->rxqs_n); 999 if (!rehash) { 1000 /* Clean up everything as with mlx5_dev_stop(). */ 1001 priv_special_flow_disable_all(priv); 1002 priv_mac_addrs_disable(priv); 1003 priv_destroy_hash_rxqs(priv); 1004 priv_fdir_disable(priv); 1005 priv_dev_interrupt_handler_uninstall(priv, dev); 1006 } 1007 recover: 1008 /* Reconfigure each RX queue. */ 1009 for (i = 0; (i != priv->rxqs_n); ++i) { 1010 struct rxq *rxq = (*priv->rxqs)[i]; 1011 struct rxq_ctrl *rxq_ctrl = 1012 container_of(rxq, struct rxq_ctrl, rxq); 1013 unsigned int mb_len; 1014 unsigned int tmp; 1015 1016 if (rxq == NULL) 1017 continue; 1018 mb_len = rte_pktmbuf_data_room_size(rxq->mp); 1019 assert(mb_len >= RTE_PKTMBUF_HEADROOM); 1020 /* Provide new values to rxq_setup(). */ 1021 dev->data->dev_conf.rxmode.jumbo_frame = 1022 (max_frame_len > ETHER_MAX_LEN); 1023 dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len; 1024 if (rehash) 1025 ret = rxq_rehash(dev, rxq_ctrl); 1026 else 1027 ret = rxq_ctrl_setup(dev, rxq_ctrl, 1 << rxq->elts_n, 1028 rxq_ctrl->socket, NULL, rxq->mp); 1029 if (!ret) 1030 continue; 1031 /* Attempt to roll back in case of error. */ 1032 tmp = (mb_len << rxq->sges_n) - RTE_PKTMBUF_HEADROOM; 1033 if (max_frame_len != tmp) { 1034 max_frame_len = tmp; 1035 goto recover; 1036 } 1037 /* Double fault, disable RX. */ 1038 break; 1039 } 1040 /* Mimic mlx5_dev_start(). */ 1041 if (ret) { 1042 ERROR("unable to reconfigure RX queues, RX disabled"); 1043 } else if (restart && 1044 !rehash && 1045 !priv_create_hash_rxqs(priv) && 1046 !priv_rehash_flows(priv)) { 1047 if (dev->data->dev_conf.fdir_conf.mode == RTE_FDIR_MODE_NONE) 1048 priv_fdir_enable(priv); 1049 priv_dev_interrupt_handler_install(priv, dev); 1050 } 1051 priv->mtu = mtu; 1052 /* Burst functions can now be called again. */ 1053 rte_wmb(); 1054 /* 1055 * Use a safe RX burst function in case of error, otherwise select RX 1056 * burst function again. 1057 */ 1058 if (!ret) 1059 priv_select_rx_function(priv); 1060 out: 1061 priv_unlock(priv); 1062 assert(ret >= 0); 1063 return -ret; 1064 } 1065 1066 /** 1067 * DPDK callback to get flow control status. 1068 * 1069 * @param dev 1070 * Pointer to Ethernet device structure. 1071 * @param[out] fc_conf 1072 * Flow control output buffer. 1073 * 1074 * @return 1075 * 0 on success, negative errno value on failure. 1076 */ 1077 int 1078 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 1079 { 1080 struct priv *priv = dev->data->dev_private; 1081 struct ifreq ifr; 1082 struct ethtool_pauseparam ethpause = { 1083 .cmd = ETHTOOL_GPAUSEPARAM 1084 }; 1085 int ret; 1086 1087 if (mlx5_is_secondary()) 1088 return -E_RTE_SECONDARY; 1089 1090 ifr.ifr_data = (void *)ðpause; 1091 priv_lock(priv); 1092 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 1093 ret = errno; 1094 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)" 1095 " failed: %s", 1096 strerror(ret)); 1097 goto out; 1098 } 1099 1100 fc_conf->autoneg = ethpause.autoneg; 1101 if (ethpause.rx_pause && ethpause.tx_pause) 1102 fc_conf->mode = RTE_FC_FULL; 1103 else if (ethpause.rx_pause) 1104 fc_conf->mode = RTE_FC_RX_PAUSE; 1105 else if (ethpause.tx_pause) 1106 fc_conf->mode = RTE_FC_TX_PAUSE; 1107 else 1108 fc_conf->mode = RTE_FC_NONE; 1109 ret = 0; 1110 1111 out: 1112 priv_unlock(priv); 1113 assert(ret >= 0); 1114 return -ret; 1115 } 1116 1117 /** 1118 * DPDK callback to modify flow control parameters. 1119 * 1120 * @param dev 1121 * Pointer to Ethernet device structure. 1122 * @param[in] fc_conf 1123 * Flow control parameters. 1124 * 1125 * @return 1126 * 0 on success, negative errno value on failure. 1127 */ 1128 int 1129 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 1130 { 1131 struct priv *priv = dev->data->dev_private; 1132 struct ifreq ifr; 1133 struct ethtool_pauseparam ethpause = { 1134 .cmd = ETHTOOL_SPAUSEPARAM 1135 }; 1136 int ret; 1137 1138 if (mlx5_is_secondary()) 1139 return -E_RTE_SECONDARY; 1140 1141 ifr.ifr_data = (void *)ðpause; 1142 ethpause.autoneg = fc_conf->autoneg; 1143 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 1144 (fc_conf->mode & RTE_FC_RX_PAUSE)) 1145 ethpause.rx_pause = 1; 1146 else 1147 ethpause.rx_pause = 0; 1148 1149 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 1150 (fc_conf->mode & RTE_FC_TX_PAUSE)) 1151 ethpause.tx_pause = 1; 1152 else 1153 ethpause.tx_pause = 0; 1154 1155 priv_lock(priv); 1156 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 1157 ret = errno; 1158 WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" 1159 " failed: %s", 1160 strerror(ret)); 1161 goto out; 1162 } 1163 ret = 0; 1164 1165 out: 1166 priv_unlock(priv); 1167 assert(ret >= 0); 1168 return -ret; 1169 } 1170 1171 /** 1172 * Get PCI information from struct ibv_device. 1173 * 1174 * @param device 1175 * Pointer to Ethernet device structure. 1176 * @param[out] pci_addr 1177 * PCI bus address output buffer. 1178 * 1179 * @return 1180 * 0 on success, -1 on failure and errno is set. 1181 */ 1182 int 1183 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, 1184 struct rte_pci_addr *pci_addr) 1185 { 1186 FILE *file; 1187 char line[32]; 1188 MKSTR(path, "%s/device/uevent", device->ibdev_path); 1189 1190 file = fopen(path, "rb"); 1191 if (file == NULL) 1192 return -1; 1193 while (fgets(line, sizeof(line), file) == line) { 1194 size_t len = strlen(line); 1195 int ret; 1196 1197 /* Truncate long lines. */ 1198 if (len == (sizeof(line) - 1)) 1199 while (line[(len - 1)] != '\n') { 1200 ret = fgetc(file); 1201 if (ret == EOF) 1202 break; 1203 line[(len - 1)] = ret; 1204 } 1205 /* Extract information. */ 1206 if (sscanf(line, 1207 "PCI_SLOT_NAME=" 1208 "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", 1209 &pci_addr->domain, 1210 &pci_addr->bus, 1211 &pci_addr->devid, 1212 &pci_addr->function) == 4) { 1213 ret = 0; 1214 break; 1215 } 1216 } 1217 fclose(file); 1218 return 0; 1219 } 1220 1221 /** 1222 * Link status handler. 1223 * 1224 * @param priv 1225 * Pointer to private structure. 1226 * @param dev 1227 * Pointer to the rte_eth_dev structure. 1228 * 1229 * @return 1230 * Nonzero if the callback process can be called immediately. 1231 */ 1232 static int 1233 priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev) 1234 { 1235 struct ibv_async_event event; 1236 struct rte_eth_link *link = &dev->data->dev_link; 1237 int ret = 0; 1238 1239 /* Read all message and acknowledge them. */ 1240 for (;;) { 1241 if (ibv_get_async_event(priv->ctx, &event)) 1242 break; 1243 1244 if (event.event_type != IBV_EVENT_PORT_ACTIVE && 1245 event.event_type != IBV_EVENT_PORT_ERR) 1246 DEBUG("event type %d on port %d not handled", 1247 event.event_type, event.element.port_num); 1248 ibv_ack_async_event(&event); 1249 } 1250 mlx5_link_update(dev, 0); 1251 if (((link->link_speed == 0) && link->link_status) || 1252 ((link->link_speed != 0) && !link->link_status)) { 1253 if (!priv->pending_alarm) { 1254 /* Inconsistent status, check again later. */ 1255 priv->pending_alarm = 1; 1256 rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US, 1257 mlx5_dev_link_status_handler, 1258 dev); 1259 } 1260 } else { 1261 ret = 1; 1262 } 1263 return ret; 1264 } 1265 1266 /** 1267 * Handle delayed link status event. 1268 * 1269 * @param arg 1270 * Registered argument. 1271 */ 1272 void 1273 mlx5_dev_link_status_handler(void *arg) 1274 { 1275 struct rte_eth_dev *dev = arg; 1276 struct priv *priv = dev->data->dev_private; 1277 int ret; 1278 1279 priv_lock(priv); 1280 assert(priv->pending_alarm == 1); 1281 priv->pending_alarm = 0; 1282 ret = priv_dev_link_status_handler(priv, dev); 1283 priv_unlock(priv); 1284 if (ret) 1285 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL, 1286 NULL); 1287 } 1288 1289 /** 1290 * Handle interrupts from the NIC. 1291 * 1292 * @param[in] intr_handle 1293 * Interrupt handler. 1294 * @param cb_arg 1295 * Callback argument. 1296 */ 1297 void 1298 mlx5_dev_interrupt_handler(void *cb_arg) 1299 { 1300 struct rte_eth_dev *dev = cb_arg; 1301 struct priv *priv = dev->data->dev_private; 1302 int ret; 1303 1304 priv_lock(priv); 1305 ret = priv_dev_link_status_handler(priv, dev); 1306 priv_unlock(priv); 1307 if (ret) 1308 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL, 1309 NULL); 1310 } 1311 1312 /** 1313 * Uninstall interrupt handler. 1314 * 1315 * @param priv 1316 * Pointer to private structure. 1317 * @param dev 1318 * Pointer to the rte_eth_dev structure. 1319 */ 1320 void 1321 priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev) 1322 { 1323 if (!dev->data->dev_conf.intr_conf.lsc) 1324 return; 1325 rte_intr_callback_unregister(&priv->intr_handle, 1326 mlx5_dev_interrupt_handler, 1327 dev); 1328 if (priv->pending_alarm) 1329 rte_eal_alarm_cancel(mlx5_dev_link_status_handler, dev); 1330 priv->pending_alarm = 0; 1331 priv->intr_handle.fd = 0; 1332 priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; 1333 } 1334 1335 /** 1336 * Install interrupt handler. 1337 * 1338 * @param priv 1339 * Pointer to private structure. 1340 * @param dev 1341 * Pointer to the rte_eth_dev structure. 1342 */ 1343 void 1344 priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev) 1345 { 1346 int rc, flags; 1347 1348 if (!dev->data->dev_conf.intr_conf.lsc) 1349 return; 1350 assert(priv->ctx->async_fd > 0); 1351 flags = fcntl(priv->ctx->async_fd, F_GETFL); 1352 rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); 1353 if (rc < 0) { 1354 INFO("failed to change file descriptor async event queue"); 1355 dev->data->dev_conf.intr_conf.lsc = 0; 1356 } else { 1357 priv->intr_handle.fd = priv->ctx->async_fd; 1358 priv->intr_handle.type = RTE_INTR_HANDLE_EXT; 1359 rte_intr_callback_register(&priv->intr_handle, 1360 mlx5_dev_interrupt_handler, 1361 dev); 1362 } 1363 } 1364 1365 /** 1366 * Change the link state (UP / DOWN). 1367 * 1368 * @param priv 1369 * Pointer to Ethernet device structure. 1370 * @param up 1371 * Nonzero for link up, otherwise link down. 1372 * 1373 * @return 1374 * 0 on success, errno value on failure. 1375 */ 1376 static int 1377 priv_set_link(struct priv *priv, int up) 1378 { 1379 struct rte_eth_dev *dev = priv->dev; 1380 int err; 1381 1382 if (up) { 1383 err = priv_set_flags(priv, ~IFF_UP, IFF_UP); 1384 if (err) 1385 return err; 1386 priv_select_tx_function(priv); 1387 priv_select_rx_function(priv); 1388 } else { 1389 err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP); 1390 if (err) 1391 return err; 1392 dev->rx_pkt_burst = removed_rx_burst; 1393 dev->tx_pkt_burst = removed_tx_burst; 1394 } 1395 return 0; 1396 } 1397 1398 /** 1399 * DPDK callback to bring the link DOWN. 1400 * 1401 * @param dev 1402 * Pointer to Ethernet device structure. 1403 * 1404 * @return 1405 * 0 on success, errno value on failure. 1406 */ 1407 int 1408 mlx5_set_link_down(struct rte_eth_dev *dev) 1409 { 1410 struct priv *priv = dev->data->dev_private; 1411 int err; 1412 1413 priv_lock(priv); 1414 err = priv_set_link(priv, 0); 1415 priv_unlock(priv); 1416 return err; 1417 } 1418 1419 /** 1420 * DPDK callback to bring the link UP. 1421 * 1422 * @param dev 1423 * Pointer to Ethernet device structure. 1424 * 1425 * @return 1426 * 0 on success, errno value on failure. 1427 */ 1428 int 1429 mlx5_set_link_up(struct rte_eth_dev *dev) 1430 { 1431 struct priv *priv = dev->data->dev_private; 1432 int err; 1433 1434 priv_lock(priv); 1435 err = priv_set_link(priv, 1); 1436 priv_unlock(priv); 1437 return err; 1438 } 1439 1440 /** 1441 * Configure secondary process queues from a private data pointer (primary 1442 * or secondary) and update burst callbacks. Can take place only once. 1443 * 1444 * All queues must have been previously created by the primary process to 1445 * avoid undefined behavior. 1446 * 1447 * @param priv 1448 * Private data pointer from either primary or secondary process. 1449 * 1450 * @return 1451 * Private data pointer from secondary process, NULL in case of error. 1452 */ 1453 struct priv * 1454 mlx5_secondary_data_setup(struct priv *priv) 1455 { 1456 unsigned int port_id = 0; 1457 struct mlx5_secondary_data *sd; 1458 void **tx_queues; 1459 void **rx_queues; 1460 unsigned int nb_tx_queues; 1461 unsigned int nb_rx_queues; 1462 unsigned int i; 1463 1464 /* priv must be valid at this point. */ 1465 assert(priv != NULL); 1466 /* priv->dev must also be valid but may point to local memory from 1467 * another process, possibly with the same address and must not 1468 * be dereferenced yet. */ 1469 assert(priv->dev != NULL); 1470 /* Determine port ID by finding out where priv comes from. */ 1471 while (1) { 1472 sd = &mlx5_secondary_data[port_id]; 1473 rte_spinlock_lock(&sd->lock); 1474 /* Primary process? */ 1475 if (sd->primary_priv == priv) 1476 break; 1477 /* Secondary process? */ 1478 if (sd->data.dev_private == priv) 1479 break; 1480 rte_spinlock_unlock(&sd->lock); 1481 if (++port_id == RTE_DIM(mlx5_secondary_data)) 1482 port_id = 0; 1483 } 1484 /* Switch to secondary private structure. If private data has already 1485 * been updated by another thread, there is nothing else to do. */ 1486 priv = sd->data.dev_private; 1487 if (priv->dev->data == &sd->data) 1488 goto end; 1489 /* Sanity checks. Secondary private structure is supposed to point 1490 * to local eth_dev, itself still pointing to the shared device data 1491 * structure allocated by the primary process. */ 1492 assert(sd->shared_dev_data != &sd->data); 1493 assert(sd->data.nb_tx_queues == 0); 1494 assert(sd->data.tx_queues == NULL); 1495 assert(sd->data.nb_rx_queues == 0); 1496 assert(sd->data.rx_queues == NULL); 1497 assert(priv != sd->primary_priv); 1498 assert(priv->dev->data == sd->shared_dev_data); 1499 assert(priv->txqs_n == 0); 1500 assert(priv->txqs == NULL); 1501 assert(priv->rxqs_n == 0); 1502 assert(priv->rxqs == NULL); 1503 nb_tx_queues = sd->shared_dev_data->nb_tx_queues; 1504 nb_rx_queues = sd->shared_dev_data->nb_rx_queues; 1505 /* Allocate local storage for queues. */ 1506 tx_queues = rte_zmalloc("secondary ethdev->tx_queues", 1507 sizeof(sd->data.tx_queues[0]) * nb_tx_queues, 1508 RTE_CACHE_LINE_SIZE); 1509 rx_queues = rte_zmalloc("secondary ethdev->rx_queues", 1510 sizeof(sd->data.rx_queues[0]) * nb_rx_queues, 1511 RTE_CACHE_LINE_SIZE); 1512 if (tx_queues == NULL || rx_queues == NULL) 1513 goto error; 1514 /* Lock to prevent control operations during setup. */ 1515 priv_lock(priv); 1516 /* TX queues. */ 1517 for (i = 0; i != nb_tx_queues; ++i) { 1518 struct txq *primary_txq = (*sd->primary_priv->txqs)[i]; 1519 struct txq_ctrl *primary_txq_ctrl; 1520 struct txq_ctrl *txq_ctrl; 1521 1522 if (primary_txq == NULL) 1523 continue; 1524 primary_txq_ctrl = container_of(primary_txq, 1525 struct txq_ctrl, txq); 1526 txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl) + 1527 (1 << primary_txq->elts_n) * 1528 sizeof(struct rte_mbuf *), 0, 1529 primary_txq_ctrl->socket); 1530 if (txq_ctrl != NULL) { 1531 if (txq_ctrl_setup(priv->dev, 1532 txq_ctrl, 1533 1 << primary_txq->elts_n, 1534 primary_txq_ctrl->socket, 1535 NULL) == 0) { 1536 txq_ctrl->txq.stats.idx = 1537 primary_txq->stats.idx; 1538 tx_queues[i] = &txq_ctrl->txq; 1539 continue; 1540 } 1541 rte_free(txq_ctrl); 1542 } 1543 while (i) { 1544 txq_ctrl = tx_queues[--i]; 1545 txq_cleanup(txq_ctrl); 1546 rte_free(txq_ctrl); 1547 } 1548 goto error; 1549 } 1550 /* RX queues. */ 1551 for (i = 0; i != nb_rx_queues; ++i) { 1552 struct rxq_ctrl *primary_rxq = 1553 container_of((*sd->primary_priv->rxqs)[i], 1554 struct rxq_ctrl, rxq); 1555 1556 if (primary_rxq == NULL) 1557 continue; 1558 /* Not supported yet. */ 1559 rx_queues[i] = NULL; 1560 } 1561 /* Update everything. */ 1562 priv->txqs = (void *)tx_queues; 1563 priv->txqs_n = nb_tx_queues; 1564 priv->rxqs = (void *)rx_queues; 1565 priv->rxqs_n = nb_rx_queues; 1566 sd->data.rx_queues = rx_queues; 1567 sd->data.tx_queues = tx_queues; 1568 sd->data.nb_rx_queues = nb_rx_queues; 1569 sd->data.nb_tx_queues = nb_tx_queues; 1570 sd->data.dev_link = sd->shared_dev_data->dev_link; 1571 sd->data.mtu = sd->shared_dev_data->mtu; 1572 memcpy(sd->data.rx_queue_state, sd->shared_dev_data->rx_queue_state, 1573 sizeof(sd->data.rx_queue_state)); 1574 memcpy(sd->data.tx_queue_state, sd->shared_dev_data->tx_queue_state, 1575 sizeof(sd->data.tx_queue_state)); 1576 sd->data.dev_flags = sd->shared_dev_data->dev_flags; 1577 /* Use local data from now on. */ 1578 rte_mb(); 1579 priv->dev->data = &sd->data; 1580 rte_mb(); 1581 priv_select_tx_function(priv); 1582 priv_select_rx_function(priv); 1583 priv_unlock(priv); 1584 end: 1585 /* More sanity checks. */ 1586 assert(priv->dev->data == &sd->data); 1587 rte_spinlock_unlock(&sd->lock); 1588 return priv; 1589 error: 1590 priv_unlock(priv); 1591 rte_free(tx_queues); 1592 rte_free(rx_queues); 1593 rte_spinlock_unlock(&sd->lock); 1594 return NULL; 1595 } 1596 1597 /** 1598 * Configure the TX function to use. 1599 * 1600 * @param priv 1601 * Pointer to private structure. 1602 */ 1603 void 1604 priv_select_tx_function(struct priv *priv) 1605 { 1606 priv->dev->tx_pkt_burst = mlx5_tx_burst; 1607 /* Select appropriate TX function. */ 1608 if (priv->mps == MLX5_MPW_ENHANCED) { 1609 if (priv_check_vec_tx_support(priv) > 0) { 1610 if (priv_check_raw_vec_tx_support(priv) > 0) 1611 priv->dev->tx_pkt_burst = mlx5_tx_burst_raw_vec; 1612 else 1613 priv->dev->tx_pkt_burst = mlx5_tx_burst_vec; 1614 DEBUG("selected Enhanced MPW TX vectorized function"); 1615 } else { 1616 priv->dev->tx_pkt_burst = mlx5_tx_burst_empw; 1617 DEBUG("selected Enhanced MPW TX function"); 1618 } 1619 } else if (priv->mps && priv->txq_inline) { 1620 priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline; 1621 DEBUG("selected MPW inline TX function"); 1622 } else if (priv->mps) { 1623 priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw; 1624 DEBUG("selected MPW TX function"); 1625 } 1626 } 1627 1628 /** 1629 * Configure the RX function to use. 1630 * 1631 * @param priv 1632 * Pointer to private structure. 1633 */ 1634 void 1635 priv_select_rx_function(struct priv *priv) 1636 { 1637 if (priv_check_vec_rx_support(priv) > 0) { 1638 priv_prep_vec_rx_function(priv); 1639 priv->dev->rx_pkt_burst = mlx5_rx_burst_vec; 1640 DEBUG("selected RX vectorized function"); 1641 } else { 1642 priv->dev->rx_pkt_burst = mlx5_rx_burst; 1643 } 1644 } 1645