1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <stddef.h> 35 #include <assert.h> 36 #include <unistd.h> 37 #include <stdint.h> 38 #include <stdio.h> 39 #include <string.h> 40 #include <stdlib.h> 41 #include <errno.h> 42 #include <dirent.h> 43 #include <net/if.h> 44 #include <sys/ioctl.h> 45 #include <sys/socket.h> 46 #include <netinet/in.h> 47 #include <linux/if.h> 48 #include <linux/ethtool.h> 49 #include <linux/sockios.h> 50 #include <fcntl.h> 51 52 /* DPDK headers don't like -pedantic. */ 53 #ifdef PEDANTIC 54 #pragma GCC diagnostic ignored "-pedantic" 55 #endif 56 #include <rte_atomic.h> 57 #include <rte_ethdev.h> 58 #include <rte_mbuf.h> 59 #include <rte_common.h> 60 #include <rte_interrupts.h> 61 #include <rte_alarm.h> 62 #include <rte_malloc.h> 63 #ifdef PEDANTIC 64 #pragma GCC diagnostic error "-pedantic" 65 #endif 66 67 #include "mlx5.h" 68 #include "mlx5_rxtx.h" 69 #include "mlx5_utils.h" 70 71 /** 72 * Return private structure associated with an Ethernet device. 73 * 74 * @param dev 75 * Pointer to Ethernet device structure. 76 * 77 * @return 78 * Pointer to private structure. 79 */ 80 struct priv * 81 mlx5_get_priv(struct rte_eth_dev *dev) 82 { 83 struct mlx5_secondary_data *sd; 84 85 if (!mlx5_is_secondary()) 86 return dev->data->dev_private; 87 sd = &mlx5_secondary_data[dev->data->port_id]; 88 return sd->data.dev_private; 89 } 90 91 /** 92 * Check if running as a secondary process. 93 * 94 * @return 95 * Nonzero if running as a secondary process. 96 */ 97 inline int 98 mlx5_is_secondary(void) 99 { 100 return rte_eal_process_type() != RTE_PROC_PRIMARY; 101 } 102 103 /** 104 * Get interface name from private structure. 105 * 106 * @param[in] priv 107 * Pointer to private structure. 108 * @param[out] ifname 109 * Interface name output buffer. 110 * 111 * @return 112 * 0 on success, -1 on failure and errno is set. 113 */ 114 int 115 priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]) 116 { 117 DIR *dir; 118 struct dirent *dent; 119 unsigned int dev_type = 0; 120 unsigned int dev_port_prev = ~0u; 121 char match[IF_NAMESIZE] = ""; 122 123 { 124 MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path); 125 126 dir = opendir(path); 127 if (dir == NULL) 128 return -1; 129 } 130 while ((dent = readdir(dir)) != NULL) { 131 char *name = dent->d_name; 132 FILE *file; 133 unsigned int dev_port; 134 int r; 135 136 if ((name[0] == '.') && 137 ((name[1] == '\0') || 138 ((name[1] == '.') && (name[2] == '\0')))) 139 continue; 140 141 MKSTR(path, "%s/device/net/%s/%s", 142 priv->ctx->device->ibdev_path, name, 143 (dev_type ? "dev_id" : "dev_port")); 144 145 file = fopen(path, "rb"); 146 if (file == NULL) { 147 if (errno != ENOENT) 148 continue; 149 /* 150 * Switch to dev_id when dev_port does not exist as 151 * is the case with Linux kernel versions < 3.15. 152 */ 153 try_dev_id: 154 match[0] = '\0'; 155 if (dev_type) 156 break; 157 dev_type = 1; 158 dev_port_prev = ~0u; 159 rewinddir(dir); 160 continue; 161 } 162 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); 163 fclose(file); 164 if (r != 1) 165 continue; 166 /* 167 * Switch to dev_id when dev_port returns the same value for 168 * all ports. May happen when using a MOFED release older than 169 * 3.0 with a Linux kernel >= 3.15. 170 */ 171 if (dev_port == dev_port_prev) 172 goto try_dev_id; 173 dev_port_prev = dev_port; 174 if (dev_port == (priv->port - 1u)) 175 snprintf(match, sizeof(match), "%s", name); 176 } 177 closedir(dir); 178 if (match[0] == '\0') 179 return -1; 180 strncpy(*ifname, match, sizeof(*ifname)); 181 return 0; 182 } 183 184 /** 185 * Read from sysfs entry. 186 * 187 * @param[in] priv 188 * Pointer to private structure. 189 * @param[in] entry 190 * Entry name relative to sysfs path. 191 * @param[out] buf 192 * Data output buffer. 193 * @param size 194 * Buffer size. 195 * 196 * @return 197 * 0 on success, -1 on failure and errno is set. 198 */ 199 static int 200 priv_sysfs_read(const struct priv *priv, const char *entry, 201 char *buf, size_t size) 202 { 203 char ifname[IF_NAMESIZE]; 204 FILE *file; 205 int ret; 206 int err; 207 208 if (priv_get_ifname(priv, &ifname)) 209 return -1; 210 211 MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, 212 ifname, entry); 213 214 file = fopen(path, "rb"); 215 if (file == NULL) 216 return -1; 217 ret = fread(buf, 1, size, file); 218 err = errno; 219 if (((size_t)ret < size) && (ferror(file))) 220 ret = -1; 221 else 222 ret = size; 223 fclose(file); 224 errno = err; 225 return ret; 226 } 227 228 /** 229 * Write to sysfs entry. 230 * 231 * @param[in] priv 232 * Pointer to private structure. 233 * @param[in] entry 234 * Entry name relative to sysfs path. 235 * @param[in] buf 236 * Data buffer. 237 * @param size 238 * Buffer size. 239 * 240 * @return 241 * 0 on success, -1 on failure and errno is set. 242 */ 243 static int 244 priv_sysfs_write(const struct priv *priv, const char *entry, 245 char *buf, size_t size) 246 { 247 char ifname[IF_NAMESIZE]; 248 FILE *file; 249 int ret; 250 int err; 251 252 if (priv_get_ifname(priv, &ifname)) 253 return -1; 254 255 MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, 256 ifname, entry); 257 258 file = fopen(path, "wb"); 259 if (file == NULL) 260 return -1; 261 ret = fwrite(buf, 1, size, file); 262 err = errno; 263 if (((size_t)ret < size) || (ferror(file))) 264 ret = -1; 265 else 266 ret = size; 267 fclose(file); 268 errno = err; 269 return ret; 270 } 271 272 /** 273 * Get unsigned long sysfs property. 274 * 275 * @param priv 276 * Pointer to private structure. 277 * @param[in] name 278 * Entry name relative to sysfs path. 279 * @param[out] value 280 * Value output buffer. 281 * 282 * @return 283 * 0 on success, -1 on failure and errno is set. 284 */ 285 static int 286 priv_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value) 287 { 288 int ret; 289 unsigned long value_ret; 290 char value_str[32]; 291 292 ret = priv_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1)); 293 if (ret == -1) { 294 DEBUG("cannot read %s value from sysfs: %s", 295 name, strerror(errno)); 296 return -1; 297 } 298 value_str[ret] = '\0'; 299 errno = 0; 300 value_ret = strtoul(value_str, NULL, 0); 301 if (errno) { 302 DEBUG("invalid %s value `%s': %s", name, value_str, 303 strerror(errno)); 304 return -1; 305 } 306 *value = value_ret; 307 return 0; 308 } 309 310 /** 311 * Set unsigned long sysfs property. 312 * 313 * @param priv 314 * Pointer to private structure. 315 * @param[in] name 316 * Entry name relative to sysfs path. 317 * @param value 318 * Value to set. 319 * 320 * @return 321 * 0 on success, -1 on failure and errno is set. 322 */ 323 static int 324 priv_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value) 325 { 326 int ret; 327 MKSTR(value_str, "%lu", value); 328 329 ret = priv_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1)); 330 if (ret == -1) { 331 DEBUG("cannot write %s `%s' (%lu) to sysfs: %s", 332 name, value_str, value, strerror(errno)); 333 return -1; 334 } 335 return 0; 336 } 337 338 /** 339 * Perform ifreq ioctl() on associated Ethernet device. 340 * 341 * @param[in] priv 342 * Pointer to private structure. 343 * @param req 344 * Request number to pass to ioctl(). 345 * @param[out] ifr 346 * Interface request structure output buffer. 347 * 348 * @return 349 * 0 on success, -1 on failure and errno is set. 350 */ 351 int 352 priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr) 353 { 354 int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 355 int ret = -1; 356 357 if (sock == -1) 358 return ret; 359 if (priv_get_ifname(priv, &ifr->ifr_name) == 0) 360 ret = ioctl(sock, req, ifr); 361 close(sock); 362 return ret; 363 } 364 365 /** 366 * Get device MTU. 367 * 368 * @param priv 369 * Pointer to private structure. 370 * @param[out] mtu 371 * MTU value output buffer. 372 * 373 * @return 374 * 0 on success, -1 on failure and errno is set. 375 */ 376 int 377 priv_get_mtu(struct priv *priv, uint16_t *mtu) 378 { 379 unsigned long ulong_mtu; 380 381 if (priv_get_sysfs_ulong(priv, "mtu", &ulong_mtu) == -1) 382 return -1; 383 *mtu = ulong_mtu; 384 return 0; 385 } 386 387 /** 388 * Set device MTU. 389 * 390 * @param priv 391 * Pointer to private structure. 392 * @param mtu 393 * MTU value to set. 394 * 395 * @return 396 * 0 on success, -1 on failure and errno is set. 397 */ 398 static int 399 priv_set_mtu(struct priv *priv, uint16_t mtu) 400 { 401 return priv_set_sysfs_ulong(priv, "mtu", mtu); 402 } 403 404 /** 405 * Set device flags. 406 * 407 * @param priv 408 * Pointer to private structure. 409 * @param keep 410 * Bitmask for flags that must remain untouched. 411 * @param flags 412 * Bitmask for flags to modify. 413 * 414 * @return 415 * 0 on success, -1 on failure and errno is set. 416 */ 417 int 418 priv_set_flags(struct priv *priv, unsigned int keep, unsigned int flags) 419 { 420 unsigned long tmp; 421 422 if (priv_get_sysfs_ulong(priv, "flags", &tmp) == -1) 423 return -1; 424 tmp &= keep; 425 tmp |= flags; 426 return priv_set_sysfs_ulong(priv, "flags", tmp); 427 } 428 429 /** 430 * Ethernet device configuration. 431 * 432 * Prepare the driver for a given number of TX and RX queues. 433 * 434 * @param dev 435 * Pointer to Ethernet device structure. 436 * 437 * @return 438 * 0 on success, errno value on failure. 439 */ 440 static int 441 dev_configure(struct rte_eth_dev *dev) 442 { 443 struct priv *priv = dev->data->dev_private; 444 unsigned int rxqs_n = dev->data->nb_rx_queues; 445 unsigned int txqs_n = dev->data->nb_tx_queues; 446 unsigned int i; 447 unsigned int j; 448 unsigned int reta_idx_n; 449 450 priv->rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf; 451 priv->rxqs = (void *)dev->data->rx_queues; 452 priv->txqs = (void *)dev->data->tx_queues; 453 if (txqs_n != priv->txqs_n) { 454 INFO("%p: TX queues number update: %u -> %u", 455 (void *)dev, priv->txqs_n, txqs_n); 456 priv->txqs_n = txqs_n; 457 } 458 if (rxqs_n > priv->ind_table_max_size) { 459 ERROR("cannot handle this many RX queues (%u)", rxqs_n); 460 return EINVAL; 461 } 462 if (rxqs_n == priv->rxqs_n) 463 return 0; 464 INFO("%p: RX queues number update: %u -> %u", 465 (void *)dev, priv->rxqs_n, rxqs_n); 466 priv->rxqs_n = rxqs_n; 467 /* If the requested number of RX queues is not a power of two, use the 468 * maximum indirection table size for better balancing. 469 * The result is always rounded to the next power of two. */ 470 reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ? 471 priv->ind_table_max_size : 472 rxqs_n)); 473 if (priv_rss_reta_index_resize(priv, reta_idx_n)) 474 return ENOMEM; 475 /* When the number of RX queues is not a power of two, the remaining 476 * table entries are padded with reused WQs and hashes are not spread 477 * uniformly. */ 478 for (i = 0, j = 0; (i != reta_idx_n); ++i) { 479 (*priv->reta_idx)[i] = j; 480 if (++j == rxqs_n) 481 j = 0; 482 } 483 return 0; 484 } 485 486 /** 487 * DPDK callback for Ethernet device configuration. 488 * 489 * @param dev 490 * Pointer to Ethernet device structure. 491 * 492 * @return 493 * 0 on success, negative errno value on failure. 494 */ 495 int 496 mlx5_dev_configure(struct rte_eth_dev *dev) 497 { 498 struct priv *priv = dev->data->dev_private; 499 int ret; 500 501 if (mlx5_is_secondary()) 502 return -E_RTE_SECONDARY; 503 504 priv_lock(priv); 505 ret = dev_configure(dev); 506 assert(ret >= 0); 507 priv_unlock(priv); 508 return -ret; 509 } 510 511 /** 512 * DPDK callback to get information about the device. 513 * 514 * @param dev 515 * Pointer to Ethernet device structure. 516 * @param[out] info 517 * Info structure output buffer. 518 */ 519 void 520 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 521 { 522 struct priv *priv = mlx5_get_priv(dev); 523 unsigned int max; 524 char ifname[IF_NAMESIZE]; 525 526 priv_lock(priv); 527 /* FIXME: we should ask the device for these values. */ 528 info->min_rx_bufsize = 32; 529 info->max_rx_pktlen = 65536; 530 /* 531 * Since we need one CQ per QP, the limit is the minimum number 532 * between the two values. 533 */ 534 max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ? 535 priv->device_attr.max_qp : priv->device_attr.max_cq); 536 /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */ 537 if (max >= 65535) 538 max = 65535; 539 info->max_rx_queues = max; 540 info->max_tx_queues = max; 541 info->max_mac_addrs = RTE_DIM(priv->mac); 542 info->rx_offload_capa = 543 (priv->hw_csum ? 544 (DEV_RX_OFFLOAD_IPV4_CKSUM | 545 DEV_RX_OFFLOAD_UDP_CKSUM | 546 DEV_RX_OFFLOAD_TCP_CKSUM) : 547 0); 548 info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT; 549 if (priv->hw_csum) 550 info->tx_offload_capa |= 551 (DEV_TX_OFFLOAD_IPV4_CKSUM | 552 DEV_TX_OFFLOAD_UDP_CKSUM | 553 DEV_TX_OFFLOAD_TCP_CKSUM); 554 if (priv_get_ifname(priv, &ifname) == 0) 555 info->if_index = if_nametoindex(ifname); 556 /* FIXME: RETA update/query API expects the callee to know the size of 557 * the indirection table, for this PMD the size varies depending on 558 * the number of RX queues, it becomes impossible to find the correct 559 * size if it is not fixed. 560 * The API should be updated to solve this problem. */ 561 info->reta_size = priv->ind_table_max_size; 562 info->speed_capa = 563 ETH_LINK_SPEED_1G | 564 ETH_LINK_SPEED_10G | 565 ETH_LINK_SPEED_20G | 566 ETH_LINK_SPEED_25G | 567 ETH_LINK_SPEED_40G | 568 ETH_LINK_SPEED_50G | 569 ETH_LINK_SPEED_56G; 570 priv_unlock(priv); 571 } 572 573 const uint32_t * 574 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) 575 { 576 static const uint32_t ptypes[] = { 577 /* refers to rxq_cq_to_pkt_type() */ 578 RTE_PTYPE_L3_IPV4, 579 RTE_PTYPE_L3_IPV6, 580 RTE_PTYPE_INNER_L3_IPV4, 581 RTE_PTYPE_INNER_L3_IPV6, 582 RTE_PTYPE_UNKNOWN 583 584 }; 585 586 if (dev->rx_pkt_burst == mlx5_rx_burst || 587 dev->rx_pkt_burst == mlx5_rx_burst_sp) 588 return ptypes; 589 return NULL; 590 } 591 592 /** 593 * DPDK callback to retrieve physical link information (unlocked version). 594 * 595 * @param dev 596 * Pointer to Ethernet device structure. 597 * @param wait_to_complete 598 * Wait for request completion (ignored). 599 */ 600 static int 601 mlx5_link_update_unlocked(struct rte_eth_dev *dev, int wait_to_complete) 602 { 603 struct priv *priv = mlx5_get_priv(dev); 604 struct ethtool_cmd edata = { 605 .cmd = ETHTOOL_GSET 606 }; 607 struct ifreq ifr; 608 struct rte_eth_link dev_link; 609 int link_speed = 0; 610 611 (void)wait_to_complete; 612 if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) { 613 WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno)); 614 return -1; 615 } 616 memset(&dev_link, 0, sizeof(dev_link)); 617 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && 618 (ifr.ifr_flags & IFF_RUNNING)); 619 ifr.ifr_data = &edata; 620 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 621 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", 622 strerror(errno)); 623 return -1; 624 } 625 link_speed = ethtool_cmd_speed(&edata); 626 if (link_speed == -1) 627 dev_link.link_speed = 0; 628 else 629 dev_link.link_speed = link_speed; 630 dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? 631 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 632 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & 633 ETH_LINK_SPEED_FIXED); 634 if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) { 635 /* Link status changed. */ 636 dev->data->dev_link = dev_link; 637 return 0; 638 } 639 /* Link status is still the same. */ 640 return -1; 641 } 642 643 /** 644 * DPDK callback to retrieve physical link information. 645 * 646 * @param dev 647 * Pointer to Ethernet device structure. 648 * @param wait_to_complete 649 * Wait for request completion (ignored). 650 */ 651 int 652 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete) 653 { 654 struct priv *priv = mlx5_get_priv(dev); 655 int ret; 656 657 priv_lock(priv); 658 ret = mlx5_link_update_unlocked(dev, wait_to_complete); 659 priv_unlock(priv); 660 return ret; 661 } 662 663 /** 664 * DPDK callback to change the MTU. 665 * 666 * Setting the MTU affects hardware MRU (packets larger than the MTU cannot be 667 * received). Use this as a hint to enable/disable scattered packets support 668 * and improve performance when not needed. 669 * Since failure is not an option, reconfiguring queues on the fly is not 670 * recommended. 671 * 672 * @param dev 673 * Pointer to Ethernet device structure. 674 * @param in_mtu 675 * New MTU. 676 * 677 * @return 678 * 0 on success, negative errno value on failure. 679 */ 680 int 681 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 682 { 683 struct priv *priv = dev->data->dev_private; 684 int ret = 0; 685 unsigned int i; 686 uint16_t (*rx_func)(void *, struct rte_mbuf **, uint16_t) = 687 mlx5_rx_burst; 688 689 if (mlx5_is_secondary()) 690 return -E_RTE_SECONDARY; 691 692 priv_lock(priv); 693 /* Set kernel interface MTU first. */ 694 if (priv_set_mtu(priv, mtu)) { 695 ret = errno; 696 WARN("cannot set port %u MTU to %u: %s", priv->port, mtu, 697 strerror(ret)); 698 goto out; 699 } else 700 DEBUG("adapter port %u MTU set to %u", priv->port, mtu); 701 priv->mtu = mtu; 702 /* Temporarily replace RX handler with a fake one, assuming it has not 703 * been copied elsewhere. */ 704 dev->rx_pkt_burst = removed_rx_burst; 705 /* Make sure everyone has left mlx5_rx_burst() and uses 706 * removed_rx_burst() instead. */ 707 rte_wmb(); 708 usleep(1000); 709 /* Reconfigure each RX queue. */ 710 for (i = 0; (i != priv->rxqs_n); ++i) { 711 struct rxq *rxq = (*priv->rxqs)[i]; 712 unsigned int max_frame_len; 713 int sp; 714 715 if (rxq == NULL) 716 continue; 717 /* Calculate new maximum frame length according to MTU and 718 * toggle scattered support (sp) if necessary. */ 719 max_frame_len = (priv->mtu + ETHER_HDR_LEN + 720 (ETHER_MAX_VLAN_FRAME_LEN - ETHER_MAX_LEN)); 721 sp = (max_frame_len > (rxq->mb_len - RTE_PKTMBUF_HEADROOM)); 722 /* Provide new values to rxq_setup(). */ 723 dev->data->dev_conf.rxmode.jumbo_frame = sp; 724 dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len; 725 ret = rxq_rehash(dev, rxq); 726 if (ret) { 727 /* Force SP RX if that queue requires it and abort. */ 728 if (rxq->sp) 729 rx_func = mlx5_rx_burst_sp; 730 break; 731 } 732 /* Scattered burst function takes priority. */ 733 if (rxq->sp) 734 rx_func = mlx5_rx_burst_sp; 735 } 736 /* Burst functions can now be called again. */ 737 rte_wmb(); 738 dev->rx_pkt_burst = rx_func; 739 out: 740 priv_unlock(priv); 741 assert(ret >= 0); 742 return -ret; 743 } 744 745 /** 746 * DPDK callback to get flow control status. 747 * 748 * @param dev 749 * Pointer to Ethernet device structure. 750 * @param[out] fc_conf 751 * Flow control output buffer. 752 * 753 * @return 754 * 0 on success, negative errno value on failure. 755 */ 756 int 757 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 758 { 759 struct priv *priv = dev->data->dev_private; 760 struct ifreq ifr; 761 struct ethtool_pauseparam ethpause = { 762 .cmd = ETHTOOL_GPAUSEPARAM 763 }; 764 int ret; 765 766 if (mlx5_is_secondary()) 767 return -E_RTE_SECONDARY; 768 769 ifr.ifr_data = ðpause; 770 priv_lock(priv); 771 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 772 ret = errno; 773 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)" 774 " failed: %s", 775 strerror(ret)); 776 goto out; 777 } 778 779 fc_conf->autoneg = ethpause.autoneg; 780 if (ethpause.rx_pause && ethpause.tx_pause) 781 fc_conf->mode = RTE_FC_FULL; 782 else if (ethpause.rx_pause) 783 fc_conf->mode = RTE_FC_RX_PAUSE; 784 else if (ethpause.tx_pause) 785 fc_conf->mode = RTE_FC_TX_PAUSE; 786 else 787 fc_conf->mode = RTE_FC_NONE; 788 ret = 0; 789 790 out: 791 priv_unlock(priv); 792 assert(ret >= 0); 793 return -ret; 794 } 795 796 /** 797 * DPDK callback to modify flow control parameters. 798 * 799 * @param dev 800 * Pointer to Ethernet device structure. 801 * @param[in] fc_conf 802 * Flow control parameters. 803 * 804 * @return 805 * 0 on success, negative errno value on failure. 806 */ 807 int 808 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 809 { 810 struct priv *priv = dev->data->dev_private; 811 struct ifreq ifr; 812 struct ethtool_pauseparam ethpause = { 813 .cmd = ETHTOOL_SPAUSEPARAM 814 }; 815 int ret; 816 817 if (mlx5_is_secondary()) 818 return -E_RTE_SECONDARY; 819 820 ifr.ifr_data = ðpause; 821 ethpause.autoneg = fc_conf->autoneg; 822 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 823 (fc_conf->mode & RTE_FC_RX_PAUSE)) 824 ethpause.rx_pause = 1; 825 else 826 ethpause.rx_pause = 0; 827 828 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 829 (fc_conf->mode & RTE_FC_TX_PAUSE)) 830 ethpause.tx_pause = 1; 831 else 832 ethpause.tx_pause = 0; 833 834 priv_lock(priv); 835 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 836 ret = errno; 837 WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" 838 " failed: %s", 839 strerror(ret)); 840 goto out; 841 } 842 ret = 0; 843 844 out: 845 priv_unlock(priv); 846 assert(ret >= 0); 847 return -ret; 848 } 849 850 /** 851 * Get PCI information from struct ibv_device. 852 * 853 * @param device 854 * Pointer to Ethernet device structure. 855 * @param[out] pci_addr 856 * PCI bus address output buffer. 857 * 858 * @return 859 * 0 on success, -1 on failure and errno is set. 860 */ 861 int 862 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, 863 struct rte_pci_addr *pci_addr) 864 { 865 FILE *file; 866 char line[32]; 867 MKSTR(path, "%s/device/uevent", device->ibdev_path); 868 869 file = fopen(path, "rb"); 870 if (file == NULL) 871 return -1; 872 while (fgets(line, sizeof(line), file) == line) { 873 size_t len = strlen(line); 874 int ret; 875 876 /* Truncate long lines. */ 877 if (len == (sizeof(line) - 1)) 878 while (line[(len - 1)] != '\n') { 879 ret = fgetc(file); 880 if (ret == EOF) 881 break; 882 line[(len - 1)] = ret; 883 } 884 /* Extract information. */ 885 if (sscanf(line, 886 "PCI_SLOT_NAME=" 887 "%" SCNx16 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", 888 &pci_addr->domain, 889 &pci_addr->bus, 890 &pci_addr->devid, 891 &pci_addr->function) == 4) { 892 ret = 0; 893 break; 894 } 895 } 896 fclose(file); 897 return 0; 898 } 899 900 /** 901 * Link status handler. 902 * 903 * @param priv 904 * Pointer to private structure. 905 * @param dev 906 * Pointer to the rte_eth_dev structure. 907 * 908 * @return 909 * Nonzero if the callback process can be called immediately. 910 */ 911 static int 912 priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev) 913 { 914 struct ibv_async_event event; 915 int port_change = 0; 916 int ret = 0; 917 918 /* Read all message and acknowledge them. */ 919 for (;;) { 920 if (ibv_get_async_event(priv->ctx, &event)) 921 break; 922 923 if (event.event_type == IBV_EVENT_PORT_ACTIVE || 924 event.event_type == IBV_EVENT_PORT_ERR) 925 port_change = 1; 926 else 927 DEBUG("event type %d on port %d not handled", 928 event.event_type, event.element.port_num); 929 ibv_ack_async_event(&event); 930 } 931 932 if (port_change ^ priv->pending_alarm) { 933 struct rte_eth_link *link = &dev->data->dev_link; 934 935 priv->pending_alarm = 0; 936 mlx5_link_update_unlocked(dev, 0); 937 if (((link->link_speed == 0) && link->link_status) || 938 ((link->link_speed != 0) && !link->link_status)) { 939 /* Inconsistent status, check again later. */ 940 priv->pending_alarm = 1; 941 rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US, 942 mlx5_dev_link_status_handler, 943 dev); 944 } else 945 ret = 1; 946 } 947 return ret; 948 } 949 950 /** 951 * Handle delayed link status event. 952 * 953 * @param arg 954 * Registered argument. 955 */ 956 void 957 mlx5_dev_link_status_handler(void *arg) 958 { 959 struct rte_eth_dev *dev = arg; 960 struct priv *priv = dev->data->dev_private; 961 int ret; 962 963 priv_lock(priv); 964 assert(priv->pending_alarm == 1); 965 ret = priv_dev_link_status_handler(priv, dev); 966 priv_unlock(priv); 967 if (ret) 968 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC); 969 } 970 971 /** 972 * Handle interrupts from the NIC. 973 * 974 * @param[in] intr_handle 975 * Interrupt handler. 976 * @param cb_arg 977 * Callback argument. 978 */ 979 void 980 mlx5_dev_interrupt_handler(struct rte_intr_handle *intr_handle, void *cb_arg) 981 { 982 struct rte_eth_dev *dev = cb_arg; 983 struct priv *priv = dev->data->dev_private; 984 int ret; 985 986 (void)intr_handle; 987 priv_lock(priv); 988 ret = priv_dev_link_status_handler(priv, dev); 989 priv_unlock(priv); 990 if (ret) 991 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC); 992 } 993 994 /** 995 * Uninstall interrupt handler. 996 * 997 * @param priv 998 * Pointer to private structure. 999 * @param dev 1000 * Pointer to the rte_eth_dev structure. 1001 */ 1002 void 1003 priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev) 1004 { 1005 if (!dev->data->dev_conf.intr_conf.lsc) 1006 return; 1007 rte_intr_callback_unregister(&priv->intr_handle, 1008 mlx5_dev_interrupt_handler, 1009 dev); 1010 if (priv->pending_alarm) 1011 rte_eal_alarm_cancel(mlx5_dev_link_status_handler, dev); 1012 priv->pending_alarm = 0; 1013 priv->intr_handle.fd = 0; 1014 priv->intr_handle.type = 0; 1015 } 1016 1017 /** 1018 * Install interrupt handler. 1019 * 1020 * @param priv 1021 * Pointer to private structure. 1022 * @param dev 1023 * Pointer to the rte_eth_dev structure. 1024 */ 1025 void 1026 priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev) 1027 { 1028 int rc, flags; 1029 1030 if (!dev->data->dev_conf.intr_conf.lsc) 1031 return; 1032 assert(priv->ctx->async_fd > 0); 1033 flags = fcntl(priv->ctx->async_fd, F_GETFL); 1034 rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); 1035 if (rc < 0) { 1036 INFO("failed to change file descriptor async event queue"); 1037 dev->data->dev_conf.intr_conf.lsc = 0; 1038 } else { 1039 priv->intr_handle.fd = priv->ctx->async_fd; 1040 priv->intr_handle.type = RTE_INTR_HANDLE_EXT; 1041 rte_intr_callback_register(&priv->intr_handle, 1042 mlx5_dev_interrupt_handler, 1043 dev); 1044 } 1045 } 1046 1047 /** 1048 * Change the link state (UP / DOWN). 1049 * 1050 * @param dev 1051 * Pointer to Ethernet device structure. 1052 * @param up 1053 * Nonzero for link up, otherwise link down. 1054 * 1055 * @return 1056 * 0 on success, errno value on failure. 1057 */ 1058 static int 1059 priv_set_link(struct priv *priv, int up) 1060 { 1061 struct rte_eth_dev *dev = priv->dev; 1062 int err; 1063 unsigned int i; 1064 1065 if (up) { 1066 err = priv_set_flags(priv, ~IFF_UP, IFF_UP); 1067 if (err) 1068 return err; 1069 for (i = 0; i < priv->rxqs_n; i++) 1070 if ((*priv->rxqs)[i]->sp) 1071 break; 1072 /* Check if an sp queue exists. 1073 * Note: Some old frames might be received. 1074 */ 1075 if (i == priv->rxqs_n) 1076 dev->rx_pkt_burst = mlx5_rx_burst; 1077 else 1078 dev->rx_pkt_burst = mlx5_rx_burst_sp; 1079 dev->tx_pkt_burst = mlx5_tx_burst; 1080 } else { 1081 err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP); 1082 if (err) 1083 return err; 1084 dev->rx_pkt_burst = removed_rx_burst; 1085 dev->tx_pkt_burst = removed_tx_burst; 1086 } 1087 return 0; 1088 } 1089 1090 /** 1091 * DPDK callback to bring the link DOWN. 1092 * 1093 * @param dev 1094 * Pointer to Ethernet device structure. 1095 * 1096 * @return 1097 * 0 on success, errno value on failure. 1098 */ 1099 int 1100 mlx5_set_link_down(struct rte_eth_dev *dev) 1101 { 1102 struct priv *priv = dev->data->dev_private; 1103 int err; 1104 1105 priv_lock(priv); 1106 err = priv_set_link(priv, 0); 1107 priv_unlock(priv); 1108 return err; 1109 } 1110 1111 /** 1112 * DPDK callback to bring the link UP. 1113 * 1114 * @param dev 1115 * Pointer to Ethernet device structure. 1116 * 1117 * @return 1118 * 0 on success, errno value on failure. 1119 */ 1120 int 1121 mlx5_set_link_up(struct rte_eth_dev *dev) 1122 { 1123 struct priv *priv = dev->data->dev_private; 1124 int err; 1125 1126 priv_lock(priv); 1127 err = priv_set_link(priv, 1); 1128 priv_unlock(priv); 1129 return err; 1130 } 1131 1132 /** 1133 * Configure secondary process queues from a private data pointer (primary 1134 * or secondary) and update burst callbacks. Can take place only once. 1135 * 1136 * All queues must have been previously created by the primary process to 1137 * avoid undefined behavior. 1138 * 1139 * @param priv 1140 * Private data pointer from either primary or secondary process. 1141 * 1142 * @return 1143 * Private data pointer from secondary process, NULL in case of error. 1144 */ 1145 struct priv * 1146 mlx5_secondary_data_setup(struct priv *priv) 1147 { 1148 unsigned int port_id = 0; 1149 struct mlx5_secondary_data *sd; 1150 void **tx_queues; 1151 void **rx_queues; 1152 unsigned int nb_tx_queues; 1153 unsigned int nb_rx_queues; 1154 unsigned int i; 1155 1156 /* priv must be valid at this point. */ 1157 assert(priv != NULL); 1158 /* priv->dev must also be valid but may point to local memory from 1159 * another process, possibly with the same address and must not 1160 * be dereferenced yet. */ 1161 assert(priv->dev != NULL); 1162 /* Determine port ID by finding out where priv comes from. */ 1163 while (1) { 1164 sd = &mlx5_secondary_data[port_id]; 1165 rte_spinlock_lock(&sd->lock); 1166 /* Primary process? */ 1167 if (sd->primary_priv == priv) 1168 break; 1169 /* Secondary process? */ 1170 if (sd->data.dev_private == priv) 1171 break; 1172 rte_spinlock_unlock(&sd->lock); 1173 if (++port_id == RTE_DIM(mlx5_secondary_data)) 1174 port_id = 0; 1175 } 1176 /* Switch to secondary private structure. If private data has already 1177 * been updated by another thread, there is nothing else to do. */ 1178 priv = sd->data.dev_private; 1179 if (priv->dev->data == &sd->data) 1180 goto end; 1181 /* Sanity checks. Secondary private structure is supposed to point 1182 * to local eth_dev, itself still pointing to the shared device data 1183 * structure allocated by the primary process. */ 1184 assert(sd->shared_dev_data != &sd->data); 1185 assert(sd->data.nb_tx_queues == 0); 1186 assert(sd->data.tx_queues == NULL); 1187 assert(sd->data.nb_rx_queues == 0); 1188 assert(sd->data.rx_queues == NULL); 1189 assert(priv != sd->primary_priv); 1190 assert(priv->dev->data == sd->shared_dev_data); 1191 assert(priv->txqs_n == 0); 1192 assert(priv->txqs == NULL); 1193 assert(priv->rxqs_n == 0); 1194 assert(priv->rxqs == NULL); 1195 nb_tx_queues = sd->shared_dev_data->nb_tx_queues; 1196 nb_rx_queues = sd->shared_dev_data->nb_rx_queues; 1197 /* Allocate local storage for queues. */ 1198 tx_queues = rte_zmalloc("secondary ethdev->tx_queues", 1199 sizeof(sd->data.tx_queues[0]) * nb_tx_queues, 1200 RTE_CACHE_LINE_SIZE); 1201 rx_queues = rte_zmalloc("secondary ethdev->rx_queues", 1202 sizeof(sd->data.rx_queues[0]) * nb_rx_queues, 1203 RTE_CACHE_LINE_SIZE); 1204 if (tx_queues == NULL || rx_queues == NULL) 1205 goto error; 1206 /* Lock to prevent control operations during setup. */ 1207 priv_lock(priv); 1208 /* TX queues. */ 1209 for (i = 0; i != nb_tx_queues; ++i) { 1210 struct txq *primary_txq = (*sd->primary_priv->txqs)[i]; 1211 struct txq *txq; 1212 1213 if (primary_txq == NULL) 1214 continue; 1215 txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0, 1216 primary_txq->socket); 1217 if (txq != NULL) { 1218 if (txq_setup(priv->dev, 1219 txq, 1220 primary_txq->elts_n * MLX5_PMD_SGE_WR_N, 1221 primary_txq->socket, 1222 NULL) == 0) { 1223 txq->stats.idx = primary_txq->stats.idx; 1224 tx_queues[i] = txq; 1225 continue; 1226 } 1227 rte_free(txq); 1228 } 1229 while (i) { 1230 txq = tx_queues[--i]; 1231 txq_cleanup(txq); 1232 rte_free(txq); 1233 } 1234 goto error; 1235 } 1236 /* RX queues. */ 1237 for (i = 0; i != nb_rx_queues; ++i) { 1238 struct rxq *primary_rxq = (*sd->primary_priv->rxqs)[i]; 1239 1240 if (primary_rxq == NULL) 1241 continue; 1242 /* Not supported yet. */ 1243 rx_queues[i] = NULL; 1244 } 1245 /* Update everything. */ 1246 priv->txqs = (void *)tx_queues; 1247 priv->txqs_n = nb_tx_queues; 1248 priv->rxqs = (void *)rx_queues; 1249 priv->rxqs_n = nb_rx_queues; 1250 sd->data.rx_queues = rx_queues; 1251 sd->data.tx_queues = tx_queues; 1252 sd->data.nb_rx_queues = nb_rx_queues; 1253 sd->data.nb_tx_queues = nb_tx_queues; 1254 sd->data.dev_link = sd->shared_dev_data->dev_link; 1255 sd->data.mtu = sd->shared_dev_data->mtu; 1256 memcpy(sd->data.rx_queue_state, sd->shared_dev_data->rx_queue_state, 1257 sizeof(sd->data.rx_queue_state)); 1258 memcpy(sd->data.tx_queue_state, sd->shared_dev_data->tx_queue_state, 1259 sizeof(sd->data.tx_queue_state)); 1260 sd->data.dev_flags = sd->shared_dev_data->dev_flags; 1261 /* Use local data from now on. */ 1262 rte_mb(); 1263 priv->dev->data = &sd->data; 1264 rte_mb(); 1265 priv->dev->tx_pkt_burst = mlx5_tx_burst; 1266 priv->dev->rx_pkt_burst = removed_rx_burst; 1267 priv_unlock(priv); 1268 end: 1269 /* More sanity checks. */ 1270 assert(priv->dev->tx_pkt_burst == mlx5_tx_burst); 1271 assert(priv->dev->rx_pkt_burst == removed_rx_burst); 1272 assert(priv->dev->data == &sd->data); 1273 rte_spinlock_unlock(&sd->lock); 1274 return priv; 1275 error: 1276 priv_unlock(priv); 1277 rte_free(tx_queues); 1278 rte_free(rx_queues); 1279 rte_spinlock_unlock(&sd->lock); 1280 return NULL; 1281 } 1282