1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <stddef.h> 35 #include <assert.h> 36 #include <unistd.h> 37 #include <stdint.h> 38 #include <stdio.h> 39 #include <string.h> 40 #include <stdlib.h> 41 #include <errno.h> 42 #include <dirent.h> 43 #include <net/if.h> 44 #include <sys/ioctl.h> 45 #include <sys/socket.h> 46 #include <netinet/in.h> 47 #include <linux/if.h> 48 #include <linux/ethtool.h> 49 #include <linux/sockios.h> 50 #include <fcntl.h> 51 52 /* DPDK headers don't like -pedantic. */ 53 #ifdef PEDANTIC 54 #pragma GCC diagnostic ignored "-pedantic" 55 #endif 56 #include <rte_atomic.h> 57 #include <rte_ethdev.h> 58 #include <rte_mbuf.h> 59 #include <rte_common.h> 60 #include <rte_interrupts.h> 61 #include <rte_alarm.h> 62 #include <rte_malloc.h> 63 #ifdef PEDANTIC 64 #pragma GCC diagnostic error "-pedantic" 65 #endif 66 67 #include "mlx5.h" 68 #include "mlx5_rxtx.h" 69 #include "mlx5_utils.h" 70 71 /** 72 * Return private structure associated with an Ethernet device. 73 * 74 * @param dev 75 * Pointer to Ethernet device structure. 76 * 77 * @return 78 * Pointer to private structure. 79 */ 80 struct priv * 81 mlx5_get_priv(struct rte_eth_dev *dev) 82 { 83 struct mlx5_secondary_data *sd; 84 85 if (!mlx5_is_secondary()) 86 return dev->data->dev_private; 87 sd = &mlx5_secondary_data[dev->data->port_id]; 88 return sd->data.dev_private; 89 } 90 91 /** 92 * Check if running as a secondary process. 93 * 94 * @return 95 * Nonzero if running as a secondary process. 96 */ 97 inline int 98 mlx5_is_secondary(void) 99 { 100 return rte_eal_process_type() != RTE_PROC_PRIMARY; 101 } 102 103 /** 104 * Get interface name from private structure. 105 * 106 * @param[in] priv 107 * Pointer to private structure. 108 * @param[out] ifname 109 * Interface name output buffer. 110 * 111 * @return 112 * 0 on success, -1 on failure and errno is set. 113 */ 114 int 115 priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]) 116 { 117 DIR *dir; 118 struct dirent *dent; 119 unsigned int dev_type = 0; 120 unsigned int dev_port_prev = ~0u; 121 char match[IF_NAMESIZE] = ""; 122 123 { 124 MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path); 125 126 dir = opendir(path); 127 if (dir == NULL) 128 return -1; 129 } 130 while ((dent = readdir(dir)) != NULL) { 131 char *name = dent->d_name; 132 FILE *file; 133 unsigned int dev_port; 134 int r; 135 136 if ((name[0] == '.') && 137 ((name[1] == '\0') || 138 ((name[1] == '.') && (name[2] == '\0')))) 139 continue; 140 141 MKSTR(path, "%s/device/net/%s/%s", 142 priv->ctx->device->ibdev_path, name, 143 (dev_type ? "dev_id" : "dev_port")); 144 145 file = fopen(path, "rb"); 146 if (file == NULL) { 147 if (errno != ENOENT) 148 continue; 149 /* 150 * Switch to dev_id when dev_port does not exist as 151 * is the case with Linux kernel versions < 3.15. 152 */ 153 try_dev_id: 154 match[0] = '\0'; 155 if (dev_type) 156 break; 157 dev_type = 1; 158 dev_port_prev = ~0u; 159 rewinddir(dir); 160 continue; 161 } 162 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); 163 fclose(file); 164 if (r != 1) 165 continue; 166 /* 167 * Switch to dev_id when dev_port returns the same value for 168 * all ports. May happen when using a MOFED release older than 169 * 3.0 with a Linux kernel >= 3.15. 170 */ 171 if (dev_port == dev_port_prev) 172 goto try_dev_id; 173 dev_port_prev = dev_port; 174 if (dev_port == (priv->port - 1u)) 175 snprintf(match, sizeof(match), "%s", name); 176 } 177 closedir(dir); 178 if (match[0] == '\0') 179 return -1; 180 strncpy(*ifname, match, sizeof(*ifname)); 181 return 0; 182 } 183 184 /** 185 * Read from sysfs entry. 186 * 187 * @param[in] priv 188 * Pointer to private structure. 189 * @param[in] entry 190 * Entry name relative to sysfs path. 191 * @param[out] buf 192 * Data output buffer. 193 * @param size 194 * Buffer size. 195 * 196 * @return 197 * 0 on success, -1 on failure and errno is set. 198 */ 199 static int 200 priv_sysfs_read(const struct priv *priv, const char *entry, 201 char *buf, size_t size) 202 { 203 char ifname[IF_NAMESIZE]; 204 FILE *file; 205 int ret; 206 int err; 207 208 if (priv_get_ifname(priv, &ifname)) 209 return -1; 210 211 MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, 212 ifname, entry); 213 214 file = fopen(path, "rb"); 215 if (file == NULL) 216 return -1; 217 ret = fread(buf, 1, size, file); 218 err = errno; 219 if (((size_t)ret < size) && (ferror(file))) 220 ret = -1; 221 else 222 ret = size; 223 fclose(file); 224 errno = err; 225 return ret; 226 } 227 228 /** 229 * Write to sysfs entry. 230 * 231 * @param[in] priv 232 * Pointer to private structure. 233 * @param[in] entry 234 * Entry name relative to sysfs path. 235 * @param[in] buf 236 * Data buffer. 237 * @param size 238 * Buffer size. 239 * 240 * @return 241 * 0 on success, -1 on failure and errno is set. 242 */ 243 static int 244 priv_sysfs_write(const struct priv *priv, const char *entry, 245 char *buf, size_t size) 246 { 247 char ifname[IF_NAMESIZE]; 248 FILE *file; 249 int ret; 250 int err; 251 252 if (priv_get_ifname(priv, &ifname)) 253 return -1; 254 255 MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, 256 ifname, entry); 257 258 file = fopen(path, "wb"); 259 if (file == NULL) 260 return -1; 261 ret = fwrite(buf, 1, size, file); 262 err = errno; 263 if (((size_t)ret < size) || (ferror(file))) 264 ret = -1; 265 else 266 ret = size; 267 fclose(file); 268 errno = err; 269 return ret; 270 } 271 272 /** 273 * Get unsigned long sysfs property. 274 * 275 * @param priv 276 * Pointer to private structure. 277 * @param[in] name 278 * Entry name relative to sysfs path. 279 * @param[out] value 280 * Value output buffer. 281 * 282 * @return 283 * 0 on success, -1 on failure and errno is set. 284 */ 285 static int 286 priv_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value) 287 { 288 int ret; 289 unsigned long value_ret; 290 char value_str[32]; 291 292 ret = priv_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1)); 293 if (ret == -1) { 294 DEBUG("cannot read %s value from sysfs: %s", 295 name, strerror(errno)); 296 return -1; 297 } 298 value_str[ret] = '\0'; 299 errno = 0; 300 value_ret = strtoul(value_str, NULL, 0); 301 if (errno) { 302 DEBUG("invalid %s value `%s': %s", name, value_str, 303 strerror(errno)); 304 return -1; 305 } 306 *value = value_ret; 307 return 0; 308 } 309 310 /** 311 * Set unsigned long sysfs property. 312 * 313 * @param priv 314 * Pointer to private structure. 315 * @param[in] name 316 * Entry name relative to sysfs path. 317 * @param value 318 * Value to set. 319 * 320 * @return 321 * 0 on success, -1 on failure and errno is set. 322 */ 323 static int 324 priv_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value) 325 { 326 int ret; 327 MKSTR(value_str, "%lu", value); 328 329 ret = priv_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1)); 330 if (ret == -1) { 331 DEBUG("cannot write %s `%s' (%lu) to sysfs: %s", 332 name, value_str, value, strerror(errno)); 333 return -1; 334 } 335 return 0; 336 } 337 338 /** 339 * Perform ifreq ioctl() on associated Ethernet device. 340 * 341 * @param[in] priv 342 * Pointer to private structure. 343 * @param req 344 * Request number to pass to ioctl(). 345 * @param[out] ifr 346 * Interface request structure output buffer. 347 * 348 * @return 349 * 0 on success, -1 on failure and errno is set. 350 */ 351 int 352 priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr) 353 { 354 int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 355 int ret = -1; 356 357 if (sock == -1) 358 return ret; 359 if (priv_get_ifname(priv, &ifr->ifr_name) == 0) 360 ret = ioctl(sock, req, ifr); 361 close(sock); 362 return ret; 363 } 364 365 /** 366 * Get device MTU. 367 * 368 * @param priv 369 * Pointer to private structure. 370 * @param[out] mtu 371 * MTU value output buffer. 372 * 373 * @return 374 * 0 on success, -1 on failure and errno is set. 375 */ 376 int 377 priv_get_mtu(struct priv *priv, uint16_t *mtu) 378 { 379 unsigned long ulong_mtu; 380 381 if (priv_get_sysfs_ulong(priv, "mtu", &ulong_mtu) == -1) 382 return -1; 383 *mtu = ulong_mtu; 384 return 0; 385 } 386 387 /** 388 * Set device MTU. 389 * 390 * @param priv 391 * Pointer to private structure. 392 * @param mtu 393 * MTU value to set. 394 * 395 * @return 396 * 0 on success, -1 on failure and errno is set. 397 */ 398 static int 399 priv_set_mtu(struct priv *priv, uint16_t mtu) 400 { 401 return priv_set_sysfs_ulong(priv, "mtu", mtu); 402 } 403 404 /** 405 * Set device flags. 406 * 407 * @param priv 408 * Pointer to private structure. 409 * @param keep 410 * Bitmask for flags that must remain untouched. 411 * @param flags 412 * Bitmask for flags to modify. 413 * 414 * @return 415 * 0 on success, -1 on failure and errno is set. 416 */ 417 int 418 priv_set_flags(struct priv *priv, unsigned int keep, unsigned int flags) 419 { 420 unsigned long tmp; 421 422 if (priv_get_sysfs_ulong(priv, "flags", &tmp) == -1) 423 return -1; 424 tmp &= keep; 425 tmp |= flags; 426 return priv_set_sysfs_ulong(priv, "flags", tmp); 427 } 428 429 /** 430 * Ethernet device configuration. 431 * 432 * Prepare the driver for a given number of TX and RX queues. 433 * 434 * @param dev 435 * Pointer to Ethernet device structure. 436 * 437 * @return 438 * 0 on success, errno value on failure. 439 */ 440 static int 441 dev_configure(struct rte_eth_dev *dev) 442 { 443 struct priv *priv = dev->data->dev_private; 444 unsigned int rxqs_n = dev->data->nb_rx_queues; 445 unsigned int txqs_n = dev->data->nb_tx_queues; 446 unsigned int i; 447 unsigned int j; 448 unsigned int reta_idx_n; 449 450 priv->rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf; 451 priv->rxqs = (void *)dev->data->rx_queues; 452 priv->txqs = (void *)dev->data->tx_queues; 453 if (txqs_n != priv->txqs_n) { 454 INFO("%p: TX queues number update: %u -> %u", 455 (void *)dev, priv->txqs_n, txqs_n); 456 priv->txqs_n = txqs_n; 457 } 458 if (rxqs_n > priv->ind_table_max_size) { 459 ERROR("cannot handle this many RX queues (%u)", rxqs_n); 460 return EINVAL; 461 } 462 if (rxqs_n == priv->rxqs_n) 463 return 0; 464 INFO("%p: RX queues number update: %u -> %u", 465 (void *)dev, priv->rxqs_n, rxqs_n); 466 priv->rxqs_n = rxqs_n; 467 /* If the requested number of RX queues is not a power of two, use the 468 * maximum indirection table size for better balancing. 469 * The result is always rounded to the next power of two. */ 470 reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ? 471 priv->ind_table_max_size : 472 rxqs_n)); 473 if (priv_rss_reta_index_resize(priv, reta_idx_n)) 474 return ENOMEM; 475 /* When the number of RX queues is not a power of two, the remaining 476 * table entries are padded with reused WQs and hashes are not spread 477 * uniformly. */ 478 for (i = 0, j = 0; (i != reta_idx_n); ++i) { 479 (*priv->reta_idx)[i] = j; 480 if (++j == rxqs_n) 481 j = 0; 482 } 483 return 0; 484 } 485 486 /** 487 * DPDK callback for Ethernet device configuration. 488 * 489 * @param dev 490 * Pointer to Ethernet device structure. 491 * 492 * @return 493 * 0 on success, negative errno value on failure. 494 */ 495 int 496 mlx5_dev_configure(struct rte_eth_dev *dev) 497 { 498 struct priv *priv = dev->data->dev_private; 499 int ret; 500 501 if (mlx5_is_secondary()) 502 return -E_RTE_SECONDARY; 503 504 priv_lock(priv); 505 ret = dev_configure(dev); 506 assert(ret >= 0); 507 priv_unlock(priv); 508 return -ret; 509 } 510 511 /** 512 * DPDK callback to get information about the device. 513 * 514 * @param dev 515 * Pointer to Ethernet device structure. 516 * @param[out] info 517 * Info structure output buffer. 518 */ 519 void 520 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 521 { 522 struct priv *priv = mlx5_get_priv(dev); 523 unsigned int max; 524 char ifname[IF_NAMESIZE]; 525 526 priv_lock(priv); 527 /* FIXME: we should ask the device for these values. */ 528 info->min_rx_bufsize = 32; 529 info->max_rx_pktlen = 65536; 530 /* 531 * Since we need one CQ per QP, the limit is the minimum number 532 * between the two values. 533 */ 534 max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ? 535 priv->device_attr.max_qp : priv->device_attr.max_cq); 536 /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */ 537 if (max >= 65535) 538 max = 65535; 539 info->max_rx_queues = max; 540 info->max_tx_queues = max; 541 info->max_mac_addrs = RTE_DIM(priv->mac); 542 info->rx_offload_capa = 543 (priv->hw_csum ? 544 (DEV_RX_OFFLOAD_IPV4_CKSUM | 545 DEV_RX_OFFLOAD_UDP_CKSUM | 546 DEV_RX_OFFLOAD_TCP_CKSUM) : 547 0); 548 info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT; 549 if (priv->hw_csum) 550 info->tx_offload_capa |= 551 (DEV_TX_OFFLOAD_IPV4_CKSUM | 552 DEV_TX_OFFLOAD_UDP_CKSUM | 553 DEV_TX_OFFLOAD_TCP_CKSUM); 554 if (priv_get_ifname(priv, &ifname) == 0) 555 info->if_index = if_nametoindex(ifname); 556 /* FIXME: RETA update/query API expects the callee to know the size of 557 * the indirection table, for this PMD the size varies depending on 558 * the number of RX queues, it becomes impossible to find the correct 559 * size if it is not fixed. 560 * The API should be updated to solve this problem. */ 561 info->reta_size = priv->ind_table_max_size; 562 info->speed_capa = 563 ETH_LINK_SPEED_1G | 564 ETH_LINK_SPEED_10G | 565 ETH_LINK_SPEED_20G | 566 ETH_LINK_SPEED_25G | 567 ETH_LINK_SPEED_40G | 568 ETH_LINK_SPEED_50G | 569 ETH_LINK_SPEED_56G | 570 ETH_LINK_SPEED_100G; 571 priv_unlock(priv); 572 } 573 574 const uint32_t * 575 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) 576 { 577 static const uint32_t ptypes[] = { 578 /* refers to rxq_cq_to_pkt_type() */ 579 RTE_PTYPE_L3_IPV4, 580 RTE_PTYPE_L3_IPV6, 581 RTE_PTYPE_INNER_L3_IPV4, 582 RTE_PTYPE_INNER_L3_IPV6, 583 RTE_PTYPE_UNKNOWN 584 585 }; 586 587 if (dev->rx_pkt_burst == mlx5_rx_burst || 588 dev->rx_pkt_burst == mlx5_rx_burst_sp) 589 return ptypes; 590 return NULL; 591 } 592 593 /** 594 * DPDK callback to retrieve physical link information (unlocked version). 595 * 596 * @param dev 597 * Pointer to Ethernet device structure. 598 * @param wait_to_complete 599 * Wait for request completion (ignored). 600 */ 601 static int 602 mlx5_link_update_unlocked(struct rte_eth_dev *dev, int wait_to_complete) 603 { 604 struct priv *priv = mlx5_get_priv(dev); 605 struct ethtool_cmd edata = { 606 .cmd = ETHTOOL_GSET 607 }; 608 struct ifreq ifr; 609 struct rte_eth_link dev_link; 610 int link_speed = 0; 611 612 (void)wait_to_complete; 613 if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) { 614 WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno)); 615 return -1; 616 } 617 memset(&dev_link, 0, sizeof(dev_link)); 618 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && 619 (ifr.ifr_flags & IFF_RUNNING)); 620 ifr.ifr_data = &edata; 621 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 622 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", 623 strerror(errno)); 624 return -1; 625 } 626 link_speed = ethtool_cmd_speed(&edata); 627 if (link_speed == -1) 628 dev_link.link_speed = 0; 629 else 630 dev_link.link_speed = link_speed; 631 dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? 632 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 633 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & 634 ETH_LINK_SPEED_FIXED); 635 if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) { 636 /* Link status changed. */ 637 dev->data->dev_link = dev_link; 638 return 0; 639 } 640 /* Link status is still the same. */ 641 return -1; 642 } 643 644 /** 645 * DPDK callback to retrieve physical link information. 646 * 647 * @param dev 648 * Pointer to Ethernet device structure. 649 * @param wait_to_complete 650 * Wait for request completion (ignored). 651 */ 652 int 653 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete) 654 { 655 struct priv *priv = mlx5_get_priv(dev); 656 int ret; 657 658 priv_lock(priv); 659 ret = mlx5_link_update_unlocked(dev, wait_to_complete); 660 priv_unlock(priv); 661 return ret; 662 } 663 664 /** 665 * DPDK callback to change the MTU. 666 * 667 * Setting the MTU affects hardware MRU (packets larger than the MTU cannot be 668 * received). Use this as a hint to enable/disable scattered packets support 669 * and improve performance when not needed. 670 * Since failure is not an option, reconfiguring queues on the fly is not 671 * recommended. 672 * 673 * @param dev 674 * Pointer to Ethernet device structure. 675 * @param in_mtu 676 * New MTU. 677 * 678 * @return 679 * 0 on success, negative errno value on failure. 680 */ 681 int 682 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 683 { 684 struct priv *priv = dev->data->dev_private; 685 int ret = 0; 686 unsigned int i; 687 uint16_t (*rx_func)(void *, struct rte_mbuf **, uint16_t) = 688 mlx5_rx_burst; 689 690 if (mlx5_is_secondary()) 691 return -E_RTE_SECONDARY; 692 693 priv_lock(priv); 694 /* Set kernel interface MTU first. */ 695 if (priv_set_mtu(priv, mtu)) { 696 ret = errno; 697 WARN("cannot set port %u MTU to %u: %s", priv->port, mtu, 698 strerror(ret)); 699 goto out; 700 } else 701 DEBUG("adapter port %u MTU set to %u", priv->port, mtu); 702 priv->mtu = mtu; 703 /* Temporarily replace RX handler with a fake one, assuming it has not 704 * been copied elsewhere. */ 705 dev->rx_pkt_burst = removed_rx_burst; 706 /* Make sure everyone has left mlx5_rx_burst() and uses 707 * removed_rx_burst() instead. */ 708 rte_wmb(); 709 usleep(1000); 710 /* Reconfigure each RX queue. */ 711 for (i = 0; (i != priv->rxqs_n); ++i) { 712 struct rxq *rxq = (*priv->rxqs)[i]; 713 unsigned int max_frame_len; 714 int sp; 715 716 if (rxq == NULL) 717 continue; 718 /* Calculate new maximum frame length according to MTU and 719 * toggle scattered support (sp) if necessary. */ 720 max_frame_len = (priv->mtu + ETHER_HDR_LEN + 721 (ETHER_MAX_VLAN_FRAME_LEN - ETHER_MAX_LEN)); 722 sp = (max_frame_len > (rxq->mb_len - RTE_PKTMBUF_HEADROOM)); 723 /* Provide new values to rxq_setup(). */ 724 dev->data->dev_conf.rxmode.jumbo_frame = sp; 725 dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len; 726 ret = rxq_rehash(dev, rxq); 727 if (ret) { 728 /* Force SP RX if that queue requires it and abort. */ 729 if (rxq->sp) 730 rx_func = mlx5_rx_burst_sp; 731 break; 732 } 733 /* Scattered burst function takes priority. */ 734 if (rxq->sp) 735 rx_func = mlx5_rx_burst_sp; 736 } 737 /* Burst functions can now be called again. */ 738 rte_wmb(); 739 dev->rx_pkt_burst = rx_func; 740 out: 741 priv_unlock(priv); 742 assert(ret >= 0); 743 return -ret; 744 } 745 746 /** 747 * DPDK callback to get flow control status. 748 * 749 * @param dev 750 * Pointer to Ethernet device structure. 751 * @param[out] fc_conf 752 * Flow control output buffer. 753 * 754 * @return 755 * 0 on success, negative errno value on failure. 756 */ 757 int 758 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 759 { 760 struct priv *priv = dev->data->dev_private; 761 struct ifreq ifr; 762 struct ethtool_pauseparam ethpause = { 763 .cmd = ETHTOOL_GPAUSEPARAM 764 }; 765 int ret; 766 767 if (mlx5_is_secondary()) 768 return -E_RTE_SECONDARY; 769 770 ifr.ifr_data = ðpause; 771 priv_lock(priv); 772 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 773 ret = errno; 774 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)" 775 " failed: %s", 776 strerror(ret)); 777 goto out; 778 } 779 780 fc_conf->autoneg = ethpause.autoneg; 781 if (ethpause.rx_pause && ethpause.tx_pause) 782 fc_conf->mode = RTE_FC_FULL; 783 else if (ethpause.rx_pause) 784 fc_conf->mode = RTE_FC_RX_PAUSE; 785 else if (ethpause.tx_pause) 786 fc_conf->mode = RTE_FC_TX_PAUSE; 787 else 788 fc_conf->mode = RTE_FC_NONE; 789 ret = 0; 790 791 out: 792 priv_unlock(priv); 793 assert(ret >= 0); 794 return -ret; 795 } 796 797 /** 798 * DPDK callback to modify flow control parameters. 799 * 800 * @param dev 801 * Pointer to Ethernet device structure. 802 * @param[in] fc_conf 803 * Flow control parameters. 804 * 805 * @return 806 * 0 on success, negative errno value on failure. 807 */ 808 int 809 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 810 { 811 struct priv *priv = dev->data->dev_private; 812 struct ifreq ifr; 813 struct ethtool_pauseparam ethpause = { 814 .cmd = ETHTOOL_SPAUSEPARAM 815 }; 816 int ret; 817 818 if (mlx5_is_secondary()) 819 return -E_RTE_SECONDARY; 820 821 ifr.ifr_data = ðpause; 822 ethpause.autoneg = fc_conf->autoneg; 823 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 824 (fc_conf->mode & RTE_FC_RX_PAUSE)) 825 ethpause.rx_pause = 1; 826 else 827 ethpause.rx_pause = 0; 828 829 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 830 (fc_conf->mode & RTE_FC_TX_PAUSE)) 831 ethpause.tx_pause = 1; 832 else 833 ethpause.tx_pause = 0; 834 835 priv_lock(priv); 836 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 837 ret = errno; 838 WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" 839 " failed: %s", 840 strerror(ret)); 841 goto out; 842 } 843 ret = 0; 844 845 out: 846 priv_unlock(priv); 847 assert(ret >= 0); 848 return -ret; 849 } 850 851 /** 852 * Get PCI information from struct ibv_device. 853 * 854 * @param device 855 * Pointer to Ethernet device structure. 856 * @param[out] pci_addr 857 * PCI bus address output buffer. 858 * 859 * @return 860 * 0 on success, -1 on failure and errno is set. 861 */ 862 int 863 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, 864 struct rte_pci_addr *pci_addr) 865 { 866 FILE *file; 867 char line[32]; 868 MKSTR(path, "%s/device/uevent", device->ibdev_path); 869 870 file = fopen(path, "rb"); 871 if (file == NULL) 872 return -1; 873 while (fgets(line, sizeof(line), file) == line) { 874 size_t len = strlen(line); 875 int ret; 876 877 /* Truncate long lines. */ 878 if (len == (sizeof(line) - 1)) 879 while (line[(len - 1)] != '\n') { 880 ret = fgetc(file); 881 if (ret == EOF) 882 break; 883 line[(len - 1)] = ret; 884 } 885 /* Extract information. */ 886 if (sscanf(line, 887 "PCI_SLOT_NAME=" 888 "%" SCNx16 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", 889 &pci_addr->domain, 890 &pci_addr->bus, 891 &pci_addr->devid, 892 &pci_addr->function) == 4) { 893 ret = 0; 894 break; 895 } 896 } 897 fclose(file); 898 return 0; 899 } 900 901 /** 902 * Link status handler. 903 * 904 * @param priv 905 * Pointer to private structure. 906 * @param dev 907 * Pointer to the rte_eth_dev structure. 908 * 909 * @return 910 * Nonzero if the callback process can be called immediately. 911 */ 912 static int 913 priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev) 914 { 915 struct ibv_async_event event; 916 int port_change = 0; 917 int ret = 0; 918 919 /* Read all message and acknowledge them. */ 920 for (;;) { 921 if (ibv_get_async_event(priv->ctx, &event)) 922 break; 923 924 if (event.event_type == IBV_EVENT_PORT_ACTIVE || 925 event.event_type == IBV_EVENT_PORT_ERR) 926 port_change = 1; 927 else 928 DEBUG("event type %d on port %d not handled", 929 event.event_type, event.element.port_num); 930 ibv_ack_async_event(&event); 931 } 932 933 if (port_change ^ priv->pending_alarm) { 934 struct rte_eth_link *link = &dev->data->dev_link; 935 936 priv->pending_alarm = 0; 937 mlx5_link_update_unlocked(dev, 0); 938 if (((link->link_speed == 0) && link->link_status) || 939 ((link->link_speed != 0) && !link->link_status)) { 940 /* Inconsistent status, check again later. */ 941 priv->pending_alarm = 1; 942 rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US, 943 mlx5_dev_link_status_handler, 944 dev); 945 } else 946 ret = 1; 947 } 948 return ret; 949 } 950 951 /** 952 * Handle delayed link status event. 953 * 954 * @param arg 955 * Registered argument. 956 */ 957 void 958 mlx5_dev_link_status_handler(void *arg) 959 { 960 struct rte_eth_dev *dev = arg; 961 struct priv *priv = dev->data->dev_private; 962 int ret; 963 964 priv_lock(priv); 965 assert(priv->pending_alarm == 1); 966 ret = priv_dev_link_status_handler(priv, dev); 967 priv_unlock(priv); 968 if (ret) 969 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC); 970 } 971 972 /** 973 * Handle interrupts from the NIC. 974 * 975 * @param[in] intr_handle 976 * Interrupt handler. 977 * @param cb_arg 978 * Callback argument. 979 */ 980 void 981 mlx5_dev_interrupt_handler(struct rte_intr_handle *intr_handle, void *cb_arg) 982 { 983 struct rte_eth_dev *dev = cb_arg; 984 struct priv *priv = dev->data->dev_private; 985 int ret; 986 987 (void)intr_handle; 988 priv_lock(priv); 989 ret = priv_dev_link_status_handler(priv, dev); 990 priv_unlock(priv); 991 if (ret) 992 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC); 993 } 994 995 /** 996 * Uninstall interrupt handler. 997 * 998 * @param priv 999 * Pointer to private structure. 1000 * @param dev 1001 * Pointer to the rte_eth_dev structure. 1002 */ 1003 void 1004 priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev) 1005 { 1006 if (!dev->data->dev_conf.intr_conf.lsc) 1007 return; 1008 rte_intr_callback_unregister(&priv->intr_handle, 1009 mlx5_dev_interrupt_handler, 1010 dev); 1011 if (priv->pending_alarm) 1012 rte_eal_alarm_cancel(mlx5_dev_link_status_handler, dev); 1013 priv->pending_alarm = 0; 1014 priv->intr_handle.fd = 0; 1015 priv->intr_handle.type = 0; 1016 } 1017 1018 /** 1019 * Install interrupt handler. 1020 * 1021 * @param priv 1022 * Pointer to private structure. 1023 * @param dev 1024 * Pointer to the rte_eth_dev structure. 1025 */ 1026 void 1027 priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev) 1028 { 1029 int rc, flags; 1030 1031 if (!dev->data->dev_conf.intr_conf.lsc) 1032 return; 1033 assert(priv->ctx->async_fd > 0); 1034 flags = fcntl(priv->ctx->async_fd, F_GETFL); 1035 rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); 1036 if (rc < 0) { 1037 INFO("failed to change file descriptor async event queue"); 1038 dev->data->dev_conf.intr_conf.lsc = 0; 1039 } else { 1040 priv->intr_handle.fd = priv->ctx->async_fd; 1041 priv->intr_handle.type = RTE_INTR_HANDLE_EXT; 1042 rte_intr_callback_register(&priv->intr_handle, 1043 mlx5_dev_interrupt_handler, 1044 dev); 1045 } 1046 } 1047 1048 /** 1049 * Change the link state (UP / DOWN). 1050 * 1051 * @param dev 1052 * Pointer to Ethernet device structure. 1053 * @param up 1054 * Nonzero for link up, otherwise link down. 1055 * 1056 * @return 1057 * 0 on success, errno value on failure. 1058 */ 1059 static int 1060 priv_set_link(struct priv *priv, int up) 1061 { 1062 struct rte_eth_dev *dev = priv->dev; 1063 int err; 1064 unsigned int i; 1065 1066 if (up) { 1067 err = priv_set_flags(priv, ~IFF_UP, IFF_UP); 1068 if (err) 1069 return err; 1070 for (i = 0; i < priv->rxqs_n; i++) 1071 if ((*priv->rxqs)[i]->sp) 1072 break; 1073 /* Check if an sp queue exists. 1074 * Note: Some old frames might be received. 1075 */ 1076 if (i == priv->rxqs_n) 1077 dev->rx_pkt_burst = mlx5_rx_burst; 1078 else 1079 dev->rx_pkt_burst = mlx5_rx_burst_sp; 1080 dev->tx_pkt_burst = mlx5_tx_burst; 1081 } else { 1082 err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP); 1083 if (err) 1084 return err; 1085 dev->rx_pkt_burst = removed_rx_burst; 1086 dev->tx_pkt_burst = removed_tx_burst; 1087 } 1088 return 0; 1089 } 1090 1091 /** 1092 * DPDK callback to bring the link DOWN. 1093 * 1094 * @param dev 1095 * Pointer to Ethernet device structure. 1096 * 1097 * @return 1098 * 0 on success, errno value on failure. 1099 */ 1100 int 1101 mlx5_set_link_down(struct rte_eth_dev *dev) 1102 { 1103 struct priv *priv = dev->data->dev_private; 1104 int err; 1105 1106 priv_lock(priv); 1107 err = priv_set_link(priv, 0); 1108 priv_unlock(priv); 1109 return err; 1110 } 1111 1112 /** 1113 * DPDK callback to bring the link UP. 1114 * 1115 * @param dev 1116 * Pointer to Ethernet device structure. 1117 * 1118 * @return 1119 * 0 on success, errno value on failure. 1120 */ 1121 int 1122 mlx5_set_link_up(struct rte_eth_dev *dev) 1123 { 1124 struct priv *priv = dev->data->dev_private; 1125 int err; 1126 1127 priv_lock(priv); 1128 err = priv_set_link(priv, 1); 1129 priv_unlock(priv); 1130 return err; 1131 } 1132 1133 /** 1134 * Configure secondary process queues from a private data pointer (primary 1135 * or secondary) and update burst callbacks. Can take place only once. 1136 * 1137 * All queues must have been previously created by the primary process to 1138 * avoid undefined behavior. 1139 * 1140 * @param priv 1141 * Private data pointer from either primary or secondary process. 1142 * 1143 * @return 1144 * Private data pointer from secondary process, NULL in case of error. 1145 */ 1146 struct priv * 1147 mlx5_secondary_data_setup(struct priv *priv) 1148 { 1149 unsigned int port_id = 0; 1150 struct mlx5_secondary_data *sd; 1151 void **tx_queues; 1152 void **rx_queues; 1153 unsigned int nb_tx_queues; 1154 unsigned int nb_rx_queues; 1155 unsigned int i; 1156 1157 /* priv must be valid at this point. */ 1158 assert(priv != NULL); 1159 /* priv->dev must also be valid but may point to local memory from 1160 * another process, possibly with the same address and must not 1161 * be dereferenced yet. */ 1162 assert(priv->dev != NULL); 1163 /* Determine port ID by finding out where priv comes from. */ 1164 while (1) { 1165 sd = &mlx5_secondary_data[port_id]; 1166 rte_spinlock_lock(&sd->lock); 1167 /* Primary process? */ 1168 if (sd->primary_priv == priv) 1169 break; 1170 /* Secondary process? */ 1171 if (sd->data.dev_private == priv) 1172 break; 1173 rte_spinlock_unlock(&sd->lock); 1174 if (++port_id == RTE_DIM(mlx5_secondary_data)) 1175 port_id = 0; 1176 } 1177 /* Switch to secondary private structure. If private data has already 1178 * been updated by another thread, there is nothing else to do. */ 1179 priv = sd->data.dev_private; 1180 if (priv->dev->data == &sd->data) 1181 goto end; 1182 /* Sanity checks. Secondary private structure is supposed to point 1183 * to local eth_dev, itself still pointing to the shared device data 1184 * structure allocated by the primary process. */ 1185 assert(sd->shared_dev_data != &sd->data); 1186 assert(sd->data.nb_tx_queues == 0); 1187 assert(sd->data.tx_queues == NULL); 1188 assert(sd->data.nb_rx_queues == 0); 1189 assert(sd->data.rx_queues == NULL); 1190 assert(priv != sd->primary_priv); 1191 assert(priv->dev->data == sd->shared_dev_data); 1192 assert(priv->txqs_n == 0); 1193 assert(priv->txqs == NULL); 1194 assert(priv->rxqs_n == 0); 1195 assert(priv->rxqs == NULL); 1196 nb_tx_queues = sd->shared_dev_data->nb_tx_queues; 1197 nb_rx_queues = sd->shared_dev_data->nb_rx_queues; 1198 /* Allocate local storage for queues. */ 1199 tx_queues = rte_zmalloc("secondary ethdev->tx_queues", 1200 sizeof(sd->data.tx_queues[0]) * nb_tx_queues, 1201 RTE_CACHE_LINE_SIZE); 1202 rx_queues = rte_zmalloc("secondary ethdev->rx_queues", 1203 sizeof(sd->data.rx_queues[0]) * nb_rx_queues, 1204 RTE_CACHE_LINE_SIZE); 1205 if (tx_queues == NULL || rx_queues == NULL) 1206 goto error; 1207 /* Lock to prevent control operations during setup. */ 1208 priv_lock(priv); 1209 /* TX queues. */ 1210 for (i = 0; i != nb_tx_queues; ++i) { 1211 struct txq *primary_txq = (*sd->primary_priv->txqs)[i]; 1212 struct txq *txq; 1213 1214 if (primary_txq == NULL) 1215 continue; 1216 txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0, 1217 primary_txq->socket); 1218 if (txq != NULL) { 1219 if (txq_setup(priv->dev, 1220 txq, 1221 primary_txq->elts_n * MLX5_PMD_SGE_WR_N, 1222 primary_txq->socket, 1223 NULL) == 0) { 1224 txq->stats.idx = primary_txq->stats.idx; 1225 tx_queues[i] = txq; 1226 continue; 1227 } 1228 rte_free(txq); 1229 } 1230 while (i) { 1231 txq = tx_queues[--i]; 1232 txq_cleanup(txq); 1233 rte_free(txq); 1234 } 1235 goto error; 1236 } 1237 /* RX queues. */ 1238 for (i = 0; i != nb_rx_queues; ++i) { 1239 struct rxq *primary_rxq = (*sd->primary_priv->rxqs)[i]; 1240 1241 if (primary_rxq == NULL) 1242 continue; 1243 /* Not supported yet. */ 1244 rx_queues[i] = NULL; 1245 } 1246 /* Update everything. */ 1247 priv->txqs = (void *)tx_queues; 1248 priv->txqs_n = nb_tx_queues; 1249 priv->rxqs = (void *)rx_queues; 1250 priv->rxqs_n = nb_rx_queues; 1251 sd->data.rx_queues = rx_queues; 1252 sd->data.tx_queues = tx_queues; 1253 sd->data.nb_rx_queues = nb_rx_queues; 1254 sd->data.nb_tx_queues = nb_tx_queues; 1255 sd->data.dev_link = sd->shared_dev_data->dev_link; 1256 sd->data.mtu = sd->shared_dev_data->mtu; 1257 memcpy(sd->data.rx_queue_state, sd->shared_dev_data->rx_queue_state, 1258 sizeof(sd->data.rx_queue_state)); 1259 memcpy(sd->data.tx_queue_state, sd->shared_dev_data->tx_queue_state, 1260 sizeof(sd->data.tx_queue_state)); 1261 sd->data.dev_flags = sd->shared_dev_data->dev_flags; 1262 /* Use local data from now on. */ 1263 rte_mb(); 1264 priv->dev->data = &sd->data; 1265 rte_mb(); 1266 priv->dev->tx_pkt_burst = mlx5_tx_burst; 1267 priv->dev->rx_pkt_burst = removed_rx_burst; 1268 priv_unlock(priv); 1269 end: 1270 /* More sanity checks. */ 1271 assert(priv->dev->tx_pkt_burst == mlx5_tx_burst); 1272 assert(priv->dev->rx_pkt_burst == removed_rx_burst); 1273 assert(priv->dev->data == &sd->data); 1274 rte_spinlock_unlock(&sd->lock); 1275 return priv; 1276 error: 1277 priv_unlock(priv); 1278 rte_free(tx_queues); 1279 rte_free(rx_queues); 1280 rte_spinlock_unlock(&sd->lock); 1281 return NULL; 1282 } 1283