1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <stddef.h> 35 #include <assert.h> 36 #include <unistd.h> 37 #include <stdint.h> 38 #include <stdio.h> 39 #include <string.h> 40 #include <stdlib.h> 41 #include <errno.h> 42 #include <dirent.h> 43 #include <net/if.h> 44 #include <sys/ioctl.h> 45 #include <sys/socket.h> 46 #include <netinet/in.h> 47 #include <linux/if.h> 48 #include <linux/ethtool.h> 49 #include <linux/sockios.h> 50 #include <fcntl.h> 51 52 /* DPDK headers don't like -pedantic. */ 53 #ifdef PEDANTIC 54 #pragma GCC diagnostic ignored "-pedantic" 55 #endif 56 #include <rte_atomic.h> 57 #include <rte_ethdev.h> 58 #include <rte_mbuf.h> 59 #include <rte_common.h> 60 #include <rte_interrupts.h> 61 #include <rte_alarm.h> 62 #include <rte_malloc.h> 63 #ifdef PEDANTIC 64 #pragma GCC diagnostic error "-pedantic" 65 #endif 66 67 #include "mlx5.h" 68 #include "mlx5_rxtx.h" 69 #include "mlx5_utils.h" 70 71 /** 72 * Return private structure associated with an Ethernet device. 73 * 74 * @param dev 75 * Pointer to Ethernet device structure. 76 * 77 * @return 78 * Pointer to private structure. 79 */ 80 struct priv * 81 mlx5_get_priv(struct rte_eth_dev *dev) 82 { 83 struct mlx5_secondary_data *sd; 84 85 if (!mlx5_is_secondary()) 86 return dev->data->dev_private; 87 sd = &mlx5_secondary_data[dev->data->port_id]; 88 return sd->data.dev_private; 89 } 90 91 /** 92 * Check if running as a secondary process. 93 * 94 * @return 95 * Nonzero if running as a secondary process. 96 */ 97 inline int 98 mlx5_is_secondary(void) 99 { 100 return rte_eal_process_type() != RTE_PROC_PRIMARY; 101 } 102 103 /** 104 * Get interface name from private structure. 105 * 106 * @param[in] priv 107 * Pointer to private structure. 108 * @param[out] ifname 109 * Interface name output buffer. 110 * 111 * @return 112 * 0 on success, -1 on failure and errno is set. 113 */ 114 int 115 priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]) 116 { 117 DIR *dir; 118 struct dirent *dent; 119 unsigned int dev_type = 0; 120 unsigned int dev_port_prev = ~0u; 121 char match[IF_NAMESIZE] = ""; 122 123 { 124 MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path); 125 126 dir = opendir(path); 127 if (dir == NULL) 128 return -1; 129 } 130 while ((dent = readdir(dir)) != NULL) { 131 char *name = dent->d_name; 132 FILE *file; 133 unsigned int dev_port; 134 int r; 135 136 if ((name[0] == '.') && 137 ((name[1] == '\0') || 138 ((name[1] == '.') && (name[2] == '\0')))) 139 continue; 140 141 MKSTR(path, "%s/device/net/%s/%s", 142 priv->ctx->device->ibdev_path, name, 143 (dev_type ? "dev_id" : "dev_port")); 144 145 file = fopen(path, "rb"); 146 if (file == NULL) { 147 if (errno != ENOENT) 148 continue; 149 /* 150 * Switch to dev_id when dev_port does not exist as 151 * is the case with Linux kernel versions < 3.15. 152 */ 153 try_dev_id: 154 match[0] = '\0'; 155 if (dev_type) 156 break; 157 dev_type = 1; 158 dev_port_prev = ~0u; 159 rewinddir(dir); 160 continue; 161 } 162 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); 163 fclose(file); 164 if (r != 1) 165 continue; 166 /* 167 * Switch to dev_id when dev_port returns the same value for 168 * all ports. May happen when using a MOFED release older than 169 * 3.0 with a Linux kernel >= 3.15. 170 */ 171 if (dev_port == dev_port_prev) 172 goto try_dev_id; 173 dev_port_prev = dev_port; 174 if (dev_port == (priv->port - 1u)) 175 snprintf(match, sizeof(match), "%s", name); 176 } 177 closedir(dir); 178 if (match[0] == '\0') 179 return -1; 180 strncpy(*ifname, match, sizeof(*ifname)); 181 return 0; 182 } 183 184 /** 185 * Read from sysfs entry. 186 * 187 * @param[in] priv 188 * Pointer to private structure. 189 * @param[in] entry 190 * Entry name relative to sysfs path. 191 * @param[out] buf 192 * Data output buffer. 193 * @param size 194 * Buffer size. 195 * 196 * @return 197 * 0 on success, -1 on failure and errno is set. 198 */ 199 static int 200 priv_sysfs_read(const struct priv *priv, const char *entry, 201 char *buf, size_t size) 202 { 203 char ifname[IF_NAMESIZE]; 204 FILE *file; 205 int ret; 206 int err; 207 208 if (priv_get_ifname(priv, &ifname)) 209 return -1; 210 211 MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, 212 ifname, entry); 213 214 file = fopen(path, "rb"); 215 if (file == NULL) 216 return -1; 217 ret = fread(buf, 1, size, file); 218 err = errno; 219 if (((size_t)ret < size) && (ferror(file))) 220 ret = -1; 221 else 222 ret = size; 223 fclose(file); 224 errno = err; 225 return ret; 226 } 227 228 /** 229 * Write to sysfs entry. 230 * 231 * @param[in] priv 232 * Pointer to private structure. 233 * @param[in] entry 234 * Entry name relative to sysfs path. 235 * @param[in] buf 236 * Data buffer. 237 * @param size 238 * Buffer size. 239 * 240 * @return 241 * 0 on success, -1 on failure and errno is set. 242 */ 243 static int 244 priv_sysfs_write(const struct priv *priv, const char *entry, 245 char *buf, size_t size) 246 { 247 char ifname[IF_NAMESIZE]; 248 FILE *file; 249 int ret; 250 int err; 251 252 if (priv_get_ifname(priv, &ifname)) 253 return -1; 254 255 MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, 256 ifname, entry); 257 258 file = fopen(path, "wb"); 259 if (file == NULL) 260 return -1; 261 ret = fwrite(buf, 1, size, file); 262 err = errno; 263 if (((size_t)ret < size) || (ferror(file))) 264 ret = -1; 265 else 266 ret = size; 267 fclose(file); 268 errno = err; 269 return ret; 270 } 271 272 /** 273 * Get unsigned long sysfs property. 274 * 275 * @param priv 276 * Pointer to private structure. 277 * @param[in] name 278 * Entry name relative to sysfs path. 279 * @param[out] value 280 * Value output buffer. 281 * 282 * @return 283 * 0 on success, -1 on failure and errno is set. 284 */ 285 static int 286 priv_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value) 287 { 288 int ret; 289 unsigned long value_ret; 290 char value_str[32]; 291 292 ret = priv_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1)); 293 if (ret == -1) { 294 DEBUG("cannot read %s value from sysfs: %s", 295 name, strerror(errno)); 296 return -1; 297 } 298 value_str[ret] = '\0'; 299 errno = 0; 300 value_ret = strtoul(value_str, NULL, 0); 301 if (errno) { 302 DEBUG("invalid %s value `%s': %s", name, value_str, 303 strerror(errno)); 304 return -1; 305 } 306 *value = value_ret; 307 return 0; 308 } 309 310 /** 311 * Set unsigned long sysfs property. 312 * 313 * @param priv 314 * Pointer to private structure. 315 * @param[in] name 316 * Entry name relative to sysfs path. 317 * @param value 318 * Value to set. 319 * 320 * @return 321 * 0 on success, -1 on failure and errno is set. 322 */ 323 static int 324 priv_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value) 325 { 326 int ret; 327 MKSTR(value_str, "%lu", value); 328 329 ret = priv_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1)); 330 if (ret == -1) { 331 DEBUG("cannot write %s `%s' (%lu) to sysfs: %s", 332 name, value_str, value, strerror(errno)); 333 return -1; 334 } 335 return 0; 336 } 337 338 /** 339 * Perform ifreq ioctl() on associated Ethernet device. 340 * 341 * @param[in] priv 342 * Pointer to private structure. 343 * @param req 344 * Request number to pass to ioctl(). 345 * @param[out] ifr 346 * Interface request structure output buffer. 347 * 348 * @return 349 * 0 on success, -1 on failure and errno is set. 350 */ 351 int 352 priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr) 353 { 354 int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 355 int ret = -1; 356 357 if (sock == -1) 358 return ret; 359 if (priv_get_ifname(priv, &ifr->ifr_name) == 0) 360 ret = ioctl(sock, req, ifr); 361 close(sock); 362 return ret; 363 } 364 365 /** 366 * Get device MTU. 367 * 368 * @param priv 369 * Pointer to private structure. 370 * @param[out] mtu 371 * MTU value output buffer. 372 * 373 * @return 374 * 0 on success, -1 on failure and errno is set. 375 */ 376 int 377 priv_get_mtu(struct priv *priv, uint16_t *mtu) 378 { 379 unsigned long ulong_mtu; 380 381 if (priv_get_sysfs_ulong(priv, "mtu", &ulong_mtu) == -1) 382 return -1; 383 *mtu = ulong_mtu; 384 return 0; 385 } 386 387 /** 388 * Set device MTU. 389 * 390 * @param priv 391 * Pointer to private structure. 392 * @param mtu 393 * MTU value to set. 394 * 395 * @return 396 * 0 on success, -1 on failure and errno is set. 397 */ 398 static int 399 priv_set_mtu(struct priv *priv, uint16_t mtu) 400 { 401 return priv_set_sysfs_ulong(priv, "mtu", mtu); 402 } 403 404 /** 405 * Set device flags. 406 * 407 * @param priv 408 * Pointer to private structure. 409 * @param keep 410 * Bitmask for flags that must remain untouched. 411 * @param flags 412 * Bitmask for flags to modify. 413 * 414 * @return 415 * 0 on success, -1 on failure and errno is set. 416 */ 417 int 418 priv_set_flags(struct priv *priv, unsigned int keep, unsigned int flags) 419 { 420 unsigned long tmp; 421 422 if (priv_get_sysfs_ulong(priv, "flags", &tmp) == -1) 423 return -1; 424 tmp &= keep; 425 tmp |= flags; 426 return priv_set_sysfs_ulong(priv, "flags", tmp); 427 } 428 429 /** 430 * Ethernet device configuration. 431 * 432 * Prepare the driver for a given number of TX and RX queues. 433 * 434 * @param dev 435 * Pointer to Ethernet device structure. 436 * 437 * @return 438 * 0 on success, errno value on failure. 439 */ 440 static int 441 dev_configure(struct rte_eth_dev *dev) 442 { 443 struct priv *priv = dev->data->dev_private; 444 unsigned int rxqs_n = dev->data->nb_rx_queues; 445 unsigned int txqs_n = dev->data->nb_tx_queues; 446 unsigned int i; 447 unsigned int j; 448 unsigned int reta_idx_n; 449 450 priv->rxqs = (void *)dev->data->rx_queues; 451 priv->txqs = (void *)dev->data->tx_queues; 452 if (txqs_n != priv->txqs_n) { 453 INFO("%p: TX queues number update: %u -> %u", 454 (void *)dev, priv->txqs_n, txqs_n); 455 priv->txqs_n = txqs_n; 456 } 457 if (rxqs_n > priv->ind_table_max_size) { 458 ERROR("cannot handle this many RX queues (%u)", rxqs_n); 459 return EINVAL; 460 } 461 if (rxqs_n == priv->rxqs_n) 462 return 0; 463 INFO("%p: RX queues number update: %u -> %u", 464 (void *)dev, priv->rxqs_n, rxqs_n); 465 priv->rxqs_n = rxqs_n; 466 /* If the requested number of RX queues is not a power of two, use the 467 * maximum indirection table size for better balancing. 468 * The result is always rounded to the next power of two. */ 469 reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ? 470 priv->ind_table_max_size : 471 rxqs_n)); 472 if (priv_rss_reta_index_resize(priv, reta_idx_n)) 473 return ENOMEM; 474 /* When the number of RX queues is not a power of two, the remaining 475 * table entries are padded with reused WQs and hashes are not spread 476 * uniformly. */ 477 for (i = 0, j = 0; (i != reta_idx_n); ++i) { 478 (*priv->reta_idx)[i] = j; 479 if (++j == rxqs_n) 480 j = 0; 481 } 482 return 0; 483 } 484 485 /** 486 * DPDK callback for Ethernet device configuration. 487 * 488 * @param dev 489 * Pointer to Ethernet device structure. 490 * 491 * @return 492 * 0 on success, negative errno value on failure. 493 */ 494 int 495 mlx5_dev_configure(struct rte_eth_dev *dev) 496 { 497 struct priv *priv = dev->data->dev_private; 498 int ret; 499 500 if (mlx5_is_secondary()) 501 return -E_RTE_SECONDARY; 502 503 priv_lock(priv); 504 ret = dev_configure(dev); 505 assert(ret >= 0); 506 priv_unlock(priv); 507 return -ret; 508 } 509 510 /** 511 * DPDK callback to get information about the device. 512 * 513 * @param dev 514 * Pointer to Ethernet device structure. 515 * @param[out] info 516 * Info structure output buffer. 517 */ 518 void 519 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 520 { 521 struct priv *priv = mlx5_get_priv(dev); 522 unsigned int max; 523 char ifname[IF_NAMESIZE]; 524 525 priv_lock(priv); 526 /* FIXME: we should ask the device for these values. */ 527 info->min_rx_bufsize = 32; 528 info->max_rx_pktlen = 65536; 529 /* 530 * Since we need one CQ per QP, the limit is the minimum number 531 * between the two values. 532 */ 533 max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ? 534 priv->device_attr.max_qp : priv->device_attr.max_cq); 535 /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */ 536 if (max >= 65535) 537 max = 65535; 538 info->max_rx_queues = max; 539 info->max_tx_queues = max; 540 info->max_mac_addrs = RTE_DIM(priv->mac); 541 info->rx_offload_capa = 542 (priv->hw_csum ? 543 (DEV_RX_OFFLOAD_IPV4_CKSUM | 544 DEV_RX_OFFLOAD_UDP_CKSUM | 545 DEV_RX_OFFLOAD_TCP_CKSUM) : 546 0); 547 info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT; 548 if (priv->hw_csum) 549 info->tx_offload_capa |= 550 (DEV_TX_OFFLOAD_IPV4_CKSUM | 551 DEV_TX_OFFLOAD_UDP_CKSUM | 552 DEV_TX_OFFLOAD_TCP_CKSUM); 553 if (priv_get_ifname(priv, &ifname) == 0) 554 info->if_index = if_nametoindex(ifname); 555 /* FIXME: RETA update/query API expects the callee to know the size of 556 * the indirection table, for this PMD the size varies depending on 557 * the number of RX queues, it becomes impossible to find the correct 558 * size if it is not fixed. 559 * The API should be updated to solve this problem. */ 560 info->reta_size = priv->ind_table_max_size; 561 priv_unlock(priv); 562 } 563 564 const uint32_t * 565 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) 566 { 567 static const uint32_t ptypes[] = { 568 /* refers to rxq_cq_to_pkt_type() */ 569 RTE_PTYPE_L3_IPV4, 570 RTE_PTYPE_L3_IPV6, 571 RTE_PTYPE_INNER_L3_IPV4, 572 RTE_PTYPE_INNER_L3_IPV6, 573 RTE_PTYPE_UNKNOWN 574 575 }; 576 577 if (dev->rx_pkt_burst == mlx5_rx_burst || 578 dev->rx_pkt_burst == mlx5_rx_burst_sp) 579 return ptypes; 580 return NULL; 581 } 582 583 /** 584 * DPDK callback to retrieve physical link information (unlocked version). 585 * 586 * @param dev 587 * Pointer to Ethernet device structure. 588 * @param wait_to_complete 589 * Wait for request completion (ignored). 590 */ 591 static int 592 mlx5_link_update_unlocked(struct rte_eth_dev *dev, int wait_to_complete) 593 { 594 struct priv *priv = mlx5_get_priv(dev); 595 struct ethtool_cmd edata = { 596 .cmd = ETHTOOL_GSET 597 }; 598 struct ifreq ifr; 599 struct rte_eth_link dev_link; 600 int link_speed = 0; 601 602 (void)wait_to_complete; 603 if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) { 604 WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno)); 605 return -1; 606 } 607 memset(&dev_link, 0, sizeof(dev_link)); 608 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && 609 (ifr.ifr_flags & IFF_RUNNING)); 610 ifr.ifr_data = &edata; 611 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 612 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", 613 strerror(errno)); 614 return -1; 615 } 616 link_speed = ethtool_cmd_speed(&edata); 617 if (link_speed == -1) 618 dev_link.link_speed = 0; 619 else 620 dev_link.link_speed = link_speed; 621 dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? 622 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 623 if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) { 624 /* Link status changed. */ 625 dev->data->dev_link = dev_link; 626 return 0; 627 } 628 /* Link status is still the same. */ 629 return -1; 630 } 631 632 /** 633 * DPDK callback to retrieve physical link information. 634 * 635 * @param dev 636 * Pointer to Ethernet device structure. 637 * @param wait_to_complete 638 * Wait for request completion (ignored). 639 */ 640 int 641 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete) 642 { 643 struct priv *priv = mlx5_get_priv(dev); 644 int ret; 645 646 priv_lock(priv); 647 ret = mlx5_link_update_unlocked(dev, wait_to_complete); 648 priv_unlock(priv); 649 return ret; 650 } 651 652 /** 653 * DPDK callback to change the MTU. 654 * 655 * Setting the MTU affects hardware MRU (packets larger than the MTU cannot be 656 * received). Use this as a hint to enable/disable scattered packets support 657 * and improve performance when not needed. 658 * Since failure is not an option, reconfiguring queues on the fly is not 659 * recommended. 660 * 661 * @param dev 662 * Pointer to Ethernet device structure. 663 * @param in_mtu 664 * New MTU. 665 * 666 * @return 667 * 0 on success, negative errno value on failure. 668 */ 669 int 670 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 671 { 672 struct priv *priv = dev->data->dev_private; 673 int ret = 0; 674 unsigned int i; 675 uint16_t (*rx_func)(void *, struct rte_mbuf **, uint16_t) = 676 mlx5_rx_burst; 677 678 if (mlx5_is_secondary()) 679 return -E_RTE_SECONDARY; 680 681 priv_lock(priv); 682 /* Set kernel interface MTU first. */ 683 if (priv_set_mtu(priv, mtu)) { 684 ret = errno; 685 WARN("cannot set port %u MTU to %u: %s", priv->port, mtu, 686 strerror(ret)); 687 goto out; 688 } else 689 DEBUG("adapter port %u MTU set to %u", priv->port, mtu); 690 priv->mtu = mtu; 691 /* Temporarily replace RX handler with a fake one, assuming it has not 692 * been copied elsewhere. */ 693 dev->rx_pkt_burst = removed_rx_burst; 694 /* Make sure everyone has left mlx5_rx_burst() and uses 695 * removed_rx_burst() instead. */ 696 rte_wmb(); 697 usleep(1000); 698 /* Reconfigure each RX queue. */ 699 for (i = 0; (i != priv->rxqs_n); ++i) { 700 struct rxq *rxq = (*priv->rxqs)[i]; 701 unsigned int max_frame_len; 702 int sp; 703 704 if (rxq == NULL) 705 continue; 706 /* Calculate new maximum frame length according to MTU and 707 * toggle scattered support (sp) if necessary. */ 708 max_frame_len = (priv->mtu + ETHER_HDR_LEN + 709 (ETHER_MAX_VLAN_FRAME_LEN - ETHER_MAX_LEN)); 710 sp = (max_frame_len > (rxq->mb_len - RTE_PKTMBUF_HEADROOM)); 711 /* Provide new values to rxq_setup(). */ 712 dev->data->dev_conf.rxmode.jumbo_frame = sp; 713 dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len; 714 ret = rxq_rehash(dev, rxq); 715 if (ret) { 716 /* Force SP RX if that queue requires it and abort. */ 717 if (rxq->sp) 718 rx_func = mlx5_rx_burst_sp; 719 break; 720 } 721 /* Scattered burst function takes priority. */ 722 if (rxq->sp) 723 rx_func = mlx5_rx_burst_sp; 724 } 725 /* Burst functions can now be called again. */ 726 rte_wmb(); 727 dev->rx_pkt_burst = rx_func; 728 out: 729 priv_unlock(priv); 730 assert(ret >= 0); 731 return -ret; 732 } 733 734 /** 735 * DPDK callback to get flow control status. 736 * 737 * @param dev 738 * Pointer to Ethernet device structure. 739 * @param[out] fc_conf 740 * Flow control output buffer. 741 * 742 * @return 743 * 0 on success, negative errno value on failure. 744 */ 745 int 746 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 747 { 748 struct priv *priv = dev->data->dev_private; 749 struct ifreq ifr; 750 struct ethtool_pauseparam ethpause = { 751 .cmd = ETHTOOL_GPAUSEPARAM 752 }; 753 int ret; 754 755 if (mlx5_is_secondary()) 756 return -E_RTE_SECONDARY; 757 758 ifr.ifr_data = ðpause; 759 priv_lock(priv); 760 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 761 ret = errno; 762 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)" 763 " failed: %s", 764 strerror(ret)); 765 goto out; 766 } 767 768 fc_conf->autoneg = ethpause.autoneg; 769 if (ethpause.rx_pause && ethpause.tx_pause) 770 fc_conf->mode = RTE_FC_FULL; 771 else if (ethpause.rx_pause) 772 fc_conf->mode = RTE_FC_RX_PAUSE; 773 else if (ethpause.tx_pause) 774 fc_conf->mode = RTE_FC_TX_PAUSE; 775 else 776 fc_conf->mode = RTE_FC_NONE; 777 ret = 0; 778 779 out: 780 priv_unlock(priv); 781 assert(ret >= 0); 782 return -ret; 783 } 784 785 /** 786 * DPDK callback to modify flow control parameters. 787 * 788 * @param dev 789 * Pointer to Ethernet device structure. 790 * @param[in] fc_conf 791 * Flow control parameters. 792 * 793 * @return 794 * 0 on success, negative errno value on failure. 795 */ 796 int 797 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 798 { 799 struct priv *priv = dev->data->dev_private; 800 struct ifreq ifr; 801 struct ethtool_pauseparam ethpause = { 802 .cmd = ETHTOOL_SPAUSEPARAM 803 }; 804 int ret; 805 806 if (mlx5_is_secondary()) 807 return -E_RTE_SECONDARY; 808 809 ifr.ifr_data = ðpause; 810 ethpause.autoneg = fc_conf->autoneg; 811 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 812 (fc_conf->mode & RTE_FC_RX_PAUSE)) 813 ethpause.rx_pause = 1; 814 else 815 ethpause.rx_pause = 0; 816 817 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 818 (fc_conf->mode & RTE_FC_TX_PAUSE)) 819 ethpause.tx_pause = 1; 820 else 821 ethpause.tx_pause = 0; 822 823 priv_lock(priv); 824 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 825 ret = errno; 826 WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" 827 " failed: %s", 828 strerror(ret)); 829 goto out; 830 } 831 ret = 0; 832 833 out: 834 priv_unlock(priv); 835 assert(ret >= 0); 836 return -ret; 837 } 838 839 /** 840 * Get PCI information from struct ibv_device. 841 * 842 * @param device 843 * Pointer to Ethernet device structure. 844 * @param[out] pci_addr 845 * PCI bus address output buffer. 846 * 847 * @return 848 * 0 on success, -1 on failure and errno is set. 849 */ 850 int 851 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, 852 struct rte_pci_addr *pci_addr) 853 { 854 FILE *file; 855 char line[32]; 856 MKSTR(path, "%s/device/uevent", device->ibdev_path); 857 858 file = fopen(path, "rb"); 859 if (file == NULL) 860 return -1; 861 while (fgets(line, sizeof(line), file) == line) { 862 size_t len = strlen(line); 863 int ret; 864 865 /* Truncate long lines. */ 866 if (len == (sizeof(line) - 1)) 867 while (line[(len - 1)] != '\n') { 868 ret = fgetc(file); 869 if (ret == EOF) 870 break; 871 line[(len - 1)] = ret; 872 } 873 /* Extract information. */ 874 if (sscanf(line, 875 "PCI_SLOT_NAME=" 876 "%" SCNx16 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", 877 &pci_addr->domain, 878 &pci_addr->bus, 879 &pci_addr->devid, 880 &pci_addr->function) == 4) { 881 ret = 0; 882 break; 883 } 884 } 885 fclose(file); 886 return 0; 887 } 888 889 /** 890 * Link status handler. 891 * 892 * @param priv 893 * Pointer to private structure. 894 * @param dev 895 * Pointer to the rte_eth_dev structure. 896 * 897 * @return 898 * Nonzero if the callback process can be called immediately. 899 */ 900 static int 901 priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev) 902 { 903 struct ibv_async_event event; 904 int port_change = 0; 905 int ret = 0; 906 907 /* Read all message and acknowledge them. */ 908 for (;;) { 909 if (ibv_get_async_event(priv->ctx, &event)) 910 break; 911 912 if (event.event_type == IBV_EVENT_PORT_ACTIVE || 913 event.event_type == IBV_EVENT_PORT_ERR) 914 port_change = 1; 915 else 916 DEBUG("event type %d on port %d not handled", 917 event.event_type, event.element.port_num); 918 ibv_ack_async_event(&event); 919 } 920 921 if (port_change ^ priv->pending_alarm) { 922 struct rte_eth_link *link = &dev->data->dev_link; 923 924 priv->pending_alarm = 0; 925 mlx5_link_update_unlocked(dev, 0); 926 if (((link->link_speed == 0) && link->link_status) || 927 ((link->link_speed != 0) && !link->link_status)) { 928 /* Inconsistent status, check again later. */ 929 priv->pending_alarm = 1; 930 rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US, 931 mlx5_dev_link_status_handler, 932 dev); 933 } else 934 ret = 1; 935 } 936 return ret; 937 } 938 939 /** 940 * Handle delayed link status event. 941 * 942 * @param arg 943 * Registered argument. 944 */ 945 void 946 mlx5_dev_link_status_handler(void *arg) 947 { 948 struct rte_eth_dev *dev = arg; 949 struct priv *priv = dev->data->dev_private; 950 int ret; 951 952 priv_lock(priv); 953 assert(priv->pending_alarm == 1); 954 ret = priv_dev_link_status_handler(priv, dev); 955 priv_unlock(priv); 956 if (ret) 957 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC); 958 } 959 960 /** 961 * Handle interrupts from the NIC. 962 * 963 * @param[in] intr_handle 964 * Interrupt handler. 965 * @param cb_arg 966 * Callback argument. 967 */ 968 void 969 mlx5_dev_interrupt_handler(struct rte_intr_handle *intr_handle, void *cb_arg) 970 { 971 struct rte_eth_dev *dev = cb_arg; 972 struct priv *priv = dev->data->dev_private; 973 int ret; 974 975 (void)intr_handle; 976 priv_lock(priv); 977 ret = priv_dev_link_status_handler(priv, dev); 978 priv_unlock(priv); 979 if (ret) 980 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC); 981 } 982 983 /** 984 * Uninstall interrupt handler. 985 * 986 * @param priv 987 * Pointer to private structure. 988 * @param dev 989 * Pointer to the rte_eth_dev structure. 990 */ 991 void 992 priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev) 993 { 994 if (!dev->data->dev_conf.intr_conf.lsc) 995 return; 996 rte_intr_callback_unregister(&priv->intr_handle, 997 mlx5_dev_interrupt_handler, 998 dev); 999 if (priv->pending_alarm) 1000 rte_eal_alarm_cancel(mlx5_dev_link_status_handler, dev); 1001 priv->pending_alarm = 0; 1002 priv->intr_handle.fd = 0; 1003 priv->intr_handle.type = 0; 1004 } 1005 1006 /** 1007 * Install interrupt handler. 1008 * 1009 * @param priv 1010 * Pointer to private structure. 1011 * @param dev 1012 * Pointer to the rte_eth_dev structure. 1013 */ 1014 void 1015 priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev) 1016 { 1017 int rc, flags; 1018 1019 if (!dev->data->dev_conf.intr_conf.lsc) 1020 return; 1021 assert(priv->ctx->async_fd > 0); 1022 flags = fcntl(priv->ctx->async_fd, F_GETFL); 1023 rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); 1024 if (rc < 0) { 1025 INFO("failed to change file descriptor async event queue"); 1026 dev->data->dev_conf.intr_conf.lsc = 0; 1027 } else { 1028 priv->intr_handle.fd = priv->ctx->async_fd; 1029 priv->intr_handle.type = RTE_INTR_HANDLE_EXT; 1030 rte_intr_callback_register(&priv->intr_handle, 1031 mlx5_dev_interrupt_handler, 1032 dev); 1033 } 1034 } 1035 1036 /** 1037 * Change the link state (UP / DOWN). 1038 * 1039 * @param dev 1040 * Pointer to Ethernet device structure. 1041 * @param up 1042 * Nonzero for link up, otherwise link down. 1043 * 1044 * @return 1045 * 0 on success, errno value on failure. 1046 */ 1047 static int 1048 priv_set_link(struct priv *priv, int up) 1049 { 1050 struct rte_eth_dev *dev = priv->dev; 1051 int err; 1052 unsigned int i; 1053 1054 if (up) { 1055 err = priv_set_flags(priv, ~IFF_UP, IFF_UP); 1056 if (err) 1057 return err; 1058 for (i = 0; i < priv->rxqs_n; i++) 1059 if ((*priv->rxqs)[i]->sp) 1060 break; 1061 /* Check if an sp queue exists. 1062 * Note: Some old frames might be received. 1063 */ 1064 if (i == priv->rxqs_n) 1065 dev->rx_pkt_burst = mlx5_rx_burst; 1066 else 1067 dev->rx_pkt_burst = mlx5_rx_burst_sp; 1068 dev->tx_pkt_burst = mlx5_tx_burst; 1069 } else { 1070 err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP); 1071 if (err) 1072 return err; 1073 dev->rx_pkt_burst = removed_rx_burst; 1074 dev->tx_pkt_burst = removed_tx_burst; 1075 } 1076 return 0; 1077 } 1078 1079 /** 1080 * DPDK callback to bring the link DOWN. 1081 * 1082 * @param dev 1083 * Pointer to Ethernet device structure. 1084 * 1085 * @return 1086 * 0 on success, errno value on failure. 1087 */ 1088 int 1089 mlx5_set_link_down(struct rte_eth_dev *dev) 1090 { 1091 struct priv *priv = dev->data->dev_private; 1092 int err; 1093 1094 priv_lock(priv); 1095 err = priv_set_link(priv, 0); 1096 priv_unlock(priv); 1097 return err; 1098 } 1099 1100 /** 1101 * DPDK callback to bring the link UP. 1102 * 1103 * @param dev 1104 * Pointer to Ethernet device structure. 1105 * 1106 * @return 1107 * 0 on success, errno value on failure. 1108 */ 1109 int 1110 mlx5_set_link_up(struct rte_eth_dev *dev) 1111 { 1112 struct priv *priv = dev->data->dev_private; 1113 int err; 1114 1115 priv_lock(priv); 1116 err = priv_set_link(priv, 1); 1117 priv_unlock(priv); 1118 return err; 1119 } 1120 1121 /** 1122 * Configure secondary process queues from a private data pointer (primary 1123 * or secondary) and update burst callbacks. Can take place only once. 1124 * 1125 * All queues must have been previously created by the primary process to 1126 * avoid undefined behavior. 1127 * 1128 * @param priv 1129 * Private data pointer from either primary or secondary process. 1130 * 1131 * @return 1132 * Private data pointer from secondary process, NULL in case of error. 1133 */ 1134 struct priv * 1135 mlx5_secondary_data_setup(struct priv *priv) 1136 { 1137 unsigned int port_id = 0; 1138 struct mlx5_secondary_data *sd; 1139 void **tx_queues; 1140 void **rx_queues; 1141 unsigned int nb_tx_queues; 1142 unsigned int nb_rx_queues; 1143 unsigned int i; 1144 1145 /* priv must be valid at this point. */ 1146 assert(priv != NULL); 1147 /* priv->dev must also be valid but may point to local memory from 1148 * another process, possibly with the same address and must not 1149 * be dereferenced yet. */ 1150 assert(priv->dev != NULL); 1151 /* Determine port ID by finding out where priv comes from. */ 1152 while (1) { 1153 sd = &mlx5_secondary_data[port_id]; 1154 rte_spinlock_lock(&sd->lock); 1155 /* Primary process? */ 1156 if (sd->primary_priv == priv) 1157 break; 1158 /* Secondary process? */ 1159 if (sd->data.dev_private == priv) 1160 break; 1161 rte_spinlock_unlock(&sd->lock); 1162 if (++port_id == RTE_DIM(mlx5_secondary_data)) 1163 port_id = 0; 1164 } 1165 /* Switch to secondary private structure. If private data has already 1166 * been updated by another thread, there is nothing else to do. */ 1167 priv = sd->data.dev_private; 1168 if (priv->dev->data == &sd->data) 1169 goto end; 1170 /* Sanity checks. Secondary private structure is supposed to point 1171 * to local eth_dev, itself still pointing to the shared device data 1172 * structure allocated by the primary process. */ 1173 assert(sd->shared_dev_data != &sd->data); 1174 assert(sd->data.nb_tx_queues == 0); 1175 assert(sd->data.tx_queues == NULL); 1176 assert(sd->data.nb_rx_queues == 0); 1177 assert(sd->data.rx_queues == NULL); 1178 assert(priv != sd->primary_priv); 1179 assert(priv->dev->data == sd->shared_dev_data); 1180 assert(priv->txqs_n == 0); 1181 assert(priv->txqs == NULL); 1182 assert(priv->rxqs_n == 0); 1183 assert(priv->rxqs == NULL); 1184 nb_tx_queues = sd->shared_dev_data->nb_tx_queues; 1185 nb_rx_queues = sd->shared_dev_data->nb_rx_queues; 1186 /* Allocate local storage for queues. */ 1187 tx_queues = rte_zmalloc("secondary ethdev->tx_queues", 1188 sizeof(sd->data.tx_queues[0]) * nb_tx_queues, 1189 RTE_CACHE_LINE_SIZE); 1190 rx_queues = rte_zmalloc("secondary ethdev->rx_queues", 1191 sizeof(sd->data.rx_queues[0]) * nb_rx_queues, 1192 RTE_CACHE_LINE_SIZE); 1193 if (tx_queues == NULL || rx_queues == NULL) 1194 goto error; 1195 /* Lock to prevent control operations during setup. */ 1196 priv_lock(priv); 1197 /* TX queues. */ 1198 for (i = 0; i != nb_tx_queues; ++i) { 1199 struct txq *primary_txq = (*sd->primary_priv->txqs)[i]; 1200 struct txq *txq; 1201 1202 if (primary_txq == NULL) 1203 continue; 1204 txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0, 1205 primary_txq->socket); 1206 if (txq != NULL) { 1207 if (txq_setup(priv->dev, 1208 txq, 1209 primary_txq->elts_n * MLX5_PMD_SGE_WR_N, 1210 primary_txq->socket, 1211 NULL) == 0) { 1212 txq->stats.idx = primary_txq->stats.idx; 1213 tx_queues[i] = txq; 1214 continue; 1215 } 1216 rte_free(txq); 1217 } 1218 while (i) { 1219 txq = tx_queues[--i]; 1220 txq_cleanup(txq); 1221 rte_free(txq); 1222 } 1223 goto error; 1224 } 1225 /* RX queues. */ 1226 for (i = 0; i != nb_rx_queues; ++i) { 1227 struct rxq *primary_rxq = (*sd->primary_priv->rxqs)[i]; 1228 1229 if (primary_rxq == NULL) 1230 continue; 1231 /* Not supported yet. */ 1232 rx_queues[i] = NULL; 1233 } 1234 /* Update everything. */ 1235 priv->txqs = (void *)tx_queues; 1236 priv->txqs_n = nb_tx_queues; 1237 priv->rxqs = (void *)rx_queues; 1238 priv->rxqs_n = nb_rx_queues; 1239 sd->data.rx_queues = rx_queues; 1240 sd->data.tx_queues = tx_queues; 1241 sd->data.nb_rx_queues = nb_rx_queues; 1242 sd->data.nb_tx_queues = nb_tx_queues; 1243 sd->data.dev_link = sd->shared_dev_data->dev_link; 1244 sd->data.mtu = sd->shared_dev_data->mtu; 1245 memcpy(sd->data.rx_queue_state, sd->shared_dev_data->rx_queue_state, 1246 sizeof(sd->data.rx_queue_state)); 1247 memcpy(sd->data.tx_queue_state, sd->shared_dev_data->tx_queue_state, 1248 sizeof(sd->data.tx_queue_state)); 1249 sd->data.dev_flags = sd->shared_dev_data->dev_flags; 1250 /* Use local data from now on. */ 1251 rte_mb(); 1252 priv->dev->data = &sd->data; 1253 rte_mb(); 1254 priv->dev->tx_pkt_burst = mlx5_tx_burst; 1255 priv->dev->rx_pkt_burst = removed_rx_burst; 1256 priv_unlock(priv); 1257 end: 1258 /* More sanity checks. */ 1259 assert(priv->dev->tx_pkt_burst == mlx5_tx_burst); 1260 assert(priv->dev->rx_pkt_burst == removed_rx_burst); 1261 assert(priv->dev->data == &sd->data); 1262 rte_spinlock_unlock(&sd->lock); 1263 return priv; 1264 error: 1265 priv_unlock(priv); 1266 rte_free(tx_queues); 1267 rte_free(rx_queues); 1268 rte_spinlock_unlock(&sd->lock); 1269 return NULL; 1270 } 1271