1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <stddef.h> 35 #include <assert.h> 36 #include <unistd.h> 37 #include <stdint.h> 38 #include <stdio.h> 39 #include <string.h> 40 #include <stdlib.h> 41 #include <errno.h> 42 #include <dirent.h> 43 #include <net/if.h> 44 #include <sys/ioctl.h> 45 #include <sys/socket.h> 46 #include <netinet/in.h> 47 #include <linux/if.h> 48 #include <linux/ethtool.h> 49 #include <linux/sockios.h> 50 #include <fcntl.h> 51 52 /* DPDK headers don't like -pedantic. */ 53 #ifdef PEDANTIC 54 #pragma GCC diagnostic ignored "-pedantic" 55 #endif 56 #include <rte_atomic.h> 57 #include <rte_ethdev.h> 58 #include <rte_mbuf.h> 59 #include <rte_common.h> 60 #include <rte_interrupts.h> 61 #include <rte_alarm.h> 62 #include <rte_malloc.h> 63 #ifdef PEDANTIC 64 #pragma GCC diagnostic error "-pedantic" 65 #endif 66 67 #include "mlx5.h" 68 #include "mlx5_rxtx.h" 69 #include "mlx5_utils.h" 70 71 /** 72 * Return private structure associated with an Ethernet device. 73 * 74 * @param dev 75 * Pointer to Ethernet device structure. 76 * 77 * @return 78 * Pointer to private structure. 79 */ 80 struct priv * 81 mlx5_get_priv(struct rte_eth_dev *dev) 82 { 83 struct mlx5_secondary_data *sd; 84 85 if (!mlx5_is_secondary()) 86 return dev->data->dev_private; 87 sd = &mlx5_secondary_data[dev->data->port_id]; 88 return sd->data.dev_private; 89 } 90 91 /** 92 * Check if running as a secondary process. 93 * 94 * @return 95 * Nonzero if running as a secondary process. 96 */ 97 inline int 98 mlx5_is_secondary(void) 99 { 100 return rte_eal_process_type() != RTE_PROC_PRIMARY; 101 } 102 103 /** 104 * Get interface name from private structure. 105 * 106 * @param[in] priv 107 * Pointer to private structure. 108 * @param[out] ifname 109 * Interface name output buffer. 110 * 111 * @return 112 * 0 on success, -1 on failure and errno is set. 113 */ 114 int 115 priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]) 116 { 117 DIR *dir; 118 struct dirent *dent; 119 unsigned int dev_type = 0; 120 unsigned int dev_port_prev = ~0u; 121 char match[IF_NAMESIZE] = ""; 122 123 { 124 MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path); 125 126 dir = opendir(path); 127 if (dir == NULL) 128 return -1; 129 } 130 while ((dent = readdir(dir)) != NULL) { 131 char *name = dent->d_name; 132 FILE *file; 133 unsigned int dev_port; 134 int r; 135 136 if ((name[0] == '.') && 137 ((name[1] == '\0') || 138 ((name[1] == '.') && (name[2] == '\0')))) 139 continue; 140 141 MKSTR(path, "%s/device/net/%s/%s", 142 priv->ctx->device->ibdev_path, name, 143 (dev_type ? "dev_id" : "dev_port")); 144 145 file = fopen(path, "rb"); 146 if (file == NULL) { 147 if (errno != ENOENT) 148 continue; 149 /* 150 * Switch to dev_id when dev_port does not exist as 151 * is the case with Linux kernel versions < 3.15. 152 */ 153 try_dev_id: 154 match[0] = '\0'; 155 if (dev_type) 156 break; 157 dev_type = 1; 158 dev_port_prev = ~0u; 159 rewinddir(dir); 160 continue; 161 } 162 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); 163 fclose(file); 164 if (r != 1) 165 continue; 166 /* 167 * Switch to dev_id when dev_port returns the same value for 168 * all ports. May happen when using a MOFED release older than 169 * 3.0 with a Linux kernel >= 3.15. 170 */ 171 if (dev_port == dev_port_prev) 172 goto try_dev_id; 173 dev_port_prev = dev_port; 174 if (dev_port == (priv->port - 1u)) 175 snprintf(match, sizeof(match), "%s", name); 176 } 177 closedir(dir); 178 if (match[0] == '\0') 179 return -1; 180 strncpy(*ifname, match, sizeof(*ifname)); 181 return 0; 182 } 183 184 /** 185 * Read from sysfs entry. 186 * 187 * @param[in] priv 188 * Pointer to private structure. 189 * @param[in] entry 190 * Entry name relative to sysfs path. 191 * @param[out] buf 192 * Data output buffer. 193 * @param size 194 * Buffer size. 195 * 196 * @return 197 * 0 on success, -1 on failure and errno is set. 198 */ 199 static int 200 priv_sysfs_read(const struct priv *priv, const char *entry, 201 char *buf, size_t size) 202 { 203 char ifname[IF_NAMESIZE]; 204 FILE *file; 205 int ret; 206 int err; 207 208 if (priv_get_ifname(priv, &ifname)) 209 return -1; 210 211 MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, 212 ifname, entry); 213 214 file = fopen(path, "rb"); 215 if (file == NULL) 216 return -1; 217 ret = fread(buf, 1, size, file); 218 err = errno; 219 if (((size_t)ret < size) && (ferror(file))) 220 ret = -1; 221 else 222 ret = size; 223 fclose(file); 224 errno = err; 225 return ret; 226 } 227 228 /** 229 * Write to sysfs entry. 230 * 231 * @param[in] priv 232 * Pointer to private structure. 233 * @param[in] entry 234 * Entry name relative to sysfs path. 235 * @param[in] buf 236 * Data buffer. 237 * @param size 238 * Buffer size. 239 * 240 * @return 241 * 0 on success, -1 on failure and errno is set. 242 */ 243 static int 244 priv_sysfs_write(const struct priv *priv, const char *entry, 245 char *buf, size_t size) 246 { 247 char ifname[IF_NAMESIZE]; 248 FILE *file; 249 int ret; 250 int err; 251 252 if (priv_get_ifname(priv, &ifname)) 253 return -1; 254 255 MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, 256 ifname, entry); 257 258 file = fopen(path, "wb"); 259 if (file == NULL) 260 return -1; 261 ret = fwrite(buf, 1, size, file); 262 err = errno; 263 if (((size_t)ret < size) || (ferror(file))) 264 ret = -1; 265 else 266 ret = size; 267 fclose(file); 268 errno = err; 269 return ret; 270 } 271 272 /** 273 * Get unsigned long sysfs property. 274 * 275 * @param priv 276 * Pointer to private structure. 277 * @param[in] name 278 * Entry name relative to sysfs path. 279 * @param[out] value 280 * Value output buffer. 281 * 282 * @return 283 * 0 on success, -1 on failure and errno is set. 284 */ 285 static int 286 priv_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value) 287 { 288 int ret; 289 unsigned long value_ret; 290 char value_str[32]; 291 292 ret = priv_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1)); 293 if (ret == -1) { 294 DEBUG("cannot read %s value from sysfs: %s", 295 name, strerror(errno)); 296 return -1; 297 } 298 value_str[ret] = '\0'; 299 errno = 0; 300 value_ret = strtoul(value_str, NULL, 0); 301 if (errno) { 302 DEBUG("invalid %s value `%s': %s", name, value_str, 303 strerror(errno)); 304 return -1; 305 } 306 *value = value_ret; 307 return 0; 308 } 309 310 /** 311 * Set unsigned long sysfs property. 312 * 313 * @param priv 314 * Pointer to private structure. 315 * @param[in] name 316 * Entry name relative to sysfs path. 317 * @param value 318 * Value to set. 319 * 320 * @return 321 * 0 on success, -1 on failure and errno is set. 322 */ 323 static int 324 priv_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value) 325 { 326 int ret; 327 MKSTR(value_str, "%lu", value); 328 329 ret = priv_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1)); 330 if (ret == -1) { 331 DEBUG("cannot write %s `%s' (%lu) to sysfs: %s", 332 name, value_str, value, strerror(errno)); 333 return -1; 334 } 335 return 0; 336 } 337 338 /** 339 * Perform ifreq ioctl() on associated Ethernet device. 340 * 341 * @param[in] priv 342 * Pointer to private structure. 343 * @param req 344 * Request number to pass to ioctl(). 345 * @param[out] ifr 346 * Interface request structure output buffer. 347 * 348 * @return 349 * 0 on success, -1 on failure and errno is set. 350 */ 351 int 352 priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr) 353 { 354 int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 355 int ret = -1; 356 357 if (sock == -1) 358 return ret; 359 if (priv_get_ifname(priv, &ifr->ifr_name) == 0) 360 ret = ioctl(sock, req, ifr); 361 close(sock); 362 return ret; 363 } 364 365 /** 366 * Get device MTU. 367 * 368 * @param priv 369 * Pointer to private structure. 370 * @param[out] mtu 371 * MTU value output buffer. 372 * 373 * @return 374 * 0 on success, -1 on failure and errno is set. 375 */ 376 int 377 priv_get_mtu(struct priv *priv, uint16_t *mtu) 378 { 379 unsigned long ulong_mtu; 380 381 if (priv_get_sysfs_ulong(priv, "mtu", &ulong_mtu) == -1) 382 return -1; 383 *mtu = ulong_mtu; 384 return 0; 385 } 386 387 /** 388 * Set device MTU. 389 * 390 * @param priv 391 * Pointer to private structure. 392 * @param mtu 393 * MTU value to set. 394 * 395 * @return 396 * 0 on success, -1 on failure and errno is set. 397 */ 398 static int 399 priv_set_mtu(struct priv *priv, uint16_t mtu) 400 { 401 return priv_set_sysfs_ulong(priv, "mtu", mtu); 402 } 403 404 /** 405 * Set device flags. 406 * 407 * @param priv 408 * Pointer to private structure. 409 * @param keep 410 * Bitmask for flags that must remain untouched. 411 * @param flags 412 * Bitmask for flags to modify. 413 * 414 * @return 415 * 0 on success, -1 on failure and errno is set. 416 */ 417 int 418 priv_set_flags(struct priv *priv, unsigned int keep, unsigned int flags) 419 { 420 unsigned long tmp; 421 422 if (priv_get_sysfs_ulong(priv, "flags", &tmp) == -1) 423 return -1; 424 tmp &= keep; 425 tmp |= flags; 426 return priv_set_sysfs_ulong(priv, "flags", tmp); 427 } 428 429 /** 430 * Ethernet device configuration. 431 * 432 * Prepare the driver for a given number of TX and RX queues. 433 * 434 * @param dev 435 * Pointer to Ethernet device structure. 436 * 437 * @return 438 * 0 on success, errno value on failure. 439 */ 440 static int 441 dev_configure(struct rte_eth_dev *dev) 442 { 443 struct priv *priv = dev->data->dev_private; 444 unsigned int rxqs_n = dev->data->nb_rx_queues; 445 unsigned int txqs_n = dev->data->nb_tx_queues; 446 unsigned int i; 447 unsigned int j; 448 unsigned int reta_idx_n; 449 450 priv->rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf; 451 priv->rxqs = (void *)dev->data->rx_queues; 452 priv->txqs = (void *)dev->data->tx_queues; 453 if (txqs_n != priv->txqs_n) { 454 INFO("%p: TX queues number update: %u -> %u", 455 (void *)dev, priv->txqs_n, txqs_n); 456 priv->txqs_n = txqs_n; 457 } 458 if (rxqs_n > priv->ind_table_max_size) { 459 ERROR("cannot handle this many RX queues (%u)", rxqs_n); 460 return EINVAL; 461 } 462 if (rxqs_n == priv->rxqs_n) 463 return 0; 464 INFO("%p: RX queues number update: %u -> %u", 465 (void *)dev, priv->rxqs_n, rxqs_n); 466 priv->rxqs_n = rxqs_n; 467 /* If the requested number of RX queues is not a power of two, use the 468 * maximum indirection table size for better balancing. 469 * The result is always rounded to the next power of two. */ 470 reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ? 471 priv->ind_table_max_size : 472 rxqs_n)); 473 if (priv_rss_reta_index_resize(priv, reta_idx_n)) 474 return ENOMEM; 475 /* When the number of RX queues is not a power of two, the remaining 476 * table entries are padded with reused WQs and hashes are not spread 477 * uniformly. */ 478 for (i = 0, j = 0; (i != reta_idx_n); ++i) { 479 (*priv->reta_idx)[i] = j; 480 if (++j == rxqs_n) 481 j = 0; 482 } 483 return 0; 484 } 485 486 /** 487 * DPDK callback for Ethernet device configuration. 488 * 489 * @param dev 490 * Pointer to Ethernet device structure. 491 * 492 * @return 493 * 0 on success, negative errno value on failure. 494 */ 495 int 496 mlx5_dev_configure(struct rte_eth_dev *dev) 497 { 498 struct priv *priv = dev->data->dev_private; 499 int ret; 500 501 if (mlx5_is_secondary()) 502 return -E_RTE_SECONDARY; 503 504 priv_lock(priv); 505 ret = dev_configure(dev); 506 assert(ret >= 0); 507 priv_unlock(priv); 508 return -ret; 509 } 510 511 /** 512 * DPDK callback to get information about the device. 513 * 514 * @param dev 515 * Pointer to Ethernet device structure. 516 * @param[out] info 517 * Info structure output buffer. 518 */ 519 void 520 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 521 { 522 struct priv *priv = mlx5_get_priv(dev); 523 unsigned int max; 524 char ifname[IF_NAMESIZE]; 525 526 priv_lock(priv); 527 /* FIXME: we should ask the device for these values. */ 528 info->min_rx_bufsize = 32; 529 info->max_rx_pktlen = 65536; 530 /* 531 * Since we need one CQ per QP, the limit is the minimum number 532 * between the two values. 533 */ 534 max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ? 535 priv->device_attr.max_qp : priv->device_attr.max_cq); 536 /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */ 537 if (max >= 65535) 538 max = 65535; 539 info->max_rx_queues = max; 540 info->max_tx_queues = max; 541 info->max_mac_addrs = RTE_DIM(priv->mac); 542 info->rx_offload_capa = 543 (priv->hw_csum ? 544 (DEV_RX_OFFLOAD_IPV4_CKSUM | 545 DEV_RX_OFFLOAD_UDP_CKSUM | 546 DEV_RX_OFFLOAD_TCP_CKSUM) : 547 0); 548 info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT; 549 if (priv->hw_csum) 550 info->tx_offload_capa |= 551 (DEV_TX_OFFLOAD_IPV4_CKSUM | 552 DEV_TX_OFFLOAD_UDP_CKSUM | 553 DEV_TX_OFFLOAD_TCP_CKSUM); 554 if (priv_get_ifname(priv, &ifname) == 0) 555 info->if_index = if_nametoindex(ifname); 556 /* FIXME: RETA update/query API expects the callee to know the size of 557 * the indirection table, for this PMD the size varies depending on 558 * the number of RX queues, it becomes impossible to find the correct 559 * size if it is not fixed. 560 * The API should be updated to solve this problem. */ 561 info->reta_size = priv->ind_table_max_size; 562 priv_unlock(priv); 563 } 564 565 const uint32_t * 566 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) 567 { 568 static const uint32_t ptypes[] = { 569 /* refers to rxq_cq_to_pkt_type() */ 570 RTE_PTYPE_L3_IPV4, 571 RTE_PTYPE_L3_IPV6, 572 RTE_PTYPE_INNER_L3_IPV4, 573 RTE_PTYPE_INNER_L3_IPV6, 574 RTE_PTYPE_UNKNOWN 575 576 }; 577 578 if (dev->rx_pkt_burst == mlx5_rx_burst || 579 dev->rx_pkt_burst == mlx5_rx_burst_sp) 580 return ptypes; 581 return NULL; 582 } 583 584 /** 585 * DPDK callback to retrieve physical link information (unlocked version). 586 * 587 * @param dev 588 * Pointer to Ethernet device structure. 589 * @param wait_to_complete 590 * Wait for request completion (ignored). 591 */ 592 static int 593 mlx5_link_update_unlocked(struct rte_eth_dev *dev, int wait_to_complete) 594 { 595 struct priv *priv = mlx5_get_priv(dev); 596 struct ethtool_cmd edata = { 597 .cmd = ETHTOOL_GSET 598 }; 599 struct ifreq ifr; 600 struct rte_eth_link dev_link; 601 int link_speed = 0; 602 603 (void)wait_to_complete; 604 if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) { 605 WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno)); 606 return -1; 607 } 608 memset(&dev_link, 0, sizeof(dev_link)); 609 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && 610 (ifr.ifr_flags & IFF_RUNNING)); 611 ifr.ifr_data = &edata; 612 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 613 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", 614 strerror(errno)); 615 return -1; 616 } 617 link_speed = ethtool_cmd_speed(&edata); 618 if (link_speed == -1) 619 dev_link.link_speed = 0; 620 else 621 dev_link.link_speed = link_speed; 622 dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? 623 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 624 if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) { 625 /* Link status changed. */ 626 dev->data->dev_link = dev_link; 627 return 0; 628 } 629 /* Link status is still the same. */ 630 return -1; 631 } 632 633 /** 634 * DPDK callback to retrieve physical link information. 635 * 636 * @param dev 637 * Pointer to Ethernet device structure. 638 * @param wait_to_complete 639 * Wait for request completion (ignored). 640 */ 641 int 642 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete) 643 { 644 struct priv *priv = mlx5_get_priv(dev); 645 int ret; 646 647 priv_lock(priv); 648 ret = mlx5_link_update_unlocked(dev, wait_to_complete); 649 priv_unlock(priv); 650 return ret; 651 } 652 653 /** 654 * DPDK callback to change the MTU. 655 * 656 * Setting the MTU affects hardware MRU (packets larger than the MTU cannot be 657 * received). Use this as a hint to enable/disable scattered packets support 658 * and improve performance when not needed. 659 * Since failure is not an option, reconfiguring queues on the fly is not 660 * recommended. 661 * 662 * @param dev 663 * Pointer to Ethernet device structure. 664 * @param in_mtu 665 * New MTU. 666 * 667 * @return 668 * 0 on success, negative errno value on failure. 669 */ 670 int 671 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 672 { 673 struct priv *priv = dev->data->dev_private; 674 int ret = 0; 675 unsigned int i; 676 uint16_t (*rx_func)(void *, struct rte_mbuf **, uint16_t) = 677 mlx5_rx_burst; 678 679 if (mlx5_is_secondary()) 680 return -E_RTE_SECONDARY; 681 682 priv_lock(priv); 683 /* Set kernel interface MTU first. */ 684 if (priv_set_mtu(priv, mtu)) { 685 ret = errno; 686 WARN("cannot set port %u MTU to %u: %s", priv->port, mtu, 687 strerror(ret)); 688 goto out; 689 } else 690 DEBUG("adapter port %u MTU set to %u", priv->port, mtu); 691 priv->mtu = mtu; 692 /* Temporarily replace RX handler with a fake one, assuming it has not 693 * been copied elsewhere. */ 694 dev->rx_pkt_burst = removed_rx_burst; 695 /* Make sure everyone has left mlx5_rx_burst() and uses 696 * removed_rx_burst() instead. */ 697 rte_wmb(); 698 usleep(1000); 699 /* Reconfigure each RX queue. */ 700 for (i = 0; (i != priv->rxqs_n); ++i) { 701 struct rxq *rxq = (*priv->rxqs)[i]; 702 unsigned int max_frame_len; 703 int sp; 704 705 if (rxq == NULL) 706 continue; 707 /* Calculate new maximum frame length according to MTU and 708 * toggle scattered support (sp) if necessary. */ 709 max_frame_len = (priv->mtu + ETHER_HDR_LEN + 710 (ETHER_MAX_VLAN_FRAME_LEN - ETHER_MAX_LEN)); 711 sp = (max_frame_len > (rxq->mb_len - RTE_PKTMBUF_HEADROOM)); 712 /* Provide new values to rxq_setup(). */ 713 dev->data->dev_conf.rxmode.jumbo_frame = sp; 714 dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len; 715 ret = rxq_rehash(dev, rxq); 716 if (ret) { 717 /* Force SP RX if that queue requires it and abort. */ 718 if (rxq->sp) 719 rx_func = mlx5_rx_burst_sp; 720 break; 721 } 722 /* Scattered burst function takes priority. */ 723 if (rxq->sp) 724 rx_func = mlx5_rx_burst_sp; 725 } 726 /* Burst functions can now be called again. */ 727 rte_wmb(); 728 dev->rx_pkt_burst = rx_func; 729 out: 730 priv_unlock(priv); 731 assert(ret >= 0); 732 return -ret; 733 } 734 735 /** 736 * DPDK callback to get flow control status. 737 * 738 * @param dev 739 * Pointer to Ethernet device structure. 740 * @param[out] fc_conf 741 * Flow control output buffer. 742 * 743 * @return 744 * 0 on success, negative errno value on failure. 745 */ 746 int 747 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 748 { 749 struct priv *priv = dev->data->dev_private; 750 struct ifreq ifr; 751 struct ethtool_pauseparam ethpause = { 752 .cmd = ETHTOOL_GPAUSEPARAM 753 }; 754 int ret; 755 756 if (mlx5_is_secondary()) 757 return -E_RTE_SECONDARY; 758 759 ifr.ifr_data = ðpause; 760 priv_lock(priv); 761 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 762 ret = errno; 763 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)" 764 " failed: %s", 765 strerror(ret)); 766 goto out; 767 } 768 769 fc_conf->autoneg = ethpause.autoneg; 770 if (ethpause.rx_pause && ethpause.tx_pause) 771 fc_conf->mode = RTE_FC_FULL; 772 else if (ethpause.rx_pause) 773 fc_conf->mode = RTE_FC_RX_PAUSE; 774 else if (ethpause.tx_pause) 775 fc_conf->mode = RTE_FC_TX_PAUSE; 776 else 777 fc_conf->mode = RTE_FC_NONE; 778 ret = 0; 779 780 out: 781 priv_unlock(priv); 782 assert(ret >= 0); 783 return -ret; 784 } 785 786 /** 787 * DPDK callback to modify flow control parameters. 788 * 789 * @param dev 790 * Pointer to Ethernet device structure. 791 * @param[in] fc_conf 792 * Flow control parameters. 793 * 794 * @return 795 * 0 on success, negative errno value on failure. 796 */ 797 int 798 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 799 { 800 struct priv *priv = dev->data->dev_private; 801 struct ifreq ifr; 802 struct ethtool_pauseparam ethpause = { 803 .cmd = ETHTOOL_SPAUSEPARAM 804 }; 805 int ret; 806 807 if (mlx5_is_secondary()) 808 return -E_RTE_SECONDARY; 809 810 ifr.ifr_data = ðpause; 811 ethpause.autoneg = fc_conf->autoneg; 812 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 813 (fc_conf->mode & RTE_FC_RX_PAUSE)) 814 ethpause.rx_pause = 1; 815 else 816 ethpause.rx_pause = 0; 817 818 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 819 (fc_conf->mode & RTE_FC_TX_PAUSE)) 820 ethpause.tx_pause = 1; 821 else 822 ethpause.tx_pause = 0; 823 824 priv_lock(priv); 825 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 826 ret = errno; 827 WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" 828 " failed: %s", 829 strerror(ret)); 830 goto out; 831 } 832 ret = 0; 833 834 out: 835 priv_unlock(priv); 836 assert(ret >= 0); 837 return -ret; 838 } 839 840 /** 841 * Get PCI information from struct ibv_device. 842 * 843 * @param device 844 * Pointer to Ethernet device structure. 845 * @param[out] pci_addr 846 * PCI bus address output buffer. 847 * 848 * @return 849 * 0 on success, -1 on failure and errno is set. 850 */ 851 int 852 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, 853 struct rte_pci_addr *pci_addr) 854 { 855 FILE *file; 856 char line[32]; 857 MKSTR(path, "%s/device/uevent", device->ibdev_path); 858 859 file = fopen(path, "rb"); 860 if (file == NULL) 861 return -1; 862 while (fgets(line, sizeof(line), file) == line) { 863 size_t len = strlen(line); 864 int ret; 865 866 /* Truncate long lines. */ 867 if (len == (sizeof(line) - 1)) 868 while (line[(len - 1)] != '\n') { 869 ret = fgetc(file); 870 if (ret == EOF) 871 break; 872 line[(len - 1)] = ret; 873 } 874 /* Extract information. */ 875 if (sscanf(line, 876 "PCI_SLOT_NAME=" 877 "%" SCNx16 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", 878 &pci_addr->domain, 879 &pci_addr->bus, 880 &pci_addr->devid, 881 &pci_addr->function) == 4) { 882 ret = 0; 883 break; 884 } 885 } 886 fclose(file); 887 return 0; 888 } 889 890 /** 891 * Link status handler. 892 * 893 * @param priv 894 * Pointer to private structure. 895 * @param dev 896 * Pointer to the rte_eth_dev structure. 897 * 898 * @return 899 * Nonzero if the callback process can be called immediately. 900 */ 901 static int 902 priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev) 903 { 904 struct ibv_async_event event; 905 int port_change = 0; 906 int ret = 0; 907 908 /* Read all message and acknowledge them. */ 909 for (;;) { 910 if (ibv_get_async_event(priv->ctx, &event)) 911 break; 912 913 if (event.event_type == IBV_EVENT_PORT_ACTIVE || 914 event.event_type == IBV_EVENT_PORT_ERR) 915 port_change = 1; 916 else 917 DEBUG("event type %d on port %d not handled", 918 event.event_type, event.element.port_num); 919 ibv_ack_async_event(&event); 920 } 921 922 if (port_change ^ priv->pending_alarm) { 923 struct rte_eth_link *link = &dev->data->dev_link; 924 925 priv->pending_alarm = 0; 926 mlx5_link_update_unlocked(dev, 0); 927 if (((link->link_speed == 0) && link->link_status) || 928 ((link->link_speed != 0) && !link->link_status)) { 929 /* Inconsistent status, check again later. */ 930 priv->pending_alarm = 1; 931 rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US, 932 mlx5_dev_link_status_handler, 933 dev); 934 } else 935 ret = 1; 936 } 937 return ret; 938 } 939 940 /** 941 * Handle delayed link status event. 942 * 943 * @param arg 944 * Registered argument. 945 */ 946 void 947 mlx5_dev_link_status_handler(void *arg) 948 { 949 struct rte_eth_dev *dev = arg; 950 struct priv *priv = dev->data->dev_private; 951 int ret; 952 953 priv_lock(priv); 954 assert(priv->pending_alarm == 1); 955 ret = priv_dev_link_status_handler(priv, dev); 956 priv_unlock(priv); 957 if (ret) 958 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC); 959 } 960 961 /** 962 * Handle interrupts from the NIC. 963 * 964 * @param[in] intr_handle 965 * Interrupt handler. 966 * @param cb_arg 967 * Callback argument. 968 */ 969 void 970 mlx5_dev_interrupt_handler(struct rte_intr_handle *intr_handle, void *cb_arg) 971 { 972 struct rte_eth_dev *dev = cb_arg; 973 struct priv *priv = dev->data->dev_private; 974 int ret; 975 976 (void)intr_handle; 977 priv_lock(priv); 978 ret = priv_dev_link_status_handler(priv, dev); 979 priv_unlock(priv); 980 if (ret) 981 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC); 982 } 983 984 /** 985 * Uninstall interrupt handler. 986 * 987 * @param priv 988 * Pointer to private structure. 989 * @param dev 990 * Pointer to the rte_eth_dev structure. 991 */ 992 void 993 priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev) 994 { 995 if (!dev->data->dev_conf.intr_conf.lsc) 996 return; 997 rte_intr_callback_unregister(&priv->intr_handle, 998 mlx5_dev_interrupt_handler, 999 dev); 1000 if (priv->pending_alarm) 1001 rte_eal_alarm_cancel(mlx5_dev_link_status_handler, dev); 1002 priv->pending_alarm = 0; 1003 priv->intr_handle.fd = 0; 1004 priv->intr_handle.type = 0; 1005 } 1006 1007 /** 1008 * Install interrupt handler. 1009 * 1010 * @param priv 1011 * Pointer to private structure. 1012 * @param dev 1013 * Pointer to the rte_eth_dev structure. 1014 */ 1015 void 1016 priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev) 1017 { 1018 int rc, flags; 1019 1020 if (!dev->data->dev_conf.intr_conf.lsc) 1021 return; 1022 assert(priv->ctx->async_fd > 0); 1023 flags = fcntl(priv->ctx->async_fd, F_GETFL); 1024 rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); 1025 if (rc < 0) { 1026 INFO("failed to change file descriptor async event queue"); 1027 dev->data->dev_conf.intr_conf.lsc = 0; 1028 } else { 1029 priv->intr_handle.fd = priv->ctx->async_fd; 1030 priv->intr_handle.type = RTE_INTR_HANDLE_EXT; 1031 rte_intr_callback_register(&priv->intr_handle, 1032 mlx5_dev_interrupt_handler, 1033 dev); 1034 } 1035 } 1036 1037 /** 1038 * Change the link state (UP / DOWN). 1039 * 1040 * @param dev 1041 * Pointer to Ethernet device structure. 1042 * @param up 1043 * Nonzero for link up, otherwise link down. 1044 * 1045 * @return 1046 * 0 on success, errno value on failure. 1047 */ 1048 static int 1049 priv_set_link(struct priv *priv, int up) 1050 { 1051 struct rte_eth_dev *dev = priv->dev; 1052 int err; 1053 unsigned int i; 1054 1055 if (up) { 1056 err = priv_set_flags(priv, ~IFF_UP, IFF_UP); 1057 if (err) 1058 return err; 1059 for (i = 0; i < priv->rxqs_n; i++) 1060 if ((*priv->rxqs)[i]->sp) 1061 break; 1062 /* Check if an sp queue exists. 1063 * Note: Some old frames might be received. 1064 */ 1065 if (i == priv->rxqs_n) 1066 dev->rx_pkt_burst = mlx5_rx_burst; 1067 else 1068 dev->rx_pkt_burst = mlx5_rx_burst_sp; 1069 dev->tx_pkt_burst = mlx5_tx_burst; 1070 } else { 1071 err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP); 1072 if (err) 1073 return err; 1074 dev->rx_pkt_burst = removed_rx_burst; 1075 dev->tx_pkt_burst = removed_tx_burst; 1076 } 1077 return 0; 1078 } 1079 1080 /** 1081 * DPDK callback to bring the link DOWN. 1082 * 1083 * @param dev 1084 * Pointer to Ethernet device structure. 1085 * 1086 * @return 1087 * 0 on success, errno value on failure. 1088 */ 1089 int 1090 mlx5_set_link_down(struct rte_eth_dev *dev) 1091 { 1092 struct priv *priv = dev->data->dev_private; 1093 int err; 1094 1095 priv_lock(priv); 1096 err = priv_set_link(priv, 0); 1097 priv_unlock(priv); 1098 return err; 1099 } 1100 1101 /** 1102 * DPDK callback to bring the link UP. 1103 * 1104 * @param dev 1105 * Pointer to Ethernet device structure. 1106 * 1107 * @return 1108 * 0 on success, errno value on failure. 1109 */ 1110 int 1111 mlx5_set_link_up(struct rte_eth_dev *dev) 1112 { 1113 struct priv *priv = dev->data->dev_private; 1114 int err; 1115 1116 priv_lock(priv); 1117 err = priv_set_link(priv, 1); 1118 priv_unlock(priv); 1119 return err; 1120 } 1121 1122 /** 1123 * Configure secondary process queues from a private data pointer (primary 1124 * or secondary) and update burst callbacks. Can take place only once. 1125 * 1126 * All queues must have been previously created by the primary process to 1127 * avoid undefined behavior. 1128 * 1129 * @param priv 1130 * Private data pointer from either primary or secondary process. 1131 * 1132 * @return 1133 * Private data pointer from secondary process, NULL in case of error. 1134 */ 1135 struct priv * 1136 mlx5_secondary_data_setup(struct priv *priv) 1137 { 1138 unsigned int port_id = 0; 1139 struct mlx5_secondary_data *sd; 1140 void **tx_queues; 1141 void **rx_queues; 1142 unsigned int nb_tx_queues; 1143 unsigned int nb_rx_queues; 1144 unsigned int i; 1145 1146 /* priv must be valid at this point. */ 1147 assert(priv != NULL); 1148 /* priv->dev must also be valid but may point to local memory from 1149 * another process, possibly with the same address and must not 1150 * be dereferenced yet. */ 1151 assert(priv->dev != NULL); 1152 /* Determine port ID by finding out where priv comes from. */ 1153 while (1) { 1154 sd = &mlx5_secondary_data[port_id]; 1155 rte_spinlock_lock(&sd->lock); 1156 /* Primary process? */ 1157 if (sd->primary_priv == priv) 1158 break; 1159 /* Secondary process? */ 1160 if (sd->data.dev_private == priv) 1161 break; 1162 rte_spinlock_unlock(&sd->lock); 1163 if (++port_id == RTE_DIM(mlx5_secondary_data)) 1164 port_id = 0; 1165 } 1166 /* Switch to secondary private structure. If private data has already 1167 * been updated by another thread, there is nothing else to do. */ 1168 priv = sd->data.dev_private; 1169 if (priv->dev->data == &sd->data) 1170 goto end; 1171 /* Sanity checks. Secondary private structure is supposed to point 1172 * to local eth_dev, itself still pointing to the shared device data 1173 * structure allocated by the primary process. */ 1174 assert(sd->shared_dev_data != &sd->data); 1175 assert(sd->data.nb_tx_queues == 0); 1176 assert(sd->data.tx_queues == NULL); 1177 assert(sd->data.nb_rx_queues == 0); 1178 assert(sd->data.rx_queues == NULL); 1179 assert(priv != sd->primary_priv); 1180 assert(priv->dev->data == sd->shared_dev_data); 1181 assert(priv->txqs_n == 0); 1182 assert(priv->txqs == NULL); 1183 assert(priv->rxqs_n == 0); 1184 assert(priv->rxqs == NULL); 1185 nb_tx_queues = sd->shared_dev_data->nb_tx_queues; 1186 nb_rx_queues = sd->shared_dev_data->nb_rx_queues; 1187 /* Allocate local storage for queues. */ 1188 tx_queues = rte_zmalloc("secondary ethdev->tx_queues", 1189 sizeof(sd->data.tx_queues[0]) * nb_tx_queues, 1190 RTE_CACHE_LINE_SIZE); 1191 rx_queues = rte_zmalloc("secondary ethdev->rx_queues", 1192 sizeof(sd->data.rx_queues[0]) * nb_rx_queues, 1193 RTE_CACHE_LINE_SIZE); 1194 if (tx_queues == NULL || rx_queues == NULL) 1195 goto error; 1196 /* Lock to prevent control operations during setup. */ 1197 priv_lock(priv); 1198 /* TX queues. */ 1199 for (i = 0; i != nb_tx_queues; ++i) { 1200 struct txq *primary_txq = (*sd->primary_priv->txqs)[i]; 1201 struct txq *txq; 1202 1203 if (primary_txq == NULL) 1204 continue; 1205 txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0, 1206 primary_txq->socket); 1207 if (txq != NULL) { 1208 if (txq_setup(priv->dev, 1209 txq, 1210 primary_txq->elts_n * MLX5_PMD_SGE_WR_N, 1211 primary_txq->socket, 1212 NULL) == 0) { 1213 txq->stats.idx = primary_txq->stats.idx; 1214 tx_queues[i] = txq; 1215 continue; 1216 } 1217 rte_free(txq); 1218 } 1219 while (i) { 1220 txq = tx_queues[--i]; 1221 txq_cleanup(txq); 1222 rte_free(txq); 1223 } 1224 goto error; 1225 } 1226 /* RX queues. */ 1227 for (i = 0; i != nb_rx_queues; ++i) { 1228 struct rxq *primary_rxq = (*sd->primary_priv->rxqs)[i]; 1229 1230 if (primary_rxq == NULL) 1231 continue; 1232 /* Not supported yet. */ 1233 rx_queues[i] = NULL; 1234 } 1235 /* Update everything. */ 1236 priv->txqs = (void *)tx_queues; 1237 priv->txqs_n = nb_tx_queues; 1238 priv->rxqs = (void *)rx_queues; 1239 priv->rxqs_n = nb_rx_queues; 1240 sd->data.rx_queues = rx_queues; 1241 sd->data.tx_queues = tx_queues; 1242 sd->data.nb_rx_queues = nb_rx_queues; 1243 sd->data.nb_tx_queues = nb_tx_queues; 1244 sd->data.dev_link = sd->shared_dev_data->dev_link; 1245 sd->data.mtu = sd->shared_dev_data->mtu; 1246 memcpy(sd->data.rx_queue_state, sd->shared_dev_data->rx_queue_state, 1247 sizeof(sd->data.rx_queue_state)); 1248 memcpy(sd->data.tx_queue_state, sd->shared_dev_data->tx_queue_state, 1249 sizeof(sd->data.tx_queue_state)); 1250 sd->data.dev_flags = sd->shared_dev_data->dev_flags; 1251 /* Use local data from now on. */ 1252 rte_mb(); 1253 priv->dev->data = &sd->data; 1254 rte_mb(); 1255 priv->dev->tx_pkt_burst = mlx5_tx_burst; 1256 priv->dev->rx_pkt_burst = removed_rx_burst; 1257 priv_unlock(priv); 1258 end: 1259 /* More sanity checks. */ 1260 assert(priv->dev->tx_pkt_burst == mlx5_tx_burst); 1261 assert(priv->dev->rx_pkt_burst == removed_rx_burst); 1262 assert(priv->dev->data == &sd->data); 1263 rte_spinlock_unlock(&sd->lock); 1264 return priv; 1265 error: 1266 priv_unlock(priv); 1267 rte_free(tx_queues); 1268 rte_free(rx_queues); 1269 rte_spinlock_unlock(&sd->lock); 1270 return NULL; 1271 } 1272