1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <stddef.h> 35 #include <assert.h> 36 #include <unistd.h> 37 #include <stdint.h> 38 #include <stdio.h> 39 #include <string.h> 40 #include <stdlib.h> 41 #include <errno.h> 42 #include <dirent.h> 43 #include <net/if.h> 44 #include <sys/ioctl.h> 45 #include <sys/socket.h> 46 #include <netinet/in.h> 47 #include <linux/if.h> 48 #include <linux/ethtool.h> 49 #include <linux/sockios.h> 50 #include <fcntl.h> 51 52 /* DPDK headers don't like -pedantic. */ 53 #ifdef PEDANTIC 54 #pragma GCC diagnostic ignored "-pedantic" 55 #endif 56 #include <rte_atomic.h> 57 #include <rte_ethdev.h> 58 #include <rte_mbuf.h> 59 #include <rte_common.h> 60 #include <rte_interrupts.h> 61 #include <rte_alarm.h> 62 #include <rte_malloc.h> 63 #ifdef PEDANTIC 64 #pragma GCC diagnostic error "-pedantic" 65 #endif 66 67 #include "mlx5.h" 68 #include "mlx5_rxtx.h" 69 #include "mlx5_utils.h" 70 71 /** 72 * Return private structure associated with an Ethernet device. 73 * 74 * @param dev 75 * Pointer to Ethernet device structure. 76 * 77 * @return 78 * Pointer to private structure. 79 */ 80 struct priv * 81 mlx5_get_priv(struct rte_eth_dev *dev) 82 { 83 struct mlx5_secondary_data *sd; 84 85 if (!mlx5_is_secondary()) 86 return dev->data->dev_private; 87 sd = &mlx5_secondary_data[dev->data->port_id]; 88 return sd->data.dev_private; 89 } 90 91 /** 92 * Check if running as a secondary process. 93 * 94 * @return 95 * Nonzero if running as a secondary process. 96 */ 97 inline int 98 mlx5_is_secondary(void) 99 { 100 return rte_eal_process_type() != RTE_PROC_PRIMARY; 101 } 102 103 /** 104 * Get interface name from private structure. 105 * 106 * @param[in] priv 107 * Pointer to private structure. 108 * @param[out] ifname 109 * Interface name output buffer. 110 * 111 * @return 112 * 0 on success, -1 on failure and errno is set. 113 */ 114 int 115 priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]) 116 { 117 DIR *dir; 118 struct dirent *dent; 119 unsigned int dev_type = 0; 120 unsigned int dev_port_prev = ~0u; 121 char match[IF_NAMESIZE] = ""; 122 123 { 124 MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path); 125 126 dir = opendir(path); 127 if (dir == NULL) 128 return -1; 129 } 130 while ((dent = readdir(dir)) != NULL) { 131 char *name = dent->d_name; 132 FILE *file; 133 unsigned int dev_port; 134 int r; 135 136 if ((name[0] == '.') && 137 ((name[1] == '\0') || 138 ((name[1] == '.') && (name[2] == '\0')))) 139 continue; 140 141 MKSTR(path, "%s/device/net/%s/%s", 142 priv->ctx->device->ibdev_path, name, 143 (dev_type ? "dev_id" : "dev_port")); 144 145 file = fopen(path, "rb"); 146 if (file == NULL) { 147 if (errno != ENOENT) 148 continue; 149 /* 150 * Switch to dev_id when dev_port does not exist as 151 * is the case with Linux kernel versions < 3.15. 152 */ 153 try_dev_id: 154 match[0] = '\0'; 155 if (dev_type) 156 break; 157 dev_type = 1; 158 dev_port_prev = ~0u; 159 rewinddir(dir); 160 continue; 161 } 162 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); 163 fclose(file); 164 if (r != 1) 165 continue; 166 /* 167 * Switch to dev_id when dev_port returns the same value for 168 * all ports. May happen when using a MOFED release older than 169 * 3.0 with a Linux kernel >= 3.15. 170 */ 171 if (dev_port == dev_port_prev) 172 goto try_dev_id; 173 dev_port_prev = dev_port; 174 if (dev_port == (priv->port - 1u)) 175 snprintf(match, sizeof(match), "%s", name); 176 } 177 closedir(dir); 178 if (match[0] == '\0') 179 return -1; 180 strncpy(*ifname, match, sizeof(*ifname)); 181 return 0; 182 } 183 184 /** 185 * Read from sysfs entry. 186 * 187 * @param[in] priv 188 * Pointer to private structure. 189 * @param[in] entry 190 * Entry name relative to sysfs path. 191 * @param[out] buf 192 * Data output buffer. 193 * @param size 194 * Buffer size. 195 * 196 * @return 197 * 0 on success, -1 on failure and errno is set. 198 */ 199 static int 200 priv_sysfs_read(const struct priv *priv, const char *entry, 201 char *buf, size_t size) 202 { 203 char ifname[IF_NAMESIZE]; 204 FILE *file; 205 int ret; 206 int err; 207 208 if (priv_get_ifname(priv, &ifname)) 209 return -1; 210 211 MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, 212 ifname, entry); 213 214 file = fopen(path, "rb"); 215 if (file == NULL) 216 return -1; 217 ret = fread(buf, 1, size, file); 218 err = errno; 219 if (((size_t)ret < size) && (ferror(file))) 220 ret = -1; 221 else 222 ret = size; 223 fclose(file); 224 errno = err; 225 return ret; 226 } 227 228 /** 229 * Write to sysfs entry. 230 * 231 * @param[in] priv 232 * Pointer to private structure. 233 * @param[in] entry 234 * Entry name relative to sysfs path. 235 * @param[in] buf 236 * Data buffer. 237 * @param size 238 * Buffer size. 239 * 240 * @return 241 * 0 on success, -1 on failure and errno is set. 242 */ 243 static int 244 priv_sysfs_write(const struct priv *priv, const char *entry, 245 char *buf, size_t size) 246 { 247 char ifname[IF_NAMESIZE]; 248 FILE *file; 249 int ret; 250 int err; 251 252 if (priv_get_ifname(priv, &ifname)) 253 return -1; 254 255 MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, 256 ifname, entry); 257 258 file = fopen(path, "wb"); 259 if (file == NULL) 260 return -1; 261 ret = fwrite(buf, 1, size, file); 262 err = errno; 263 if (((size_t)ret < size) || (ferror(file))) 264 ret = -1; 265 else 266 ret = size; 267 fclose(file); 268 errno = err; 269 return ret; 270 } 271 272 /** 273 * Get unsigned long sysfs property. 274 * 275 * @param priv 276 * Pointer to private structure. 277 * @param[in] name 278 * Entry name relative to sysfs path. 279 * @param[out] value 280 * Value output buffer. 281 * 282 * @return 283 * 0 on success, -1 on failure and errno is set. 284 */ 285 static int 286 priv_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value) 287 { 288 int ret; 289 unsigned long value_ret; 290 char value_str[32]; 291 292 ret = priv_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1)); 293 if (ret == -1) { 294 DEBUG("cannot read %s value from sysfs: %s", 295 name, strerror(errno)); 296 return -1; 297 } 298 value_str[ret] = '\0'; 299 errno = 0; 300 value_ret = strtoul(value_str, NULL, 0); 301 if (errno) { 302 DEBUG("invalid %s value `%s': %s", name, value_str, 303 strerror(errno)); 304 return -1; 305 } 306 *value = value_ret; 307 return 0; 308 } 309 310 /** 311 * Set unsigned long sysfs property. 312 * 313 * @param priv 314 * Pointer to private structure. 315 * @param[in] name 316 * Entry name relative to sysfs path. 317 * @param value 318 * Value to set. 319 * 320 * @return 321 * 0 on success, -1 on failure and errno is set. 322 */ 323 static int 324 priv_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value) 325 { 326 int ret; 327 MKSTR(value_str, "%lu", value); 328 329 ret = priv_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1)); 330 if (ret == -1) { 331 DEBUG("cannot write %s `%s' (%lu) to sysfs: %s", 332 name, value_str, value, strerror(errno)); 333 return -1; 334 } 335 return 0; 336 } 337 338 /** 339 * Perform ifreq ioctl() on associated Ethernet device. 340 * 341 * @param[in] priv 342 * Pointer to private structure. 343 * @param req 344 * Request number to pass to ioctl(). 345 * @param[out] ifr 346 * Interface request structure output buffer. 347 * 348 * @return 349 * 0 on success, -1 on failure and errno is set. 350 */ 351 int 352 priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr) 353 { 354 int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 355 int ret = -1; 356 357 if (sock == -1) 358 return ret; 359 if (priv_get_ifname(priv, &ifr->ifr_name) == 0) 360 ret = ioctl(sock, req, ifr); 361 close(sock); 362 return ret; 363 } 364 365 /** 366 * Get device MTU. 367 * 368 * @param priv 369 * Pointer to private structure. 370 * @param[out] mtu 371 * MTU value output buffer. 372 * 373 * @return 374 * 0 on success, -1 on failure and errno is set. 375 */ 376 int 377 priv_get_mtu(struct priv *priv, uint16_t *mtu) 378 { 379 unsigned long ulong_mtu; 380 381 if (priv_get_sysfs_ulong(priv, "mtu", &ulong_mtu) == -1) 382 return -1; 383 *mtu = ulong_mtu; 384 return 0; 385 } 386 387 /** 388 * Set device MTU. 389 * 390 * @param priv 391 * Pointer to private structure. 392 * @param mtu 393 * MTU value to set. 394 * 395 * @return 396 * 0 on success, -1 on failure and errno is set. 397 */ 398 static int 399 priv_set_mtu(struct priv *priv, uint16_t mtu) 400 { 401 return priv_set_sysfs_ulong(priv, "mtu", mtu); 402 } 403 404 /** 405 * Set device flags. 406 * 407 * @param priv 408 * Pointer to private structure. 409 * @param keep 410 * Bitmask for flags that must remain untouched. 411 * @param flags 412 * Bitmask for flags to modify. 413 * 414 * @return 415 * 0 on success, -1 on failure and errno is set. 416 */ 417 int 418 priv_set_flags(struct priv *priv, unsigned int keep, unsigned int flags) 419 { 420 unsigned long tmp; 421 422 if (priv_get_sysfs_ulong(priv, "flags", &tmp) == -1) 423 return -1; 424 tmp &= keep; 425 tmp |= flags; 426 return priv_set_sysfs_ulong(priv, "flags", tmp); 427 } 428 429 /** 430 * Ethernet device configuration. 431 * 432 * Prepare the driver for a given number of TX and RX queues. 433 * 434 * @param dev 435 * Pointer to Ethernet device structure. 436 * 437 * @return 438 * 0 on success, errno value on failure. 439 */ 440 static int 441 dev_configure(struct rte_eth_dev *dev) 442 { 443 struct priv *priv = dev->data->dev_private; 444 unsigned int rxqs_n = dev->data->nb_rx_queues; 445 unsigned int txqs_n = dev->data->nb_tx_queues; 446 unsigned int i; 447 unsigned int j; 448 unsigned int reta_idx_n; 449 450 priv->rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf; 451 priv->rxqs = (void *)dev->data->rx_queues; 452 priv->txqs = (void *)dev->data->tx_queues; 453 if (txqs_n != priv->txqs_n) { 454 INFO("%p: TX queues number update: %u -> %u", 455 (void *)dev, priv->txqs_n, txqs_n); 456 priv->txqs_n = txqs_n; 457 } 458 if (rxqs_n > priv->ind_table_max_size) { 459 ERROR("cannot handle this many RX queues (%u)", rxqs_n); 460 return EINVAL; 461 } 462 if (rxqs_n == priv->rxqs_n) 463 return 0; 464 INFO("%p: RX queues number update: %u -> %u", 465 (void *)dev, priv->rxqs_n, rxqs_n); 466 priv->rxqs_n = rxqs_n; 467 /* If the requested number of RX queues is not a power of two, use the 468 * maximum indirection table size for better balancing. 469 * The result is always rounded to the next power of two. */ 470 reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ? 471 priv->ind_table_max_size : 472 rxqs_n)); 473 if (priv_rss_reta_index_resize(priv, reta_idx_n)) 474 return ENOMEM; 475 /* When the number of RX queues is not a power of two, the remaining 476 * table entries are padded with reused WQs and hashes are not spread 477 * uniformly. */ 478 for (i = 0, j = 0; (i != reta_idx_n); ++i) { 479 (*priv->reta_idx)[i] = j; 480 if (++j == rxqs_n) 481 j = 0; 482 } 483 return 0; 484 } 485 486 /** 487 * DPDK callback for Ethernet device configuration. 488 * 489 * @param dev 490 * Pointer to Ethernet device structure. 491 * 492 * @return 493 * 0 on success, negative errno value on failure. 494 */ 495 int 496 mlx5_dev_configure(struct rte_eth_dev *dev) 497 { 498 struct priv *priv = dev->data->dev_private; 499 int ret; 500 501 if (mlx5_is_secondary()) 502 return -E_RTE_SECONDARY; 503 504 priv_lock(priv); 505 ret = dev_configure(dev); 506 assert(ret >= 0); 507 priv_unlock(priv); 508 return -ret; 509 } 510 511 /** 512 * DPDK callback to get information about the device. 513 * 514 * @param dev 515 * Pointer to Ethernet device structure. 516 * @param[out] info 517 * Info structure output buffer. 518 */ 519 void 520 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 521 { 522 struct priv *priv = mlx5_get_priv(dev); 523 unsigned int max; 524 char ifname[IF_NAMESIZE]; 525 526 priv_lock(priv); 527 /* FIXME: we should ask the device for these values. */ 528 info->min_rx_bufsize = 32; 529 info->max_rx_pktlen = 65536; 530 /* 531 * Since we need one CQ per QP, the limit is the minimum number 532 * between the two values. 533 */ 534 max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ? 535 priv->device_attr.max_qp : priv->device_attr.max_cq); 536 /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */ 537 if (max >= 65535) 538 max = 65535; 539 info->max_rx_queues = max; 540 info->max_tx_queues = max; 541 info->max_mac_addrs = RTE_DIM(priv->mac); 542 info->rx_offload_capa = 543 (priv->hw_csum ? 544 (DEV_RX_OFFLOAD_IPV4_CKSUM | 545 DEV_RX_OFFLOAD_UDP_CKSUM | 546 DEV_RX_OFFLOAD_TCP_CKSUM) : 547 0); 548 info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT; 549 if (priv->hw_csum) 550 info->tx_offload_capa |= 551 (DEV_TX_OFFLOAD_IPV4_CKSUM | 552 DEV_TX_OFFLOAD_UDP_CKSUM | 553 DEV_TX_OFFLOAD_TCP_CKSUM); 554 if (priv_get_ifname(priv, &ifname) == 0) 555 info->if_index = if_nametoindex(ifname); 556 /* FIXME: RETA update/query API expects the callee to know the size of 557 * the indirection table, for this PMD the size varies depending on 558 * the number of RX queues, it becomes impossible to find the correct 559 * size if it is not fixed. 560 * The API should be updated to solve this problem. */ 561 info->reta_size = priv->ind_table_max_size; 562 info->speed_capa = 563 ETH_LINK_SPEED_1G | 564 ETH_LINK_SPEED_10G | 565 ETH_LINK_SPEED_20G | 566 ETH_LINK_SPEED_25G | 567 ETH_LINK_SPEED_40G | 568 ETH_LINK_SPEED_50G | 569 ETH_LINK_SPEED_56G; 570 priv_unlock(priv); 571 } 572 573 const uint32_t * 574 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) 575 { 576 static const uint32_t ptypes[] = { 577 /* refers to rxq_cq_to_pkt_type() */ 578 RTE_PTYPE_L3_IPV4, 579 RTE_PTYPE_L3_IPV6, 580 RTE_PTYPE_INNER_L3_IPV4, 581 RTE_PTYPE_INNER_L3_IPV6, 582 RTE_PTYPE_UNKNOWN 583 584 }; 585 586 if (dev->rx_pkt_burst == mlx5_rx_burst || 587 dev->rx_pkt_burst == mlx5_rx_burst_sp) 588 return ptypes; 589 return NULL; 590 } 591 592 /** 593 * DPDK callback to retrieve physical link information (unlocked version). 594 * 595 * @param dev 596 * Pointer to Ethernet device structure. 597 * @param wait_to_complete 598 * Wait for request completion (ignored). 599 */ 600 static int 601 mlx5_link_update_unlocked(struct rte_eth_dev *dev, int wait_to_complete) 602 { 603 struct priv *priv = mlx5_get_priv(dev); 604 struct ethtool_cmd edata = { 605 .cmd = ETHTOOL_GSET 606 }; 607 struct ifreq ifr; 608 struct rte_eth_link dev_link; 609 int link_speed = 0; 610 611 (void)wait_to_complete; 612 if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) { 613 WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno)); 614 return -1; 615 } 616 memset(&dev_link, 0, sizeof(dev_link)); 617 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && 618 (ifr.ifr_flags & IFF_RUNNING)); 619 ifr.ifr_data = &edata; 620 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 621 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", 622 strerror(errno)); 623 return -1; 624 } 625 link_speed = ethtool_cmd_speed(&edata); 626 if (link_speed == -1) 627 dev_link.link_speed = 0; 628 else 629 dev_link.link_speed = link_speed; 630 dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? 631 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 632 if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) { 633 /* Link status changed. */ 634 dev->data->dev_link = dev_link; 635 return 0; 636 } 637 /* Link status is still the same. */ 638 return -1; 639 } 640 641 /** 642 * DPDK callback to retrieve physical link information. 643 * 644 * @param dev 645 * Pointer to Ethernet device structure. 646 * @param wait_to_complete 647 * Wait for request completion (ignored). 648 */ 649 int 650 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete) 651 { 652 struct priv *priv = mlx5_get_priv(dev); 653 int ret; 654 655 priv_lock(priv); 656 ret = mlx5_link_update_unlocked(dev, wait_to_complete); 657 priv_unlock(priv); 658 return ret; 659 } 660 661 /** 662 * DPDK callback to change the MTU. 663 * 664 * Setting the MTU affects hardware MRU (packets larger than the MTU cannot be 665 * received). Use this as a hint to enable/disable scattered packets support 666 * and improve performance when not needed. 667 * Since failure is not an option, reconfiguring queues on the fly is not 668 * recommended. 669 * 670 * @param dev 671 * Pointer to Ethernet device structure. 672 * @param in_mtu 673 * New MTU. 674 * 675 * @return 676 * 0 on success, negative errno value on failure. 677 */ 678 int 679 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 680 { 681 struct priv *priv = dev->data->dev_private; 682 int ret = 0; 683 unsigned int i; 684 uint16_t (*rx_func)(void *, struct rte_mbuf **, uint16_t) = 685 mlx5_rx_burst; 686 687 if (mlx5_is_secondary()) 688 return -E_RTE_SECONDARY; 689 690 priv_lock(priv); 691 /* Set kernel interface MTU first. */ 692 if (priv_set_mtu(priv, mtu)) { 693 ret = errno; 694 WARN("cannot set port %u MTU to %u: %s", priv->port, mtu, 695 strerror(ret)); 696 goto out; 697 } else 698 DEBUG("adapter port %u MTU set to %u", priv->port, mtu); 699 priv->mtu = mtu; 700 /* Temporarily replace RX handler with a fake one, assuming it has not 701 * been copied elsewhere. */ 702 dev->rx_pkt_burst = removed_rx_burst; 703 /* Make sure everyone has left mlx5_rx_burst() and uses 704 * removed_rx_burst() instead. */ 705 rte_wmb(); 706 usleep(1000); 707 /* Reconfigure each RX queue. */ 708 for (i = 0; (i != priv->rxqs_n); ++i) { 709 struct rxq *rxq = (*priv->rxqs)[i]; 710 unsigned int max_frame_len; 711 int sp; 712 713 if (rxq == NULL) 714 continue; 715 /* Calculate new maximum frame length according to MTU and 716 * toggle scattered support (sp) if necessary. */ 717 max_frame_len = (priv->mtu + ETHER_HDR_LEN + 718 (ETHER_MAX_VLAN_FRAME_LEN - ETHER_MAX_LEN)); 719 sp = (max_frame_len > (rxq->mb_len - RTE_PKTMBUF_HEADROOM)); 720 /* Provide new values to rxq_setup(). */ 721 dev->data->dev_conf.rxmode.jumbo_frame = sp; 722 dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len; 723 ret = rxq_rehash(dev, rxq); 724 if (ret) { 725 /* Force SP RX if that queue requires it and abort. */ 726 if (rxq->sp) 727 rx_func = mlx5_rx_burst_sp; 728 break; 729 } 730 /* Scattered burst function takes priority. */ 731 if (rxq->sp) 732 rx_func = mlx5_rx_burst_sp; 733 } 734 /* Burst functions can now be called again. */ 735 rte_wmb(); 736 dev->rx_pkt_burst = rx_func; 737 out: 738 priv_unlock(priv); 739 assert(ret >= 0); 740 return -ret; 741 } 742 743 /** 744 * DPDK callback to get flow control status. 745 * 746 * @param dev 747 * Pointer to Ethernet device structure. 748 * @param[out] fc_conf 749 * Flow control output buffer. 750 * 751 * @return 752 * 0 on success, negative errno value on failure. 753 */ 754 int 755 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 756 { 757 struct priv *priv = dev->data->dev_private; 758 struct ifreq ifr; 759 struct ethtool_pauseparam ethpause = { 760 .cmd = ETHTOOL_GPAUSEPARAM 761 }; 762 int ret; 763 764 if (mlx5_is_secondary()) 765 return -E_RTE_SECONDARY; 766 767 ifr.ifr_data = ðpause; 768 priv_lock(priv); 769 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 770 ret = errno; 771 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)" 772 " failed: %s", 773 strerror(ret)); 774 goto out; 775 } 776 777 fc_conf->autoneg = ethpause.autoneg; 778 if (ethpause.rx_pause && ethpause.tx_pause) 779 fc_conf->mode = RTE_FC_FULL; 780 else if (ethpause.rx_pause) 781 fc_conf->mode = RTE_FC_RX_PAUSE; 782 else if (ethpause.tx_pause) 783 fc_conf->mode = RTE_FC_TX_PAUSE; 784 else 785 fc_conf->mode = RTE_FC_NONE; 786 ret = 0; 787 788 out: 789 priv_unlock(priv); 790 assert(ret >= 0); 791 return -ret; 792 } 793 794 /** 795 * DPDK callback to modify flow control parameters. 796 * 797 * @param dev 798 * Pointer to Ethernet device structure. 799 * @param[in] fc_conf 800 * Flow control parameters. 801 * 802 * @return 803 * 0 on success, negative errno value on failure. 804 */ 805 int 806 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 807 { 808 struct priv *priv = dev->data->dev_private; 809 struct ifreq ifr; 810 struct ethtool_pauseparam ethpause = { 811 .cmd = ETHTOOL_SPAUSEPARAM 812 }; 813 int ret; 814 815 if (mlx5_is_secondary()) 816 return -E_RTE_SECONDARY; 817 818 ifr.ifr_data = ðpause; 819 ethpause.autoneg = fc_conf->autoneg; 820 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 821 (fc_conf->mode & RTE_FC_RX_PAUSE)) 822 ethpause.rx_pause = 1; 823 else 824 ethpause.rx_pause = 0; 825 826 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 827 (fc_conf->mode & RTE_FC_TX_PAUSE)) 828 ethpause.tx_pause = 1; 829 else 830 ethpause.tx_pause = 0; 831 832 priv_lock(priv); 833 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 834 ret = errno; 835 WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" 836 " failed: %s", 837 strerror(ret)); 838 goto out; 839 } 840 ret = 0; 841 842 out: 843 priv_unlock(priv); 844 assert(ret >= 0); 845 return -ret; 846 } 847 848 /** 849 * Get PCI information from struct ibv_device. 850 * 851 * @param device 852 * Pointer to Ethernet device structure. 853 * @param[out] pci_addr 854 * PCI bus address output buffer. 855 * 856 * @return 857 * 0 on success, -1 on failure and errno is set. 858 */ 859 int 860 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, 861 struct rte_pci_addr *pci_addr) 862 { 863 FILE *file; 864 char line[32]; 865 MKSTR(path, "%s/device/uevent", device->ibdev_path); 866 867 file = fopen(path, "rb"); 868 if (file == NULL) 869 return -1; 870 while (fgets(line, sizeof(line), file) == line) { 871 size_t len = strlen(line); 872 int ret; 873 874 /* Truncate long lines. */ 875 if (len == (sizeof(line) - 1)) 876 while (line[(len - 1)] != '\n') { 877 ret = fgetc(file); 878 if (ret == EOF) 879 break; 880 line[(len - 1)] = ret; 881 } 882 /* Extract information. */ 883 if (sscanf(line, 884 "PCI_SLOT_NAME=" 885 "%" SCNx16 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", 886 &pci_addr->domain, 887 &pci_addr->bus, 888 &pci_addr->devid, 889 &pci_addr->function) == 4) { 890 ret = 0; 891 break; 892 } 893 } 894 fclose(file); 895 return 0; 896 } 897 898 /** 899 * Link status handler. 900 * 901 * @param priv 902 * Pointer to private structure. 903 * @param dev 904 * Pointer to the rte_eth_dev structure. 905 * 906 * @return 907 * Nonzero if the callback process can be called immediately. 908 */ 909 static int 910 priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev) 911 { 912 struct ibv_async_event event; 913 int port_change = 0; 914 int ret = 0; 915 916 /* Read all message and acknowledge them. */ 917 for (;;) { 918 if (ibv_get_async_event(priv->ctx, &event)) 919 break; 920 921 if (event.event_type == IBV_EVENT_PORT_ACTIVE || 922 event.event_type == IBV_EVENT_PORT_ERR) 923 port_change = 1; 924 else 925 DEBUG("event type %d on port %d not handled", 926 event.event_type, event.element.port_num); 927 ibv_ack_async_event(&event); 928 } 929 930 if (port_change ^ priv->pending_alarm) { 931 struct rte_eth_link *link = &dev->data->dev_link; 932 933 priv->pending_alarm = 0; 934 mlx5_link_update_unlocked(dev, 0); 935 if (((link->link_speed == 0) && link->link_status) || 936 ((link->link_speed != 0) && !link->link_status)) { 937 /* Inconsistent status, check again later. */ 938 priv->pending_alarm = 1; 939 rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US, 940 mlx5_dev_link_status_handler, 941 dev); 942 } else 943 ret = 1; 944 } 945 return ret; 946 } 947 948 /** 949 * Handle delayed link status event. 950 * 951 * @param arg 952 * Registered argument. 953 */ 954 void 955 mlx5_dev_link_status_handler(void *arg) 956 { 957 struct rte_eth_dev *dev = arg; 958 struct priv *priv = dev->data->dev_private; 959 int ret; 960 961 priv_lock(priv); 962 assert(priv->pending_alarm == 1); 963 ret = priv_dev_link_status_handler(priv, dev); 964 priv_unlock(priv); 965 if (ret) 966 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC); 967 } 968 969 /** 970 * Handle interrupts from the NIC. 971 * 972 * @param[in] intr_handle 973 * Interrupt handler. 974 * @param cb_arg 975 * Callback argument. 976 */ 977 void 978 mlx5_dev_interrupt_handler(struct rte_intr_handle *intr_handle, void *cb_arg) 979 { 980 struct rte_eth_dev *dev = cb_arg; 981 struct priv *priv = dev->data->dev_private; 982 int ret; 983 984 (void)intr_handle; 985 priv_lock(priv); 986 ret = priv_dev_link_status_handler(priv, dev); 987 priv_unlock(priv); 988 if (ret) 989 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC); 990 } 991 992 /** 993 * Uninstall interrupt handler. 994 * 995 * @param priv 996 * Pointer to private structure. 997 * @param dev 998 * Pointer to the rte_eth_dev structure. 999 */ 1000 void 1001 priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev) 1002 { 1003 if (!dev->data->dev_conf.intr_conf.lsc) 1004 return; 1005 rte_intr_callback_unregister(&priv->intr_handle, 1006 mlx5_dev_interrupt_handler, 1007 dev); 1008 if (priv->pending_alarm) 1009 rte_eal_alarm_cancel(mlx5_dev_link_status_handler, dev); 1010 priv->pending_alarm = 0; 1011 priv->intr_handle.fd = 0; 1012 priv->intr_handle.type = 0; 1013 } 1014 1015 /** 1016 * Install interrupt handler. 1017 * 1018 * @param priv 1019 * Pointer to private structure. 1020 * @param dev 1021 * Pointer to the rte_eth_dev structure. 1022 */ 1023 void 1024 priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev) 1025 { 1026 int rc, flags; 1027 1028 if (!dev->data->dev_conf.intr_conf.lsc) 1029 return; 1030 assert(priv->ctx->async_fd > 0); 1031 flags = fcntl(priv->ctx->async_fd, F_GETFL); 1032 rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); 1033 if (rc < 0) { 1034 INFO("failed to change file descriptor async event queue"); 1035 dev->data->dev_conf.intr_conf.lsc = 0; 1036 } else { 1037 priv->intr_handle.fd = priv->ctx->async_fd; 1038 priv->intr_handle.type = RTE_INTR_HANDLE_EXT; 1039 rte_intr_callback_register(&priv->intr_handle, 1040 mlx5_dev_interrupt_handler, 1041 dev); 1042 } 1043 } 1044 1045 /** 1046 * Change the link state (UP / DOWN). 1047 * 1048 * @param dev 1049 * Pointer to Ethernet device structure. 1050 * @param up 1051 * Nonzero for link up, otherwise link down. 1052 * 1053 * @return 1054 * 0 on success, errno value on failure. 1055 */ 1056 static int 1057 priv_set_link(struct priv *priv, int up) 1058 { 1059 struct rte_eth_dev *dev = priv->dev; 1060 int err; 1061 unsigned int i; 1062 1063 if (up) { 1064 err = priv_set_flags(priv, ~IFF_UP, IFF_UP); 1065 if (err) 1066 return err; 1067 for (i = 0; i < priv->rxqs_n; i++) 1068 if ((*priv->rxqs)[i]->sp) 1069 break; 1070 /* Check if an sp queue exists. 1071 * Note: Some old frames might be received. 1072 */ 1073 if (i == priv->rxqs_n) 1074 dev->rx_pkt_burst = mlx5_rx_burst; 1075 else 1076 dev->rx_pkt_burst = mlx5_rx_burst_sp; 1077 dev->tx_pkt_burst = mlx5_tx_burst; 1078 } else { 1079 err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP); 1080 if (err) 1081 return err; 1082 dev->rx_pkt_burst = removed_rx_burst; 1083 dev->tx_pkt_burst = removed_tx_burst; 1084 } 1085 return 0; 1086 } 1087 1088 /** 1089 * DPDK callback to bring the link DOWN. 1090 * 1091 * @param dev 1092 * Pointer to Ethernet device structure. 1093 * 1094 * @return 1095 * 0 on success, errno value on failure. 1096 */ 1097 int 1098 mlx5_set_link_down(struct rte_eth_dev *dev) 1099 { 1100 struct priv *priv = dev->data->dev_private; 1101 int err; 1102 1103 priv_lock(priv); 1104 err = priv_set_link(priv, 0); 1105 priv_unlock(priv); 1106 return err; 1107 } 1108 1109 /** 1110 * DPDK callback to bring the link UP. 1111 * 1112 * @param dev 1113 * Pointer to Ethernet device structure. 1114 * 1115 * @return 1116 * 0 on success, errno value on failure. 1117 */ 1118 int 1119 mlx5_set_link_up(struct rte_eth_dev *dev) 1120 { 1121 struct priv *priv = dev->data->dev_private; 1122 int err; 1123 1124 priv_lock(priv); 1125 err = priv_set_link(priv, 1); 1126 priv_unlock(priv); 1127 return err; 1128 } 1129 1130 /** 1131 * Configure secondary process queues from a private data pointer (primary 1132 * or secondary) and update burst callbacks. Can take place only once. 1133 * 1134 * All queues must have been previously created by the primary process to 1135 * avoid undefined behavior. 1136 * 1137 * @param priv 1138 * Private data pointer from either primary or secondary process. 1139 * 1140 * @return 1141 * Private data pointer from secondary process, NULL in case of error. 1142 */ 1143 struct priv * 1144 mlx5_secondary_data_setup(struct priv *priv) 1145 { 1146 unsigned int port_id = 0; 1147 struct mlx5_secondary_data *sd; 1148 void **tx_queues; 1149 void **rx_queues; 1150 unsigned int nb_tx_queues; 1151 unsigned int nb_rx_queues; 1152 unsigned int i; 1153 1154 /* priv must be valid at this point. */ 1155 assert(priv != NULL); 1156 /* priv->dev must also be valid but may point to local memory from 1157 * another process, possibly with the same address and must not 1158 * be dereferenced yet. */ 1159 assert(priv->dev != NULL); 1160 /* Determine port ID by finding out where priv comes from. */ 1161 while (1) { 1162 sd = &mlx5_secondary_data[port_id]; 1163 rte_spinlock_lock(&sd->lock); 1164 /* Primary process? */ 1165 if (sd->primary_priv == priv) 1166 break; 1167 /* Secondary process? */ 1168 if (sd->data.dev_private == priv) 1169 break; 1170 rte_spinlock_unlock(&sd->lock); 1171 if (++port_id == RTE_DIM(mlx5_secondary_data)) 1172 port_id = 0; 1173 } 1174 /* Switch to secondary private structure. If private data has already 1175 * been updated by another thread, there is nothing else to do. */ 1176 priv = sd->data.dev_private; 1177 if (priv->dev->data == &sd->data) 1178 goto end; 1179 /* Sanity checks. Secondary private structure is supposed to point 1180 * to local eth_dev, itself still pointing to the shared device data 1181 * structure allocated by the primary process. */ 1182 assert(sd->shared_dev_data != &sd->data); 1183 assert(sd->data.nb_tx_queues == 0); 1184 assert(sd->data.tx_queues == NULL); 1185 assert(sd->data.nb_rx_queues == 0); 1186 assert(sd->data.rx_queues == NULL); 1187 assert(priv != sd->primary_priv); 1188 assert(priv->dev->data == sd->shared_dev_data); 1189 assert(priv->txqs_n == 0); 1190 assert(priv->txqs == NULL); 1191 assert(priv->rxqs_n == 0); 1192 assert(priv->rxqs == NULL); 1193 nb_tx_queues = sd->shared_dev_data->nb_tx_queues; 1194 nb_rx_queues = sd->shared_dev_data->nb_rx_queues; 1195 /* Allocate local storage for queues. */ 1196 tx_queues = rte_zmalloc("secondary ethdev->tx_queues", 1197 sizeof(sd->data.tx_queues[0]) * nb_tx_queues, 1198 RTE_CACHE_LINE_SIZE); 1199 rx_queues = rte_zmalloc("secondary ethdev->rx_queues", 1200 sizeof(sd->data.rx_queues[0]) * nb_rx_queues, 1201 RTE_CACHE_LINE_SIZE); 1202 if (tx_queues == NULL || rx_queues == NULL) 1203 goto error; 1204 /* Lock to prevent control operations during setup. */ 1205 priv_lock(priv); 1206 /* TX queues. */ 1207 for (i = 0; i != nb_tx_queues; ++i) { 1208 struct txq *primary_txq = (*sd->primary_priv->txqs)[i]; 1209 struct txq *txq; 1210 1211 if (primary_txq == NULL) 1212 continue; 1213 txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0, 1214 primary_txq->socket); 1215 if (txq != NULL) { 1216 if (txq_setup(priv->dev, 1217 txq, 1218 primary_txq->elts_n * MLX5_PMD_SGE_WR_N, 1219 primary_txq->socket, 1220 NULL) == 0) { 1221 txq->stats.idx = primary_txq->stats.idx; 1222 tx_queues[i] = txq; 1223 continue; 1224 } 1225 rte_free(txq); 1226 } 1227 while (i) { 1228 txq = tx_queues[--i]; 1229 txq_cleanup(txq); 1230 rte_free(txq); 1231 } 1232 goto error; 1233 } 1234 /* RX queues. */ 1235 for (i = 0; i != nb_rx_queues; ++i) { 1236 struct rxq *primary_rxq = (*sd->primary_priv->rxqs)[i]; 1237 1238 if (primary_rxq == NULL) 1239 continue; 1240 /* Not supported yet. */ 1241 rx_queues[i] = NULL; 1242 } 1243 /* Update everything. */ 1244 priv->txqs = (void *)tx_queues; 1245 priv->txqs_n = nb_tx_queues; 1246 priv->rxqs = (void *)rx_queues; 1247 priv->rxqs_n = nb_rx_queues; 1248 sd->data.rx_queues = rx_queues; 1249 sd->data.tx_queues = tx_queues; 1250 sd->data.nb_rx_queues = nb_rx_queues; 1251 sd->data.nb_tx_queues = nb_tx_queues; 1252 sd->data.dev_link = sd->shared_dev_data->dev_link; 1253 sd->data.mtu = sd->shared_dev_data->mtu; 1254 memcpy(sd->data.rx_queue_state, sd->shared_dev_data->rx_queue_state, 1255 sizeof(sd->data.rx_queue_state)); 1256 memcpy(sd->data.tx_queue_state, sd->shared_dev_data->tx_queue_state, 1257 sizeof(sd->data.tx_queue_state)); 1258 sd->data.dev_flags = sd->shared_dev_data->dev_flags; 1259 /* Use local data from now on. */ 1260 rte_mb(); 1261 priv->dev->data = &sd->data; 1262 rte_mb(); 1263 priv->dev->tx_pkt_burst = mlx5_tx_burst; 1264 priv->dev->rx_pkt_burst = removed_rx_burst; 1265 priv_unlock(priv); 1266 end: 1267 /* More sanity checks. */ 1268 assert(priv->dev->tx_pkt_burst == mlx5_tx_burst); 1269 assert(priv->dev->rx_pkt_burst == removed_rx_burst); 1270 assert(priv->dev->data == &sd->data); 1271 rte_spinlock_unlock(&sd->lock); 1272 return priv; 1273 error: 1274 priv_unlock(priv); 1275 rte_free(tx_queues); 1276 rte_free(rx_queues); 1277 rte_spinlock_unlock(&sd->lock); 1278 return NULL; 1279 } 1280