1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <stddef.h> 35 #include <assert.h> 36 #include <unistd.h> 37 #include <stdint.h> 38 #include <stdio.h> 39 #include <string.h> 40 #include <stdlib.h> 41 #include <errno.h> 42 #include <dirent.h> 43 #include <net/if.h> 44 #include <sys/ioctl.h> 45 #include <sys/socket.h> 46 #include <netinet/in.h> 47 #include <linux/if.h> 48 49 /* DPDK headers don't like -pedantic. */ 50 #ifdef PEDANTIC 51 #pragma GCC diagnostic ignored "-pedantic" 52 #endif 53 #include <rte_atomic.h> 54 #include <rte_ethdev.h> 55 #include <rte_mbuf.h> 56 #include <rte_common.h> 57 #ifdef PEDANTIC 58 #pragma GCC diagnostic error "-pedantic" 59 #endif 60 61 #include "mlx5.h" 62 #include "mlx5_rxtx.h" 63 #include "mlx5_utils.h" 64 65 /** 66 * Get interface name from private structure. 67 * 68 * @param[in] priv 69 * Pointer to private structure. 70 * @param[out] ifname 71 * Interface name output buffer. 72 * 73 * @return 74 * 0 on success, -1 on failure and errno is set. 75 */ 76 int 77 priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]) 78 { 79 DIR *dir; 80 struct dirent *dent; 81 unsigned int dev_type = 0; 82 unsigned int dev_port_prev = ~0u; 83 char match[IF_NAMESIZE] = ""; 84 85 { 86 MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path); 87 88 dir = opendir(path); 89 if (dir == NULL) 90 return -1; 91 } 92 while ((dent = readdir(dir)) != NULL) { 93 char *name = dent->d_name; 94 FILE *file; 95 unsigned int dev_port; 96 int r; 97 98 if ((name[0] == '.') && 99 ((name[1] == '\0') || 100 ((name[1] == '.') && (name[2] == '\0')))) 101 continue; 102 103 MKSTR(path, "%s/device/net/%s/%s", 104 priv->ctx->device->ibdev_path, name, 105 (dev_type ? "dev_id" : "dev_port")); 106 107 file = fopen(path, "rb"); 108 if (file == NULL) { 109 if (errno != ENOENT) 110 continue; 111 /* 112 * Switch to dev_id when dev_port does not exist as 113 * is the case with Linux kernel versions < 3.15. 114 */ 115 try_dev_id: 116 match[0] = '\0'; 117 if (dev_type) 118 break; 119 dev_type = 1; 120 dev_port_prev = ~0u; 121 rewinddir(dir); 122 continue; 123 } 124 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); 125 fclose(file); 126 if (r != 1) 127 continue; 128 /* 129 * Switch to dev_id when dev_port returns the same value for 130 * all ports. May happen when using a MOFED release older than 131 * 3.0 with a Linux kernel >= 3.15. 132 */ 133 if (dev_port == dev_port_prev) 134 goto try_dev_id; 135 dev_port_prev = dev_port; 136 if (dev_port == (priv->port - 1u)) 137 snprintf(match, sizeof(match), "%s", name); 138 } 139 closedir(dir); 140 if (match[0] == '\0') 141 return -1; 142 strncpy(*ifname, match, sizeof(*ifname)); 143 return 0; 144 } 145 146 /** 147 * Read from sysfs entry. 148 * 149 * @param[in] priv 150 * Pointer to private structure. 151 * @param[in] entry 152 * Entry name relative to sysfs path. 153 * @param[out] buf 154 * Data output buffer. 155 * @param size 156 * Buffer size. 157 * 158 * @return 159 * 0 on success, -1 on failure and errno is set. 160 */ 161 static int 162 priv_sysfs_read(const struct priv *priv, const char *entry, 163 char *buf, size_t size) 164 { 165 char ifname[IF_NAMESIZE]; 166 FILE *file; 167 int ret; 168 int err; 169 170 if (priv_get_ifname(priv, &ifname)) 171 return -1; 172 173 MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, 174 ifname, entry); 175 176 file = fopen(path, "rb"); 177 if (file == NULL) 178 return -1; 179 ret = fread(buf, 1, size, file); 180 err = errno; 181 if (((size_t)ret < size) && (ferror(file))) 182 ret = -1; 183 else 184 ret = size; 185 fclose(file); 186 errno = err; 187 return ret; 188 } 189 190 /** 191 * Write to sysfs entry. 192 * 193 * @param[in] priv 194 * Pointer to private structure. 195 * @param[in] entry 196 * Entry name relative to sysfs path. 197 * @param[in] buf 198 * Data buffer. 199 * @param size 200 * Buffer size. 201 * 202 * @return 203 * 0 on success, -1 on failure and errno is set. 204 */ 205 static int 206 priv_sysfs_write(const struct priv *priv, const char *entry, 207 char *buf, size_t size) 208 { 209 char ifname[IF_NAMESIZE]; 210 FILE *file; 211 int ret; 212 int err; 213 214 if (priv_get_ifname(priv, &ifname)) 215 return -1; 216 217 MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, 218 ifname, entry); 219 220 file = fopen(path, "wb"); 221 if (file == NULL) 222 return -1; 223 ret = fwrite(buf, 1, size, file); 224 err = errno; 225 if (((size_t)ret < size) || (ferror(file))) 226 ret = -1; 227 else 228 ret = size; 229 fclose(file); 230 errno = err; 231 return ret; 232 } 233 234 /** 235 * Get unsigned long sysfs property. 236 * 237 * @param priv 238 * Pointer to private structure. 239 * @param[in] name 240 * Entry name relative to sysfs path. 241 * @param[out] value 242 * Value output buffer. 243 * 244 * @return 245 * 0 on success, -1 on failure and errno is set. 246 */ 247 static int 248 priv_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value) 249 { 250 int ret; 251 unsigned long value_ret; 252 char value_str[32]; 253 254 ret = priv_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1)); 255 if (ret == -1) { 256 DEBUG("cannot read %s value from sysfs: %s", 257 name, strerror(errno)); 258 return -1; 259 } 260 value_str[ret] = '\0'; 261 errno = 0; 262 value_ret = strtoul(value_str, NULL, 0); 263 if (errno) { 264 DEBUG("invalid %s value `%s': %s", name, value_str, 265 strerror(errno)); 266 return -1; 267 } 268 *value = value_ret; 269 return 0; 270 } 271 272 /** 273 * Set unsigned long sysfs property. 274 * 275 * @param priv 276 * Pointer to private structure. 277 * @param[in] name 278 * Entry name relative to sysfs path. 279 * @param value 280 * Value to set. 281 * 282 * @return 283 * 0 on success, -1 on failure and errno is set. 284 */ 285 static int 286 priv_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value) 287 { 288 int ret; 289 MKSTR(value_str, "%lu", value); 290 291 ret = priv_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1)); 292 if (ret == -1) { 293 DEBUG("cannot write %s `%s' (%lu) to sysfs: %s", 294 name, value_str, value, strerror(errno)); 295 return -1; 296 } 297 return 0; 298 } 299 300 /** 301 * Perform ifreq ioctl() on associated Ethernet device. 302 * 303 * @param[in] priv 304 * Pointer to private structure. 305 * @param req 306 * Request number to pass to ioctl(). 307 * @param[out] ifr 308 * Interface request structure output buffer. 309 * 310 * @return 311 * 0 on success, -1 on failure and errno is set. 312 */ 313 int 314 priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr) 315 { 316 int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 317 int ret = -1; 318 319 if (sock == -1) 320 return ret; 321 if (priv_get_ifname(priv, &ifr->ifr_name) == 0) 322 ret = ioctl(sock, req, ifr); 323 close(sock); 324 return ret; 325 } 326 327 /** 328 * Get device MTU. 329 * 330 * @param priv 331 * Pointer to private structure. 332 * @param[out] mtu 333 * MTU value output buffer. 334 * 335 * @return 336 * 0 on success, -1 on failure and errno is set. 337 */ 338 int 339 priv_get_mtu(struct priv *priv, uint16_t *mtu) 340 { 341 unsigned long ulong_mtu; 342 343 if (priv_get_sysfs_ulong(priv, "mtu", &ulong_mtu) == -1) 344 return -1; 345 *mtu = ulong_mtu; 346 return 0; 347 } 348 349 /** 350 * Set device MTU. 351 * 352 * @param priv 353 * Pointer to private structure. 354 * @param mtu 355 * MTU value to set. 356 * 357 * @return 358 * 0 on success, -1 on failure and errno is set. 359 */ 360 static int 361 priv_set_mtu(struct priv *priv, uint16_t mtu) 362 { 363 return priv_set_sysfs_ulong(priv, "mtu", mtu); 364 } 365 366 /** 367 * Set device flags. 368 * 369 * @param priv 370 * Pointer to private structure. 371 * @param keep 372 * Bitmask for flags that must remain untouched. 373 * @param flags 374 * Bitmask for flags to modify. 375 * 376 * @return 377 * 0 on success, -1 on failure and errno is set. 378 */ 379 int 380 priv_set_flags(struct priv *priv, unsigned int keep, unsigned int flags) 381 { 382 unsigned long tmp; 383 384 if (priv_get_sysfs_ulong(priv, "flags", &tmp) == -1) 385 return -1; 386 tmp &= keep; 387 tmp |= flags; 388 return priv_set_sysfs_ulong(priv, "flags", tmp); 389 } 390 391 /** 392 * Ethernet device configuration. 393 * 394 * Prepare the driver for a given number of TX and RX queues. 395 * Allocate parent RSS queue when several RX queues are requested. 396 * 397 * @param dev 398 * Pointer to Ethernet device structure. 399 * 400 * @return 401 * 0 on success, errno value on failure. 402 */ 403 static int 404 dev_configure(struct rte_eth_dev *dev) 405 { 406 struct priv *priv = dev->data->dev_private; 407 unsigned int rxqs_n = dev->data->nb_rx_queues; 408 unsigned int txqs_n = dev->data->nb_tx_queues; 409 unsigned int tmp; 410 int ret; 411 412 priv->rxqs = (void *)dev->data->rx_queues; 413 priv->txqs = (void *)dev->data->tx_queues; 414 if (txqs_n != priv->txqs_n) { 415 INFO("%p: TX queues number update: %u -> %u", 416 (void *)dev, priv->txqs_n, txqs_n); 417 priv->txqs_n = txqs_n; 418 } 419 if (rxqs_n == priv->rxqs_n) 420 return 0; 421 INFO("%p: RX queues number update: %u -> %u", 422 (void *)dev, priv->rxqs_n, rxqs_n); 423 /* If RSS is enabled, disable it first. */ 424 if (priv->rss) { 425 unsigned int i; 426 427 /* Only if there are no remaining child RX queues. */ 428 for (i = 0; (i != priv->rxqs_n); ++i) 429 if ((*priv->rxqs)[i] != NULL) 430 return EINVAL; 431 rxq_cleanup(&priv->rxq_parent); 432 priv->rss = 0; 433 priv->rxqs_n = 0; 434 } 435 if (rxqs_n <= 1) { 436 /* Nothing else to do. */ 437 priv->rxqs_n = rxqs_n; 438 return 0; 439 } 440 /* Allocate a new RSS parent queue if supported by hardware. */ 441 if (!priv->hw_rss) { 442 ERROR("%p: only a single RX queue can be configured when" 443 " hardware doesn't support RSS", 444 (void *)dev); 445 return EINVAL; 446 } 447 /* Fail if hardware doesn't support that many RSS queues. */ 448 if (rxqs_n >= priv->max_rss_tbl_sz) { 449 ERROR("%p: only %u RX queues can be configured for RSS", 450 (void *)dev, priv->max_rss_tbl_sz); 451 return EINVAL; 452 } 453 priv->rss = 1; 454 tmp = priv->rxqs_n; 455 priv->rxqs_n = rxqs_n; 456 ret = rxq_setup(dev, &priv->rxq_parent, 0, 0, NULL, NULL); 457 if (!ret) 458 return 0; 459 /* Failure, rollback. */ 460 priv->rss = 0; 461 priv->rxqs_n = tmp; 462 assert(ret > 0); 463 return ret; 464 } 465 466 /** 467 * DPDK callback for Ethernet device configuration. 468 * 469 * @param dev 470 * Pointer to Ethernet device structure. 471 * 472 * @return 473 * 0 on success, negative errno value on failure. 474 */ 475 int 476 mlx5_dev_configure(struct rte_eth_dev *dev) 477 { 478 struct priv *priv = dev->data->dev_private; 479 int ret; 480 481 priv_lock(priv); 482 ret = dev_configure(dev); 483 assert(ret >= 0); 484 priv_unlock(priv); 485 return -ret; 486 } 487 488 /** 489 * DPDK callback to get information about the device. 490 * 491 * @param dev 492 * Pointer to Ethernet device structure. 493 * @param[out] info 494 * Info structure output buffer. 495 */ 496 void 497 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 498 { 499 struct priv *priv = dev->data->dev_private; 500 unsigned int max; 501 char ifname[IF_NAMESIZE]; 502 503 priv_lock(priv); 504 /* FIXME: we should ask the device for these values. */ 505 info->min_rx_bufsize = 32; 506 info->max_rx_pktlen = 65536; 507 /* 508 * Since we need one CQ per QP, the limit is the minimum number 509 * between the two values. 510 */ 511 max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ? 512 priv->device_attr.max_qp : priv->device_attr.max_cq); 513 /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */ 514 if (max >= 65535) 515 max = 65535; 516 info->max_rx_queues = max; 517 info->max_tx_queues = max; 518 /* Last array entry is reserved for broadcast. */ 519 info->max_mac_addrs = (RTE_DIM(priv->mac) - 1); 520 info->rx_offload_capa = 521 (priv->hw_csum ? 522 (DEV_RX_OFFLOAD_IPV4_CKSUM | 523 DEV_RX_OFFLOAD_UDP_CKSUM | 524 DEV_RX_OFFLOAD_TCP_CKSUM) : 525 0); 526 info->tx_offload_capa = 527 (priv->hw_csum ? 528 (DEV_TX_OFFLOAD_IPV4_CKSUM | 529 DEV_TX_OFFLOAD_UDP_CKSUM | 530 DEV_TX_OFFLOAD_TCP_CKSUM) : 531 0); 532 if (priv_get_ifname(priv, &ifname) == 0) 533 info->if_index = if_nametoindex(ifname); 534 priv_unlock(priv); 535 } 536 537 /** 538 * DPDK callback to change the MTU. 539 * 540 * Setting the MTU affects hardware MRU (packets larger than the MTU cannot be 541 * received). Use this as a hint to enable/disable scattered packets support 542 * and improve performance when not needed. 543 * Since failure is not an option, reconfiguring queues on the fly is not 544 * recommended. 545 * 546 * @param dev 547 * Pointer to Ethernet device structure. 548 * @param in_mtu 549 * New MTU. 550 * 551 * @return 552 * 0 on success, negative errno value on failure. 553 */ 554 int 555 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 556 { 557 struct priv *priv = dev->data->dev_private; 558 int ret = 0; 559 unsigned int i; 560 uint16_t (*rx_func)(void *, struct rte_mbuf **, uint16_t) = 561 mlx5_rx_burst; 562 563 priv_lock(priv); 564 /* Set kernel interface MTU first. */ 565 if (priv_set_mtu(priv, mtu)) { 566 ret = errno; 567 WARN("cannot set port %u MTU to %u: %s", priv->port, mtu, 568 strerror(ret)); 569 goto out; 570 } else 571 DEBUG("adapter port %u MTU set to %u", priv->port, mtu); 572 priv->mtu = mtu; 573 /* Temporarily replace RX handler with a fake one, assuming it has not 574 * been copied elsewhere. */ 575 dev->rx_pkt_burst = removed_rx_burst; 576 /* Make sure everyone has left mlx5_rx_burst() and uses 577 * removed_rx_burst() instead. */ 578 rte_wmb(); 579 usleep(1000); 580 /* Reconfigure each RX queue. */ 581 for (i = 0; (i != priv->rxqs_n); ++i) { 582 struct rxq *rxq = (*priv->rxqs)[i]; 583 unsigned int max_frame_len; 584 int sp; 585 586 if (rxq == NULL) 587 continue; 588 /* Calculate new maximum frame length according to MTU and 589 * toggle scattered support (sp) if necessary. */ 590 max_frame_len = (priv->mtu + ETHER_HDR_LEN + 591 (ETHER_MAX_VLAN_FRAME_LEN - ETHER_MAX_LEN)); 592 sp = (max_frame_len > (rxq->mb_len - RTE_PKTMBUF_HEADROOM)); 593 /* Provide new values to rxq_setup(). */ 594 dev->data->dev_conf.rxmode.jumbo_frame = sp; 595 dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len; 596 ret = rxq_rehash(dev, rxq); 597 if (ret) { 598 /* Force SP RX if that queue requires it and abort. */ 599 if (rxq->sp) 600 rx_func = mlx5_rx_burst_sp; 601 break; 602 } 603 /* Reenable non-RSS queue attributes. No need to check 604 * for errors at this stage. */ 605 if (!priv->rss) { 606 if (priv->started) 607 rxq_mac_addrs_add(rxq); 608 } 609 /* Scattered burst function takes priority. */ 610 if (rxq->sp) 611 rx_func = mlx5_rx_burst_sp; 612 } 613 /* Burst functions can now be called again. */ 614 rte_wmb(); 615 dev->rx_pkt_burst = rx_func; 616 out: 617 priv_unlock(priv); 618 assert(ret >= 0); 619 return -ret; 620 } 621 622 /** 623 * Get PCI information from struct ibv_device. 624 * 625 * @param device 626 * Pointer to Ethernet device structure. 627 * @param[out] pci_addr 628 * PCI bus address output buffer. 629 * 630 * @return 631 * 0 on success, -1 on failure and errno is set. 632 */ 633 int 634 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, 635 struct rte_pci_addr *pci_addr) 636 { 637 FILE *file; 638 char line[32]; 639 MKSTR(path, "%s/device/uevent", device->ibdev_path); 640 641 file = fopen(path, "rb"); 642 if (file == NULL) 643 return -1; 644 while (fgets(line, sizeof(line), file) == line) { 645 size_t len = strlen(line); 646 int ret; 647 648 /* Truncate long lines. */ 649 if (len == (sizeof(line) - 1)) 650 while (line[(len - 1)] != '\n') { 651 ret = fgetc(file); 652 if (ret == EOF) 653 break; 654 line[(len - 1)] = ret; 655 } 656 /* Extract information. */ 657 if (sscanf(line, 658 "PCI_SLOT_NAME=" 659 "%" SCNx16 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", 660 &pci_addr->domain, 661 &pci_addr->bus, 662 &pci_addr->devid, 663 &pci_addr->function) == 4) { 664 ret = 0; 665 break; 666 } 667 } 668 fclose(file); 669 return 0; 670 } 671