1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2012-2015 6WIND S.A. 5 * Copyright 2012 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * Known limitations: 36 * - RSS hash key and options cannot be modified. 37 * - Hardware counters aren't implemented. 38 */ 39 40 /* System headers. */ 41 #include <stddef.h> 42 #include <stdio.h> 43 #include <stdlib.h> 44 #include <stdint.h> 45 #include <inttypes.h> 46 #include <string.h> 47 #include <errno.h> 48 #include <unistd.h> 49 #include <limits.h> 50 #include <assert.h> 51 #include <arpa/inet.h> 52 #include <net/if.h> 53 #include <dirent.h> 54 #include <sys/ioctl.h> 55 #include <sys/socket.h> 56 #include <netinet/in.h> 57 #include <linux/if.h> 58 #include <linux/ethtool.h> 59 #include <linux/sockios.h> 60 #include <fcntl.h> 61 62 /* Verbs header. */ 63 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 64 #ifdef PEDANTIC 65 #pragma GCC diagnostic ignored "-pedantic" 66 #endif 67 #include <infiniband/verbs.h> 68 #ifdef PEDANTIC 69 #pragma GCC diagnostic error "-pedantic" 70 #endif 71 72 /* DPDK headers don't like -pedantic. */ 73 #ifdef PEDANTIC 74 #pragma GCC diagnostic ignored "-pedantic" 75 #endif 76 #include <rte_ether.h> 77 #include <rte_ethdev.h> 78 #include <rte_dev.h> 79 #include <rte_mbuf.h> 80 #include <rte_errno.h> 81 #include <rte_mempool.h> 82 #include <rte_prefetch.h> 83 #include <rte_malloc.h> 84 #include <rte_spinlock.h> 85 #include <rte_atomic.h> 86 #include <rte_version.h> 87 #include <rte_log.h> 88 #include <rte_alarm.h> 89 #ifdef PEDANTIC 90 #pragma GCC diagnostic error "-pedantic" 91 #endif 92 93 /* Generated configuration header. */ 94 #include "mlx4_autoconf.h" 95 96 /* PMD header. */ 97 #include "mlx4.h" 98 99 /* Runtime logging through RTE_LOG() is enabled when not in debugging mode. 100 * Intermediate LOG_*() macros add the required end-of-line characters. */ 101 #ifndef NDEBUG 102 #define INFO(...) DEBUG(__VA_ARGS__) 103 #define WARN(...) DEBUG(__VA_ARGS__) 104 #define ERROR(...) DEBUG(__VA_ARGS__) 105 #else 106 #define LOG__(level, m, ...) \ 107 RTE_LOG(level, PMD, MLX4_DRIVER_NAME ": " m "%c", __VA_ARGS__) 108 #define LOG_(level, ...) LOG__(level, __VA_ARGS__, '\n') 109 #define INFO(...) LOG_(INFO, __VA_ARGS__) 110 #define WARN(...) LOG_(WARNING, __VA_ARGS__) 111 #define ERROR(...) LOG_(ERR, __VA_ARGS__) 112 #endif 113 114 /* Convenience macros for accessing mbuf fields. */ 115 #define NEXT(m) ((m)->next) 116 #define DATA_LEN(m) ((m)->data_len) 117 #define PKT_LEN(m) ((m)->pkt_len) 118 #define DATA_OFF(m) ((m)->data_off) 119 #define SET_DATA_OFF(m, o) ((m)->data_off = (o)) 120 #define NB_SEGS(m) ((m)->nb_segs) 121 #define PORT(m) ((m)->port) 122 123 /* Work Request ID data type (64 bit). */ 124 typedef union { 125 struct { 126 uint32_t id; 127 uint16_t offset; 128 } data; 129 uint64_t raw; 130 } wr_id_t; 131 132 #define WR_ID(o) (((wr_id_t *)&(o))->data) 133 134 /* Compile-time check. */ 135 static inline void wr_id_t_check(void) 136 { 137 wr_id_t check[1 + (2 * -!(sizeof(wr_id_t) == sizeof(uint64_t)))]; 138 139 (void)check; 140 (void)wr_id_t_check; 141 } 142 143 /* Transpose flags. Useful to convert IBV to DPDK flags. */ 144 #define TRANSPOSE(val, from, to) \ 145 (((from) >= (to)) ? \ 146 (((val) & (from)) / ((from) / (to))) : \ 147 (((val) & (from)) * ((to) / (from)))) 148 149 struct mlx4_rxq_stats { 150 unsigned int idx; /**< Mapping index. */ 151 #ifdef MLX4_PMD_SOFT_COUNTERS 152 uint64_t ipackets; /**< Total of successfully received packets. */ 153 uint64_t ibytes; /**< Total of successfully received bytes. */ 154 #endif 155 uint64_t idropped; /**< Total of packets dropped when RX ring full. */ 156 uint64_t rx_nombuf; /**< Total of RX mbuf allocation failures. */ 157 }; 158 159 struct mlx4_txq_stats { 160 unsigned int idx; /**< Mapping index. */ 161 #ifdef MLX4_PMD_SOFT_COUNTERS 162 uint64_t opackets; /**< Total of successfully sent packets. */ 163 uint64_t obytes; /**< Total of successfully sent bytes. */ 164 #endif 165 uint64_t odropped; /**< Total of packets not sent when TX ring full. */ 166 }; 167 168 /* RX element (scattered packets). */ 169 struct rxq_elt_sp { 170 struct ibv_recv_wr wr; /* Work Request. */ 171 struct ibv_sge sges[MLX4_PMD_SGE_WR_N]; /* Scatter/Gather Elements. */ 172 struct rte_mbuf *bufs[MLX4_PMD_SGE_WR_N]; /* SGEs buffers. */ 173 }; 174 175 /* RX element. */ 176 struct rxq_elt { 177 struct ibv_recv_wr wr; /* Work Request. */ 178 struct ibv_sge sge; /* Scatter/Gather Element. */ 179 /* mbuf pointer is derived from WR_ID(wr.wr_id).offset. */ 180 }; 181 182 /* RX queue descriptor. */ 183 struct rxq { 184 struct priv *priv; /* Back pointer to private data. */ 185 struct rte_mempool *mp; /* Memory Pool for allocations. */ 186 struct ibv_mr *mr; /* Memory Region (for mp). */ 187 struct ibv_cq *cq; /* Completion Queue. */ 188 struct ibv_qp *qp; /* Queue Pair. */ 189 struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */ 190 struct ibv_exp_cq_family *if_cq; /* CQ interface. */ 191 /* 192 * Each VLAN ID requires a separate flow steering rule. 193 */ 194 BITFIELD_DECLARE(mac_configured, uint32_t, MLX4_MAX_MAC_ADDRESSES); 195 struct ibv_flow *mac_flow[MLX4_MAX_MAC_ADDRESSES][MLX4_MAX_VLAN_IDS]; 196 struct ibv_flow *promisc_flow; /* Promiscuous flow. */ 197 struct ibv_flow *allmulti_flow; /* Multicast flow. */ 198 unsigned int port_id; /* Port ID for incoming packets. */ 199 unsigned int elts_n; /* (*elts)[] length. */ 200 unsigned int elts_head; /* Current index in (*elts)[]. */ 201 union { 202 struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */ 203 struct rxq_elt (*no_sp)[]; /* RX elements. */ 204 } elts; 205 unsigned int sp:1; /* Use scattered RX elements. */ 206 unsigned int csum:1; /* Enable checksum offloading. */ 207 unsigned int csum_l2tun:1; /* Same for L2 tunnels. */ 208 uint32_t mb_len; /* Length of a mp-issued mbuf. */ 209 struct mlx4_rxq_stats stats; /* RX queue counters. */ 210 unsigned int socket; /* CPU socket ID for allocations. */ 211 struct ibv_exp_res_domain *rd; /* Resource Domain. */ 212 }; 213 214 /* TX element. */ 215 struct txq_elt { 216 struct rte_mbuf *buf; 217 }; 218 219 /* Linear buffer type. It is used when transmitting buffers with too many 220 * segments that do not fit the hardware queue (see max_send_sge). 221 * Extra segments are copied (linearized) in such buffers, replacing the 222 * last SGE during TX. 223 * The size is arbitrary but large enough to hold a jumbo frame with 224 * 8 segments considering mbuf.buf_len is about 2048 bytes. */ 225 typedef uint8_t linear_t[16384]; 226 227 /* TX queue descriptor. */ 228 struct txq { 229 struct priv *priv; /* Back pointer to private data. */ 230 struct { 231 const struct rte_mempool *mp; /* Cached Memory Pool. */ 232 struct ibv_mr *mr; /* Memory Region (for mp). */ 233 uint32_t lkey; /* mr->lkey */ 234 } mp2mr[MLX4_PMD_TX_MP_CACHE]; /* MP to MR translation table. */ 235 struct ibv_cq *cq; /* Completion Queue. */ 236 struct ibv_qp *qp; /* Queue Pair. */ 237 struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */ 238 struct ibv_exp_cq_family *if_cq; /* CQ interface. */ 239 #if MLX4_PMD_MAX_INLINE > 0 240 uint32_t max_inline; /* Max inline send size <= MLX4_PMD_MAX_INLINE. */ 241 #endif 242 unsigned int elts_n; /* (*elts)[] length. */ 243 struct txq_elt (*elts)[]; /* TX elements. */ 244 unsigned int elts_head; /* Current index in (*elts)[]. */ 245 unsigned int elts_tail; /* First element awaiting completion. */ 246 unsigned int elts_comp; /* Number of completion requests. */ 247 unsigned int elts_comp_cd; /* Countdown for next completion request. */ 248 unsigned int elts_comp_cd_init; /* Initial value for countdown. */ 249 struct mlx4_txq_stats stats; /* TX queue counters. */ 250 linear_t (*elts_linear)[]; /* Linearized buffers. */ 251 struct ibv_mr *mr_linear; /* Memory Region for linearized buffers. */ 252 unsigned int socket; /* CPU socket ID for allocations. */ 253 struct ibv_exp_res_domain *rd; /* Resource Domain. */ 254 }; 255 256 struct priv { 257 struct rte_eth_dev *dev; /* Ethernet device. */ 258 struct ibv_context *ctx; /* Verbs context. */ 259 struct ibv_device_attr device_attr; /* Device properties. */ 260 struct ibv_pd *pd; /* Protection Domain. */ 261 /* 262 * MAC addresses array and configuration bit-field. 263 * An extra entry that cannot be modified by the DPDK is reserved 264 * for broadcast frames (destination MAC address ff:ff:ff:ff:ff:ff). 265 */ 266 struct ether_addr mac[MLX4_MAX_MAC_ADDRESSES]; 267 BITFIELD_DECLARE(mac_configured, uint32_t, MLX4_MAX_MAC_ADDRESSES); 268 /* VLAN filters. */ 269 struct { 270 unsigned int enabled:1; /* If enabled. */ 271 unsigned int id:12; /* VLAN ID (0-4095). */ 272 } vlan_filter[MLX4_MAX_VLAN_IDS]; /* VLAN filters table. */ 273 /* Device properties. */ 274 uint16_t mtu; /* Configured MTU. */ 275 uint8_t port; /* Physical port number. */ 276 unsigned int started:1; /* Device started, flows enabled. */ 277 unsigned int promisc:1; /* Device in promiscuous mode. */ 278 unsigned int allmulti:1; /* Device receives all multicast packets. */ 279 unsigned int hw_qpg:1; /* QP groups are supported. */ 280 unsigned int hw_tss:1; /* TSS is supported. */ 281 unsigned int hw_rss:1; /* RSS is supported. */ 282 unsigned int hw_csum:1; /* Checksum offload is supported. */ 283 unsigned int hw_csum_l2tun:1; /* Same for L2 tunnels. */ 284 unsigned int rss:1; /* RSS is enabled. */ 285 unsigned int vf:1; /* This is a VF device. */ 286 unsigned int pending_alarm:1; /* An alarm is pending. */ 287 #ifdef INLINE_RECV 288 unsigned int inl_recv_size; /* Inline recv size */ 289 #endif 290 unsigned int max_rss_tbl_sz; /* Maximum number of RSS queues. */ 291 /* RX/TX queues. */ 292 struct rxq rxq_parent; /* Parent queue when RSS is enabled. */ 293 unsigned int rxqs_n; /* RX queues array size. */ 294 unsigned int txqs_n; /* TX queues array size. */ 295 struct rxq *(*rxqs)[]; /* RX queues. */ 296 struct txq *(*txqs)[]; /* TX queues. */ 297 struct rte_intr_handle intr_handle; /* Interrupt handler. */ 298 rte_spinlock_t lock; /* Lock for control functions. */ 299 }; 300 301 /* Local storage for secondary process data. */ 302 struct mlx4_secondary_data { 303 struct rte_eth_dev_data data; /* Local device data. */ 304 struct priv *primary_priv; /* Private structure from primary. */ 305 struct rte_eth_dev_data *shared_dev_data; /* Shared device data. */ 306 rte_spinlock_t lock; /* Port configuration lock. */ 307 } mlx4_secondary_data[RTE_MAX_ETHPORTS]; 308 309 /** 310 * Check if running as a secondary process. 311 * 312 * @return 313 * Nonzero if running as a secondary process. 314 */ 315 static inline int 316 mlx4_is_secondary(void) 317 { 318 return rte_eal_process_type() != RTE_PROC_PRIMARY; 319 } 320 321 /** 322 * Return private structure associated with an Ethernet device. 323 * 324 * @param dev 325 * Pointer to Ethernet device structure. 326 * 327 * @return 328 * Pointer to private structure. 329 */ 330 static struct priv * 331 mlx4_get_priv(struct rte_eth_dev *dev) 332 { 333 struct mlx4_secondary_data *sd; 334 335 if (!mlx4_is_secondary()) 336 return dev->data->dev_private; 337 sd = &mlx4_secondary_data[dev->data->port_id]; 338 return sd->data.dev_private; 339 } 340 341 /** 342 * Lock private structure to protect it from concurrent access in the 343 * control path. 344 * 345 * @param priv 346 * Pointer to private structure. 347 */ 348 static void 349 priv_lock(struct priv *priv) 350 { 351 rte_spinlock_lock(&priv->lock); 352 } 353 354 /** 355 * Unlock private structure. 356 * 357 * @param priv 358 * Pointer to private structure. 359 */ 360 static void 361 priv_unlock(struct priv *priv) 362 { 363 rte_spinlock_unlock(&priv->lock); 364 } 365 366 /* Allocate a buffer on the stack and fill it with a printf format string. */ 367 #define MKSTR(name, ...) \ 368 char name[snprintf(NULL, 0, __VA_ARGS__) + 1]; \ 369 \ 370 snprintf(name, sizeof(name), __VA_ARGS__) 371 372 /** 373 * Get interface name from private structure. 374 * 375 * @param[in] priv 376 * Pointer to private structure. 377 * @param[out] ifname 378 * Interface name output buffer. 379 * 380 * @return 381 * 0 on success, -1 on failure and errno is set. 382 */ 383 static int 384 priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]) 385 { 386 DIR *dir; 387 struct dirent *dent; 388 unsigned int dev_type = 0; 389 unsigned int dev_port_prev = ~0u; 390 char match[IF_NAMESIZE] = ""; 391 392 { 393 MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path); 394 395 dir = opendir(path); 396 if (dir == NULL) 397 return -1; 398 } 399 while ((dent = readdir(dir)) != NULL) { 400 char *name = dent->d_name; 401 FILE *file; 402 unsigned int dev_port; 403 int r; 404 405 if ((name[0] == '.') && 406 ((name[1] == '\0') || 407 ((name[1] == '.') && (name[2] == '\0')))) 408 continue; 409 410 MKSTR(path, "%s/device/net/%s/%s", 411 priv->ctx->device->ibdev_path, name, 412 (dev_type ? "dev_id" : "dev_port")); 413 414 file = fopen(path, "rb"); 415 if (file == NULL) { 416 if (errno != ENOENT) 417 continue; 418 /* 419 * Switch to dev_id when dev_port does not exist as 420 * is the case with Linux kernel versions < 3.15. 421 */ 422 try_dev_id: 423 match[0] = '\0'; 424 if (dev_type) 425 break; 426 dev_type = 1; 427 dev_port_prev = ~0u; 428 rewinddir(dir); 429 continue; 430 } 431 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); 432 fclose(file); 433 if (r != 1) 434 continue; 435 /* 436 * Switch to dev_id when dev_port returns the same value for 437 * all ports. May happen when using a MOFED release older than 438 * 3.0 with a Linux kernel >= 3.15. 439 */ 440 if (dev_port == dev_port_prev) 441 goto try_dev_id; 442 dev_port_prev = dev_port; 443 if (dev_port == (priv->port - 1u)) 444 snprintf(match, sizeof(match), "%s", name); 445 } 446 closedir(dir); 447 if (match[0] == '\0') 448 return -1; 449 strncpy(*ifname, match, sizeof(*ifname)); 450 return 0; 451 } 452 453 /** 454 * Read from sysfs entry. 455 * 456 * @param[in] priv 457 * Pointer to private structure. 458 * @param[in] entry 459 * Entry name relative to sysfs path. 460 * @param[out] buf 461 * Data output buffer. 462 * @param size 463 * Buffer size. 464 * 465 * @return 466 * 0 on success, -1 on failure and errno is set. 467 */ 468 static int 469 priv_sysfs_read(const struct priv *priv, const char *entry, 470 char *buf, size_t size) 471 { 472 char ifname[IF_NAMESIZE]; 473 FILE *file; 474 int ret; 475 int err; 476 477 if (priv_get_ifname(priv, &ifname)) 478 return -1; 479 480 MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, 481 ifname, entry); 482 483 file = fopen(path, "rb"); 484 if (file == NULL) 485 return -1; 486 ret = fread(buf, 1, size, file); 487 err = errno; 488 if (((size_t)ret < size) && (ferror(file))) 489 ret = -1; 490 else 491 ret = size; 492 fclose(file); 493 errno = err; 494 return ret; 495 } 496 497 /** 498 * Write to sysfs entry. 499 * 500 * @param[in] priv 501 * Pointer to private structure. 502 * @param[in] entry 503 * Entry name relative to sysfs path. 504 * @param[in] buf 505 * Data buffer. 506 * @param size 507 * Buffer size. 508 * 509 * @return 510 * 0 on success, -1 on failure and errno is set. 511 */ 512 static int 513 priv_sysfs_write(const struct priv *priv, const char *entry, 514 char *buf, size_t size) 515 { 516 char ifname[IF_NAMESIZE]; 517 FILE *file; 518 int ret; 519 int err; 520 521 if (priv_get_ifname(priv, &ifname)) 522 return -1; 523 524 MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, 525 ifname, entry); 526 527 file = fopen(path, "wb"); 528 if (file == NULL) 529 return -1; 530 ret = fwrite(buf, 1, size, file); 531 err = errno; 532 if (((size_t)ret < size) || (ferror(file))) 533 ret = -1; 534 else 535 ret = size; 536 fclose(file); 537 errno = err; 538 return ret; 539 } 540 541 /** 542 * Get unsigned long sysfs property. 543 * 544 * @param priv 545 * Pointer to private structure. 546 * @param[in] name 547 * Entry name relative to sysfs path. 548 * @param[out] value 549 * Value output buffer. 550 * 551 * @return 552 * 0 on success, -1 on failure and errno is set. 553 */ 554 static int 555 priv_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value) 556 { 557 int ret; 558 unsigned long value_ret; 559 char value_str[32]; 560 561 ret = priv_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1)); 562 if (ret == -1) { 563 DEBUG("cannot read %s value from sysfs: %s", 564 name, strerror(errno)); 565 return -1; 566 } 567 value_str[ret] = '\0'; 568 errno = 0; 569 value_ret = strtoul(value_str, NULL, 0); 570 if (errno) { 571 DEBUG("invalid %s value `%s': %s", name, value_str, 572 strerror(errno)); 573 return -1; 574 } 575 *value = value_ret; 576 return 0; 577 } 578 579 /** 580 * Set unsigned long sysfs property. 581 * 582 * @param priv 583 * Pointer to private structure. 584 * @param[in] name 585 * Entry name relative to sysfs path. 586 * @param value 587 * Value to set. 588 * 589 * @return 590 * 0 on success, -1 on failure and errno is set. 591 */ 592 static int 593 priv_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value) 594 { 595 int ret; 596 MKSTR(value_str, "%lu", value); 597 598 ret = priv_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1)); 599 if (ret == -1) { 600 DEBUG("cannot write %s `%s' (%lu) to sysfs: %s", 601 name, value_str, value, strerror(errno)); 602 return -1; 603 } 604 return 0; 605 } 606 607 /** 608 * Perform ifreq ioctl() on associated Ethernet device. 609 * 610 * @param[in] priv 611 * Pointer to private structure. 612 * @param req 613 * Request number to pass to ioctl(). 614 * @param[out] ifr 615 * Interface request structure output buffer. 616 * 617 * @return 618 * 0 on success, -1 on failure and errno is set. 619 */ 620 static int 621 priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr) 622 { 623 int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 624 int ret = -1; 625 626 if (sock == -1) 627 return ret; 628 if (priv_get_ifname(priv, &ifr->ifr_name) == 0) 629 ret = ioctl(sock, req, ifr); 630 close(sock); 631 return ret; 632 } 633 634 /** 635 * Get device MTU. 636 * 637 * @param priv 638 * Pointer to private structure. 639 * @param[out] mtu 640 * MTU value output buffer. 641 * 642 * @return 643 * 0 on success, -1 on failure and errno is set. 644 */ 645 static int 646 priv_get_mtu(struct priv *priv, uint16_t *mtu) 647 { 648 unsigned long ulong_mtu; 649 650 if (priv_get_sysfs_ulong(priv, "mtu", &ulong_mtu) == -1) 651 return -1; 652 *mtu = ulong_mtu; 653 return 0; 654 } 655 656 /** 657 * Set device MTU. 658 * 659 * @param priv 660 * Pointer to private structure. 661 * @param mtu 662 * MTU value to set. 663 * 664 * @return 665 * 0 on success, -1 on failure and errno is set. 666 */ 667 static int 668 priv_set_mtu(struct priv *priv, uint16_t mtu) 669 { 670 return priv_set_sysfs_ulong(priv, "mtu", mtu); 671 } 672 673 /** 674 * Set device flags. 675 * 676 * @param priv 677 * Pointer to private structure. 678 * @param keep 679 * Bitmask for flags that must remain untouched. 680 * @param flags 681 * Bitmask for flags to modify. 682 * 683 * @return 684 * 0 on success, -1 on failure and errno is set. 685 */ 686 static int 687 priv_set_flags(struct priv *priv, unsigned int keep, unsigned int flags) 688 { 689 unsigned long tmp; 690 691 if (priv_get_sysfs_ulong(priv, "flags", &tmp) == -1) 692 return -1; 693 tmp &= keep; 694 tmp |= flags; 695 return priv_set_sysfs_ulong(priv, "flags", tmp); 696 } 697 698 /* Device configuration. */ 699 700 static int 701 txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc, 702 unsigned int socket, const struct rte_eth_txconf *conf); 703 704 static void 705 txq_cleanup(struct txq *txq); 706 707 static int 708 rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc, 709 unsigned int socket, const struct rte_eth_rxconf *conf, 710 struct rte_mempool *mp); 711 712 static void 713 rxq_cleanup(struct rxq *rxq); 714 715 /** 716 * Ethernet device configuration. 717 * 718 * Prepare the driver for a given number of TX and RX queues. 719 * Allocate parent RSS queue when several RX queues are requested. 720 * 721 * @param dev 722 * Pointer to Ethernet device structure. 723 * 724 * @return 725 * 0 on success, errno value on failure. 726 */ 727 static int 728 dev_configure(struct rte_eth_dev *dev) 729 { 730 struct priv *priv = dev->data->dev_private; 731 unsigned int rxqs_n = dev->data->nb_rx_queues; 732 unsigned int txqs_n = dev->data->nb_tx_queues; 733 unsigned int tmp; 734 int ret; 735 736 priv->rxqs = (void *)dev->data->rx_queues; 737 priv->txqs = (void *)dev->data->tx_queues; 738 if (txqs_n != priv->txqs_n) { 739 INFO("%p: TX queues number update: %u -> %u", 740 (void *)dev, priv->txqs_n, txqs_n); 741 priv->txqs_n = txqs_n; 742 } 743 if (rxqs_n == priv->rxqs_n) 744 return 0; 745 INFO("%p: RX queues number update: %u -> %u", 746 (void *)dev, priv->rxqs_n, rxqs_n); 747 /* If RSS is enabled, disable it first. */ 748 if (priv->rss) { 749 unsigned int i; 750 751 /* Only if there are no remaining child RX queues. */ 752 for (i = 0; (i != priv->rxqs_n); ++i) 753 if ((*priv->rxqs)[i] != NULL) 754 return EINVAL; 755 rxq_cleanup(&priv->rxq_parent); 756 priv->rss = 0; 757 priv->rxqs_n = 0; 758 } 759 if (rxqs_n <= 1) { 760 /* Nothing else to do. */ 761 priv->rxqs_n = rxqs_n; 762 return 0; 763 } 764 /* Allocate a new RSS parent queue if supported by hardware. */ 765 if (!priv->hw_rss) { 766 ERROR("%p: only a single RX queue can be configured when" 767 " hardware doesn't support RSS", 768 (void *)dev); 769 return EINVAL; 770 } 771 /* Fail if hardware doesn't support that many RSS queues. */ 772 if (rxqs_n >= priv->max_rss_tbl_sz) { 773 ERROR("%p: only %u RX queues can be configured for RSS", 774 (void *)dev, priv->max_rss_tbl_sz); 775 return EINVAL; 776 } 777 priv->rss = 1; 778 tmp = priv->rxqs_n; 779 priv->rxqs_n = rxqs_n; 780 ret = rxq_setup(dev, &priv->rxq_parent, 0, 0, NULL, NULL); 781 if (!ret) 782 return 0; 783 /* Failure, rollback. */ 784 priv->rss = 0; 785 priv->rxqs_n = tmp; 786 assert(ret > 0); 787 return ret; 788 } 789 790 /** 791 * DPDK callback for Ethernet device configuration. 792 * 793 * @param dev 794 * Pointer to Ethernet device structure. 795 * 796 * @return 797 * 0 on success, negative errno value on failure. 798 */ 799 static int 800 mlx4_dev_configure(struct rte_eth_dev *dev) 801 { 802 struct priv *priv = dev->data->dev_private; 803 int ret; 804 805 if (mlx4_is_secondary()) 806 return -E_RTE_SECONDARY; 807 priv_lock(priv); 808 ret = dev_configure(dev); 809 assert(ret >= 0); 810 priv_unlock(priv); 811 return -ret; 812 } 813 814 static uint16_t mlx4_tx_burst(void *, struct rte_mbuf **, uint16_t); 815 static uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t); 816 817 /** 818 * Configure secondary process queues from a private data pointer (primary 819 * or secondary) and update burst callbacks. Can take place only once. 820 * 821 * All queues must have been previously created by the primary process to 822 * avoid undefined behavior. 823 * 824 * @param priv 825 * Private data pointer from either primary or secondary process. 826 * 827 * @return 828 * Private data pointer from secondary process, NULL in case of error. 829 */ 830 static struct priv * 831 mlx4_secondary_data_setup(struct priv *priv) 832 { 833 unsigned int port_id = 0; 834 struct mlx4_secondary_data *sd; 835 void **tx_queues; 836 void **rx_queues; 837 unsigned int nb_tx_queues; 838 unsigned int nb_rx_queues; 839 unsigned int i; 840 841 /* priv must be valid at this point. */ 842 assert(priv != NULL); 843 /* priv->dev must also be valid but may point to local memory from 844 * another process, possibly with the same address and must not 845 * be dereferenced yet. */ 846 assert(priv->dev != NULL); 847 /* Determine port ID by finding out where priv comes from. */ 848 while (1) { 849 sd = &mlx4_secondary_data[port_id]; 850 rte_spinlock_lock(&sd->lock); 851 /* Primary process? */ 852 if (sd->primary_priv == priv) 853 break; 854 /* Secondary process? */ 855 if (sd->data.dev_private == priv) 856 break; 857 rte_spinlock_unlock(&sd->lock); 858 if (++port_id == RTE_DIM(mlx4_secondary_data)) 859 port_id = 0; 860 } 861 /* Switch to secondary private structure. If private data has already 862 * been updated by another thread, there is nothing else to do. */ 863 priv = sd->data.dev_private; 864 if (priv->dev->data == &sd->data) 865 goto end; 866 /* Sanity checks. Secondary private structure is supposed to point 867 * to local eth_dev, itself still pointing to the shared device data 868 * structure allocated by the primary process. */ 869 assert(sd->shared_dev_data != &sd->data); 870 assert(sd->data.nb_tx_queues == 0); 871 assert(sd->data.tx_queues == NULL); 872 assert(sd->data.nb_rx_queues == 0); 873 assert(sd->data.rx_queues == NULL); 874 assert(priv != sd->primary_priv); 875 assert(priv->dev->data == sd->shared_dev_data); 876 assert(priv->txqs_n == 0); 877 assert(priv->txqs == NULL); 878 assert(priv->rxqs_n == 0); 879 assert(priv->rxqs == NULL); 880 nb_tx_queues = sd->shared_dev_data->nb_tx_queues; 881 nb_rx_queues = sd->shared_dev_data->nb_rx_queues; 882 /* Allocate local storage for queues. */ 883 tx_queues = rte_zmalloc("secondary ethdev->tx_queues", 884 sizeof(sd->data.tx_queues[0]) * nb_tx_queues, 885 RTE_CACHE_LINE_SIZE); 886 rx_queues = rte_zmalloc("secondary ethdev->rx_queues", 887 sizeof(sd->data.rx_queues[0]) * nb_rx_queues, 888 RTE_CACHE_LINE_SIZE); 889 if (tx_queues == NULL || rx_queues == NULL) 890 goto error; 891 /* Lock to prevent control operations during setup. */ 892 priv_lock(priv); 893 /* TX queues. */ 894 for (i = 0; i != nb_tx_queues; ++i) { 895 struct txq *primary_txq = (*sd->primary_priv->txqs)[i]; 896 struct txq *txq; 897 898 if (primary_txq == NULL) 899 continue; 900 txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0, 901 primary_txq->socket); 902 if (txq != NULL) { 903 if (txq_setup(priv->dev, 904 txq, 905 primary_txq->elts_n * MLX4_PMD_SGE_WR_N, 906 primary_txq->socket, 907 NULL) == 0) { 908 txq->stats.idx = primary_txq->stats.idx; 909 tx_queues[i] = txq; 910 continue; 911 } 912 rte_free(txq); 913 } 914 while (i) { 915 txq = tx_queues[--i]; 916 txq_cleanup(txq); 917 rte_free(txq); 918 } 919 goto error; 920 } 921 /* RX queues. */ 922 for (i = 0; i != nb_rx_queues; ++i) { 923 struct rxq *primary_rxq = (*sd->primary_priv->rxqs)[i]; 924 925 if (primary_rxq == NULL) 926 continue; 927 /* Not supported yet. */ 928 rx_queues[i] = NULL; 929 } 930 /* Update everything. */ 931 priv->txqs = (void *)tx_queues; 932 priv->txqs_n = nb_tx_queues; 933 priv->rxqs = (void *)rx_queues; 934 priv->rxqs_n = nb_rx_queues; 935 sd->data.rx_queues = rx_queues; 936 sd->data.tx_queues = tx_queues; 937 sd->data.nb_rx_queues = nb_rx_queues; 938 sd->data.nb_tx_queues = nb_tx_queues; 939 sd->data.dev_link = sd->shared_dev_data->dev_link; 940 sd->data.mtu = sd->shared_dev_data->mtu; 941 memcpy(sd->data.rx_queue_state, sd->shared_dev_data->rx_queue_state, 942 sizeof(sd->data.rx_queue_state)); 943 memcpy(sd->data.tx_queue_state, sd->shared_dev_data->tx_queue_state, 944 sizeof(sd->data.tx_queue_state)); 945 sd->data.dev_flags = sd->shared_dev_data->dev_flags; 946 /* Use local data from now on. */ 947 rte_mb(); 948 priv->dev->data = &sd->data; 949 rte_mb(); 950 priv->dev->tx_pkt_burst = mlx4_tx_burst; 951 priv->dev->rx_pkt_burst = removed_rx_burst; 952 priv_unlock(priv); 953 end: 954 /* More sanity checks. */ 955 assert(priv->dev->tx_pkt_burst == mlx4_tx_burst); 956 assert(priv->dev->rx_pkt_burst == removed_rx_burst); 957 assert(priv->dev->data == &sd->data); 958 rte_spinlock_unlock(&sd->lock); 959 return priv; 960 error: 961 priv_unlock(priv); 962 rte_free(tx_queues); 963 rte_free(rx_queues); 964 rte_spinlock_unlock(&sd->lock); 965 return NULL; 966 } 967 968 /* TX queues handling. */ 969 970 /** 971 * Allocate TX queue elements. 972 * 973 * @param txq 974 * Pointer to TX queue structure. 975 * @param elts_n 976 * Number of elements to allocate. 977 * 978 * @return 979 * 0 on success, errno value on failure. 980 */ 981 static int 982 txq_alloc_elts(struct txq *txq, unsigned int elts_n) 983 { 984 unsigned int i; 985 struct txq_elt (*elts)[elts_n] = 986 rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq->socket); 987 linear_t (*elts_linear)[elts_n] = 988 rte_calloc_socket("TXQ", 1, sizeof(*elts_linear), 0, 989 txq->socket); 990 struct ibv_mr *mr_linear = NULL; 991 int ret = 0; 992 993 if ((elts == NULL) || (elts_linear == NULL)) { 994 ERROR("%p: can't allocate packets array", (void *)txq); 995 ret = ENOMEM; 996 goto error; 997 } 998 mr_linear = 999 ibv_reg_mr(txq->priv->pd, elts_linear, sizeof(*elts_linear), 1000 (IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE)); 1001 if (mr_linear == NULL) { 1002 ERROR("%p: unable to configure MR, ibv_reg_mr() failed", 1003 (void *)txq); 1004 ret = EINVAL; 1005 goto error; 1006 } 1007 for (i = 0; (i != elts_n); ++i) { 1008 struct txq_elt *elt = &(*elts)[i]; 1009 1010 elt->buf = NULL; 1011 } 1012 DEBUG("%p: allocated and configured %u WRs", (void *)txq, elts_n); 1013 txq->elts_n = elts_n; 1014 txq->elts = elts; 1015 txq->elts_head = 0; 1016 txq->elts_tail = 0; 1017 txq->elts_comp = 0; 1018 /* Request send completion every MLX4_PMD_TX_PER_COMP_REQ packets or 1019 * at least 4 times per ring. */ 1020 txq->elts_comp_cd_init = 1021 ((MLX4_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ? 1022 MLX4_PMD_TX_PER_COMP_REQ : (elts_n / 4)); 1023 txq->elts_comp_cd = txq->elts_comp_cd_init; 1024 txq->elts_linear = elts_linear; 1025 txq->mr_linear = mr_linear; 1026 assert(ret == 0); 1027 return 0; 1028 error: 1029 if (mr_linear != NULL) 1030 claim_zero(ibv_dereg_mr(mr_linear)); 1031 1032 rte_free(elts_linear); 1033 rte_free(elts); 1034 1035 DEBUG("%p: failed, freed everything", (void *)txq); 1036 assert(ret > 0); 1037 return ret; 1038 } 1039 1040 /** 1041 * Free TX queue elements. 1042 * 1043 * @param txq 1044 * Pointer to TX queue structure. 1045 */ 1046 static void 1047 txq_free_elts(struct txq *txq) 1048 { 1049 unsigned int i; 1050 unsigned int elts_n = txq->elts_n; 1051 struct txq_elt (*elts)[elts_n] = txq->elts; 1052 linear_t (*elts_linear)[elts_n] = txq->elts_linear; 1053 struct ibv_mr *mr_linear = txq->mr_linear; 1054 1055 DEBUG("%p: freeing WRs", (void *)txq); 1056 txq->elts_n = 0; 1057 txq->elts = NULL; 1058 txq->elts_linear = NULL; 1059 txq->mr_linear = NULL; 1060 if (mr_linear != NULL) 1061 claim_zero(ibv_dereg_mr(mr_linear)); 1062 1063 rte_free(elts_linear); 1064 if (elts == NULL) 1065 return; 1066 for (i = 0; (i != elemof(*elts)); ++i) { 1067 struct txq_elt *elt = &(*elts)[i]; 1068 1069 if (elt->buf == NULL) 1070 continue; 1071 rte_pktmbuf_free(elt->buf); 1072 } 1073 rte_free(elts); 1074 } 1075 1076 1077 /** 1078 * Clean up a TX queue. 1079 * 1080 * Destroy objects, free allocated memory and reset the structure for reuse. 1081 * 1082 * @param txq 1083 * Pointer to TX queue structure. 1084 */ 1085 static void 1086 txq_cleanup(struct txq *txq) 1087 { 1088 struct ibv_exp_release_intf_params params; 1089 size_t i; 1090 1091 DEBUG("cleaning up %p", (void *)txq); 1092 txq_free_elts(txq); 1093 if (txq->if_qp != NULL) { 1094 assert(txq->priv != NULL); 1095 assert(txq->priv->ctx != NULL); 1096 assert(txq->qp != NULL); 1097 params = (struct ibv_exp_release_intf_params){ 1098 .comp_mask = 0, 1099 }; 1100 claim_zero(ibv_exp_release_intf(txq->priv->ctx, 1101 txq->if_qp, 1102 ¶ms)); 1103 } 1104 if (txq->if_cq != NULL) { 1105 assert(txq->priv != NULL); 1106 assert(txq->priv->ctx != NULL); 1107 assert(txq->cq != NULL); 1108 params = (struct ibv_exp_release_intf_params){ 1109 .comp_mask = 0, 1110 }; 1111 claim_zero(ibv_exp_release_intf(txq->priv->ctx, 1112 txq->if_cq, 1113 ¶ms)); 1114 } 1115 if (txq->qp != NULL) 1116 claim_zero(ibv_destroy_qp(txq->qp)); 1117 if (txq->cq != NULL) 1118 claim_zero(ibv_destroy_cq(txq->cq)); 1119 if (txq->rd != NULL) { 1120 struct ibv_exp_destroy_res_domain_attr attr = { 1121 .comp_mask = 0, 1122 }; 1123 1124 assert(txq->priv != NULL); 1125 assert(txq->priv->ctx != NULL); 1126 claim_zero(ibv_exp_destroy_res_domain(txq->priv->ctx, 1127 txq->rd, 1128 &attr)); 1129 } 1130 for (i = 0; (i != elemof(txq->mp2mr)); ++i) { 1131 if (txq->mp2mr[i].mp == NULL) 1132 break; 1133 assert(txq->mp2mr[i].mr != NULL); 1134 claim_zero(ibv_dereg_mr(txq->mp2mr[i].mr)); 1135 } 1136 memset(txq, 0, sizeof(*txq)); 1137 } 1138 1139 /** 1140 * Manage TX completions. 1141 * 1142 * When sending a burst, mlx4_tx_burst() posts several WRs. 1143 * To improve performance, a completion event is only required once every 1144 * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information 1145 * for other WRs, but this information would not be used anyway. 1146 * 1147 * @param txq 1148 * Pointer to TX queue structure. 1149 * 1150 * @return 1151 * 0 on success, -1 on failure. 1152 */ 1153 static int 1154 txq_complete(struct txq *txq) 1155 { 1156 unsigned int elts_comp = txq->elts_comp; 1157 unsigned int elts_tail = txq->elts_tail; 1158 const unsigned int elts_n = txq->elts_n; 1159 int wcs_n; 1160 1161 if (unlikely(elts_comp == 0)) 1162 return 0; 1163 #ifdef DEBUG_SEND 1164 DEBUG("%p: processing %u work requests completions", 1165 (void *)txq, elts_comp); 1166 #endif 1167 wcs_n = txq->if_cq->poll_cnt(txq->cq, elts_comp); 1168 if (unlikely(wcs_n == 0)) 1169 return 0; 1170 if (unlikely(wcs_n < 0)) { 1171 DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)", 1172 (void *)txq, wcs_n); 1173 return -1; 1174 } 1175 elts_comp -= wcs_n; 1176 assert(elts_comp <= txq->elts_comp); 1177 /* 1178 * Assume WC status is successful as nothing can be done about it 1179 * anyway. 1180 */ 1181 elts_tail += wcs_n * txq->elts_comp_cd_init; 1182 if (elts_tail >= elts_n) 1183 elts_tail -= elts_n; 1184 txq->elts_tail = elts_tail; 1185 txq->elts_comp = elts_comp; 1186 return 0; 1187 } 1188 1189 /** 1190 * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which 1191 * the cloned mbuf is allocated is returned instead. 1192 * 1193 * @param buf 1194 * Pointer to mbuf. 1195 * 1196 * @return 1197 * Memory pool where data is located for given mbuf. 1198 */ 1199 static struct rte_mempool * 1200 txq_mb2mp(struct rte_mbuf *buf) 1201 { 1202 if (unlikely(RTE_MBUF_INDIRECT(buf))) 1203 return rte_mbuf_from_indirect(buf)->pool; 1204 return buf->pool; 1205 } 1206 1207 /** 1208 * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[]. 1209 * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, 1210 * remove an entry first. 1211 * 1212 * @param txq 1213 * Pointer to TX queue structure. 1214 * @param[in] mp 1215 * Memory Pool for which a Memory Region lkey must be returned. 1216 * 1217 * @return 1218 * mr->lkey on success, (uint32_t)-1 on failure. 1219 */ 1220 static uint32_t 1221 txq_mp2mr(struct txq *txq, const struct rte_mempool *mp) 1222 { 1223 unsigned int i; 1224 struct ibv_mr *mr; 1225 1226 for (i = 0; (i != elemof(txq->mp2mr)); ++i) { 1227 if (unlikely(txq->mp2mr[i].mp == NULL)) { 1228 /* Unknown MP, add a new MR for it. */ 1229 break; 1230 } 1231 if (txq->mp2mr[i].mp == mp) { 1232 assert(txq->mp2mr[i].lkey != (uint32_t)-1); 1233 assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey); 1234 return txq->mp2mr[i].lkey; 1235 } 1236 } 1237 /* Add a new entry, register MR first. */ 1238 DEBUG("%p: discovered new memory pool \"%s\" (%p)", 1239 (void *)txq, mp->name, (const void *)mp); 1240 mr = ibv_reg_mr(txq->priv->pd, 1241 (void *)mp->elt_va_start, 1242 (mp->elt_va_end - mp->elt_va_start), 1243 (IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE)); 1244 if (unlikely(mr == NULL)) { 1245 DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.", 1246 (void *)txq); 1247 return (uint32_t)-1; 1248 } 1249 if (unlikely(i == elemof(txq->mp2mr))) { 1250 /* Table is full, remove oldest entry. */ 1251 DEBUG("%p: MR <-> MP table full, dropping oldest entry.", 1252 (void *)txq); 1253 --i; 1254 claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr)); 1255 memmove(&txq->mp2mr[0], &txq->mp2mr[1], 1256 (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0]))); 1257 } 1258 /* Store the new entry. */ 1259 txq->mp2mr[i].mp = mp; 1260 txq->mp2mr[i].mr = mr; 1261 txq->mp2mr[i].lkey = mr->lkey; 1262 DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32, 1263 (void *)txq, mp->name, (const void *)mp, txq->mp2mr[i].lkey); 1264 return txq->mp2mr[i].lkey; 1265 } 1266 1267 struct txq_mp2mr_mbuf_check_data { 1268 const struct rte_mempool *mp; 1269 int ret; 1270 }; 1271 1272 /** 1273 * Callback function for rte_mempool_obj_iter() to check whether a given 1274 * mempool object looks like a mbuf. 1275 * 1276 * @param[in, out] arg 1277 * Context data (struct txq_mp2mr_mbuf_check_data). Contains mempool pointer 1278 * and return value. 1279 * @param[in] start 1280 * Object start address. 1281 * @param[in] end 1282 * Object end address. 1283 * @param index 1284 * Unused. 1285 * 1286 * @return 1287 * Nonzero value when object is not a mbuf. 1288 */ 1289 static void 1290 txq_mp2mr_mbuf_check(void *arg, void *start, void *end, 1291 uint32_t index __rte_unused) 1292 { 1293 struct txq_mp2mr_mbuf_check_data *data = arg; 1294 struct rte_mbuf *buf = 1295 (void *)((uintptr_t)start + data->mp->header_size); 1296 1297 (void)index; 1298 /* Check whether mbuf structure fits element size and whether mempool 1299 * pointer is valid. */ 1300 if (((uintptr_t)end >= (uintptr_t)(buf + 1)) && 1301 (buf->pool == data->mp)) 1302 data->ret = 0; 1303 else 1304 data->ret = -1; 1305 } 1306 1307 /** 1308 * Iterator function for rte_mempool_walk() to register existing mempools and 1309 * fill the MP to MR cache of a TX queue. 1310 * 1311 * @param[in] mp 1312 * Memory Pool to register. 1313 * @param *arg 1314 * Pointer to TX queue structure. 1315 */ 1316 static void 1317 txq_mp2mr_iter(const struct rte_mempool *mp, void *arg) 1318 { 1319 struct txq *txq = arg; 1320 struct txq_mp2mr_mbuf_check_data data = { 1321 .mp = mp, 1322 .ret = -1, 1323 }; 1324 1325 /* Discard empty mempools. */ 1326 if (mp->size == 0) 1327 return; 1328 /* Register mempool only if the first element looks like a mbuf. */ 1329 rte_mempool_obj_iter((void *)mp->elt_va_start, 1330 1, 1331 mp->header_size + mp->elt_size + mp->trailer_size, 1332 1, 1333 mp->elt_pa, 1334 mp->pg_num, 1335 mp->pg_shift, 1336 txq_mp2mr_mbuf_check, 1337 &data); 1338 if (data.ret) 1339 return; 1340 txq_mp2mr(txq, mp); 1341 } 1342 1343 #if MLX4_PMD_SGE_WR_N > 1 1344 1345 /** 1346 * Copy scattered mbuf contents to a single linear buffer. 1347 * 1348 * @param[out] linear 1349 * Linear output buffer. 1350 * @param[in] buf 1351 * Scattered input buffer. 1352 * 1353 * @return 1354 * Number of bytes copied to the output buffer or 0 if not large enough. 1355 */ 1356 static unsigned int 1357 linearize_mbuf(linear_t *linear, struct rte_mbuf *buf) 1358 { 1359 unsigned int size = 0; 1360 unsigned int offset; 1361 1362 do { 1363 unsigned int len = DATA_LEN(buf); 1364 1365 offset = size; 1366 size += len; 1367 if (unlikely(size > sizeof(*linear))) 1368 return 0; 1369 memcpy(&(*linear)[offset], 1370 rte_pktmbuf_mtod(buf, uint8_t *), 1371 len); 1372 buf = NEXT(buf); 1373 } while (buf != NULL); 1374 return size; 1375 } 1376 1377 /** 1378 * Handle scattered buffers for mlx4_tx_burst(). 1379 * 1380 * @param txq 1381 * TX queue structure. 1382 * @param segs 1383 * Number of segments in buf. 1384 * @param elt 1385 * TX queue element to fill. 1386 * @param[in] buf 1387 * Buffer to process. 1388 * @param elts_head 1389 * Index of the linear buffer to use if necessary (normally txq->elts_head). 1390 * @param[out] sges 1391 * Array filled with SGEs on success. 1392 * 1393 * @return 1394 * A structure containing the processed packet size in bytes and the 1395 * number of SGEs. Both fields are set to (unsigned int)-1 in case of 1396 * failure. 1397 */ 1398 static struct tx_burst_sg_ret { 1399 unsigned int length; 1400 unsigned int num; 1401 } 1402 tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt, 1403 struct rte_mbuf *buf, unsigned int elts_head, 1404 struct ibv_sge (*sges)[MLX4_PMD_SGE_WR_N]) 1405 { 1406 unsigned int sent_size = 0; 1407 unsigned int j; 1408 int linearize = 0; 1409 1410 /* When there are too many segments, extra segments are 1411 * linearized in the last SGE. */ 1412 if (unlikely(segs > elemof(*sges))) { 1413 segs = (elemof(*sges) - 1); 1414 linearize = 1; 1415 } 1416 /* Update element. */ 1417 elt->buf = buf; 1418 /* Register segments as SGEs. */ 1419 for (j = 0; (j != segs); ++j) { 1420 struct ibv_sge *sge = &(*sges)[j]; 1421 uint32_t lkey; 1422 1423 /* Retrieve Memory Region key for this memory pool. */ 1424 lkey = txq_mp2mr(txq, txq_mb2mp(buf)); 1425 if (unlikely(lkey == (uint32_t)-1)) { 1426 /* MR does not exist. */ 1427 DEBUG("%p: unable to get MP <-> MR association", 1428 (void *)txq); 1429 /* Clean up TX element. */ 1430 elt->buf = NULL; 1431 goto stop; 1432 } 1433 /* Update SGE. */ 1434 sge->addr = rte_pktmbuf_mtod(buf, uintptr_t); 1435 if (txq->priv->vf) 1436 rte_prefetch0((volatile void *) 1437 (uintptr_t)sge->addr); 1438 sge->length = DATA_LEN(buf); 1439 sge->lkey = lkey; 1440 sent_size += sge->length; 1441 buf = NEXT(buf); 1442 } 1443 /* If buf is not NULL here and is not going to be linearized, 1444 * nb_segs is not valid. */ 1445 assert(j == segs); 1446 assert((buf == NULL) || (linearize)); 1447 /* Linearize extra segments. */ 1448 if (linearize) { 1449 struct ibv_sge *sge = &(*sges)[segs]; 1450 linear_t *linear = &(*txq->elts_linear)[elts_head]; 1451 unsigned int size = linearize_mbuf(linear, buf); 1452 1453 assert(segs == (elemof(*sges) - 1)); 1454 if (size == 0) { 1455 /* Invalid packet. */ 1456 DEBUG("%p: packet too large to be linearized.", 1457 (void *)txq); 1458 /* Clean up TX element. */ 1459 elt->buf = NULL; 1460 goto stop; 1461 } 1462 /* If MLX4_PMD_SGE_WR_N is 1, free mbuf immediately. */ 1463 if (elemof(*sges) == 1) { 1464 do { 1465 struct rte_mbuf *next = NEXT(buf); 1466 1467 rte_pktmbuf_free_seg(buf); 1468 buf = next; 1469 } while (buf != NULL); 1470 elt->buf = NULL; 1471 } 1472 /* Update SGE. */ 1473 sge->addr = (uintptr_t)&(*linear)[0]; 1474 sge->length = size; 1475 sge->lkey = txq->mr_linear->lkey; 1476 sent_size += size; 1477 /* Include last segment. */ 1478 segs++; 1479 } 1480 return (struct tx_burst_sg_ret){ 1481 .length = sent_size, 1482 .num = segs, 1483 }; 1484 stop: 1485 return (struct tx_burst_sg_ret){ 1486 .length = -1, 1487 .num = -1, 1488 }; 1489 } 1490 1491 #endif /* MLX4_PMD_SGE_WR_N > 1 */ 1492 1493 /** 1494 * DPDK callback for TX. 1495 * 1496 * @param dpdk_txq 1497 * Generic pointer to TX queue structure. 1498 * @param[in] pkts 1499 * Packets to transmit. 1500 * @param pkts_n 1501 * Number of packets in array. 1502 * 1503 * @return 1504 * Number of packets successfully transmitted (<= pkts_n). 1505 */ 1506 static uint16_t 1507 mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 1508 { 1509 struct txq *txq = (struct txq *)dpdk_txq; 1510 unsigned int elts_head = txq->elts_head; 1511 const unsigned int elts_n = txq->elts_n; 1512 unsigned int elts_comp_cd = txq->elts_comp_cd; 1513 unsigned int elts_comp = 0; 1514 unsigned int i; 1515 unsigned int max; 1516 int err; 1517 1518 assert(elts_comp_cd != 0); 1519 txq_complete(txq); 1520 max = (elts_n - (elts_head - txq->elts_tail)); 1521 if (max > elts_n) 1522 max -= elts_n; 1523 assert(max >= 1); 1524 assert(max <= elts_n); 1525 /* Always leave one free entry in the ring. */ 1526 --max; 1527 if (max == 0) 1528 return 0; 1529 if (max > pkts_n) 1530 max = pkts_n; 1531 for (i = 0; (i != max); ++i) { 1532 struct rte_mbuf *buf = pkts[i]; 1533 unsigned int elts_head_next = 1534 (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); 1535 struct txq_elt *elt_next = &(*txq->elts)[elts_head_next]; 1536 struct txq_elt *elt = &(*txq->elts)[elts_head]; 1537 unsigned int segs = NB_SEGS(buf); 1538 #ifdef MLX4_PMD_SOFT_COUNTERS 1539 unsigned int sent_size = 0; 1540 #endif 1541 uint32_t send_flags = 0; 1542 1543 /* Clean up old buffer. */ 1544 if (likely(elt->buf != NULL)) { 1545 struct rte_mbuf *tmp = elt->buf; 1546 1547 /* Faster than rte_pktmbuf_free(). */ 1548 do { 1549 struct rte_mbuf *next = NEXT(tmp); 1550 1551 rte_pktmbuf_free_seg(tmp); 1552 tmp = next; 1553 } while (tmp != NULL); 1554 } 1555 /* Request TX completion. */ 1556 if (unlikely(--elts_comp_cd == 0)) { 1557 elts_comp_cd = txq->elts_comp_cd_init; 1558 ++elts_comp; 1559 send_flags |= IBV_EXP_QP_BURST_SIGNALED; 1560 } 1561 /* Should we enable HW CKSUM offload */ 1562 if (buf->ol_flags & 1563 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { 1564 send_flags |= IBV_EXP_QP_BURST_IP_CSUM; 1565 /* HW does not support checksum offloads at arbitrary 1566 * offsets but automatically recognizes the packet 1567 * type. For inner L3/L4 checksums, only VXLAN (UDP) 1568 * tunnels are currently supported. */ 1569 if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type)) 1570 send_flags |= IBV_EXP_QP_BURST_TUNNEL; 1571 } 1572 if (likely(segs == 1)) { 1573 uintptr_t addr; 1574 uint32_t length; 1575 uint32_t lkey; 1576 1577 /* Retrieve buffer information. */ 1578 addr = rte_pktmbuf_mtod(buf, uintptr_t); 1579 length = DATA_LEN(buf); 1580 /* Retrieve Memory Region key for this memory pool. */ 1581 lkey = txq_mp2mr(txq, txq_mb2mp(buf)); 1582 if (unlikely(lkey == (uint32_t)-1)) { 1583 /* MR does not exist. */ 1584 DEBUG("%p: unable to get MP <-> MR" 1585 " association", (void *)txq); 1586 /* Clean up TX element. */ 1587 elt->buf = NULL; 1588 goto stop; 1589 } 1590 /* Update element. */ 1591 elt->buf = buf; 1592 if (txq->priv->vf) 1593 rte_prefetch0((volatile void *) 1594 (uintptr_t)addr); 1595 RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); 1596 /* Put packet into send queue. */ 1597 #if MLX4_PMD_MAX_INLINE > 0 1598 if (length <= txq->max_inline) 1599 err = txq->if_qp->send_pending_inline 1600 (txq->qp, 1601 (void *)addr, 1602 length, 1603 send_flags); 1604 else 1605 #endif 1606 err = txq->if_qp->send_pending 1607 (txq->qp, 1608 addr, 1609 length, 1610 lkey, 1611 send_flags); 1612 if (unlikely(err)) 1613 goto stop; 1614 #ifdef MLX4_PMD_SOFT_COUNTERS 1615 sent_size += length; 1616 #endif 1617 } else { 1618 #if MLX4_PMD_SGE_WR_N > 1 1619 struct ibv_sge sges[MLX4_PMD_SGE_WR_N]; 1620 struct tx_burst_sg_ret ret; 1621 1622 ret = tx_burst_sg(txq, segs, elt, buf, elts_head, 1623 &sges); 1624 if (ret.length == (unsigned int)-1) 1625 goto stop; 1626 RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); 1627 /* Put SG list into send queue. */ 1628 err = txq->if_qp->send_pending_sg_list 1629 (txq->qp, 1630 sges, 1631 ret.num, 1632 send_flags); 1633 if (unlikely(err)) 1634 goto stop; 1635 #ifdef MLX4_PMD_SOFT_COUNTERS 1636 sent_size += ret.length; 1637 #endif 1638 #else /* MLX4_PMD_SGE_WR_N > 1 */ 1639 DEBUG("%p: TX scattered buffers support not" 1640 " compiled in", (void *)txq); 1641 goto stop; 1642 #endif /* MLX4_PMD_SGE_WR_N > 1 */ 1643 } 1644 elts_head = elts_head_next; 1645 #ifdef MLX4_PMD_SOFT_COUNTERS 1646 /* Increment sent bytes counter. */ 1647 txq->stats.obytes += sent_size; 1648 #endif 1649 } 1650 stop: 1651 /* Take a shortcut if nothing must be sent. */ 1652 if (unlikely(i == 0)) 1653 return 0; 1654 #ifdef MLX4_PMD_SOFT_COUNTERS 1655 /* Increment sent packets counter. */ 1656 txq->stats.opackets += i; 1657 #endif 1658 /* Ring QP doorbell. */ 1659 err = txq->if_qp->send_flush(txq->qp); 1660 if (unlikely(err)) { 1661 /* A nonzero value is not supposed to be returned. 1662 * Nothing can be done about it. */ 1663 DEBUG("%p: send_flush() failed with error %d", 1664 (void *)txq, err); 1665 } 1666 txq->elts_head = elts_head; 1667 txq->elts_comp += elts_comp; 1668 txq->elts_comp_cd = elts_comp_cd; 1669 return i; 1670 } 1671 1672 /** 1673 * DPDK callback for TX in secondary processes. 1674 * 1675 * This function configures all queues from primary process information 1676 * if necessary before reverting to the normal TX burst callback. 1677 * 1678 * @param dpdk_txq 1679 * Generic pointer to TX queue structure. 1680 * @param[in] pkts 1681 * Packets to transmit. 1682 * @param pkts_n 1683 * Number of packets in array. 1684 * 1685 * @return 1686 * Number of packets successfully transmitted (<= pkts_n). 1687 */ 1688 static uint16_t 1689 mlx4_tx_burst_secondary_setup(void *dpdk_txq, struct rte_mbuf **pkts, 1690 uint16_t pkts_n) 1691 { 1692 struct txq *txq = dpdk_txq; 1693 struct priv *priv = mlx4_secondary_data_setup(txq->priv); 1694 struct priv *primary_priv; 1695 unsigned int index; 1696 1697 if (priv == NULL) 1698 return 0; 1699 primary_priv = 1700 mlx4_secondary_data[priv->dev->data->port_id].primary_priv; 1701 /* Look for queue index in both private structures. */ 1702 for (index = 0; index != priv->txqs_n; ++index) 1703 if (((*primary_priv->txqs)[index] == txq) || 1704 ((*priv->txqs)[index] == txq)) 1705 break; 1706 if (index == priv->txqs_n) 1707 return 0; 1708 txq = (*priv->txqs)[index]; 1709 return priv->dev->tx_pkt_burst(txq, pkts, pkts_n); 1710 } 1711 1712 /** 1713 * Configure a TX queue. 1714 * 1715 * @param dev 1716 * Pointer to Ethernet device structure. 1717 * @param txq 1718 * Pointer to TX queue structure. 1719 * @param desc 1720 * Number of descriptors to configure in queue. 1721 * @param socket 1722 * NUMA socket on which memory must be allocated. 1723 * @param[in] conf 1724 * Thresholds parameters. 1725 * 1726 * @return 1727 * 0 on success, errno value on failure. 1728 */ 1729 static int 1730 txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc, 1731 unsigned int socket, const struct rte_eth_txconf *conf) 1732 { 1733 struct priv *priv = mlx4_get_priv(dev); 1734 struct txq tmpl = { 1735 .priv = priv, 1736 .socket = socket 1737 }; 1738 union { 1739 struct ibv_exp_query_intf_params params; 1740 struct ibv_exp_qp_init_attr init; 1741 struct ibv_exp_res_domain_init_attr rd; 1742 struct ibv_exp_cq_init_attr cq; 1743 struct ibv_exp_qp_attr mod; 1744 } attr; 1745 enum ibv_exp_query_intf_status status; 1746 int ret = 0; 1747 1748 (void)conf; /* Thresholds configuration (ignored). */ 1749 if (priv == NULL) 1750 return EINVAL; 1751 if ((desc == 0) || (desc % MLX4_PMD_SGE_WR_N)) { 1752 ERROR("%p: invalid number of TX descriptors (must be a" 1753 " multiple of %d)", (void *)dev, MLX4_PMD_SGE_WR_N); 1754 return EINVAL; 1755 } 1756 desc /= MLX4_PMD_SGE_WR_N; 1757 /* MRs will be registered in mp2mr[] later. */ 1758 attr.rd = (struct ibv_exp_res_domain_init_attr){ 1759 .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL | 1760 IBV_EXP_RES_DOMAIN_MSG_MODEL), 1761 .thread_model = IBV_EXP_THREAD_SINGLE, 1762 .msg_model = IBV_EXP_MSG_HIGH_BW, 1763 }; 1764 tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd); 1765 if (tmpl.rd == NULL) { 1766 ret = ENOMEM; 1767 ERROR("%p: RD creation failure: %s", 1768 (void *)dev, strerror(ret)); 1769 goto error; 1770 } 1771 attr.cq = (struct ibv_exp_cq_init_attr){ 1772 .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN, 1773 .res_domain = tmpl.rd, 1774 }; 1775 tmpl.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq); 1776 if (tmpl.cq == NULL) { 1777 ret = ENOMEM; 1778 ERROR("%p: CQ creation failure: %s", 1779 (void *)dev, strerror(ret)); 1780 goto error; 1781 } 1782 DEBUG("priv->device_attr.max_qp_wr is %d", 1783 priv->device_attr.max_qp_wr); 1784 DEBUG("priv->device_attr.max_sge is %d", 1785 priv->device_attr.max_sge); 1786 attr.init = (struct ibv_exp_qp_init_attr){ 1787 /* CQ to be associated with the send queue. */ 1788 .send_cq = tmpl.cq, 1789 /* CQ to be associated with the receive queue. */ 1790 .recv_cq = tmpl.cq, 1791 .cap = { 1792 /* Max number of outstanding WRs. */ 1793 .max_send_wr = ((priv->device_attr.max_qp_wr < desc) ? 1794 priv->device_attr.max_qp_wr : 1795 desc), 1796 /* Max number of scatter/gather elements in a WR. */ 1797 .max_send_sge = ((priv->device_attr.max_sge < 1798 MLX4_PMD_SGE_WR_N) ? 1799 priv->device_attr.max_sge : 1800 MLX4_PMD_SGE_WR_N), 1801 #if MLX4_PMD_MAX_INLINE > 0 1802 .max_inline_data = MLX4_PMD_MAX_INLINE, 1803 #endif 1804 }, 1805 .qp_type = IBV_QPT_RAW_PACKET, 1806 /* Do *NOT* enable this, completions events are managed per 1807 * TX burst. */ 1808 .sq_sig_all = 0, 1809 .pd = priv->pd, 1810 .res_domain = tmpl.rd, 1811 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD | 1812 IBV_EXP_QP_INIT_ATTR_RES_DOMAIN), 1813 }; 1814 tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init); 1815 if (tmpl.qp == NULL) { 1816 ret = (errno ? errno : EINVAL); 1817 ERROR("%p: QP creation failure: %s", 1818 (void *)dev, strerror(ret)); 1819 goto error; 1820 } 1821 #if MLX4_PMD_MAX_INLINE > 0 1822 /* ibv_create_qp() updates this value. */ 1823 tmpl.max_inline = attr.init.cap.max_inline_data; 1824 #endif 1825 attr.mod = (struct ibv_exp_qp_attr){ 1826 /* Move the QP to this state. */ 1827 .qp_state = IBV_QPS_INIT, 1828 /* Primary port number. */ 1829 .port_num = priv->port 1830 }; 1831 ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, 1832 (IBV_EXP_QP_STATE | IBV_EXP_QP_PORT)); 1833 if (ret) { 1834 ERROR("%p: QP state to IBV_QPS_INIT failed: %s", 1835 (void *)dev, strerror(ret)); 1836 goto error; 1837 } 1838 ret = txq_alloc_elts(&tmpl, desc); 1839 if (ret) { 1840 ERROR("%p: TXQ allocation failed: %s", 1841 (void *)dev, strerror(ret)); 1842 goto error; 1843 } 1844 attr.mod = (struct ibv_exp_qp_attr){ 1845 .qp_state = IBV_QPS_RTR 1846 }; 1847 ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE); 1848 if (ret) { 1849 ERROR("%p: QP state to IBV_QPS_RTR failed: %s", 1850 (void *)dev, strerror(ret)); 1851 goto error; 1852 } 1853 attr.mod.qp_state = IBV_QPS_RTS; 1854 ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE); 1855 if (ret) { 1856 ERROR("%p: QP state to IBV_QPS_RTS failed: %s", 1857 (void *)dev, strerror(ret)); 1858 goto error; 1859 } 1860 attr.params = (struct ibv_exp_query_intf_params){ 1861 .intf_scope = IBV_EXP_INTF_GLOBAL, 1862 .intf = IBV_EXP_INTF_CQ, 1863 .obj = tmpl.cq, 1864 }; 1865 tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status); 1866 if (tmpl.if_cq == NULL) { 1867 ERROR("%p: CQ interface family query failed with status %d", 1868 (void *)dev, status); 1869 goto error; 1870 } 1871 attr.params = (struct ibv_exp_query_intf_params){ 1872 .intf_scope = IBV_EXP_INTF_GLOBAL, 1873 .intf = IBV_EXP_INTF_QP_BURST, 1874 .obj = tmpl.qp, 1875 #ifdef HAVE_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK 1876 /* MC loopback must be disabled when not using a VF. */ 1877 .family_flags = 1878 (!priv->vf ? 1879 IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK : 1880 0), 1881 #endif 1882 }; 1883 tmpl.if_qp = ibv_exp_query_intf(priv->ctx, &attr.params, &status); 1884 if (tmpl.if_qp == NULL) { 1885 ERROR("%p: QP interface family query failed with status %d", 1886 (void *)dev, status); 1887 goto error; 1888 } 1889 /* Clean up txq in case we're reinitializing it. */ 1890 DEBUG("%p: cleaning-up old txq just in case", (void *)txq); 1891 txq_cleanup(txq); 1892 *txq = tmpl; 1893 DEBUG("%p: txq updated with %p", (void *)txq, (void *)&tmpl); 1894 /* Pre-register known mempools. */ 1895 rte_mempool_walk(txq_mp2mr_iter, txq); 1896 assert(ret == 0); 1897 return 0; 1898 error: 1899 txq_cleanup(&tmpl); 1900 assert(ret > 0); 1901 return ret; 1902 } 1903 1904 /** 1905 * DPDK callback to configure a TX queue. 1906 * 1907 * @param dev 1908 * Pointer to Ethernet device structure. 1909 * @param idx 1910 * TX queue index. 1911 * @param desc 1912 * Number of descriptors to configure in queue. 1913 * @param socket 1914 * NUMA socket on which memory must be allocated. 1915 * @param[in] conf 1916 * Thresholds parameters. 1917 * 1918 * @return 1919 * 0 on success, negative errno value on failure. 1920 */ 1921 static int 1922 mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, 1923 unsigned int socket, const struct rte_eth_txconf *conf) 1924 { 1925 struct priv *priv = dev->data->dev_private; 1926 struct txq *txq = (*priv->txqs)[idx]; 1927 int ret; 1928 1929 if (mlx4_is_secondary()) 1930 return -E_RTE_SECONDARY; 1931 priv_lock(priv); 1932 DEBUG("%p: configuring queue %u for %u descriptors", 1933 (void *)dev, idx, desc); 1934 if (idx >= priv->txqs_n) { 1935 ERROR("%p: queue index out of range (%u >= %u)", 1936 (void *)dev, idx, priv->txqs_n); 1937 priv_unlock(priv); 1938 return -EOVERFLOW; 1939 } 1940 if (txq != NULL) { 1941 DEBUG("%p: reusing already allocated queue index %u (%p)", 1942 (void *)dev, idx, (void *)txq); 1943 if (priv->started) { 1944 priv_unlock(priv); 1945 return -EEXIST; 1946 } 1947 (*priv->txqs)[idx] = NULL; 1948 txq_cleanup(txq); 1949 } else { 1950 txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0, socket); 1951 if (txq == NULL) { 1952 ERROR("%p: unable to allocate queue index %u", 1953 (void *)dev, idx); 1954 priv_unlock(priv); 1955 return -ENOMEM; 1956 } 1957 } 1958 ret = txq_setup(dev, txq, desc, socket, conf); 1959 if (ret) 1960 rte_free(txq); 1961 else { 1962 txq->stats.idx = idx; 1963 DEBUG("%p: adding TX queue %p to list", 1964 (void *)dev, (void *)txq); 1965 (*priv->txqs)[idx] = txq; 1966 /* Update send callback. */ 1967 dev->tx_pkt_burst = mlx4_tx_burst; 1968 } 1969 priv_unlock(priv); 1970 return -ret; 1971 } 1972 1973 /** 1974 * DPDK callback to release a TX queue. 1975 * 1976 * @param dpdk_txq 1977 * Generic TX queue pointer. 1978 */ 1979 static void 1980 mlx4_tx_queue_release(void *dpdk_txq) 1981 { 1982 struct txq *txq = (struct txq *)dpdk_txq; 1983 struct priv *priv; 1984 unsigned int i; 1985 1986 if (mlx4_is_secondary()) 1987 return; 1988 if (txq == NULL) 1989 return; 1990 priv = txq->priv; 1991 priv_lock(priv); 1992 for (i = 0; (i != priv->txqs_n); ++i) 1993 if ((*priv->txqs)[i] == txq) { 1994 DEBUG("%p: removing TX queue %p from list", 1995 (void *)priv->dev, (void *)txq); 1996 (*priv->txqs)[i] = NULL; 1997 break; 1998 } 1999 txq_cleanup(txq); 2000 rte_free(txq); 2001 priv_unlock(priv); 2002 } 2003 2004 /* RX queues handling. */ 2005 2006 /** 2007 * Allocate RX queue elements with scattered packets support. 2008 * 2009 * @param rxq 2010 * Pointer to RX queue structure. 2011 * @param elts_n 2012 * Number of elements to allocate. 2013 * @param[in] pool 2014 * If not NULL, fetch buffers from this array instead of allocating them 2015 * with rte_pktmbuf_alloc(). 2016 * 2017 * @return 2018 * 0 on success, errno value on failure. 2019 */ 2020 static int 2021 rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n, 2022 struct rte_mbuf **pool) 2023 { 2024 unsigned int i; 2025 struct rxq_elt_sp (*elts)[elts_n] = 2026 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0, 2027 rxq->socket); 2028 int ret = 0; 2029 2030 if (elts == NULL) { 2031 ERROR("%p: can't allocate packets array", (void *)rxq); 2032 ret = ENOMEM; 2033 goto error; 2034 } 2035 /* For each WR (packet). */ 2036 for (i = 0; (i != elts_n); ++i) { 2037 unsigned int j; 2038 struct rxq_elt_sp *elt = &(*elts)[i]; 2039 struct ibv_recv_wr *wr = &elt->wr; 2040 struct ibv_sge (*sges)[(elemof(elt->sges))] = &elt->sges; 2041 2042 /* These two arrays must have the same size. */ 2043 assert(elemof(elt->sges) == elemof(elt->bufs)); 2044 /* Configure WR. */ 2045 wr->wr_id = i; 2046 wr->next = &(*elts)[(i + 1)].wr; 2047 wr->sg_list = &(*sges)[0]; 2048 wr->num_sge = elemof(*sges); 2049 /* For each SGE (segment). */ 2050 for (j = 0; (j != elemof(elt->bufs)); ++j) { 2051 struct ibv_sge *sge = &(*sges)[j]; 2052 struct rte_mbuf *buf; 2053 2054 if (pool != NULL) { 2055 buf = *(pool++); 2056 assert(buf != NULL); 2057 rte_pktmbuf_reset(buf); 2058 } else 2059 buf = rte_pktmbuf_alloc(rxq->mp); 2060 if (buf == NULL) { 2061 assert(pool == NULL); 2062 ERROR("%p: empty mbuf pool", (void *)rxq); 2063 ret = ENOMEM; 2064 goto error; 2065 } 2066 elt->bufs[j] = buf; 2067 /* Headroom is reserved by rte_pktmbuf_alloc(). */ 2068 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM); 2069 /* Buffer is supposed to be empty. */ 2070 assert(rte_pktmbuf_data_len(buf) == 0); 2071 assert(rte_pktmbuf_pkt_len(buf) == 0); 2072 /* sge->addr must be able to store a pointer. */ 2073 assert(sizeof(sge->addr) >= sizeof(uintptr_t)); 2074 if (j == 0) { 2075 /* The first SGE keeps its headroom. */ 2076 sge->addr = rte_pktmbuf_mtod(buf, uintptr_t); 2077 sge->length = (buf->buf_len - 2078 RTE_PKTMBUF_HEADROOM); 2079 } else { 2080 /* Subsequent SGEs lose theirs. */ 2081 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM); 2082 SET_DATA_OFF(buf, 0); 2083 sge->addr = (uintptr_t)buf->buf_addr; 2084 sge->length = buf->buf_len; 2085 } 2086 sge->lkey = rxq->mr->lkey; 2087 /* Redundant check for tailroom. */ 2088 assert(sge->length == rte_pktmbuf_tailroom(buf)); 2089 } 2090 } 2091 /* The last WR pointer must be NULL. */ 2092 (*elts)[(i - 1)].wr.next = NULL; 2093 DEBUG("%p: allocated and configured %u WRs (%zu segments)", 2094 (void *)rxq, elts_n, (elts_n * elemof((*elts)[0].sges))); 2095 rxq->elts_n = elts_n; 2096 rxq->elts_head = 0; 2097 rxq->elts.sp = elts; 2098 assert(ret == 0); 2099 return 0; 2100 error: 2101 if (elts != NULL) { 2102 assert(pool == NULL); 2103 for (i = 0; (i != elemof(*elts)); ++i) { 2104 unsigned int j; 2105 struct rxq_elt_sp *elt = &(*elts)[i]; 2106 2107 for (j = 0; (j != elemof(elt->bufs)); ++j) { 2108 struct rte_mbuf *buf = elt->bufs[j]; 2109 2110 if (buf != NULL) 2111 rte_pktmbuf_free_seg(buf); 2112 } 2113 } 2114 rte_free(elts); 2115 } 2116 DEBUG("%p: failed, freed everything", (void *)rxq); 2117 assert(ret > 0); 2118 return ret; 2119 } 2120 2121 /** 2122 * Free RX queue elements with scattered packets support. 2123 * 2124 * @param rxq 2125 * Pointer to RX queue structure. 2126 */ 2127 static void 2128 rxq_free_elts_sp(struct rxq *rxq) 2129 { 2130 unsigned int i; 2131 unsigned int elts_n = rxq->elts_n; 2132 struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp; 2133 2134 DEBUG("%p: freeing WRs", (void *)rxq); 2135 rxq->elts_n = 0; 2136 rxq->elts.sp = NULL; 2137 if (elts == NULL) 2138 return; 2139 for (i = 0; (i != elemof(*elts)); ++i) { 2140 unsigned int j; 2141 struct rxq_elt_sp *elt = &(*elts)[i]; 2142 2143 for (j = 0; (j != elemof(elt->bufs)); ++j) { 2144 struct rte_mbuf *buf = elt->bufs[j]; 2145 2146 if (buf != NULL) 2147 rte_pktmbuf_free_seg(buf); 2148 } 2149 } 2150 rte_free(elts); 2151 } 2152 2153 /** 2154 * Allocate RX queue elements. 2155 * 2156 * @param rxq 2157 * Pointer to RX queue structure. 2158 * @param elts_n 2159 * Number of elements to allocate. 2160 * @param[in] pool 2161 * If not NULL, fetch buffers from this array instead of allocating them 2162 * with rte_pktmbuf_alloc(). 2163 * 2164 * @return 2165 * 0 on success, errno value on failure. 2166 */ 2167 static int 2168 rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool) 2169 { 2170 unsigned int i; 2171 struct rxq_elt (*elts)[elts_n] = 2172 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0, 2173 rxq->socket); 2174 int ret = 0; 2175 2176 if (elts == NULL) { 2177 ERROR("%p: can't allocate packets array", (void *)rxq); 2178 ret = ENOMEM; 2179 goto error; 2180 } 2181 /* For each WR (packet). */ 2182 for (i = 0; (i != elts_n); ++i) { 2183 struct rxq_elt *elt = &(*elts)[i]; 2184 struct ibv_recv_wr *wr = &elt->wr; 2185 struct ibv_sge *sge = &(*elts)[i].sge; 2186 struct rte_mbuf *buf; 2187 2188 if (pool != NULL) { 2189 buf = *(pool++); 2190 assert(buf != NULL); 2191 rte_pktmbuf_reset(buf); 2192 } else 2193 buf = rte_pktmbuf_alloc(rxq->mp); 2194 if (buf == NULL) { 2195 assert(pool == NULL); 2196 ERROR("%p: empty mbuf pool", (void *)rxq); 2197 ret = ENOMEM; 2198 goto error; 2199 } 2200 /* Configure WR. Work request ID contains its own index in 2201 * the elts array and the offset between SGE buffer header and 2202 * its data. */ 2203 WR_ID(wr->wr_id).id = i; 2204 WR_ID(wr->wr_id).offset = 2205 (((uintptr_t)buf->buf_addr + RTE_PKTMBUF_HEADROOM) - 2206 (uintptr_t)buf); 2207 wr->next = &(*elts)[(i + 1)].wr; 2208 wr->sg_list = sge; 2209 wr->num_sge = 1; 2210 /* Headroom is reserved by rte_pktmbuf_alloc(). */ 2211 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM); 2212 /* Buffer is supposed to be empty. */ 2213 assert(rte_pktmbuf_data_len(buf) == 0); 2214 assert(rte_pktmbuf_pkt_len(buf) == 0); 2215 /* sge->addr must be able to store a pointer. */ 2216 assert(sizeof(sge->addr) >= sizeof(uintptr_t)); 2217 /* SGE keeps its headroom. */ 2218 sge->addr = (uintptr_t) 2219 ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM); 2220 sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM); 2221 sge->lkey = rxq->mr->lkey; 2222 /* Redundant check for tailroom. */ 2223 assert(sge->length == rte_pktmbuf_tailroom(buf)); 2224 /* Make sure elts index and SGE mbuf pointer can be deduced 2225 * from WR ID. */ 2226 if ((WR_ID(wr->wr_id).id != i) || 2227 ((void *)((uintptr_t)sge->addr - 2228 WR_ID(wr->wr_id).offset) != buf)) { 2229 ERROR("%p: cannot store index and offset in WR ID", 2230 (void *)rxq); 2231 sge->addr = 0; 2232 rte_pktmbuf_free(buf); 2233 ret = EOVERFLOW; 2234 goto error; 2235 } 2236 } 2237 /* The last WR pointer must be NULL. */ 2238 (*elts)[(i - 1)].wr.next = NULL; 2239 DEBUG("%p: allocated and configured %u single-segment WRs", 2240 (void *)rxq, elts_n); 2241 rxq->elts_n = elts_n; 2242 rxq->elts_head = 0; 2243 rxq->elts.no_sp = elts; 2244 assert(ret == 0); 2245 return 0; 2246 error: 2247 if (elts != NULL) { 2248 assert(pool == NULL); 2249 for (i = 0; (i != elemof(*elts)); ++i) { 2250 struct rxq_elt *elt = &(*elts)[i]; 2251 struct rte_mbuf *buf; 2252 2253 if (elt->sge.addr == 0) 2254 continue; 2255 assert(WR_ID(elt->wr.wr_id).id == i); 2256 buf = (void *)((uintptr_t)elt->sge.addr - 2257 WR_ID(elt->wr.wr_id).offset); 2258 rte_pktmbuf_free_seg(buf); 2259 } 2260 rte_free(elts); 2261 } 2262 DEBUG("%p: failed, freed everything", (void *)rxq); 2263 assert(ret > 0); 2264 return ret; 2265 } 2266 2267 /** 2268 * Free RX queue elements. 2269 * 2270 * @param rxq 2271 * Pointer to RX queue structure. 2272 */ 2273 static void 2274 rxq_free_elts(struct rxq *rxq) 2275 { 2276 unsigned int i; 2277 unsigned int elts_n = rxq->elts_n; 2278 struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp; 2279 2280 DEBUG("%p: freeing WRs", (void *)rxq); 2281 rxq->elts_n = 0; 2282 rxq->elts.no_sp = NULL; 2283 if (elts == NULL) 2284 return; 2285 for (i = 0; (i != elemof(*elts)); ++i) { 2286 struct rxq_elt *elt = &(*elts)[i]; 2287 struct rte_mbuf *buf; 2288 2289 if (elt->sge.addr == 0) 2290 continue; 2291 assert(WR_ID(elt->wr.wr_id).id == i); 2292 buf = (void *)((uintptr_t)elt->sge.addr - 2293 WR_ID(elt->wr.wr_id).offset); 2294 rte_pktmbuf_free_seg(buf); 2295 } 2296 rte_free(elts); 2297 } 2298 2299 /** 2300 * Delete flow steering rule. 2301 * 2302 * @param rxq 2303 * Pointer to RX queue structure. 2304 * @param mac_index 2305 * MAC address index. 2306 * @param vlan_index 2307 * VLAN index. 2308 */ 2309 static void 2310 rxq_del_flow(struct rxq *rxq, unsigned int mac_index, unsigned int vlan_index) 2311 { 2312 #ifndef NDEBUG 2313 struct priv *priv = rxq->priv; 2314 const uint8_t (*mac)[ETHER_ADDR_LEN] = 2315 (const uint8_t (*)[ETHER_ADDR_LEN]) 2316 priv->mac[mac_index].addr_bytes; 2317 #endif 2318 assert(rxq->mac_flow[mac_index][vlan_index] != NULL); 2319 DEBUG("%p: removing MAC address %02x:%02x:%02x:%02x:%02x:%02x index %u" 2320 " (VLAN ID %" PRIu16 ")", 2321 (void *)rxq, 2322 (*mac)[0], (*mac)[1], (*mac)[2], (*mac)[3], (*mac)[4], (*mac)[5], 2323 mac_index, priv->vlan_filter[vlan_index].id); 2324 claim_zero(ibv_destroy_flow(rxq->mac_flow[mac_index][vlan_index])); 2325 rxq->mac_flow[mac_index][vlan_index] = NULL; 2326 } 2327 2328 /** 2329 * Unregister a MAC address from a RX queue. 2330 * 2331 * @param rxq 2332 * Pointer to RX queue structure. 2333 * @param mac_index 2334 * MAC address index. 2335 */ 2336 static void 2337 rxq_mac_addr_del(struct rxq *rxq, unsigned int mac_index) 2338 { 2339 struct priv *priv = rxq->priv; 2340 unsigned int i; 2341 unsigned int vlans = 0; 2342 2343 assert(mac_index < elemof(priv->mac)); 2344 if (!BITFIELD_ISSET(rxq->mac_configured, mac_index)) 2345 return; 2346 for (i = 0; (i != elemof(priv->vlan_filter)); ++i) { 2347 if (!priv->vlan_filter[i].enabled) 2348 continue; 2349 rxq_del_flow(rxq, mac_index, i); 2350 vlans++; 2351 } 2352 if (!vlans) { 2353 rxq_del_flow(rxq, mac_index, 0); 2354 } 2355 BITFIELD_RESET(rxq->mac_configured, mac_index); 2356 } 2357 2358 /** 2359 * Unregister all MAC addresses from a RX queue. 2360 * 2361 * @param rxq 2362 * Pointer to RX queue structure. 2363 */ 2364 static void 2365 rxq_mac_addrs_del(struct rxq *rxq) 2366 { 2367 struct priv *priv = rxq->priv; 2368 unsigned int i; 2369 2370 for (i = 0; (i != elemof(priv->mac)); ++i) 2371 rxq_mac_addr_del(rxq, i); 2372 } 2373 2374 static int rxq_promiscuous_enable(struct rxq *); 2375 static void rxq_promiscuous_disable(struct rxq *); 2376 2377 /** 2378 * Add single flow steering rule. 2379 * 2380 * @param rxq 2381 * Pointer to RX queue structure. 2382 * @param mac_index 2383 * MAC address index to register. 2384 * @param vlan_index 2385 * VLAN index. Use -1 for a flow without VLAN. 2386 * 2387 * @return 2388 * 0 on success, errno value on failure. 2389 */ 2390 static int 2391 rxq_add_flow(struct rxq *rxq, unsigned int mac_index, unsigned int vlan_index) 2392 { 2393 struct ibv_flow *flow; 2394 struct priv *priv = rxq->priv; 2395 const uint8_t (*mac)[ETHER_ADDR_LEN] = 2396 (const uint8_t (*)[ETHER_ADDR_LEN]) 2397 priv->mac[mac_index].addr_bytes; 2398 2399 /* Allocate flow specification on the stack. */ 2400 struct __attribute__((packed)) { 2401 struct ibv_flow_attr attr; 2402 struct ibv_flow_spec_eth spec; 2403 } data; 2404 struct ibv_flow_attr *attr = &data.attr; 2405 struct ibv_flow_spec_eth *spec = &data.spec; 2406 2407 assert(mac_index < elemof(priv->mac)); 2408 assert((vlan_index < elemof(priv->vlan_filter)) || (vlan_index == -1u)); 2409 /* 2410 * No padding must be inserted by the compiler between attr and spec. 2411 * This layout is expected by libibverbs. 2412 */ 2413 assert(((uint8_t *)attr + sizeof(*attr)) == (uint8_t *)spec); 2414 *attr = (struct ibv_flow_attr){ 2415 .type = IBV_FLOW_ATTR_NORMAL, 2416 .num_of_specs = 1, 2417 .port = priv->port, 2418 .flags = 0 2419 }; 2420 *spec = (struct ibv_flow_spec_eth){ 2421 .type = IBV_FLOW_SPEC_ETH, 2422 .size = sizeof(*spec), 2423 .val = { 2424 .dst_mac = { 2425 (*mac)[0], (*mac)[1], (*mac)[2], 2426 (*mac)[3], (*mac)[4], (*mac)[5] 2427 }, 2428 .vlan_tag = ((vlan_index != -1u) ? 2429 htons(priv->vlan_filter[vlan_index].id) : 2430 0), 2431 }, 2432 .mask = { 2433 .dst_mac = "\xff\xff\xff\xff\xff\xff", 2434 .vlan_tag = ((vlan_index != -1u) ? htons(0xfff) : 0), 2435 } 2436 }; 2437 DEBUG("%p: adding MAC address %02x:%02x:%02x:%02x:%02x:%02x index %u" 2438 " (VLAN %s %" PRIu16 ")", 2439 (void *)rxq, 2440 (*mac)[0], (*mac)[1], (*mac)[2], (*mac)[3], (*mac)[4], (*mac)[5], 2441 mac_index, 2442 ((vlan_index != -1u) ? "ID" : "index"), 2443 ((vlan_index != -1u) ? priv->vlan_filter[vlan_index].id : -1u)); 2444 /* Create related flow. */ 2445 errno = 0; 2446 flow = ibv_create_flow(rxq->qp, attr); 2447 if (flow == NULL) { 2448 /* It's not clear whether errno is always set in this case. */ 2449 ERROR("%p: flow configuration failed, errno=%d: %s", 2450 (void *)rxq, errno, 2451 (errno ? strerror(errno) : "Unknown error")); 2452 if (errno) 2453 return errno; 2454 return EINVAL; 2455 } 2456 if (vlan_index == -1u) 2457 vlan_index = 0; 2458 assert(rxq->mac_flow[mac_index][vlan_index] == NULL); 2459 rxq->mac_flow[mac_index][vlan_index] = flow; 2460 return 0; 2461 } 2462 2463 /** 2464 * Register a MAC address in a RX queue. 2465 * 2466 * @param rxq 2467 * Pointer to RX queue structure. 2468 * @param mac_index 2469 * MAC address index to register. 2470 * 2471 * @return 2472 * 0 on success, errno value on failure. 2473 */ 2474 static int 2475 rxq_mac_addr_add(struct rxq *rxq, unsigned int mac_index) 2476 { 2477 struct priv *priv = rxq->priv; 2478 unsigned int i; 2479 unsigned int vlans = 0; 2480 int ret; 2481 2482 assert(mac_index < elemof(priv->mac)); 2483 if (BITFIELD_ISSET(rxq->mac_configured, mac_index)) 2484 rxq_mac_addr_del(rxq, mac_index); 2485 /* Fill VLAN specifications. */ 2486 for (i = 0; (i != elemof(priv->vlan_filter)); ++i) { 2487 if (!priv->vlan_filter[i].enabled) 2488 continue; 2489 /* Create related flow. */ 2490 ret = rxq_add_flow(rxq, mac_index, i); 2491 if (!ret) { 2492 vlans++; 2493 continue; 2494 } 2495 /* Failure, rollback. */ 2496 while (i != 0) 2497 if (priv->vlan_filter[--i].enabled) 2498 rxq_del_flow(rxq, mac_index, i); 2499 assert(ret > 0); 2500 return ret; 2501 } 2502 /* In case there is no VLAN filter. */ 2503 if (!vlans) { 2504 ret = rxq_add_flow(rxq, mac_index, -1); 2505 if (ret) 2506 return ret; 2507 } 2508 BITFIELD_SET(rxq->mac_configured, mac_index); 2509 return 0; 2510 } 2511 2512 /** 2513 * Register all MAC addresses in a RX queue. 2514 * 2515 * @param rxq 2516 * Pointer to RX queue structure. 2517 * 2518 * @return 2519 * 0 on success, errno value on failure. 2520 */ 2521 static int 2522 rxq_mac_addrs_add(struct rxq *rxq) 2523 { 2524 struct priv *priv = rxq->priv; 2525 unsigned int i; 2526 int ret; 2527 2528 for (i = 0; (i != elemof(priv->mac)); ++i) { 2529 if (!BITFIELD_ISSET(priv->mac_configured, i)) 2530 continue; 2531 ret = rxq_mac_addr_add(rxq, i); 2532 if (!ret) 2533 continue; 2534 /* Failure, rollback. */ 2535 while (i != 0) 2536 rxq_mac_addr_del(rxq, --i); 2537 assert(ret > 0); 2538 return ret; 2539 } 2540 return 0; 2541 } 2542 2543 /** 2544 * Unregister a MAC address. 2545 * 2546 * In RSS mode, the MAC address is unregistered from the parent queue, 2547 * otherwise it is unregistered from each queue directly. 2548 * 2549 * @param priv 2550 * Pointer to private structure. 2551 * @param mac_index 2552 * MAC address index. 2553 */ 2554 static void 2555 priv_mac_addr_del(struct priv *priv, unsigned int mac_index) 2556 { 2557 unsigned int i; 2558 2559 assert(mac_index < elemof(priv->mac)); 2560 if (!BITFIELD_ISSET(priv->mac_configured, mac_index)) 2561 return; 2562 if (priv->rss) { 2563 rxq_mac_addr_del(&priv->rxq_parent, mac_index); 2564 goto end; 2565 } 2566 for (i = 0; (i != priv->dev->data->nb_rx_queues); ++i) 2567 rxq_mac_addr_del((*priv->rxqs)[i], mac_index); 2568 end: 2569 BITFIELD_RESET(priv->mac_configured, mac_index); 2570 } 2571 2572 /** 2573 * Register a MAC address. 2574 * 2575 * In RSS mode, the MAC address is registered in the parent queue, 2576 * otherwise it is registered in each queue directly. 2577 * 2578 * @param priv 2579 * Pointer to private structure. 2580 * @param mac_index 2581 * MAC address index to use. 2582 * @param mac 2583 * MAC address to register. 2584 * 2585 * @return 2586 * 0 on success, errno value on failure. 2587 */ 2588 static int 2589 priv_mac_addr_add(struct priv *priv, unsigned int mac_index, 2590 const uint8_t (*mac)[ETHER_ADDR_LEN]) 2591 { 2592 unsigned int i; 2593 int ret; 2594 2595 assert(mac_index < elemof(priv->mac)); 2596 /* First, make sure this address isn't already configured. */ 2597 for (i = 0; (i != elemof(priv->mac)); ++i) { 2598 /* Skip this index, it's going to be reconfigured. */ 2599 if (i == mac_index) 2600 continue; 2601 if (!BITFIELD_ISSET(priv->mac_configured, i)) 2602 continue; 2603 if (memcmp(priv->mac[i].addr_bytes, *mac, sizeof(*mac))) 2604 continue; 2605 /* Address already configured elsewhere, return with error. */ 2606 return EADDRINUSE; 2607 } 2608 if (BITFIELD_ISSET(priv->mac_configured, mac_index)) 2609 priv_mac_addr_del(priv, mac_index); 2610 priv->mac[mac_index] = (struct ether_addr){ 2611 { 2612 (*mac)[0], (*mac)[1], (*mac)[2], 2613 (*mac)[3], (*mac)[4], (*mac)[5] 2614 } 2615 }; 2616 /* If device isn't started, this is all we need to do. */ 2617 if (!priv->started) { 2618 #ifndef NDEBUG 2619 /* Verify that all queues have this index disabled. */ 2620 for (i = 0; (i != priv->rxqs_n); ++i) { 2621 if ((*priv->rxqs)[i] == NULL) 2622 continue; 2623 assert(!BITFIELD_ISSET 2624 ((*priv->rxqs)[i]->mac_configured, mac_index)); 2625 } 2626 #endif 2627 goto end; 2628 } 2629 if (priv->rss) { 2630 ret = rxq_mac_addr_add(&priv->rxq_parent, mac_index); 2631 if (ret) 2632 return ret; 2633 goto end; 2634 } 2635 for (i = 0; (i != priv->rxqs_n); ++i) { 2636 if ((*priv->rxqs)[i] == NULL) 2637 continue; 2638 ret = rxq_mac_addr_add((*priv->rxqs)[i], mac_index); 2639 if (!ret) 2640 continue; 2641 /* Failure, rollback. */ 2642 while (i != 0) 2643 if ((*priv->rxqs)[(--i)] != NULL) 2644 rxq_mac_addr_del((*priv->rxqs)[i], mac_index); 2645 return ret; 2646 } 2647 end: 2648 BITFIELD_SET(priv->mac_configured, mac_index); 2649 return 0; 2650 } 2651 2652 /** 2653 * Enable allmulti mode in a RX queue. 2654 * 2655 * @param rxq 2656 * Pointer to RX queue structure. 2657 * 2658 * @return 2659 * 0 on success, errno value on failure. 2660 */ 2661 static int 2662 rxq_allmulticast_enable(struct rxq *rxq) 2663 { 2664 struct ibv_flow *flow; 2665 struct ibv_flow_attr attr = { 2666 .type = IBV_FLOW_ATTR_MC_DEFAULT, 2667 .num_of_specs = 0, 2668 .port = rxq->priv->port, 2669 .flags = 0 2670 }; 2671 2672 DEBUG("%p: enabling allmulticast mode", (void *)rxq); 2673 if (rxq->allmulti_flow != NULL) 2674 return EBUSY; 2675 errno = 0; 2676 flow = ibv_create_flow(rxq->qp, &attr); 2677 if (flow == NULL) { 2678 /* It's not clear whether errno is always set in this case. */ 2679 ERROR("%p: flow configuration failed, errno=%d: %s", 2680 (void *)rxq, errno, 2681 (errno ? strerror(errno) : "Unknown error")); 2682 if (errno) 2683 return errno; 2684 return EINVAL; 2685 } 2686 rxq->allmulti_flow = flow; 2687 DEBUG("%p: allmulticast mode enabled", (void *)rxq); 2688 return 0; 2689 } 2690 2691 /** 2692 * Disable allmulti mode in a RX queue. 2693 * 2694 * @param rxq 2695 * Pointer to RX queue structure. 2696 */ 2697 static void 2698 rxq_allmulticast_disable(struct rxq *rxq) 2699 { 2700 DEBUG("%p: disabling allmulticast mode", (void *)rxq); 2701 if (rxq->allmulti_flow == NULL) 2702 return; 2703 claim_zero(ibv_destroy_flow(rxq->allmulti_flow)); 2704 rxq->allmulti_flow = NULL; 2705 DEBUG("%p: allmulticast mode disabled", (void *)rxq); 2706 } 2707 2708 /** 2709 * Enable promiscuous mode in a RX queue. 2710 * 2711 * @param rxq 2712 * Pointer to RX queue structure. 2713 * 2714 * @return 2715 * 0 on success, errno value on failure. 2716 */ 2717 static int 2718 rxq_promiscuous_enable(struct rxq *rxq) 2719 { 2720 struct ibv_flow *flow; 2721 struct ibv_flow_attr attr = { 2722 .type = IBV_FLOW_ATTR_ALL_DEFAULT, 2723 .num_of_specs = 0, 2724 .port = rxq->priv->port, 2725 .flags = 0 2726 }; 2727 2728 if (rxq->priv->vf) 2729 return 0; 2730 DEBUG("%p: enabling promiscuous mode", (void *)rxq); 2731 if (rxq->promisc_flow != NULL) 2732 return EBUSY; 2733 errno = 0; 2734 flow = ibv_create_flow(rxq->qp, &attr); 2735 if (flow == NULL) { 2736 /* It's not clear whether errno is always set in this case. */ 2737 ERROR("%p: flow configuration failed, errno=%d: %s", 2738 (void *)rxq, errno, 2739 (errno ? strerror(errno) : "Unknown error")); 2740 if (errno) 2741 return errno; 2742 return EINVAL; 2743 } 2744 rxq->promisc_flow = flow; 2745 DEBUG("%p: promiscuous mode enabled", (void *)rxq); 2746 return 0; 2747 } 2748 2749 /** 2750 * Disable promiscuous mode in a RX queue. 2751 * 2752 * @param rxq 2753 * Pointer to RX queue structure. 2754 */ 2755 static void 2756 rxq_promiscuous_disable(struct rxq *rxq) 2757 { 2758 if (rxq->priv->vf) 2759 return; 2760 DEBUG("%p: disabling promiscuous mode", (void *)rxq); 2761 if (rxq->promisc_flow == NULL) 2762 return; 2763 claim_zero(ibv_destroy_flow(rxq->promisc_flow)); 2764 rxq->promisc_flow = NULL; 2765 DEBUG("%p: promiscuous mode disabled", (void *)rxq); 2766 } 2767 2768 /** 2769 * Clean up a RX queue. 2770 * 2771 * Destroy objects, free allocated memory and reset the structure for reuse. 2772 * 2773 * @param rxq 2774 * Pointer to RX queue structure. 2775 */ 2776 static void 2777 rxq_cleanup(struct rxq *rxq) 2778 { 2779 struct ibv_exp_release_intf_params params; 2780 2781 DEBUG("cleaning up %p", (void *)rxq); 2782 if (rxq->sp) 2783 rxq_free_elts_sp(rxq); 2784 else 2785 rxq_free_elts(rxq); 2786 if (rxq->if_qp != NULL) { 2787 assert(rxq->priv != NULL); 2788 assert(rxq->priv->ctx != NULL); 2789 assert(rxq->qp != NULL); 2790 params = (struct ibv_exp_release_intf_params){ 2791 .comp_mask = 0, 2792 }; 2793 claim_zero(ibv_exp_release_intf(rxq->priv->ctx, 2794 rxq->if_qp, 2795 ¶ms)); 2796 } 2797 if (rxq->if_cq != NULL) { 2798 assert(rxq->priv != NULL); 2799 assert(rxq->priv->ctx != NULL); 2800 assert(rxq->cq != NULL); 2801 params = (struct ibv_exp_release_intf_params){ 2802 .comp_mask = 0, 2803 }; 2804 claim_zero(ibv_exp_release_intf(rxq->priv->ctx, 2805 rxq->if_cq, 2806 ¶ms)); 2807 } 2808 if (rxq->qp != NULL) { 2809 rxq_promiscuous_disable(rxq); 2810 rxq_allmulticast_disable(rxq); 2811 rxq_mac_addrs_del(rxq); 2812 claim_zero(ibv_destroy_qp(rxq->qp)); 2813 } 2814 if (rxq->cq != NULL) 2815 claim_zero(ibv_destroy_cq(rxq->cq)); 2816 if (rxq->rd != NULL) { 2817 struct ibv_exp_destroy_res_domain_attr attr = { 2818 .comp_mask = 0, 2819 }; 2820 2821 assert(rxq->priv != NULL); 2822 assert(rxq->priv->ctx != NULL); 2823 claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx, 2824 rxq->rd, 2825 &attr)); 2826 } 2827 if (rxq->mr != NULL) 2828 claim_zero(ibv_dereg_mr(rxq->mr)); 2829 memset(rxq, 0, sizeof(*rxq)); 2830 } 2831 2832 /** 2833 * Translate RX completion flags to packet type. 2834 * 2835 * @param flags 2836 * RX completion flags returned by poll_length_flags(). 2837 * 2838 * @return 2839 * Packet type for struct rte_mbuf. 2840 */ 2841 static inline uint32_t 2842 rxq_cq_to_pkt_type(uint32_t flags) 2843 { 2844 uint32_t pkt_type; 2845 2846 if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) 2847 pkt_type = 2848 TRANSPOSE(flags, 2849 IBV_EXP_CQ_RX_OUTER_IPV4_PACKET, RTE_PTYPE_L3_IPV4) | 2850 TRANSPOSE(flags, 2851 IBV_EXP_CQ_RX_OUTER_IPV6_PACKET, RTE_PTYPE_L3_IPV6) | 2852 TRANSPOSE(flags, 2853 IBV_EXP_CQ_RX_IPV4_PACKET, RTE_PTYPE_INNER_L3_IPV4) | 2854 TRANSPOSE(flags, 2855 IBV_EXP_CQ_RX_IPV6_PACKET, RTE_PTYPE_INNER_L3_IPV6); 2856 else 2857 pkt_type = 2858 TRANSPOSE(flags, 2859 IBV_EXP_CQ_RX_IPV4_PACKET, RTE_PTYPE_L3_IPV4) | 2860 TRANSPOSE(flags, 2861 IBV_EXP_CQ_RX_IPV6_PACKET, RTE_PTYPE_L3_IPV6); 2862 return pkt_type; 2863 } 2864 2865 /** 2866 * Translate RX completion flags to offload flags. 2867 * 2868 * @param[in] rxq 2869 * Pointer to RX queue structure. 2870 * @param flags 2871 * RX completion flags returned by poll_length_flags(). 2872 * 2873 * @return 2874 * Offload flags (ol_flags) for struct rte_mbuf. 2875 */ 2876 static inline uint32_t 2877 rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags) 2878 { 2879 uint32_t ol_flags = 0; 2880 2881 if (rxq->csum) 2882 ol_flags |= 2883 TRANSPOSE(~flags, 2884 IBV_EXP_CQ_RX_IP_CSUM_OK, 2885 PKT_RX_IP_CKSUM_BAD) | 2886 TRANSPOSE(~flags, 2887 IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK, 2888 PKT_RX_L4_CKSUM_BAD); 2889 /* 2890 * PKT_RX_IP_CKSUM_BAD and PKT_RX_L4_CKSUM_BAD are used in place 2891 * of PKT_RX_EIP_CKSUM_BAD because the latter is not functional 2892 * (its value is 0). 2893 */ 2894 if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) 2895 ol_flags |= 2896 TRANSPOSE(~flags, 2897 IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK, 2898 PKT_RX_IP_CKSUM_BAD) | 2899 TRANSPOSE(~flags, 2900 IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK, 2901 PKT_RX_L4_CKSUM_BAD); 2902 return ol_flags; 2903 } 2904 2905 static uint16_t 2906 mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n); 2907 2908 /** 2909 * DPDK callback for RX with scattered packets support. 2910 * 2911 * @param dpdk_rxq 2912 * Generic pointer to RX queue structure. 2913 * @param[out] pkts 2914 * Array to store received packets. 2915 * @param pkts_n 2916 * Maximum number of packets in array. 2917 * 2918 * @return 2919 * Number of packets successfully received (<= pkts_n). 2920 */ 2921 static uint16_t 2922 mlx4_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 2923 { 2924 struct rxq *rxq = (struct rxq *)dpdk_rxq; 2925 struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp; 2926 const unsigned int elts_n = rxq->elts_n; 2927 unsigned int elts_head = rxq->elts_head; 2928 struct ibv_recv_wr head; 2929 struct ibv_recv_wr **next = &head.next; 2930 struct ibv_recv_wr *bad_wr; 2931 unsigned int i; 2932 unsigned int pkts_ret = 0; 2933 int ret; 2934 2935 if (unlikely(!rxq->sp)) 2936 return mlx4_rx_burst(dpdk_rxq, pkts, pkts_n); 2937 if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */ 2938 return 0; 2939 for (i = 0; (i != pkts_n); ++i) { 2940 struct rxq_elt_sp *elt = &(*elts)[elts_head]; 2941 struct ibv_recv_wr *wr = &elt->wr; 2942 uint64_t wr_id = wr->wr_id; 2943 unsigned int len; 2944 unsigned int pkt_buf_len; 2945 struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */ 2946 struct rte_mbuf **pkt_buf_next = &pkt_buf; 2947 unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM; 2948 unsigned int j = 0; 2949 uint32_t flags; 2950 2951 /* Sanity checks. */ 2952 #ifdef NDEBUG 2953 (void)wr_id; 2954 #endif 2955 assert(wr_id < rxq->elts_n); 2956 assert(wr->sg_list == elt->sges); 2957 assert(wr->num_sge == elemof(elt->sges)); 2958 assert(elts_head < rxq->elts_n); 2959 assert(rxq->elts_head < rxq->elts_n); 2960 ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL, 2961 &flags); 2962 if (unlikely(ret < 0)) { 2963 struct ibv_wc wc; 2964 int wcs_n; 2965 2966 DEBUG("rxq=%p, poll_length() failed (ret=%d)", 2967 (void *)rxq, ret); 2968 /* ibv_poll_cq() must be used in case of failure. */ 2969 wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); 2970 if (unlikely(wcs_n == 0)) 2971 break; 2972 if (unlikely(wcs_n < 0)) { 2973 DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", 2974 (void *)rxq, wcs_n); 2975 break; 2976 } 2977 assert(wcs_n == 1); 2978 if (unlikely(wc.status != IBV_WC_SUCCESS)) { 2979 /* Whatever, just repost the offending WR. */ 2980 DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" 2981 " completion status (%d): %s", 2982 (void *)rxq, wc.wr_id, wc.status, 2983 ibv_wc_status_str(wc.status)); 2984 #ifdef MLX4_PMD_SOFT_COUNTERS 2985 /* Increment dropped packets counter. */ 2986 ++rxq->stats.idropped; 2987 #endif 2988 /* Link completed WRs together for repost. */ 2989 *next = wr; 2990 next = &wr->next; 2991 goto repost; 2992 } 2993 ret = wc.byte_len; 2994 } 2995 if (ret == 0) 2996 break; 2997 len = ret; 2998 pkt_buf_len = len; 2999 /* Link completed WRs together for repost. */ 3000 *next = wr; 3001 next = &wr->next; 3002 /* 3003 * Replace spent segments with new ones, concatenate and 3004 * return them as pkt_buf. 3005 */ 3006 while (1) { 3007 struct ibv_sge *sge = &elt->sges[j]; 3008 struct rte_mbuf *seg = elt->bufs[j]; 3009 struct rte_mbuf *rep; 3010 unsigned int seg_tailroom; 3011 3012 /* 3013 * Fetch initial bytes of packet descriptor into a 3014 * cacheline while allocating rep. 3015 */ 3016 rte_prefetch0(seg); 3017 rep = __rte_mbuf_raw_alloc(rxq->mp); 3018 if (unlikely(rep == NULL)) { 3019 /* 3020 * Unable to allocate a replacement mbuf, 3021 * repost WR. 3022 */ 3023 DEBUG("rxq=%p, wr_id=%" PRIu64 ":" 3024 " can't allocate a new mbuf", 3025 (void *)rxq, wr_id); 3026 if (pkt_buf != NULL) { 3027 *pkt_buf_next = NULL; 3028 rte_pktmbuf_free(pkt_buf); 3029 } 3030 /* Increase out of memory counters. */ 3031 ++rxq->stats.rx_nombuf; 3032 ++rxq->priv->dev->data->rx_mbuf_alloc_failed; 3033 goto repost; 3034 } 3035 #ifndef NDEBUG 3036 /* Poison user-modifiable fields in rep. */ 3037 NEXT(rep) = (void *)((uintptr_t)-1); 3038 SET_DATA_OFF(rep, 0xdead); 3039 DATA_LEN(rep) = 0xd00d; 3040 PKT_LEN(rep) = 0xdeadd00d; 3041 NB_SEGS(rep) = 0x2a; 3042 PORT(rep) = 0x2a; 3043 rep->ol_flags = -1; 3044 #endif 3045 assert(rep->buf_len == seg->buf_len); 3046 assert(rep->buf_len == rxq->mb_len); 3047 /* Reconfigure sge to use rep instead of seg. */ 3048 assert(sge->lkey == rxq->mr->lkey); 3049 sge->addr = ((uintptr_t)rep->buf_addr + seg_headroom); 3050 elt->bufs[j] = rep; 3051 ++j; 3052 /* Update pkt_buf if it's the first segment, or link 3053 * seg to the previous one and update pkt_buf_next. */ 3054 *pkt_buf_next = seg; 3055 pkt_buf_next = &NEXT(seg); 3056 /* Update seg information. */ 3057 seg_tailroom = (seg->buf_len - seg_headroom); 3058 assert(sge->length == seg_tailroom); 3059 SET_DATA_OFF(seg, seg_headroom); 3060 if (likely(len <= seg_tailroom)) { 3061 /* Last segment. */ 3062 DATA_LEN(seg) = len; 3063 PKT_LEN(seg) = len; 3064 /* Sanity check. */ 3065 assert(rte_pktmbuf_headroom(seg) == 3066 seg_headroom); 3067 assert(rte_pktmbuf_tailroom(seg) == 3068 (seg_tailroom - len)); 3069 break; 3070 } 3071 DATA_LEN(seg) = seg_tailroom; 3072 PKT_LEN(seg) = seg_tailroom; 3073 /* Sanity check. */ 3074 assert(rte_pktmbuf_headroom(seg) == seg_headroom); 3075 assert(rte_pktmbuf_tailroom(seg) == 0); 3076 /* Fix len and clear headroom for next segments. */ 3077 len -= seg_tailroom; 3078 seg_headroom = 0; 3079 } 3080 /* Update head and tail segments. */ 3081 *pkt_buf_next = NULL; 3082 assert(pkt_buf != NULL); 3083 assert(j != 0); 3084 NB_SEGS(pkt_buf) = j; 3085 PORT(pkt_buf) = rxq->port_id; 3086 PKT_LEN(pkt_buf) = pkt_buf_len; 3087 pkt_buf->packet_type = rxq_cq_to_pkt_type(flags); 3088 pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags); 3089 3090 /* Return packet. */ 3091 *(pkts++) = pkt_buf; 3092 ++pkts_ret; 3093 #ifdef MLX4_PMD_SOFT_COUNTERS 3094 /* Increase bytes counter. */ 3095 rxq->stats.ibytes += pkt_buf_len; 3096 #endif 3097 repost: 3098 if (++elts_head >= elts_n) 3099 elts_head = 0; 3100 continue; 3101 } 3102 if (unlikely(i == 0)) 3103 return 0; 3104 *next = NULL; 3105 /* Repost WRs. */ 3106 #ifdef DEBUG_RECV 3107 DEBUG("%p: reposting %d WRs", (void *)rxq, i); 3108 #endif 3109 ret = ibv_post_recv(rxq->qp, head.next, &bad_wr); 3110 if (unlikely(ret)) { 3111 /* Inability to repost WRs is fatal. */ 3112 DEBUG("%p: ibv_post_recv(): failed for WR %p: %s", 3113 (void *)rxq->priv, 3114 (void *)bad_wr, 3115 strerror(ret)); 3116 abort(); 3117 } 3118 rxq->elts_head = elts_head; 3119 #ifdef MLX4_PMD_SOFT_COUNTERS 3120 /* Increase packets counter. */ 3121 rxq->stats.ipackets += pkts_ret; 3122 #endif 3123 return pkts_ret; 3124 } 3125 3126 /** 3127 * DPDK callback for RX. 3128 * 3129 * The following function is the same as mlx4_rx_burst_sp(), except it doesn't 3130 * manage scattered packets. Improves performance when MRU is lower than the 3131 * size of the first segment. 3132 * 3133 * @param dpdk_rxq 3134 * Generic pointer to RX queue structure. 3135 * @param[out] pkts 3136 * Array to store received packets. 3137 * @param pkts_n 3138 * Maximum number of packets in array. 3139 * 3140 * @return 3141 * Number of packets successfully received (<= pkts_n). 3142 */ 3143 static uint16_t 3144 mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 3145 { 3146 struct rxq *rxq = (struct rxq *)dpdk_rxq; 3147 struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp; 3148 const unsigned int elts_n = rxq->elts_n; 3149 unsigned int elts_head = rxq->elts_head; 3150 struct ibv_sge sges[pkts_n]; 3151 unsigned int i; 3152 unsigned int pkts_ret = 0; 3153 int ret; 3154 3155 if (unlikely(rxq->sp)) 3156 return mlx4_rx_burst_sp(dpdk_rxq, pkts, pkts_n); 3157 for (i = 0; (i != pkts_n); ++i) { 3158 struct rxq_elt *elt = &(*elts)[elts_head]; 3159 struct ibv_recv_wr *wr = &elt->wr; 3160 uint64_t wr_id = wr->wr_id; 3161 unsigned int len; 3162 struct rte_mbuf *seg = (void *)((uintptr_t)elt->sge.addr - 3163 WR_ID(wr_id).offset); 3164 struct rte_mbuf *rep; 3165 uint32_t flags; 3166 3167 /* Sanity checks. */ 3168 assert(WR_ID(wr_id).id < rxq->elts_n); 3169 assert(wr->sg_list == &elt->sge); 3170 assert(wr->num_sge == 1); 3171 assert(elts_head < rxq->elts_n); 3172 assert(rxq->elts_head < rxq->elts_n); 3173 /* 3174 * Fetch initial bytes of packet descriptor into a 3175 * cacheline while allocating rep. 3176 */ 3177 rte_prefetch0(seg); 3178 rte_prefetch0(&seg->cacheline1); 3179 ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL, 3180 &flags); 3181 if (unlikely(ret < 0)) { 3182 struct ibv_wc wc; 3183 int wcs_n; 3184 3185 DEBUG("rxq=%p, poll_length() failed (ret=%d)", 3186 (void *)rxq, ret); 3187 /* ibv_poll_cq() must be used in case of failure. */ 3188 wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); 3189 if (unlikely(wcs_n == 0)) 3190 break; 3191 if (unlikely(wcs_n < 0)) { 3192 DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", 3193 (void *)rxq, wcs_n); 3194 break; 3195 } 3196 assert(wcs_n == 1); 3197 if (unlikely(wc.status != IBV_WC_SUCCESS)) { 3198 /* Whatever, just repost the offending WR. */ 3199 DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" 3200 " completion status (%d): %s", 3201 (void *)rxq, wc.wr_id, wc.status, 3202 ibv_wc_status_str(wc.status)); 3203 #ifdef MLX4_PMD_SOFT_COUNTERS 3204 /* Increment dropped packets counter. */ 3205 ++rxq->stats.idropped; 3206 #endif 3207 /* Add SGE to array for repost. */ 3208 sges[i] = elt->sge; 3209 goto repost; 3210 } 3211 ret = wc.byte_len; 3212 } 3213 if (ret == 0) 3214 break; 3215 len = ret; 3216 rep = __rte_mbuf_raw_alloc(rxq->mp); 3217 if (unlikely(rep == NULL)) { 3218 /* 3219 * Unable to allocate a replacement mbuf, 3220 * repost WR. 3221 */ 3222 DEBUG("rxq=%p, wr_id=%" PRIu32 ":" 3223 " can't allocate a new mbuf", 3224 (void *)rxq, WR_ID(wr_id).id); 3225 /* Increase out of memory counters. */ 3226 ++rxq->stats.rx_nombuf; 3227 ++rxq->priv->dev->data->rx_mbuf_alloc_failed; 3228 goto repost; 3229 } 3230 3231 /* Reconfigure sge to use rep instead of seg. */ 3232 elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM; 3233 assert(elt->sge.lkey == rxq->mr->lkey); 3234 WR_ID(wr->wr_id).offset = 3235 (((uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM) - 3236 (uintptr_t)rep); 3237 assert(WR_ID(wr->wr_id).id == WR_ID(wr_id).id); 3238 3239 /* Add SGE to array for repost. */ 3240 sges[i] = elt->sge; 3241 3242 /* Update seg information. */ 3243 SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM); 3244 NB_SEGS(seg) = 1; 3245 PORT(seg) = rxq->port_id; 3246 NEXT(seg) = NULL; 3247 PKT_LEN(seg) = len; 3248 DATA_LEN(seg) = len; 3249 seg->packet_type = rxq_cq_to_pkt_type(flags); 3250 seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags); 3251 3252 /* Return packet. */ 3253 *(pkts++) = seg; 3254 ++pkts_ret; 3255 #ifdef MLX4_PMD_SOFT_COUNTERS 3256 /* Increase bytes counter. */ 3257 rxq->stats.ibytes += len; 3258 #endif 3259 repost: 3260 if (++elts_head >= elts_n) 3261 elts_head = 0; 3262 continue; 3263 } 3264 if (unlikely(i == 0)) 3265 return 0; 3266 /* Repost WRs. */ 3267 #ifdef DEBUG_RECV 3268 DEBUG("%p: reposting %u WRs", (void *)rxq, i); 3269 #endif 3270 ret = rxq->if_qp->recv_burst(rxq->qp, sges, i); 3271 if (unlikely(ret)) { 3272 /* Inability to repost WRs is fatal. */ 3273 DEBUG("%p: recv_burst(): failed (ret=%d)", 3274 (void *)rxq->priv, 3275 ret); 3276 abort(); 3277 } 3278 rxq->elts_head = elts_head; 3279 #ifdef MLX4_PMD_SOFT_COUNTERS 3280 /* Increase packets counter. */ 3281 rxq->stats.ipackets += pkts_ret; 3282 #endif 3283 return pkts_ret; 3284 } 3285 3286 /** 3287 * DPDK callback for RX in secondary processes. 3288 * 3289 * This function configures all queues from primary process information 3290 * if necessary before reverting to the normal RX burst callback. 3291 * 3292 * @param dpdk_rxq 3293 * Generic pointer to RX queue structure. 3294 * @param[out] pkts 3295 * Array to store received packets. 3296 * @param pkts_n 3297 * Maximum number of packets in array. 3298 * 3299 * @return 3300 * Number of packets successfully received (<= pkts_n). 3301 */ 3302 static uint16_t 3303 mlx4_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts, 3304 uint16_t pkts_n) 3305 { 3306 struct rxq *rxq = dpdk_rxq; 3307 struct priv *priv = mlx4_secondary_data_setup(rxq->priv); 3308 struct priv *primary_priv; 3309 unsigned int index; 3310 3311 if (priv == NULL) 3312 return 0; 3313 primary_priv = 3314 mlx4_secondary_data[priv->dev->data->port_id].primary_priv; 3315 /* Look for queue index in both private structures. */ 3316 for (index = 0; index != priv->rxqs_n; ++index) 3317 if (((*primary_priv->rxqs)[index] == rxq) || 3318 ((*priv->rxqs)[index] == rxq)) 3319 break; 3320 if (index == priv->rxqs_n) 3321 return 0; 3322 rxq = (*priv->rxqs)[index]; 3323 return priv->dev->rx_pkt_burst(rxq, pkts, pkts_n); 3324 } 3325 3326 /** 3327 * Allocate a Queue Pair. 3328 * Optionally setup inline receive if supported. 3329 * 3330 * @param priv 3331 * Pointer to private structure. 3332 * @param cq 3333 * Completion queue to associate with QP. 3334 * @param desc 3335 * Number of descriptors in QP (hint only). 3336 * 3337 * @return 3338 * QP pointer or NULL in case of error. 3339 */ 3340 static struct ibv_qp * 3341 rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc, 3342 struct ibv_exp_res_domain *rd) 3343 { 3344 struct ibv_exp_qp_init_attr attr = { 3345 /* CQ to be associated with the send queue. */ 3346 .send_cq = cq, 3347 /* CQ to be associated with the receive queue. */ 3348 .recv_cq = cq, 3349 .cap = { 3350 /* Max number of outstanding WRs. */ 3351 .max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ? 3352 priv->device_attr.max_qp_wr : 3353 desc), 3354 /* Max number of scatter/gather elements in a WR. */ 3355 .max_recv_sge = ((priv->device_attr.max_sge < 3356 MLX4_PMD_SGE_WR_N) ? 3357 priv->device_attr.max_sge : 3358 MLX4_PMD_SGE_WR_N), 3359 }, 3360 .qp_type = IBV_QPT_RAW_PACKET, 3361 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD | 3362 IBV_EXP_QP_INIT_ATTR_RES_DOMAIN), 3363 .pd = priv->pd, 3364 .res_domain = rd, 3365 }; 3366 3367 #ifdef INLINE_RECV 3368 attr.max_inl_recv = priv->inl_recv_size; 3369 attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_INL_RECV; 3370 #endif 3371 return ibv_exp_create_qp(priv->ctx, &attr); 3372 } 3373 3374 #ifdef RSS_SUPPORT 3375 3376 /** 3377 * Allocate a RSS Queue Pair. 3378 * Optionally setup inline receive if supported. 3379 * 3380 * @param priv 3381 * Pointer to private structure. 3382 * @param cq 3383 * Completion queue to associate with QP. 3384 * @param desc 3385 * Number of descriptors in QP (hint only). 3386 * @param parent 3387 * If nonzero, create a parent QP, otherwise a child. 3388 * 3389 * @return 3390 * QP pointer or NULL in case of error. 3391 */ 3392 static struct ibv_qp * 3393 rxq_setup_qp_rss(struct priv *priv, struct ibv_cq *cq, uint16_t desc, 3394 int parent, struct ibv_exp_res_domain *rd) 3395 { 3396 struct ibv_exp_qp_init_attr attr = { 3397 /* CQ to be associated with the send queue. */ 3398 .send_cq = cq, 3399 /* CQ to be associated with the receive queue. */ 3400 .recv_cq = cq, 3401 .cap = { 3402 /* Max number of outstanding WRs. */ 3403 .max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ? 3404 priv->device_attr.max_qp_wr : 3405 desc), 3406 /* Max number of scatter/gather elements in a WR. */ 3407 .max_recv_sge = ((priv->device_attr.max_sge < 3408 MLX4_PMD_SGE_WR_N) ? 3409 priv->device_attr.max_sge : 3410 MLX4_PMD_SGE_WR_N), 3411 }, 3412 .qp_type = IBV_QPT_RAW_PACKET, 3413 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD | 3414 IBV_EXP_QP_INIT_ATTR_RES_DOMAIN | 3415 IBV_EXP_QP_INIT_ATTR_QPG), 3416 .pd = priv->pd, 3417 .res_domain = rd, 3418 }; 3419 3420 #ifdef INLINE_RECV 3421 attr.max_inl_recv = priv->inl_recv_size, 3422 attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_INL_RECV; 3423 #endif 3424 if (parent) { 3425 attr.qpg.qpg_type = IBV_EXP_QPG_PARENT; 3426 /* TSS isn't necessary. */ 3427 attr.qpg.parent_attrib.tss_child_count = 0; 3428 attr.qpg.parent_attrib.rss_child_count = priv->rxqs_n; 3429 DEBUG("initializing parent RSS queue"); 3430 } else { 3431 attr.qpg.qpg_type = IBV_EXP_QPG_CHILD_RX; 3432 attr.qpg.qpg_parent = priv->rxq_parent.qp; 3433 DEBUG("initializing child RSS queue"); 3434 } 3435 return ibv_exp_create_qp(priv->ctx, &attr); 3436 } 3437 3438 #endif /* RSS_SUPPORT */ 3439 3440 /** 3441 * Reconfigure a RX queue with new parameters. 3442 * 3443 * rxq_rehash() does not allocate mbufs, which, if not done from the right 3444 * thread (such as a control thread), may corrupt the pool. 3445 * In case of failure, the queue is left untouched. 3446 * 3447 * @param dev 3448 * Pointer to Ethernet device structure. 3449 * @param rxq 3450 * RX queue pointer. 3451 * 3452 * @return 3453 * 0 on success, errno value on failure. 3454 */ 3455 static int 3456 rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq) 3457 { 3458 struct priv *priv = rxq->priv; 3459 struct rxq tmpl = *rxq; 3460 unsigned int mbuf_n; 3461 unsigned int desc_n; 3462 struct rte_mbuf **pool; 3463 unsigned int i, k; 3464 struct ibv_exp_qp_attr mod; 3465 struct ibv_recv_wr *bad_wr; 3466 int err; 3467 int parent = (rxq == &priv->rxq_parent); 3468 3469 if (parent) { 3470 ERROR("%p: cannot rehash parent queue %p", 3471 (void *)dev, (void *)rxq); 3472 return EINVAL; 3473 } 3474 DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq); 3475 /* Number of descriptors and mbufs currently allocated. */ 3476 desc_n = (tmpl.elts_n * (tmpl.sp ? MLX4_PMD_SGE_WR_N : 1)); 3477 mbuf_n = desc_n; 3478 /* Toggle RX checksum offload if hardware supports it. */ 3479 if (priv->hw_csum) { 3480 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum; 3481 rxq->csum = tmpl.csum; 3482 } 3483 if (priv->hw_csum_l2tun) { 3484 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum; 3485 rxq->csum_l2tun = tmpl.csum_l2tun; 3486 } 3487 /* Enable scattered packets support for this queue if necessary. */ 3488 if ((dev->data->dev_conf.rxmode.jumbo_frame) && 3489 (dev->data->dev_conf.rxmode.max_rx_pkt_len > 3490 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) { 3491 tmpl.sp = 1; 3492 desc_n /= MLX4_PMD_SGE_WR_N; 3493 } else 3494 tmpl.sp = 0; 3495 DEBUG("%p: %s scattered packets support (%u WRs)", 3496 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n); 3497 /* If scatter mode is the same as before, nothing to do. */ 3498 if (tmpl.sp == rxq->sp) { 3499 DEBUG("%p: nothing to do", (void *)dev); 3500 return 0; 3501 } 3502 /* Remove attached flows if RSS is disabled (no parent queue). */ 3503 if (!priv->rss) { 3504 rxq_allmulticast_disable(&tmpl); 3505 rxq_promiscuous_disable(&tmpl); 3506 rxq_mac_addrs_del(&tmpl); 3507 /* Update original queue in case of failure. */ 3508 rxq->allmulti_flow = tmpl.allmulti_flow; 3509 rxq->promisc_flow = tmpl.promisc_flow; 3510 memcpy(rxq->mac_configured, tmpl.mac_configured, 3511 sizeof(rxq->mac_configured)); 3512 memcpy(rxq->mac_flow, tmpl.mac_flow, sizeof(rxq->mac_flow)); 3513 } 3514 /* From now on, any failure will render the queue unusable. 3515 * Reinitialize QP. */ 3516 mod = (struct ibv_exp_qp_attr){ .qp_state = IBV_QPS_RESET }; 3517 err = ibv_exp_modify_qp(tmpl.qp, &mod, IBV_EXP_QP_STATE); 3518 if (err) { 3519 ERROR("%p: cannot reset QP: %s", (void *)dev, strerror(err)); 3520 assert(err > 0); 3521 return err; 3522 } 3523 err = ibv_resize_cq(tmpl.cq, desc_n); 3524 if (err) { 3525 ERROR("%p: cannot resize CQ: %s", (void *)dev, strerror(err)); 3526 assert(err > 0); 3527 return err; 3528 } 3529 mod = (struct ibv_exp_qp_attr){ 3530 /* Move the QP to this state. */ 3531 .qp_state = IBV_QPS_INIT, 3532 /* Primary port number. */ 3533 .port_num = priv->port 3534 }; 3535 err = ibv_exp_modify_qp(tmpl.qp, &mod, 3536 (IBV_EXP_QP_STATE | 3537 #ifdef RSS_SUPPORT 3538 (parent ? IBV_EXP_QP_GROUP_RSS : 0) | 3539 #endif /* RSS_SUPPORT */ 3540 IBV_EXP_QP_PORT)); 3541 if (err) { 3542 ERROR("%p: QP state to IBV_QPS_INIT failed: %s", 3543 (void *)dev, strerror(err)); 3544 assert(err > 0); 3545 return err; 3546 }; 3547 /* Reconfigure flows. Do not care for errors. */ 3548 if (!priv->rss) { 3549 rxq_mac_addrs_add(&tmpl); 3550 if (priv->promisc) 3551 rxq_promiscuous_enable(&tmpl); 3552 if (priv->allmulti) 3553 rxq_allmulticast_enable(&tmpl); 3554 /* Update original queue in case of failure. */ 3555 rxq->allmulti_flow = tmpl.allmulti_flow; 3556 rxq->promisc_flow = tmpl.promisc_flow; 3557 memcpy(rxq->mac_configured, tmpl.mac_configured, 3558 sizeof(rxq->mac_configured)); 3559 memcpy(rxq->mac_flow, tmpl.mac_flow, sizeof(rxq->mac_flow)); 3560 } 3561 /* Allocate pool. */ 3562 pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0); 3563 if (pool == NULL) { 3564 ERROR("%p: cannot allocate memory", (void *)dev); 3565 return ENOBUFS; 3566 } 3567 /* Snatch mbufs from original queue. */ 3568 k = 0; 3569 if (rxq->sp) { 3570 struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp; 3571 3572 for (i = 0; (i != elemof(*elts)); ++i) { 3573 struct rxq_elt_sp *elt = &(*elts)[i]; 3574 unsigned int j; 3575 3576 for (j = 0; (j != elemof(elt->bufs)); ++j) { 3577 assert(elt->bufs[j] != NULL); 3578 pool[k++] = elt->bufs[j]; 3579 } 3580 } 3581 } else { 3582 struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp; 3583 3584 for (i = 0; (i != elemof(*elts)); ++i) { 3585 struct rxq_elt *elt = &(*elts)[i]; 3586 struct rte_mbuf *buf = (void *) 3587 ((uintptr_t)elt->sge.addr - 3588 WR_ID(elt->wr.wr_id).offset); 3589 3590 assert(WR_ID(elt->wr.wr_id).id == i); 3591 pool[k++] = buf; 3592 } 3593 } 3594 assert(k == mbuf_n); 3595 tmpl.elts_n = 0; 3596 tmpl.elts.sp = NULL; 3597 assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp); 3598 err = ((tmpl.sp) ? 3599 rxq_alloc_elts_sp(&tmpl, desc_n, pool) : 3600 rxq_alloc_elts(&tmpl, desc_n, pool)); 3601 if (err) { 3602 ERROR("%p: cannot reallocate WRs, aborting", (void *)dev); 3603 rte_free(pool); 3604 assert(err > 0); 3605 return err; 3606 } 3607 assert(tmpl.elts_n == desc_n); 3608 assert(tmpl.elts.sp != NULL); 3609 rte_free(pool); 3610 /* Clean up original data. */ 3611 rxq->elts_n = 0; 3612 rte_free(rxq->elts.sp); 3613 rxq->elts.sp = NULL; 3614 /* Post WRs. */ 3615 err = ibv_post_recv(tmpl.qp, 3616 (tmpl.sp ? 3617 &(*tmpl.elts.sp)[0].wr : 3618 &(*tmpl.elts.no_sp)[0].wr), 3619 &bad_wr); 3620 if (err) { 3621 ERROR("%p: ibv_post_recv() failed for WR %p: %s", 3622 (void *)dev, 3623 (void *)bad_wr, 3624 strerror(err)); 3625 goto skip_rtr; 3626 } 3627 mod = (struct ibv_exp_qp_attr){ 3628 .qp_state = IBV_QPS_RTR 3629 }; 3630 err = ibv_exp_modify_qp(tmpl.qp, &mod, IBV_EXP_QP_STATE); 3631 if (err) 3632 ERROR("%p: QP state to IBV_QPS_RTR failed: %s", 3633 (void *)dev, strerror(err)); 3634 skip_rtr: 3635 *rxq = tmpl; 3636 assert(err >= 0); 3637 return err; 3638 } 3639 3640 /** 3641 * Configure a RX queue. 3642 * 3643 * @param dev 3644 * Pointer to Ethernet device structure. 3645 * @param rxq 3646 * Pointer to RX queue structure. 3647 * @param desc 3648 * Number of descriptors to configure in queue. 3649 * @param socket 3650 * NUMA socket on which memory must be allocated. 3651 * @param[in] conf 3652 * Thresholds parameters. 3653 * @param mp 3654 * Memory pool for buffer allocations. 3655 * 3656 * @return 3657 * 0 on success, errno value on failure. 3658 */ 3659 static int 3660 rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc, 3661 unsigned int socket, const struct rte_eth_rxconf *conf, 3662 struct rte_mempool *mp) 3663 { 3664 struct priv *priv = dev->data->dev_private; 3665 struct rxq tmpl = { 3666 .priv = priv, 3667 .mp = mp, 3668 .socket = socket 3669 }; 3670 struct ibv_exp_qp_attr mod; 3671 union { 3672 struct ibv_exp_query_intf_params params; 3673 struct ibv_exp_cq_init_attr cq; 3674 struct ibv_exp_res_domain_init_attr rd; 3675 } attr; 3676 enum ibv_exp_query_intf_status status; 3677 struct ibv_recv_wr *bad_wr; 3678 struct rte_mbuf *buf; 3679 int ret = 0; 3680 int parent = (rxq == &priv->rxq_parent); 3681 3682 (void)conf; /* Thresholds configuration (ignored). */ 3683 /* 3684 * If this is a parent queue, hardware must support RSS and 3685 * RSS must be enabled. 3686 */ 3687 assert((!parent) || ((priv->hw_rss) && (priv->rss))); 3688 if (parent) { 3689 /* Even if unused, ibv_create_cq() requires at least one 3690 * descriptor. */ 3691 desc = 1; 3692 goto skip_mr; 3693 } 3694 if ((desc == 0) || (desc % MLX4_PMD_SGE_WR_N)) { 3695 ERROR("%p: invalid number of RX descriptors (must be a" 3696 " multiple of %d)", (void *)dev, MLX4_PMD_SGE_WR_N); 3697 return EINVAL; 3698 } 3699 /* Get mbuf length. */ 3700 buf = rte_pktmbuf_alloc(mp); 3701 if (buf == NULL) { 3702 ERROR("%p: unable to allocate mbuf", (void *)dev); 3703 return ENOMEM; 3704 } 3705 tmpl.mb_len = buf->buf_len; 3706 assert((rte_pktmbuf_headroom(buf) + 3707 rte_pktmbuf_tailroom(buf)) == tmpl.mb_len); 3708 assert(rte_pktmbuf_headroom(buf) == RTE_PKTMBUF_HEADROOM); 3709 rte_pktmbuf_free(buf); 3710 /* Toggle RX checksum offload if hardware supports it. */ 3711 if (priv->hw_csum) 3712 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum; 3713 if (priv->hw_csum_l2tun) 3714 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum; 3715 /* Enable scattered packets support for this queue if necessary. */ 3716 if ((dev->data->dev_conf.rxmode.jumbo_frame) && 3717 (dev->data->dev_conf.rxmode.max_rx_pkt_len > 3718 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) { 3719 tmpl.sp = 1; 3720 desc /= MLX4_PMD_SGE_WR_N; 3721 } 3722 DEBUG("%p: %s scattered packets support (%u WRs)", 3723 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc); 3724 /* Use the entire RX mempool as the memory region. */ 3725 tmpl.mr = ibv_reg_mr(priv->pd, 3726 (void *)mp->elt_va_start, 3727 (mp->elt_va_end - mp->elt_va_start), 3728 (IBV_ACCESS_LOCAL_WRITE | 3729 IBV_ACCESS_REMOTE_WRITE)); 3730 if (tmpl.mr == NULL) { 3731 ret = EINVAL; 3732 ERROR("%p: MR creation failure: %s", 3733 (void *)dev, strerror(ret)); 3734 goto error; 3735 } 3736 skip_mr: 3737 attr.rd = (struct ibv_exp_res_domain_init_attr){ 3738 .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL | 3739 IBV_EXP_RES_DOMAIN_MSG_MODEL), 3740 .thread_model = IBV_EXP_THREAD_SINGLE, 3741 .msg_model = IBV_EXP_MSG_HIGH_BW, 3742 }; 3743 tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd); 3744 if (tmpl.rd == NULL) { 3745 ret = ENOMEM; 3746 ERROR("%p: RD creation failure: %s", 3747 (void *)dev, strerror(ret)); 3748 goto error; 3749 } 3750 attr.cq = (struct ibv_exp_cq_init_attr){ 3751 .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN, 3752 .res_domain = tmpl.rd, 3753 }; 3754 tmpl.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq); 3755 if (tmpl.cq == NULL) { 3756 ret = ENOMEM; 3757 ERROR("%p: CQ creation failure: %s", 3758 (void *)dev, strerror(ret)); 3759 goto error; 3760 } 3761 DEBUG("priv->device_attr.max_qp_wr is %d", 3762 priv->device_attr.max_qp_wr); 3763 DEBUG("priv->device_attr.max_sge is %d", 3764 priv->device_attr.max_sge); 3765 #ifdef RSS_SUPPORT 3766 if (priv->rss) 3767 tmpl.qp = rxq_setup_qp_rss(priv, tmpl.cq, desc, parent, 3768 tmpl.rd); 3769 else 3770 #endif /* RSS_SUPPORT */ 3771 tmpl.qp = rxq_setup_qp(priv, tmpl.cq, desc, tmpl.rd); 3772 if (tmpl.qp == NULL) { 3773 ret = (errno ? errno : EINVAL); 3774 ERROR("%p: QP creation failure: %s", 3775 (void *)dev, strerror(ret)); 3776 goto error; 3777 } 3778 mod = (struct ibv_exp_qp_attr){ 3779 /* Move the QP to this state. */ 3780 .qp_state = IBV_QPS_INIT, 3781 /* Primary port number. */ 3782 .port_num = priv->port 3783 }; 3784 ret = ibv_exp_modify_qp(tmpl.qp, &mod, 3785 (IBV_EXP_QP_STATE | 3786 #ifdef RSS_SUPPORT 3787 (parent ? IBV_EXP_QP_GROUP_RSS : 0) | 3788 #endif /* RSS_SUPPORT */ 3789 IBV_EXP_QP_PORT)); 3790 if (ret) { 3791 ERROR("%p: QP state to IBV_QPS_INIT failed: %s", 3792 (void *)dev, strerror(ret)); 3793 goto error; 3794 } 3795 if ((parent) || (!priv->rss)) { 3796 /* Configure MAC and broadcast addresses. */ 3797 ret = rxq_mac_addrs_add(&tmpl); 3798 if (ret) { 3799 ERROR("%p: QP flow attachment failed: %s", 3800 (void *)dev, strerror(ret)); 3801 goto error; 3802 } 3803 } 3804 /* Allocate descriptors for RX queues, except for the RSS parent. */ 3805 if (parent) 3806 goto skip_alloc; 3807 if (tmpl.sp) 3808 ret = rxq_alloc_elts_sp(&tmpl, desc, NULL); 3809 else 3810 ret = rxq_alloc_elts(&tmpl, desc, NULL); 3811 if (ret) { 3812 ERROR("%p: RXQ allocation failed: %s", 3813 (void *)dev, strerror(ret)); 3814 goto error; 3815 } 3816 ret = ibv_post_recv(tmpl.qp, 3817 (tmpl.sp ? 3818 &(*tmpl.elts.sp)[0].wr : 3819 &(*tmpl.elts.no_sp)[0].wr), 3820 &bad_wr); 3821 if (ret) { 3822 ERROR("%p: ibv_post_recv() failed for WR %p: %s", 3823 (void *)dev, 3824 (void *)bad_wr, 3825 strerror(ret)); 3826 goto error; 3827 } 3828 skip_alloc: 3829 mod = (struct ibv_exp_qp_attr){ 3830 .qp_state = IBV_QPS_RTR 3831 }; 3832 ret = ibv_exp_modify_qp(tmpl.qp, &mod, IBV_EXP_QP_STATE); 3833 if (ret) { 3834 ERROR("%p: QP state to IBV_QPS_RTR failed: %s", 3835 (void *)dev, strerror(ret)); 3836 goto error; 3837 } 3838 /* Save port ID. */ 3839 tmpl.port_id = dev->data->port_id; 3840 DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id); 3841 attr.params = (struct ibv_exp_query_intf_params){ 3842 .intf_scope = IBV_EXP_INTF_GLOBAL, 3843 .intf = IBV_EXP_INTF_CQ, 3844 .obj = tmpl.cq, 3845 }; 3846 tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status); 3847 if (tmpl.if_cq == NULL) { 3848 ERROR("%p: CQ interface family query failed with status %d", 3849 (void *)dev, status); 3850 goto error; 3851 } 3852 attr.params = (struct ibv_exp_query_intf_params){ 3853 .intf_scope = IBV_EXP_INTF_GLOBAL, 3854 .intf = IBV_EXP_INTF_QP_BURST, 3855 .obj = tmpl.qp, 3856 }; 3857 tmpl.if_qp = ibv_exp_query_intf(priv->ctx, &attr.params, &status); 3858 if (tmpl.if_qp == NULL) { 3859 ERROR("%p: QP interface family query failed with status %d", 3860 (void *)dev, status); 3861 goto error; 3862 } 3863 /* Clean up rxq in case we're reinitializing it. */ 3864 DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq); 3865 rxq_cleanup(rxq); 3866 *rxq = tmpl; 3867 DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl); 3868 assert(ret == 0); 3869 return 0; 3870 error: 3871 rxq_cleanup(&tmpl); 3872 assert(ret > 0); 3873 return ret; 3874 } 3875 3876 /** 3877 * DPDK callback to configure a RX queue. 3878 * 3879 * @param dev 3880 * Pointer to Ethernet device structure. 3881 * @param idx 3882 * RX queue index. 3883 * @param desc 3884 * Number of descriptors to configure in queue. 3885 * @param socket 3886 * NUMA socket on which memory must be allocated. 3887 * @param[in] conf 3888 * Thresholds parameters. 3889 * @param mp 3890 * Memory pool for buffer allocations. 3891 * 3892 * @return 3893 * 0 on success, negative errno value on failure. 3894 */ 3895 static int 3896 mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, 3897 unsigned int socket, const struct rte_eth_rxconf *conf, 3898 struct rte_mempool *mp) 3899 { 3900 struct priv *priv = dev->data->dev_private; 3901 struct rxq *rxq = (*priv->rxqs)[idx]; 3902 int ret; 3903 3904 if (mlx4_is_secondary()) 3905 return -E_RTE_SECONDARY; 3906 priv_lock(priv); 3907 DEBUG("%p: configuring queue %u for %u descriptors", 3908 (void *)dev, idx, desc); 3909 if (idx >= priv->rxqs_n) { 3910 ERROR("%p: queue index out of range (%u >= %u)", 3911 (void *)dev, idx, priv->rxqs_n); 3912 priv_unlock(priv); 3913 return -EOVERFLOW; 3914 } 3915 if (rxq != NULL) { 3916 DEBUG("%p: reusing already allocated queue index %u (%p)", 3917 (void *)dev, idx, (void *)rxq); 3918 if (priv->started) { 3919 priv_unlock(priv); 3920 return -EEXIST; 3921 } 3922 (*priv->rxqs)[idx] = NULL; 3923 rxq_cleanup(rxq); 3924 } else { 3925 rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket); 3926 if (rxq == NULL) { 3927 ERROR("%p: unable to allocate queue index %u", 3928 (void *)dev, idx); 3929 priv_unlock(priv); 3930 return -ENOMEM; 3931 } 3932 } 3933 ret = rxq_setup(dev, rxq, desc, socket, conf, mp); 3934 if (ret) 3935 rte_free(rxq); 3936 else { 3937 rxq->stats.idx = idx; 3938 DEBUG("%p: adding RX queue %p to list", 3939 (void *)dev, (void *)rxq); 3940 (*priv->rxqs)[idx] = rxq; 3941 /* Update receive callback. */ 3942 if (rxq->sp) 3943 dev->rx_pkt_burst = mlx4_rx_burst_sp; 3944 else 3945 dev->rx_pkt_burst = mlx4_rx_burst; 3946 } 3947 priv_unlock(priv); 3948 return -ret; 3949 } 3950 3951 /** 3952 * DPDK callback to release a RX queue. 3953 * 3954 * @param dpdk_rxq 3955 * Generic RX queue pointer. 3956 */ 3957 static void 3958 mlx4_rx_queue_release(void *dpdk_rxq) 3959 { 3960 struct rxq *rxq = (struct rxq *)dpdk_rxq; 3961 struct priv *priv; 3962 unsigned int i; 3963 3964 if (mlx4_is_secondary()) 3965 return; 3966 if (rxq == NULL) 3967 return; 3968 priv = rxq->priv; 3969 priv_lock(priv); 3970 assert(rxq != &priv->rxq_parent); 3971 for (i = 0; (i != priv->rxqs_n); ++i) 3972 if ((*priv->rxqs)[i] == rxq) { 3973 DEBUG("%p: removing RX queue %p from list", 3974 (void *)priv->dev, (void *)rxq); 3975 (*priv->rxqs)[i] = NULL; 3976 break; 3977 } 3978 rxq_cleanup(rxq); 3979 rte_free(rxq); 3980 priv_unlock(priv); 3981 } 3982 3983 static void 3984 priv_dev_interrupt_handler_install(struct priv *, struct rte_eth_dev *); 3985 3986 /** 3987 * DPDK callback to start the device. 3988 * 3989 * Simulate device start by attaching all configured flows. 3990 * 3991 * @param dev 3992 * Pointer to Ethernet device structure. 3993 * 3994 * @return 3995 * 0 on success, negative errno value on failure. 3996 */ 3997 static int 3998 mlx4_dev_start(struct rte_eth_dev *dev) 3999 { 4000 struct priv *priv = dev->data->dev_private; 4001 unsigned int i = 0; 4002 unsigned int r; 4003 struct rxq *rxq; 4004 4005 if (mlx4_is_secondary()) 4006 return -E_RTE_SECONDARY; 4007 priv_lock(priv); 4008 if (priv->started) { 4009 priv_unlock(priv); 4010 return 0; 4011 } 4012 DEBUG("%p: attaching configured flows to all RX queues", (void *)dev); 4013 priv->started = 1; 4014 if (priv->rss) { 4015 rxq = &priv->rxq_parent; 4016 r = 1; 4017 } else { 4018 rxq = (*priv->rxqs)[0]; 4019 r = priv->rxqs_n; 4020 } 4021 /* Iterate only once when RSS is enabled. */ 4022 do { 4023 int ret; 4024 4025 /* Ignore nonexistent RX queues. */ 4026 if (rxq == NULL) 4027 continue; 4028 ret = rxq_mac_addrs_add(rxq); 4029 if (!ret && priv->promisc) 4030 ret = rxq_promiscuous_enable(rxq); 4031 if (!ret && priv->allmulti) 4032 ret = rxq_allmulticast_enable(rxq); 4033 if (!ret) 4034 continue; 4035 WARN("%p: QP flow attachment failed: %s", 4036 (void *)dev, strerror(ret)); 4037 /* Rollback. */ 4038 while (i != 0) { 4039 rxq = (*priv->rxqs)[--i]; 4040 if (rxq != NULL) { 4041 rxq_allmulticast_disable(rxq); 4042 rxq_promiscuous_disable(rxq); 4043 rxq_mac_addrs_del(rxq); 4044 } 4045 } 4046 priv->started = 0; 4047 priv_unlock(priv); 4048 return -ret; 4049 } while ((--r) && ((rxq = (*priv->rxqs)[++i]), i)); 4050 priv_dev_interrupt_handler_install(priv, dev); 4051 priv_unlock(priv); 4052 return 0; 4053 } 4054 4055 /** 4056 * DPDK callback to stop the device. 4057 * 4058 * Simulate device stop by detaching all configured flows. 4059 * 4060 * @param dev 4061 * Pointer to Ethernet device structure. 4062 */ 4063 static void 4064 mlx4_dev_stop(struct rte_eth_dev *dev) 4065 { 4066 struct priv *priv = dev->data->dev_private; 4067 unsigned int i = 0; 4068 unsigned int r; 4069 struct rxq *rxq; 4070 4071 if (mlx4_is_secondary()) 4072 return; 4073 priv_lock(priv); 4074 if (!priv->started) { 4075 priv_unlock(priv); 4076 return; 4077 } 4078 DEBUG("%p: detaching flows from all RX queues", (void *)dev); 4079 priv->started = 0; 4080 if (priv->rss) { 4081 rxq = &priv->rxq_parent; 4082 r = 1; 4083 } else { 4084 rxq = (*priv->rxqs)[0]; 4085 r = priv->rxqs_n; 4086 } 4087 /* Iterate only once when RSS is enabled. */ 4088 do { 4089 /* Ignore nonexistent RX queues. */ 4090 if (rxq == NULL) 4091 continue; 4092 rxq_allmulticast_disable(rxq); 4093 rxq_promiscuous_disable(rxq); 4094 rxq_mac_addrs_del(rxq); 4095 } while ((--r) && ((rxq = (*priv->rxqs)[++i]), i)); 4096 priv_unlock(priv); 4097 } 4098 4099 /** 4100 * Dummy DPDK callback for TX. 4101 * 4102 * This function is used to temporarily replace the real callback during 4103 * unsafe control operations on the queue, or in case of error. 4104 * 4105 * @param dpdk_txq 4106 * Generic pointer to TX queue structure. 4107 * @param[in] pkts 4108 * Packets to transmit. 4109 * @param pkts_n 4110 * Number of packets in array. 4111 * 4112 * @return 4113 * Number of packets successfully transmitted (<= pkts_n). 4114 */ 4115 static uint16_t 4116 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 4117 { 4118 (void)dpdk_txq; 4119 (void)pkts; 4120 (void)pkts_n; 4121 return 0; 4122 } 4123 4124 /** 4125 * Dummy DPDK callback for RX. 4126 * 4127 * This function is used to temporarily replace the real callback during 4128 * unsafe control operations on the queue, or in case of error. 4129 * 4130 * @param dpdk_rxq 4131 * Generic pointer to RX queue structure. 4132 * @param[out] pkts 4133 * Array to store received packets. 4134 * @param pkts_n 4135 * Maximum number of packets in array. 4136 * 4137 * @return 4138 * Number of packets successfully received (<= pkts_n). 4139 */ 4140 static uint16_t 4141 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 4142 { 4143 (void)dpdk_rxq; 4144 (void)pkts; 4145 (void)pkts_n; 4146 return 0; 4147 } 4148 4149 static void 4150 priv_dev_interrupt_handler_uninstall(struct priv *, struct rte_eth_dev *); 4151 4152 /** 4153 * DPDK callback to close the device. 4154 * 4155 * Destroy all queues and objects, free memory. 4156 * 4157 * @param dev 4158 * Pointer to Ethernet device structure. 4159 */ 4160 static void 4161 mlx4_dev_close(struct rte_eth_dev *dev) 4162 { 4163 struct priv *priv = mlx4_get_priv(dev); 4164 void *tmp; 4165 unsigned int i; 4166 4167 if (priv == NULL) 4168 return; 4169 priv_lock(priv); 4170 DEBUG("%p: closing device \"%s\"", 4171 (void *)dev, 4172 ((priv->ctx != NULL) ? priv->ctx->device->name : "")); 4173 /* Prevent crashes when queues are still in use. This is unfortunately 4174 * still required for DPDK 1.3 because some programs (such as testpmd) 4175 * never release them before closing the device. */ 4176 dev->rx_pkt_burst = removed_rx_burst; 4177 dev->tx_pkt_burst = removed_tx_burst; 4178 if (priv->rxqs != NULL) { 4179 /* XXX race condition if mlx4_rx_burst() is still running. */ 4180 usleep(1000); 4181 for (i = 0; (i != priv->rxqs_n); ++i) { 4182 tmp = (*priv->rxqs)[i]; 4183 if (tmp == NULL) 4184 continue; 4185 (*priv->rxqs)[i] = NULL; 4186 rxq_cleanup(tmp); 4187 rte_free(tmp); 4188 } 4189 priv->rxqs_n = 0; 4190 priv->rxqs = NULL; 4191 } 4192 if (priv->txqs != NULL) { 4193 /* XXX race condition if mlx4_tx_burst() is still running. */ 4194 usleep(1000); 4195 for (i = 0; (i != priv->txqs_n); ++i) { 4196 tmp = (*priv->txqs)[i]; 4197 if (tmp == NULL) 4198 continue; 4199 (*priv->txqs)[i] = NULL; 4200 txq_cleanup(tmp); 4201 rte_free(tmp); 4202 } 4203 priv->txqs_n = 0; 4204 priv->txqs = NULL; 4205 } 4206 if (priv->rss) 4207 rxq_cleanup(&priv->rxq_parent); 4208 if (priv->pd != NULL) { 4209 assert(priv->ctx != NULL); 4210 claim_zero(ibv_dealloc_pd(priv->pd)); 4211 claim_zero(ibv_close_device(priv->ctx)); 4212 } else 4213 assert(priv->ctx == NULL); 4214 priv_dev_interrupt_handler_uninstall(priv, dev); 4215 priv_unlock(priv); 4216 memset(priv, 0, sizeof(*priv)); 4217 } 4218 4219 /** 4220 * DPDK callback to get information about the device. 4221 * 4222 * @param dev 4223 * Pointer to Ethernet device structure. 4224 * @param[out] info 4225 * Info structure output buffer. 4226 */ 4227 static void 4228 mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 4229 { 4230 struct priv *priv = mlx4_get_priv(dev); 4231 unsigned int max; 4232 char ifname[IF_NAMESIZE]; 4233 4234 if (priv == NULL) 4235 return; 4236 priv_lock(priv); 4237 /* FIXME: we should ask the device for these values. */ 4238 info->min_rx_bufsize = 32; 4239 info->max_rx_pktlen = 65536; 4240 /* 4241 * Since we need one CQ per QP, the limit is the minimum number 4242 * between the two values. 4243 */ 4244 max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ? 4245 priv->device_attr.max_qp : priv->device_attr.max_cq); 4246 /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */ 4247 if (max >= 65535) 4248 max = 65535; 4249 info->max_rx_queues = max; 4250 info->max_tx_queues = max; 4251 /* Last array entry is reserved for broadcast. */ 4252 info->max_mac_addrs = (elemof(priv->mac) - 1); 4253 info->rx_offload_capa = 4254 (priv->hw_csum ? 4255 (DEV_RX_OFFLOAD_IPV4_CKSUM | 4256 DEV_RX_OFFLOAD_UDP_CKSUM | 4257 DEV_RX_OFFLOAD_TCP_CKSUM) : 4258 0); 4259 info->tx_offload_capa = 4260 (priv->hw_csum ? 4261 (DEV_TX_OFFLOAD_IPV4_CKSUM | 4262 DEV_TX_OFFLOAD_UDP_CKSUM | 4263 DEV_TX_OFFLOAD_TCP_CKSUM) : 4264 0); 4265 if (priv_get_ifname(priv, &ifname) == 0) 4266 info->if_index = if_nametoindex(ifname); 4267 priv_unlock(priv); 4268 } 4269 4270 /** 4271 * DPDK callback to get device statistics. 4272 * 4273 * @param dev 4274 * Pointer to Ethernet device structure. 4275 * @param[out] stats 4276 * Stats structure output buffer. 4277 */ 4278 static void 4279 mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats) 4280 { 4281 struct priv *priv = mlx4_get_priv(dev); 4282 struct rte_eth_stats tmp = {0}; 4283 unsigned int i; 4284 unsigned int idx; 4285 4286 if (priv == NULL) 4287 return; 4288 priv_lock(priv); 4289 /* Add software counters. */ 4290 for (i = 0; (i != priv->rxqs_n); ++i) { 4291 struct rxq *rxq = (*priv->rxqs)[i]; 4292 4293 if (rxq == NULL) 4294 continue; 4295 idx = rxq->stats.idx; 4296 if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) { 4297 #ifdef MLX4_PMD_SOFT_COUNTERS 4298 tmp.q_ipackets[idx] += rxq->stats.ipackets; 4299 tmp.q_ibytes[idx] += rxq->stats.ibytes; 4300 #endif 4301 tmp.q_errors[idx] += (rxq->stats.idropped + 4302 rxq->stats.rx_nombuf); 4303 } 4304 #ifdef MLX4_PMD_SOFT_COUNTERS 4305 tmp.ipackets += rxq->stats.ipackets; 4306 tmp.ibytes += rxq->stats.ibytes; 4307 #endif 4308 tmp.ierrors += rxq->stats.idropped; 4309 tmp.rx_nombuf += rxq->stats.rx_nombuf; 4310 } 4311 for (i = 0; (i != priv->txqs_n); ++i) { 4312 struct txq *txq = (*priv->txqs)[i]; 4313 4314 if (txq == NULL) 4315 continue; 4316 idx = txq->stats.idx; 4317 if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) { 4318 #ifdef MLX4_PMD_SOFT_COUNTERS 4319 tmp.q_opackets[idx] += txq->stats.opackets; 4320 tmp.q_obytes[idx] += txq->stats.obytes; 4321 #endif 4322 tmp.q_errors[idx] += txq->stats.odropped; 4323 } 4324 #ifdef MLX4_PMD_SOFT_COUNTERS 4325 tmp.opackets += txq->stats.opackets; 4326 tmp.obytes += txq->stats.obytes; 4327 #endif 4328 tmp.oerrors += txq->stats.odropped; 4329 } 4330 #ifndef MLX4_PMD_SOFT_COUNTERS 4331 /* FIXME: retrieve and add hardware counters. */ 4332 #endif 4333 *stats = tmp; 4334 priv_unlock(priv); 4335 } 4336 4337 /** 4338 * DPDK callback to clear device statistics. 4339 * 4340 * @param dev 4341 * Pointer to Ethernet device structure. 4342 */ 4343 static void 4344 mlx4_stats_reset(struct rte_eth_dev *dev) 4345 { 4346 struct priv *priv = mlx4_get_priv(dev); 4347 unsigned int i; 4348 unsigned int idx; 4349 4350 if (priv == NULL) 4351 return; 4352 priv_lock(priv); 4353 for (i = 0; (i != priv->rxqs_n); ++i) { 4354 if ((*priv->rxqs)[i] == NULL) 4355 continue; 4356 idx = (*priv->rxqs)[i]->stats.idx; 4357 (*priv->rxqs)[i]->stats = 4358 (struct mlx4_rxq_stats){ .idx = idx }; 4359 } 4360 for (i = 0; (i != priv->txqs_n); ++i) { 4361 if ((*priv->txqs)[i] == NULL) 4362 continue; 4363 idx = (*priv->txqs)[i]->stats.idx; 4364 (*priv->txqs)[i]->stats = 4365 (struct mlx4_txq_stats){ .idx = idx }; 4366 } 4367 #ifndef MLX4_PMD_SOFT_COUNTERS 4368 /* FIXME: reset hardware counters. */ 4369 #endif 4370 priv_unlock(priv); 4371 } 4372 4373 /** 4374 * DPDK callback to remove a MAC address. 4375 * 4376 * @param dev 4377 * Pointer to Ethernet device structure. 4378 * @param index 4379 * MAC address index. 4380 */ 4381 static void 4382 mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) 4383 { 4384 struct priv *priv = dev->data->dev_private; 4385 4386 if (mlx4_is_secondary()) 4387 return; 4388 priv_lock(priv); 4389 DEBUG("%p: removing MAC address from index %" PRIu32, 4390 (void *)dev, index); 4391 /* Last array entry is reserved for broadcast. */ 4392 if (index >= (elemof(priv->mac) - 1)) 4393 goto end; 4394 priv_mac_addr_del(priv, index); 4395 end: 4396 priv_unlock(priv); 4397 } 4398 4399 /** 4400 * DPDK callback to add a MAC address. 4401 * 4402 * @param dev 4403 * Pointer to Ethernet device structure. 4404 * @param mac_addr 4405 * MAC address to register. 4406 * @param index 4407 * MAC address index. 4408 * @param vmdq 4409 * VMDq pool index to associate address with (ignored). 4410 */ 4411 static void 4412 mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr, 4413 uint32_t index, uint32_t vmdq) 4414 { 4415 struct priv *priv = dev->data->dev_private; 4416 4417 if (mlx4_is_secondary()) 4418 return; 4419 (void)vmdq; 4420 priv_lock(priv); 4421 DEBUG("%p: adding MAC address at index %" PRIu32, 4422 (void *)dev, index); 4423 /* Last array entry is reserved for broadcast. */ 4424 if (index >= (elemof(priv->mac) - 1)) 4425 goto end; 4426 priv_mac_addr_add(priv, index, 4427 (const uint8_t (*)[ETHER_ADDR_LEN]) 4428 mac_addr->addr_bytes); 4429 end: 4430 priv_unlock(priv); 4431 } 4432 4433 /** 4434 * DPDK callback to enable promiscuous mode. 4435 * 4436 * @param dev 4437 * Pointer to Ethernet device structure. 4438 */ 4439 static void 4440 mlx4_promiscuous_enable(struct rte_eth_dev *dev) 4441 { 4442 struct priv *priv = dev->data->dev_private; 4443 unsigned int i; 4444 int ret; 4445 4446 if (mlx4_is_secondary()) 4447 return; 4448 priv_lock(priv); 4449 if (priv->promisc) { 4450 priv_unlock(priv); 4451 return; 4452 } 4453 /* If device isn't started, this is all we need to do. */ 4454 if (!priv->started) 4455 goto end; 4456 if (priv->rss) { 4457 ret = rxq_promiscuous_enable(&priv->rxq_parent); 4458 if (ret) { 4459 priv_unlock(priv); 4460 return; 4461 } 4462 goto end; 4463 } 4464 for (i = 0; (i != priv->rxqs_n); ++i) { 4465 if ((*priv->rxqs)[i] == NULL) 4466 continue; 4467 ret = rxq_promiscuous_enable((*priv->rxqs)[i]); 4468 if (!ret) 4469 continue; 4470 /* Failure, rollback. */ 4471 while (i != 0) 4472 if ((*priv->rxqs)[--i] != NULL) 4473 rxq_promiscuous_disable((*priv->rxqs)[i]); 4474 priv_unlock(priv); 4475 return; 4476 } 4477 end: 4478 priv->promisc = 1; 4479 priv_unlock(priv); 4480 } 4481 4482 /** 4483 * DPDK callback to disable promiscuous mode. 4484 * 4485 * @param dev 4486 * Pointer to Ethernet device structure. 4487 */ 4488 static void 4489 mlx4_promiscuous_disable(struct rte_eth_dev *dev) 4490 { 4491 struct priv *priv = dev->data->dev_private; 4492 unsigned int i; 4493 4494 if (mlx4_is_secondary()) 4495 return; 4496 priv_lock(priv); 4497 if (!priv->promisc) { 4498 priv_unlock(priv); 4499 return; 4500 } 4501 if (priv->rss) { 4502 rxq_promiscuous_disable(&priv->rxq_parent); 4503 goto end; 4504 } 4505 for (i = 0; (i != priv->rxqs_n); ++i) 4506 if ((*priv->rxqs)[i] != NULL) 4507 rxq_promiscuous_disable((*priv->rxqs)[i]); 4508 end: 4509 priv->promisc = 0; 4510 priv_unlock(priv); 4511 } 4512 4513 /** 4514 * DPDK callback to enable allmulti mode. 4515 * 4516 * @param dev 4517 * Pointer to Ethernet device structure. 4518 */ 4519 static void 4520 mlx4_allmulticast_enable(struct rte_eth_dev *dev) 4521 { 4522 struct priv *priv = dev->data->dev_private; 4523 unsigned int i; 4524 int ret; 4525 4526 if (mlx4_is_secondary()) 4527 return; 4528 priv_lock(priv); 4529 if (priv->allmulti) { 4530 priv_unlock(priv); 4531 return; 4532 } 4533 /* If device isn't started, this is all we need to do. */ 4534 if (!priv->started) 4535 goto end; 4536 if (priv->rss) { 4537 ret = rxq_allmulticast_enable(&priv->rxq_parent); 4538 if (ret) { 4539 priv_unlock(priv); 4540 return; 4541 } 4542 goto end; 4543 } 4544 for (i = 0; (i != priv->rxqs_n); ++i) { 4545 if ((*priv->rxqs)[i] == NULL) 4546 continue; 4547 ret = rxq_allmulticast_enable((*priv->rxqs)[i]); 4548 if (!ret) 4549 continue; 4550 /* Failure, rollback. */ 4551 while (i != 0) 4552 if ((*priv->rxqs)[--i] != NULL) 4553 rxq_allmulticast_disable((*priv->rxqs)[i]); 4554 priv_unlock(priv); 4555 return; 4556 } 4557 end: 4558 priv->allmulti = 1; 4559 priv_unlock(priv); 4560 } 4561 4562 /** 4563 * DPDK callback to disable allmulti mode. 4564 * 4565 * @param dev 4566 * Pointer to Ethernet device structure. 4567 */ 4568 static void 4569 mlx4_allmulticast_disable(struct rte_eth_dev *dev) 4570 { 4571 struct priv *priv = dev->data->dev_private; 4572 unsigned int i; 4573 4574 if (mlx4_is_secondary()) 4575 return; 4576 priv_lock(priv); 4577 if (!priv->allmulti) { 4578 priv_unlock(priv); 4579 return; 4580 } 4581 if (priv->rss) { 4582 rxq_allmulticast_disable(&priv->rxq_parent); 4583 goto end; 4584 } 4585 for (i = 0; (i != priv->rxqs_n); ++i) 4586 if ((*priv->rxqs)[i] != NULL) 4587 rxq_allmulticast_disable((*priv->rxqs)[i]); 4588 end: 4589 priv->allmulti = 0; 4590 priv_unlock(priv); 4591 } 4592 4593 /** 4594 * DPDK callback to retrieve physical link information (unlocked version). 4595 * 4596 * @param dev 4597 * Pointer to Ethernet device structure. 4598 * @param wait_to_complete 4599 * Wait for request completion (ignored). 4600 */ 4601 static int 4602 mlx4_link_update_unlocked(struct rte_eth_dev *dev, int wait_to_complete) 4603 { 4604 struct priv *priv = mlx4_get_priv(dev); 4605 struct ethtool_cmd edata = { 4606 .cmd = ETHTOOL_GSET 4607 }; 4608 struct ifreq ifr; 4609 struct rte_eth_link dev_link; 4610 int link_speed = 0; 4611 4612 if (priv == NULL) 4613 return -EINVAL; 4614 (void)wait_to_complete; 4615 if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) { 4616 WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno)); 4617 return -1; 4618 } 4619 memset(&dev_link, 0, sizeof(dev_link)); 4620 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && 4621 (ifr.ifr_flags & IFF_RUNNING)); 4622 ifr.ifr_data = &edata; 4623 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 4624 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", 4625 strerror(errno)); 4626 return -1; 4627 } 4628 link_speed = ethtool_cmd_speed(&edata); 4629 if (link_speed == -1) 4630 dev_link.link_speed = 0; 4631 else 4632 dev_link.link_speed = link_speed; 4633 dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? 4634 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); 4635 if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) { 4636 /* Link status changed. */ 4637 dev->data->dev_link = dev_link; 4638 return 0; 4639 } 4640 /* Link status is still the same. */ 4641 return -1; 4642 } 4643 4644 /** 4645 * DPDK callback to retrieve physical link information. 4646 * 4647 * @param dev 4648 * Pointer to Ethernet device structure. 4649 * @param wait_to_complete 4650 * Wait for request completion (ignored). 4651 */ 4652 static int 4653 mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete) 4654 { 4655 struct priv *priv = mlx4_get_priv(dev); 4656 int ret; 4657 4658 if (priv == NULL) 4659 return -EINVAL; 4660 priv_lock(priv); 4661 ret = mlx4_link_update_unlocked(dev, wait_to_complete); 4662 priv_unlock(priv); 4663 return ret; 4664 } 4665 4666 /** 4667 * DPDK callback to change the MTU. 4668 * 4669 * Setting the MTU affects hardware MRU (packets larger than the MTU cannot be 4670 * received). Use this as a hint to enable/disable scattered packets support 4671 * and improve performance when not needed. 4672 * Since failure is not an option, reconfiguring queues on the fly is not 4673 * recommended. 4674 * 4675 * @param dev 4676 * Pointer to Ethernet device structure. 4677 * @param in_mtu 4678 * New MTU. 4679 * 4680 * @return 4681 * 0 on success, negative errno value on failure. 4682 */ 4683 static int 4684 mlx4_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) 4685 { 4686 struct priv *priv = dev->data->dev_private; 4687 int ret = 0; 4688 unsigned int i; 4689 uint16_t (*rx_func)(void *, struct rte_mbuf **, uint16_t) = 4690 mlx4_rx_burst; 4691 4692 if (mlx4_is_secondary()) 4693 return -E_RTE_SECONDARY; 4694 priv_lock(priv); 4695 /* Set kernel interface MTU first. */ 4696 if (priv_set_mtu(priv, mtu)) { 4697 ret = errno; 4698 WARN("cannot set port %u MTU to %u: %s", priv->port, mtu, 4699 strerror(ret)); 4700 goto out; 4701 } else 4702 DEBUG("adapter port %u MTU set to %u", priv->port, mtu); 4703 priv->mtu = mtu; 4704 /* Temporarily replace RX handler with a fake one, assuming it has not 4705 * been copied elsewhere. */ 4706 dev->rx_pkt_burst = removed_rx_burst; 4707 /* Make sure everyone has left mlx4_rx_burst() and uses 4708 * removed_rx_burst() instead. */ 4709 rte_wmb(); 4710 usleep(1000); 4711 /* Reconfigure each RX queue. */ 4712 for (i = 0; (i != priv->rxqs_n); ++i) { 4713 struct rxq *rxq = (*priv->rxqs)[i]; 4714 unsigned int max_frame_len; 4715 int sp; 4716 4717 if (rxq == NULL) 4718 continue; 4719 /* Calculate new maximum frame length according to MTU and 4720 * toggle scattered support (sp) if necessary. */ 4721 max_frame_len = (priv->mtu + ETHER_HDR_LEN + 4722 (ETHER_MAX_VLAN_FRAME_LEN - ETHER_MAX_LEN)); 4723 sp = (max_frame_len > (rxq->mb_len - RTE_PKTMBUF_HEADROOM)); 4724 /* Provide new values to rxq_setup(). */ 4725 dev->data->dev_conf.rxmode.jumbo_frame = sp; 4726 dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len; 4727 ret = rxq_rehash(dev, rxq); 4728 if (ret) { 4729 /* Force SP RX if that queue requires it and abort. */ 4730 if (rxq->sp) 4731 rx_func = mlx4_rx_burst_sp; 4732 break; 4733 } 4734 /* Reenable non-RSS queue attributes. No need to check 4735 * for errors at this stage. */ 4736 if (!priv->rss) { 4737 rxq_mac_addrs_add(rxq); 4738 if (priv->promisc) 4739 rxq_promiscuous_enable(rxq); 4740 if (priv->allmulti) 4741 rxq_allmulticast_enable(rxq); 4742 } 4743 /* Scattered burst function takes priority. */ 4744 if (rxq->sp) 4745 rx_func = mlx4_rx_burst_sp; 4746 } 4747 /* Burst functions can now be called again. */ 4748 rte_wmb(); 4749 dev->rx_pkt_burst = rx_func; 4750 out: 4751 priv_unlock(priv); 4752 assert(ret >= 0); 4753 return -ret; 4754 } 4755 4756 /** 4757 * DPDK callback to get flow control status. 4758 * 4759 * @param dev 4760 * Pointer to Ethernet device structure. 4761 * @param[out] fc_conf 4762 * Flow control output buffer. 4763 * 4764 * @return 4765 * 0 on success, negative errno value on failure. 4766 */ 4767 static int 4768 mlx4_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 4769 { 4770 struct priv *priv = dev->data->dev_private; 4771 struct ifreq ifr; 4772 struct ethtool_pauseparam ethpause = { 4773 .cmd = ETHTOOL_GPAUSEPARAM 4774 }; 4775 int ret; 4776 4777 if (mlx4_is_secondary()) 4778 return -E_RTE_SECONDARY; 4779 ifr.ifr_data = ðpause; 4780 priv_lock(priv); 4781 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 4782 ret = errno; 4783 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)" 4784 " failed: %s", 4785 strerror(ret)); 4786 goto out; 4787 } 4788 4789 fc_conf->autoneg = ethpause.autoneg; 4790 if (ethpause.rx_pause && ethpause.tx_pause) 4791 fc_conf->mode = RTE_FC_FULL; 4792 else if (ethpause.rx_pause) 4793 fc_conf->mode = RTE_FC_RX_PAUSE; 4794 else if (ethpause.tx_pause) 4795 fc_conf->mode = RTE_FC_TX_PAUSE; 4796 else 4797 fc_conf->mode = RTE_FC_NONE; 4798 ret = 0; 4799 4800 out: 4801 priv_unlock(priv); 4802 assert(ret >= 0); 4803 return -ret; 4804 } 4805 4806 /** 4807 * DPDK callback to modify flow control parameters. 4808 * 4809 * @param dev 4810 * Pointer to Ethernet device structure. 4811 * @param[in] fc_conf 4812 * Flow control parameters. 4813 * 4814 * @return 4815 * 0 on success, negative errno value on failure. 4816 */ 4817 static int 4818 mlx4_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) 4819 { 4820 struct priv *priv = dev->data->dev_private; 4821 struct ifreq ifr; 4822 struct ethtool_pauseparam ethpause = { 4823 .cmd = ETHTOOL_SPAUSEPARAM 4824 }; 4825 int ret; 4826 4827 if (mlx4_is_secondary()) 4828 return -E_RTE_SECONDARY; 4829 ifr.ifr_data = ðpause; 4830 ethpause.autoneg = fc_conf->autoneg; 4831 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 4832 (fc_conf->mode & RTE_FC_RX_PAUSE)) 4833 ethpause.rx_pause = 1; 4834 else 4835 ethpause.rx_pause = 0; 4836 4837 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || 4838 (fc_conf->mode & RTE_FC_TX_PAUSE)) 4839 ethpause.tx_pause = 1; 4840 else 4841 ethpause.tx_pause = 0; 4842 4843 priv_lock(priv); 4844 if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { 4845 ret = errno; 4846 WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" 4847 " failed: %s", 4848 strerror(ret)); 4849 goto out; 4850 } 4851 ret = 0; 4852 4853 out: 4854 priv_unlock(priv); 4855 assert(ret >= 0); 4856 return -ret; 4857 } 4858 4859 /** 4860 * Configure a VLAN filter. 4861 * 4862 * @param dev 4863 * Pointer to Ethernet device structure. 4864 * @param vlan_id 4865 * VLAN ID to filter. 4866 * @param on 4867 * Toggle filter. 4868 * 4869 * @return 4870 * 0 on success, errno value on failure. 4871 */ 4872 static int 4873 vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on) 4874 { 4875 struct priv *priv = dev->data->dev_private; 4876 unsigned int i; 4877 unsigned int j = -1; 4878 4879 DEBUG("%p: %s VLAN filter ID %" PRIu16, 4880 (void *)dev, (on ? "enable" : "disable"), vlan_id); 4881 for (i = 0; (i != elemof(priv->vlan_filter)); ++i) { 4882 if (!priv->vlan_filter[i].enabled) { 4883 /* Unused index, remember it. */ 4884 j = i; 4885 continue; 4886 } 4887 if (priv->vlan_filter[i].id != vlan_id) 4888 continue; 4889 /* This VLAN ID is already known, use its index. */ 4890 j = i; 4891 break; 4892 } 4893 /* Check if there's room for another VLAN filter. */ 4894 if (j == (unsigned int)-1) 4895 return ENOMEM; 4896 /* 4897 * VLAN filters apply to all configured MAC addresses, flow 4898 * specifications must be reconfigured accordingly. 4899 */ 4900 priv->vlan_filter[j].id = vlan_id; 4901 if ((on) && (!priv->vlan_filter[j].enabled)) { 4902 /* 4903 * Filter is disabled, enable it. 4904 * Rehashing flows in all RX queues is necessary. 4905 */ 4906 if (priv->rss) 4907 rxq_mac_addrs_del(&priv->rxq_parent); 4908 else 4909 for (i = 0; (i != priv->rxqs_n); ++i) 4910 if ((*priv->rxqs)[i] != NULL) 4911 rxq_mac_addrs_del((*priv->rxqs)[i]); 4912 priv->vlan_filter[j].enabled = 1; 4913 if (priv->started) { 4914 if (priv->rss) 4915 rxq_mac_addrs_add(&priv->rxq_parent); 4916 else 4917 for (i = 0; (i != priv->rxqs_n); ++i) { 4918 if ((*priv->rxqs)[i] == NULL) 4919 continue; 4920 rxq_mac_addrs_add((*priv->rxqs)[i]); 4921 } 4922 } 4923 } else if ((!on) && (priv->vlan_filter[j].enabled)) { 4924 /* 4925 * Filter is enabled, disable it. 4926 * Rehashing flows in all RX queues is necessary. 4927 */ 4928 if (priv->rss) 4929 rxq_mac_addrs_del(&priv->rxq_parent); 4930 else 4931 for (i = 0; (i != priv->rxqs_n); ++i) 4932 if ((*priv->rxqs)[i] != NULL) 4933 rxq_mac_addrs_del((*priv->rxqs)[i]); 4934 priv->vlan_filter[j].enabled = 0; 4935 if (priv->started) { 4936 if (priv->rss) 4937 rxq_mac_addrs_add(&priv->rxq_parent); 4938 else 4939 for (i = 0; (i != priv->rxqs_n); ++i) { 4940 if ((*priv->rxqs)[i] == NULL) 4941 continue; 4942 rxq_mac_addrs_add((*priv->rxqs)[i]); 4943 } 4944 } 4945 } 4946 return 0; 4947 } 4948 4949 /** 4950 * DPDK callback to configure a VLAN filter. 4951 * 4952 * @param dev 4953 * Pointer to Ethernet device structure. 4954 * @param vlan_id 4955 * VLAN ID to filter. 4956 * @param on 4957 * Toggle filter. 4958 * 4959 * @return 4960 * 0 on success, negative errno value on failure. 4961 */ 4962 static int 4963 mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on) 4964 { 4965 struct priv *priv = dev->data->dev_private; 4966 int ret; 4967 4968 if (mlx4_is_secondary()) 4969 return -E_RTE_SECONDARY; 4970 priv_lock(priv); 4971 ret = vlan_filter_set(dev, vlan_id, on); 4972 priv_unlock(priv); 4973 assert(ret >= 0); 4974 return -ret; 4975 } 4976 4977 static const struct eth_dev_ops mlx4_dev_ops = { 4978 .dev_configure = mlx4_dev_configure, 4979 .dev_start = mlx4_dev_start, 4980 .dev_stop = mlx4_dev_stop, 4981 .dev_close = mlx4_dev_close, 4982 .promiscuous_enable = mlx4_promiscuous_enable, 4983 .promiscuous_disable = mlx4_promiscuous_disable, 4984 .allmulticast_enable = mlx4_allmulticast_enable, 4985 .allmulticast_disable = mlx4_allmulticast_disable, 4986 .link_update = mlx4_link_update, 4987 .stats_get = mlx4_stats_get, 4988 .stats_reset = mlx4_stats_reset, 4989 .queue_stats_mapping_set = NULL, 4990 .dev_infos_get = mlx4_dev_infos_get, 4991 .vlan_filter_set = mlx4_vlan_filter_set, 4992 .vlan_tpid_set = NULL, 4993 .vlan_strip_queue_set = NULL, 4994 .vlan_offload_set = NULL, 4995 .rx_queue_setup = mlx4_rx_queue_setup, 4996 .tx_queue_setup = mlx4_tx_queue_setup, 4997 .rx_queue_release = mlx4_rx_queue_release, 4998 .tx_queue_release = mlx4_tx_queue_release, 4999 .dev_led_on = NULL, 5000 .dev_led_off = NULL, 5001 .flow_ctrl_get = mlx4_dev_get_flow_ctrl, 5002 .flow_ctrl_set = mlx4_dev_set_flow_ctrl, 5003 .priority_flow_ctrl_set = NULL, 5004 .mac_addr_remove = mlx4_mac_addr_remove, 5005 .mac_addr_add = mlx4_mac_addr_add, 5006 .mtu_set = mlx4_dev_set_mtu, 5007 }; 5008 5009 /** 5010 * Get PCI information from struct ibv_device. 5011 * 5012 * @param device 5013 * Pointer to Ethernet device structure. 5014 * @param[out] pci_addr 5015 * PCI bus address output buffer. 5016 * 5017 * @return 5018 * 0 on success, -1 on failure and errno is set. 5019 */ 5020 static int 5021 mlx4_ibv_device_to_pci_addr(const struct ibv_device *device, 5022 struct rte_pci_addr *pci_addr) 5023 { 5024 FILE *file; 5025 char line[32]; 5026 MKSTR(path, "%s/device/uevent", device->ibdev_path); 5027 5028 file = fopen(path, "rb"); 5029 if (file == NULL) 5030 return -1; 5031 while (fgets(line, sizeof(line), file) == line) { 5032 size_t len = strlen(line); 5033 int ret; 5034 5035 /* Truncate long lines. */ 5036 if (len == (sizeof(line) - 1)) 5037 while (line[(len - 1)] != '\n') { 5038 ret = fgetc(file); 5039 if (ret == EOF) 5040 break; 5041 line[(len - 1)] = ret; 5042 } 5043 /* Extract information. */ 5044 if (sscanf(line, 5045 "PCI_SLOT_NAME=" 5046 "%" SCNx16 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", 5047 &pci_addr->domain, 5048 &pci_addr->bus, 5049 &pci_addr->devid, 5050 &pci_addr->function) == 4) { 5051 ret = 0; 5052 break; 5053 } 5054 } 5055 fclose(file); 5056 return 0; 5057 } 5058 5059 /** 5060 * Get MAC address by querying netdevice. 5061 * 5062 * @param[in] priv 5063 * struct priv for the requested device. 5064 * @param[out] mac 5065 * MAC address output buffer. 5066 * 5067 * @return 5068 * 0 on success, -1 on failure and errno is set. 5069 */ 5070 static int 5071 priv_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN]) 5072 { 5073 struct ifreq request; 5074 5075 if (priv_ifreq(priv, SIOCGIFHWADDR, &request)) 5076 return -1; 5077 memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); 5078 return 0; 5079 } 5080 5081 /* Support up to 32 adapters. */ 5082 static struct { 5083 struct rte_pci_addr pci_addr; /* associated PCI address */ 5084 uint32_t ports; /* physical ports bitfield. */ 5085 } mlx4_dev[32]; 5086 5087 /** 5088 * Get device index in mlx4_dev[] from PCI bus address. 5089 * 5090 * @param[in] pci_addr 5091 * PCI bus address to look for. 5092 * 5093 * @return 5094 * mlx4_dev[] index on success, -1 on failure. 5095 */ 5096 static int 5097 mlx4_dev_idx(struct rte_pci_addr *pci_addr) 5098 { 5099 unsigned int i; 5100 int ret = -1; 5101 5102 assert(pci_addr != NULL); 5103 for (i = 0; (i != elemof(mlx4_dev)); ++i) { 5104 if ((mlx4_dev[i].pci_addr.domain == pci_addr->domain) && 5105 (mlx4_dev[i].pci_addr.bus == pci_addr->bus) && 5106 (mlx4_dev[i].pci_addr.devid == pci_addr->devid) && 5107 (mlx4_dev[i].pci_addr.function == pci_addr->function)) 5108 return i; 5109 if ((mlx4_dev[i].ports == 0) && (ret == -1)) 5110 ret = i; 5111 } 5112 return ret; 5113 } 5114 5115 /** 5116 * Retrieve integer value from environment variable. 5117 * 5118 * @param[in] name 5119 * Environment variable name. 5120 * 5121 * @return 5122 * Integer value, 0 if the variable is not set. 5123 */ 5124 static int 5125 mlx4_getenv_int(const char *name) 5126 { 5127 const char *val = getenv(name); 5128 5129 if (val == NULL) 5130 return 0; 5131 return atoi(val); 5132 } 5133 5134 static void 5135 mlx4_dev_link_status_handler(void *); 5136 static void 5137 mlx4_dev_interrupt_handler(struct rte_intr_handle *, void *); 5138 5139 /** 5140 * Link status handler. 5141 * 5142 * @param priv 5143 * Pointer to private structure. 5144 * @param dev 5145 * Pointer to the rte_eth_dev structure. 5146 * 5147 * @return 5148 * Nonzero if the callback process can be called immediately. 5149 */ 5150 static int 5151 priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev) 5152 { 5153 struct ibv_async_event event; 5154 int port_change = 0; 5155 int ret = 0; 5156 5157 /* Read all message and acknowledge them. */ 5158 for (;;) { 5159 if (ibv_get_async_event(priv->ctx, &event)) 5160 break; 5161 5162 if (event.event_type == IBV_EVENT_PORT_ACTIVE || 5163 event.event_type == IBV_EVENT_PORT_ERR) 5164 port_change = 1; 5165 else 5166 DEBUG("event type %d on port %d not handled", 5167 event.event_type, event.element.port_num); 5168 ibv_ack_async_event(&event); 5169 } 5170 5171 if (port_change ^ priv->pending_alarm) { 5172 struct rte_eth_link *link = &dev->data->dev_link; 5173 5174 priv->pending_alarm = 0; 5175 mlx4_link_update_unlocked(dev, 0); 5176 if (((link->link_speed == 0) && link->link_status) || 5177 ((link->link_speed != 0) && !link->link_status)) { 5178 /* Inconsistent status, check again later. */ 5179 priv->pending_alarm = 1; 5180 rte_eal_alarm_set(MLX4_ALARM_TIMEOUT_US, 5181 mlx4_dev_link_status_handler, 5182 dev); 5183 } else 5184 ret = 1; 5185 } 5186 return ret; 5187 } 5188 5189 /** 5190 * Handle delayed link status event. 5191 * 5192 * @param arg 5193 * Registered argument. 5194 */ 5195 static void 5196 mlx4_dev_link_status_handler(void *arg) 5197 { 5198 struct rte_eth_dev *dev = arg; 5199 struct priv *priv = dev->data->dev_private; 5200 int ret; 5201 5202 priv_lock(priv); 5203 assert(priv->pending_alarm == 1); 5204 ret = priv_dev_link_status_handler(priv, dev); 5205 priv_unlock(priv); 5206 if (ret) 5207 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC); 5208 } 5209 5210 /** 5211 * Handle interrupts from the NIC. 5212 * 5213 * @param[in] intr_handle 5214 * Interrupt handler. 5215 * @param cb_arg 5216 * Callback argument. 5217 */ 5218 static void 5219 mlx4_dev_interrupt_handler(struct rte_intr_handle *intr_handle, void *cb_arg) 5220 { 5221 struct rte_eth_dev *dev = cb_arg; 5222 struct priv *priv = dev->data->dev_private; 5223 int ret; 5224 5225 (void)intr_handle; 5226 priv_lock(priv); 5227 ret = priv_dev_link_status_handler(priv, dev); 5228 priv_unlock(priv); 5229 if (ret) 5230 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC); 5231 } 5232 5233 /** 5234 * Uninstall interrupt handler. 5235 * 5236 * @param priv 5237 * Pointer to private structure. 5238 * @param dev 5239 * Pointer to the rte_eth_dev structure. 5240 */ 5241 static void 5242 priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev) 5243 { 5244 if (!dev->data->dev_conf.intr_conf.lsc) 5245 return; 5246 rte_intr_callback_unregister(&priv->intr_handle, 5247 mlx4_dev_interrupt_handler, 5248 dev); 5249 if (priv->pending_alarm) 5250 rte_eal_alarm_cancel(mlx4_dev_link_status_handler, dev); 5251 priv->pending_alarm = 0; 5252 priv->intr_handle.fd = 0; 5253 priv->intr_handle.type = 0; 5254 } 5255 5256 /** 5257 * Install interrupt handler. 5258 * 5259 * @param priv 5260 * Pointer to private structure. 5261 * @param dev 5262 * Pointer to the rte_eth_dev structure. 5263 */ 5264 static void 5265 priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev) 5266 { 5267 int rc, flags; 5268 5269 if (!dev->data->dev_conf.intr_conf.lsc) 5270 return; 5271 assert(priv->ctx->async_fd > 0); 5272 flags = fcntl(priv->ctx->async_fd, F_GETFL); 5273 rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); 5274 if (rc < 0) { 5275 INFO("failed to change file descriptor async event queue"); 5276 dev->data->dev_conf.intr_conf.lsc = 0; 5277 } else { 5278 priv->intr_handle.fd = priv->ctx->async_fd; 5279 priv->intr_handle.type = RTE_INTR_HANDLE_EXT; 5280 rte_intr_callback_register(&priv->intr_handle, 5281 mlx4_dev_interrupt_handler, 5282 dev); 5283 } 5284 } 5285 5286 static struct eth_driver mlx4_driver; 5287 5288 /** 5289 * DPDK callback to register a PCI device. 5290 * 5291 * This function creates an Ethernet device for each port of a given 5292 * PCI device. 5293 * 5294 * @param[in] pci_drv 5295 * PCI driver structure (mlx4_driver). 5296 * @param[in] pci_dev 5297 * PCI device information. 5298 * 5299 * @return 5300 * 0 on success, negative errno value on failure. 5301 */ 5302 static int 5303 mlx4_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) 5304 { 5305 struct ibv_device **list; 5306 struct ibv_device *ibv_dev; 5307 int err = 0; 5308 struct ibv_context *attr_ctx = NULL; 5309 struct ibv_device_attr device_attr; 5310 unsigned int vf; 5311 int idx; 5312 int i; 5313 5314 (void)pci_drv; 5315 assert(pci_drv == &mlx4_driver.pci_drv); 5316 /* Get mlx4_dev[] index. */ 5317 idx = mlx4_dev_idx(&pci_dev->addr); 5318 if (idx == -1) { 5319 ERROR("this driver cannot support any more adapters"); 5320 return -ENOMEM; 5321 } 5322 DEBUG("using driver device index %d", idx); 5323 5324 /* Save PCI address. */ 5325 mlx4_dev[idx].pci_addr = pci_dev->addr; 5326 list = ibv_get_device_list(&i); 5327 if (list == NULL) { 5328 assert(errno); 5329 if (errno == ENOSYS) { 5330 WARN("cannot list devices, is ib_uverbs loaded?"); 5331 return 0; 5332 } 5333 return -errno; 5334 } 5335 assert(i >= 0); 5336 /* 5337 * For each listed device, check related sysfs entry against 5338 * the provided PCI ID. 5339 */ 5340 while (i != 0) { 5341 struct rte_pci_addr pci_addr; 5342 5343 --i; 5344 DEBUG("checking device \"%s\"", list[i]->name); 5345 if (mlx4_ibv_device_to_pci_addr(list[i], &pci_addr)) 5346 continue; 5347 if ((pci_dev->addr.domain != pci_addr.domain) || 5348 (pci_dev->addr.bus != pci_addr.bus) || 5349 (pci_dev->addr.devid != pci_addr.devid) || 5350 (pci_dev->addr.function != pci_addr.function)) 5351 continue; 5352 vf = (pci_dev->id.device_id == 5353 PCI_DEVICE_ID_MELLANOX_CONNECTX3VF); 5354 INFO("PCI information matches, using device \"%s\" (VF: %s)", 5355 list[i]->name, (vf ? "true" : "false")); 5356 attr_ctx = ibv_open_device(list[i]); 5357 err = errno; 5358 break; 5359 } 5360 if (attr_ctx == NULL) { 5361 ibv_free_device_list(list); 5362 switch (err) { 5363 case 0: 5364 WARN("cannot access device, is mlx4_ib loaded?"); 5365 return 0; 5366 case EINVAL: 5367 WARN("cannot use device, are drivers up to date?"); 5368 return 0; 5369 } 5370 assert(err > 0); 5371 return -err; 5372 } 5373 ibv_dev = list[i]; 5374 5375 DEBUG("device opened"); 5376 if (ibv_query_device(attr_ctx, &device_attr)) 5377 goto error; 5378 INFO("%u port(s) detected", device_attr.phys_port_cnt); 5379 5380 for (i = 0; i < device_attr.phys_port_cnt; i++) { 5381 uint32_t port = i + 1; /* ports are indexed from one */ 5382 uint32_t test = (1 << i); 5383 struct ibv_context *ctx = NULL; 5384 struct ibv_port_attr port_attr; 5385 struct ibv_pd *pd = NULL; 5386 struct priv *priv = NULL; 5387 struct rte_eth_dev *eth_dev = NULL; 5388 #ifdef HAVE_EXP_QUERY_DEVICE 5389 struct ibv_exp_device_attr exp_device_attr; 5390 #endif /* HAVE_EXP_QUERY_DEVICE */ 5391 struct ether_addr mac; 5392 5393 #ifdef HAVE_EXP_QUERY_DEVICE 5394 exp_device_attr.comp_mask = IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS; 5395 #ifdef RSS_SUPPORT 5396 exp_device_attr.comp_mask |= IBV_EXP_DEVICE_ATTR_RSS_TBL_SZ; 5397 #endif /* RSS_SUPPORT */ 5398 #endif /* HAVE_EXP_QUERY_DEVICE */ 5399 5400 DEBUG("using port %u (%08" PRIx32 ")", port, test); 5401 5402 ctx = ibv_open_device(ibv_dev); 5403 if (ctx == NULL) 5404 goto port_error; 5405 5406 /* Check port status. */ 5407 err = ibv_query_port(ctx, port, &port_attr); 5408 if (err) { 5409 ERROR("port query failed: %s", strerror(err)); 5410 goto port_error; 5411 } 5412 if (port_attr.state != IBV_PORT_ACTIVE) 5413 DEBUG("port %d is not active: \"%s\" (%d)", 5414 port, ibv_port_state_str(port_attr.state), 5415 port_attr.state); 5416 5417 /* Allocate protection domain. */ 5418 pd = ibv_alloc_pd(ctx); 5419 if (pd == NULL) { 5420 ERROR("PD allocation failure"); 5421 err = ENOMEM; 5422 goto port_error; 5423 } 5424 5425 mlx4_dev[idx].ports |= test; 5426 5427 /* from rte_ethdev.c */ 5428 priv = rte_zmalloc("ethdev private structure", 5429 sizeof(*priv), 5430 RTE_CACHE_LINE_SIZE); 5431 if (priv == NULL) { 5432 ERROR("priv allocation failure"); 5433 err = ENOMEM; 5434 goto port_error; 5435 } 5436 5437 priv->ctx = ctx; 5438 priv->device_attr = device_attr; 5439 priv->port = port; 5440 priv->pd = pd; 5441 priv->mtu = ETHER_MTU; 5442 #ifdef HAVE_EXP_QUERY_DEVICE 5443 if (ibv_exp_query_device(ctx, &exp_device_attr)) { 5444 ERROR("ibv_exp_query_device() failed"); 5445 goto port_error; 5446 } 5447 #ifdef RSS_SUPPORT 5448 if ((exp_device_attr.exp_device_cap_flags & 5449 IBV_EXP_DEVICE_QPG) && 5450 (exp_device_attr.exp_device_cap_flags & 5451 IBV_EXP_DEVICE_UD_RSS) && 5452 (exp_device_attr.comp_mask & 5453 IBV_EXP_DEVICE_ATTR_RSS_TBL_SZ) && 5454 (exp_device_attr.max_rss_tbl_sz > 0)) { 5455 priv->hw_qpg = 1; 5456 priv->hw_rss = 1; 5457 priv->max_rss_tbl_sz = exp_device_attr.max_rss_tbl_sz; 5458 } else { 5459 priv->hw_qpg = 0; 5460 priv->hw_rss = 0; 5461 priv->max_rss_tbl_sz = 0; 5462 } 5463 priv->hw_tss = !!(exp_device_attr.exp_device_cap_flags & 5464 IBV_EXP_DEVICE_UD_TSS); 5465 DEBUG("device flags: %s%s%s", 5466 (priv->hw_qpg ? "IBV_DEVICE_QPG " : ""), 5467 (priv->hw_tss ? "IBV_DEVICE_TSS " : ""), 5468 (priv->hw_rss ? "IBV_DEVICE_RSS " : "")); 5469 if (priv->hw_rss) 5470 DEBUG("maximum RSS indirection table size: %u", 5471 exp_device_attr.max_rss_tbl_sz); 5472 #endif /* RSS_SUPPORT */ 5473 5474 priv->hw_csum = 5475 ((exp_device_attr.exp_device_cap_flags & 5476 IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT) && 5477 (exp_device_attr.exp_device_cap_flags & 5478 IBV_EXP_DEVICE_RX_CSUM_IP_PKT)); 5479 DEBUG("checksum offloading is %ssupported", 5480 (priv->hw_csum ? "" : "not ")); 5481 5482 priv->hw_csum_l2tun = !!(exp_device_attr.exp_device_cap_flags & 5483 IBV_EXP_DEVICE_VXLAN_SUPPORT); 5484 DEBUG("L2 tunnel checksum offloads are %ssupported", 5485 (priv->hw_csum_l2tun ? "" : "not ")); 5486 5487 #ifdef INLINE_RECV 5488 priv->inl_recv_size = mlx4_getenv_int("MLX4_INLINE_RECV_SIZE"); 5489 5490 if (priv->inl_recv_size) { 5491 exp_device_attr.comp_mask = 5492 IBV_EXP_DEVICE_ATTR_INLINE_RECV_SZ; 5493 if (ibv_exp_query_device(ctx, &exp_device_attr)) { 5494 INFO("Couldn't query device for inline-receive" 5495 " capabilities."); 5496 priv->inl_recv_size = 0; 5497 } else { 5498 if ((unsigned)exp_device_attr.inline_recv_sz < 5499 priv->inl_recv_size) { 5500 INFO("Max inline-receive (%d) <" 5501 " requested inline-receive (%u)", 5502 exp_device_attr.inline_recv_sz, 5503 priv->inl_recv_size); 5504 priv->inl_recv_size = 5505 exp_device_attr.inline_recv_sz; 5506 } 5507 } 5508 INFO("Set inline receive size to %u", 5509 priv->inl_recv_size); 5510 } 5511 #endif /* INLINE_RECV */ 5512 #endif /* HAVE_EXP_QUERY_DEVICE */ 5513 5514 (void)mlx4_getenv_int; 5515 priv->vf = vf; 5516 /* Configure the first MAC address by default. */ 5517 if (priv_get_mac(priv, &mac.addr_bytes)) { 5518 ERROR("cannot get MAC address, is mlx4_en loaded?" 5519 " (errno: %s)", strerror(errno)); 5520 goto port_error; 5521 } 5522 INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", 5523 priv->port, 5524 mac.addr_bytes[0], mac.addr_bytes[1], 5525 mac.addr_bytes[2], mac.addr_bytes[3], 5526 mac.addr_bytes[4], mac.addr_bytes[5]); 5527 /* Register MAC and broadcast addresses. */ 5528 claim_zero(priv_mac_addr_add(priv, 0, 5529 (const uint8_t (*)[ETHER_ADDR_LEN]) 5530 mac.addr_bytes)); 5531 claim_zero(priv_mac_addr_add(priv, (elemof(priv->mac) - 1), 5532 &(const uint8_t [ETHER_ADDR_LEN]) 5533 { "\xff\xff\xff\xff\xff\xff" })); 5534 #ifndef NDEBUG 5535 { 5536 char ifname[IF_NAMESIZE]; 5537 5538 if (priv_get_ifname(priv, &ifname) == 0) 5539 DEBUG("port %u ifname is \"%s\"", 5540 priv->port, ifname); 5541 else 5542 DEBUG("port %u ifname is unknown", priv->port); 5543 } 5544 #endif 5545 /* Get actual MTU if possible. */ 5546 priv_get_mtu(priv, &priv->mtu); 5547 DEBUG("port %u MTU is %u", priv->port, priv->mtu); 5548 5549 /* from rte_ethdev.c */ 5550 { 5551 char name[RTE_ETH_NAME_MAX_LEN]; 5552 5553 snprintf(name, sizeof(name), "%s port %u", 5554 ibv_get_device_name(ibv_dev), port); 5555 eth_dev = rte_eth_dev_allocate(name, RTE_ETH_DEV_PCI); 5556 } 5557 if (eth_dev == NULL) { 5558 ERROR("can not allocate rte ethdev"); 5559 err = ENOMEM; 5560 goto port_error; 5561 } 5562 5563 /* Secondary processes have to use local storage for their 5564 * private data as well as a copy of eth_dev->data, but this 5565 * pointer must not be modified before burst functions are 5566 * actually called. */ 5567 if (mlx4_is_secondary()) { 5568 struct mlx4_secondary_data *sd = 5569 &mlx4_secondary_data[eth_dev->data->port_id]; 5570 5571 sd->primary_priv = eth_dev->data->dev_private; 5572 if (sd->primary_priv == NULL) { 5573 ERROR("no private data for port %u", 5574 eth_dev->data->port_id); 5575 err = EINVAL; 5576 goto port_error; 5577 } 5578 sd->shared_dev_data = eth_dev->data; 5579 rte_spinlock_init(&sd->lock); 5580 memcpy(sd->data.name, sd->shared_dev_data->name, 5581 sizeof(sd->data.name)); 5582 sd->data.dev_private = priv; 5583 sd->data.rx_mbuf_alloc_failed = 0; 5584 sd->data.mtu = ETHER_MTU; 5585 sd->data.port_id = sd->shared_dev_data->port_id; 5586 sd->data.mac_addrs = priv->mac; 5587 eth_dev->tx_pkt_burst = mlx4_tx_burst_secondary_setup; 5588 eth_dev->rx_pkt_burst = mlx4_rx_burst_secondary_setup; 5589 } else { 5590 eth_dev->data->dev_private = priv; 5591 eth_dev->data->rx_mbuf_alloc_failed = 0; 5592 eth_dev->data->mtu = ETHER_MTU; 5593 eth_dev->data->mac_addrs = priv->mac; 5594 } 5595 eth_dev->pci_dev = pci_dev; 5596 5597 rte_eth_copy_pci_info(eth_dev, pci_dev); 5598 5599 eth_dev->driver = &mlx4_driver; 5600 5601 priv->dev = eth_dev; 5602 eth_dev->dev_ops = &mlx4_dev_ops; 5603 TAILQ_INIT(ð_dev->link_intr_cbs); 5604 5605 /* Bring Ethernet device up. */ 5606 DEBUG("forcing Ethernet interface up"); 5607 priv_set_flags(priv, ~IFF_UP, IFF_UP); 5608 continue; 5609 5610 port_error: 5611 rte_free(priv); 5612 if (pd) 5613 claim_zero(ibv_dealloc_pd(pd)); 5614 if (ctx) 5615 claim_zero(ibv_close_device(ctx)); 5616 if (eth_dev) 5617 rte_eth_dev_release_port(eth_dev); 5618 break; 5619 } 5620 5621 /* 5622 * XXX if something went wrong in the loop above, there is a resource 5623 * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as 5624 * long as the dpdk does not provide a way to deallocate a ethdev and a 5625 * way to enumerate the registered ethdevs to free the previous ones. 5626 */ 5627 5628 /* no port found, complain */ 5629 if (!mlx4_dev[idx].ports) { 5630 err = ENODEV; 5631 goto error; 5632 } 5633 5634 error: 5635 if (attr_ctx) 5636 claim_zero(ibv_close_device(attr_ctx)); 5637 if (list) 5638 ibv_free_device_list(list); 5639 assert(err >= 0); 5640 return -err; 5641 } 5642 5643 static const struct rte_pci_id mlx4_pci_id_map[] = { 5644 { 5645 .vendor_id = PCI_VENDOR_ID_MELLANOX, 5646 .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX3, 5647 .subsystem_vendor_id = PCI_ANY_ID, 5648 .subsystem_device_id = PCI_ANY_ID 5649 }, 5650 { 5651 .vendor_id = PCI_VENDOR_ID_MELLANOX, 5652 .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO, 5653 .subsystem_vendor_id = PCI_ANY_ID, 5654 .subsystem_device_id = PCI_ANY_ID 5655 }, 5656 { 5657 .vendor_id = PCI_VENDOR_ID_MELLANOX, 5658 .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX3VF, 5659 .subsystem_vendor_id = PCI_ANY_ID, 5660 .subsystem_device_id = PCI_ANY_ID 5661 }, 5662 { 5663 .vendor_id = 0 5664 } 5665 }; 5666 5667 static struct eth_driver mlx4_driver = { 5668 .pci_drv = { 5669 .name = MLX4_DRIVER_NAME, 5670 .id_table = mlx4_pci_id_map, 5671 .devinit = mlx4_pci_devinit, 5672 .drv_flags = RTE_PCI_DRV_INTR_LSC, 5673 }, 5674 .dev_private_size = sizeof(struct priv) 5675 }; 5676 5677 /** 5678 * Driver initialization routine. 5679 */ 5680 static int 5681 rte_mlx4_pmd_init(const char *name, const char *args) 5682 { 5683 (void)name; 5684 (void)args; 5685 /* 5686 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use 5687 * huge pages. Calling ibv_fork_init() during init allows 5688 * applications to use fork() safely for purposes other than 5689 * using this PMD, which is not supported in forked processes. 5690 */ 5691 setenv("RDMAV_HUGEPAGES_SAFE", "1", 1); 5692 ibv_fork_init(); 5693 rte_eal_pci_register(&mlx4_driver.pci_drv); 5694 return 0; 5695 } 5696 5697 static struct rte_driver rte_mlx4_driver = { 5698 .type = PMD_PDEV, 5699 .name = MLX4_DRIVER_NAME, 5700 .init = rte_mlx4_pmd_init, 5701 }; 5702 5703 PMD_REGISTER_DRIVER(rte_mlx4_driver) 5704