1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2015 6WIND S.A. 5 * Copyright 2015 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <stddef.h> 35 #include <unistd.h> 36 #include <string.h> 37 #include <assert.h> 38 #include <stdint.h> 39 #include <stdlib.h> 40 #include <errno.h> 41 #include <net/if.h> 42 #include <sys/mman.h> 43 44 /* Verbs header. */ 45 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 46 #ifdef PEDANTIC 47 #pragma GCC diagnostic ignored "-Wpedantic" 48 #endif 49 #include <infiniband/verbs.h> 50 #ifdef PEDANTIC 51 #pragma GCC diagnostic error "-Wpedantic" 52 #endif 53 54 #include <rte_malloc.h> 55 #include <rte_ethdev_driver.h> 56 #include <rte_ethdev_pci.h> 57 #include <rte_pci.h> 58 #include <rte_bus_pci.h> 59 #include <rte_common.h> 60 #include <rte_eal_memconfig.h> 61 #include <rte_kvargs.h> 62 63 #include "mlx5.h" 64 #include "mlx5_utils.h" 65 #include "mlx5_rxtx.h" 66 #include "mlx5_autoconf.h" 67 #include "mlx5_defs.h" 68 69 /* Device parameter to enable RX completion queue compression. */ 70 #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en" 71 72 /* Device parameter to configure inline send. */ 73 #define MLX5_TXQ_INLINE "txq_inline" 74 75 /* 76 * Device parameter to configure the number of TX queues threshold for 77 * enabling inline send. 78 */ 79 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline" 80 81 /* Device parameter to enable multi-packet send WQEs. */ 82 #define MLX5_TXQ_MPW_EN "txq_mpw_en" 83 84 /* Device parameter to include 2 dsegs in the title WQEBB. */ 85 #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en" 86 87 /* Device parameter to limit the size of inlining packet. */ 88 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len" 89 90 /* Device parameter to enable hardware Tx vector. */ 91 #define MLX5_TX_VEC_EN "tx_vec_en" 92 93 /* Device parameter to enable hardware Rx vector. */ 94 #define MLX5_RX_VEC_EN "rx_vec_en" 95 96 #ifndef HAVE_IBV_MLX5_MOD_MPW 97 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) 98 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) 99 #endif 100 101 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP 102 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4) 103 #endif 104 105 /** 106 * Retrieve integer value from environment variable. 107 * 108 * @param[in] name 109 * Environment variable name. 110 * 111 * @return 112 * Integer value, 0 if the variable is not set. 113 */ 114 int 115 mlx5_getenv_int(const char *name) 116 { 117 const char *val = getenv(name); 118 119 if (val == NULL) 120 return 0; 121 return atoi(val); 122 } 123 124 /** 125 * Verbs callback to allocate a memory. This function should allocate the space 126 * according to the size provided residing inside a huge page. 127 * Please note that all allocation must respect the alignment from libmlx5 128 * (i.e. currently sysconf(_SC_PAGESIZE)). 129 * 130 * @param[in] size 131 * The size in bytes of the memory to allocate. 132 * @param[in] data 133 * A pointer to the callback data. 134 * 135 * @return 136 * a pointer to the allocate space. 137 */ 138 static void * 139 mlx5_alloc_verbs_buf(size_t size, void *data) 140 { 141 struct priv *priv = data; 142 void *ret; 143 size_t alignment = sysconf(_SC_PAGESIZE); 144 unsigned int socket = SOCKET_ID_ANY; 145 146 if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) { 147 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 148 149 socket = ctrl->socket; 150 } else if (priv->verbs_alloc_ctx.type == 151 MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) { 152 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 153 154 socket = ctrl->socket; 155 } 156 assert(data != NULL); 157 ret = rte_malloc_socket(__func__, size, alignment, socket); 158 DEBUG("Extern alloc size: %lu, align: %lu: %p", size, alignment, ret); 159 return ret; 160 } 161 162 /** 163 * Verbs callback to free a memory. 164 * 165 * @param[in] ptr 166 * A pointer to the memory to free. 167 * @param[in] data 168 * A pointer to the callback data. 169 */ 170 static void 171 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused) 172 { 173 assert(data != NULL); 174 DEBUG("Extern free request: %p", ptr); 175 rte_free(ptr); 176 } 177 178 /** 179 * DPDK callback to close the device. 180 * 181 * Destroy all queues and objects, free memory. 182 * 183 * @param dev 184 * Pointer to Ethernet device structure. 185 */ 186 static void 187 mlx5_dev_close(struct rte_eth_dev *dev) 188 { 189 struct priv *priv = dev->data->dev_private; 190 unsigned int i; 191 int ret; 192 193 priv_lock(priv); 194 DEBUG("%p: closing device \"%s\"", 195 (void *)dev, 196 ((priv->ctx != NULL) ? priv->ctx->device->name : "")); 197 /* In case mlx5_dev_stop() has not been called. */ 198 priv_dev_interrupt_handler_uninstall(priv, dev); 199 priv_dev_traffic_disable(priv, dev); 200 /* Prevent crashes when queues are still in use. */ 201 dev->rx_pkt_burst = removed_rx_burst; 202 dev->tx_pkt_burst = removed_tx_burst; 203 if (priv->rxqs != NULL) { 204 /* XXX race condition if mlx5_rx_burst() is still running. */ 205 usleep(1000); 206 for (i = 0; (i != priv->rxqs_n); ++i) 207 mlx5_priv_rxq_release(priv, i); 208 priv->rxqs_n = 0; 209 priv->rxqs = NULL; 210 } 211 if (priv->txqs != NULL) { 212 /* XXX race condition if mlx5_tx_burst() is still running. */ 213 usleep(1000); 214 for (i = 0; (i != priv->txqs_n); ++i) 215 mlx5_priv_txq_release(priv, i); 216 priv->txqs_n = 0; 217 priv->txqs = NULL; 218 } 219 if (priv->pd != NULL) { 220 assert(priv->ctx != NULL); 221 claim_zero(ibv_dealloc_pd(priv->pd)); 222 claim_zero(ibv_close_device(priv->ctx)); 223 } else 224 assert(priv->ctx == NULL); 225 if (priv->rss_conf.rss_key != NULL) 226 rte_free(priv->rss_conf.rss_key); 227 if (priv->reta_idx != NULL) 228 rte_free(priv->reta_idx); 229 priv_socket_uninit(priv); 230 ret = mlx5_priv_hrxq_ibv_verify(priv); 231 if (ret) 232 WARN("%p: some Hash Rx queue still remain", (void *)priv); 233 ret = mlx5_priv_ind_table_ibv_verify(priv); 234 if (ret) 235 WARN("%p: some Indirection table still remain", (void *)priv); 236 ret = mlx5_priv_rxq_ibv_verify(priv); 237 if (ret) 238 WARN("%p: some Verbs Rx queue still remain", (void *)priv); 239 ret = mlx5_priv_rxq_verify(priv); 240 if (ret) 241 WARN("%p: some Rx Queues still remain", (void *)priv); 242 ret = mlx5_priv_txq_ibv_verify(priv); 243 if (ret) 244 WARN("%p: some Verbs Tx queue still remain", (void *)priv); 245 ret = mlx5_priv_txq_verify(priv); 246 if (ret) 247 WARN("%p: some Tx Queues still remain", (void *)priv); 248 ret = priv_flow_verify(priv); 249 if (ret) 250 WARN("%p: some flows still remain", (void *)priv); 251 ret = priv_mr_verify(priv); 252 if (ret) 253 WARN("%p: some Memory Region still remain", (void *)priv); 254 priv_unlock(priv); 255 memset(priv, 0, sizeof(*priv)); 256 } 257 258 const struct eth_dev_ops mlx5_dev_ops = { 259 .dev_configure = mlx5_dev_configure, 260 .dev_start = mlx5_dev_start, 261 .dev_stop = mlx5_dev_stop, 262 .dev_set_link_down = mlx5_set_link_down, 263 .dev_set_link_up = mlx5_set_link_up, 264 .dev_close = mlx5_dev_close, 265 .promiscuous_enable = mlx5_promiscuous_enable, 266 .promiscuous_disable = mlx5_promiscuous_disable, 267 .allmulticast_enable = mlx5_allmulticast_enable, 268 .allmulticast_disable = mlx5_allmulticast_disable, 269 .link_update = mlx5_link_update, 270 .stats_get = mlx5_stats_get, 271 .stats_reset = mlx5_stats_reset, 272 .xstats_get = mlx5_xstats_get, 273 .xstats_reset = mlx5_xstats_reset, 274 .xstats_get_names = mlx5_xstats_get_names, 275 .dev_infos_get = mlx5_dev_infos_get, 276 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 277 .vlan_filter_set = mlx5_vlan_filter_set, 278 .rx_queue_setup = mlx5_rx_queue_setup, 279 .tx_queue_setup = mlx5_tx_queue_setup, 280 .rx_queue_release = mlx5_rx_queue_release, 281 .tx_queue_release = mlx5_tx_queue_release, 282 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 283 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 284 .mac_addr_remove = mlx5_mac_addr_remove, 285 .mac_addr_add = mlx5_mac_addr_add, 286 .mac_addr_set = mlx5_mac_addr_set, 287 .mtu_set = mlx5_dev_set_mtu, 288 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 289 .vlan_offload_set = mlx5_vlan_offload_set, 290 .reta_update = mlx5_dev_rss_reta_update, 291 .reta_query = mlx5_dev_rss_reta_query, 292 .rss_hash_update = mlx5_rss_hash_update, 293 .rss_hash_conf_get = mlx5_rss_hash_conf_get, 294 .filter_ctrl = mlx5_dev_filter_ctrl, 295 .rx_descriptor_status = mlx5_rx_descriptor_status, 296 .tx_descriptor_status = mlx5_tx_descriptor_status, 297 .rx_queue_intr_enable = mlx5_rx_intr_enable, 298 .rx_queue_intr_disable = mlx5_rx_intr_disable, 299 .is_removed = mlx5_is_removed, 300 }; 301 302 static const struct eth_dev_ops mlx5_dev_sec_ops = { 303 .stats_get = mlx5_stats_get, 304 .stats_reset = mlx5_stats_reset, 305 .xstats_get = mlx5_xstats_get, 306 .xstats_reset = mlx5_xstats_reset, 307 .xstats_get_names = mlx5_xstats_get_names, 308 .dev_infos_get = mlx5_dev_infos_get, 309 .rx_descriptor_status = mlx5_rx_descriptor_status, 310 .tx_descriptor_status = mlx5_tx_descriptor_status, 311 }; 312 313 /* Available operators in flow isolated mode. */ 314 const struct eth_dev_ops mlx5_dev_ops_isolate = { 315 .dev_configure = mlx5_dev_configure, 316 .dev_start = mlx5_dev_start, 317 .dev_stop = mlx5_dev_stop, 318 .dev_set_link_down = mlx5_set_link_down, 319 .dev_set_link_up = mlx5_set_link_up, 320 .dev_close = mlx5_dev_close, 321 .link_update = mlx5_link_update, 322 .stats_get = mlx5_stats_get, 323 .stats_reset = mlx5_stats_reset, 324 .xstats_get = mlx5_xstats_get, 325 .xstats_reset = mlx5_xstats_reset, 326 .xstats_get_names = mlx5_xstats_get_names, 327 .dev_infos_get = mlx5_dev_infos_get, 328 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 329 .vlan_filter_set = mlx5_vlan_filter_set, 330 .rx_queue_setup = mlx5_rx_queue_setup, 331 .tx_queue_setup = mlx5_tx_queue_setup, 332 .rx_queue_release = mlx5_rx_queue_release, 333 .tx_queue_release = mlx5_tx_queue_release, 334 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 335 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 336 .mac_addr_remove = mlx5_mac_addr_remove, 337 .mac_addr_add = mlx5_mac_addr_add, 338 .mac_addr_set = mlx5_mac_addr_set, 339 .mtu_set = mlx5_dev_set_mtu, 340 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 341 .vlan_offload_set = mlx5_vlan_offload_set, 342 .filter_ctrl = mlx5_dev_filter_ctrl, 343 .rx_descriptor_status = mlx5_rx_descriptor_status, 344 .tx_descriptor_status = mlx5_tx_descriptor_status, 345 .rx_queue_intr_enable = mlx5_rx_intr_enable, 346 .rx_queue_intr_disable = mlx5_rx_intr_disable, 347 .is_removed = mlx5_is_removed, 348 }; 349 350 static struct { 351 struct rte_pci_addr pci_addr; /* associated PCI address */ 352 uint32_t ports; /* physical ports bitfield. */ 353 } mlx5_dev[32]; 354 355 /** 356 * Get device index in mlx5_dev[] from PCI bus address. 357 * 358 * @param[in] pci_addr 359 * PCI bus address to look for. 360 * 361 * @return 362 * mlx5_dev[] index on success, -1 on failure. 363 */ 364 static int 365 mlx5_dev_idx(struct rte_pci_addr *pci_addr) 366 { 367 unsigned int i; 368 int ret = -1; 369 370 assert(pci_addr != NULL); 371 for (i = 0; (i != RTE_DIM(mlx5_dev)); ++i) { 372 if ((mlx5_dev[i].pci_addr.domain == pci_addr->domain) && 373 (mlx5_dev[i].pci_addr.bus == pci_addr->bus) && 374 (mlx5_dev[i].pci_addr.devid == pci_addr->devid) && 375 (mlx5_dev[i].pci_addr.function == pci_addr->function)) 376 return i; 377 if ((mlx5_dev[i].ports == 0) && (ret == -1)) 378 ret = i; 379 } 380 return ret; 381 } 382 383 /** 384 * Verify and store value for device argument. 385 * 386 * @param[in] key 387 * Key argument to verify. 388 * @param[in] val 389 * Value associated with key. 390 * @param opaque 391 * User data. 392 * 393 * @return 394 * 0 on success, negative errno value on failure. 395 */ 396 static int 397 mlx5_args_check(const char *key, const char *val, void *opaque) 398 { 399 struct mlx5_dev_config *config = opaque; 400 unsigned long tmp; 401 402 errno = 0; 403 tmp = strtoul(val, NULL, 0); 404 if (errno) { 405 WARN("%s: \"%s\" is not a valid integer", key, val); 406 return errno; 407 } 408 if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) { 409 config->cqe_comp = !!tmp; 410 } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) { 411 config->txq_inline = tmp; 412 } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) { 413 config->txqs_inline = tmp; 414 } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) { 415 config->mps = !!tmp ? config->mps : 0; 416 } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) { 417 config->mpw_hdr_dseg = !!tmp; 418 } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) { 419 config->inline_max_packet_sz = tmp; 420 } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) { 421 config->tx_vec_en = !!tmp; 422 } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) { 423 config->rx_vec_en = !!tmp; 424 } else { 425 WARN("%s: unknown parameter", key); 426 return -EINVAL; 427 } 428 return 0; 429 } 430 431 /** 432 * Parse device parameters. 433 * 434 * @param config 435 * Pointer to device configuration structure. 436 * @param devargs 437 * Device arguments structure. 438 * 439 * @return 440 * 0 on success, errno value on failure. 441 */ 442 static int 443 mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) 444 { 445 const char **params = (const char *[]){ 446 MLX5_RXQ_CQE_COMP_EN, 447 MLX5_TXQ_INLINE, 448 MLX5_TXQS_MIN_INLINE, 449 MLX5_TXQ_MPW_EN, 450 MLX5_TXQ_MPW_HDR_DSEG_EN, 451 MLX5_TXQ_MAX_INLINE_LEN, 452 MLX5_TX_VEC_EN, 453 MLX5_RX_VEC_EN, 454 NULL, 455 }; 456 struct rte_kvargs *kvlist; 457 int ret = 0; 458 int i; 459 460 if (devargs == NULL) 461 return 0; 462 /* Following UGLY cast is done to pass checkpatch. */ 463 kvlist = rte_kvargs_parse(devargs->args, params); 464 if (kvlist == NULL) 465 return 0; 466 /* Process parameters. */ 467 for (i = 0; (params[i] != NULL); ++i) { 468 if (rte_kvargs_count(kvlist, params[i])) { 469 ret = rte_kvargs_process(kvlist, params[i], 470 mlx5_args_check, config); 471 if (ret != 0) { 472 rte_kvargs_free(kvlist); 473 return ret; 474 } 475 } 476 } 477 rte_kvargs_free(kvlist); 478 return 0; 479 } 480 481 static struct rte_pci_driver mlx5_driver; 482 483 /* 484 * Reserved UAR address space for TXQ UAR(hw doorbell) mapping, process 485 * local resource used by both primary and secondary to avoid duplicate 486 * reservation. 487 * The space has to be available on both primary and secondary process, 488 * TXQ UAR maps to this area using fixed mmap w/o double check. 489 */ 490 static void *uar_base; 491 492 /** 493 * Reserve UAR address space for primary process. 494 * 495 * @param[in] priv 496 * Pointer to private structure. 497 * 498 * @return 499 * 0 on success, errno value on failure. 500 */ 501 static int 502 priv_uar_init_primary(struct priv *priv) 503 { 504 void *addr = (void *)0; 505 int i; 506 const struct rte_mem_config *mcfg; 507 int ret; 508 509 if (uar_base) { /* UAR address space mapped. */ 510 priv->uar_base = uar_base; 511 return 0; 512 } 513 /* find out lower bound of hugepage segments */ 514 mcfg = rte_eal_get_configuration()->mem_config; 515 for (i = 0; i < RTE_MAX_MEMSEG && mcfg->memseg[i].addr; i++) { 516 if (addr) 517 addr = RTE_MIN(addr, mcfg->memseg[i].addr); 518 else 519 addr = mcfg->memseg[i].addr; 520 } 521 /* keep distance to hugepages to minimize potential conflicts. */ 522 addr = RTE_PTR_SUB(addr, MLX5_UAR_OFFSET + MLX5_UAR_SIZE); 523 /* anonymous mmap, no real memory consumption. */ 524 addr = mmap(addr, MLX5_UAR_SIZE, 525 PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 526 if (addr == MAP_FAILED) { 527 ERROR("Failed to reserve UAR address space, please adjust " 528 "MLX5_UAR_SIZE or try --base-virtaddr"); 529 ret = ENOMEM; 530 return ret; 531 } 532 /* Accept either same addr or a new addr returned from mmap if target 533 * range occupied. 534 */ 535 INFO("Reserved UAR address space: %p", addr); 536 priv->uar_base = addr; /* for primary and secondary UAR re-mmap. */ 537 uar_base = addr; /* process local, don't reserve again. */ 538 return 0; 539 } 540 541 /** 542 * Reserve UAR address space for secondary process, align with 543 * primary process. 544 * 545 * @param[in] priv 546 * Pointer to private structure. 547 * 548 * @return 549 * 0 on success, errno value on failure. 550 */ 551 static int 552 priv_uar_init_secondary(struct priv *priv) 553 { 554 void *addr; 555 int ret; 556 557 assert(priv->uar_base); 558 if (uar_base) { /* already reserved. */ 559 assert(uar_base == priv->uar_base); 560 return 0; 561 } 562 /* anonymous mmap, no real memory consumption. */ 563 addr = mmap(priv->uar_base, MLX5_UAR_SIZE, 564 PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 565 if (addr == MAP_FAILED) { 566 ERROR("UAR mmap failed: %p size: %llu", 567 priv->uar_base, MLX5_UAR_SIZE); 568 ret = ENXIO; 569 return ret; 570 } 571 if (priv->uar_base != addr) { 572 ERROR("UAR address %p size %llu occupied, please adjust " 573 "MLX5_UAR_OFFSET or try EAL parameter --base-virtaddr", 574 priv->uar_base, MLX5_UAR_SIZE); 575 ret = ENXIO; 576 return ret; 577 } 578 uar_base = addr; /* process local, don't reserve again */ 579 INFO("Reserved UAR address space: %p", addr); 580 return 0; 581 } 582 583 /** 584 * DPDK callback to register a PCI device. 585 * 586 * This function creates an Ethernet device for each port of a given 587 * PCI device. 588 * 589 * @param[in] pci_drv 590 * PCI driver structure (mlx5_driver). 591 * @param[in] pci_dev 592 * PCI device information. 593 * 594 * @return 595 * 0 on success, negative errno value on failure. 596 */ 597 static int 598 mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) 599 { 600 struct ibv_device **list; 601 struct ibv_device *ibv_dev; 602 int err = 0; 603 struct ibv_context *attr_ctx = NULL; 604 struct ibv_device_attr_ex device_attr; 605 unsigned int sriov; 606 unsigned int mps; 607 unsigned int cqe_comp; 608 unsigned int tunnel_en = 0; 609 int idx; 610 int i; 611 struct mlx5dv_context attrs_out; 612 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT 613 struct ibv_counter_set_description cs_desc; 614 #endif 615 616 (void)pci_drv; 617 assert(pci_drv == &mlx5_driver); 618 /* Get mlx5_dev[] index. */ 619 idx = mlx5_dev_idx(&pci_dev->addr); 620 if (idx == -1) { 621 ERROR("this driver cannot support any more adapters"); 622 return -ENOMEM; 623 } 624 DEBUG("using driver device index %d", idx); 625 626 /* Save PCI address. */ 627 mlx5_dev[idx].pci_addr = pci_dev->addr; 628 list = ibv_get_device_list(&i); 629 if (list == NULL) { 630 assert(errno); 631 if (errno == ENOSYS) 632 ERROR("cannot list devices, is ib_uverbs loaded?"); 633 return -errno; 634 } 635 assert(i >= 0); 636 /* 637 * For each listed device, check related sysfs entry against 638 * the provided PCI ID. 639 */ 640 while (i != 0) { 641 struct rte_pci_addr pci_addr; 642 643 --i; 644 DEBUG("checking device \"%s\"", list[i]->name); 645 if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr)) 646 continue; 647 if ((pci_dev->addr.domain != pci_addr.domain) || 648 (pci_dev->addr.bus != pci_addr.bus) || 649 (pci_dev->addr.devid != pci_addr.devid) || 650 (pci_dev->addr.function != pci_addr.function)) 651 continue; 652 sriov = ((pci_dev->id.device_id == 653 PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) || 654 (pci_dev->id.device_id == 655 PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) || 656 (pci_dev->id.device_id == 657 PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) || 658 (pci_dev->id.device_id == 659 PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)); 660 switch (pci_dev->id.device_id) { 661 case PCI_DEVICE_ID_MELLANOX_CONNECTX4: 662 tunnel_en = 1; 663 break; 664 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX: 665 case PCI_DEVICE_ID_MELLANOX_CONNECTX5: 666 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: 667 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX: 668 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: 669 tunnel_en = 1; 670 break; 671 default: 672 break; 673 } 674 INFO("PCI information matches, using device \"%s\"" 675 " (SR-IOV: %s)", 676 list[i]->name, 677 sriov ? "true" : "false"); 678 attr_ctx = ibv_open_device(list[i]); 679 err = errno; 680 break; 681 } 682 if (attr_ctx == NULL) { 683 ibv_free_device_list(list); 684 switch (err) { 685 case 0: 686 ERROR("cannot access device, is mlx5_ib loaded?"); 687 return -ENODEV; 688 case EINVAL: 689 ERROR("cannot use device, are drivers up to date?"); 690 return -EINVAL; 691 } 692 assert(err > 0); 693 return -err; 694 } 695 ibv_dev = list[i]; 696 697 DEBUG("device opened"); 698 /* 699 * Multi-packet send is supported by ConnectX-4 Lx PF as well 700 * as all ConnectX-5 devices. 701 */ 702 mlx5dv_query_device(attr_ctx, &attrs_out); 703 if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { 704 if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { 705 DEBUG("Enhanced MPW is supported"); 706 mps = MLX5_MPW_ENHANCED; 707 } else { 708 DEBUG("MPW is supported"); 709 mps = MLX5_MPW; 710 } 711 } else { 712 DEBUG("MPW isn't supported"); 713 mps = MLX5_MPW_DISABLED; 714 } 715 if (RTE_CACHE_LINE_SIZE == 128 && 716 !(attrs_out.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) 717 cqe_comp = 0; 718 else 719 cqe_comp = 1; 720 if (ibv_query_device_ex(attr_ctx, NULL, &device_attr)) 721 goto error; 722 INFO("%u port(s) detected", device_attr.orig_attr.phys_port_cnt); 723 724 for (i = 0; i < device_attr.orig_attr.phys_port_cnt; i++) { 725 char name[RTE_ETH_NAME_MAX_LEN]; 726 int len; 727 uint32_t port = i + 1; /* ports are indexed from one */ 728 uint32_t test = (1 << i); 729 struct ibv_context *ctx = NULL; 730 struct ibv_port_attr port_attr; 731 struct ibv_pd *pd = NULL; 732 struct priv *priv = NULL; 733 struct rte_eth_dev *eth_dev; 734 struct ibv_device_attr_ex device_attr_ex; 735 struct ether_addr mac; 736 uint16_t num_vfs = 0; 737 struct ibv_device_attr_ex device_attr; 738 struct mlx5_dev_config config = { 739 .cqe_comp = cqe_comp, 740 .mps = mps, 741 .tunnel_en = tunnel_en, 742 .tx_vec_en = 1, 743 .rx_vec_en = 1, 744 .mpw_hdr_dseg = 0, 745 .txq_inline = MLX5_ARG_UNSET, 746 .txqs_inline = MLX5_ARG_UNSET, 747 .inline_max_packet_sz = MLX5_ARG_UNSET, 748 }; 749 750 len = snprintf(name, sizeof(name), PCI_PRI_FMT, 751 pci_dev->addr.domain, pci_dev->addr.bus, 752 pci_dev->addr.devid, pci_dev->addr.function); 753 if (device_attr.orig_attr.phys_port_cnt > 1) 754 snprintf(name + len, sizeof(name), " port %u", i); 755 756 mlx5_dev[idx].ports |= test; 757 758 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 759 eth_dev = rte_eth_dev_attach_secondary(name); 760 if (eth_dev == NULL) { 761 ERROR("can not attach rte ethdev"); 762 err = ENOMEM; 763 goto error; 764 } 765 eth_dev->device = &pci_dev->device; 766 eth_dev->dev_ops = &mlx5_dev_sec_ops; 767 priv = eth_dev->data->dev_private; 768 err = priv_uar_init_secondary(priv); 769 if (err < 0) { 770 err = -err; 771 goto error; 772 } 773 /* Receive command fd from primary process */ 774 err = priv_socket_connect(priv); 775 if (err < 0) { 776 err = -err; 777 goto error; 778 } 779 /* Remap UAR for Tx queues. */ 780 err = priv_tx_uar_remap(priv, err); 781 if (err) 782 goto error; 783 /* 784 * Ethdev pointer is still required as input since 785 * the primary device is not accessible from the 786 * secondary process. 787 */ 788 eth_dev->rx_pkt_burst = 789 priv_select_rx_function(priv, eth_dev); 790 eth_dev->tx_pkt_burst = 791 priv_select_tx_function(priv, eth_dev); 792 continue; 793 } 794 795 DEBUG("using port %u (%08" PRIx32 ")", port, test); 796 797 ctx = ibv_open_device(ibv_dev); 798 if (ctx == NULL) { 799 err = ENODEV; 800 goto port_error; 801 } 802 803 ibv_query_device_ex(ctx, NULL, &device_attr); 804 /* Check port status. */ 805 err = ibv_query_port(ctx, port, &port_attr); 806 if (err) { 807 ERROR("port query failed: %s", strerror(err)); 808 goto port_error; 809 } 810 811 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { 812 ERROR("port %d is not configured in Ethernet mode", 813 port); 814 err = EINVAL; 815 goto port_error; 816 } 817 818 if (port_attr.state != IBV_PORT_ACTIVE) 819 DEBUG("port %d is not active: \"%s\" (%d)", 820 port, ibv_port_state_str(port_attr.state), 821 port_attr.state); 822 823 /* Allocate protection domain. */ 824 pd = ibv_alloc_pd(ctx); 825 if (pd == NULL) { 826 ERROR("PD allocation failure"); 827 err = ENOMEM; 828 goto port_error; 829 } 830 831 mlx5_dev[idx].ports |= test; 832 833 /* from rte_ethdev.c */ 834 priv = rte_zmalloc("ethdev private structure", 835 sizeof(*priv), 836 RTE_CACHE_LINE_SIZE); 837 if (priv == NULL) { 838 ERROR("priv allocation failure"); 839 err = ENOMEM; 840 goto port_error; 841 } 842 843 priv->ctx = ctx; 844 strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path, 845 sizeof(priv->ibdev_path)); 846 priv->device_attr = device_attr; 847 priv->port = port; 848 priv->pd = pd; 849 priv->mtu = ETHER_MTU; 850 err = mlx5_args(&config, pci_dev->device.devargs); 851 if (err) { 852 ERROR("failed to process device arguments: %s", 853 strerror(err)); 854 goto port_error; 855 } 856 if (ibv_query_device_ex(ctx, NULL, &device_attr_ex)) { 857 ERROR("ibv_query_device_ex() failed"); 858 goto port_error; 859 } 860 861 config.hw_csum = !!(device_attr_ex.device_cap_flags_ex & 862 IBV_DEVICE_RAW_IP_CSUM); 863 DEBUG("checksum offloading is %ssupported", 864 (config.hw_csum ? "" : "not ")); 865 866 #ifdef HAVE_IBV_DEVICE_VXLAN_SUPPORT 867 config.hw_csum_l2tun = 868 !!(exp_device_attr.exp_device_cap_flags & 869 IBV_DEVICE_VXLAN_SUPPORT); 870 #endif 871 DEBUG("Rx L2 tunnel checksum offloads are %ssupported", 872 (config.hw_csum_l2tun ? "" : "not ")); 873 874 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT 875 config.flow_counter_en = !!(device_attr.max_counter_sets); 876 ibv_describe_counter_set(ctx, 0, &cs_desc); 877 DEBUG("counter type = %d, num of cs = %ld, attributes = %d", 878 cs_desc.counter_type, cs_desc.num_of_cs, 879 cs_desc.attributes); 880 #endif 881 config.ind_table_max_size = 882 device_attr_ex.rss_caps.max_rwq_indirection_table_size; 883 /* Remove this check once DPDK supports larger/variable 884 * indirection tables. */ 885 if (config.ind_table_max_size > 886 (unsigned int)ETH_RSS_RETA_SIZE_512) 887 config.ind_table_max_size = ETH_RSS_RETA_SIZE_512; 888 DEBUG("maximum RX indirection table size is %u", 889 config.ind_table_max_size); 890 config.hw_vlan_strip = !!(device_attr_ex.raw_packet_caps & 891 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); 892 DEBUG("VLAN stripping is %ssupported", 893 (config.hw_vlan_strip ? "" : "not ")); 894 895 config.hw_fcs_strip = 896 !!(device_attr_ex.orig_attr.device_cap_flags & 897 IBV_WQ_FLAGS_SCATTER_FCS); 898 DEBUG("FCS stripping configuration is %ssupported", 899 (config.hw_fcs_strip ? "" : "not ")); 900 901 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING 902 config.hw_padding = !!device_attr_ex.rx_pad_end_addr_align; 903 #endif 904 DEBUG("hardware RX end alignment padding is %ssupported", 905 (config.hw_padding ? "" : "not ")); 906 907 priv_get_num_vfs(priv, &num_vfs); 908 config.sriov = (num_vfs || sriov); 909 config.tso = ((device_attr_ex.tso_caps.max_tso > 0) && 910 (device_attr_ex.tso_caps.supported_qpts & 911 (1 << IBV_QPT_RAW_PACKET))); 912 if (config.tso) 913 config.tso_max_payload_sz = 914 device_attr_ex.tso_caps.max_tso; 915 if (config.mps && !mps) { 916 ERROR("multi-packet send not supported on this device" 917 " (" MLX5_TXQ_MPW_EN ")"); 918 err = ENOTSUP; 919 goto port_error; 920 } 921 INFO("%sMPS is %s", 922 config.mps == MLX5_MPW_ENHANCED ? "Enhanced " : "", 923 config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); 924 if (config.cqe_comp && !cqe_comp) { 925 WARN("Rx CQE compression isn't supported"); 926 config.cqe_comp = 0; 927 } 928 err = priv_uar_init_primary(priv); 929 if (err) 930 goto port_error; 931 /* Configure the first MAC address by default. */ 932 if (priv_get_mac(priv, &mac.addr_bytes)) { 933 ERROR("cannot get MAC address, is mlx5_en loaded?" 934 " (errno: %s)", strerror(errno)); 935 err = ENODEV; 936 goto port_error; 937 } 938 INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", 939 priv->port, 940 mac.addr_bytes[0], mac.addr_bytes[1], 941 mac.addr_bytes[2], mac.addr_bytes[3], 942 mac.addr_bytes[4], mac.addr_bytes[5]); 943 #ifndef NDEBUG 944 { 945 char ifname[IF_NAMESIZE]; 946 947 if (priv_get_ifname(priv, &ifname) == 0) 948 DEBUG("port %u ifname is \"%s\"", 949 priv->port, ifname); 950 else 951 DEBUG("port %u ifname is unknown", priv->port); 952 } 953 #endif 954 /* Get actual MTU if possible. */ 955 priv_get_mtu(priv, &priv->mtu); 956 DEBUG("port %u MTU is %u", priv->port, priv->mtu); 957 958 eth_dev = rte_eth_dev_allocate(name); 959 if (eth_dev == NULL) { 960 ERROR("can not allocate rte ethdev"); 961 err = ENOMEM; 962 goto port_error; 963 } 964 eth_dev->data->dev_private = priv; 965 eth_dev->data->mac_addrs = priv->mac; 966 eth_dev->device = &pci_dev->device; 967 rte_eth_copy_pci_info(eth_dev, pci_dev); 968 eth_dev->device->driver = &mlx5_driver.driver; 969 /* 970 * Initialize burst functions to prevent crashes before link-up. 971 */ 972 eth_dev->rx_pkt_burst = removed_rx_burst; 973 eth_dev->tx_pkt_burst = removed_tx_burst; 974 priv->dev = eth_dev; 975 eth_dev->dev_ops = &mlx5_dev_ops; 976 /* Register MAC address. */ 977 claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); 978 TAILQ_INIT(&priv->flows); 979 TAILQ_INIT(&priv->ctrl_flows); 980 981 /* Hint libmlx5 to use PMD allocator for data plane resources */ 982 struct mlx5dv_ctx_allocators alctr = { 983 .alloc = &mlx5_alloc_verbs_buf, 984 .free = &mlx5_free_verbs_buf, 985 .data = priv, 986 }; 987 mlx5dv_set_context_attr(ctx, MLX5DV_CTX_ATTR_BUF_ALLOCATORS, 988 (void *)((uintptr_t)&alctr)); 989 990 /* Bring Ethernet device up. */ 991 DEBUG("forcing Ethernet interface up"); 992 priv_set_flags(priv, ~IFF_UP, IFF_UP); 993 /* Store device configuration on private structure. */ 994 priv->config = config; 995 continue; 996 997 port_error: 998 if (priv) 999 rte_free(priv); 1000 if (pd) 1001 claim_zero(ibv_dealloc_pd(pd)); 1002 if (ctx) 1003 claim_zero(ibv_close_device(ctx)); 1004 break; 1005 } 1006 1007 /* 1008 * XXX if something went wrong in the loop above, there is a resource 1009 * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as 1010 * long as the dpdk does not provide a way to deallocate a ethdev and a 1011 * way to enumerate the registered ethdevs to free the previous ones. 1012 */ 1013 1014 /* no port found, complain */ 1015 if (!mlx5_dev[idx].ports) { 1016 err = ENODEV; 1017 goto error; 1018 } 1019 1020 error: 1021 if (attr_ctx) 1022 claim_zero(ibv_close_device(attr_ctx)); 1023 if (list) 1024 ibv_free_device_list(list); 1025 assert(err >= 0); 1026 return -err; 1027 } 1028 1029 static const struct rte_pci_id mlx5_pci_id_map[] = { 1030 { 1031 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1032 PCI_DEVICE_ID_MELLANOX_CONNECTX4) 1033 }, 1034 { 1035 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1036 PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) 1037 }, 1038 { 1039 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1040 PCI_DEVICE_ID_MELLANOX_CONNECTX4LX) 1041 }, 1042 { 1043 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1044 PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) 1045 }, 1046 { 1047 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1048 PCI_DEVICE_ID_MELLANOX_CONNECTX5) 1049 }, 1050 { 1051 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1052 PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) 1053 }, 1054 { 1055 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1056 PCI_DEVICE_ID_MELLANOX_CONNECTX5EX) 1057 }, 1058 { 1059 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1060 PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF) 1061 }, 1062 { 1063 .vendor_id = 0 1064 } 1065 }; 1066 1067 static struct rte_pci_driver mlx5_driver = { 1068 .driver = { 1069 .name = MLX5_DRIVER_NAME 1070 }, 1071 .id_table = mlx5_pci_id_map, 1072 .probe = mlx5_pci_probe, 1073 .drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV, 1074 }; 1075 1076 /** 1077 * Driver initialization routine. 1078 */ 1079 RTE_INIT(rte_mlx5_pmd_init); 1080 static void 1081 rte_mlx5_pmd_init(void) 1082 { 1083 /* Build the static table for ptype conversion. */ 1084 mlx5_set_ptype_table(); 1085 /* 1086 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use 1087 * huge pages. Calling ibv_fork_init() during init allows 1088 * applications to use fork() safely for purposes other than 1089 * using this PMD, which is not supported in forked processes. 1090 */ 1091 setenv("RDMAV_HUGEPAGES_SAFE", "1", 1); 1092 /* Match the size of Rx completion entry to the size of a cacheline. */ 1093 if (RTE_CACHE_LINE_SIZE == 128) 1094 setenv("MLX5_CQE_SIZE", "128", 0); 1095 ibv_fork_init(); 1096 rte_pci_register(&mlx5_driver); 1097 } 1098 1099 RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__); 1100 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map); 1101 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib"); 1102