1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015 Mellanox. 4 */ 5 6 #include <stddef.h> 7 #include <unistd.h> 8 #include <string.h> 9 #include <assert.h> 10 #include <dlfcn.h> 11 #include <stdint.h> 12 #include <stdlib.h> 13 #include <errno.h> 14 #include <net/if.h> 15 #include <sys/mman.h> 16 17 /* Verbs header. */ 18 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 19 #ifdef PEDANTIC 20 #pragma GCC diagnostic ignored "-Wpedantic" 21 #endif 22 #include <infiniband/verbs.h> 23 #ifdef PEDANTIC 24 #pragma GCC diagnostic error "-Wpedantic" 25 #endif 26 27 #include <rte_malloc.h> 28 #include <rte_ethdev_driver.h> 29 #include <rte_ethdev_pci.h> 30 #include <rte_pci.h> 31 #include <rte_bus_pci.h> 32 #include <rte_common.h> 33 #include <rte_config.h> 34 #include <rte_eal_memconfig.h> 35 #include <rte_kvargs.h> 36 37 #include "mlx5.h" 38 #include "mlx5_utils.h" 39 #include "mlx5_rxtx.h" 40 #include "mlx5_autoconf.h" 41 #include "mlx5_defs.h" 42 #include "mlx5_glue.h" 43 44 /* Device parameter to enable RX completion queue compression. */ 45 #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en" 46 47 /* Device parameter to configure inline send. */ 48 #define MLX5_TXQ_INLINE "txq_inline" 49 50 /* 51 * Device parameter to configure the number of TX queues threshold for 52 * enabling inline send. 53 */ 54 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline" 55 56 /* Device parameter to enable multi-packet send WQEs. */ 57 #define MLX5_TXQ_MPW_EN "txq_mpw_en" 58 59 /* Device parameter to include 2 dsegs in the title WQEBB. */ 60 #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en" 61 62 /* Device parameter to limit the size of inlining packet. */ 63 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len" 64 65 /* Device parameter to enable hardware Tx vector. */ 66 #define MLX5_TX_VEC_EN "tx_vec_en" 67 68 /* Device parameter to enable hardware Rx vector. */ 69 #define MLX5_RX_VEC_EN "rx_vec_en" 70 71 #ifndef HAVE_IBV_MLX5_MOD_MPW 72 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) 73 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) 74 #endif 75 76 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP 77 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4) 78 #endif 79 80 /** 81 * Retrieve integer value from environment variable. 82 * 83 * @param[in] name 84 * Environment variable name. 85 * 86 * @return 87 * Integer value, 0 if the variable is not set. 88 */ 89 int 90 mlx5_getenv_int(const char *name) 91 { 92 const char *val = getenv(name); 93 94 if (val == NULL) 95 return 0; 96 return atoi(val); 97 } 98 99 /** 100 * Verbs callback to allocate a memory. This function should allocate the space 101 * according to the size provided residing inside a huge page. 102 * Please note that all allocation must respect the alignment from libmlx5 103 * (i.e. currently sysconf(_SC_PAGESIZE)). 104 * 105 * @param[in] size 106 * The size in bytes of the memory to allocate. 107 * @param[in] data 108 * A pointer to the callback data. 109 * 110 * @return 111 * a pointer to the allocate space. 112 */ 113 static void * 114 mlx5_alloc_verbs_buf(size_t size, void *data) 115 { 116 struct priv *priv = data; 117 void *ret; 118 size_t alignment = sysconf(_SC_PAGESIZE); 119 unsigned int socket = SOCKET_ID_ANY; 120 121 if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) { 122 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 123 124 socket = ctrl->socket; 125 } else if (priv->verbs_alloc_ctx.type == 126 MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) { 127 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 128 129 socket = ctrl->socket; 130 } 131 assert(data != NULL); 132 ret = rte_malloc_socket(__func__, size, alignment, socket); 133 DEBUG("Extern alloc size: %lu, align: %lu: %p", size, alignment, ret); 134 return ret; 135 } 136 137 /** 138 * Verbs callback to free a memory. 139 * 140 * @param[in] ptr 141 * A pointer to the memory to free. 142 * @param[in] data 143 * A pointer to the callback data. 144 */ 145 static void 146 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused) 147 { 148 assert(data != NULL); 149 DEBUG("Extern free request: %p", ptr); 150 rte_free(ptr); 151 } 152 153 /** 154 * DPDK callback to close the device. 155 * 156 * Destroy all queues and objects, free memory. 157 * 158 * @param dev 159 * Pointer to Ethernet device structure. 160 */ 161 static void 162 mlx5_dev_close(struct rte_eth_dev *dev) 163 { 164 struct priv *priv = dev->data->dev_private; 165 unsigned int i; 166 int ret; 167 168 priv_lock(priv); 169 DEBUG("%p: closing device \"%s\"", 170 (void *)dev, 171 ((priv->ctx != NULL) ? priv->ctx->device->name : "")); 172 /* In case mlx5_dev_stop() has not been called. */ 173 priv_dev_interrupt_handler_uninstall(priv, dev); 174 priv_dev_traffic_disable(priv, dev); 175 /* Prevent crashes when queues are still in use. */ 176 dev->rx_pkt_burst = removed_rx_burst; 177 dev->tx_pkt_burst = removed_tx_burst; 178 if (priv->rxqs != NULL) { 179 /* XXX race condition if mlx5_rx_burst() is still running. */ 180 usleep(1000); 181 for (i = 0; (i != priv->rxqs_n); ++i) 182 mlx5_priv_rxq_release(priv, i); 183 priv->rxqs_n = 0; 184 priv->rxqs = NULL; 185 } 186 if (priv->txqs != NULL) { 187 /* XXX race condition if mlx5_tx_burst() is still running. */ 188 usleep(1000); 189 for (i = 0; (i != priv->txqs_n); ++i) 190 mlx5_priv_txq_release(priv, i); 191 priv->txqs_n = 0; 192 priv->txqs = NULL; 193 } 194 if (priv->pd != NULL) { 195 assert(priv->ctx != NULL); 196 claim_zero(mlx5_glue->dealloc_pd(priv->pd)); 197 claim_zero(mlx5_glue->close_device(priv->ctx)); 198 } else 199 assert(priv->ctx == NULL); 200 if (priv->rss_conf.rss_key != NULL) 201 rte_free(priv->rss_conf.rss_key); 202 if (priv->reta_idx != NULL) 203 rte_free(priv->reta_idx); 204 priv_socket_uninit(priv); 205 ret = mlx5_priv_hrxq_ibv_verify(priv); 206 if (ret) 207 WARN("%p: some Hash Rx queue still remain", (void *)priv); 208 ret = mlx5_priv_ind_table_ibv_verify(priv); 209 if (ret) 210 WARN("%p: some Indirection table still remain", (void *)priv); 211 ret = mlx5_priv_rxq_ibv_verify(priv); 212 if (ret) 213 WARN("%p: some Verbs Rx queue still remain", (void *)priv); 214 ret = mlx5_priv_rxq_verify(priv); 215 if (ret) 216 WARN("%p: some Rx Queues still remain", (void *)priv); 217 ret = mlx5_priv_txq_ibv_verify(priv); 218 if (ret) 219 WARN("%p: some Verbs Tx queue still remain", (void *)priv); 220 ret = mlx5_priv_txq_verify(priv); 221 if (ret) 222 WARN("%p: some Tx Queues still remain", (void *)priv); 223 ret = priv_flow_verify(priv); 224 if (ret) 225 WARN("%p: some flows still remain", (void *)priv); 226 ret = priv_mr_verify(priv); 227 if (ret) 228 WARN("%p: some Memory Region still remain", (void *)priv); 229 priv_unlock(priv); 230 memset(priv, 0, sizeof(*priv)); 231 } 232 233 const struct eth_dev_ops mlx5_dev_ops = { 234 .dev_configure = mlx5_dev_configure, 235 .dev_start = mlx5_dev_start, 236 .dev_stop = mlx5_dev_stop, 237 .dev_set_link_down = mlx5_set_link_down, 238 .dev_set_link_up = mlx5_set_link_up, 239 .dev_close = mlx5_dev_close, 240 .promiscuous_enable = mlx5_promiscuous_enable, 241 .promiscuous_disable = mlx5_promiscuous_disable, 242 .allmulticast_enable = mlx5_allmulticast_enable, 243 .allmulticast_disable = mlx5_allmulticast_disable, 244 .link_update = mlx5_link_update, 245 .stats_get = mlx5_stats_get, 246 .stats_reset = mlx5_stats_reset, 247 .xstats_get = mlx5_xstats_get, 248 .xstats_reset = mlx5_xstats_reset, 249 .xstats_get_names = mlx5_xstats_get_names, 250 .dev_infos_get = mlx5_dev_infos_get, 251 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 252 .vlan_filter_set = mlx5_vlan_filter_set, 253 .rx_queue_setup = mlx5_rx_queue_setup, 254 .tx_queue_setup = mlx5_tx_queue_setup, 255 .rx_queue_release = mlx5_rx_queue_release, 256 .tx_queue_release = mlx5_tx_queue_release, 257 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 258 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 259 .mac_addr_remove = mlx5_mac_addr_remove, 260 .mac_addr_add = mlx5_mac_addr_add, 261 .mac_addr_set = mlx5_mac_addr_set, 262 .mtu_set = mlx5_dev_set_mtu, 263 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 264 .vlan_offload_set = mlx5_vlan_offload_set, 265 .reta_update = mlx5_dev_rss_reta_update, 266 .reta_query = mlx5_dev_rss_reta_query, 267 .rss_hash_update = mlx5_rss_hash_update, 268 .rss_hash_conf_get = mlx5_rss_hash_conf_get, 269 .filter_ctrl = mlx5_dev_filter_ctrl, 270 .rx_descriptor_status = mlx5_rx_descriptor_status, 271 .tx_descriptor_status = mlx5_tx_descriptor_status, 272 .rx_queue_intr_enable = mlx5_rx_intr_enable, 273 .rx_queue_intr_disable = mlx5_rx_intr_disable, 274 .is_removed = mlx5_is_removed, 275 }; 276 277 static const struct eth_dev_ops mlx5_dev_sec_ops = { 278 .stats_get = mlx5_stats_get, 279 .stats_reset = mlx5_stats_reset, 280 .xstats_get = mlx5_xstats_get, 281 .xstats_reset = mlx5_xstats_reset, 282 .xstats_get_names = mlx5_xstats_get_names, 283 .dev_infos_get = mlx5_dev_infos_get, 284 .rx_descriptor_status = mlx5_rx_descriptor_status, 285 .tx_descriptor_status = mlx5_tx_descriptor_status, 286 }; 287 288 /* Available operators in flow isolated mode. */ 289 const struct eth_dev_ops mlx5_dev_ops_isolate = { 290 .dev_configure = mlx5_dev_configure, 291 .dev_start = mlx5_dev_start, 292 .dev_stop = mlx5_dev_stop, 293 .dev_set_link_down = mlx5_set_link_down, 294 .dev_set_link_up = mlx5_set_link_up, 295 .dev_close = mlx5_dev_close, 296 .link_update = mlx5_link_update, 297 .stats_get = mlx5_stats_get, 298 .stats_reset = mlx5_stats_reset, 299 .xstats_get = mlx5_xstats_get, 300 .xstats_reset = mlx5_xstats_reset, 301 .xstats_get_names = mlx5_xstats_get_names, 302 .dev_infos_get = mlx5_dev_infos_get, 303 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 304 .vlan_filter_set = mlx5_vlan_filter_set, 305 .rx_queue_setup = mlx5_rx_queue_setup, 306 .tx_queue_setup = mlx5_tx_queue_setup, 307 .rx_queue_release = mlx5_rx_queue_release, 308 .tx_queue_release = mlx5_tx_queue_release, 309 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 310 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 311 .mac_addr_remove = mlx5_mac_addr_remove, 312 .mac_addr_add = mlx5_mac_addr_add, 313 .mac_addr_set = mlx5_mac_addr_set, 314 .mtu_set = mlx5_dev_set_mtu, 315 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 316 .vlan_offload_set = mlx5_vlan_offload_set, 317 .filter_ctrl = mlx5_dev_filter_ctrl, 318 .rx_descriptor_status = mlx5_rx_descriptor_status, 319 .tx_descriptor_status = mlx5_tx_descriptor_status, 320 .rx_queue_intr_enable = mlx5_rx_intr_enable, 321 .rx_queue_intr_disable = mlx5_rx_intr_disable, 322 .is_removed = mlx5_is_removed, 323 }; 324 325 static struct { 326 struct rte_pci_addr pci_addr; /* associated PCI address */ 327 uint32_t ports; /* physical ports bitfield. */ 328 } mlx5_dev[32]; 329 330 /** 331 * Get device index in mlx5_dev[] from PCI bus address. 332 * 333 * @param[in] pci_addr 334 * PCI bus address to look for. 335 * 336 * @return 337 * mlx5_dev[] index on success, -1 on failure. 338 */ 339 static int 340 mlx5_dev_idx(struct rte_pci_addr *pci_addr) 341 { 342 unsigned int i; 343 int ret = -1; 344 345 assert(pci_addr != NULL); 346 for (i = 0; (i != RTE_DIM(mlx5_dev)); ++i) { 347 if ((mlx5_dev[i].pci_addr.domain == pci_addr->domain) && 348 (mlx5_dev[i].pci_addr.bus == pci_addr->bus) && 349 (mlx5_dev[i].pci_addr.devid == pci_addr->devid) && 350 (mlx5_dev[i].pci_addr.function == pci_addr->function)) 351 return i; 352 if ((mlx5_dev[i].ports == 0) && (ret == -1)) 353 ret = i; 354 } 355 return ret; 356 } 357 358 /** 359 * Verify and store value for device argument. 360 * 361 * @param[in] key 362 * Key argument to verify. 363 * @param[in] val 364 * Value associated with key. 365 * @param opaque 366 * User data. 367 * 368 * @return 369 * 0 on success, negative errno value on failure. 370 */ 371 static int 372 mlx5_args_check(const char *key, const char *val, void *opaque) 373 { 374 struct mlx5_dev_config *config = opaque; 375 unsigned long tmp; 376 377 errno = 0; 378 tmp = strtoul(val, NULL, 0); 379 if (errno) { 380 WARN("%s: \"%s\" is not a valid integer", key, val); 381 return errno; 382 } 383 if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) { 384 config->cqe_comp = !!tmp; 385 } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) { 386 config->txq_inline = tmp; 387 } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) { 388 config->txqs_inline = tmp; 389 } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) { 390 config->mps = !!tmp ? config->mps : 0; 391 } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) { 392 config->mpw_hdr_dseg = !!tmp; 393 } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) { 394 config->inline_max_packet_sz = tmp; 395 } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) { 396 config->tx_vec_en = !!tmp; 397 } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) { 398 config->rx_vec_en = !!tmp; 399 } else { 400 WARN("%s: unknown parameter", key); 401 return -EINVAL; 402 } 403 return 0; 404 } 405 406 /** 407 * Parse device parameters. 408 * 409 * @param config 410 * Pointer to device configuration structure. 411 * @param devargs 412 * Device arguments structure. 413 * 414 * @return 415 * 0 on success, errno value on failure. 416 */ 417 static int 418 mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) 419 { 420 const char **params = (const char *[]){ 421 MLX5_RXQ_CQE_COMP_EN, 422 MLX5_TXQ_INLINE, 423 MLX5_TXQS_MIN_INLINE, 424 MLX5_TXQ_MPW_EN, 425 MLX5_TXQ_MPW_HDR_DSEG_EN, 426 MLX5_TXQ_MAX_INLINE_LEN, 427 MLX5_TX_VEC_EN, 428 MLX5_RX_VEC_EN, 429 NULL, 430 }; 431 struct rte_kvargs *kvlist; 432 int ret = 0; 433 int i; 434 435 if (devargs == NULL) 436 return 0; 437 /* Following UGLY cast is done to pass checkpatch. */ 438 kvlist = rte_kvargs_parse(devargs->args, params); 439 if (kvlist == NULL) 440 return 0; 441 /* Process parameters. */ 442 for (i = 0; (params[i] != NULL); ++i) { 443 if (rte_kvargs_count(kvlist, params[i])) { 444 ret = rte_kvargs_process(kvlist, params[i], 445 mlx5_args_check, config); 446 if (ret != 0) { 447 rte_kvargs_free(kvlist); 448 return ret; 449 } 450 } 451 } 452 rte_kvargs_free(kvlist); 453 return 0; 454 } 455 456 static struct rte_pci_driver mlx5_driver; 457 458 /* 459 * Reserved UAR address space for TXQ UAR(hw doorbell) mapping, process 460 * local resource used by both primary and secondary to avoid duplicate 461 * reservation. 462 * The space has to be available on both primary and secondary process, 463 * TXQ UAR maps to this area using fixed mmap w/o double check. 464 */ 465 static void *uar_base; 466 467 /** 468 * Reserve UAR address space for primary process. 469 * 470 * @param[in] priv 471 * Pointer to private structure. 472 * 473 * @return 474 * 0 on success, errno value on failure. 475 */ 476 static int 477 priv_uar_init_primary(struct priv *priv) 478 { 479 void *addr = (void *)0; 480 int i; 481 const struct rte_mem_config *mcfg; 482 int ret; 483 484 if (uar_base) { /* UAR address space mapped. */ 485 priv->uar_base = uar_base; 486 return 0; 487 } 488 /* find out lower bound of hugepage segments */ 489 mcfg = rte_eal_get_configuration()->mem_config; 490 for (i = 0; i < RTE_MAX_MEMSEG && mcfg->memseg[i].addr; i++) { 491 if (addr) 492 addr = RTE_MIN(addr, mcfg->memseg[i].addr); 493 else 494 addr = mcfg->memseg[i].addr; 495 } 496 /* keep distance to hugepages to minimize potential conflicts. */ 497 addr = RTE_PTR_SUB(addr, MLX5_UAR_OFFSET + MLX5_UAR_SIZE); 498 /* anonymous mmap, no real memory consumption. */ 499 addr = mmap(addr, MLX5_UAR_SIZE, 500 PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 501 if (addr == MAP_FAILED) { 502 ERROR("Failed to reserve UAR address space, please adjust " 503 "MLX5_UAR_SIZE or try --base-virtaddr"); 504 ret = ENOMEM; 505 return ret; 506 } 507 /* Accept either same addr or a new addr returned from mmap if target 508 * range occupied. 509 */ 510 INFO("Reserved UAR address space: %p", addr); 511 priv->uar_base = addr; /* for primary and secondary UAR re-mmap. */ 512 uar_base = addr; /* process local, don't reserve again. */ 513 return 0; 514 } 515 516 /** 517 * Reserve UAR address space for secondary process, align with 518 * primary process. 519 * 520 * @param[in] priv 521 * Pointer to private structure. 522 * 523 * @return 524 * 0 on success, errno value on failure. 525 */ 526 static int 527 priv_uar_init_secondary(struct priv *priv) 528 { 529 void *addr; 530 int ret; 531 532 assert(priv->uar_base); 533 if (uar_base) { /* already reserved. */ 534 assert(uar_base == priv->uar_base); 535 return 0; 536 } 537 /* anonymous mmap, no real memory consumption. */ 538 addr = mmap(priv->uar_base, MLX5_UAR_SIZE, 539 PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 540 if (addr == MAP_FAILED) { 541 ERROR("UAR mmap failed: %p size: %llu", 542 priv->uar_base, MLX5_UAR_SIZE); 543 ret = ENXIO; 544 return ret; 545 } 546 if (priv->uar_base != addr) { 547 ERROR("UAR address %p size %llu occupied, please adjust " 548 "MLX5_UAR_OFFSET or try EAL parameter --base-virtaddr", 549 priv->uar_base, MLX5_UAR_SIZE); 550 ret = ENXIO; 551 return ret; 552 } 553 uar_base = addr; /* process local, don't reserve again */ 554 INFO("Reserved UAR address space: %p", addr); 555 return 0; 556 } 557 558 /** 559 * DPDK callback to register a PCI device. 560 * 561 * This function creates an Ethernet device for each port of a given 562 * PCI device. 563 * 564 * @param[in] pci_drv 565 * PCI driver structure (mlx5_driver). 566 * @param[in] pci_dev 567 * PCI device information. 568 * 569 * @return 570 * 0 on success, negative errno value on failure. 571 */ 572 static int 573 mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) 574 { 575 struct ibv_device **list; 576 struct ibv_device *ibv_dev; 577 int err = 0; 578 struct ibv_context *attr_ctx = NULL; 579 struct ibv_device_attr_ex device_attr; 580 unsigned int sriov; 581 unsigned int mps; 582 unsigned int cqe_comp; 583 unsigned int tunnel_en = 0; 584 int idx; 585 int i; 586 struct mlx5dv_context attrs_out; 587 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT 588 struct ibv_counter_set_description cs_desc; 589 #endif 590 591 (void)pci_drv; 592 assert(pci_drv == &mlx5_driver); 593 /* Get mlx5_dev[] index. */ 594 idx = mlx5_dev_idx(&pci_dev->addr); 595 if (idx == -1) { 596 ERROR("this driver cannot support any more adapters"); 597 return -ENOMEM; 598 } 599 DEBUG("using driver device index %d", idx); 600 601 /* Save PCI address. */ 602 mlx5_dev[idx].pci_addr = pci_dev->addr; 603 list = mlx5_glue->get_device_list(&i); 604 if (list == NULL) { 605 assert(errno); 606 if (errno == ENOSYS) 607 ERROR("cannot list devices, is ib_uverbs loaded?"); 608 return -errno; 609 } 610 assert(i >= 0); 611 /* 612 * For each listed device, check related sysfs entry against 613 * the provided PCI ID. 614 */ 615 while (i != 0) { 616 struct rte_pci_addr pci_addr; 617 618 --i; 619 DEBUG("checking device \"%s\"", list[i]->name); 620 if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr)) 621 continue; 622 if ((pci_dev->addr.domain != pci_addr.domain) || 623 (pci_dev->addr.bus != pci_addr.bus) || 624 (pci_dev->addr.devid != pci_addr.devid) || 625 (pci_dev->addr.function != pci_addr.function)) 626 continue; 627 sriov = ((pci_dev->id.device_id == 628 PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) || 629 (pci_dev->id.device_id == 630 PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) || 631 (pci_dev->id.device_id == 632 PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) || 633 (pci_dev->id.device_id == 634 PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)); 635 switch (pci_dev->id.device_id) { 636 case PCI_DEVICE_ID_MELLANOX_CONNECTX4: 637 tunnel_en = 1; 638 break; 639 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX: 640 case PCI_DEVICE_ID_MELLANOX_CONNECTX5: 641 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: 642 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX: 643 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: 644 tunnel_en = 1; 645 break; 646 default: 647 break; 648 } 649 INFO("PCI information matches, using device \"%s\"" 650 " (SR-IOV: %s)", 651 list[i]->name, 652 sriov ? "true" : "false"); 653 attr_ctx = mlx5_glue->open_device(list[i]); 654 err = errno; 655 break; 656 } 657 if (attr_ctx == NULL) { 658 mlx5_glue->free_device_list(list); 659 switch (err) { 660 case 0: 661 ERROR("cannot access device, is mlx5_ib loaded?"); 662 return -ENODEV; 663 case EINVAL: 664 ERROR("cannot use device, are drivers up to date?"); 665 return -EINVAL; 666 } 667 assert(err > 0); 668 return -err; 669 } 670 ibv_dev = list[i]; 671 672 DEBUG("device opened"); 673 /* 674 * Multi-packet send is supported by ConnectX-4 Lx PF as well 675 * as all ConnectX-5 devices. 676 */ 677 mlx5_glue->dv_query_device(attr_ctx, &attrs_out); 678 if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { 679 if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { 680 DEBUG("Enhanced MPW is supported"); 681 mps = MLX5_MPW_ENHANCED; 682 } else { 683 DEBUG("MPW is supported"); 684 mps = MLX5_MPW; 685 } 686 } else { 687 DEBUG("MPW isn't supported"); 688 mps = MLX5_MPW_DISABLED; 689 } 690 if (RTE_CACHE_LINE_SIZE == 128 && 691 !(attrs_out.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) 692 cqe_comp = 0; 693 else 694 cqe_comp = 1; 695 if (mlx5_glue->query_device_ex(attr_ctx, NULL, &device_attr)) 696 goto error; 697 INFO("%u port(s) detected", device_attr.orig_attr.phys_port_cnt); 698 699 for (i = 0; i < device_attr.orig_attr.phys_port_cnt; i++) { 700 char name[RTE_ETH_NAME_MAX_LEN]; 701 int len; 702 uint32_t port = i + 1; /* ports are indexed from one */ 703 uint32_t test = (1 << i); 704 struct ibv_context *ctx = NULL; 705 struct ibv_port_attr port_attr; 706 struct ibv_pd *pd = NULL; 707 struct priv *priv = NULL; 708 struct rte_eth_dev *eth_dev; 709 struct ibv_device_attr_ex device_attr_ex; 710 struct ether_addr mac; 711 uint16_t num_vfs = 0; 712 struct ibv_device_attr_ex device_attr; 713 struct mlx5_dev_config config = { 714 .cqe_comp = cqe_comp, 715 .mps = mps, 716 .tunnel_en = tunnel_en, 717 .tx_vec_en = 1, 718 .rx_vec_en = 1, 719 .mpw_hdr_dseg = 0, 720 .txq_inline = MLX5_ARG_UNSET, 721 .txqs_inline = MLX5_ARG_UNSET, 722 .inline_max_packet_sz = MLX5_ARG_UNSET, 723 }; 724 725 len = snprintf(name, sizeof(name), PCI_PRI_FMT, 726 pci_dev->addr.domain, pci_dev->addr.bus, 727 pci_dev->addr.devid, pci_dev->addr.function); 728 if (device_attr.orig_attr.phys_port_cnt > 1) 729 snprintf(name + len, sizeof(name), " port %u", i); 730 731 mlx5_dev[idx].ports |= test; 732 733 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 734 eth_dev = rte_eth_dev_attach_secondary(name); 735 if (eth_dev == NULL) { 736 ERROR("can not attach rte ethdev"); 737 err = ENOMEM; 738 goto error; 739 } 740 eth_dev->device = &pci_dev->device; 741 eth_dev->dev_ops = &mlx5_dev_sec_ops; 742 priv = eth_dev->data->dev_private; 743 err = priv_uar_init_secondary(priv); 744 if (err < 0) { 745 err = -err; 746 goto error; 747 } 748 /* Receive command fd from primary process */ 749 err = priv_socket_connect(priv); 750 if (err < 0) { 751 err = -err; 752 goto error; 753 } 754 /* Remap UAR for Tx queues. */ 755 err = priv_tx_uar_remap(priv, err); 756 if (err) 757 goto error; 758 /* 759 * Ethdev pointer is still required as input since 760 * the primary device is not accessible from the 761 * secondary process. 762 */ 763 eth_dev->rx_pkt_burst = 764 priv_select_rx_function(priv, eth_dev); 765 eth_dev->tx_pkt_burst = 766 priv_select_tx_function(priv, eth_dev); 767 continue; 768 } 769 770 DEBUG("using port %u (%08" PRIx32 ")", port, test); 771 772 ctx = mlx5_glue->open_device(ibv_dev); 773 if (ctx == NULL) { 774 err = ENODEV; 775 goto port_error; 776 } 777 778 mlx5_glue->query_device_ex(ctx, NULL, &device_attr); 779 /* Check port status. */ 780 err = mlx5_glue->query_port(ctx, port, &port_attr); 781 if (err) { 782 ERROR("port query failed: %s", strerror(err)); 783 goto port_error; 784 } 785 786 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { 787 ERROR("port %d is not configured in Ethernet mode", 788 port); 789 err = EINVAL; 790 goto port_error; 791 } 792 793 if (port_attr.state != IBV_PORT_ACTIVE) 794 DEBUG("port %d is not active: \"%s\" (%d)", 795 port, mlx5_glue->port_state_str(port_attr.state), 796 port_attr.state); 797 798 /* Allocate protection domain. */ 799 pd = mlx5_glue->alloc_pd(ctx); 800 if (pd == NULL) { 801 ERROR("PD allocation failure"); 802 err = ENOMEM; 803 goto port_error; 804 } 805 806 mlx5_dev[idx].ports |= test; 807 808 /* from rte_ethdev.c */ 809 priv = rte_zmalloc("ethdev private structure", 810 sizeof(*priv), 811 RTE_CACHE_LINE_SIZE); 812 if (priv == NULL) { 813 ERROR("priv allocation failure"); 814 err = ENOMEM; 815 goto port_error; 816 } 817 818 priv->ctx = ctx; 819 strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path, 820 sizeof(priv->ibdev_path)); 821 priv->device_attr = device_attr; 822 priv->port = port; 823 priv->pd = pd; 824 priv->mtu = ETHER_MTU; 825 err = mlx5_args(&config, pci_dev->device.devargs); 826 if (err) { 827 ERROR("failed to process device arguments: %s", 828 strerror(err)); 829 goto port_error; 830 } 831 if (mlx5_glue->query_device_ex(ctx, NULL, &device_attr_ex)) { 832 ERROR("ibv_query_device_ex() failed"); 833 goto port_error; 834 } 835 836 config.hw_csum = !!(device_attr_ex.device_cap_flags_ex & 837 IBV_DEVICE_RAW_IP_CSUM); 838 DEBUG("checksum offloading is %ssupported", 839 (config.hw_csum ? "" : "not ")); 840 841 #ifdef HAVE_IBV_DEVICE_VXLAN_SUPPORT 842 config.hw_csum_l2tun = 843 !!(exp_device_attr.exp_device_cap_flags & 844 IBV_DEVICE_VXLAN_SUPPORT); 845 #endif 846 DEBUG("Rx L2 tunnel checksum offloads are %ssupported", 847 (config.hw_csum_l2tun ? "" : "not ")); 848 849 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT 850 config.flow_counter_en = !!(device_attr.max_counter_sets); 851 mlx5_glue->describe_counter_set(ctx, 0, &cs_desc); 852 DEBUG("counter type = %d, num of cs = %ld, attributes = %d", 853 cs_desc.counter_type, cs_desc.num_of_cs, 854 cs_desc.attributes); 855 #endif 856 config.ind_table_max_size = 857 device_attr_ex.rss_caps.max_rwq_indirection_table_size; 858 /* Remove this check once DPDK supports larger/variable 859 * indirection tables. */ 860 if (config.ind_table_max_size > 861 (unsigned int)ETH_RSS_RETA_SIZE_512) 862 config.ind_table_max_size = ETH_RSS_RETA_SIZE_512; 863 DEBUG("maximum RX indirection table size is %u", 864 config.ind_table_max_size); 865 config.hw_vlan_strip = !!(device_attr_ex.raw_packet_caps & 866 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); 867 DEBUG("VLAN stripping is %ssupported", 868 (config.hw_vlan_strip ? "" : "not ")); 869 870 config.hw_fcs_strip = 871 !!(device_attr_ex.orig_attr.device_cap_flags & 872 IBV_WQ_FLAGS_SCATTER_FCS); 873 DEBUG("FCS stripping configuration is %ssupported", 874 (config.hw_fcs_strip ? "" : "not ")); 875 876 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING 877 config.hw_padding = !!device_attr_ex.rx_pad_end_addr_align; 878 #endif 879 DEBUG("hardware RX end alignment padding is %ssupported", 880 (config.hw_padding ? "" : "not ")); 881 882 priv_get_num_vfs(priv, &num_vfs); 883 config.sriov = (num_vfs || sriov); 884 config.tso = ((device_attr_ex.tso_caps.max_tso > 0) && 885 (device_attr_ex.tso_caps.supported_qpts & 886 (1 << IBV_QPT_RAW_PACKET))); 887 if (config.tso) 888 config.tso_max_payload_sz = 889 device_attr_ex.tso_caps.max_tso; 890 if (config.mps && !mps) { 891 ERROR("multi-packet send not supported on this device" 892 " (" MLX5_TXQ_MPW_EN ")"); 893 err = ENOTSUP; 894 goto port_error; 895 } 896 INFO("%sMPS is %s", 897 config.mps == MLX5_MPW_ENHANCED ? "Enhanced " : "", 898 config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); 899 if (config.cqe_comp && !cqe_comp) { 900 WARN("Rx CQE compression isn't supported"); 901 config.cqe_comp = 0; 902 } 903 err = priv_uar_init_primary(priv); 904 if (err) 905 goto port_error; 906 /* Configure the first MAC address by default. */ 907 if (priv_get_mac(priv, &mac.addr_bytes)) { 908 ERROR("cannot get MAC address, is mlx5_en loaded?" 909 " (errno: %s)", strerror(errno)); 910 err = ENODEV; 911 goto port_error; 912 } 913 INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", 914 priv->port, 915 mac.addr_bytes[0], mac.addr_bytes[1], 916 mac.addr_bytes[2], mac.addr_bytes[3], 917 mac.addr_bytes[4], mac.addr_bytes[5]); 918 #ifndef NDEBUG 919 { 920 char ifname[IF_NAMESIZE]; 921 922 if (priv_get_ifname(priv, &ifname) == 0) 923 DEBUG("port %u ifname is \"%s\"", 924 priv->port, ifname); 925 else 926 DEBUG("port %u ifname is unknown", priv->port); 927 } 928 #endif 929 /* Get actual MTU if possible. */ 930 priv_get_mtu(priv, &priv->mtu); 931 DEBUG("port %u MTU is %u", priv->port, priv->mtu); 932 933 eth_dev = rte_eth_dev_allocate(name); 934 if (eth_dev == NULL) { 935 ERROR("can not allocate rte ethdev"); 936 err = ENOMEM; 937 goto port_error; 938 } 939 eth_dev->data->dev_private = priv; 940 eth_dev->data->mac_addrs = priv->mac; 941 eth_dev->device = &pci_dev->device; 942 rte_eth_copy_pci_info(eth_dev, pci_dev); 943 eth_dev->device->driver = &mlx5_driver.driver; 944 /* 945 * Initialize burst functions to prevent crashes before link-up. 946 */ 947 eth_dev->rx_pkt_burst = removed_rx_burst; 948 eth_dev->tx_pkt_burst = removed_tx_burst; 949 priv->dev = eth_dev; 950 eth_dev->dev_ops = &mlx5_dev_ops; 951 /* Register MAC address. */ 952 claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); 953 TAILQ_INIT(&priv->flows); 954 TAILQ_INIT(&priv->ctrl_flows); 955 956 /* Hint libmlx5 to use PMD allocator for data plane resources */ 957 struct mlx5dv_ctx_allocators alctr = { 958 .alloc = &mlx5_alloc_verbs_buf, 959 .free = &mlx5_free_verbs_buf, 960 .data = priv, 961 }; 962 mlx5_glue->dv_set_context_attr(ctx, 963 MLX5DV_CTX_ATTR_BUF_ALLOCATORS, 964 (void *)((uintptr_t)&alctr)); 965 966 /* Bring Ethernet device up. */ 967 DEBUG("forcing Ethernet interface up"); 968 priv_set_flags(priv, ~IFF_UP, IFF_UP); 969 /* Store device configuration on private structure. */ 970 priv->config = config; 971 continue; 972 973 port_error: 974 if (priv) 975 rte_free(priv); 976 if (pd) 977 claim_zero(mlx5_glue->dealloc_pd(pd)); 978 if (ctx) 979 claim_zero(mlx5_glue->close_device(ctx)); 980 break; 981 } 982 983 /* 984 * XXX if something went wrong in the loop above, there is a resource 985 * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as 986 * long as the dpdk does not provide a way to deallocate a ethdev and a 987 * way to enumerate the registered ethdevs to free the previous ones. 988 */ 989 990 /* no port found, complain */ 991 if (!mlx5_dev[idx].ports) { 992 err = ENODEV; 993 goto error; 994 } 995 996 error: 997 if (attr_ctx) 998 claim_zero(mlx5_glue->close_device(attr_ctx)); 999 if (list) 1000 mlx5_glue->free_device_list(list); 1001 assert(err >= 0); 1002 return -err; 1003 } 1004 1005 static const struct rte_pci_id mlx5_pci_id_map[] = { 1006 { 1007 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1008 PCI_DEVICE_ID_MELLANOX_CONNECTX4) 1009 }, 1010 { 1011 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1012 PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) 1013 }, 1014 { 1015 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1016 PCI_DEVICE_ID_MELLANOX_CONNECTX4LX) 1017 }, 1018 { 1019 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1020 PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) 1021 }, 1022 { 1023 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1024 PCI_DEVICE_ID_MELLANOX_CONNECTX5) 1025 }, 1026 { 1027 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1028 PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) 1029 }, 1030 { 1031 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1032 PCI_DEVICE_ID_MELLANOX_CONNECTX5EX) 1033 }, 1034 { 1035 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1036 PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF) 1037 }, 1038 { 1039 .vendor_id = 0 1040 } 1041 }; 1042 1043 static struct rte_pci_driver mlx5_driver = { 1044 .driver = { 1045 .name = MLX5_DRIVER_NAME 1046 }, 1047 .id_table = mlx5_pci_id_map, 1048 .probe = mlx5_pci_probe, 1049 .drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV, 1050 }; 1051 1052 #ifdef RTE_LIBRTE_MLX5_DLOPEN_DEPS 1053 1054 /** 1055 * Initialization routine for run-time dependency on rdma-core. 1056 */ 1057 static int 1058 mlx5_glue_init(void) 1059 { 1060 void *handle = NULL; 1061 void **sym; 1062 const char *dlmsg; 1063 1064 handle = dlopen(MLX5_GLUE, RTLD_LAZY); 1065 if (!handle) { 1066 rte_errno = EINVAL; 1067 dlmsg = dlerror(); 1068 if (dlmsg) 1069 WARN("cannot load glue library: %s", dlmsg); 1070 goto glue_error; 1071 } 1072 sym = dlsym(handle, "mlx5_glue"); 1073 if (!sym || !*sym) { 1074 rte_errno = EINVAL; 1075 dlmsg = dlerror(); 1076 if (dlmsg) 1077 ERROR("cannot resolve glue symbol: %s", dlmsg); 1078 goto glue_error; 1079 } 1080 mlx5_glue = *sym; 1081 return 0; 1082 glue_error: 1083 if (handle) 1084 dlclose(handle); 1085 WARN("cannot initialize PMD due to missing run-time" 1086 " dependency on rdma-core libraries (libibverbs," 1087 " libmlx5)"); 1088 return -rte_errno; 1089 } 1090 1091 #endif 1092 1093 /** 1094 * Driver initialization routine. 1095 */ 1096 RTE_INIT(rte_mlx5_pmd_init); 1097 static void 1098 rte_mlx5_pmd_init(void) 1099 { 1100 /* Build the static table for ptype conversion. */ 1101 mlx5_set_ptype_table(); 1102 /* 1103 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use 1104 * huge pages. Calling ibv_fork_init() during init allows 1105 * applications to use fork() safely for purposes other than 1106 * using this PMD, which is not supported in forked processes. 1107 */ 1108 setenv("RDMAV_HUGEPAGES_SAFE", "1", 1); 1109 /* Match the size of Rx completion entry to the size of a cacheline. */ 1110 if (RTE_CACHE_LINE_SIZE == 128) 1111 setenv("MLX5_CQE_SIZE", "128", 0); 1112 #ifdef RTE_LIBRTE_MLX5_DLOPEN_DEPS 1113 if (mlx5_glue_init()) 1114 return; 1115 assert(mlx5_glue); 1116 #endif 1117 mlx5_glue->fork_init(); 1118 rte_pci_register(&mlx5_driver); 1119 } 1120 1121 RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__); 1122 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map); 1123 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib"); 1124