1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015 Mellanox. 4 */ 5 6 #include <stddef.h> 7 #include <unistd.h> 8 #include <string.h> 9 #include <assert.h> 10 #include <dlfcn.h> 11 #include <stdint.h> 12 #include <stdlib.h> 13 #include <errno.h> 14 #include <net/if.h> 15 #include <sys/mman.h> 16 17 /* Verbs header. */ 18 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 19 #ifdef PEDANTIC 20 #pragma GCC diagnostic ignored "-Wpedantic" 21 #endif 22 #include <infiniband/verbs.h> 23 #ifdef PEDANTIC 24 #pragma GCC diagnostic error "-Wpedantic" 25 #endif 26 27 #include <rte_malloc.h> 28 #include <rte_ethdev_driver.h> 29 #include <rte_ethdev_pci.h> 30 #include <rte_pci.h> 31 #include <rte_bus_pci.h> 32 #include <rte_common.h> 33 #include <rte_config.h> 34 #include <rte_eal_memconfig.h> 35 #include <rte_kvargs.h> 36 37 #include "mlx5.h" 38 #include "mlx5_utils.h" 39 #include "mlx5_rxtx.h" 40 #include "mlx5_autoconf.h" 41 #include "mlx5_defs.h" 42 #include "mlx5_glue.h" 43 44 /* Device parameter to enable RX completion queue compression. */ 45 #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en" 46 47 /* Device parameter to configure inline send. */ 48 #define MLX5_TXQ_INLINE "txq_inline" 49 50 /* 51 * Device parameter to configure the number of TX queues threshold for 52 * enabling inline send. 53 */ 54 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline" 55 56 /* Device parameter to enable multi-packet send WQEs. */ 57 #define MLX5_TXQ_MPW_EN "txq_mpw_en" 58 59 /* Device parameter to include 2 dsegs in the title WQEBB. */ 60 #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en" 61 62 /* Device parameter to limit the size of inlining packet. */ 63 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len" 64 65 /* Device parameter to enable hardware Tx vector. */ 66 #define MLX5_TX_VEC_EN "tx_vec_en" 67 68 /* Device parameter to enable hardware Rx vector. */ 69 #define MLX5_RX_VEC_EN "rx_vec_en" 70 71 #ifndef HAVE_IBV_MLX5_MOD_MPW 72 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) 73 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) 74 #endif 75 76 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP 77 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4) 78 #endif 79 80 /** 81 * Retrieve integer value from environment variable. 82 * 83 * @param[in] name 84 * Environment variable name. 85 * 86 * @return 87 * Integer value, 0 if the variable is not set. 88 */ 89 int 90 mlx5_getenv_int(const char *name) 91 { 92 const char *val = getenv(name); 93 94 if (val == NULL) 95 return 0; 96 return atoi(val); 97 } 98 99 /** 100 * Verbs callback to allocate a memory. This function should allocate the space 101 * according to the size provided residing inside a huge page. 102 * Please note that all allocation must respect the alignment from libmlx5 103 * (i.e. currently sysconf(_SC_PAGESIZE)). 104 * 105 * @param[in] size 106 * The size in bytes of the memory to allocate. 107 * @param[in] data 108 * A pointer to the callback data. 109 * 110 * @return 111 * a pointer to the allocate space. 112 */ 113 static void * 114 mlx5_alloc_verbs_buf(size_t size, void *data) 115 { 116 struct priv *priv = data; 117 void *ret; 118 size_t alignment = sysconf(_SC_PAGESIZE); 119 unsigned int socket = SOCKET_ID_ANY; 120 121 if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) { 122 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 123 124 socket = ctrl->socket; 125 } else if (priv->verbs_alloc_ctx.type == 126 MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) { 127 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 128 129 socket = ctrl->socket; 130 } 131 assert(data != NULL); 132 ret = rte_malloc_socket(__func__, size, alignment, socket); 133 DEBUG("Extern alloc size: %lu, align: %lu: %p", size, alignment, ret); 134 return ret; 135 } 136 137 /** 138 * Verbs callback to free a memory. 139 * 140 * @param[in] ptr 141 * A pointer to the memory to free. 142 * @param[in] data 143 * A pointer to the callback data. 144 */ 145 static void 146 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused) 147 { 148 assert(data != NULL); 149 DEBUG("Extern free request: %p", ptr); 150 rte_free(ptr); 151 } 152 153 /** 154 * DPDK callback to close the device. 155 * 156 * Destroy all queues and objects, free memory. 157 * 158 * @param dev 159 * Pointer to Ethernet device structure. 160 */ 161 static void 162 mlx5_dev_close(struct rte_eth_dev *dev) 163 { 164 struct priv *priv = dev->data->dev_private; 165 unsigned int i; 166 int ret; 167 168 priv_lock(priv); 169 DEBUG("%p: closing device \"%s\"", 170 (void *)dev, 171 ((priv->ctx != NULL) ? priv->ctx->device->name : "")); 172 /* In case mlx5_dev_stop() has not been called. */ 173 priv_dev_interrupt_handler_uninstall(priv, dev); 174 priv_dev_traffic_disable(priv, dev); 175 /* Prevent crashes when queues are still in use. */ 176 dev->rx_pkt_burst = removed_rx_burst; 177 dev->tx_pkt_burst = removed_tx_burst; 178 if (priv->rxqs != NULL) { 179 /* XXX race condition if mlx5_rx_burst() is still running. */ 180 usleep(1000); 181 for (i = 0; (i != priv->rxqs_n); ++i) 182 mlx5_priv_rxq_release(priv, i); 183 priv->rxqs_n = 0; 184 priv->rxqs = NULL; 185 } 186 if (priv->txqs != NULL) { 187 /* XXX race condition if mlx5_tx_burst() is still running. */ 188 usleep(1000); 189 for (i = 0; (i != priv->txqs_n); ++i) 190 mlx5_priv_txq_release(priv, i); 191 priv->txqs_n = 0; 192 priv->txqs = NULL; 193 } 194 if (priv->pd != NULL) { 195 assert(priv->ctx != NULL); 196 claim_zero(mlx5_glue->dealloc_pd(priv->pd)); 197 claim_zero(mlx5_glue->close_device(priv->ctx)); 198 } else 199 assert(priv->ctx == NULL); 200 if (priv->rss_conf.rss_key != NULL) 201 rte_free(priv->rss_conf.rss_key); 202 if (priv->reta_idx != NULL) 203 rte_free(priv->reta_idx); 204 if (priv->primary_socket) 205 priv_socket_uninit(priv); 206 ret = mlx5_priv_hrxq_ibv_verify(priv); 207 if (ret) 208 WARN("%p: some Hash Rx queue still remain", (void *)priv); 209 ret = mlx5_priv_ind_table_ibv_verify(priv); 210 if (ret) 211 WARN("%p: some Indirection table still remain", (void *)priv); 212 ret = mlx5_priv_rxq_ibv_verify(priv); 213 if (ret) 214 WARN("%p: some Verbs Rx queue still remain", (void *)priv); 215 ret = mlx5_priv_rxq_verify(priv); 216 if (ret) 217 WARN("%p: some Rx Queues still remain", (void *)priv); 218 ret = mlx5_priv_txq_ibv_verify(priv); 219 if (ret) 220 WARN("%p: some Verbs Tx queue still remain", (void *)priv); 221 ret = mlx5_priv_txq_verify(priv); 222 if (ret) 223 WARN("%p: some Tx Queues still remain", (void *)priv); 224 ret = priv_flow_verify(priv); 225 if (ret) 226 WARN("%p: some flows still remain", (void *)priv); 227 ret = priv_mr_verify(priv); 228 if (ret) 229 WARN("%p: some Memory Region still remain", (void *)priv); 230 priv_unlock(priv); 231 memset(priv, 0, sizeof(*priv)); 232 } 233 234 const struct eth_dev_ops mlx5_dev_ops = { 235 .dev_configure = mlx5_dev_configure, 236 .dev_start = mlx5_dev_start, 237 .dev_stop = mlx5_dev_stop, 238 .dev_set_link_down = mlx5_set_link_down, 239 .dev_set_link_up = mlx5_set_link_up, 240 .dev_close = mlx5_dev_close, 241 .promiscuous_enable = mlx5_promiscuous_enable, 242 .promiscuous_disable = mlx5_promiscuous_disable, 243 .allmulticast_enable = mlx5_allmulticast_enable, 244 .allmulticast_disable = mlx5_allmulticast_disable, 245 .link_update = mlx5_link_update, 246 .stats_get = mlx5_stats_get, 247 .stats_reset = mlx5_stats_reset, 248 .xstats_get = mlx5_xstats_get, 249 .xstats_reset = mlx5_xstats_reset, 250 .xstats_get_names = mlx5_xstats_get_names, 251 .dev_infos_get = mlx5_dev_infos_get, 252 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 253 .vlan_filter_set = mlx5_vlan_filter_set, 254 .rx_queue_setup = mlx5_rx_queue_setup, 255 .tx_queue_setup = mlx5_tx_queue_setup, 256 .rx_queue_release = mlx5_rx_queue_release, 257 .tx_queue_release = mlx5_tx_queue_release, 258 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 259 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 260 .mac_addr_remove = mlx5_mac_addr_remove, 261 .mac_addr_add = mlx5_mac_addr_add, 262 .mac_addr_set = mlx5_mac_addr_set, 263 .mtu_set = mlx5_dev_set_mtu, 264 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 265 .vlan_offload_set = mlx5_vlan_offload_set, 266 .reta_update = mlx5_dev_rss_reta_update, 267 .reta_query = mlx5_dev_rss_reta_query, 268 .rss_hash_update = mlx5_rss_hash_update, 269 .rss_hash_conf_get = mlx5_rss_hash_conf_get, 270 .filter_ctrl = mlx5_dev_filter_ctrl, 271 .rx_descriptor_status = mlx5_rx_descriptor_status, 272 .tx_descriptor_status = mlx5_tx_descriptor_status, 273 .rx_queue_intr_enable = mlx5_rx_intr_enable, 274 .rx_queue_intr_disable = mlx5_rx_intr_disable, 275 .is_removed = mlx5_is_removed, 276 }; 277 278 static const struct eth_dev_ops mlx5_dev_sec_ops = { 279 .stats_get = mlx5_stats_get, 280 .stats_reset = mlx5_stats_reset, 281 .xstats_get = mlx5_xstats_get, 282 .xstats_reset = mlx5_xstats_reset, 283 .xstats_get_names = mlx5_xstats_get_names, 284 .dev_infos_get = mlx5_dev_infos_get, 285 .rx_descriptor_status = mlx5_rx_descriptor_status, 286 .tx_descriptor_status = mlx5_tx_descriptor_status, 287 }; 288 289 /* Available operators in flow isolated mode. */ 290 const struct eth_dev_ops mlx5_dev_ops_isolate = { 291 .dev_configure = mlx5_dev_configure, 292 .dev_start = mlx5_dev_start, 293 .dev_stop = mlx5_dev_stop, 294 .dev_set_link_down = mlx5_set_link_down, 295 .dev_set_link_up = mlx5_set_link_up, 296 .dev_close = mlx5_dev_close, 297 .link_update = mlx5_link_update, 298 .stats_get = mlx5_stats_get, 299 .stats_reset = mlx5_stats_reset, 300 .xstats_get = mlx5_xstats_get, 301 .xstats_reset = mlx5_xstats_reset, 302 .xstats_get_names = mlx5_xstats_get_names, 303 .dev_infos_get = mlx5_dev_infos_get, 304 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 305 .vlan_filter_set = mlx5_vlan_filter_set, 306 .rx_queue_setup = mlx5_rx_queue_setup, 307 .tx_queue_setup = mlx5_tx_queue_setup, 308 .rx_queue_release = mlx5_rx_queue_release, 309 .tx_queue_release = mlx5_tx_queue_release, 310 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 311 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 312 .mac_addr_remove = mlx5_mac_addr_remove, 313 .mac_addr_add = mlx5_mac_addr_add, 314 .mac_addr_set = mlx5_mac_addr_set, 315 .mtu_set = mlx5_dev_set_mtu, 316 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 317 .vlan_offload_set = mlx5_vlan_offload_set, 318 .filter_ctrl = mlx5_dev_filter_ctrl, 319 .rx_descriptor_status = mlx5_rx_descriptor_status, 320 .tx_descriptor_status = mlx5_tx_descriptor_status, 321 .rx_queue_intr_enable = mlx5_rx_intr_enable, 322 .rx_queue_intr_disable = mlx5_rx_intr_disable, 323 .is_removed = mlx5_is_removed, 324 }; 325 326 static struct { 327 struct rte_pci_addr pci_addr; /* associated PCI address */ 328 uint32_t ports; /* physical ports bitfield. */ 329 } mlx5_dev[32]; 330 331 /** 332 * Get device index in mlx5_dev[] from PCI bus address. 333 * 334 * @param[in] pci_addr 335 * PCI bus address to look for. 336 * 337 * @return 338 * mlx5_dev[] index on success, -1 on failure. 339 */ 340 static int 341 mlx5_dev_idx(struct rte_pci_addr *pci_addr) 342 { 343 unsigned int i; 344 int ret = -1; 345 346 assert(pci_addr != NULL); 347 for (i = 0; (i != RTE_DIM(mlx5_dev)); ++i) { 348 if ((mlx5_dev[i].pci_addr.domain == pci_addr->domain) && 349 (mlx5_dev[i].pci_addr.bus == pci_addr->bus) && 350 (mlx5_dev[i].pci_addr.devid == pci_addr->devid) && 351 (mlx5_dev[i].pci_addr.function == pci_addr->function)) 352 return i; 353 if ((mlx5_dev[i].ports == 0) && (ret == -1)) 354 ret = i; 355 } 356 return ret; 357 } 358 359 /** 360 * Verify and store value for device argument. 361 * 362 * @param[in] key 363 * Key argument to verify. 364 * @param[in] val 365 * Value associated with key. 366 * @param opaque 367 * User data. 368 * 369 * @return 370 * 0 on success, negative errno value on failure. 371 */ 372 static int 373 mlx5_args_check(const char *key, const char *val, void *opaque) 374 { 375 struct mlx5_dev_config *config = opaque; 376 unsigned long tmp; 377 378 errno = 0; 379 tmp = strtoul(val, NULL, 0); 380 if (errno) { 381 WARN("%s: \"%s\" is not a valid integer", key, val); 382 return errno; 383 } 384 if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) { 385 config->cqe_comp = !!tmp; 386 } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) { 387 config->txq_inline = tmp; 388 } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) { 389 config->txqs_inline = tmp; 390 } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) { 391 config->mps = !!tmp ? config->mps : 0; 392 } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) { 393 config->mpw_hdr_dseg = !!tmp; 394 } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) { 395 config->inline_max_packet_sz = tmp; 396 } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) { 397 config->tx_vec_en = !!tmp; 398 } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) { 399 config->rx_vec_en = !!tmp; 400 } else { 401 WARN("%s: unknown parameter", key); 402 return -EINVAL; 403 } 404 return 0; 405 } 406 407 /** 408 * Parse device parameters. 409 * 410 * @param config 411 * Pointer to device configuration structure. 412 * @param devargs 413 * Device arguments structure. 414 * 415 * @return 416 * 0 on success, errno value on failure. 417 */ 418 static int 419 mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) 420 { 421 const char **params = (const char *[]){ 422 MLX5_RXQ_CQE_COMP_EN, 423 MLX5_TXQ_INLINE, 424 MLX5_TXQS_MIN_INLINE, 425 MLX5_TXQ_MPW_EN, 426 MLX5_TXQ_MPW_HDR_DSEG_EN, 427 MLX5_TXQ_MAX_INLINE_LEN, 428 MLX5_TX_VEC_EN, 429 MLX5_RX_VEC_EN, 430 NULL, 431 }; 432 struct rte_kvargs *kvlist; 433 int ret = 0; 434 int i; 435 436 if (devargs == NULL) 437 return 0; 438 /* Following UGLY cast is done to pass checkpatch. */ 439 kvlist = rte_kvargs_parse(devargs->args, params); 440 if (kvlist == NULL) 441 return 0; 442 /* Process parameters. */ 443 for (i = 0; (params[i] != NULL); ++i) { 444 if (rte_kvargs_count(kvlist, params[i])) { 445 ret = rte_kvargs_process(kvlist, params[i], 446 mlx5_args_check, config); 447 if (ret != 0) { 448 rte_kvargs_free(kvlist); 449 return ret; 450 } 451 } 452 } 453 rte_kvargs_free(kvlist); 454 return 0; 455 } 456 457 static struct rte_pci_driver mlx5_driver; 458 459 /* 460 * Reserved UAR address space for TXQ UAR(hw doorbell) mapping, process 461 * local resource used by both primary and secondary to avoid duplicate 462 * reservation. 463 * The space has to be available on both primary and secondary process, 464 * TXQ UAR maps to this area using fixed mmap w/o double check. 465 */ 466 static void *uar_base; 467 468 /** 469 * Reserve UAR address space for primary process. 470 * 471 * @param[in] priv 472 * Pointer to private structure. 473 * 474 * @return 475 * 0 on success, errno value on failure. 476 */ 477 static int 478 priv_uar_init_primary(struct priv *priv) 479 { 480 void *addr = (void *)0; 481 int i; 482 const struct rte_mem_config *mcfg; 483 int ret; 484 485 if (uar_base) { /* UAR address space mapped. */ 486 priv->uar_base = uar_base; 487 return 0; 488 } 489 /* find out lower bound of hugepage segments */ 490 mcfg = rte_eal_get_configuration()->mem_config; 491 for (i = 0; i < RTE_MAX_MEMSEG && mcfg->memseg[i].addr; i++) { 492 if (addr) 493 addr = RTE_MIN(addr, mcfg->memseg[i].addr); 494 else 495 addr = mcfg->memseg[i].addr; 496 } 497 /* keep distance to hugepages to minimize potential conflicts. */ 498 addr = RTE_PTR_SUB(addr, MLX5_UAR_OFFSET + MLX5_UAR_SIZE); 499 /* anonymous mmap, no real memory consumption. */ 500 addr = mmap(addr, MLX5_UAR_SIZE, 501 PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 502 if (addr == MAP_FAILED) { 503 ERROR("Failed to reserve UAR address space, please adjust " 504 "MLX5_UAR_SIZE or try --base-virtaddr"); 505 ret = ENOMEM; 506 return ret; 507 } 508 /* Accept either same addr or a new addr returned from mmap if target 509 * range occupied. 510 */ 511 INFO("Reserved UAR address space: %p", addr); 512 priv->uar_base = addr; /* for primary and secondary UAR re-mmap. */ 513 uar_base = addr; /* process local, don't reserve again. */ 514 return 0; 515 } 516 517 /** 518 * Reserve UAR address space for secondary process, align with 519 * primary process. 520 * 521 * @param[in] priv 522 * Pointer to private structure. 523 * 524 * @return 525 * 0 on success, errno value on failure. 526 */ 527 static int 528 priv_uar_init_secondary(struct priv *priv) 529 { 530 void *addr; 531 int ret; 532 533 assert(priv->uar_base); 534 if (uar_base) { /* already reserved. */ 535 assert(uar_base == priv->uar_base); 536 return 0; 537 } 538 /* anonymous mmap, no real memory consumption. */ 539 addr = mmap(priv->uar_base, MLX5_UAR_SIZE, 540 PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 541 if (addr == MAP_FAILED) { 542 ERROR("UAR mmap failed: %p size: %llu", 543 priv->uar_base, MLX5_UAR_SIZE); 544 ret = ENXIO; 545 return ret; 546 } 547 if (priv->uar_base != addr) { 548 ERROR("UAR address %p size %llu occupied, please adjust " 549 "MLX5_UAR_OFFSET or try EAL parameter --base-virtaddr", 550 priv->uar_base, MLX5_UAR_SIZE); 551 ret = ENXIO; 552 return ret; 553 } 554 uar_base = addr; /* process local, don't reserve again */ 555 INFO("Reserved UAR address space: %p", addr); 556 return 0; 557 } 558 559 /** 560 * DPDK callback to register a PCI device. 561 * 562 * This function creates an Ethernet device for each port of a given 563 * PCI device. 564 * 565 * @param[in] pci_drv 566 * PCI driver structure (mlx5_driver). 567 * @param[in] pci_dev 568 * PCI device information. 569 * 570 * @return 571 * 0 on success, negative errno value on failure. 572 */ 573 static int 574 mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) 575 { 576 struct ibv_device **list; 577 struct ibv_device *ibv_dev; 578 int err = 0; 579 struct ibv_context *attr_ctx = NULL; 580 struct ibv_device_attr_ex device_attr; 581 unsigned int sriov; 582 unsigned int mps; 583 unsigned int cqe_comp; 584 unsigned int tunnel_en = 0; 585 int idx; 586 int i; 587 struct mlx5dv_context attrs_out; 588 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT 589 struct ibv_counter_set_description cs_desc; 590 #endif 591 592 (void)pci_drv; 593 assert(pci_drv == &mlx5_driver); 594 /* Get mlx5_dev[] index. */ 595 idx = mlx5_dev_idx(&pci_dev->addr); 596 if (idx == -1) { 597 ERROR("this driver cannot support any more adapters"); 598 return -ENOMEM; 599 } 600 DEBUG("using driver device index %d", idx); 601 602 /* Save PCI address. */ 603 mlx5_dev[idx].pci_addr = pci_dev->addr; 604 list = mlx5_glue->get_device_list(&i); 605 if (list == NULL) { 606 assert(errno); 607 if (errno == ENOSYS) 608 ERROR("cannot list devices, is ib_uverbs loaded?"); 609 return -errno; 610 } 611 assert(i >= 0); 612 /* 613 * For each listed device, check related sysfs entry against 614 * the provided PCI ID. 615 */ 616 while (i != 0) { 617 struct rte_pci_addr pci_addr; 618 619 --i; 620 DEBUG("checking device \"%s\"", list[i]->name); 621 if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr)) 622 continue; 623 if ((pci_dev->addr.domain != pci_addr.domain) || 624 (pci_dev->addr.bus != pci_addr.bus) || 625 (pci_dev->addr.devid != pci_addr.devid) || 626 (pci_dev->addr.function != pci_addr.function)) 627 continue; 628 sriov = ((pci_dev->id.device_id == 629 PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) || 630 (pci_dev->id.device_id == 631 PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) || 632 (pci_dev->id.device_id == 633 PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) || 634 (pci_dev->id.device_id == 635 PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)); 636 switch (pci_dev->id.device_id) { 637 case PCI_DEVICE_ID_MELLANOX_CONNECTX4: 638 tunnel_en = 1; 639 break; 640 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX: 641 case PCI_DEVICE_ID_MELLANOX_CONNECTX5: 642 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: 643 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX: 644 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: 645 tunnel_en = 1; 646 break; 647 default: 648 break; 649 } 650 INFO("PCI information matches, using device \"%s\"" 651 " (SR-IOV: %s)", 652 list[i]->name, 653 sriov ? "true" : "false"); 654 attr_ctx = mlx5_glue->open_device(list[i]); 655 err = errno; 656 break; 657 } 658 if (attr_ctx == NULL) { 659 mlx5_glue->free_device_list(list); 660 switch (err) { 661 case 0: 662 ERROR("cannot access device, is mlx5_ib loaded?"); 663 return -ENODEV; 664 case EINVAL: 665 ERROR("cannot use device, are drivers up to date?"); 666 return -EINVAL; 667 } 668 assert(err > 0); 669 return -err; 670 } 671 ibv_dev = list[i]; 672 673 DEBUG("device opened"); 674 /* 675 * Multi-packet send is supported by ConnectX-4 Lx PF as well 676 * as all ConnectX-5 devices. 677 */ 678 mlx5_glue->dv_query_device(attr_ctx, &attrs_out); 679 if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { 680 if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { 681 DEBUG("Enhanced MPW is supported"); 682 mps = MLX5_MPW_ENHANCED; 683 } else { 684 DEBUG("MPW is supported"); 685 mps = MLX5_MPW; 686 } 687 } else { 688 DEBUG("MPW isn't supported"); 689 mps = MLX5_MPW_DISABLED; 690 } 691 if (RTE_CACHE_LINE_SIZE == 128 && 692 !(attrs_out.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) 693 cqe_comp = 0; 694 else 695 cqe_comp = 1; 696 if (mlx5_glue->query_device_ex(attr_ctx, NULL, &device_attr)) 697 goto error; 698 INFO("%u port(s) detected", device_attr.orig_attr.phys_port_cnt); 699 700 for (i = 0; i < device_attr.orig_attr.phys_port_cnt; i++) { 701 char name[RTE_ETH_NAME_MAX_LEN]; 702 int len; 703 uint32_t port = i + 1; /* ports are indexed from one */ 704 uint32_t test = (1 << i); 705 struct ibv_context *ctx = NULL; 706 struct ibv_port_attr port_attr; 707 struct ibv_pd *pd = NULL; 708 struct priv *priv = NULL; 709 struct rte_eth_dev *eth_dev; 710 struct ibv_device_attr_ex device_attr_ex; 711 struct ether_addr mac; 712 uint16_t num_vfs = 0; 713 struct ibv_device_attr_ex device_attr; 714 struct mlx5_dev_config config = { 715 .cqe_comp = cqe_comp, 716 .mps = mps, 717 .tunnel_en = tunnel_en, 718 .tx_vec_en = 1, 719 .rx_vec_en = 1, 720 .mpw_hdr_dseg = 0, 721 .txq_inline = MLX5_ARG_UNSET, 722 .txqs_inline = MLX5_ARG_UNSET, 723 .inline_max_packet_sz = MLX5_ARG_UNSET, 724 }; 725 726 len = snprintf(name, sizeof(name), PCI_PRI_FMT, 727 pci_dev->addr.domain, pci_dev->addr.bus, 728 pci_dev->addr.devid, pci_dev->addr.function); 729 if (device_attr.orig_attr.phys_port_cnt > 1) 730 snprintf(name + len, sizeof(name), " port %u", i); 731 732 mlx5_dev[idx].ports |= test; 733 734 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 735 eth_dev = rte_eth_dev_attach_secondary(name); 736 if (eth_dev == NULL) { 737 ERROR("can not attach rte ethdev"); 738 err = ENOMEM; 739 goto error; 740 } 741 eth_dev->device = &pci_dev->device; 742 eth_dev->dev_ops = &mlx5_dev_sec_ops; 743 priv = eth_dev->data->dev_private; 744 err = priv_uar_init_secondary(priv); 745 if (err < 0) { 746 err = -err; 747 goto error; 748 } 749 /* Receive command fd from primary process */ 750 err = priv_socket_connect(priv); 751 if (err < 0) { 752 err = -err; 753 goto error; 754 } 755 /* Remap UAR for Tx queues. */ 756 err = priv_tx_uar_remap(priv, err); 757 if (err) 758 goto error; 759 /* 760 * Ethdev pointer is still required as input since 761 * the primary device is not accessible from the 762 * secondary process. 763 */ 764 eth_dev->rx_pkt_burst = 765 priv_select_rx_function(priv, eth_dev); 766 eth_dev->tx_pkt_burst = 767 priv_select_tx_function(priv, eth_dev); 768 continue; 769 } 770 771 DEBUG("using port %u (%08" PRIx32 ")", port, test); 772 773 ctx = mlx5_glue->open_device(ibv_dev); 774 if (ctx == NULL) { 775 err = ENODEV; 776 goto port_error; 777 } 778 779 mlx5_glue->query_device_ex(ctx, NULL, &device_attr); 780 /* Check port status. */ 781 err = mlx5_glue->query_port(ctx, port, &port_attr); 782 if (err) { 783 ERROR("port query failed: %s", strerror(err)); 784 goto port_error; 785 } 786 787 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { 788 ERROR("port %d is not configured in Ethernet mode", 789 port); 790 err = EINVAL; 791 goto port_error; 792 } 793 794 if (port_attr.state != IBV_PORT_ACTIVE) 795 DEBUG("port %d is not active: \"%s\" (%d)", 796 port, mlx5_glue->port_state_str(port_attr.state), 797 port_attr.state); 798 799 /* Allocate protection domain. */ 800 pd = mlx5_glue->alloc_pd(ctx); 801 if (pd == NULL) { 802 ERROR("PD allocation failure"); 803 err = ENOMEM; 804 goto port_error; 805 } 806 807 mlx5_dev[idx].ports |= test; 808 809 /* from rte_ethdev.c */ 810 priv = rte_zmalloc("ethdev private structure", 811 sizeof(*priv), 812 RTE_CACHE_LINE_SIZE); 813 if (priv == NULL) { 814 ERROR("priv allocation failure"); 815 err = ENOMEM; 816 goto port_error; 817 } 818 819 priv->ctx = ctx; 820 strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path, 821 sizeof(priv->ibdev_path)); 822 priv->device_attr = device_attr; 823 priv->port = port; 824 priv->pd = pd; 825 priv->mtu = ETHER_MTU; 826 err = mlx5_args(&config, pci_dev->device.devargs); 827 if (err) { 828 ERROR("failed to process device arguments: %s", 829 strerror(err)); 830 goto port_error; 831 } 832 if (mlx5_glue->query_device_ex(ctx, NULL, &device_attr_ex)) { 833 ERROR("ibv_query_device_ex() failed"); 834 goto port_error; 835 } 836 837 config.hw_csum = !!(device_attr_ex.device_cap_flags_ex & 838 IBV_DEVICE_RAW_IP_CSUM); 839 DEBUG("checksum offloading is %ssupported", 840 (config.hw_csum ? "" : "not ")); 841 842 #ifdef HAVE_IBV_DEVICE_VXLAN_SUPPORT 843 config.hw_csum_l2tun = 844 !!(exp_device_attr.exp_device_cap_flags & 845 IBV_DEVICE_VXLAN_SUPPORT); 846 #endif 847 DEBUG("Rx L2 tunnel checksum offloads are %ssupported", 848 (config.hw_csum_l2tun ? "" : "not ")); 849 850 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT 851 config.flow_counter_en = !!(device_attr.max_counter_sets); 852 mlx5_glue->describe_counter_set(ctx, 0, &cs_desc); 853 DEBUG("counter type = %d, num of cs = %ld, attributes = %d", 854 cs_desc.counter_type, cs_desc.num_of_cs, 855 cs_desc.attributes); 856 #endif 857 config.ind_table_max_size = 858 device_attr_ex.rss_caps.max_rwq_indirection_table_size; 859 /* Remove this check once DPDK supports larger/variable 860 * indirection tables. */ 861 if (config.ind_table_max_size > 862 (unsigned int)ETH_RSS_RETA_SIZE_512) 863 config.ind_table_max_size = ETH_RSS_RETA_SIZE_512; 864 DEBUG("maximum RX indirection table size is %u", 865 config.ind_table_max_size); 866 config.hw_vlan_strip = !!(device_attr_ex.raw_packet_caps & 867 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); 868 DEBUG("VLAN stripping is %ssupported", 869 (config.hw_vlan_strip ? "" : "not ")); 870 871 config.hw_fcs_strip = !!(device_attr_ex.raw_packet_caps & 872 IBV_RAW_PACKET_CAP_SCATTER_FCS); 873 DEBUG("FCS stripping configuration is %ssupported", 874 (config.hw_fcs_strip ? "" : "not ")); 875 876 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING 877 config.hw_padding = !!device_attr_ex.rx_pad_end_addr_align; 878 #endif 879 DEBUG("hardware RX end alignment padding is %ssupported", 880 (config.hw_padding ? "" : "not ")); 881 882 priv_get_num_vfs(priv, &num_vfs); 883 config.sriov = (num_vfs || sriov); 884 config.tso = ((device_attr_ex.tso_caps.max_tso > 0) && 885 (device_attr_ex.tso_caps.supported_qpts & 886 (1 << IBV_QPT_RAW_PACKET))); 887 if (config.tso) 888 config.tso_max_payload_sz = 889 device_attr_ex.tso_caps.max_tso; 890 if (config.mps && !mps) { 891 ERROR("multi-packet send not supported on this device" 892 " (" MLX5_TXQ_MPW_EN ")"); 893 err = ENOTSUP; 894 goto port_error; 895 } 896 INFO("%sMPS is %s", 897 config.mps == MLX5_MPW_ENHANCED ? "Enhanced " : "", 898 config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); 899 if (config.cqe_comp && !cqe_comp) { 900 WARN("Rx CQE compression isn't supported"); 901 config.cqe_comp = 0; 902 } 903 err = priv_uar_init_primary(priv); 904 if (err) 905 goto port_error; 906 /* Configure the first MAC address by default. */ 907 if (priv_get_mac(priv, &mac.addr_bytes)) { 908 ERROR("cannot get MAC address, is mlx5_en loaded?" 909 " (errno: %s)", strerror(errno)); 910 err = ENODEV; 911 goto port_error; 912 } 913 INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", 914 priv->port, 915 mac.addr_bytes[0], mac.addr_bytes[1], 916 mac.addr_bytes[2], mac.addr_bytes[3], 917 mac.addr_bytes[4], mac.addr_bytes[5]); 918 #ifndef NDEBUG 919 { 920 char ifname[IF_NAMESIZE]; 921 922 if (priv_get_ifname(priv, &ifname) == 0) 923 DEBUG("port %u ifname is \"%s\"", 924 priv->port, ifname); 925 else 926 DEBUG("port %u ifname is unknown", priv->port); 927 } 928 #endif 929 /* Get actual MTU if possible. */ 930 priv_get_mtu(priv, &priv->mtu); 931 DEBUG("port %u MTU is %u", priv->port, priv->mtu); 932 933 eth_dev = rte_eth_dev_allocate(name); 934 if (eth_dev == NULL) { 935 ERROR("can not allocate rte ethdev"); 936 err = ENOMEM; 937 goto port_error; 938 } 939 eth_dev->data->dev_private = priv; 940 eth_dev->data->mac_addrs = priv->mac; 941 eth_dev->device = &pci_dev->device; 942 rte_eth_copy_pci_info(eth_dev, pci_dev); 943 eth_dev->device->driver = &mlx5_driver.driver; 944 /* 945 * Initialize burst functions to prevent crashes before link-up. 946 */ 947 eth_dev->rx_pkt_burst = removed_rx_burst; 948 eth_dev->tx_pkt_burst = removed_tx_burst; 949 priv->dev = eth_dev; 950 eth_dev->dev_ops = &mlx5_dev_ops; 951 /* Register MAC address. */ 952 claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); 953 TAILQ_INIT(&priv->flows); 954 TAILQ_INIT(&priv->ctrl_flows); 955 956 /* Hint libmlx5 to use PMD allocator for data plane resources */ 957 struct mlx5dv_ctx_allocators alctr = { 958 .alloc = &mlx5_alloc_verbs_buf, 959 .free = &mlx5_free_verbs_buf, 960 .data = priv, 961 }; 962 mlx5_glue->dv_set_context_attr(ctx, 963 MLX5DV_CTX_ATTR_BUF_ALLOCATORS, 964 (void *)((uintptr_t)&alctr)); 965 966 /* Bring Ethernet device up. */ 967 DEBUG("forcing Ethernet interface up"); 968 priv_set_flags(priv, ~IFF_UP, IFF_UP); 969 /* Store device configuration on private structure. */ 970 priv->config = config; 971 continue; 972 973 port_error: 974 if (priv) 975 rte_free(priv); 976 if (pd) 977 claim_zero(mlx5_glue->dealloc_pd(pd)); 978 if (ctx) 979 claim_zero(mlx5_glue->close_device(ctx)); 980 break; 981 } 982 983 /* 984 * XXX if something went wrong in the loop above, there is a resource 985 * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as 986 * long as the dpdk does not provide a way to deallocate a ethdev and a 987 * way to enumerate the registered ethdevs to free the previous ones. 988 */ 989 990 /* no port found, complain */ 991 if (!mlx5_dev[idx].ports) { 992 err = ENODEV; 993 goto error; 994 } 995 996 error: 997 if (attr_ctx) 998 claim_zero(mlx5_glue->close_device(attr_ctx)); 999 if (list) 1000 mlx5_glue->free_device_list(list); 1001 assert(err >= 0); 1002 return -err; 1003 } 1004 1005 static const struct rte_pci_id mlx5_pci_id_map[] = { 1006 { 1007 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1008 PCI_DEVICE_ID_MELLANOX_CONNECTX4) 1009 }, 1010 { 1011 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1012 PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) 1013 }, 1014 { 1015 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1016 PCI_DEVICE_ID_MELLANOX_CONNECTX4LX) 1017 }, 1018 { 1019 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1020 PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) 1021 }, 1022 { 1023 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1024 PCI_DEVICE_ID_MELLANOX_CONNECTX5) 1025 }, 1026 { 1027 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1028 PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) 1029 }, 1030 { 1031 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1032 PCI_DEVICE_ID_MELLANOX_CONNECTX5EX) 1033 }, 1034 { 1035 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1036 PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF) 1037 }, 1038 { 1039 .vendor_id = 0 1040 } 1041 }; 1042 1043 static struct rte_pci_driver mlx5_driver = { 1044 .driver = { 1045 .name = MLX5_DRIVER_NAME 1046 }, 1047 .id_table = mlx5_pci_id_map, 1048 .probe = mlx5_pci_probe, 1049 .drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV, 1050 }; 1051 1052 #ifdef RTE_LIBRTE_MLX5_DLOPEN_DEPS 1053 1054 /** 1055 * Initialization routine for run-time dependency on rdma-core. 1056 */ 1057 static int 1058 mlx5_glue_init(void) 1059 { 1060 const char *path[] = { 1061 /* 1062 * A basic security check is necessary before trusting 1063 * MLX5_GLUE_PATH, which may override RTE_EAL_PMD_PATH. 1064 */ 1065 (geteuid() == getuid() && getegid() == getgid() ? 1066 getenv("MLX5_GLUE_PATH") : NULL), 1067 RTE_EAL_PMD_PATH, 1068 }; 1069 unsigned int i = 0; 1070 void *handle = NULL; 1071 void **sym; 1072 const char *dlmsg; 1073 1074 while (!handle && i != RTE_DIM(path)) { 1075 const char *end; 1076 size_t len; 1077 int ret; 1078 1079 if (!path[i]) { 1080 ++i; 1081 continue; 1082 } 1083 end = strpbrk(path[i], ":;"); 1084 if (!end) 1085 end = path[i] + strlen(path[i]); 1086 len = end - path[i]; 1087 ret = 0; 1088 do { 1089 char name[ret + 1]; 1090 1091 ret = snprintf(name, sizeof(name), "%.*s%s" MLX5_GLUE, 1092 (int)len, path[i], 1093 (!len || *(end - 1) == '/') ? "" : "/"); 1094 if (ret == -1) 1095 break; 1096 if (sizeof(name) != (size_t)ret + 1) 1097 continue; 1098 DEBUG("looking for rdma-core glue as \"%s\"", name); 1099 handle = dlopen(name, RTLD_LAZY); 1100 break; 1101 } while (1); 1102 path[i] = end + 1; 1103 if (!*end) 1104 ++i; 1105 } 1106 if (!handle) { 1107 rte_errno = EINVAL; 1108 dlmsg = dlerror(); 1109 if (dlmsg) 1110 WARN("cannot load glue library: %s", dlmsg); 1111 goto glue_error; 1112 } 1113 sym = dlsym(handle, "mlx5_glue"); 1114 if (!sym || !*sym) { 1115 rte_errno = EINVAL; 1116 dlmsg = dlerror(); 1117 if (dlmsg) 1118 ERROR("cannot resolve glue symbol: %s", dlmsg); 1119 goto glue_error; 1120 } 1121 mlx5_glue = *sym; 1122 return 0; 1123 glue_error: 1124 if (handle) 1125 dlclose(handle); 1126 WARN("cannot initialize PMD due to missing run-time" 1127 " dependency on rdma-core libraries (libibverbs," 1128 " libmlx5)"); 1129 return -rte_errno; 1130 } 1131 1132 #endif 1133 1134 /** 1135 * Driver initialization routine. 1136 */ 1137 RTE_INIT(rte_mlx5_pmd_init); 1138 static void 1139 rte_mlx5_pmd_init(void) 1140 { 1141 /* Build the static table for ptype conversion. */ 1142 mlx5_set_ptype_table(); 1143 /* 1144 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use 1145 * huge pages. Calling ibv_fork_init() during init allows 1146 * applications to use fork() safely for purposes other than 1147 * using this PMD, which is not supported in forked processes. 1148 */ 1149 setenv("RDMAV_HUGEPAGES_SAFE", "1", 1); 1150 /* Match the size of Rx completion entry to the size of a cacheline. */ 1151 if (RTE_CACHE_LINE_SIZE == 128) 1152 setenv("MLX5_CQE_SIZE", "128", 0); 1153 #ifdef RTE_LIBRTE_MLX5_DLOPEN_DEPS 1154 if (mlx5_glue_init()) 1155 return; 1156 assert(mlx5_glue); 1157 #endif 1158 #ifndef NDEBUG 1159 /* Glue structure must not contain any NULL pointers. */ 1160 { 1161 unsigned int i; 1162 1163 for (i = 0; i != sizeof(*mlx5_glue) / sizeof(void *); ++i) 1164 assert(((const void *const *)mlx5_glue)[i]); 1165 } 1166 #endif 1167 if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) { 1168 ERROR("rdma-core glue \"%s\" mismatch: \"%s\" is required", 1169 mlx5_glue->version, MLX5_GLUE_VERSION); 1170 return; 1171 } 1172 mlx5_glue->fork_init(); 1173 rte_pci_register(&mlx5_driver); 1174 } 1175 1176 RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__); 1177 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map); 1178 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib"); 1179