1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015 Mellanox Technologies, Ltd 4 */ 5 6 #include <stddef.h> 7 #include <unistd.h> 8 #include <string.h> 9 #include <assert.h> 10 #include <dlfcn.h> 11 #include <stdint.h> 12 #include <stdlib.h> 13 #include <errno.h> 14 #include <net/if.h> 15 #include <sys/mman.h> 16 #include <linux/rtnetlink.h> 17 18 /* Verbs header. */ 19 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 20 #ifdef PEDANTIC 21 #pragma GCC diagnostic ignored "-Wpedantic" 22 #endif 23 #include <infiniband/verbs.h> 24 #ifdef PEDANTIC 25 #pragma GCC diagnostic error "-Wpedantic" 26 #endif 27 28 #include <rte_malloc.h> 29 #include <rte_ethdev_driver.h> 30 #include <rte_ethdev_pci.h> 31 #include <rte_pci.h> 32 #include <rte_bus_pci.h> 33 #include <rte_common.h> 34 #include <rte_config.h> 35 #include <rte_eal_memconfig.h> 36 #include <rte_kvargs.h> 37 #include <rte_rwlock.h> 38 #include <rte_spinlock.h> 39 40 #include "mlx5.h" 41 #include "mlx5_utils.h" 42 #include "mlx5_rxtx.h" 43 #include "mlx5_autoconf.h" 44 #include "mlx5_defs.h" 45 #include "mlx5_glue.h" 46 #include "mlx5_mr.h" 47 48 /* Device parameter to enable RX completion queue compression. */ 49 #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en" 50 51 /* Device parameter to enable Multi-Packet Rx queue. */ 52 #define MLX5_RX_MPRQ_EN "mprq_en" 53 54 /* Device parameter to configure log 2 of the number of strides for MPRQ. */ 55 #define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num" 56 57 /* Device parameter to limit the size of memcpy'd packet for MPRQ. */ 58 #define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len" 59 60 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */ 61 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq" 62 63 /* Device parameter to configure inline send. */ 64 #define MLX5_TXQ_INLINE "txq_inline" 65 66 /* 67 * Device parameter to configure the number of TX queues threshold for 68 * enabling inline send. 69 */ 70 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline" 71 72 /* Device parameter to enable multi-packet send WQEs. */ 73 #define MLX5_TXQ_MPW_EN "txq_mpw_en" 74 75 /* Device parameter to include 2 dsegs in the title WQEBB. */ 76 #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en" 77 78 /* Device parameter to limit the size of inlining packet. */ 79 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len" 80 81 /* Device parameter to enable hardware Tx vector. */ 82 #define MLX5_TX_VEC_EN "tx_vec_en" 83 84 /* Device parameter to enable hardware Rx vector. */ 85 #define MLX5_RX_VEC_EN "rx_vec_en" 86 87 /* Allow L3 VXLAN flow creation. */ 88 #define MLX5_L3_VXLAN_EN "l3_vxlan_en" 89 90 /* Activate Netlink support in VF mode. */ 91 #define MLX5_VF_NL_EN "vf_nl_en" 92 93 #ifndef HAVE_IBV_MLX5_MOD_MPW 94 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) 95 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) 96 #endif 97 98 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP 99 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4) 100 #endif 101 102 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data"; 103 104 /* Shared memory between primary and secondary processes. */ 105 struct mlx5_shared_data *mlx5_shared_data; 106 107 /* Spinlock for mlx5_shared_data allocation. */ 108 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER; 109 110 /** Driver-specific log messages type. */ 111 int mlx5_logtype; 112 113 /** 114 * Prepare shared data between primary and secondary process. 115 */ 116 static void 117 mlx5_prepare_shared_data(void) 118 { 119 const struct rte_memzone *mz; 120 121 rte_spinlock_lock(&mlx5_shared_data_lock); 122 if (mlx5_shared_data == NULL) { 123 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 124 /* Allocate shared memory. */ 125 mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA, 126 sizeof(*mlx5_shared_data), 127 SOCKET_ID_ANY, 0); 128 } else { 129 /* Lookup allocated shared memory. */ 130 mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA); 131 } 132 if (mz == NULL) 133 rte_panic("Cannot allocate mlx5 shared data\n"); 134 mlx5_shared_data = mz->addr; 135 /* Initialize shared data. */ 136 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 137 LIST_INIT(&mlx5_shared_data->mem_event_cb_list); 138 rte_rwlock_init(&mlx5_shared_data->mem_event_rwlock); 139 } 140 rte_mem_event_callback_register("MLX5_MEM_EVENT_CB", 141 mlx5_mr_mem_event_cb, NULL); 142 } 143 rte_spinlock_unlock(&mlx5_shared_data_lock); 144 } 145 146 /** 147 * Retrieve integer value from environment variable. 148 * 149 * @param[in] name 150 * Environment variable name. 151 * 152 * @return 153 * Integer value, 0 if the variable is not set. 154 */ 155 int 156 mlx5_getenv_int(const char *name) 157 { 158 const char *val = getenv(name); 159 160 if (val == NULL) 161 return 0; 162 return atoi(val); 163 } 164 165 /** 166 * Verbs callback to allocate a memory. This function should allocate the space 167 * according to the size provided residing inside a huge page. 168 * Please note that all allocation must respect the alignment from libmlx5 169 * (i.e. currently sysconf(_SC_PAGESIZE)). 170 * 171 * @param[in] size 172 * The size in bytes of the memory to allocate. 173 * @param[in] data 174 * A pointer to the callback data. 175 * 176 * @return 177 * Allocated buffer, NULL otherwise and rte_errno is set. 178 */ 179 static void * 180 mlx5_alloc_verbs_buf(size_t size, void *data) 181 { 182 struct priv *priv = data; 183 void *ret; 184 size_t alignment = sysconf(_SC_PAGESIZE); 185 unsigned int socket = SOCKET_ID_ANY; 186 187 if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) { 188 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 189 190 socket = ctrl->socket; 191 } else if (priv->verbs_alloc_ctx.type == 192 MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) { 193 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 194 195 socket = ctrl->socket; 196 } 197 assert(data != NULL); 198 ret = rte_malloc_socket(__func__, size, alignment, socket); 199 if (!ret && size) 200 rte_errno = ENOMEM; 201 return ret; 202 } 203 204 /** 205 * Verbs callback to free a memory. 206 * 207 * @param[in] ptr 208 * A pointer to the memory to free. 209 * @param[in] data 210 * A pointer to the callback data. 211 */ 212 static void 213 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused) 214 { 215 assert(data != NULL); 216 rte_free(ptr); 217 } 218 219 /** 220 * DPDK callback to close the device. 221 * 222 * Destroy all queues and objects, free memory. 223 * 224 * @param dev 225 * Pointer to Ethernet device structure. 226 */ 227 static void 228 mlx5_dev_close(struct rte_eth_dev *dev) 229 { 230 struct priv *priv = dev->data->dev_private; 231 unsigned int i; 232 int ret; 233 234 DRV_LOG(DEBUG, "port %u closing device \"%s\"", 235 dev->data->port_id, 236 ((priv->ctx != NULL) ? priv->ctx->device->name : "")); 237 /* In case mlx5_dev_stop() has not been called. */ 238 mlx5_dev_interrupt_handler_uninstall(dev); 239 mlx5_traffic_disable(dev); 240 /* Prevent crashes when queues are still in use. */ 241 dev->rx_pkt_burst = removed_rx_burst; 242 dev->tx_pkt_burst = removed_tx_burst; 243 if (priv->rxqs != NULL) { 244 /* XXX race condition if mlx5_rx_burst() is still running. */ 245 usleep(1000); 246 for (i = 0; (i != priv->rxqs_n); ++i) 247 mlx5_rxq_release(dev, i); 248 priv->rxqs_n = 0; 249 priv->rxqs = NULL; 250 } 251 if (priv->txqs != NULL) { 252 /* XXX race condition if mlx5_tx_burst() is still running. */ 253 usleep(1000); 254 for (i = 0; (i != priv->txqs_n); ++i) 255 mlx5_txq_release(dev, i); 256 priv->txqs_n = 0; 257 priv->txqs = NULL; 258 } 259 mlx5_flow_delete_drop_queue(dev); 260 mlx5_mprq_free_mp(dev); 261 mlx5_mr_release(dev); 262 if (priv->pd != NULL) { 263 assert(priv->ctx != NULL); 264 claim_zero(mlx5_glue->dealloc_pd(priv->pd)); 265 claim_zero(mlx5_glue->close_device(priv->ctx)); 266 } else 267 assert(priv->ctx == NULL); 268 if (priv->rss_conf.rss_key != NULL) 269 rte_free(priv->rss_conf.rss_key); 270 if (priv->reta_idx != NULL) 271 rte_free(priv->reta_idx); 272 if (priv->primary_socket) 273 mlx5_socket_uninit(dev); 274 if (priv->config.vf) 275 mlx5_nl_mac_addr_flush(dev); 276 if (priv->nl_socket >= 0) 277 close(priv->nl_socket); 278 ret = mlx5_hrxq_ibv_verify(dev); 279 if (ret) 280 DRV_LOG(WARNING, "port %u some hash Rx queue still remain", 281 dev->data->port_id); 282 ret = mlx5_ind_table_ibv_verify(dev); 283 if (ret) 284 DRV_LOG(WARNING, "port %u some indirection table still remain", 285 dev->data->port_id); 286 ret = mlx5_rxq_ibv_verify(dev); 287 if (ret) 288 DRV_LOG(WARNING, "port %u some Verbs Rx queue still remain", 289 dev->data->port_id); 290 ret = mlx5_rxq_verify(dev); 291 if (ret) 292 DRV_LOG(WARNING, "port %u some Rx queues still remain", 293 dev->data->port_id); 294 ret = mlx5_txq_ibv_verify(dev); 295 if (ret) 296 DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain", 297 dev->data->port_id); 298 ret = mlx5_txq_verify(dev); 299 if (ret) 300 DRV_LOG(WARNING, "port %u some Tx queues still remain", 301 dev->data->port_id); 302 ret = mlx5_flow_verify(dev); 303 if (ret) 304 DRV_LOG(WARNING, "port %u some flows still remain", 305 dev->data->port_id); 306 memset(priv, 0, sizeof(*priv)); 307 } 308 309 const struct eth_dev_ops mlx5_dev_ops = { 310 .dev_configure = mlx5_dev_configure, 311 .dev_start = mlx5_dev_start, 312 .dev_stop = mlx5_dev_stop, 313 .dev_set_link_down = mlx5_set_link_down, 314 .dev_set_link_up = mlx5_set_link_up, 315 .dev_close = mlx5_dev_close, 316 .promiscuous_enable = mlx5_promiscuous_enable, 317 .promiscuous_disable = mlx5_promiscuous_disable, 318 .allmulticast_enable = mlx5_allmulticast_enable, 319 .allmulticast_disable = mlx5_allmulticast_disable, 320 .link_update = mlx5_link_update, 321 .stats_get = mlx5_stats_get, 322 .stats_reset = mlx5_stats_reset, 323 .xstats_get = mlx5_xstats_get, 324 .xstats_reset = mlx5_xstats_reset, 325 .xstats_get_names = mlx5_xstats_get_names, 326 .dev_infos_get = mlx5_dev_infos_get, 327 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 328 .vlan_filter_set = mlx5_vlan_filter_set, 329 .rx_queue_setup = mlx5_rx_queue_setup, 330 .tx_queue_setup = mlx5_tx_queue_setup, 331 .rx_queue_release = mlx5_rx_queue_release, 332 .tx_queue_release = mlx5_tx_queue_release, 333 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 334 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 335 .mac_addr_remove = mlx5_mac_addr_remove, 336 .mac_addr_add = mlx5_mac_addr_add, 337 .mac_addr_set = mlx5_mac_addr_set, 338 .set_mc_addr_list = mlx5_set_mc_addr_list, 339 .mtu_set = mlx5_dev_set_mtu, 340 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 341 .vlan_offload_set = mlx5_vlan_offload_set, 342 .reta_update = mlx5_dev_rss_reta_update, 343 .reta_query = mlx5_dev_rss_reta_query, 344 .rss_hash_update = mlx5_rss_hash_update, 345 .rss_hash_conf_get = mlx5_rss_hash_conf_get, 346 .filter_ctrl = mlx5_dev_filter_ctrl, 347 .rx_descriptor_status = mlx5_rx_descriptor_status, 348 .tx_descriptor_status = mlx5_tx_descriptor_status, 349 .rx_queue_intr_enable = mlx5_rx_intr_enable, 350 .rx_queue_intr_disable = mlx5_rx_intr_disable, 351 .is_removed = mlx5_is_removed, 352 }; 353 354 static const struct eth_dev_ops mlx5_dev_sec_ops = { 355 .stats_get = mlx5_stats_get, 356 .stats_reset = mlx5_stats_reset, 357 .xstats_get = mlx5_xstats_get, 358 .xstats_reset = mlx5_xstats_reset, 359 .xstats_get_names = mlx5_xstats_get_names, 360 .dev_infos_get = mlx5_dev_infos_get, 361 .rx_descriptor_status = mlx5_rx_descriptor_status, 362 .tx_descriptor_status = mlx5_tx_descriptor_status, 363 }; 364 365 /* Available operators in flow isolated mode. */ 366 const struct eth_dev_ops mlx5_dev_ops_isolate = { 367 .dev_configure = mlx5_dev_configure, 368 .dev_start = mlx5_dev_start, 369 .dev_stop = mlx5_dev_stop, 370 .dev_set_link_down = mlx5_set_link_down, 371 .dev_set_link_up = mlx5_set_link_up, 372 .dev_close = mlx5_dev_close, 373 .link_update = mlx5_link_update, 374 .stats_get = mlx5_stats_get, 375 .stats_reset = mlx5_stats_reset, 376 .xstats_get = mlx5_xstats_get, 377 .xstats_reset = mlx5_xstats_reset, 378 .xstats_get_names = mlx5_xstats_get_names, 379 .dev_infos_get = mlx5_dev_infos_get, 380 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 381 .vlan_filter_set = mlx5_vlan_filter_set, 382 .rx_queue_setup = mlx5_rx_queue_setup, 383 .tx_queue_setup = mlx5_tx_queue_setup, 384 .rx_queue_release = mlx5_rx_queue_release, 385 .tx_queue_release = mlx5_tx_queue_release, 386 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 387 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 388 .mac_addr_remove = mlx5_mac_addr_remove, 389 .mac_addr_add = mlx5_mac_addr_add, 390 .mac_addr_set = mlx5_mac_addr_set, 391 .set_mc_addr_list = mlx5_set_mc_addr_list, 392 .mtu_set = mlx5_dev_set_mtu, 393 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 394 .vlan_offload_set = mlx5_vlan_offload_set, 395 .filter_ctrl = mlx5_dev_filter_ctrl, 396 .rx_descriptor_status = mlx5_rx_descriptor_status, 397 .tx_descriptor_status = mlx5_tx_descriptor_status, 398 .rx_queue_intr_enable = mlx5_rx_intr_enable, 399 .rx_queue_intr_disable = mlx5_rx_intr_disable, 400 .is_removed = mlx5_is_removed, 401 }; 402 403 /** 404 * Verify and store value for device argument. 405 * 406 * @param[in] key 407 * Key argument to verify. 408 * @param[in] val 409 * Value associated with key. 410 * @param opaque 411 * User data. 412 * 413 * @return 414 * 0 on success, a negative errno value otherwise and rte_errno is set. 415 */ 416 static int 417 mlx5_args_check(const char *key, const char *val, void *opaque) 418 { 419 struct mlx5_dev_config *config = opaque; 420 unsigned long tmp; 421 422 errno = 0; 423 tmp = strtoul(val, NULL, 0); 424 if (errno) { 425 rte_errno = errno; 426 DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val); 427 return -rte_errno; 428 } 429 if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) { 430 config->cqe_comp = !!tmp; 431 } else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) { 432 config->mprq.enabled = !!tmp; 433 } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) { 434 config->mprq.stride_num_n = tmp; 435 } else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) { 436 config->mprq.max_memcpy_len = tmp; 437 } else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) { 438 config->mprq.min_rxqs_num = tmp; 439 } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) { 440 config->txq_inline = tmp; 441 } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) { 442 config->txqs_inline = tmp; 443 } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) { 444 config->mps = !!tmp ? config->mps : 0; 445 } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) { 446 config->mpw_hdr_dseg = !!tmp; 447 } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) { 448 config->inline_max_packet_sz = tmp; 449 } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) { 450 config->tx_vec_en = !!tmp; 451 } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) { 452 config->rx_vec_en = !!tmp; 453 } else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) { 454 config->l3_vxlan_en = !!tmp; 455 } else if (strcmp(MLX5_VF_NL_EN, key) == 0) { 456 config->vf_nl_en = !!tmp; 457 } else { 458 DRV_LOG(WARNING, "%s: unknown parameter", key); 459 rte_errno = EINVAL; 460 return -rte_errno; 461 } 462 return 0; 463 } 464 465 /** 466 * Parse device parameters. 467 * 468 * @param config 469 * Pointer to device configuration structure. 470 * @param devargs 471 * Device arguments structure. 472 * 473 * @return 474 * 0 on success, a negative errno value otherwise and rte_errno is set. 475 */ 476 static int 477 mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) 478 { 479 const char **params = (const char *[]){ 480 MLX5_RXQ_CQE_COMP_EN, 481 MLX5_RX_MPRQ_EN, 482 MLX5_RX_MPRQ_LOG_STRIDE_NUM, 483 MLX5_RX_MPRQ_MAX_MEMCPY_LEN, 484 MLX5_RXQS_MIN_MPRQ, 485 MLX5_TXQ_INLINE, 486 MLX5_TXQS_MIN_INLINE, 487 MLX5_TXQ_MPW_EN, 488 MLX5_TXQ_MPW_HDR_DSEG_EN, 489 MLX5_TXQ_MAX_INLINE_LEN, 490 MLX5_TX_VEC_EN, 491 MLX5_RX_VEC_EN, 492 MLX5_L3_VXLAN_EN, 493 MLX5_VF_NL_EN, 494 NULL, 495 }; 496 struct rte_kvargs *kvlist; 497 int ret = 0; 498 int i; 499 500 if (devargs == NULL) 501 return 0; 502 /* Following UGLY cast is done to pass checkpatch. */ 503 kvlist = rte_kvargs_parse(devargs->args, params); 504 if (kvlist == NULL) 505 return 0; 506 /* Process parameters. */ 507 for (i = 0; (params[i] != NULL); ++i) { 508 if (rte_kvargs_count(kvlist, params[i])) { 509 ret = rte_kvargs_process(kvlist, params[i], 510 mlx5_args_check, config); 511 if (ret) { 512 rte_errno = EINVAL; 513 rte_kvargs_free(kvlist); 514 return -rte_errno; 515 } 516 } 517 } 518 rte_kvargs_free(kvlist); 519 return 0; 520 } 521 522 static struct rte_pci_driver mlx5_driver; 523 524 /* 525 * Reserved UAR address space for TXQ UAR(hw doorbell) mapping, process 526 * local resource used by both primary and secondary to avoid duplicate 527 * reservation. 528 * The space has to be available on both primary and secondary process, 529 * TXQ UAR maps to this area using fixed mmap w/o double check. 530 */ 531 static void *uar_base; 532 533 static int 534 find_lower_va_bound(const struct rte_memseg_list *msl __rte_unused, 535 const struct rte_memseg *ms, void *arg) 536 { 537 void **addr = arg; 538 539 if (*addr == NULL) 540 *addr = ms->addr; 541 else 542 *addr = RTE_MIN(*addr, ms->addr); 543 544 return 0; 545 } 546 547 /** 548 * Reserve UAR address space for primary process. 549 * 550 * @param[in] dev 551 * Pointer to Ethernet device. 552 * 553 * @return 554 * 0 on success, a negative errno value otherwise and rte_errno is set. 555 */ 556 static int 557 mlx5_uar_init_primary(struct rte_eth_dev *dev) 558 { 559 struct priv *priv = dev->data->dev_private; 560 void *addr = (void *)0; 561 562 if (uar_base) { /* UAR address space mapped. */ 563 priv->uar_base = uar_base; 564 return 0; 565 } 566 /* find out lower bound of hugepage segments */ 567 rte_memseg_walk(find_lower_va_bound, &addr); 568 569 /* keep distance to hugepages to minimize potential conflicts. */ 570 addr = RTE_PTR_SUB(addr, MLX5_UAR_OFFSET + MLX5_UAR_SIZE); 571 /* anonymous mmap, no real memory consumption. */ 572 addr = mmap(addr, MLX5_UAR_SIZE, 573 PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 574 if (addr == MAP_FAILED) { 575 DRV_LOG(ERR, 576 "port %u failed to reserve UAR address space, please" 577 " adjust MLX5_UAR_SIZE or try --base-virtaddr", 578 dev->data->port_id); 579 rte_errno = ENOMEM; 580 return -rte_errno; 581 } 582 /* Accept either same addr or a new addr returned from mmap if target 583 * range occupied. 584 */ 585 DRV_LOG(INFO, "port %u reserved UAR address space: %p", 586 dev->data->port_id, addr); 587 priv->uar_base = addr; /* for primary and secondary UAR re-mmap. */ 588 uar_base = addr; /* process local, don't reserve again. */ 589 return 0; 590 } 591 592 /** 593 * Reserve UAR address space for secondary process, align with 594 * primary process. 595 * 596 * @param[in] dev 597 * Pointer to Ethernet device. 598 * 599 * @return 600 * 0 on success, a negative errno value otherwise and rte_errno is set. 601 */ 602 static int 603 mlx5_uar_init_secondary(struct rte_eth_dev *dev) 604 { 605 struct priv *priv = dev->data->dev_private; 606 void *addr; 607 608 assert(priv->uar_base); 609 if (uar_base) { /* already reserved. */ 610 assert(uar_base == priv->uar_base); 611 return 0; 612 } 613 /* anonymous mmap, no real memory consumption. */ 614 addr = mmap(priv->uar_base, MLX5_UAR_SIZE, 615 PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 616 if (addr == MAP_FAILED) { 617 DRV_LOG(ERR, "port %u UAR mmap failed: %p size: %llu", 618 dev->data->port_id, priv->uar_base, MLX5_UAR_SIZE); 619 rte_errno = ENXIO; 620 return -rte_errno; 621 } 622 if (priv->uar_base != addr) { 623 DRV_LOG(ERR, 624 "port %u UAR address %p size %llu occupied, please" 625 " adjust MLX5_UAR_OFFSET or try EAL parameter" 626 " --base-virtaddr", 627 dev->data->port_id, priv->uar_base, MLX5_UAR_SIZE); 628 rte_errno = ENXIO; 629 return -rte_errno; 630 } 631 uar_base = addr; /* process local, don't reserve again */ 632 DRV_LOG(INFO, "port %u reserved UAR address space: %p", 633 dev->data->port_id, addr); 634 return 0; 635 } 636 637 /** 638 * DPDK callback to register a PCI device. 639 * 640 * This function creates an Ethernet device for each port of a given 641 * PCI device. 642 * 643 * @param[in] pci_drv 644 * PCI driver structure (mlx5_driver). 645 * @param[in] pci_dev 646 * PCI device information. 647 * 648 * @return 649 * 0 on success, a negative errno value otherwise and rte_errno is set. 650 */ 651 static int 652 mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, 653 struct rte_pci_device *pci_dev) 654 { 655 struct ibv_device **list = NULL; 656 struct ibv_device *ibv_dev; 657 int err = 0; 658 struct ibv_context *attr_ctx = NULL; 659 struct ibv_device_attr_ex device_attr; 660 unsigned int vf = 0; 661 unsigned int mps; 662 unsigned int cqe_comp; 663 unsigned int tunnel_en = 0; 664 unsigned int mpls_en = 0; 665 unsigned int swp = 0; 666 unsigned int verb_priorities = 0; 667 unsigned int mprq = 0; 668 unsigned int mprq_min_stride_size_n = 0; 669 unsigned int mprq_max_stride_size_n = 0; 670 unsigned int mprq_min_stride_num_n = 0; 671 unsigned int mprq_max_stride_num_n = 0; 672 int i; 673 struct mlx5dv_context attrs_out = {0}; 674 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT 675 struct ibv_counter_set_description cs_desc = { .counter_type = 0 }; 676 #endif 677 678 /* Prepare shared data between primary and secondary process. */ 679 mlx5_prepare_shared_data(); 680 assert(pci_drv == &mlx5_driver); 681 list = mlx5_glue->get_device_list(&i); 682 if (list == NULL) { 683 assert(errno); 684 err = errno; 685 if (errno == ENOSYS) 686 DRV_LOG(ERR, 687 "cannot list devices, is ib_uverbs loaded?"); 688 goto error; 689 } 690 assert(i >= 0); 691 /* 692 * For each listed device, check related sysfs entry against 693 * the provided PCI ID. 694 */ 695 while (i != 0) { 696 struct rte_pci_addr pci_addr; 697 698 --i; 699 DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name); 700 if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr)) 701 continue; 702 if ((pci_dev->addr.domain != pci_addr.domain) || 703 (pci_dev->addr.bus != pci_addr.bus) || 704 (pci_dev->addr.devid != pci_addr.devid) || 705 (pci_dev->addr.function != pci_addr.function)) 706 continue; 707 DRV_LOG(INFO, "PCI information matches, using device \"%s\"", 708 list[i]->name); 709 vf = ((pci_dev->id.device_id == 710 PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) || 711 (pci_dev->id.device_id == 712 PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) || 713 (pci_dev->id.device_id == 714 PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) || 715 (pci_dev->id.device_id == 716 PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)); 717 attr_ctx = mlx5_glue->open_device(list[i]); 718 rte_errno = errno; 719 err = rte_errno; 720 break; 721 } 722 if (attr_ctx == NULL) { 723 switch (err) { 724 case 0: 725 DRV_LOG(ERR, 726 "cannot access device, is mlx5_ib loaded?"); 727 err = ENODEV; 728 break; 729 case EINVAL: 730 DRV_LOG(ERR, 731 "cannot use device, are drivers up to date?"); 732 break; 733 } 734 goto error; 735 } 736 ibv_dev = list[i]; 737 DRV_LOG(DEBUG, "device opened"); 738 #ifdef HAVE_IBV_MLX5_MOD_SWP 739 attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; 740 #endif 741 /* 742 * Multi-packet send is supported by ConnectX-4 Lx PF as well 743 * as all ConnectX-5 devices. 744 */ 745 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 746 attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS; 747 #endif 748 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 749 attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; 750 #endif 751 mlx5_glue->dv_query_device(attr_ctx, &attrs_out); 752 if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { 753 if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { 754 DRV_LOG(DEBUG, "enhanced MPW is supported"); 755 mps = MLX5_MPW_ENHANCED; 756 } else { 757 DRV_LOG(DEBUG, "MPW is supported"); 758 mps = MLX5_MPW; 759 } 760 } else { 761 DRV_LOG(DEBUG, "MPW isn't supported"); 762 mps = MLX5_MPW_DISABLED; 763 } 764 #ifdef HAVE_IBV_MLX5_MOD_SWP 765 if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_SWP) 766 swp = attrs_out.sw_parsing_caps.sw_parsing_offloads; 767 DRV_LOG(DEBUG, "SWP support: %u", swp); 768 #endif 769 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 770 if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) { 771 struct mlx5dv_striding_rq_caps mprq_caps = 772 attrs_out.striding_rq_caps; 773 774 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d", 775 mprq_caps.min_single_stride_log_num_of_bytes); 776 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d", 777 mprq_caps.max_single_stride_log_num_of_bytes); 778 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d", 779 mprq_caps.min_single_wqe_log_num_of_strides); 780 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d", 781 mprq_caps.max_single_wqe_log_num_of_strides); 782 DRV_LOG(DEBUG, "\tsupported_qpts: %d", 783 mprq_caps.supported_qpts); 784 DRV_LOG(DEBUG, "device supports Multi-Packet RQ"); 785 mprq = 1; 786 mprq_min_stride_size_n = 787 mprq_caps.min_single_stride_log_num_of_bytes; 788 mprq_max_stride_size_n = 789 mprq_caps.max_single_stride_log_num_of_bytes; 790 mprq_min_stride_num_n = 791 mprq_caps.min_single_wqe_log_num_of_strides; 792 mprq_max_stride_num_n = 793 mprq_caps.max_single_wqe_log_num_of_strides; 794 } 795 #endif 796 if (RTE_CACHE_LINE_SIZE == 128 && 797 !(attrs_out.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) 798 cqe_comp = 0; 799 else 800 cqe_comp = 1; 801 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 802 if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { 803 tunnel_en = ((attrs_out.tunnel_offloads_caps & 804 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) && 805 (attrs_out.tunnel_offloads_caps & 806 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE)); 807 } 808 DRV_LOG(DEBUG, "tunnel offloading is %ssupported", 809 tunnel_en ? "" : "not "); 810 #else 811 DRV_LOG(WARNING, 812 "tunnel offloading disabled due to old OFED/rdma-core version"); 813 #endif 814 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT 815 mpls_en = ((attrs_out.tunnel_offloads_caps & 816 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) && 817 (attrs_out.tunnel_offloads_caps & 818 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP)); 819 DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported", 820 mpls_en ? "" : "not "); 821 #else 822 DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to" 823 " old OFED/rdma-core version or firmware configuration"); 824 #endif 825 err = mlx5_glue->query_device_ex(attr_ctx, NULL, &device_attr); 826 if (err) { 827 DEBUG("ibv_query_device_ex() failed"); 828 goto error; 829 } 830 DRV_LOG(INFO, "%u port(s) detected", 831 device_attr.orig_attr.phys_port_cnt); 832 for (i = 0; i < device_attr.orig_attr.phys_port_cnt; i++) { 833 char name[RTE_ETH_NAME_MAX_LEN]; 834 int len; 835 uint32_t port = i + 1; /* ports are indexed from one */ 836 struct ibv_context *ctx = NULL; 837 struct ibv_port_attr port_attr; 838 struct ibv_pd *pd = NULL; 839 struct priv *priv = NULL; 840 struct rte_eth_dev *eth_dev = NULL; 841 struct ibv_device_attr_ex device_attr_ex; 842 struct ether_addr mac; 843 struct mlx5_dev_config config = { 844 .cqe_comp = cqe_comp, 845 .mps = mps, 846 .tunnel_en = tunnel_en, 847 .mpls_en = mpls_en, 848 .tx_vec_en = 1, 849 .rx_vec_en = 1, 850 .mpw_hdr_dseg = 0, 851 .txq_inline = MLX5_ARG_UNSET, 852 .txqs_inline = MLX5_ARG_UNSET, 853 .inline_max_packet_sz = MLX5_ARG_UNSET, 854 .vf_nl_en = 1, 855 .swp = !!swp, 856 .mprq = { 857 .enabled = 0, /* Disabled by default. */ 858 .stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, 859 mprq_min_stride_num_n), 860 .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN, 861 .min_rxqs_num = MLX5_MPRQ_MIN_RXQS, 862 }, 863 }; 864 865 len = snprintf(name, sizeof(name), PCI_PRI_FMT, 866 pci_dev->addr.domain, pci_dev->addr.bus, 867 pci_dev->addr.devid, pci_dev->addr.function); 868 if (device_attr.orig_attr.phys_port_cnt > 1) 869 snprintf(name + len, sizeof(name), " port %u", i); 870 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 871 eth_dev = rte_eth_dev_attach_secondary(name); 872 if (eth_dev == NULL) { 873 DRV_LOG(ERR, "can not attach rte ethdev"); 874 rte_errno = ENOMEM; 875 err = rte_errno; 876 goto error; 877 } 878 eth_dev->device = &pci_dev->device; 879 eth_dev->dev_ops = &mlx5_dev_sec_ops; 880 err = mlx5_uar_init_secondary(eth_dev); 881 if (err) { 882 err = rte_errno; 883 goto error; 884 } 885 /* Receive command fd from primary process */ 886 err = mlx5_socket_connect(eth_dev); 887 if (err < 0) { 888 err = rte_errno; 889 goto error; 890 } 891 /* Remap UAR for Tx queues. */ 892 err = mlx5_tx_uar_remap(eth_dev, err); 893 if (err) { 894 err = rte_errno; 895 goto error; 896 } 897 /* 898 * Ethdev pointer is still required as input since 899 * the primary device is not accessible from the 900 * secondary process. 901 */ 902 eth_dev->rx_pkt_burst = 903 mlx5_select_rx_function(eth_dev); 904 eth_dev->tx_pkt_burst = 905 mlx5_select_tx_function(eth_dev); 906 rte_eth_dev_probing_finish(eth_dev); 907 continue; 908 } 909 DRV_LOG(DEBUG, "using port %u", port); 910 ctx = mlx5_glue->open_device(ibv_dev); 911 if (ctx == NULL) { 912 err = ENODEV; 913 goto port_error; 914 } 915 /* Check port status. */ 916 err = mlx5_glue->query_port(ctx, port, &port_attr); 917 if (err) { 918 DRV_LOG(ERR, "port query failed: %s", strerror(err)); 919 goto port_error; 920 } 921 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { 922 DRV_LOG(ERR, 923 "port %d is not configured in Ethernet mode", 924 port); 925 err = EINVAL; 926 goto port_error; 927 } 928 if (port_attr.state != IBV_PORT_ACTIVE) 929 DRV_LOG(DEBUG, "port %d is not active: \"%s\" (%d)", 930 port, 931 mlx5_glue->port_state_str(port_attr.state), 932 port_attr.state); 933 /* Allocate protection domain. */ 934 pd = mlx5_glue->alloc_pd(ctx); 935 if (pd == NULL) { 936 DRV_LOG(ERR, "PD allocation failure"); 937 err = ENOMEM; 938 goto port_error; 939 } 940 /* from rte_ethdev.c */ 941 priv = rte_zmalloc("ethdev private structure", 942 sizeof(*priv), 943 RTE_CACHE_LINE_SIZE); 944 if (priv == NULL) { 945 DRV_LOG(ERR, "priv allocation failure"); 946 err = ENOMEM; 947 goto port_error; 948 } 949 priv->ctx = ctx; 950 strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path, 951 sizeof(priv->ibdev_path)); 952 priv->device_attr = device_attr; 953 priv->port = port; 954 priv->pd = pd; 955 priv->mtu = ETHER_MTU; 956 err = mlx5_args(&config, pci_dev->device.devargs); 957 if (err) { 958 err = rte_errno; 959 DRV_LOG(ERR, "failed to process device arguments: %s", 960 strerror(rte_errno)); 961 goto port_error; 962 } 963 err = mlx5_glue->query_device_ex(ctx, NULL, &device_attr_ex); 964 if (err) { 965 DRV_LOG(ERR, "ibv_query_device_ex() failed"); 966 goto port_error; 967 } 968 config.hw_csum = !!(device_attr_ex.device_cap_flags_ex & 969 IBV_DEVICE_RAW_IP_CSUM); 970 DRV_LOG(DEBUG, "checksum offloading is %ssupported", 971 (config.hw_csum ? "" : "not ")); 972 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT 973 config.flow_counter_en = !!(device_attr.max_counter_sets); 974 mlx5_glue->describe_counter_set(ctx, 0, &cs_desc); 975 DRV_LOG(DEBUG, 976 "counter type = %d, num of cs = %ld, attributes = %d", 977 cs_desc.counter_type, cs_desc.num_of_cs, 978 cs_desc.attributes); 979 #endif 980 config.ind_table_max_size = 981 device_attr_ex.rss_caps.max_rwq_indirection_table_size; 982 /* Remove this check once DPDK supports larger/variable 983 * indirection tables. */ 984 if (config.ind_table_max_size > 985 (unsigned int)ETH_RSS_RETA_SIZE_512) 986 config.ind_table_max_size = ETH_RSS_RETA_SIZE_512; 987 DRV_LOG(DEBUG, "maximum Rx indirection table size is %u", 988 config.ind_table_max_size); 989 config.hw_vlan_strip = !!(device_attr_ex.raw_packet_caps & 990 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); 991 DRV_LOG(DEBUG, "VLAN stripping is %ssupported", 992 (config.hw_vlan_strip ? "" : "not ")); 993 994 config.hw_fcs_strip = !!(device_attr_ex.raw_packet_caps & 995 IBV_RAW_PACKET_CAP_SCATTER_FCS); 996 DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported", 997 (config.hw_fcs_strip ? "" : "not ")); 998 999 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING 1000 config.hw_padding = !!device_attr_ex.rx_pad_end_addr_align; 1001 #endif 1002 DRV_LOG(DEBUG, 1003 "hardware Rx end alignment padding is %ssupported", 1004 (config.hw_padding ? "" : "not ")); 1005 config.vf = vf; 1006 config.tso = ((device_attr_ex.tso_caps.max_tso > 0) && 1007 (device_attr_ex.tso_caps.supported_qpts & 1008 (1 << IBV_QPT_RAW_PACKET))); 1009 if (config.tso) 1010 config.tso_max_payload_sz = 1011 device_attr_ex.tso_caps.max_tso; 1012 if (config.mps && !mps) { 1013 DRV_LOG(ERR, 1014 "multi-packet send not supported on this device" 1015 " (" MLX5_TXQ_MPW_EN ")"); 1016 err = ENOTSUP; 1017 goto port_error; 1018 } 1019 DRV_LOG(INFO, "%s MPS is %s", 1020 config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "", 1021 config.mps != MLX5_MPW_DISABLED ? "enabled" : 1022 "disabled"); 1023 if (config.cqe_comp && !cqe_comp) { 1024 DRV_LOG(WARNING, "Rx CQE compression isn't supported"); 1025 config.cqe_comp = 0; 1026 } 1027 if (config.mprq.enabled && mprq) { 1028 if (config.mprq.stride_num_n > mprq_max_stride_num_n || 1029 config.mprq.stride_num_n < mprq_min_stride_num_n) { 1030 config.mprq.stride_num_n = 1031 RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, 1032 mprq_min_stride_num_n); 1033 DRV_LOG(WARNING, 1034 "the number of strides" 1035 " for Multi-Packet RQ is out of range," 1036 " setting default value (%u)", 1037 1 << config.mprq.stride_num_n); 1038 } 1039 config.mprq.min_stride_size_n = mprq_min_stride_size_n; 1040 config.mprq.max_stride_size_n = mprq_max_stride_size_n; 1041 } else if (config.mprq.enabled && !mprq) { 1042 DRV_LOG(WARNING, "Multi-Packet RQ isn't supported"); 1043 config.mprq.enabled = 0; 1044 } 1045 eth_dev = rte_eth_dev_allocate(name); 1046 if (eth_dev == NULL) { 1047 DRV_LOG(ERR, "can not allocate rte ethdev"); 1048 err = ENOMEM; 1049 goto port_error; 1050 } 1051 eth_dev->data->dev_private = priv; 1052 priv->dev_data = eth_dev->data; 1053 eth_dev->data->mac_addrs = priv->mac; 1054 eth_dev->device = &pci_dev->device; 1055 rte_eth_copy_pci_info(eth_dev, pci_dev); 1056 eth_dev->device->driver = &mlx5_driver.driver; 1057 err = mlx5_uar_init_primary(eth_dev); 1058 if (err) { 1059 err = rte_errno; 1060 goto port_error; 1061 } 1062 /* Configure the first MAC address by default. */ 1063 if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) { 1064 DRV_LOG(ERR, 1065 "port %u cannot get MAC address, is mlx5_en" 1066 " loaded? (errno: %s)", 1067 eth_dev->data->port_id, strerror(rte_errno)); 1068 err = ENODEV; 1069 goto port_error; 1070 } 1071 DRV_LOG(INFO, 1072 "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", 1073 eth_dev->data->port_id, 1074 mac.addr_bytes[0], mac.addr_bytes[1], 1075 mac.addr_bytes[2], mac.addr_bytes[3], 1076 mac.addr_bytes[4], mac.addr_bytes[5]); 1077 #ifndef NDEBUG 1078 { 1079 char ifname[IF_NAMESIZE]; 1080 1081 if (mlx5_get_ifname(eth_dev, &ifname) == 0) 1082 DRV_LOG(DEBUG, "port %u ifname is \"%s\"", 1083 eth_dev->data->port_id, ifname); 1084 else 1085 DRV_LOG(DEBUG, "port %u ifname is unknown", 1086 eth_dev->data->port_id); 1087 } 1088 #endif 1089 /* Get actual MTU if possible. */ 1090 err = mlx5_get_mtu(eth_dev, &priv->mtu); 1091 if (err) { 1092 err = rte_errno; 1093 goto port_error; 1094 } 1095 DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id, 1096 priv->mtu); 1097 /* 1098 * Initialize burst functions to prevent crashes before link-up. 1099 */ 1100 eth_dev->rx_pkt_burst = removed_rx_burst; 1101 eth_dev->tx_pkt_burst = removed_tx_burst; 1102 eth_dev->dev_ops = &mlx5_dev_ops; 1103 /* Register MAC address. */ 1104 claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); 1105 priv->nl_socket = -1; 1106 priv->nl_sn = 0; 1107 if (vf && config.vf_nl_en) { 1108 priv->nl_socket = mlx5_nl_init(RTMGRP_LINK); 1109 if (priv->nl_socket < 0) 1110 priv->nl_socket = -1; 1111 mlx5_nl_mac_addr_sync(eth_dev); 1112 } 1113 TAILQ_INIT(&priv->flows); 1114 TAILQ_INIT(&priv->ctrl_flows); 1115 /* Hint libmlx5 to use PMD allocator for data plane resources */ 1116 struct mlx5dv_ctx_allocators alctr = { 1117 .alloc = &mlx5_alloc_verbs_buf, 1118 .free = &mlx5_free_verbs_buf, 1119 .data = priv, 1120 }; 1121 mlx5_glue->dv_set_context_attr(ctx, 1122 MLX5DV_CTX_ATTR_BUF_ALLOCATORS, 1123 (void *)((uintptr_t)&alctr)); 1124 /* Bring Ethernet device up. */ 1125 DRV_LOG(DEBUG, "port %u forcing Ethernet interface up", 1126 eth_dev->data->port_id); 1127 mlx5_set_link_up(eth_dev); 1128 /* 1129 * Even though the interrupt handler is not installed yet, 1130 * interrupts will still trigger on the asyn_fd from 1131 * Verbs context returned by ibv_open_device(). 1132 */ 1133 mlx5_link_update(eth_dev, 0); 1134 /* Store device configuration on private structure. */ 1135 priv->config = config; 1136 /* Create drop queue. */ 1137 err = mlx5_flow_create_drop_queue(eth_dev); 1138 if (err) { 1139 DRV_LOG(ERR, "port %u drop queue allocation failed: %s", 1140 eth_dev->data->port_id, strerror(rte_errno)); 1141 err = rte_errno; 1142 goto port_error; 1143 } 1144 /* Supported Verbs flow priority number detection. */ 1145 if (verb_priorities == 0) 1146 verb_priorities = mlx5_get_max_verbs_prio(eth_dev); 1147 if (verb_priorities < MLX5_VERBS_FLOW_PRIO_8) { 1148 DRV_LOG(ERR, "port %u wrong Verbs flow priorities: %u", 1149 eth_dev->data->port_id, verb_priorities); 1150 err = ENOTSUP; 1151 goto port_error; 1152 } 1153 priv->config.max_verbs_prio = verb_priorities; 1154 /* 1155 * Once the device is added to the list of memory event 1156 * callback, its global MR cache table cannot be expanded 1157 * on the fly because of deadlock. If it overflows, lookup 1158 * should be done by searching MR list linearly, which is slow. 1159 */ 1160 err = mlx5_mr_btree_init(&priv->mr.cache, 1161 MLX5_MR_BTREE_CACHE_N * 2, 1162 eth_dev->device->numa_node); 1163 if (err) { 1164 err = rte_errno; 1165 goto port_error; 1166 } 1167 /* Add device to memory callback list. */ 1168 rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); 1169 LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list, 1170 priv, mem_event_cb); 1171 rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); 1172 rte_eth_dev_probing_finish(eth_dev); 1173 continue; 1174 port_error: 1175 if (priv) 1176 rte_free(priv); 1177 if (pd) 1178 claim_zero(mlx5_glue->dealloc_pd(pd)); 1179 if (ctx) 1180 claim_zero(mlx5_glue->close_device(ctx)); 1181 if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY) 1182 rte_eth_dev_release_port(eth_dev); 1183 break; 1184 } 1185 /* 1186 * XXX if something went wrong in the loop above, there is a resource 1187 * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as 1188 * long as the dpdk does not provide a way to deallocate a ethdev and a 1189 * way to enumerate the registered ethdevs to free the previous ones. 1190 */ 1191 error: 1192 if (attr_ctx) 1193 claim_zero(mlx5_glue->close_device(attr_ctx)); 1194 if (list) 1195 mlx5_glue->free_device_list(list); 1196 if (err) { 1197 rte_errno = err; 1198 return -rte_errno; 1199 } 1200 return 0; 1201 } 1202 1203 static const struct rte_pci_id mlx5_pci_id_map[] = { 1204 { 1205 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1206 PCI_DEVICE_ID_MELLANOX_CONNECTX4) 1207 }, 1208 { 1209 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1210 PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) 1211 }, 1212 { 1213 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1214 PCI_DEVICE_ID_MELLANOX_CONNECTX4LX) 1215 }, 1216 { 1217 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1218 PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) 1219 }, 1220 { 1221 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1222 PCI_DEVICE_ID_MELLANOX_CONNECTX5) 1223 }, 1224 { 1225 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1226 PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) 1227 }, 1228 { 1229 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1230 PCI_DEVICE_ID_MELLANOX_CONNECTX5EX) 1231 }, 1232 { 1233 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1234 PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF) 1235 }, 1236 { 1237 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1238 PCI_DEVICE_ID_MELLANOX_CONNECTX5BF) 1239 }, 1240 { 1241 .vendor_id = 0 1242 } 1243 }; 1244 1245 static struct rte_pci_driver mlx5_driver = { 1246 .driver = { 1247 .name = MLX5_DRIVER_NAME 1248 }, 1249 .id_table = mlx5_pci_id_map, 1250 .probe = mlx5_pci_probe, 1251 .drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV, 1252 }; 1253 1254 #ifdef RTE_LIBRTE_MLX5_DLOPEN_DEPS 1255 1256 /** 1257 * Suffix RTE_EAL_PMD_PATH with "-glue". 1258 * 1259 * This function performs a sanity check on RTE_EAL_PMD_PATH before 1260 * suffixing its last component. 1261 * 1262 * @param buf[out] 1263 * Output buffer, should be large enough otherwise NULL is returned. 1264 * @param size 1265 * Size of @p out. 1266 * 1267 * @return 1268 * Pointer to @p buf or @p NULL in case suffix cannot be appended. 1269 */ 1270 static char * 1271 mlx5_glue_path(char *buf, size_t size) 1272 { 1273 static const char *const bad[] = { "/", ".", "..", NULL }; 1274 const char *path = RTE_EAL_PMD_PATH; 1275 size_t len = strlen(path); 1276 size_t off; 1277 int i; 1278 1279 while (len && path[len - 1] == '/') 1280 --len; 1281 for (off = len; off && path[off - 1] != '/'; --off) 1282 ; 1283 for (i = 0; bad[i]; ++i) 1284 if (!strncmp(path + off, bad[i], (int)(len - off))) 1285 goto error; 1286 i = snprintf(buf, size, "%.*s-glue", (int)len, path); 1287 if (i == -1 || (size_t)i >= size) 1288 goto error; 1289 return buf; 1290 error: 1291 DRV_LOG(ERR, 1292 "unable to append \"-glue\" to last component of" 1293 " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\")," 1294 " please re-configure DPDK"); 1295 return NULL; 1296 } 1297 1298 /** 1299 * Initialization routine for run-time dependency on rdma-core. 1300 */ 1301 static int 1302 mlx5_glue_init(void) 1303 { 1304 char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")]; 1305 const char *path[] = { 1306 /* 1307 * A basic security check is necessary before trusting 1308 * MLX5_GLUE_PATH, which may override RTE_EAL_PMD_PATH. 1309 */ 1310 (geteuid() == getuid() && getegid() == getgid() ? 1311 getenv("MLX5_GLUE_PATH") : NULL), 1312 /* 1313 * When RTE_EAL_PMD_PATH is set, use its glue-suffixed 1314 * variant, otherwise let dlopen() look up libraries on its 1315 * own. 1316 */ 1317 (*RTE_EAL_PMD_PATH ? 1318 mlx5_glue_path(glue_path, sizeof(glue_path)) : ""), 1319 }; 1320 unsigned int i = 0; 1321 void *handle = NULL; 1322 void **sym; 1323 const char *dlmsg; 1324 1325 while (!handle && i != RTE_DIM(path)) { 1326 const char *end; 1327 size_t len; 1328 int ret; 1329 1330 if (!path[i]) { 1331 ++i; 1332 continue; 1333 } 1334 end = strpbrk(path[i], ":;"); 1335 if (!end) 1336 end = path[i] + strlen(path[i]); 1337 len = end - path[i]; 1338 ret = 0; 1339 do { 1340 char name[ret + 1]; 1341 1342 ret = snprintf(name, sizeof(name), "%.*s%s" MLX5_GLUE, 1343 (int)len, path[i], 1344 (!len || *(end - 1) == '/') ? "" : "/"); 1345 if (ret == -1) 1346 break; 1347 if (sizeof(name) != (size_t)ret + 1) 1348 continue; 1349 DRV_LOG(DEBUG, "looking for rdma-core glue as \"%s\"", 1350 name); 1351 handle = dlopen(name, RTLD_LAZY); 1352 break; 1353 } while (1); 1354 path[i] = end + 1; 1355 if (!*end) 1356 ++i; 1357 } 1358 if (!handle) { 1359 rte_errno = EINVAL; 1360 dlmsg = dlerror(); 1361 if (dlmsg) 1362 DRV_LOG(WARNING, "cannot load glue library: %s", dlmsg); 1363 goto glue_error; 1364 } 1365 sym = dlsym(handle, "mlx5_glue"); 1366 if (!sym || !*sym) { 1367 rte_errno = EINVAL; 1368 dlmsg = dlerror(); 1369 if (dlmsg) 1370 DRV_LOG(ERR, "cannot resolve glue symbol: %s", dlmsg); 1371 goto glue_error; 1372 } 1373 mlx5_glue = *sym; 1374 return 0; 1375 glue_error: 1376 if (handle) 1377 dlclose(handle); 1378 DRV_LOG(WARNING, 1379 "cannot initialize PMD due to missing run-time dependency on" 1380 " rdma-core libraries (libibverbs, libmlx5)"); 1381 return -rte_errno; 1382 } 1383 1384 #endif 1385 1386 /** 1387 * Driver initialization routine. 1388 */ 1389 RTE_INIT(rte_mlx5_pmd_init); 1390 static void 1391 rte_mlx5_pmd_init(void) 1392 { 1393 /* Initialize driver log type. */ 1394 mlx5_logtype = rte_log_register("pmd.net.mlx5"); 1395 if (mlx5_logtype >= 0) 1396 rte_log_set_level(mlx5_logtype, RTE_LOG_NOTICE); 1397 1398 /* Build the static tables for Verbs conversion. */ 1399 mlx5_set_ptype_table(); 1400 mlx5_set_cksum_table(); 1401 mlx5_set_swp_types_table(); 1402 /* 1403 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use 1404 * huge pages. Calling ibv_fork_init() during init allows 1405 * applications to use fork() safely for purposes other than 1406 * using this PMD, which is not supported in forked processes. 1407 */ 1408 setenv("RDMAV_HUGEPAGES_SAFE", "1", 1); 1409 /* Match the size of Rx completion entry to the size of a cacheline. */ 1410 if (RTE_CACHE_LINE_SIZE == 128) 1411 setenv("MLX5_CQE_SIZE", "128", 0); 1412 /* 1413 * MLX5_DEVICE_FATAL_CLEANUP tells ibv_destroy functions to 1414 * cleanup all the Verbs resources even when the device was removed. 1415 */ 1416 setenv("MLX5_DEVICE_FATAL_CLEANUP", "1", 1); 1417 #ifdef RTE_LIBRTE_MLX5_DLOPEN_DEPS 1418 if (mlx5_glue_init()) 1419 return; 1420 assert(mlx5_glue); 1421 #endif 1422 #ifndef NDEBUG 1423 /* Glue structure must not contain any NULL pointers. */ 1424 { 1425 unsigned int i; 1426 1427 for (i = 0; i != sizeof(*mlx5_glue) / sizeof(void *); ++i) 1428 assert(((const void *const *)mlx5_glue)[i]); 1429 } 1430 #endif 1431 if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) { 1432 DRV_LOG(ERR, 1433 "rdma-core glue \"%s\" mismatch: \"%s\" is required", 1434 mlx5_glue->version, MLX5_GLUE_VERSION); 1435 return; 1436 } 1437 mlx5_glue->fork_init(); 1438 rte_pci_register(&mlx5_driver); 1439 } 1440 1441 RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__); 1442 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map); 1443 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib"); 1444