1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015 Mellanox Technologies, Ltd 4 */ 5 6 #include <stddef.h> 7 #include <unistd.h> 8 #include <string.h> 9 #include <stdint.h> 10 #include <stdlib.h> 11 #include <errno.h> 12 #include <net/if.h> 13 #include <sys/mman.h> 14 #include <linux/rtnetlink.h> 15 16 /* Verbs header. */ 17 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 18 #ifdef PEDANTIC 19 #pragma GCC diagnostic ignored "-Wpedantic" 20 #endif 21 #include <infiniband/verbs.h> 22 #ifdef PEDANTIC 23 #pragma GCC diagnostic error "-Wpedantic" 24 #endif 25 26 #include <rte_malloc.h> 27 #include <rte_ethdev_driver.h> 28 #include <rte_ethdev_pci.h> 29 #include <rte_pci.h> 30 #include <rte_bus_pci.h> 31 #include <rte_common.h> 32 #include <rte_kvargs.h> 33 #include <rte_rwlock.h> 34 #include <rte_spinlock.h> 35 #include <rte_string_fns.h> 36 #include <rte_alarm.h> 37 38 #include <mlx5_glue.h> 39 #include <mlx5_devx_cmds.h> 40 #include <mlx5_common.h> 41 42 #include "mlx5_defs.h" 43 #include "mlx5.h" 44 #include "mlx5_utils.h" 45 #include "mlx5_rxtx.h" 46 #include "mlx5_autoconf.h" 47 #include "mlx5_mr.h" 48 #include "mlx5_flow.h" 49 #include "rte_pmd_mlx5.h" 50 51 /* Device parameter to enable RX completion queue compression. */ 52 #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en" 53 54 /* Device parameter to enable RX completion entry padding to 128B. */ 55 #define MLX5_RXQ_CQE_PAD_EN "rxq_cqe_pad_en" 56 57 /* Device parameter to enable padding Rx packet to cacheline size. */ 58 #define MLX5_RXQ_PKT_PAD_EN "rxq_pkt_pad_en" 59 60 /* Device parameter to enable Multi-Packet Rx queue. */ 61 #define MLX5_RX_MPRQ_EN "mprq_en" 62 63 /* Device parameter to configure log 2 of the number of strides for MPRQ. */ 64 #define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num" 65 66 /* Device parameter to limit the size of memcpy'd packet for MPRQ. */ 67 #define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len" 68 69 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */ 70 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq" 71 72 /* Device parameter to configure inline send. Deprecated, ignored.*/ 73 #define MLX5_TXQ_INLINE "txq_inline" 74 75 /* Device parameter to limit packet size to inline with ordinary SEND. */ 76 #define MLX5_TXQ_INLINE_MAX "txq_inline_max" 77 78 /* Device parameter to configure minimal data size to inline. */ 79 #define MLX5_TXQ_INLINE_MIN "txq_inline_min" 80 81 /* Device parameter to limit packet size to inline with Enhanced MPW. */ 82 #define MLX5_TXQ_INLINE_MPW "txq_inline_mpw" 83 84 /* 85 * Device parameter to configure the number of TX queues threshold for 86 * enabling inline send. 87 */ 88 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline" 89 90 /* 91 * Device parameter to configure the number of TX queues threshold for 92 * enabling vectorized Tx, deprecated, ignored (no vectorized Tx routines). 93 */ 94 #define MLX5_TXQS_MAX_VEC "txqs_max_vec" 95 96 /* Device parameter to enable multi-packet send WQEs. */ 97 #define MLX5_TXQ_MPW_EN "txq_mpw_en" 98 99 /* 100 * Device parameter to force doorbell register mapping 101 * to non-cahed region eliminating the extra write memory barrier. 102 */ 103 #define MLX5_TX_DB_NC "tx_db_nc" 104 105 /* 106 * Device parameter to include 2 dsegs in the title WQEBB. 107 * Deprecated, ignored. 108 */ 109 #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en" 110 111 /* 112 * Device parameter to limit the size of inlining packet. 113 * Deprecated, ignored. 114 */ 115 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len" 116 117 /* 118 * Device parameter to enable hardware Tx vector. 119 * Deprecated, ignored (no vectorized Tx routines anymore). 120 */ 121 #define MLX5_TX_VEC_EN "tx_vec_en" 122 123 /* Device parameter to enable hardware Rx vector. */ 124 #define MLX5_RX_VEC_EN "rx_vec_en" 125 126 /* Allow L3 VXLAN flow creation. */ 127 #define MLX5_L3_VXLAN_EN "l3_vxlan_en" 128 129 /* Activate DV E-Switch flow steering. */ 130 #define MLX5_DV_ESW_EN "dv_esw_en" 131 132 /* Activate DV flow steering. */ 133 #define MLX5_DV_FLOW_EN "dv_flow_en" 134 135 /* Enable extensive flow metadata support. */ 136 #define MLX5_DV_XMETA_EN "dv_xmeta_en" 137 138 /* Activate Netlink support in VF mode. */ 139 #define MLX5_VF_NL_EN "vf_nl_en" 140 141 /* Enable extending memsegs when creating a MR. */ 142 #define MLX5_MR_EXT_MEMSEG_EN "mr_ext_memseg_en" 143 144 /* Select port representors to instantiate. */ 145 #define MLX5_REPRESENTOR "representor" 146 147 /* Device parameter to configure the maximum number of dump files per queue. */ 148 #define MLX5_MAX_DUMP_FILES_NUM "max_dump_files_num" 149 150 /* Configure timeout of LRO session (in microseconds). */ 151 #define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec" 152 153 #ifndef HAVE_IBV_MLX5_MOD_MPW 154 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) 155 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) 156 #endif 157 158 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP 159 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4) 160 #endif 161 162 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data"; 163 164 /* Shared memory between primary and secondary processes. */ 165 struct mlx5_shared_data *mlx5_shared_data; 166 167 /* Spinlock for mlx5_shared_data allocation. */ 168 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER; 169 170 /* Process local data for secondary processes. */ 171 static struct mlx5_local_data mlx5_local_data; 172 173 /** Driver-specific log messages type. */ 174 int mlx5_logtype; 175 176 /** Data associated with devices to spawn. */ 177 struct mlx5_dev_spawn_data { 178 uint32_t ifindex; /**< Network interface index. */ 179 uint32_t max_port; /**< IB device maximal port index. */ 180 uint32_t ibv_port; /**< IB device physical port index. */ 181 int pf_bond; /**< bonding device PF index. < 0 - no bonding */ 182 struct mlx5_switch_info info; /**< Switch information. */ 183 struct ibv_device *ibv_dev; /**< Associated IB device. */ 184 struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */ 185 struct rte_pci_device *pci_dev; /**< Backend PCI device. */ 186 }; 187 188 static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER(); 189 static pthread_mutex_t mlx5_ibv_list_mutex = PTHREAD_MUTEX_INITIALIZER; 190 191 #define MLX5_FLOW_MIN_ID_POOL_SIZE 512 192 #define MLX5_ID_GENERATION_ARRAY_FACTOR 16 193 194 #define MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE 4096 195 #define MLX5_TAGS_HLIST_ARRAY_SIZE 8192 196 197 /** 198 * Allocate ID pool structure. 199 * 200 * @param[in] max_id 201 * The maximum id can be allocated from the pool. 202 * 203 * @return 204 * Pointer to pool object, NULL value otherwise. 205 */ 206 struct mlx5_flow_id_pool * 207 mlx5_flow_id_pool_alloc(uint32_t max_id) 208 { 209 struct mlx5_flow_id_pool *pool; 210 void *mem; 211 212 pool = rte_zmalloc("id pool allocation", sizeof(*pool), 213 RTE_CACHE_LINE_SIZE); 214 if (!pool) { 215 DRV_LOG(ERR, "can't allocate id pool"); 216 rte_errno = ENOMEM; 217 return NULL; 218 } 219 mem = rte_zmalloc("", MLX5_FLOW_MIN_ID_POOL_SIZE * sizeof(uint32_t), 220 RTE_CACHE_LINE_SIZE); 221 if (!mem) { 222 DRV_LOG(ERR, "can't allocate mem for id pool"); 223 rte_errno = ENOMEM; 224 goto error; 225 } 226 pool->free_arr = mem; 227 pool->curr = pool->free_arr; 228 pool->last = pool->free_arr + MLX5_FLOW_MIN_ID_POOL_SIZE; 229 pool->base_index = 0; 230 pool->max_id = max_id; 231 return pool; 232 error: 233 rte_free(pool); 234 return NULL; 235 } 236 237 /** 238 * Release ID pool structure. 239 * 240 * @param[in] pool 241 * Pointer to flow id pool object to free. 242 */ 243 void 244 mlx5_flow_id_pool_release(struct mlx5_flow_id_pool *pool) 245 { 246 rte_free(pool->free_arr); 247 rte_free(pool); 248 } 249 250 /** 251 * Generate ID. 252 * 253 * @param[in] pool 254 * Pointer to flow id pool. 255 * @param[out] id 256 * The generated ID. 257 * 258 * @return 259 * 0 on success, error value otherwise. 260 */ 261 uint32_t 262 mlx5_flow_id_get(struct mlx5_flow_id_pool *pool, uint32_t *id) 263 { 264 if (pool->curr == pool->free_arr) { 265 if (pool->base_index == pool->max_id) { 266 rte_errno = ENOMEM; 267 DRV_LOG(ERR, "no free id"); 268 return -rte_errno; 269 } 270 *id = ++pool->base_index; 271 return 0; 272 } 273 *id = *(--pool->curr); 274 return 0; 275 } 276 277 /** 278 * Release ID. 279 * 280 * @param[in] pool 281 * Pointer to flow id pool. 282 * @param[out] id 283 * The generated ID. 284 * 285 * @return 286 * 0 on success, error value otherwise. 287 */ 288 uint32_t 289 mlx5_flow_id_release(struct mlx5_flow_id_pool *pool, uint32_t id) 290 { 291 uint32_t size; 292 uint32_t size2; 293 void *mem; 294 295 if (pool->curr == pool->last) { 296 size = pool->curr - pool->free_arr; 297 size2 = size * MLX5_ID_GENERATION_ARRAY_FACTOR; 298 MLX5_ASSERT(size2 > size); 299 mem = rte_malloc("", size2 * sizeof(uint32_t), 0); 300 if (!mem) { 301 DRV_LOG(ERR, "can't allocate mem for id pool"); 302 rte_errno = ENOMEM; 303 return -rte_errno; 304 } 305 memcpy(mem, pool->free_arr, size * sizeof(uint32_t)); 306 rte_free(pool->free_arr); 307 pool->free_arr = mem; 308 pool->curr = pool->free_arr + size; 309 pool->last = pool->free_arr + size2; 310 } 311 *pool->curr = id; 312 pool->curr++; 313 return 0; 314 } 315 316 /** 317 * Initialize the counters management structure. 318 * 319 * @param[in] sh 320 * Pointer to mlx5_ibv_shared object to free 321 */ 322 static void 323 mlx5_flow_counters_mng_init(struct mlx5_ibv_shared *sh) 324 { 325 uint8_t i; 326 327 TAILQ_INIT(&sh->cmng.flow_counters); 328 for (i = 0; i < RTE_DIM(sh->cmng.ccont); ++i) 329 TAILQ_INIT(&sh->cmng.ccont[i].pool_list); 330 } 331 332 /** 333 * Destroy all the resources allocated for a counter memory management. 334 * 335 * @param[in] mng 336 * Pointer to the memory management structure. 337 */ 338 static void 339 mlx5_flow_destroy_counter_stat_mem_mng(struct mlx5_counter_stats_mem_mng *mng) 340 { 341 uint8_t *mem = (uint8_t *)(uintptr_t)mng->raws[0].data; 342 343 LIST_REMOVE(mng, next); 344 claim_zero(mlx5_devx_cmd_destroy(mng->dm)); 345 claim_zero(mlx5_glue->devx_umem_dereg(mng->umem)); 346 rte_free(mem); 347 } 348 349 /** 350 * Close and release all the resources of the counters management. 351 * 352 * @param[in] sh 353 * Pointer to mlx5_ibv_shared object to free. 354 */ 355 static void 356 mlx5_flow_counters_mng_close(struct mlx5_ibv_shared *sh) 357 { 358 struct mlx5_counter_stats_mem_mng *mng; 359 uint8_t i; 360 int j; 361 int retries = 1024; 362 363 rte_errno = 0; 364 while (--retries) { 365 rte_eal_alarm_cancel(mlx5_flow_query_alarm, sh); 366 if (rte_errno != EINPROGRESS) 367 break; 368 rte_pause(); 369 } 370 for (i = 0; i < RTE_DIM(sh->cmng.ccont); ++i) { 371 struct mlx5_flow_counter_pool *pool; 372 uint32_t batch = !!(i % 2); 373 374 if (!sh->cmng.ccont[i].pools) 375 continue; 376 pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list); 377 while (pool) { 378 if (batch) { 379 if (pool->min_dcs) 380 claim_zero 381 (mlx5_devx_cmd_destroy(pool->min_dcs)); 382 } 383 for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) { 384 if (pool->counters_raw[j].action) 385 claim_zero 386 (mlx5_glue->destroy_flow_action 387 (pool->counters_raw[j].action)); 388 if (!batch && pool->counters_raw[j].dcs) 389 claim_zero(mlx5_devx_cmd_destroy 390 (pool->counters_raw[j].dcs)); 391 } 392 TAILQ_REMOVE(&sh->cmng.ccont[i].pool_list, pool, 393 next); 394 rte_free(pool); 395 pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list); 396 } 397 rte_free(sh->cmng.ccont[i].pools); 398 } 399 mng = LIST_FIRST(&sh->cmng.mem_mngs); 400 while (mng) { 401 mlx5_flow_destroy_counter_stat_mem_mng(mng); 402 mng = LIST_FIRST(&sh->cmng.mem_mngs); 403 } 404 memset(&sh->cmng, 0, sizeof(sh->cmng)); 405 } 406 407 /** 408 * Extract pdn of PD object using DV API. 409 * 410 * @param[in] pd 411 * Pointer to the verbs PD object. 412 * @param[out] pdn 413 * Pointer to the PD object number variable. 414 * 415 * @return 416 * 0 on success, error value otherwise. 417 */ 418 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 419 static int 420 mlx5_get_pdn(struct ibv_pd *pd __rte_unused, uint32_t *pdn __rte_unused) 421 { 422 struct mlx5dv_obj obj; 423 struct mlx5dv_pd pd_info; 424 int ret = 0; 425 426 obj.pd.in = pd; 427 obj.pd.out = &pd_info; 428 ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD); 429 if (ret) { 430 DRV_LOG(DEBUG, "Fail to get PD object info"); 431 return ret; 432 } 433 *pdn = pd_info.pdn; 434 return 0; 435 } 436 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */ 437 438 static int 439 mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config) 440 { 441 char *env; 442 int value; 443 444 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 445 /* Get environment variable to store. */ 446 env = getenv(MLX5_SHUT_UP_BF); 447 value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET; 448 if (config->dbnc == MLX5_ARG_UNSET) 449 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1); 450 else 451 setenv(MLX5_SHUT_UP_BF, 452 config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1); 453 return value; 454 } 455 456 static void 457 mlx5_restore_doorbell_mapping_env(int value) 458 { 459 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 460 /* Restore the original environment variable state. */ 461 if (value == MLX5_ARG_UNSET) 462 unsetenv(MLX5_SHUT_UP_BF); 463 else 464 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1); 465 } 466 467 /** 468 * Allocate shared IB device context. If there is multiport device the 469 * master and representors will share this context, if there is single 470 * port dedicated IB device, the context will be used by only given 471 * port due to unification. 472 * 473 * Routine first searches the context for the specified IB device name, 474 * if found the shared context assumed and reference counter is incremented. 475 * If no context found the new one is created and initialized with specified 476 * IB device context and parameters. 477 * 478 * @param[in] spawn 479 * Pointer to the IB device attributes (name, port, etc). 480 * @param[in] config 481 * Pointer to device configuration structure. 482 * 483 * @return 484 * Pointer to mlx5_ibv_shared object on success, 485 * otherwise NULL and rte_errno is set. 486 */ 487 static struct mlx5_ibv_shared * 488 mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn, 489 const struct mlx5_dev_config *config) 490 { 491 struct mlx5_ibv_shared *sh; 492 int dbmap_env; 493 int err = 0; 494 uint32_t i; 495 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 496 struct mlx5_devx_tis_attr tis_attr = { 0 }; 497 #endif 498 499 MLX5_ASSERT(spawn); 500 /* Secondary process should not create the shared context. */ 501 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 502 pthread_mutex_lock(&mlx5_ibv_list_mutex); 503 /* Search for IB context by device name. */ 504 LIST_FOREACH(sh, &mlx5_ibv_list, next) { 505 if (!strcmp(sh->ibdev_name, spawn->ibv_dev->name)) { 506 sh->refcnt++; 507 goto exit; 508 } 509 } 510 /* No device found, we have to create new shared context. */ 511 MLX5_ASSERT(spawn->max_port); 512 sh = rte_zmalloc("ethdev shared ib context", 513 sizeof(struct mlx5_ibv_shared) + 514 spawn->max_port * 515 sizeof(struct mlx5_ibv_shared_port), 516 RTE_CACHE_LINE_SIZE); 517 if (!sh) { 518 DRV_LOG(ERR, "shared context allocation failure"); 519 rte_errno = ENOMEM; 520 goto exit; 521 } 522 /* 523 * Configure environment variable "MLX5_BF_SHUT_UP" 524 * before the device creation. The rdma_core library 525 * checks the variable at device creation and 526 * stores the result internally. 527 */ 528 dbmap_env = mlx5_config_doorbell_mapping_env(config); 529 /* Try to open IB device with DV first, then usual Verbs. */ 530 errno = 0; 531 sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev); 532 if (sh->ctx) { 533 sh->devx = 1; 534 DRV_LOG(DEBUG, "DevX is supported"); 535 /* The device is created, no need for environment. */ 536 mlx5_restore_doorbell_mapping_env(dbmap_env); 537 } else { 538 /* The environment variable is still configured. */ 539 sh->ctx = mlx5_glue->open_device(spawn->ibv_dev); 540 err = errno ? errno : ENODEV; 541 /* 542 * The environment variable is not needed anymore, 543 * all device creation attempts are completed. 544 */ 545 mlx5_restore_doorbell_mapping_env(dbmap_env); 546 if (!sh->ctx) 547 goto error; 548 DRV_LOG(DEBUG, "DevX is NOT supported"); 549 } 550 err = mlx5_glue->query_device_ex(sh->ctx, NULL, &sh->device_attr); 551 if (err) { 552 DRV_LOG(DEBUG, "ibv_query_device_ex() failed"); 553 goto error; 554 } 555 sh->refcnt = 1; 556 sh->max_port = spawn->max_port; 557 strncpy(sh->ibdev_name, sh->ctx->device->name, 558 sizeof(sh->ibdev_name)); 559 strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path, 560 sizeof(sh->ibdev_path)); 561 pthread_mutex_init(&sh->intr_mutex, NULL); 562 /* 563 * Setting port_id to max unallowed value means 564 * there is no interrupt subhandler installed for 565 * the given port index i. 566 */ 567 for (i = 0; i < sh->max_port; i++) { 568 sh->port[i].ih_port_id = RTE_MAX_ETHPORTS; 569 sh->port[i].devx_ih_port_id = RTE_MAX_ETHPORTS; 570 } 571 sh->pd = mlx5_glue->alloc_pd(sh->ctx); 572 if (sh->pd == NULL) { 573 DRV_LOG(ERR, "PD allocation failure"); 574 err = ENOMEM; 575 goto error; 576 } 577 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 578 if (sh->devx) { 579 err = mlx5_get_pdn(sh->pd, &sh->pdn); 580 if (err) { 581 DRV_LOG(ERR, "Fail to extract pdn from PD"); 582 goto error; 583 } 584 sh->td = mlx5_devx_cmd_create_td(sh->ctx); 585 if (!sh->td) { 586 DRV_LOG(ERR, "TD allocation failure"); 587 err = ENOMEM; 588 goto error; 589 } 590 tis_attr.transport_domain = sh->td->id; 591 sh->tis = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr); 592 if (!sh->tis) { 593 DRV_LOG(ERR, "TIS allocation failure"); 594 err = ENOMEM; 595 goto error; 596 } 597 } 598 sh->flow_id_pool = mlx5_flow_id_pool_alloc(UINT32_MAX); 599 if (!sh->flow_id_pool) { 600 DRV_LOG(ERR, "can't create flow id pool"); 601 err = ENOMEM; 602 goto error; 603 } 604 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */ 605 /* 606 * Once the device is added to the list of memory event 607 * callback, its global MR cache table cannot be expanded 608 * on the fly because of deadlock. If it overflows, lookup 609 * should be done by searching MR list linearly, which is slow. 610 * 611 * At this point the device is not added to the memory 612 * event list yet, context is just being created. 613 */ 614 err = mlx5_mr_btree_init(&sh->mr.cache, 615 MLX5_MR_BTREE_CACHE_N * 2, 616 spawn->pci_dev->device.numa_node); 617 if (err) { 618 err = rte_errno; 619 goto error; 620 } 621 mlx5_flow_counters_mng_init(sh); 622 /* Add device to memory callback list. */ 623 rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); 624 LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list, 625 sh, mem_event_cb); 626 rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); 627 /* Add context to the global device list. */ 628 LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next); 629 exit: 630 pthread_mutex_unlock(&mlx5_ibv_list_mutex); 631 return sh; 632 error: 633 pthread_mutex_unlock(&mlx5_ibv_list_mutex); 634 MLX5_ASSERT(sh); 635 if (sh->tis) 636 claim_zero(mlx5_devx_cmd_destroy(sh->tis)); 637 if (sh->td) 638 claim_zero(mlx5_devx_cmd_destroy(sh->td)); 639 if (sh->pd) 640 claim_zero(mlx5_glue->dealloc_pd(sh->pd)); 641 if (sh->ctx) 642 claim_zero(mlx5_glue->close_device(sh->ctx)); 643 if (sh->flow_id_pool) 644 mlx5_flow_id_pool_release(sh->flow_id_pool); 645 rte_free(sh); 646 MLX5_ASSERT(err > 0); 647 rte_errno = err; 648 return NULL; 649 } 650 651 /** 652 * Free shared IB device context. Decrement counter and if zero free 653 * all allocated resources and close handles. 654 * 655 * @param[in] sh 656 * Pointer to mlx5_ibv_shared object to free 657 */ 658 static void 659 mlx5_free_shared_ibctx(struct mlx5_ibv_shared *sh) 660 { 661 pthread_mutex_lock(&mlx5_ibv_list_mutex); 662 #ifdef RTE_LIBRTE_MLX5_DEBUG 663 /* Check the object presence in the list. */ 664 struct mlx5_ibv_shared *lctx; 665 666 LIST_FOREACH(lctx, &mlx5_ibv_list, next) 667 if (lctx == sh) 668 break; 669 MLX5_ASSERT(lctx); 670 if (lctx != sh) { 671 DRV_LOG(ERR, "Freeing non-existing shared IB context"); 672 goto exit; 673 } 674 #endif 675 MLX5_ASSERT(sh); 676 MLX5_ASSERT(sh->refcnt); 677 /* Secondary process should not free the shared context. */ 678 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 679 if (--sh->refcnt) 680 goto exit; 681 /* Remove from memory callback device list. */ 682 rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); 683 LIST_REMOVE(sh, mem_event_cb); 684 rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); 685 /* Release created Memory Regions. */ 686 mlx5_mr_release(sh); 687 /* Remove context from the global device list. */ 688 LIST_REMOVE(sh, next); 689 /* 690 * Ensure there is no async event handler installed. 691 * Only primary process handles async device events. 692 **/ 693 mlx5_flow_counters_mng_close(sh); 694 MLX5_ASSERT(!sh->intr_cnt); 695 if (sh->intr_cnt) 696 mlx5_intr_callback_unregister 697 (&sh->intr_handle, mlx5_dev_interrupt_handler, sh); 698 #ifdef HAVE_MLX5_DEVX_ASYNC_SUPPORT 699 if (sh->devx_intr_cnt) { 700 if (sh->intr_handle_devx.fd) 701 rte_intr_callback_unregister(&sh->intr_handle_devx, 702 mlx5_dev_interrupt_handler_devx, sh); 703 if (sh->devx_comp) 704 mlx5dv_devx_destroy_cmd_comp(sh->devx_comp); 705 } 706 #endif 707 pthread_mutex_destroy(&sh->intr_mutex); 708 if (sh->pd) 709 claim_zero(mlx5_glue->dealloc_pd(sh->pd)); 710 if (sh->tis) 711 claim_zero(mlx5_devx_cmd_destroy(sh->tis)); 712 if (sh->td) 713 claim_zero(mlx5_devx_cmd_destroy(sh->td)); 714 if (sh->ctx) 715 claim_zero(mlx5_glue->close_device(sh->ctx)); 716 if (sh->flow_id_pool) 717 mlx5_flow_id_pool_release(sh->flow_id_pool); 718 rte_free(sh); 719 exit: 720 pthread_mutex_unlock(&mlx5_ibv_list_mutex); 721 } 722 723 /** 724 * Destroy table hash list and all the root entries per domain. 725 * 726 * @param[in] priv 727 * Pointer to the private device data structure. 728 */ 729 static void 730 mlx5_free_table_hash_list(struct mlx5_priv *priv) 731 { 732 struct mlx5_ibv_shared *sh = priv->sh; 733 struct mlx5_flow_tbl_data_entry *tbl_data; 734 union mlx5_flow_tbl_key table_key = { 735 { 736 .table_id = 0, 737 .reserved = 0, 738 .domain = 0, 739 .direction = 0, 740 } 741 }; 742 struct mlx5_hlist_entry *pos; 743 744 if (!sh->flow_tbls) 745 return; 746 pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64); 747 if (pos) { 748 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry, 749 entry); 750 MLX5_ASSERT(tbl_data); 751 mlx5_hlist_remove(sh->flow_tbls, pos); 752 rte_free(tbl_data); 753 } 754 table_key.direction = 1; 755 pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64); 756 if (pos) { 757 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry, 758 entry); 759 MLX5_ASSERT(tbl_data); 760 mlx5_hlist_remove(sh->flow_tbls, pos); 761 rte_free(tbl_data); 762 } 763 table_key.direction = 0; 764 table_key.domain = 1; 765 pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64); 766 if (pos) { 767 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry, 768 entry); 769 MLX5_ASSERT(tbl_data); 770 mlx5_hlist_remove(sh->flow_tbls, pos); 771 rte_free(tbl_data); 772 } 773 mlx5_hlist_destroy(sh->flow_tbls, NULL, NULL); 774 } 775 776 /** 777 * Initialize flow table hash list and create the root tables entry 778 * for each domain. 779 * 780 * @param[in] priv 781 * Pointer to the private device data structure. 782 * 783 * @return 784 * Zero on success, positive error code otherwise. 785 */ 786 static int 787 mlx5_alloc_table_hash_list(struct mlx5_priv *priv) 788 { 789 struct mlx5_ibv_shared *sh = priv->sh; 790 char s[MLX5_HLIST_NAMESIZE]; 791 int err = 0; 792 793 MLX5_ASSERT(sh); 794 snprintf(s, sizeof(s), "%s_flow_table", priv->sh->ibdev_name); 795 sh->flow_tbls = mlx5_hlist_create(s, MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE); 796 if (!sh->flow_tbls) { 797 DRV_LOG(ERR, "flow tables with hash creation failed.\n"); 798 err = ENOMEM; 799 return err; 800 } 801 #ifndef HAVE_MLX5DV_DR 802 /* 803 * In case we have not DR support, the zero tables should be created 804 * because DV expect to see them even if they cannot be created by 805 * RDMA-CORE. 806 */ 807 union mlx5_flow_tbl_key table_key = { 808 { 809 .table_id = 0, 810 .reserved = 0, 811 .domain = 0, 812 .direction = 0, 813 } 814 }; 815 struct mlx5_flow_tbl_data_entry *tbl_data = rte_zmalloc(NULL, 816 sizeof(*tbl_data), 0); 817 818 if (!tbl_data) { 819 err = ENOMEM; 820 goto error; 821 } 822 tbl_data->entry.key = table_key.v64; 823 err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry); 824 if (err) 825 goto error; 826 rte_atomic32_init(&tbl_data->tbl.refcnt); 827 rte_atomic32_inc(&tbl_data->tbl.refcnt); 828 table_key.direction = 1; 829 tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0); 830 if (!tbl_data) { 831 err = ENOMEM; 832 goto error; 833 } 834 tbl_data->entry.key = table_key.v64; 835 err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry); 836 if (err) 837 goto error; 838 rte_atomic32_init(&tbl_data->tbl.refcnt); 839 rte_atomic32_inc(&tbl_data->tbl.refcnt); 840 table_key.direction = 0; 841 table_key.domain = 1; 842 tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0); 843 if (!tbl_data) { 844 err = ENOMEM; 845 goto error; 846 } 847 tbl_data->entry.key = table_key.v64; 848 err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry); 849 if (err) 850 goto error; 851 rte_atomic32_init(&tbl_data->tbl.refcnt); 852 rte_atomic32_inc(&tbl_data->tbl.refcnt); 853 return err; 854 error: 855 mlx5_free_table_hash_list(priv); 856 #endif /* HAVE_MLX5DV_DR */ 857 return err; 858 } 859 860 /** 861 * Initialize DR related data within private structure. 862 * Routine checks the reference counter and does actual 863 * resources creation/initialization only if counter is zero. 864 * 865 * @param[in] priv 866 * Pointer to the private device data structure. 867 * 868 * @return 869 * Zero on success, positive error code otherwise. 870 */ 871 static int 872 mlx5_alloc_shared_dr(struct mlx5_priv *priv) 873 { 874 struct mlx5_ibv_shared *sh = priv->sh; 875 char s[MLX5_HLIST_NAMESIZE]; 876 int err = 0; 877 878 if (!sh->flow_tbls) 879 err = mlx5_alloc_table_hash_list(priv); 880 else 881 DRV_LOG(DEBUG, "sh->flow_tbls[%p] already created, reuse\n", 882 (void *)sh->flow_tbls); 883 if (err) 884 return err; 885 /* Create tags hash list table. */ 886 snprintf(s, sizeof(s), "%s_tags", sh->ibdev_name); 887 sh->tag_table = mlx5_hlist_create(s, MLX5_TAGS_HLIST_ARRAY_SIZE); 888 if (!sh->tag_table) { 889 DRV_LOG(ERR, "tags with hash creation failed.\n"); 890 err = ENOMEM; 891 goto error; 892 } 893 #ifdef HAVE_MLX5DV_DR 894 void *domain; 895 896 if (sh->dv_refcnt) { 897 /* Shared DV/DR structures is already initialized. */ 898 sh->dv_refcnt++; 899 priv->dr_shared = 1; 900 return 0; 901 } 902 /* Reference counter is zero, we should initialize structures. */ 903 domain = mlx5_glue->dr_create_domain(sh->ctx, 904 MLX5DV_DR_DOMAIN_TYPE_NIC_RX); 905 if (!domain) { 906 DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed"); 907 err = errno; 908 goto error; 909 } 910 sh->rx_domain = domain; 911 domain = mlx5_glue->dr_create_domain(sh->ctx, 912 MLX5DV_DR_DOMAIN_TYPE_NIC_TX); 913 if (!domain) { 914 DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed"); 915 err = errno; 916 goto error; 917 } 918 pthread_mutex_init(&sh->dv_mutex, NULL); 919 sh->tx_domain = domain; 920 #ifdef HAVE_MLX5DV_DR_ESWITCH 921 if (priv->config.dv_esw_en) { 922 domain = mlx5_glue->dr_create_domain 923 (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB); 924 if (!domain) { 925 DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed"); 926 err = errno; 927 goto error; 928 } 929 sh->fdb_domain = domain; 930 sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop(); 931 } 932 #endif 933 sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan(); 934 #endif /* HAVE_MLX5DV_DR */ 935 sh->dv_refcnt++; 936 priv->dr_shared = 1; 937 return 0; 938 error: 939 /* Rollback the created objects. */ 940 if (sh->rx_domain) { 941 mlx5_glue->dr_destroy_domain(sh->rx_domain); 942 sh->rx_domain = NULL; 943 } 944 if (sh->tx_domain) { 945 mlx5_glue->dr_destroy_domain(sh->tx_domain); 946 sh->tx_domain = NULL; 947 } 948 if (sh->fdb_domain) { 949 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 950 sh->fdb_domain = NULL; 951 } 952 if (sh->esw_drop_action) { 953 mlx5_glue->destroy_flow_action(sh->esw_drop_action); 954 sh->esw_drop_action = NULL; 955 } 956 if (sh->pop_vlan_action) { 957 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 958 sh->pop_vlan_action = NULL; 959 } 960 if (sh->tag_table) { 961 /* tags should be destroyed with flow before. */ 962 mlx5_hlist_destroy(sh->tag_table, NULL, NULL); 963 sh->tag_table = NULL; 964 } 965 mlx5_free_table_hash_list(priv); 966 return err; 967 } 968 969 /** 970 * Destroy DR related data within private structure. 971 * 972 * @param[in] priv 973 * Pointer to the private device data structure. 974 */ 975 static void 976 mlx5_free_shared_dr(struct mlx5_priv *priv) 977 { 978 struct mlx5_ibv_shared *sh; 979 980 if (!priv->dr_shared) 981 return; 982 priv->dr_shared = 0; 983 sh = priv->sh; 984 MLX5_ASSERT(sh); 985 #ifdef HAVE_MLX5DV_DR 986 MLX5_ASSERT(sh->dv_refcnt); 987 if (sh->dv_refcnt && --sh->dv_refcnt) 988 return; 989 if (sh->rx_domain) { 990 mlx5_glue->dr_destroy_domain(sh->rx_domain); 991 sh->rx_domain = NULL; 992 } 993 if (sh->tx_domain) { 994 mlx5_glue->dr_destroy_domain(sh->tx_domain); 995 sh->tx_domain = NULL; 996 } 997 #ifdef HAVE_MLX5DV_DR_ESWITCH 998 if (sh->fdb_domain) { 999 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 1000 sh->fdb_domain = NULL; 1001 } 1002 if (sh->esw_drop_action) { 1003 mlx5_glue->destroy_flow_action(sh->esw_drop_action); 1004 sh->esw_drop_action = NULL; 1005 } 1006 #endif 1007 if (sh->pop_vlan_action) { 1008 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 1009 sh->pop_vlan_action = NULL; 1010 } 1011 pthread_mutex_destroy(&sh->dv_mutex); 1012 #endif /* HAVE_MLX5DV_DR */ 1013 if (sh->tag_table) { 1014 /* tags should be destroyed with flow before. */ 1015 mlx5_hlist_destroy(sh->tag_table, NULL, NULL); 1016 sh->tag_table = NULL; 1017 } 1018 mlx5_free_table_hash_list(priv); 1019 } 1020 1021 /** 1022 * Initialize shared data between primary and secondary process. 1023 * 1024 * A memzone is reserved by primary process and secondary processes attach to 1025 * the memzone. 1026 * 1027 * @return 1028 * 0 on success, a negative errno value otherwise and rte_errno is set. 1029 */ 1030 static int 1031 mlx5_init_shared_data(void) 1032 { 1033 const struct rte_memzone *mz; 1034 int ret = 0; 1035 1036 rte_spinlock_lock(&mlx5_shared_data_lock); 1037 if (mlx5_shared_data == NULL) { 1038 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 1039 /* Allocate shared memory. */ 1040 mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA, 1041 sizeof(*mlx5_shared_data), 1042 SOCKET_ID_ANY, 0); 1043 if (mz == NULL) { 1044 DRV_LOG(ERR, 1045 "Cannot allocate mlx5 shared data"); 1046 ret = -rte_errno; 1047 goto error; 1048 } 1049 mlx5_shared_data = mz->addr; 1050 memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data)); 1051 rte_spinlock_init(&mlx5_shared_data->lock); 1052 } else { 1053 /* Lookup allocated shared memory. */ 1054 mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA); 1055 if (mz == NULL) { 1056 DRV_LOG(ERR, 1057 "Cannot attach mlx5 shared data"); 1058 ret = -rte_errno; 1059 goto error; 1060 } 1061 mlx5_shared_data = mz->addr; 1062 memset(&mlx5_local_data, 0, sizeof(mlx5_local_data)); 1063 } 1064 } 1065 error: 1066 rte_spinlock_unlock(&mlx5_shared_data_lock); 1067 return ret; 1068 } 1069 1070 /** 1071 * Retrieve integer value from environment variable. 1072 * 1073 * @param[in] name 1074 * Environment variable name. 1075 * 1076 * @return 1077 * Integer value, 0 if the variable is not set. 1078 */ 1079 int 1080 mlx5_getenv_int(const char *name) 1081 { 1082 const char *val = getenv(name); 1083 1084 if (val == NULL) 1085 return 0; 1086 return atoi(val); 1087 } 1088 1089 /** 1090 * Verbs callback to allocate a memory. This function should allocate the space 1091 * according to the size provided residing inside a huge page. 1092 * Please note that all allocation must respect the alignment from libmlx5 1093 * (i.e. currently sysconf(_SC_PAGESIZE)). 1094 * 1095 * @param[in] size 1096 * The size in bytes of the memory to allocate. 1097 * @param[in] data 1098 * A pointer to the callback data. 1099 * 1100 * @return 1101 * Allocated buffer, NULL otherwise and rte_errno is set. 1102 */ 1103 static void * 1104 mlx5_alloc_verbs_buf(size_t size, void *data) 1105 { 1106 struct mlx5_priv *priv = data; 1107 void *ret; 1108 size_t alignment = sysconf(_SC_PAGESIZE); 1109 unsigned int socket = SOCKET_ID_ANY; 1110 1111 if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) { 1112 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 1113 1114 socket = ctrl->socket; 1115 } else if (priv->verbs_alloc_ctx.type == 1116 MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) { 1117 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 1118 1119 socket = ctrl->socket; 1120 } 1121 MLX5_ASSERT(data != NULL); 1122 ret = rte_malloc_socket(__func__, size, alignment, socket); 1123 if (!ret && size) 1124 rte_errno = ENOMEM; 1125 return ret; 1126 } 1127 1128 /** 1129 * Verbs callback to free a memory. 1130 * 1131 * @param[in] ptr 1132 * A pointer to the memory to free. 1133 * @param[in] data 1134 * A pointer to the callback data. 1135 */ 1136 static void 1137 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused) 1138 { 1139 MLX5_ASSERT(data != NULL); 1140 rte_free(ptr); 1141 } 1142 1143 /** 1144 * DPDK callback to add udp tunnel port 1145 * 1146 * @param[in] dev 1147 * A pointer to eth_dev 1148 * @param[in] udp_tunnel 1149 * A pointer to udp tunnel 1150 * 1151 * @return 1152 * 0 on valid udp ports and tunnels, -ENOTSUP otherwise. 1153 */ 1154 int 1155 mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev __rte_unused, 1156 struct rte_eth_udp_tunnel *udp_tunnel) 1157 { 1158 MLX5_ASSERT(udp_tunnel != NULL); 1159 if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN && 1160 udp_tunnel->udp_port == 4789) 1161 return 0; 1162 if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN_GPE && 1163 udp_tunnel->udp_port == 4790) 1164 return 0; 1165 return -ENOTSUP; 1166 } 1167 1168 /** 1169 * Initialize process private data structure. 1170 * 1171 * @param dev 1172 * Pointer to Ethernet device structure. 1173 * 1174 * @return 1175 * 0 on success, a negative errno value otherwise and rte_errno is set. 1176 */ 1177 int 1178 mlx5_proc_priv_init(struct rte_eth_dev *dev) 1179 { 1180 struct mlx5_priv *priv = dev->data->dev_private; 1181 struct mlx5_proc_priv *ppriv; 1182 size_t ppriv_size; 1183 1184 /* 1185 * UAR register table follows the process private structure. BlueFlame 1186 * registers for Tx queues are stored in the table. 1187 */ 1188 ppriv_size = 1189 sizeof(struct mlx5_proc_priv) + priv->txqs_n * sizeof(void *); 1190 ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size, 1191 RTE_CACHE_LINE_SIZE, dev->device->numa_node); 1192 if (!ppriv) { 1193 rte_errno = ENOMEM; 1194 return -rte_errno; 1195 } 1196 ppriv->uar_table_sz = ppriv_size; 1197 dev->process_private = ppriv; 1198 return 0; 1199 } 1200 1201 /** 1202 * Un-initialize process private data structure. 1203 * 1204 * @param dev 1205 * Pointer to Ethernet device structure. 1206 */ 1207 static void 1208 mlx5_proc_priv_uninit(struct rte_eth_dev *dev) 1209 { 1210 if (!dev->process_private) 1211 return; 1212 rte_free(dev->process_private); 1213 dev->process_private = NULL; 1214 } 1215 1216 /** 1217 * DPDK callback to close the device. 1218 * 1219 * Destroy all queues and objects, free memory. 1220 * 1221 * @param dev 1222 * Pointer to Ethernet device structure. 1223 */ 1224 static void 1225 mlx5_dev_close(struct rte_eth_dev *dev) 1226 { 1227 struct mlx5_priv *priv = dev->data->dev_private; 1228 unsigned int i; 1229 int ret; 1230 1231 DRV_LOG(DEBUG, "port %u closing device \"%s\"", 1232 dev->data->port_id, 1233 ((priv->sh->ctx != NULL) ? priv->sh->ctx->device->name : "")); 1234 /* In case mlx5_dev_stop() has not been called. */ 1235 mlx5_dev_interrupt_handler_uninstall(dev); 1236 mlx5_dev_interrupt_handler_devx_uninstall(dev); 1237 mlx5_traffic_disable(dev); 1238 mlx5_flow_flush(dev, NULL); 1239 mlx5_flow_meter_flush(dev, NULL); 1240 /* Prevent crashes when queues are still in use. */ 1241 dev->rx_pkt_burst = removed_rx_burst; 1242 dev->tx_pkt_burst = removed_tx_burst; 1243 rte_wmb(); 1244 /* Disable datapath on secondary process. */ 1245 mlx5_mp_req_stop_rxtx(dev); 1246 if (priv->rxqs != NULL) { 1247 /* XXX race condition if mlx5_rx_burst() is still running. */ 1248 usleep(1000); 1249 for (i = 0; (i != priv->rxqs_n); ++i) 1250 mlx5_rxq_release(dev, i); 1251 priv->rxqs_n = 0; 1252 priv->rxqs = NULL; 1253 } 1254 if (priv->txqs != NULL) { 1255 /* XXX race condition if mlx5_tx_burst() is still running. */ 1256 usleep(1000); 1257 for (i = 0; (i != priv->txqs_n); ++i) 1258 mlx5_txq_release(dev, i); 1259 priv->txqs_n = 0; 1260 priv->txqs = NULL; 1261 } 1262 mlx5_proc_priv_uninit(dev); 1263 if (priv->mreg_cp_tbl) 1264 mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL); 1265 mlx5_mprq_free_mp(dev); 1266 mlx5_free_shared_dr(priv); 1267 if (priv->rss_conf.rss_key != NULL) 1268 rte_free(priv->rss_conf.rss_key); 1269 if (priv->reta_idx != NULL) 1270 rte_free(priv->reta_idx); 1271 if (priv->config.vf) 1272 mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev), 1273 dev->data->mac_addrs, 1274 MLX5_MAX_MAC_ADDRESSES, priv->mac_own); 1275 if (priv->nl_socket_route >= 0) 1276 close(priv->nl_socket_route); 1277 if (priv->nl_socket_rdma >= 0) 1278 close(priv->nl_socket_rdma); 1279 if (priv->vmwa_context) 1280 mlx5_vlan_vmwa_exit(priv->vmwa_context); 1281 if (priv->sh) { 1282 /* 1283 * Free the shared context in last turn, because the cleanup 1284 * routines above may use some shared fields, like 1285 * mlx5_nl_mac_addr_flush() uses ibdev_path for retrieveing 1286 * ifindex if Netlink fails. 1287 */ 1288 mlx5_free_shared_ibctx(priv->sh); 1289 priv->sh = NULL; 1290 } 1291 ret = mlx5_hrxq_verify(dev); 1292 if (ret) 1293 DRV_LOG(WARNING, "port %u some hash Rx queue still remain", 1294 dev->data->port_id); 1295 ret = mlx5_ind_table_obj_verify(dev); 1296 if (ret) 1297 DRV_LOG(WARNING, "port %u some indirection table still remain", 1298 dev->data->port_id); 1299 ret = mlx5_rxq_obj_verify(dev); 1300 if (ret) 1301 DRV_LOG(WARNING, "port %u some Rx queue objects still remain", 1302 dev->data->port_id); 1303 ret = mlx5_rxq_verify(dev); 1304 if (ret) 1305 DRV_LOG(WARNING, "port %u some Rx queues still remain", 1306 dev->data->port_id); 1307 ret = mlx5_txq_obj_verify(dev); 1308 if (ret) 1309 DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain", 1310 dev->data->port_id); 1311 ret = mlx5_txq_verify(dev); 1312 if (ret) 1313 DRV_LOG(WARNING, "port %u some Tx queues still remain", 1314 dev->data->port_id); 1315 ret = mlx5_flow_verify(dev); 1316 if (ret) 1317 DRV_LOG(WARNING, "port %u some flows still remain", 1318 dev->data->port_id); 1319 if (priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { 1320 unsigned int c = 0; 1321 uint16_t port_id; 1322 1323 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { 1324 struct mlx5_priv *opriv = 1325 rte_eth_devices[port_id].data->dev_private; 1326 1327 if (!opriv || 1328 opriv->domain_id != priv->domain_id || 1329 &rte_eth_devices[port_id] == dev) 1330 continue; 1331 ++c; 1332 break; 1333 } 1334 if (!c) 1335 claim_zero(rte_eth_switch_domain_free(priv->domain_id)); 1336 } 1337 memset(priv, 0, sizeof(*priv)); 1338 priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; 1339 /* 1340 * Reset mac_addrs to NULL such that it is not freed as part of 1341 * rte_eth_dev_release_port(). mac_addrs is part of dev_private so 1342 * it is freed when dev_private is freed. 1343 */ 1344 dev->data->mac_addrs = NULL; 1345 } 1346 1347 const struct eth_dev_ops mlx5_dev_ops = { 1348 .dev_configure = mlx5_dev_configure, 1349 .dev_start = mlx5_dev_start, 1350 .dev_stop = mlx5_dev_stop, 1351 .dev_set_link_down = mlx5_set_link_down, 1352 .dev_set_link_up = mlx5_set_link_up, 1353 .dev_close = mlx5_dev_close, 1354 .promiscuous_enable = mlx5_promiscuous_enable, 1355 .promiscuous_disable = mlx5_promiscuous_disable, 1356 .allmulticast_enable = mlx5_allmulticast_enable, 1357 .allmulticast_disable = mlx5_allmulticast_disable, 1358 .link_update = mlx5_link_update, 1359 .stats_get = mlx5_stats_get, 1360 .stats_reset = mlx5_stats_reset, 1361 .xstats_get = mlx5_xstats_get, 1362 .xstats_reset = mlx5_xstats_reset, 1363 .xstats_get_names = mlx5_xstats_get_names, 1364 .fw_version_get = mlx5_fw_version_get, 1365 .dev_infos_get = mlx5_dev_infos_get, 1366 .read_clock = mlx5_read_clock, 1367 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 1368 .vlan_filter_set = mlx5_vlan_filter_set, 1369 .rx_queue_setup = mlx5_rx_queue_setup, 1370 .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup, 1371 .tx_queue_setup = mlx5_tx_queue_setup, 1372 .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup, 1373 .rx_queue_release = mlx5_rx_queue_release, 1374 .tx_queue_release = mlx5_tx_queue_release, 1375 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 1376 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 1377 .mac_addr_remove = mlx5_mac_addr_remove, 1378 .mac_addr_add = mlx5_mac_addr_add, 1379 .mac_addr_set = mlx5_mac_addr_set, 1380 .set_mc_addr_list = mlx5_set_mc_addr_list, 1381 .mtu_set = mlx5_dev_set_mtu, 1382 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 1383 .vlan_offload_set = mlx5_vlan_offload_set, 1384 .reta_update = mlx5_dev_rss_reta_update, 1385 .reta_query = mlx5_dev_rss_reta_query, 1386 .rss_hash_update = mlx5_rss_hash_update, 1387 .rss_hash_conf_get = mlx5_rss_hash_conf_get, 1388 .filter_ctrl = mlx5_dev_filter_ctrl, 1389 .rx_descriptor_status = mlx5_rx_descriptor_status, 1390 .tx_descriptor_status = mlx5_tx_descriptor_status, 1391 .rxq_info_get = mlx5_rxq_info_get, 1392 .txq_info_get = mlx5_txq_info_get, 1393 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 1394 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 1395 .rx_queue_count = mlx5_rx_queue_count, 1396 .rx_queue_intr_enable = mlx5_rx_intr_enable, 1397 .rx_queue_intr_disable = mlx5_rx_intr_disable, 1398 .is_removed = mlx5_is_removed, 1399 .udp_tunnel_port_add = mlx5_udp_tunnel_port_add, 1400 .get_module_info = mlx5_get_module_info, 1401 .get_module_eeprom = mlx5_get_module_eeprom, 1402 .hairpin_cap_get = mlx5_hairpin_cap_get, 1403 .mtr_ops_get = mlx5_flow_meter_ops_get, 1404 }; 1405 1406 /* Available operations from secondary process. */ 1407 static const struct eth_dev_ops mlx5_dev_sec_ops = { 1408 .stats_get = mlx5_stats_get, 1409 .stats_reset = mlx5_stats_reset, 1410 .xstats_get = mlx5_xstats_get, 1411 .xstats_reset = mlx5_xstats_reset, 1412 .xstats_get_names = mlx5_xstats_get_names, 1413 .fw_version_get = mlx5_fw_version_get, 1414 .dev_infos_get = mlx5_dev_infos_get, 1415 .rx_descriptor_status = mlx5_rx_descriptor_status, 1416 .tx_descriptor_status = mlx5_tx_descriptor_status, 1417 .rxq_info_get = mlx5_rxq_info_get, 1418 .txq_info_get = mlx5_txq_info_get, 1419 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 1420 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 1421 .get_module_info = mlx5_get_module_info, 1422 .get_module_eeprom = mlx5_get_module_eeprom, 1423 }; 1424 1425 /* Available operations in flow isolated mode. */ 1426 const struct eth_dev_ops mlx5_dev_ops_isolate = { 1427 .dev_configure = mlx5_dev_configure, 1428 .dev_start = mlx5_dev_start, 1429 .dev_stop = mlx5_dev_stop, 1430 .dev_set_link_down = mlx5_set_link_down, 1431 .dev_set_link_up = mlx5_set_link_up, 1432 .dev_close = mlx5_dev_close, 1433 .promiscuous_enable = mlx5_promiscuous_enable, 1434 .promiscuous_disable = mlx5_promiscuous_disable, 1435 .allmulticast_enable = mlx5_allmulticast_enable, 1436 .allmulticast_disable = mlx5_allmulticast_disable, 1437 .link_update = mlx5_link_update, 1438 .stats_get = mlx5_stats_get, 1439 .stats_reset = mlx5_stats_reset, 1440 .xstats_get = mlx5_xstats_get, 1441 .xstats_reset = mlx5_xstats_reset, 1442 .xstats_get_names = mlx5_xstats_get_names, 1443 .fw_version_get = mlx5_fw_version_get, 1444 .dev_infos_get = mlx5_dev_infos_get, 1445 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 1446 .vlan_filter_set = mlx5_vlan_filter_set, 1447 .rx_queue_setup = mlx5_rx_queue_setup, 1448 .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup, 1449 .tx_queue_setup = mlx5_tx_queue_setup, 1450 .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup, 1451 .rx_queue_release = mlx5_rx_queue_release, 1452 .tx_queue_release = mlx5_tx_queue_release, 1453 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 1454 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 1455 .mac_addr_remove = mlx5_mac_addr_remove, 1456 .mac_addr_add = mlx5_mac_addr_add, 1457 .mac_addr_set = mlx5_mac_addr_set, 1458 .set_mc_addr_list = mlx5_set_mc_addr_list, 1459 .mtu_set = mlx5_dev_set_mtu, 1460 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 1461 .vlan_offload_set = mlx5_vlan_offload_set, 1462 .filter_ctrl = mlx5_dev_filter_ctrl, 1463 .rx_descriptor_status = mlx5_rx_descriptor_status, 1464 .tx_descriptor_status = mlx5_tx_descriptor_status, 1465 .rxq_info_get = mlx5_rxq_info_get, 1466 .txq_info_get = mlx5_txq_info_get, 1467 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 1468 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 1469 .rx_queue_intr_enable = mlx5_rx_intr_enable, 1470 .rx_queue_intr_disable = mlx5_rx_intr_disable, 1471 .is_removed = mlx5_is_removed, 1472 .get_module_info = mlx5_get_module_info, 1473 .get_module_eeprom = mlx5_get_module_eeprom, 1474 .hairpin_cap_get = mlx5_hairpin_cap_get, 1475 .mtr_ops_get = mlx5_flow_meter_ops_get, 1476 }; 1477 1478 /** 1479 * Verify and store value for device argument. 1480 * 1481 * @param[in] key 1482 * Key argument to verify. 1483 * @param[in] val 1484 * Value associated with key. 1485 * @param opaque 1486 * User data. 1487 * 1488 * @return 1489 * 0 on success, a negative errno value otherwise and rte_errno is set. 1490 */ 1491 static int 1492 mlx5_args_check(const char *key, const char *val, void *opaque) 1493 { 1494 struct mlx5_dev_config *config = opaque; 1495 unsigned long tmp; 1496 1497 /* No-op, port representors are processed in mlx5_dev_spawn(). */ 1498 if (!strcmp(MLX5_REPRESENTOR, key)) 1499 return 0; 1500 errno = 0; 1501 tmp = strtoul(val, NULL, 0); 1502 if (errno) { 1503 rte_errno = errno; 1504 DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val); 1505 return -rte_errno; 1506 } 1507 if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) { 1508 config->cqe_comp = !!tmp; 1509 } else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) { 1510 config->cqe_pad = !!tmp; 1511 } else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) { 1512 config->hw_padding = !!tmp; 1513 } else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) { 1514 config->mprq.enabled = !!tmp; 1515 } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) { 1516 config->mprq.stride_num_n = tmp; 1517 } else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) { 1518 config->mprq.max_memcpy_len = tmp; 1519 } else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) { 1520 config->mprq.min_rxqs_num = tmp; 1521 } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) { 1522 DRV_LOG(WARNING, "%s: deprecated parameter," 1523 " converted to txq_inline_max", key); 1524 config->txq_inline_max = tmp; 1525 } else if (strcmp(MLX5_TXQ_INLINE_MAX, key) == 0) { 1526 config->txq_inline_max = tmp; 1527 } else if (strcmp(MLX5_TXQ_INLINE_MIN, key) == 0) { 1528 config->txq_inline_min = tmp; 1529 } else if (strcmp(MLX5_TXQ_INLINE_MPW, key) == 0) { 1530 config->txq_inline_mpw = tmp; 1531 } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) { 1532 config->txqs_inline = tmp; 1533 } else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) { 1534 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key); 1535 } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) { 1536 config->mps = !!tmp; 1537 } else if (strcmp(MLX5_TX_DB_NC, key) == 0) { 1538 if (tmp != MLX5_TXDB_CACHED && 1539 tmp != MLX5_TXDB_NCACHED && 1540 tmp != MLX5_TXDB_HEURISTIC) { 1541 DRV_LOG(ERR, "invalid Tx doorbell " 1542 "mapping parameter"); 1543 rte_errno = EINVAL; 1544 return -rte_errno; 1545 } 1546 config->dbnc = tmp; 1547 } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) { 1548 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key); 1549 } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) { 1550 DRV_LOG(WARNING, "%s: deprecated parameter," 1551 " converted to txq_inline_mpw", key); 1552 config->txq_inline_mpw = tmp; 1553 } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) { 1554 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key); 1555 } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) { 1556 config->rx_vec_en = !!tmp; 1557 } else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) { 1558 config->l3_vxlan_en = !!tmp; 1559 } else if (strcmp(MLX5_VF_NL_EN, key) == 0) { 1560 config->vf_nl_en = !!tmp; 1561 } else if (strcmp(MLX5_DV_ESW_EN, key) == 0) { 1562 config->dv_esw_en = !!tmp; 1563 } else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) { 1564 config->dv_flow_en = !!tmp; 1565 } else if (strcmp(MLX5_DV_XMETA_EN, key) == 0) { 1566 if (tmp != MLX5_XMETA_MODE_LEGACY && 1567 tmp != MLX5_XMETA_MODE_META16 && 1568 tmp != MLX5_XMETA_MODE_META32) { 1569 DRV_LOG(ERR, "invalid extensive " 1570 "metadata parameter"); 1571 rte_errno = EINVAL; 1572 return -rte_errno; 1573 } 1574 config->dv_xmeta_en = tmp; 1575 } else if (strcmp(MLX5_MR_EXT_MEMSEG_EN, key) == 0) { 1576 config->mr_ext_memseg_en = !!tmp; 1577 } else if (strcmp(MLX5_MAX_DUMP_FILES_NUM, key) == 0) { 1578 config->max_dump_files_num = tmp; 1579 } else if (strcmp(MLX5_LRO_TIMEOUT_USEC, key) == 0) { 1580 config->lro.timeout = tmp; 1581 } else if (strcmp(MLX5_CLASS_ARG_NAME, key) == 0) { 1582 DRV_LOG(DEBUG, "class argument is %s.", val); 1583 } else { 1584 DRV_LOG(WARNING, "%s: unknown parameter", key); 1585 rte_errno = EINVAL; 1586 return -rte_errno; 1587 } 1588 return 0; 1589 } 1590 1591 /** 1592 * Parse device parameters. 1593 * 1594 * @param config 1595 * Pointer to device configuration structure. 1596 * @param devargs 1597 * Device arguments structure. 1598 * 1599 * @return 1600 * 0 on success, a negative errno value otherwise and rte_errno is set. 1601 */ 1602 static int 1603 mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) 1604 { 1605 const char **params = (const char *[]){ 1606 MLX5_RXQ_CQE_COMP_EN, 1607 MLX5_RXQ_CQE_PAD_EN, 1608 MLX5_RXQ_PKT_PAD_EN, 1609 MLX5_RX_MPRQ_EN, 1610 MLX5_RX_MPRQ_LOG_STRIDE_NUM, 1611 MLX5_RX_MPRQ_MAX_MEMCPY_LEN, 1612 MLX5_RXQS_MIN_MPRQ, 1613 MLX5_TXQ_INLINE, 1614 MLX5_TXQ_INLINE_MIN, 1615 MLX5_TXQ_INLINE_MAX, 1616 MLX5_TXQ_INLINE_MPW, 1617 MLX5_TXQS_MIN_INLINE, 1618 MLX5_TXQS_MAX_VEC, 1619 MLX5_TXQ_MPW_EN, 1620 MLX5_TXQ_MPW_HDR_DSEG_EN, 1621 MLX5_TXQ_MAX_INLINE_LEN, 1622 MLX5_TX_DB_NC, 1623 MLX5_TX_VEC_EN, 1624 MLX5_RX_VEC_EN, 1625 MLX5_L3_VXLAN_EN, 1626 MLX5_VF_NL_EN, 1627 MLX5_DV_ESW_EN, 1628 MLX5_DV_FLOW_EN, 1629 MLX5_DV_XMETA_EN, 1630 MLX5_MR_EXT_MEMSEG_EN, 1631 MLX5_REPRESENTOR, 1632 MLX5_MAX_DUMP_FILES_NUM, 1633 MLX5_LRO_TIMEOUT_USEC, 1634 MLX5_CLASS_ARG_NAME, 1635 NULL, 1636 }; 1637 struct rte_kvargs *kvlist; 1638 int ret = 0; 1639 int i; 1640 1641 if (devargs == NULL) 1642 return 0; 1643 /* Following UGLY cast is done to pass checkpatch. */ 1644 kvlist = rte_kvargs_parse(devargs->args, params); 1645 if (kvlist == NULL) { 1646 rte_errno = EINVAL; 1647 return -rte_errno; 1648 } 1649 /* Process parameters. */ 1650 for (i = 0; (params[i] != NULL); ++i) { 1651 if (rte_kvargs_count(kvlist, params[i])) { 1652 ret = rte_kvargs_process(kvlist, params[i], 1653 mlx5_args_check, config); 1654 if (ret) { 1655 rte_errno = EINVAL; 1656 rte_kvargs_free(kvlist); 1657 return -rte_errno; 1658 } 1659 } 1660 } 1661 rte_kvargs_free(kvlist); 1662 return 0; 1663 } 1664 1665 static struct rte_pci_driver mlx5_driver; 1666 1667 /** 1668 * PMD global initialization. 1669 * 1670 * Independent from individual device, this function initializes global 1671 * per-PMD data structures distinguishing primary and secondary processes. 1672 * Hence, each initialization is called once per a process. 1673 * 1674 * @return 1675 * 0 on success, a negative errno value otherwise and rte_errno is set. 1676 */ 1677 static int 1678 mlx5_init_once(void) 1679 { 1680 struct mlx5_shared_data *sd; 1681 struct mlx5_local_data *ld = &mlx5_local_data; 1682 int ret = 0; 1683 1684 if (mlx5_init_shared_data()) 1685 return -rte_errno; 1686 sd = mlx5_shared_data; 1687 MLX5_ASSERT(sd); 1688 rte_spinlock_lock(&sd->lock); 1689 switch (rte_eal_process_type()) { 1690 case RTE_PROC_PRIMARY: 1691 if (sd->init_done) 1692 break; 1693 LIST_INIT(&sd->mem_event_cb_list); 1694 rte_rwlock_init(&sd->mem_event_rwlock); 1695 rte_mem_event_callback_register("MLX5_MEM_EVENT_CB", 1696 mlx5_mr_mem_event_cb, NULL); 1697 ret = mlx5_mp_init_primary(); 1698 if (ret) 1699 goto out; 1700 sd->init_done = true; 1701 break; 1702 case RTE_PROC_SECONDARY: 1703 if (ld->init_done) 1704 break; 1705 ret = mlx5_mp_init_secondary(); 1706 if (ret) 1707 goto out; 1708 ++sd->secondary_cnt; 1709 ld->init_done = true; 1710 break; 1711 default: 1712 break; 1713 } 1714 out: 1715 rte_spinlock_unlock(&sd->lock); 1716 return ret; 1717 } 1718 1719 /** 1720 * Configures the minimal amount of data to inline into WQE 1721 * while sending packets. 1722 * 1723 * - the txq_inline_min has the maximal priority, if this 1724 * key is specified in devargs 1725 * - if DevX is enabled the inline mode is queried from the 1726 * device (HCA attributes and NIC vport context if needed). 1727 * - otherwise L2 mode (18 bytes) is assumed for ConnectX-4/4 Lx 1728 * and none (0 bytes) for other NICs 1729 * 1730 * @param spawn 1731 * Verbs device parameters (name, port, switch_info) to spawn. 1732 * @param config 1733 * Device configuration parameters. 1734 */ 1735 static void 1736 mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn, 1737 struct mlx5_dev_config *config) 1738 { 1739 if (config->txq_inline_min != MLX5_ARG_UNSET) { 1740 /* Application defines size of inlined data explicitly. */ 1741 switch (spawn->pci_dev->id.device_id) { 1742 case PCI_DEVICE_ID_MELLANOX_CONNECTX4: 1743 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: 1744 if (config->txq_inline_min < 1745 (int)MLX5_INLINE_HSIZE_L2) { 1746 DRV_LOG(DEBUG, 1747 "txq_inline_mix aligned to minimal" 1748 " ConnectX-4 required value %d", 1749 (int)MLX5_INLINE_HSIZE_L2); 1750 config->txq_inline_min = MLX5_INLINE_HSIZE_L2; 1751 } 1752 break; 1753 } 1754 goto exit; 1755 } 1756 if (config->hca_attr.eth_net_offloads) { 1757 /* We have DevX enabled, inline mode queried successfully. */ 1758 switch (config->hca_attr.wqe_inline_mode) { 1759 case MLX5_CAP_INLINE_MODE_L2: 1760 /* outer L2 header must be inlined. */ 1761 config->txq_inline_min = MLX5_INLINE_HSIZE_L2; 1762 goto exit; 1763 case MLX5_CAP_INLINE_MODE_NOT_REQUIRED: 1764 /* No inline data are required by NIC. */ 1765 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE; 1766 config->hw_vlan_insert = 1767 config->hca_attr.wqe_vlan_insert; 1768 DRV_LOG(DEBUG, "Tx VLAN insertion is supported"); 1769 goto exit; 1770 case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT: 1771 /* inline mode is defined by NIC vport context. */ 1772 if (!config->hca_attr.eth_virt) 1773 break; 1774 switch (config->hca_attr.vport_inline_mode) { 1775 case MLX5_INLINE_MODE_NONE: 1776 config->txq_inline_min = 1777 MLX5_INLINE_HSIZE_NONE; 1778 goto exit; 1779 case MLX5_INLINE_MODE_L2: 1780 config->txq_inline_min = 1781 MLX5_INLINE_HSIZE_L2; 1782 goto exit; 1783 case MLX5_INLINE_MODE_IP: 1784 config->txq_inline_min = 1785 MLX5_INLINE_HSIZE_L3; 1786 goto exit; 1787 case MLX5_INLINE_MODE_TCP_UDP: 1788 config->txq_inline_min = 1789 MLX5_INLINE_HSIZE_L4; 1790 goto exit; 1791 case MLX5_INLINE_MODE_INNER_L2: 1792 config->txq_inline_min = 1793 MLX5_INLINE_HSIZE_INNER_L2; 1794 goto exit; 1795 case MLX5_INLINE_MODE_INNER_IP: 1796 config->txq_inline_min = 1797 MLX5_INLINE_HSIZE_INNER_L3; 1798 goto exit; 1799 case MLX5_INLINE_MODE_INNER_TCP_UDP: 1800 config->txq_inline_min = 1801 MLX5_INLINE_HSIZE_INNER_L4; 1802 goto exit; 1803 } 1804 } 1805 } 1806 /* 1807 * We get here if we are unable to deduce 1808 * inline data size with DevX. Try PCI ID 1809 * to determine old NICs. 1810 */ 1811 switch (spawn->pci_dev->id.device_id) { 1812 case PCI_DEVICE_ID_MELLANOX_CONNECTX4: 1813 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: 1814 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX: 1815 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: 1816 config->txq_inline_min = MLX5_INLINE_HSIZE_L2; 1817 config->hw_vlan_insert = 0; 1818 break; 1819 case PCI_DEVICE_ID_MELLANOX_CONNECTX5: 1820 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: 1821 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX: 1822 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: 1823 /* 1824 * These NICs support VLAN insertion from WQE and 1825 * report the wqe_vlan_insert flag. But there is the bug 1826 * and PFC control may be broken, so disable feature. 1827 */ 1828 config->hw_vlan_insert = 0; 1829 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE; 1830 break; 1831 default: 1832 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE; 1833 break; 1834 } 1835 exit: 1836 DRV_LOG(DEBUG, "min tx inline configured: %d", config->txq_inline_min); 1837 } 1838 1839 /** 1840 * Configures the metadata mask fields in the shared context. 1841 * 1842 * @param [in] dev 1843 * Pointer to Ethernet device. 1844 */ 1845 static void 1846 mlx5_set_metadata_mask(struct rte_eth_dev *dev) 1847 { 1848 struct mlx5_priv *priv = dev->data->dev_private; 1849 struct mlx5_ibv_shared *sh = priv->sh; 1850 uint32_t meta, mark, reg_c0; 1851 1852 reg_c0 = ~priv->vport_meta_mask; 1853 switch (priv->config.dv_xmeta_en) { 1854 case MLX5_XMETA_MODE_LEGACY: 1855 meta = UINT32_MAX; 1856 mark = MLX5_FLOW_MARK_MASK; 1857 break; 1858 case MLX5_XMETA_MODE_META16: 1859 meta = reg_c0 >> rte_bsf32(reg_c0); 1860 mark = MLX5_FLOW_MARK_MASK; 1861 break; 1862 case MLX5_XMETA_MODE_META32: 1863 meta = UINT32_MAX; 1864 mark = (reg_c0 >> rte_bsf32(reg_c0)) & MLX5_FLOW_MARK_MASK; 1865 break; 1866 default: 1867 meta = 0; 1868 mark = 0; 1869 MLX5_ASSERT(false); 1870 break; 1871 } 1872 if (sh->dv_mark_mask && sh->dv_mark_mask != mark) 1873 DRV_LOG(WARNING, "metadata MARK mask mismatche %08X:%08X", 1874 sh->dv_mark_mask, mark); 1875 else 1876 sh->dv_mark_mask = mark; 1877 if (sh->dv_meta_mask && sh->dv_meta_mask != meta) 1878 DRV_LOG(WARNING, "metadata META mask mismatche %08X:%08X", 1879 sh->dv_meta_mask, meta); 1880 else 1881 sh->dv_meta_mask = meta; 1882 if (sh->dv_regc0_mask && sh->dv_regc0_mask != reg_c0) 1883 DRV_LOG(WARNING, "metadata reg_c0 mask mismatche %08X:%08X", 1884 sh->dv_meta_mask, reg_c0); 1885 else 1886 sh->dv_regc0_mask = reg_c0; 1887 DRV_LOG(DEBUG, "metadata mode %u", priv->config.dv_xmeta_en); 1888 DRV_LOG(DEBUG, "metadata MARK mask %08X", sh->dv_mark_mask); 1889 DRV_LOG(DEBUG, "metadata META mask %08X", sh->dv_meta_mask); 1890 DRV_LOG(DEBUG, "metadata reg_c0 mask %08X", sh->dv_regc0_mask); 1891 } 1892 1893 /** 1894 * Allocate page of door-bells and register it using DevX API. 1895 * 1896 * @param [in] dev 1897 * Pointer to Ethernet device. 1898 * 1899 * @return 1900 * Pointer to new page on success, NULL otherwise. 1901 */ 1902 static struct mlx5_devx_dbr_page * 1903 mlx5_alloc_dbr_page(struct rte_eth_dev *dev) 1904 { 1905 struct mlx5_priv *priv = dev->data->dev_private; 1906 struct mlx5_devx_dbr_page *page; 1907 1908 /* Allocate space for door-bell page and management data. */ 1909 page = rte_calloc_socket(__func__, 1, sizeof(struct mlx5_devx_dbr_page), 1910 RTE_CACHE_LINE_SIZE, dev->device->numa_node); 1911 if (!page) { 1912 DRV_LOG(ERR, "port %u cannot allocate dbr page", 1913 dev->data->port_id); 1914 return NULL; 1915 } 1916 /* Register allocated memory. */ 1917 page->umem = mlx5_glue->devx_umem_reg(priv->sh->ctx, page->dbrs, 1918 MLX5_DBR_PAGE_SIZE, 0); 1919 if (!page->umem) { 1920 DRV_LOG(ERR, "port %u cannot umem reg dbr page", 1921 dev->data->port_id); 1922 rte_free(page); 1923 return NULL; 1924 } 1925 return page; 1926 } 1927 1928 /** 1929 * Find the next available door-bell, allocate new page if needed. 1930 * 1931 * @param [in] dev 1932 * Pointer to Ethernet device. 1933 * @param [out] dbr_page 1934 * Door-bell page containing the page data. 1935 * 1936 * @return 1937 * Door-bell address offset on success, a negative error value otherwise. 1938 */ 1939 int64_t 1940 mlx5_get_dbr(struct rte_eth_dev *dev, struct mlx5_devx_dbr_page **dbr_page) 1941 { 1942 struct mlx5_priv *priv = dev->data->dev_private; 1943 struct mlx5_devx_dbr_page *page = NULL; 1944 uint32_t i, j; 1945 1946 LIST_FOREACH(page, &priv->dbrpgs, next) 1947 if (page->dbr_count < MLX5_DBR_PER_PAGE) 1948 break; 1949 if (!page) { /* No page with free door-bell exists. */ 1950 page = mlx5_alloc_dbr_page(dev); 1951 if (!page) /* Failed to allocate new page. */ 1952 return (-1); 1953 LIST_INSERT_HEAD(&priv->dbrpgs, page, next); 1954 } 1955 /* Loop to find bitmap part with clear bit. */ 1956 for (i = 0; 1957 i < MLX5_DBR_BITMAP_SIZE && page->dbr_bitmap[i] == UINT64_MAX; 1958 i++) 1959 ; /* Empty. */ 1960 /* Find the first clear bit. */ 1961 j = rte_bsf64(~page->dbr_bitmap[i]); 1962 MLX5_ASSERT(i < (MLX5_DBR_PER_PAGE / 64)); 1963 page->dbr_bitmap[i] |= (1 << j); 1964 page->dbr_count++; 1965 *dbr_page = page; 1966 return (((i * 64) + j) * sizeof(uint64_t)); 1967 } 1968 1969 /** 1970 * Release a door-bell record. 1971 * 1972 * @param [in] dev 1973 * Pointer to Ethernet device. 1974 * @param [in] umem_id 1975 * UMEM ID of page containing the door-bell record to release. 1976 * @param [in] offset 1977 * Offset of door-bell record in page. 1978 * 1979 * @return 1980 * 0 on success, a negative error value otherwise. 1981 */ 1982 int32_t 1983 mlx5_release_dbr(struct rte_eth_dev *dev, uint32_t umem_id, uint64_t offset) 1984 { 1985 struct mlx5_priv *priv = dev->data->dev_private; 1986 struct mlx5_devx_dbr_page *page = NULL; 1987 int ret = 0; 1988 1989 LIST_FOREACH(page, &priv->dbrpgs, next) 1990 /* Find the page this address belongs to. */ 1991 if (page->umem->umem_id == umem_id) 1992 break; 1993 if (!page) 1994 return -EINVAL; 1995 page->dbr_count--; 1996 if (!page->dbr_count) { 1997 /* Page not used, free it and remove from list. */ 1998 LIST_REMOVE(page, next); 1999 if (page->umem) 2000 ret = -mlx5_glue->devx_umem_dereg(page->umem); 2001 rte_free(page); 2002 } else { 2003 /* Mark in bitmap that this door-bell is not in use. */ 2004 offset /= MLX5_DBR_SIZE; 2005 int i = offset / 64; 2006 int j = offset % 64; 2007 2008 page->dbr_bitmap[i] &= ~(1 << j); 2009 } 2010 return ret; 2011 } 2012 2013 int 2014 rte_pmd_mlx5_get_dyn_flag_names(char *names[], unsigned int n) 2015 { 2016 static const char *const dynf_names[] = { 2017 RTE_PMD_MLX5_FINE_GRANULARITY_INLINE, 2018 RTE_MBUF_DYNFLAG_METADATA_NAME 2019 }; 2020 unsigned int i; 2021 2022 if (n < RTE_DIM(dynf_names)) 2023 return -ENOMEM; 2024 for (i = 0; i < RTE_DIM(dynf_names); i++) { 2025 if (names[i] == NULL) 2026 return -EINVAL; 2027 strcpy(names[i], dynf_names[i]); 2028 } 2029 return RTE_DIM(dynf_names); 2030 } 2031 2032 /** 2033 * Check sibling device configurations. 2034 * 2035 * Sibling devices sharing the Infiniband device context 2036 * should have compatible configurations. This regards 2037 * representors and bonding slaves. 2038 * 2039 * @param priv 2040 * Private device descriptor. 2041 * @param config 2042 * Configuration of the device is going to be created. 2043 * 2044 * @return 2045 * 0 on success, EINVAL otherwise 2046 */ 2047 static int 2048 mlx5_dev_check_sibling_config(struct mlx5_priv *priv, 2049 struct mlx5_dev_config *config) 2050 { 2051 struct mlx5_ibv_shared *sh = priv->sh; 2052 struct mlx5_dev_config *sh_conf = NULL; 2053 uint16_t port_id; 2054 2055 MLX5_ASSERT(sh); 2056 /* Nothing to compare for the single/first device. */ 2057 if (sh->refcnt == 1) 2058 return 0; 2059 /* Find the device with shared context. */ 2060 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { 2061 struct mlx5_priv *opriv = 2062 rte_eth_devices[port_id].data->dev_private; 2063 2064 if (opriv && opriv != priv && opriv->sh == sh) { 2065 sh_conf = &opriv->config; 2066 break; 2067 } 2068 } 2069 if (!sh_conf) 2070 return 0; 2071 if (sh_conf->dv_flow_en ^ config->dv_flow_en) { 2072 DRV_LOG(ERR, "\"dv_flow_en\" configuration mismatch" 2073 " for shared %s context", sh->ibdev_name); 2074 rte_errno = EINVAL; 2075 return rte_errno; 2076 } 2077 if (sh_conf->dv_xmeta_en ^ config->dv_xmeta_en) { 2078 DRV_LOG(ERR, "\"dv_xmeta_en\" configuration mismatch" 2079 " for shared %s context", sh->ibdev_name); 2080 rte_errno = EINVAL; 2081 return rte_errno; 2082 } 2083 return 0; 2084 } 2085 /** 2086 * Spawn an Ethernet device from Verbs information. 2087 * 2088 * @param dpdk_dev 2089 * Backing DPDK device. 2090 * @param spawn 2091 * Verbs device parameters (name, port, switch_info) to spawn. 2092 * @param config 2093 * Device configuration parameters. 2094 * 2095 * @return 2096 * A valid Ethernet device object on success, NULL otherwise and rte_errno 2097 * is set. The following errors are defined: 2098 * 2099 * EBUSY: device is not supposed to be spawned. 2100 * EEXIST: device is already spawned 2101 */ 2102 static struct rte_eth_dev * 2103 mlx5_dev_spawn(struct rte_device *dpdk_dev, 2104 struct mlx5_dev_spawn_data *spawn, 2105 struct mlx5_dev_config config) 2106 { 2107 const struct mlx5_switch_info *switch_info = &spawn->info; 2108 struct mlx5_ibv_shared *sh = NULL; 2109 struct ibv_port_attr port_attr; 2110 struct mlx5dv_context dv_attr = { .comp_mask = 0 }; 2111 struct rte_eth_dev *eth_dev = NULL; 2112 struct mlx5_priv *priv = NULL; 2113 int err = 0; 2114 unsigned int hw_padding = 0; 2115 unsigned int mps; 2116 unsigned int cqe_comp; 2117 unsigned int cqe_pad = 0; 2118 unsigned int tunnel_en = 0; 2119 unsigned int mpls_en = 0; 2120 unsigned int swp = 0; 2121 unsigned int mprq = 0; 2122 unsigned int mprq_min_stride_size_n = 0; 2123 unsigned int mprq_max_stride_size_n = 0; 2124 unsigned int mprq_min_stride_num_n = 0; 2125 unsigned int mprq_max_stride_num_n = 0; 2126 struct rte_ether_addr mac; 2127 char name[RTE_ETH_NAME_MAX_LEN]; 2128 int own_domain_id = 0; 2129 uint16_t port_id; 2130 unsigned int i; 2131 #ifdef HAVE_MLX5DV_DR_DEVX_PORT 2132 struct mlx5dv_devx_port devx_port = { .comp_mask = 0 }; 2133 #endif 2134 2135 /* Determine if this port representor is supposed to be spawned. */ 2136 if (switch_info->representor && dpdk_dev->devargs) { 2137 struct rte_eth_devargs eth_da; 2138 2139 err = rte_eth_devargs_parse(dpdk_dev->devargs->args, ð_da); 2140 if (err) { 2141 rte_errno = -err; 2142 DRV_LOG(ERR, "failed to process device arguments: %s", 2143 strerror(rte_errno)); 2144 return NULL; 2145 } 2146 for (i = 0; i < eth_da.nb_representor_ports; ++i) 2147 if (eth_da.representor_ports[i] == 2148 (uint16_t)switch_info->port_name) 2149 break; 2150 if (i == eth_da.nb_representor_ports) { 2151 rte_errno = EBUSY; 2152 return NULL; 2153 } 2154 } 2155 /* Build device name. */ 2156 if (spawn->pf_bond < 0) { 2157 /* Single device. */ 2158 if (!switch_info->representor) 2159 strlcpy(name, dpdk_dev->name, sizeof(name)); 2160 else 2161 snprintf(name, sizeof(name), "%s_representor_%u", 2162 dpdk_dev->name, switch_info->port_name); 2163 } else { 2164 /* Bonding device. */ 2165 if (!switch_info->representor) 2166 snprintf(name, sizeof(name), "%s_%s", 2167 dpdk_dev->name, spawn->ibv_dev->name); 2168 else 2169 snprintf(name, sizeof(name), "%s_%s_representor_%u", 2170 dpdk_dev->name, spawn->ibv_dev->name, 2171 switch_info->port_name); 2172 } 2173 /* check if the device is already spawned */ 2174 if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) { 2175 rte_errno = EEXIST; 2176 return NULL; 2177 } 2178 DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name); 2179 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 2180 eth_dev = rte_eth_dev_attach_secondary(name); 2181 if (eth_dev == NULL) { 2182 DRV_LOG(ERR, "can not attach rte ethdev"); 2183 rte_errno = ENOMEM; 2184 return NULL; 2185 } 2186 eth_dev->device = dpdk_dev; 2187 eth_dev->dev_ops = &mlx5_dev_sec_ops; 2188 err = mlx5_proc_priv_init(eth_dev); 2189 if (err) 2190 return NULL; 2191 /* Receive command fd from primary process */ 2192 err = mlx5_mp_req_verbs_cmd_fd(eth_dev); 2193 if (err < 0) 2194 return NULL; 2195 /* Remap UAR for Tx queues. */ 2196 err = mlx5_tx_uar_init_secondary(eth_dev, err); 2197 if (err) 2198 return NULL; 2199 /* 2200 * Ethdev pointer is still required as input since 2201 * the primary device is not accessible from the 2202 * secondary process. 2203 */ 2204 eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev); 2205 eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev); 2206 return eth_dev; 2207 } 2208 /* 2209 * Some parameters ("tx_db_nc" in particularly) are needed in 2210 * advance to create dv/verbs device context. We proceed the 2211 * devargs here to get ones, and later proceed devargs again 2212 * to override some hardware settings. 2213 */ 2214 err = mlx5_args(&config, dpdk_dev->devargs); 2215 if (err) { 2216 err = rte_errno; 2217 DRV_LOG(ERR, "failed to process device arguments: %s", 2218 strerror(rte_errno)); 2219 goto error; 2220 } 2221 sh = mlx5_alloc_shared_ibctx(spawn, &config); 2222 if (!sh) 2223 return NULL; 2224 config.devx = sh->devx; 2225 #ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR 2226 config.dest_tir = 1; 2227 #endif 2228 #ifdef HAVE_IBV_MLX5_MOD_SWP 2229 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; 2230 #endif 2231 /* 2232 * Multi-packet send is supported by ConnectX-4 Lx PF as well 2233 * as all ConnectX-5 devices. 2234 */ 2235 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 2236 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS; 2237 #endif 2238 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 2239 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; 2240 #endif 2241 mlx5_glue->dv_query_device(sh->ctx, &dv_attr); 2242 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { 2243 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { 2244 DRV_LOG(DEBUG, "enhanced MPW is supported"); 2245 mps = MLX5_MPW_ENHANCED; 2246 } else { 2247 DRV_LOG(DEBUG, "MPW is supported"); 2248 mps = MLX5_MPW; 2249 } 2250 } else { 2251 DRV_LOG(DEBUG, "MPW isn't supported"); 2252 mps = MLX5_MPW_DISABLED; 2253 } 2254 #ifdef HAVE_IBV_MLX5_MOD_SWP 2255 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP) 2256 swp = dv_attr.sw_parsing_caps.sw_parsing_offloads; 2257 DRV_LOG(DEBUG, "SWP support: %u", swp); 2258 #endif 2259 config.swp = !!swp; 2260 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 2261 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) { 2262 struct mlx5dv_striding_rq_caps mprq_caps = 2263 dv_attr.striding_rq_caps; 2264 2265 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d", 2266 mprq_caps.min_single_stride_log_num_of_bytes); 2267 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d", 2268 mprq_caps.max_single_stride_log_num_of_bytes); 2269 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d", 2270 mprq_caps.min_single_wqe_log_num_of_strides); 2271 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d", 2272 mprq_caps.max_single_wqe_log_num_of_strides); 2273 DRV_LOG(DEBUG, "\tsupported_qpts: %d", 2274 mprq_caps.supported_qpts); 2275 DRV_LOG(DEBUG, "device supports Multi-Packet RQ"); 2276 mprq = 1; 2277 mprq_min_stride_size_n = 2278 mprq_caps.min_single_stride_log_num_of_bytes; 2279 mprq_max_stride_size_n = 2280 mprq_caps.max_single_stride_log_num_of_bytes; 2281 mprq_min_stride_num_n = 2282 mprq_caps.min_single_wqe_log_num_of_strides; 2283 mprq_max_stride_num_n = 2284 mprq_caps.max_single_wqe_log_num_of_strides; 2285 config.mprq.stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, 2286 mprq_min_stride_num_n); 2287 } 2288 #endif 2289 if (RTE_CACHE_LINE_SIZE == 128 && 2290 !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) 2291 cqe_comp = 0; 2292 else 2293 cqe_comp = 1; 2294 config.cqe_comp = cqe_comp; 2295 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD 2296 /* Whether device supports 128B Rx CQE padding. */ 2297 cqe_pad = RTE_CACHE_LINE_SIZE == 128 && 2298 (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD); 2299 #endif 2300 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 2301 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { 2302 tunnel_en = ((dv_attr.tunnel_offloads_caps & 2303 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) && 2304 (dv_attr.tunnel_offloads_caps & 2305 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) && 2306 (dv_attr.tunnel_offloads_caps & 2307 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE)); 2308 } 2309 DRV_LOG(DEBUG, "tunnel offloading is %ssupported", 2310 tunnel_en ? "" : "not "); 2311 #else 2312 DRV_LOG(WARNING, 2313 "tunnel offloading disabled due to old OFED/rdma-core version"); 2314 #endif 2315 config.tunnel_en = tunnel_en; 2316 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT 2317 mpls_en = ((dv_attr.tunnel_offloads_caps & 2318 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) && 2319 (dv_attr.tunnel_offloads_caps & 2320 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP)); 2321 DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported", 2322 mpls_en ? "" : "not "); 2323 #else 2324 DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to" 2325 " old OFED/rdma-core version or firmware configuration"); 2326 #endif 2327 config.mpls_en = mpls_en; 2328 /* Check port status. */ 2329 err = mlx5_glue->query_port(sh->ctx, spawn->ibv_port, &port_attr); 2330 if (err) { 2331 DRV_LOG(ERR, "port query failed: %s", strerror(err)); 2332 goto error; 2333 } 2334 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { 2335 DRV_LOG(ERR, "port is not configured in Ethernet mode"); 2336 err = EINVAL; 2337 goto error; 2338 } 2339 if (port_attr.state != IBV_PORT_ACTIVE) 2340 DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)", 2341 mlx5_glue->port_state_str(port_attr.state), 2342 port_attr.state); 2343 /* Allocate private eth device data. */ 2344 priv = rte_zmalloc("ethdev private structure", 2345 sizeof(*priv), 2346 RTE_CACHE_LINE_SIZE); 2347 if (priv == NULL) { 2348 DRV_LOG(ERR, "priv allocation failure"); 2349 err = ENOMEM; 2350 goto error; 2351 } 2352 priv->sh = sh; 2353 priv->ibv_port = spawn->ibv_port; 2354 priv->pci_dev = spawn->pci_dev; 2355 priv->mtu = RTE_ETHER_MTU; 2356 #ifndef RTE_ARCH_64 2357 /* Initialize UAR access locks for 32bit implementations. */ 2358 rte_spinlock_init(&priv->uar_lock_cq); 2359 for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++) 2360 rte_spinlock_init(&priv->uar_lock[i]); 2361 #endif 2362 /* Some internal functions rely on Netlink sockets, open them now. */ 2363 priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA); 2364 priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE); 2365 priv->representor = !!switch_info->representor; 2366 priv->master = !!switch_info->master; 2367 priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; 2368 priv->vport_meta_tag = 0; 2369 priv->vport_meta_mask = 0; 2370 priv->pf_bond = spawn->pf_bond; 2371 #ifdef HAVE_MLX5DV_DR_DEVX_PORT 2372 /* 2373 * The DevX port query API is implemented. E-Switch may use 2374 * either vport or reg_c[0] metadata register to match on 2375 * vport index. The engaged part of metadata register is 2376 * defined by mask. 2377 */ 2378 if (switch_info->representor || switch_info->master) { 2379 devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT | 2380 MLX5DV_DEVX_PORT_MATCH_REG_C_0; 2381 err = mlx5_glue->devx_port_query(sh->ctx, spawn->ibv_port, 2382 &devx_port); 2383 if (err) { 2384 DRV_LOG(WARNING, 2385 "can't query devx port %d on device %s", 2386 spawn->ibv_port, spawn->ibv_dev->name); 2387 devx_port.comp_mask = 0; 2388 } 2389 } 2390 if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) { 2391 priv->vport_meta_tag = devx_port.reg_c_0.value; 2392 priv->vport_meta_mask = devx_port.reg_c_0.mask; 2393 if (!priv->vport_meta_mask) { 2394 DRV_LOG(ERR, "vport zero mask for port %d" 2395 " on bonding device %s", 2396 spawn->ibv_port, spawn->ibv_dev->name); 2397 err = ENOTSUP; 2398 goto error; 2399 } 2400 if (priv->vport_meta_tag & ~priv->vport_meta_mask) { 2401 DRV_LOG(ERR, "invalid vport tag for port %d" 2402 " on bonding device %s", 2403 spawn->ibv_port, spawn->ibv_dev->name); 2404 err = ENOTSUP; 2405 goto error; 2406 } 2407 } 2408 if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) { 2409 priv->vport_id = devx_port.vport_num; 2410 } else if (spawn->pf_bond >= 0) { 2411 DRV_LOG(ERR, "can't deduce vport index for port %d" 2412 " on bonding device %s", 2413 spawn->ibv_port, spawn->ibv_dev->name); 2414 err = ENOTSUP; 2415 goto error; 2416 } else { 2417 /* Suppose vport index in compatible way. */ 2418 priv->vport_id = switch_info->representor ? 2419 switch_info->port_name + 1 : -1; 2420 } 2421 #else 2422 /* 2423 * Kernel/rdma_core support single E-Switch per PF configurations 2424 * only and vport_id field contains the vport index for 2425 * associated VF, which is deduced from representor port name. 2426 * For example, let's have the IB device port 10, it has 2427 * attached network device eth0, which has port name attribute 2428 * pf0vf2, we can deduce the VF number as 2, and set vport index 2429 * as 3 (2+1). This assigning schema should be changed if the 2430 * multiple E-Switch instances per PF configurations or/and PCI 2431 * subfunctions are added. 2432 */ 2433 priv->vport_id = switch_info->representor ? 2434 switch_info->port_name + 1 : -1; 2435 #endif 2436 /* representor_id field keeps the unmodified VF index. */ 2437 priv->representor_id = switch_info->representor ? 2438 switch_info->port_name : -1; 2439 /* 2440 * Look for sibling devices in order to reuse their switch domain 2441 * if any, otherwise allocate one. 2442 */ 2443 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { 2444 const struct mlx5_priv *opriv = 2445 rte_eth_devices[port_id].data->dev_private; 2446 2447 if (!opriv || 2448 opriv->sh != priv->sh || 2449 opriv->domain_id == 2450 RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) 2451 continue; 2452 priv->domain_id = opriv->domain_id; 2453 break; 2454 } 2455 if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { 2456 err = rte_eth_switch_domain_alloc(&priv->domain_id); 2457 if (err) { 2458 err = rte_errno; 2459 DRV_LOG(ERR, "unable to allocate switch domain: %s", 2460 strerror(rte_errno)); 2461 goto error; 2462 } 2463 own_domain_id = 1; 2464 } 2465 /* Override some values set by hardware configuration. */ 2466 mlx5_args(&config, dpdk_dev->devargs); 2467 err = mlx5_dev_check_sibling_config(priv, &config); 2468 if (err) 2469 goto error; 2470 config.hw_csum = !!(sh->device_attr.device_cap_flags_ex & 2471 IBV_DEVICE_RAW_IP_CSUM); 2472 DRV_LOG(DEBUG, "checksum offloading is %ssupported", 2473 (config.hw_csum ? "" : "not ")); 2474 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \ 2475 !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) 2476 DRV_LOG(DEBUG, "counters are not supported"); 2477 #endif 2478 #if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR) 2479 if (config.dv_flow_en) { 2480 DRV_LOG(WARNING, "DV flow is not supported"); 2481 config.dv_flow_en = 0; 2482 } 2483 #endif 2484 config.ind_table_max_size = 2485 sh->device_attr.rss_caps.max_rwq_indirection_table_size; 2486 /* 2487 * Remove this check once DPDK supports larger/variable 2488 * indirection tables. 2489 */ 2490 if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512) 2491 config.ind_table_max_size = ETH_RSS_RETA_SIZE_512; 2492 DRV_LOG(DEBUG, "maximum Rx indirection table size is %u", 2493 config.ind_table_max_size); 2494 config.hw_vlan_strip = !!(sh->device_attr.raw_packet_caps & 2495 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); 2496 DRV_LOG(DEBUG, "VLAN stripping is %ssupported", 2497 (config.hw_vlan_strip ? "" : "not ")); 2498 config.hw_fcs_strip = !!(sh->device_attr.raw_packet_caps & 2499 IBV_RAW_PACKET_CAP_SCATTER_FCS); 2500 DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported", 2501 (config.hw_fcs_strip ? "" : "not ")); 2502 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING) 2503 hw_padding = !!sh->device_attr.rx_pad_end_addr_align; 2504 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING) 2505 hw_padding = !!(sh->device_attr.device_cap_flags_ex & 2506 IBV_DEVICE_PCI_WRITE_END_PADDING); 2507 #endif 2508 if (config.hw_padding && !hw_padding) { 2509 DRV_LOG(DEBUG, "Rx end alignment padding isn't supported"); 2510 config.hw_padding = 0; 2511 } else if (config.hw_padding) { 2512 DRV_LOG(DEBUG, "Rx end alignment padding is enabled"); 2513 } 2514 config.tso = (sh->device_attr.tso_caps.max_tso > 0 && 2515 (sh->device_attr.tso_caps.supported_qpts & 2516 (1 << IBV_QPT_RAW_PACKET))); 2517 if (config.tso) 2518 config.tso_max_payload_sz = sh->device_attr.tso_caps.max_tso; 2519 /* 2520 * MPW is disabled by default, while the Enhanced MPW is enabled 2521 * by default. 2522 */ 2523 if (config.mps == MLX5_ARG_UNSET) 2524 config.mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED : 2525 MLX5_MPW_DISABLED; 2526 else 2527 config.mps = config.mps ? mps : MLX5_MPW_DISABLED; 2528 DRV_LOG(INFO, "%sMPS is %s", 2529 config.mps == MLX5_MPW_ENHANCED ? "enhanced " : 2530 config.mps == MLX5_MPW ? "legacy " : "", 2531 config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); 2532 if (config.cqe_comp && !cqe_comp) { 2533 DRV_LOG(WARNING, "Rx CQE compression isn't supported"); 2534 config.cqe_comp = 0; 2535 } 2536 if (config.cqe_pad && !cqe_pad) { 2537 DRV_LOG(WARNING, "Rx CQE padding isn't supported"); 2538 config.cqe_pad = 0; 2539 } else if (config.cqe_pad) { 2540 DRV_LOG(INFO, "Rx CQE padding is enabled"); 2541 } 2542 if (config.devx) { 2543 priv->counter_fallback = 0; 2544 err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr); 2545 if (err) { 2546 err = -err; 2547 goto error; 2548 } 2549 if (!config.hca_attr.flow_counters_dump) 2550 priv->counter_fallback = 1; 2551 #ifndef HAVE_IBV_DEVX_ASYNC 2552 priv->counter_fallback = 1; 2553 #endif 2554 if (priv->counter_fallback) 2555 DRV_LOG(INFO, "Use fall-back DV counter management"); 2556 /* Check for LRO support. */ 2557 if (config.dest_tir && config.hca_attr.lro_cap && 2558 config.dv_flow_en) { 2559 /* TBD check tunnel lro caps. */ 2560 config.lro.supported = config.hca_attr.lro_cap; 2561 DRV_LOG(DEBUG, "Device supports LRO"); 2562 /* 2563 * If LRO timeout is not configured by application, 2564 * use the minimal supported value. 2565 */ 2566 if (!config.lro.timeout) 2567 config.lro.timeout = 2568 config.hca_attr.lro_timer_supported_periods[0]; 2569 DRV_LOG(DEBUG, "LRO session timeout set to %d usec", 2570 config.lro.timeout); 2571 } 2572 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER) 2573 if (config.hca_attr.qos.sup && config.hca_attr.qos.srtcm_sup && 2574 config.dv_flow_en) { 2575 uint8_t reg_c_mask = 2576 config.hca_attr.qos.flow_meter_reg_c_ids; 2577 /* 2578 * Meter needs two REG_C's for color match and pre-sfx 2579 * flow match. Here get the REG_C for color match. 2580 * REG_C_0 and REG_C_1 is reserved for metadata feature. 2581 */ 2582 reg_c_mask &= 0xfc; 2583 if (__builtin_popcount(reg_c_mask) < 1) { 2584 priv->mtr_en = 0; 2585 DRV_LOG(WARNING, "No available register for" 2586 " meter."); 2587 } else { 2588 priv->mtr_color_reg = ffs(reg_c_mask) - 1 + 2589 REG_C_0; 2590 priv->mtr_en = 1; 2591 priv->mtr_reg_share = 2592 config.hca_attr.qos.flow_meter_reg_share; 2593 DRV_LOG(DEBUG, "The REG_C meter uses is %d", 2594 priv->mtr_color_reg); 2595 } 2596 } 2597 #endif 2598 } 2599 if (config.mprq.enabled && mprq) { 2600 if (config.mprq.stride_num_n > mprq_max_stride_num_n || 2601 config.mprq.stride_num_n < mprq_min_stride_num_n) { 2602 config.mprq.stride_num_n = 2603 RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, 2604 mprq_min_stride_num_n); 2605 DRV_LOG(WARNING, 2606 "the number of strides" 2607 " for Multi-Packet RQ is out of range," 2608 " setting default value (%u)", 2609 1 << config.mprq.stride_num_n); 2610 } 2611 config.mprq.min_stride_size_n = mprq_min_stride_size_n; 2612 config.mprq.max_stride_size_n = mprq_max_stride_size_n; 2613 } else if (config.mprq.enabled && !mprq) { 2614 DRV_LOG(WARNING, "Multi-Packet RQ isn't supported"); 2615 config.mprq.enabled = 0; 2616 } 2617 if (config.max_dump_files_num == 0) 2618 config.max_dump_files_num = 128; 2619 eth_dev = rte_eth_dev_allocate(name); 2620 if (eth_dev == NULL) { 2621 DRV_LOG(ERR, "can not allocate rte ethdev"); 2622 err = ENOMEM; 2623 goto error; 2624 } 2625 /* Flag to call rte_eth_dev_release_port() in rte_eth_dev_close(). */ 2626 eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE; 2627 if (priv->representor) { 2628 eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR; 2629 eth_dev->data->representor_id = priv->representor_id; 2630 } 2631 /* 2632 * Store associated network device interface index. This index 2633 * is permanent throughout the lifetime of device. So, we may store 2634 * the ifindex here and use the cached value further. 2635 */ 2636 MLX5_ASSERT(spawn->ifindex); 2637 priv->if_index = spawn->ifindex; 2638 eth_dev->data->dev_private = priv; 2639 priv->dev_data = eth_dev->data; 2640 eth_dev->data->mac_addrs = priv->mac; 2641 eth_dev->device = dpdk_dev; 2642 /* Configure the first MAC address by default. */ 2643 if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) { 2644 DRV_LOG(ERR, 2645 "port %u cannot get MAC address, is mlx5_en" 2646 " loaded? (errno: %s)", 2647 eth_dev->data->port_id, strerror(rte_errno)); 2648 err = ENODEV; 2649 goto error; 2650 } 2651 DRV_LOG(INFO, 2652 "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", 2653 eth_dev->data->port_id, 2654 mac.addr_bytes[0], mac.addr_bytes[1], 2655 mac.addr_bytes[2], mac.addr_bytes[3], 2656 mac.addr_bytes[4], mac.addr_bytes[5]); 2657 #ifdef RTE_LIBRTE_MLX5_DEBUG 2658 { 2659 char ifname[IF_NAMESIZE]; 2660 2661 if (mlx5_get_ifname(eth_dev, &ifname) == 0) 2662 DRV_LOG(DEBUG, "port %u ifname is \"%s\"", 2663 eth_dev->data->port_id, ifname); 2664 else 2665 DRV_LOG(DEBUG, "port %u ifname is unknown", 2666 eth_dev->data->port_id); 2667 } 2668 #endif 2669 /* Get actual MTU if possible. */ 2670 err = mlx5_get_mtu(eth_dev, &priv->mtu); 2671 if (err) { 2672 err = rte_errno; 2673 goto error; 2674 } 2675 DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id, 2676 priv->mtu); 2677 /* Initialize burst functions to prevent crashes before link-up. */ 2678 eth_dev->rx_pkt_burst = removed_rx_burst; 2679 eth_dev->tx_pkt_burst = removed_tx_burst; 2680 eth_dev->dev_ops = &mlx5_dev_ops; 2681 /* Register MAC address. */ 2682 claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); 2683 if (config.vf && config.vf_nl_en) 2684 mlx5_nl_mac_addr_sync(priv->nl_socket_route, 2685 mlx5_ifindex(eth_dev), 2686 eth_dev->data->mac_addrs, 2687 MLX5_MAX_MAC_ADDRESSES); 2688 TAILQ_INIT(&priv->flows); 2689 TAILQ_INIT(&priv->ctrl_flows); 2690 TAILQ_INIT(&priv->flow_meters); 2691 TAILQ_INIT(&priv->flow_meter_profiles); 2692 /* Hint libmlx5 to use PMD allocator for data plane resources */ 2693 struct mlx5dv_ctx_allocators alctr = { 2694 .alloc = &mlx5_alloc_verbs_buf, 2695 .free = &mlx5_free_verbs_buf, 2696 .data = priv, 2697 }; 2698 mlx5_glue->dv_set_context_attr(sh->ctx, 2699 MLX5DV_CTX_ATTR_BUF_ALLOCATORS, 2700 (void *)((uintptr_t)&alctr)); 2701 /* Bring Ethernet device up. */ 2702 DRV_LOG(DEBUG, "port %u forcing Ethernet interface up", 2703 eth_dev->data->port_id); 2704 mlx5_set_link_up(eth_dev); 2705 /* 2706 * Even though the interrupt handler is not installed yet, 2707 * interrupts will still trigger on the async_fd from 2708 * Verbs context returned by ibv_open_device(). 2709 */ 2710 mlx5_link_update(eth_dev, 0); 2711 #ifdef HAVE_MLX5DV_DR_ESWITCH 2712 if (!(config.hca_attr.eswitch_manager && config.dv_flow_en && 2713 (switch_info->representor || switch_info->master))) 2714 config.dv_esw_en = 0; 2715 #else 2716 config.dv_esw_en = 0; 2717 #endif 2718 /* Detect minimal data bytes to inline. */ 2719 mlx5_set_min_inline(spawn, &config); 2720 /* Store device configuration on private structure. */ 2721 priv->config = config; 2722 /* Create context for virtual machine VLAN workaround. */ 2723 priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex); 2724 if (config.dv_flow_en) { 2725 err = mlx5_alloc_shared_dr(priv); 2726 if (err) 2727 goto error; 2728 /* 2729 * RSS id is shared with meter flow id. Meter flow id can only 2730 * use the 24 MSB of the register. 2731 */ 2732 priv->qrss_id_pool = mlx5_flow_id_pool_alloc(UINT32_MAX >> 2733 MLX5_MTR_COLOR_BITS); 2734 if (!priv->qrss_id_pool) { 2735 DRV_LOG(ERR, "can't create flow id pool"); 2736 err = ENOMEM; 2737 goto error; 2738 } 2739 } 2740 /* Supported Verbs flow priority number detection. */ 2741 err = mlx5_flow_discover_priorities(eth_dev); 2742 if (err < 0) { 2743 err = -err; 2744 goto error; 2745 } 2746 priv->config.flow_prio = err; 2747 if (!priv->config.dv_esw_en && 2748 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { 2749 DRV_LOG(WARNING, "metadata mode %u is not supported " 2750 "(no E-Switch)", priv->config.dv_xmeta_en); 2751 priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY; 2752 } 2753 mlx5_set_metadata_mask(eth_dev); 2754 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && 2755 !priv->sh->dv_regc0_mask) { 2756 DRV_LOG(ERR, "metadata mode %u is not supported " 2757 "(no metadata reg_c[0] is available)", 2758 priv->config.dv_xmeta_en); 2759 err = ENOTSUP; 2760 goto error; 2761 } 2762 /* Query availibility of metadata reg_c's. */ 2763 err = mlx5_flow_discover_mreg_c(eth_dev); 2764 if (err < 0) { 2765 err = -err; 2766 goto error; 2767 } 2768 if (!mlx5_flow_ext_mreg_supported(eth_dev)) { 2769 DRV_LOG(DEBUG, 2770 "port %u extensive metadata register is not supported", 2771 eth_dev->data->port_id); 2772 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { 2773 DRV_LOG(ERR, "metadata mode %u is not supported " 2774 "(no metadata registers available)", 2775 priv->config.dv_xmeta_en); 2776 err = ENOTSUP; 2777 goto error; 2778 } 2779 } 2780 if (priv->config.dv_flow_en && 2781 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && 2782 mlx5_flow_ext_mreg_supported(eth_dev) && 2783 priv->sh->dv_regc0_mask) { 2784 priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME, 2785 MLX5_FLOW_MREG_HTABLE_SZ); 2786 if (!priv->mreg_cp_tbl) { 2787 err = ENOMEM; 2788 goto error; 2789 } 2790 } 2791 return eth_dev; 2792 error: 2793 if (priv) { 2794 if (priv->mreg_cp_tbl) 2795 mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL); 2796 if (priv->sh) 2797 mlx5_free_shared_dr(priv); 2798 if (priv->nl_socket_route >= 0) 2799 close(priv->nl_socket_route); 2800 if (priv->nl_socket_rdma >= 0) 2801 close(priv->nl_socket_rdma); 2802 if (priv->vmwa_context) 2803 mlx5_vlan_vmwa_exit(priv->vmwa_context); 2804 if (priv->qrss_id_pool) 2805 mlx5_flow_id_pool_release(priv->qrss_id_pool); 2806 if (own_domain_id) 2807 claim_zero(rte_eth_switch_domain_free(priv->domain_id)); 2808 rte_free(priv); 2809 if (eth_dev != NULL) 2810 eth_dev->data->dev_private = NULL; 2811 } 2812 if (eth_dev != NULL) { 2813 /* mac_addrs must not be freed alone because part of dev_private */ 2814 eth_dev->data->mac_addrs = NULL; 2815 rte_eth_dev_release_port(eth_dev); 2816 } 2817 if (sh) 2818 mlx5_free_shared_ibctx(sh); 2819 MLX5_ASSERT(err > 0); 2820 rte_errno = err; 2821 return NULL; 2822 } 2823 2824 /** 2825 * Comparison callback to sort device data. 2826 * 2827 * This is meant to be used with qsort(). 2828 * 2829 * @param a[in] 2830 * Pointer to pointer to first data object. 2831 * @param b[in] 2832 * Pointer to pointer to second data object. 2833 * 2834 * @return 2835 * 0 if both objects are equal, less than 0 if the first argument is less 2836 * than the second, greater than 0 otherwise. 2837 */ 2838 static int 2839 mlx5_dev_spawn_data_cmp(const void *a, const void *b) 2840 { 2841 const struct mlx5_switch_info *si_a = 2842 &((const struct mlx5_dev_spawn_data *)a)->info; 2843 const struct mlx5_switch_info *si_b = 2844 &((const struct mlx5_dev_spawn_data *)b)->info; 2845 int ret; 2846 2847 /* Master device first. */ 2848 ret = si_b->master - si_a->master; 2849 if (ret) 2850 return ret; 2851 /* Then representor devices. */ 2852 ret = si_b->representor - si_a->representor; 2853 if (ret) 2854 return ret; 2855 /* Unidentified devices come last in no specific order. */ 2856 if (!si_a->representor) 2857 return 0; 2858 /* Order representors by name. */ 2859 return si_a->port_name - si_b->port_name; 2860 } 2861 2862 /** 2863 * Match PCI information for possible slaves of bonding device. 2864 * 2865 * @param[in] ibv_dev 2866 * Pointer to Infiniband device structure. 2867 * @param[in] pci_dev 2868 * Pointer to PCI device structure to match PCI address. 2869 * @param[in] nl_rdma 2870 * Netlink RDMA group socket handle. 2871 * 2872 * @return 2873 * negative value if no bonding device found, otherwise 2874 * positive index of slave PF in bonding. 2875 */ 2876 static int 2877 mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev, 2878 const struct rte_pci_device *pci_dev, 2879 int nl_rdma) 2880 { 2881 char ifname[IF_NAMESIZE + 1]; 2882 unsigned int ifindex; 2883 unsigned int np, i; 2884 FILE *file = NULL; 2885 int pf = -1; 2886 2887 /* 2888 * Try to get master device name. If something goes 2889 * wrong suppose the lack of kernel support and no 2890 * bonding devices. 2891 */ 2892 if (nl_rdma < 0) 2893 return -1; 2894 if (!strstr(ibv_dev->name, "bond")) 2895 return -1; 2896 np = mlx5_nl_portnum(nl_rdma, ibv_dev->name); 2897 if (!np) 2898 return -1; 2899 /* 2900 * The Master device might not be on the predefined 2901 * port (not on port index 1, it is not garanted), 2902 * we have to scan all Infiniband device port and 2903 * find master. 2904 */ 2905 for (i = 1; i <= np; ++i) { 2906 /* Check whether Infiniband port is populated. */ 2907 ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i); 2908 if (!ifindex) 2909 continue; 2910 if (!if_indextoname(ifindex, ifname)) 2911 continue; 2912 /* Try to read bonding slave names from sysfs. */ 2913 MKSTR(slaves, 2914 "/sys/class/net/%s/master/bonding/slaves", ifname); 2915 file = fopen(slaves, "r"); 2916 if (file) 2917 break; 2918 } 2919 if (!file) 2920 return -1; 2921 /* Use safe format to check maximal buffer length. */ 2922 MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE); 2923 while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) { 2924 char tmp_str[IF_NAMESIZE + 32]; 2925 struct rte_pci_addr pci_addr; 2926 struct mlx5_switch_info info; 2927 2928 /* Process slave interface names in the loop. */ 2929 snprintf(tmp_str, sizeof(tmp_str), 2930 "/sys/class/net/%s", ifname); 2931 if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) { 2932 DRV_LOG(WARNING, "can not get PCI address" 2933 " for netdev \"%s\"", ifname); 2934 continue; 2935 } 2936 if (pci_dev->addr.domain != pci_addr.domain || 2937 pci_dev->addr.bus != pci_addr.bus || 2938 pci_dev->addr.devid != pci_addr.devid || 2939 pci_dev->addr.function != pci_addr.function) 2940 continue; 2941 /* Slave interface PCI address match found. */ 2942 fclose(file); 2943 snprintf(tmp_str, sizeof(tmp_str), 2944 "/sys/class/net/%s/phys_port_name", ifname); 2945 file = fopen(tmp_str, "rb"); 2946 if (!file) 2947 break; 2948 info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET; 2949 if (fscanf(file, "%32s", tmp_str) == 1) 2950 mlx5_translate_port_name(tmp_str, &info); 2951 if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY || 2952 info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK) 2953 pf = info.port_name; 2954 break; 2955 } 2956 if (file) 2957 fclose(file); 2958 return pf; 2959 } 2960 2961 /** 2962 * DPDK callback to register a PCI device. 2963 * 2964 * This function spawns Ethernet devices out of a given PCI device. 2965 * 2966 * @param[in] pci_drv 2967 * PCI driver structure (mlx5_driver). 2968 * @param[in] pci_dev 2969 * PCI device information. 2970 * 2971 * @return 2972 * 0 on success, a negative errno value otherwise and rte_errno is set. 2973 */ 2974 static int 2975 mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, 2976 struct rte_pci_device *pci_dev) 2977 { 2978 struct ibv_device **ibv_list; 2979 /* 2980 * Number of found IB Devices matching with requested PCI BDF. 2981 * nd != 1 means there are multiple IB devices over the same 2982 * PCI device and we have representors and master. 2983 */ 2984 unsigned int nd = 0; 2985 /* 2986 * Number of found IB device Ports. nd = 1 and np = 1..n means 2987 * we have the single multiport IB device, and there may be 2988 * representors attached to some of found ports. 2989 */ 2990 unsigned int np = 0; 2991 /* 2992 * Number of DPDK ethernet devices to Spawn - either over 2993 * multiple IB devices or multiple ports of single IB device. 2994 * Actually this is the number of iterations to spawn. 2995 */ 2996 unsigned int ns = 0; 2997 /* 2998 * Bonding device 2999 * < 0 - no bonding device (single one) 3000 * >= 0 - bonding device (value is slave PF index) 3001 */ 3002 int bd = -1; 3003 struct mlx5_dev_spawn_data *list = NULL; 3004 struct mlx5_dev_config dev_config; 3005 int ret; 3006 3007 if (mlx5_class_get(pci_dev->device.devargs) != MLX5_CLASS_NET) { 3008 DRV_LOG(DEBUG, "Skip probing - should be probed by other mlx5" 3009 " driver."); 3010 return 1; 3011 } 3012 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 3013 mlx5_pmd_socket_init(); 3014 ret = mlx5_init_once(); 3015 if (ret) { 3016 DRV_LOG(ERR, "unable to init PMD global data: %s", 3017 strerror(rte_errno)); 3018 return -rte_errno; 3019 } 3020 MLX5_ASSERT(pci_drv == &mlx5_driver); 3021 errno = 0; 3022 ibv_list = mlx5_glue->get_device_list(&ret); 3023 if (!ibv_list) { 3024 rte_errno = errno ? errno : ENOSYS; 3025 DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?"); 3026 return -rte_errno; 3027 } 3028 /* 3029 * First scan the list of all Infiniband devices to find 3030 * matching ones, gathering into the list. 3031 */ 3032 struct ibv_device *ibv_match[ret + 1]; 3033 int nl_route = mlx5_nl_init(NETLINK_ROUTE); 3034 int nl_rdma = mlx5_nl_init(NETLINK_RDMA); 3035 unsigned int i; 3036 3037 while (ret-- > 0) { 3038 struct rte_pci_addr pci_addr; 3039 3040 DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name); 3041 bd = mlx5_device_bond_pci_match 3042 (ibv_list[ret], pci_dev, nl_rdma); 3043 if (bd >= 0) { 3044 /* 3045 * Bonding device detected. Only one match is allowed, 3046 * the bonding is supported over multi-port IB device, 3047 * there should be no matches on representor PCI 3048 * functions or non VF LAG bonding devices with 3049 * specified address. 3050 */ 3051 if (nd) { 3052 DRV_LOG(ERR, 3053 "multiple PCI match on bonding device" 3054 "\"%s\" found", ibv_list[ret]->name); 3055 rte_errno = ENOENT; 3056 ret = -rte_errno; 3057 goto exit; 3058 } 3059 DRV_LOG(INFO, "PCI information matches for" 3060 " slave %d bonding device \"%s\"", 3061 bd, ibv_list[ret]->name); 3062 ibv_match[nd++] = ibv_list[ret]; 3063 break; 3064 } 3065 if (mlx5_dev_to_pci_addr 3066 (ibv_list[ret]->ibdev_path, &pci_addr)) 3067 continue; 3068 if (pci_dev->addr.domain != pci_addr.domain || 3069 pci_dev->addr.bus != pci_addr.bus || 3070 pci_dev->addr.devid != pci_addr.devid || 3071 pci_dev->addr.function != pci_addr.function) 3072 continue; 3073 DRV_LOG(INFO, "PCI information matches for device \"%s\"", 3074 ibv_list[ret]->name); 3075 ibv_match[nd++] = ibv_list[ret]; 3076 } 3077 ibv_match[nd] = NULL; 3078 if (!nd) { 3079 /* No device matches, just complain and bail out. */ 3080 DRV_LOG(WARNING, 3081 "no Verbs device matches PCI device " PCI_PRI_FMT "," 3082 " are kernel drivers loaded?", 3083 pci_dev->addr.domain, pci_dev->addr.bus, 3084 pci_dev->addr.devid, pci_dev->addr.function); 3085 rte_errno = ENOENT; 3086 ret = -rte_errno; 3087 goto exit; 3088 } 3089 if (nd == 1) { 3090 /* 3091 * Found single matching device may have multiple ports. 3092 * Each port may be representor, we have to check the port 3093 * number and check the representors existence. 3094 */ 3095 if (nl_rdma >= 0) 3096 np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name); 3097 if (!np) 3098 DRV_LOG(WARNING, "can not get IB device \"%s\"" 3099 " ports number", ibv_match[0]->name); 3100 if (bd >= 0 && !np) { 3101 DRV_LOG(ERR, "can not get ports" 3102 " for bonding device"); 3103 rte_errno = ENOENT; 3104 ret = -rte_errno; 3105 goto exit; 3106 } 3107 } 3108 #ifndef HAVE_MLX5DV_DR_DEVX_PORT 3109 if (bd >= 0) { 3110 /* 3111 * This may happen if there is VF LAG kernel support and 3112 * application is compiled with older rdma_core library. 3113 */ 3114 DRV_LOG(ERR, 3115 "No kernel/verbs support for VF LAG bonding found."); 3116 rte_errno = ENOTSUP; 3117 ret = -rte_errno; 3118 goto exit; 3119 } 3120 #endif 3121 /* 3122 * Now we can determine the maximal 3123 * amount of devices to be spawned. 3124 */ 3125 list = rte_zmalloc("device spawn data", 3126 sizeof(struct mlx5_dev_spawn_data) * 3127 (np ? np : nd), 3128 RTE_CACHE_LINE_SIZE); 3129 if (!list) { 3130 DRV_LOG(ERR, "spawn data array allocation failure"); 3131 rte_errno = ENOMEM; 3132 ret = -rte_errno; 3133 goto exit; 3134 } 3135 if (bd >= 0 || np > 1) { 3136 /* 3137 * Single IB device with multiple ports found, 3138 * it may be E-Switch master device and representors. 3139 * We have to perform identification trough the ports. 3140 */ 3141 MLX5_ASSERT(nl_rdma >= 0); 3142 MLX5_ASSERT(ns == 0); 3143 MLX5_ASSERT(nd == 1); 3144 MLX5_ASSERT(np); 3145 for (i = 1; i <= np; ++i) { 3146 list[ns].max_port = np; 3147 list[ns].ibv_port = i; 3148 list[ns].ibv_dev = ibv_match[0]; 3149 list[ns].eth_dev = NULL; 3150 list[ns].pci_dev = pci_dev; 3151 list[ns].pf_bond = bd; 3152 list[ns].ifindex = mlx5_nl_ifindex 3153 (nl_rdma, list[ns].ibv_dev->name, i); 3154 if (!list[ns].ifindex) { 3155 /* 3156 * No network interface index found for the 3157 * specified port, it means there is no 3158 * representor on this port. It's OK, 3159 * there can be disabled ports, for example 3160 * if sriov_numvfs < sriov_totalvfs. 3161 */ 3162 continue; 3163 } 3164 ret = -1; 3165 if (nl_route >= 0) 3166 ret = mlx5_nl_switch_info 3167 (nl_route, 3168 list[ns].ifindex, 3169 &list[ns].info); 3170 if (ret || (!list[ns].info.representor && 3171 !list[ns].info.master)) { 3172 /* 3173 * We failed to recognize representors with 3174 * Netlink, let's try to perform the task 3175 * with sysfs. 3176 */ 3177 ret = mlx5_sysfs_switch_info 3178 (list[ns].ifindex, 3179 &list[ns].info); 3180 } 3181 if (!ret && bd >= 0) { 3182 switch (list[ns].info.name_type) { 3183 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: 3184 if (list[ns].info.port_name == bd) 3185 ns++; 3186 break; 3187 case MLX5_PHYS_PORT_NAME_TYPE_PFVF: 3188 if (list[ns].info.pf_num == bd) 3189 ns++; 3190 break; 3191 default: 3192 break; 3193 } 3194 continue; 3195 } 3196 if (!ret && (list[ns].info.representor ^ 3197 list[ns].info.master)) 3198 ns++; 3199 } 3200 if (!ns) { 3201 DRV_LOG(ERR, 3202 "unable to recognize master/representors" 3203 " on the IB device with multiple ports"); 3204 rte_errno = ENOENT; 3205 ret = -rte_errno; 3206 goto exit; 3207 } 3208 } else { 3209 /* 3210 * The existence of several matching entries (nd > 1) means 3211 * port representors have been instantiated. No existing Verbs 3212 * call nor sysfs entries can tell them apart, this can only 3213 * be done through Netlink calls assuming kernel drivers are 3214 * recent enough to support them. 3215 * 3216 * In the event of identification failure through Netlink, 3217 * try again through sysfs, then: 3218 * 3219 * 1. A single IB device matches (nd == 1) with single 3220 * port (np=0/1) and is not a representor, assume 3221 * no switch support. 3222 * 3223 * 2. Otherwise no safe assumptions can be made; 3224 * complain louder and bail out. 3225 */ 3226 np = 1; 3227 for (i = 0; i != nd; ++i) { 3228 memset(&list[ns].info, 0, sizeof(list[ns].info)); 3229 list[ns].max_port = 1; 3230 list[ns].ibv_port = 1; 3231 list[ns].ibv_dev = ibv_match[i]; 3232 list[ns].eth_dev = NULL; 3233 list[ns].pci_dev = pci_dev; 3234 list[ns].pf_bond = -1; 3235 list[ns].ifindex = 0; 3236 if (nl_rdma >= 0) 3237 list[ns].ifindex = mlx5_nl_ifindex 3238 (nl_rdma, list[ns].ibv_dev->name, 1); 3239 if (!list[ns].ifindex) { 3240 char ifname[IF_NAMESIZE]; 3241 3242 /* 3243 * Netlink failed, it may happen with old 3244 * ib_core kernel driver (before 4.16). 3245 * We can assume there is old driver because 3246 * here we are processing single ports IB 3247 * devices. Let's try sysfs to retrieve 3248 * the ifindex. The method works for 3249 * master device only. 3250 */ 3251 if (nd > 1) { 3252 /* 3253 * Multiple devices found, assume 3254 * representors, can not distinguish 3255 * master/representor and retrieve 3256 * ifindex via sysfs. 3257 */ 3258 continue; 3259 } 3260 ret = mlx5_get_master_ifname 3261 (ibv_match[i]->ibdev_path, &ifname); 3262 if (!ret) 3263 list[ns].ifindex = 3264 if_nametoindex(ifname); 3265 if (!list[ns].ifindex) { 3266 /* 3267 * No network interface index found 3268 * for the specified device, it means 3269 * there it is neither representor 3270 * nor master. 3271 */ 3272 continue; 3273 } 3274 } 3275 ret = -1; 3276 if (nl_route >= 0) 3277 ret = mlx5_nl_switch_info 3278 (nl_route, 3279 list[ns].ifindex, 3280 &list[ns].info); 3281 if (ret || (!list[ns].info.representor && 3282 !list[ns].info.master)) { 3283 /* 3284 * We failed to recognize representors with 3285 * Netlink, let's try to perform the task 3286 * with sysfs. 3287 */ 3288 ret = mlx5_sysfs_switch_info 3289 (list[ns].ifindex, 3290 &list[ns].info); 3291 } 3292 if (!ret && (list[ns].info.representor ^ 3293 list[ns].info.master)) { 3294 ns++; 3295 } else if ((nd == 1) && 3296 !list[ns].info.representor && 3297 !list[ns].info.master) { 3298 /* 3299 * Single IB device with 3300 * one physical port and 3301 * attached network device. 3302 * May be SRIOV is not enabled 3303 * or there is no representors. 3304 */ 3305 DRV_LOG(INFO, "no E-Switch support detected"); 3306 ns++; 3307 break; 3308 } 3309 } 3310 if (!ns) { 3311 DRV_LOG(ERR, 3312 "unable to recognize master/representors" 3313 " on the multiple IB devices"); 3314 rte_errno = ENOENT; 3315 ret = -rte_errno; 3316 goto exit; 3317 } 3318 } 3319 MLX5_ASSERT(ns); 3320 /* 3321 * Sort list to probe devices in natural order for users convenience 3322 * (i.e. master first, then representors from lowest to highest ID). 3323 */ 3324 qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp); 3325 /* Default configuration. */ 3326 dev_config = (struct mlx5_dev_config){ 3327 .hw_padding = 0, 3328 .mps = MLX5_ARG_UNSET, 3329 .dbnc = MLX5_ARG_UNSET, 3330 .rx_vec_en = 1, 3331 .txq_inline_max = MLX5_ARG_UNSET, 3332 .txq_inline_min = MLX5_ARG_UNSET, 3333 .txq_inline_mpw = MLX5_ARG_UNSET, 3334 .txqs_inline = MLX5_ARG_UNSET, 3335 .vf_nl_en = 1, 3336 .mr_ext_memseg_en = 1, 3337 .mprq = { 3338 .enabled = 0, /* Disabled by default. */ 3339 .stride_num_n = MLX5_MPRQ_STRIDE_NUM_N, 3340 .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN, 3341 .min_rxqs_num = MLX5_MPRQ_MIN_RXQS, 3342 }, 3343 .dv_esw_en = 1, 3344 .dv_flow_en = 1, 3345 }; 3346 /* Device specific configuration. */ 3347 switch (pci_dev->id.device_id) { 3348 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: 3349 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: 3350 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: 3351 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: 3352 case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF: 3353 case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF: 3354 case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF: 3355 dev_config.vf = 1; 3356 break; 3357 default: 3358 break; 3359 } 3360 for (i = 0; i != ns; ++i) { 3361 uint32_t restore; 3362 3363 list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device, 3364 &list[i], 3365 dev_config); 3366 if (!list[i].eth_dev) { 3367 if (rte_errno != EBUSY && rte_errno != EEXIST) 3368 break; 3369 /* Device is disabled or already spawned. Ignore it. */ 3370 continue; 3371 } 3372 restore = list[i].eth_dev->data->dev_flags; 3373 rte_eth_copy_pci_info(list[i].eth_dev, pci_dev); 3374 /* Restore non-PCI flags cleared by the above call. */ 3375 list[i].eth_dev->data->dev_flags |= restore; 3376 mlx5_dev_interrupt_handler_devx_install(list[i].eth_dev); 3377 rte_eth_dev_probing_finish(list[i].eth_dev); 3378 } 3379 if (i != ns) { 3380 DRV_LOG(ERR, 3381 "probe of PCI device " PCI_PRI_FMT " aborted after" 3382 " encountering an error: %s", 3383 pci_dev->addr.domain, pci_dev->addr.bus, 3384 pci_dev->addr.devid, pci_dev->addr.function, 3385 strerror(rte_errno)); 3386 ret = -rte_errno; 3387 /* Roll back. */ 3388 while (i--) { 3389 if (!list[i].eth_dev) 3390 continue; 3391 mlx5_dev_close(list[i].eth_dev); 3392 /* mac_addrs must not be freed because in dev_private */ 3393 list[i].eth_dev->data->mac_addrs = NULL; 3394 claim_zero(rte_eth_dev_release_port(list[i].eth_dev)); 3395 } 3396 /* Restore original error. */ 3397 rte_errno = -ret; 3398 } else { 3399 ret = 0; 3400 } 3401 exit: 3402 /* 3403 * Do the routine cleanup: 3404 * - close opened Netlink sockets 3405 * - free allocated spawn data array 3406 * - free the Infiniband device list 3407 */ 3408 if (nl_rdma >= 0) 3409 close(nl_rdma); 3410 if (nl_route >= 0) 3411 close(nl_route); 3412 if (list) 3413 rte_free(list); 3414 MLX5_ASSERT(ibv_list); 3415 mlx5_glue->free_device_list(ibv_list); 3416 return ret; 3417 } 3418 3419 /** 3420 * Look for the ethernet device belonging to mlx5 driver. 3421 * 3422 * @param[in] port_id 3423 * port_id to start looking for device. 3424 * @param[in] pci_dev 3425 * Pointer to the hint PCI device. When device is being probed 3426 * the its siblings (master and preceding representors might 3427 * not have assigned driver yet (because the mlx5_pci_probe() 3428 * is not completed yet, for this case match on hint PCI 3429 * device may be used to detect sibling device. 3430 * 3431 * @return 3432 * port_id of found device, RTE_MAX_ETHPORT if not found. 3433 */ 3434 uint16_t 3435 mlx5_eth_find_next(uint16_t port_id, struct rte_pci_device *pci_dev) 3436 { 3437 while (port_id < RTE_MAX_ETHPORTS) { 3438 struct rte_eth_dev *dev = &rte_eth_devices[port_id]; 3439 3440 if (dev->state != RTE_ETH_DEV_UNUSED && 3441 dev->device && 3442 (dev->device == &pci_dev->device || 3443 (dev->device->driver && 3444 dev->device->driver->name && 3445 !strcmp(dev->device->driver->name, MLX5_DRIVER_NAME)))) 3446 break; 3447 port_id++; 3448 } 3449 if (port_id >= RTE_MAX_ETHPORTS) 3450 return RTE_MAX_ETHPORTS; 3451 return port_id; 3452 } 3453 3454 /** 3455 * DPDK callback to remove a PCI device. 3456 * 3457 * This function removes all Ethernet devices belong to a given PCI device. 3458 * 3459 * @param[in] pci_dev 3460 * Pointer to the PCI device. 3461 * 3462 * @return 3463 * 0 on success, the function cannot fail. 3464 */ 3465 static int 3466 mlx5_pci_remove(struct rte_pci_device *pci_dev) 3467 { 3468 uint16_t port_id; 3469 3470 RTE_ETH_FOREACH_DEV_OF(port_id, &pci_dev->device) 3471 rte_eth_dev_close(port_id); 3472 return 0; 3473 } 3474 3475 static const struct rte_pci_id mlx5_pci_id_map[] = { 3476 { 3477 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3478 PCI_DEVICE_ID_MELLANOX_CONNECTX4) 3479 }, 3480 { 3481 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3482 PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) 3483 }, 3484 { 3485 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3486 PCI_DEVICE_ID_MELLANOX_CONNECTX4LX) 3487 }, 3488 { 3489 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3490 PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) 3491 }, 3492 { 3493 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3494 PCI_DEVICE_ID_MELLANOX_CONNECTX5) 3495 }, 3496 { 3497 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3498 PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) 3499 }, 3500 { 3501 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3502 PCI_DEVICE_ID_MELLANOX_CONNECTX5EX) 3503 }, 3504 { 3505 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3506 PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF) 3507 }, 3508 { 3509 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3510 PCI_DEVICE_ID_MELLANOX_CONNECTX5BF) 3511 }, 3512 { 3513 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3514 PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF) 3515 }, 3516 { 3517 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3518 PCI_DEVICE_ID_MELLANOX_CONNECTX6) 3519 }, 3520 { 3521 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3522 PCI_DEVICE_ID_MELLANOX_CONNECTX6VF) 3523 }, 3524 { 3525 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3526 PCI_DEVICE_ID_MELLANOX_CONNECTX6DX) 3527 }, 3528 { 3529 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3530 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF) 3531 }, 3532 { 3533 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3534 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF) 3535 }, 3536 { 3537 .vendor_id = 0 3538 } 3539 }; 3540 3541 static struct rte_pci_driver mlx5_driver = { 3542 .driver = { 3543 .name = MLX5_DRIVER_NAME 3544 }, 3545 .id_table = mlx5_pci_id_map, 3546 .probe = mlx5_pci_probe, 3547 .remove = mlx5_pci_remove, 3548 .dma_map = mlx5_dma_map, 3549 .dma_unmap = mlx5_dma_unmap, 3550 .drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV | 3551 RTE_PCI_DRV_PROBE_AGAIN, 3552 }; 3553 3554 /** 3555 * Driver initialization routine. 3556 */ 3557 RTE_INIT(rte_mlx5_pmd_init) 3558 { 3559 /* Initialize driver log type. */ 3560 mlx5_logtype = rte_log_register("pmd.net.mlx5"); 3561 if (mlx5_logtype >= 0) 3562 rte_log_set_level(mlx5_logtype, RTE_LOG_NOTICE); 3563 3564 /* Build the static tables for Verbs conversion. */ 3565 mlx5_set_ptype_table(); 3566 mlx5_set_cksum_table(); 3567 mlx5_set_swp_types_table(); 3568 if (mlx5_glue) 3569 rte_pci_register(&mlx5_driver); 3570 } 3571 3572 RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__); 3573 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map); 3574 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib"); 3575