1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015 Mellanox Technologies, Ltd 4 */ 5 6 #include <stddef.h> 7 #include <unistd.h> 8 #include <string.h> 9 #include <stdint.h> 10 #include <stdlib.h> 11 #include <errno.h> 12 #include <net/if.h> 13 #include <sys/mman.h> 14 #include <linux/rtnetlink.h> 15 16 /* Verbs header. */ 17 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 18 #ifdef PEDANTIC 19 #pragma GCC diagnostic ignored "-Wpedantic" 20 #endif 21 #include <infiniband/verbs.h> 22 #ifdef PEDANTIC 23 #pragma GCC diagnostic error "-Wpedantic" 24 #endif 25 26 #include <rte_malloc.h> 27 #include <rte_ethdev_driver.h> 28 #include <rte_ethdev_pci.h> 29 #include <rte_pci.h> 30 #include <rte_bus_pci.h> 31 #include <rte_common.h> 32 #include <rte_kvargs.h> 33 #include <rte_rwlock.h> 34 #include <rte_spinlock.h> 35 #include <rte_string_fns.h> 36 #include <rte_alarm.h> 37 38 #include <mlx5_glue.h> 39 #include <mlx5_devx_cmds.h> 40 #include <mlx5_common.h> 41 #include <mlx5_common_mp.h> 42 43 #include "mlx5_defs.h" 44 #include "mlx5.h" 45 #include "mlx5_utils.h" 46 #include "mlx5_rxtx.h" 47 #include "mlx5_autoconf.h" 48 #include "mlx5_mr.h" 49 #include "mlx5_flow.h" 50 #include "rte_pmd_mlx5.h" 51 52 /* Device parameter to enable RX completion queue compression. */ 53 #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en" 54 55 /* Device parameter to enable RX completion entry padding to 128B. */ 56 #define MLX5_RXQ_CQE_PAD_EN "rxq_cqe_pad_en" 57 58 /* Device parameter to enable padding Rx packet to cacheline size. */ 59 #define MLX5_RXQ_PKT_PAD_EN "rxq_pkt_pad_en" 60 61 /* Device parameter to enable Multi-Packet Rx queue. */ 62 #define MLX5_RX_MPRQ_EN "mprq_en" 63 64 /* Device parameter to configure log 2 of the number of strides for MPRQ. */ 65 #define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num" 66 67 /* Device parameter to configure log 2 of the stride size for MPRQ. */ 68 #define MLX5_RX_MPRQ_LOG_STRIDE_SIZE "mprq_log_stride_size" 69 70 /* Device parameter to limit the size of memcpy'd packet for MPRQ. */ 71 #define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len" 72 73 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */ 74 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq" 75 76 /* Device parameter to configure inline send. Deprecated, ignored.*/ 77 #define MLX5_TXQ_INLINE "txq_inline" 78 79 /* Device parameter to limit packet size to inline with ordinary SEND. */ 80 #define MLX5_TXQ_INLINE_MAX "txq_inline_max" 81 82 /* Device parameter to configure minimal data size to inline. */ 83 #define MLX5_TXQ_INLINE_MIN "txq_inline_min" 84 85 /* Device parameter to limit packet size to inline with Enhanced MPW. */ 86 #define MLX5_TXQ_INLINE_MPW "txq_inline_mpw" 87 88 /* 89 * Device parameter to configure the number of TX queues threshold for 90 * enabling inline send. 91 */ 92 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline" 93 94 /* 95 * Device parameter to configure the number of TX queues threshold for 96 * enabling vectorized Tx, deprecated, ignored (no vectorized Tx routines). 97 */ 98 #define MLX5_TXQS_MAX_VEC "txqs_max_vec" 99 100 /* Device parameter to enable multi-packet send WQEs. */ 101 #define MLX5_TXQ_MPW_EN "txq_mpw_en" 102 103 /* 104 * Device parameter to force doorbell register mapping 105 * to non-cahed region eliminating the extra write memory barrier. 106 */ 107 #define MLX5_TX_DB_NC "tx_db_nc" 108 109 /* 110 * Device parameter to include 2 dsegs in the title WQEBB. 111 * Deprecated, ignored. 112 */ 113 #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en" 114 115 /* 116 * Device parameter to limit the size of inlining packet. 117 * Deprecated, ignored. 118 */ 119 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len" 120 121 /* 122 * Device parameter to enable hardware Tx vector. 123 * Deprecated, ignored (no vectorized Tx routines anymore). 124 */ 125 #define MLX5_TX_VEC_EN "tx_vec_en" 126 127 /* Device parameter to enable hardware Rx vector. */ 128 #define MLX5_RX_VEC_EN "rx_vec_en" 129 130 /* Allow L3 VXLAN flow creation. */ 131 #define MLX5_L3_VXLAN_EN "l3_vxlan_en" 132 133 /* Activate DV E-Switch flow steering. */ 134 #define MLX5_DV_ESW_EN "dv_esw_en" 135 136 /* Activate DV flow steering. */ 137 #define MLX5_DV_FLOW_EN "dv_flow_en" 138 139 /* Enable extensive flow metadata support. */ 140 #define MLX5_DV_XMETA_EN "dv_xmeta_en" 141 142 /* Activate Netlink support in VF mode. */ 143 #define MLX5_VF_NL_EN "vf_nl_en" 144 145 /* Enable extending memsegs when creating a MR. */ 146 #define MLX5_MR_EXT_MEMSEG_EN "mr_ext_memseg_en" 147 148 /* Select port representors to instantiate. */ 149 #define MLX5_REPRESENTOR "representor" 150 151 /* Device parameter to configure the maximum number of dump files per queue. */ 152 #define MLX5_MAX_DUMP_FILES_NUM "max_dump_files_num" 153 154 /* Configure timeout of LRO session (in microseconds). */ 155 #define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec" 156 157 /* 158 * Device parameter to configure the total data buffer size for a single 159 * hairpin queue (logarithm value). 160 */ 161 #define MLX5_HP_BUF_SIZE "hp_buf_log_sz" 162 163 #ifndef HAVE_IBV_MLX5_MOD_MPW 164 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) 165 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) 166 #endif 167 168 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP 169 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4) 170 #endif 171 172 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data"; 173 174 /* Shared memory between primary and secondary processes. */ 175 struct mlx5_shared_data *mlx5_shared_data; 176 177 /* Spinlock for mlx5_shared_data allocation. */ 178 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER; 179 180 /* Process local data for secondary processes. */ 181 static struct mlx5_local_data mlx5_local_data; 182 183 /** Driver-specific log messages type. */ 184 int mlx5_logtype; 185 186 /** Data associated with devices to spawn. */ 187 struct mlx5_dev_spawn_data { 188 uint32_t ifindex; /**< Network interface index. */ 189 uint32_t max_port; /**< IB device maximal port index. */ 190 uint32_t ibv_port; /**< IB device physical port index. */ 191 int pf_bond; /**< bonding device PF index. < 0 - no bonding */ 192 struct mlx5_switch_info info; /**< Switch information. */ 193 struct ibv_device *ibv_dev; /**< Associated IB device. */ 194 struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */ 195 struct rte_pci_device *pci_dev; /**< Backend PCI device. */ 196 }; 197 198 static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER(); 199 static pthread_mutex_t mlx5_ibv_list_mutex = PTHREAD_MUTEX_INITIALIZER; 200 201 static struct mlx5_indexed_pool_config mlx5_ipool_cfg[] = { 202 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 203 { 204 .size = sizeof(struct mlx5_flow_dv_encap_decap_resource), 205 .trunk_size = 64, 206 .grow_trunk = 3, 207 .grow_shift = 2, 208 .need_lock = 0, 209 .release_mem_en = 1, 210 .malloc = rte_malloc_socket, 211 .free = rte_free, 212 .type = "mlx5_encap_decap_ipool", 213 }, 214 { 215 .size = sizeof(struct mlx5_flow_dv_push_vlan_action_resource), 216 .trunk_size = 64, 217 .grow_trunk = 3, 218 .grow_shift = 2, 219 .need_lock = 0, 220 .release_mem_en = 1, 221 .malloc = rte_malloc_socket, 222 .free = rte_free, 223 .type = "mlx5_push_vlan_ipool", 224 }, 225 { 226 .size = sizeof(struct mlx5_flow_dv_tag_resource), 227 .trunk_size = 64, 228 .grow_trunk = 3, 229 .grow_shift = 2, 230 .need_lock = 0, 231 .release_mem_en = 1, 232 .malloc = rte_malloc_socket, 233 .free = rte_free, 234 .type = "mlx5_tag_ipool", 235 }, 236 { 237 .size = sizeof(struct mlx5_flow_dv_port_id_action_resource), 238 .trunk_size = 64, 239 .grow_trunk = 3, 240 .grow_shift = 2, 241 .need_lock = 0, 242 .release_mem_en = 1, 243 .malloc = rte_malloc_socket, 244 .free = rte_free, 245 .type = "mlx5_port_id_ipool", 246 }, 247 { 248 .size = sizeof(struct mlx5_flow_tbl_data_entry), 249 .trunk_size = 64, 250 .grow_trunk = 3, 251 .grow_shift = 2, 252 .need_lock = 0, 253 .release_mem_en = 1, 254 .malloc = rte_malloc_socket, 255 .free = rte_free, 256 .type = "mlx5_jump_ipool", 257 }, 258 #endif 259 { 260 .size = sizeof(struct mlx5_flow_meter), 261 .trunk_size = 64, 262 .grow_trunk = 3, 263 .grow_shift = 2, 264 .need_lock = 0, 265 .release_mem_en = 1, 266 .malloc = rte_malloc_socket, 267 .free = rte_free, 268 .type = "mlx5_meter_ipool", 269 }, 270 { 271 .size = sizeof(struct mlx5_flow_mreg_copy_resource), 272 .trunk_size = 64, 273 .grow_trunk = 3, 274 .grow_shift = 2, 275 .need_lock = 0, 276 .release_mem_en = 1, 277 .malloc = rte_malloc_socket, 278 .free = rte_free, 279 .type = "mlx5_mcp_ipool", 280 }, 281 { 282 .size = (sizeof(struct mlx5_hrxq) + MLX5_RSS_HASH_KEY_LEN), 283 .trunk_size = 64, 284 .grow_trunk = 3, 285 .grow_shift = 2, 286 .need_lock = 0, 287 .release_mem_en = 1, 288 .malloc = rte_malloc_socket, 289 .free = rte_free, 290 .type = "mlx5_hrxq_ipool", 291 }, 292 { 293 .size = sizeof(struct mlx5_flow_handle), 294 .trunk_size = 64, 295 .grow_trunk = 3, 296 .grow_shift = 2, 297 .need_lock = 0, 298 .release_mem_en = 1, 299 .malloc = rte_malloc_socket, 300 .free = rte_free, 301 .type = "mlx5_flow_handle_ipool", 302 }, 303 { 304 .size = sizeof(struct rte_flow), 305 .trunk_size = 4096, 306 .need_lock = 1, 307 .release_mem_en = 1, 308 .malloc = rte_malloc_socket, 309 .free = rte_free, 310 .type = "rte_flow_ipool", 311 }, 312 }; 313 314 315 #define MLX5_FLOW_MIN_ID_POOL_SIZE 512 316 #define MLX5_ID_GENERATION_ARRAY_FACTOR 16 317 318 #define MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE 4096 319 #define MLX5_TAGS_HLIST_ARRAY_SIZE 8192 320 321 /** 322 * Allocate ID pool structure. 323 * 324 * @param[in] max_id 325 * The maximum id can be allocated from the pool. 326 * 327 * @return 328 * Pointer to pool object, NULL value otherwise. 329 */ 330 struct mlx5_flow_id_pool * 331 mlx5_flow_id_pool_alloc(uint32_t max_id) 332 { 333 struct mlx5_flow_id_pool *pool; 334 void *mem; 335 336 pool = rte_zmalloc("id pool allocation", sizeof(*pool), 337 RTE_CACHE_LINE_SIZE); 338 if (!pool) { 339 DRV_LOG(ERR, "can't allocate id pool"); 340 rte_errno = ENOMEM; 341 return NULL; 342 } 343 mem = rte_zmalloc("", MLX5_FLOW_MIN_ID_POOL_SIZE * sizeof(uint32_t), 344 RTE_CACHE_LINE_SIZE); 345 if (!mem) { 346 DRV_LOG(ERR, "can't allocate mem for id pool"); 347 rte_errno = ENOMEM; 348 goto error; 349 } 350 pool->free_arr = mem; 351 pool->curr = pool->free_arr; 352 pool->last = pool->free_arr + MLX5_FLOW_MIN_ID_POOL_SIZE; 353 pool->base_index = 0; 354 pool->max_id = max_id; 355 return pool; 356 error: 357 rte_free(pool); 358 return NULL; 359 } 360 361 /** 362 * Release ID pool structure. 363 * 364 * @param[in] pool 365 * Pointer to flow id pool object to free. 366 */ 367 void 368 mlx5_flow_id_pool_release(struct mlx5_flow_id_pool *pool) 369 { 370 rte_free(pool->free_arr); 371 rte_free(pool); 372 } 373 374 /** 375 * Generate ID. 376 * 377 * @param[in] pool 378 * Pointer to flow id pool. 379 * @param[out] id 380 * The generated ID. 381 * 382 * @return 383 * 0 on success, error value otherwise. 384 */ 385 uint32_t 386 mlx5_flow_id_get(struct mlx5_flow_id_pool *pool, uint32_t *id) 387 { 388 if (pool->curr == pool->free_arr) { 389 if (pool->base_index == pool->max_id) { 390 rte_errno = ENOMEM; 391 DRV_LOG(ERR, "no free id"); 392 return -rte_errno; 393 } 394 *id = ++pool->base_index; 395 return 0; 396 } 397 *id = *(--pool->curr); 398 return 0; 399 } 400 401 /** 402 * Release ID. 403 * 404 * @param[in] pool 405 * Pointer to flow id pool. 406 * @param[out] id 407 * The generated ID. 408 * 409 * @return 410 * 0 on success, error value otherwise. 411 */ 412 uint32_t 413 mlx5_flow_id_release(struct mlx5_flow_id_pool *pool, uint32_t id) 414 { 415 uint32_t size; 416 uint32_t size2; 417 void *mem; 418 419 if (pool->curr == pool->last) { 420 size = pool->curr - pool->free_arr; 421 size2 = size * MLX5_ID_GENERATION_ARRAY_FACTOR; 422 MLX5_ASSERT(size2 > size); 423 mem = rte_malloc("", size2 * sizeof(uint32_t), 0); 424 if (!mem) { 425 DRV_LOG(ERR, "can't allocate mem for id pool"); 426 rte_errno = ENOMEM; 427 return -rte_errno; 428 } 429 memcpy(mem, pool->free_arr, size * sizeof(uint32_t)); 430 rte_free(pool->free_arr); 431 pool->free_arr = mem; 432 pool->curr = pool->free_arr + size; 433 pool->last = pool->free_arr + size2; 434 } 435 *pool->curr = id; 436 pool->curr++; 437 return 0; 438 } 439 440 /** 441 * Initialize the shared aging list information per port. 442 * 443 * @param[in] sh 444 * Pointer to mlx5_ibv_shared object. 445 */ 446 static void 447 mlx5_flow_aging_init(struct mlx5_ibv_shared *sh) 448 { 449 uint32_t i; 450 struct mlx5_age_info *age_info; 451 452 for (i = 0; i < sh->max_port; i++) { 453 age_info = &sh->port[i].age_info; 454 age_info->flags = 0; 455 TAILQ_INIT(&age_info->aged_counters); 456 rte_spinlock_init(&age_info->aged_sl); 457 MLX5_AGE_SET(age_info, MLX5_AGE_TRIGGER); 458 } 459 } 460 461 /** 462 * Initialize the counters management structure. 463 * 464 * @param[in] sh 465 * Pointer to mlx5_ibv_shared object to free 466 */ 467 static void 468 mlx5_flow_counters_mng_init(struct mlx5_ibv_shared *sh) 469 { 470 int i; 471 472 memset(&sh->cmng, 0, sizeof(sh->cmng)); 473 TAILQ_INIT(&sh->cmng.flow_counters); 474 for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) { 475 TAILQ_INIT(&sh->cmng.ccont[i].pool_list); 476 rte_spinlock_init(&sh->cmng.ccont[i].resize_sl); 477 } 478 } 479 480 /** 481 * Destroy all the resources allocated for a counter memory management. 482 * 483 * @param[in] mng 484 * Pointer to the memory management structure. 485 */ 486 static void 487 mlx5_flow_destroy_counter_stat_mem_mng(struct mlx5_counter_stats_mem_mng *mng) 488 { 489 uint8_t *mem = (uint8_t *)(uintptr_t)mng->raws[0].data; 490 491 LIST_REMOVE(mng, next); 492 claim_zero(mlx5_devx_cmd_destroy(mng->dm)); 493 claim_zero(mlx5_glue->devx_umem_dereg(mng->umem)); 494 rte_free(mem); 495 } 496 497 /** 498 * Close and release all the resources of the counters management. 499 * 500 * @param[in] sh 501 * Pointer to mlx5_ibv_shared object to free. 502 */ 503 static void 504 mlx5_flow_counters_mng_close(struct mlx5_ibv_shared *sh) 505 { 506 struct mlx5_counter_stats_mem_mng *mng; 507 int i; 508 int j; 509 int retries = 1024; 510 511 rte_errno = 0; 512 while (--retries) { 513 rte_eal_alarm_cancel(mlx5_flow_query_alarm, sh); 514 if (rte_errno != EINPROGRESS) 515 break; 516 rte_pause(); 517 } 518 for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) { 519 struct mlx5_flow_counter_pool *pool; 520 uint32_t batch = !!(i > 1); 521 522 if (!sh->cmng.ccont[i].pools) 523 continue; 524 pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list); 525 while (pool) { 526 if (batch && pool->min_dcs) 527 claim_zero(mlx5_devx_cmd_destroy 528 (pool->min_dcs)); 529 for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) { 530 if (MLX5_POOL_GET_CNT(pool, j)->action) 531 claim_zero 532 (mlx5_glue->destroy_flow_action 533 (MLX5_POOL_GET_CNT 534 (pool, j)->action)); 535 if (!batch && MLX5_GET_POOL_CNT_EXT 536 (pool, j)->dcs) 537 claim_zero(mlx5_devx_cmd_destroy 538 (MLX5_GET_POOL_CNT_EXT 539 (pool, j)->dcs)); 540 } 541 TAILQ_REMOVE(&sh->cmng.ccont[i].pool_list, pool, next); 542 rte_free(pool); 543 pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list); 544 } 545 rte_free(sh->cmng.ccont[i].pools); 546 } 547 mng = LIST_FIRST(&sh->cmng.mem_mngs); 548 while (mng) { 549 mlx5_flow_destroy_counter_stat_mem_mng(mng); 550 mng = LIST_FIRST(&sh->cmng.mem_mngs); 551 } 552 memset(&sh->cmng, 0, sizeof(sh->cmng)); 553 } 554 555 /** 556 * Initialize the flow resources' indexed mempool. 557 * 558 * @param[in] sh 559 * Pointer to mlx5_ibv_shared object. 560 * @param[in] sh 561 * Pointer to user dev config. 562 */ 563 static void 564 mlx5_flow_ipool_create(struct mlx5_ibv_shared *sh, 565 const struct mlx5_dev_config *config __rte_unused) 566 { 567 uint8_t i; 568 569 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 570 /* 571 * While DV is supported, user chooses the verbs mode, 572 * the mlx5 flow handle size is different with the 573 * MLX5_FLOW_HANDLE_VERBS_SIZE. 574 */ 575 if (!config->dv_flow_en) 576 mlx5_ipool_cfg[MLX5_IPOOL_MLX5_FLOW].size = 577 MLX5_FLOW_HANDLE_VERBS_SIZE; 578 #endif 579 for (i = 0; i < MLX5_IPOOL_MAX; ++i) 580 sh->ipool[i] = mlx5_ipool_create(&mlx5_ipool_cfg[i]); 581 } 582 583 /** 584 * Release the flow resources' indexed mempool. 585 * 586 * @param[in] sh 587 * Pointer to mlx5_ibv_shared object. 588 */ 589 static void 590 mlx5_flow_ipool_destroy(struct mlx5_ibv_shared *sh) 591 { 592 uint8_t i; 593 594 for (i = 0; i < MLX5_IPOOL_MAX; ++i) 595 mlx5_ipool_destroy(sh->ipool[i]); 596 } 597 598 /** 599 * Extract pdn of PD object using DV API. 600 * 601 * @param[in] pd 602 * Pointer to the verbs PD object. 603 * @param[out] pdn 604 * Pointer to the PD object number variable. 605 * 606 * @return 607 * 0 on success, error value otherwise. 608 */ 609 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 610 static int 611 mlx5_get_pdn(struct ibv_pd *pd __rte_unused, uint32_t *pdn __rte_unused) 612 { 613 struct mlx5dv_obj obj; 614 struct mlx5dv_pd pd_info; 615 int ret = 0; 616 617 obj.pd.in = pd; 618 obj.pd.out = &pd_info; 619 ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD); 620 if (ret) { 621 DRV_LOG(DEBUG, "Fail to get PD object info"); 622 return ret; 623 } 624 *pdn = pd_info.pdn; 625 return 0; 626 } 627 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */ 628 629 static int 630 mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config) 631 { 632 char *env; 633 int value; 634 635 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 636 /* Get environment variable to store. */ 637 env = getenv(MLX5_SHUT_UP_BF); 638 value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET; 639 if (config->dbnc == MLX5_ARG_UNSET) 640 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1); 641 else 642 setenv(MLX5_SHUT_UP_BF, 643 config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1); 644 return value; 645 } 646 647 static void 648 mlx5_restore_doorbell_mapping_env(int value) 649 { 650 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 651 /* Restore the original environment variable state. */ 652 if (value == MLX5_ARG_UNSET) 653 unsetenv(MLX5_SHUT_UP_BF); 654 else 655 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1); 656 } 657 658 /** 659 * Allocate shared IB device context. If there is multiport device the 660 * master and representors will share this context, if there is single 661 * port dedicated IB device, the context will be used by only given 662 * port due to unification. 663 * 664 * Routine first searches the context for the specified IB device name, 665 * if found the shared context assumed and reference counter is incremented. 666 * If no context found the new one is created and initialized with specified 667 * IB device context and parameters. 668 * 669 * @param[in] spawn 670 * Pointer to the IB device attributes (name, port, etc). 671 * @param[in] config 672 * Pointer to device configuration structure. 673 * 674 * @return 675 * Pointer to mlx5_ibv_shared object on success, 676 * otherwise NULL and rte_errno is set. 677 */ 678 static struct mlx5_ibv_shared * 679 mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn, 680 const struct mlx5_dev_config *config) 681 { 682 struct mlx5_ibv_shared *sh; 683 int dbmap_env; 684 int err = 0; 685 uint32_t i; 686 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 687 struct mlx5_devx_tis_attr tis_attr = { 0 }; 688 #endif 689 690 MLX5_ASSERT(spawn); 691 /* Secondary process should not create the shared context. */ 692 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 693 pthread_mutex_lock(&mlx5_ibv_list_mutex); 694 /* Search for IB context by device name. */ 695 LIST_FOREACH(sh, &mlx5_ibv_list, next) { 696 if (!strcmp(sh->ibdev_name, spawn->ibv_dev->name)) { 697 sh->refcnt++; 698 goto exit; 699 } 700 } 701 /* No device found, we have to create new shared context. */ 702 MLX5_ASSERT(spawn->max_port); 703 sh = rte_zmalloc("ethdev shared ib context", 704 sizeof(struct mlx5_ibv_shared) + 705 spawn->max_port * 706 sizeof(struct mlx5_ibv_shared_port), 707 RTE_CACHE_LINE_SIZE); 708 if (!sh) { 709 DRV_LOG(ERR, "shared context allocation failure"); 710 rte_errno = ENOMEM; 711 goto exit; 712 } 713 /* 714 * Configure environment variable "MLX5_BF_SHUT_UP" 715 * before the device creation. The rdma_core library 716 * checks the variable at device creation and 717 * stores the result internally. 718 */ 719 dbmap_env = mlx5_config_doorbell_mapping_env(config); 720 /* Try to open IB device with DV first, then usual Verbs. */ 721 errno = 0; 722 sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev); 723 if (sh->ctx) { 724 sh->devx = 1; 725 DRV_LOG(DEBUG, "DevX is supported"); 726 /* The device is created, no need for environment. */ 727 mlx5_restore_doorbell_mapping_env(dbmap_env); 728 } else { 729 /* The environment variable is still configured. */ 730 sh->ctx = mlx5_glue->open_device(spawn->ibv_dev); 731 err = errno ? errno : ENODEV; 732 /* 733 * The environment variable is not needed anymore, 734 * all device creation attempts are completed. 735 */ 736 mlx5_restore_doorbell_mapping_env(dbmap_env); 737 if (!sh->ctx) 738 goto error; 739 DRV_LOG(DEBUG, "DevX is NOT supported"); 740 } 741 err = mlx5_glue->query_device_ex(sh->ctx, NULL, &sh->device_attr); 742 if (err) { 743 DRV_LOG(DEBUG, "ibv_query_device_ex() failed"); 744 goto error; 745 } 746 sh->refcnt = 1; 747 sh->max_port = spawn->max_port; 748 strncpy(sh->ibdev_name, sh->ctx->device->name, 749 sizeof(sh->ibdev_name)); 750 strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path, 751 sizeof(sh->ibdev_path)); 752 pthread_mutex_init(&sh->intr_mutex, NULL); 753 /* 754 * Setting port_id to max unallowed value means 755 * there is no interrupt subhandler installed for 756 * the given port index i. 757 */ 758 for (i = 0; i < sh->max_port; i++) { 759 sh->port[i].ih_port_id = RTE_MAX_ETHPORTS; 760 sh->port[i].devx_ih_port_id = RTE_MAX_ETHPORTS; 761 } 762 sh->pd = mlx5_glue->alloc_pd(sh->ctx); 763 if (sh->pd == NULL) { 764 DRV_LOG(ERR, "PD allocation failure"); 765 err = ENOMEM; 766 goto error; 767 } 768 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 769 if (sh->devx) { 770 err = mlx5_get_pdn(sh->pd, &sh->pdn); 771 if (err) { 772 DRV_LOG(ERR, "Fail to extract pdn from PD"); 773 goto error; 774 } 775 sh->td = mlx5_devx_cmd_create_td(sh->ctx); 776 if (!sh->td) { 777 DRV_LOG(ERR, "TD allocation failure"); 778 err = ENOMEM; 779 goto error; 780 } 781 tis_attr.transport_domain = sh->td->id; 782 sh->tis = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr); 783 if (!sh->tis) { 784 DRV_LOG(ERR, "TIS allocation failure"); 785 err = ENOMEM; 786 goto error; 787 } 788 } 789 sh->flow_id_pool = mlx5_flow_id_pool_alloc 790 ((1 << HAIRPIN_FLOW_ID_BITS) - 1); 791 if (!sh->flow_id_pool) { 792 DRV_LOG(ERR, "can't create flow id pool"); 793 err = ENOMEM; 794 goto error; 795 } 796 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */ 797 /* 798 * Once the device is added to the list of memory event 799 * callback, its global MR cache table cannot be expanded 800 * on the fly because of deadlock. If it overflows, lookup 801 * should be done by searching MR list linearly, which is slow. 802 * 803 * At this point the device is not added to the memory 804 * event list yet, context is just being created. 805 */ 806 err = mlx5_mr_btree_init(&sh->share_cache.cache, 807 MLX5_MR_BTREE_CACHE_N * 2, 808 spawn->pci_dev->device.numa_node); 809 if (err) { 810 err = rte_errno; 811 goto error; 812 } 813 mlx5_flow_aging_init(sh); 814 mlx5_flow_counters_mng_init(sh); 815 mlx5_flow_ipool_create(sh, config); 816 /* Add device to memory callback list. */ 817 rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); 818 LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list, 819 sh, mem_event_cb); 820 rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); 821 /* Add context to the global device list. */ 822 LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next); 823 exit: 824 pthread_mutex_unlock(&mlx5_ibv_list_mutex); 825 return sh; 826 error: 827 pthread_mutex_unlock(&mlx5_ibv_list_mutex); 828 MLX5_ASSERT(sh); 829 if (sh->tis) 830 claim_zero(mlx5_devx_cmd_destroy(sh->tis)); 831 if (sh->td) 832 claim_zero(mlx5_devx_cmd_destroy(sh->td)); 833 if (sh->pd) 834 claim_zero(mlx5_glue->dealloc_pd(sh->pd)); 835 if (sh->ctx) 836 claim_zero(mlx5_glue->close_device(sh->ctx)); 837 if (sh->flow_id_pool) 838 mlx5_flow_id_pool_release(sh->flow_id_pool); 839 rte_free(sh); 840 MLX5_ASSERT(err > 0); 841 rte_errno = err; 842 return NULL; 843 } 844 845 /** 846 * Free shared IB device context. Decrement counter and if zero free 847 * all allocated resources and close handles. 848 * 849 * @param[in] sh 850 * Pointer to mlx5_ibv_shared object to free 851 */ 852 static void 853 mlx5_free_shared_ibctx(struct mlx5_ibv_shared *sh) 854 { 855 pthread_mutex_lock(&mlx5_ibv_list_mutex); 856 #ifdef RTE_LIBRTE_MLX5_DEBUG 857 /* Check the object presence in the list. */ 858 struct mlx5_ibv_shared *lctx; 859 860 LIST_FOREACH(lctx, &mlx5_ibv_list, next) 861 if (lctx == sh) 862 break; 863 MLX5_ASSERT(lctx); 864 if (lctx != sh) { 865 DRV_LOG(ERR, "Freeing non-existing shared IB context"); 866 goto exit; 867 } 868 #endif 869 MLX5_ASSERT(sh); 870 MLX5_ASSERT(sh->refcnt); 871 /* Secondary process should not free the shared context. */ 872 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 873 if (--sh->refcnt) 874 goto exit; 875 /* Remove from memory callback device list. */ 876 rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); 877 LIST_REMOVE(sh, mem_event_cb); 878 rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); 879 /* Release created Memory Regions. */ 880 mlx5_mr_release_cache(&sh->share_cache); 881 /* Remove context from the global device list. */ 882 LIST_REMOVE(sh, next); 883 /* 884 * Ensure there is no async event handler installed. 885 * Only primary process handles async device events. 886 **/ 887 mlx5_flow_counters_mng_close(sh); 888 mlx5_flow_ipool_destroy(sh); 889 MLX5_ASSERT(!sh->intr_cnt); 890 if (sh->intr_cnt) 891 mlx5_intr_callback_unregister 892 (&sh->intr_handle, mlx5_dev_interrupt_handler, sh); 893 #ifdef HAVE_MLX5_DEVX_ASYNC_SUPPORT 894 if (sh->devx_intr_cnt) { 895 if (sh->intr_handle_devx.fd) 896 rte_intr_callback_unregister(&sh->intr_handle_devx, 897 mlx5_dev_interrupt_handler_devx, sh); 898 if (sh->devx_comp) 899 mlx5dv_devx_destroy_cmd_comp(sh->devx_comp); 900 } 901 #endif 902 pthread_mutex_destroy(&sh->intr_mutex); 903 if (sh->pd) 904 claim_zero(mlx5_glue->dealloc_pd(sh->pd)); 905 if (sh->tis) 906 claim_zero(mlx5_devx_cmd_destroy(sh->tis)); 907 if (sh->td) 908 claim_zero(mlx5_devx_cmd_destroy(sh->td)); 909 if (sh->ctx) 910 claim_zero(mlx5_glue->close_device(sh->ctx)); 911 if (sh->flow_id_pool) 912 mlx5_flow_id_pool_release(sh->flow_id_pool); 913 rte_free(sh); 914 exit: 915 pthread_mutex_unlock(&mlx5_ibv_list_mutex); 916 } 917 918 /** 919 * Destroy table hash list and all the root entries per domain. 920 * 921 * @param[in] priv 922 * Pointer to the private device data structure. 923 */ 924 static void 925 mlx5_free_table_hash_list(struct mlx5_priv *priv) 926 { 927 struct mlx5_ibv_shared *sh = priv->sh; 928 struct mlx5_flow_tbl_data_entry *tbl_data; 929 union mlx5_flow_tbl_key table_key = { 930 { 931 .table_id = 0, 932 .reserved = 0, 933 .domain = 0, 934 .direction = 0, 935 } 936 }; 937 struct mlx5_hlist_entry *pos; 938 939 if (!sh->flow_tbls) 940 return; 941 pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64); 942 if (pos) { 943 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry, 944 entry); 945 MLX5_ASSERT(tbl_data); 946 mlx5_hlist_remove(sh->flow_tbls, pos); 947 rte_free(tbl_data); 948 } 949 table_key.direction = 1; 950 pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64); 951 if (pos) { 952 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry, 953 entry); 954 MLX5_ASSERT(tbl_data); 955 mlx5_hlist_remove(sh->flow_tbls, pos); 956 rte_free(tbl_data); 957 } 958 table_key.direction = 0; 959 table_key.domain = 1; 960 pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64); 961 if (pos) { 962 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry, 963 entry); 964 MLX5_ASSERT(tbl_data); 965 mlx5_hlist_remove(sh->flow_tbls, pos); 966 rte_free(tbl_data); 967 } 968 mlx5_hlist_destroy(sh->flow_tbls, NULL, NULL); 969 } 970 971 /** 972 * Initialize flow table hash list and create the root tables entry 973 * for each domain. 974 * 975 * @param[in] priv 976 * Pointer to the private device data structure. 977 * 978 * @return 979 * Zero on success, positive error code otherwise. 980 */ 981 static int 982 mlx5_alloc_table_hash_list(struct mlx5_priv *priv) 983 { 984 struct mlx5_ibv_shared *sh = priv->sh; 985 char s[MLX5_HLIST_NAMESIZE]; 986 int err = 0; 987 988 MLX5_ASSERT(sh); 989 snprintf(s, sizeof(s), "%s_flow_table", priv->sh->ibdev_name); 990 sh->flow_tbls = mlx5_hlist_create(s, MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE); 991 if (!sh->flow_tbls) { 992 DRV_LOG(ERR, "flow tables with hash creation failed.\n"); 993 err = ENOMEM; 994 return err; 995 } 996 #ifndef HAVE_MLX5DV_DR 997 /* 998 * In case we have not DR support, the zero tables should be created 999 * because DV expect to see them even if they cannot be created by 1000 * RDMA-CORE. 1001 */ 1002 union mlx5_flow_tbl_key table_key = { 1003 { 1004 .table_id = 0, 1005 .reserved = 0, 1006 .domain = 0, 1007 .direction = 0, 1008 } 1009 }; 1010 struct mlx5_flow_tbl_data_entry *tbl_data = rte_zmalloc(NULL, 1011 sizeof(*tbl_data), 0); 1012 1013 if (!tbl_data) { 1014 err = ENOMEM; 1015 goto error; 1016 } 1017 tbl_data->entry.key = table_key.v64; 1018 err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry); 1019 if (err) 1020 goto error; 1021 rte_atomic32_init(&tbl_data->tbl.refcnt); 1022 rte_atomic32_inc(&tbl_data->tbl.refcnt); 1023 table_key.direction = 1; 1024 tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0); 1025 if (!tbl_data) { 1026 err = ENOMEM; 1027 goto error; 1028 } 1029 tbl_data->entry.key = table_key.v64; 1030 err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry); 1031 if (err) 1032 goto error; 1033 rte_atomic32_init(&tbl_data->tbl.refcnt); 1034 rte_atomic32_inc(&tbl_data->tbl.refcnt); 1035 table_key.direction = 0; 1036 table_key.domain = 1; 1037 tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0); 1038 if (!tbl_data) { 1039 err = ENOMEM; 1040 goto error; 1041 } 1042 tbl_data->entry.key = table_key.v64; 1043 err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry); 1044 if (err) 1045 goto error; 1046 rte_atomic32_init(&tbl_data->tbl.refcnt); 1047 rte_atomic32_inc(&tbl_data->tbl.refcnt); 1048 return err; 1049 error: 1050 mlx5_free_table_hash_list(priv); 1051 #endif /* HAVE_MLX5DV_DR */ 1052 return err; 1053 } 1054 1055 /** 1056 * Initialize DR related data within private structure. 1057 * Routine checks the reference counter and does actual 1058 * resources creation/initialization only if counter is zero. 1059 * 1060 * @param[in] priv 1061 * Pointer to the private device data structure. 1062 * 1063 * @return 1064 * Zero on success, positive error code otherwise. 1065 */ 1066 static int 1067 mlx5_alloc_shared_dr(struct mlx5_priv *priv) 1068 { 1069 struct mlx5_ibv_shared *sh = priv->sh; 1070 char s[MLX5_HLIST_NAMESIZE]; 1071 int err = 0; 1072 1073 if (!sh->flow_tbls) 1074 err = mlx5_alloc_table_hash_list(priv); 1075 else 1076 DRV_LOG(DEBUG, "sh->flow_tbls[%p] already created, reuse\n", 1077 (void *)sh->flow_tbls); 1078 if (err) 1079 return err; 1080 /* Create tags hash list table. */ 1081 snprintf(s, sizeof(s), "%s_tags", sh->ibdev_name); 1082 sh->tag_table = mlx5_hlist_create(s, MLX5_TAGS_HLIST_ARRAY_SIZE); 1083 if (!sh->tag_table) { 1084 DRV_LOG(ERR, "tags with hash creation failed.\n"); 1085 err = ENOMEM; 1086 goto error; 1087 } 1088 #ifdef HAVE_MLX5DV_DR 1089 void *domain; 1090 1091 if (sh->dv_refcnt) { 1092 /* Shared DV/DR structures is already initialized. */ 1093 sh->dv_refcnt++; 1094 priv->dr_shared = 1; 1095 return 0; 1096 } 1097 /* Reference counter is zero, we should initialize structures. */ 1098 domain = mlx5_glue->dr_create_domain(sh->ctx, 1099 MLX5DV_DR_DOMAIN_TYPE_NIC_RX); 1100 if (!domain) { 1101 DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed"); 1102 err = errno; 1103 goto error; 1104 } 1105 sh->rx_domain = domain; 1106 domain = mlx5_glue->dr_create_domain(sh->ctx, 1107 MLX5DV_DR_DOMAIN_TYPE_NIC_TX); 1108 if (!domain) { 1109 DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed"); 1110 err = errno; 1111 goto error; 1112 } 1113 pthread_mutex_init(&sh->dv_mutex, NULL); 1114 sh->tx_domain = domain; 1115 #ifdef HAVE_MLX5DV_DR_ESWITCH 1116 if (priv->config.dv_esw_en) { 1117 domain = mlx5_glue->dr_create_domain 1118 (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB); 1119 if (!domain) { 1120 DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed"); 1121 err = errno; 1122 goto error; 1123 } 1124 sh->fdb_domain = domain; 1125 sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop(); 1126 } 1127 #endif 1128 sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan(); 1129 #endif /* HAVE_MLX5DV_DR */ 1130 sh->dv_refcnt++; 1131 priv->dr_shared = 1; 1132 return 0; 1133 error: 1134 /* Rollback the created objects. */ 1135 if (sh->rx_domain) { 1136 mlx5_glue->dr_destroy_domain(sh->rx_domain); 1137 sh->rx_domain = NULL; 1138 } 1139 if (sh->tx_domain) { 1140 mlx5_glue->dr_destroy_domain(sh->tx_domain); 1141 sh->tx_domain = NULL; 1142 } 1143 if (sh->fdb_domain) { 1144 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 1145 sh->fdb_domain = NULL; 1146 } 1147 if (sh->esw_drop_action) { 1148 mlx5_glue->destroy_flow_action(sh->esw_drop_action); 1149 sh->esw_drop_action = NULL; 1150 } 1151 if (sh->pop_vlan_action) { 1152 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 1153 sh->pop_vlan_action = NULL; 1154 } 1155 if (sh->tag_table) { 1156 /* tags should be destroyed with flow before. */ 1157 mlx5_hlist_destroy(sh->tag_table, NULL, NULL); 1158 sh->tag_table = NULL; 1159 } 1160 mlx5_free_table_hash_list(priv); 1161 return err; 1162 } 1163 1164 /** 1165 * Destroy DR related data within private structure. 1166 * 1167 * @param[in] priv 1168 * Pointer to the private device data structure. 1169 */ 1170 static void 1171 mlx5_free_shared_dr(struct mlx5_priv *priv) 1172 { 1173 struct mlx5_ibv_shared *sh; 1174 1175 if (!priv->dr_shared) 1176 return; 1177 priv->dr_shared = 0; 1178 sh = priv->sh; 1179 MLX5_ASSERT(sh); 1180 #ifdef HAVE_MLX5DV_DR 1181 MLX5_ASSERT(sh->dv_refcnt); 1182 if (sh->dv_refcnt && --sh->dv_refcnt) 1183 return; 1184 if (sh->rx_domain) { 1185 mlx5_glue->dr_destroy_domain(sh->rx_domain); 1186 sh->rx_domain = NULL; 1187 } 1188 if (sh->tx_domain) { 1189 mlx5_glue->dr_destroy_domain(sh->tx_domain); 1190 sh->tx_domain = NULL; 1191 } 1192 #ifdef HAVE_MLX5DV_DR_ESWITCH 1193 if (sh->fdb_domain) { 1194 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 1195 sh->fdb_domain = NULL; 1196 } 1197 if (sh->esw_drop_action) { 1198 mlx5_glue->destroy_flow_action(sh->esw_drop_action); 1199 sh->esw_drop_action = NULL; 1200 } 1201 #endif 1202 if (sh->pop_vlan_action) { 1203 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 1204 sh->pop_vlan_action = NULL; 1205 } 1206 pthread_mutex_destroy(&sh->dv_mutex); 1207 #endif /* HAVE_MLX5DV_DR */ 1208 if (sh->tag_table) { 1209 /* tags should be destroyed with flow before. */ 1210 mlx5_hlist_destroy(sh->tag_table, NULL, NULL); 1211 sh->tag_table = NULL; 1212 } 1213 mlx5_free_table_hash_list(priv); 1214 } 1215 1216 /** 1217 * Initialize shared data between primary and secondary process. 1218 * 1219 * A memzone is reserved by primary process and secondary processes attach to 1220 * the memzone. 1221 * 1222 * @return 1223 * 0 on success, a negative errno value otherwise and rte_errno is set. 1224 */ 1225 static int 1226 mlx5_init_shared_data(void) 1227 { 1228 const struct rte_memzone *mz; 1229 int ret = 0; 1230 1231 rte_spinlock_lock(&mlx5_shared_data_lock); 1232 if (mlx5_shared_data == NULL) { 1233 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 1234 /* Allocate shared memory. */ 1235 mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA, 1236 sizeof(*mlx5_shared_data), 1237 SOCKET_ID_ANY, 0); 1238 if (mz == NULL) { 1239 DRV_LOG(ERR, 1240 "Cannot allocate mlx5 shared data"); 1241 ret = -rte_errno; 1242 goto error; 1243 } 1244 mlx5_shared_data = mz->addr; 1245 memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data)); 1246 rte_spinlock_init(&mlx5_shared_data->lock); 1247 } else { 1248 /* Lookup allocated shared memory. */ 1249 mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA); 1250 if (mz == NULL) { 1251 DRV_LOG(ERR, 1252 "Cannot attach mlx5 shared data"); 1253 ret = -rte_errno; 1254 goto error; 1255 } 1256 mlx5_shared_data = mz->addr; 1257 memset(&mlx5_local_data, 0, sizeof(mlx5_local_data)); 1258 } 1259 } 1260 error: 1261 rte_spinlock_unlock(&mlx5_shared_data_lock); 1262 return ret; 1263 } 1264 1265 /** 1266 * Retrieve integer value from environment variable. 1267 * 1268 * @param[in] name 1269 * Environment variable name. 1270 * 1271 * @return 1272 * Integer value, 0 if the variable is not set. 1273 */ 1274 int 1275 mlx5_getenv_int(const char *name) 1276 { 1277 const char *val = getenv(name); 1278 1279 if (val == NULL) 1280 return 0; 1281 return atoi(val); 1282 } 1283 1284 /** 1285 * Verbs callback to allocate a memory. This function should allocate the space 1286 * according to the size provided residing inside a huge page. 1287 * Please note that all allocation must respect the alignment from libmlx5 1288 * (i.e. currently sysconf(_SC_PAGESIZE)). 1289 * 1290 * @param[in] size 1291 * The size in bytes of the memory to allocate. 1292 * @param[in] data 1293 * A pointer to the callback data. 1294 * 1295 * @return 1296 * Allocated buffer, NULL otherwise and rte_errno is set. 1297 */ 1298 static void * 1299 mlx5_alloc_verbs_buf(size_t size, void *data) 1300 { 1301 struct mlx5_priv *priv = data; 1302 void *ret; 1303 size_t alignment = sysconf(_SC_PAGESIZE); 1304 unsigned int socket = SOCKET_ID_ANY; 1305 1306 if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) { 1307 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 1308 1309 socket = ctrl->socket; 1310 } else if (priv->verbs_alloc_ctx.type == 1311 MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) { 1312 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 1313 1314 socket = ctrl->socket; 1315 } 1316 MLX5_ASSERT(data != NULL); 1317 ret = rte_malloc_socket(__func__, size, alignment, socket); 1318 if (!ret && size) 1319 rte_errno = ENOMEM; 1320 return ret; 1321 } 1322 1323 /** 1324 * Verbs callback to free a memory. 1325 * 1326 * @param[in] ptr 1327 * A pointer to the memory to free. 1328 * @param[in] data 1329 * A pointer to the callback data. 1330 */ 1331 static void 1332 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused) 1333 { 1334 MLX5_ASSERT(data != NULL); 1335 rte_free(ptr); 1336 } 1337 1338 /** 1339 * DPDK callback to add udp tunnel port 1340 * 1341 * @param[in] dev 1342 * A pointer to eth_dev 1343 * @param[in] udp_tunnel 1344 * A pointer to udp tunnel 1345 * 1346 * @return 1347 * 0 on valid udp ports and tunnels, -ENOTSUP otherwise. 1348 */ 1349 int 1350 mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev __rte_unused, 1351 struct rte_eth_udp_tunnel *udp_tunnel) 1352 { 1353 MLX5_ASSERT(udp_tunnel != NULL); 1354 if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN && 1355 udp_tunnel->udp_port == 4789) 1356 return 0; 1357 if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN_GPE && 1358 udp_tunnel->udp_port == 4790) 1359 return 0; 1360 return -ENOTSUP; 1361 } 1362 1363 /** 1364 * Initialize process private data structure. 1365 * 1366 * @param dev 1367 * Pointer to Ethernet device structure. 1368 * 1369 * @return 1370 * 0 on success, a negative errno value otherwise and rte_errno is set. 1371 */ 1372 int 1373 mlx5_proc_priv_init(struct rte_eth_dev *dev) 1374 { 1375 struct mlx5_priv *priv = dev->data->dev_private; 1376 struct mlx5_proc_priv *ppriv; 1377 size_t ppriv_size; 1378 1379 /* 1380 * UAR register table follows the process private structure. BlueFlame 1381 * registers for Tx queues are stored in the table. 1382 */ 1383 ppriv_size = 1384 sizeof(struct mlx5_proc_priv) + priv->txqs_n * sizeof(void *); 1385 ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size, 1386 RTE_CACHE_LINE_SIZE, dev->device->numa_node); 1387 if (!ppriv) { 1388 rte_errno = ENOMEM; 1389 return -rte_errno; 1390 } 1391 ppriv->uar_table_sz = ppriv_size; 1392 dev->process_private = ppriv; 1393 return 0; 1394 } 1395 1396 /** 1397 * Un-initialize process private data structure. 1398 * 1399 * @param dev 1400 * Pointer to Ethernet device structure. 1401 */ 1402 static void 1403 mlx5_proc_priv_uninit(struct rte_eth_dev *dev) 1404 { 1405 if (!dev->process_private) 1406 return; 1407 rte_free(dev->process_private); 1408 dev->process_private = NULL; 1409 } 1410 1411 /** 1412 * DPDK callback to close the device. 1413 * 1414 * Destroy all queues and objects, free memory. 1415 * 1416 * @param dev 1417 * Pointer to Ethernet device structure. 1418 */ 1419 static void 1420 mlx5_dev_close(struct rte_eth_dev *dev) 1421 { 1422 struct mlx5_priv *priv = dev->data->dev_private; 1423 unsigned int i; 1424 int ret; 1425 1426 DRV_LOG(DEBUG, "port %u closing device \"%s\"", 1427 dev->data->port_id, 1428 ((priv->sh->ctx != NULL) ? priv->sh->ctx->device->name : "")); 1429 /* In case mlx5_dev_stop() has not been called. */ 1430 mlx5_dev_interrupt_handler_uninstall(dev); 1431 mlx5_dev_interrupt_handler_devx_uninstall(dev); 1432 /* 1433 * If default mreg copy action is removed at the stop stage, 1434 * the search will return none and nothing will be done anymore. 1435 */ 1436 mlx5_flow_stop_default(dev); 1437 mlx5_traffic_disable(dev); 1438 /* 1439 * If all the flows are already flushed in the device stop stage, 1440 * then this will return directly without any action. 1441 */ 1442 mlx5_flow_list_flush(dev, &priv->flows, true); 1443 mlx5_flow_meter_flush(dev, NULL); 1444 /* Free the intermediate buffers for flow creation. */ 1445 mlx5_flow_free_intermediate(dev); 1446 /* Prevent crashes when queues are still in use. */ 1447 dev->rx_pkt_burst = removed_rx_burst; 1448 dev->tx_pkt_burst = removed_tx_burst; 1449 rte_wmb(); 1450 /* Disable datapath on secondary process. */ 1451 mlx5_mp_req_stop_rxtx(dev); 1452 if (priv->rxqs != NULL) { 1453 /* XXX race condition if mlx5_rx_burst() is still running. */ 1454 usleep(1000); 1455 for (i = 0; (i != priv->rxqs_n); ++i) 1456 mlx5_rxq_release(dev, i); 1457 priv->rxqs_n = 0; 1458 priv->rxqs = NULL; 1459 } 1460 if (priv->txqs != NULL) { 1461 /* XXX race condition if mlx5_tx_burst() is still running. */ 1462 usleep(1000); 1463 for (i = 0; (i != priv->txqs_n); ++i) 1464 mlx5_txq_release(dev, i); 1465 priv->txqs_n = 0; 1466 priv->txqs = NULL; 1467 } 1468 mlx5_proc_priv_uninit(dev); 1469 if (priv->mreg_cp_tbl) 1470 mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL); 1471 mlx5_mprq_free_mp(dev); 1472 mlx5_free_shared_dr(priv); 1473 if (priv->rss_conf.rss_key != NULL) 1474 rte_free(priv->rss_conf.rss_key); 1475 if (priv->reta_idx != NULL) 1476 rte_free(priv->reta_idx); 1477 if (priv->config.vf) 1478 mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev), 1479 dev->data->mac_addrs, 1480 MLX5_MAX_MAC_ADDRESSES, priv->mac_own); 1481 if (priv->nl_socket_route >= 0) 1482 close(priv->nl_socket_route); 1483 if (priv->nl_socket_rdma >= 0) 1484 close(priv->nl_socket_rdma); 1485 if (priv->vmwa_context) 1486 mlx5_vlan_vmwa_exit(priv->vmwa_context); 1487 ret = mlx5_hrxq_verify(dev); 1488 if (ret) 1489 DRV_LOG(WARNING, "port %u some hash Rx queue still remain", 1490 dev->data->port_id); 1491 ret = mlx5_ind_table_obj_verify(dev); 1492 if (ret) 1493 DRV_LOG(WARNING, "port %u some indirection table still remain", 1494 dev->data->port_id); 1495 ret = mlx5_rxq_obj_verify(dev); 1496 if (ret) 1497 DRV_LOG(WARNING, "port %u some Rx queue objects still remain", 1498 dev->data->port_id); 1499 ret = mlx5_rxq_verify(dev); 1500 if (ret) 1501 DRV_LOG(WARNING, "port %u some Rx queues still remain", 1502 dev->data->port_id); 1503 ret = mlx5_txq_obj_verify(dev); 1504 if (ret) 1505 DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain", 1506 dev->data->port_id); 1507 ret = mlx5_txq_verify(dev); 1508 if (ret) 1509 DRV_LOG(WARNING, "port %u some Tx queues still remain", 1510 dev->data->port_id); 1511 ret = mlx5_flow_verify(dev); 1512 if (ret) 1513 DRV_LOG(WARNING, "port %u some flows still remain", 1514 dev->data->port_id); 1515 if (priv->sh) { 1516 /* 1517 * Free the shared context in last turn, because the cleanup 1518 * routines above may use some shared fields, like 1519 * mlx5_nl_mac_addr_flush() uses ibdev_path for retrieveing 1520 * ifindex if Netlink fails. 1521 */ 1522 mlx5_free_shared_ibctx(priv->sh); 1523 priv->sh = NULL; 1524 } 1525 if (priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { 1526 unsigned int c = 0; 1527 uint16_t port_id; 1528 1529 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { 1530 struct mlx5_priv *opriv = 1531 rte_eth_devices[port_id].data->dev_private; 1532 1533 if (!opriv || 1534 opriv->domain_id != priv->domain_id || 1535 &rte_eth_devices[port_id] == dev) 1536 continue; 1537 ++c; 1538 break; 1539 } 1540 if (!c) 1541 claim_zero(rte_eth_switch_domain_free(priv->domain_id)); 1542 } 1543 memset(priv, 0, sizeof(*priv)); 1544 priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; 1545 /* 1546 * Reset mac_addrs to NULL such that it is not freed as part of 1547 * rte_eth_dev_release_port(). mac_addrs is part of dev_private so 1548 * it is freed when dev_private is freed. 1549 */ 1550 dev->data->mac_addrs = NULL; 1551 } 1552 1553 const struct eth_dev_ops mlx5_dev_ops = { 1554 .dev_configure = mlx5_dev_configure, 1555 .dev_start = mlx5_dev_start, 1556 .dev_stop = mlx5_dev_stop, 1557 .dev_set_link_down = mlx5_set_link_down, 1558 .dev_set_link_up = mlx5_set_link_up, 1559 .dev_close = mlx5_dev_close, 1560 .promiscuous_enable = mlx5_promiscuous_enable, 1561 .promiscuous_disable = mlx5_promiscuous_disable, 1562 .allmulticast_enable = mlx5_allmulticast_enable, 1563 .allmulticast_disable = mlx5_allmulticast_disable, 1564 .link_update = mlx5_link_update, 1565 .stats_get = mlx5_stats_get, 1566 .stats_reset = mlx5_stats_reset, 1567 .xstats_get = mlx5_xstats_get, 1568 .xstats_reset = mlx5_xstats_reset, 1569 .xstats_get_names = mlx5_xstats_get_names, 1570 .fw_version_get = mlx5_fw_version_get, 1571 .dev_infos_get = mlx5_dev_infos_get, 1572 .read_clock = mlx5_read_clock, 1573 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 1574 .vlan_filter_set = mlx5_vlan_filter_set, 1575 .rx_queue_setup = mlx5_rx_queue_setup, 1576 .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup, 1577 .tx_queue_setup = mlx5_tx_queue_setup, 1578 .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup, 1579 .rx_queue_release = mlx5_rx_queue_release, 1580 .tx_queue_release = mlx5_tx_queue_release, 1581 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 1582 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 1583 .mac_addr_remove = mlx5_mac_addr_remove, 1584 .mac_addr_add = mlx5_mac_addr_add, 1585 .mac_addr_set = mlx5_mac_addr_set, 1586 .set_mc_addr_list = mlx5_set_mc_addr_list, 1587 .mtu_set = mlx5_dev_set_mtu, 1588 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 1589 .vlan_offload_set = mlx5_vlan_offload_set, 1590 .reta_update = mlx5_dev_rss_reta_update, 1591 .reta_query = mlx5_dev_rss_reta_query, 1592 .rss_hash_update = mlx5_rss_hash_update, 1593 .rss_hash_conf_get = mlx5_rss_hash_conf_get, 1594 .filter_ctrl = mlx5_dev_filter_ctrl, 1595 .rx_descriptor_status = mlx5_rx_descriptor_status, 1596 .tx_descriptor_status = mlx5_tx_descriptor_status, 1597 .rxq_info_get = mlx5_rxq_info_get, 1598 .txq_info_get = mlx5_txq_info_get, 1599 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 1600 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 1601 .rx_queue_count = mlx5_rx_queue_count, 1602 .rx_queue_intr_enable = mlx5_rx_intr_enable, 1603 .rx_queue_intr_disable = mlx5_rx_intr_disable, 1604 .is_removed = mlx5_is_removed, 1605 .udp_tunnel_port_add = mlx5_udp_tunnel_port_add, 1606 .get_module_info = mlx5_get_module_info, 1607 .get_module_eeprom = mlx5_get_module_eeprom, 1608 .hairpin_cap_get = mlx5_hairpin_cap_get, 1609 .mtr_ops_get = mlx5_flow_meter_ops_get, 1610 }; 1611 1612 /* Available operations from secondary process. */ 1613 static const struct eth_dev_ops mlx5_dev_sec_ops = { 1614 .stats_get = mlx5_stats_get, 1615 .stats_reset = mlx5_stats_reset, 1616 .xstats_get = mlx5_xstats_get, 1617 .xstats_reset = mlx5_xstats_reset, 1618 .xstats_get_names = mlx5_xstats_get_names, 1619 .fw_version_get = mlx5_fw_version_get, 1620 .dev_infos_get = mlx5_dev_infos_get, 1621 .rx_descriptor_status = mlx5_rx_descriptor_status, 1622 .tx_descriptor_status = mlx5_tx_descriptor_status, 1623 .rxq_info_get = mlx5_rxq_info_get, 1624 .txq_info_get = mlx5_txq_info_get, 1625 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 1626 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 1627 .get_module_info = mlx5_get_module_info, 1628 .get_module_eeprom = mlx5_get_module_eeprom, 1629 }; 1630 1631 /* Available operations in flow isolated mode. */ 1632 const struct eth_dev_ops mlx5_dev_ops_isolate = { 1633 .dev_configure = mlx5_dev_configure, 1634 .dev_start = mlx5_dev_start, 1635 .dev_stop = mlx5_dev_stop, 1636 .dev_set_link_down = mlx5_set_link_down, 1637 .dev_set_link_up = mlx5_set_link_up, 1638 .dev_close = mlx5_dev_close, 1639 .promiscuous_enable = mlx5_promiscuous_enable, 1640 .promiscuous_disable = mlx5_promiscuous_disable, 1641 .allmulticast_enable = mlx5_allmulticast_enable, 1642 .allmulticast_disable = mlx5_allmulticast_disable, 1643 .link_update = mlx5_link_update, 1644 .stats_get = mlx5_stats_get, 1645 .stats_reset = mlx5_stats_reset, 1646 .xstats_get = mlx5_xstats_get, 1647 .xstats_reset = mlx5_xstats_reset, 1648 .xstats_get_names = mlx5_xstats_get_names, 1649 .fw_version_get = mlx5_fw_version_get, 1650 .dev_infos_get = mlx5_dev_infos_get, 1651 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 1652 .vlan_filter_set = mlx5_vlan_filter_set, 1653 .rx_queue_setup = mlx5_rx_queue_setup, 1654 .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup, 1655 .tx_queue_setup = mlx5_tx_queue_setup, 1656 .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup, 1657 .rx_queue_release = mlx5_rx_queue_release, 1658 .tx_queue_release = mlx5_tx_queue_release, 1659 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 1660 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 1661 .mac_addr_remove = mlx5_mac_addr_remove, 1662 .mac_addr_add = mlx5_mac_addr_add, 1663 .mac_addr_set = mlx5_mac_addr_set, 1664 .set_mc_addr_list = mlx5_set_mc_addr_list, 1665 .mtu_set = mlx5_dev_set_mtu, 1666 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 1667 .vlan_offload_set = mlx5_vlan_offload_set, 1668 .filter_ctrl = mlx5_dev_filter_ctrl, 1669 .rx_descriptor_status = mlx5_rx_descriptor_status, 1670 .tx_descriptor_status = mlx5_tx_descriptor_status, 1671 .rxq_info_get = mlx5_rxq_info_get, 1672 .txq_info_get = mlx5_txq_info_get, 1673 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 1674 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 1675 .rx_queue_intr_enable = mlx5_rx_intr_enable, 1676 .rx_queue_intr_disable = mlx5_rx_intr_disable, 1677 .is_removed = mlx5_is_removed, 1678 .get_module_info = mlx5_get_module_info, 1679 .get_module_eeprom = mlx5_get_module_eeprom, 1680 .hairpin_cap_get = mlx5_hairpin_cap_get, 1681 .mtr_ops_get = mlx5_flow_meter_ops_get, 1682 }; 1683 1684 /** 1685 * Verify and store value for device argument. 1686 * 1687 * @param[in] key 1688 * Key argument to verify. 1689 * @param[in] val 1690 * Value associated with key. 1691 * @param opaque 1692 * User data. 1693 * 1694 * @return 1695 * 0 on success, a negative errno value otherwise and rte_errno is set. 1696 */ 1697 static int 1698 mlx5_args_check(const char *key, const char *val, void *opaque) 1699 { 1700 struct mlx5_dev_config *config = opaque; 1701 unsigned long tmp; 1702 1703 /* No-op, port representors are processed in mlx5_dev_spawn(). */ 1704 if (!strcmp(MLX5_REPRESENTOR, key)) 1705 return 0; 1706 errno = 0; 1707 tmp = strtoul(val, NULL, 0); 1708 if (errno) { 1709 rte_errno = errno; 1710 DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val); 1711 return -rte_errno; 1712 } 1713 if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) { 1714 config->cqe_comp = !!tmp; 1715 } else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) { 1716 config->cqe_pad = !!tmp; 1717 } else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) { 1718 config->hw_padding = !!tmp; 1719 } else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) { 1720 config->mprq.enabled = !!tmp; 1721 } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) { 1722 config->mprq.stride_num_n = tmp; 1723 } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_SIZE, key) == 0) { 1724 config->mprq.stride_size_n = tmp; 1725 } else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) { 1726 config->mprq.max_memcpy_len = tmp; 1727 } else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) { 1728 config->mprq.min_rxqs_num = tmp; 1729 } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) { 1730 DRV_LOG(WARNING, "%s: deprecated parameter," 1731 " converted to txq_inline_max", key); 1732 config->txq_inline_max = tmp; 1733 } else if (strcmp(MLX5_TXQ_INLINE_MAX, key) == 0) { 1734 config->txq_inline_max = tmp; 1735 } else if (strcmp(MLX5_TXQ_INLINE_MIN, key) == 0) { 1736 config->txq_inline_min = tmp; 1737 } else if (strcmp(MLX5_TXQ_INLINE_MPW, key) == 0) { 1738 config->txq_inline_mpw = tmp; 1739 } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) { 1740 config->txqs_inline = tmp; 1741 } else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) { 1742 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key); 1743 } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) { 1744 config->mps = !!tmp; 1745 } else if (strcmp(MLX5_TX_DB_NC, key) == 0) { 1746 if (tmp != MLX5_TXDB_CACHED && 1747 tmp != MLX5_TXDB_NCACHED && 1748 tmp != MLX5_TXDB_HEURISTIC) { 1749 DRV_LOG(ERR, "invalid Tx doorbell " 1750 "mapping parameter"); 1751 rte_errno = EINVAL; 1752 return -rte_errno; 1753 } 1754 config->dbnc = tmp; 1755 } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) { 1756 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key); 1757 } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) { 1758 DRV_LOG(WARNING, "%s: deprecated parameter," 1759 " converted to txq_inline_mpw", key); 1760 config->txq_inline_mpw = tmp; 1761 } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) { 1762 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key); 1763 } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) { 1764 config->rx_vec_en = !!tmp; 1765 } else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) { 1766 config->l3_vxlan_en = !!tmp; 1767 } else if (strcmp(MLX5_VF_NL_EN, key) == 0) { 1768 config->vf_nl_en = !!tmp; 1769 } else if (strcmp(MLX5_DV_ESW_EN, key) == 0) { 1770 config->dv_esw_en = !!tmp; 1771 } else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) { 1772 config->dv_flow_en = !!tmp; 1773 } else if (strcmp(MLX5_DV_XMETA_EN, key) == 0) { 1774 if (tmp != MLX5_XMETA_MODE_LEGACY && 1775 tmp != MLX5_XMETA_MODE_META16 && 1776 tmp != MLX5_XMETA_MODE_META32) { 1777 DRV_LOG(ERR, "invalid extensive " 1778 "metadata parameter"); 1779 rte_errno = EINVAL; 1780 return -rte_errno; 1781 } 1782 config->dv_xmeta_en = tmp; 1783 } else if (strcmp(MLX5_MR_EXT_MEMSEG_EN, key) == 0) { 1784 config->mr_ext_memseg_en = !!tmp; 1785 } else if (strcmp(MLX5_MAX_DUMP_FILES_NUM, key) == 0) { 1786 config->max_dump_files_num = tmp; 1787 } else if (strcmp(MLX5_LRO_TIMEOUT_USEC, key) == 0) { 1788 config->lro.timeout = tmp; 1789 } else if (strcmp(MLX5_CLASS_ARG_NAME, key) == 0) { 1790 DRV_LOG(DEBUG, "class argument is %s.", val); 1791 } else if (strcmp(MLX5_HP_BUF_SIZE, key) == 0) { 1792 config->log_hp_size = tmp; 1793 } else { 1794 DRV_LOG(WARNING, "%s: unknown parameter", key); 1795 rte_errno = EINVAL; 1796 return -rte_errno; 1797 } 1798 return 0; 1799 } 1800 1801 /** 1802 * Parse device parameters. 1803 * 1804 * @param config 1805 * Pointer to device configuration structure. 1806 * @param devargs 1807 * Device arguments structure. 1808 * 1809 * @return 1810 * 0 on success, a negative errno value otherwise and rte_errno is set. 1811 */ 1812 static int 1813 mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) 1814 { 1815 const char **params = (const char *[]){ 1816 MLX5_RXQ_CQE_COMP_EN, 1817 MLX5_RXQ_CQE_PAD_EN, 1818 MLX5_RXQ_PKT_PAD_EN, 1819 MLX5_RX_MPRQ_EN, 1820 MLX5_RX_MPRQ_LOG_STRIDE_NUM, 1821 MLX5_RX_MPRQ_LOG_STRIDE_SIZE, 1822 MLX5_RX_MPRQ_MAX_MEMCPY_LEN, 1823 MLX5_RXQS_MIN_MPRQ, 1824 MLX5_TXQ_INLINE, 1825 MLX5_TXQ_INLINE_MIN, 1826 MLX5_TXQ_INLINE_MAX, 1827 MLX5_TXQ_INLINE_MPW, 1828 MLX5_TXQS_MIN_INLINE, 1829 MLX5_TXQS_MAX_VEC, 1830 MLX5_TXQ_MPW_EN, 1831 MLX5_TXQ_MPW_HDR_DSEG_EN, 1832 MLX5_TXQ_MAX_INLINE_LEN, 1833 MLX5_TX_DB_NC, 1834 MLX5_TX_VEC_EN, 1835 MLX5_RX_VEC_EN, 1836 MLX5_L3_VXLAN_EN, 1837 MLX5_VF_NL_EN, 1838 MLX5_DV_ESW_EN, 1839 MLX5_DV_FLOW_EN, 1840 MLX5_DV_XMETA_EN, 1841 MLX5_MR_EXT_MEMSEG_EN, 1842 MLX5_REPRESENTOR, 1843 MLX5_MAX_DUMP_FILES_NUM, 1844 MLX5_LRO_TIMEOUT_USEC, 1845 MLX5_CLASS_ARG_NAME, 1846 MLX5_HP_BUF_SIZE, 1847 NULL, 1848 }; 1849 struct rte_kvargs *kvlist; 1850 int ret = 0; 1851 int i; 1852 1853 if (devargs == NULL) 1854 return 0; 1855 /* Following UGLY cast is done to pass checkpatch. */ 1856 kvlist = rte_kvargs_parse(devargs->args, params); 1857 if (kvlist == NULL) { 1858 rte_errno = EINVAL; 1859 return -rte_errno; 1860 } 1861 /* Process parameters. */ 1862 for (i = 0; (params[i] != NULL); ++i) { 1863 if (rte_kvargs_count(kvlist, params[i])) { 1864 ret = rte_kvargs_process(kvlist, params[i], 1865 mlx5_args_check, config); 1866 if (ret) { 1867 rte_errno = EINVAL; 1868 rte_kvargs_free(kvlist); 1869 return -rte_errno; 1870 } 1871 } 1872 } 1873 rte_kvargs_free(kvlist); 1874 return 0; 1875 } 1876 1877 static struct rte_pci_driver mlx5_driver; 1878 1879 /** 1880 * PMD global initialization. 1881 * 1882 * Independent from individual device, this function initializes global 1883 * per-PMD data structures distinguishing primary and secondary processes. 1884 * Hence, each initialization is called once per a process. 1885 * 1886 * @return 1887 * 0 on success, a negative errno value otherwise and rte_errno is set. 1888 */ 1889 static int 1890 mlx5_init_once(void) 1891 { 1892 struct mlx5_shared_data *sd; 1893 struct mlx5_local_data *ld = &mlx5_local_data; 1894 int ret = 0; 1895 1896 if (mlx5_init_shared_data()) 1897 return -rte_errno; 1898 sd = mlx5_shared_data; 1899 MLX5_ASSERT(sd); 1900 rte_spinlock_lock(&sd->lock); 1901 switch (rte_eal_process_type()) { 1902 case RTE_PROC_PRIMARY: 1903 if (sd->init_done) 1904 break; 1905 LIST_INIT(&sd->mem_event_cb_list); 1906 rte_rwlock_init(&sd->mem_event_rwlock); 1907 rte_mem_event_callback_register("MLX5_MEM_EVENT_CB", 1908 mlx5_mr_mem_event_cb, NULL); 1909 ret = mlx5_mp_init_primary(MLX5_MP_NAME, 1910 mlx5_mp_primary_handle); 1911 if (ret) 1912 goto out; 1913 sd->init_done = true; 1914 break; 1915 case RTE_PROC_SECONDARY: 1916 if (ld->init_done) 1917 break; 1918 ret = mlx5_mp_init_secondary(MLX5_MP_NAME, 1919 mlx5_mp_secondary_handle); 1920 if (ret) 1921 goto out; 1922 ++sd->secondary_cnt; 1923 ld->init_done = true; 1924 break; 1925 default: 1926 break; 1927 } 1928 out: 1929 rte_spinlock_unlock(&sd->lock); 1930 return ret; 1931 } 1932 1933 /** 1934 * Configures the minimal amount of data to inline into WQE 1935 * while sending packets. 1936 * 1937 * - the txq_inline_min has the maximal priority, if this 1938 * key is specified in devargs 1939 * - if DevX is enabled the inline mode is queried from the 1940 * device (HCA attributes and NIC vport context if needed). 1941 * - otherwise L2 mode (18 bytes) is assumed for ConnectX-4/4 Lx 1942 * and none (0 bytes) for other NICs 1943 * 1944 * @param spawn 1945 * Verbs device parameters (name, port, switch_info) to spawn. 1946 * @param config 1947 * Device configuration parameters. 1948 */ 1949 static void 1950 mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn, 1951 struct mlx5_dev_config *config) 1952 { 1953 if (config->txq_inline_min != MLX5_ARG_UNSET) { 1954 /* Application defines size of inlined data explicitly. */ 1955 switch (spawn->pci_dev->id.device_id) { 1956 case PCI_DEVICE_ID_MELLANOX_CONNECTX4: 1957 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: 1958 if (config->txq_inline_min < 1959 (int)MLX5_INLINE_HSIZE_L2) { 1960 DRV_LOG(DEBUG, 1961 "txq_inline_mix aligned to minimal" 1962 " ConnectX-4 required value %d", 1963 (int)MLX5_INLINE_HSIZE_L2); 1964 config->txq_inline_min = MLX5_INLINE_HSIZE_L2; 1965 } 1966 break; 1967 } 1968 goto exit; 1969 } 1970 if (config->hca_attr.eth_net_offloads) { 1971 /* We have DevX enabled, inline mode queried successfully. */ 1972 switch (config->hca_attr.wqe_inline_mode) { 1973 case MLX5_CAP_INLINE_MODE_L2: 1974 /* outer L2 header must be inlined. */ 1975 config->txq_inline_min = MLX5_INLINE_HSIZE_L2; 1976 goto exit; 1977 case MLX5_CAP_INLINE_MODE_NOT_REQUIRED: 1978 /* No inline data are required by NIC. */ 1979 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE; 1980 config->hw_vlan_insert = 1981 config->hca_attr.wqe_vlan_insert; 1982 DRV_LOG(DEBUG, "Tx VLAN insertion is supported"); 1983 goto exit; 1984 case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT: 1985 /* inline mode is defined by NIC vport context. */ 1986 if (!config->hca_attr.eth_virt) 1987 break; 1988 switch (config->hca_attr.vport_inline_mode) { 1989 case MLX5_INLINE_MODE_NONE: 1990 config->txq_inline_min = 1991 MLX5_INLINE_HSIZE_NONE; 1992 goto exit; 1993 case MLX5_INLINE_MODE_L2: 1994 config->txq_inline_min = 1995 MLX5_INLINE_HSIZE_L2; 1996 goto exit; 1997 case MLX5_INLINE_MODE_IP: 1998 config->txq_inline_min = 1999 MLX5_INLINE_HSIZE_L3; 2000 goto exit; 2001 case MLX5_INLINE_MODE_TCP_UDP: 2002 config->txq_inline_min = 2003 MLX5_INLINE_HSIZE_L4; 2004 goto exit; 2005 case MLX5_INLINE_MODE_INNER_L2: 2006 config->txq_inline_min = 2007 MLX5_INLINE_HSIZE_INNER_L2; 2008 goto exit; 2009 case MLX5_INLINE_MODE_INNER_IP: 2010 config->txq_inline_min = 2011 MLX5_INLINE_HSIZE_INNER_L3; 2012 goto exit; 2013 case MLX5_INLINE_MODE_INNER_TCP_UDP: 2014 config->txq_inline_min = 2015 MLX5_INLINE_HSIZE_INNER_L4; 2016 goto exit; 2017 } 2018 } 2019 } 2020 /* 2021 * We get here if we are unable to deduce 2022 * inline data size with DevX. Try PCI ID 2023 * to determine old NICs. 2024 */ 2025 switch (spawn->pci_dev->id.device_id) { 2026 case PCI_DEVICE_ID_MELLANOX_CONNECTX4: 2027 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: 2028 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX: 2029 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: 2030 config->txq_inline_min = MLX5_INLINE_HSIZE_L2; 2031 config->hw_vlan_insert = 0; 2032 break; 2033 case PCI_DEVICE_ID_MELLANOX_CONNECTX5: 2034 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: 2035 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX: 2036 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: 2037 /* 2038 * These NICs support VLAN insertion from WQE and 2039 * report the wqe_vlan_insert flag. But there is the bug 2040 * and PFC control may be broken, so disable feature. 2041 */ 2042 config->hw_vlan_insert = 0; 2043 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE; 2044 break; 2045 default: 2046 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE; 2047 break; 2048 } 2049 exit: 2050 DRV_LOG(DEBUG, "min tx inline configured: %d", config->txq_inline_min); 2051 } 2052 2053 /** 2054 * Configures the metadata mask fields in the shared context. 2055 * 2056 * @param [in] dev 2057 * Pointer to Ethernet device. 2058 */ 2059 static void 2060 mlx5_set_metadata_mask(struct rte_eth_dev *dev) 2061 { 2062 struct mlx5_priv *priv = dev->data->dev_private; 2063 struct mlx5_ibv_shared *sh = priv->sh; 2064 uint32_t meta, mark, reg_c0; 2065 2066 reg_c0 = ~priv->vport_meta_mask; 2067 switch (priv->config.dv_xmeta_en) { 2068 case MLX5_XMETA_MODE_LEGACY: 2069 meta = UINT32_MAX; 2070 mark = MLX5_FLOW_MARK_MASK; 2071 break; 2072 case MLX5_XMETA_MODE_META16: 2073 meta = reg_c0 >> rte_bsf32(reg_c0); 2074 mark = MLX5_FLOW_MARK_MASK; 2075 break; 2076 case MLX5_XMETA_MODE_META32: 2077 meta = UINT32_MAX; 2078 mark = (reg_c0 >> rte_bsf32(reg_c0)) & MLX5_FLOW_MARK_MASK; 2079 break; 2080 default: 2081 meta = 0; 2082 mark = 0; 2083 MLX5_ASSERT(false); 2084 break; 2085 } 2086 if (sh->dv_mark_mask && sh->dv_mark_mask != mark) 2087 DRV_LOG(WARNING, "metadata MARK mask mismatche %08X:%08X", 2088 sh->dv_mark_mask, mark); 2089 else 2090 sh->dv_mark_mask = mark; 2091 if (sh->dv_meta_mask && sh->dv_meta_mask != meta) 2092 DRV_LOG(WARNING, "metadata META mask mismatche %08X:%08X", 2093 sh->dv_meta_mask, meta); 2094 else 2095 sh->dv_meta_mask = meta; 2096 if (sh->dv_regc0_mask && sh->dv_regc0_mask != reg_c0) 2097 DRV_LOG(WARNING, "metadata reg_c0 mask mismatche %08X:%08X", 2098 sh->dv_meta_mask, reg_c0); 2099 else 2100 sh->dv_regc0_mask = reg_c0; 2101 DRV_LOG(DEBUG, "metadata mode %u", priv->config.dv_xmeta_en); 2102 DRV_LOG(DEBUG, "metadata MARK mask %08X", sh->dv_mark_mask); 2103 DRV_LOG(DEBUG, "metadata META mask %08X", sh->dv_meta_mask); 2104 DRV_LOG(DEBUG, "metadata reg_c0 mask %08X", sh->dv_regc0_mask); 2105 } 2106 2107 /** 2108 * Allocate page of door-bells and register it using DevX API. 2109 * 2110 * @param [in] dev 2111 * Pointer to Ethernet device. 2112 * 2113 * @return 2114 * Pointer to new page on success, NULL otherwise. 2115 */ 2116 static struct mlx5_devx_dbr_page * 2117 mlx5_alloc_dbr_page(struct rte_eth_dev *dev) 2118 { 2119 struct mlx5_priv *priv = dev->data->dev_private; 2120 struct mlx5_devx_dbr_page *page; 2121 2122 /* Allocate space for door-bell page and management data. */ 2123 page = rte_calloc_socket(__func__, 1, sizeof(struct mlx5_devx_dbr_page), 2124 RTE_CACHE_LINE_SIZE, dev->device->numa_node); 2125 if (!page) { 2126 DRV_LOG(ERR, "port %u cannot allocate dbr page", 2127 dev->data->port_id); 2128 return NULL; 2129 } 2130 /* Register allocated memory. */ 2131 page->umem = mlx5_glue->devx_umem_reg(priv->sh->ctx, page->dbrs, 2132 MLX5_DBR_PAGE_SIZE, 0); 2133 if (!page->umem) { 2134 DRV_LOG(ERR, "port %u cannot umem reg dbr page", 2135 dev->data->port_id); 2136 rte_free(page); 2137 return NULL; 2138 } 2139 return page; 2140 } 2141 2142 /** 2143 * Find the next available door-bell, allocate new page if needed. 2144 * 2145 * @param [in] dev 2146 * Pointer to Ethernet device. 2147 * @param [out] dbr_page 2148 * Door-bell page containing the page data. 2149 * 2150 * @return 2151 * Door-bell address offset on success, a negative error value otherwise. 2152 */ 2153 int64_t 2154 mlx5_get_dbr(struct rte_eth_dev *dev, struct mlx5_devx_dbr_page **dbr_page) 2155 { 2156 struct mlx5_priv *priv = dev->data->dev_private; 2157 struct mlx5_devx_dbr_page *page = NULL; 2158 uint32_t i, j; 2159 2160 LIST_FOREACH(page, &priv->dbrpgs, next) 2161 if (page->dbr_count < MLX5_DBR_PER_PAGE) 2162 break; 2163 if (!page) { /* No page with free door-bell exists. */ 2164 page = mlx5_alloc_dbr_page(dev); 2165 if (!page) /* Failed to allocate new page. */ 2166 return (-1); 2167 LIST_INSERT_HEAD(&priv->dbrpgs, page, next); 2168 } 2169 /* Loop to find bitmap part with clear bit. */ 2170 for (i = 0; 2171 i < MLX5_DBR_BITMAP_SIZE && page->dbr_bitmap[i] == UINT64_MAX; 2172 i++) 2173 ; /* Empty. */ 2174 /* Find the first clear bit. */ 2175 MLX5_ASSERT(i < MLX5_DBR_BITMAP_SIZE); 2176 j = rte_bsf64(~page->dbr_bitmap[i]); 2177 page->dbr_bitmap[i] |= (UINT64_C(1) << j); 2178 page->dbr_count++; 2179 *dbr_page = page; 2180 return (((i * 64) + j) * sizeof(uint64_t)); 2181 } 2182 2183 /** 2184 * Release a door-bell record. 2185 * 2186 * @param [in] dev 2187 * Pointer to Ethernet device. 2188 * @param [in] umem_id 2189 * UMEM ID of page containing the door-bell record to release. 2190 * @param [in] offset 2191 * Offset of door-bell record in page. 2192 * 2193 * @return 2194 * 0 on success, a negative error value otherwise. 2195 */ 2196 int32_t 2197 mlx5_release_dbr(struct rte_eth_dev *dev, uint32_t umem_id, uint64_t offset) 2198 { 2199 struct mlx5_priv *priv = dev->data->dev_private; 2200 struct mlx5_devx_dbr_page *page = NULL; 2201 int ret = 0; 2202 2203 LIST_FOREACH(page, &priv->dbrpgs, next) 2204 /* Find the page this address belongs to. */ 2205 if (page->umem->umem_id == umem_id) 2206 break; 2207 if (!page) 2208 return -EINVAL; 2209 page->dbr_count--; 2210 if (!page->dbr_count) { 2211 /* Page not used, free it and remove from list. */ 2212 LIST_REMOVE(page, next); 2213 if (page->umem) 2214 ret = -mlx5_glue->devx_umem_dereg(page->umem); 2215 rte_free(page); 2216 } else { 2217 /* Mark in bitmap that this door-bell is not in use. */ 2218 offset /= MLX5_DBR_SIZE; 2219 int i = offset / 64; 2220 int j = offset % 64; 2221 2222 page->dbr_bitmap[i] &= ~(UINT64_C(1) << j); 2223 } 2224 return ret; 2225 } 2226 2227 int 2228 rte_pmd_mlx5_get_dyn_flag_names(char *names[], unsigned int n) 2229 { 2230 static const char *const dynf_names[] = { 2231 RTE_PMD_MLX5_FINE_GRANULARITY_INLINE, 2232 RTE_MBUF_DYNFLAG_METADATA_NAME 2233 }; 2234 unsigned int i; 2235 2236 if (n < RTE_DIM(dynf_names)) 2237 return -ENOMEM; 2238 for (i = 0; i < RTE_DIM(dynf_names); i++) { 2239 if (names[i] == NULL) 2240 return -EINVAL; 2241 strcpy(names[i], dynf_names[i]); 2242 } 2243 return RTE_DIM(dynf_names); 2244 } 2245 2246 /** 2247 * Check sibling device configurations. 2248 * 2249 * Sibling devices sharing the Infiniband device context 2250 * should have compatible configurations. This regards 2251 * representors and bonding slaves. 2252 * 2253 * @param priv 2254 * Private device descriptor. 2255 * @param config 2256 * Configuration of the device is going to be created. 2257 * 2258 * @return 2259 * 0 on success, EINVAL otherwise 2260 */ 2261 static int 2262 mlx5_dev_check_sibling_config(struct mlx5_priv *priv, 2263 struct mlx5_dev_config *config) 2264 { 2265 struct mlx5_ibv_shared *sh = priv->sh; 2266 struct mlx5_dev_config *sh_conf = NULL; 2267 uint16_t port_id; 2268 2269 MLX5_ASSERT(sh); 2270 /* Nothing to compare for the single/first device. */ 2271 if (sh->refcnt == 1) 2272 return 0; 2273 /* Find the device with shared context. */ 2274 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { 2275 struct mlx5_priv *opriv = 2276 rte_eth_devices[port_id].data->dev_private; 2277 2278 if (opriv && opriv != priv && opriv->sh == sh) { 2279 sh_conf = &opriv->config; 2280 break; 2281 } 2282 } 2283 if (!sh_conf) 2284 return 0; 2285 if (sh_conf->dv_flow_en ^ config->dv_flow_en) { 2286 DRV_LOG(ERR, "\"dv_flow_en\" configuration mismatch" 2287 " for shared %s context", sh->ibdev_name); 2288 rte_errno = EINVAL; 2289 return rte_errno; 2290 } 2291 if (sh_conf->dv_xmeta_en ^ config->dv_xmeta_en) { 2292 DRV_LOG(ERR, "\"dv_xmeta_en\" configuration mismatch" 2293 " for shared %s context", sh->ibdev_name); 2294 rte_errno = EINVAL; 2295 return rte_errno; 2296 } 2297 return 0; 2298 } 2299 /** 2300 * Spawn an Ethernet device from Verbs information. 2301 * 2302 * @param dpdk_dev 2303 * Backing DPDK device. 2304 * @param spawn 2305 * Verbs device parameters (name, port, switch_info) to spawn. 2306 * @param config 2307 * Device configuration parameters. 2308 * 2309 * @return 2310 * A valid Ethernet device object on success, NULL otherwise and rte_errno 2311 * is set. The following errors are defined: 2312 * 2313 * EBUSY: device is not supposed to be spawned. 2314 * EEXIST: device is already spawned 2315 */ 2316 static struct rte_eth_dev * 2317 mlx5_dev_spawn(struct rte_device *dpdk_dev, 2318 struct mlx5_dev_spawn_data *spawn, 2319 struct mlx5_dev_config config) 2320 { 2321 const struct mlx5_switch_info *switch_info = &spawn->info; 2322 struct mlx5_ibv_shared *sh = NULL; 2323 struct ibv_port_attr port_attr; 2324 struct mlx5dv_context dv_attr = { .comp_mask = 0 }; 2325 struct rte_eth_dev *eth_dev = NULL; 2326 struct mlx5_priv *priv = NULL; 2327 int err = 0; 2328 unsigned int hw_padding = 0; 2329 unsigned int mps; 2330 unsigned int cqe_comp; 2331 unsigned int cqe_pad = 0; 2332 unsigned int tunnel_en = 0; 2333 unsigned int mpls_en = 0; 2334 unsigned int swp = 0; 2335 unsigned int mprq = 0; 2336 unsigned int mprq_min_stride_size_n = 0; 2337 unsigned int mprq_max_stride_size_n = 0; 2338 unsigned int mprq_min_stride_num_n = 0; 2339 unsigned int mprq_max_stride_num_n = 0; 2340 struct rte_ether_addr mac; 2341 char name[RTE_ETH_NAME_MAX_LEN]; 2342 int own_domain_id = 0; 2343 uint16_t port_id; 2344 unsigned int i; 2345 #ifdef HAVE_MLX5DV_DR_DEVX_PORT 2346 struct mlx5dv_devx_port devx_port = { .comp_mask = 0 }; 2347 #endif 2348 2349 /* Determine if this port representor is supposed to be spawned. */ 2350 if (switch_info->representor && dpdk_dev->devargs) { 2351 struct rte_eth_devargs eth_da; 2352 2353 err = rte_eth_devargs_parse(dpdk_dev->devargs->args, ð_da); 2354 if (err) { 2355 rte_errno = -err; 2356 DRV_LOG(ERR, "failed to process device arguments: %s", 2357 strerror(rte_errno)); 2358 return NULL; 2359 } 2360 for (i = 0; i < eth_da.nb_representor_ports; ++i) 2361 if (eth_da.representor_ports[i] == 2362 (uint16_t)switch_info->port_name) 2363 break; 2364 if (i == eth_da.nb_representor_ports) { 2365 rte_errno = EBUSY; 2366 return NULL; 2367 } 2368 } 2369 /* Build device name. */ 2370 if (spawn->pf_bond < 0) { 2371 /* Single device. */ 2372 if (!switch_info->representor) 2373 strlcpy(name, dpdk_dev->name, sizeof(name)); 2374 else 2375 snprintf(name, sizeof(name), "%s_representor_%u", 2376 dpdk_dev->name, switch_info->port_name); 2377 } else { 2378 /* Bonding device. */ 2379 if (!switch_info->representor) 2380 snprintf(name, sizeof(name), "%s_%s", 2381 dpdk_dev->name, spawn->ibv_dev->name); 2382 else 2383 snprintf(name, sizeof(name), "%s_%s_representor_%u", 2384 dpdk_dev->name, spawn->ibv_dev->name, 2385 switch_info->port_name); 2386 } 2387 /* check if the device is already spawned */ 2388 if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) { 2389 rte_errno = EEXIST; 2390 return NULL; 2391 } 2392 DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name); 2393 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 2394 struct mlx5_mp_id mp_id; 2395 2396 eth_dev = rte_eth_dev_attach_secondary(name); 2397 if (eth_dev == NULL) { 2398 DRV_LOG(ERR, "can not attach rte ethdev"); 2399 rte_errno = ENOMEM; 2400 return NULL; 2401 } 2402 eth_dev->device = dpdk_dev; 2403 eth_dev->dev_ops = &mlx5_dev_sec_ops; 2404 err = mlx5_proc_priv_init(eth_dev); 2405 if (err) 2406 return NULL; 2407 mp_id.port_id = eth_dev->data->port_id; 2408 strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN); 2409 /* Receive command fd from primary process */ 2410 err = mlx5_mp_req_verbs_cmd_fd(&mp_id); 2411 if (err < 0) 2412 return NULL; 2413 /* Remap UAR for Tx queues. */ 2414 err = mlx5_tx_uar_init_secondary(eth_dev, err); 2415 if (err) 2416 return NULL; 2417 /* 2418 * Ethdev pointer is still required as input since 2419 * the primary device is not accessible from the 2420 * secondary process. 2421 */ 2422 eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev); 2423 eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev); 2424 return eth_dev; 2425 } 2426 /* 2427 * Some parameters ("tx_db_nc" in particularly) are needed in 2428 * advance to create dv/verbs device context. We proceed the 2429 * devargs here to get ones, and later proceed devargs again 2430 * to override some hardware settings. 2431 */ 2432 err = mlx5_args(&config, dpdk_dev->devargs); 2433 if (err) { 2434 err = rte_errno; 2435 DRV_LOG(ERR, "failed to process device arguments: %s", 2436 strerror(rte_errno)); 2437 goto error; 2438 } 2439 sh = mlx5_alloc_shared_ibctx(spawn, &config); 2440 if (!sh) 2441 return NULL; 2442 config.devx = sh->devx; 2443 #ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR 2444 config.dest_tir = 1; 2445 #endif 2446 #ifdef HAVE_IBV_MLX5_MOD_SWP 2447 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; 2448 #endif 2449 /* 2450 * Multi-packet send is supported by ConnectX-4 Lx PF as well 2451 * as all ConnectX-5 devices. 2452 */ 2453 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 2454 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS; 2455 #endif 2456 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 2457 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; 2458 #endif 2459 mlx5_glue->dv_query_device(sh->ctx, &dv_attr); 2460 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { 2461 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { 2462 DRV_LOG(DEBUG, "enhanced MPW is supported"); 2463 mps = MLX5_MPW_ENHANCED; 2464 } else { 2465 DRV_LOG(DEBUG, "MPW is supported"); 2466 mps = MLX5_MPW; 2467 } 2468 } else { 2469 DRV_LOG(DEBUG, "MPW isn't supported"); 2470 mps = MLX5_MPW_DISABLED; 2471 } 2472 #ifdef HAVE_IBV_MLX5_MOD_SWP 2473 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP) 2474 swp = dv_attr.sw_parsing_caps.sw_parsing_offloads; 2475 DRV_LOG(DEBUG, "SWP support: %u", swp); 2476 #endif 2477 config.swp = !!swp; 2478 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 2479 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) { 2480 struct mlx5dv_striding_rq_caps mprq_caps = 2481 dv_attr.striding_rq_caps; 2482 2483 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d", 2484 mprq_caps.min_single_stride_log_num_of_bytes); 2485 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d", 2486 mprq_caps.max_single_stride_log_num_of_bytes); 2487 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d", 2488 mprq_caps.min_single_wqe_log_num_of_strides); 2489 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d", 2490 mprq_caps.max_single_wqe_log_num_of_strides); 2491 DRV_LOG(DEBUG, "\tsupported_qpts: %d", 2492 mprq_caps.supported_qpts); 2493 DRV_LOG(DEBUG, "device supports Multi-Packet RQ"); 2494 mprq = 1; 2495 mprq_min_stride_size_n = 2496 mprq_caps.min_single_stride_log_num_of_bytes; 2497 mprq_max_stride_size_n = 2498 mprq_caps.max_single_stride_log_num_of_bytes; 2499 mprq_min_stride_num_n = 2500 mprq_caps.min_single_wqe_log_num_of_strides; 2501 mprq_max_stride_num_n = 2502 mprq_caps.max_single_wqe_log_num_of_strides; 2503 } 2504 #endif 2505 if (RTE_CACHE_LINE_SIZE == 128 && 2506 !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) 2507 cqe_comp = 0; 2508 else 2509 cqe_comp = 1; 2510 config.cqe_comp = cqe_comp; 2511 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD 2512 /* Whether device supports 128B Rx CQE padding. */ 2513 cqe_pad = RTE_CACHE_LINE_SIZE == 128 && 2514 (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD); 2515 #endif 2516 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 2517 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { 2518 tunnel_en = ((dv_attr.tunnel_offloads_caps & 2519 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) && 2520 (dv_attr.tunnel_offloads_caps & 2521 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) && 2522 (dv_attr.tunnel_offloads_caps & 2523 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE)); 2524 } 2525 DRV_LOG(DEBUG, "tunnel offloading is %ssupported", 2526 tunnel_en ? "" : "not "); 2527 #else 2528 DRV_LOG(WARNING, 2529 "tunnel offloading disabled due to old OFED/rdma-core version"); 2530 #endif 2531 config.tunnel_en = tunnel_en; 2532 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT 2533 mpls_en = ((dv_attr.tunnel_offloads_caps & 2534 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) && 2535 (dv_attr.tunnel_offloads_caps & 2536 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP)); 2537 DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported", 2538 mpls_en ? "" : "not "); 2539 #else 2540 DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to" 2541 " old OFED/rdma-core version or firmware configuration"); 2542 #endif 2543 config.mpls_en = mpls_en; 2544 /* Check port status. */ 2545 err = mlx5_glue->query_port(sh->ctx, spawn->ibv_port, &port_attr); 2546 if (err) { 2547 DRV_LOG(ERR, "port query failed: %s", strerror(err)); 2548 goto error; 2549 } 2550 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { 2551 DRV_LOG(ERR, "port is not configured in Ethernet mode"); 2552 err = EINVAL; 2553 goto error; 2554 } 2555 if (port_attr.state != IBV_PORT_ACTIVE) 2556 DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)", 2557 mlx5_glue->port_state_str(port_attr.state), 2558 port_attr.state); 2559 /* Allocate private eth device data. */ 2560 priv = rte_zmalloc("ethdev private structure", 2561 sizeof(*priv), 2562 RTE_CACHE_LINE_SIZE); 2563 if (priv == NULL) { 2564 DRV_LOG(ERR, "priv allocation failure"); 2565 err = ENOMEM; 2566 goto error; 2567 } 2568 priv->sh = sh; 2569 priv->ibv_port = spawn->ibv_port; 2570 priv->pci_dev = spawn->pci_dev; 2571 priv->mtu = RTE_ETHER_MTU; 2572 priv->mp_id.port_id = port_id; 2573 strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN); 2574 #ifndef RTE_ARCH_64 2575 /* Initialize UAR access locks for 32bit implementations. */ 2576 rte_spinlock_init(&priv->uar_lock_cq); 2577 for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++) 2578 rte_spinlock_init(&priv->uar_lock[i]); 2579 #endif 2580 /* Some internal functions rely on Netlink sockets, open them now. */ 2581 priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA); 2582 priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE); 2583 priv->representor = !!switch_info->representor; 2584 priv->master = !!switch_info->master; 2585 priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; 2586 priv->vport_meta_tag = 0; 2587 priv->vport_meta_mask = 0; 2588 priv->pf_bond = spawn->pf_bond; 2589 #ifdef HAVE_MLX5DV_DR_DEVX_PORT 2590 /* 2591 * The DevX port query API is implemented. E-Switch may use 2592 * either vport or reg_c[0] metadata register to match on 2593 * vport index. The engaged part of metadata register is 2594 * defined by mask. 2595 */ 2596 if (switch_info->representor || switch_info->master) { 2597 devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT | 2598 MLX5DV_DEVX_PORT_MATCH_REG_C_0; 2599 err = mlx5_glue->devx_port_query(sh->ctx, spawn->ibv_port, 2600 &devx_port); 2601 if (err) { 2602 DRV_LOG(WARNING, 2603 "can't query devx port %d on device %s", 2604 spawn->ibv_port, spawn->ibv_dev->name); 2605 devx_port.comp_mask = 0; 2606 } 2607 } 2608 if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) { 2609 priv->vport_meta_tag = devx_port.reg_c_0.value; 2610 priv->vport_meta_mask = devx_port.reg_c_0.mask; 2611 if (!priv->vport_meta_mask) { 2612 DRV_LOG(ERR, "vport zero mask for port %d" 2613 " on bonding device %s", 2614 spawn->ibv_port, spawn->ibv_dev->name); 2615 err = ENOTSUP; 2616 goto error; 2617 } 2618 if (priv->vport_meta_tag & ~priv->vport_meta_mask) { 2619 DRV_LOG(ERR, "invalid vport tag for port %d" 2620 " on bonding device %s", 2621 spawn->ibv_port, spawn->ibv_dev->name); 2622 err = ENOTSUP; 2623 goto error; 2624 } 2625 } 2626 if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) { 2627 priv->vport_id = devx_port.vport_num; 2628 } else if (spawn->pf_bond >= 0) { 2629 DRV_LOG(ERR, "can't deduce vport index for port %d" 2630 " on bonding device %s", 2631 spawn->ibv_port, spawn->ibv_dev->name); 2632 err = ENOTSUP; 2633 goto error; 2634 } else { 2635 /* Suppose vport index in compatible way. */ 2636 priv->vport_id = switch_info->representor ? 2637 switch_info->port_name + 1 : -1; 2638 } 2639 #else 2640 /* 2641 * Kernel/rdma_core support single E-Switch per PF configurations 2642 * only and vport_id field contains the vport index for 2643 * associated VF, which is deduced from representor port name. 2644 * For example, let's have the IB device port 10, it has 2645 * attached network device eth0, which has port name attribute 2646 * pf0vf2, we can deduce the VF number as 2, and set vport index 2647 * as 3 (2+1). This assigning schema should be changed if the 2648 * multiple E-Switch instances per PF configurations or/and PCI 2649 * subfunctions are added. 2650 */ 2651 priv->vport_id = switch_info->representor ? 2652 switch_info->port_name + 1 : -1; 2653 #endif 2654 /* representor_id field keeps the unmodified VF index. */ 2655 priv->representor_id = switch_info->representor ? 2656 switch_info->port_name : -1; 2657 /* 2658 * Look for sibling devices in order to reuse their switch domain 2659 * if any, otherwise allocate one. 2660 */ 2661 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { 2662 const struct mlx5_priv *opriv = 2663 rte_eth_devices[port_id].data->dev_private; 2664 2665 if (!opriv || 2666 opriv->sh != priv->sh || 2667 opriv->domain_id == 2668 RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) 2669 continue; 2670 priv->domain_id = opriv->domain_id; 2671 break; 2672 } 2673 if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { 2674 err = rte_eth_switch_domain_alloc(&priv->domain_id); 2675 if (err) { 2676 err = rte_errno; 2677 DRV_LOG(ERR, "unable to allocate switch domain: %s", 2678 strerror(rte_errno)); 2679 goto error; 2680 } 2681 own_domain_id = 1; 2682 } 2683 /* Override some values set by hardware configuration. */ 2684 mlx5_args(&config, dpdk_dev->devargs); 2685 err = mlx5_dev_check_sibling_config(priv, &config); 2686 if (err) 2687 goto error; 2688 config.hw_csum = !!(sh->device_attr.device_cap_flags_ex & 2689 IBV_DEVICE_RAW_IP_CSUM); 2690 DRV_LOG(DEBUG, "checksum offloading is %ssupported", 2691 (config.hw_csum ? "" : "not ")); 2692 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \ 2693 !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) 2694 DRV_LOG(DEBUG, "counters are not supported"); 2695 #endif 2696 #if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR) 2697 if (config.dv_flow_en) { 2698 DRV_LOG(WARNING, "DV flow is not supported"); 2699 config.dv_flow_en = 0; 2700 } 2701 #endif 2702 config.ind_table_max_size = 2703 sh->device_attr.rss_caps.max_rwq_indirection_table_size; 2704 /* 2705 * Remove this check once DPDK supports larger/variable 2706 * indirection tables. 2707 */ 2708 if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512) 2709 config.ind_table_max_size = ETH_RSS_RETA_SIZE_512; 2710 DRV_LOG(DEBUG, "maximum Rx indirection table size is %u", 2711 config.ind_table_max_size); 2712 config.hw_vlan_strip = !!(sh->device_attr.raw_packet_caps & 2713 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); 2714 DRV_LOG(DEBUG, "VLAN stripping is %ssupported", 2715 (config.hw_vlan_strip ? "" : "not ")); 2716 config.hw_fcs_strip = !!(sh->device_attr.raw_packet_caps & 2717 IBV_RAW_PACKET_CAP_SCATTER_FCS); 2718 DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported", 2719 (config.hw_fcs_strip ? "" : "not ")); 2720 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING) 2721 hw_padding = !!sh->device_attr.rx_pad_end_addr_align; 2722 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING) 2723 hw_padding = !!(sh->device_attr.device_cap_flags_ex & 2724 IBV_DEVICE_PCI_WRITE_END_PADDING); 2725 #endif 2726 if (config.hw_padding && !hw_padding) { 2727 DRV_LOG(DEBUG, "Rx end alignment padding isn't supported"); 2728 config.hw_padding = 0; 2729 } else if (config.hw_padding) { 2730 DRV_LOG(DEBUG, "Rx end alignment padding is enabled"); 2731 } 2732 config.tso = (sh->device_attr.tso_caps.max_tso > 0 && 2733 (sh->device_attr.tso_caps.supported_qpts & 2734 (1 << IBV_QPT_RAW_PACKET))); 2735 if (config.tso) 2736 config.tso_max_payload_sz = sh->device_attr.tso_caps.max_tso; 2737 /* 2738 * MPW is disabled by default, while the Enhanced MPW is enabled 2739 * by default. 2740 */ 2741 if (config.mps == MLX5_ARG_UNSET) 2742 config.mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED : 2743 MLX5_MPW_DISABLED; 2744 else 2745 config.mps = config.mps ? mps : MLX5_MPW_DISABLED; 2746 DRV_LOG(INFO, "%sMPS is %s", 2747 config.mps == MLX5_MPW_ENHANCED ? "enhanced " : 2748 config.mps == MLX5_MPW ? "legacy " : "", 2749 config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); 2750 if (config.cqe_comp && !cqe_comp) { 2751 DRV_LOG(WARNING, "Rx CQE compression isn't supported"); 2752 config.cqe_comp = 0; 2753 } 2754 if (config.cqe_pad && !cqe_pad) { 2755 DRV_LOG(WARNING, "Rx CQE padding isn't supported"); 2756 config.cqe_pad = 0; 2757 } else if (config.cqe_pad) { 2758 DRV_LOG(INFO, "Rx CQE padding is enabled"); 2759 } 2760 if (config.devx) { 2761 priv->counter_fallback = 0; 2762 err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr); 2763 if (err) { 2764 err = -err; 2765 goto error; 2766 } 2767 if (!config.hca_attr.flow_counters_dump) 2768 priv->counter_fallback = 1; 2769 #ifndef HAVE_IBV_DEVX_ASYNC 2770 priv->counter_fallback = 1; 2771 #endif 2772 if (priv->counter_fallback) 2773 DRV_LOG(INFO, "Use fall-back DV counter management"); 2774 /* Check for LRO support. */ 2775 if (config.dest_tir && config.hca_attr.lro_cap && 2776 config.dv_flow_en) { 2777 /* TBD check tunnel lro caps. */ 2778 config.lro.supported = config.hca_attr.lro_cap; 2779 DRV_LOG(DEBUG, "Device supports LRO"); 2780 /* 2781 * If LRO timeout is not configured by application, 2782 * use the minimal supported value. 2783 */ 2784 if (!config.lro.timeout) 2785 config.lro.timeout = 2786 config.hca_attr.lro_timer_supported_periods[0]; 2787 DRV_LOG(DEBUG, "LRO session timeout set to %d usec", 2788 config.lro.timeout); 2789 } 2790 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER) 2791 if (config.hca_attr.qos.sup && config.hca_attr.qos.srtcm_sup && 2792 config.dv_flow_en) { 2793 uint8_t reg_c_mask = 2794 config.hca_attr.qos.flow_meter_reg_c_ids; 2795 /* 2796 * Meter needs two REG_C's for color match and pre-sfx 2797 * flow match. Here get the REG_C for color match. 2798 * REG_C_0 and REG_C_1 is reserved for metadata feature. 2799 */ 2800 reg_c_mask &= 0xfc; 2801 if (__builtin_popcount(reg_c_mask) < 1) { 2802 priv->mtr_en = 0; 2803 DRV_LOG(WARNING, "No available register for" 2804 " meter."); 2805 } else { 2806 priv->mtr_color_reg = ffs(reg_c_mask) - 1 + 2807 REG_C_0; 2808 priv->mtr_en = 1; 2809 priv->mtr_reg_share = 2810 config.hca_attr.qos.flow_meter_reg_share; 2811 DRV_LOG(DEBUG, "The REG_C meter uses is %d", 2812 priv->mtr_color_reg); 2813 } 2814 } 2815 #endif 2816 } 2817 if (config.mprq.enabled && mprq) { 2818 if (config.mprq.stride_num_n && 2819 (config.mprq.stride_num_n > mprq_max_stride_num_n || 2820 config.mprq.stride_num_n < mprq_min_stride_num_n)) { 2821 config.mprq.stride_num_n = 2822 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, 2823 mprq_min_stride_num_n), 2824 mprq_max_stride_num_n); 2825 DRV_LOG(WARNING, 2826 "the number of strides" 2827 " for Multi-Packet RQ is out of range," 2828 " setting default value (%u)", 2829 1 << config.mprq.stride_num_n); 2830 } 2831 if (config.mprq.stride_size_n && 2832 (config.mprq.stride_size_n > mprq_max_stride_size_n || 2833 config.mprq.stride_size_n < mprq_min_stride_size_n)) { 2834 config.mprq.stride_size_n = 2835 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N, 2836 mprq_min_stride_size_n), 2837 mprq_max_stride_size_n); 2838 DRV_LOG(WARNING, 2839 "the size of a stride" 2840 " for Multi-Packet RQ is out of range," 2841 " setting default value (%u)", 2842 1 << config.mprq.stride_size_n); 2843 } 2844 config.mprq.min_stride_size_n = mprq_min_stride_size_n; 2845 config.mprq.max_stride_size_n = mprq_max_stride_size_n; 2846 } else if (config.mprq.enabled && !mprq) { 2847 DRV_LOG(WARNING, "Multi-Packet RQ isn't supported"); 2848 config.mprq.enabled = 0; 2849 } 2850 if (config.max_dump_files_num == 0) 2851 config.max_dump_files_num = 128; 2852 eth_dev = rte_eth_dev_allocate(name); 2853 if (eth_dev == NULL) { 2854 DRV_LOG(ERR, "can not allocate rte ethdev"); 2855 err = ENOMEM; 2856 goto error; 2857 } 2858 /* Flag to call rte_eth_dev_release_port() in rte_eth_dev_close(). */ 2859 eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE; 2860 if (priv->representor) { 2861 eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR; 2862 eth_dev->data->representor_id = priv->representor_id; 2863 } 2864 /* 2865 * Store associated network device interface index. This index 2866 * is permanent throughout the lifetime of device. So, we may store 2867 * the ifindex here and use the cached value further. 2868 */ 2869 MLX5_ASSERT(spawn->ifindex); 2870 priv->if_index = spawn->ifindex; 2871 eth_dev->data->dev_private = priv; 2872 priv->dev_data = eth_dev->data; 2873 eth_dev->data->mac_addrs = priv->mac; 2874 eth_dev->device = dpdk_dev; 2875 /* Configure the first MAC address by default. */ 2876 if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) { 2877 DRV_LOG(ERR, 2878 "port %u cannot get MAC address, is mlx5_en" 2879 " loaded? (errno: %s)", 2880 eth_dev->data->port_id, strerror(rte_errno)); 2881 err = ENODEV; 2882 goto error; 2883 } 2884 DRV_LOG(INFO, 2885 "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", 2886 eth_dev->data->port_id, 2887 mac.addr_bytes[0], mac.addr_bytes[1], 2888 mac.addr_bytes[2], mac.addr_bytes[3], 2889 mac.addr_bytes[4], mac.addr_bytes[5]); 2890 #ifdef RTE_LIBRTE_MLX5_DEBUG 2891 { 2892 char ifname[IF_NAMESIZE]; 2893 2894 if (mlx5_get_ifname(eth_dev, &ifname) == 0) 2895 DRV_LOG(DEBUG, "port %u ifname is \"%s\"", 2896 eth_dev->data->port_id, ifname); 2897 else 2898 DRV_LOG(DEBUG, "port %u ifname is unknown", 2899 eth_dev->data->port_id); 2900 } 2901 #endif 2902 /* Get actual MTU if possible. */ 2903 err = mlx5_get_mtu(eth_dev, &priv->mtu); 2904 if (err) { 2905 err = rte_errno; 2906 goto error; 2907 } 2908 DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id, 2909 priv->mtu); 2910 /* Initialize burst functions to prevent crashes before link-up. */ 2911 eth_dev->rx_pkt_burst = removed_rx_burst; 2912 eth_dev->tx_pkt_burst = removed_tx_burst; 2913 eth_dev->dev_ops = &mlx5_dev_ops; 2914 /* Register MAC address. */ 2915 claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); 2916 if (config.vf && config.vf_nl_en) 2917 mlx5_nl_mac_addr_sync(priv->nl_socket_route, 2918 mlx5_ifindex(eth_dev), 2919 eth_dev->data->mac_addrs, 2920 MLX5_MAX_MAC_ADDRESSES); 2921 priv->flows = 0; 2922 priv->ctrl_flows = 0; 2923 TAILQ_INIT(&priv->flow_meters); 2924 TAILQ_INIT(&priv->flow_meter_profiles); 2925 /* Hint libmlx5 to use PMD allocator for data plane resources */ 2926 struct mlx5dv_ctx_allocators alctr = { 2927 .alloc = &mlx5_alloc_verbs_buf, 2928 .free = &mlx5_free_verbs_buf, 2929 .data = priv, 2930 }; 2931 mlx5_glue->dv_set_context_attr(sh->ctx, 2932 MLX5DV_CTX_ATTR_BUF_ALLOCATORS, 2933 (void *)((uintptr_t)&alctr)); 2934 /* Bring Ethernet device up. */ 2935 DRV_LOG(DEBUG, "port %u forcing Ethernet interface up", 2936 eth_dev->data->port_id); 2937 mlx5_set_link_up(eth_dev); 2938 /* 2939 * Even though the interrupt handler is not installed yet, 2940 * interrupts will still trigger on the async_fd from 2941 * Verbs context returned by ibv_open_device(). 2942 */ 2943 mlx5_link_update(eth_dev, 0); 2944 #ifdef HAVE_MLX5DV_DR_ESWITCH 2945 if (!(config.hca_attr.eswitch_manager && config.dv_flow_en && 2946 (switch_info->representor || switch_info->master))) 2947 config.dv_esw_en = 0; 2948 #else 2949 config.dv_esw_en = 0; 2950 #endif 2951 /* Detect minimal data bytes to inline. */ 2952 mlx5_set_min_inline(spawn, &config); 2953 /* Store device configuration on private structure. */ 2954 priv->config = config; 2955 /* Create context for virtual machine VLAN workaround. */ 2956 priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex); 2957 if (config.dv_flow_en) { 2958 err = mlx5_alloc_shared_dr(priv); 2959 if (err) 2960 goto error; 2961 /* 2962 * RSS id is shared with meter flow id. Meter flow id can only 2963 * use the 24 MSB of the register. 2964 */ 2965 priv->qrss_id_pool = mlx5_flow_id_pool_alloc(UINT32_MAX >> 2966 MLX5_MTR_COLOR_BITS); 2967 if (!priv->qrss_id_pool) { 2968 DRV_LOG(ERR, "can't create flow id pool"); 2969 err = ENOMEM; 2970 goto error; 2971 } 2972 } 2973 /* Supported Verbs flow priority number detection. */ 2974 err = mlx5_flow_discover_priorities(eth_dev); 2975 if (err < 0) { 2976 err = -err; 2977 goto error; 2978 } 2979 priv->config.flow_prio = err; 2980 if (!priv->config.dv_esw_en && 2981 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { 2982 DRV_LOG(WARNING, "metadata mode %u is not supported " 2983 "(no E-Switch)", priv->config.dv_xmeta_en); 2984 priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY; 2985 } 2986 mlx5_set_metadata_mask(eth_dev); 2987 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && 2988 !priv->sh->dv_regc0_mask) { 2989 DRV_LOG(ERR, "metadata mode %u is not supported " 2990 "(no metadata reg_c[0] is available)", 2991 priv->config.dv_xmeta_en); 2992 err = ENOTSUP; 2993 goto error; 2994 } 2995 /* 2996 * Allocate the buffer for flow creating, just once. 2997 * The allocation must be done before any flow creating. 2998 */ 2999 mlx5_flow_alloc_intermediate(eth_dev); 3000 /* Query availibility of metadata reg_c's. */ 3001 err = mlx5_flow_discover_mreg_c(eth_dev); 3002 if (err < 0) { 3003 err = -err; 3004 goto error; 3005 } 3006 if (!mlx5_flow_ext_mreg_supported(eth_dev)) { 3007 DRV_LOG(DEBUG, 3008 "port %u extensive metadata register is not supported", 3009 eth_dev->data->port_id); 3010 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { 3011 DRV_LOG(ERR, "metadata mode %u is not supported " 3012 "(no metadata registers available)", 3013 priv->config.dv_xmeta_en); 3014 err = ENOTSUP; 3015 goto error; 3016 } 3017 } 3018 if (priv->config.dv_flow_en && 3019 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && 3020 mlx5_flow_ext_mreg_supported(eth_dev) && 3021 priv->sh->dv_regc0_mask) { 3022 priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME, 3023 MLX5_FLOW_MREG_HTABLE_SZ); 3024 if (!priv->mreg_cp_tbl) { 3025 err = ENOMEM; 3026 goto error; 3027 } 3028 } 3029 return eth_dev; 3030 error: 3031 if (priv) { 3032 if (priv->mreg_cp_tbl) 3033 mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL); 3034 if (priv->sh) 3035 mlx5_free_shared_dr(priv); 3036 if (priv->nl_socket_route >= 0) 3037 close(priv->nl_socket_route); 3038 if (priv->nl_socket_rdma >= 0) 3039 close(priv->nl_socket_rdma); 3040 if (priv->vmwa_context) 3041 mlx5_vlan_vmwa_exit(priv->vmwa_context); 3042 if (priv->qrss_id_pool) 3043 mlx5_flow_id_pool_release(priv->qrss_id_pool); 3044 if (own_domain_id) 3045 claim_zero(rte_eth_switch_domain_free(priv->domain_id)); 3046 rte_free(priv); 3047 if (eth_dev != NULL) 3048 eth_dev->data->dev_private = NULL; 3049 } 3050 if (eth_dev != NULL) { 3051 /* mac_addrs must not be freed alone because part of dev_private */ 3052 eth_dev->data->mac_addrs = NULL; 3053 rte_eth_dev_release_port(eth_dev); 3054 } 3055 if (sh) 3056 mlx5_free_shared_ibctx(sh); 3057 MLX5_ASSERT(err > 0); 3058 rte_errno = err; 3059 return NULL; 3060 } 3061 3062 /** 3063 * Comparison callback to sort device data. 3064 * 3065 * This is meant to be used with qsort(). 3066 * 3067 * @param a[in] 3068 * Pointer to pointer to first data object. 3069 * @param b[in] 3070 * Pointer to pointer to second data object. 3071 * 3072 * @return 3073 * 0 if both objects are equal, less than 0 if the first argument is less 3074 * than the second, greater than 0 otherwise. 3075 */ 3076 static int 3077 mlx5_dev_spawn_data_cmp(const void *a, const void *b) 3078 { 3079 const struct mlx5_switch_info *si_a = 3080 &((const struct mlx5_dev_spawn_data *)a)->info; 3081 const struct mlx5_switch_info *si_b = 3082 &((const struct mlx5_dev_spawn_data *)b)->info; 3083 int ret; 3084 3085 /* Master device first. */ 3086 ret = si_b->master - si_a->master; 3087 if (ret) 3088 return ret; 3089 /* Then representor devices. */ 3090 ret = si_b->representor - si_a->representor; 3091 if (ret) 3092 return ret; 3093 /* Unidentified devices come last in no specific order. */ 3094 if (!si_a->representor) 3095 return 0; 3096 /* Order representors by name. */ 3097 return si_a->port_name - si_b->port_name; 3098 } 3099 3100 /** 3101 * Match PCI information for possible slaves of bonding device. 3102 * 3103 * @param[in] ibv_dev 3104 * Pointer to Infiniband device structure. 3105 * @param[in] pci_dev 3106 * Pointer to PCI device structure to match PCI address. 3107 * @param[in] nl_rdma 3108 * Netlink RDMA group socket handle. 3109 * 3110 * @return 3111 * negative value if no bonding device found, otherwise 3112 * positive index of slave PF in bonding. 3113 */ 3114 static int 3115 mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev, 3116 const struct rte_pci_device *pci_dev, 3117 int nl_rdma) 3118 { 3119 char ifname[IF_NAMESIZE + 1]; 3120 unsigned int ifindex; 3121 unsigned int np, i; 3122 FILE *file = NULL; 3123 int pf = -1; 3124 3125 /* 3126 * Try to get master device name. If something goes 3127 * wrong suppose the lack of kernel support and no 3128 * bonding devices. 3129 */ 3130 if (nl_rdma < 0) 3131 return -1; 3132 if (!strstr(ibv_dev->name, "bond")) 3133 return -1; 3134 np = mlx5_nl_portnum(nl_rdma, ibv_dev->name); 3135 if (!np) 3136 return -1; 3137 /* 3138 * The Master device might not be on the predefined 3139 * port (not on port index 1, it is not garanted), 3140 * we have to scan all Infiniband device port and 3141 * find master. 3142 */ 3143 for (i = 1; i <= np; ++i) { 3144 /* Check whether Infiniband port is populated. */ 3145 ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i); 3146 if (!ifindex) 3147 continue; 3148 if (!if_indextoname(ifindex, ifname)) 3149 continue; 3150 /* Try to read bonding slave names from sysfs. */ 3151 MKSTR(slaves, 3152 "/sys/class/net/%s/master/bonding/slaves", ifname); 3153 file = fopen(slaves, "r"); 3154 if (file) 3155 break; 3156 } 3157 if (!file) 3158 return -1; 3159 /* Use safe format to check maximal buffer length. */ 3160 MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE); 3161 while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) { 3162 char tmp_str[IF_NAMESIZE + 32]; 3163 struct rte_pci_addr pci_addr; 3164 struct mlx5_switch_info info; 3165 3166 /* Process slave interface names in the loop. */ 3167 snprintf(tmp_str, sizeof(tmp_str), 3168 "/sys/class/net/%s", ifname); 3169 if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) { 3170 DRV_LOG(WARNING, "can not get PCI address" 3171 " for netdev \"%s\"", ifname); 3172 continue; 3173 } 3174 if (pci_dev->addr.domain != pci_addr.domain || 3175 pci_dev->addr.bus != pci_addr.bus || 3176 pci_dev->addr.devid != pci_addr.devid || 3177 pci_dev->addr.function != pci_addr.function) 3178 continue; 3179 /* Slave interface PCI address match found. */ 3180 fclose(file); 3181 snprintf(tmp_str, sizeof(tmp_str), 3182 "/sys/class/net/%s/phys_port_name", ifname); 3183 file = fopen(tmp_str, "rb"); 3184 if (!file) 3185 break; 3186 info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET; 3187 if (fscanf(file, "%32s", tmp_str) == 1) 3188 mlx5_translate_port_name(tmp_str, &info); 3189 if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY || 3190 info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK) 3191 pf = info.port_name; 3192 break; 3193 } 3194 if (file) 3195 fclose(file); 3196 return pf; 3197 } 3198 3199 /** 3200 * DPDK callback to register a PCI device. 3201 * 3202 * This function spawns Ethernet devices out of a given PCI device. 3203 * 3204 * @param[in] pci_drv 3205 * PCI driver structure (mlx5_driver). 3206 * @param[in] pci_dev 3207 * PCI device information. 3208 * 3209 * @return 3210 * 0 on success, a negative errno value otherwise and rte_errno is set. 3211 */ 3212 static int 3213 mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, 3214 struct rte_pci_device *pci_dev) 3215 { 3216 struct ibv_device **ibv_list; 3217 /* 3218 * Number of found IB Devices matching with requested PCI BDF. 3219 * nd != 1 means there are multiple IB devices over the same 3220 * PCI device and we have representors and master. 3221 */ 3222 unsigned int nd = 0; 3223 /* 3224 * Number of found IB device Ports. nd = 1 and np = 1..n means 3225 * we have the single multiport IB device, and there may be 3226 * representors attached to some of found ports. 3227 */ 3228 unsigned int np = 0; 3229 /* 3230 * Number of DPDK ethernet devices to Spawn - either over 3231 * multiple IB devices or multiple ports of single IB device. 3232 * Actually this is the number of iterations to spawn. 3233 */ 3234 unsigned int ns = 0; 3235 /* 3236 * Bonding device 3237 * < 0 - no bonding device (single one) 3238 * >= 0 - bonding device (value is slave PF index) 3239 */ 3240 int bd = -1; 3241 struct mlx5_dev_spawn_data *list = NULL; 3242 struct mlx5_dev_config dev_config; 3243 int ret; 3244 3245 if (mlx5_class_get(pci_dev->device.devargs) != MLX5_CLASS_NET) { 3246 DRV_LOG(DEBUG, "Skip probing - should be probed by other mlx5" 3247 " driver."); 3248 return 1; 3249 } 3250 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 3251 mlx5_pmd_socket_init(); 3252 ret = mlx5_init_once(); 3253 if (ret) { 3254 DRV_LOG(ERR, "unable to init PMD global data: %s", 3255 strerror(rte_errno)); 3256 return -rte_errno; 3257 } 3258 MLX5_ASSERT(pci_drv == &mlx5_driver); 3259 errno = 0; 3260 ibv_list = mlx5_glue->get_device_list(&ret); 3261 if (!ibv_list) { 3262 rte_errno = errno ? errno : ENOSYS; 3263 DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?"); 3264 return -rte_errno; 3265 } 3266 /* 3267 * First scan the list of all Infiniband devices to find 3268 * matching ones, gathering into the list. 3269 */ 3270 struct ibv_device *ibv_match[ret + 1]; 3271 int nl_route = mlx5_nl_init(NETLINK_ROUTE); 3272 int nl_rdma = mlx5_nl_init(NETLINK_RDMA); 3273 unsigned int i; 3274 3275 while (ret-- > 0) { 3276 struct rte_pci_addr pci_addr; 3277 3278 DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name); 3279 bd = mlx5_device_bond_pci_match 3280 (ibv_list[ret], pci_dev, nl_rdma); 3281 if (bd >= 0) { 3282 /* 3283 * Bonding device detected. Only one match is allowed, 3284 * the bonding is supported over multi-port IB device, 3285 * there should be no matches on representor PCI 3286 * functions or non VF LAG bonding devices with 3287 * specified address. 3288 */ 3289 if (nd) { 3290 DRV_LOG(ERR, 3291 "multiple PCI match on bonding device" 3292 "\"%s\" found", ibv_list[ret]->name); 3293 rte_errno = ENOENT; 3294 ret = -rte_errno; 3295 goto exit; 3296 } 3297 DRV_LOG(INFO, "PCI information matches for" 3298 " slave %d bonding device \"%s\"", 3299 bd, ibv_list[ret]->name); 3300 ibv_match[nd++] = ibv_list[ret]; 3301 break; 3302 } 3303 if (mlx5_dev_to_pci_addr 3304 (ibv_list[ret]->ibdev_path, &pci_addr)) 3305 continue; 3306 if (pci_dev->addr.domain != pci_addr.domain || 3307 pci_dev->addr.bus != pci_addr.bus || 3308 pci_dev->addr.devid != pci_addr.devid || 3309 pci_dev->addr.function != pci_addr.function) 3310 continue; 3311 DRV_LOG(INFO, "PCI information matches for device \"%s\"", 3312 ibv_list[ret]->name); 3313 ibv_match[nd++] = ibv_list[ret]; 3314 } 3315 ibv_match[nd] = NULL; 3316 if (!nd) { 3317 /* No device matches, just complain and bail out. */ 3318 DRV_LOG(WARNING, 3319 "no Verbs device matches PCI device " PCI_PRI_FMT "," 3320 " are kernel drivers loaded?", 3321 pci_dev->addr.domain, pci_dev->addr.bus, 3322 pci_dev->addr.devid, pci_dev->addr.function); 3323 rte_errno = ENOENT; 3324 ret = -rte_errno; 3325 goto exit; 3326 } 3327 if (nd == 1) { 3328 /* 3329 * Found single matching device may have multiple ports. 3330 * Each port may be representor, we have to check the port 3331 * number and check the representors existence. 3332 */ 3333 if (nl_rdma >= 0) 3334 np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name); 3335 if (!np) 3336 DRV_LOG(WARNING, "can not get IB device \"%s\"" 3337 " ports number", ibv_match[0]->name); 3338 if (bd >= 0 && !np) { 3339 DRV_LOG(ERR, "can not get ports" 3340 " for bonding device"); 3341 rte_errno = ENOENT; 3342 ret = -rte_errno; 3343 goto exit; 3344 } 3345 } 3346 #ifndef HAVE_MLX5DV_DR_DEVX_PORT 3347 if (bd >= 0) { 3348 /* 3349 * This may happen if there is VF LAG kernel support and 3350 * application is compiled with older rdma_core library. 3351 */ 3352 DRV_LOG(ERR, 3353 "No kernel/verbs support for VF LAG bonding found."); 3354 rte_errno = ENOTSUP; 3355 ret = -rte_errno; 3356 goto exit; 3357 } 3358 #endif 3359 /* 3360 * Now we can determine the maximal 3361 * amount of devices to be spawned. 3362 */ 3363 list = rte_zmalloc("device spawn data", 3364 sizeof(struct mlx5_dev_spawn_data) * 3365 (np ? np : nd), 3366 RTE_CACHE_LINE_SIZE); 3367 if (!list) { 3368 DRV_LOG(ERR, "spawn data array allocation failure"); 3369 rte_errno = ENOMEM; 3370 ret = -rte_errno; 3371 goto exit; 3372 } 3373 if (bd >= 0 || np > 1) { 3374 /* 3375 * Single IB device with multiple ports found, 3376 * it may be E-Switch master device and representors. 3377 * We have to perform identification through the ports. 3378 */ 3379 MLX5_ASSERT(nl_rdma >= 0); 3380 MLX5_ASSERT(ns == 0); 3381 MLX5_ASSERT(nd == 1); 3382 MLX5_ASSERT(np); 3383 for (i = 1; i <= np; ++i) { 3384 list[ns].max_port = np; 3385 list[ns].ibv_port = i; 3386 list[ns].ibv_dev = ibv_match[0]; 3387 list[ns].eth_dev = NULL; 3388 list[ns].pci_dev = pci_dev; 3389 list[ns].pf_bond = bd; 3390 list[ns].ifindex = mlx5_nl_ifindex 3391 (nl_rdma, list[ns].ibv_dev->name, i); 3392 if (!list[ns].ifindex) { 3393 /* 3394 * No network interface index found for the 3395 * specified port, it means there is no 3396 * representor on this port. It's OK, 3397 * there can be disabled ports, for example 3398 * if sriov_numvfs < sriov_totalvfs. 3399 */ 3400 continue; 3401 } 3402 ret = -1; 3403 if (nl_route >= 0) 3404 ret = mlx5_nl_switch_info 3405 (nl_route, 3406 list[ns].ifindex, 3407 &list[ns].info); 3408 if (ret || (!list[ns].info.representor && 3409 !list[ns].info.master)) { 3410 /* 3411 * We failed to recognize representors with 3412 * Netlink, let's try to perform the task 3413 * with sysfs. 3414 */ 3415 ret = mlx5_sysfs_switch_info 3416 (list[ns].ifindex, 3417 &list[ns].info); 3418 } 3419 if (!ret && bd >= 0) { 3420 switch (list[ns].info.name_type) { 3421 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: 3422 if (list[ns].info.port_name == bd) 3423 ns++; 3424 break; 3425 case MLX5_PHYS_PORT_NAME_TYPE_PFVF: 3426 if (list[ns].info.pf_num == bd) 3427 ns++; 3428 break; 3429 default: 3430 break; 3431 } 3432 continue; 3433 } 3434 if (!ret && (list[ns].info.representor ^ 3435 list[ns].info.master)) 3436 ns++; 3437 } 3438 if (!ns) { 3439 DRV_LOG(ERR, 3440 "unable to recognize master/representors" 3441 " on the IB device with multiple ports"); 3442 rte_errno = ENOENT; 3443 ret = -rte_errno; 3444 goto exit; 3445 } 3446 } else { 3447 /* 3448 * The existence of several matching entries (nd > 1) means 3449 * port representors have been instantiated. No existing Verbs 3450 * call nor sysfs entries can tell them apart, this can only 3451 * be done through Netlink calls assuming kernel drivers are 3452 * recent enough to support them. 3453 * 3454 * In the event of identification failure through Netlink, 3455 * try again through sysfs, then: 3456 * 3457 * 1. A single IB device matches (nd == 1) with single 3458 * port (np=0/1) and is not a representor, assume 3459 * no switch support. 3460 * 3461 * 2. Otherwise no safe assumptions can be made; 3462 * complain louder and bail out. 3463 */ 3464 np = 1; 3465 for (i = 0; i != nd; ++i) { 3466 memset(&list[ns].info, 0, sizeof(list[ns].info)); 3467 list[ns].max_port = 1; 3468 list[ns].ibv_port = 1; 3469 list[ns].ibv_dev = ibv_match[i]; 3470 list[ns].eth_dev = NULL; 3471 list[ns].pci_dev = pci_dev; 3472 list[ns].pf_bond = -1; 3473 list[ns].ifindex = 0; 3474 if (nl_rdma >= 0) 3475 list[ns].ifindex = mlx5_nl_ifindex 3476 (nl_rdma, list[ns].ibv_dev->name, 1); 3477 if (!list[ns].ifindex) { 3478 char ifname[IF_NAMESIZE]; 3479 3480 /* 3481 * Netlink failed, it may happen with old 3482 * ib_core kernel driver (before 4.16). 3483 * We can assume there is old driver because 3484 * here we are processing single ports IB 3485 * devices. Let's try sysfs to retrieve 3486 * the ifindex. The method works for 3487 * master device only. 3488 */ 3489 if (nd > 1) { 3490 /* 3491 * Multiple devices found, assume 3492 * representors, can not distinguish 3493 * master/representor and retrieve 3494 * ifindex via sysfs. 3495 */ 3496 continue; 3497 } 3498 ret = mlx5_get_master_ifname 3499 (ibv_match[i]->ibdev_path, &ifname); 3500 if (!ret) 3501 list[ns].ifindex = 3502 if_nametoindex(ifname); 3503 if (!list[ns].ifindex) { 3504 /* 3505 * No network interface index found 3506 * for the specified device, it means 3507 * there it is neither representor 3508 * nor master. 3509 */ 3510 continue; 3511 } 3512 } 3513 ret = -1; 3514 if (nl_route >= 0) 3515 ret = mlx5_nl_switch_info 3516 (nl_route, 3517 list[ns].ifindex, 3518 &list[ns].info); 3519 if (ret || (!list[ns].info.representor && 3520 !list[ns].info.master)) { 3521 /* 3522 * We failed to recognize representors with 3523 * Netlink, let's try to perform the task 3524 * with sysfs. 3525 */ 3526 ret = mlx5_sysfs_switch_info 3527 (list[ns].ifindex, 3528 &list[ns].info); 3529 } 3530 if (!ret && (list[ns].info.representor ^ 3531 list[ns].info.master)) { 3532 ns++; 3533 } else if ((nd == 1) && 3534 !list[ns].info.representor && 3535 !list[ns].info.master) { 3536 /* 3537 * Single IB device with 3538 * one physical port and 3539 * attached network device. 3540 * May be SRIOV is not enabled 3541 * or there is no representors. 3542 */ 3543 DRV_LOG(INFO, "no E-Switch support detected"); 3544 ns++; 3545 break; 3546 } 3547 } 3548 if (!ns) { 3549 DRV_LOG(ERR, 3550 "unable to recognize master/representors" 3551 " on the multiple IB devices"); 3552 rte_errno = ENOENT; 3553 ret = -rte_errno; 3554 goto exit; 3555 } 3556 } 3557 MLX5_ASSERT(ns); 3558 /* 3559 * Sort list to probe devices in natural order for users convenience 3560 * (i.e. master first, then representors from lowest to highest ID). 3561 */ 3562 qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp); 3563 /* Default configuration. */ 3564 dev_config = (struct mlx5_dev_config){ 3565 .hw_padding = 0, 3566 .mps = MLX5_ARG_UNSET, 3567 .dbnc = MLX5_ARG_UNSET, 3568 .rx_vec_en = 1, 3569 .txq_inline_max = MLX5_ARG_UNSET, 3570 .txq_inline_min = MLX5_ARG_UNSET, 3571 .txq_inline_mpw = MLX5_ARG_UNSET, 3572 .txqs_inline = MLX5_ARG_UNSET, 3573 .vf_nl_en = 1, 3574 .mr_ext_memseg_en = 1, 3575 .mprq = { 3576 .enabled = 0, /* Disabled by default. */ 3577 .stride_num_n = 0, 3578 .stride_size_n = 0, 3579 .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN, 3580 .min_rxqs_num = MLX5_MPRQ_MIN_RXQS, 3581 }, 3582 .dv_esw_en = 1, 3583 .dv_flow_en = 1, 3584 .log_hp_size = MLX5_ARG_UNSET, 3585 }; 3586 /* Device specific configuration. */ 3587 switch (pci_dev->id.device_id) { 3588 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: 3589 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: 3590 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: 3591 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: 3592 case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF: 3593 case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF: 3594 case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF: 3595 dev_config.vf = 1; 3596 break; 3597 default: 3598 break; 3599 } 3600 for (i = 0; i != ns; ++i) { 3601 uint32_t restore; 3602 3603 list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device, 3604 &list[i], 3605 dev_config); 3606 if (!list[i].eth_dev) { 3607 if (rte_errno != EBUSY && rte_errno != EEXIST) 3608 break; 3609 /* Device is disabled or already spawned. Ignore it. */ 3610 continue; 3611 } 3612 restore = list[i].eth_dev->data->dev_flags; 3613 rte_eth_copy_pci_info(list[i].eth_dev, pci_dev); 3614 /* Restore non-PCI flags cleared by the above call. */ 3615 list[i].eth_dev->data->dev_flags |= restore; 3616 mlx5_dev_interrupt_handler_devx_install(list[i].eth_dev); 3617 rte_eth_dev_probing_finish(list[i].eth_dev); 3618 } 3619 if (i != ns) { 3620 DRV_LOG(ERR, 3621 "probe of PCI device " PCI_PRI_FMT " aborted after" 3622 " encountering an error: %s", 3623 pci_dev->addr.domain, pci_dev->addr.bus, 3624 pci_dev->addr.devid, pci_dev->addr.function, 3625 strerror(rte_errno)); 3626 ret = -rte_errno; 3627 /* Roll back. */ 3628 while (i--) { 3629 if (!list[i].eth_dev) 3630 continue; 3631 mlx5_dev_close(list[i].eth_dev); 3632 /* mac_addrs must not be freed because in dev_private */ 3633 list[i].eth_dev->data->mac_addrs = NULL; 3634 claim_zero(rte_eth_dev_release_port(list[i].eth_dev)); 3635 } 3636 /* Restore original error. */ 3637 rte_errno = -ret; 3638 } else { 3639 ret = 0; 3640 } 3641 exit: 3642 /* 3643 * Do the routine cleanup: 3644 * - close opened Netlink sockets 3645 * - free allocated spawn data array 3646 * - free the Infiniband device list 3647 */ 3648 if (nl_rdma >= 0) 3649 close(nl_rdma); 3650 if (nl_route >= 0) 3651 close(nl_route); 3652 if (list) 3653 rte_free(list); 3654 MLX5_ASSERT(ibv_list); 3655 mlx5_glue->free_device_list(ibv_list); 3656 return ret; 3657 } 3658 3659 /** 3660 * Look for the ethernet device belonging to mlx5 driver. 3661 * 3662 * @param[in] port_id 3663 * port_id to start looking for device. 3664 * @param[in] pci_dev 3665 * Pointer to the hint PCI device. When device is being probed 3666 * the its siblings (master and preceding representors might 3667 * not have assigned driver yet (because the mlx5_pci_probe() 3668 * is not completed yet, for this case match on hint PCI 3669 * device may be used to detect sibling device. 3670 * 3671 * @return 3672 * port_id of found device, RTE_MAX_ETHPORT if not found. 3673 */ 3674 uint16_t 3675 mlx5_eth_find_next(uint16_t port_id, struct rte_pci_device *pci_dev) 3676 { 3677 while (port_id < RTE_MAX_ETHPORTS) { 3678 struct rte_eth_dev *dev = &rte_eth_devices[port_id]; 3679 3680 if (dev->state != RTE_ETH_DEV_UNUSED && 3681 dev->device && 3682 (dev->device == &pci_dev->device || 3683 (dev->device->driver && 3684 dev->device->driver->name && 3685 !strcmp(dev->device->driver->name, MLX5_DRIVER_NAME)))) 3686 break; 3687 port_id++; 3688 } 3689 if (port_id >= RTE_MAX_ETHPORTS) 3690 return RTE_MAX_ETHPORTS; 3691 return port_id; 3692 } 3693 3694 /** 3695 * DPDK callback to remove a PCI device. 3696 * 3697 * This function removes all Ethernet devices belong to a given PCI device. 3698 * 3699 * @param[in] pci_dev 3700 * Pointer to the PCI device. 3701 * 3702 * @return 3703 * 0 on success, the function cannot fail. 3704 */ 3705 static int 3706 mlx5_pci_remove(struct rte_pci_device *pci_dev) 3707 { 3708 uint16_t port_id; 3709 3710 RTE_ETH_FOREACH_DEV_OF(port_id, &pci_dev->device) 3711 rte_eth_dev_close(port_id); 3712 return 0; 3713 } 3714 3715 static const struct rte_pci_id mlx5_pci_id_map[] = { 3716 { 3717 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3718 PCI_DEVICE_ID_MELLANOX_CONNECTX4) 3719 }, 3720 { 3721 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3722 PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) 3723 }, 3724 { 3725 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3726 PCI_DEVICE_ID_MELLANOX_CONNECTX4LX) 3727 }, 3728 { 3729 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3730 PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) 3731 }, 3732 { 3733 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3734 PCI_DEVICE_ID_MELLANOX_CONNECTX5) 3735 }, 3736 { 3737 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3738 PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) 3739 }, 3740 { 3741 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3742 PCI_DEVICE_ID_MELLANOX_CONNECTX5EX) 3743 }, 3744 { 3745 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3746 PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF) 3747 }, 3748 { 3749 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3750 PCI_DEVICE_ID_MELLANOX_CONNECTX5BF) 3751 }, 3752 { 3753 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3754 PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF) 3755 }, 3756 { 3757 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3758 PCI_DEVICE_ID_MELLANOX_CONNECTX6) 3759 }, 3760 { 3761 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3762 PCI_DEVICE_ID_MELLANOX_CONNECTX6VF) 3763 }, 3764 { 3765 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3766 PCI_DEVICE_ID_MELLANOX_CONNECTX6DX) 3767 }, 3768 { 3769 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3770 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF) 3771 }, 3772 { 3773 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 3774 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF) 3775 }, 3776 { 3777 .vendor_id = 0 3778 } 3779 }; 3780 3781 static struct rte_pci_driver mlx5_driver = { 3782 .driver = { 3783 .name = MLX5_DRIVER_NAME 3784 }, 3785 .id_table = mlx5_pci_id_map, 3786 .probe = mlx5_pci_probe, 3787 .remove = mlx5_pci_remove, 3788 .dma_map = mlx5_dma_map, 3789 .dma_unmap = mlx5_dma_unmap, 3790 .drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV | 3791 RTE_PCI_DRV_PROBE_AGAIN, 3792 }; 3793 3794 /** 3795 * Driver initialization routine. 3796 */ 3797 RTE_INIT(rte_mlx5_pmd_init) 3798 { 3799 /* Initialize driver log type. */ 3800 mlx5_logtype = rte_log_register("pmd.net.mlx5"); 3801 if (mlx5_logtype >= 0) 3802 rte_log_set_level(mlx5_logtype, RTE_LOG_NOTICE); 3803 3804 /* Build the static tables for Verbs conversion. */ 3805 mlx5_set_ptype_table(); 3806 mlx5_set_cksum_table(); 3807 mlx5_set_swp_types_table(); 3808 if (mlx5_glue) 3809 rte_pci_register(&mlx5_driver); 3810 } 3811 3812 RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__); 3813 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map); 3814 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib"); 3815