1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015 Mellanox Technologies, Ltd 4 */ 5 6 #include <stddef.h> 7 #include <unistd.h> 8 #include <string.h> 9 #include <assert.h> 10 #include <dlfcn.h> 11 #include <stdint.h> 12 #include <stdlib.h> 13 #include <errno.h> 14 #include <net/if.h> 15 #include <sys/mman.h> 16 #include <linux/rtnetlink.h> 17 18 /* Verbs header. */ 19 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 20 #ifdef PEDANTIC 21 #pragma GCC diagnostic ignored "-Wpedantic" 22 #endif 23 #include <infiniband/verbs.h> 24 #ifdef PEDANTIC 25 #pragma GCC diagnostic error "-Wpedantic" 26 #endif 27 28 #include <rte_malloc.h> 29 #include <rte_ethdev_driver.h> 30 #include <rte_ethdev_pci.h> 31 #include <rte_pci.h> 32 #include <rte_bus_pci.h> 33 #include <rte_common.h> 34 #include <rte_config.h> 35 #include <rte_kvargs.h> 36 #include <rte_rwlock.h> 37 #include <rte_spinlock.h> 38 #include <rte_string_fns.h> 39 #include <rte_alarm.h> 40 41 #include "mlx5.h" 42 #include "mlx5_utils.h" 43 #include "mlx5_rxtx.h" 44 #include "mlx5_autoconf.h" 45 #include "mlx5_defs.h" 46 #include "mlx5_glue.h" 47 #include "mlx5_mr.h" 48 #include "mlx5_flow.h" 49 50 /* Device parameter to enable RX completion queue compression. */ 51 #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en" 52 53 /* Device parameter to enable RX completion entry padding to 128B. */ 54 #define MLX5_RXQ_CQE_PAD_EN "rxq_cqe_pad_en" 55 56 /* Device parameter to enable padding Rx packet to cacheline size. */ 57 #define MLX5_RXQ_PKT_PAD_EN "rxq_pkt_pad_en" 58 59 /* Device parameter to enable Multi-Packet Rx queue. */ 60 #define MLX5_RX_MPRQ_EN "mprq_en" 61 62 /* Device parameter to configure log 2 of the number of strides for MPRQ. */ 63 #define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num" 64 65 /* Device parameter to limit the size of memcpy'd packet for MPRQ. */ 66 #define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len" 67 68 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */ 69 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq" 70 71 /* Device parameter to configure inline send. Deprecated, ignored.*/ 72 #define MLX5_TXQ_INLINE "txq_inline" 73 74 /* Device parameter to limit packet size to inline with ordinary SEND. */ 75 #define MLX5_TXQ_INLINE_MAX "txq_inline_max" 76 77 /* Device parameter to configure minimal data size to inline. */ 78 #define MLX5_TXQ_INLINE_MIN "txq_inline_min" 79 80 /* Device parameter to limit packet size to inline with Enhanced MPW. */ 81 #define MLX5_TXQ_INLINE_MPW "txq_inline_mpw" 82 83 /* 84 * Device parameter to configure the number of TX queues threshold for 85 * enabling inline send. 86 */ 87 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline" 88 89 /* 90 * Device parameter to configure the number of TX queues threshold for 91 * enabling vectorized Tx, deprecated, ignored (no vectorized Tx routines). 92 */ 93 #define MLX5_TXQS_MAX_VEC "txqs_max_vec" 94 95 /* Device parameter to enable multi-packet send WQEs. */ 96 #define MLX5_TXQ_MPW_EN "txq_mpw_en" 97 98 /* 99 * Device parameter to include 2 dsegs in the title WQEBB. 100 * Deprecated, ignored. 101 */ 102 #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en" 103 104 /* 105 * Device parameter to limit the size of inlining packet. 106 * Deprecated, ignored. 107 */ 108 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len" 109 110 /* 111 * Device parameter to enable hardware Tx vector. 112 * Deprecated, ignored (no vectorized Tx routines anymore). 113 */ 114 #define MLX5_TX_VEC_EN "tx_vec_en" 115 116 /* Device parameter to enable hardware Rx vector. */ 117 #define MLX5_RX_VEC_EN "rx_vec_en" 118 119 /* Allow L3 VXLAN flow creation. */ 120 #define MLX5_L3_VXLAN_EN "l3_vxlan_en" 121 122 /* Activate DV E-Switch flow steering. */ 123 #define MLX5_DV_ESW_EN "dv_esw_en" 124 125 /* Activate DV flow steering. */ 126 #define MLX5_DV_FLOW_EN "dv_flow_en" 127 128 /* Activate Netlink support in VF mode. */ 129 #define MLX5_VF_NL_EN "vf_nl_en" 130 131 /* Enable extending memsegs when creating a MR. */ 132 #define MLX5_MR_EXT_MEMSEG_EN "mr_ext_memseg_en" 133 134 /* Select port representors to instantiate. */ 135 #define MLX5_REPRESENTOR "representor" 136 137 /* Device parameter to configure the maximum number of dump files per queue. */ 138 #define MLX5_MAX_DUMP_FILES_NUM "max_dump_files_num" 139 140 /* Configure timeout of LRO session (in microseconds). */ 141 #define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec" 142 143 #ifndef HAVE_IBV_MLX5_MOD_MPW 144 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) 145 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) 146 #endif 147 148 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP 149 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4) 150 #endif 151 152 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data"; 153 154 /* Shared memory between primary and secondary processes. */ 155 struct mlx5_shared_data *mlx5_shared_data; 156 157 /* Spinlock for mlx5_shared_data allocation. */ 158 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER; 159 160 /* Process local data for secondary processes. */ 161 static struct mlx5_local_data mlx5_local_data; 162 163 /** Driver-specific log messages type. */ 164 int mlx5_logtype; 165 166 /** Data associated with devices to spawn. */ 167 struct mlx5_dev_spawn_data { 168 uint32_t ifindex; /**< Network interface index. */ 169 uint32_t max_port; /**< IB device maximal port index. */ 170 uint32_t ibv_port; /**< IB device physical port index. */ 171 struct mlx5_switch_info info; /**< Switch information. */ 172 struct ibv_device *ibv_dev; /**< Associated IB device. */ 173 struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */ 174 struct rte_pci_device *pci_dev; /**< Backend PCI device. */ 175 }; 176 177 static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER(); 178 static pthread_mutex_t mlx5_ibv_list_mutex = PTHREAD_MUTEX_INITIALIZER; 179 180 /** 181 * Initialize the counters management structure. 182 * 183 * @param[in] sh 184 * Pointer to mlx5_ibv_shared object to free 185 */ 186 static void 187 mlx5_flow_counters_mng_init(struct mlx5_ibv_shared *sh) 188 { 189 uint8_t i; 190 191 TAILQ_INIT(&sh->cmng.flow_counters); 192 for (i = 0; i < RTE_DIM(sh->cmng.ccont); ++i) 193 TAILQ_INIT(&sh->cmng.ccont[i].pool_list); 194 } 195 196 /** 197 * Destroy all the resources allocated for a counter memory management. 198 * 199 * @param[in] mng 200 * Pointer to the memory management structure. 201 */ 202 static void 203 mlx5_flow_destroy_counter_stat_mem_mng(struct mlx5_counter_stats_mem_mng *mng) 204 { 205 uint8_t *mem = (uint8_t *)(uintptr_t)mng->raws[0].data; 206 207 LIST_REMOVE(mng, next); 208 claim_zero(mlx5_devx_cmd_destroy(mng->dm)); 209 claim_zero(mlx5_glue->devx_umem_dereg(mng->umem)); 210 rte_free(mem); 211 } 212 213 /** 214 * Close and release all the resources of the counters management. 215 * 216 * @param[in] sh 217 * Pointer to mlx5_ibv_shared object to free. 218 */ 219 static void 220 mlx5_flow_counters_mng_close(struct mlx5_ibv_shared *sh) 221 { 222 struct mlx5_counter_stats_mem_mng *mng; 223 uint8_t i; 224 int j; 225 int retries = 1024; 226 227 rte_errno = 0; 228 while (--retries) { 229 rte_eal_alarm_cancel(mlx5_flow_query_alarm, sh); 230 if (rte_errno != EINPROGRESS) 231 break; 232 rte_pause(); 233 } 234 for (i = 0; i < RTE_DIM(sh->cmng.ccont); ++i) { 235 struct mlx5_flow_counter_pool *pool; 236 uint32_t batch = !!(i % 2); 237 238 if (!sh->cmng.ccont[i].pools) 239 continue; 240 pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list); 241 while (pool) { 242 if (batch) { 243 if (pool->min_dcs) 244 claim_zero 245 (mlx5_devx_cmd_destroy(pool->min_dcs)); 246 } 247 for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) { 248 if (pool->counters_raw[j].action) 249 claim_zero 250 (mlx5_glue->destroy_flow_action 251 (pool->counters_raw[j].action)); 252 if (!batch && pool->counters_raw[j].dcs) 253 claim_zero(mlx5_devx_cmd_destroy 254 (pool->counters_raw[j].dcs)); 255 } 256 TAILQ_REMOVE(&sh->cmng.ccont[i].pool_list, pool, 257 next); 258 rte_free(pool); 259 pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list); 260 } 261 rte_free(sh->cmng.ccont[i].pools); 262 } 263 mng = LIST_FIRST(&sh->cmng.mem_mngs); 264 while (mng) { 265 mlx5_flow_destroy_counter_stat_mem_mng(mng); 266 mng = LIST_FIRST(&sh->cmng.mem_mngs); 267 } 268 memset(&sh->cmng, 0, sizeof(sh->cmng)); 269 } 270 271 /** 272 * Extract pdn of PD object using DV API. 273 * 274 * @param[in] pd 275 * Pointer to the verbs PD object. 276 * @param[out] pdn 277 * Pointer to the PD object number variable. 278 * 279 * @return 280 * 0 on success, error value otherwise. 281 */ 282 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 283 static int 284 mlx5_get_pdn(struct ibv_pd *pd __rte_unused, uint32_t *pdn __rte_unused) 285 { 286 struct mlx5dv_obj obj; 287 struct mlx5dv_pd pd_info; 288 int ret = 0; 289 290 obj.pd.in = pd; 291 obj.pd.out = &pd_info; 292 ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD); 293 if (ret) { 294 DRV_LOG(DEBUG, "Fail to get PD object info"); 295 return ret; 296 } 297 *pdn = pd_info.pdn; 298 return 0; 299 } 300 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */ 301 302 /** 303 * Allocate shared IB device context. If there is multiport device the 304 * master and representors will share this context, if there is single 305 * port dedicated IB device, the context will be used by only given 306 * port due to unification. 307 * 308 * Routine first searches the context for the specified IB device name, 309 * if found the shared context assumed and reference counter is incremented. 310 * If no context found the new one is created and initialized with specified 311 * IB device context and parameters. 312 * 313 * @param[in] spawn 314 * Pointer to the IB device attributes (name, port, etc). 315 * 316 * @return 317 * Pointer to mlx5_ibv_shared object on success, 318 * otherwise NULL and rte_errno is set. 319 */ 320 static struct mlx5_ibv_shared * 321 mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn) 322 { 323 struct mlx5_ibv_shared *sh; 324 int err = 0; 325 uint32_t i; 326 327 assert(spawn); 328 /* Secondary process should not create the shared context. */ 329 assert(rte_eal_process_type() == RTE_PROC_PRIMARY); 330 pthread_mutex_lock(&mlx5_ibv_list_mutex); 331 /* Search for IB context by device name. */ 332 LIST_FOREACH(sh, &mlx5_ibv_list, next) { 333 if (!strcmp(sh->ibdev_name, spawn->ibv_dev->name)) { 334 sh->refcnt++; 335 goto exit; 336 } 337 } 338 /* No device found, we have to create new shared context. */ 339 assert(spawn->max_port); 340 sh = rte_zmalloc("ethdev shared ib context", 341 sizeof(struct mlx5_ibv_shared) + 342 spawn->max_port * 343 sizeof(struct mlx5_ibv_shared_port), 344 RTE_CACHE_LINE_SIZE); 345 if (!sh) { 346 DRV_LOG(ERR, "shared context allocation failure"); 347 rte_errno = ENOMEM; 348 goto exit; 349 } 350 /* Try to open IB device with DV first, then usual Verbs. */ 351 errno = 0; 352 sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev); 353 if (sh->ctx) { 354 sh->devx = 1; 355 DRV_LOG(DEBUG, "DevX is supported"); 356 } else { 357 sh->ctx = mlx5_glue->open_device(spawn->ibv_dev); 358 if (!sh->ctx) { 359 err = errno ? errno : ENODEV; 360 goto error; 361 } 362 DRV_LOG(DEBUG, "DevX is NOT supported"); 363 } 364 err = mlx5_glue->query_device_ex(sh->ctx, NULL, &sh->device_attr); 365 if (err) { 366 DRV_LOG(DEBUG, "ibv_query_device_ex() failed"); 367 goto error; 368 } 369 sh->refcnt = 1; 370 sh->max_port = spawn->max_port; 371 strncpy(sh->ibdev_name, sh->ctx->device->name, 372 sizeof(sh->ibdev_name)); 373 strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path, 374 sizeof(sh->ibdev_path)); 375 sh->pci_dev = spawn->pci_dev; 376 pthread_mutex_init(&sh->intr_mutex, NULL); 377 /* 378 * Setting port_id to max unallowed value means 379 * there is no interrupt subhandler installed for 380 * the given port index i. 381 */ 382 for (i = 0; i < sh->max_port; i++) 383 sh->port[i].ih_port_id = RTE_MAX_ETHPORTS; 384 sh->pd = mlx5_glue->alloc_pd(sh->ctx); 385 if (sh->pd == NULL) { 386 DRV_LOG(ERR, "PD allocation failure"); 387 err = ENOMEM; 388 goto error; 389 } 390 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 391 err = mlx5_get_pdn(sh->pd, &sh->pdn); 392 if (err) { 393 DRV_LOG(ERR, "Fail to extract pdn from PD"); 394 goto error; 395 } 396 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */ 397 /* 398 * Once the device is added to the list of memory event 399 * callback, its global MR cache table cannot be expanded 400 * on the fly because of deadlock. If it overflows, lookup 401 * should be done by searching MR list linearly, which is slow. 402 * 403 * At this point the device is not added to the memory 404 * event list yet, context is just being created. 405 */ 406 err = mlx5_mr_btree_init(&sh->mr.cache, 407 MLX5_MR_BTREE_CACHE_N * 2, 408 sh->pci_dev->device.numa_node); 409 if (err) { 410 err = rte_errno; 411 goto error; 412 } 413 mlx5_flow_counters_mng_init(sh); 414 /* Add device to memory callback list. */ 415 rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); 416 LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list, 417 sh, mem_event_cb); 418 rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); 419 /* Add context to the global device list. */ 420 LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next); 421 exit: 422 pthread_mutex_unlock(&mlx5_ibv_list_mutex); 423 return sh; 424 error: 425 pthread_mutex_unlock(&mlx5_ibv_list_mutex); 426 assert(sh); 427 if (sh->pd) 428 claim_zero(mlx5_glue->dealloc_pd(sh->pd)); 429 if (sh->ctx) 430 claim_zero(mlx5_glue->close_device(sh->ctx)); 431 rte_free(sh); 432 assert(err > 0); 433 rte_errno = err; 434 return NULL; 435 } 436 437 /** 438 * Free shared IB device context. Decrement counter and if zero free 439 * all allocated resources and close handles. 440 * 441 * @param[in] sh 442 * Pointer to mlx5_ibv_shared object to free 443 */ 444 static void 445 mlx5_free_shared_ibctx(struct mlx5_ibv_shared *sh) 446 { 447 pthread_mutex_lock(&mlx5_ibv_list_mutex); 448 #ifndef NDEBUG 449 /* Check the object presence in the list. */ 450 struct mlx5_ibv_shared *lctx; 451 452 LIST_FOREACH(lctx, &mlx5_ibv_list, next) 453 if (lctx == sh) 454 break; 455 assert(lctx); 456 if (lctx != sh) { 457 DRV_LOG(ERR, "Freeing non-existing shared IB context"); 458 goto exit; 459 } 460 #endif 461 assert(sh); 462 assert(sh->refcnt); 463 /* Secondary process should not free the shared context. */ 464 assert(rte_eal_process_type() == RTE_PROC_PRIMARY); 465 if (--sh->refcnt) 466 goto exit; 467 /* Release created Memory Regions. */ 468 mlx5_mr_release(sh); 469 /* Remove from memory callback device list. */ 470 rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); 471 LIST_REMOVE(sh, mem_event_cb); 472 rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); 473 /* Remove context from the global device list. */ 474 LIST_REMOVE(sh, next); 475 /* 476 * Ensure there is no async event handler installed. 477 * Only primary process handles async device events. 478 **/ 479 mlx5_flow_counters_mng_close(sh); 480 assert(!sh->intr_cnt); 481 if (sh->intr_cnt) 482 mlx5_intr_callback_unregister 483 (&sh->intr_handle, mlx5_dev_interrupt_handler, sh); 484 pthread_mutex_destroy(&sh->intr_mutex); 485 if (sh->pd) 486 claim_zero(mlx5_glue->dealloc_pd(sh->pd)); 487 if (sh->ctx) 488 claim_zero(mlx5_glue->close_device(sh->ctx)); 489 rte_free(sh); 490 exit: 491 pthread_mutex_unlock(&mlx5_ibv_list_mutex); 492 } 493 494 /** 495 * Initialize DR related data within private structure. 496 * Routine checks the reference counter and does actual 497 * resources creation/initialization only if counter is zero. 498 * 499 * @param[in] priv 500 * Pointer to the private device data structure. 501 * 502 * @return 503 * Zero on success, positive error code otherwise. 504 */ 505 static int 506 mlx5_alloc_shared_dr(struct mlx5_priv *priv) 507 { 508 #ifdef HAVE_MLX5DV_DR 509 struct mlx5_ibv_shared *sh = priv->sh; 510 int err = 0; 511 void *domain; 512 513 assert(sh); 514 if (sh->dv_refcnt) { 515 /* Shared DV/DR structures is already initialized. */ 516 sh->dv_refcnt++; 517 priv->dr_shared = 1; 518 return 0; 519 } 520 /* Reference counter is zero, we should initialize structures. */ 521 domain = mlx5_glue->dr_create_domain(sh->ctx, 522 MLX5DV_DR_DOMAIN_TYPE_NIC_RX); 523 if (!domain) { 524 DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed"); 525 err = errno; 526 goto error; 527 } 528 sh->rx_domain = domain; 529 domain = mlx5_glue->dr_create_domain(sh->ctx, 530 MLX5DV_DR_DOMAIN_TYPE_NIC_TX); 531 if (!domain) { 532 DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed"); 533 err = errno; 534 goto error; 535 } 536 pthread_mutex_init(&sh->dv_mutex, NULL); 537 sh->tx_domain = domain; 538 #ifdef HAVE_MLX5DV_DR_ESWITCH 539 if (priv->config.dv_esw_en) { 540 domain = mlx5_glue->dr_create_domain 541 (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB); 542 if (!domain) { 543 DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed"); 544 err = errno; 545 goto error; 546 } 547 sh->fdb_domain = domain; 548 sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop(); 549 } 550 #endif 551 sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan(); 552 sh->dv_refcnt++; 553 priv->dr_shared = 1; 554 return 0; 555 556 error: 557 /* Rollback the created objects. */ 558 if (sh->rx_domain) { 559 mlx5_glue->dr_destroy_domain(sh->rx_domain); 560 sh->rx_domain = NULL; 561 } 562 if (sh->tx_domain) { 563 mlx5_glue->dr_destroy_domain(sh->tx_domain); 564 sh->tx_domain = NULL; 565 } 566 if (sh->fdb_domain) { 567 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 568 sh->fdb_domain = NULL; 569 } 570 if (sh->esw_drop_action) { 571 mlx5_glue->destroy_flow_action(sh->esw_drop_action); 572 sh->esw_drop_action = NULL; 573 } 574 if (sh->pop_vlan_action) { 575 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 576 sh->pop_vlan_action = NULL; 577 } 578 return err; 579 #else 580 (void)priv; 581 return 0; 582 #endif 583 } 584 585 /** 586 * Destroy DR related data within private structure. 587 * 588 * @param[in] priv 589 * Pointer to the private device data structure. 590 */ 591 static void 592 mlx5_free_shared_dr(struct mlx5_priv *priv) 593 { 594 #ifdef HAVE_MLX5DV_DR 595 struct mlx5_ibv_shared *sh; 596 597 if (!priv->dr_shared) 598 return; 599 priv->dr_shared = 0; 600 sh = priv->sh; 601 assert(sh); 602 assert(sh->dv_refcnt); 603 if (sh->dv_refcnt && --sh->dv_refcnt) 604 return; 605 if (sh->rx_domain) { 606 mlx5_glue->dr_destroy_domain(sh->rx_domain); 607 sh->rx_domain = NULL; 608 } 609 if (sh->tx_domain) { 610 mlx5_glue->dr_destroy_domain(sh->tx_domain); 611 sh->tx_domain = NULL; 612 } 613 #ifdef HAVE_MLX5DV_DR_ESWITCH 614 if (sh->fdb_domain) { 615 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 616 sh->fdb_domain = NULL; 617 } 618 if (sh->esw_drop_action) { 619 mlx5_glue->destroy_flow_action(sh->esw_drop_action); 620 sh->esw_drop_action = NULL; 621 } 622 #endif 623 if (sh->pop_vlan_action) { 624 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 625 sh->pop_vlan_action = NULL; 626 } 627 pthread_mutex_destroy(&sh->dv_mutex); 628 #else 629 (void)priv; 630 #endif 631 } 632 633 /** 634 * Initialize shared data between primary and secondary process. 635 * 636 * A memzone is reserved by primary process and secondary processes attach to 637 * the memzone. 638 * 639 * @return 640 * 0 on success, a negative errno value otherwise and rte_errno is set. 641 */ 642 static int 643 mlx5_init_shared_data(void) 644 { 645 const struct rte_memzone *mz; 646 int ret = 0; 647 648 rte_spinlock_lock(&mlx5_shared_data_lock); 649 if (mlx5_shared_data == NULL) { 650 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 651 /* Allocate shared memory. */ 652 mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA, 653 sizeof(*mlx5_shared_data), 654 SOCKET_ID_ANY, 0); 655 if (mz == NULL) { 656 DRV_LOG(ERR, 657 "Cannot allocate mlx5 shared data\n"); 658 ret = -rte_errno; 659 goto error; 660 } 661 mlx5_shared_data = mz->addr; 662 memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data)); 663 rte_spinlock_init(&mlx5_shared_data->lock); 664 } else { 665 /* Lookup allocated shared memory. */ 666 mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA); 667 if (mz == NULL) { 668 DRV_LOG(ERR, 669 "Cannot attach mlx5 shared data\n"); 670 ret = -rte_errno; 671 goto error; 672 } 673 mlx5_shared_data = mz->addr; 674 memset(&mlx5_local_data, 0, sizeof(mlx5_local_data)); 675 } 676 } 677 error: 678 rte_spinlock_unlock(&mlx5_shared_data_lock); 679 return ret; 680 } 681 682 /** 683 * Retrieve integer value from environment variable. 684 * 685 * @param[in] name 686 * Environment variable name. 687 * 688 * @return 689 * Integer value, 0 if the variable is not set. 690 */ 691 int 692 mlx5_getenv_int(const char *name) 693 { 694 const char *val = getenv(name); 695 696 if (val == NULL) 697 return 0; 698 return atoi(val); 699 } 700 701 /** 702 * Verbs callback to allocate a memory. This function should allocate the space 703 * according to the size provided residing inside a huge page. 704 * Please note that all allocation must respect the alignment from libmlx5 705 * (i.e. currently sysconf(_SC_PAGESIZE)). 706 * 707 * @param[in] size 708 * The size in bytes of the memory to allocate. 709 * @param[in] data 710 * A pointer to the callback data. 711 * 712 * @return 713 * Allocated buffer, NULL otherwise and rte_errno is set. 714 */ 715 static void * 716 mlx5_alloc_verbs_buf(size_t size, void *data) 717 { 718 struct mlx5_priv *priv = data; 719 void *ret; 720 size_t alignment = sysconf(_SC_PAGESIZE); 721 unsigned int socket = SOCKET_ID_ANY; 722 723 if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) { 724 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 725 726 socket = ctrl->socket; 727 } else if (priv->verbs_alloc_ctx.type == 728 MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) { 729 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 730 731 socket = ctrl->socket; 732 } 733 assert(data != NULL); 734 ret = rte_malloc_socket(__func__, size, alignment, socket); 735 if (!ret && size) 736 rte_errno = ENOMEM; 737 return ret; 738 } 739 740 /** 741 * Verbs callback to free a memory. 742 * 743 * @param[in] ptr 744 * A pointer to the memory to free. 745 * @param[in] data 746 * A pointer to the callback data. 747 */ 748 static void 749 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused) 750 { 751 assert(data != NULL); 752 rte_free(ptr); 753 } 754 755 /** 756 * DPDK callback to add udp tunnel port 757 * 758 * @param[in] dev 759 * A pointer to eth_dev 760 * @param[in] udp_tunnel 761 * A pointer to udp tunnel 762 * 763 * @return 764 * 0 on valid udp ports and tunnels, -ENOTSUP otherwise. 765 */ 766 int 767 mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev __rte_unused, 768 struct rte_eth_udp_tunnel *udp_tunnel) 769 { 770 assert(udp_tunnel != NULL); 771 if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN && 772 udp_tunnel->udp_port == 4789) 773 return 0; 774 if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN_GPE && 775 udp_tunnel->udp_port == 4790) 776 return 0; 777 return -ENOTSUP; 778 } 779 780 /** 781 * Initialize process private data structure. 782 * 783 * @param dev 784 * Pointer to Ethernet device structure. 785 * 786 * @return 787 * 0 on success, a negative errno value otherwise and rte_errno is set. 788 */ 789 int 790 mlx5_proc_priv_init(struct rte_eth_dev *dev) 791 { 792 struct mlx5_priv *priv = dev->data->dev_private; 793 struct mlx5_proc_priv *ppriv; 794 size_t ppriv_size; 795 796 /* 797 * UAR register table follows the process private structure. BlueFlame 798 * registers for Tx queues are stored in the table. 799 */ 800 ppriv_size = 801 sizeof(struct mlx5_proc_priv) + priv->txqs_n * sizeof(void *); 802 ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size, 803 RTE_CACHE_LINE_SIZE, dev->device->numa_node); 804 if (!ppriv) { 805 rte_errno = ENOMEM; 806 return -rte_errno; 807 } 808 ppriv->uar_table_sz = ppriv_size; 809 dev->process_private = ppriv; 810 return 0; 811 } 812 813 /** 814 * Un-initialize process private data structure. 815 * 816 * @param dev 817 * Pointer to Ethernet device structure. 818 */ 819 static void 820 mlx5_proc_priv_uninit(struct rte_eth_dev *dev) 821 { 822 if (!dev->process_private) 823 return; 824 rte_free(dev->process_private); 825 dev->process_private = NULL; 826 } 827 828 /** 829 * DPDK callback to close the device. 830 * 831 * Destroy all queues and objects, free memory. 832 * 833 * @param dev 834 * Pointer to Ethernet device structure. 835 */ 836 static void 837 mlx5_dev_close(struct rte_eth_dev *dev) 838 { 839 struct mlx5_priv *priv = dev->data->dev_private; 840 unsigned int i; 841 int ret; 842 843 DRV_LOG(DEBUG, "port %u closing device \"%s\"", 844 dev->data->port_id, 845 ((priv->sh->ctx != NULL) ? priv->sh->ctx->device->name : "")); 846 /* In case mlx5_dev_stop() has not been called. */ 847 mlx5_dev_interrupt_handler_uninstall(dev); 848 mlx5_traffic_disable(dev); 849 mlx5_flow_flush(dev, NULL); 850 /* Prevent crashes when queues are still in use. */ 851 dev->rx_pkt_burst = removed_rx_burst; 852 dev->tx_pkt_burst = removed_tx_burst; 853 rte_wmb(); 854 /* Disable datapath on secondary process. */ 855 mlx5_mp_req_stop_rxtx(dev); 856 if (priv->rxqs != NULL) { 857 /* XXX race condition if mlx5_rx_burst() is still running. */ 858 usleep(1000); 859 for (i = 0; (i != priv->rxqs_n); ++i) 860 mlx5_rxq_release(dev, i); 861 priv->rxqs_n = 0; 862 priv->rxqs = NULL; 863 } 864 if (priv->txqs != NULL) { 865 /* XXX race condition if mlx5_tx_burst() is still running. */ 866 usleep(1000); 867 for (i = 0; (i != priv->txqs_n); ++i) 868 mlx5_txq_release(dev, i); 869 priv->txqs_n = 0; 870 priv->txqs = NULL; 871 } 872 mlx5_proc_priv_uninit(dev); 873 mlx5_mprq_free_mp(dev); 874 mlx5_free_shared_dr(priv); 875 if (priv->rss_conf.rss_key != NULL) 876 rte_free(priv->rss_conf.rss_key); 877 if (priv->reta_idx != NULL) 878 rte_free(priv->reta_idx); 879 if (priv->config.vf) 880 mlx5_nl_mac_addr_flush(dev); 881 if (priv->nl_socket_route >= 0) 882 close(priv->nl_socket_route); 883 if (priv->nl_socket_rdma >= 0) 884 close(priv->nl_socket_rdma); 885 if (priv->vmwa_context) 886 mlx5_vlan_vmwa_exit(priv->vmwa_context); 887 if (priv->sh) { 888 /* 889 * Free the shared context in last turn, because the cleanup 890 * routines above may use some shared fields, like 891 * mlx5_nl_mac_addr_flush() uses ibdev_path for retrieveing 892 * ifindex if Netlink fails. 893 */ 894 mlx5_free_shared_ibctx(priv->sh); 895 priv->sh = NULL; 896 } 897 ret = mlx5_hrxq_verify(dev); 898 if (ret) 899 DRV_LOG(WARNING, "port %u some hash Rx queue still remain", 900 dev->data->port_id); 901 ret = mlx5_ind_table_obj_verify(dev); 902 if (ret) 903 DRV_LOG(WARNING, "port %u some indirection table still remain", 904 dev->data->port_id); 905 ret = mlx5_rxq_obj_verify(dev); 906 if (ret) 907 DRV_LOG(WARNING, "port %u some Rx queue objects still remain", 908 dev->data->port_id); 909 ret = mlx5_rxq_verify(dev); 910 if (ret) 911 DRV_LOG(WARNING, "port %u some Rx queues still remain", 912 dev->data->port_id); 913 ret = mlx5_txq_ibv_verify(dev); 914 if (ret) 915 DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain", 916 dev->data->port_id); 917 ret = mlx5_txq_verify(dev); 918 if (ret) 919 DRV_LOG(WARNING, "port %u some Tx queues still remain", 920 dev->data->port_id); 921 ret = mlx5_flow_verify(dev); 922 if (ret) 923 DRV_LOG(WARNING, "port %u some flows still remain", 924 dev->data->port_id); 925 if (priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { 926 unsigned int c = 0; 927 uint16_t port_id; 928 929 RTE_ETH_FOREACH_DEV_OF(port_id, dev->device) { 930 struct mlx5_priv *opriv = 931 rte_eth_devices[port_id].data->dev_private; 932 933 if (!opriv || 934 opriv->domain_id != priv->domain_id || 935 &rte_eth_devices[port_id] == dev) 936 continue; 937 ++c; 938 } 939 if (!c) 940 claim_zero(rte_eth_switch_domain_free(priv->domain_id)); 941 } 942 memset(priv, 0, sizeof(*priv)); 943 priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; 944 /* 945 * Reset mac_addrs to NULL such that it is not freed as part of 946 * rte_eth_dev_release_port(). mac_addrs is part of dev_private so 947 * it is freed when dev_private is freed. 948 */ 949 dev->data->mac_addrs = NULL; 950 } 951 952 const struct eth_dev_ops mlx5_dev_ops = { 953 .dev_configure = mlx5_dev_configure, 954 .dev_start = mlx5_dev_start, 955 .dev_stop = mlx5_dev_stop, 956 .dev_set_link_down = mlx5_set_link_down, 957 .dev_set_link_up = mlx5_set_link_up, 958 .dev_close = mlx5_dev_close, 959 .promiscuous_enable = mlx5_promiscuous_enable, 960 .promiscuous_disable = mlx5_promiscuous_disable, 961 .allmulticast_enable = mlx5_allmulticast_enable, 962 .allmulticast_disable = mlx5_allmulticast_disable, 963 .link_update = mlx5_link_update, 964 .stats_get = mlx5_stats_get, 965 .stats_reset = mlx5_stats_reset, 966 .xstats_get = mlx5_xstats_get, 967 .xstats_reset = mlx5_xstats_reset, 968 .xstats_get_names = mlx5_xstats_get_names, 969 .fw_version_get = mlx5_fw_version_get, 970 .dev_infos_get = mlx5_dev_infos_get, 971 .read_clock = mlx5_read_clock, 972 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 973 .vlan_filter_set = mlx5_vlan_filter_set, 974 .rx_queue_setup = mlx5_rx_queue_setup, 975 .tx_queue_setup = mlx5_tx_queue_setup, 976 .rx_queue_release = mlx5_rx_queue_release, 977 .tx_queue_release = mlx5_tx_queue_release, 978 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 979 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 980 .mac_addr_remove = mlx5_mac_addr_remove, 981 .mac_addr_add = mlx5_mac_addr_add, 982 .mac_addr_set = mlx5_mac_addr_set, 983 .set_mc_addr_list = mlx5_set_mc_addr_list, 984 .mtu_set = mlx5_dev_set_mtu, 985 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 986 .vlan_offload_set = mlx5_vlan_offload_set, 987 .reta_update = mlx5_dev_rss_reta_update, 988 .reta_query = mlx5_dev_rss_reta_query, 989 .rss_hash_update = mlx5_rss_hash_update, 990 .rss_hash_conf_get = mlx5_rss_hash_conf_get, 991 .filter_ctrl = mlx5_dev_filter_ctrl, 992 .rx_descriptor_status = mlx5_rx_descriptor_status, 993 .tx_descriptor_status = mlx5_tx_descriptor_status, 994 .rx_queue_count = mlx5_rx_queue_count, 995 .rx_queue_intr_enable = mlx5_rx_intr_enable, 996 .rx_queue_intr_disable = mlx5_rx_intr_disable, 997 .is_removed = mlx5_is_removed, 998 .udp_tunnel_port_add = mlx5_udp_tunnel_port_add, 999 .get_module_info = mlx5_get_module_info, 1000 .get_module_eeprom = mlx5_get_module_eeprom, 1001 }; 1002 1003 /* Available operations from secondary process. */ 1004 static const struct eth_dev_ops mlx5_dev_sec_ops = { 1005 .stats_get = mlx5_stats_get, 1006 .stats_reset = mlx5_stats_reset, 1007 .xstats_get = mlx5_xstats_get, 1008 .xstats_reset = mlx5_xstats_reset, 1009 .xstats_get_names = mlx5_xstats_get_names, 1010 .fw_version_get = mlx5_fw_version_get, 1011 .dev_infos_get = mlx5_dev_infos_get, 1012 .rx_descriptor_status = mlx5_rx_descriptor_status, 1013 .tx_descriptor_status = mlx5_tx_descriptor_status, 1014 .get_module_info = mlx5_get_module_info, 1015 .get_module_eeprom = mlx5_get_module_eeprom, 1016 }; 1017 1018 /* Available operations in flow isolated mode. */ 1019 const struct eth_dev_ops mlx5_dev_ops_isolate = { 1020 .dev_configure = mlx5_dev_configure, 1021 .dev_start = mlx5_dev_start, 1022 .dev_stop = mlx5_dev_stop, 1023 .dev_set_link_down = mlx5_set_link_down, 1024 .dev_set_link_up = mlx5_set_link_up, 1025 .dev_close = mlx5_dev_close, 1026 .promiscuous_enable = mlx5_promiscuous_enable, 1027 .promiscuous_disable = mlx5_promiscuous_disable, 1028 .allmulticast_enable = mlx5_allmulticast_enable, 1029 .allmulticast_disable = mlx5_allmulticast_disable, 1030 .link_update = mlx5_link_update, 1031 .stats_get = mlx5_stats_get, 1032 .stats_reset = mlx5_stats_reset, 1033 .xstats_get = mlx5_xstats_get, 1034 .xstats_reset = mlx5_xstats_reset, 1035 .xstats_get_names = mlx5_xstats_get_names, 1036 .fw_version_get = mlx5_fw_version_get, 1037 .dev_infos_get = mlx5_dev_infos_get, 1038 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 1039 .vlan_filter_set = mlx5_vlan_filter_set, 1040 .rx_queue_setup = mlx5_rx_queue_setup, 1041 .tx_queue_setup = mlx5_tx_queue_setup, 1042 .rx_queue_release = mlx5_rx_queue_release, 1043 .tx_queue_release = mlx5_tx_queue_release, 1044 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 1045 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 1046 .mac_addr_remove = mlx5_mac_addr_remove, 1047 .mac_addr_add = mlx5_mac_addr_add, 1048 .mac_addr_set = mlx5_mac_addr_set, 1049 .set_mc_addr_list = mlx5_set_mc_addr_list, 1050 .mtu_set = mlx5_dev_set_mtu, 1051 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 1052 .vlan_offload_set = mlx5_vlan_offload_set, 1053 .filter_ctrl = mlx5_dev_filter_ctrl, 1054 .rx_descriptor_status = mlx5_rx_descriptor_status, 1055 .tx_descriptor_status = mlx5_tx_descriptor_status, 1056 .rx_queue_intr_enable = mlx5_rx_intr_enable, 1057 .rx_queue_intr_disable = mlx5_rx_intr_disable, 1058 .is_removed = mlx5_is_removed, 1059 .get_module_info = mlx5_get_module_info, 1060 .get_module_eeprom = mlx5_get_module_eeprom, 1061 }; 1062 1063 /** 1064 * Verify and store value for device argument. 1065 * 1066 * @param[in] key 1067 * Key argument to verify. 1068 * @param[in] val 1069 * Value associated with key. 1070 * @param opaque 1071 * User data. 1072 * 1073 * @return 1074 * 0 on success, a negative errno value otherwise and rte_errno is set. 1075 */ 1076 static int 1077 mlx5_args_check(const char *key, const char *val, void *opaque) 1078 { 1079 struct mlx5_dev_config *config = opaque; 1080 unsigned long tmp; 1081 1082 /* No-op, port representors are processed in mlx5_dev_spawn(). */ 1083 if (!strcmp(MLX5_REPRESENTOR, key)) 1084 return 0; 1085 errno = 0; 1086 tmp = strtoul(val, NULL, 0); 1087 if (errno) { 1088 rte_errno = errno; 1089 DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val); 1090 return -rte_errno; 1091 } 1092 if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) { 1093 config->cqe_comp = !!tmp; 1094 } else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) { 1095 config->cqe_pad = !!tmp; 1096 } else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) { 1097 config->hw_padding = !!tmp; 1098 } else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) { 1099 config->mprq.enabled = !!tmp; 1100 } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) { 1101 config->mprq.stride_num_n = tmp; 1102 } else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) { 1103 config->mprq.max_memcpy_len = tmp; 1104 } else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) { 1105 config->mprq.min_rxqs_num = tmp; 1106 } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) { 1107 DRV_LOG(WARNING, "%s: deprecated parameter," 1108 " converted to txq_inline_max", key); 1109 config->txq_inline_max = tmp; 1110 } else if (strcmp(MLX5_TXQ_INLINE_MAX, key) == 0) { 1111 config->txq_inline_max = tmp; 1112 } else if (strcmp(MLX5_TXQ_INLINE_MIN, key) == 0) { 1113 config->txq_inline_min = tmp; 1114 } else if (strcmp(MLX5_TXQ_INLINE_MPW, key) == 0) { 1115 config->txq_inline_mpw = tmp; 1116 } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) { 1117 config->txqs_inline = tmp; 1118 } else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) { 1119 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key); 1120 } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) { 1121 config->mps = !!tmp; 1122 } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) { 1123 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key); 1124 } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) { 1125 DRV_LOG(WARNING, "%s: deprecated parameter," 1126 " converted to txq_inline_mpw", key); 1127 config->txq_inline_mpw = tmp; 1128 } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) { 1129 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key); 1130 } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) { 1131 config->rx_vec_en = !!tmp; 1132 } else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) { 1133 config->l3_vxlan_en = !!tmp; 1134 } else if (strcmp(MLX5_VF_NL_EN, key) == 0) { 1135 config->vf_nl_en = !!tmp; 1136 } else if (strcmp(MLX5_DV_ESW_EN, key) == 0) { 1137 config->dv_esw_en = !!tmp; 1138 } else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) { 1139 config->dv_flow_en = !!tmp; 1140 } else if (strcmp(MLX5_MR_EXT_MEMSEG_EN, key) == 0) { 1141 config->mr_ext_memseg_en = !!tmp; 1142 } else if (strcmp(MLX5_MAX_DUMP_FILES_NUM, key) == 0) { 1143 config->max_dump_files_num = tmp; 1144 } else if (strcmp(MLX5_LRO_TIMEOUT_USEC, key) == 0) { 1145 config->lro.timeout = tmp; 1146 } else { 1147 DRV_LOG(WARNING, "%s: unknown parameter", key); 1148 rte_errno = EINVAL; 1149 return -rte_errno; 1150 } 1151 return 0; 1152 } 1153 1154 /** 1155 * Parse device parameters. 1156 * 1157 * @param config 1158 * Pointer to device configuration structure. 1159 * @param devargs 1160 * Device arguments structure. 1161 * 1162 * @return 1163 * 0 on success, a negative errno value otherwise and rte_errno is set. 1164 */ 1165 static int 1166 mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) 1167 { 1168 const char **params = (const char *[]){ 1169 MLX5_RXQ_CQE_COMP_EN, 1170 MLX5_RXQ_CQE_PAD_EN, 1171 MLX5_RXQ_PKT_PAD_EN, 1172 MLX5_RX_MPRQ_EN, 1173 MLX5_RX_MPRQ_LOG_STRIDE_NUM, 1174 MLX5_RX_MPRQ_MAX_MEMCPY_LEN, 1175 MLX5_RXQS_MIN_MPRQ, 1176 MLX5_TXQ_INLINE, 1177 MLX5_TXQ_INLINE_MIN, 1178 MLX5_TXQ_INLINE_MAX, 1179 MLX5_TXQ_INLINE_MPW, 1180 MLX5_TXQS_MIN_INLINE, 1181 MLX5_TXQS_MAX_VEC, 1182 MLX5_TXQ_MPW_EN, 1183 MLX5_TXQ_MPW_HDR_DSEG_EN, 1184 MLX5_TXQ_MAX_INLINE_LEN, 1185 MLX5_TX_VEC_EN, 1186 MLX5_RX_VEC_EN, 1187 MLX5_L3_VXLAN_EN, 1188 MLX5_VF_NL_EN, 1189 MLX5_DV_ESW_EN, 1190 MLX5_DV_FLOW_EN, 1191 MLX5_MR_EXT_MEMSEG_EN, 1192 MLX5_REPRESENTOR, 1193 MLX5_MAX_DUMP_FILES_NUM, 1194 MLX5_LRO_TIMEOUT_USEC, 1195 NULL, 1196 }; 1197 struct rte_kvargs *kvlist; 1198 int ret = 0; 1199 int i; 1200 1201 if (devargs == NULL) 1202 return 0; 1203 /* Following UGLY cast is done to pass checkpatch. */ 1204 kvlist = rte_kvargs_parse(devargs->args, params); 1205 if (kvlist == NULL) { 1206 rte_errno = EINVAL; 1207 return -rte_errno; 1208 } 1209 /* Process parameters. */ 1210 for (i = 0; (params[i] != NULL); ++i) { 1211 if (rte_kvargs_count(kvlist, params[i])) { 1212 ret = rte_kvargs_process(kvlist, params[i], 1213 mlx5_args_check, config); 1214 if (ret) { 1215 rte_errno = EINVAL; 1216 rte_kvargs_free(kvlist); 1217 return -rte_errno; 1218 } 1219 } 1220 } 1221 rte_kvargs_free(kvlist); 1222 return 0; 1223 } 1224 1225 static struct rte_pci_driver mlx5_driver; 1226 1227 /** 1228 * PMD global initialization. 1229 * 1230 * Independent from individual device, this function initializes global 1231 * per-PMD data structures distinguishing primary and secondary processes. 1232 * Hence, each initialization is called once per a process. 1233 * 1234 * @return 1235 * 0 on success, a negative errno value otherwise and rte_errno is set. 1236 */ 1237 static int 1238 mlx5_init_once(void) 1239 { 1240 struct mlx5_shared_data *sd; 1241 struct mlx5_local_data *ld = &mlx5_local_data; 1242 int ret = 0; 1243 1244 if (mlx5_init_shared_data()) 1245 return -rte_errno; 1246 sd = mlx5_shared_data; 1247 assert(sd); 1248 rte_spinlock_lock(&sd->lock); 1249 switch (rte_eal_process_type()) { 1250 case RTE_PROC_PRIMARY: 1251 if (sd->init_done) 1252 break; 1253 LIST_INIT(&sd->mem_event_cb_list); 1254 rte_rwlock_init(&sd->mem_event_rwlock); 1255 rte_mem_event_callback_register("MLX5_MEM_EVENT_CB", 1256 mlx5_mr_mem_event_cb, NULL); 1257 ret = mlx5_mp_init_primary(); 1258 if (ret) 1259 goto out; 1260 sd->init_done = true; 1261 break; 1262 case RTE_PROC_SECONDARY: 1263 if (ld->init_done) 1264 break; 1265 ret = mlx5_mp_init_secondary(); 1266 if (ret) 1267 goto out; 1268 ++sd->secondary_cnt; 1269 ld->init_done = true; 1270 break; 1271 default: 1272 break; 1273 } 1274 out: 1275 rte_spinlock_unlock(&sd->lock); 1276 return ret; 1277 } 1278 1279 /** 1280 * Configures the minimal amount of data to inline into WQE 1281 * while sending packets. 1282 * 1283 * - the txq_inline_min has the maximal priority, if this 1284 * key is specified in devargs 1285 * - if DevX is enabled the inline mode is queried from the 1286 * device (HCA attributes and NIC vport context if needed). 1287 * - otherwise L2 mode (18 bytes) is assumed for ConnectX-4/4LX 1288 * and none (0 bytes) for other NICs 1289 * 1290 * @param spawn 1291 * Verbs device parameters (name, port, switch_info) to spawn. 1292 * @param config 1293 * Device configuration parameters. 1294 */ 1295 static void 1296 mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn, 1297 struct mlx5_dev_config *config) 1298 { 1299 if (config->txq_inline_min != MLX5_ARG_UNSET) { 1300 /* Application defines size of inlined data explicitly. */ 1301 switch (spawn->pci_dev->id.device_id) { 1302 case PCI_DEVICE_ID_MELLANOX_CONNECTX4: 1303 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: 1304 if (config->txq_inline_min < 1305 (int)MLX5_INLINE_HSIZE_L2) { 1306 DRV_LOG(DEBUG, 1307 "txq_inline_mix aligned to minimal" 1308 " ConnectX-4 required value %d", 1309 (int)MLX5_INLINE_HSIZE_L2); 1310 config->txq_inline_min = MLX5_INLINE_HSIZE_L2; 1311 } 1312 break; 1313 } 1314 goto exit; 1315 } 1316 if (config->hca_attr.eth_net_offloads) { 1317 /* We have DevX enabled, inline mode queried successfully. */ 1318 switch (config->hca_attr.wqe_inline_mode) { 1319 case MLX5_CAP_INLINE_MODE_L2: 1320 /* outer L2 header must be inlined. */ 1321 config->txq_inline_min = MLX5_INLINE_HSIZE_L2; 1322 goto exit; 1323 case MLX5_CAP_INLINE_MODE_NOT_REQUIRED: 1324 /* No inline data are required by NIC. */ 1325 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE; 1326 config->hw_vlan_insert = 1327 config->hca_attr.wqe_vlan_insert; 1328 DRV_LOG(DEBUG, "Tx VLAN insertion is supported"); 1329 goto exit; 1330 case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT: 1331 /* inline mode is defined by NIC vport context. */ 1332 if (!config->hca_attr.eth_virt) 1333 break; 1334 switch (config->hca_attr.vport_inline_mode) { 1335 case MLX5_INLINE_MODE_NONE: 1336 config->txq_inline_min = 1337 MLX5_INLINE_HSIZE_NONE; 1338 goto exit; 1339 case MLX5_INLINE_MODE_L2: 1340 config->txq_inline_min = 1341 MLX5_INLINE_HSIZE_L2; 1342 goto exit; 1343 case MLX5_INLINE_MODE_IP: 1344 config->txq_inline_min = 1345 MLX5_INLINE_HSIZE_L3; 1346 goto exit; 1347 case MLX5_INLINE_MODE_TCP_UDP: 1348 config->txq_inline_min = 1349 MLX5_INLINE_HSIZE_L4; 1350 goto exit; 1351 case MLX5_INLINE_MODE_INNER_L2: 1352 config->txq_inline_min = 1353 MLX5_INLINE_HSIZE_INNER_L2; 1354 goto exit; 1355 case MLX5_INLINE_MODE_INNER_IP: 1356 config->txq_inline_min = 1357 MLX5_INLINE_HSIZE_INNER_L3; 1358 goto exit; 1359 case MLX5_INLINE_MODE_INNER_TCP_UDP: 1360 config->txq_inline_min = 1361 MLX5_INLINE_HSIZE_INNER_L4; 1362 goto exit; 1363 } 1364 } 1365 } 1366 /* 1367 * We get here if we are unable to deduce 1368 * inline data size with DevX. Try PCI ID 1369 * to determine old NICs. 1370 */ 1371 switch (spawn->pci_dev->id.device_id) { 1372 case PCI_DEVICE_ID_MELLANOX_CONNECTX4: 1373 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: 1374 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX: 1375 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: 1376 config->txq_inline_min = MLX5_INLINE_HSIZE_L2; 1377 config->hw_vlan_insert = 0; 1378 break; 1379 case PCI_DEVICE_ID_MELLANOX_CONNECTX5: 1380 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: 1381 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX: 1382 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: 1383 /* 1384 * These NICs support VLAN insertion from WQE and 1385 * report the wqe_vlan_insert flag. But there is the bug 1386 * and PFC control may be broken, so disable feature. 1387 */ 1388 config->hw_vlan_insert = 0; 1389 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE; 1390 break; 1391 default: 1392 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE; 1393 break; 1394 } 1395 exit: 1396 DRV_LOG(DEBUG, "min tx inline configured: %d", config->txq_inline_min); 1397 } 1398 1399 /** 1400 * Allocate page of door-bells and register it using DevX API. 1401 * 1402 * @param [in] dev 1403 * Pointer to Ethernet device. 1404 * 1405 * @return 1406 * Pointer to new page on success, NULL otherwise. 1407 */ 1408 static struct mlx5_devx_dbr_page * 1409 mlx5_alloc_dbr_page(struct rte_eth_dev *dev) 1410 { 1411 struct mlx5_priv *priv = dev->data->dev_private; 1412 struct mlx5_devx_dbr_page *page; 1413 1414 /* Allocate space for door-bell page and management data. */ 1415 page = rte_calloc_socket(__func__, 1, sizeof(struct mlx5_devx_dbr_page), 1416 RTE_CACHE_LINE_SIZE, dev->device->numa_node); 1417 if (!page) { 1418 DRV_LOG(ERR, "port %u cannot allocate dbr page", 1419 dev->data->port_id); 1420 return NULL; 1421 } 1422 /* Register allocated memory. */ 1423 page->umem = mlx5_glue->devx_umem_reg(priv->sh->ctx, page->dbrs, 1424 MLX5_DBR_PAGE_SIZE, 0); 1425 if (!page->umem) { 1426 DRV_LOG(ERR, "port %u cannot umem reg dbr page", 1427 dev->data->port_id); 1428 rte_free(page); 1429 return NULL; 1430 } 1431 return page; 1432 } 1433 1434 /** 1435 * Find the next available door-bell, allocate new page if needed. 1436 * 1437 * @param [in] dev 1438 * Pointer to Ethernet device. 1439 * @param [out] dbr_page 1440 * Door-bell page containing the page data. 1441 * 1442 * @return 1443 * Door-bell address offset on success, a negative error value otherwise. 1444 */ 1445 int64_t 1446 mlx5_get_dbr(struct rte_eth_dev *dev, struct mlx5_devx_dbr_page **dbr_page) 1447 { 1448 struct mlx5_priv *priv = dev->data->dev_private; 1449 struct mlx5_devx_dbr_page *page = NULL; 1450 uint32_t i, j; 1451 1452 LIST_FOREACH(page, &priv->dbrpgs, next) 1453 if (page->dbr_count < MLX5_DBR_PER_PAGE) 1454 break; 1455 if (!page) { /* No page with free door-bell exists. */ 1456 page = mlx5_alloc_dbr_page(dev); 1457 if (!page) /* Failed to allocate new page. */ 1458 return (-1); 1459 LIST_INSERT_HEAD(&priv->dbrpgs, page, next); 1460 } 1461 /* Loop to find bitmap part with clear bit. */ 1462 for (i = 0; 1463 i < MLX5_DBR_BITMAP_SIZE && page->dbr_bitmap[i] == UINT64_MAX; 1464 i++) 1465 ; /* Empty. */ 1466 /* Find the first clear bit. */ 1467 j = rte_bsf64(~page->dbr_bitmap[i]); 1468 assert(i < (MLX5_DBR_PER_PAGE / 64)); 1469 page->dbr_bitmap[i] |= (1 << j); 1470 page->dbr_count++; 1471 *dbr_page = page; 1472 return (((i * 64) + j) * sizeof(uint64_t)); 1473 } 1474 1475 /** 1476 * Release a door-bell record. 1477 * 1478 * @param [in] dev 1479 * Pointer to Ethernet device. 1480 * @param [in] umem_id 1481 * UMEM ID of page containing the door-bell record to release. 1482 * @param [in] offset 1483 * Offset of door-bell record in page. 1484 * 1485 * @return 1486 * 0 on success, a negative error value otherwise. 1487 */ 1488 int32_t 1489 mlx5_release_dbr(struct rte_eth_dev *dev, uint32_t umem_id, uint64_t offset) 1490 { 1491 struct mlx5_priv *priv = dev->data->dev_private; 1492 struct mlx5_devx_dbr_page *page = NULL; 1493 int ret = 0; 1494 1495 LIST_FOREACH(page, &priv->dbrpgs, next) 1496 /* Find the page this address belongs to. */ 1497 if (page->umem->umem_id == umem_id) 1498 break; 1499 if (!page) 1500 return -EINVAL; 1501 page->dbr_count--; 1502 if (!page->dbr_count) { 1503 /* Page not used, free it and remove from list. */ 1504 LIST_REMOVE(page, next); 1505 if (page->umem) 1506 ret = -mlx5_glue->devx_umem_dereg(page->umem); 1507 rte_free(page); 1508 } else { 1509 /* Mark in bitmap that this door-bell is not in use. */ 1510 offset /= MLX5_DBR_SIZE; 1511 int i = offset / 64; 1512 int j = offset % 64; 1513 1514 page->dbr_bitmap[i] &= ~(1 << j); 1515 } 1516 return ret; 1517 } 1518 1519 /** 1520 * Spawn an Ethernet device from Verbs information. 1521 * 1522 * @param dpdk_dev 1523 * Backing DPDK device. 1524 * @param spawn 1525 * Verbs device parameters (name, port, switch_info) to spawn. 1526 * @param config 1527 * Device configuration parameters. 1528 * 1529 * @return 1530 * A valid Ethernet device object on success, NULL otherwise and rte_errno 1531 * is set. The following errors are defined: 1532 * 1533 * EBUSY: device is not supposed to be spawned. 1534 * EEXIST: device is already spawned 1535 */ 1536 static struct rte_eth_dev * 1537 mlx5_dev_spawn(struct rte_device *dpdk_dev, 1538 struct mlx5_dev_spawn_data *spawn, 1539 struct mlx5_dev_config config) 1540 { 1541 const struct mlx5_switch_info *switch_info = &spawn->info; 1542 struct mlx5_ibv_shared *sh = NULL; 1543 struct ibv_port_attr port_attr; 1544 struct mlx5dv_context dv_attr = { .comp_mask = 0 }; 1545 struct rte_eth_dev *eth_dev = NULL; 1546 struct mlx5_priv *priv = NULL; 1547 int err = 0; 1548 unsigned int hw_padding = 0; 1549 unsigned int mps; 1550 unsigned int cqe_comp; 1551 unsigned int cqe_pad = 0; 1552 unsigned int tunnel_en = 0; 1553 unsigned int mpls_en = 0; 1554 unsigned int swp = 0; 1555 unsigned int mprq = 0; 1556 unsigned int mprq_min_stride_size_n = 0; 1557 unsigned int mprq_max_stride_size_n = 0; 1558 unsigned int mprq_min_stride_num_n = 0; 1559 unsigned int mprq_max_stride_num_n = 0; 1560 struct rte_ether_addr mac; 1561 char name[RTE_ETH_NAME_MAX_LEN]; 1562 int own_domain_id = 0; 1563 uint16_t port_id; 1564 unsigned int i; 1565 1566 /* Determine if this port representor is supposed to be spawned. */ 1567 if (switch_info->representor && dpdk_dev->devargs) { 1568 struct rte_eth_devargs eth_da; 1569 1570 err = rte_eth_devargs_parse(dpdk_dev->devargs->args, ð_da); 1571 if (err) { 1572 rte_errno = -err; 1573 DRV_LOG(ERR, "failed to process device arguments: %s", 1574 strerror(rte_errno)); 1575 return NULL; 1576 } 1577 for (i = 0; i < eth_da.nb_representor_ports; ++i) 1578 if (eth_da.representor_ports[i] == 1579 (uint16_t)switch_info->port_name) 1580 break; 1581 if (i == eth_da.nb_representor_ports) { 1582 rte_errno = EBUSY; 1583 return NULL; 1584 } 1585 } 1586 /* Build device name. */ 1587 if (!switch_info->representor) 1588 strlcpy(name, dpdk_dev->name, sizeof(name)); 1589 else 1590 snprintf(name, sizeof(name), "%s_representor_%u", 1591 dpdk_dev->name, switch_info->port_name); 1592 /* check if the device is already spawned */ 1593 if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) { 1594 rte_errno = EEXIST; 1595 return NULL; 1596 } 1597 DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name); 1598 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 1599 eth_dev = rte_eth_dev_attach_secondary(name); 1600 if (eth_dev == NULL) { 1601 DRV_LOG(ERR, "can not attach rte ethdev"); 1602 rte_errno = ENOMEM; 1603 return NULL; 1604 } 1605 eth_dev->device = dpdk_dev; 1606 eth_dev->dev_ops = &mlx5_dev_sec_ops; 1607 err = mlx5_proc_priv_init(eth_dev); 1608 if (err) 1609 return NULL; 1610 /* Receive command fd from primary process */ 1611 err = mlx5_mp_req_verbs_cmd_fd(eth_dev); 1612 if (err < 0) 1613 return NULL; 1614 /* Remap UAR for Tx queues. */ 1615 err = mlx5_tx_uar_init_secondary(eth_dev, err); 1616 if (err) 1617 return NULL; 1618 /* 1619 * Ethdev pointer is still required as input since 1620 * the primary device is not accessible from the 1621 * secondary process. 1622 */ 1623 eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev); 1624 eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev); 1625 return eth_dev; 1626 } 1627 sh = mlx5_alloc_shared_ibctx(spawn); 1628 if (!sh) 1629 return NULL; 1630 config.devx = sh->devx; 1631 #ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR 1632 config.dest_tir = 1; 1633 #endif 1634 #ifdef HAVE_IBV_MLX5_MOD_SWP 1635 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; 1636 #endif 1637 /* 1638 * Multi-packet send is supported by ConnectX-4 Lx PF as well 1639 * as all ConnectX-5 devices. 1640 */ 1641 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 1642 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS; 1643 #endif 1644 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 1645 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; 1646 #endif 1647 mlx5_glue->dv_query_device(sh->ctx, &dv_attr); 1648 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { 1649 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { 1650 DRV_LOG(DEBUG, "enhanced MPW is supported"); 1651 mps = MLX5_MPW_ENHANCED; 1652 } else { 1653 DRV_LOG(DEBUG, "MPW is supported"); 1654 mps = MLX5_MPW; 1655 } 1656 } else { 1657 DRV_LOG(DEBUG, "MPW isn't supported"); 1658 mps = MLX5_MPW_DISABLED; 1659 } 1660 #ifdef HAVE_IBV_MLX5_MOD_SWP 1661 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP) 1662 swp = dv_attr.sw_parsing_caps.sw_parsing_offloads; 1663 DRV_LOG(DEBUG, "SWP support: %u", swp); 1664 #endif 1665 config.swp = !!swp; 1666 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 1667 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) { 1668 struct mlx5dv_striding_rq_caps mprq_caps = 1669 dv_attr.striding_rq_caps; 1670 1671 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d", 1672 mprq_caps.min_single_stride_log_num_of_bytes); 1673 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d", 1674 mprq_caps.max_single_stride_log_num_of_bytes); 1675 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d", 1676 mprq_caps.min_single_wqe_log_num_of_strides); 1677 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d", 1678 mprq_caps.max_single_wqe_log_num_of_strides); 1679 DRV_LOG(DEBUG, "\tsupported_qpts: %d", 1680 mprq_caps.supported_qpts); 1681 DRV_LOG(DEBUG, "device supports Multi-Packet RQ"); 1682 mprq = 1; 1683 mprq_min_stride_size_n = 1684 mprq_caps.min_single_stride_log_num_of_bytes; 1685 mprq_max_stride_size_n = 1686 mprq_caps.max_single_stride_log_num_of_bytes; 1687 mprq_min_stride_num_n = 1688 mprq_caps.min_single_wqe_log_num_of_strides; 1689 mprq_max_stride_num_n = 1690 mprq_caps.max_single_wqe_log_num_of_strides; 1691 config.mprq.stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, 1692 mprq_min_stride_num_n); 1693 } 1694 #endif 1695 if (RTE_CACHE_LINE_SIZE == 128 && 1696 !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) 1697 cqe_comp = 0; 1698 else 1699 cqe_comp = 1; 1700 config.cqe_comp = cqe_comp; 1701 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD 1702 /* Whether device supports 128B Rx CQE padding. */ 1703 cqe_pad = RTE_CACHE_LINE_SIZE == 128 && 1704 (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD); 1705 #endif 1706 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 1707 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { 1708 tunnel_en = ((dv_attr.tunnel_offloads_caps & 1709 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) && 1710 (dv_attr.tunnel_offloads_caps & 1711 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE)); 1712 } 1713 DRV_LOG(DEBUG, "tunnel offloading is %ssupported", 1714 tunnel_en ? "" : "not "); 1715 #else 1716 DRV_LOG(WARNING, 1717 "tunnel offloading disabled due to old OFED/rdma-core version"); 1718 #endif 1719 config.tunnel_en = tunnel_en; 1720 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT 1721 mpls_en = ((dv_attr.tunnel_offloads_caps & 1722 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) && 1723 (dv_attr.tunnel_offloads_caps & 1724 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP)); 1725 DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported", 1726 mpls_en ? "" : "not "); 1727 #else 1728 DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to" 1729 " old OFED/rdma-core version or firmware configuration"); 1730 #endif 1731 config.mpls_en = mpls_en; 1732 /* Check port status. */ 1733 err = mlx5_glue->query_port(sh->ctx, spawn->ibv_port, &port_attr); 1734 if (err) { 1735 DRV_LOG(ERR, "port query failed: %s", strerror(err)); 1736 goto error; 1737 } 1738 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { 1739 DRV_LOG(ERR, "port is not configured in Ethernet mode"); 1740 err = EINVAL; 1741 goto error; 1742 } 1743 if (port_attr.state != IBV_PORT_ACTIVE) 1744 DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)", 1745 mlx5_glue->port_state_str(port_attr.state), 1746 port_attr.state); 1747 /* Allocate private eth device data. */ 1748 priv = rte_zmalloc("ethdev private structure", 1749 sizeof(*priv), 1750 RTE_CACHE_LINE_SIZE); 1751 if (priv == NULL) { 1752 DRV_LOG(ERR, "priv allocation failure"); 1753 err = ENOMEM; 1754 goto error; 1755 } 1756 priv->sh = sh; 1757 priv->ibv_port = spawn->ibv_port; 1758 priv->mtu = RTE_ETHER_MTU; 1759 #ifndef RTE_ARCH_64 1760 /* Initialize UAR access locks for 32bit implementations. */ 1761 rte_spinlock_init(&priv->uar_lock_cq); 1762 for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++) 1763 rte_spinlock_init(&priv->uar_lock[i]); 1764 #endif 1765 /* Some internal functions rely on Netlink sockets, open them now. */ 1766 priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA); 1767 priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE); 1768 priv->nl_sn = 0; 1769 priv->representor = !!switch_info->representor; 1770 priv->master = !!switch_info->master; 1771 priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; 1772 /* 1773 * Currently we support single E-Switch per PF configurations 1774 * only and vport_id field contains the vport index for 1775 * associated VF, which is deduced from representor port name. 1776 * For example, let's have the IB device port 10, it has 1777 * attached network device eth0, which has port name attribute 1778 * pf0vf2, we can deduce the VF number as 2, and set vport index 1779 * as 3 (2+1). This assigning schema should be changed if the 1780 * multiple E-Switch instances per PF configurations or/and PCI 1781 * subfunctions are added. 1782 */ 1783 priv->vport_id = switch_info->representor ? 1784 switch_info->port_name + 1 : -1; 1785 /* representor_id field keeps the unmodified port/VF index. */ 1786 priv->representor_id = switch_info->representor ? 1787 switch_info->port_name : -1; 1788 /* 1789 * Look for sibling devices in order to reuse their switch domain 1790 * if any, otherwise allocate one. 1791 */ 1792 RTE_ETH_FOREACH_DEV_OF(port_id, dpdk_dev) { 1793 const struct mlx5_priv *opriv = 1794 rte_eth_devices[port_id].data->dev_private; 1795 1796 if (!opriv || 1797 opriv->domain_id == 1798 RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) 1799 continue; 1800 priv->domain_id = opriv->domain_id; 1801 break; 1802 } 1803 if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { 1804 err = rte_eth_switch_domain_alloc(&priv->domain_id); 1805 if (err) { 1806 err = rte_errno; 1807 DRV_LOG(ERR, "unable to allocate switch domain: %s", 1808 strerror(rte_errno)); 1809 goto error; 1810 } 1811 own_domain_id = 1; 1812 } 1813 err = mlx5_args(&config, dpdk_dev->devargs); 1814 if (err) { 1815 err = rte_errno; 1816 DRV_LOG(ERR, "failed to process device arguments: %s", 1817 strerror(rte_errno)); 1818 goto error; 1819 } 1820 config.hw_csum = !!(sh->device_attr.device_cap_flags_ex & 1821 IBV_DEVICE_RAW_IP_CSUM); 1822 DRV_LOG(DEBUG, "checksum offloading is %ssupported", 1823 (config.hw_csum ? "" : "not ")); 1824 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \ 1825 !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) 1826 DRV_LOG(DEBUG, "counters are not supported"); 1827 #endif 1828 #ifndef HAVE_IBV_FLOW_DV_SUPPORT 1829 if (config.dv_flow_en) { 1830 DRV_LOG(WARNING, "DV flow is not supported"); 1831 config.dv_flow_en = 0; 1832 } 1833 #endif 1834 config.ind_table_max_size = 1835 sh->device_attr.rss_caps.max_rwq_indirection_table_size; 1836 /* 1837 * Remove this check once DPDK supports larger/variable 1838 * indirection tables. 1839 */ 1840 if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512) 1841 config.ind_table_max_size = ETH_RSS_RETA_SIZE_512; 1842 DRV_LOG(DEBUG, "maximum Rx indirection table size is %u", 1843 config.ind_table_max_size); 1844 config.hw_vlan_strip = !!(sh->device_attr.raw_packet_caps & 1845 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); 1846 DRV_LOG(DEBUG, "VLAN stripping is %ssupported", 1847 (config.hw_vlan_strip ? "" : "not ")); 1848 config.hw_fcs_strip = !!(sh->device_attr.raw_packet_caps & 1849 IBV_RAW_PACKET_CAP_SCATTER_FCS); 1850 DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported", 1851 (config.hw_fcs_strip ? "" : "not ")); 1852 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING) 1853 hw_padding = !!sh->device_attr.rx_pad_end_addr_align; 1854 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING) 1855 hw_padding = !!(sh->device_attr.device_cap_flags_ex & 1856 IBV_DEVICE_PCI_WRITE_END_PADDING); 1857 #endif 1858 if (config.hw_padding && !hw_padding) { 1859 DRV_LOG(DEBUG, "Rx end alignment padding isn't supported"); 1860 config.hw_padding = 0; 1861 } else if (config.hw_padding) { 1862 DRV_LOG(DEBUG, "Rx end alignment padding is enabled"); 1863 } 1864 config.tso = (sh->device_attr.tso_caps.max_tso > 0 && 1865 (sh->device_attr.tso_caps.supported_qpts & 1866 (1 << IBV_QPT_RAW_PACKET))); 1867 if (config.tso) 1868 config.tso_max_payload_sz = sh->device_attr.tso_caps.max_tso; 1869 /* 1870 * MPW is disabled by default, while the Enhanced MPW is enabled 1871 * by default. 1872 */ 1873 if (config.mps == MLX5_ARG_UNSET) 1874 config.mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED : 1875 MLX5_MPW_DISABLED; 1876 else 1877 config.mps = config.mps ? mps : MLX5_MPW_DISABLED; 1878 DRV_LOG(INFO, "%sMPS is %s", 1879 config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "", 1880 config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); 1881 if (config.cqe_comp && !cqe_comp) { 1882 DRV_LOG(WARNING, "Rx CQE compression isn't supported"); 1883 config.cqe_comp = 0; 1884 } 1885 if (config.cqe_pad && !cqe_pad) { 1886 DRV_LOG(WARNING, "Rx CQE padding isn't supported"); 1887 config.cqe_pad = 0; 1888 } else if (config.cqe_pad) { 1889 DRV_LOG(INFO, "Rx CQE padding is enabled"); 1890 } 1891 if (config.devx) { 1892 priv->counter_fallback = 0; 1893 err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr); 1894 if (err) { 1895 err = -err; 1896 goto error; 1897 } 1898 if (!config.hca_attr.flow_counters_dump) 1899 priv->counter_fallback = 1; 1900 #ifndef HAVE_IBV_DEVX_ASYNC 1901 priv->counter_fallback = 1; 1902 #endif 1903 if (priv->counter_fallback) 1904 DRV_LOG(INFO, "Use fall-back DV counter management\n"); 1905 /* Check for LRO support. */ 1906 if (config.dest_tir && config.hca_attr.lro_cap) { 1907 /* TBD check tunnel lro caps. */ 1908 config.lro.supported = config.hca_attr.lro_cap; 1909 DRV_LOG(DEBUG, "Device supports LRO"); 1910 /* 1911 * If LRO timeout is not configured by application, 1912 * use the minimal supported value. 1913 */ 1914 if (!config.lro.timeout) 1915 config.lro.timeout = 1916 config.hca_attr.lro_timer_supported_periods[0]; 1917 DRV_LOG(DEBUG, "LRO session timeout set to %d usec", 1918 config.lro.timeout); 1919 } 1920 } 1921 if (config.mprq.enabled && mprq) { 1922 if (config.mprq.stride_num_n > mprq_max_stride_num_n || 1923 config.mprq.stride_num_n < mprq_min_stride_num_n) { 1924 config.mprq.stride_num_n = 1925 RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, 1926 mprq_min_stride_num_n); 1927 DRV_LOG(WARNING, 1928 "the number of strides" 1929 " for Multi-Packet RQ is out of range," 1930 " setting default value (%u)", 1931 1 << config.mprq.stride_num_n); 1932 } 1933 config.mprq.min_stride_size_n = mprq_min_stride_size_n; 1934 config.mprq.max_stride_size_n = mprq_max_stride_size_n; 1935 } else if (config.mprq.enabled && !mprq) { 1936 DRV_LOG(WARNING, "Multi-Packet RQ isn't supported"); 1937 config.mprq.enabled = 0; 1938 } 1939 if (config.max_dump_files_num == 0) 1940 config.max_dump_files_num = 128; 1941 eth_dev = rte_eth_dev_allocate(name); 1942 if (eth_dev == NULL) { 1943 DRV_LOG(ERR, "can not allocate rte ethdev"); 1944 err = ENOMEM; 1945 goto error; 1946 } 1947 /* Flag to call rte_eth_dev_release_port() in rte_eth_dev_close(). */ 1948 eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE; 1949 if (priv->representor) { 1950 eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR; 1951 eth_dev->data->representor_id = priv->representor_id; 1952 } 1953 /* 1954 * Store associated network device interface index. This index 1955 * is permanent throughout the lifetime of device. So, we may store 1956 * the ifindex here and use the cached value further. 1957 */ 1958 assert(spawn->ifindex); 1959 priv->if_index = spawn->ifindex; 1960 eth_dev->data->dev_private = priv; 1961 priv->dev_data = eth_dev->data; 1962 eth_dev->data->mac_addrs = priv->mac; 1963 eth_dev->device = dpdk_dev; 1964 /* Configure the first MAC address by default. */ 1965 if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) { 1966 DRV_LOG(ERR, 1967 "port %u cannot get MAC address, is mlx5_en" 1968 " loaded? (errno: %s)", 1969 eth_dev->data->port_id, strerror(rte_errno)); 1970 err = ENODEV; 1971 goto error; 1972 } 1973 DRV_LOG(INFO, 1974 "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", 1975 eth_dev->data->port_id, 1976 mac.addr_bytes[0], mac.addr_bytes[1], 1977 mac.addr_bytes[2], mac.addr_bytes[3], 1978 mac.addr_bytes[4], mac.addr_bytes[5]); 1979 #ifndef NDEBUG 1980 { 1981 char ifname[IF_NAMESIZE]; 1982 1983 if (mlx5_get_ifname(eth_dev, &ifname) == 0) 1984 DRV_LOG(DEBUG, "port %u ifname is \"%s\"", 1985 eth_dev->data->port_id, ifname); 1986 else 1987 DRV_LOG(DEBUG, "port %u ifname is unknown", 1988 eth_dev->data->port_id); 1989 } 1990 #endif 1991 /* Get actual MTU if possible. */ 1992 err = mlx5_get_mtu(eth_dev, &priv->mtu); 1993 if (err) { 1994 err = rte_errno; 1995 goto error; 1996 } 1997 DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id, 1998 priv->mtu); 1999 /* Initialize burst functions to prevent crashes before link-up. */ 2000 eth_dev->rx_pkt_burst = removed_rx_burst; 2001 eth_dev->tx_pkt_burst = removed_tx_burst; 2002 eth_dev->dev_ops = &mlx5_dev_ops; 2003 /* Register MAC address. */ 2004 claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); 2005 if (config.vf && config.vf_nl_en) 2006 mlx5_nl_mac_addr_sync(eth_dev); 2007 TAILQ_INIT(&priv->flows); 2008 TAILQ_INIT(&priv->ctrl_flows); 2009 /* Hint libmlx5 to use PMD allocator for data plane resources */ 2010 struct mlx5dv_ctx_allocators alctr = { 2011 .alloc = &mlx5_alloc_verbs_buf, 2012 .free = &mlx5_free_verbs_buf, 2013 .data = priv, 2014 }; 2015 mlx5_glue->dv_set_context_attr(sh->ctx, 2016 MLX5DV_CTX_ATTR_BUF_ALLOCATORS, 2017 (void *)((uintptr_t)&alctr)); 2018 /* Bring Ethernet device up. */ 2019 DRV_LOG(DEBUG, "port %u forcing Ethernet interface up", 2020 eth_dev->data->port_id); 2021 mlx5_set_link_up(eth_dev); 2022 /* 2023 * Even though the interrupt handler is not installed yet, 2024 * interrupts will still trigger on the async_fd from 2025 * Verbs context returned by ibv_open_device(). 2026 */ 2027 mlx5_link_update(eth_dev, 0); 2028 #ifdef HAVE_MLX5DV_DR_ESWITCH 2029 if (!(config.hca_attr.eswitch_manager && config.dv_flow_en && 2030 (switch_info->representor || switch_info->master))) 2031 config.dv_esw_en = 0; 2032 #else 2033 config.dv_esw_en = 0; 2034 #endif 2035 /* Detect minimal data bytes to inline. */ 2036 mlx5_set_min_inline(spawn, &config); 2037 /* Store device configuration on private structure. */ 2038 priv->config = config; 2039 /* Create context for virtual machine VLAN workaround. */ 2040 priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex); 2041 if (config.dv_flow_en) { 2042 err = mlx5_alloc_shared_dr(priv); 2043 if (err) 2044 goto error; 2045 } 2046 /* Supported Verbs flow priority number detection. */ 2047 err = mlx5_flow_discover_priorities(eth_dev); 2048 if (err < 0) { 2049 err = -err; 2050 goto error; 2051 } 2052 priv->config.flow_prio = err; 2053 return eth_dev; 2054 error: 2055 if (priv) { 2056 if (priv->sh) 2057 mlx5_free_shared_dr(priv); 2058 if (priv->nl_socket_route >= 0) 2059 close(priv->nl_socket_route); 2060 if (priv->nl_socket_rdma >= 0) 2061 close(priv->nl_socket_rdma); 2062 if (priv->vmwa_context) 2063 mlx5_vlan_vmwa_exit(priv->vmwa_context); 2064 if (own_domain_id) 2065 claim_zero(rte_eth_switch_domain_free(priv->domain_id)); 2066 rte_free(priv); 2067 if (eth_dev != NULL) 2068 eth_dev->data->dev_private = NULL; 2069 } 2070 if (eth_dev != NULL) { 2071 /* mac_addrs must not be freed alone because part of dev_private */ 2072 eth_dev->data->mac_addrs = NULL; 2073 rte_eth_dev_release_port(eth_dev); 2074 } 2075 if (sh) 2076 mlx5_free_shared_ibctx(sh); 2077 assert(err > 0); 2078 rte_errno = err; 2079 return NULL; 2080 } 2081 2082 /** 2083 * Comparison callback to sort device data. 2084 * 2085 * This is meant to be used with qsort(). 2086 * 2087 * @param a[in] 2088 * Pointer to pointer to first data object. 2089 * @param b[in] 2090 * Pointer to pointer to second data object. 2091 * 2092 * @return 2093 * 0 if both objects are equal, less than 0 if the first argument is less 2094 * than the second, greater than 0 otherwise. 2095 */ 2096 static int 2097 mlx5_dev_spawn_data_cmp(const void *a, const void *b) 2098 { 2099 const struct mlx5_switch_info *si_a = 2100 &((const struct mlx5_dev_spawn_data *)a)->info; 2101 const struct mlx5_switch_info *si_b = 2102 &((const struct mlx5_dev_spawn_data *)b)->info; 2103 int ret; 2104 2105 /* Master device first. */ 2106 ret = si_b->master - si_a->master; 2107 if (ret) 2108 return ret; 2109 /* Then representor devices. */ 2110 ret = si_b->representor - si_a->representor; 2111 if (ret) 2112 return ret; 2113 /* Unidentified devices come last in no specific order. */ 2114 if (!si_a->representor) 2115 return 0; 2116 /* Order representors by name. */ 2117 return si_a->port_name - si_b->port_name; 2118 } 2119 2120 /** 2121 * DPDK callback to register a PCI device. 2122 * 2123 * This function spawns Ethernet devices out of a given PCI device. 2124 * 2125 * @param[in] pci_drv 2126 * PCI driver structure (mlx5_driver). 2127 * @param[in] pci_dev 2128 * PCI device information. 2129 * 2130 * @return 2131 * 0 on success, a negative errno value otherwise and rte_errno is set. 2132 */ 2133 static int 2134 mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, 2135 struct rte_pci_device *pci_dev) 2136 { 2137 struct ibv_device **ibv_list; 2138 /* 2139 * Number of found IB Devices matching with requested PCI BDF. 2140 * nd != 1 means there are multiple IB devices over the same 2141 * PCI device and we have representors and master. 2142 */ 2143 unsigned int nd = 0; 2144 /* 2145 * Number of found IB device Ports. nd = 1 and np = 1..n means 2146 * we have the single multiport IB device, and there may be 2147 * representors attached to some of found ports. 2148 */ 2149 unsigned int np = 0; 2150 /* 2151 * Number of DPDK ethernet devices to Spawn - either over 2152 * multiple IB devices or multiple ports of single IB device. 2153 * Actually this is the number of iterations to spawn. 2154 */ 2155 unsigned int ns = 0; 2156 struct mlx5_dev_config dev_config; 2157 int ret; 2158 2159 ret = mlx5_init_once(); 2160 if (ret) { 2161 DRV_LOG(ERR, "unable to init PMD global data: %s", 2162 strerror(rte_errno)); 2163 return -rte_errno; 2164 } 2165 assert(pci_drv == &mlx5_driver); 2166 errno = 0; 2167 ibv_list = mlx5_glue->get_device_list(&ret); 2168 if (!ibv_list) { 2169 rte_errno = errno ? errno : ENOSYS; 2170 DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?"); 2171 return -rte_errno; 2172 } 2173 /* 2174 * First scan the list of all Infiniband devices to find 2175 * matching ones, gathering into the list. 2176 */ 2177 struct ibv_device *ibv_match[ret + 1]; 2178 int nl_route = -1; 2179 int nl_rdma = -1; 2180 unsigned int i; 2181 2182 while (ret-- > 0) { 2183 struct rte_pci_addr pci_addr; 2184 2185 DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name); 2186 if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr)) 2187 continue; 2188 if (pci_dev->addr.domain != pci_addr.domain || 2189 pci_dev->addr.bus != pci_addr.bus || 2190 pci_dev->addr.devid != pci_addr.devid || 2191 pci_dev->addr.function != pci_addr.function) 2192 continue; 2193 DRV_LOG(INFO, "PCI information matches for device \"%s\"", 2194 ibv_list[ret]->name); 2195 ibv_match[nd++] = ibv_list[ret]; 2196 } 2197 ibv_match[nd] = NULL; 2198 if (!nd) { 2199 /* No device matches, just complain and bail out. */ 2200 mlx5_glue->free_device_list(ibv_list); 2201 DRV_LOG(WARNING, 2202 "no Verbs device matches PCI device " PCI_PRI_FMT "," 2203 " are kernel drivers loaded?", 2204 pci_dev->addr.domain, pci_dev->addr.bus, 2205 pci_dev->addr.devid, pci_dev->addr.function); 2206 rte_errno = ENOENT; 2207 ret = -rte_errno; 2208 return ret; 2209 } 2210 nl_route = mlx5_nl_init(NETLINK_ROUTE); 2211 nl_rdma = mlx5_nl_init(NETLINK_RDMA); 2212 if (nd == 1) { 2213 /* 2214 * Found single matching device may have multiple ports. 2215 * Each port may be representor, we have to check the port 2216 * number and check the representors existence. 2217 */ 2218 if (nl_rdma >= 0) 2219 np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name); 2220 if (!np) 2221 DRV_LOG(WARNING, "can not get IB device \"%s\"" 2222 " ports number", ibv_match[0]->name); 2223 } 2224 /* 2225 * Now we can determine the maximal 2226 * amount of devices to be spawned. 2227 */ 2228 struct mlx5_dev_spawn_data list[np ? np : nd]; 2229 2230 if (np > 1) { 2231 /* 2232 * Single IB device with multiple ports found, 2233 * it may be E-Switch master device and representors. 2234 * We have to perform identification trough the ports. 2235 */ 2236 assert(nl_rdma >= 0); 2237 assert(ns == 0); 2238 assert(nd == 1); 2239 for (i = 1; i <= np; ++i) { 2240 list[ns].max_port = np; 2241 list[ns].ibv_port = i; 2242 list[ns].ibv_dev = ibv_match[0]; 2243 list[ns].eth_dev = NULL; 2244 list[ns].pci_dev = pci_dev; 2245 list[ns].ifindex = mlx5_nl_ifindex 2246 (nl_rdma, list[ns].ibv_dev->name, i); 2247 if (!list[ns].ifindex) { 2248 /* 2249 * No network interface index found for the 2250 * specified port, it means there is no 2251 * representor on this port. It's OK, 2252 * there can be disabled ports, for example 2253 * if sriov_numvfs < sriov_totalvfs. 2254 */ 2255 continue; 2256 } 2257 ret = -1; 2258 if (nl_route >= 0) 2259 ret = mlx5_nl_switch_info 2260 (nl_route, 2261 list[ns].ifindex, 2262 &list[ns].info); 2263 if (ret || (!list[ns].info.representor && 2264 !list[ns].info.master)) { 2265 /* 2266 * We failed to recognize representors with 2267 * Netlink, let's try to perform the task 2268 * with sysfs. 2269 */ 2270 ret = mlx5_sysfs_switch_info 2271 (list[ns].ifindex, 2272 &list[ns].info); 2273 } 2274 if (!ret && (list[ns].info.representor ^ 2275 list[ns].info.master)) 2276 ns++; 2277 } 2278 if (!ns) { 2279 DRV_LOG(ERR, 2280 "unable to recognize master/representors" 2281 " on the IB device with multiple ports"); 2282 rte_errno = ENOENT; 2283 ret = -rte_errno; 2284 goto exit; 2285 } 2286 } else { 2287 /* 2288 * The existence of several matching entries (nd > 1) means 2289 * port representors have been instantiated. No existing Verbs 2290 * call nor sysfs entries can tell them apart, this can only 2291 * be done through Netlink calls assuming kernel drivers are 2292 * recent enough to support them. 2293 * 2294 * In the event of identification failure through Netlink, 2295 * try again through sysfs, then: 2296 * 2297 * 1. A single IB device matches (nd == 1) with single 2298 * port (np=0/1) and is not a representor, assume 2299 * no switch support. 2300 * 2301 * 2. Otherwise no safe assumptions can be made; 2302 * complain louder and bail out. 2303 */ 2304 np = 1; 2305 for (i = 0; i != nd; ++i) { 2306 memset(&list[ns].info, 0, sizeof(list[ns].info)); 2307 list[ns].max_port = 1; 2308 list[ns].ibv_port = 1; 2309 list[ns].ibv_dev = ibv_match[i]; 2310 list[ns].eth_dev = NULL; 2311 list[ns].pci_dev = pci_dev; 2312 list[ns].ifindex = 0; 2313 if (nl_rdma >= 0) 2314 list[ns].ifindex = mlx5_nl_ifindex 2315 (nl_rdma, list[ns].ibv_dev->name, 1); 2316 if (!list[ns].ifindex) { 2317 char ifname[IF_NAMESIZE]; 2318 2319 /* 2320 * Netlink failed, it may happen with old 2321 * ib_core kernel driver (before 4.16). 2322 * We can assume there is old driver because 2323 * here we are processing single ports IB 2324 * devices. Let's try sysfs to retrieve 2325 * the ifindex. The method works for 2326 * master device only. 2327 */ 2328 if (nd > 1) { 2329 /* 2330 * Multiple devices found, assume 2331 * representors, can not distinguish 2332 * master/representor and retrieve 2333 * ifindex via sysfs. 2334 */ 2335 continue; 2336 } 2337 ret = mlx5_get_master_ifname 2338 (ibv_match[i]->ibdev_path, &ifname); 2339 if (!ret) 2340 list[ns].ifindex = 2341 if_nametoindex(ifname); 2342 if (!list[ns].ifindex) { 2343 /* 2344 * No network interface index found 2345 * for the specified device, it means 2346 * there it is neither representor 2347 * nor master. 2348 */ 2349 continue; 2350 } 2351 } 2352 ret = -1; 2353 if (nl_route >= 0) 2354 ret = mlx5_nl_switch_info 2355 (nl_route, 2356 list[ns].ifindex, 2357 &list[ns].info); 2358 if (ret || (!list[ns].info.representor && 2359 !list[ns].info.master)) { 2360 /* 2361 * We failed to recognize representors with 2362 * Netlink, let's try to perform the task 2363 * with sysfs. 2364 */ 2365 ret = mlx5_sysfs_switch_info 2366 (list[ns].ifindex, 2367 &list[ns].info); 2368 } 2369 if (!ret && (list[ns].info.representor ^ 2370 list[ns].info.master)) { 2371 ns++; 2372 } else if ((nd == 1) && 2373 !list[ns].info.representor && 2374 !list[ns].info.master) { 2375 /* 2376 * Single IB device with 2377 * one physical port and 2378 * attached network device. 2379 * May be SRIOV is not enabled 2380 * or there is no representors. 2381 */ 2382 DRV_LOG(INFO, "no E-Switch support detected"); 2383 ns++; 2384 break; 2385 } 2386 } 2387 if (!ns) { 2388 DRV_LOG(ERR, 2389 "unable to recognize master/representors" 2390 " on the multiple IB devices"); 2391 rte_errno = ENOENT; 2392 ret = -rte_errno; 2393 goto exit; 2394 } 2395 } 2396 assert(ns); 2397 /* 2398 * Sort list to probe devices in natural order for users convenience 2399 * (i.e. master first, then representors from lowest to highest ID). 2400 */ 2401 qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp); 2402 /* Default configuration. */ 2403 dev_config = (struct mlx5_dev_config){ 2404 .hw_padding = 0, 2405 .mps = MLX5_ARG_UNSET, 2406 .rx_vec_en = 1, 2407 .txq_inline_max = MLX5_ARG_UNSET, 2408 .txq_inline_min = MLX5_ARG_UNSET, 2409 .txq_inline_mpw = MLX5_ARG_UNSET, 2410 .txqs_inline = MLX5_ARG_UNSET, 2411 .vf_nl_en = 1, 2412 .mr_ext_memseg_en = 1, 2413 .mprq = { 2414 .enabled = 0, /* Disabled by default. */ 2415 .stride_num_n = MLX5_MPRQ_STRIDE_NUM_N, 2416 .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN, 2417 .min_rxqs_num = MLX5_MPRQ_MIN_RXQS, 2418 }, 2419 .dv_esw_en = 1, 2420 }; 2421 /* Device specific configuration. */ 2422 switch (pci_dev->id.device_id) { 2423 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: 2424 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: 2425 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: 2426 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: 2427 dev_config.vf = 1; 2428 break; 2429 default: 2430 break; 2431 } 2432 for (i = 0; i != ns; ++i) { 2433 uint32_t restore; 2434 2435 list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device, 2436 &list[i], 2437 dev_config); 2438 if (!list[i].eth_dev) { 2439 if (rte_errno != EBUSY && rte_errno != EEXIST) 2440 break; 2441 /* Device is disabled or already spawned. Ignore it. */ 2442 continue; 2443 } 2444 restore = list[i].eth_dev->data->dev_flags; 2445 rte_eth_copy_pci_info(list[i].eth_dev, pci_dev); 2446 /* Restore non-PCI flags cleared by the above call. */ 2447 list[i].eth_dev->data->dev_flags |= restore; 2448 rte_eth_dev_probing_finish(list[i].eth_dev); 2449 } 2450 if (i != ns) { 2451 DRV_LOG(ERR, 2452 "probe of PCI device " PCI_PRI_FMT " aborted after" 2453 " encountering an error: %s", 2454 pci_dev->addr.domain, pci_dev->addr.bus, 2455 pci_dev->addr.devid, pci_dev->addr.function, 2456 strerror(rte_errno)); 2457 ret = -rte_errno; 2458 /* Roll back. */ 2459 while (i--) { 2460 if (!list[i].eth_dev) 2461 continue; 2462 mlx5_dev_close(list[i].eth_dev); 2463 /* mac_addrs must not be freed because in dev_private */ 2464 list[i].eth_dev->data->mac_addrs = NULL; 2465 claim_zero(rte_eth_dev_release_port(list[i].eth_dev)); 2466 } 2467 /* Restore original error. */ 2468 rte_errno = -ret; 2469 } else { 2470 ret = 0; 2471 } 2472 exit: 2473 /* 2474 * Do the routine cleanup: 2475 * - close opened Netlink sockets 2476 * - free the Infiniband device list 2477 */ 2478 if (nl_rdma >= 0) 2479 close(nl_rdma); 2480 if (nl_route >= 0) 2481 close(nl_route); 2482 assert(ibv_list); 2483 mlx5_glue->free_device_list(ibv_list); 2484 return ret; 2485 } 2486 2487 /** 2488 * DPDK callback to remove a PCI device. 2489 * 2490 * This function removes all Ethernet devices belong to a given PCI device. 2491 * 2492 * @param[in] pci_dev 2493 * Pointer to the PCI device. 2494 * 2495 * @return 2496 * 0 on success, the function cannot fail. 2497 */ 2498 static int 2499 mlx5_pci_remove(struct rte_pci_device *pci_dev) 2500 { 2501 uint16_t port_id; 2502 2503 RTE_ETH_FOREACH_DEV_OF(port_id, &pci_dev->device) 2504 rte_eth_dev_close(port_id); 2505 return 0; 2506 } 2507 2508 static const struct rte_pci_id mlx5_pci_id_map[] = { 2509 { 2510 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 2511 PCI_DEVICE_ID_MELLANOX_CONNECTX4) 2512 }, 2513 { 2514 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 2515 PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) 2516 }, 2517 { 2518 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 2519 PCI_DEVICE_ID_MELLANOX_CONNECTX4LX) 2520 }, 2521 { 2522 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 2523 PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) 2524 }, 2525 { 2526 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 2527 PCI_DEVICE_ID_MELLANOX_CONNECTX5) 2528 }, 2529 { 2530 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 2531 PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) 2532 }, 2533 { 2534 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 2535 PCI_DEVICE_ID_MELLANOX_CONNECTX5EX) 2536 }, 2537 { 2538 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 2539 PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF) 2540 }, 2541 { 2542 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 2543 PCI_DEVICE_ID_MELLANOX_CONNECTX5BF) 2544 }, 2545 { 2546 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 2547 PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF) 2548 }, 2549 { 2550 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 2551 PCI_DEVICE_ID_MELLANOX_CONNECTX6) 2552 }, 2553 { 2554 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 2555 PCI_DEVICE_ID_MELLANOX_CONNECTX6VF) 2556 }, 2557 { 2558 .vendor_id = 0 2559 } 2560 }; 2561 2562 static struct rte_pci_driver mlx5_driver = { 2563 .driver = { 2564 .name = MLX5_DRIVER_NAME 2565 }, 2566 .id_table = mlx5_pci_id_map, 2567 .probe = mlx5_pci_probe, 2568 .remove = mlx5_pci_remove, 2569 .dma_map = mlx5_dma_map, 2570 .dma_unmap = mlx5_dma_unmap, 2571 .drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV | 2572 RTE_PCI_DRV_PROBE_AGAIN, 2573 }; 2574 2575 #ifdef RTE_IBVERBS_LINK_DLOPEN 2576 2577 /** 2578 * Suffix RTE_EAL_PMD_PATH with "-glue". 2579 * 2580 * This function performs a sanity check on RTE_EAL_PMD_PATH before 2581 * suffixing its last component. 2582 * 2583 * @param buf[out] 2584 * Output buffer, should be large enough otherwise NULL is returned. 2585 * @param size 2586 * Size of @p out. 2587 * 2588 * @return 2589 * Pointer to @p buf or @p NULL in case suffix cannot be appended. 2590 */ 2591 static char * 2592 mlx5_glue_path(char *buf, size_t size) 2593 { 2594 static const char *const bad[] = { "/", ".", "..", NULL }; 2595 const char *path = RTE_EAL_PMD_PATH; 2596 size_t len = strlen(path); 2597 size_t off; 2598 int i; 2599 2600 while (len && path[len - 1] == '/') 2601 --len; 2602 for (off = len; off && path[off - 1] != '/'; --off) 2603 ; 2604 for (i = 0; bad[i]; ++i) 2605 if (!strncmp(path + off, bad[i], (int)(len - off))) 2606 goto error; 2607 i = snprintf(buf, size, "%.*s-glue", (int)len, path); 2608 if (i == -1 || (size_t)i >= size) 2609 goto error; 2610 return buf; 2611 error: 2612 DRV_LOG(ERR, 2613 "unable to append \"-glue\" to last component of" 2614 " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\")," 2615 " please re-configure DPDK"); 2616 return NULL; 2617 } 2618 2619 /** 2620 * Initialization routine for run-time dependency on rdma-core. 2621 */ 2622 static int 2623 mlx5_glue_init(void) 2624 { 2625 char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")]; 2626 const char *path[] = { 2627 /* 2628 * A basic security check is necessary before trusting 2629 * MLX5_GLUE_PATH, which may override RTE_EAL_PMD_PATH. 2630 */ 2631 (geteuid() == getuid() && getegid() == getgid() ? 2632 getenv("MLX5_GLUE_PATH") : NULL), 2633 /* 2634 * When RTE_EAL_PMD_PATH is set, use its glue-suffixed 2635 * variant, otherwise let dlopen() look up libraries on its 2636 * own. 2637 */ 2638 (*RTE_EAL_PMD_PATH ? 2639 mlx5_glue_path(glue_path, sizeof(glue_path)) : ""), 2640 }; 2641 unsigned int i = 0; 2642 void *handle = NULL; 2643 void **sym; 2644 const char *dlmsg; 2645 2646 while (!handle && i != RTE_DIM(path)) { 2647 const char *end; 2648 size_t len; 2649 int ret; 2650 2651 if (!path[i]) { 2652 ++i; 2653 continue; 2654 } 2655 end = strpbrk(path[i], ":;"); 2656 if (!end) 2657 end = path[i] + strlen(path[i]); 2658 len = end - path[i]; 2659 ret = 0; 2660 do { 2661 char name[ret + 1]; 2662 2663 ret = snprintf(name, sizeof(name), "%.*s%s" MLX5_GLUE, 2664 (int)len, path[i], 2665 (!len || *(end - 1) == '/') ? "" : "/"); 2666 if (ret == -1) 2667 break; 2668 if (sizeof(name) != (size_t)ret + 1) 2669 continue; 2670 DRV_LOG(DEBUG, "looking for rdma-core glue as \"%s\"", 2671 name); 2672 handle = dlopen(name, RTLD_LAZY); 2673 break; 2674 } while (1); 2675 path[i] = end + 1; 2676 if (!*end) 2677 ++i; 2678 } 2679 if (!handle) { 2680 rte_errno = EINVAL; 2681 dlmsg = dlerror(); 2682 if (dlmsg) 2683 DRV_LOG(WARNING, "cannot load glue library: %s", dlmsg); 2684 goto glue_error; 2685 } 2686 sym = dlsym(handle, "mlx5_glue"); 2687 if (!sym || !*sym) { 2688 rte_errno = EINVAL; 2689 dlmsg = dlerror(); 2690 if (dlmsg) 2691 DRV_LOG(ERR, "cannot resolve glue symbol: %s", dlmsg); 2692 goto glue_error; 2693 } 2694 mlx5_glue = *sym; 2695 return 0; 2696 glue_error: 2697 if (handle) 2698 dlclose(handle); 2699 DRV_LOG(WARNING, 2700 "cannot initialize PMD due to missing run-time dependency on" 2701 " rdma-core libraries (libibverbs, libmlx5)"); 2702 return -rte_errno; 2703 } 2704 2705 #endif 2706 2707 /** 2708 * Driver initialization routine. 2709 */ 2710 RTE_INIT(rte_mlx5_pmd_init) 2711 { 2712 /* Initialize driver log type. */ 2713 mlx5_logtype = rte_log_register("pmd.net.mlx5"); 2714 if (mlx5_logtype >= 0) 2715 rte_log_set_level(mlx5_logtype, RTE_LOG_NOTICE); 2716 2717 /* Build the static tables for Verbs conversion. */ 2718 mlx5_set_ptype_table(); 2719 mlx5_set_cksum_table(); 2720 mlx5_set_swp_types_table(); 2721 /* 2722 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use 2723 * huge pages. Calling ibv_fork_init() during init allows 2724 * applications to use fork() safely for purposes other than 2725 * using this PMD, which is not supported in forked processes. 2726 */ 2727 setenv("RDMAV_HUGEPAGES_SAFE", "1", 1); 2728 /* Match the size of Rx completion entry to the size of a cacheline. */ 2729 if (RTE_CACHE_LINE_SIZE == 128) 2730 setenv("MLX5_CQE_SIZE", "128", 0); 2731 /* 2732 * MLX5_DEVICE_FATAL_CLEANUP tells ibv_destroy functions to 2733 * cleanup all the Verbs resources even when the device was removed. 2734 */ 2735 setenv("MLX5_DEVICE_FATAL_CLEANUP", "1", 1); 2736 #ifdef RTE_IBVERBS_LINK_DLOPEN 2737 if (mlx5_glue_init()) 2738 return; 2739 assert(mlx5_glue); 2740 #endif 2741 #ifndef NDEBUG 2742 /* Glue structure must not contain any NULL pointers. */ 2743 { 2744 unsigned int i; 2745 2746 for (i = 0; i != sizeof(*mlx5_glue) / sizeof(void *); ++i) 2747 assert(((const void *const *)mlx5_glue)[i]); 2748 } 2749 #endif 2750 if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) { 2751 DRV_LOG(ERR, 2752 "rdma-core glue \"%s\" mismatch: \"%s\" is required", 2753 mlx5_glue->version, MLX5_GLUE_VERSION); 2754 return; 2755 } 2756 mlx5_glue->fork_init(); 2757 rte_pci_register(&mlx5_driver); 2758 } 2759 2760 RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__); 2761 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map); 2762 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib"); 2763