1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2020 Mellanox Technologies, Ltd 4 */ 5 6 #include <stddef.h> 7 #include <unistd.h> 8 #include <string.h> 9 #include <stdint.h> 10 #include <stdlib.h> 11 #include <errno.h> 12 #include <net/if.h> 13 #include <linux/rtnetlink.h> 14 #include <linux/sockios.h> 15 #include <linux/ethtool.h> 16 #include <fcntl.h> 17 18 #include <rte_malloc.h> 19 #include <rte_ethdev_driver.h> 20 #include <rte_ethdev_pci.h> 21 #include <rte_pci.h> 22 #include <rte_bus_pci.h> 23 #include <rte_common.h> 24 #include <rte_kvargs.h> 25 #include <rte_rwlock.h> 26 #include <rte_spinlock.h> 27 #include <rte_string_fns.h> 28 #include <rte_alarm.h> 29 #include <rte_eal_paging.h> 30 31 #include <mlx5_glue.h> 32 #include <mlx5_devx_cmds.h> 33 #include <mlx5_common.h> 34 #include <mlx5_common_mp.h> 35 #include <mlx5_common_mr.h> 36 #include <mlx5_malloc.h> 37 38 #include "mlx5_defs.h" 39 #include "mlx5.h" 40 #include "mlx5_common_os.h" 41 #include "mlx5_utils.h" 42 #include "mlx5_rxtx.h" 43 #include "mlx5_autoconf.h" 44 #include "mlx5_mr.h" 45 #include "mlx5_flow.h" 46 #include "rte_pmd_mlx5.h" 47 #include "mlx5_verbs.h" 48 #include "mlx5_nl.h" 49 #include "mlx5_devx.h" 50 51 #define MLX5_TAGS_HLIST_ARRAY_SIZE 8192 52 53 #ifndef HAVE_IBV_MLX5_MOD_MPW 54 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) 55 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) 56 #endif 57 58 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP 59 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4) 60 #endif 61 62 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data"; 63 64 /* Spinlock for mlx5_shared_data allocation. */ 65 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER; 66 67 /* Process local data for secondary processes. */ 68 static struct mlx5_local_data mlx5_local_data; 69 70 /** 71 * Set the completion channel file descriptor interrupt as non-blocking. 72 * 73 * @param[in] rxq_obj 74 * Pointer to RQ channel object, which includes the channel fd 75 * 76 * @param[out] fd 77 * The file descriptor (representing the intetrrupt) used in this channel. 78 * 79 * @return 80 * 0 on successfully setting the fd to non-blocking, non-zero otherwise. 81 */ 82 int 83 mlx5_os_set_nonblock_channel_fd(int fd) 84 { 85 int flags; 86 87 flags = fcntl(fd, F_GETFL); 88 return fcntl(fd, F_SETFL, flags | O_NONBLOCK); 89 } 90 91 /** 92 * Get mlx5 device attributes. The glue function query_device_ex() is called 93 * with out parameter of type 'struct ibv_device_attr_ex *'. Then fill in mlx5 94 * device attributes from the glue out parameter. 95 * 96 * @param dev 97 * Pointer to ibv context. 98 * 99 * @param device_attr 100 * Pointer to mlx5 device attributes. 101 * 102 * @return 103 * 0 on success, non zero error number otherwise 104 */ 105 int 106 mlx5_os_get_dev_attr(void *ctx, struct mlx5_dev_attr *device_attr) 107 { 108 int err; 109 struct ibv_device_attr_ex attr_ex; 110 memset(device_attr, 0, sizeof(*device_attr)); 111 err = mlx5_glue->query_device_ex(ctx, NULL, &attr_ex); 112 if (err) 113 return err; 114 115 device_attr->device_cap_flags_ex = attr_ex.device_cap_flags_ex; 116 device_attr->max_qp_wr = attr_ex.orig_attr.max_qp_wr; 117 device_attr->max_sge = attr_ex.orig_attr.max_sge; 118 device_attr->max_cq = attr_ex.orig_attr.max_cq; 119 device_attr->max_qp = attr_ex.orig_attr.max_qp; 120 device_attr->raw_packet_caps = attr_ex.raw_packet_caps; 121 device_attr->max_rwq_indirection_table_size = 122 attr_ex.rss_caps.max_rwq_indirection_table_size; 123 device_attr->max_tso = attr_ex.tso_caps.max_tso; 124 device_attr->tso_supported_qpts = attr_ex.tso_caps.supported_qpts; 125 126 struct mlx5dv_context dv_attr = { .comp_mask = 0 }; 127 err = mlx5_glue->dv_query_device(ctx, &dv_attr); 128 if (err) 129 return err; 130 131 device_attr->flags = dv_attr.flags; 132 device_attr->comp_mask = dv_attr.comp_mask; 133 #ifdef HAVE_IBV_MLX5_MOD_SWP 134 device_attr->sw_parsing_offloads = 135 dv_attr.sw_parsing_caps.sw_parsing_offloads; 136 #endif 137 device_attr->min_single_stride_log_num_of_bytes = 138 dv_attr.striding_rq_caps.min_single_stride_log_num_of_bytes; 139 device_attr->max_single_stride_log_num_of_bytes = 140 dv_attr.striding_rq_caps.max_single_stride_log_num_of_bytes; 141 device_attr->min_single_wqe_log_num_of_strides = 142 dv_attr.striding_rq_caps.min_single_wqe_log_num_of_strides; 143 device_attr->max_single_wqe_log_num_of_strides = 144 dv_attr.striding_rq_caps.max_single_wqe_log_num_of_strides; 145 device_attr->stride_supported_qpts = 146 dv_attr.striding_rq_caps.supported_qpts; 147 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 148 device_attr->tunnel_offloads_caps = dv_attr.tunnel_offloads_caps; 149 #endif 150 151 return err; 152 } 153 154 /** 155 * Verbs callback to allocate a memory. This function should allocate the space 156 * according to the size provided residing inside a huge page. 157 * Please note that all allocation must respect the alignment from libmlx5 158 * (i.e. currently rte_mem_page_size()). 159 * 160 * @param[in] size 161 * The size in bytes of the memory to allocate. 162 * @param[in] data 163 * A pointer to the callback data. 164 * 165 * @return 166 * Allocated buffer, NULL otherwise and rte_errno is set. 167 */ 168 static void * 169 mlx5_alloc_verbs_buf(size_t size, void *data) 170 { 171 struct mlx5_priv *priv = data; 172 void *ret; 173 unsigned int socket = SOCKET_ID_ANY; 174 size_t alignment = rte_mem_page_size(); 175 if (alignment == (size_t)-1) { 176 DRV_LOG(ERR, "Failed to get mem page size"); 177 rte_errno = ENOMEM; 178 return NULL; 179 } 180 181 if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) { 182 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 183 184 socket = ctrl->socket; 185 } else if (priv->verbs_alloc_ctx.type == 186 MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) { 187 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 188 189 socket = ctrl->socket; 190 } 191 MLX5_ASSERT(data != NULL); 192 ret = mlx5_malloc(0, size, alignment, socket); 193 if (!ret && size) 194 rte_errno = ENOMEM; 195 return ret; 196 } 197 198 /** 199 * Verbs callback to free a memory. 200 * 201 * @param[in] ptr 202 * A pointer to the memory to free. 203 * @param[in] data 204 * A pointer to the callback data. 205 */ 206 static void 207 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused) 208 { 209 MLX5_ASSERT(data != NULL); 210 mlx5_free(ptr); 211 } 212 213 /** 214 * Initialize DR related data within private structure. 215 * Routine checks the reference counter and does actual 216 * resources creation/initialization only if counter is zero. 217 * 218 * @param[in] priv 219 * Pointer to the private device data structure. 220 * 221 * @return 222 * Zero on success, positive error code otherwise. 223 */ 224 static int 225 mlx5_alloc_shared_dr(struct mlx5_priv *priv) 226 { 227 struct mlx5_dev_ctx_shared *sh = priv->sh; 228 char s[MLX5_HLIST_NAMESIZE] __rte_unused; 229 int err; 230 231 MLX5_ASSERT(sh && sh->refcnt); 232 if (sh->refcnt > 1) 233 return 0; 234 err = mlx5_alloc_table_hash_list(priv); 235 if (err) 236 goto error; 237 /* The resources below are only valid with DV support. */ 238 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 239 /* Create tags hash list table. */ 240 snprintf(s, sizeof(s), "%s_tags", sh->ibdev_name); 241 sh->tag_table = mlx5_hlist_create(s, MLX5_TAGS_HLIST_ARRAY_SIZE, 0, 242 MLX5_HLIST_WRITE_MOST, 243 flow_dv_tag_create_cb, NULL, 244 flow_dv_tag_remove_cb); 245 if (!sh->tag_table) { 246 DRV_LOG(ERR, "tags with hash creation failed."); 247 err = ENOMEM; 248 goto error; 249 } 250 sh->tag_table->ctx = sh; 251 snprintf(s, sizeof(s), "%s_hdr_modify", sh->ibdev_name); 252 sh->modify_cmds = mlx5_hlist_create(s, MLX5_FLOW_HDR_MODIFY_HTABLE_SZ, 253 0, MLX5_HLIST_WRITE_MOST | 254 MLX5_HLIST_DIRECT_KEY, 255 flow_dv_modify_create_cb, 256 flow_dv_modify_match_cb, 257 flow_dv_modify_remove_cb); 258 if (!sh->modify_cmds) { 259 DRV_LOG(ERR, "hdr modify hash creation failed"); 260 err = ENOMEM; 261 goto error; 262 } 263 sh->modify_cmds->ctx = sh; 264 snprintf(s, sizeof(s), "%s_encaps_decaps", sh->ibdev_name); 265 sh->encaps_decaps = mlx5_hlist_create(s, 266 MLX5_FLOW_ENCAP_DECAP_HTABLE_SZ, 267 0, 0, NULL, NULL, NULL); 268 if (!sh->encaps_decaps) { 269 DRV_LOG(ERR, "encap decap hash creation failed"); 270 err = ENOMEM; 271 goto error; 272 } 273 #endif 274 #ifdef HAVE_MLX5DV_DR 275 void *domain; 276 277 /* Reference counter is zero, we should initialize structures. */ 278 domain = mlx5_glue->dr_create_domain(sh->ctx, 279 MLX5DV_DR_DOMAIN_TYPE_NIC_RX); 280 if (!domain) { 281 DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed"); 282 err = errno; 283 goto error; 284 } 285 sh->rx_domain = domain; 286 domain = mlx5_glue->dr_create_domain(sh->ctx, 287 MLX5DV_DR_DOMAIN_TYPE_NIC_TX); 288 if (!domain) { 289 DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed"); 290 err = errno; 291 goto error; 292 } 293 pthread_mutex_init(&sh->dv_mutex, NULL); 294 sh->tx_domain = domain; 295 #ifdef HAVE_MLX5DV_DR_ESWITCH 296 if (priv->config.dv_esw_en) { 297 domain = mlx5_glue->dr_create_domain 298 (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB); 299 if (!domain) { 300 DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed"); 301 err = errno; 302 goto error; 303 } 304 sh->fdb_domain = domain; 305 sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop(); 306 } 307 #endif 308 if (!sh->tunnel_hub) 309 err = mlx5_alloc_tunnel_hub(sh); 310 if (err) { 311 DRV_LOG(ERR, "mlx5_alloc_tunnel_hub failed err=%d", err); 312 goto error; 313 } 314 if (priv->config.reclaim_mode == MLX5_RCM_AGGR) { 315 mlx5_glue->dr_reclaim_domain_memory(sh->rx_domain, 1); 316 mlx5_glue->dr_reclaim_domain_memory(sh->tx_domain, 1); 317 if (sh->fdb_domain) 318 mlx5_glue->dr_reclaim_domain_memory(sh->fdb_domain, 1); 319 } 320 sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan(); 321 #endif /* HAVE_MLX5DV_DR */ 322 sh->default_miss_action = 323 mlx5_glue->dr_create_flow_action_default_miss(); 324 if (!sh->default_miss_action) 325 DRV_LOG(WARNING, "Default miss action is not supported."); 326 return 0; 327 error: 328 /* Rollback the created objects. */ 329 if (sh->rx_domain) { 330 mlx5_glue->dr_destroy_domain(sh->rx_domain); 331 sh->rx_domain = NULL; 332 } 333 if (sh->tx_domain) { 334 mlx5_glue->dr_destroy_domain(sh->tx_domain); 335 sh->tx_domain = NULL; 336 } 337 if (sh->fdb_domain) { 338 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 339 sh->fdb_domain = NULL; 340 } 341 if (sh->esw_drop_action) { 342 mlx5_glue->destroy_flow_action(sh->esw_drop_action); 343 sh->esw_drop_action = NULL; 344 } 345 if (sh->pop_vlan_action) { 346 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 347 sh->pop_vlan_action = NULL; 348 } 349 if (sh->encaps_decaps) { 350 mlx5_hlist_destroy(sh->encaps_decaps); 351 sh->encaps_decaps = NULL; 352 } 353 if (sh->modify_cmds) { 354 mlx5_hlist_destroy(sh->modify_cmds); 355 sh->modify_cmds = NULL; 356 } 357 if (sh->tag_table) { 358 /* tags should be destroyed with flow before. */ 359 mlx5_hlist_destroy(sh->tag_table); 360 sh->tag_table = NULL; 361 } 362 if (sh->tunnel_hub) { 363 mlx5_release_tunnel_hub(sh, priv->dev_port); 364 sh->tunnel_hub = NULL; 365 } 366 mlx5_free_table_hash_list(priv); 367 return err; 368 } 369 370 /** 371 * Destroy DR related data within private structure. 372 * 373 * @param[in] priv 374 * Pointer to the private device data structure. 375 */ 376 void 377 mlx5_os_free_shared_dr(struct mlx5_priv *priv) 378 { 379 struct mlx5_dev_ctx_shared *sh = priv->sh; 380 381 MLX5_ASSERT(sh && sh->refcnt); 382 if (sh->refcnt > 1) 383 return; 384 #ifdef HAVE_MLX5DV_DR 385 if (sh->rx_domain) { 386 mlx5_glue->dr_destroy_domain(sh->rx_domain); 387 sh->rx_domain = NULL; 388 } 389 if (sh->tx_domain) { 390 mlx5_glue->dr_destroy_domain(sh->tx_domain); 391 sh->tx_domain = NULL; 392 } 393 #ifdef HAVE_MLX5DV_DR_ESWITCH 394 if (sh->fdb_domain) { 395 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 396 sh->fdb_domain = NULL; 397 } 398 if (sh->esw_drop_action) { 399 mlx5_glue->destroy_flow_action(sh->esw_drop_action); 400 sh->esw_drop_action = NULL; 401 } 402 #endif 403 if (sh->pop_vlan_action) { 404 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 405 sh->pop_vlan_action = NULL; 406 } 407 pthread_mutex_destroy(&sh->dv_mutex); 408 #endif /* HAVE_MLX5DV_DR */ 409 if (sh->default_miss_action) 410 mlx5_glue->destroy_flow_action 411 (sh->default_miss_action); 412 if (sh->encaps_decaps) { 413 mlx5_hlist_destroy(sh->encaps_decaps); 414 sh->encaps_decaps = NULL; 415 } 416 if (sh->modify_cmds) { 417 mlx5_hlist_destroy(sh->modify_cmds); 418 sh->modify_cmds = NULL; 419 } 420 if (sh->tag_table) { 421 /* tags should be destroyed with flow before. */ 422 mlx5_hlist_destroy(sh->tag_table); 423 sh->tag_table = NULL; 424 } 425 if (sh->tunnel_hub) { 426 mlx5_release_tunnel_hub(sh, priv->dev_port); 427 sh->tunnel_hub = NULL; 428 } 429 mlx5_free_table_hash_list(priv); 430 } 431 432 /** 433 * Initialize shared data between primary and secondary process. 434 * 435 * A memzone is reserved by primary process and secondary processes attach to 436 * the memzone. 437 * 438 * @return 439 * 0 on success, a negative errno value otherwise and rte_errno is set. 440 */ 441 static int 442 mlx5_init_shared_data(void) 443 { 444 const struct rte_memzone *mz; 445 int ret = 0; 446 447 rte_spinlock_lock(&mlx5_shared_data_lock); 448 if (mlx5_shared_data == NULL) { 449 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 450 /* Allocate shared memory. */ 451 mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA, 452 sizeof(*mlx5_shared_data), 453 SOCKET_ID_ANY, 0); 454 if (mz == NULL) { 455 DRV_LOG(ERR, 456 "Cannot allocate mlx5 shared data"); 457 ret = -rte_errno; 458 goto error; 459 } 460 mlx5_shared_data = mz->addr; 461 memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data)); 462 rte_spinlock_init(&mlx5_shared_data->lock); 463 } else { 464 /* Lookup allocated shared memory. */ 465 mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA); 466 if (mz == NULL) { 467 DRV_LOG(ERR, 468 "Cannot attach mlx5 shared data"); 469 ret = -rte_errno; 470 goto error; 471 } 472 mlx5_shared_data = mz->addr; 473 memset(&mlx5_local_data, 0, sizeof(mlx5_local_data)); 474 } 475 } 476 error: 477 rte_spinlock_unlock(&mlx5_shared_data_lock); 478 return ret; 479 } 480 481 /** 482 * PMD global initialization. 483 * 484 * Independent from individual device, this function initializes global 485 * per-PMD data structures distinguishing primary and secondary processes. 486 * Hence, each initialization is called once per a process. 487 * 488 * @return 489 * 0 on success, a negative errno value otherwise and rte_errno is set. 490 */ 491 static int 492 mlx5_init_once(void) 493 { 494 struct mlx5_shared_data *sd; 495 struct mlx5_local_data *ld = &mlx5_local_data; 496 int ret = 0; 497 498 if (mlx5_init_shared_data()) 499 return -rte_errno; 500 sd = mlx5_shared_data; 501 MLX5_ASSERT(sd); 502 rte_spinlock_lock(&sd->lock); 503 switch (rte_eal_process_type()) { 504 case RTE_PROC_PRIMARY: 505 if (sd->init_done) 506 break; 507 LIST_INIT(&sd->mem_event_cb_list); 508 rte_rwlock_init(&sd->mem_event_rwlock); 509 rte_mem_event_callback_register("MLX5_MEM_EVENT_CB", 510 mlx5_mr_mem_event_cb, NULL); 511 ret = mlx5_mp_init_primary(MLX5_MP_NAME, 512 mlx5_mp_os_primary_handle); 513 if (ret) 514 goto out; 515 sd->init_done = true; 516 break; 517 case RTE_PROC_SECONDARY: 518 if (ld->init_done) 519 break; 520 ret = mlx5_mp_init_secondary(MLX5_MP_NAME, 521 mlx5_mp_os_secondary_handle); 522 if (ret) 523 goto out; 524 ++sd->secondary_cnt; 525 ld->init_done = true; 526 break; 527 default: 528 break; 529 } 530 out: 531 rte_spinlock_unlock(&sd->lock); 532 return ret; 533 } 534 535 /** 536 * Create the Tx queue DevX/Verbs object. 537 * 538 * @param dev 539 * Pointer to Ethernet device. 540 * @param idx 541 * Queue index in DPDK Tx queue array. 542 * 543 * @return 544 * 0 on success, a negative errno value otherwise and rte_errno is set. 545 */ 546 static int 547 mlx5_os_txq_obj_new(struct rte_eth_dev *dev, uint16_t idx) 548 { 549 struct mlx5_priv *priv = dev->data->dev_private; 550 struct mlx5_txq_data *txq_data = (*priv->txqs)[idx]; 551 struct mlx5_txq_ctrl *txq_ctrl = 552 container_of(txq_data, struct mlx5_txq_ctrl, txq); 553 554 if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) 555 return mlx5_txq_devx_obj_new(dev, idx); 556 #ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET 557 if (!priv->config.dv_esw_en) 558 return mlx5_txq_devx_obj_new(dev, idx); 559 #endif 560 return mlx5_txq_ibv_obj_new(dev, idx); 561 } 562 563 /** 564 * Release an Tx DevX/verbs queue object. 565 * 566 * @param txq_obj 567 * DevX/Verbs Tx queue object. 568 */ 569 static void 570 mlx5_os_txq_obj_release(struct mlx5_txq_obj *txq_obj) 571 { 572 if (txq_obj->txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) { 573 mlx5_txq_devx_obj_release(txq_obj); 574 return; 575 } 576 #ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET 577 if (!txq_obj->txq_ctrl->priv->config.dv_esw_en) { 578 mlx5_txq_devx_obj_release(txq_obj); 579 return; 580 } 581 #endif 582 mlx5_txq_ibv_obj_release(txq_obj); 583 } 584 585 /** 586 * DV flow counter mode detect and config. 587 * 588 * @param dev 589 * Pointer to rte_eth_dev structure. 590 * 591 */ 592 static void 593 mlx5_flow_counter_mode_config(struct rte_eth_dev *dev __rte_unused) 594 { 595 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 596 struct mlx5_priv *priv = dev->data->dev_private; 597 struct mlx5_dev_ctx_shared *sh = priv->sh; 598 bool fallback; 599 600 #ifndef HAVE_IBV_DEVX_ASYNC 601 fallback = true; 602 #else 603 fallback = false; 604 if (!priv->config.devx || !priv->config.dv_flow_en || 605 !priv->config.hca_attr.flow_counters_dump || 606 !(priv->config.hca_attr.flow_counter_bulk_alloc_bitmap & 0x4) || 607 (mlx5_flow_dv_discover_counter_offset_support(dev) == -ENOTSUP)) 608 fallback = true; 609 #endif 610 if (fallback) 611 DRV_LOG(INFO, "Use fall-back DV counter management. Flow " 612 "counter dump:%d, bulk_alloc_bitmap:0x%hhx.", 613 priv->config.hca_attr.flow_counters_dump, 614 priv->config.hca_attr.flow_counter_bulk_alloc_bitmap); 615 /* Initialize fallback mode only on the port initializes sh. */ 616 if (sh->refcnt == 1) 617 sh->cmng.counter_fallback = fallback; 618 else if (fallback != sh->cmng.counter_fallback) 619 DRV_LOG(WARNING, "Port %d in sh has different fallback mode " 620 "with others:%d.", PORT_ID(priv), fallback); 621 #endif 622 } 623 624 /** 625 * Spawn an Ethernet device from Verbs information. 626 * 627 * @param dpdk_dev 628 * Backing DPDK device. 629 * @param spawn 630 * Verbs device parameters (name, port, switch_info) to spawn. 631 * @param config 632 * Device configuration parameters. 633 * 634 * @return 635 * A valid Ethernet device object on success, NULL otherwise and rte_errno 636 * is set. The following errors are defined: 637 * 638 * EBUSY: device is not supposed to be spawned. 639 * EEXIST: device is already spawned 640 */ 641 static struct rte_eth_dev * 642 mlx5_dev_spawn(struct rte_device *dpdk_dev, 643 struct mlx5_dev_spawn_data *spawn, 644 struct mlx5_dev_config *config) 645 { 646 const struct mlx5_switch_info *switch_info = &spawn->info; 647 struct mlx5_dev_ctx_shared *sh = NULL; 648 struct ibv_port_attr port_attr; 649 struct mlx5dv_context dv_attr = { .comp_mask = 0 }; 650 struct rte_eth_dev *eth_dev = NULL; 651 struct mlx5_priv *priv = NULL; 652 int err = 0; 653 unsigned int hw_padding = 0; 654 unsigned int mps; 655 unsigned int cqe_comp; 656 unsigned int cqe_pad = 0; 657 unsigned int tunnel_en = 0; 658 unsigned int mpls_en = 0; 659 unsigned int swp = 0; 660 unsigned int mprq = 0; 661 unsigned int mprq_min_stride_size_n = 0; 662 unsigned int mprq_max_stride_size_n = 0; 663 unsigned int mprq_min_stride_num_n = 0; 664 unsigned int mprq_max_stride_num_n = 0; 665 struct rte_ether_addr mac; 666 char name[RTE_ETH_NAME_MAX_LEN]; 667 int own_domain_id = 0; 668 uint16_t port_id; 669 unsigned int i; 670 #ifdef HAVE_MLX5DV_DR_DEVX_PORT 671 struct mlx5dv_devx_port devx_port = { .comp_mask = 0 }; 672 #endif 673 674 /* Determine if this port representor is supposed to be spawned. */ 675 if (switch_info->representor && dpdk_dev->devargs) { 676 struct rte_eth_devargs eth_da; 677 678 err = rte_eth_devargs_parse(dpdk_dev->devargs->args, ð_da); 679 if (err) { 680 rte_errno = -err; 681 DRV_LOG(ERR, "failed to process device arguments: %s", 682 strerror(rte_errno)); 683 return NULL; 684 } 685 for (i = 0; i < eth_da.nb_representor_ports; ++i) 686 if (eth_da.representor_ports[i] == 687 (uint16_t)switch_info->port_name) 688 break; 689 if (i == eth_da.nb_representor_ports) { 690 rte_errno = EBUSY; 691 return NULL; 692 } 693 } 694 /* Build device name. */ 695 if (spawn->pf_bond < 0) { 696 /* Single device. */ 697 if (!switch_info->representor) 698 strlcpy(name, dpdk_dev->name, sizeof(name)); 699 else 700 snprintf(name, sizeof(name), "%s_representor_%u", 701 dpdk_dev->name, switch_info->port_name); 702 } else { 703 /* Bonding device. */ 704 if (!switch_info->representor) 705 snprintf(name, sizeof(name), "%s_%s", 706 dpdk_dev->name, 707 mlx5_os_get_dev_device_name(spawn->phys_dev)); 708 else 709 snprintf(name, sizeof(name), "%s_%s_representor_%u", 710 dpdk_dev->name, 711 mlx5_os_get_dev_device_name(spawn->phys_dev), 712 switch_info->port_name); 713 } 714 /* check if the device is already spawned */ 715 if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) { 716 rte_errno = EEXIST; 717 return NULL; 718 } 719 DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name); 720 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 721 struct mlx5_mp_id mp_id; 722 723 eth_dev = rte_eth_dev_attach_secondary(name); 724 if (eth_dev == NULL) { 725 DRV_LOG(ERR, "can not attach rte ethdev"); 726 rte_errno = ENOMEM; 727 return NULL; 728 } 729 eth_dev->device = dpdk_dev; 730 eth_dev->dev_ops = &mlx5_os_dev_sec_ops; 731 eth_dev->rx_descriptor_status = mlx5_rx_descriptor_status; 732 eth_dev->tx_descriptor_status = mlx5_tx_descriptor_status; 733 err = mlx5_proc_priv_init(eth_dev); 734 if (err) 735 return NULL; 736 mp_id.port_id = eth_dev->data->port_id; 737 strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN); 738 /* Receive command fd from primary process */ 739 err = mlx5_mp_req_verbs_cmd_fd(&mp_id); 740 if (err < 0) 741 goto err_secondary; 742 /* Remap UAR for Tx queues. */ 743 err = mlx5_tx_uar_init_secondary(eth_dev, err); 744 if (err) 745 goto err_secondary; 746 /* 747 * Ethdev pointer is still required as input since 748 * the primary device is not accessible from the 749 * secondary process. 750 */ 751 eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev); 752 eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev); 753 return eth_dev; 754 err_secondary: 755 mlx5_dev_close(eth_dev); 756 return NULL; 757 } 758 /* 759 * Some parameters ("tx_db_nc" in particularly) are needed in 760 * advance to create dv/verbs device context. We proceed the 761 * devargs here to get ones, and later proceed devargs again 762 * to override some hardware settings. 763 */ 764 err = mlx5_args(config, dpdk_dev->devargs); 765 if (err) { 766 err = rte_errno; 767 DRV_LOG(ERR, "failed to process device arguments: %s", 768 strerror(rte_errno)); 769 goto error; 770 } 771 if (config->dv_miss_info) { 772 if (switch_info->master || switch_info->representor) 773 config->dv_xmeta_en = MLX5_XMETA_MODE_META16; 774 } 775 mlx5_malloc_mem_select(config->sys_mem_en); 776 sh = mlx5_alloc_shared_dev_ctx(spawn, config); 777 if (!sh) 778 return NULL; 779 config->devx = sh->devx; 780 #ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR 781 config->dest_tir = 1; 782 #endif 783 #ifdef HAVE_IBV_MLX5_MOD_SWP 784 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; 785 #endif 786 /* 787 * Multi-packet send is supported by ConnectX-4 Lx PF as well 788 * as all ConnectX-5 devices. 789 */ 790 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 791 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS; 792 #endif 793 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 794 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; 795 #endif 796 mlx5_glue->dv_query_device(sh->ctx, &dv_attr); 797 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { 798 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { 799 DRV_LOG(DEBUG, "enhanced MPW is supported"); 800 mps = MLX5_MPW_ENHANCED; 801 } else { 802 DRV_LOG(DEBUG, "MPW is supported"); 803 mps = MLX5_MPW; 804 } 805 } else { 806 DRV_LOG(DEBUG, "MPW isn't supported"); 807 mps = MLX5_MPW_DISABLED; 808 } 809 #ifdef HAVE_IBV_MLX5_MOD_SWP 810 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP) 811 swp = dv_attr.sw_parsing_caps.sw_parsing_offloads; 812 DRV_LOG(DEBUG, "SWP support: %u", swp); 813 #endif 814 config->swp = !!swp; 815 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 816 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) { 817 struct mlx5dv_striding_rq_caps mprq_caps = 818 dv_attr.striding_rq_caps; 819 820 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d", 821 mprq_caps.min_single_stride_log_num_of_bytes); 822 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d", 823 mprq_caps.max_single_stride_log_num_of_bytes); 824 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d", 825 mprq_caps.min_single_wqe_log_num_of_strides); 826 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d", 827 mprq_caps.max_single_wqe_log_num_of_strides); 828 DRV_LOG(DEBUG, "\tsupported_qpts: %d", 829 mprq_caps.supported_qpts); 830 DRV_LOG(DEBUG, "device supports Multi-Packet RQ"); 831 mprq = 1; 832 mprq_min_stride_size_n = 833 mprq_caps.min_single_stride_log_num_of_bytes; 834 mprq_max_stride_size_n = 835 mprq_caps.max_single_stride_log_num_of_bytes; 836 mprq_min_stride_num_n = 837 mprq_caps.min_single_wqe_log_num_of_strides; 838 mprq_max_stride_num_n = 839 mprq_caps.max_single_wqe_log_num_of_strides; 840 } 841 #endif 842 if (RTE_CACHE_LINE_SIZE == 128 && 843 !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) 844 cqe_comp = 0; 845 else 846 cqe_comp = 1; 847 config->cqe_comp = cqe_comp; 848 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD 849 /* Whether device supports 128B Rx CQE padding. */ 850 cqe_pad = RTE_CACHE_LINE_SIZE == 128 && 851 (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD); 852 #endif 853 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 854 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { 855 tunnel_en = ((dv_attr.tunnel_offloads_caps & 856 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) && 857 (dv_attr.tunnel_offloads_caps & 858 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) && 859 (dv_attr.tunnel_offloads_caps & 860 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE)); 861 } 862 DRV_LOG(DEBUG, "tunnel offloading is %ssupported", 863 tunnel_en ? "" : "not "); 864 #else 865 DRV_LOG(WARNING, 866 "tunnel offloading disabled due to old OFED/rdma-core version"); 867 #endif 868 config->tunnel_en = tunnel_en; 869 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT 870 mpls_en = ((dv_attr.tunnel_offloads_caps & 871 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) && 872 (dv_attr.tunnel_offloads_caps & 873 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP)); 874 DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported", 875 mpls_en ? "" : "not "); 876 #else 877 DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to" 878 " old OFED/rdma-core version or firmware configuration"); 879 #endif 880 config->mpls_en = mpls_en; 881 /* Check port status. */ 882 err = mlx5_glue->query_port(sh->ctx, spawn->phys_port, &port_attr); 883 if (err) { 884 DRV_LOG(ERR, "port query failed: %s", strerror(err)); 885 goto error; 886 } 887 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { 888 DRV_LOG(ERR, "port is not configured in Ethernet mode"); 889 err = EINVAL; 890 goto error; 891 } 892 if (port_attr.state != IBV_PORT_ACTIVE) 893 DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)", 894 mlx5_glue->port_state_str(port_attr.state), 895 port_attr.state); 896 /* Allocate private eth device data. */ 897 priv = mlx5_malloc(MLX5_MEM_ZERO | MLX5_MEM_RTE, 898 sizeof(*priv), 899 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 900 if (priv == NULL) { 901 DRV_LOG(ERR, "priv allocation failure"); 902 err = ENOMEM; 903 goto error; 904 } 905 priv->sh = sh; 906 priv->dev_port = spawn->phys_port; 907 priv->pci_dev = spawn->pci_dev; 908 priv->mtu = RTE_ETHER_MTU; 909 priv->mp_id.port_id = port_id; 910 strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN); 911 /* Some internal functions rely on Netlink sockets, open them now. */ 912 priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA); 913 priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE); 914 priv->representor = !!switch_info->representor; 915 priv->master = !!switch_info->master; 916 priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; 917 priv->vport_meta_tag = 0; 918 priv->vport_meta_mask = 0; 919 priv->pf_bond = spawn->pf_bond; 920 #ifdef HAVE_MLX5DV_DR_DEVX_PORT 921 /* 922 * The DevX port query API is implemented. E-Switch may use 923 * either vport or reg_c[0] metadata register to match on 924 * vport index. The engaged part of metadata register is 925 * defined by mask. 926 */ 927 if (switch_info->representor || switch_info->master) { 928 devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT | 929 MLX5DV_DEVX_PORT_MATCH_REG_C_0; 930 err = mlx5_glue->devx_port_query(sh->ctx, spawn->phys_port, 931 &devx_port); 932 if (err) { 933 DRV_LOG(WARNING, 934 "can't query devx port %d on device %s", 935 spawn->phys_port, 936 mlx5_os_get_dev_device_name(spawn->phys_dev)); 937 devx_port.comp_mask = 0; 938 } 939 } 940 if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) { 941 priv->vport_meta_tag = devx_port.reg_c_0.value; 942 priv->vport_meta_mask = devx_port.reg_c_0.mask; 943 if (!priv->vport_meta_mask) { 944 DRV_LOG(ERR, "vport zero mask for port %d" 945 " on bonding device %s", 946 spawn->phys_port, 947 mlx5_os_get_dev_device_name 948 (spawn->phys_dev)); 949 err = ENOTSUP; 950 goto error; 951 } 952 if (priv->vport_meta_tag & ~priv->vport_meta_mask) { 953 DRV_LOG(ERR, "invalid vport tag for port %d" 954 " on bonding device %s", 955 spawn->phys_port, 956 mlx5_os_get_dev_device_name 957 (spawn->phys_dev)); 958 err = ENOTSUP; 959 goto error; 960 } 961 } 962 if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) { 963 priv->vport_id = devx_port.vport_num; 964 } else if (spawn->pf_bond >= 0) { 965 DRV_LOG(ERR, "can't deduce vport index for port %d" 966 " on bonding device %s", 967 spawn->phys_port, 968 mlx5_os_get_dev_device_name(spawn->phys_dev)); 969 err = ENOTSUP; 970 goto error; 971 } else { 972 /* Suppose vport index in compatible way. */ 973 priv->vport_id = switch_info->representor ? 974 switch_info->port_name + 1 : -1; 975 } 976 #else 977 /* 978 * Kernel/rdma_core support single E-Switch per PF configurations 979 * only and vport_id field contains the vport index for 980 * associated VF, which is deduced from representor port name. 981 * For example, let's have the IB device port 10, it has 982 * attached network device eth0, which has port name attribute 983 * pf0vf2, we can deduce the VF number as 2, and set vport index 984 * as 3 (2+1). This assigning schema should be changed if the 985 * multiple E-Switch instances per PF configurations or/and PCI 986 * subfunctions are added. 987 */ 988 priv->vport_id = switch_info->representor ? 989 switch_info->port_name + 1 : -1; 990 #endif 991 /* representor_id field keeps the unmodified VF index. */ 992 priv->representor_id = switch_info->representor ? 993 switch_info->port_name : -1; 994 /* 995 * Look for sibling devices in order to reuse their switch domain 996 * if any, otherwise allocate one. 997 */ 998 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { 999 const struct mlx5_priv *opriv = 1000 rte_eth_devices[port_id].data->dev_private; 1001 1002 if (!opriv || 1003 opriv->sh != priv->sh || 1004 opriv->domain_id == 1005 RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) 1006 continue; 1007 priv->domain_id = opriv->domain_id; 1008 break; 1009 } 1010 if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { 1011 err = rte_eth_switch_domain_alloc(&priv->domain_id); 1012 if (err) { 1013 err = rte_errno; 1014 DRV_LOG(ERR, "unable to allocate switch domain: %s", 1015 strerror(rte_errno)); 1016 goto error; 1017 } 1018 own_domain_id = 1; 1019 } 1020 /* Override some values set by hardware configuration. */ 1021 mlx5_args(config, dpdk_dev->devargs); 1022 err = mlx5_dev_check_sibling_config(priv, config); 1023 if (err) 1024 goto error; 1025 config->hw_csum = !!(sh->device_attr.device_cap_flags_ex & 1026 IBV_DEVICE_RAW_IP_CSUM); 1027 DRV_LOG(DEBUG, "checksum offloading is %ssupported", 1028 (config->hw_csum ? "" : "not ")); 1029 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \ 1030 !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) 1031 DRV_LOG(DEBUG, "counters are not supported"); 1032 #endif 1033 #if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR) 1034 if (config->dv_flow_en) { 1035 DRV_LOG(WARNING, "DV flow is not supported"); 1036 config->dv_flow_en = 0; 1037 } 1038 #endif 1039 config->ind_table_max_size = 1040 sh->device_attr.max_rwq_indirection_table_size; 1041 /* 1042 * Remove this check once DPDK supports larger/variable 1043 * indirection tables. 1044 */ 1045 if (config->ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512) 1046 config->ind_table_max_size = ETH_RSS_RETA_SIZE_512; 1047 DRV_LOG(DEBUG, "maximum Rx indirection table size is %u", 1048 config->ind_table_max_size); 1049 config->hw_vlan_strip = !!(sh->device_attr.raw_packet_caps & 1050 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); 1051 DRV_LOG(DEBUG, "VLAN stripping is %ssupported", 1052 (config->hw_vlan_strip ? "" : "not ")); 1053 config->hw_fcs_strip = !!(sh->device_attr.raw_packet_caps & 1054 IBV_RAW_PACKET_CAP_SCATTER_FCS); 1055 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING) 1056 hw_padding = !!sh->device_attr.rx_pad_end_addr_align; 1057 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING) 1058 hw_padding = !!(sh->device_attr.device_cap_flags_ex & 1059 IBV_DEVICE_PCI_WRITE_END_PADDING); 1060 #endif 1061 if (config->hw_padding && !hw_padding) { 1062 DRV_LOG(DEBUG, "Rx end alignment padding isn't supported"); 1063 config->hw_padding = 0; 1064 } else if (config->hw_padding) { 1065 DRV_LOG(DEBUG, "Rx end alignment padding is enabled"); 1066 } 1067 config->tso = (sh->device_attr.max_tso > 0 && 1068 (sh->device_attr.tso_supported_qpts & 1069 (1 << IBV_QPT_RAW_PACKET))); 1070 if (config->tso) 1071 config->tso_max_payload_sz = sh->device_attr.max_tso; 1072 /* 1073 * MPW is disabled by default, while the Enhanced MPW is enabled 1074 * by default. 1075 */ 1076 if (config->mps == MLX5_ARG_UNSET) 1077 config->mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED : 1078 MLX5_MPW_DISABLED; 1079 else 1080 config->mps = config->mps ? mps : MLX5_MPW_DISABLED; 1081 DRV_LOG(INFO, "%sMPS is %s", 1082 config->mps == MLX5_MPW_ENHANCED ? "enhanced " : 1083 config->mps == MLX5_MPW ? "legacy " : "", 1084 config->mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); 1085 if (config->cqe_comp && !cqe_comp) { 1086 DRV_LOG(WARNING, "Rx CQE compression isn't supported"); 1087 config->cqe_comp = 0; 1088 } 1089 if (config->cqe_pad && !cqe_pad) { 1090 DRV_LOG(WARNING, "Rx CQE padding isn't supported"); 1091 config->cqe_pad = 0; 1092 } else if (config->cqe_pad) { 1093 DRV_LOG(INFO, "Rx CQE padding is enabled"); 1094 } 1095 if (config->devx) { 1096 err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config->hca_attr); 1097 if (err) { 1098 err = -err; 1099 goto error; 1100 } 1101 /* Check relax ordering support. */ 1102 if (config->hca_attr.relaxed_ordering_write && 1103 config->hca_attr.relaxed_ordering_read && 1104 !haswell_broadwell_cpu) 1105 sh->cmng.relaxed_ordering = 1; 1106 /* Check for LRO support. */ 1107 if (config->dest_tir && config->hca_attr.lro_cap && 1108 config->dv_flow_en) { 1109 /* TBD check tunnel lro caps. */ 1110 config->lro.supported = config->hca_attr.lro_cap; 1111 DRV_LOG(DEBUG, "Device supports LRO"); 1112 /* 1113 * If LRO timeout is not configured by application, 1114 * use the minimal supported value. 1115 */ 1116 if (!config->lro.timeout) 1117 config->lro.timeout = 1118 config->hca_attr.lro_timer_supported_periods[0]; 1119 DRV_LOG(DEBUG, "LRO session timeout set to %d usec", 1120 config->lro.timeout); 1121 DRV_LOG(DEBUG, "LRO minimal size of TCP segment " 1122 "required for coalescing is %d bytes", 1123 config->hca_attr.lro_min_mss_size); 1124 } 1125 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER) 1126 if (config->hca_attr.qos.sup && 1127 config->hca_attr.qos.srtcm_sup && 1128 config->dv_flow_en) { 1129 uint8_t reg_c_mask = 1130 config->hca_attr.qos.flow_meter_reg_c_ids; 1131 /* 1132 * Meter needs two REG_C's for color match and pre-sfx 1133 * flow match. Here get the REG_C for color match. 1134 * REG_C_0 and REG_C_1 is reserved for metadata feature. 1135 */ 1136 reg_c_mask &= 0xfc; 1137 if (__builtin_popcount(reg_c_mask) < 1) { 1138 priv->mtr_en = 0; 1139 DRV_LOG(WARNING, "No available register for" 1140 " meter."); 1141 } else { 1142 priv->mtr_color_reg = ffs(reg_c_mask) - 1 + 1143 REG_C_0; 1144 priv->mtr_en = 1; 1145 priv->mtr_reg_share = 1146 config->hca_attr.qos.flow_meter_reg_share; 1147 DRV_LOG(DEBUG, "The REG_C meter uses is %d", 1148 priv->mtr_color_reg); 1149 } 1150 } 1151 #endif 1152 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_SAMPLE) 1153 if (config->hca_attr.log_max_ft_sampler_num > 0 && 1154 config->dv_flow_en) { 1155 priv->sampler_en = 1; 1156 DRV_LOG(DEBUG, "The Sampler enabled!\n"); 1157 } else { 1158 priv->sampler_en = 0; 1159 if (!config->hca_attr.log_max_ft_sampler_num) 1160 DRV_LOG(WARNING, "No available register for" 1161 " Sampler."); 1162 else 1163 DRV_LOG(DEBUG, "DV flow is not supported!\n"); 1164 } 1165 #endif 1166 } 1167 if (config->tx_pp) { 1168 DRV_LOG(DEBUG, "Timestamp counter frequency %u kHz", 1169 config->hca_attr.dev_freq_khz); 1170 DRV_LOG(DEBUG, "Packet pacing is %ssupported", 1171 config->hca_attr.qos.packet_pacing ? "" : "not "); 1172 DRV_LOG(DEBUG, "Cross channel ops are %ssupported", 1173 config->hca_attr.cross_channel ? "" : "not "); 1174 DRV_LOG(DEBUG, "WQE index ignore is %ssupported", 1175 config->hca_attr.wqe_index_ignore ? "" : "not "); 1176 DRV_LOG(DEBUG, "Non-wire SQ feature is %ssupported", 1177 config->hca_attr.non_wire_sq ? "" : "not "); 1178 DRV_LOG(DEBUG, "Static WQE SQ feature is %ssupported (%d)", 1179 config->hca_attr.log_max_static_sq_wq ? "" : "not ", 1180 config->hca_attr.log_max_static_sq_wq); 1181 DRV_LOG(DEBUG, "WQE rate PP mode is %ssupported", 1182 config->hca_attr.qos.wqe_rate_pp ? "" : "not "); 1183 if (!config->devx) { 1184 DRV_LOG(ERR, "DevX is required for packet pacing"); 1185 err = ENODEV; 1186 goto error; 1187 } 1188 if (!config->hca_attr.qos.packet_pacing) { 1189 DRV_LOG(ERR, "Packet pacing is not supported"); 1190 err = ENODEV; 1191 goto error; 1192 } 1193 if (!config->hca_attr.cross_channel) { 1194 DRV_LOG(ERR, "Cross channel operations are" 1195 " required for packet pacing"); 1196 err = ENODEV; 1197 goto error; 1198 } 1199 if (!config->hca_attr.wqe_index_ignore) { 1200 DRV_LOG(ERR, "WQE index ignore feature is" 1201 " required for packet pacing"); 1202 err = ENODEV; 1203 goto error; 1204 } 1205 if (!config->hca_attr.non_wire_sq) { 1206 DRV_LOG(ERR, "Non-wire SQ feature is" 1207 " required for packet pacing"); 1208 err = ENODEV; 1209 goto error; 1210 } 1211 if (!config->hca_attr.log_max_static_sq_wq) { 1212 DRV_LOG(ERR, "Static WQE SQ feature is" 1213 " required for packet pacing"); 1214 err = ENODEV; 1215 goto error; 1216 } 1217 if (!config->hca_attr.qos.wqe_rate_pp) { 1218 DRV_LOG(ERR, "WQE rate mode is required" 1219 " for packet pacing"); 1220 err = ENODEV; 1221 goto error; 1222 } 1223 #ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET 1224 DRV_LOG(ERR, "DevX does not provide UAR offset," 1225 " can't create queues for packet pacing"); 1226 err = ENODEV; 1227 goto error; 1228 #endif 1229 } 1230 if (config->devx) { 1231 uint32_t reg[MLX5_ST_SZ_DW(register_mtutc)]; 1232 1233 err = config->hca_attr.access_register_user ? 1234 mlx5_devx_cmd_register_read 1235 (sh->ctx, MLX5_REGISTER_ID_MTUTC, 0, 1236 reg, MLX5_ST_SZ_DW(register_mtutc)) : ENOTSUP; 1237 if (!err) { 1238 uint32_t ts_mode; 1239 1240 /* MTUTC register is read successfully. */ 1241 ts_mode = MLX5_GET(register_mtutc, reg, 1242 time_stamp_mode); 1243 if (ts_mode == MLX5_MTUTC_TIMESTAMP_MODE_REAL_TIME) 1244 config->rt_timestamp = 1; 1245 } else { 1246 /* Kernel does not support register reading. */ 1247 if (config->hca_attr.dev_freq_khz == 1248 (NS_PER_S / MS_PER_S)) 1249 config->rt_timestamp = 1; 1250 } 1251 } 1252 /* 1253 * If HW has bug working with tunnel packet decapsulation and 1254 * scatter FCS, and decapsulation is needed, clear the hw_fcs_strip 1255 * bit. Then DEV_RX_OFFLOAD_KEEP_CRC bit will not be set anymore. 1256 */ 1257 if (config->hca_attr.scatter_fcs_w_decap_disable && config->decap_en) 1258 config->hw_fcs_strip = 0; 1259 DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported", 1260 (config->hw_fcs_strip ? "" : "not ")); 1261 if (config->mprq.enabled && mprq) { 1262 if (config->mprq.stride_num_n && 1263 (config->mprq.stride_num_n > mprq_max_stride_num_n || 1264 config->mprq.stride_num_n < mprq_min_stride_num_n)) { 1265 config->mprq.stride_num_n = 1266 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, 1267 mprq_min_stride_num_n), 1268 mprq_max_stride_num_n); 1269 DRV_LOG(WARNING, 1270 "the number of strides" 1271 " for Multi-Packet RQ is out of range," 1272 " setting default value (%u)", 1273 1 << config->mprq.stride_num_n); 1274 } 1275 if (config->mprq.stride_size_n && 1276 (config->mprq.stride_size_n > mprq_max_stride_size_n || 1277 config->mprq.stride_size_n < mprq_min_stride_size_n)) { 1278 config->mprq.stride_size_n = 1279 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N, 1280 mprq_min_stride_size_n), 1281 mprq_max_stride_size_n); 1282 DRV_LOG(WARNING, 1283 "the size of a stride" 1284 " for Multi-Packet RQ is out of range," 1285 " setting default value (%u)", 1286 1 << config->mprq.stride_size_n); 1287 } 1288 config->mprq.min_stride_size_n = mprq_min_stride_size_n; 1289 config->mprq.max_stride_size_n = mprq_max_stride_size_n; 1290 } else if (config->mprq.enabled && !mprq) { 1291 DRV_LOG(WARNING, "Multi-Packet RQ isn't supported"); 1292 config->mprq.enabled = 0; 1293 } 1294 if (config->max_dump_files_num == 0) 1295 config->max_dump_files_num = 128; 1296 eth_dev = rte_eth_dev_allocate(name); 1297 if (eth_dev == NULL) { 1298 DRV_LOG(ERR, "can not allocate rte ethdev"); 1299 err = ENOMEM; 1300 goto error; 1301 } 1302 if (priv->representor) { 1303 eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR; 1304 eth_dev->data->representor_id = priv->representor_id; 1305 } 1306 /* 1307 * Store associated network device interface index. This index 1308 * is permanent throughout the lifetime of device. So, we may store 1309 * the ifindex here and use the cached value further. 1310 */ 1311 MLX5_ASSERT(spawn->ifindex); 1312 priv->if_index = spawn->ifindex; 1313 if (priv->pf_bond >= 0 && priv->master) { 1314 /* Get bond interface info */ 1315 err = mlx5_sysfs_bond_info(priv->if_index, 1316 &priv->bond_ifindex, 1317 priv->bond_name); 1318 if (err) 1319 DRV_LOG(ERR, "unable to get bond info: %s", 1320 strerror(rte_errno)); 1321 else 1322 DRV_LOG(INFO, "PF device %u, bond device %u(%s)", 1323 priv->if_index, priv->bond_ifindex, 1324 priv->bond_name); 1325 } 1326 eth_dev->data->dev_private = priv; 1327 priv->dev_data = eth_dev->data; 1328 eth_dev->data->mac_addrs = priv->mac; 1329 eth_dev->device = dpdk_dev; 1330 eth_dev->data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS; 1331 /* Configure the first MAC address by default. */ 1332 if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) { 1333 DRV_LOG(ERR, 1334 "port %u cannot get MAC address, is mlx5_en" 1335 " loaded? (errno: %s)", 1336 eth_dev->data->port_id, strerror(rte_errno)); 1337 err = ENODEV; 1338 goto error; 1339 } 1340 DRV_LOG(INFO, 1341 "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", 1342 eth_dev->data->port_id, 1343 mac.addr_bytes[0], mac.addr_bytes[1], 1344 mac.addr_bytes[2], mac.addr_bytes[3], 1345 mac.addr_bytes[4], mac.addr_bytes[5]); 1346 #ifdef RTE_LIBRTE_MLX5_DEBUG 1347 { 1348 char ifname[IF_NAMESIZE]; 1349 1350 if (mlx5_get_ifname(eth_dev, &ifname) == 0) 1351 DRV_LOG(DEBUG, "port %u ifname is \"%s\"", 1352 eth_dev->data->port_id, ifname); 1353 else 1354 DRV_LOG(DEBUG, "port %u ifname is unknown", 1355 eth_dev->data->port_id); 1356 } 1357 #endif 1358 /* Get actual MTU if possible. */ 1359 err = mlx5_get_mtu(eth_dev, &priv->mtu); 1360 if (err) { 1361 err = rte_errno; 1362 goto error; 1363 } 1364 DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id, 1365 priv->mtu); 1366 /* Initialize burst functions to prevent crashes before link-up. */ 1367 eth_dev->rx_pkt_burst = removed_rx_burst; 1368 eth_dev->tx_pkt_burst = removed_tx_burst; 1369 eth_dev->dev_ops = &mlx5_os_dev_ops; 1370 eth_dev->rx_descriptor_status = mlx5_rx_descriptor_status; 1371 eth_dev->tx_descriptor_status = mlx5_tx_descriptor_status; 1372 eth_dev->rx_queue_count = mlx5_rx_queue_count; 1373 /* Register MAC address. */ 1374 claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); 1375 if (config->vf && config->vf_nl_en) 1376 mlx5_nl_mac_addr_sync(priv->nl_socket_route, 1377 mlx5_ifindex(eth_dev), 1378 eth_dev->data->mac_addrs, 1379 MLX5_MAX_MAC_ADDRESSES); 1380 priv->flows = 0; 1381 priv->ctrl_flows = 0; 1382 rte_spinlock_init(&priv->flow_list_lock); 1383 TAILQ_INIT(&priv->flow_meters); 1384 TAILQ_INIT(&priv->flow_meter_profiles); 1385 /* Hint libmlx5 to use PMD allocator for data plane resources */ 1386 mlx5_glue->dv_set_context_attr(sh->ctx, 1387 MLX5DV_CTX_ATTR_BUF_ALLOCATORS, 1388 (void *)((uintptr_t)&(struct mlx5dv_ctx_allocators){ 1389 .alloc = &mlx5_alloc_verbs_buf, 1390 .free = &mlx5_free_verbs_buf, 1391 .data = priv, 1392 })); 1393 /* Bring Ethernet device up. */ 1394 DRV_LOG(DEBUG, "port %u forcing Ethernet interface up", 1395 eth_dev->data->port_id); 1396 mlx5_set_link_up(eth_dev); 1397 /* 1398 * Even though the interrupt handler is not installed yet, 1399 * interrupts will still trigger on the async_fd from 1400 * Verbs context returned by ibv_open_device(). 1401 */ 1402 mlx5_link_update(eth_dev, 0); 1403 #ifdef HAVE_MLX5DV_DR_ESWITCH 1404 if (!(config->hca_attr.eswitch_manager && config->dv_flow_en && 1405 (switch_info->representor || switch_info->master))) 1406 config->dv_esw_en = 0; 1407 #else 1408 config->dv_esw_en = 0; 1409 #endif 1410 /* Detect minimal data bytes to inline. */ 1411 mlx5_set_min_inline(spawn, config); 1412 /* Store device configuration on private structure. */ 1413 priv->config = *config; 1414 /* Create context for virtual machine VLAN workaround. */ 1415 priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex); 1416 if (config->dv_flow_en) { 1417 err = mlx5_alloc_shared_dr(priv); 1418 if (err) 1419 goto error; 1420 } 1421 if (config->devx && config->dv_flow_en && config->dest_tir) { 1422 priv->obj_ops = devx_obj_ops; 1423 priv->obj_ops.drop_action_create = 1424 ibv_obj_ops.drop_action_create; 1425 priv->obj_ops.drop_action_destroy = 1426 ibv_obj_ops.drop_action_destroy; 1427 #ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET 1428 priv->obj_ops.txq_obj_modify = ibv_obj_ops.txq_obj_modify; 1429 #else 1430 if (config->dv_esw_en) 1431 priv->obj_ops.txq_obj_modify = 1432 ibv_obj_ops.txq_obj_modify; 1433 #endif 1434 /* Use specific wrappers for Tx object. */ 1435 priv->obj_ops.txq_obj_new = mlx5_os_txq_obj_new; 1436 priv->obj_ops.txq_obj_release = mlx5_os_txq_obj_release; 1437 1438 } else { 1439 priv->obj_ops = ibv_obj_ops; 1440 } 1441 priv->drop_queue.hrxq = mlx5_drop_action_create(eth_dev); 1442 if (!priv->drop_queue.hrxq) 1443 goto error; 1444 /* Supported Verbs flow priority number detection. */ 1445 err = mlx5_flow_discover_priorities(eth_dev); 1446 if (err < 0) { 1447 err = -err; 1448 goto error; 1449 } 1450 priv->config.flow_prio = err; 1451 if (!priv->config.dv_esw_en && 1452 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { 1453 DRV_LOG(WARNING, "metadata mode %u is not supported " 1454 "(no E-Switch)", priv->config.dv_xmeta_en); 1455 priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY; 1456 } 1457 mlx5_set_metadata_mask(eth_dev); 1458 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && 1459 !priv->sh->dv_regc0_mask) { 1460 DRV_LOG(ERR, "metadata mode %u is not supported " 1461 "(no metadata reg_c[0] is available)", 1462 priv->config.dv_xmeta_en); 1463 err = ENOTSUP; 1464 goto error; 1465 } 1466 /* Query availability of metadata reg_c's. */ 1467 err = mlx5_flow_discover_mreg_c(eth_dev); 1468 if (err < 0) { 1469 err = -err; 1470 goto error; 1471 } 1472 if (!mlx5_flow_ext_mreg_supported(eth_dev)) { 1473 DRV_LOG(DEBUG, 1474 "port %u extensive metadata register is not supported", 1475 eth_dev->data->port_id); 1476 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { 1477 DRV_LOG(ERR, "metadata mode %u is not supported " 1478 "(no metadata registers available)", 1479 priv->config.dv_xmeta_en); 1480 err = ENOTSUP; 1481 goto error; 1482 } 1483 } 1484 if (priv->config.dv_flow_en && 1485 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && 1486 mlx5_flow_ext_mreg_supported(eth_dev) && 1487 priv->sh->dv_regc0_mask) { 1488 priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME, 1489 MLX5_FLOW_MREG_HTABLE_SZ, 1490 0, 0, 1491 NULL, NULL, NULL); 1492 if (!priv->mreg_cp_tbl) { 1493 err = ENOMEM; 1494 goto error; 1495 } 1496 } 1497 mlx5_flow_counter_mode_config(eth_dev); 1498 return eth_dev; 1499 error: 1500 if (priv) { 1501 if (priv->mreg_cp_tbl) 1502 mlx5_hlist_destroy(priv->mreg_cp_tbl); 1503 if (priv->sh) 1504 mlx5_os_free_shared_dr(priv); 1505 if (priv->nl_socket_route >= 0) 1506 close(priv->nl_socket_route); 1507 if (priv->nl_socket_rdma >= 0) 1508 close(priv->nl_socket_rdma); 1509 if (priv->vmwa_context) 1510 mlx5_vlan_vmwa_exit(priv->vmwa_context); 1511 if (eth_dev && priv->drop_queue.hrxq) 1512 mlx5_drop_action_destroy(eth_dev); 1513 if (own_domain_id) 1514 claim_zero(rte_eth_switch_domain_free(priv->domain_id)); 1515 mlx5_free(priv); 1516 if (eth_dev != NULL) 1517 eth_dev->data->dev_private = NULL; 1518 } 1519 if (eth_dev != NULL) { 1520 /* mac_addrs must not be freed alone because part of 1521 * dev_private 1522 **/ 1523 eth_dev->data->mac_addrs = NULL; 1524 rte_eth_dev_release_port(eth_dev); 1525 } 1526 if (sh) 1527 mlx5_free_shared_dev_ctx(sh); 1528 MLX5_ASSERT(err > 0); 1529 rte_errno = err; 1530 return NULL; 1531 } 1532 1533 /** 1534 * Comparison callback to sort device data. 1535 * 1536 * This is meant to be used with qsort(). 1537 * 1538 * @param a[in] 1539 * Pointer to pointer to first data object. 1540 * @param b[in] 1541 * Pointer to pointer to second data object. 1542 * 1543 * @return 1544 * 0 if both objects are equal, less than 0 if the first argument is less 1545 * than the second, greater than 0 otherwise. 1546 */ 1547 static int 1548 mlx5_dev_spawn_data_cmp(const void *a, const void *b) 1549 { 1550 const struct mlx5_switch_info *si_a = 1551 &((const struct mlx5_dev_spawn_data *)a)->info; 1552 const struct mlx5_switch_info *si_b = 1553 &((const struct mlx5_dev_spawn_data *)b)->info; 1554 int ret; 1555 1556 /* Master device first. */ 1557 ret = si_b->master - si_a->master; 1558 if (ret) 1559 return ret; 1560 /* Then representor devices. */ 1561 ret = si_b->representor - si_a->representor; 1562 if (ret) 1563 return ret; 1564 /* Unidentified devices come last in no specific order. */ 1565 if (!si_a->representor) 1566 return 0; 1567 /* Order representors by name. */ 1568 return si_a->port_name - si_b->port_name; 1569 } 1570 1571 /** 1572 * Match PCI information for possible slaves of bonding device. 1573 * 1574 * @param[in] ibv_dev 1575 * Pointer to Infiniband device structure. 1576 * @param[in] pci_dev 1577 * Pointer to PCI device structure to match PCI address. 1578 * @param[in] nl_rdma 1579 * Netlink RDMA group socket handle. 1580 * 1581 * @return 1582 * negative value if no bonding device found, otherwise 1583 * positive index of slave PF in bonding. 1584 */ 1585 static int 1586 mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev, 1587 const struct rte_pci_device *pci_dev, 1588 int nl_rdma) 1589 { 1590 char ifname[IF_NAMESIZE + 1]; 1591 unsigned int ifindex; 1592 unsigned int np, i; 1593 FILE *file = NULL; 1594 int pf = -1; 1595 1596 /* 1597 * Try to get master device name. If something goes 1598 * wrong suppose the lack of kernel support and no 1599 * bonding devices. 1600 */ 1601 if (nl_rdma < 0) 1602 return -1; 1603 if (!strstr(ibv_dev->name, "bond")) 1604 return -1; 1605 np = mlx5_nl_portnum(nl_rdma, ibv_dev->name); 1606 if (!np) 1607 return -1; 1608 /* 1609 * The Master device might not be on the predefined 1610 * port (not on port index 1, it is not garanted), 1611 * we have to scan all Infiniband device port and 1612 * find master. 1613 */ 1614 for (i = 1; i <= np; ++i) { 1615 /* Check whether Infiniband port is populated. */ 1616 ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i); 1617 if (!ifindex) 1618 continue; 1619 if (!if_indextoname(ifindex, ifname)) 1620 continue; 1621 /* Try to read bonding slave names from sysfs. */ 1622 MKSTR(slaves, 1623 "/sys/class/net/%s/master/bonding/slaves", ifname); 1624 file = fopen(slaves, "r"); 1625 if (file) 1626 break; 1627 } 1628 if (!file) 1629 return -1; 1630 /* Use safe format to check maximal buffer length. */ 1631 MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE); 1632 while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) { 1633 char tmp_str[IF_NAMESIZE + 32]; 1634 struct rte_pci_addr pci_addr; 1635 struct mlx5_switch_info info; 1636 1637 /* Process slave interface names in the loop. */ 1638 snprintf(tmp_str, sizeof(tmp_str), 1639 "/sys/class/net/%s", ifname); 1640 if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) { 1641 DRV_LOG(WARNING, "can not get PCI address" 1642 " for netdev \"%s\"", ifname); 1643 continue; 1644 } 1645 if (pci_dev->addr.domain != pci_addr.domain || 1646 pci_dev->addr.bus != pci_addr.bus || 1647 pci_dev->addr.devid != pci_addr.devid || 1648 pci_dev->addr.function != pci_addr.function) 1649 continue; 1650 /* Slave interface PCI address match found. */ 1651 fclose(file); 1652 snprintf(tmp_str, sizeof(tmp_str), 1653 "/sys/class/net/%s/phys_port_name", ifname); 1654 file = fopen(tmp_str, "rb"); 1655 if (!file) 1656 break; 1657 info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET; 1658 if (fscanf(file, "%32s", tmp_str) == 1) 1659 mlx5_translate_port_name(tmp_str, &info); 1660 if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY || 1661 info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK) 1662 pf = info.port_name; 1663 break; 1664 } 1665 if (file) 1666 fclose(file); 1667 return pf; 1668 } 1669 1670 /** 1671 * DPDK callback to register a PCI device. 1672 * 1673 * This function spawns Ethernet devices out of a given PCI device. 1674 * 1675 * @param[in] pci_drv 1676 * PCI driver structure (mlx5_driver). 1677 * @param[in] pci_dev 1678 * PCI device information. 1679 * 1680 * @return 1681 * 0 on success, a negative errno value otherwise and rte_errno is set. 1682 */ 1683 int 1684 mlx5_os_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, 1685 struct rte_pci_device *pci_dev) 1686 { 1687 struct ibv_device **ibv_list; 1688 /* 1689 * Number of found IB Devices matching with requested PCI BDF. 1690 * nd != 1 means there are multiple IB devices over the same 1691 * PCI device and we have representors and master. 1692 */ 1693 unsigned int nd = 0; 1694 /* 1695 * Number of found IB device Ports. nd = 1 and np = 1..n means 1696 * we have the single multiport IB device, and there may be 1697 * representors attached to some of found ports. 1698 */ 1699 unsigned int np = 0; 1700 /* 1701 * Number of DPDK ethernet devices to Spawn - either over 1702 * multiple IB devices or multiple ports of single IB device. 1703 * Actually this is the number of iterations to spawn. 1704 */ 1705 unsigned int ns = 0; 1706 /* 1707 * Bonding device 1708 * < 0 - no bonding device (single one) 1709 * >= 0 - bonding device (value is slave PF index) 1710 */ 1711 int bd = -1; 1712 struct mlx5_dev_spawn_data *list = NULL; 1713 struct mlx5_dev_config dev_config; 1714 unsigned int dev_config_vf; 1715 int ret; 1716 1717 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 1718 mlx5_pmd_socket_init(); 1719 ret = mlx5_init_once(); 1720 if (ret) { 1721 DRV_LOG(ERR, "unable to init PMD global data: %s", 1722 strerror(rte_errno)); 1723 return -rte_errno; 1724 } 1725 errno = 0; 1726 ibv_list = mlx5_glue->get_device_list(&ret); 1727 if (!ibv_list) { 1728 rte_errno = errno ? errno : ENOSYS; 1729 DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?"); 1730 return -rte_errno; 1731 } 1732 /* 1733 * First scan the list of all Infiniband devices to find 1734 * matching ones, gathering into the list. 1735 */ 1736 struct ibv_device *ibv_match[ret + 1]; 1737 int nl_route = mlx5_nl_init(NETLINK_ROUTE); 1738 int nl_rdma = mlx5_nl_init(NETLINK_RDMA); 1739 unsigned int i; 1740 1741 while (ret-- > 0) { 1742 struct rte_pci_addr pci_addr; 1743 1744 DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name); 1745 bd = mlx5_device_bond_pci_match 1746 (ibv_list[ret], pci_dev, nl_rdma); 1747 if (bd >= 0) { 1748 /* 1749 * Bonding device detected. Only one match is allowed, 1750 * the bonding is supported over multi-port IB device, 1751 * there should be no matches on representor PCI 1752 * functions or non VF LAG bonding devices with 1753 * specified address. 1754 */ 1755 if (nd) { 1756 DRV_LOG(ERR, 1757 "multiple PCI match on bonding device" 1758 "\"%s\" found", ibv_list[ret]->name); 1759 rte_errno = ENOENT; 1760 ret = -rte_errno; 1761 goto exit; 1762 } 1763 DRV_LOG(INFO, "PCI information matches for" 1764 " slave %d bonding device \"%s\"", 1765 bd, ibv_list[ret]->name); 1766 ibv_match[nd++] = ibv_list[ret]; 1767 break; 1768 } 1769 if (mlx5_dev_to_pci_addr 1770 (ibv_list[ret]->ibdev_path, &pci_addr)) 1771 continue; 1772 if (pci_dev->addr.domain != pci_addr.domain || 1773 pci_dev->addr.bus != pci_addr.bus || 1774 pci_dev->addr.devid != pci_addr.devid || 1775 pci_dev->addr.function != pci_addr.function) 1776 continue; 1777 DRV_LOG(INFO, "PCI information matches for device \"%s\"", 1778 ibv_list[ret]->name); 1779 ibv_match[nd++] = ibv_list[ret]; 1780 } 1781 ibv_match[nd] = NULL; 1782 if (!nd) { 1783 /* No device matches, just complain and bail out. */ 1784 DRV_LOG(WARNING, 1785 "no Verbs device matches PCI device " PCI_PRI_FMT "," 1786 " are kernel drivers loaded?", 1787 pci_dev->addr.domain, pci_dev->addr.bus, 1788 pci_dev->addr.devid, pci_dev->addr.function); 1789 rte_errno = ENOENT; 1790 ret = -rte_errno; 1791 goto exit; 1792 } 1793 if (nd == 1) { 1794 /* 1795 * Found single matching device may have multiple ports. 1796 * Each port may be representor, we have to check the port 1797 * number and check the representors existence. 1798 */ 1799 if (nl_rdma >= 0) 1800 np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name); 1801 if (!np) 1802 DRV_LOG(WARNING, "can not get IB device \"%s\"" 1803 " ports number", ibv_match[0]->name); 1804 if (bd >= 0 && !np) { 1805 DRV_LOG(ERR, "can not get ports" 1806 " for bonding device"); 1807 rte_errno = ENOENT; 1808 ret = -rte_errno; 1809 goto exit; 1810 } 1811 } 1812 #ifndef HAVE_MLX5DV_DR_DEVX_PORT 1813 if (bd >= 0) { 1814 /* 1815 * This may happen if there is VF LAG kernel support and 1816 * application is compiled with older rdma_core library. 1817 */ 1818 DRV_LOG(ERR, 1819 "No kernel/verbs support for VF LAG bonding found."); 1820 rte_errno = ENOTSUP; 1821 ret = -rte_errno; 1822 goto exit; 1823 } 1824 #endif 1825 /* 1826 * Now we can determine the maximal 1827 * amount of devices to be spawned. 1828 */ 1829 list = mlx5_malloc(MLX5_MEM_ZERO, 1830 sizeof(struct mlx5_dev_spawn_data) * 1831 (np ? np : nd), 1832 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 1833 if (!list) { 1834 DRV_LOG(ERR, "spawn data array allocation failure"); 1835 rte_errno = ENOMEM; 1836 ret = -rte_errno; 1837 goto exit; 1838 } 1839 if (bd >= 0 || np > 1) { 1840 /* 1841 * Single IB device with multiple ports found, 1842 * it may be E-Switch master device and representors. 1843 * We have to perform identification through the ports. 1844 */ 1845 MLX5_ASSERT(nl_rdma >= 0); 1846 MLX5_ASSERT(ns == 0); 1847 MLX5_ASSERT(nd == 1); 1848 MLX5_ASSERT(np); 1849 for (i = 1; i <= np; ++i) { 1850 list[ns].max_port = np; 1851 list[ns].phys_port = i; 1852 list[ns].phys_dev = ibv_match[0]; 1853 list[ns].eth_dev = NULL; 1854 list[ns].pci_dev = pci_dev; 1855 list[ns].pf_bond = bd; 1856 list[ns].ifindex = mlx5_nl_ifindex 1857 (nl_rdma, 1858 mlx5_os_get_dev_device_name 1859 (list[ns].phys_dev), i); 1860 if (!list[ns].ifindex) { 1861 /* 1862 * No network interface index found for the 1863 * specified port, it means there is no 1864 * representor on this port. It's OK, 1865 * there can be disabled ports, for example 1866 * if sriov_numvfs < sriov_totalvfs. 1867 */ 1868 continue; 1869 } 1870 ret = -1; 1871 if (nl_route >= 0) 1872 ret = mlx5_nl_switch_info 1873 (nl_route, 1874 list[ns].ifindex, 1875 &list[ns].info); 1876 if (ret || (!list[ns].info.representor && 1877 !list[ns].info.master)) { 1878 /* 1879 * We failed to recognize representors with 1880 * Netlink, let's try to perform the task 1881 * with sysfs. 1882 */ 1883 ret = mlx5_sysfs_switch_info 1884 (list[ns].ifindex, 1885 &list[ns].info); 1886 } 1887 if (!ret && bd >= 0) { 1888 switch (list[ns].info.name_type) { 1889 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: 1890 if (list[ns].info.port_name == bd) 1891 ns++; 1892 break; 1893 case MLX5_PHYS_PORT_NAME_TYPE_PFHPF: 1894 /* Fallthrough */ 1895 case MLX5_PHYS_PORT_NAME_TYPE_PFVF: 1896 if (list[ns].info.pf_num == bd) 1897 ns++; 1898 break; 1899 default: 1900 break; 1901 } 1902 continue; 1903 } 1904 if (!ret && (list[ns].info.representor ^ 1905 list[ns].info.master)) 1906 ns++; 1907 } 1908 if (!ns) { 1909 DRV_LOG(ERR, 1910 "unable to recognize master/representors" 1911 " on the IB device with multiple ports"); 1912 rte_errno = ENOENT; 1913 ret = -rte_errno; 1914 goto exit; 1915 } 1916 } else { 1917 /* 1918 * The existence of several matching entries (nd > 1) means 1919 * port representors have been instantiated. No existing Verbs 1920 * call nor sysfs entries can tell them apart, this can only 1921 * be done through Netlink calls assuming kernel drivers are 1922 * recent enough to support them. 1923 * 1924 * In the event of identification failure through Netlink, 1925 * try again through sysfs, then: 1926 * 1927 * 1. A single IB device matches (nd == 1) with single 1928 * port (np=0/1) and is not a representor, assume 1929 * no switch support. 1930 * 1931 * 2. Otherwise no safe assumptions can be made; 1932 * complain louder and bail out. 1933 */ 1934 for (i = 0; i != nd; ++i) { 1935 memset(&list[ns].info, 0, sizeof(list[ns].info)); 1936 list[ns].max_port = 1; 1937 list[ns].phys_port = 1; 1938 list[ns].phys_dev = ibv_match[i]; 1939 list[ns].eth_dev = NULL; 1940 list[ns].pci_dev = pci_dev; 1941 list[ns].pf_bond = -1; 1942 list[ns].ifindex = 0; 1943 if (nl_rdma >= 0) 1944 list[ns].ifindex = mlx5_nl_ifindex 1945 (nl_rdma, 1946 mlx5_os_get_dev_device_name 1947 (list[ns].phys_dev), 1); 1948 if (!list[ns].ifindex) { 1949 char ifname[IF_NAMESIZE]; 1950 1951 /* 1952 * Netlink failed, it may happen with old 1953 * ib_core kernel driver (before 4.16). 1954 * We can assume there is old driver because 1955 * here we are processing single ports IB 1956 * devices. Let's try sysfs to retrieve 1957 * the ifindex. The method works for 1958 * master device only. 1959 */ 1960 if (nd > 1) { 1961 /* 1962 * Multiple devices found, assume 1963 * representors, can not distinguish 1964 * master/representor and retrieve 1965 * ifindex via sysfs. 1966 */ 1967 continue; 1968 } 1969 ret = mlx5_get_ifname_sysfs 1970 (ibv_match[i]->ibdev_path, ifname); 1971 if (!ret) 1972 list[ns].ifindex = 1973 if_nametoindex(ifname); 1974 if (!list[ns].ifindex) { 1975 /* 1976 * No network interface index found 1977 * for the specified device, it means 1978 * there it is neither representor 1979 * nor master. 1980 */ 1981 continue; 1982 } 1983 } 1984 ret = -1; 1985 if (nl_route >= 0) 1986 ret = mlx5_nl_switch_info 1987 (nl_route, 1988 list[ns].ifindex, 1989 &list[ns].info); 1990 if (ret || (!list[ns].info.representor && 1991 !list[ns].info.master)) { 1992 /* 1993 * We failed to recognize representors with 1994 * Netlink, let's try to perform the task 1995 * with sysfs. 1996 */ 1997 ret = mlx5_sysfs_switch_info 1998 (list[ns].ifindex, 1999 &list[ns].info); 2000 } 2001 if (!ret && (list[ns].info.representor ^ 2002 list[ns].info.master)) { 2003 ns++; 2004 } else if ((nd == 1) && 2005 !list[ns].info.representor && 2006 !list[ns].info.master) { 2007 /* 2008 * Single IB device with 2009 * one physical port and 2010 * attached network device. 2011 * May be SRIOV is not enabled 2012 * or there is no representors. 2013 */ 2014 DRV_LOG(INFO, "no E-Switch support detected"); 2015 ns++; 2016 break; 2017 } 2018 } 2019 if (!ns) { 2020 DRV_LOG(ERR, 2021 "unable to recognize master/representors" 2022 " on the multiple IB devices"); 2023 rte_errno = ENOENT; 2024 ret = -rte_errno; 2025 goto exit; 2026 } 2027 } 2028 MLX5_ASSERT(ns); 2029 /* 2030 * Sort list to probe devices in natural order for users convenience 2031 * (i.e. master first, then representors from lowest to highest ID). 2032 */ 2033 qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp); 2034 /* Device specific configuration. */ 2035 switch (pci_dev->id.device_id) { 2036 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: 2037 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: 2038 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: 2039 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: 2040 case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF: 2041 case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF: 2042 case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF: 2043 dev_config_vf = 1; 2044 break; 2045 default: 2046 dev_config_vf = 0; 2047 break; 2048 } 2049 for (i = 0; i != ns; ++i) { 2050 uint32_t restore; 2051 2052 /* Default configuration. */ 2053 memset(&dev_config, 0, sizeof(struct mlx5_dev_config)); 2054 dev_config.vf = dev_config_vf; 2055 dev_config.mps = MLX5_ARG_UNSET; 2056 dev_config.dbnc = MLX5_ARG_UNSET; 2057 dev_config.rx_vec_en = 1; 2058 dev_config.txq_inline_max = MLX5_ARG_UNSET; 2059 dev_config.txq_inline_min = MLX5_ARG_UNSET; 2060 dev_config.txq_inline_mpw = MLX5_ARG_UNSET; 2061 dev_config.txqs_inline = MLX5_ARG_UNSET; 2062 dev_config.vf_nl_en = 1; 2063 dev_config.mr_ext_memseg_en = 1; 2064 dev_config.mprq.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN; 2065 dev_config.mprq.min_rxqs_num = MLX5_MPRQ_MIN_RXQS; 2066 dev_config.dv_esw_en = 1; 2067 dev_config.dv_flow_en = 1; 2068 dev_config.decap_en = 1; 2069 dev_config.log_hp_size = MLX5_ARG_UNSET; 2070 list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device, 2071 &list[i], 2072 &dev_config); 2073 if (!list[i].eth_dev) { 2074 if (rte_errno != EBUSY && rte_errno != EEXIST) 2075 break; 2076 /* Device is disabled or already spawned. Ignore it. */ 2077 continue; 2078 } 2079 restore = list[i].eth_dev->data->dev_flags; 2080 rte_eth_copy_pci_info(list[i].eth_dev, pci_dev); 2081 /* Restore non-PCI flags cleared by the above call. */ 2082 list[i].eth_dev->data->dev_flags |= restore; 2083 rte_eth_dev_probing_finish(list[i].eth_dev); 2084 } 2085 if (i != ns) { 2086 DRV_LOG(ERR, 2087 "probe of PCI device " PCI_PRI_FMT " aborted after" 2088 " encountering an error: %s", 2089 pci_dev->addr.domain, pci_dev->addr.bus, 2090 pci_dev->addr.devid, pci_dev->addr.function, 2091 strerror(rte_errno)); 2092 ret = -rte_errno; 2093 /* Roll back. */ 2094 while (i--) { 2095 if (!list[i].eth_dev) 2096 continue; 2097 mlx5_dev_close(list[i].eth_dev); 2098 /* mac_addrs must not be freed because in dev_private */ 2099 list[i].eth_dev->data->mac_addrs = NULL; 2100 claim_zero(rte_eth_dev_release_port(list[i].eth_dev)); 2101 } 2102 /* Restore original error. */ 2103 rte_errno = -ret; 2104 } else { 2105 ret = 0; 2106 } 2107 exit: 2108 /* 2109 * Do the routine cleanup: 2110 * - close opened Netlink sockets 2111 * - free allocated spawn data array 2112 * - free the Infiniband device list 2113 */ 2114 if (nl_rdma >= 0) 2115 close(nl_rdma); 2116 if (nl_route >= 0) 2117 close(nl_route); 2118 if (list) 2119 mlx5_free(list); 2120 MLX5_ASSERT(ibv_list); 2121 mlx5_glue->free_device_list(ibv_list); 2122 return ret; 2123 } 2124 2125 static int 2126 mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config) 2127 { 2128 char *env; 2129 int value; 2130 2131 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 2132 /* Get environment variable to store. */ 2133 env = getenv(MLX5_SHUT_UP_BF); 2134 value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET; 2135 if (config->dbnc == MLX5_ARG_UNSET) 2136 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1); 2137 else 2138 setenv(MLX5_SHUT_UP_BF, 2139 config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1); 2140 return value; 2141 } 2142 2143 static void 2144 mlx5_restore_doorbell_mapping_env(int value) 2145 { 2146 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 2147 /* Restore the original environment variable state. */ 2148 if (value == MLX5_ARG_UNSET) 2149 unsetenv(MLX5_SHUT_UP_BF); 2150 else 2151 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1); 2152 } 2153 2154 /** 2155 * Extract pdn of PD object using DV API. 2156 * 2157 * @param[in] pd 2158 * Pointer to the verbs PD object. 2159 * @param[out] pdn 2160 * Pointer to the PD object number variable. 2161 * 2162 * @return 2163 * 0 on success, error value otherwise. 2164 */ 2165 int 2166 mlx5_os_get_pdn(void *pd, uint32_t *pdn) 2167 { 2168 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 2169 struct mlx5dv_obj obj; 2170 struct mlx5dv_pd pd_info; 2171 int ret = 0; 2172 2173 obj.pd.in = pd; 2174 obj.pd.out = &pd_info; 2175 ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD); 2176 if (ret) { 2177 DRV_LOG(DEBUG, "Fail to get PD object info"); 2178 return ret; 2179 } 2180 *pdn = pd_info.pdn; 2181 return 0; 2182 #else 2183 (void)pd; 2184 (void)pdn; 2185 return -ENOTSUP; 2186 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */ 2187 } 2188 2189 /** 2190 * Function API to open IB device. 2191 * 2192 * This function calls the Linux glue APIs to open a device. 2193 * 2194 * @param[in] spawn 2195 * Pointer to the IB device attributes (name, port, etc). 2196 * @param[out] config 2197 * Pointer to device configuration structure. 2198 * @param[out] sh 2199 * Pointer to shared context structure. 2200 * 2201 * @return 2202 * 0 on success, a positive error value otherwise. 2203 */ 2204 int 2205 mlx5_os_open_device(const struct mlx5_dev_spawn_data *spawn, 2206 const struct mlx5_dev_config *config, 2207 struct mlx5_dev_ctx_shared *sh) 2208 { 2209 int dbmap_env; 2210 int err = 0; 2211 2212 sh->numa_node = spawn->pci_dev->device.numa_node; 2213 pthread_mutex_init(&sh->txpp.mutex, NULL); 2214 /* 2215 * Configure environment variable "MLX5_BF_SHUT_UP" 2216 * before the device creation. The rdma_core library 2217 * checks the variable at device creation and 2218 * stores the result internally. 2219 */ 2220 dbmap_env = mlx5_config_doorbell_mapping_env(config); 2221 /* Try to open IB device with DV first, then usual Verbs. */ 2222 errno = 0; 2223 sh->ctx = mlx5_glue->dv_open_device(spawn->phys_dev); 2224 if (sh->ctx) { 2225 sh->devx = 1; 2226 DRV_LOG(DEBUG, "DevX is supported"); 2227 /* The device is created, no need for environment. */ 2228 mlx5_restore_doorbell_mapping_env(dbmap_env); 2229 } else { 2230 /* The environment variable is still configured. */ 2231 sh->ctx = mlx5_glue->open_device(spawn->phys_dev); 2232 err = errno ? errno : ENODEV; 2233 /* 2234 * The environment variable is not needed anymore, 2235 * all device creation attempts are completed. 2236 */ 2237 mlx5_restore_doorbell_mapping_env(dbmap_env); 2238 if (!sh->ctx) 2239 return err; 2240 DRV_LOG(DEBUG, "DevX is NOT supported"); 2241 err = 0; 2242 } 2243 return err; 2244 } 2245 2246 /** 2247 * Install shared asynchronous device events handler. 2248 * This function is implemented to support event sharing 2249 * between multiple ports of single IB device. 2250 * 2251 * @param sh 2252 * Pointer to mlx5_dev_ctx_shared object. 2253 */ 2254 void 2255 mlx5_os_dev_shared_handler_install(struct mlx5_dev_ctx_shared *sh) 2256 { 2257 int ret; 2258 int flags; 2259 2260 sh->intr_handle.fd = -1; 2261 flags = fcntl(((struct ibv_context *)sh->ctx)->async_fd, F_GETFL); 2262 ret = fcntl(((struct ibv_context *)sh->ctx)->async_fd, 2263 F_SETFL, flags | O_NONBLOCK); 2264 if (ret) { 2265 DRV_LOG(INFO, "failed to change file descriptor async event" 2266 " queue"); 2267 } else { 2268 sh->intr_handle.fd = ((struct ibv_context *)sh->ctx)->async_fd; 2269 sh->intr_handle.type = RTE_INTR_HANDLE_EXT; 2270 if (rte_intr_callback_register(&sh->intr_handle, 2271 mlx5_dev_interrupt_handler, sh)) { 2272 DRV_LOG(INFO, "Fail to install the shared interrupt."); 2273 sh->intr_handle.fd = -1; 2274 } 2275 } 2276 if (sh->devx) { 2277 #ifdef HAVE_IBV_DEVX_ASYNC 2278 sh->intr_handle_devx.fd = -1; 2279 sh->devx_comp = 2280 (void *)mlx5_glue->devx_create_cmd_comp(sh->ctx); 2281 struct mlx5dv_devx_cmd_comp *devx_comp = sh->devx_comp; 2282 if (!devx_comp) { 2283 DRV_LOG(INFO, "failed to allocate devx_comp."); 2284 return; 2285 } 2286 flags = fcntl(devx_comp->fd, F_GETFL); 2287 ret = fcntl(devx_comp->fd, F_SETFL, flags | O_NONBLOCK); 2288 if (ret) { 2289 DRV_LOG(INFO, "failed to change file descriptor" 2290 " devx comp"); 2291 return; 2292 } 2293 sh->intr_handle_devx.fd = devx_comp->fd; 2294 sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT; 2295 if (rte_intr_callback_register(&sh->intr_handle_devx, 2296 mlx5_dev_interrupt_handler_devx, sh)) { 2297 DRV_LOG(INFO, "Fail to install the devx shared" 2298 " interrupt."); 2299 sh->intr_handle_devx.fd = -1; 2300 } 2301 #endif /* HAVE_IBV_DEVX_ASYNC */ 2302 } 2303 } 2304 2305 /** 2306 * Uninstall shared asynchronous device events handler. 2307 * This function is implemented to support event sharing 2308 * between multiple ports of single IB device. 2309 * 2310 * @param dev 2311 * Pointer to mlx5_dev_ctx_shared object. 2312 */ 2313 void 2314 mlx5_os_dev_shared_handler_uninstall(struct mlx5_dev_ctx_shared *sh) 2315 { 2316 if (sh->intr_handle.fd >= 0) 2317 mlx5_intr_callback_unregister(&sh->intr_handle, 2318 mlx5_dev_interrupt_handler, sh); 2319 #ifdef HAVE_IBV_DEVX_ASYNC 2320 if (sh->intr_handle_devx.fd >= 0) 2321 rte_intr_callback_unregister(&sh->intr_handle_devx, 2322 mlx5_dev_interrupt_handler_devx, sh); 2323 if (sh->devx_comp) 2324 mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp); 2325 #endif 2326 } 2327 2328 /** 2329 * Read statistics by a named counter. 2330 * 2331 * @param[in] priv 2332 * Pointer to the private device data structure. 2333 * @param[in] ctr_name 2334 * Pointer to the name of the statistic counter to read 2335 * @param[out] stat 2336 * Pointer to read statistic value. 2337 * @return 2338 * 0 on success and stat is valud, 1 if failed to read the value 2339 * rte_errno is set. 2340 * 2341 */ 2342 int 2343 mlx5_os_read_dev_stat(struct mlx5_priv *priv, const char *ctr_name, 2344 uint64_t *stat) 2345 { 2346 int fd; 2347 2348 if (priv->sh) { 2349 MKSTR(path, "%s/ports/%d/hw_counters/%s", 2350 priv->sh->ibdev_path, 2351 priv->dev_port, 2352 ctr_name); 2353 fd = open(path, O_RDONLY); 2354 /* 2355 * in switchdev the file location is not per port 2356 * but rather in <ibdev_path>/hw_counters/<file_name>. 2357 */ 2358 if (fd == -1) { 2359 MKSTR(path1, "%s/hw_counters/%s", 2360 priv->sh->ibdev_path, 2361 ctr_name); 2362 fd = open(path1, O_RDONLY); 2363 } 2364 if (fd != -1) { 2365 char buf[21] = {'\0'}; 2366 ssize_t n = read(fd, buf, sizeof(buf)); 2367 2368 close(fd); 2369 if (n != -1) { 2370 *stat = strtoull(buf, NULL, 10); 2371 return 0; 2372 } 2373 } 2374 } 2375 *stat = 0; 2376 return 1; 2377 } 2378 2379 /** 2380 * Set the reg_mr and dereg_mr call backs 2381 * 2382 * @param reg_mr_cb[out] 2383 * Pointer to reg_mr func 2384 * @param dereg_mr_cb[out] 2385 * Pointer to dereg_mr func 2386 * 2387 */ 2388 void 2389 mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb, 2390 mlx5_dereg_mr_t *dereg_mr_cb) 2391 { 2392 *reg_mr_cb = mlx5_verbs_ops.reg_mr; 2393 *dereg_mr_cb = mlx5_verbs_ops.dereg_mr; 2394 } 2395 2396 /** 2397 * Remove a MAC address from device 2398 * 2399 * @param dev 2400 * Pointer to Ethernet device structure. 2401 * @param index 2402 * MAC address index. 2403 */ 2404 void 2405 mlx5_os_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) 2406 { 2407 struct mlx5_priv *priv = dev->data->dev_private; 2408 const int vf = priv->config.vf; 2409 2410 if (vf) 2411 mlx5_nl_mac_addr_remove(priv->nl_socket_route, 2412 mlx5_ifindex(dev), priv->mac_own, 2413 &dev->data->mac_addrs[index], index); 2414 } 2415 2416 /** 2417 * Adds a MAC address to the device 2418 * 2419 * @param dev 2420 * Pointer to Ethernet device structure. 2421 * @param mac_addr 2422 * MAC address to register. 2423 * @param index 2424 * MAC address index. 2425 * 2426 * @return 2427 * 0 on success, a negative errno value otherwise 2428 */ 2429 int 2430 mlx5_os_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac, 2431 uint32_t index) 2432 { 2433 struct mlx5_priv *priv = dev->data->dev_private; 2434 const int vf = priv->config.vf; 2435 int ret = 0; 2436 2437 if (vf) 2438 ret = mlx5_nl_mac_addr_add(priv->nl_socket_route, 2439 mlx5_ifindex(dev), priv->mac_own, 2440 mac, index); 2441 return ret; 2442 } 2443 2444 /** 2445 * Modify a VF MAC address 2446 * 2447 * @param priv 2448 * Pointer to device private data. 2449 * @param mac_addr 2450 * MAC address to modify into. 2451 * @param iface_idx 2452 * Net device interface index 2453 * @param vf_index 2454 * VF index 2455 * 2456 * @return 2457 * 0 on success, a negative errno value otherwise 2458 */ 2459 int 2460 mlx5_os_vf_mac_addr_modify(struct mlx5_priv *priv, 2461 unsigned int iface_idx, 2462 struct rte_ether_addr *mac_addr, 2463 int vf_index) 2464 { 2465 return mlx5_nl_vf_mac_addr_modify 2466 (priv->nl_socket_route, iface_idx, mac_addr, vf_index); 2467 } 2468 2469 /** 2470 * Set device promiscuous mode 2471 * 2472 * @param dev 2473 * Pointer to Ethernet device structure. 2474 * @param enable 2475 * 0 - promiscuous is disabled, otherwise - enabled 2476 * 2477 * @return 2478 * 0 on success, a negative error value otherwise 2479 */ 2480 int 2481 mlx5_os_set_promisc(struct rte_eth_dev *dev, int enable) 2482 { 2483 struct mlx5_priv *priv = dev->data->dev_private; 2484 2485 return mlx5_nl_promisc(priv->nl_socket_route, 2486 mlx5_ifindex(dev), !!enable); 2487 } 2488 2489 /** 2490 * Set device promiscuous mode 2491 * 2492 * @param dev 2493 * Pointer to Ethernet device structure. 2494 * @param enable 2495 * 0 - all multicase is disabled, otherwise - enabled 2496 * 2497 * @return 2498 * 0 on success, a negative error value otherwise 2499 */ 2500 int 2501 mlx5_os_set_allmulti(struct rte_eth_dev *dev, int enable) 2502 { 2503 struct mlx5_priv *priv = dev->data->dev_private; 2504 2505 return mlx5_nl_allmulti(priv->nl_socket_route, 2506 mlx5_ifindex(dev), !!enable); 2507 } 2508 2509 /** 2510 * Flush device MAC addresses 2511 * 2512 * @param dev 2513 * Pointer to Ethernet device structure. 2514 * 2515 */ 2516 void 2517 mlx5_os_mac_addr_flush(struct rte_eth_dev *dev) 2518 { 2519 struct mlx5_priv *priv = dev->data->dev_private; 2520 2521 mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev), 2522 dev->data->mac_addrs, 2523 MLX5_MAX_MAC_ADDRESSES, priv->mac_own); 2524 } 2525 2526 const struct eth_dev_ops mlx5_os_dev_ops = { 2527 .dev_configure = mlx5_dev_configure, 2528 .dev_start = mlx5_dev_start, 2529 .dev_stop = mlx5_dev_stop, 2530 .dev_set_link_down = mlx5_set_link_down, 2531 .dev_set_link_up = mlx5_set_link_up, 2532 .dev_close = mlx5_dev_close, 2533 .promiscuous_enable = mlx5_promiscuous_enable, 2534 .promiscuous_disable = mlx5_promiscuous_disable, 2535 .allmulticast_enable = mlx5_allmulticast_enable, 2536 .allmulticast_disable = mlx5_allmulticast_disable, 2537 .link_update = mlx5_link_update, 2538 .stats_get = mlx5_stats_get, 2539 .stats_reset = mlx5_stats_reset, 2540 .xstats_get = mlx5_xstats_get, 2541 .xstats_reset = mlx5_xstats_reset, 2542 .xstats_get_names = mlx5_xstats_get_names, 2543 .fw_version_get = mlx5_fw_version_get, 2544 .dev_infos_get = mlx5_dev_infos_get, 2545 .read_clock = mlx5_txpp_read_clock, 2546 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 2547 .vlan_filter_set = mlx5_vlan_filter_set, 2548 .rx_queue_setup = mlx5_rx_queue_setup, 2549 .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup, 2550 .tx_queue_setup = mlx5_tx_queue_setup, 2551 .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup, 2552 .rx_queue_release = mlx5_rx_queue_release, 2553 .tx_queue_release = mlx5_tx_queue_release, 2554 .rx_queue_start = mlx5_rx_queue_start, 2555 .rx_queue_stop = mlx5_rx_queue_stop, 2556 .tx_queue_start = mlx5_tx_queue_start, 2557 .tx_queue_stop = mlx5_tx_queue_stop, 2558 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 2559 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 2560 .mac_addr_remove = mlx5_mac_addr_remove, 2561 .mac_addr_add = mlx5_mac_addr_add, 2562 .mac_addr_set = mlx5_mac_addr_set, 2563 .set_mc_addr_list = mlx5_set_mc_addr_list, 2564 .mtu_set = mlx5_dev_set_mtu, 2565 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 2566 .vlan_offload_set = mlx5_vlan_offload_set, 2567 .reta_update = mlx5_dev_rss_reta_update, 2568 .reta_query = mlx5_dev_rss_reta_query, 2569 .rss_hash_update = mlx5_rss_hash_update, 2570 .rss_hash_conf_get = mlx5_rss_hash_conf_get, 2571 .filter_ctrl = mlx5_dev_filter_ctrl, 2572 .rxq_info_get = mlx5_rxq_info_get, 2573 .txq_info_get = mlx5_txq_info_get, 2574 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 2575 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 2576 .rx_queue_intr_enable = mlx5_rx_intr_enable, 2577 .rx_queue_intr_disable = mlx5_rx_intr_disable, 2578 .is_removed = mlx5_is_removed, 2579 .udp_tunnel_port_add = mlx5_udp_tunnel_port_add, 2580 .get_module_info = mlx5_get_module_info, 2581 .get_module_eeprom = mlx5_get_module_eeprom, 2582 .hairpin_cap_get = mlx5_hairpin_cap_get, 2583 .mtr_ops_get = mlx5_flow_meter_ops_get, 2584 .hairpin_bind = mlx5_hairpin_bind, 2585 .hairpin_unbind = mlx5_hairpin_unbind, 2586 .hairpin_get_peer_ports = mlx5_hairpin_get_peer_ports, 2587 .hairpin_queue_peer_update = mlx5_hairpin_queue_peer_update, 2588 .hairpin_queue_peer_bind = mlx5_hairpin_queue_peer_bind, 2589 .hairpin_queue_peer_unbind = mlx5_hairpin_queue_peer_unbind, 2590 }; 2591 2592 /* Available operations from secondary process. */ 2593 const struct eth_dev_ops mlx5_os_dev_sec_ops = { 2594 .stats_get = mlx5_stats_get, 2595 .stats_reset = mlx5_stats_reset, 2596 .xstats_get = mlx5_xstats_get, 2597 .xstats_reset = mlx5_xstats_reset, 2598 .xstats_get_names = mlx5_xstats_get_names, 2599 .fw_version_get = mlx5_fw_version_get, 2600 .dev_infos_get = mlx5_dev_infos_get, 2601 .read_clock = mlx5_txpp_read_clock, 2602 .rx_queue_start = mlx5_rx_queue_start, 2603 .rx_queue_stop = mlx5_rx_queue_stop, 2604 .tx_queue_start = mlx5_tx_queue_start, 2605 .tx_queue_stop = mlx5_tx_queue_stop, 2606 .rxq_info_get = mlx5_rxq_info_get, 2607 .txq_info_get = mlx5_txq_info_get, 2608 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 2609 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 2610 .get_module_info = mlx5_get_module_info, 2611 .get_module_eeprom = mlx5_get_module_eeprom, 2612 }; 2613 2614 /* Available operations in flow isolated mode. */ 2615 const struct eth_dev_ops mlx5_os_dev_ops_isolate = { 2616 .dev_configure = mlx5_dev_configure, 2617 .dev_start = mlx5_dev_start, 2618 .dev_stop = mlx5_dev_stop, 2619 .dev_set_link_down = mlx5_set_link_down, 2620 .dev_set_link_up = mlx5_set_link_up, 2621 .dev_close = mlx5_dev_close, 2622 .promiscuous_enable = mlx5_promiscuous_enable, 2623 .promiscuous_disable = mlx5_promiscuous_disable, 2624 .allmulticast_enable = mlx5_allmulticast_enable, 2625 .allmulticast_disable = mlx5_allmulticast_disable, 2626 .link_update = mlx5_link_update, 2627 .stats_get = mlx5_stats_get, 2628 .stats_reset = mlx5_stats_reset, 2629 .xstats_get = mlx5_xstats_get, 2630 .xstats_reset = mlx5_xstats_reset, 2631 .xstats_get_names = mlx5_xstats_get_names, 2632 .fw_version_get = mlx5_fw_version_get, 2633 .dev_infos_get = mlx5_dev_infos_get, 2634 .read_clock = mlx5_txpp_read_clock, 2635 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 2636 .vlan_filter_set = mlx5_vlan_filter_set, 2637 .rx_queue_setup = mlx5_rx_queue_setup, 2638 .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup, 2639 .tx_queue_setup = mlx5_tx_queue_setup, 2640 .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup, 2641 .rx_queue_release = mlx5_rx_queue_release, 2642 .tx_queue_release = mlx5_tx_queue_release, 2643 .rx_queue_start = mlx5_rx_queue_start, 2644 .rx_queue_stop = mlx5_rx_queue_stop, 2645 .tx_queue_start = mlx5_tx_queue_start, 2646 .tx_queue_stop = mlx5_tx_queue_stop, 2647 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 2648 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 2649 .mac_addr_remove = mlx5_mac_addr_remove, 2650 .mac_addr_add = mlx5_mac_addr_add, 2651 .mac_addr_set = mlx5_mac_addr_set, 2652 .set_mc_addr_list = mlx5_set_mc_addr_list, 2653 .mtu_set = mlx5_dev_set_mtu, 2654 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 2655 .vlan_offload_set = mlx5_vlan_offload_set, 2656 .filter_ctrl = mlx5_dev_filter_ctrl, 2657 .rxq_info_get = mlx5_rxq_info_get, 2658 .txq_info_get = mlx5_txq_info_get, 2659 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 2660 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 2661 .rx_queue_intr_enable = mlx5_rx_intr_enable, 2662 .rx_queue_intr_disable = mlx5_rx_intr_disable, 2663 .is_removed = mlx5_is_removed, 2664 .get_module_info = mlx5_get_module_info, 2665 .get_module_eeprom = mlx5_get_module_eeprom, 2666 .hairpin_cap_get = mlx5_hairpin_cap_get, 2667 .mtr_ops_get = mlx5_flow_meter_ops_get, 2668 .hairpin_bind = mlx5_hairpin_bind, 2669 .hairpin_unbind = mlx5_hairpin_unbind, 2670 .hairpin_get_peer_ports = mlx5_hairpin_get_peer_ports, 2671 .hairpin_queue_peer_update = mlx5_hairpin_queue_peer_update, 2672 .hairpin_queue_peer_bind = mlx5_hairpin_queue_peer_bind, 2673 .hairpin_queue_peer_unbind = mlx5_hairpin_queue_peer_unbind, 2674 }; 2675