1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2020 Mellanox Technologies, Ltd 4 */ 5 6 #include <stddef.h> 7 #include <unistd.h> 8 #include <string.h> 9 #include <stdint.h> 10 #include <stdlib.h> 11 #include <errno.h> 12 #include <net/if.h> 13 #include <linux/rtnetlink.h> 14 #include <linux/sockios.h> 15 #include <linux/ethtool.h> 16 #include <fcntl.h> 17 18 #include <rte_malloc.h> 19 #include <rte_ethdev_driver.h> 20 #include <rte_ethdev_pci.h> 21 #include <rte_pci.h> 22 #include <rte_bus_pci.h> 23 #include <rte_common.h> 24 #include <rte_kvargs.h> 25 #include <rte_rwlock.h> 26 #include <rte_spinlock.h> 27 #include <rte_string_fns.h> 28 #include <rte_alarm.h> 29 #include <rte_eal_paging.h> 30 31 #include <mlx5_glue.h> 32 #include <mlx5_devx_cmds.h> 33 #include <mlx5_common.h> 34 #include <mlx5_common_mp.h> 35 #include <mlx5_common_mr.h> 36 #include <mlx5_malloc.h> 37 38 #include "mlx5_defs.h" 39 #include "mlx5.h" 40 #include "mlx5_common_os.h" 41 #include "mlx5_utils.h" 42 #include "mlx5_rxtx.h" 43 #include "mlx5_autoconf.h" 44 #include "mlx5_mr.h" 45 #include "mlx5_flow.h" 46 #include "rte_pmd_mlx5.h" 47 #include "mlx5_verbs.h" 48 #include "mlx5_nl.h" 49 #include "mlx5_devx.h" 50 51 #define MLX5_TAGS_HLIST_ARRAY_SIZE 8192 52 53 #ifndef HAVE_IBV_MLX5_MOD_MPW 54 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) 55 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) 56 #endif 57 58 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP 59 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4) 60 #endif 61 62 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data"; 63 64 /* Spinlock for mlx5_shared_data allocation. */ 65 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER; 66 67 /* Process local data for secondary processes. */ 68 static struct mlx5_local_data mlx5_local_data; 69 70 /** 71 * Set the completion channel file descriptor interrupt as non-blocking. 72 * 73 * @param[in] rxq_obj 74 * Pointer to RQ channel object, which includes the channel fd 75 * 76 * @param[out] fd 77 * The file descriptor (representing the intetrrupt) used in this channel. 78 * 79 * @return 80 * 0 on successfully setting the fd to non-blocking, non-zero otherwise. 81 */ 82 int 83 mlx5_os_set_nonblock_channel_fd(int fd) 84 { 85 int flags; 86 87 flags = fcntl(fd, F_GETFL); 88 return fcntl(fd, F_SETFL, flags | O_NONBLOCK); 89 } 90 91 /** 92 * Get mlx5 device attributes. The glue function query_device_ex() is called 93 * with out parameter of type 'struct ibv_device_attr_ex *'. Then fill in mlx5 94 * device attributes from the glue out parameter. 95 * 96 * @param dev 97 * Pointer to ibv context. 98 * 99 * @param device_attr 100 * Pointer to mlx5 device attributes. 101 * 102 * @return 103 * 0 on success, non zero error number otherwise 104 */ 105 int 106 mlx5_os_get_dev_attr(void *ctx, struct mlx5_dev_attr *device_attr) 107 { 108 int err; 109 struct ibv_device_attr_ex attr_ex; 110 memset(device_attr, 0, sizeof(*device_attr)); 111 err = mlx5_glue->query_device_ex(ctx, NULL, &attr_ex); 112 if (err) 113 return err; 114 115 device_attr->device_cap_flags_ex = attr_ex.device_cap_flags_ex; 116 device_attr->max_qp_wr = attr_ex.orig_attr.max_qp_wr; 117 device_attr->max_sge = attr_ex.orig_attr.max_sge; 118 device_attr->max_cq = attr_ex.orig_attr.max_cq; 119 device_attr->max_qp = attr_ex.orig_attr.max_qp; 120 device_attr->raw_packet_caps = attr_ex.raw_packet_caps; 121 device_attr->max_rwq_indirection_table_size = 122 attr_ex.rss_caps.max_rwq_indirection_table_size; 123 device_attr->max_tso = attr_ex.tso_caps.max_tso; 124 device_attr->tso_supported_qpts = attr_ex.tso_caps.supported_qpts; 125 126 struct mlx5dv_context dv_attr = { .comp_mask = 0 }; 127 err = mlx5_glue->dv_query_device(ctx, &dv_attr); 128 if (err) 129 return err; 130 131 device_attr->flags = dv_attr.flags; 132 device_attr->comp_mask = dv_attr.comp_mask; 133 #ifdef HAVE_IBV_MLX5_MOD_SWP 134 device_attr->sw_parsing_offloads = 135 dv_attr.sw_parsing_caps.sw_parsing_offloads; 136 #endif 137 device_attr->min_single_stride_log_num_of_bytes = 138 dv_attr.striding_rq_caps.min_single_stride_log_num_of_bytes; 139 device_attr->max_single_stride_log_num_of_bytes = 140 dv_attr.striding_rq_caps.max_single_stride_log_num_of_bytes; 141 device_attr->min_single_wqe_log_num_of_strides = 142 dv_attr.striding_rq_caps.min_single_wqe_log_num_of_strides; 143 device_attr->max_single_wqe_log_num_of_strides = 144 dv_attr.striding_rq_caps.max_single_wqe_log_num_of_strides; 145 device_attr->stride_supported_qpts = 146 dv_attr.striding_rq_caps.supported_qpts; 147 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 148 device_attr->tunnel_offloads_caps = dv_attr.tunnel_offloads_caps; 149 #endif 150 151 return err; 152 } 153 154 /** 155 * Verbs callback to allocate a memory. This function should allocate the space 156 * according to the size provided residing inside a huge page. 157 * Please note that all allocation must respect the alignment from libmlx5 158 * (i.e. currently rte_mem_page_size()). 159 * 160 * @param[in] size 161 * The size in bytes of the memory to allocate. 162 * @param[in] data 163 * A pointer to the callback data. 164 * 165 * @return 166 * Allocated buffer, NULL otherwise and rte_errno is set. 167 */ 168 static void * 169 mlx5_alloc_verbs_buf(size_t size, void *data) 170 { 171 struct mlx5_priv *priv = data; 172 void *ret; 173 unsigned int socket = SOCKET_ID_ANY; 174 size_t alignment = rte_mem_page_size(); 175 if (alignment == (size_t)-1) { 176 DRV_LOG(ERR, "Failed to get mem page size"); 177 rte_errno = ENOMEM; 178 return NULL; 179 } 180 181 if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) { 182 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 183 184 socket = ctrl->socket; 185 } else if (priv->verbs_alloc_ctx.type == 186 MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) { 187 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 188 189 socket = ctrl->socket; 190 } 191 MLX5_ASSERT(data != NULL); 192 ret = mlx5_malloc(0, size, alignment, socket); 193 if (!ret && size) 194 rte_errno = ENOMEM; 195 return ret; 196 } 197 198 /** 199 * Verbs callback to free a memory. 200 * 201 * @param[in] ptr 202 * A pointer to the memory to free. 203 * @param[in] data 204 * A pointer to the callback data. 205 */ 206 static void 207 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused) 208 { 209 MLX5_ASSERT(data != NULL); 210 mlx5_free(ptr); 211 } 212 213 /** 214 * Initialize DR related data within private structure. 215 * Routine checks the reference counter and does actual 216 * resources creation/initialization only if counter is zero. 217 * 218 * @param[in] priv 219 * Pointer to the private device data structure. 220 * 221 * @return 222 * Zero on success, positive error code otherwise. 223 */ 224 static int 225 mlx5_alloc_shared_dr(struct mlx5_priv *priv) 226 { 227 struct mlx5_dev_ctx_shared *sh = priv->sh; 228 char s[MLX5_HLIST_NAMESIZE] __rte_unused; 229 int err; 230 231 MLX5_ASSERT(sh && sh->refcnt); 232 if (sh->refcnt > 1) 233 return 0; 234 err = mlx5_alloc_table_hash_list(priv); 235 if (err) 236 goto error; 237 /* The resources below are only valid with DV support. */ 238 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 239 /* Init port id action cache list. */ 240 snprintf(s, sizeof(s), "%s_port_id_action_cache", sh->ibdev_name); 241 mlx5_cache_list_init(&sh->port_id_action_list, s, 0, sh, 242 flow_dv_port_id_create_cb, 243 flow_dv_port_id_match_cb, 244 flow_dv_port_id_remove_cb); 245 /* Init push vlan action cache list. */ 246 snprintf(s, sizeof(s), "%s_push_vlan_action_cache", sh->ibdev_name); 247 mlx5_cache_list_init(&sh->push_vlan_action_list, s, 0, sh, 248 flow_dv_push_vlan_create_cb, 249 flow_dv_push_vlan_match_cb, 250 flow_dv_push_vlan_remove_cb); 251 /* Create tags hash list table. */ 252 snprintf(s, sizeof(s), "%s_tags", sh->ibdev_name); 253 sh->tag_table = mlx5_hlist_create(s, MLX5_TAGS_HLIST_ARRAY_SIZE, 0, 254 MLX5_HLIST_WRITE_MOST, 255 flow_dv_tag_create_cb, NULL, 256 flow_dv_tag_remove_cb); 257 if (!sh->tag_table) { 258 DRV_LOG(ERR, "tags with hash creation failed."); 259 err = ENOMEM; 260 goto error; 261 } 262 sh->tag_table->ctx = sh; 263 snprintf(s, sizeof(s), "%s_hdr_modify", sh->ibdev_name); 264 sh->modify_cmds = mlx5_hlist_create(s, MLX5_FLOW_HDR_MODIFY_HTABLE_SZ, 265 0, MLX5_HLIST_WRITE_MOST | 266 MLX5_HLIST_DIRECT_KEY, 267 flow_dv_modify_create_cb, 268 flow_dv_modify_match_cb, 269 flow_dv_modify_remove_cb); 270 if (!sh->modify_cmds) { 271 DRV_LOG(ERR, "hdr modify hash creation failed"); 272 err = ENOMEM; 273 goto error; 274 } 275 sh->modify_cmds->ctx = sh; 276 snprintf(s, sizeof(s), "%s_encaps_decaps", sh->ibdev_name); 277 sh->encaps_decaps = mlx5_hlist_create(s, 278 MLX5_FLOW_ENCAP_DECAP_HTABLE_SZ, 279 0, MLX5_HLIST_DIRECT_KEY | 280 MLX5_HLIST_WRITE_MOST, 281 flow_dv_encap_decap_create_cb, 282 flow_dv_encap_decap_match_cb, 283 flow_dv_encap_decap_remove_cb); 284 if (!sh->encaps_decaps) { 285 DRV_LOG(ERR, "encap decap hash creation failed"); 286 err = ENOMEM; 287 goto error; 288 } 289 sh->encaps_decaps->ctx = sh; 290 #endif 291 #ifdef HAVE_MLX5DV_DR 292 void *domain; 293 294 /* Reference counter is zero, we should initialize structures. */ 295 domain = mlx5_glue->dr_create_domain(sh->ctx, 296 MLX5DV_DR_DOMAIN_TYPE_NIC_RX); 297 if (!domain) { 298 DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed"); 299 err = errno; 300 goto error; 301 } 302 sh->rx_domain = domain; 303 domain = mlx5_glue->dr_create_domain(sh->ctx, 304 MLX5DV_DR_DOMAIN_TYPE_NIC_TX); 305 if (!domain) { 306 DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed"); 307 err = errno; 308 goto error; 309 } 310 pthread_mutex_init(&sh->dv_mutex, NULL); 311 sh->tx_domain = domain; 312 #ifdef HAVE_MLX5DV_DR_ESWITCH 313 if (priv->config.dv_esw_en) { 314 domain = mlx5_glue->dr_create_domain 315 (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB); 316 if (!domain) { 317 DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed"); 318 err = errno; 319 goto error; 320 } 321 sh->fdb_domain = domain; 322 sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop(); 323 } 324 #endif 325 if (!sh->tunnel_hub) 326 err = mlx5_alloc_tunnel_hub(sh); 327 if (err) { 328 DRV_LOG(ERR, "mlx5_alloc_tunnel_hub failed err=%d", err); 329 goto error; 330 } 331 if (priv->config.reclaim_mode == MLX5_RCM_AGGR) { 332 mlx5_glue->dr_reclaim_domain_memory(sh->rx_domain, 1); 333 mlx5_glue->dr_reclaim_domain_memory(sh->tx_domain, 1); 334 if (sh->fdb_domain) 335 mlx5_glue->dr_reclaim_domain_memory(sh->fdb_domain, 1); 336 } 337 sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan(); 338 #endif /* HAVE_MLX5DV_DR */ 339 sh->default_miss_action = 340 mlx5_glue->dr_create_flow_action_default_miss(); 341 if (!sh->default_miss_action) 342 DRV_LOG(WARNING, "Default miss action is not supported."); 343 return 0; 344 error: 345 /* Rollback the created objects. */ 346 if (sh->rx_domain) { 347 mlx5_glue->dr_destroy_domain(sh->rx_domain); 348 sh->rx_domain = NULL; 349 } 350 if (sh->tx_domain) { 351 mlx5_glue->dr_destroy_domain(sh->tx_domain); 352 sh->tx_domain = NULL; 353 } 354 if (sh->fdb_domain) { 355 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 356 sh->fdb_domain = NULL; 357 } 358 if (sh->esw_drop_action) { 359 mlx5_glue->destroy_flow_action(sh->esw_drop_action); 360 sh->esw_drop_action = NULL; 361 } 362 if (sh->pop_vlan_action) { 363 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 364 sh->pop_vlan_action = NULL; 365 } 366 if (sh->encaps_decaps) { 367 mlx5_hlist_destroy(sh->encaps_decaps); 368 sh->encaps_decaps = NULL; 369 } 370 if (sh->modify_cmds) { 371 mlx5_hlist_destroy(sh->modify_cmds); 372 sh->modify_cmds = NULL; 373 } 374 if (sh->tag_table) { 375 /* tags should be destroyed with flow before. */ 376 mlx5_hlist_destroy(sh->tag_table); 377 sh->tag_table = NULL; 378 } 379 if (sh->tunnel_hub) { 380 mlx5_release_tunnel_hub(sh, priv->dev_port); 381 sh->tunnel_hub = NULL; 382 } 383 mlx5_free_table_hash_list(priv); 384 return err; 385 } 386 387 /** 388 * Destroy DR related data within private structure. 389 * 390 * @param[in] priv 391 * Pointer to the private device data structure. 392 */ 393 void 394 mlx5_os_free_shared_dr(struct mlx5_priv *priv) 395 { 396 struct mlx5_dev_ctx_shared *sh = priv->sh; 397 398 MLX5_ASSERT(sh && sh->refcnt); 399 if (sh->refcnt > 1) 400 return; 401 #ifdef HAVE_MLX5DV_DR 402 if (sh->rx_domain) { 403 mlx5_glue->dr_destroy_domain(sh->rx_domain); 404 sh->rx_domain = NULL; 405 } 406 if (sh->tx_domain) { 407 mlx5_glue->dr_destroy_domain(sh->tx_domain); 408 sh->tx_domain = NULL; 409 } 410 #ifdef HAVE_MLX5DV_DR_ESWITCH 411 if (sh->fdb_domain) { 412 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 413 sh->fdb_domain = NULL; 414 } 415 if (sh->esw_drop_action) { 416 mlx5_glue->destroy_flow_action(sh->esw_drop_action); 417 sh->esw_drop_action = NULL; 418 } 419 #endif 420 if (sh->pop_vlan_action) { 421 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 422 sh->pop_vlan_action = NULL; 423 } 424 pthread_mutex_destroy(&sh->dv_mutex); 425 #endif /* HAVE_MLX5DV_DR */ 426 if (sh->default_miss_action) 427 mlx5_glue->destroy_flow_action 428 (sh->default_miss_action); 429 if (sh->encaps_decaps) { 430 mlx5_hlist_destroy(sh->encaps_decaps); 431 sh->encaps_decaps = NULL; 432 } 433 if (sh->modify_cmds) { 434 mlx5_hlist_destroy(sh->modify_cmds); 435 sh->modify_cmds = NULL; 436 } 437 if (sh->tag_table) { 438 /* tags should be destroyed with flow before. */ 439 mlx5_hlist_destroy(sh->tag_table); 440 sh->tag_table = NULL; 441 } 442 if (sh->tunnel_hub) { 443 mlx5_release_tunnel_hub(sh, priv->dev_port); 444 sh->tunnel_hub = NULL; 445 } 446 mlx5_cache_list_destroy(&sh->port_id_action_list); 447 mlx5_cache_list_destroy(&sh->push_vlan_action_list); 448 mlx5_free_table_hash_list(priv); 449 } 450 451 /** 452 * Initialize shared data between primary and secondary process. 453 * 454 * A memzone is reserved by primary process and secondary processes attach to 455 * the memzone. 456 * 457 * @return 458 * 0 on success, a negative errno value otherwise and rte_errno is set. 459 */ 460 static int 461 mlx5_init_shared_data(void) 462 { 463 const struct rte_memzone *mz; 464 int ret = 0; 465 466 rte_spinlock_lock(&mlx5_shared_data_lock); 467 if (mlx5_shared_data == NULL) { 468 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 469 /* Allocate shared memory. */ 470 mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA, 471 sizeof(*mlx5_shared_data), 472 SOCKET_ID_ANY, 0); 473 if (mz == NULL) { 474 DRV_LOG(ERR, 475 "Cannot allocate mlx5 shared data"); 476 ret = -rte_errno; 477 goto error; 478 } 479 mlx5_shared_data = mz->addr; 480 memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data)); 481 rte_spinlock_init(&mlx5_shared_data->lock); 482 } else { 483 /* Lookup allocated shared memory. */ 484 mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA); 485 if (mz == NULL) { 486 DRV_LOG(ERR, 487 "Cannot attach mlx5 shared data"); 488 ret = -rte_errno; 489 goto error; 490 } 491 mlx5_shared_data = mz->addr; 492 memset(&mlx5_local_data, 0, sizeof(mlx5_local_data)); 493 } 494 } 495 error: 496 rte_spinlock_unlock(&mlx5_shared_data_lock); 497 return ret; 498 } 499 500 /** 501 * PMD global initialization. 502 * 503 * Independent from individual device, this function initializes global 504 * per-PMD data structures distinguishing primary and secondary processes. 505 * Hence, each initialization is called once per a process. 506 * 507 * @return 508 * 0 on success, a negative errno value otherwise and rte_errno is set. 509 */ 510 static int 511 mlx5_init_once(void) 512 { 513 struct mlx5_shared_data *sd; 514 struct mlx5_local_data *ld = &mlx5_local_data; 515 int ret = 0; 516 517 if (mlx5_init_shared_data()) 518 return -rte_errno; 519 sd = mlx5_shared_data; 520 MLX5_ASSERT(sd); 521 rte_spinlock_lock(&sd->lock); 522 switch (rte_eal_process_type()) { 523 case RTE_PROC_PRIMARY: 524 if (sd->init_done) 525 break; 526 LIST_INIT(&sd->mem_event_cb_list); 527 rte_rwlock_init(&sd->mem_event_rwlock); 528 rte_mem_event_callback_register("MLX5_MEM_EVENT_CB", 529 mlx5_mr_mem_event_cb, NULL); 530 ret = mlx5_mp_init_primary(MLX5_MP_NAME, 531 mlx5_mp_os_primary_handle); 532 if (ret) 533 goto out; 534 sd->init_done = true; 535 break; 536 case RTE_PROC_SECONDARY: 537 if (ld->init_done) 538 break; 539 ret = mlx5_mp_init_secondary(MLX5_MP_NAME, 540 mlx5_mp_os_secondary_handle); 541 if (ret) 542 goto out; 543 ++sd->secondary_cnt; 544 ld->init_done = true; 545 break; 546 default: 547 break; 548 } 549 out: 550 rte_spinlock_unlock(&sd->lock); 551 return ret; 552 } 553 554 /** 555 * Create the Tx queue DevX/Verbs object. 556 * 557 * @param dev 558 * Pointer to Ethernet device. 559 * @param idx 560 * Queue index in DPDK Tx queue array. 561 * 562 * @return 563 * 0 on success, a negative errno value otherwise and rte_errno is set. 564 */ 565 static int 566 mlx5_os_txq_obj_new(struct rte_eth_dev *dev, uint16_t idx) 567 { 568 struct mlx5_priv *priv = dev->data->dev_private; 569 struct mlx5_txq_data *txq_data = (*priv->txqs)[idx]; 570 struct mlx5_txq_ctrl *txq_ctrl = 571 container_of(txq_data, struct mlx5_txq_ctrl, txq); 572 573 if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) 574 return mlx5_txq_devx_obj_new(dev, idx); 575 #ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET 576 if (!priv->config.dv_esw_en) 577 return mlx5_txq_devx_obj_new(dev, idx); 578 #endif 579 return mlx5_txq_ibv_obj_new(dev, idx); 580 } 581 582 /** 583 * Release an Tx DevX/verbs queue object. 584 * 585 * @param txq_obj 586 * DevX/Verbs Tx queue object. 587 */ 588 static void 589 mlx5_os_txq_obj_release(struct mlx5_txq_obj *txq_obj) 590 { 591 if (txq_obj->txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) { 592 mlx5_txq_devx_obj_release(txq_obj); 593 return; 594 } 595 #ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET 596 if (!txq_obj->txq_ctrl->priv->config.dv_esw_en) { 597 mlx5_txq_devx_obj_release(txq_obj); 598 return; 599 } 600 #endif 601 mlx5_txq_ibv_obj_release(txq_obj); 602 } 603 604 /** 605 * DV flow counter mode detect and config. 606 * 607 * @param dev 608 * Pointer to rte_eth_dev structure. 609 * 610 */ 611 static void 612 mlx5_flow_counter_mode_config(struct rte_eth_dev *dev __rte_unused) 613 { 614 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 615 struct mlx5_priv *priv = dev->data->dev_private; 616 struct mlx5_dev_ctx_shared *sh = priv->sh; 617 bool fallback; 618 619 #ifndef HAVE_IBV_DEVX_ASYNC 620 fallback = true; 621 #else 622 fallback = false; 623 if (!priv->config.devx || !priv->config.dv_flow_en || 624 !priv->config.hca_attr.flow_counters_dump || 625 !(priv->config.hca_attr.flow_counter_bulk_alloc_bitmap & 0x4) || 626 (mlx5_flow_dv_discover_counter_offset_support(dev) == -ENOTSUP)) 627 fallback = true; 628 #endif 629 if (fallback) 630 DRV_LOG(INFO, "Use fall-back DV counter management. Flow " 631 "counter dump:%d, bulk_alloc_bitmap:0x%hhx.", 632 priv->config.hca_attr.flow_counters_dump, 633 priv->config.hca_attr.flow_counter_bulk_alloc_bitmap); 634 /* Initialize fallback mode only on the port initializes sh. */ 635 if (sh->refcnt == 1) 636 sh->cmng.counter_fallback = fallback; 637 else if (fallback != sh->cmng.counter_fallback) 638 DRV_LOG(WARNING, "Port %d in sh has different fallback mode " 639 "with others:%d.", PORT_ID(priv), fallback); 640 #endif 641 } 642 643 /** 644 * Spawn an Ethernet device from Verbs information. 645 * 646 * @param dpdk_dev 647 * Backing DPDK device. 648 * @param spawn 649 * Verbs device parameters (name, port, switch_info) to spawn. 650 * @param config 651 * Device configuration parameters. 652 * 653 * @return 654 * A valid Ethernet device object on success, NULL otherwise and rte_errno 655 * is set. The following errors are defined: 656 * 657 * EBUSY: device is not supposed to be spawned. 658 * EEXIST: device is already spawned 659 */ 660 static struct rte_eth_dev * 661 mlx5_dev_spawn(struct rte_device *dpdk_dev, 662 struct mlx5_dev_spawn_data *spawn, 663 struct mlx5_dev_config *config) 664 { 665 const struct mlx5_switch_info *switch_info = &spawn->info; 666 struct mlx5_dev_ctx_shared *sh = NULL; 667 struct ibv_port_attr port_attr; 668 struct mlx5dv_context dv_attr = { .comp_mask = 0 }; 669 struct rte_eth_dev *eth_dev = NULL; 670 struct mlx5_priv *priv = NULL; 671 int err = 0; 672 unsigned int hw_padding = 0; 673 unsigned int mps; 674 unsigned int cqe_comp; 675 unsigned int cqe_pad = 0; 676 unsigned int tunnel_en = 0; 677 unsigned int mpls_en = 0; 678 unsigned int swp = 0; 679 unsigned int mprq = 0; 680 unsigned int mprq_min_stride_size_n = 0; 681 unsigned int mprq_max_stride_size_n = 0; 682 unsigned int mprq_min_stride_num_n = 0; 683 unsigned int mprq_max_stride_num_n = 0; 684 struct rte_ether_addr mac; 685 char name[RTE_ETH_NAME_MAX_LEN]; 686 int own_domain_id = 0; 687 uint16_t port_id; 688 unsigned int i; 689 #ifdef HAVE_MLX5DV_DR_DEVX_PORT 690 struct mlx5dv_devx_port devx_port = { .comp_mask = 0 }; 691 #endif 692 693 /* Determine if this port representor is supposed to be spawned. */ 694 if (switch_info->representor && dpdk_dev->devargs) { 695 struct rte_eth_devargs eth_da; 696 697 err = rte_eth_devargs_parse(dpdk_dev->devargs->args, ð_da); 698 if (err) { 699 rte_errno = -err; 700 DRV_LOG(ERR, "failed to process device arguments: %s", 701 strerror(rte_errno)); 702 return NULL; 703 } 704 for (i = 0; i < eth_da.nb_representor_ports; ++i) 705 if (eth_da.representor_ports[i] == 706 (uint16_t)switch_info->port_name) 707 break; 708 if (i == eth_da.nb_representor_ports) { 709 rte_errno = EBUSY; 710 return NULL; 711 } 712 } 713 /* Build device name. */ 714 if (spawn->pf_bond < 0) { 715 /* Single device. */ 716 if (!switch_info->representor) 717 strlcpy(name, dpdk_dev->name, sizeof(name)); 718 else 719 snprintf(name, sizeof(name), "%s_representor_%u", 720 dpdk_dev->name, switch_info->port_name); 721 } else { 722 /* Bonding device. */ 723 if (!switch_info->representor) 724 snprintf(name, sizeof(name), "%s_%s", 725 dpdk_dev->name, 726 mlx5_os_get_dev_device_name(spawn->phys_dev)); 727 else 728 snprintf(name, sizeof(name), "%s_%s_representor_%u", 729 dpdk_dev->name, 730 mlx5_os_get_dev_device_name(spawn->phys_dev), 731 switch_info->port_name); 732 } 733 /* check if the device is already spawned */ 734 if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) { 735 rte_errno = EEXIST; 736 return NULL; 737 } 738 DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name); 739 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 740 struct mlx5_mp_id mp_id; 741 742 eth_dev = rte_eth_dev_attach_secondary(name); 743 if (eth_dev == NULL) { 744 DRV_LOG(ERR, "can not attach rte ethdev"); 745 rte_errno = ENOMEM; 746 return NULL; 747 } 748 eth_dev->device = dpdk_dev; 749 eth_dev->dev_ops = &mlx5_os_dev_sec_ops; 750 eth_dev->rx_descriptor_status = mlx5_rx_descriptor_status; 751 eth_dev->tx_descriptor_status = mlx5_tx_descriptor_status; 752 err = mlx5_proc_priv_init(eth_dev); 753 if (err) 754 return NULL; 755 mp_id.port_id = eth_dev->data->port_id; 756 strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN); 757 /* Receive command fd from primary process */ 758 err = mlx5_mp_req_verbs_cmd_fd(&mp_id); 759 if (err < 0) 760 goto err_secondary; 761 /* Remap UAR for Tx queues. */ 762 err = mlx5_tx_uar_init_secondary(eth_dev, err); 763 if (err) 764 goto err_secondary; 765 /* 766 * Ethdev pointer is still required as input since 767 * the primary device is not accessible from the 768 * secondary process. 769 */ 770 eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev); 771 eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev); 772 return eth_dev; 773 err_secondary: 774 mlx5_dev_close(eth_dev); 775 return NULL; 776 } 777 /* 778 * Some parameters ("tx_db_nc" in particularly) are needed in 779 * advance to create dv/verbs device context. We proceed the 780 * devargs here to get ones, and later proceed devargs again 781 * to override some hardware settings. 782 */ 783 err = mlx5_args(config, dpdk_dev->devargs); 784 if (err) { 785 err = rte_errno; 786 DRV_LOG(ERR, "failed to process device arguments: %s", 787 strerror(rte_errno)); 788 goto error; 789 } 790 if (config->dv_miss_info) { 791 if (switch_info->master || switch_info->representor) 792 config->dv_xmeta_en = MLX5_XMETA_MODE_META16; 793 } 794 mlx5_malloc_mem_select(config->sys_mem_en); 795 sh = mlx5_alloc_shared_dev_ctx(spawn, config); 796 if (!sh) 797 return NULL; 798 config->devx = sh->devx; 799 #ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR 800 config->dest_tir = 1; 801 #endif 802 #ifdef HAVE_IBV_MLX5_MOD_SWP 803 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; 804 #endif 805 /* 806 * Multi-packet send is supported by ConnectX-4 Lx PF as well 807 * as all ConnectX-5 devices. 808 */ 809 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 810 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS; 811 #endif 812 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 813 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; 814 #endif 815 mlx5_glue->dv_query_device(sh->ctx, &dv_attr); 816 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { 817 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { 818 DRV_LOG(DEBUG, "enhanced MPW is supported"); 819 mps = MLX5_MPW_ENHANCED; 820 } else { 821 DRV_LOG(DEBUG, "MPW is supported"); 822 mps = MLX5_MPW; 823 } 824 } else { 825 DRV_LOG(DEBUG, "MPW isn't supported"); 826 mps = MLX5_MPW_DISABLED; 827 } 828 #ifdef HAVE_IBV_MLX5_MOD_SWP 829 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP) 830 swp = dv_attr.sw_parsing_caps.sw_parsing_offloads; 831 DRV_LOG(DEBUG, "SWP support: %u", swp); 832 #endif 833 config->swp = !!swp; 834 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 835 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) { 836 struct mlx5dv_striding_rq_caps mprq_caps = 837 dv_attr.striding_rq_caps; 838 839 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d", 840 mprq_caps.min_single_stride_log_num_of_bytes); 841 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d", 842 mprq_caps.max_single_stride_log_num_of_bytes); 843 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d", 844 mprq_caps.min_single_wqe_log_num_of_strides); 845 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d", 846 mprq_caps.max_single_wqe_log_num_of_strides); 847 DRV_LOG(DEBUG, "\tsupported_qpts: %d", 848 mprq_caps.supported_qpts); 849 DRV_LOG(DEBUG, "device supports Multi-Packet RQ"); 850 mprq = 1; 851 mprq_min_stride_size_n = 852 mprq_caps.min_single_stride_log_num_of_bytes; 853 mprq_max_stride_size_n = 854 mprq_caps.max_single_stride_log_num_of_bytes; 855 mprq_min_stride_num_n = 856 mprq_caps.min_single_wqe_log_num_of_strides; 857 mprq_max_stride_num_n = 858 mprq_caps.max_single_wqe_log_num_of_strides; 859 } 860 #endif 861 if (RTE_CACHE_LINE_SIZE == 128 && 862 !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) 863 cqe_comp = 0; 864 else 865 cqe_comp = 1; 866 config->cqe_comp = cqe_comp; 867 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD 868 /* Whether device supports 128B Rx CQE padding. */ 869 cqe_pad = RTE_CACHE_LINE_SIZE == 128 && 870 (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD); 871 #endif 872 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 873 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { 874 tunnel_en = ((dv_attr.tunnel_offloads_caps & 875 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) && 876 (dv_attr.tunnel_offloads_caps & 877 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) && 878 (dv_attr.tunnel_offloads_caps & 879 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE)); 880 } 881 DRV_LOG(DEBUG, "tunnel offloading is %ssupported", 882 tunnel_en ? "" : "not "); 883 #else 884 DRV_LOG(WARNING, 885 "tunnel offloading disabled due to old OFED/rdma-core version"); 886 #endif 887 config->tunnel_en = tunnel_en; 888 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT 889 mpls_en = ((dv_attr.tunnel_offloads_caps & 890 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) && 891 (dv_attr.tunnel_offloads_caps & 892 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP)); 893 DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported", 894 mpls_en ? "" : "not "); 895 #else 896 DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to" 897 " old OFED/rdma-core version or firmware configuration"); 898 #endif 899 config->mpls_en = mpls_en; 900 /* Check port status. */ 901 err = mlx5_glue->query_port(sh->ctx, spawn->phys_port, &port_attr); 902 if (err) { 903 DRV_LOG(ERR, "port query failed: %s", strerror(err)); 904 goto error; 905 } 906 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { 907 DRV_LOG(ERR, "port is not configured in Ethernet mode"); 908 err = EINVAL; 909 goto error; 910 } 911 if (port_attr.state != IBV_PORT_ACTIVE) 912 DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)", 913 mlx5_glue->port_state_str(port_attr.state), 914 port_attr.state); 915 /* Allocate private eth device data. */ 916 priv = mlx5_malloc(MLX5_MEM_ZERO | MLX5_MEM_RTE, 917 sizeof(*priv), 918 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 919 if (priv == NULL) { 920 DRV_LOG(ERR, "priv allocation failure"); 921 err = ENOMEM; 922 goto error; 923 } 924 priv->sh = sh; 925 priv->dev_port = spawn->phys_port; 926 priv->pci_dev = spawn->pci_dev; 927 priv->mtu = RTE_ETHER_MTU; 928 priv->mp_id.port_id = port_id; 929 strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN); 930 /* Some internal functions rely on Netlink sockets, open them now. */ 931 priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA); 932 priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE); 933 priv->representor = !!switch_info->representor; 934 priv->master = !!switch_info->master; 935 priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; 936 priv->vport_meta_tag = 0; 937 priv->vport_meta_mask = 0; 938 priv->pf_bond = spawn->pf_bond; 939 #ifdef HAVE_MLX5DV_DR_DEVX_PORT 940 /* 941 * The DevX port query API is implemented. E-Switch may use 942 * either vport or reg_c[0] metadata register to match on 943 * vport index. The engaged part of metadata register is 944 * defined by mask. 945 */ 946 if (switch_info->representor || switch_info->master) { 947 devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT | 948 MLX5DV_DEVX_PORT_MATCH_REG_C_0; 949 err = mlx5_glue->devx_port_query(sh->ctx, spawn->phys_port, 950 &devx_port); 951 if (err) { 952 DRV_LOG(WARNING, 953 "can't query devx port %d on device %s", 954 spawn->phys_port, 955 mlx5_os_get_dev_device_name(spawn->phys_dev)); 956 devx_port.comp_mask = 0; 957 } 958 } 959 if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) { 960 priv->vport_meta_tag = devx_port.reg_c_0.value; 961 priv->vport_meta_mask = devx_port.reg_c_0.mask; 962 if (!priv->vport_meta_mask) { 963 DRV_LOG(ERR, "vport zero mask for port %d" 964 " on bonding device %s", 965 spawn->phys_port, 966 mlx5_os_get_dev_device_name 967 (spawn->phys_dev)); 968 err = ENOTSUP; 969 goto error; 970 } 971 if (priv->vport_meta_tag & ~priv->vport_meta_mask) { 972 DRV_LOG(ERR, "invalid vport tag for port %d" 973 " on bonding device %s", 974 spawn->phys_port, 975 mlx5_os_get_dev_device_name 976 (spawn->phys_dev)); 977 err = ENOTSUP; 978 goto error; 979 } 980 } 981 if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) { 982 priv->vport_id = devx_port.vport_num; 983 } else if (spawn->pf_bond >= 0) { 984 DRV_LOG(ERR, "can't deduce vport index for port %d" 985 " on bonding device %s", 986 spawn->phys_port, 987 mlx5_os_get_dev_device_name(spawn->phys_dev)); 988 err = ENOTSUP; 989 goto error; 990 } else { 991 /* Suppose vport index in compatible way. */ 992 priv->vport_id = switch_info->representor ? 993 switch_info->port_name + 1 : -1; 994 } 995 #else 996 /* 997 * Kernel/rdma_core support single E-Switch per PF configurations 998 * only and vport_id field contains the vport index for 999 * associated VF, which is deduced from representor port name. 1000 * For example, let's have the IB device port 10, it has 1001 * attached network device eth0, which has port name attribute 1002 * pf0vf2, we can deduce the VF number as 2, and set vport index 1003 * as 3 (2+1). This assigning schema should be changed if the 1004 * multiple E-Switch instances per PF configurations or/and PCI 1005 * subfunctions are added. 1006 */ 1007 priv->vport_id = switch_info->representor ? 1008 switch_info->port_name + 1 : -1; 1009 #endif 1010 /* representor_id field keeps the unmodified VF index. */ 1011 priv->representor_id = switch_info->representor ? 1012 switch_info->port_name : -1; 1013 /* 1014 * Look for sibling devices in order to reuse their switch domain 1015 * if any, otherwise allocate one. 1016 */ 1017 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { 1018 const struct mlx5_priv *opriv = 1019 rte_eth_devices[port_id].data->dev_private; 1020 1021 if (!opriv || 1022 opriv->sh != priv->sh || 1023 opriv->domain_id == 1024 RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) 1025 continue; 1026 priv->domain_id = opriv->domain_id; 1027 break; 1028 } 1029 if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { 1030 err = rte_eth_switch_domain_alloc(&priv->domain_id); 1031 if (err) { 1032 err = rte_errno; 1033 DRV_LOG(ERR, "unable to allocate switch domain: %s", 1034 strerror(rte_errno)); 1035 goto error; 1036 } 1037 own_domain_id = 1; 1038 } 1039 /* Override some values set by hardware configuration. */ 1040 mlx5_args(config, dpdk_dev->devargs); 1041 err = mlx5_dev_check_sibling_config(priv, config); 1042 if (err) 1043 goto error; 1044 config->hw_csum = !!(sh->device_attr.device_cap_flags_ex & 1045 IBV_DEVICE_RAW_IP_CSUM); 1046 DRV_LOG(DEBUG, "checksum offloading is %ssupported", 1047 (config->hw_csum ? "" : "not ")); 1048 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \ 1049 !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) 1050 DRV_LOG(DEBUG, "counters are not supported"); 1051 #endif 1052 #if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR) 1053 if (config->dv_flow_en) { 1054 DRV_LOG(WARNING, "DV flow is not supported"); 1055 config->dv_flow_en = 0; 1056 } 1057 #endif 1058 config->ind_table_max_size = 1059 sh->device_attr.max_rwq_indirection_table_size; 1060 /* 1061 * Remove this check once DPDK supports larger/variable 1062 * indirection tables. 1063 */ 1064 if (config->ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512) 1065 config->ind_table_max_size = ETH_RSS_RETA_SIZE_512; 1066 DRV_LOG(DEBUG, "maximum Rx indirection table size is %u", 1067 config->ind_table_max_size); 1068 config->hw_vlan_strip = !!(sh->device_attr.raw_packet_caps & 1069 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); 1070 DRV_LOG(DEBUG, "VLAN stripping is %ssupported", 1071 (config->hw_vlan_strip ? "" : "not ")); 1072 config->hw_fcs_strip = !!(sh->device_attr.raw_packet_caps & 1073 IBV_RAW_PACKET_CAP_SCATTER_FCS); 1074 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING) 1075 hw_padding = !!sh->device_attr.rx_pad_end_addr_align; 1076 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING) 1077 hw_padding = !!(sh->device_attr.device_cap_flags_ex & 1078 IBV_DEVICE_PCI_WRITE_END_PADDING); 1079 #endif 1080 if (config->hw_padding && !hw_padding) { 1081 DRV_LOG(DEBUG, "Rx end alignment padding isn't supported"); 1082 config->hw_padding = 0; 1083 } else if (config->hw_padding) { 1084 DRV_LOG(DEBUG, "Rx end alignment padding is enabled"); 1085 } 1086 config->tso = (sh->device_attr.max_tso > 0 && 1087 (sh->device_attr.tso_supported_qpts & 1088 (1 << IBV_QPT_RAW_PACKET))); 1089 if (config->tso) 1090 config->tso_max_payload_sz = sh->device_attr.max_tso; 1091 /* 1092 * MPW is disabled by default, while the Enhanced MPW is enabled 1093 * by default. 1094 */ 1095 if (config->mps == MLX5_ARG_UNSET) 1096 config->mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED : 1097 MLX5_MPW_DISABLED; 1098 else 1099 config->mps = config->mps ? mps : MLX5_MPW_DISABLED; 1100 DRV_LOG(INFO, "%sMPS is %s", 1101 config->mps == MLX5_MPW_ENHANCED ? "enhanced " : 1102 config->mps == MLX5_MPW ? "legacy " : "", 1103 config->mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); 1104 if (config->cqe_comp && !cqe_comp) { 1105 DRV_LOG(WARNING, "Rx CQE compression isn't supported"); 1106 config->cqe_comp = 0; 1107 } 1108 if (config->cqe_pad && !cqe_pad) { 1109 DRV_LOG(WARNING, "Rx CQE padding isn't supported"); 1110 config->cqe_pad = 0; 1111 } else if (config->cqe_pad) { 1112 DRV_LOG(INFO, "Rx CQE padding is enabled"); 1113 } 1114 if (config->devx) { 1115 err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config->hca_attr); 1116 if (err) { 1117 err = -err; 1118 goto error; 1119 } 1120 /* Check relax ordering support. */ 1121 if (config->hca_attr.relaxed_ordering_write && 1122 config->hca_attr.relaxed_ordering_read && 1123 !haswell_broadwell_cpu) 1124 sh->cmng.relaxed_ordering = 1; 1125 /* Check for LRO support. */ 1126 if (config->dest_tir && config->hca_attr.lro_cap && 1127 config->dv_flow_en) { 1128 /* TBD check tunnel lro caps. */ 1129 config->lro.supported = config->hca_attr.lro_cap; 1130 DRV_LOG(DEBUG, "Device supports LRO"); 1131 /* 1132 * If LRO timeout is not configured by application, 1133 * use the minimal supported value. 1134 */ 1135 if (!config->lro.timeout) 1136 config->lro.timeout = 1137 config->hca_attr.lro_timer_supported_periods[0]; 1138 DRV_LOG(DEBUG, "LRO session timeout set to %d usec", 1139 config->lro.timeout); 1140 DRV_LOG(DEBUG, "LRO minimal size of TCP segment " 1141 "required for coalescing is %d bytes", 1142 config->hca_attr.lro_min_mss_size); 1143 } 1144 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER) 1145 if (config->hca_attr.qos.sup && 1146 config->hca_attr.qos.srtcm_sup && 1147 config->dv_flow_en) { 1148 uint8_t reg_c_mask = 1149 config->hca_attr.qos.flow_meter_reg_c_ids; 1150 /* 1151 * Meter needs two REG_C's for color match and pre-sfx 1152 * flow match. Here get the REG_C for color match. 1153 * REG_C_0 and REG_C_1 is reserved for metadata feature. 1154 */ 1155 reg_c_mask &= 0xfc; 1156 if (__builtin_popcount(reg_c_mask) < 1) { 1157 priv->mtr_en = 0; 1158 DRV_LOG(WARNING, "No available register for" 1159 " meter."); 1160 } else { 1161 priv->mtr_color_reg = ffs(reg_c_mask) - 1 + 1162 REG_C_0; 1163 priv->mtr_en = 1; 1164 priv->mtr_reg_share = 1165 config->hca_attr.qos.flow_meter_reg_share; 1166 DRV_LOG(DEBUG, "The REG_C meter uses is %d", 1167 priv->mtr_color_reg); 1168 } 1169 } 1170 #endif 1171 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_SAMPLE) 1172 if (config->hca_attr.log_max_ft_sampler_num > 0 && 1173 config->dv_flow_en) { 1174 priv->sampler_en = 1; 1175 DRV_LOG(DEBUG, "The Sampler enabled!\n"); 1176 } else { 1177 priv->sampler_en = 0; 1178 if (!config->hca_attr.log_max_ft_sampler_num) 1179 DRV_LOG(WARNING, "No available register for" 1180 " Sampler."); 1181 else 1182 DRV_LOG(DEBUG, "DV flow is not supported!\n"); 1183 } 1184 #endif 1185 } 1186 if (config->tx_pp) { 1187 DRV_LOG(DEBUG, "Timestamp counter frequency %u kHz", 1188 config->hca_attr.dev_freq_khz); 1189 DRV_LOG(DEBUG, "Packet pacing is %ssupported", 1190 config->hca_attr.qos.packet_pacing ? "" : "not "); 1191 DRV_LOG(DEBUG, "Cross channel ops are %ssupported", 1192 config->hca_attr.cross_channel ? "" : "not "); 1193 DRV_LOG(DEBUG, "WQE index ignore is %ssupported", 1194 config->hca_attr.wqe_index_ignore ? "" : "not "); 1195 DRV_LOG(DEBUG, "Non-wire SQ feature is %ssupported", 1196 config->hca_attr.non_wire_sq ? "" : "not "); 1197 DRV_LOG(DEBUG, "Static WQE SQ feature is %ssupported (%d)", 1198 config->hca_attr.log_max_static_sq_wq ? "" : "not ", 1199 config->hca_attr.log_max_static_sq_wq); 1200 DRV_LOG(DEBUG, "WQE rate PP mode is %ssupported", 1201 config->hca_attr.qos.wqe_rate_pp ? "" : "not "); 1202 if (!config->devx) { 1203 DRV_LOG(ERR, "DevX is required for packet pacing"); 1204 err = ENODEV; 1205 goto error; 1206 } 1207 if (!config->hca_attr.qos.packet_pacing) { 1208 DRV_LOG(ERR, "Packet pacing is not supported"); 1209 err = ENODEV; 1210 goto error; 1211 } 1212 if (!config->hca_attr.cross_channel) { 1213 DRV_LOG(ERR, "Cross channel operations are" 1214 " required for packet pacing"); 1215 err = ENODEV; 1216 goto error; 1217 } 1218 if (!config->hca_attr.wqe_index_ignore) { 1219 DRV_LOG(ERR, "WQE index ignore feature is" 1220 " required for packet pacing"); 1221 err = ENODEV; 1222 goto error; 1223 } 1224 if (!config->hca_attr.non_wire_sq) { 1225 DRV_LOG(ERR, "Non-wire SQ feature is" 1226 " required for packet pacing"); 1227 err = ENODEV; 1228 goto error; 1229 } 1230 if (!config->hca_attr.log_max_static_sq_wq) { 1231 DRV_LOG(ERR, "Static WQE SQ feature is" 1232 " required for packet pacing"); 1233 err = ENODEV; 1234 goto error; 1235 } 1236 if (!config->hca_attr.qos.wqe_rate_pp) { 1237 DRV_LOG(ERR, "WQE rate mode is required" 1238 " for packet pacing"); 1239 err = ENODEV; 1240 goto error; 1241 } 1242 #ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET 1243 DRV_LOG(ERR, "DevX does not provide UAR offset," 1244 " can't create queues for packet pacing"); 1245 err = ENODEV; 1246 goto error; 1247 #endif 1248 } 1249 if (config->devx) { 1250 uint32_t reg[MLX5_ST_SZ_DW(register_mtutc)]; 1251 1252 err = config->hca_attr.access_register_user ? 1253 mlx5_devx_cmd_register_read 1254 (sh->ctx, MLX5_REGISTER_ID_MTUTC, 0, 1255 reg, MLX5_ST_SZ_DW(register_mtutc)) : ENOTSUP; 1256 if (!err) { 1257 uint32_t ts_mode; 1258 1259 /* MTUTC register is read successfully. */ 1260 ts_mode = MLX5_GET(register_mtutc, reg, 1261 time_stamp_mode); 1262 if (ts_mode == MLX5_MTUTC_TIMESTAMP_MODE_REAL_TIME) 1263 config->rt_timestamp = 1; 1264 } else { 1265 /* Kernel does not support register reading. */ 1266 if (config->hca_attr.dev_freq_khz == 1267 (NS_PER_S / MS_PER_S)) 1268 config->rt_timestamp = 1; 1269 } 1270 } 1271 /* 1272 * If HW has bug working with tunnel packet decapsulation and 1273 * scatter FCS, and decapsulation is needed, clear the hw_fcs_strip 1274 * bit. Then DEV_RX_OFFLOAD_KEEP_CRC bit will not be set anymore. 1275 */ 1276 if (config->hca_attr.scatter_fcs_w_decap_disable && config->decap_en) 1277 config->hw_fcs_strip = 0; 1278 DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported", 1279 (config->hw_fcs_strip ? "" : "not ")); 1280 if (config->mprq.enabled && mprq) { 1281 if (config->mprq.stride_num_n && 1282 (config->mprq.stride_num_n > mprq_max_stride_num_n || 1283 config->mprq.stride_num_n < mprq_min_stride_num_n)) { 1284 config->mprq.stride_num_n = 1285 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, 1286 mprq_min_stride_num_n), 1287 mprq_max_stride_num_n); 1288 DRV_LOG(WARNING, 1289 "the number of strides" 1290 " for Multi-Packet RQ is out of range," 1291 " setting default value (%u)", 1292 1 << config->mprq.stride_num_n); 1293 } 1294 if (config->mprq.stride_size_n && 1295 (config->mprq.stride_size_n > mprq_max_stride_size_n || 1296 config->mprq.stride_size_n < mprq_min_stride_size_n)) { 1297 config->mprq.stride_size_n = 1298 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N, 1299 mprq_min_stride_size_n), 1300 mprq_max_stride_size_n); 1301 DRV_LOG(WARNING, 1302 "the size of a stride" 1303 " for Multi-Packet RQ is out of range," 1304 " setting default value (%u)", 1305 1 << config->mprq.stride_size_n); 1306 } 1307 config->mprq.min_stride_size_n = mprq_min_stride_size_n; 1308 config->mprq.max_stride_size_n = mprq_max_stride_size_n; 1309 } else if (config->mprq.enabled && !mprq) { 1310 DRV_LOG(WARNING, "Multi-Packet RQ isn't supported"); 1311 config->mprq.enabled = 0; 1312 } 1313 if (config->max_dump_files_num == 0) 1314 config->max_dump_files_num = 128; 1315 eth_dev = rte_eth_dev_allocate(name); 1316 if (eth_dev == NULL) { 1317 DRV_LOG(ERR, "can not allocate rte ethdev"); 1318 err = ENOMEM; 1319 goto error; 1320 } 1321 if (priv->representor) { 1322 eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR; 1323 eth_dev->data->representor_id = priv->representor_id; 1324 } 1325 /* 1326 * Store associated network device interface index. This index 1327 * is permanent throughout the lifetime of device. So, we may store 1328 * the ifindex here and use the cached value further. 1329 */ 1330 MLX5_ASSERT(spawn->ifindex); 1331 priv->if_index = spawn->ifindex; 1332 if (priv->pf_bond >= 0 && priv->master) { 1333 /* Get bond interface info */ 1334 err = mlx5_sysfs_bond_info(priv->if_index, 1335 &priv->bond_ifindex, 1336 priv->bond_name); 1337 if (err) 1338 DRV_LOG(ERR, "unable to get bond info: %s", 1339 strerror(rte_errno)); 1340 else 1341 DRV_LOG(INFO, "PF device %u, bond device %u(%s)", 1342 priv->if_index, priv->bond_ifindex, 1343 priv->bond_name); 1344 } 1345 eth_dev->data->dev_private = priv; 1346 priv->dev_data = eth_dev->data; 1347 eth_dev->data->mac_addrs = priv->mac; 1348 eth_dev->device = dpdk_dev; 1349 eth_dev->data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS; 1350 /* Configure the first MAC address by default. */ 1351 if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) { 1352 DRV_LOG(ERR, 1353 "port %u cannot get MAC address, is mlx5_en" 1354 " loaded? (errno: %s)", 1355 eth_dev->data->port_id, strerror(rte_errno)); 1356 err = ENODEV; 1357 goto error; 1358 } 1359 DRV_LOG(INFO, 1360 "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", 1361 eth_dev->data->port_id, 1362 mac.addr_bytes[0], mac.addr_bytes[1], 1363 mac.addr_bytes[2], mac.addr_bytes[3], 1364 mac.addr_bytes[4], mac.addr_bytes[5]); 1365 #ifdef RTE_LIBRTE_MLX5_DEBUG 1366 { 1367 char ifname[IF_NAMESIZE]; 1368 1369 if (mlx5_get_ifname(eth_dev, &ifname) == 0) 1370 DRV_LOG(DEBUG, "port %u ifname is \"%s\"", 1371 eth_dev->data->port_id, ifname); 1372 else 1373 DRV_LOG(DEBUG, "port %u ifname is unknown", 1374 eth_dev->data->port_id); 1375 } 1376 #endif 1377 /* Get actual MTU if possible. */ 1378 err = mlx5_get_mtu(eth_dev, &priv->mtu); 1379 if (err) { 1380 err = rte_errno; 1381 goto error; 1382 } 1383 DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id, 1384 priv->mtu); 1385 /* Initialize burst functions to prevent crashes before link-up. */ 1386 eth_dev->rx_pkt_burst = removed_rx_burst; 1387 eth_dev->tx_pkt_burst = removed_tx_burst; 1388 eth_dev->dev_ops = &mlx5_os_dev_ops; 1389 eth_dev->rx_descriptor_status = mlx5_rx_descriptor_status; 1390 eth_dev->tx_descriptor_status = mlx5_tx_descriptor_status; 1391 eth_dev->rx_queue_count = mlx5_rx_queue_count; 1392 /* Register MAC address. */ 1393 claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); 1394 if (config->vf && config->vf_nl_en) 1395 mlx5_nl_mac_addr_sync(priv->nl_socket_route, 1396 mlx5_ifindex(eth_dev), 1397 eth_dev->data->mac_addrs, 1398 MLX5_MAX_MAC_ADDRESSES); 1399 priv->flows = 0; 1400 priv->ctrl_flows = 0; 1401 rte_spinlock_init(&priv->flow_list_lock); 1402 TAILQ_INIT(&priv->flow_meters); 1403 TAILQ_INIT(&priv->flow_meter_profiles); 1404 /* Hint libmlx5 to use PMD allocator for data plane resources */ 1405 mlx5_glue->dv_set_context_attr(sh->ctx, 1406 MLX5DV_CTX_ATTR_BUF_ALLOCATORS, 1407 (void *)((uintptr_t)&(struct mlx5dv_ctx_allocators){ 1408 .alloc = &mlx5_alloc_verbs_buf, 1409 .free = &mlx5_free_verbs_buf, 1410 .data = priv, 1411 })); 1412 /* Bring Ethernet device up. */ 1413 DRV_LOG(DEBUG, "port %u forcing Ethernet interface up", 1414 eth_dev->data->port_id); 1415 mlx5_set_link_up(eth_dev); 1416 /* 1417 * Even though the interrupt handler is not installed yet, 1418 * interrupts will still trigger on the async_fd from 1419 * Verbs context returned by ibv_open_device(). 1420 */ 1421 mlx5_link_update(eth_dev, 0); 1422 #ifdef HAVE_MLX5DV_DR_ESWITCH 1423 if (!(config->hca_attr.eswitch_manager && config->dv_flow_en && 1424 (switch_info->representor || switch_info->master))) 1425 config->dv_esw_en = 0; 1426 #else 1427 config->dv_esw_en = 0; 1428 #endif 1429 /* Detect minimal data bytes to inline. */ 1430 mlx5_set_min_inline(spawn, config); 1431 /* Store device configuration on private structure. */ 1432 priv->config = *config; 1433 /* Create context for virtual machine VLAN workaround. */ 1434 priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex); 1435 if (config->dv_flow_en) { 1436 err = mlx5_alloc_shared_dr(priv); 1437 if (err) 1438 goto error; 1439 } 1440 if (config->devx && config->dv_flow_en && config->dest_tir) { 1441 priv->obj_ops = devx_obj_ops; 1442 priv->obj_ops.drop_action_create = 1443 ibv_obj_ops.drop_action_create; 1444 priv->obj_ops.drop_action_destroy = 1445 ibv_obj_ops.drop_action_destroy; 1446 #ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET 1447 priv->obj_ops.txq_obj_modify = ibv_obj_ops.txq_obj_modify; 1448 #else 1449 if (config->dv_esw_en) 1450 priv->obj_ops.txq_obj_modify = 1451 ibv_obj_ops.txq_obj_modify; 1452 #endif 1453 /* Use specific wrappers for Tx object. */ 1454 priv->obj_ops.txq_obj_new = mlx5_os_txq_obj_new; 1455 priv->obj_ops.txq_obj_release = mlx5_os_txq_obj_release; 1456 1457 } else { 1458 priv->obj_ops = ibv_obj_ops; 1459 } 1460 priv->drop_queue.hrxq = mlx5_drop_action_create(eth_dev); 1461 if (!priv->drop_queue.hrxq) 1462 goto error; 1463 /* Supported Verbs flow priority number detection. */ 1464 err = mlx5_flow_discover_priorities(eth_dev); 1465 if (err < 0) { 1466 err = -err; 1467 goto error; 1468 } 1469 priv->config.flow_prio = err; 1470 if (!priv->config.dv_esw_en && 1471 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { 1472 DRV_LOG(WARNING, "metadata mode %u is not supported " 1473 "(no E-Switch)", priv->config.dv_xmeta_en); 1474 priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY; 1475 } 1476 mlx5_set_metadata_mask(eth_dev); 1477 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && 1478 !priv->sh->dv_regc0_mask) { 1479 DRV_LOG(ERR, "metadata mode %u is not supported " 1480 "(no metadata reg_c[0] is available)", 1481 priv->config.dv_xmeta_en); 1482 err = ENOTSUP; 1483 goto error; 1484 } 1485 mlx5_cache_list_init(&priv->hrxqs, "hrxq", 0, eth_dev, 1486 mlx5_hrxq_create_cb, 1487 mlx5_hrxq_match_cb, 1488 mlx5_hrxq_remove_cb); 1489 /* Query availability of metadata reg_c's. */ 1490 err = mlx5_flow_discover_mreg_c(eth_dev); 1491 if (err < 0) { 1492 err = -err; 1493 goto error; 1494 } 1495 if (!mlx5_flow_ext_mreg_supported(eth_dev)) { 1496 DRV_LOG(DEBUG, 1497 "port %u extensive metadata register is not supported", 1498 eth_dev->data->port_id); 1499 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { 1500 DRV_LOG(ERR, "metadata mode %u is not supported " 1501 "(no metadata registers available)", 1502 priv->config.dv_xmeta_en); 1503 err = ENOTSUP; 1504 goto error; 1505 } 1506 } 1507 if (priv->config.dv_flow_en && 1508 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && 1509 mlx5_flow_ext_mreg_supported(eth_dev) && 1510 priv->sh->dv_regc0_mask) { 1511 priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME, 1512 MLX5_FLOW_MREG_HTABLE_SZ, 1513 0, 0, 1514 flow_dv_mreg_create_cb, 1515 NULL, 1516 flow_dv_mreg_remove_cb); 1517 if (!priv->mreg_cp_tbl) { 1518 err = ENOMEM; 1519 goto error; 1520 } 1521 priv->mreg_cp_tbl->ctx = eth_dev; 1522 } 1523 mlx5_flow_counter_mode_config(eth_dev); 1524 return eth_dev; 1525 error: 1526 if (priv) { 1527 if (priv->mreg_cp_tbl) 1528 mlx5_hlist_destroy(priv->mreg_cp_tbl); 1529 if (priv->sh) 1530 mlx5_os_free_shared_dr(priv); 1531 if (priv->nl_socket_route >= 0) 1532 close(priv->nl_socket_route); 1533 if (priv->nl_socket_rdma >= 0) 1534 close(priv->nl_socket_rdma); 1535 if (priv->vmwa_context) 1536 mlx5_vlan_vmwa_exit(priv->vmwa_context); 1537 if (eth_dev && priv->drop_queue.hrxq) 1538 mlx5_drop_action_destroy(eth_dev); 1539 if (own_domain_id) 1540 claim_zero(rte_eth_switch_domain_free(priv->domain_id)); 1541 mlx5_cache_list_destroy(&priv->hrxqs); 1542 mlx5_free(priv); 1543 if (eth_dev != NULL) 1544 eth_dev->data->dev_private = NULL; 1545 } 1546 if (eth_dev != NULL) { 1547 /* mac_addrs must not be freed alone because part of 1548 * dev_private 1549 **/ 1550 eth_dev->data->mac_addrs = NULL; 1551 rte_eth_dev_release_port(eth_dev); 1552 } 1553 if (sh) 1554 mlx5_free_shared_dev_ctx(sh); 1555 MLX5_ASSERT(err > 0); 1556 rte_errno = err; 1557 return NULL; 1558 } 1559 1560 /** 1561 * Comparison callback to sort device data. 1562 * 1563 * This is meant to be used with qsort(). 1564 * 1565 * @param a[in] 1566 * Pointer to pointer to first data object. 1567 * @param b[in] 1568 * Pointer to pointer to second data object. 1569 * 1570 * @return 1571 * 0 if both objects are equal, less than 0 if the first argument is less 1572 * than the second, greater than 0 otherwise. 1573 */ 1574 static int 1575 mlx5_dev_spawn_data_cmp(const void *a, const void *b) 1576 { 1577 const struct mlx5_switch_info *si_a = 1578 &((const struct mlx5_dev_spawn_data *)a)->info; 1579 const struct mlx5_switch_info *si_b = 1580 &((const struct mlx5_dev_spawn_data *)b)->info; 1581 int ret; 1582 1583 /* Master device first. */ 1584 ret = si_b->master - si_a->master; 1585 if (ret) 1586 return ret; 1587 /* Then representor devices. */ 1588 ret = si_b->representor - si_a->representor; 1589 if (ret) 1590 return ret; 1591 /* Unidentified devices come last in no specific order. */ 1592 if (!si_a->representor) 1593 return 0; 1594 /* Order representors by name. */ 1595 return si_a->port_name - si_b->port_name; 1596 } 1597 1598 /** 1599 * Match PCI information for possible slaves of bonding device. 1600 * 1601 * @param[in] ibv_dev 1602 * Pointer to Infiniband device structure. 1603 * @param[in] pci_dev 1604 * Pointer to PCI device structure to match PCI address. 1605 * @param[in] nl_rdma 1606 * Netlink RDMA group socket handle. 1607 * 1608 * @return 1609 * negative value if no bonding device found, otherwise 1610 * positive index of slave PF in bonding. 1611 */ 1612 static int 1613 mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev, 1614 const struct rte_pci_device *pci_dev, 1615 int nl_rdma) 1616 { 1617 char ifname[IF_NAMESIZE + 1]; 1618 unsigned int ifindex; 1619 unsigned int np, i; 1620 FILE *file = NULL; 1621 int pf = -1; 1622 1623 /* 1624 * Try to get master device name. If something goes 1625 * wrong suppose the lack of kernel support and no 1626 * bonding devices. 1627 */ 1628 if (nl_rdma < 0) 1629 return -1; 1630 if (!strstr(ibv_dev->name, "bond")) 1631 return -1; 1632 np = mlx5_nl_portnum(nl_rdma, ibv_dev->name); 1633 if (!np) 1634 return -1; 1635 /* 1636 * The Master device might not be on the predefined 1637 * port (not on port index 1, it is not garanted), 1638 * we have to scan all Infiniband device port and 1639 * find master. 1640 */ 1641 for (i = 1; i <= np; ++i) { 1642 /* Check whether Infiniband port is populated. */ 1643 ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i); 1644 if (!ifindex) 1645 continue; 1646 if (!if_indextoname(ifindex, ifname)) 1647 continue; 1648 /* Try to read bonding slave names from sysfs. */ 1649 MKSTR(slaves, 1650 "/sys/class/net/%s/master/bonding/slaves", ifname); 1651 file = fopen(slaves, "r"); 1652 if (file) 1653 break; 1654 } 1655 if (!file) 1656 return -1; 1657 /* Use safe format to check maximal buffer length. */ 1658 MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE); 1659 while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) { 1660 char tmp_str[IF_NAMESIZE + 32]; 1661 struct rte_pci_addr pci_addr; 1662 struct mlx5_switch_info info; 1663 1664 /* Process slave interface names in the loop. */ 1665 snprintf(tmp_str, sizeof(tmp_str), 1666 "/sys/class/net/%s", ifname); 1667 if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) { 1668 DRV_LOG(WARNING, "can not get PCI address" 1669 " for netdev \"%s\"", ifname); 1670 continue; 1671 } 1672 if (pci_dev->addr.domain != pci_addr.domain || 1673 pci_dev->addr.bus != pci_addr.bus || 1674 pci_dev->addr.devid != pci_addr.devid || 1675 pci_dev->addr.function != pci_addr.function) 1676 continue; 1677 /* Slave interface PCI address match found. */ 1678 fclose(file); 1679 snprintf(tmp_str, sizeof(tmp_str), 1680 "/sys/class/net/%s/phys_port_name", ifname); 1681 file = fopen(tmp_str, "rb"); 1682 if (!file) 1683 break; 1684 info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET; 1685 if (fscanf(file, "%32s", tmp_str) == 1) 1686 mlx5_translate_port_name(tmp_str, &info); 1687 if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY || 1688 info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK) 1689 pf = info.port_name; 1690 break; 1691 } 1692 if (file) 1693 fclose(file); 1694 return pf; 1695 } 1696 1697 /** 1698 * DPDK callback to register a PCI device. 1699 * 1700 * This function spawns Ethernet devices out of a given PCI device. 1701 * 1702 * @param[in] pci_drv 1703 * PCI driver structure (mlx5_driver). 1704 * @param[in] pci_dev 1705 * PCI device information. 1706 * 1707 * @return 1708 * 0 on success, a negative errno value otherwise and rte_errno is set. 1709 */ 1710 int 1711 mlx5_os_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, 1712 struct rte_pci_device *pci_dev) 1713 { 1714 struct ibv_device **ibv_list; 1715 /* 1716 * Number of found IB Devices matching with requested PCI BDF. 1717 * nd != 1 means there are multiple IB devices over the same 1718 * PCI device and we have representors and master. 1719 */ 1720 unsigned int nd = 0; 1721 /* 1722 * Number of found IB device Ports. nd = 1 and np = 1..n means 1723 * we have the single multiport IB device, and there may be 1724 * representors attached to some of found ports. 1725 */ 1726 unsigned int np = 0; 1727 /* 1728 * Number of DPDK ethernet devices to Spawn - either over 1729 * multiple IB devices or multiple ports of single IB device. 1730 * Actually this is the number of iterations to spawn. 1731 */ 1732 unsigned int ns = 0; 1733 /* 1734 * Bonding device 1735 * < 0 - no bonding device (single one) 1736 * >= 0 - bonding device (value is slave PF index) 1737 */ 1738 int bd = -1; 1739 struct mlx5_dev_spawn_data *list = NULL; 1740 struct mlx5_dev_config dev_config; 1741 unsigned int dev_config_vf; 1742 int ret; 1743 1744 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 1745 mlx5_pmd_socket_init(); 1746 ret = mlx5_init_once(); 1747 if (ret) { 1748 DRV_LOG(ERR, "unable to init PMD global data: %s", 1749 strerror(rte_errno)); 1750 return -rte_errno; 1751 } 1752 errno = 0; 1753 ibv_list = mlx5_glue->get_device_list(&ret); 1754 if (!ibv_list) { 1755 rte_errno = errno ? errno : ENOSYS; 1756 DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?"); 1757 return -rte_errno; 1758 } 1759 /* 1760 * First scan the list of all Infiniband devices to find 1761 * matching ones, gathering into the list. 1762 */ 1763 struct ibv_device *ibv_match[ret + 1]; 1764 int nl_route = mlx5_nl_init(NETLINK_ROUTE); 1765 int nl_rdma = mlx5_nl_init(NETLINK_RDMA); 1766 unsigned int i; 1767 1768 while (ret-- > 0) { 1769 struct rte_pci_addr pci_addr; 1770 1771 DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name); 1772 bd = mlx5_device_bond_pci_match 1773 (ibv_list[ret], pci_dev, nl_rdma); 1774 if (bd >= 0) { 1775 /* 1776 * Bonding device detected. Only one match is allowed, 1777 * the bonding is supported over multi-port IB device, 1778 * there should be no matches on representor PCI 1779 * functions or non VF LAG bonding devices with 1780 * specified address. 1781 */ 1782 if (nd) { 1783 DRV_LOG(ERR, 1784 "multiple PCI match on bonding device" 1785 "\"%s\" found", ibv_list[ret]->name); 1786 rte_errno = ENOENT; 1787 ret = -rte_errno; 1788 goto exit; 1789 } 1790 DRV_LOG(INFO, "PCI information matches for" 1791 " slave %d bonding device \"%s\"", 1792 bd, ibv_list[ret]->name); 1793 ibv_match[nd++] = ibv_list[ret]; 1794 break; 1795 } 1796 if (mlx5_dev_to_pci_addr 1797 (ibv_list[ret]->ibdev_path, &pci_addr)) 1798 continue; 1799 if (pci_dev->addr.domain != pci_addr.domain || 1800 pci_dev->addr.bus != pci_addr.bus || 1801 pci_dev->addr.devid != pci_addr.devid || 1802 pci_dev->addr.function != pci_addr.function) 1803 continue; 1804 DRV_LOG(INFO, "PCI information matches for device \"%s\"", 1805 ibv_list[ret]->name); 1806 ibv_match[nd++] = ibv_list[ret]; 1807 } 1808 ibv_match[nd] = NULL; 1809 if (!nd) { 1810 /* No device matches, just complain and bail out. */ 1811 DRV_LOG(WARNING, 1812 "no Verbs device matches PCI device " PCI_PRI_FMT "," 1813 " are kernel drivers loaded?", 1814 pci_dev->addr.domain, pci_dev->addr.bus, 1815 pci_dev->addr.devid, pci_dev->addr.function); 1816 rte_errno = ENOENT; 1817 ret = -rte_errno; 1818 goto exit; 1819 } 1820 if (nd == 1) { 1821 /* 1822 * Found single matching device may have multiple ports. 1823 * Each port may be representor, we have to check the port 1824 * number and check the representors existence. 1825 */ 1826 if (nl_rdma >= 0) 1827 np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name); 1828 if (!np) 1829 DRV_LOG(WARNING, "can not get IB device \"%s\"" 1830 " ports number", ibv_match[0]->name); 1831 if (bd >= 0 && !np) { 1832 DRV_LOG(ERR, "can not get ports" 1833 " for bonding device"); 1834 rte_errno = ENOENT; 1835 ret = -rte_errno; 1836 goto exit; 1837 } 1838 } 1839 #ifndef HAVE_MLX5DV_DR_DEVX_PORT 1840 if (bd >= 0) { 1841 /* 1842 * This may happen if there is VF LAG kernel support and 1843 * application is compiled with older rdma_core library. 1844 */ 1845 DRV_LOG(ERR, 1846 "No kernel/verbs support for VF LAG bonding found."); 1847 rte_errno = ENOTSUP; 1848 ret = -rte_errno; 1849 goto exit; 1850 } 1851 #endif 1852 /* 1853 * Now we can determine the maximal 1854 * amount of devices to be spawned. 1855 */ 1856 list = mlx5_malloc(MLX5_MEM_ZERO, 1857 sizeof(struct mlx5_dev_spawn_data) * 1858 (np ? np : nd), 1859 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 1860 if (!list) { 1861 DRV_LOG(ERR, "spawn data array allocation failure"); 1862 rte_errno = ENOMEM; 1863 ret = -rte_errno; 1864 goto exit; 1865 } 1866 if (bd >= 0 || np > 1) { 1867 /* 1868 * Single IB device with multiple ports found, 1869 * it may be E-Switch master device and representors. 1870 * We have to perform identification through the ports. 1871 */ 1872 MLX5_ASSERT(nl_rdma >= 0); 1873 MLX5_ASSERT(ns == 0); 1874 MLX5_ASSERT(nd == 1); 1875 MLX5_ASSERT(np); 1876 for (i = 1; i <= np; ++i) { 1877 list[ns].max_port = np; 1878 list[ns].phys_port = i; 1879 list[ns].phys_dev = ibv_match[0]; 1880 list[ns].eth_dev = NULL; 1881 list[ns].pci_dev = pci_dev; 1882 list[ns].pf_bond = bd; 1883 list[ns].ifindex = mlx5_nl_ifindex 1884 (nl_rdma, 1885 mlx5_os_get_dev_device_name 1886 (list[ns].phys_dev), i); 1887 if (!list[ns].ifindex) { 1888 /* 1889 * No network interface index found for the 1890 * specified port, it means there is no 1891 * representor on this port. It's OK, 1892 * there can be disabled ports, for example 1893 * if sriov_numvfs < sriov_totalvfs. 1894 */ 1895 continue; 1896 } 1897 ret = -1; 1898 if (nl_route >= 0) 1899 ret = mlx5_nl_switch_info 1900 (nl_route, 1901 list[ns].ifindex, 1902 &list[ns].info); 1903 if (ret || (!list[ns].info.representor && 1904 !list[ns].info.master)) { 1905 /* 1906 * We failed to recognize representors with 1907 * Netlink, let's try to perform the task 1908 * with sysfs. 1909 */ 1910 ret = mlx5_sysfs_switch_info 1911 (list[ns].ifindex, 1912 &list[ns].info); 1913 } 1914 if (!ret && bd >= 0) { 1915 switch (list[ns].info.name_type) { 1916 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: 1917 if (list[ns].info.port_name == bd) 1918 ns++; 1919 break; 1920 case MLX5_PHYS_PORT_NAME_TYPE_PFHPF: 1921 /* Fallthrough */ 1922 case MLX5_PHYS_PORT_NAME_TYPE_PFVF: 1923 if (list[ns].info.pf_num == bd) 1924 ns++; 1925 break; 1926 default: 1927 break; 1928 } 1929 continue; 1930 } 1931 if (!ret && (list[ns].info.representor ^ 1932 list[ns].info.master)) 1933 ns++; 1934 } 1935 if (!ns) { 1936 DRV_LOG(ERR, 1937 "unable to recognize master/representors" 1938 " on the IB device with multiple ports"); 1939 rte_errno = ENOENT; 1940 ret = -rte_errno; 1941 goto exit; 1942 } 1943 } else { 1944 /* 1945 * The existence of several matching entries (nd > 1) means 1946 * port representors have been instantiated. No existing Verbs 1947 * call nor sysfs entries can tell them apart, this can only 1948 * be done through Netlink calls assuming kernel drivers are 1949 * recent enough to support them. 1950 * 1951 * In the event of identification failure through Netlink, 1952 * try again through sysfs, then: 1953 * 1954 * 1. A single IB device matches (nd == 1) with single 1955 * port (np=0/1) and is not a representor, assume 1956 * no switch support. 1957 * 1958 * 2. Otherwise no safe assumptions can be made; 1959 * complain louder and bail out. 1960 */ 1961 for (i = 0; i != nd; ++i) { 1962 memset(&list[ns].info, 0, sizeof(list[ns].info)); 1963 list[ns].max_port = 1; 1964 list[ns].phys_port = 1; 1965 list[ns].phys_dev = ibv_match[i]; 1966 list[ns].eth_dev = NULL; 1967 list[ns].pci_dev = pci_dev; 1968 list[ns].pf_bond = -1; 1969 list[ns].ifindex = 0; 1970 if (nl_rdma >= 0) 1971 list[ns].ifindex = mlx5_nl_ifindex 1972 (nl_rdma, 1973 mlx5_os_get_dev_device_name 1974 (list[ns].phys_dev), 1); 1975 if (!list[ns].ifindex) { 1976 char ifname[IF_NAMESIZE]; 1977 1978 /* 1979 * Netlink failed, it may happen with old 1980 * ib_core kernel driver (before 4.16). 1981 * We can assume there is old driver because 1982 * here we are processing single ports IB 1983 * devices. Let's try sysfs to retrieve 1984 * the ifindex. The method works for 1985 * master device only. 1986 */ 1987 if (nd > 1) { 1988 /* 1989 * Multiple devices found, assume 1990 * representors, can not distinguish 1991 * master/representor and retrieve 1992 * ifindex via sysfs. 1993 */ 1994 continue; 1995 } 1996 ret = mlx5_get_ifname_sysfs 1997 (ibv_match[i]->ibdev_path, ifname); 1998 if (!ret) 1999 list[ns].ifindex = 2000 if_nametoindex(ifname); 2001 if (!list[ns].ifindex) { 2002 /* 2003 * No network interface index found 2004 * for the specified device, it means 2005 * there it is neither representor 2006 * nor master. 2007 */ 2008 continue; 2009 } 2010 } 2011 ret = -1; 2012 if (nl_route >= 0) 2013 ret = mlx5_nl_switch_info 2014 (nl_route, 2015 list[ns].ifindex, 2016 &list[ns].info); 2017 if (ret || (!list[ns].info.representor && 2018 !list[ns].info.master)) { 2019 /* 2020 * We failed to recognize representors with 2021 * Netlink, let's try to perform the task 2022 * with sysfs. 2023 */ 2024 ret = mlx5_sysfs_switch_info 2025 (list[ns].ifindex, 2026 &list[ns].info); 2027 } 2028 if (!ret && (list[ns].info.representor ^ 2029 list[ns].info.master)) { 2030 ns++; 2031 } else if ((nd == 1) && 2032 !list[ns].info.representor && 2033 !list[ns].info.master) { 2034 /* 2035 * Single IB device with 2036 * one physical port and 2037 * attached network device. 2038 * May be SRIOV is not enabled 2039 * or there is no representors. 2040 */ 2041 DRV_LOG(INFO, "no E-Switch support detected"); 2042 ns++; 2043 break; 2044 } 2045 } 2046 if (!ns) { 2047 DRV_LOG(ERR, 2048 "unable to recognize master/representors" 2049 " on the multiple IB devices"); 2050 rte_errno = ENOENT; 2051 ret = -rte_errno; 2052 goto exit; 2053 } 2054 } 2055 MLX5_ASSERT(ns); 2056 /* 2057 * Sort list to probe devices in natural order for users convenience 2058 * (i.e. master first, then representors from lowest to highest ID). 2059 */ 2060 qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp); 2061 /* Device specific configuration. */ 2062 switch (pci_dev->id.device_id) { 2063 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: 2064 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: 2065 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: 2066 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: 2067 case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF: 2068 case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF: 2069 case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF: 2070 dev_config_vf = 1; 2071 break; 2072 default: 2073 dev_config_vf = 0; 2074 break; 2075 } 2076 for (i = 0; i != ns; ++i) { 2077 uint32_t restore; 2078 2079 /* Default configuration. */ 2080 memset(&dev_config, 0, sizeof(struct mlx5_dev_config)); 2081 dev_config.vf = dev_config_vf; 2082 dev_config.mps = MLX5_ARG_UNSET; 2083 dev_config.dbnc = MLX5_ARG_UNSET; 2084 dev_config.rx_vec_en = 1; 2085 dev_config.txq_inline_max = MLX5_ARG_UNSET; 2086 dev_config.txq_inline_min = MLX5_ARG_UNSET; 2087 dev_config.txq_inline_mpw = MLX5_ARG_UNSET; 2088 dev_config.txqs_inline = MLX5_ARG_UNSET; 2089 dev_config.vf_nl_en = 1; 2090 dev_config.mr_ext_memseg_en = 1; 2091 dev_config.mprq.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN; 2092 dev_config.mprq.min_rxqs_num = MLX5_MPRQ_MIN_RXQS; 2093 dev_config.dv_esw_en = 1; 2094 dev_config.dv_flow_en = 1; 2095 dev_config.decap_en = 1; 2096 dev_config.log_hp_size = MLX5_ARG_UNSET; 2097 list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device, 2098 &list[i], 2099 &dev_config); 2100 if (!list[i].eth_dev) { 2101 if (rte_errno != EBUSY && rte_errno != EEXIST) 2102 break; 2103 /* Device is disabled or already spawned. Ignore it. */ 2104 continue; 2105 } 2106 restore = list[i].eth_dev->data->dev_flags; 2107 rte_eth_copy_pci_info(list[i].eth_dev, pci_dev); 2108 /* Restore non-PCI flags cleared by the above call. */ 2109 list[i].eth_dev->data->dev_flags |= restore; 2110 rte_eth_dev_probing_finish(list[i].eth_dev); 2111 } 2112 if (i != ns) { 2113 DRV_LOG(ERR, 2114 "probe of PCI device " PCI_PRI_FMT " aborted after" 2115 " encountering an error: %s", 2116 pci_dev->addr.domain, pci_dev->addr.bus, 2117 pci_dev->addr.devid, pci_dev->addr.function, 2118 strerror(rte_errno)); 2119 ret = -rte_errno; 2120 /* Roll back. */ 2121 while (i--) { 2122 if (!list[i].eth_dev) 2123 continue; 2124 mlx5_dev_close(list[i].eth_dev); 2125 /* mac_addrs must not be freed because in dev_private */ 2126 list[i].eth_dev->data->mac_addrs = NULL; 2127 claim_zero(rte_eth_dev_release_port(list[i].eth_dev)); 2128 } 2129 /* Restore original error. */ 2130 rte_errno = -ret; 2131 } else { 2132 ret = 0; 2133 } 2134 exit: 2135 /* 2136 * Do the routine cleanup: 2137 * - close opened Netlink sockets 2138 * - free allocated spawn data array 2139 * - free the Infiniband device list 2140 */ 2141 if (nl_rdma >= 0) 2142 close(nl_rdma); 2143 if (nl_route >= 0) 2144 close(nl_route); 2145 if (list) 2146 mlx5_free(list); 2147 MLX5_ASSERT(ibv_list); 2148 mlx5_glue->free_device_list(ibv_list); 2149 return ret; 2150 } 2151 2152 static int 2153 mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config) 2154 { 2155 char *env; 2156 int value; 2157 2158 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 2159 /* Get environment variable to store. */ 2160 env = getenv(MLX5_SHUT_UP_BF); 2161 value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET; 2162 if (config->dbnc == MLX5_ARG_UNSET) 2163 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1); 2164 else 2165 setenv(MLX5_SHUT_UP_BF, 2166 config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1); 2167 return value; 2168 } 2169 2170 static void 2171 mlx5_restore_doorbell_mapping_env(int value) 2172 { 2173 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 2174 /* Restore the original environment variable state. */ 2175 if (value == MLX5_ARG_UNSET) 2176 unsetenv(MLX5_SHUT_UP_BF); 2177 else 2178 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1); 2179 } 2180 2181 /** 2182 * Extract pdn of PD object using DV API. 2183 * 2184 * @param[in] pd 2185 * Pointer to the verbs PD object. 2186 * @param[out] pdn 2187 * Pointer to the PD object number variable. 2188 * 2189 * @return 2190 * 0 on success, error value otherwise. 2191 */ 2192 int 2193 mlx5_os_get_pdn(void *pd, uint32_t *pdn) 2194 { 2195 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 2196 struct mlx5dv_obj obj; 2197 struct mlx5dv_pd pd_info; 2198 int ret = 0; 2199 2200 obj.pd.in = pd; 2201 obj.pd.out = &pd_info; 2202 ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD); 2203 if (ret) { 2204 DRV_LOG(DEBUG, "Fail to get PD object info"); 2205 return ret; 2206 } 2207 *pdn = pd_info.pdn; 2208 return 0; 2209 #else 2210 (void)pd; 2211 (void)pdn; 2212 return -ENOTSUP; 2213 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */ 2214 } 2215 2216 /** 2217 * Function API to open IB device. 2218 * 2219 * This function calls the Linux glue APIs to open a device. 2220 * 2221 * @param[in] spawn 2222 * Pointer to the IB device attributes (name, port, etc). 2223 * @param[out] config 2224 * Pointer to device configuration structure. 2225 * @param[out] sh 2226 * Pointer to shared context structure. 2227 * 2228 * @return 2229 * 0 on success, a positive error value otherwise. 2230 */ 2231 int 2232 mlx5_os_open_device(const struct mlx5_dev_spawn_data *spawn, 2233 const struct mlx5_dev_config *config, 2234 struct mlx5_dev_ctx_shared *sh) 2235 { 2236 int dbmap_env; 2237 int err = 0; 2238 2239 sh->numa_node = spawn->pci_dev->device.numa_node; 2240 pthread_mutex_init(&sh->txpp.mutex, NULL); 2241 /* 2242 * Configure environment variable "MLX5_BF_SHUT_UP" 2243 * before the device creation. The rdma_core library 2244 * checks the variable at device creation and 2245 * stores the result internally. 2246 */ 2247 dbmap_env = mlx5_config_doorbell_mapping_env(config); 2248 /* Try to open IB device with DV first, then usual Verbs. */ 2249 errno = 0; 2250 sh->ctx = mlx5_glue->dv_open_device(spawn->phys_dev); 2251 if (sh->ctx) { 2252 sh->devx = 1; 2253 DRV_LOG(DEBUG, "DevX is supported"); 2254 /* The device is created, no need for environment. */ 2255 mlx5_restore_doorbell_mapping_env(dbmap_env); 2256 } else { 2257 /* The environment variable is still configured. */ 2258 sh->ctx = mlx5_glue->open_device(spawn->phys_dev); 2259 err = errno ? errno : ENODEV; 2260 /* 2261 * The environment variable is not needed anymore, 2262 * all device creation attempts are completed. 2263 */ 2264 mlx5_restore_doorbell_mapping_env(dbmap_env); 2265 if (!sh->ctx) 2266 return err; 2267 DRV_LOG(DEBUG, "DevX is NOT supported"); 2268 err = 0; 2269 } 2270 return err; 2271 } 2272 2273 /** 2274 * Install shared asynchronous device events handler. 2275 * This function is implemented to support event sharing 2276 * between multiple ports of single IB device. 2277 * 2278 * @param sh 2279 * Pointer to mlx5_dev_ctx_shared object. 2280 */ 2281 void 2282 mlx5_os_dev_shared_handler_install(struct mlx5_dev_ctx_shared *sh) 2283 { 2284 int ret; 2285 int flags; 2286 2287 sh->intr_handle.fd = -1; 2288 flags = fcntl(((struct ibv_context *)sh->ctx)->async_fd, F_GETFL); 2289 ret = fcntl(((struct ibv_context *)sh->ctx)->async_fd, 2290 F_SETFL, flags | O_NONBLOCK); 2291 if (ret) { 2292 DRV_LOG(INFO, "failed to change file descriptor async event" 2293 " queue"); 2294 } else { 2295 sh->intr_handle.fd = ((struct ibv_context *)sh->ctx)->async_fd; 2296 sh->intr_handle.type = RTE_INTR_HANDLE_EXT; 2297 if (rte_intr_callback_register(&sh->intr_handle, 2298 mlx5_dev_interrupt_handler, sh)) { 2299 DRV_LOG(INFO, "Fail to install the shared interrupt."); 2300 sh->intr_handle.fd = -1; 2301 } 2302 } 2303 if (sh->devx) { 2304 #ifdef HAVE_IBV_DEVX_ASYNC 2305 sh->intr_handle_devx.fd = -1; 2306 sh->devx_comp = 2307 (void *)mlx5_glue->devx_create_cmd_comp(sh->ctx); 2308 struct mlx5dv_devx_cmd_comp *devx_comp = sh->devx_comp; 2309 if (!devx_comp) { 2310 DRV_LOG(INFO, "failed to allocate devx_comp."); 2311 return; 2312 } 2313 flags = fcntl(devx_comp->fd, F_GETFL); 2314 ret = fcntl(devx_comp->fd, F_SETFL, flags | O_NONBLOCK); 2315 if (ret) { 2316 DRV_LOG(INFO, "failed to change file descriptor" 2317 " devx comp"); 2318 return; 2319 } 2320 sh->intr_handle_devx.fd = devx_comp->fd; 2321 sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT; 2322 if (rte_intr_callback_register(&sh->intr_handle_devx, 2323 mlx5_dev_interrupt_handler_devx, sh)) { 2324 DRV_LOG(INFO, "Fail to install the devx shared" 2325 " interrupt."); 2326 sh->intr_handle_devx.fd = -1; 2327 } 2328 #endif /* HAVE_IBV_DEVX_ASYNC */ 2329 } 2330 } 2331 2332 /** 2333 * Uninstall shared asynchronous device events handler. 2334 * This function is implemented to support event sharing 2335 * between multiple ports of single IB device. 2336 * 2337 * @param dev 2338 * Pointer to mlx5_dev_ctx_shared object. 2339 */ 2340 void 2341 mlx5_os_dev_shared_handler_uninstall(struct mlx5_dev_ctx_shared *sh) 2342 { 2343 if (sh->intr_handle.fd >= 0) 2344 mlx5_intr_callback_unregister(&sh->intr_handle, 2345 mlx5_dev_interrupt_handler, sh); 2346 #ifdef HAVE_IBV_DEVX_ASYNC 2347 if (sh->intr_handle_devx.fd >= 0) 2348 rte_intr_callback_unregister(&sh->intr_handle_devx, 2349 mlx5_dev_interrupt_handler_devx, sh); 2350 if (sh->devx_comp) 2351 mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp); 2352 #endif 2353 } 2354 2355 /** 2356 * Read statistics by a named counter. 2357 * 2358 * @param[in] priv 2359 * Pointer to the private device data structure. 2360 * @param[in] ctr_name 2361 * Pointer to the name of the statistic counter to read 2362 * @param[out] stat 2363 * Pointer to read statistic value. 2364 * @return 2365 * 0 on success and stat is valud, 1 if failed to read the value 2366 * rte_errno is set. 2367 * 2368 */ 2369 int 2370 mlx5_os_read_dev_stat(struct mlx5_priv *priv, const char *ctr_name, 2371 uint64_t *stat) 2372 { 2373 int fd; 2374 2375 if (priv->sh) { 2376 MKSTR(path, "%s/ports/%d/hw_counters/%s", 2377 priv->sh->ibdev_path, 2378 priv->dev_port, 2379 ctr_name); 2380 fd = open(path, O_RDONLY); 2381 /* 2382 * in switchdev the file location is not per port 2383 * but rather in <ibdev_path>/hw_counters/<file_name>. 2384 */ 2385 if (fd == -1) { 2386 MKSTR(path1, "%s/hw_counters/%s", 2387 priv->sh->ibdev_path, 2388 ctr_name); 2389 fd = open(path1, O_RDONLY); 2390 } 2391 if (fd != -1) { 2392 char buf[21] = {'\0'}; 2393 ssize_t n = read(fd, buf, sizeof(buf)); 2394 2395 close(fd); 2396 if (n != -1) { 2397 *stat = strtoull(buf, NULL, 10); 2398 return 0; 2399 } 2400 } 2401 } 2402 *stat = 0; 2403 return 1; 2404 } 2405 2406 /** 2407 * Set the reg_mr and dereg_mr call backs 2408 * 2409 * @param reg_mr_cb[out] 2410 * Pointer to reg_mr func 2411 * @param dereg_mr_cb[out] 2412 * Pointer to dereg_mr func 2413 * 2414 */ 2415 void 2416 mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb, 2417 mlx5_dereg_mr_t *dereg_mr_cb) 2418 { 2419 *reg_mr_cb = mlx5_verbs_ops.reg_mr; 2420 *dereg_mr_cb = mlx5_verbs_ops.dereg_mr; 2421 } 2422 2423 /** 2424 * Remove a MAC address from device 2425 * 2426 * @param dev 2427 * Pointer to Ethernet device structure. 2428 * @param index 2429 * MAC address index. 2430 */ 2431 void 2432 mlx5_os_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) 2433 { 2434 struct mlx5_priv *priv = dev->data->dev_private; 2435 const int vf = priv->config.vf; 2436 2437 if (vf) 2438 mlx5_nl_mac_addr_remove(priv->nl_socket_route, 2439 mlx5_ifindex(dev), priv->mac_own, 2440 &dev->data->mac_addrs[index], index); 2441 } 2442 2443 /** 2444 * Adds a MAC address to the device 2445 * 2446 * @param dev 2447 * Pointer to Ethernet device structure. 2448 * @param mac_addr 2449 * MAC address to register. 2450 * @param index 2451 * MAC address index. 2452 * 2453 * @return 2454 * 0 on success, a negative errno value otherwise 2455 */ 2456 int 2457 mlx5_os_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac, 2458 uint32_t index) 2459 { 2460 struct mlx5_priv *priv = dev->data->dev_private; 2461 const int vf = priv->config.vf; 2462 int ret = 0; 2463 2464 if (vf) 2465 ret = mlx5_nl_mac_addr_add(priv->nl_socket_route, 2466 mlx5_ifindex(dev), priv->mac_own, 2467 mac, index); 2468 return ret; 2469 } 2470 2471 /** 2472 * Modify a VF MAC address 2473 * 2474 * @param priv 2475 * Pointer to device private data. 2476 * @param mac_addr 2477 * MAC address to modify into. 2478 * @param iface_idx 2479 * Net device interface index 2480 * @param vf_index 2481 * VF index 2482 * 2483 * @return 2484 * 0 on success, a negative errno value otherwise 2485 */ 2486 int 2487 mlx5_os_vf_mac_addr_modify(struct mlx5_priv *priv, 2488 unsigned int iface_idx, 2489 struct rte_ether_addr *mac_addr, 2490 int vf_index) 2491 { 2492 return mlx5_nl_vf_mac_addr_modify 2493 (priv->nl_socket_route, iface_idx, mac_addr, vf_index); 2494 } 2495 2496 /** 2497 * Set device promiscuous mode 2498 * 2499 * @param dev 2500 * Pointer to Ethernet device structure. 2501 * @param enable 2502 * 0 - promiscuous is disabled, otherwise - enabled 2503 * 2504 * @return 2505 * 0 on success, a negative error value otherwise 2506 */ 2507 int 2508 mlx5_os_set_promisc(struct rte_eth_dev *dev, int enable) 2509 { 2510 struct mlx5_priv *priv = dev->data->dev_private; 2511 2512 return mlx5_nl_promisc(priv->nl_socket_route, 2513 mlx5_ifindex(dev), !!enable); 2514 } 2515 2516 /** 2517 * Set device promiscuous mode 2518 * 2519 * @param dev 2520 * Pointer to Ethernet device structure. 2521 * @param enable 2522 * 0 - all multicase is disabled, otherwise - enabled 2523 * 2524 * @return 2525 * 0 on success, a negative error value otherwise 2526 */ 2527 int 2528 mlx5_os_set_allmulti(struct rte_eth_dev *dev, int enable) 2529 { 2530 struct mlx5_priv *priv = dev->data->dev_private; 2531 2532 return mlx5_nl_allmulti(priv->nl_socket_route, 2533 mlx5_ifindex(dev), !!enable); 2534 } 2535 2536 /** 2537 * Flush device MAC addresses 2538 * 2539 * @param dev 2540 * Pointer to Ethernet device structure. 2541 * 2542 */ 2543 void 2544 mlx5_os_mac_addr_flush(struct rte_eth_dev *dev) 2545 { 2546 struct mlx5_priv *priv = dev->data->dev_private; 2547 2548 mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev), 2549 dev->data->mac_addrs, 2550 MLX5_MAX_MAC_ADDRESSES, priv->mac_own); 2551 } 2552 2553 const struct eth_dev_ops mlx5_os_dev_ops = { 2554 .dev_configure = mlx5_dev_configure, 2555 .dev_start = mlx5_dev_start, 2556 .dev_stop = mlx5_dev_stop, 2557 .dev_set_link_down = mlx5_set_link_down, 2558 .dev_set_link_up = mlx5_set_link_up, 2559 .dev_close = mlx5_dev_close, 2560 .promiscuous_enable = mlx5_promiscuous_enable, 2561 .promiscuous_disable = mlx5_promiscuous_disable, 2562 .allmulticast_enable = mlx5_allmulticast_enable, 2563 .allmulticast_disable = mlx5_allmulticast_disable, 2564 .link_update = mlx5_link_update, 2565 .stats_get = mlx5_stats_get, 2566 .stats_reset = mlx5_stats_reset, 2567 .xstats_get = mlx5_xstats_get, 2568 .xstats_reset = mlx5_xstats_reset, 2569 .xstats_get_names = mlx5_xstats_get_names, 2570 .fw_version_get = mlx5_fw_version_get, 2571 .dev_infos_get = mlx5_dev_infos_get, 2572 .read_clock = mlx5_txpp_read_clock, 2573 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 2574 .vlan_filter_set = mlx5_vlan_filter_set, 2575 .rx_queue_setup = mlx5_rx_queue_setup, 2576 .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup, 2577 .tx_queue_setup = mlx5_tx_queue_setup, 2578 .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup, 2579 .rx_queue_release = mlx5_rx_queue_release, 2580 .tx_queue_release = mlx5_tx_queue_release, 2581 .rx_queue_start = mlx5_rx_queue_start, 2582 .rx_queue_stop = mlx5_rx_queue_stop, 2583 .tx_queue_start = mlx5_tx_queue_start, 2584 .tx_queue_stop = mlx5_tx_queue_stop, 2585 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 2586 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 2587 .mac_addr_remove = mlx5_mac_addr_remove, 2588 .mac_addr_add = mlx5_mac_addr_add, 2589 .mac_addr_set = mlx5_mac_addr_set, 2590 .set_mc_addr_list = mlx5_set_mc_addr_list, 2591 .mtu_set = mlx5_dev_set_mtu, 2592 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 2593 .vlan_offload_set = mlx5_vlan_offload_set, 2594 .reta_update = mlx5_dev_rss_reta_update, 2595 .reta_query = mlx5_dev_rss_reta_query, 2596 .rss_hash_update = mlx5_rss_hash_update, 2597 .rss_hash_conf_get = mlx5_rss_hash_conf_get, 2598 .filter_ctrl = mlx5_dev_filter_ctrl, 2599 .rxq_info_get = mlx5_rxq_info_get, 2600 .txq_info_get = mlx5_txq_info_get, 2601 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 2602 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 2603 .rx_queue_intr_enable = mlx5_rx_intr_enable, 2604 .rx_queue_intr_disable = mlx5_rx_intr_disable, 2605 .is_removed = mlx5_is_removed, 2606 .udp_tunnel_port_add = mlx5_udp_tunnel_port_add, 2607 .get_module_info = mlx5_get_module_info, 2608 .get_module_eeprom = mlx5_get_module_eeprom, 2609 .hairpin_cap_get = mlx5_hairpin_cap_get, 2610 .mtr_ops_get = mlx5_flow_meter_ops_get, 2611 .hairpin_bind = mlx5_hairpin_bind, 2612 .hairpin_unbind = mlx5_hairpin_unbind, 2613 .hairpin_get_peer_ports = mlx5_hairpin_get_peer_ports, 2614 .hairpin_queue_peer_update = mlx5_hairpin_queue_peer_update, 2615 .hairpin_queue_peer_bind = mlx5_hairpin_queue_peer_bind, 2616 .hairpin_queue_peer_unbind = mlx5_hairpin_queue_peer_unbind, 2617 }; 2618 2619 /* Available operations from secondary process. */ 2620 const struct eth_dev_ops mlx5_os_dev_sec_ops = { 2621 .stats_get = mlx5_stats_get, 2622 .stats_reset = mlx5_stats_reset, 2623 .xstats_get = mlx5_xstats_get, 2624 .xstats_reset = mlx5_xstats_reset, 2625 .xstats_get_names = mlx5_xstats_get_names, 2626 .fw_version_get = mlx5_fw_version_get, 2627 .dev_infos_get = mlx5_dev_infos_get, 2628 .read_clock = mlx5_txpp_read_clock, 2629 .rx_queue_start = mlx5_rx_queue_start, 2630 .rx_queue_stop = mlx5_rx_queue_stop, 2631 .tx_queue_start = mlx5_tx_queue_start, 2632 .tx_queue_stop = mlx5_tx_queue_stop, 2633 .rxq_info_get = mlx5_rxq_info_get, 2634 .txq_info_get = mlx5_txq_info_get, 2635 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 2636 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 2637 .get_module_info = mlx5_get_module_info, 2638 .get_module_eeprom = mlx5_get_module_eeprom, 2639 }; 2640 2641 /* Available operations in flow isolated mode. */ 2642 const struct eth_dev_ops mlx5_os_dev_ops_isolate = { 2643 .dev_configure = mlx5_dev_configure, 2644 .dev_start = mlx5_dev_start, 2645 .dev_stop = mlx5_dev_stop, 2646 .dev_set_link_down = mlx5_set_link_down, 2647 .dev_set_link_up = mlx5_set_link_up, 2648 .dev_close = mlx5_dev_close, 2649 .promiscuous_enable = mlx5_promiscuous_enable, 2650 .promiscuous_disable = mlx5_promiscuous_disable, 2651 .allmulticast_enable = mlx5_allmulticast_enable, 2652 .allmulticast_disable = mlx5_allmulticast_disable, 2653 .link_update = mlx5_link_update, 2654 .stats_get = mlx5_stats_get, 2655 .stats_reset = mlx5_stats_reset, 2656 .xstats_get = mlx5_xstats_get, 2657 .xstats_reset = mlx5_xstats_reset, 2658 .xstats_get_names = mlx5_xstats_get_names, 2659 .fw_version_get = mlx5_fw_version_get, 2660 .dev_infos_get = mlx5_dev_infos_get, 2661 .read_clock = mlx5_txpp_read_clock, 2662 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 2663 .vlan_filter_set = mlx5_vlan_filter_set, 2664 .rx_queue_setup = mlx5_rx_queue_setup, 2665 .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup, 2666 .tx_queue_setup = mlx5_tx_queue_setup, 2667 .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup, 2668 .rx_queue_release = mlx5_rx_queue_release, 2669 .tx_queue_release = mlx5_tx_queue_release, 2670 .rx_queue_start = mlx5_rx_queue_start, 2671 .rx_queue_stop = mlx5_rx_queue_stop, 2672 .tx_queue_start = mlx5_tx_queue_start, 2673 .tx_queue_stop = mlx5_tx_queue_stop, 2674 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 2675 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 2676 .mac_addr_remove = mlx5_mac_addr_remove, 2677 .mac_addr_add = mlx5_mac_addr_add, 2678 .mac_addr_set = mlx5_mac_addr_set, 2679 .set_mc_addr_list = mlx5_set_mc_addr_list, 2680 .mtu_set = mlx5_dev_set_mtu, 2681 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 2682 .vlan_offload_set = mlx5_vlan_offload_set, 2683 .filter_ctrl = mlx5_dev_filter_ctrl, 2684 .rxq_info_get = mlx5_rxq_info_get, 2685 .txq_info_get = mlx5_txq_info_get, 2686 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 2687 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 2688 .rx_queue_intr_enable = mlx5_rx_intr_enable, 2689 .rx_queue_intr_disable = mlx5_rx_intr_disable, 2690 .is_removed = mlx5_is_removed, 2691 .get_module_info = mlx5_get_module_info, 2692 .get_module_eeprom = mlx5_get_module_eeprom, 2693 .hairpin_cap_get = mlx5_hairpin_cap_get, 2694 .mtr_ops_get = mlx5_flow_meter_ops_get, 2695 .hairpin_bind = mlx5_hairpin_bind, 2696 .hairpin_unbind = mlx5_hairpin_unbind, 2697 .hairpin_get_peer_ports = mlx5_hairpin_get_peer_ports, 2698 .hairpin_queue_peer_update = mlx5_hairpin_queue_peer_update, 2699 .hairpin_queue_peer_bind = mlx5_hairpin_queue_peer_bind, 2700 .hairpin_queue_peer_unbind = mlx5_hairpin_queue_peer_unbind, 2701 }; 2702