1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2020 Mellanox Technologies, Ltd 4 */ 5 6 #include <stddef.h> 7 #include <unistd.h> 8 #include <string.h> 9 #include <stdint.h> 10 #include <stdlib.h> 11 #include <errno.h> 12 #include <net/if.h> 13 #include <linux/rtnetlink.h> 14 #include <linux/sockios.h> 15 #include <linux/ethtool.h> 16 #include <fcntl.h> 17 18 #include <rte_malloc.h> 19 #include <rte_ethdev_driver.h> 20 #include <rte_ethdev_pci.h> 21 #include <rte_pci.h> 22 #include <rte_bus_pci.h> 23 #include <rte_common.h> 24 #include <rte_kvargs.h> 25 #include <rte_rwlock.h> 26 #include <rte_spinlock.h> 27 #include <rte_string_fns.h> 28 #include <rte_alarm.h> 29 #include <rte_eal_paging.h> 30 31 #include <mlx5_glue.h> 32 #include <mlx5_devx_cmds.h> 33 #include <mlx5_common.h> 34 #include <mlx5_common_mp.h> 35 #include <mlx5_common_mr.h> 36 #include <mlx5_malloc.h> 37 38 #include "mlx5_defs.h" 39 #include "mlx5.h" 40 #include "mlx5_common_os.h" 41 #include "mlx5_utils.h" 42 #include "mlx5_rxtx.h" 43 #include "mlx5_autoconf.h" 44 #include "mlx5_mr.h" 45 #include "mlx5_flow.h" 46 #include "rte_pmd_mlx5.h" 47 #include "mlx5_verbs.h" 48 #include "mlx5_nl.h" 49 #include "mlx5_devx.h" 50 51 #define MLX5_TAGS_HLIST_ARRAY_SIZE 8192 52 53 #ifndef HAVE_IBV_MLX5_MOD_MPW 54 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) 55 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) 56 #endif 57 58 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP 59 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4) 60 #endif 61 62 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data"; 63 64 /* Spinlock for mlx5_shared_data allocation. */ 65 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER; 66 67 /* Process local data for secondary processes. */ 68 static struct mlx5_local_data mlx5_local_data; 69 70 /** 71 * Set the completion channel file descriptor interrupt as non-blocking. 72 * 73 * @param[in] rxq_obj 74 * Pointer to RQ channel object, which includes the channel fd 75 * 76 * @param[out] fd 77 * The file descriptor (representing the intetrrupt) used in this channel. 78 * 79 * @return 80 * 0 on successfully setting the fd to non-blocking, non-zero otherwise. 81 */ 82 int 83 mlx5_os_set_nonblock_channel_fd(int fd) 84 { 85 int flags; 86 87 flags = fcntl(fd, F_GETFL); 88 return fcntl(fd, F_SETFL, flags | O_NONBLOCK); 89 } 90 91 /** 92 * Get mlx5 device attributes. The glue function query_device_ex() is called 93 * with out parameter of type 'struct ibv_device_attr_ex *'. Then fill in mlx5 94 * device attributes from the glue out parameter. 95 * 96 * @param dev 97 * Pointer to ibv context. 98 * 99 * @param device_attr 100 * Pointer to mlx5 device attributes. 101 * 102 * @return 103 * 0 on success, non zero error number otherwise 104 */ 105 int 106 mlx5_os_get_dev_attr(void *ctx, struct mlx5_dev_attr *device_attr) 107 { 108 int err; 109 struct ibv_device_attr_ex attr_ex; 110 memset(device_attr, 0, sizeof(*device_attr)); 111 err = mlx5_glue->query_device_ex(ctx, NULL, &attr_ex); 112 if (err) 113 return err; 114 115 device_attr->device_cap_flags_ex = attr_ex.device_cap_flags_ex; 116 device_attr->max_qp_wr = attr_ex.orig_attr.max_qp_wr; 117 device_attr->max_sge = attr_ex.orig_attr.max_sge; 118 device_attr->max_cq = attr_ex.orig_attr.max_cq; 119 device_attr->max_qp = attr_ex.orig_attr.max_qp; 120 device_attr->raw_packet_caps = attr_ex.raw_packet_caps; 121 device_attr->max_rwq_indirection_table_size = 122 attr_ex.rss_caps.max_rwq_indirection_table_size; 123 device_attr->max_tso = attr_ex.tso_caps.max_tso; 124 device_attr->tso_supported_qpts = attr_ex.tso_caps.supported_qpts; 125 126 struct mlx5dv_context dv_attr = { .comp_mask = 0 }; 127 err = mlx5_glue->dv_query_device(ctx, &dv_attr); 128 if (err) 129 return err; 130 131 device_attr->flags = dv_attr.flags; 132 device_attr->comp_mask = dv_attr.comp_mask; 133 #ifdef HAVE_IBV_MLX5_MOD_SWP 134 device_attr->sw_parsing_offloads = 135 dv_attr.sw_parsing_caps.sw_parsing_offloads; 136 #endif 137 device_attr->min_single_stride_log_num_of_bytes = 138 dv_attr.striding_rq_caps.min_single_stride_log_num_of_bytes; 139 device_attr->max_single_stride_log_num_of_bytes = 140 dv_attr.striding_rq_caps.max_single_stride_log_num_of_bytes; 141 device_attr->min_single_wqe_log_num_of_strides = 142 dv_attr.striding_rq_caps.min_single_wqe_log_num_of_strides; 143 device_attr->max_single_wqe_log_num_of_strides = 144 dv_attr.striding_rq_caps.max_single_wqe_log_num_of_strides; 145 device_attr->stride_supported_qpts = 146 dv_attr.striding_rq_caps.supported_qpts; 147 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 148 device_attr->tunnel_offloads_caps = dv_attr.tunnel_offloads_caps; 149 #endif 150 151 return err; 152 } 153 154 /** 155 * Verbs callback to allocate a memory. This function should allocate the space 156 * according to the size provided residing inside a huge page. 157 * Please note that all allocation must respect the alignment from libmlx5 158 * (i.e. currently rte_mem_page_size()). 159 * 160 * @param[in] size 161 * The size in bytes of the memory to allocate. 162 * @param[in] data 163 * A pointer to the callback data. 164 * 165 * @return 166 * Allocated buffer, NULL otherwise and rte_errno is set. 167 */ 168 static void * 169 mlx5_alloc_verbs_buf(size_t size, void *data) 170 { 171 struct mlx5_priv *priv = data; 172 void *ret; 173 unsigned int socket = SOCKET_ID_ANY; 174 size_t alignment = rte_mem_page_size(); 175 if (alignment == (size_t)-1) { 176 DRV_LOG(ERR, "Failed to get mem page size"); 177 rte_errno = ENOMEM; 178 return NULL; 179 } 180 181 if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) { 182 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 183 184 socket = ctrl->socket; 185 } else if (priv->verbs_alloc_ctx.type == 186 MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) { 187 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 188 189 socket = ctrl->socket; 190 } 191 MLX5_ASSERT(data != NULL); 192 ret = mlx5_malloc(0, size, alignment, socket); 193 if (!ret && size) 194 rte_errno = ENOMEM; 195 return ret; 196 } 197 198 /** 199 * Verbs callback to free a memory. 200 * 201 * @param[in] ptr 202 * A pointer to the memory to free. 203 * @param[in] data 204 * A pointer to the callback data. 205 */ 206 static void 207 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused) 208 { 209 MLX5_ASSERT(data != NULL); 210 mlx5_free(ptr); 211 } 212 213 /** 214 * Initialize DR related data within private structure. 215 * Routine checks the reference counter and does actual 216 * resources creation/initialization only if counter is zero. 217 * 218 * @param[in] priv 219 * Pointer to the private device data structure. 220 * 221 * @return 222 * Zero on success, positive error code otherwise. 223 */ 224 static int 225 mlx5_alloc_shared_dr(struct mlx5_priv *priv) 226 { 227 struct mlx5_dev_ctx_shared *sh = priv->sh; 228 char s[MLX5_HLIST_NAMESIZE] __rte_unused; 229 int err; 230 231 MLX5_ASSERT(sh && sh->refcnt); 232 if (sh->refcnt > 1) 233 return 0; 234 err = mlx5_alloc_table_hash_list(priv); 235 if (err) 236 goto error; 237 /* The resources below are only valid with DV support. */ 238 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 239 /* Init port id action cache list. */ 240 snprintf(s, sizeof(s), "%s_port_id_action_cache", sh->ibdev_name); 241 mlx5_cache_list_init(&sh->port_id_action_list, s, 0, sh, 242 flow_dv_port_id_create_cb, 243 flow_dv_port_id_match_cb, 244 flow_dv_port_id_remove_cb); 245 /* Create tags hash list table. */ 246 snprintf(s, sizeof(s), "%s_tags", sh->ibdev_name); 247 sh->tag_table = mlx5_hlist_create(s, MLX5_TAGS_HLIST_ARRAY_SIZE, 0, 248 MLX5_HLIST_WRITE_MOST, 249 flow_dv_tag_create_cb, NULL, 250 flow_dv_tag_remove_cb); 251 if (!sh->tag_table) { 252 DRV_LOG(ERR, "tags with hash creation failed."); 253 err = ENOMEM; 254 goto error; 255 } 256 sh->tag_table->ctx = sh; 257 snprintf(s, sizeof(s), "%s_hdr_modify", sh->ibdev_name); 258 sh->modify_cmds = mlx5_hlist_create(s, MLX5_FLOW_HDR_MODIFY_HTABLE_SZ, 259 0, MLX5_HLIST_WRITE_MOST | 260 MLX5_HLIST_DIRECT_KEY, 261 flow_dv_modify_create_cb, 262 flow_dv_modify_match_cb, 263 flow_dv_modify_remove_cb); 264 if (!sh->modify_cmds) { 265 DRV_LOG(ERR, "hdr modify hash creation failed"); 266 err = ENOMEM; 267 goto error; 268 } 269 sh->modify_cmds->ctx = sh; 270 snprintf(s, sizeof(s), "%s_encaps_decaps", sh->ibdev_name); 271 sh->encaps_decaps = mlx5_hlist_create(s, 272 MLX5_FLOW_ENCAP_DECAP_HTABLE_SZ, 273 0, MLX5_HLIST_DIRECT_KEY | 274 MLX5_HLIST_WRITE_MOST, 275 flow_dv_encap_decap_create_cb, 276 flow_dv_encap_decap_match_cb, 277 flow_dv_encap_decap_remove_cb); 278 if (!sh->encaps_decaps) { 279 DRV_LOG(ERR, "encap decap hash creation failed"); 280 err = ENOMEM; 281 goto error; 282 } 283 sh->encaps_decaps->ctx = sh; 284 #endif 285 #ifdef HAVE_MLX5DV_DR 286 void *domain; 287 288 /* Reference counter is zero, we should initialize structures. */ 289 domain = mlx5_glue->dr_create_domain(sh->ctx, 290 MLX5DV_DR_DOMAIN_TYPE_NIC_RX); 291 if (!domain) { 292 DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed"); 293 err = errno; 294 goto error; 295 } 296 sh->rx_domain = domain; 297 domain = mlx5_glue->dr_create_domain(sh->ctx, 298 MLX5DV_DR_DOMAIN_TYPE_NIC_TX); 299 if (!domain) { 300 DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed"); 301 err = errno; 302 goto error; 303 } 304 pthread_mutex_init(&sh->dv_mutex, NULL); 305 sh->tx_domain = domain; 306 #ifdef HAVE_MLX5DV_DR_ESWITCH 307 if (priv->config.dv_esw_en) { 308 domain = mlx5_glue->dr_create_domain 309 (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB); 310 if (!domain) { 311 DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed"); 312 err = errno; 313 goto error; 314 } 315 sh->fdb_domain = domain; 316 sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop(); 317 } 318 #endif 319 if (!sh->tunnel_hub) 320 err = mlx5_alloc_tunnel_hub(sh); 321 if (err) { 322 DRV_LOG(ERR, "mlx5_alloc_tunnel_hub failed err=%d", err); 323 goto error; 324 } 325 if (priv->config.reclaim_mode == MLX5_RCM_AGGR) { 326 mlx5_glue->dr_reclaim_domain_memory(sh->rx_domain, 1); 327 mlx5_glue->dr_reclaim_domain_memory(sh->tx_domain, 1); 328 if (sh->fdb_domain) 329 mlx5_glue->dr_reclaim_domain_memory(sh->fdb_domain, 1); 330 } 331 sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan(); 332 #endif /* HAVE_MLX5DV_DR */ 333 sh->default_miss_action = 334 mlx5_glue->dr_create_flow_action_default_miss(); 335 if (!sh->default_miss_action) 336 DRV_LOG(WARNING, "Default miss action is not supported."); 337 return 0; 338 error: 339 /* Rollback the created objects. */ 340 if (sh->rx_domain) { 341 mlx5_glue->dr_destroy_domain(sh->rx_domain); 342 sh->rx_domain = NULL; 343 } 344 if (sh->tx_domain) { 345 mlx5_glue->dr_destroy_domain(sh->tx_domain); 346 sh->tx_domain = NULL; 347 } 348 if (sh->fdb_domain) { 349 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 350 sh->fdb_domain = NULL; 351 } 352 if (sh->esw_drop_action) { 353 mlx5_glue->destroy_flow_action(sh->esw_drop_action); 354 sh->esw_drop_action = NULL; 355 } 356 if (sh->pop_vlan_action) { 357 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 358 sh->pop_vlan_action = NULL; 359 } 360 if (sh->encaps_decaps) { 361 mlx5_hlist_destroy(sh->encaps_decaps); 362 sh->encaps_decaps = NULL; 363 } 364 if (sh->modify_cmds) { 365 mlx5_hlist_destroy(sh->modify_cmds); 366 sh->modify_cmds = NULL; 367 } 368 if (sh->tag_table) { 369 /* tags should be destroyed with flow before. */ 370 mlx5_hlist_destroy(sh->tag_table); 371 sh->tag_table = NULL; 372 } 373 if (sh->tunnel_hub) { 374 mlx5_release_tunnel_hub(sh, priv->dev_port); 375 sh->tunnel_hub = NULL; 376 } 377 mlx5_free_table_hash_list(priv); 378 return err; 379 } 380 381 /** 382 * Destroy DR related data within private structure. 383 * 384 * @param[in] priv 385 * Pointer to the private device data structure. 386 */ 387 void 388 mlx5_os_free_shared_dr(struct mlx5_priv *priv) 389 { 390 struct mlx5_dev_ctx_shared *sh = priv->sh; 391 392 MLX5_ASSERT(sh && sh->refcnt); 393 if (sh->refcnt > 1) 394 return; 395 #ifdef HAVE_MLX5DV_DR 396 if (sh->rx_domain) { 397 mlx5_glue->dr_destroy_domain(sh->rx_domain); 398 sh->rx_domain = NULL; 399 } 400 if (sh->tx_domain) { 401 mlx5_glue->dr_destroy_domain(sh->tx_domain); 402 sh->tx_domain = NULL; 403 } 404 #ifdef HAVE_MLX5DV_DR_ESWITCH 405 if (sh->fdb_domain) { 406 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 407 sh->fdb_domain = NULL; 408 } 409 if (sh->esw_drop_action) { 410 mlx5_glue->destroy_flow_action(sh->esw_drop_action); 411 sh->esw_drop_action = NULL; 412 } 413 #endif 414 if (sh->pop_vlan_action) { 415 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 416 sh->pop_vlan_action = NULL; 417 } 418 pthread_mutex_destroy(&sh->dv_mutex); 419 #endif /* HAVE_MLX5DV_DR */ 420 if (sh->default_miss_action) 421 mlx5_glue->destroy_flow_action 422 (sh->default_miss_action); 423 if (sh->encaps_decaps) { 424 mlx5_hlist_destroy(sh->encaps_decaps); 425 sh->encaps_decaps = NULL; 426 } 427 if (sh->modify_cmds) { 428 mlx5_hlist_destroy(sh->modify_cmds); 429 sh->modify_cmds = NULL; 430 } 431 if (sh->tag_table) { 432 /* tags should be destroyed with flow before. */ 433 mlx5_hlist_destroy(sh->tag_table); 434 sh->tag_table = NULL; 435 } 436 if (sh->tunnel_hub) { 437 mlx5_release_tunnel_hub(sh, priv->dev_port); 438 sh->tunnel_hub = NULL; 439 } 440 mlx5_cache_list_destroy(&sh->port_id_action_list); 441 mlx5_free_table_hash_list(priv); 442 } 443 444 /** 445 * Initialize shared data between primary and secondary process. 446 * 447 * A memzone is reserved by primary process and secondary processes attach to 448 * the memzone. 449 * 450 * @return 451 * 0 on success, a negative errno value otherwise and rte_errno is set. 452 */ 453 static int 454 mlx5_init_shared_data(void) 455 { 456 const struct rte_memzone *mz; 457 int ret = 0; 458 459 rte_spinlock_lock(&mlx5_shared_data_lock); 460 if (mlx5_shared_data == NULL) { 461 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 462 /* Allocate shared memory. */ 463 mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA, 464 sizeof(*mlx5_shared_data), 465 SOCKET_ID_ANY, 0); 466 if (mz == NULL) { 467 DRV_LOG(ERR, 468 "Cannot allocate mlx5 shared data"); 469 ret = -rte_errno; 470 goto error; 471 } 472 mlx5_shared_data = mz->addr; 473 memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data)); 474 rte_spinlock_init(&mlx5_shared_data->lock); 475 } else { 476 /* Lookup allocated shared memory. */ 477 mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA); 478 if (mz == NULL) { 479 DRV_LOG(ERR, 480 "Cannot attach mlx5 shared data"); 481 ret = -rte_errno; 482 goto error; 483 } 484 mlx5_shared_data = mz->addr; 485 memset(&mlx5_local_data, 0, sizeof(mlx5_local_data)); 486 } 487 } 488 error: 489 rte_spinlock_unlock(&mlx5_shared_data_lock); 490 return ret; 491 } 492 493 /** 494 * PMD global initialization. 495 * 496 * Independent from individual device, this function initializes global 497 * per-PMD data structures distinguishing primary and secondary processes. 498 * Hence, each initialization is called once per a process. 499 * 500 * @return 501 * 0 on success, a negative errno value otherwise and rte_errno is set. 502 */ 503 static int 504 mlx5_init_once(void) 505 { 506 struct mlx5_shared_data *sd; 507 struct mlx5_local_data *ld = &mlx5_local_data; 508 int ret = 0; 509 510 if (mlx5_init_shared_data()) 511 return -rte_errno; 512 sd = mlx5_shared_data; 513 MLX5_ASSERT(sd); 514 rte_spinlock_lock(&sd->lock); 515 switch (rte_eal_process_type()) { 516 case RTE_PROC_PRIMARY: 517 if (sd->init_done) 518 break; 519 LIST_INIT(&sd->mem_event_cb_list); 520 rte_rwlock_init(&sd->mem_event_rwlock); 521 rte_mem_event_callback_register("MLX5_MEM_EVENT_CB", 522 mlx5_mr_mem_event_cb, NULL); 523 ret = mlx5_mp_init_primary(MLX5_MP_NAME, 524 mlx5_mp_os_primary_handle); 525 if (ret) 526 goto out; 527 sd->init_done = true; 528 break; 529 case RTE_PROC_SECONDARY: 530 if (ld->init_done) 531 break; 532 ret = mlx5_mp_init_secondary(MLX5_MP_NAME, 533 mlx5_mp_os_secondary_handle); 534 if (ret) 535 goto out; 536 ++sd->secondary_cnt; 537 ld->init_done = true; 538 break; 539 default: 540 break; 541 } 542 out: 543 rte_spinlock_unlock(&sd->lock); 544 return ret; 545 } 546 547 /** 548 * Create the Tx queue DevX/Verbs object. 549 * 550 * @param dev 551 * Pointer to Ethernet device. 552 * @param idx 553 * Queue index in DPDK Tx queue array. 554 * 555 * @return 556 * 0 on success, a negative errno value otherwise and rte_errno is set. 557 */ 558 static int 559 mlx5_os_txq_obj_new(struct rte_eth_dev *dev, uint16_t idx) 560 { 561 struct mlx5_priv *priv = dev->data->dev_private; 562 struct mlx5_txq_data *txq_data = (*priv->txqs)[idx]; 563 struct mlx5_txq_ctrl *txq_ctrl = 564 container_of(txq_data, struct mlx5_txq_ctrl, txq); 565 566 if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) 567 return mlx5_txq_devx_obj_new(dev, idx); 568 #ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET 569 if (!priv->config.dv_esw_en) 570 return mlx5_txq_devx_obj_new(dev, idx); 571 #endif 572 return mlx5_txq_ibv_obj_new(dev, idx); 573 } 574 575 /** 576 * Release an Tx DevX/verbs queue object. 577 * 578 * @param txq_obj 579 * DevX/Verbs Tx queue object. 580 */ 581 static void 582 mlx5_os_txq_obj_release(struct mlx5_txq_obj *txq_obj) 583 { 584 if (txq_obj->txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) { 585 mlx5_txq_devx_obj_release(txq_obj); 586 return; 587 } 588 #ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET 589 if (!txq_obj->txq_ctrl->priv->config.dv_esw_en) { 590 mlx5_txq_devx_obj_release(txq_obj); 591 return; 592 } 593 #endif 594 mlx5_txq_ibv_obj_release(txq_obj); 595 } 596 597 /** 598 * DV flow counter mode detect and config. 599 * 600 * @param dev 601 * Pointer to rte_eth_dev structure. 602 * 603 */ 604 static void 605 mlx5_flow_counter_mode_config(struct rte_eth_dev *dev __rte_unused) 606 { 607 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 608 struct mlx5_priv *priv = dev->data->dev_private; 609 struct mlx5_dev_ctx_shared *sh = priv->sh; 610 bool fallback; 611 612 #ifndef HAVE_IBV_DEVX_ASYNC 613 fallback = true; 614 #else 615 fallback = false; 616 if (!priv->config.devx || !priv->config.dv_flow_en || 617 !priv->config.hca_attr.flow_counters_dump || 618 !(priv->config.hca_attr.flow_counter_bulk_alloc_bitmap & 0x4) || 619 (mlx5_flow_dv_discover_counter_offset_support(dev) == -ENOTSUP)) 620 fallback = true; 621 #endif 622 if (fallback) 623 DRV_LOG(INFO, "Use fall-back DV counter management. Flow " 624 "counter dump:%d, bulk_alloc_bitmap:0x%hhx.", 625 priv->config.hca_attr.flow_counters_dump, 626 priv->config.hca_attr.flow_counter_bulk_alloc_bitmap); 627 /* Initialize fallback mode only on the port initializes sh. */ 628 if (sh->refcnt == 1) 629 sh->cmng.counter_fallback = fallback; 630 else if (fallback != sh->cmng.counter_fallback) 631 DRV_LOG(WARNING, "Port %d in sh has different fallback mode " 632 "with others:%d.", PORT_ID(priv), fallback); 633 #endif 634 } 635 636 /** 637 * Spawn an Ethernet device from Verbs information. 638 * 639 * @param dpdk_dev 640 * Backing DPDK device. 641 * @param spawn 642 * Verbs device parameters (name, port, switch_info) to spawn. 643 * @param config 644 * Device configuration parameters. 645 * 646 * @return 647 * A valid Ethernet device object on success, NULL otherwise and rte_errno 648 * is set. The following errors are defined: 649 * 650 * EBUSY: device is not supposed to be spawned. 651 * EEXIST: device is already spawned 652 */ 653 static struct rte_eth_dev * 654 mlx5_dev_spawn(struct rte_device *dpdk_dev, 655 struct mlx5_dev_spawn_data *spawn, 656 struct mlx5_dev_config *config) 657 { 658 const struct mlx5_switch_info *switch_info = &spawn->info; 659 struct mlx5_dev_ctx_shared *sh = NULL; 660 struct ibv_port_attr port_attr; 661 struct mlx5dv_context dv_attr = { .comp_mask = 0 }; 662 struct rte_eth_dev *eth_dev = NULL; 663 struct mlx5_priv *priv = NULL; 664 int err = 0; 665 unsigned int hw_padding = 0; 666 unsigned int mps; 667 unsigned int cqe_comp; 668 unsigned int cqe_pad = 0; 669 unsigned int tunnel_en = 0; 670 unsigned int mpls_en = 0; 671 unsigned int swp = 0; 672 unsigned int mprq = 0; 673 unsigned int mprq_min_stride_size_n = 0; 674 unsigned int mprq_max_stride_size_n = 0; 675 unsigned int mprq_min_stride_num_n = 0; 676 unsigned int mprq_max_stride_num_n = 0; 677 struct rte_ether_addr mac; 678 char name[RTE_ETH_NAME_MAX_LEN]; 679 int own_domain_id = 0; 680 uint16_t port_id; 681 unsigned int i; 682 #ifdef HAVE_MLX5DV_DR_DEVX_PORT 683 struct mlx5dv_devx_port devx_port = { .comp_mask = 0 }; 684 #endif 685 686 /* Determine if this port representor is supposed to be spawned. */ 687 if (switch_info->representor && dpdk_dev->devargs) { 688 struct rte_eth_devargs eth_da; 689 690 err = rte_eth_devargs_parse(dpdk_dev->devargs->args, ð_da); 691 if (err) { 692 rte_errno = -err; 693 DRV_LOG(ERR, "failed to process device arguments: %s", 694 strerror(rte_errno)); 695 return NULL; 696 } 697 for (i = 0; i < eth_da.nb_representor_ports; ++i) 698 if (eth_da.representor_ports[i] == 699 (uint16_t)switch_info->port_name) 700 break; 701 if (i == eth_da.nb_representor_ports) { 702 rte_errno = EBUSY; 703 return NULL; 704 } 705 } 706 /* Build device name. */ 707 if (spawn->pf_bond < 0) { 708 /* Single device. */ 709 if (!switch_info->representor) 710 strlcpy(name, dpdk_dev->name, sizeof(name)); 711 else 712 snprintf(name, sizeof(name), "%s_representor_%u", 713 dpdk_dev->name, switch_info->port_name); 714 } else { 715 /* Bonding device. */ 716 if (!switch_info->representor) 717 snprintf(name, sizeof(name), "%s_%s", 718 dpdk_dev->name, 719 mlx5_os_get_dev_device_name(spawn->phys_dev)); 720 else 721 snprintf(name, sizeof(name), "%s_%s_representor_%u", 722 dpdk_dev->name, 723 mlx5_os_get_dev_device_name(spawn->phys_dev), 724 switch_info->port_name); 725 } 726 /* check if the device is already spawned */ 727 if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) { 728 rte_errno = EEXIST; 729 return NULL; 730 } 731 DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name); 732 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 733 struct mlx5_mp_id mp_id; 734 735 eth_dev = rte_eth_dev_attach_secondary(name); 736 if (eth_dev == NULL) { 737 DRV_LOG(ERR, "can not attach rte ethdev"); 738 rte_errno = ENOMEM; 739 return NULL; 740 } 741 eth_dev->device = dpdk_dev; 742 eth_dev->dev_ops = &mlx5_os_dev_sec_ops; 743 eth_dev->rx_descriptor_status = mlx5_rx_descriptor_status; 744 eth_dev->tx_descriptor_status = mlx5_tx_descriptor_status; 745 err = mlx5_proc_priv_init(eth_dev); 746 if (err) 747 return NULL; 748 mp_id.port_id = eth_dev->data->port_id; 749 strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN); 750 /* Receive command fd from primary process */ 751 err = mlx5_mp_req_verbs_cmd_fd(&mp_id); 752 if (err < 0) 753 goto err_secondary; 754 /* Remap UAR for Tx queues. */ 755 err = mlx5_tx_uar_init_secondary(eth_dev, err); 756 if (err) 757 goto err_secondary; 758 /* 759 * Ethdev pointer is still required as input since 760 * the primary device is not accessible from the 761 * secondary process. 762 */ 763 eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev); 764 eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev); 765 return eth_dev; 766 err_secondary: 767 mlx5_dev_close(eth_dev); 768 return NULL; 769 } 770 /* 771 * Some parameters ("tx_db_nc" in particularly) are needed in 772 * advance to create dv/verbs device context. We proceed the 773 * devargs here to get ones, and later proceed devargs again 774 * to override some hardware settings. 775 */ 776 err = mlx5_args(config, dpdk_dev->devargs); 777 if (err) { 778 err = rte_errno; 779 DRV_LOG(ERR, "failed to process device arguments: %s", 780 strerror(rte_errno)); 781 goto error; 782 } 783 if (config->dv_miss_info) { 784 if (switch_info->master || switch_info->representor) 785 config->dv_xmeta_en = MLX5_XMETA_MODE_META16; 786 } 787 mlx5_malloc_mem_select(config->sys_mem_en); 788 sh = mlx5_alloc_shared_dev_ctx(spawn, config); 789 if (!sh) 790 return NULL; 791 config->devx = sh->devx; 792 #ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR 793 config->dest_tir = 1; 794 #endif 795 #ifdef HAVE_IBV_MLX5_MOD_SWP 796 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; 797 #endif 798 /* 799 * Multi-packet send is supported by ConnectX-4 Lx PF as well 800 * as all ConnectX-5 devices. 801 */ 802 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 803 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS; 804 #endif 805 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 806 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; 807 #endif 808 mlx5_glue->dv_query_device(sh->ctx, &dv_attr); 809 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { 810 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { 811 DRV_LOG(DEBUG, "enhanced MPW is supported"); 812 mps = MLX5_MPW_ENHANCED; 813 } else { 814 DRV_LOG(DEBUG, "MPW is supported"); 815 mps = MLX5_MPW; 816 } 817 } else { 818 DRV_LOG(DEBUG, "MPW isn't supported"); 819 mps = MLX5_MPW_DISABLED; 820 } 821 #ifdef HAVE_IBV_MLX5_MOD_SWP 822 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP) 823 swp = dv_attr.sw_parsing_caps.sw_parsing_offloads; 824 DRV_LOG(DEBUG, "SWP support: %u", swp); 825 #endif 826 config->swp = !!swp; 827 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 828 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) { 829 struct mlx5dv_striding_rq_caps mprq_caps = 830 dv_attr.striding_rq_caps; 831 832 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d", 833 mprq_caps.min_single_stride_log_num_of_bytes); 834 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d", 835 mprq_caps.max_single_stride_log_num_of_bytes); 836 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d", 837 mprq_caps.min_single_wqe_log_num_of_strides); 838 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d", 839 mprq_caps.max_single_wqe_log_num_of_strides); 840 DRV_LOG(DEBUG, "\tsupported_qpts: %d", 841 mprq_caps.supported_qpts); 842 DRV_LOG(DEBUG, "device supports Multi-Packet RQ"); 843 mprq = 1; 844 mprq_min_stride_size_n = 845 mprq_caps.min_single_stride_log_num_of_bytes; 846 mprq_max_stride_size_n = 847 mprq_caps.max_single_stride_log_num_of_bytes; 848 mprq_min_stride_num_n = 849 mprq_caps.min_single_wqe_log_num_of_strides; 850 mprq_max_stride_num_n = 851 mprq_caps.max_single_wqe_log_num_of_strides; 852 } 853 #endif 854 if (RTE_CACHE_LINE_SIZE == 128 && 855 !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) 856 cqe_comp = 0; 857 else 858 cqe_comp = 1; 859 config->cqe_comp = cqe_comp; 860 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD 861 /* Whether device supports 128B Rx CQE padding. */ 862 cqe_pad = RTE_CACHE_LINE_SIZE == 128 && 863 (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD); 864 #endif 865 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 866 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { 867 tunnel_en = ((dv_attr.tunnel_offloads_caps & 868 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) && 869 (dv_attr.tunnel_offloads_caps & 870 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) && 871 (dv_attr.tunnel_offloads_caps & 872 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE)); 873 } 874 DRV_LOG(DEBUG, "tunnel offloading is %ssupported", 875 tunnel_en ? "" : "not "); 876 #else 877 DRV_LOG(WARNING, 878 "tunnel offloading disabled due to old OFED/rdma-core version"); 879 #endif 880 config->tunnel_en = tunnel_en; 881 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT 882 mpls_en = ((dv_attr.tunnel_offloads_caps & 883 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) && 884 (dv_attr.tunnel_offloads_caps & 885 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP)); 886 DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported", 887 mpls_en ? "" : "not "); 888 #else 889 DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to" 890 " old OFED/rdma-core version or firmware configuration"); 891 #endif 892 config->mpls_en = mpls_en; 893 /* Check port status. */ 894 err = mlx5_glue->query_port(sh->ctx, spawn->phys_port, &port_attr); 895 if (err) { 896 DRV_LOG(ERR, "port query failed: %s", strerror(err)); 897 goto error; 898 } 899 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { 900 DRV_LOG(ERR, "port is not configured in Ethernet mode"); 901 err = EINVAL; 902 goto error; 903 } 904 if (port_attr.state != IBV_PORT_ACTIVE) 905 DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)", 906 mlx5_glue->port_state_str(port_attr.state), 907 port_attr.state); 908 /* Allocate private eth device data. */ 909 priv = mlx5_malloc(MLX5_MEM_ZERO | MLX5_MEM_RTE, 910 sizeof(*priv), 911 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 912 if (priv == NULL) { 913 DRV_LOG(ERR, "priv allocation failure"); 914 err = ENOMEM; 915 goto error; 916 } 917 priv->sh = sh; 918 priv->dev_port = spawn->phys_port; 919 priv->pci_dev = spawn->pci_dev; 920 priv->mtu = RTE_ETHER_MTU; 921 priv->mp_id.port_id = port_id; 922 strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN); 923 /* Some internal functions rely on Netlink sockets, open them now. */ 924 priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA); 925 priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE); 926 priv->representor = !!switch_info->representor; 927 priv->master = !!switch_info->master; 928 priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; 929 priv->vport_meta_tag = 0; 930 priv->vport_meta_mask = 0; 931 priv->pf_bond = spawn->pf_bond; 932 #ifdef HAVE_MLX5DV_DR_DEVX_PORT 933 /* 934 * The DevX port query API is implemented. E-Switch may use 935 * either vport or reg_c[0] metadata register to match on 936 * vport index. The engaged part of metadata register is 937 * defined by mask. 938 */ 939 if (switch_info->representor || switch_info->master) { 940 devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT | 941 MLX5DV_DEVX_PORT_MATCH_REG_C_0; 942 err = mlx5_glue->devx_port_query(sh->ctx, spawn->phys_port, 943 &devx_port); 944 if (err) { 945 DRV_LOG(WARNING, 946 "can't query devx port %d on device %s", 947 spawn->phys_port, 948 mlx5_os_get_dev_device_name(spawn->phys_dev)); 949 devx_port.comp_mask = 0; 950 } 951 } 952 if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) { 953 priv->vport_meta_tag = devx_port.reg_c_0.value; 954 priv->vport_meta_mask = devx_port.reg_c_0.mask; 955 if (!priv->vport_meta_mask) { 956 DRV_LOG(ERR, "vport zero mask for port %d" 957 " on bonding device %s", 958 spawn->phys_port, 959 mlx5_os_get_dev_device_name 960 (spawn->phys_dev)); 961 err = ENOTSUP; 962 goto error; 963 } 964 if (priv->vport_meta_tag & ~priv->vport_meta_mask) { 965 DRV_LOG(ERR, "invalid vport tag for port %d" 966 " on bonding device %s", 967 spawn->phys_port, 968 mlx5_os_get_dev_device_name 969 (spawn->phys_dev)); 970 err = ENOTSUP; 971 goto error; 972 } 973 } 974 if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) { 975 priv->vport_id = devx_port.vport_num; 976 } else if (spawn->pf_bond >= 0) { 977 DRV_LOG(ERR, "can't deduce vport index for port %d" 978 " on bonding device %s", 979 spawn->phys_port, 980 mlx5_os_get_dev_device_name(spawn->phys_dev)); 981 err = ENOTSUP; 982 goto error; 983 } else { 984 /* Suppose vport index in compatible way. */ 985 priv->vport_id = switch_info->representor ? 986 switch_info->port_name + 1 : -1; 987 } 988 #else 989 /* 990 * Kernel/rdma_core support single E-Switch per PF configurations 991 * only and vport_id field contains the vport index for 992 * associated VF, which is deduced from representor port name. 993 * For example, let's have the IB device port 10, it has 994 * attached network device eth0, which has port name attribute 995 * pf0vf2, we can deduce the VF number as 2, and set vport index 996 * as 3 (2+1). This assigning schema should be changed if the 997 * multiple E-Switch instances per PF configurations or/and PCI 998 * subfunctions are added. 999 */ 1000 priv->vport_id = switch_info->representor ? 1001 switch_info->port_name + 1 : -1; 1002 #endif 1003 /* representor_id field keeps the unmodified VF index. */ 1004 priv->representor_id = switch_info->representor ? 1005 switch_info->port_name : -1; 1006 /* 1007 * Look for sibling devices in order to reuse their switch domain 1008 * if any, otherwise allocate one. 1009 */ 1010 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { 1011 const struct mlx5_priv *opriv = 1012 rte_eth_devices[port_id].data->dev_private; 1013 1014 if (!opriv || 1015 opriv->sh != priv->sh || 1016 opriv->domain_id == 1017 RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) 1018 continue; 1019 priv->domain_id = opriv->domain_id; 1020 break; 1021 } 1022 if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { 1023 err = rte_eth_switch_domain_alloc(&priv->domain_id); 1024 if (err) { 1025 err = rte_errno; 1026 DRV_LOG(ERR, "unable to allocate switch domain: %s", 1027 strerror(rte_errno)); 1028 goto error; 1029 } 1030 own_domain_id = 1; 1031 } 1032 /* Override some values set by hardware configuration. */ 1033 mlx5_args(config, dpdk_dev->devargs); 1034 err = mlx5_dev_check_sibling_config(priv, config); 1035 if (err) 1036 goto error; 1037 config->hw_csum = !!(sh->device_attr.device_cap_flags_ex & 1038 IBV_DEVICE_RAW_IP_CSUM); 1039 DRV_LOG(DEBUG, "checksum offloading is %ssupported", 1040 (config->hw_csum ? "" : "not ")); 1041 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \ 1042 !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) 1043 DRV_LOG(DEBUG, "counters are not supported"); 1044 #endif 1045 #if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR) 1046 if (config->dv_flow_en) { 1047 DRV_LOG(WARNING, "DV flow is not supported"); 1048 config->dv_flow_en = 0; 1049 } 1050 #endif 1051 config->ind_table_max_size = 1052 sh->device_attr.max_rwq_indirection_table_size; 1053 /* 1054 * Remove this check once DPDK supports larger/variable 1055 * indirection tables. 1056 */ 1057 if (config->ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512) 1058 config->ind_table_max_size = ETH_RSS_RETA_SIZE_512; 1059 DRV_LOG(DEBUG, "maximum Rx indirection table size is %u", 1060 config->ind_table_max_size); 1061 config->hw_vlan_strip = !!(sh->device_attr.raw_packet_caps & 1062 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); 1063 DRV_LOG(DEBUG, "VLAN stripping is %ssupported", 1064 (config->hw_vlan_strip ? "" : "not ")); 1065 config->hw_fcs_strip = !!(sh->device_attr.raw_packet_caps & 1066 IBV_RAW_PACKET_CAP_SCATTER_FCS); 1067 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING) 1068 hw_padding = !!sh->device_attr.rx_pad_end_addr_align; 1069 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING) 1070 hw_padding = !!(sh->device_attr.device_cap_flags_ex & 1071 IBV_DEVICE_PCI_WRITE_END_PADDING); 1072 #endif 1073 if (config->hw_padding && !hw_padding) { 1074 DRV_LOG(DEBUG, "Rx end alignment padding isn't supported"); 1075 config->hw_padding = 0; 1076 } else if (config->hw_padding) { 1077 DRV_LOG(DEBUG, "Rx end alignment padding is enabled"); 1078 } 1079 config->tso = (sh->device_attr.max_tso > 0 && 1080 (sh->device_attr.tso_supported_qpts & 1081 (1 << IBV_QPT_RAW_PACKET))); 1082 if (config->tso) 1083 config->tso_max_payload_sz = sh->device_attr.max_tso; 1084 /* 1085 * MPW is disabled by default, while the Enhanced MPW is enabled 1086 * by default. 1087 */ 1088 if (config->mps == MLX5_ARG_UNSET) 1089 config->mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED : 1090 MLX5_MPW_DISABLED; 1091 else 1092 config->mps = config->mps ? mps : MLX5_MPW_DISABLED; 1093 DRV_LOG(INFO, "%sMPS is %s", 1094 config->mps == MLX5_MPW_ENHANCED ? "enhanced " : 1095 config->mps == MLX5_MPW ? "legacy " : "", 1096 config->mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); 1097 if (config->cqe_comp && !cqe_comp) { 1098 DRV_LOG(WARNING, "Rx CQE compression isn't supported"); 1099 config->cqe_comp = 0; 1100 } 1101 if (config->cqe_pad && !cqe_pad) { 1102 DRV_LOG(WARNING, "Rx CQE padding isn't supported"); 1103 config->cqe_pad = 0; 1104 } else if (config->cqe_pad) { 1105 DRV_LOG(INFO, "Rx CQE padding is enabled"); 1106 } 1107 if (config->devx) { 1108 err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config->hca_attr); 1109 if (err) { 1110 err = -err; 1111 goto error; 1112 } 1113 /* Check relax ordering support. */ 1114 if (config->hca_attr.relaxed_ordering_write && 1115 config->hca_attr.relaxed_ordering_read && 1116 !haswell_broadwell_cpu) 1117 sh->cmng.relaxed_ordering = 1; 1118 /* Check for LRO support. */ 1119 if (config->dest_tir && config->hca_attr.lro_cap && 1120 config->dv_flow_en) { 1121 /* TBD check tunnel lro caps. */ 1122 config->lro.supported = config->hca_attr.lro_cap; 1123 DRV_LOG(DEBUG, "Device supports LRO"); 1124 /* 1125 * If LRO timeout is not configured by application, 1126 * use the minimal supported value. 1127 */ 1128 if (!config->lro.timeout) 1129 config->lro.timeout = 1130 config->hca_attr.lro_timer_supported_periods[0]; 1131 DRV_LOG(DEBUG, "LRO session timeout set to %d usec", 1132 config->lro.timeout); 1133 DRV_LOG(DEBUG, "LRO minimal size of TCP segment " 1134 "required for coalescing is %d bytes", 1135 config->hca_attr.lro_min_mss_size); 1136 } 1137 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER) 1138 if (config->hca_attr.qos.sup && 1139 config->hca_attr.qos.srtcm_sup && 1140 config->dv_flow_en) { 1141 uint8_t reg_c_mask = 1142 config->hca_attr.qos.flow_meter_reg_c_ids; 1143 /* 1144 * Meter needs two REG_C's for color match and pre-sfx 1145 * flow match. Here get the REG_C for color match. 1146 * REG_C_0 and REG_C_1 is reserved for metadata feature. 1147 */ 1148 reg_c_mask &= 0xfc; 1149 if (__builtin_popcount(reg_c_mask) < 1) { 1150 priv->mtr_en = 0; 1151 DRV_LOG(WARNING, "No available register for" 1152 " meter."); 1153 } else { 1154 priv->mtr_color_reg = ffs(reg_c_mask) - 1 + 1155 REG_C_0; 1156 priv->mtr_en = 1; 1157 priv->mtr_reg_share = 1158 config->hca_attr.qos.flow_meter_reg_share; 1159 DRV_LOG(DEBUG, "The REG_C meter uses is %d", 1160 priv->mtr_color_reg); 1161 } 1162 } 1163 #endif 1164 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_SAMPLE) 1165 if (config->hca_attr.log_max_ft_sampler_num > 0 && 1166 config->dv_flow_en) { 1167 priv->sampler_en = 1; 1168 DRV_LOG(DEBUG, "The Sampler enabled!\n"); 1169 } else { 1170 priv->sampler_en = 0; 1171 if (!config->hca_attr.log_max_ft_sampler_num) 1172 DRV_LOG(WARNING, "No available register for" 1173 " Sampler."); 1174 else 1175 DRV_LOG(DEBUG, "DV flow is not supported!\n"); 1176 } 1177 #endif 1178 } 1179 if (config->tx_pp) { 1180 DRV_LOG(DEBUG, "Timestamp counter frequency %u kHz", 1181 config->hca_attr.dev_freq_khz); 1182 DRV_LOG(DEBUG, "Packet pacing is %ssupported", 1183 config->hca_attr.qos.packet_pacing ? "" : "not "); 1184 DRV_LOG(DEBUG, "Cross channel ops are %ssupported", 1185 config->hca_attr.cross_channel ? "" : "not "); 1186 DRV_LOG(DEBUG, "WQE index ignore is %ssupported", 1187 config->hca_attr.wqe_index_ignore ? "" : "not "); 1188 DRV_LOG(DEBUG, "Non-wire SQ feature is %ssupported", 1189 config->hca_attr.non_wire_sq ? "" : "not "); 1190 DRV_LOG(DEBUG, "Static WQE SQ feature is %ssupported (%d)", 1191 config->hca_attr.log_max_static_sq_wq ? "" : "not ", 1192 config->hca_attr.log_max_static_sq_wq); 1193 DRV_LOG(DEBUG, "WQE rate PP mode is %ssupported", 1194 config->hca_attr.qos.wqe_rate_pp ? "" : "not "); 1195 if (!config->devx) { 1196 DRV_LOG(ERR, "DevX is required for packet pacing"); 1197 err = ENODEV; 1198 goto error; 1199 } 1200 if (!config->hca_attr.qos.packet_pacing) { 1201 DRV_LOG(ERR, "Packet pacing is not supported"); 1202 err = ENODEV; 1203 goto error; 1204 } 1205 if (!config->hca_attr.cross_channel) { 1206 DRV_LOG(ERR, "Cross channel operations are" 1207 " required for packet pacing"); 1208 err = ENODEV; 1209 goto error; 1210 } 1211 if (!config->hca_attr.wqe_index_ignore) { 1212 DRV_LOG(ERR, "WQE index ignore feature is" 1213 " required for packet pacing"); 1214 err = ENODEV; 1215 goto error; 1216 } 1217 if (!config->hca_attr.non_wire_sq) { 1218 DRV_LOG(ERR, "Non-wire SQ feature is" 1219 " required for packet pacing"); 1220 err = ENODEV; 1221 goto error; 1222 } 1223 if (!config->hca_attr.log_max_static_sq_wq) { 1224 DRV_LOG(ERR, "Static WQE SQ feature is" 1225 " required for packet pacing"); 1226 err = ENODEV; 1227 goto error; 1228 } 1229 if (!config->hca_attr.qos.wqe_rate_pp) { 1230 DRV_LOG(ERR, "WQE rate mode is required" 1231 " for packet pacing"); 1232 err = ENODEV; 1233 goto error; 1234 } 1235 #ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET 1236 DRV_LOG(ERR, "DevX does not provide UAR offset," 1237 " can't create queues for packet pacing"); 1238 err = ENODEV; 1239 goto error; 1240 #endif 1241 } 1242 if (config->devx) { 1243 uint32_t reg[MLX5_ST_SZ_DW(register_mtutc)]; 1244 1245 err = config->hca_attr.access_register_user ? 1246 mlx5_devx_cmd_register_read 1247 (sh->ctx, MLX5_REGISTER_ID_MTUTC, 0, 1248 reg, MLX5_ST_SZ_DW(register_mtutc)) : ENOTSUP; 1249 if (!err) { 1250 uint32_t ts_mode; 1251 1252 /* MTUTC register is read successfully. */ 1253 ts_mode = MLX5_GET(register_mtutc, reg, 1254 time_stamp_mode); 1255 if (ts_mode == MLX5_MTUTC_TIMESTAMP_MODE_REAL_TIME) 1256 config->rt_timestamp = 1; 1257 } else { 1258 /* Kernel does not support register reading. */ 1259 if (config->hca_attr.dev_freq_khz == 1260 (NS_PER_S / MS_PER_S)) 1261 config->rt_timestamp = 1; 1262 } 1263 } 1264 /* 1265 * If HW has bug working with tunnel packet decapsulation and 1266 * scatter FCS, and decapsulation is needed, clear the hw_fcs_strip 1267 * bit. Then DEV_RX_OFFLOAD_KEEP_CRC bit will not be set anymore. 1268 */ 1269 if (config->hca_attr.scatter_fcs_w_decap_disable && config->decap_en) 1270 config->hw_fcs_strip = 0; 1271 DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported", 1272 (config->hw_fcs_strip ? "" : "not ")); 1273 if (config->mprq.enabled && mprq) { 1274 if (config->mprq.stride_num_n && 1275 (config->mprq.stride_num_n > mprq_max_stride_num_n || 1276 config->mprq.stride_num_n < mprq_min_stride_num_n)) { 1277 config->mprq.stride_num_n = 1278 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, 1279 mprq_min_stride_num_n), 1280 mprq_max_stride_num_n); 1281 DRV_LOG(WARNING, 1282 "the number of strides" 1283 " for Multi-Packet RQ is out of range," 1284 " setting default value (%u)", 1285 1 << config->mprq.stride_num_n); 1286 } 1287 if (config->mprq.stride_size_n && 1288 (config->mprq.stride_size_n > mprq_max_stride_size_n || 1289 config->mprq.stride_size_n < mprq_min_stride_size_n)) { 1290 config->mprq.stride_size_n = 1291 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N, 1292 mprq_min_stride_size_n), 1293 mprq_max_stride_size_n); 1294 DRV_LOG(WARNING, 1295 "the size of a stride" 1296 " for Multi-Packet RQ is out of range," 1297 " setting default value (%u)", 1298 1 << config->mprq.stride_size_n); 1299 } 1300 config->mprq.min_stride_size_n = mprq_min_stride_size_n; 1301 config->mprq.max_stride_size_n = mprq_max_stride_size_n; 1302 } else if (config->mprq.enabled && !mprq) { 1303 DRV_LOG(WARNING, "Multi-Packet RQ isn't supported"); 1304 config->mprq.enabled = 0; 1305 } 1306 if (config->max_dump_files_num == 0) 1307 config->max_dump_files_num = 128; 1308 eth_dev = rte_eth_dev_allocate(name); 1309 if (eth_dev == NULL) { 1310 DRV_LOG(ERR, "can not allocate rte ethdev"); 1311 err = ENOMEM; 1312 goto error; 1313 } 1314 if (priv->representor) { 1315 eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR; 1316 eth_dev->data->representor_id = priv->representor_id; 1317 } 1318 /* 1319 * Store associated network device interface index. This index 1320 * is permanent throughout the lifetime of device. So, we may store 1321 * the ifindex here and use the cached value further. 1322 */ 1323 MLX5_ASSERT(spawn->ifindex); 1324 priv->if_index = spawn->ifindex; 1325 if (priv->pf_bond >= 0 && priv->master) { 1326 /* Get bond interface info */ 1327 err = mlx5_sysfs_bond_info(priv->if_index, 1328 &priv->bond_ifindex, 1329 priv->bond_name); 1330 if (err) 1331 DRV_LOG(ERR, "unable to get bond info: %s", 1332 strerror(rte_errno)); 1333 else 1334 DRV_LOG(INFO, "PF device %u, bond device %u(%s)", 1335 priv->if_index, priv->bond_ifindex, 1336 priv->bond_name); 1337 } 1338 eth_dev->data->dev_private = priv; 1339 priv->dev_data = eth_dev->data; 1340 eth_dev->data->mac_addrs = priv->mac; 1341 eth_dev->device = dpdk_dev; 1342 eth_dev->data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS; 1343 /* Configure the first MAC address by default. */ 1344 if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) { 1345 DRV_LOG(ERR, 1346 "port %u cannot get MAC address, is mlx5_en" 1347 " loaded? (errno: %s)", 1348 eth_dev->data->port_id, strerror(rte_errno)); 1349 err = ENODEV; 1350 goto error; 1351 } 1352 DRV_LOG(INFO, 1353 "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", 1354 eth_dev->data->port_id, 1355 mac.addr_bytes[0], mac.addr_bytes[1], 1356 mac.addr_bytes[2], mac.addr_bytes[3], 1357 mac.addr_bytes[4], mac.addr_bytes[5]); 1358 #ifdef RTE_LIBRTE_MLX5_DEBUG 1359 { 1360 char ifname[IF_NAMESIZE]; 1361 1362 if (mlx5_get_ifname(eth_dev, &ifname) == 0) 1363 DRV_LOG(DEBUG, "port %u ifname is \"%s\"", 1364 eth_dev->data->port_id, ifname); 1365 else 1366 DRV_LOG(DEBUG, "port %u ifname is unknown", 1367 eth_dev->data->port_id); 1368 } 1369 #endif 1370 /* Get actual MTU if possible. */ 1371 err = mlx5_get_mtu(eth_dev, &priv->mtu); 1372 if (err) { 1373 err = rte_errno; 1374 goto error; 1375 } 1376 DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id, 1377 priv->mtu); 1378 /* Initialize burst functions to prevent crashes before link-up. */ 1379 eth_dev->rx_pkt_burst = removed_rx_burst; 1380 eth_dev->tx_pkt_burst = removed_tx_burst; 1381 eth_dev->dev_ops = &mlx5_os_dev_ops; 1382 eth_dev->rx_descriptor_status = mlx5_rx_descriptor_status; 1383 eth_dev->tx_descriptor_status = mlx5_tx_descriptor_status; 1384 eth_dev->rx_queue_count = mlx5_rx_queue_count; 1385 /* Register MAC address. */ 1386 claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); 1387 if (config->vf && config->vf_nl_en) 1388 mlx5_nl_mac_addr_sync(priv->nl_socket_route, 1389 mlx5_ifindex(eth_dev), 1390 eth_dev->data->mac_addrs, 1391 MLX5_MAX_MAC_ADDRESSES); 1392 priv->flows = 0; 1393 priv->ctrl_flows = 0; 1394 rte_spinlock_init(&priv->flow_list_lock); 1395 TAILQ_INIT(&priv->flow_meters); 1396 TAILQ_INIT(&priv->flow_meter_profiles); 1397 /* Hint libmlx5 to use PMD allocator for data plane resources */ 1398 mlx5_glue->dv_set_context_attr(sh->ctx, 1399 MLX5DV_CTX_ATTR_BUF_ALLOCATORS, 1400 (void *)((uintptr_t)&(struct mlx5dv_ctx_allocators){ 1401 .alloc = &mlx5_alloc_verbs_buf, 1402 .free = &mlx5_free_verbs_buf, 1403 .data = priv, 1404 })); 1405 /* Bring Ethernet device up. */ 1406 DRV_LOG(DEBUG, "port %u forcing Ethernet interface up", 1407 eth_dev->data->port_id); 1408 mlx5_set_link_up(eth_dev); 1409 /* 1410 * Even though the interrupt handler is not installed yet, 1411 * interrupts will still trigger on the async_fd from 1412 * Verbs context returned by ibv_open_device(). 1413 */ 1414 mlx5_link_update(eth_dev, 0); 1415 #ifdef HAVE_MLX5DV_DR_ESWITCH 1416 if (!(config->hca_attr.eswitch_manager && config->dv_flow_en && 1417 (switch_info->representor || switch_info->master))) 1418 config->dv_esw_en = 0; 1419 #else 1420 config->dv_esw_en = 0; 1421 #endif 1422 /* Detect minimal data bytes to inline. */ 1423 mlx5_set_min_inline(spawn, config); 1424 /* Store device configuration on private structure. */ 1425 priv->config = *config; 1426 /* Create context for virtual machine VLAN workaround. */ 1427 priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex); 1428 if (config->dv_flow_en) { 1429 err = mlx5_alloc_shared_dr(priv); 1430 if (err) 1431 goto error; 1432 } 1433 if (config->devx && config->dv_flow_en && config->dest_tir) { 1434 priv->obj_ops = devx_obj_ops; 1435 priv->obj_ops.drop_action_create = 1436 ibv_obj_ops.drop_action_create; 1437 priv->obj_ops.drop_action_destroy = 1438 ibv_obj_ops.drop_action_destroy; 1439 #ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET 1440 priv->obj_ops.txq_obj_modify = ibv_obj_ops.txq_obj_modify; 1441 #else 1442 if (config->dv_esw_en) 1443 priv->obj_ops.txq_obj_modify = 1444 ibv_obj_ops.txq_obj_modify; 1445 #endif 1446 /* Use specific wrappers for Tx object. */ 1447 priv->obj_ops.txq_obj_new = mlx5_os_txq_obj_new; 1448 priv->obj_ops.txq_obj_release = mlx5_os_txq_obj_release; 1449 1450 } else { 1451 priv->obj_ops = ibv_obj_ops; 1452 } 1453 priv->drop_queue.hrxq = mlx5_drop_action_create(eth_dev); 1454 if (!priv->drop_queue.hrxq) 1455 goto error; 1456 /* Supported Verbs flow priority number detection. */ 1457 err = mlx5_flow_discover_priorities(eth_dev); 1458 if (err < 0) { 1459 err = -err; 1460 goto error; 1461 } 1462 priv->config.flow_prio = err; 1463 if (!priv->config.dv_esw_en && 1464 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { 1465 DRV_LOG(WARNING, "metadata mode %u is not supported " 1466 "(no E-Switch)", priv->config.dv_xmeta_en); 1467 priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY; 1468 } 1469 mlx5_set_metadata_mask(eth_dev); 1470 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && 1471 !priv->sh->dv_regc0_mask) { 1472 DRV_LOG(ERR, "metadata mode %u is not supported " 1473 "(no metadata reg_c[0] is available)", 1474 priv->config.dv_xmeta_en); 1475 err = ENOTSUP; 1476 goto error; 1477 } 1478 mlx5_cache_list_init(&priv->hrxqs, "hrxq", 0, eth_dev, 1479 mlx5_hrxq_create_cb, 1480 mlx5_hrxq_match_cb, 1481 mlx5_hrxq_remove_cb); 1482 /* Query availability of metadata reg_c's. */ 1483 err = mlx5_flow_discover_mreg_c(eth_dev); 1484 if (err < 0) { 1485 err = -err; 1486 goto error; 1487 } 1488 if (!mlx5_flow_ext_mreg_supported(eth_dev)) { 1489 DRV_LOG(DEBUG, 1490 "port %u extensive metadata register is not supported", 1491 eth_dev->data->port_id); 1492 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { 1493 DRV_LOG(ERR, "metadata mode %u is not supported " 1494 "(no metadata registers available)", 1495 priv->config.dv_xmeta_en); 1496 err = ENOTSUP; 1497 goto error; 1498 } 1499 } 1500 if (priv->config.dv_flow_en && 1501 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && 1502 mlx5_flow_ext_mreg_supported(eth_dev) && 1503 priv->sh->dv_regc0_mask) { 1504 priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME, 1505 MLX5_FLOW_MREG_HTABLE_SZ, 1506 0, 0, 1507 flow_dv_mreg_create_cb, 1508 NULL, 1509 flow_dv_mreg_remove_cb); 1510 if (!priv->mreg_cp_tbl) { 1511 err = ENOMEM; 1512 goto error; 1513 } 1514 priv->mreg_cp_tbl->ctx = eth_dev; 1515 } 1516 mlx5_flow_counter_mode_config(eth_dev); 1517 return eth_dev; 1518 error: 1519 if (priv) { 1520 if (priv->mreg_cp_tbl) 1521 mlx5_hlist_destroy(priv->mreg_cp_tbl); 1522 if (priv->sh) 1523 mlx5_os_free_shared_dr(priv); 1524 if (priv->nl_socket_route >= 0) 1525 close(priv->nl_socket_route); 1526 if (priv->nl_socket_rdma >= 0) 1527 close(priv->nl_socket_rdma); 1528 if (priv->vmwa_context) 1529 mlx5_vlan_vmwa_exit(priv->vmwa_context); 1530 if (eth_dev && priv->drop_queue.hrxq) 1531 mlx5_drop_action_destroy(eth_dev); 1532 if (own_domain_id) 1533 claim_zero(rte_eth_switch_domain_free(priv->domain_id)); 1534 mlx5_cache_list_destroy(&priv->hrxqs); 1535 mlx5_free(priv); 1536 if (eth_dev != NULL) 1537 eth_dev->data->dev_private = NULL; 1538 } 1539 if (eth_dev != NULL) { 1540 /* mac_addrs must not be freed alone because part of 1541 * dev_private 1542 **/ 1543 eth_dev->data->mac_addrs = NULL; 1544 rte_eth_dev_release_port(eth_dev); 1545 } 1546 if (sh) 1547 mlx5_free_shared_dev_ctx(sh); 1548 MLX5_ASSERT(err > 0); 1549 rte_errno = err; 1550 return NULL; 1551 } 1552 1553 /** 1554 * Comparison callback to sort device data. 1555 * 1556 * This is meant to be used with qsort(). 1557 * 1558 * @param a[in] 1559 * Pointer to pointer to first data object. 1560 * @param b[in] 1561 * Pointer to pointer to second data object. 1562 * 1563 * @return 1564 * 0 if both objects are equal, less than 0 if the first argument is less 1565 * than the second, greater than 0 otherwise. 1566 */ 1567 static int 1568 mlx5_dev_spawn_data_cmp(const void *a, const void *b) 1569 { 1570 const struct mlx5_switch_info *si_a = 1571 &((const struct mlx5_dev_spawn_data *)a)->info; 1572 const struct mlx5_switch_info *si_b = 1573 &((const struct mlx5_dev_spawn_data *)b)->info; 1574 int ret; 1575 1576 /* Master device first. */ 1577 ret = si_b->master - si_a->master; 1578 if (ret) 1579 return ret; 1580 /* Then representor devices. */ 1581 ret = si_b->representor - si_a->representor; 1582 if (ret) 1583 return ret; 1584 /* Unidentified devices come last in no specific order. */ 1585 if (!si_a->representor) 1586 return 0; 1587 /* Order representors by name. */ 1588 return si_a->port_name - si_b->port_name; 1589 } 1590 1591 /** 1592 * Match PCI information for possible slaves of bonding device. 1593 * 1594 * @param[in] ibv_dev 1595 * Pointer to Infiniband device structure. 1596 * @param[in] pci_dev 1597 * Pointer to PCI device structure to match PCI address. 1598 * @param[in] nl_rdma 1599 * Netlink RDMA group socket handle. 1600 * 1601 * @return 1602 * negative value if no bonding device found, otherwise 1603 * positive index of slave PF in bonding. 1604 */ 1605 static int 1606 mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev, 1607 const struct rte_pci_device *pci_dev, 1608 int nl_rdma) 1609 { 1610 char ifname[IF_NAMESIZE + 1]; 1611 unsigned int ifindex; 1612 unsigned int np, i; 1613 FILE *file = NULL; 1614 int pf = -1; 1615 1616 /* 1617 * Try to get master device name. If something goes 1618 * wrong suppose the lack of kernel support and no 1619 * bonding devices. 1620 */ 1621 if (nl_rdma < 0) 1622 return -1; 1623 if (!strstr(ibv_dev->name, "bond")) 1624 return -1; 1625 np = mlx5_nl_portnum(nl_rdma, ibv_dev->name); 1626 if (!np) 1627 return -1; 1628 /* 1629 * The Master device might not be on the predefined 1630 * port (not on port index 1, it is not garanted), 1631 * we have to scan all Infiniband device port and 1632 * find master. 1633 */ 1634 for (i = 1; i <= np; ++i) { 1635 /* Check whether Infiniband port is populated. */ 1636 ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i); 1637 if (!ifindex) 1638 continue; 1639 if (!if_indextoname(ifindex, ifname)) 1640 continue; 1641 /* Try to read bonding slave names from sysfs. */ 1642 MKSTR(slaves, 1643 "/sys/class/net/%s/master/bonding/slaves", ifname); 1644 file = fopen(slaves, "r"); 1645 if (file) 1646 break; 1647 } 1648 if (!file) 1649 return -1; 1650 /* Use safe format to check maximal buffer length. */ 1651 MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE); 1652 while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) { 1653 char tmp_str[IF_NAMESIZE + 32]; 1654 struct rte_pci_addr pci_addr; 1655 struct mlx5_switch_info info; 1656 1657 /* Process slave interface names in the loop. */ 1658 snprintf(tmp_str, sizeof(tmp_str), 1659 "/sys/class/net/%s", ifname); 1660 if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) { 1661 DRV_LOG(WARNING, "can not get PCI address" 1662 " for netdev \"%s\"", ifname); 1663 continue; 1664 } 1665 if (pci_dev->addr.domain != pci_addr.domain || 1666 pci_dev->addr.bus != pci_addr.bus || 1667 pci_dev->addr.devid != pci_addr.devid || 1668 pci_dev->addr.function != pci_addr.function) 1669 continue; 1670 /* Slave interface PCI address match found. */ 1671 fclose(file); 1672 snprintf(tmp_str, sizeof(tmp_str), 1673 "/sys/class/net/%s/phys_port_name", ifname); 1674 file = fopen(tmp_str, "rb"); 1675 if (!file) 1676 break; 1677 info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET; 1678 if (fscanf(file, "%32s", tmp_str) == 1) 1679 mlx5_translate_port_name(tmp_str, &info); 1680 if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY || 1681 info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK) 1682 pf = info.port_name; 1683 break; 1684 } 1685 if (file) 1686 fclose(file); 1687 return pf; 1688 } 1689 1690 /** 1691 * DPDK callback to register a PCI device. 1692 * 1693 * This function spawns Ethernet devices out of a given PCI device. 1694 * 1695 * @param[in] pci_drv 1696 * PCI driver structure (mlx5_driver). 1697 * @param[in] pci_dev 1698 * PCI device information. 1699 * 1700 * @return 1701 * 0 on success, a negative errno value otherwise and rte_errno is set. 1702 */ 1703 int 1704 mlx5_os_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, 1705 struct rte_pci_device *pci_dev) 1706 { 1707 struct ibv_device **ibv_list; 1708 /* 1709 * Number of found IB Devices matching with requested PCI BDF. 1710 * nd != 1 means there are multiple IB devices over the same 1711 * PCI device and we have representors and master. 1712 */ 1713 unsigned int nd = 0; 1714 /* 1715 * Number of found IB device Ports. nd = 1 and np = 1..n means 1716 * we have the single multiport IB device, and there may be 1717 * representors attached to some of found ports. 1718 */ 1719 unsigned int np = 0; 1720 /* 1721 * Number of DPDK ethernet devices to Spawn - either over 1722 * multiple IB devices or multiple ports of single IB device. 1723 * Actually this is the number of iterations to spawn. 1724 */ 1725 unsigned int ns = 0; 1726 /* 1727 * Bonding device 1728 * < 0 - no bonding device (single one) 1729 * >= 0 - bonding device (value is slave PF index) 1730 */ 1731 int bd = -1; 1732 struct mlx5_dev_spawn_data *list = NULL; 1733 struct mlx5_dev_config dev_config; 1734 unsigned int dev_config_vf; 1735 int ret; 1736 1737 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 1738 mlx5_pmd_socket_init(); 1739 ret = mlx5_init_once(); 1740 if (ret) { 1741 DRV_LOG(ERR, "unable to init PMD global data: %s", 1742 strerror(rte_errno)); 1743 return -rte_errno; 1744 } 1745 errno = 0; 1746 ibv_list = mlx5_glue->get_device_list(&ret); 1747 if (!ibv_list) { 1748 rte_errno = errno ? errno : ENOSYS; 1749 DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?"); 1750 return -rte_errno; 1751 } 1752 /* 1753 * First scan the list of all Infiniband devices to find 1754 * matching ones, gathering into the list. 1755 */ 1756 struct ibv_device *ibv_match[ret + 1]; 1757 int nl_route = mlx5_nl_init(NETLINK_ROUTE); 1758 int nl_rdma = mlx5_nl_init(NETLINK_RDMA); 1759 unsigned int i; 1760 1761 while (ret-- > 0) { 1762 struct rte_pci_addr pci_addr; 1763 1764 DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name); 1765 bd = mlx5_device_bond_pci_match 1766 (ibv_list[ret], pci_dev, nl_rdma); 1767 if (bd >= 0) { 1768 /* 1769 * Bonding device detected. Only one match is allowed, 1770 * the bonding is supported over multi-port IB device, 1771 * there should be no matches on representor PCI 1772 * functions or non VF LAG bonding devices with 1773 * specified address. 1774 */ 1775 if (nd) { 1776 DRV_LOG(ERR, 1777 "multiple PCI match on bonding device" 1778 "\"%s\" found", ibv_list[ret]->name); 1779 rte_errno = ENOENT; 1780 ret = -rte_errno; 1781 goto exit; 1782 } 1783 DRV_LOG(INFO, "PCI information matches for" 1784 " slave %d bonding device \"%s\"", 1785 bd, ibv_list[ret]->name); 1786 ibv_match[nd++] = ibv_list[ret]; 1787 break; 1788 } 1789 if (mlx5_dev_to_pci_addr 1790 (ibv_list[ret]->ibdev_path, &pci_addr)) 1791 continue; 1792 if (pci_dev->addr.domain != pci_addr.domain || 1793 pci_dev->addr.bus != pci_addr.bus || 1794 pci_dev->addr.devid != pci_addr.devid || 1795 pci_dev->addr.function != pci_addr.function) 1796 continue; 1797 DRV_LOG(INFO, "PCI information matches for device \"%s\"", 1798 ibv_list[ret]->name); 1799 ibv_match[nd++] = ibv_list[ret]; 1800 } 1801 ibv_match[nd] = NULL; 1802 if (!nd) { 1803 /* No device matches, just complain and bail out. */ 1804 DRV_LOG(WARNING, 1805 "no Verbs device matches PCI device " PCI_PRI_FMT "," 1806 " are kernel drivers loaded?", 1807 pci_dev->addr.domain, pci_dev->addr.bus, 1808 pci_dev->addr.devid, pci_dev->addr.function); 1809 rte_errno = ENOENT; 1810 ret = -rte_errno; 1811 goto exit; 1812 } 1813 if (nd == 1) { 1814 /* 1815 * Found single matching device may have multiple ports. 1816 * Each port may be representor, we have to check the port 1817 * number and check the representors existence. 1818 */ 1819 if (nl_rdma >= 0) 1820 np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name); 1821 if (!np) 1822 DRV_LOG(WARNING, "can not get IB device \"%s\"" 1823 " ports number", ibv_match[0]->name); 1824 if (bd >= 0 && !np) { 1825 DRV_LOG(ERR, "can not get ports" 1826 " for bonding device"); 1827 rte_errno = ENOENT; 1828 ret = -rte_errno; 1829 goto exit; 1830 } 1831 } 1832 #ifndef HAVE_MLX5DV_DR_DEVX_PORT 1833 if (bd >= 0) { 1834 /* 1835 * This may happen if there is VF LAG kernel support and 1836 * application is compiled with older rdma_core library. 1837 */ 1838 DRV_LOG(ERR, 1839 "No kernel/verbs support for VF LAG bonding found."); 1840 rte_errno = ENOTSUP; 1841 ret = -rte_errno; 1842 goto exit; 1843 } 1844 #endif 1845 /* 1846 * Now we can determine the maximal 1847 * amount of devices to be spawned. 1848 */ 1849 list = mlx5_malloc(MLX5_MEM_ZERO, 1850 sizeof(struct mlx5_dev_spawn_data) * 1851 (np ? np : nd), 1852 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 1853 if (!list) { 1854 DRV_LOG(ERR, "spawn data array allocation failure"); 1855 rte_errno = ENOMEM; 1856 ret = -rte_errno; 1857 goto exit; 1858 } 1859 if (bd >= 0 || np > 1) { 1860 /* 1861 * Single IB device with multiple ports found, 1862 * it may be E-Switch master device and representors. 1863 * We have to perform identification through the ports. 1864 */ 1865 MLX5_ASSERT(nl_rdma >= 0); 1866 MLX5_ASSERT(ns == 0); 1867 MLX5_ASSERT(nd == 1); 1868 MLX5_ASSERT(np); 1869 for (i = 1; i <= np; ++i) { 1870 list[ns].max_port = np; 1871 list[ns].phys_port = i; 1872 list[ns].phys_dev = ibv_match[0]; 1873 list[ns].eth_dev = NULL; 1874 list[ns].pci_dev = pci_dev; 1875 list[ns].pf_bond = bd; 1876 list[ns].ifindex = mlx5_nl_ifindex 1877 (nl_rdma, 1878 mlx5_os_get_dev_device_name 1879 (list[ns].phys_dev), i); 1880 if (!list[ns].ifindex) { 1881 /* 1882 * No network interface index found for the 1883 * specified port, it means there is no 1884 * representor on this port. It's OK, 1885 * there can be disabled ports, for example 1886 * if sriov_numvfs < sriov_totalvfs. 1887 */ 1888 continue; 1889 } 1890 ret = -1; 1891 if (nl_route >= 0) 1892 ret = mlx5_nl_switch_info 1893 (nl_route, 1894 list[ns].ifindex, 1895 &list[ns].info); 1896 if (ret || (!list[ns].info.representor && 1897 !list[ns].info.master)) { 1898 /* 1899 * We failed to recognize representors with 1900 * Netlink, let's try to perform the task 1901 * with sysfs. 1902 */ 1903 ret = mlx5_sysfs_switch_info 1904 (list[ns].ifindex, 1905 &list[ns].info); 1906 } 1907 if (!ret && bd >= 0) { 1908 switch (list[ns].info.name_type) { 1909 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: 1910 if (list[ns].info.port_name == bd) 1911 ns++; 1912 break; 1913 case MLX5_PHYS_PORT_NAME_TYPE_PFHPF: 1914 /* Fallthrough */ 1915 case MLX5_PHYS_PORT_NAME_TYPE_PFVF: 1916 if (list[ns].info.pf_num == bd) 1917 ns++; 1918 break; 1919 default: 1920 break; 1921 } 1922 continue; 1923 } 1924 if (!ret && (list[ns].info.representor ^ 1925 list[ns].info.master)) 1926 ns++; 1927 } 1928 if (!ns) { 1929 DRV_LOG(ERR, 1930 "unable to recognize master/representors" 1931 " on the IB device with multiple ports"); 1932 rte_errno = ENOENT; 1933 ret = -rte_errno; 1934 goto exit; 1935 } 1936 } else { 1937 /* 1938 * The existence of several matching entries (nd > 1) means 1939 * port representors have been instantiated. No existing Verbs 1940 * call nor sysfs entries can tell them apart, this can only 1941 * be done through Netlink calls assuming kernel drivers are 1942 * recent enough to support them. 1943 * 1944 * In the event of identification failure through Netlink, 1945 * try again through sysfs, then: 1946 * 1947 * 1. A single IB device matches (nd == 1) with single 1948 * port (np=0/1) and is not a representor, assume 1949 * no switch support. 1950 * 1951 * 2. Otherwise no safe assumptions can be made; 1952 * complain louder and bail out. 1953 */ 1954 for (i = 0; i != nd; ++i) { 1955 memset(&list[ns].info, 0, sizeof(list[ns].info)); 1956 list[ns].max_port = 1; 1957 list[ns].phys_port = 1; 1958 list[ns].phys_dev = ibv_match[i]; 1959 list[ns].eth_dev = NULL; 1960 list[ns].pci_dev = pci_dev; 1961 list[ns].pf_bond = -1; 1962 list[ns].ifindex = 0; 1963 if (nl_rdma >= 0) 1964 list[ns].ifindex = mlx5_nl_ifindex 1965 (nl_rdma, 1966 mlx5_os_get_dev_device_name 1967 (list[ns].phys_dev), 1); 1968 if (!list[ns].ifindex) { 1969 char ifname[IF_NAMESIZE]; 1970 1971 /* 1972 * Netlink failed, it may happen with old 1973 * ib_core kernel driver (before 4.16). 1974 * We can assume there is old driver because 1975 * here we are processing single ports IB 1976 * devices. Let's try sysfs to retrieve 1977 * the ifindex. The method works for 1978 * master device only. 1979 */ 1980 if (nd > 1) { 1981 /* 1982 * Multiple devices found, assume 1983 * representors, can not distinguish 1984 * master/representor and retrieve 1985 * ifindex via sysfs. 1986 */ 1987 continue; 1988 } 1989 ret = mlx5_get_ifname_sysfs 1990 (ibv_match[i]->ibdev_path, ifname); 1991 if (!ret) 1992 list[ns].ifindex = 1993 if_nametoindex(ifname); 1994 if (!list[ns].ifindex) { 1995 /* 1996 * No network interface index found 1997 * for the specified device, it means 1998 * there it is neither representor 1999 * nor master. 2000 */ 2001 continue; 2002 } 2003 } 2004 ret = -1; 2005 if (nl_route >= 0) 2006 ret = mlx5_nl_switch_info 2007 (nl_route, 2008 list[ns].ifindex, 2009 &list[ns].info); 2010 if (ret || (!list[ns].info.representor && 2011 !list[ns].info.master)) { 2012 /* 2013 * We failed to recognize representors with 2014 * Netlink, let's try to perform the task 2015 * with sysfs. 2016 */ 2017 ret = mlx5_sysfs_switch_info 2018 (list[ns].ifindex, 2019 &list[ns].info); 2020 } 2021 if (!ret && (list[ns].info.representor ^ 2022 list[ns].info.master)) { 2023 ns++; 2024 } else if ((nd == 1) && 2025 !list[ns].info.representor && 2026 !list[ns].info.master) { 2027 /* 2028 * Single IB device with 2029 * one physical port and 2030 * attached network device. 2031 * May be SRIOV is not enabled 2032 * or there is no representors. 2033 */ 2034 DRV_LOG(INFO, "no E-Switch support detected"); 2035 ns++; 2036 break; 2037 } 2038 } 2039 if (!ns) { 2040 DRV_LOG(ERR, 2041 "unable to recognize master/representors" 2042 " on the multiple IB devices"); 2043 rte_errno = ENOENT; 2044 ret = -rte_errno; 2045 goto exit; 2046 } 2047 } 2048 MLX5_ASSERT(ns); 2049 /* 2050 * Sort list to probe devices in natural order for users convenience 2051 * (i.e. master first, then representors from lowest to highest ID). 2052 */ 2053 qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp); 2054 /* Device specific configuration. */ 2055 switch (pci_dev->id.device_id) { 2056 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: 2057 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: 2058 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: 2059 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: 2060 case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF: 2061 case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF: 2062 case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF: 2063 dev_config_vf = 1; 2064 break; 2065 default: 2066 dev_config_vf = 0; 2067 break; 2068 } 2069 for (i = 0; i != ns; ++i) { 2070 uint32_t restore; 2071 2072 /* Default configuration. */ 2073 memset(&dev_config, 0, sizeof(struct mlx5_dev_config)); 2074 dev_config.vf = dev_config_vf; 2075 dev_config.mps = MLX5_ARG_UNSET; 2076 dev_config.dbnc = MLX5_ARG_UNSET; 2077 dev_config.rx_vec_en = 1; 2078 dev_config.txq_inline_max = MLX5_ARG_UNSET; 2079 dev_config.txq_inline_min = MLX5_ARG_UNSET; 2080 dev_config.txq_inline_mpw = MLX5_ARG_UNSET; 2081 dev_config.txqs_inline = MLX5_ARG_UNSET; 2082 dev_config.vf_nl_en = 1; 2083 dev_config.mr_ext_memseg_en = 1; 2084 dev_config.mprq.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN; 2085 dev_config.mprq.min_rxqs_num = MLX5_MPRQ_MIN_RXQS; 2086 dev_config.dv_esw_en = 1; 2087 dev_config.dv_flow_en = 1; 2088 dev_config.decap_en = 1; 2089 dev_config.log_hp_size = MLX5_ARG_UNSET; 2090 list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device, 2091 &list[i], 2092 &dev_config); 2093 if (!list[i].eth_dev) { 2094 if (rte_errno != EBUSY && rte_errno != EEXIST) 2095 break; 2096 /* Device is disabled or already spawned. Ignore it. */ 2097 continue; 2098 } 2099 restore = list[i].eth_dev->data->dev_flags; 2100 rte_eth_copy_pci_info(list[i].eth_dev, pci_dev); 2101 /* Restore non-PCI flags cleared by the above call. */ 2102 list[i].eth_dev->data->dev_flags |= restore; 2103 rte_eth_dev_probing_finish(list[i].eth_dev); 2104 } 2105 if (i != ns) { 2106 DRV_LOG(ERR, 2107 "probe of PCI device " PCI_PRI_FMT " aborted after" 2108 " encountering an error: %s", 2109 pci_dev->addr.domain, pci_dev->addr.bus, 2110 pci_dev->addr.devid, pci_dev->addr.function, 2111 strerror(rte_errno)); 2112 ret = -rte_errno; 2113 /* Roll back. */ 2114 while (i--) { 2115 if (!list[i].eth_dev) 2116 continue; 2117 mlx5_dev_close(list[i].eth_dev); 2118 /* mac_addrs must not be freed because in dev_private */ 2119 list[i].eth_dev->data->mac_addrs = NULL; 2120 claim_zero(rte_eth_dev_release_port(list[i].eth_dev)); 2121 } 2122 /* Restore original error. */ 2123 rte_errno = -ret; 2124 } else { 2125 ret = 0; 2126 } 2127 exit: 2128 /* 2129 * Do the routine cleanup: 2130 * - close opened Netlink sockets 2131 * - free allocated spawn data array 2132 * - free the Infiniband device list 2133 */ 2134 if (nl_rdma >= 0) 2135 close(nl_rdma); 2136 if (nl_route >= 0) 2137 close(nl_route); 2138 if (list) 2139 mlx5_free(list); 2140 MLX5_ASSERT(ibv_list); 2141 mlx5_glue->free_device_list(ibv_list); 2142 return ret; 2143 } 2144 2145 static int 2146 mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config) 2147 { 2148 char *env; 2149 int value; 2150 2151 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 2152 /* Get environment variable to store. */ 2153 env = getenv(MLX5_SHUT_UP_BF); 2154 value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET; 2155 if (config->dbnc == MLX5_ARG_UNSET) 2156 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1); 2157 else 2158 setenv(MLX5_SHUT_UP_BF, 2159 config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1); 2160 return value; 2161 } 2162 2163 static void 2164 mlx5_restore_doorbell_mapping_env(int value) 2165 { 2166 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 2167 /* Restore the original environment variable state. */ 2168 if (value == MLX5_ARG_UNSET) 2169 unsetenv(MLX5_SHUT_UP_BF); 2170 else 2171 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1); 2172 } 2173 2174 /** 2175 * Extract pdn of PD object using DV API. 2176 * 2177 * @param[in] pd 2178 * Pointer to the verbs PD object. 2179 * @param[out] pdn 2180 * Pointer to the PD object number variable. 2181 * 2182 * @return 2183 * 0 on success, error value otherwise. 2184 */ 2185 int 2186 mlx5_os_get_pdn(void *pd, uint32_t *pdn) 2187 { 2188 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 2189 struct mlx5dv_obj obj; 2190 struct mlx5dv_pd pd_info; 2191 int ret = 0; 2192 2193 obj.pd.in = pd; 2194 obj.pd.out = &pd_info; 2195 ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD); 2196 if (ret) { 2197 DRV_LOG(DEBUG, "Fail to get PD object info"); 2198 return ret; 2199 } 2200 *pdn = pd_info.pdn; 2201 return 0; 2202 #else 2203 (void)pd; 2204 (void)pdn; 2205 return -ENOTSUP; 2206 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */ 2207 } 2208 2209 /** 2210 * Function API to open IB device. 2211 * 2212 * This function calls the Linux glue APIs to open a device. 2213 * 2214 * @param[in] spawn 2215 * Pointer to the IB device attributes (name, port, etc). 2216 * @param[out] config 2217 * Pointer to device configuration structure. 2218 * @param[out] sh 2219 * Pointer to shared context structure. 2220 * 2221 * @return 2222 * 0 on success, a positive error value otherwise. 2223 */ 2224 int 2225 mlx5_os_open_device(const struct mlx5_dev_spawn_data *spawn, 2226 const struct mlx5_dev_config *config, 2227 struct mlx5_dev_ctx_shared *sh) 2228 { 2229 int dbmap_env; 2230 int err = 0; 2231 2232 sh->numa_node = spawn->pci_dev->device.numa_node; 2233 pthread_mutex_init(&sh->txpp.mutex, NULL); 2234 /* 2235 * Configure environment variable "MLX5_BF_SHUT_UP" 2236 * before the device creation. The rdma_core library 2237 * checks the variable at device creation and 2238 * stores the result internally. 2239 */ 2240 dbmap_env = mlx5_config_doorbell_mapping_env(config); 2241 /* Try to open IB device with DV first, then usual Verbs. */ 2242 errno = 0; 2243 sh->ctx = mlx5_glue->dv_open_device(spawn->phys_dev); 2244 if (sh->ctx) { 2245 sh->devx = 1; 2246 DRV_LOG(DEBUG, "DevX is supported"); 2247 /* The device is created, no need for environment. */ 2248 mlx5_restore_doorbell_mapping_env(dbmap_env); 2249 } else { 2250 /* The environment variable is still configured. */ 2251 sh->ctx = mlx5_glue->open_device(spawn->phys_dev); 2252 err = errno ? errno : ENODEV; 2253 /* 2254 * The environment variable is not needed anymore, 2255 * all device creation attempts are completed. 2256 */ 2257 mlx5_restore_doorbell_mapping_env(dbmap_env); 2258 if (!sh->ctx) 2259 return err; 2260 DRV_LOG(DEBUG, "DevX is NOT supported"); 2261 err = 0; 2262 } 2263 return err; 2264 } 2265 2266 /** 2267 * Install shared asynchronous device events handler. 2268 * This function is implemented to support event sharing 2269 * between multiple ports of single IB device. 2270 * 2271 * @param sh 2272 * Pointer to mlx5_dev_ctx_shared object. 2273 */ 2274 void 2275 mlx5_os_dev_shared_handler_install(struct mlx5_dev_ctx_shared *sh) 2276 { 2277 int ret; 2278 int flags; 2279 2280 sh->intr_handle.fd = -1; 2281 flags = fcntl(((struct ibv_context *)sh->ctx)->async_fd, F_GETFL); 2282 ret = fcntl(((struct ibv_context *)sh->ctx)->async_fd, 2283 F_SETFL, flags | O_NONBLOCK); 2284 if (ret) { 2285 DRV_LOG(INFO, "failed to change file descriptor async event" 2286 " queue"); 2287 } else { 2288 sh->intr_handle.fd = ((struct ibv_context *)sh->ctx)->async_fd; 2289 sh->intr_handle.type = RTE_INTR_HANDLE_EXT; 2290 if (rte_intr_callback_register(&sh->intr_handle, 2291 mlx5_dev_interrupt_handler, sh)) { 2292 DRV_LOG(INFO, "Fail to install the shared interrupt."); 2293 sh->intr_handle.fd = -1; 2294 } 2295 } 2296 if (sh->devx) { 2297 #ifdef HAVE_IBV_DEVX_ASYNC 2298 sh->intr_handle_devx.fd = -1; 2299 sh->devx_comp = 2300 (void *)mlx5_glue->devx_create_cmd_comp(sh->ctx); 2301 struct mlx5dv_devx_cmd_comp *devx_comp = sh->devx_comp; 2302 if (!devx_comp) { 2303 DRV_LOG(INFO, "failed to allocate devx_comp."); 2304 return; 2305 } 2306 flags = fcntl(devx_comp->fd, F_GETFL); 2307 ret = fcntl(devx_comp->fd, F_SETFL, flags | O_NONBLOCK); 2308 if (ret) { 2309 DRV_LOG(INFO, "failed to change file descriptor" 2310 " devx comp"); 2311 return; 2312 } 2313 sh->intr_handle_devx.fd = devx_comp->fd; 2314 sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT; 2315 if (rte_intr_callback_register(&sh->intr_handle_devx, 2316 mlx5_dev_interrupt_handler_devx, sh)) { 2317 DRV_LOG(INFO, "Fail to install the devx shared" 2318 " interrupt."); 2319 sh->intr_handle_devx.fd = -1; 2320 } 2321 #endif /* HAVE_IBV_DEVX_ASYNC */ 2322 } 2323 } 2324 2325 /** 2326 * Uninstall shared asynchronous device events handler. 2327 * This function is implemented to support event sharing 2328 * between multiple ports of single IB device. 2329 * 2330 * @param dev 2331 * Pointer to mlx5_dev_ctx_shared object. 2332 */ 2333 void 2334 mlx5_os_dev_shared_handler_uninstall(struct mlx5_dev_ctx_shared *sh) 2335 { 2336 if (sh->intr_handle.fd >= 0) 2337 mlx5_intr_callback_unregister(&sh->intr_handle, 2338 mlx5_dev_interrupt_handler, sh); 2339 #ifdef HAVE_IBV_DEVX_ASYNC 2340 if (sh->intr_handle_devx.fd >= 0) 2341 rte_intr_callback_unregister(&sh->intr_handle_devx, 2342 mlx5_dev_interrupt_handler_devx, sh); 2343 if (sh->devx_comp) 2344 mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp); 2345 #endif 2346 } 2347 2348 /** 2349 * Read statistics by a named counter. 2350 * 2351 * @param[in] priv 2352 * Pointer to the private device data structure. 2353 * @param[in] ctr_name 2354 * Pointer to the name of the statistic counter to read 2355 * @param[out] stat 2356 * Pointer to read statistic value. 2357 * @return 2358 * 0 on success and stat is valud, 1 if failed to read the value 2359 * rte_errno is set. 2360 * 2361 */ 2362 int 2363 mlx5_os_read_dev_stat(struct mlx5_priv *priv, const char *ctr_name, 2364 uint64_t *stat) 2365 { 2366 int fd; 2367 2368 if (priv->sh) { 2369 MKSTR(path, "%s/ports/%d/hw_counters/%s", 2370 priv->sh->ibdev_path, 2371 priv->dev_port, 2372 ctr_name); 2373 fd = open(path, O_RDONLY); 2374 /* 2375 * in switchdev the file location is not per port 2376 * but rather in <ibdev_path>/hw_counters/<file_name>. 2377 */ 2378 if (fd == -1) { 2379 MKSTR(path1, "%s/hw_counters/%s", 2380 priv->sh->ibdev_path, 2381 ctr_name); 2382 fd = open(path1, O_RDONLY); 2383 } 2384 if (fd != -1) { 2385 char buf[21] = {'\0'}; 2386 ssize_t n = read(fd, buf, sizeof(buf)); 2387 2388 close(fd); 2389 if (n != -1) { 2390 *stat = strtoull(buf, NULL, 10); 2391 return 0; 2392 } 2393 } 2394 } 2395 *stat = 0; 2396 return 1; 2397 } 2398 2399 /** 2400 * Set the reg_mr and dereg_mr call backs 2401 * 2402 * @param reg_mr_cb[out] 2403 * Pointer to reg_mr func 2404 * @param dereg_mr_cb[out] 2405 * Pointer to dereg_mr func 2406 * 2407 */ 2408 void 2409 mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb, 2410 mlx5_dereg_mr_t *dereg_mr_cb) 2411 { 2412 *reg_mr_cb = mlx5_verbs_ops.reg_mr; 2413 *dereg_mr_cb = mlx5_verbs_ops.dereg_mr; 2414 } 2415 2416 /** 2417 * Remove a MAC address from device 2418 * 2419 * @param dev 2420 * Pointer to Ethernet device structure. 2421 * @param index 2422 * MAC address index. 2423 */ 2424 void 2425 mlx5_os_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) 2426 { 2427 struct mlx5_priv *priv = dev->data->dev_private; 2428 const int vf = priv->config.vf; 2429 2430 if (vf) 2431 mlx5_nl_mac_addr_remove(priv->nl_socket_route, 2432 mlx5_ifindex(dev), priv->mac_own, 2433 &dev->data->mac_addrs[index], index); 2434 } 2435 2436 /** 2437 * Adds a MAC address to the device 2438 * 2439 * @param dev 2440 * Pointer to Ethernet device structure. 2441 * @param mac_addr 2442 * MAC address to register. 2443 * @param index 2444 * MAC address index. 2445 * 2446 * @return 2447 * 0 on success, a negative errno value otherwise 2448 */ 2449 int 2450 mlx5_os_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac, 2451 uint32_t index) 2452 { 2453 struct mlx5_priv *priv = dev->data->dev_private; 2454 const int vf = priv->config.vf; 2455 int ret = 0; 2456 2457 if (vf) 2458 ret = mlx5_nl_mac_addr_add(priv->nl_socket_route, 2459 mlx5_ifindex(dev), priv->mac_own, 2460 mac, index); 2461 return ret; 2462 } 2463 2464 /** 2465 * Modify a VF MAC address 2466 * 2467 * @param priv 2468 * Pointer to device private data. 2469 * @param mac_addr 2470 * MAC address to modify into. 2471 * @param iface_idx 2472 * Net device interface index 2473 * @param vf_index 2474 * VF index 2475 * 2476 * @return 2477 * 0 on success, a negative errno value otherwise 2478 */ 2479 int 2480 mlx5_os_vf_mac_addr_modify(struct mlx5_priv *priv, 2481 unsigned int iface_idx, 2482 struct rte_ether_addr *mac_addr, 2483 int vf_index) 2484 { 2485 return mlx5_nl_vf_mac_addr_modify 2486 (priv->nl_socket_route, iface_idx, mac_addr, vf_index); 2487 } 2488 2489 /** 2490 * Set device promiscuous mode 2491 * 2492 * @param dev 2493 * Pointer to Ethernet device structure. 2494 * @param enable 2495 * 0 - promiscuous is disabled, otherwise - enabled 2496 * 2497 * @return 2498 * 0 on success, a negative error value otherwise 2499 */ 2500 int 2501 mlx5_os_set_promisc(struct rte_eth_dev *dev, int enable) 2502 { 2503 struct mlx5_priv *priv = dev->data->dev_private; 2504 2505 return mlx5_nl_promisc(priv->nl_socket_route, 2506 mlx5_ifindex(dev), !!enable); 2507 } 2508 2509 /** 2510 * Set device promiscuous mode 2511 * 2512 * @param dev 2513 * Pointer to Ethernet device structure. 2514 * @param enable 2515 * 0 - all multicase is disabled, otherwise - enabled 2516 * 2517 * @return 2518 * 0 on success, a negative error value otherwise 2519 */ 2520 int 2521 mlx5_os_set_allmulti(struct rte_eth_dev *dev, int enable) 2522 { 2523 struct mlx5_priv *priv = dev->data->dev_private; 2524 2525 return mlx5_nl_allmulti(priv->nl_socket_route, 2526 mlx5_ifindex(dev), !!enable); 2527 } 2528 2529 /** 2530 * Flush device MAC addresses 2531 * 2532 * @param dev 2533 * Pointer to Ethernet device structure. 2534 * 2535 */ 2536 void 2537 mlx5_os_mac_addr_flush(struct rte_eth_dev *dev) 2538 { 2539 struct mlx5_priv *priv = dev->data->dev_private; 2540 2541 mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev), 2542 dev->data->mac_addrs, 2543 MLX5_MAX_MAC_ADDRESSES, priv->mac_own); 2544 } 2545 2546 const struct eth_dev_ops mlx5_os_dev_ops = { 2547 .dev_configure = mlx5_dev_configure, 2548 .dev_start = mlx5_dev_start, 2549 .dev_stop = mlx5_dev_stop, 2550 .dev_set_link_down = mlx5_set_link_down, 2551 .dev_set_link_up = mlx5_set_link_up, 2552 .dev_close = mlx5_dev_close, 2553 .promiscuous_enable = mlx5_promiscuous_enable, 2554 .promiscuous_disable = mlx5_promiscuous_disable, 2555 .allmulticast_enable = mlx5_allmulticast_enable, 2556 .allmulticast_disable = mlx5_allmulticast_disable, 2557 .link_update = mlx5_link_update, 2558 .stats_get = mlx5_stats_get, 2559 .stats_reset = mlx5_stats_reset, 2560 .xstats_get = mlx5_xstats_get, 2561 .xstats_reset = mlx5_xstats_reset, 2562 .xstats_get_names = mlx5_xstats_get_names, 2563 .fw_version_get = mlx5_fw_version_get, 2564 .dev_infos_get = mlx5_dev_infos_get, 2565 .read_clock = mlx5_txpp_read_clock, 2566 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 2567 .vlan_filter_set = mlx5_vlan_filter_set, 2568 .rx_queue_setup = mlx5_rx_queue_setup, 2569 .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup, 2570 .tx_queue_setup = mlx5_tx_queue_setup, 2571 .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup, 2572 .rx_queue_release = mlx5_rx_queue_release, 2573 .tx_queue_release = mlx5_tx_queue_release, 2574 .rx_queue_start = mlx5_rx_queue_start, 2575 .rx_queue_stop = mlx5_rx_queue_stop, 2576 .tx_queue_start = mlx5_tx_queue_start, 2577 .tx_queue_stop = mlx5_tx_queue_stop, 2578 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 2579 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 2580 .mac_addr_remove = mlx5_mac_addr_remove, 2581 .mac_addr_add = mlx5_mac_addr_add, 2582 .mac_addr_set = mlx5_mac_addr_set, 2583 .set_mc_addr_list = mlx5_set_mc_addr_list, 2584 .mtu_set = mlx5_dev_set_mtu, 2585 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 2586 .vlan_offload_set = mlx5_vlan_offload_set, 2587 .reta_update = mlx5_dev_rss_reta_update, 2588 .reta_query = mlx5_dev_rss_reta_query, 2589 .rss_hash_update = mlx5_rss_hash_update, 2590 .rss_hash_conf_get = mlx5_rss_hash_conf_get, 2591 .filter_ctrl = mlx5_dev_filter_ctrl, 2592 .rxq_info_get = mlx5_rxq_info_get, 2593 .txq_info_get = mlx5_txq_info_get, 2594 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 2595 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 2596 .rx_queue_intr_enable = mlx5_rx_intr_enable, 2597 .rx_queue_intr_disable = mlx5_rx_intr_disable, 2598 .is_removed = mlx5_is_removed, 2599 .udp_tunnel_port_add = mlx5_udp_tunnel_port_add, 2600 .get_module_info = mlx5_get_module_info, 2601 .get_module_eeprom = mlx5_get_module_eeprom, 2602 .hairpin_cap_get = mlx5_hairpin_cap_get, 2603 .mtr_ops_get = mlx5_flow_meter_ops_get, 2604 .hairpin_bind = mlx5_hairpin_bind, 2605 .hairpin_unbind = mlx5_hairpin_unbind, 2606 .hairpin_get_peer_ports = mlx5_hairpin_get_peer_ports, 2607 .hairpin_queue_peer_update = mlx5_hairpin_queue_peer_update, 2608 .hairpin_queue_peer_bind = mlx5_hairpin_queue_peer_bind, 2609 .hairpin_queue_peer_unbind = mlx5_hairpin_queue_peer_unbind, 2610 }; 2611 2612 /* Available operations from secondary process. */ 2613 const struct eth_dev_ops mlx5_os_dev_sec_ops = { 2614 .stats_get = mlx5_stats_get, 2615 .stats_reset = mlx5_stats_reset, 2616 .xstats_get = mlx5_xstats_get, 2617 .xstats_reset = mlx5_xstats_reset, 2618 .xstats_get_names = mlx5_xstats_get_names, 2619 .fw_version_get = mlx5_fw_version_get, 2620 .dev_infos_get = mlx5_dev_infos_get, 2621 .read_clock = mlx5_txpp_read_clock, 2622 .rx_queue_start = mlx5_rx_queue_start, 2623 .rx_queue_stop = mlx5_rx_queue_stop, 2624 .tx_queue_start = mlx5_tx_queue_start, 2625 .tx_queue_stop = mlx5_tx_queue_stop, 2626 .rxq_info_get = mlx5_rxq_info_get, 2627 .txq_info_get = mlx5_txq_info_get, 2628 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 2629 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 2630 .get_module_info = mlx5_get_module_info, 2631 .get_module_eeprom = mlx5_get_module_eeprom, 2632 }; 2633 2634 /* Available operations in flow isolated mode. */ 2635 const struct eth_dev_ops mlx5_os_dev_ops_isolate = { 2636 .dev_configure = mlx5_dev_configure, 2637 .dev_start = mlx5_dev_start, 2638 .dev_stop = mlx5_dev_stop, 2639 .dev_set_link_down = mlx5_set_link_down, 2640 .dev_set_link_up = mlx5_set_link_up, 2641 .dev_close = mlx5_dev_close, 2642 .promiscuous_enable = mlx5_promiscuous_enable, 2643 .promiscuous_disable = mlx5_promiscuous_disable, 2644 .allmulticast_enable = mlx5_allmulticast_enable, 2645 .allmulticast_disable = mlx5_allmulticast_disable, 2646 .link_update = mlx5_link_update, 2647 .stats_get = mlx5_stats_get, 2648 .stats_reset = mlx5_stats_reset, 2649 .xstats_get = mlx5_xstats_get, 2650 .xstats_reset = mlx5_xstats_reset, 2651 .xstats_get_names = mlx5_xstats_get_names, 2652 .fw_version_get = mlx5_fw_version_get, 2653 .dev_infos_get = mlx5_dev_infos_get, 2654 .read_clock = mlx5_txpp_read_clock, 2655 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 2656 .vlan_filter_set = mlx5_vlan_filter_set, 2657 .rx_queue_setup = mlx5_rx_queue_setup, 2658 .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup, 2659 .tx_queue_setup = mlx5_tx_queue_setup, 2660 .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup, 2661 .rx_queue_release = mlx5_rx_queue_release, 2662 .tx_queue_release = mlx5_tx_queue_release, 2663 .rx_queue_start = mlx5_rx_queue_start, 2664 .rx_queue_stop = mlx5_rx_queue_stop, 2665 .tx_queue_start = mlx5_tx_queue_start, 2666 .tx_queue_stop = mlx5_tx_queue_stop, 2667 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 2668 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 2669 .mac_addr_remove = mlx5_mac_addr_remove, 2670 .mac_addr_add = mlx5_mac_addr_add, 2671 .mac_addr_set = mlx5_mac_addr_set, 2672 .set_mc_addr_list = mlx5_set_mc_addr_list, 2673 .mtu_set = mlx5_dev_set_mtu, 2674 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 2675 .vlan_offload_set = mlx5_vlan_offload_set, 2676 .filter_ctrl = mlx5_dev_filter_ctrl, 2677 .rxq_info_get = mlx5_rxq_info_get, 2678 .txq_info_get = mlx5_txq_info_get, 2679 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 2680 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 2681 .rx_queue_intr_enable = mlx5_rx_intr_enable, 2682 .rx_queue_intr_disable = mlx5_rx_intr_disable, 2683 .is_removed = mlx5_is_removed, 2684 .get_module_info = mlx5_get_module_info, 2685 .get_module_eeprom = mlx5_get_module_eeprom, 2686 .hairpin_cap_get = mlx5_hairpin_cap_get, 2687 .mtr_ops_get = mlx5_flow_meter_ops_get, 2688 .hairpin_bind = mlx5_hairpin_bind, 2689 .hairpin_unbind = mlx5_hairpin_unbind, 2690 .hairpin_get_peer_ports = mlx5_hairpin_get_peer_ports, 2691 .hairpin_queue_peer_update = mlx5_hairpin_queue_peer_update, 2692 .hairpin_queue_peer_bind = mlx5_hairpin_queue_peer_bind, 2693 .hairpin_queue_peer_unbind = mlx5_hairpin_queue_peer_unbind, 2694 }; 2695