1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2020 Mellanox Technologies, Ltd 4 */ 5 6 #include <stddef.h> 7 #include <unistd.h> 8 #include <string.h> 9 #include <stdint.h> 10 #include <stdlib.h> 11 #include <errno.h> 12 #include <net/if.h> 13 #include <sys/mman.h> 14 #include <linux/rtnetlink.h> 15 #include <fcntl.h> 16 17 /* Verbs header. */ 18 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 19 #ifdef PEDANTIC 20 #pragma GCC diagnostic ignored "-Wpedantic" 21 #endif 22 #include <infiniband/verbs.h> 23 #ifdef PEDANTIC 24 #pragma GCC diagnostic error "-Wpedantic" 25 #endif 26 27 #include <rte_malloc.h> 28 #include <rte_ethdev_driver.h> 29 #include <rte_ethdev_pci.h> 30 #include <rte_pci.h> 31 #include <rte_bus_pci.h> 32 #include <rte_common.h> 33 #include <rte_kvargs.h> 34 #include <rte_rwlock.h> 35 #include <rte_spinlock.h> 36 #include <rte_string_fns.h> 37 #include <rte_alarm.h> 38 39 #include <mlx5_glue.h> 40 #include <mlx5_devx_cmds.h> 41 #include <mlx5_common.h> 42 #include <mlx5_common_mp.h> 43 44 #include "mlx5_defs.h" 45 #include "mlx5.h" 46 #include "mlx5_utils.h" 47 #include "mlx5_rxtx.h" 48 #include "mlx5_autoconf.h" 49 #include "mlx5_mr.h" 50 #include "mlx5_flow.h" 51 #include "rte_pmd_mlx5.h" 52 53 #define MLX5_TAGS_HLIST_ARRAY_SIZE 8192 54 55 #ifndef HAVE_IBV_MLX5_MOD_MPW 56 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) 57 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) 58 #endif 59 60 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP 61 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4) 62 #endif 63 64 /** 65 * Get ibv device name. Given an ibv_context pointer - return a 66 * pointer to the corresponding device name. 67 * 68 * @param[in] ctx 69 * Pointer to ibv context. 70 * 71 * @return 72 * Pointer to device name if ctx is valid, NULL otherwise. 73 */ 74 const char * 75 mlx5_os_get_ctx_device_name(void *ctx) 76 { 77 if (!ctx) 78 return NULL; 79 return ((struct ibv_context *)ctx)->device->name; 80 } 81 82 /** 83 * Get ibv device path name. Given an ibv_context pointer - return a 84 * pointer to the corresponding device path name. 85 * 86 * @param[in] ctx 87 * Pointer to ibv context. 88 * 89 * @return 90 * Pointer to device path name if ctx is valid, NULL otherwise. 91 */ 92 const char * 93 mlx5_os_get_ctx_device_path(void *ctx) 94 { 95 if (!ctx) 96 return NULL; 97 98 return ((struct ibv_context *)ctx)->device->ibdev_path; 99 } 100 101 /** 102 * Get umem id. Given a pointer to umem object of type 103 * 'struct mlx5dv_devx_umem *' - return its id. 104 * 105 * @param[in] umem 106 * Pointer to umem object. 107 * 108 * @return 109 * The umem id if umem is valid, 0 otherwise. 110 */ 111 uint32_t 112 mlx5_os_get_umem_id(void *umem) 113 { 114 if (!umem) 115 return 0; 116 return ((struct mlx5dv_devx_umem *)umem)->umem_id; 117 } 118 119 /** 120 * Get mlx5 device attributes. The glue function query_device_ex() is called 121 * with out parameter of type 'struct ibv_device_attr_ex *'. Then fill in mlx5 122 * device attributes from the glue out parameter. 123 * 124 * @param dev 125 * Pointer to ibv context. 126 * 127 * @param device_attr 128 * Pointer to mlx5 device attributes. 129 * 130 * @return 131 * 0 on success, non zero error number otherwise 132 */ 133 int 134 mlx5_os_get_dev_attr(void *ctx, struct mlx5_dev_attr *device_attr) 135 { 136 int err; 137 struct ibv_device_attr_ex attr_ex; 138 memset(device_attr, 0, sizeof(*device_attr)); 139 err = mlx5_glue->query_device_ex(ctx, NULL, &attr_ex); 140 if (err) 141 return err; 142 143 device_attr->device_cap_flags_ex = attr_ex.device_cap_flags_ex; 144 device_attr->max_qp_wr = attr_ex.orig_attr.max_qp_wr; 145 device_attr->max_sge = attr_ex.orig_attr.max_sge; 146 device_attr->max_cq = attr_ex.orig_attr.max_cq; 147 device_attr->max_qp = attr_ex.orig_attr.max_qp; 148 device_attr->raw_packet_caps = attr_ex.raw_packet_caps; 149 device_attr->max_rwq_indirection_table_size = 150 attr_ex.rss_caps.max_rwq_indirection_table_size; 151 device_attr->max_tso = attr_ex.tso_caps.max_tso; 152 device_attr->tso_supported_qpts = attr_ex.tso_caps.supported_qpts; 153 154 struct mlx5dv_context dv_attr = { .comp_mask = 0 }; 155 err = mlx5_glue->dv_query_device(ctx, &dv_attr); 156 if (err) 157 return err; 158 159 device_attr->flags = dv_attr.flags; 160 device_attr->comp_mask = dv_attr.comp_mask; 161 #ifdef HAVE_IBV_MLX5_MOD_SWP 162 device_attr->sw_parsing_offloads = 163 dv_attr.sw_parsing_caps.sw_parsing_offloads; 164 #endif 165 device_attr->min_single_stride_log_num_of_bytes = 166 dv_attr.striding_rq_caps.min_single_stride_log_num_of_bytes; 167 device_attr->max_single_stride_log_num_of_bytes = 168 dv_attr.striding_rq_caps.max_single_stride_log_num_of_bytes; 169 device_attr->min_single_wqe_log_num_of_strides = 170 dv_attr.striding_rq_caps.min_single_wqe_log_num_of_strides; 171 device_attr->max_single_wqe_log_num_of_strides = 172 dv_attr.striding_rq_caps.max_single_wqe_log_num_of_strides; 173 device_attr->stride_supported_qpts = 174 dv_attr.striding_rq_caps.supported_qpts; 175 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 176 device_attr->tunnel_offloads_caps = dv_attr.tunnel_offloads_caps; 177 #endif 178 179 return err; 180 } 181 182 /** 183 * Verbs callback to allocate a memory. This function should allocate the space 184 * according to the size provided residing inside a huge page. 185 * Please note that all allocation must respect the alignment from libmlx5 186 * (i.e. currently sysconf(_SC_PAGESIZE)). 187 * 188 * @param[in] size 189 * The size in bytes of the memory to allocate. 190 * @param[in] data 191 * A pointer to the callback data. 192 * 193 * @return 194 * Allocated buffer, NULL otherwise and rte_errno is set. 195 */ 196 static void * 197 mlx5_alloc_verbs_buf(size_t size, void *data) 198 { 199 struct mlx5_priv *priv = data; 200 void *ret; 201 size_t alignment = sysconf(_SC_PAGESIZE); 202 unsigned int socket = SOCKET_ID_ANY; 203 204 if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) { 205 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 206 207 socket = ctrl->socket; 208 } else if (priv->verbs_alloc_ctx.type == 209 MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) { 210 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 211 212 socket = ctrl->socket; 213 } 214 MLX5_ASSERT(data != NULL); 215 ret = rte_malloc_socket(__func__, size, alignment, socket); 216 if (!ret && size) 217 rte_errno = ENOMEM; 218 return ret; 219 } 220 221 /** 222 * Verbs callback to free a memory. 223 * 224 * @param[in] ptr 225 * A pointer to the memory to free. 226 * @param[in] data 227 * A pointer to the callback data. 228 */ 229 static void 230 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused) 231 { 232 MLX5_ASSERT(data != NULL); 233 rte_free(ptr); 234 } 235 236 /** 237 * Initialize DR related data within private structure. 238 * Routine checks the reference counter and does actual 239 * resources creation/initialization only if counter is zero. 240 * 241 * @param[in] priv 242 * Pointer to the private device data structure. 243 * 244 * @return 245 * Zero on success, positive error code otherwise. 246 */ 247 static int 248 mlx5_alloc_shared_dr(struct mlx5_priv *priv) 249 { 250 struct mlx5_dev_ctx_shared *sh = priv->sh; 251 char s[MLX5_HLIST_NAMESIZE]; 252 int err = 0; 253 254 if (!sh->flow_tbls) 255 err = mlx5_alloc_table_hash_list(priv); 256 else 257 DRV_LOG(DEBUG, "sh->flow_tbls[%p] already created, reuse\n", 258 (void *)sh->flow_tbls); 259 if (err) 260 return err; 261 /* Create tags hash list table. */ 262 snprintf(s, sizeof(s), "%s_tags", sh->ibdev_name); 263 sh->tag_table = mlx5_hlist_create(s, MLX5_TAGS_HLIST_ARRAY_SIZE); 264 if (!sh->tag_table) { 265 DRV_LOG(ERR, "tags with hash creation failed.\n"); 266 err = ENOMEM; 267 goto error; 268 } 269 #ifdef HAVE_MLX5DV_DR 270 void *domain; 271 272 if (sh->dv_refcnt) { 273 /* Shared DV/DR structures is already initialized. */ 274 sh->dv_refcnt++; 275 priv->dr_shared = 1; 276 return 0; 277 } 278 /* Reference counter is zero, we should initialize structures. */ 279 domain = mlx5_glue->dr_create_domain(sh->ctx, 280 MLX5DV_DR_DOMAIN_TYPE_NIC_RX); 281 if (!domain) { 282 DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed"); 283 err = errno; 284 goto error; 285 } 286 sh->rx_domain = domain; 287 domain = mlx5_glue->dr_create_domain(sh->ctx, 288 MLX5DV_DR_DOMAIN_TYPE_NIC_TX); 289 if (!domain) { 290 DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed"); 291 err = errno; 292 goto error; 293 } 294 pthread_mutex_init(&sh->dv_mutex, NULL); 295 sh->tx_domain = domain; 296 #ifdef HAVE_MLX5DV_DR_ESWITCH 297 if (priv->config.dv_esw_en) { 298 domain = mlx5_glue->dr_create_domain 299 (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB); 300 if (!domain) { 301 DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed"); 302 err = errno; 303 goto error; 304 } 305 sh->fdb_domain = domain; 306 sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop(); 307 } 308 #endif 309 if (priv->config.reclaim_mode == MLX5_RCM_AGGR) { 310 mlx5_glue->dr_reclaim_domain_memory(sh->rx_domain, 1); 311 mlx5_glue->dr_reclaim_domain_memory(sh->tx_domain, 1); 312 if (sh->fdb_domain) 313 mlx5_glue->dr_reclaim_domain_memory(sh->fdb_domain, 1); 314 } 315 sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan(); 316 #endif /* HAVE_MLX5DV_DR */ 317 sh->dv_refcnt++; 318 priv->dr_shared = 1; 319 return 0; 320 error: 321 /* Rollback the created objects. */ 322 if (sh->rx_domain) { 323 mlx5_glue->dr_destroy_domain(sh->rx_domain); 324 sh->rx_domain = NULL; 325 } 326 if (sh->tx_domain) { 327 mlx5_glue->dr_destroy_domain(sh->tx_domain); 328 sh->tx_domain = NULL; 329 } 330 if (sh->fdb_domain) { 331 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 332 sh->fdb_domain = NULL; 333 } 334 if (sh->esw_drop_action) { 335 mlx5_glue->destroy_flow_action(sh->esw_drop_action); 336 sh->esw_drop_action = NULL; 337 } 338 if (sh->pop_vlan_action) { 339 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 340 sh->pop_vlan_action = NULL; 341 } 342 if (sh->tag_table) { 343 /* tags should be destroyed with flow before. */ 344 mlx5_hlist_destroy(sh->tag_table, NULL, NULL); 345 sh->tag_table = NULL; 346 } 347 mlx5_free_table_hash_list(priv); 348 return err; 349 } 350 351 /** 352 * Destroy DR related data within private structure. 353 * 354 * @param[in] priv 355 * Pointer to the private device data structure. 356 */ 357 void 358 mlx5_os_free_shared_dr(struct mlx5_priv *priv) 359 { 360 struct mlx5_dev_ctx_shared *sh; 361 362 if (!priv->dr_shared) 363 return; 364 priv->dr_shared = 0; 365 sh = priv->sh; 366 MLX5_ASSERT(sh); 367 #ifdef HAVE_MLX5DV_DR 368 MLX5_ASSERT(sh->dv_refcnt); 369 if (sh->dv_refcnt && --sh->dv_refcnt) 370 return; 371 if (sh->rx_domain) { 372 mlx5_glue->dr_destroy_domain(sh->rx_domain); 373 sh->rx_domain = NULL; 374 } 375 if (sh->tx_domain) { 376 mlx5_glue->dr_destroy_domain(sh->tx_domain); 377 sh->tx_domain = NULL; 378 } 379 #ifdef HAVE_MLX5DV_DR_ESWITCH 380 if (sh->fdb_domain) { 381 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 382 sh->fdb_domain = NULL; 383 } 384 if (sh->esw_drop_action) { 385 mlx5_glue->destroy_flow_action(sh->esw_drop_action); 386 sh->esw_drop_action = NULL; 387 } 388 #endif 389 if (sh->pop_vlan_action) { 390 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 391 sh->pop_vlan_action = NULL; 392 } 393 pthread_mutex_destroy(&sh->dv_mutex); 394 #endif /* HAVE_MLX5DV_DR */ 395 if (sh->tag_table) { 396 /* tags should be destroyed with flow before. */ 397 mlx5_hlist_destroy(sh->tag_table, NULL, NULL); 398 sh->tag_table = NULL; 399 } 400 mlx5_free_table_hash_list(priv); 401 } 402 403 /** 404 * Spawn an Ethernet device from Verbs information. 405 * 406 * @param dpdk_dev 407 * Backing DPDK device. 408 * @param spawn 409 * Verbs device parameters (name, port, switch_info) to spawn. 410 * @param config 411 * Device configuration parameters. 412 * 413 * @return 414 * A valid Ethernet device object on success, NULL otherwise and rte_errno 415 * is set. The following errors are defined: 416 * 417 * EBUSY: device is not supposed to be spawned. 418 * EEXIST: device is already spawned 419 */ 420 static struct rte_eth_dev * 421 mlx5_dev_spawn(struct rte_device *dpdk_dev, 422 struct mlx5_dev_spawn_data *spawn, 423 struct mlx5_dev_config config) 424 { 425 const struct mlx5_switch_info *switch_info = &spawn->info; 426 struct mlx5_dev_ctx_shared *sh = NULL; 427 struct ibv_port_attr port_attr; 428 struct mlx5dv_context dv_attr = { .comp_mask = 0 }; 429 struct rte_eth_dev *eth_dev = NULL; 430 struct mlx5_priv *priv = NULL; 431 int err = 0; 432 unsigned int hw_padding = 0; 433 unsigned int mps; 434 unsigned int cqe_comp; 435 unsigned int cqe_pad = 0; 436 unsigned int tunnel_en = 0; 437 unsigned int mpls_en = 0; 438 unsigned int swp = 0; 439 unsigned int mprq = 0; 440 unsigned int mprq_min_stride_size_n = 0; 441 unsigned int mprq_max_stride_size_n = 0; 442 unsigned int mprq_min_stride_num_n = 0; 443 unsigned int mprq_max_stride_num_n = 0; 444 struct rte_ether_addr mac; 445 char name[RTE_ETH_NAME_MAX_LEN]; 446 int own_domain_id = 0; 447 uint16_t port_id; 448 unsigned int i; 449 #ifdef HAVE_MLX5DV_DR_DEVX_PORT 450 struct mlx5dv_devx_port devx_port = { .comp_mask = 0 }; 451 #endif 452 453 /* Determine if this port representor is supposed to be spawned. */ 454 if (switch_info->representor && dpdk_dev->devargs) { 455 struct rte_eth_devargs eth_da; 456 457 err = rte_eth_devargs_parse(dpdk_dev->devargs->args, ð_da); 458 if (err) { 459 rte_errno = -err; 460 DRV_LOG(ERR, "failed to process device arguments: %s", 461 strerror(rte_errno)); 462 return NULL; 463 } 464 for (i = 0; i < eth_da.nb_representor_ports; ++i) 465 if (eth_da.representor_ports[i] == 466 (uint16_t)switch_info->port_name) 467 break; 468 if (i == eth_da.nb_representor_ports) { 469 rte_errno = EBUSY; 470 return NULL; 471 } 472 } 473 /* Build device name. */ 474 if (spawn->pf_bond < 0) { 475 /* Single device. */ 476 if (!switch_info->representor) 477 strlcpy(name, dpdk_dev->name, sizeof(name)); 478 else 479 snprintf(name, sizeof(name), "%s_representor_%u", 480 dpdk_dev->name, switch_info->port_name); 481 } else { 482 /* Bonding device. */ 483 if (!switch_info->representor) 484 snprintf(name, sizeof(name), "%s_%s", 485 dpdk_dev->name, spawn->ibv_dev->name); 486 else 487 snprintf(name, sizeof(name), "%s_%s_representor_%u", 488 dpdk_dev->name, spawn->ibv_dev->name, 489 switch_info->port_name); 490 } 491 /* check if the device is already spawned */ 492 if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) { 493 rte_errno = EEXIST; 494 return NULL; 495 } 496 DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name); 497 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 498 struct mlx5_mp_id mp_id; 499 500 eth_dev = rte_eth_dev_attach_secondary(name); 501 if (eth_dev == NULL) { 502 DRV_LOG(ERR, "can not attach rte ethdev"); 503 rte_errno = ENOMEM; 504 return NULL; 505 } 506 eth_dev->device = dpdk_dev; 507 eth_dev->dev_ops = &mlx5_dev_sec_ops; 508 err = mlx5_proc_priv_init(eth_dev); 509 if (err) 510 return NULL; 511 mp_id.port_id = eth_dev->data->port_id; 512 strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN); 513 /* Receive command fd from primary process */ 514 err = mlx5_mp_req_verbs_cmd_fd(&mp_id); 515 if (err < 0) 516 goto err_secondary; 517 /* Remap UAR for Tx queues. */ 518 err = mlx5_tx_uar_init_secondary(eth_dev, err); 519 if (err) 520 goto err_secondary; 521 /* 522 * Ethdev pointer is still required as input since 523 * the primary device is not accessible from the 524 * secondary process. 525 */ 526 eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev); 527 eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev); 528 return eth_dev; 529 err_secondary: 530 mlx5_dev_close(eth_dev); 531 return NULL; 532 } 533 /* 534 * Some parameters ("tx_db_nc" in particularly) are needed in 535 * advance to create dv/verbs device context. We proceed the 536 * devargs here to get ones, and later proceed devargs again 537 * to override some hardware settings. 538 */ 539 err = mlx5_args(&config, dpdk_dev->devargs); 540 if (err) { 541 err = rte_errno; 542 DRV_LOG(ERR, "failed to process device arguments: %s", 543 strerror(rte_errno)); 544 goto error; 545 } 546 sh = mlx5_alloc_shared_ibctx(spawn, &config); 547 if (!sh) 548 return NULL; 549 config.devx = sh->devx; 550 #ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR 551 config.dest_tir = 1; 552 #endif 553 #ifdef HAVE_IBV_MLX5_MOD_SWP 554 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; 555 #endif 556 /* 557 * Multi-packet send is supported by ConnectX-4 Lx PF as well 558 * as all ConnectX-5 devices. 559 */ 560 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 561 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS; 562 #endif 563 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 564 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; 565 #endif 566 mlx5_glue->dv_query_device(sh->ctx, &dv_attr); 567 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { 568 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { 569 DRV_LOG(DEBUG, "enhanced MPW is supported"); 570 mps = MLX5_MPW_ENHANCED; 571 } else { 572 DRV_LOG(DEBUG, "MPW is supported"); 573 mps = MLX5_MPW; 574 } 575 } else { 576 DRV_LOG(DEBUG, "MPW isn't supported"); 577 mps = MLX5_MPW_DISABLED; 578 } 579 #ifdef HAVE_IBV_MLX5_MOD_SWP 580 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP) 581 swp = dv_attr.sw_parsing_caps.sw_parsing_offloads; 582 DRV_LOG(DEBUG, "SWP support: %u", swp); 583 #endif 584 config.swp = !!swp; 585 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 586 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) { 587 struct mlx5dv_striding_rq_caps mprq_caps = 588 dv_attr.striding_rq_caps; 589 590 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d", 591 mprq_caps.min_single_stride_log_num_of_bytes); 592 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d", 593 mprq_caps.max_single_stride_log_num_of_bytes); 594 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d", 595 mprq_caps.min_single_wqe_log_num_of_strides); 596 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d", 597 mprq_caps.max_single_wqe_log_num_of_strides); 598 DRV_LOG(DEBUG, "\tsupported_qpts: %d", 599 mprq_caps.supported_qpts); 600 DRV_LOG(DEBUG, "device supports Multi-Packet RQ"); 601 mprq = 1; 602 mprq_min_stride_size_n = 603 mprq_caps.min_single_stride_log_num_of_bytes; 604 mprq_max_stride_size_n = 605 mprq_caps.max_single_stride_log_num_of_bytes; 606 mprq_min_stride_num_n = 607 mprq_caps.min_single_wqe_log_num_of_strides; 608 mprq_max_stride_num_n = 609 mprq_caps.max_single_wqe_log_num_of_strides; 610 } 611 #endif 612 if (RTE_CACHE_LINE_SIZE == 128 && 613 !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) 614 cqe_comp = 0; 615 else 616 cqe_comp = 1; 617 config.cqe_comp = cqe_comp; 618 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD 619 /* Whether device supports 128B Rx CQE padding. */ 620 cqe_pad = RTE_CACHE_LINE_SIZE == 128 && 621 (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD); 622 #endif 623 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 624 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { 625 tunnel_en = ((dv_attr.tunnel_offloads_caps & 626 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) && 627 (dv_attr.tunnel_offloads_caps & 628 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) && 629 (dv_attr.tunnel_offloads_caps & 630 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE)); 631 } 632 DRV_LOG(DEBUG, "tunnel offloading is %ssupported", 633 tunnel_en ? "" : "not "); 634 #else 635 DRV_LOG(WARNING, 636 "tunnel offloading disabled due to old OFED/rdma-core version"); 637 #endif 638 config.tunnel_en = tunnel_en; 639 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT 640 mpls_en = ((dv_attr.tunnel_offloads_caps & 641 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) && 642 (dv_attr.tunnel_offloads_caps & 643 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP)); 644 DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported", 645 mpls_en ? "" : "not "); 646 #else 647 DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to" 648 " old OFED/rdma-core version or firmware configuration"); 649 #endif 650 config.mpls_en = mpls_en; 651 /* Check port status. */ 652 err = mlx5_glue->query_port(sh->ctx, spawn->ibv_port, &port_attr); 653 if (err) { 654 DRV_LOG(ERR, "port query failed: %s", strerror(err)); 655 goto error; 656 } 657 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { 658 DRV_LOG(ERR, "port is not configured in Ethernet mode"); 659 err = EINVAL; 660 goto error; 661 } 662 if (port_attr.state != IBV_PORT_ACTIVE) 663 DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)", 664 mlx5_glue->port_state_str(port_attr.state), 665 port_attr.state); 666 /* Allocate private eth device data. */ 667 priv = rte_zmalloc("ethdev private structure", 668 sizeof(*priv), 669 RTE_CACHE_LINE_SIZE); 670 if (priv == NULL) { 671 DRV_LOG(ERR, "priv allocation failure"); 672 err = ENOMEM; 673 goto error; 674 } 675 priv->sh = sh; 676 priv->ibv_port = spawn->ibv_port; 677 priv->pci_dev = spawn->pci_dev; 678 priv->mtu = RTE_ETHER_MTU; 679 priv->mp_id.port_id = port_id; 680 strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN); 681 #ifndef RTE_ARCH_64 682 /* Initialize UAR access locks for 32bit implementations. */ 683 rte_spinlock_init(&priv->uar_lock_cq); 684 for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++) 685 rte_spinlock_init(&priv->uar_lock[i]); 686 #endif 687 /* Some internal functions rely on Netlink sockets, open them now. */ 688 priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA); 689 priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE); 690 priv->representor = !!switch_info->representor; 691 priv->master = !!switch_info->master; 692 priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; 693 priv->vport_meta_tag = 0; 694 priv->vport_meta_mask = 0; 695 priv->pf_bond = spawn->pf_bond; 696 #ifdef HAVE_MLX5DV_DR_DEVX_PORT 697 /* 698 * The DevX port query API is implemented. E-Switch may use 699 * either vport or reg_c[0] metadata register to match on 700 * vport index. The engaged part of metadata register is 701 * defined by mask. 702 */ 703 if (switch_info->representor || switch_info->master) { 704 devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT | 705 MLX5DV_DEVX_PORT_MATCH_REG_C_0; 706 err = mlx5_glue->devx_port_query(sh->ctx, spawn->ibv_port, 707 &devx_port); 708 if (err) { 709 DRV_LOG(WARNING, 710 "can't query devx port %d on device %s", 711 spawn->ibv_port, spawn->ibv_dev->name); 712 devx_port.comp_mask = 0; 713 } 714 } 715 if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) { 716 priv->vport_meta_tag = devx_port.reg_c_0.value; 717 priv->vport_meta_mask = devx_port.reg_c_0.mask; 718 if (!priv->vport_meta_mask) { 719 DRV_LOG(ERR, "vport zero mask for port %d" 720 " on bonding device %s", 721 spawn->ibv_port, spawn->ibv_dev->name); 722 err = ENOTSUP; 723 goto error; 724 } 725 if (priv->vport_meta_tag & ~priv->vport_meta_mask) { 726 DRV_LOG(ERR, "invalid vport tag for port %d" 727 " on bonding device %s", 728 spawn->ibv_port, spawn->ibv_dev->name); 729 err = ENOTSUP; 730 goto error; 731 } 732 } 733 if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) { 734 priv->vport_id = devx_port.vport_num; 735 } else if (spawn->pf_bond >= 0) { 736 DRV_LOG(ERR, "can't deduce vport index for port %d" 737 " on bonding device %s", 738 spawn->ibv_port, spawn->ibv_dev->name); 739 err = ENOTSUP; 740 goto error; 741 } else { 742 /* Suppose vport index in compatible way. */ 743 priv->vport_id = switch_info->representor ? 744 switch_info->port_name + 1 : -1; 745 } 746 #else 747 /* 748 * Kernel/rdma_core support single E-Switch per PF configurations 749 * only and vport_id field contains the vport index for 750 * associated VF, which is deduced from representor port name. 751 * For example, let's have the IB device port 10, it has 752 * attached network device eth0, which has port name attribute 753 * pf0vf2, we can deduce the VF number as 2, and set vport index 754 * as 3 (2+1). This assigning schema should be changed if the 755 * multiple E-Switch instances per PF configurations or/and PCI 756 * subfunctions are added. 757 */ 758 priv->vport_id = switch_info->representor ? 759 switch_info->port_name + 1 : -1; 760 #endif 761 /* representor_id field keeps the unmodified VF index. */ 762 priv->representor_id = switch_info->representor ? 763 switch_info->port_name : -1; 764 /* 765 * Look for sibling devices in order to reuse their switch domain 766 * if any, otherwise allocate one. 767 */ 768 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { 769 const struct mlx5_priv *opriv = 770 rte_eth_devices[port_id].data->dev_private; 771 772 if (!opriv || 773 opriv->sh != priv->sh || 774 opriv->domain_id == 775 RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) 776 continue; 777 priv->domain_id = opriv->domain_id; 778 break; 779 } 780 if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { 781 err = rte_eth_switch_domain_alloc(&priv->domain_id); 782 if (err) { 783 err = rte_errno; 784 DRV_LOG(ERR, "unable to allocate switch domain: %s", 785 strerror(rte_errno)); 786 goto error; 787 } 788 own_domain_id = 1; 789 } 790 /* Override some values set by hardware configuration. */ 791 mlx5_args(&config, dpdk_dev->devargs); 792 err = mlx5_dev_check_sibling_config(priv, &config); 793 if (err) 794 goto error; 795 config.hw_csum = !!(sh->device_attr.device_cap_flags_ex & 796 IBV_DEVICE_RAW_IP_CSUM); 797 DRV_LOG(DEBUG, "checksum offloading is %ssupported", 798 (config.hw_csum ? "" : "not ")); 799 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \ 800 !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) 801 DRV_LOG(DEBUG, "counters are not supported"); 802 #endif 803 #if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR) 804 if (config.dv_flow_en) { 805 DRV_LOG(WARNING, "DV flow is not supported"); 806 config.dv_flow_en = 0; 807 } 808 #endif 809 config.ind_table_max_size = 810 sh->device_attr.max_rwq_indirection_table_size; 811 /* 812 * Remove this check once DPDK supports larger/variable 813 * indirection tables. 814 */ 815 if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512) 816 config.ind_table_max_size = ETH_RSS_RETA_SIZE_512; 817 DRV_LOG(DEBUG, "maximum Rx indirection table size is %u", 818 config.ind_table_max_size); 819 config.hw_vlan_strip = !!(sh->device_attr.raw_packet_caps & 820 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); 821 DRV_LOG(DEBUG, "VLAN stripping is %ssupported", 822 (config.hw_vlan_strip ? "" : "not ")); 823 config.hw_fcs_strip = !!(sh->device_attr.raw_packet_caps & 824 IBV_RAW_PACKET_CAP_SCATTER_FCS); 825 DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported", 826 (config.hw_fcs_strip ? "" : "not ")); 827 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING) 828 hw_padding = !!sh->device_attr.rx_pad_end_addr_align; 829 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING) 830 hw_padding = !!(sh->device_attr.device_cap_flags_ex & 831 IBV_DEVICE_PCI_WRITE_END_PADDING); 832 #endif 833 if (config.hw_padding && !hw_padding) { 834 DRV_LOG(DEBUG, "Rx end alignment padding isn't supported"); 835 config.hw_padding = 0; 836 } else if (config.hw_padding) { 837 DRV_LOG(DEBUG, "Rx end alignment padding is enabled"); 838 } 839 config.tso = (sh->device_attr.max_tso > 0 && 840 (sh->device_attr.tso_supported_qpts & 841 (1 << IBV_QPT_RAW_PACKET))); 842 if (config.tso) 843 config.tso_max_payload_sz = sh->device_attr.max_tso; 844 /* 845 * MPW is disabled by default, while the Enhanced MPW is enabled 846 * by default. 847 */ 848 if (config.mps == MLX5_ARG_UNSET) 849 config.mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED : 850 MLX5_MPW_DISABLED; 851 else 852 config.mps = config.mps ? mps : MLX5_MPW_DISABLED; 853 DRV_LOG(INFO, "%sMPS is %s", 854 config.mps == MLX5_MPW_ENHANCED ? "enhanced " : 855 config.mps == MLX5_MPW ? "legacy " : "", 856 config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); 857 if (config.cqe_comp && !cqe_comp) { 858 DRV_LOG(WARNING, "Rx CQE compression isn't supported"); 859 config.cqe_comp = 0; 860 } 861 if (config.cqe_pad && !cqe_pad) { 862 DRV_LOG(WARNING, "Rx CQE padding isn't supported"); 863 config.cqe_pad = 0; 864 } else if (config.cqe_pad) { 865 DRV_LOG(INFO, "Rx CQE padding is enabled"); 866 } 867 if (config.devx) { 868 priv->counter_fallback = 0; 869 err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr); 870 if (err) { 871 err = -err; 872 goto error; 873 } 874 if (!config.hca_attr.flow_counters_dump) 875 priv->counter_fallback = 1; 876 #ifndef HAVE_IBV_DEVX_ASYNC 877 priv->counter_fallback = 1; 878 #endif 879 if (priv->counter_fallback) 880 DRV_LOG(INFO, "Use fall-back DV counter management"); 881 /* Check for LRO support. */ 882 if (config.dest_tir && config.hca_attr.lro_cap && 883 config.dv_flow_en) { 884 /* TBD check tunnel lro caps. */ 885 config.lro.supported = config.hca_attr.lro_cap; 886 DRV_LOG(DEBUG, "Device supports LRO"); 887 /* 888 * If LRO timeout is not configured by application, 889 * use the minimal supported value. 890 */ 891 if (!config.lro.timeout) 892 config.lro.timeout = 893 config.hca_attr.lro_timer_supported_periods[0]; 894 DRV_LOG(DEBUG, "LRO session timeout set to %d usec", 895 config.lro.timeout); 896 } 897 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER) 898 if (config.hca_attr.qos.sup && config.hca_attr.qos.srtcm_sup && 899 config.dv_flow_en) { 900 uint8_t reg_c_mask = 901 config.hca_attr.qos.flow_meter_reg_c_ids; 902 /* 903 * Meter needs two REG_C's for color match and pre-sfx 904 * flow match. Here get the REG_C for color match. 905 * REG_C_0 and REG_C_1 is reserved for metadata feature. 906 */ 907 reg_c_mask &= 0xfc; 908 if (__builtin_popcount(reg_c_mask) < 1) { 909 priv->mtr_en = 0; 910 DRV_LOG(WARNING, "No available register for" 911 " meter."); 912 } else { 913 priv->mtr_color_reg = ffs(reg_c_mask) - 1 + 914 REG_C_0; 915 priv->mtr_en = 1; 916 priv->mtr_reg_share = 917 config.hca_attr.qos.flow_meter_reg_share; 918 DRV_LOG(DEBUG, "The REG_C meter uses is %d", 919 priv->mtr_color_reg); 920 } 921 } 922 #endif 923 } 924 if (config.mprq.enabled && mprq) { 925 if (config.mprq.stride_num_n && 926 (config.mprq.stride_num_n > mprq_max_stride_num_n || 927 config.mprq.stride_num_n < mprq_min_stride_num_n)) { 928 config.mprq.stride_num_n = 929 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, 930 mprq_min_stride_num_n), 931 mprq_max_stride_num_n); 932 DRV_LOG(WARNING, 933 "the number of strides" 934 " for Multi-Packet RQ is out of range," 935 " setting default value (%u)", 936 1 << config.mprq.stride_num_n); 937 } 938 if (config.mprq.stride_size_n && 939 (config.mprq.stride_size_n > mprq_max_stride_size_n || 940 config.mprq.stride_size_n < mprq_min_stride_size_n)) { 941 config.mprq.stride_size_n = 942 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N, 943 mprq_min_stride_size_n), 944 mprq_max_stride_size_n); 945 DRV_LOG(WARNING, 946 "the size of a stride" 947 " for Multi-Packet RQ is out of range," 948 " setting default value (%u)", 949 1 << config.mprq.stride_size_n); 950 } 951 config.mprq.min_stride_size_n = mprq_min_stride_size_n; 952 config.mprq.max_stride_size_n = mprq_max_stride_size_n; 953 } else if (config.mprq.enabled && !mprq) { 954 DRV_LOG(WARNING, "Multi-Packet RQ isn't supported"); 955 config.mprq.enabled = 0; 956 } 957 if (config.max_dump_files_num == 0) 958 config.max_dump_files_num = 128; 959 eth_dev = rte_eth_dev_allocate(name); 960 if (eth_dev == NULL) { 961 DRV_LOG(ERR, "can not allocate rte ethdev"); 962 err = ENOMEM; 963 goto error; 964 } 965 /* Flag to call rte_eth_dev_release_port() in rte_eth_dev_close(). */ 966 eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE; 967 if (priv->representor) { 968 eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR; 969 eth_dev->data->representor_id = priv->representor_id; 970 } 971 /* 972 * Store associated network device interface index. This index 973 * is permanent throughout the lifetime of device. So, we may store 974 * the ifindex here and use the cached value further. 975 */ 976 MLX5_ASSERT(spawn->ifindex); 977 priv->if_index = spawn->ifindex; 978 eth_dev->data->dev_private = priv; 979 priv->dev_data = eth_dev->data; 980 eth_dev->data->mac_addrs = priv->mac; 981 eth_dev->device = dpdk_dev; 982 /* Configure the first MAC address by default. */ 983 if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) { 984 DRV_LOG(ERR, 985 "port %u cannot get MAC address, is mlx5_en" 986 " loaded? (errno: %s)", 987 eth_dev->data->port_id, strerror(rte_errno)); 988 err = ENODEV; 989 goto error; 990 } 991 DRV_LOG(INFO, 992 "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", 993 eth_dev->data->port_id, 994 mac.addr_bytes[0], mac.addr_bytes[1], 995 mac.addr_bytes[2], mac.addr_bytes[3], 996 mac.addr_bytes[4], mac.addr_bytes[5]); 997 #ifdef RTE_LIBRTE_MLX5_DEBUG 998 { 999 char ifname[IF_NAMESIZE]; 1000 1001 if (mlx5_get_ifname(eth_dev, &ifname) == 0) 1002 DRV_LOG(DEBUG, "port %u ifname is \"%s\"", 1003 eth_dev->data->port_id, ifname); 1004 else 1005 DRV_LOG(DEBUG, "port %u ifname is unknown", 1006 eth_dev->data->port_id); 1007 } 1008 #endif 1009 /* Get actual MTU if possible. */ 1010 err = mlx5_get_mtu(eth_dev, &priv->mtu); 1011 if (err) { 1012 err = rte_errno; 1013 goto error; 1014 } 1015 DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id, 1016 priv->mtu); 1017 /* Initialize burst functions to prevent crashes before link-up. */ 1018 eth_dev->rx_pkt_burst = removed_rx_burst; 1019 eth_dev->tx_pkt_burst = removed_tx_burst; 1020 eth_dev->dev_ops = &mlx5_dev_ops; 1021 /* Register MAC address. */ 1022 claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); 1023 if (config.vf && config.vf_nl_en) 1024 mlx5_nl_mac_addr_sync(priv->nl_socket_route, 1025 mlx5_ifindex(eth_dev), 1026 eth_dev->data->mac_addrs, 1027 MLX5_MAX_MAC_ADDRESSES); 1028 priv->flows = 0; 1029 priv->ctrl_flows = 0; 1030 TAILQ_INIT(&priv->flow_meters); 1031 TAILQ_INIT(&priv->flow_meter_profiles); 1032 /* Hint libmlx5 to use PMD allocator for data plane resources */ 1033 struct mlx5dv_ctx_allocators alctr = { 1034 .alloc = &mlx5_alloc_verbs_buf, 1035 .free = &mlx5_free_verbs_buf, 1036 .data = priv, 1037 }; 1038 mlx5_glue->dv_set_context_attr(sh->ctx, 1039 MLX5DV_CTX_ATTR_BUF_ALLOCATORS, 1040 (void *)((uintptr_t)&alctr)); 1041 /* Bring Ethernet device up. */ 1042 DRV_LOG(DEBUG, "port %u forcing Ethernet interface up", 1043 eth_dev->data->port_id); 1044 mlx5_set_link_up(eth_dev); 1045 /* 1046 * Even though the interrupt handler is not installed yet, 1047 * interrupts will still trigger on the async_fd from 1048 * Verbs context returned by ibv_open_device(). 1049 */ 1050 mlx5_link_update(eth_dev, 0); 1051 #ifdef HAVE_MLX5DV_DR_ESWITCH 1052 if (!(config.hca_attr.eswitch_manager && config.dv_flow_en && 1053 (switch_info->representor || switch_info->master))) 1054 config.dv_esw_en = 0; 1055 #else 1056 config.dv_esw_en = 0; 1057 #endif 1058 /* Detect minimal data bytes to inline. */ 1059 mlx5_set_min_inline(spawn, &config); 1060 /* Store device configuration on private structure. */ 1061 priv->config = config; 1062 /* Create context for virtual machine VLAN workaround. */ 1063 priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex); 1064 if (config.dv_flow_en) { 1065 err = mlx5_alloc_shared_dr(priv); 1066 if (err) 1067 goto error; 1068 /* 1069 * RSS id is shared with meter flow id. Meter flow id can only 1070 * use the 24 MSB of the register. 1071 */ 1072 priv->qrss_id_pool = mlx5_flow_id_pool_alloc(UINT32_MAX >> 1073 MLX5_MTR_COLOR_BITS); 1074 if (!priv->qrss_id_pool) { 1075 DRV_LOG(ERR, "can't create flow id pool"); 1076 err = ENOMEM; 1077 goto error; 1078 } 1079 } 1080 /* Supported Verbs flow priority number detection. */ 1081 err = mlx5_flow_discover_priorities(eth_dev); 1082 if (err < 0) { 1083 err = -err; 1084 goto error; 1085 } 1086 priv->config.flow_prio = err; 1087 if (!priv->config.dv_esw_en && 1088 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { 1089 DRV_LOG(WARNING, "metadata mode %u is not supported " 1090 "(no E-Switch)", priv->config.dv_xmeta_en); 1091 priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY; 1092 } 1093 mlx5_set_metadata_mask(eth_dev); 1094 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && 1095 !priv->sh->dv_regc0_mask) { 1096 DRV_LOG(ERR, "metadata mode %u is not supported " 1097 "(no metadata reg_c[0] is available)", 1098 priv->config.dv_xmeta_en); 1099 err = ENOTSUP; 1100 goto error; 1101 } 1102 /* 1103 * Allocate the buffer for flow creating, just once. 1104 * The allocation must be done before any flow creating. 1105 */ 1106 mlx5_flow_alloc_intermediate(eth_dev); 1107 /* Query availability of metadata reg_c's. */ 1108 err = mlx5_flow_discover_mreg_c(eth_dev); 1109 if (err < 0) { 1110 err = -err; 1111 goto error; 1112 } 1113 if (!mlx5_flow_ext_mreg_supported(eth_dev)) { 1114 DRV_LOG(DEBUG, 1115 "port %u extensive metadata register is not supported", 1116 eth_dev->data->port_id); 1117 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { 1118 DRV_LOG(ERR, "metadata mode %u is not supported " 1119 "(no metadata registers available)", 1120 priv->config.dv_xmeta_en); 1121 err = ENOTSUP; 1122 goto error; 1123 } 1124 } 1125 if (priv->config.dv_flow_en && 1126 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && 1127 mlx5_flow_ext_mreg_supported(eth_dev) && 1128 priv->sh->dv_regc0_mask) { 1129 priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME, 1130 MLX5_FLOW_MREG_HTABLE_SZ); 1131 if (!priv->mreg_cp_tbl) { 1132 err = ENOMEM; 1133 goto error; 1134 } 1135 } 1136 return eth_dev; 1137 error: 1138 if (priv) { 1139 if (priv->mreg_cp_tbl) 1140 mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL); 1141 if (priv->sh) 1142 mlx5_os_free_shared_dr(priv); 1143 if (priv->nl_socket_route >= 0) 1144 close(priv->nl_socket_route); 1145 if (priv->nl_socket_rdma >= 0) 1146 close(priv->nl_socket_rdma); 1147 if (priv->vmwa_context) 1148 mlx5_vlan_vmwa_exit(priv->vmwa_context); 1149 if (priv->qrss_id_pool) 1150 mlx5_flow_id_pool_release(priv->qrss_id_pool); 1151 if (own_domain_id) 1152 claim_zero(rte_eth_switch_domain_free(priv->domain_id)); 1153 rte_free(priv); 1154 if (eth_dev != NULL) 1155 eth_dev->data->dev_private = NULL; 1156 } 1157 if (eth_dev != NULL) { 1158 /* mac_addrs must not be freed alone because part of 1159 * dev_private 1160 **/ 1161 eth_dev->data->mac_addrs = NULL; 1162 rte_eth_dev_release_port(eth_dev); 1163 } 1164 if (sh) 1165 mlx5_free_shared_ibctx(sh); 1166 MLX5_ASSERT(err > 0); 1167 rte_errno = err; 1168 return NULL; 1169 } 1170 1171 /** 1172 * Comparison callback to sort device data. 1173 * 1174 * This is meant to be used with qsort(). 1175 * 1176 * @param a[in] 1177 * Pointer to pointer to first data object. 1178 * @param b[in] 1179 * Pointer to pointer to second data object. 1180 * 1181 * @return 1182 * 0 if both objects are equal, less than 0 if the first argument is less 1183 * than the second, greater than 0 otherwise. 1184 */ 1185 static int 1186 mlx5_dev_spawn_data_cmp(const void *a, const void *b) 1187 { 1188 const struct mlx5_switch_info *si_a = 1189 &((const struct mlx5_dev_spawn_data *)a)->info; 1190 const struct mlx5_switch_info *si_b = 1191 &((const struct mlx5_dev_spawn_data *)b)->info; 1192 int ret; 1193 1194 /* Master device first. */ 1195 ret = si_b->master - si_a->master; 1196 if (ret) 1197 return ret; 1198 /* Then representor devices. */ 1199 ret = si_b->representor - si_a->representor; 1200 if (ret) 1201 return ret; 1202 /* Unidentified devices come last in no specific order. */ 1203 if (!si_a->representor) 1204 return 0; 1205 /* Order representors by name. */ 1206 return si_a->port_name - si_b->port_name; 1207 } 1208 1209 /** 1210 * Match PCI information for possible slaves of bonding device. 1211 * 1212 * @param[in] ibv_dev 1213 * Pointer to Infiniband device structure. 1214 * @param[in] pci_dev 1215 * Pointer to PCI device structure to match PCI address. 1216 * @param[in] nl_rdma 1217 * Netlink RDMA group socket handle. 1218 * 1219 * @return 1220 * negative value if no bonding device found, otherwise 1221 * positive index of slave PF in bonding. 1222 */ 1223 static int 1224 mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev, 1225 const struct rte_pci_device *pci_dev, 1226 int nl_rdma) 1227 { 1228 char ifname[IF_NAMESIZE + 1]; 1229 unsigned int ifindex; 1230 unsigned int np, i; 1231 FILE *file = NULL; 1232 int pf = -1; 1233 1234 /* 1235 * Try to get master device name. If something goes 1236 * wrong suppose the lack of kernel support and no 1237 * bonding devices. 1238 */ 1239 if (nl_rdma < 0) 1240 return -1; 1241 if (!strstr(ibv_dev->name, "bond")) 1242 return -1; 1243 np = mlx5_nl_portnum(nl_rdma, ibv_dev->name); 1244 if (!np) 1245 return -1; 1246 /* 1247 * The Master device might not be on the predefined 1248 * port (not on port index 1, it is not garanted), 1249 * we have to scan all Infiniband device port and 1250 * find master. 1251 */ 1252 for (i = 1; i <= np; ++i) { 1253 /* Check whether Infiniband port is populated. */ 1254 ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i); 1255 if (!ifindex) 1256 continue; 1257 if (!if_indextoname(ifindex, ifname)) 1258 continue; 1259 /* Try to read bonding slave names from sysfs. */ 1260 MKSTR(slaves, 1261 "/sys/class/net/%s/master/bonding/slaves", ifname); 1262 file = fopen(slaves, "r"); 1263 if (file) 1264 break; 1265 } 1266 if (!file) 1267 return -1; 1268 /* Use safe format to check maximal buffer length. */ 1269 MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE); 1270 while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) { 1271 char tmp_str[IF_NAMESIZE + 32]; 1272 struct rte_pci_addr pci_addr; 1273 struct mlx5_switch_info info; 1274 1275 /* Process slave interface names in the loop. */ 1276 snprintf(tmp_str, sizeof(tmp_str), 1277 "/sys/class/net/%s", ifname); 1278 if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) { 1279 DRV_LOG(WARNING, "can not get PCI address" 1280 " for netdev \"%s\"", ifname); 1281 continue; 1282 } 1283 if (pci_dev->addr.domain != pci_addr.domain || 1284 pci_dev->addr.bus != pci_addr.bus || 1285 pci_dev->addr.devid != pci_addr.devid || 1286 pci_dev->addr.function != pci_addr.function) 1287 continue; 1288 /* Slave interface PCI address match found. */ 1289 fclose(file); 1290 snprintf(tmp_str, sizeof(tmp_str), 1291 "/sys/class/net/%s/phys_port_name", ifname); 1292 file = fopen(tmp_str, "rb"); 1293 if (!file) 1294 break; 1295 info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET; 1296 if (fscanf(file, "%32s", tmp_str) == 1) 1297 mlx5_translate_port_name(tmp_str, &info); 1298 if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY || 1299 info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK) 1300 pf = info.port_name; 1301 break; 1302 } 1303 if (file) 1304 fclose(file); 1305 return pf; 1306 } 1307 1308 /** 1309 * DPDK callback to register a PCI device. 1310 * 1311 * This function spawns Ethernet devices out of a given PCI device. 1312 * 1313 * @param[in] pci_drv 1314 * PCI driver structure (mlx5_driver). 1315 * @param[in] pci_dev 1316 * PCI device information. 1317 * 1318 * @return 1319 * 0 on success, a negative errno value otherwise and rte_errno is set. 1320 */ 1321 int 1322 mlx5_os_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, 1323 struct rte_pci_device *pci_dev) 1324 { 1325 struct ibv_device **ibv_list; 1326 /* 1327 * Number of found IB Devices matching with requested PCI BDF. 1328 * nd != 1 means there are multiple IB devices over the same 1329 * PCI device and we have representors and master. 1330 */ 1331 unsigned int nd = 0; 1332 /* 1333 * Number of found IB device Ports. nd = 1 and np = 1..n means 1334 * we have the single multiport IB device, and there may be 1335 * representors attached to some of found ports. 1336 */ 1337 unsigned int np = 0; 1338 /* 1339 * Number of DPDK ethernet devices to Spawn - either over 1340 * multiple IB devices or multiple ports of single IB device. 1341 * Actually this is the number of iterations to spawn. 1342 */ 1343 unsigned int ns = 0; 1344 /* 1345 * Bonding device 1346 * < 0 - no bonding device (single one) 1347 * >= 0 - bonding device (value is slave PF index) 1348 */ 1349 int bd = -1; 1350 struct mlx5_dev_spawn_data *list = NULL; 1351 struct mlx5_dev_config dev_config; 1352 int ret; 1353 1354 if (mlx5_class_get(pci_dev->device.devargs) != MLX5_CLASS_NET) { 1355 DRV_LOG(DEBUG, "Skip probing - should be probed by other mlx5" 1356 " driver."); 1357 return 1; 1358 } 1359 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 1360 mlx5_pmd_socket_init(); 1361 ret = mlx5_init_once(); 1362 if (ret) { 1363 DRV_LOG(ERR, "unable to init PMD global data: %s", 1364 strerror(rte_errno)); 1365 return -rte_errno; 1366 } 1367 MLX5_ASSERT(pci_drv == &mlx5_driver); 1368 errno = 0; 1369 ibv_list = mlx5_glue->get_device_list(&ret); 1370 if (!ibv_list) { 1371 rte_errno = errno ? errno : ENOSYS; 1372 DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?"); 1373 return -rte_errno; 1374 } 1375 /* 1376 * First scan the list of all Infiniband devices to find 1377 * matching ones, gathering into the list. 1378 */ 1379 struct ibv_device *ibv_match[ret + 1]; 1380 int nl_route = mlx5_nl_init(NETLINK_ROUTE); 1381 int nl_rdma = mlx5_nl_init(NETLINK_RDMA); 1382 unsigned int i; 1383 1384 while (ret-- > 0) { 1385 struct rte_pci_addr pci_addr; 1386 1387 DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name); 1388 bd = mlx5_device_bond_pci_match 1389 (ibv_list[ret], pci_dev, nl_rdma); 1390 if (bd >= 0) { 1391 /* 1392 * Bonding device detected. Only one match is allowed, 1393 * the bonding is supported over multi-port IB device, 1394 * there should be no matches on representor PCI 1395 * functions or non VF LAG bonding devices with 1396 * specified address. 1397 */ 1398 if (nd) { 1399 DRV_LOG(ERR, 1400 "multiple PCI match on bonding device" 1401 "\"%s\" found", ibv_list[ret]->name); 1402 rte_errno = ENOENT; 1403 ret = -rte_errno; 1404 goto exit; 1405 } 1406 DRV_LOG(INFO, "PCI information matches for" 1407 " slave %d bonding device \"%s\"", 1408 bd, ibv_list[ret]->name); 1409 ibv_match[nd++] = ibv_list[ret]; 1410 break; 1411 } 1412 if (mlx5_dev_to_pci_addr 1413 (ibv_list[ret]->ibdev_path, &pci_addr)) 1414 continue; 1415 if (pci_dev->addr.domain != pci_addr.domain || 1416 pci_dev->addr.bus != pci_addr.bus || 1417 pci_dev->addr.devid != pci_addr.devid || 1418 pci_dev->addr.function != pci_addr.function) 1419 continue; 1420 DRV_LOG(INFO, "PCI information matches for device \"%s\"", 1421 ibv_list[ret]->name); 1422 ibv_match[nd++] = ibv_list[ret]; 1423 } 1424 ibv_match[nd] = NULL; 1425 if (!nd) { 1426 /* No device matches, just complain and bail out. */ 1427 DRV_LOG(WARNING, 1428 "no Verbs device matches PCI device " PCI_PRI_FMT "," 1429 " are kernel drivers loaded?", 1430 pci_dev->addr.domain, pci_dev->addr.bus, 1431 pci_dev->addr.devid, pci_dev->addr.function); 1432 rte_errno = ENOENT; 1433 ret = -rte_errno; 1434 goto exit; 1435 } 1436 if (nd == 1) { 1437 /* 1438 * Found single matching device may have multiple ports. 1439 * Each port may be representor, we have to check the port 1440 * number and check the representors existence. 1441 */ 1442 if (nl_rdma >= 0) 1443 np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name); 1444 if (!np) 1445 DRV_LOG(WARNING, "can not get IB device \"%s\"" 1446 " ports number", ibv_match[0]->name); 1447 if (bd >= 0 && !np) { 1448 DRV_LOG(ERR, "can not get ports" 1449 " for bonding device"); 1450 rte_errno = ENOENT; 1451 ret = -rte_errno; 1452 goto exit; 1453 } 1454 } 1455 #ifndef HAVE_MLX5DV_DR_DEVX_PORT 1456 if (bd >= 0) { 1457 /* 1458 * This may happen if there is VF LAG kernel support and 1459 * application is compiled with older rdma_core library. 1460 */ 1461 DRV_LOG(ERR, 1462 "No kernel/verbs support for VF LAG bonding found."); 1463 rte_errno = ENOTSUP; 1464 ret = -rte_errno; 1465 goto exit; 1466 } 1467 #endif 1468 /* 1469 * Now we can determine the maximal 1470 * amount of devices to be spawned. 1471 */ 1472 list = rte_zmalloc("device spawn data", 1473 sizeof(struct mlx5_dev_spawn_data) * 1474 (np ? np : nd), 1475 RTE_CACHE_LINE_SIZE); 1476 if (!list) { 1477 DRV_LOG(ERR, "spawn data array allocation failure"); 1478 rte_errno = ENOMEM; 1479 ret = -rte_errno; 1480 goto exit; 1481 } 1482 if (bd >= 0 || np > 1) { 1483 /* 1484 * Single IB device with multiple ports found, 1485 * it may be E-Switch master device and representors. 1486 * We have to perform identification through the ports. 1487 */ 1488 MLX5_ASSERT(nl_rdma >= 0); 1489 MLX5_ASSERT(ns == 0); 1490 MLX5_ASSERT(nd == 1); 1491 MLX5_ASSERT(np); 1492 for (i = 1; i <= np; ++i) { 1493 list[ns].max_port = np; 1494 list[ns].ibv_port = i; 1495 list[ns].ibv_dev = ibv_match[0]; 1496 list[ns].eth_dev = NULL; 1497 list[ns].pci_dev = pci_dev; 1498 list[ns].pf_bond = bd; 1499 list[ns].ifindex = mlx5_nl_ifindex 1500 (nl_rdma, list[ns].ibv_dev->name, i); 1501 if (!list[ns].ifindex) { 1502 /* 1503 * No network interface index found for the 1504 * specified port, it means there is no 1505 * representor on this port. It's OK, 1506 * there can be disabled ports, for example 1507 * if sriov_numvfs < sriov_totalvfs. 1508 */ 1509 continue; 1510 } 1511 ret = -1; 1512 if (nl_route >= 0) 1513 ret = mlx5_nl_switch_info 1514 (nl_route, 1515 list[ns].ifindex, 1516 &list[ns].info); 1517 if (ret || (!list[ns].info.representor && 1518 !list[ns].info.master)) { 1519 /* 1520 * We failed to recognize representors with 1521 * Netlink, let's try to perform the task 1522 * with sysfs. 1523 */ 1524 ret = mlx5_sysfs_switch_info 1525 (list[ns].ifindex, 1526 &list[ns].info); 1527 } 1528 if (!ret && bd >= 0) { 1529 switch (list[ns].info.name_type) { 1530 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: 1531 if (list[ns].info.port_name == bd) 1532 ns++; 1533 break; 1534 case MLX5_PHYS_PORT_NAME_TYPE_PFVF: 1535 if (list[ns].info.pf_num == bd) 1536 ns++; 1537 break; 1538 default: 1539 break; 1540 } 1541 continue; 1542 } 1543 if (!ret && (list[ns].info.representor ^ 1544 list[ns].info.master)) 1545 ns++; 1546 } 1547 if (!ns) { 1548 DRV_LOG(ERR, 1549 "unable to recognize master/representors" 1550 " on the IB device with multiple ports"); 1551 rte_errno = ENOENT; 1552 ret = -rte_errno; 1553 goto exit; 1554 } 1555 } else { 1556 /* 1557 * The existence of several matching entries (nd > 1) means 1558 * port representors have been instantiated. No existing Verbs 1559 * call nor sysfs entries can tell them apart, this can only 1560 * be done through Netlink calls assuming kernel drivers are 1561 * recent enough to support them. 1562 * 1563 * In the event of identification failure through Netlink, 1564 * try again through sysfs, then: 1565 * 1566 * 1. A single IB device matches (nd == 1) with single 1567 * port (np=0/1) and is not a representor, assume 1568 * no switch support. 1569 * 1570 * 2. Otherwise no safe assumptions can be made; 1571 * complain louder and bail out. 1572 */ 1573 for (i = 0; i != nd; ++i) { 1574 memset(&list[ns].info, 0, sizeof(list[ns].info)); 1575 list[ns].max_port = 1; 1576 list[ns].ibv_port = 1; 1577 list[ns].ibv_dev = ibv_match[i]; 1578 list[ns].eth_dev = NULL; 1579 list[ns].pci_dev = pci_dev; 1580 list[ns].pf_bond = -1; 1581 list[ns].ifindex = 0; 1582 if (nl_rdma >= 0) 1583 list[ns].ifindex = mlx5_nl_ifindex 1584 (nl_rdma, list[ns].ibv_dev->name, 1); 1585 if (!list[ns].ifindex) { 1586 char ifname[IF_NAMESIZE]; 1587 1588 /* 1589 * Netlink failed, it may happen with old 1590 * ib_core kernel driver (before 4.16). 1591 * We can assume there is old driver because 1592 * here we are processing single ports IB 1593 * devices. Let's try sysfs to retrieve 1594 * the ifindex. The method works for 1595 * master device only. 1596 */ 1597 if (nd > 1) { 1598 /* 1599 * Multiple devices found, assume 1600 * representors, can not distinguish 1601 * master/representor and retrieve 1602 * ifindex via sysfs. 1603 */ 1604 continue; 1605 } 1606 ret = mlx5_get_master_ifname 1607 (ibv_match[i]->ibdev_path, &ifname); 1608 if (!ret) 1609 list[ns].ifindex = 1610 if_nametoindex(ifname); 1611 if (!list[ns].ifindex) { 1612 /* 1613 * No network interface index found 1614 * for the specified device, it means 1615 * there it is neither representor 1616 * nor master. 1617 */ 1618 continue; 1619 } 1620 } 1621 ret = -1; 1622 if (nl_route >= 0) 1623 ret = mlx5_nl_switch_info 1624 (nl_route, 1625 list[ns].ifindex, 1626 &list[ns].info); 1627 if (ret || (!list[ns].info.representor && 1628 !list[ns].info.master)) { 1629 /* 1630 * We failed to recognize representors with 1631 * Netlink, let's try to perform the task 1632 * with sysfs. 1633 */ 1634 ret = mlx5_sysfs_switch_info 1635 (list[ns].ifindex, 1636 &list[ns].info); 1637 } 1638 if (!ret && (list[ns].info.representor ^ 1639 list[ns].info.master)) { 1640 ns++; 1641 } else if ((nd == 1) && 1642 !list[ns].info.representor && 1643 !list[ns].info.master) { 1644 /* 1645 * Single IB device with 1646 * one physical port and 1647 * attached network device. 1648 * May be SRIOV is not enabled 1649 * or there is no representors. 1650 */ 1651 DRV_LOG(INFO, "no E-Switch support detected"); 1652 ns++; 1653 break; 1654 } 1655 } 1656 if (!ns) { 1657 DRV_LOG(ERR, 1658 "unable to recognize master/representors" 1659 " on the multiple IB devices"); 1660 rte_errno = ENOENT; 1661 ret = -rte_errno; 1662 goto exit; 1663 } 1664 } 1665 MLX5_ASSERT(ns); 1666 /* 1667 * Sort list to probe devices in natural order for users convenience 1668 * (i.e. master first, then representors from lowest to highest ID). 1669 */ 1670 qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp); 1671 /* Default configuration. */ 1672 dev_config = (struct mlx5_dev_config){ 1673 .hw_padding = 0, 1674 .mps = MLX5_ARG_UNSET, 1675 .dbnc = MLX5_ARG_UNSET, 1676 .rx_vec_en = 1, 1677 .txq_inline_max = MLX5_ARG_UNSET, 1678 .txq_inline_min = MLX5_ARG_UNSET, 1679 .txq_inline_mpw = MLX5_ARG_UNSET, 1680 .txqs_inline = MLX5_ARG_UNSET, 1681 .vf_nl_en = 1, 1682 .mr_ext_memseg_en = 1, 1683 .mprq = { 1684 .enabled = 0, /* Disabled by default. */ 1685 .stride_num_n = 0, 1686 .stride_size_n = 0, 1687 .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN, 1688 .min_rxqs_num = MLX5_MPRQ_MIN_RXQS, 1689 }, 1690 .dv_esw_en = 1, 1691 .dv_flow_en = 1, 1692 .log_hp_size = MLX5_ARG_UNSET, 1693 }; 1694 /* Device specific configuration. */ 1695 switch (pci_dev->id.device_id) { 1696 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: 1697 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: 1698 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: 1699 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: 1700 case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF: 1701 case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF: 1702 case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF: 1703 dev_config.vf = 1; 1704 break; 1705 default: 1706 break; 1707 } 1708 for (i = 0; i != ns; ++i) { 1709 uint32_t restore; 1710 1711 list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device, 1712 &list[i], 1713 dev_config); 1714 if (!list[i].eth_dev) { 1715 if (rte_errno != EBUSY && rte_errno != EEXIST) 1716 break; 1717 /* Device is disabled or already spawned. Ignore it. */ 1718 continue; 1719 } 1720 restore = list[i].eth_dev->data->dev_flags; 1721 rte_eth_copy_pci_info(list[i].eth_dev, pci_dev); 1722 /* Restore non-PCI flags cleared by the above call. */ 1723 list[i].eth_dev->data->dev_flags |= restore; 1724 rte_eth_dev_probing_finish(list[i].eth_dev); 1725 } 1726 if (i != ns) { 1727 DRV_LOG(ERR, 1728 "probe of PCI device " PCI_PRI_FMT " aborted after" 1729 " encountering an error: %s", 1730 pci_dev->addr.domain, pci_dev->addr.bus, 1731 pci_dev->addr.devid, pci_dev->addr.function, 1732 strerror(rte_errno)); 1733 ret = -rte_errno; 1734 /* Roll back. */ 1735 while (i--) { 1736 if (!list[i].eth_dev) 1737 continue; 1738 mlx5_dev_close(list[i].eth_dev); 1739 /* mac_addrs must not be freed because in dev_private */ 1740 list[i].eth_dev->data->mac_addrs = NULL; 1741 claim_zero(rte_eth_dev_release_port(list[i].eth_dev)); 1742 } 1743 /* Restore original error. */ 1744 rte_errno = -ret; 1745 } else { 1746 ret = 0; 1747 } 1748 exit: 1749 /* 1750 * Do the routine cleanup: 1751 * - close opened Netlink sockets 1752 * - free allocated spawn data array 1753 * - free the Infiniband device list 1754 */ 1755 if (nl_rdma >= 0) 1756 close(nl_rdma); 1757 if (nl_route >= 0) 1758 close(nl_route); 1759 if (list) 1760 rte_free(list); 1761 MLX5_ASSERT(ibv_list); 1762 mlx5_glue->free_device_list(ibv_list); 1763 return ret; 1764 } 1765 1766 static int 1767 mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config) 1768 { 1769 char *env; 1770 int value; 1771 1772 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 1773 /* Get environment variable to store. */ 1774 env = getenv(MLX5_SHUT_UP_BF); 1775 value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET; 1776 if (config->dbnc == MLX5_ARG_UNSET) 1777 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1); 1778 else 1779 setenv(MLX5_SHUT_UP_BF, 1780 config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1); 1781 return value; 1782 } 1783 1784 static void 1785 mlx5_restore_doorbell_mapping_env(int value) 1786 { 1787 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 1788 /* Restore the original environment variable state. */ 1789 if (value == MLX5_ARG_UNSET) 1790 unsetenv(MLX5_SHUT_UP_BF); 1791 else 1792 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1); 1793 } 1794 1795 /** 1796 * Extract pdn of PD object using DV API. 1797 * 1798 * @param[in] pd 1799 * Pointer to the verbs PD object. 1800 * @param[out] pdn 1801 * Pointer to the PD object number variable. 1802 * 1803 * @return 1804 * 0 on success, error value otherwise. 1805 */ 1806 int 1807 mlx5_os_get_pdn(void *pd, uint32_t *pdn) 1808 { 1809 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 1810 struct mlx5dv_obj obj; 1811 struct mlx5dv_pd pd_info; 1812 int ret = 0; 1813 1814 obj.pd.in = pd; 1815 obj.pd.out = &pd_info; 1816 ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD); 1817 if (ret) { 1818 DRV_LOG(DEBUG, "Fail to get PD object info"); 1819 return ret; 1820 } 1821 *pdn = pd_info.pdn; 1822 return 0; 1823 #else 1824 (void)pd; 1825 (void)pdn; 1826 return -ENOTSUP; 1827 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */ 1828 } 1829 1830 /** 1831 * Function API to open IB device. 1832 * 1833 * This function calls the Linux glue APIs to open a device. 1834 * 1835 * @param[in] spawn 1836 * Pointer to the IB device attributes (name, port, etc). 1837 * @param[out] config 1838 * Pointer to device configuration structure. 1839 * @param[out] sh 1840 * Pointer to shared context structure. 1841 * 1842 * @return 1843 * 0 on success, a positive error value otherwise. 1844 */ 1845 int 1846 mlx5_os_open_device(const struct mlx5_dev_spawn_data *spawn, 1847 const struct mlx5_dev_config *config, 1848 struct mlx5_dev_ctx_shared *sh) 1849 { 1850 int dbmap_env; 1851 int err = 0; 1852 /* 1853 * Configure environment variable "MLX5_BF_SHUT_UP" 1854 * before the device creation. The rdma_core library 1855 * checks the variable at device creation and 1856 * stores the result internally. 1857 */ 1858 dbmap_env = mlx5_config_doorbell_mapping_env(config); 1859 /* Try to open IB device with DV first, then usual Verbs. */ 1860 errno = 0; 1861 sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev); 1862 if (sh->ctx) { 1863 sh->devx = 1; 1864 DRV_LOG(DEBUG, "DevX is supported"); 1865 /* The device is created, no need for environment. */ 1866 mlx5_restore_doorbell_mapping_env(dbmap_env); 1867 } else { 1868 /* The environment variable is still configured. */ 1869 sh->ctx = mlx5_glue->open_device(spawn->ibv_dev); 1870 err = errno ? errno : ENODEV; 1871 /* 1872 * The environment variable is not needed anymore, 1873 * all device creation attempts are completed. 1874 */ 1875 mlx5_restore_doorbell_mapping_env(dbmap_env); 1876 if (!sh->ctx) 1877 return err; 1878 DRV_LOG(DEBUG, "DevX is NOT supported"); 1879 err = 0; 1880 } 1881 return err; 1882 } 1883 1884 /** 1885 * Install shared asynchronous device events handler. 1886 * This function is implemented to support event sharing 1887 * between multiple ports of single IB device. 1888 * 1889 * @param sh 1890 * Pointer to mlx5_dev_ctx_shared object. 1891 */ 1892 void 1893 mlx5_os_dev_shared_handler_install(struct mlx5_dev_ctx_shared *sh) 1894 { 1895 int ret; 1896 int flags; 1897 1898 sh->intr_handle.fd = -1; 1899 flags = fcntl(((struct ibv_context *)sh->ctx)->async_fd, F_GETFL); 1900 ret = fcntl(((struct ibv_context *)sh->ctx)->async_fd, 1901 F_SETFL, flags | O_NONBLOCK); 1902 if (ret) { 1903 DRV_LOG(INFO, "failed to change file descriptor async event" 1904 " queue"); 1905 } else { 1906 sh->intr_handle.fd = ((struct ibv_context *)sh->ctx)->async_fd; 1907 sh->intr_handle.type = RTE_INTR_HANDLE_EXT; 1908 if (rte_intr_callback_register(&sh->intr_handle, 1909 mlx5_dev_interrupt_handler, sh)) { 1910 DRV_LOG(INFO, "Fail to install the shared interrupt."); 1911 sh->intr_handle.fd = -1; 1912 } 1913 } 1914 if (sh->devx) { 1915 #ifdef HAVE_IBV_DEVX_ASYNC 1916 sh->intr_handle_devx.fd = -1; 1917 sh->devx_comp = mlx5_glue->devx_create_cmd_comp(sh->ctx); 1918 if (!sh->devx_comp) { 1919 DRV_LOG(INFO, "failed to allocate devx_comp."); 1920 return; 1921 } 1922 flags = fcntl(sh->devx_comp->fd, F_GETFL); 1923 ret = fcntl(sh->devx_comp->fd, F_SETFL, flags | O_NONBLOCK); 1924 if (ret) { 1925 DRV_LOG(INFO, "failed to change file descriptor" 1926 " devx comp"); 1927 return; 1928 } 1929 sh->intr_handle_devx.fd = sh->devx_comp->fd; 1930 sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT; 1931 if (rte_intr_callback_register(&sh->intr_handle_devx, 1932 mlx5_dev_interrupt_handler_devx, sh)) { 1933 DRV_LOG(INFO, "Fail to install the devx shared" 1934 " interrupt."); 1935 sh->intr_handle_devx.fd = -1; 1936 } 1937 #endif /* HAVE_IBV_DEVX_ASYNC */ 1938 } 1939 } 1940 1941 /** 1942 * Uninstall shared asynchronous device events handler. 1943 * This function is implemented to support event sharing 1944 * between multiple ports of single IB device. 1945 * 1946 * @param dev 1947 * Pointer to mlx5_dev_ctx_shared object. 1948 */ 1949 void 1950 mlx5_os_dev_shared_handler_uninstall(struct mlx5_dev_ctx_shared *sh) 1951 { 1952 if (sh->intr_handle.fd >= 0) 1953 mlx5_intr_callback_unregister(&sh->intr_handle, 1954 mlx5_dev_interrupt_handler, sh); 1955 #ifdef HAVE_IBV_DEVX_ASYNC 1956 if (sh->intr_handle_devx.fd >= 0) 1957 rte_intr_callback_unregister(&sh->intr_handle_devx, 1958 mlx5_dev_interrupt_handler_devx, sh); 1959 if (sh->devx_comp) 1960 mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp); 1961 #endif 1962 } 1963