1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2020 Mellanox Technologies, Ltd 4 */ 5 6 #include <stddef.h> 7 #include <unistd.h> 8 #include <string.h> 9 #include <stdint.h> 10 #include <stdlib.h> 11 #include <errno.h> 12 #include <net/if.h> 13 #include <sys/mman.h> 14 #include <linux/rtnetlink.h> 15 #include <fcntl.h> 16 17 /* Verbs header. */ 18 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 19 #ifdef PEDANTIC 20 #pragma GCC diagnostic ignored "-Wpedantic" 21 #endif 22 #include <infiniband/verbs.h> 23 #ifdef PEDANTIC 24 #pragma GCC diagnostic error "-Wpedantic" 25 #endif 26 27 #include <rte_malloc.h> 28 #include <rte_ethdev_driver.h> 29 #include <rte_ethdev_pci.h> 30 #include <rte_pci.h> 31 #include <rte_bus_pci.h> 32 #include <rte_common.h> 33 #include <rte_kvargs.h> 34 #include <rte_rwlock.h> 35 #include <rte_spinlock.h> 36 #include <rte_string_fns.h> 37 #include <rte_alarm.h> 38 39 #include <mlx5_glue.h> 40 #include <mlx5_devx_cmds.h> 41 #include <mlx5_common.h> 42 #include <mlx5_common_mp.h> 43 44 #include "mlx5_defs.h" 45 #include "mlx5.h" 46 #include "mlx5_utils.h" 47 #include "mlx5_rxtx.h" 48 #include "mlx5_autoconf.h" 49 #include "mlx5_mr.h" 50 #include "mlx5_flow.h" 51 #include "rte_pmd_mlx5.h" 52 53 #define MLX5_TAGS_HLIST_ARRAY_SIZE 8192 54 55 #ifndef HAVE_IBV_MLX5_MOD_MPW 56 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) 57 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) 58 #endif 59 60 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP 61 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4) 62 #endif 63 64 /** 65 * Get device name. Given an ibv_device pointer - return a 66 * pointer to the corresponding device name. 67 * 68 * @param[in] dev 69 * Pointer to ibv device. 70 * 71 * @return 72 * Pointer to device name if dev is valid, NULL otherwise. 73 */ 74 const char * 75 mlx5_os_get_dev_device_name(void *dev) 76 { 77 if (!dev) 78 return NULL; 79 return ((struct ibv_device *)dev)->name; 80 } 81 82 /** 83 * Get ibv device name. Given an ibv_context pointer - return a 84 * pointer to the corresponding device name. 85 * 86 * @param[in] ctx 87 * Pointer to ibv context. 88 * 89 * @return 90 * Pointer to device name if ctx is valid, NULL otherwise. 91 */ 92 const char * 93 mlx5_os_get_ctx_device_name(void *ctx) 94 { 95 if (!ctx) 96 return NULL; 97 return ((struct ibv_context *)ctx)->device->name; 98 } 99 100 /** 101 * Get ibv device path name. Given an ibv_context pointer - return a 102 * pointer to the corresponding device path name. 103 * 104 * @param[in] ctx 105 * Pointer to ibv context. 106 * 107 * @return 108 * Pointer to device path name if ctx is valid, NULL otherwise. 109 */ 110 const char * 111 mlx5_os_get_ctx_device_path(void *ctx) 112 { 113 if (!ctx) 114 return NULL; 115 116 return ((struct ibv_context *)ctx)->device->ibdev_path; 117 } 118 119 /** 120 * Get umem id. Given a pointer to umem object of type 121 * 'struct mlx5dv_devx_umem *' - return its id. 122 * 123 * @param[in] umem 124 * Pointer to umem object. 125 * 126 * @return 127 * The umem id if umem is valid, 0 otherwise. 128 */ 129 uint32_t 130 mlx5_os_get_umem_id(void *umem) 131 { 132 if (!umem) 133 return 0; 134 return ((struct mlx5dv_devx_umem *)umem)->umem_id; 135 } 136 137 /** 138 * Get mlx5 device attributes. The glue function query_device_ex() is called 139 * with out parameter of type 'struct ibv_device_attr_ex *'. Then fill in mlx5 140 * device attributes from the glue out parameter. 141 * 142 * @param dev 143 * Pointer to ibv context. 144 * 145 * @param device_attr 146 * Pointer to mlx5 device attributes. 147 * 148 * @return 149 * 0 on success, non zero error number otherwise 150 */ 151 int 152 mlx5_os_get_dev_attr(void *ctx, struct mlx5_dev_attr *device_attr) 153 { 154 int err; 155 struct ibv_device_attr_ex attr_ex; 156 memset(device_attr, 0, sizeof(*device_attr)); 157 err = mlx5_glue->query_device_ex(ctx, NULL, &attr_ex); 158 if (err) 159 return err; 160 161 device_attr->device_cap_flags_ex = attr_ex.device_cap_flags_ex; 162 device_attr->max_qp_wr = attr_ex.orig_attr.max_qp_wr; 163 device_attr->max_sge = attr_ex.orig_attr.max_sge; 164 device_attr->max_cq = attr_ex.orig_attr.max_cq; 165 device_attr->max_qp = attr_ex.orig_attr.max_qp; 166 device_attr->raw_packet_caps = attr_ex.raw_packet_caps; 167 device_attr->max_rwq_indirection_table_size = 168 attr_ex.rss_caps.max_rwq_indirection_table_size; 169 device_attr->max_tso = attr_ex.tso_caps.max_tso; 170 device_attr->tso_supported_qpts = attr_ex.tso_caps.supported_qpts; 171 172 struct mlx5dv_context dv_attr = { .comp_mask = 0 }; 173 err = mlx5_glue->dv_query_device(ctx, &dv_attr); 174 if (err) 175 return err; 176 177 device_attr->flags = dv_attr.flags; 178 device_attr->comp_mask = dv_attr.comp_mask; 179 #ifdef HAVE_IBV_MLX5_MOD_SWP 180 device_attr->sw_parsing_offloads = 181 dv_attr.sw_parsing_caps.sw_parsing_offloads; 182 #endif 183 device_attr->min_single_stride_log_num_of_bytes = 184 dv_attr.striding_rq_caps.min_single_stride_log_num_of_bytes; 185 device_attr->max_single_stride_log_num_of_bytes = 186 dv_attr.striding_rq_caps.max_single_stride_log_num_of_bytes; 187 device_attr->min_single_wqe_log_num_of_strides = 188 dv_attr.striding_rq_caps.min_single_wqe_log_num_of_strides; 189 device_attr->max_single_wqe_log_num_of_strides = 190 dv_attr.striding_rq_caps.max_single_wqe_log_num_of_strides; 191 device_attr->stride_supported_qpts = 192 dv_attr.striding_rq_caps.supported_qpts; 193 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 194 device_attr->tunnel_offloads_caps = dv_attr.tunnel_offloads_caps; 195 #endif 196 197 return err; 198 } 199 200 /** 201 * Verbs callback to allocate a memory. This function should allocate the space 202 * according to the size provided residing inside a huge page. 203 * Please note that all allocation must respect the alignment from libmlx5 204 * (i.e. currently sysconf(_SC_PAGESIZE)). 205 * 206 * @param[in] size 207 * The size in bytes of the memory to allocate. 208 * @param[in] data 209 * A pointer to the callback data. 210 * 211 * @return 212 * Allocated buffer, NULL otherwise and rte_errno is set. 213 */ 214 static void * 215 mlx5_alloc_verbs_buf(size_t size, void *data) 216 { 217 struct mlx5_priv *priv = data; 218 void *ret; 219 size_t alignment = sysconf(_SC_PAGESIZE); 220 unsigned int socket = SOCKET_ID_ANY; 221 222 if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) { 223 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 224 225 socket = ctrl->socket; 226 } else if (priv->verbs_alloc_ctx.type == 227 MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) { 228 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 229 230 socket = ctrl->socket; 231 } 232 MLX5_ASSERT(data != NULL); 233 ret = rte_malloc_socket(__func__, size, alignment, socket); 234 if (!ret && size) 235 rte_errno = ENOMEM; 236 return ret; 237 } 238 239 /** 240 * Verbs callback to free a memory. 241 * 242 * @param[in] ptr 243 * A pointer to the memory to free. 244 * @param[in] data 245 * A pointer to the callback data. 246 */ 247 static void 248 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused) 249 { 250 MLX5_ASSERT(data != NULL); 251 rte_free(ptr); 252 } 253 254 /** 255 * Initialize DR related data within private structure. 256 * Routine checks the reference counter and does actual 257 * resources creation/initialization only if counter is zero. 258 * 259 * @param[in] priv 260 * Pointer to the private device data structure. 261 * 262 * @return 263 * Zero on success, positive error code otherwise. 264 */ 265 static int 266 mlx5_alloc_shared_dr(struct mlx5_priv *priv) 267 { 268 struct mlx5_dev_ctx_shared *sh = priv->sh; 269 char s[MLX5_HLIST_NAMESIZE]; 270 int err = 0; 271 272 if (!sh->flow_tbls) 273 err = mlx5_alloc_table_hash_list(priv); 274 else 275 DRV_LOG(DEBUG, "sh->flow_tbls[%p] already created, reuse\n", 276 (void *)sh->flow_tbls); 277 if (err) 278 return err; 279 /* Create tags hash list table. */ 280 snprintf(s, sizeof(s), "%s_tags", sh->ibdev_name); 281 sh->tag_table = mlx5_hlist_create(s, MLX5_TAGS_HLIST_ARRAY_SIZE); 282 if (!sh->tag_table) { 283 DRV_LOG(ERR, "tags with hash creation failed.\n"); 284 err = ENOMEM; 285 goto error; 286 } 287 #ifdef HAVE_MLX5DV_DR 288 void *domain; 289 290 if (sh->dv_refcnt) { 291 /* Shared DV/DR structures is already initialized. */ 292 sh->dv_refcnt++; 293 priv->dr_shared = 1; 294 return 0; 295 } 296 /* Reference counter is zero, we should initialize structures. */ 297 domain = mlx5_glue->dr_create_domain(sh->ctx, 298 MLX5DV_DR_DOMAIN_TYPE_NIC_RX); 299 if (!domain) { 300 DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed"); 301 err = errno; 302 goto error; 303 } 304 sh->rx_domain = domain; 305 domain = mlx5_glue->dr_create_domain(sh->ctx, 306 MLX5DV_DR_DOMAIN_TYPE_NIC_TX); 307 if (!domain) { 308 DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed"); 309 err = errno; 310 goto error; 311 } 312 pthread_mutex_init(&sh->dv_mutex, NULL); 313 sh->tx_domain = domain; 314 #ifdef HAVE_MLX5DV_DR_ESWITCH 315 if (priv->config.dv_esw_en) { 316 domain = mlx5_glue->dr_create_domain 317 (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB); 318 if (!domain) { 319 DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed"); 320 err = errno; 321 goto error; 322 } 323 sh->fdb_domain = domain; 324 sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop(); 325 } 326 #endif 327 if (priv->config.reclaim_mode == MLX5_RCM_AGGR) { 328 mlx5_glue->dr_reclaim_domain_memory(sh->rx_domain, 1); 329 mlx5_glue->dr_reclaim_domain_memory(sh->tx_domain, 1); 330 if (sh->fdb_domain) 331 mlx5_glue->dr_reclaim_domain_memory(sh->fdb_domain, 1); 332 } 333 sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan(); 334 #endif /* HAVE_MLX5DV_DR */ 335 sh->dv_refcnt++; 336 priv->dr_shared = 1; 337 return 0; 338 error: 339 /* Rollback the created objects. */ 340 if (sh->rx_domain) { 341 mlx5_glue->dr_destroy_domain(sh->rx_domain); 342 sh->rx_domain = NULL; 343 } 344 if (sh->tx_domain) { 345 mlx5_glue->dr_destroy_domain(sh->tx_domain); 346 sh->tx_domain = NULL; 347 } 348 if (sh->fdb_domain) { 349 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 350 sh->fdb_domain = NULL; 351 } 352 if (sh->esw_drop_action) { 353 mlx5_glue->destroy_flow_action(sh->esw_drop_action); 354 sh->esw_drop_action = NULL; 355 } 356 if (sh->pop_vlan_action) { 357 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 358 sh->pop_vlan_action = NULL; 359 } 360 if (sh->tag_table) { 361 /* tags should be destroyed with flow before. */ 362 mlx5_hlist_destroy(sh->tag_table, NULL, NULL); 363 sh->tag_table = NULL; 364 } 365 mlx5_free_table_hash_list(priv); 366 return err; 367 } 368 369 /** 370 * Destroy DR related data within private structure. 371 * 372 * @param[in] priv 373 * Pointer to the private device data structure. 374 */ 375 void 376 mlx5_os_free_shared_dr(struct mlx5_priv *priv) 377 { 378 struct mlx5_dev_ctx_shared *sh; 379 380 if (!priv->dr_shared) 381 return; 382 priv->dr_shared = 0; 383 sh = priv->sh; 384 MLX5_ASSERT(sh); 385 #ifdef HAVE_MLX5DV_DR 386 MLX5_ASSERT(sh->dv_refcnt); 387 if (sh->dv_refcnt && --sh->dv_refcnt) 388 return; 389 if (sh->rx_domain) { 390 mlx5_glue->dr_destroy_domain(sh->rx_domain); 391 sh->rx_domain = NULL; 392 } 393 if (sh->tx_domain) { 394 mlx5_glue->dr_destroy_domain(sh->tx_domain); 395 sh->tx_domain = NULL; 396 } 397 #ifdef HAVE_MLX5DV_DR_ESWITCH 398 if (sh->fdb_domain) { 399 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 400 sh->fdb_domain = NULL; 401 } 402 if (sh->esw_drop_action) { 403 mlx5_glue->destroy_flow_action(sh->esw_drop_action); 404 sh->esw_drop_action = NULL; 405 } 406 #endif 407 if (sh->pop_vlan_action) { 408 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 409 sh->pop_vlan_action = NULL; 410 } 411 pthread_mutex_destroy(&sh->dv_mutex); 412 #endif /* HAVE_MLX5DV_DR */ 413 if (sh->tag_table) { 414 /* tags should be destroyed with flow before. */ 415 mlx5_hlist_destroy(sh->tag_table, NULL, NULL); 416 sh->tag_table = NULL; 417 } 418 mlx5_free_table_hash_list(priv); 419 } 420 421 /** 422 * Spawn an Ethernet device from Verbs information. 423 * 424 * @param dpdk_dev 425 * Backing DPDK device. 426 * @param spawn 427 * Verbs device parameters (name, port, switch_info) to spawn. 428 * @param config 429 * Device configuration parameters. 430 * 431 * @return 432 * A valid Ethernet device object on success, NULL otherwise and rte_errno 433 * is set. The following errors are defined: 434 * 435 * EBUSY: device is not supposed to be spawned. 436 * EEXIST: device is already spawned 437 */ 438 static struct rte_eth_dev * 439 mlx5_dev_spawn(struct rte_device *dpdk_dev, 440 struct mlx5_dev_spawn_data *spawn, 441 struct mlx5_dev_config config) 442 { 443 const struct mlx5_switch_info *switch_info = &spawn->info; 444 struct mlx5_dev_ctx_shared *sh = NULL; 445 struct ibv_port_attr port_attr; 446 struct mlx5dv_context dv_attr = { .comp_mask = 0 }; 447 struct rte_eth_dev *eth_dev = NULL; 448 struct mlx5_priv *priv = NULL; 449 int err = 0; 450 unsigned int hw_padding = 0; 451 unsigned int mps; 452 unsigned int cqe_comp; 453 unsigned int cqe_pad = 0; 454 unsigned int tunnel_en = 0; 455 unsigned int mpls_en = 0; 456 unsigned int swp = 0; 457 unsigned int mprq = 0; 458 unsigned int mprq_min_stride_size_n = 0; 459 unsigned int mprq_max_stride_size_n = 0; 460 unsigned int mprq_min_stride_num_n = 0; 461 unsigned int mprq_max_stride_num_n = 0; 462 struct rte_ether_addr mac; 463 char name[RTE_ETH_NAME_MAX_LEN]; 464 int own_domain_id = 0; 465 uint16_t port_id; 466 unsigned int i; 467 #ifdef HAVE_MLX5DV_DR_DEVX_PORT 468 struct mlx5dv_devx_port devx_port = { .comp_mask = 0 }; 469 #endif 470 471 /* Determine if this port representor is supposed to be spawned. */ 472 if (switch_info->representor && dpdk_dev->devargs) { 473 struct rte_eth_devargs eth_da; 474 475 err = rte_eth_devargs_parse(dpdk_dev->devargs->args, ð_da); 476 if (err) { 477 rte_errno = -err; 478 DRV_LOG(ERR, "failed to process device arguments: %s", 479 strerror(rte_errno)); 480 return NULL; 481 } 482 for (i = 0; i < eth_da.nb_representor_ports; ++i) 483 if (eth_da.representor_ports[i] == 484 (uint16_t)switch_info->port_name) 485 break; 486 if (i == eth_da.nb_representor_ports) { 487 rte_errno = EBUSY; 488 return NULL; 489 } 490 } 491 /* Build device name. */ 492 if (spawn->pf_bond < 0) { 493 /* Single device. */ 494 if (!switch_info->representor) 495 strlcpy(name, dpdk_dev->name, sizeof(name)); 496 else 497 snprintf(name, sizeof(name), "%s_representor_%u", 498 dpdk_dev->name, switch_info->port_name); 499 } else { 500 /* Bonding device. */ 501 if (!switch_info->representor) 502 snprintf(name, sizeof(name), "%s_%s", 503 dpdk_dev->name, 504 mlx5_os_get_dev_device_name(spawn->phys_dev)); 505 else 506 snprintf(name, sizeof(name), "%s_%s_representor_%u", 507 dpdk_dev->name, 508 mlx5_os_get_dev_device_name(spawn->phys_dev), 509 switch_info->port_name); 510 } 511 /* check if the device is already spawned */ 512 if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) { 513 rte_errno = EEXIST; 514 return NULL; 515 } 516 DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name); 517 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 518 struct mlx5_mp_id mp_id; 519 520 eth_dev = rte_eth_dev_attach_secondary(name); 521 if (eth_dev == NULL) { 522 DRV_LOG(ERR, "can not attach rte ethdev"); 523 rte_errno = ENOMEM; 524 return NULL; 525 } 526 eth_dev->device = dpdk_dev; 527 eth_dev->dev_ops = &mlx5_os_dev_sec_ops; 528 err = mlx5_proc_priv_init(eth_dev); 529 if (err) 530 return NULL; 531 mp_id.port_id = eth_dev->data->port_id; 532 strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN); 533 /* Receive command fd from primary process */ 534 err = mlx5_mp_req_verbs_cmd_fd(&mp_id); 535 if (err < 0) 536 goto err_secondary; 537 /* Remap UAR for Tx queues. */ 538 err = mlx5_tx_uar_init_secondary(eth_dev, err); 539 if (err) 540 goto err_secondary; 541 /* 542 * Ethdev pointer is still required as input since 543 * the primary device is not accessible from the 544 * secondary process. 545 */ 546 eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev); 547 eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev); 548 return eth_dev; 549 err_secondary: 550 mlx5_dev_close(eth_dev); 551 return NULL; 552 } 553 /* 554 * Some parameters ("tx_db_nc" in particularly) are needed in 555 * advance to create dv/verbs device context. We proceed the 556 * devargs here to get ones, and later proceed devargs again 557 * to override some hardware settings. 558 */ 559 err = mlx5_args(&config, dpdk_dev->devargs); 560 if (err) { 561 err = rte_errno; 562 DRV_LOG(ERR, "failed to process device arguments: %s", 563 strerror(rte_errno)); 564 goto error; 565 } 566 sh = mlx5_alloc_shared_dev_ctx(spawn, &config); 567 if (!sh) 568 return NULL; 569 config.devx = sh->devx; 570 #ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR 571 config.dest_tir = 1; 572 #endif 573 #ifdef HAVE_IBV_MLX5_MOD_SWP 574 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; 575 #endif 576 /* 577 * Multi-packet send is supported by ConnectX-4 Lx PF as well 578 * as all ConnectX-5 devices. 579 */ 580 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 581 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS; 582 #endif 583 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 584 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; 585 #endif 586 mlx5_glue->dv_query_device(sh->ctx, &dv_attr); 587 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { 588 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { 589 DRV_LOG(DEBUG, "enhanced MPW is supported"); 590 mps = MLX5_MPW_ENHANCED; 591 } else { 592 DRV_LOG(DEBUG, "MPW is supported"); 593 mps = MLX5_MPW; 594 } 595 } else { 596 DRV_LOG(DEBUG, "MPW isn't supported"); 597 mps = MLX5_MPW_DISABLED; 598 } 599 #ifdef HAVE_IBV_MLX5_MOD_SWP 600 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP) 601 swp = dv_attr.sw_parsing_caps.sw_parsing_offloads; 602 DRV_LOG(DEBUG, "SWP support: %u", swp); 603 #endif 604 config.swp = !!swp; 605 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 606 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) { 607 struct mlx5dv_striding_rq_caps mprq_caps = 608 dv_attr.striding_rq_caps; 609 610 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d", 611 mprq_caps.min_single_stride_log_num_of_bytes); 612 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d", 613 mprq_caps.max_single_stride_log_num_of_bytes); 614 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d", 615 mprq_caps.min_single_wqe_log_num_of_strides); 616 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d", 617 mprq_caps.max_single_wqe_log_num_of_strides); 618 DRV_LOG(DEBUG, "\tsupported_qpts: %d", 619 mprq_caps.supported_qpts); 620 DRV_LOG(DEBUG, "device supports Multi-Packet RQ"); 621 mprq = 1; 622 mprq_min_stride_size_n = 623 mprq_caps.min_single_stride_log_num_of_bytes; 624 mprq_max_stride_size_n = 625 mprq_caps.max_single_stride_log_num_of_bytes; 626 mprq_min_stride_num_n = 627 mprq_caps.min_single_wqe_log_num_of_strides; 628 mprq_max_stride_num_n = 629 mprq_caps.max_single_wqe_log_num_of_strides; 630 } 631 #endif 632 if (RTE_CACHE_LINE_SIZE == 128 && 633 !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) 634 cqe_comp = 0; 635 else 636 cqe_comp = 1; 637 config.cqe_comp = cqe_comp; 638 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD 639 /* Whether device supports 128B Rx CQE padding. */ 640 cqe_pad = RTE_CACHE_LINE_SIZE == 128 && 641 (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD); 642 #endif 643 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 644 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { 645 tunnel_en = ((dv_attr.tunnel_offloads_caps & 646 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) && 647 (dv_attr.tunnel_offloads_caps & 648 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) && 649 (dv_attr.tunnel_offloads_caps & 650 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE)); 651 } 652 DRV_LOG(DEBUG, "tunnel offloading is %ssupported", 653 tunnel_en ? "" : "not "); 654 #else 655 DRV_LOG(WARNING, 656 "tunnel offloading disabled due to old OFED/rdma-core version"); 657 #endif 658 config.tunnel_en = tunnel_en; 659 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT 660 mpls_en = ((dv_attr.tunnel_offloads_caps & 661 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) && 662 (dv_attr.tunnel_offloads_caps & 663 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP)); 664 DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported", 665 mpls_en ? "" : "not "); 666 #else 667 DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to" 668 " old OFED/rdma-core version or firmware configuration"); 669 #endif 670 config.mpls_en = mpls_en; 671 /* Check port status. */ 672 err = mlx5_glue->query_port(sh->ctx, spawn->phys_port, &port_attr); 673 if (err) { 674 DRV_LOG(ERR, "port query failed: %s", strerror(err)); 675 goto error; 676 } 677 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { 678 DRV_LOG(ERR, "port is not configured in Ethernet mode"); 679 err = EINVAL; 680 goto error; 681 } 682 if (port_attr.state != IBV_PORT_ACTIVE) 683 DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)", 684 mlx5_glue->port_state_str(port_attr.state), 685 port_attr.state); 686 /* Allocate private eth device data. */ 687 priv = rte_zmalloc("ethdev private structure", 688 sizeof(*priv), 689 RTE_CACHE_LINE_SIZE); 690 if (priv == NULL) { 691 DRV_LOG(ERR, "priv allocation failure"); 692 err = ENOMEM; 693 goto error; 694 } 695 priv->sh = sh; 696 priv->dev_port = spawn->phys_port; 697 priv->pci_dev = spawn->pci_dev; 698 priv->mtu = RTE_ETHER_MTU; 699 priv->mp_id.port_id = port_id; 700 strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN); 701 #ifndef RTE_ARCH_64 702 /* Initialize UAR access locks for 32bit implementations. */ 703 rte_spinlock_init(&priv->uar_lock_cq); 704 for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++) 705 rte_spinlock_init(&priv->uar_lock[i]); 706 #endif 707 /* Some internal functions rely on Netlink sockets, open them now. */ 708 priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA); 709 priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE); 710 priv->representor = !!switch_info->representor; 711 priv->master = !!switch_info->master; 712 priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; 713 priv->vport_meta_tag = 0; 714 priv->vport_meta_mask = 0; 715 priv->pf_bond = spawn->pf_bond; 716 #ifdef HAVE_MLX5DV_DR_DEVX_PORT 717 /* 718 * The DevX port query API is implemented. E-Switch may use 719 * either vport or reg_c[0] metadata register to match on 720 * vport index. The engaged part of metadata register is 721 * defined by mask. 722 */ 723 if (switch_info->representor || switch_info->master) { 724 devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT | 725 MLX5DV_DEVX_PORT_MATCH_REG_C_0; 726 err = mlx5_glue->devx_port_query(sh->ctx, spawn->phys_port, 727 &devx_port); 728 if (err) { 729 DRV_LOG(WARNING, 730 "can't query devx port %d on device %s", 731 spawn->phys_port, 732 mlx5_os_get_dev_device_name(spawn->phys_dev)); 733 devx_port.comp_mask = 0; 734 } 735 } 736 if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) { 737 priv->vport_meta_tag = devx_port.reg_c_0.value; 738 priv->vport_meta_mask = devx_port.reg_c_0.mask; 739 if (!priv->vport_meta_mask) { 740 DRV_LOG(ERR, "vport zero mask for port %d" 741 " on bonding device %s", 742 spawn->phys_port, 743 mlx5_os_get_dev_device_name 744 (spawn->phys_dev)); 745 err = ENOTSUP; 746 goto error; 747 } 748 if (priv->vport_meta_tag & ~priv->vport_meta_mask) { 749 DRV_LOG(ERR, "invalid vport tag for port %d" 750 " on bonding device %s", 751 spawn->phys_port, 752 mlx5_os_get_dev_device_name 753 (spawn->phys_dev)); 754 err = ENOTSUP; 755 goto error; 756 } 757 } 758 if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) { 759 priv->vport_id = devx_port.vport_num; 760 } else if (spawn->pf_bond >= 0) { 761 DRV_LOG(ERR, "can't deduce vport index for port %d" 762 " on bonding device %s", 763 spawn->phys_port, 764 mlx5_os_get_dev_device_name(spawn->phys_dev)); 765 err = ENOTSUP; 766 goto error; 767 } else { 768 /* Suppose vport index in compatible way. */ 769 priv->vport_id = switch_info->representor ? 770 switch_info->port_name + 1 : -1; 771 } 772 #else 773 /* 774 * Kernel/rdma_core support single E-Switch per PF configurations 775 * only and vport_id field contains the vport index for 776 * associated VF, which is deduced from representor port name. 777 * For example, let's have the IB device port 10, it has 778 * attached network device eth0, which has port name attribute 779 * pf0vf2, we can deduce the VF number as 2, and set vport index 780 * as 3 (2+1). This assigning schema should be changed if the 781 * multiple E-Switch instances per PF configurations or/and PCI 782 * subfunctions are added. 783 */ 784 priv->vport_id = switch_info->representor ? 785 switch_info->port_name + 1 : -1; 786 #endif 787 /* representor_id field keeps the unmodified VF index. */ 788 priv->representor_id = switch_info->representor ? 789 switch_info->port_name : -1; 790 /* 791 * Look for sibling devices in order to reuse their switch domain 792 * if any, otherwise allocate one. 793 */ 794 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { 795 const struct mlx5_priv *opriv = 796 rte_eth_devices[port_id].data->dev_private; 797 798 if (!opriv || 799 opriv->sh != priv->sh || 800 opriv->domain_id == 801 RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) 802 continue; 803 priv->domain_id = opriv->domain_id; 804 break; 805 } 806 if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { 807 err = rte_eth_switch_domain_alloc(&priv->domain_id); 808 if (err) { 809 err = rte_errno; 810 DRV_LOG(ERR, "unable to allocate switch domain: %s", 811 strerror(rte_errno)); 812 goto error; 813 } 814 own_domain_id = 1; 815 } 816 /* Override some values set by hardware configuration. */ 817 mlx5_args(&config, dpdk_dev->devargs); 818 err = mlx5_dev_check_sibling_config(priv, &config); 819 if (err) 820 goto error; 821 config.hw_csum = !!(sh->device_attr.device_cap_flags_ex & 822 IBV_DEVICE_RAW_IP_CSUM); 823 DRV_LOG(DEBUG, "checksum offloading is %ssupported", 824 (config.hw_csum ? "" : "not ")); 825 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \ 826 !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) 827 DRV_LOG(DEBUG, "counters are not supported"); 828 #endif 829 #if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR) 830 if (config.dv_flow_en) { 831 DRV_LOG(WARNING, "DV flow is not supported"); 832 config.dv_flow_en = 0; 833 } 834 #endif 835 config.ind_table_max_size = 836 sh->device_attr.max_rwq_indirection_table_size; 837 /* 838 * Remove this check once DPDK supports larger/variable 839 * indirection tables. 840 */ 841 if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512) 842 config.ind_table_max_size = ETH_RSS_RETA_SIZE_512; 843 DRV_LOG(DEBUG, "maximum Rx indirection table size is %u", 844 config.ind_table_max_size); 845 config.hw_vlan_strip = !!(sh->device_attr.raw_packet_caps & 846 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); 847 DRV_LOG(DEBUG, "VLAN stripping is %ssupported", 848 (config.hw_vlan_strip ? "" : "not ")); 849 config.hw_fcs_strip = !!(sh->device_attr.raw_packet_caps & 850 IBV_RAW_PACKET_CAP_SCATTER_FCS); 851 DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported", 852 (config.hw_fcs_strip ? "" : "not ")); 853 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING) 854 hw_padding = !!sh->device_attr.rx_pad_end_addr_align; 855 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING) 856 hw_padding = !!(sh->device_attr.device_cap_flags_ex & 857 IBV_DEVICE_PCI_WRITE_END_PADDING); 858 #endif 859 if (config.hw_padding && !hw_padding) { 860 DRV_LOG(DEBUG, "Rx end alignment padding isn't supported"); 861 config.hw_padding = 0; 862 } else if (config.hw_padding) { 863 DRV_LOG(DEBUG, "Rx end alignment padding is enabled"); 864 } 865 config.tso = (sh->device_attr.max_tso > 0 && 866 (sh->device_attr.tso_supported_qpts & 867 (1 << IBV_QPT_RAW_PACKET))); 868 if (config.tso) 869 config.tso_max_payload_sz = sh->device_attr.max_tso; 870 /* 871 * MPW is disabled by default, while the Enhanced MPW is enabled 872 * by default. 873 */ 874 if (config.mps == MLX5_ARG_UNSET) 875 config.mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED : 876 MLX5_MPW_DISABLED; 877 else 878 config.mps = config.mps ? mps : MLX5_MPW_DISABLED; 879 DRV_LOG(INFO, "%sMPS is %s", 880 config.mps == MLX5_MPW_ENHANCED ? "enhanced " : 881 config.mps == MLX5_MPW ? "legacy " : "", 882 config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); 883 if (config.cqe_comp && !cqe_comp) { 884 DRV_LOG(WARNING, "Rx CQE compression isn't supported"); 885 config.cqe_comp = 0; 886 } 887 if (config.cqe_pad && !cqe_pad) { 888 DRV_LOG(WARNING, "Rx CQE padding isn't supported"); 889 config.cqe_pad = 0; 890 } else if (config.cqe_pad) { 891 DRV_LOG(INFO, "Rx CQE padding is enabled"); 892 } 893 if (config.devx) { 894 priv->counter_fallback = 0; 895 err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr); 896 if (err) { 897 err = -err; 898 goto error; 899 } 900 if (!config.hca_attr.flow_counters_dump) 901 priv->counter_fallback = 1; 902 #ifndef HAVE_IBV_DEVX_ASYNC 903 priv->counter_fallback = 1; 904 #endif 905 if (priv->counter_fallback) 906 DRV_LOG(INFO, "Use fall-back DV counter management"); 907 /* Check for LRO support. */ 908 if (config.dest_tir && config.hca_attr.lro_cap && 909 config.dv_flow_en) { 910 /* TBD check tunnel lro caps. */ 911 config.lro.supported = config.hca_attr.lro_cap; 912 DRV_LOG(DEBUG, "Device supports LRO"); 913 /* 914 * If LRO timeout is not configured by application, 915 * use the minimal supported value. 916 */ 917 if (!config.lro.timeout) 918 config.lro.timeout = 919 config.hca_attr.lro_timer_supported_periods[0]; 920 DRV_LOG(DEBUG, "LRO session timeout set to %d usec", 921 config.lro.timeout); 922 } 923 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER) 924 if (config.hca_attr.qos.sup && config.hca_attr.qos.srtcm_sup && 925 config.dv_flow_en) { 926 uint8_t reg_c_mask = 927 config.hca_attr.qos.flow_meter_reg_c_ids; 928 /* 929 * Meter needs two REG_C's for color match and pre-sfx 930 * flow match. Here get the REG_C for color match. 931 * REG_C_0 and REG_C_1 is reserved for metadata feature. 932 */ 933 reg_c_mask &= 0xfc; 934 if (__builtin_popcount(reg_c_mask) < 1) { 935 priv->mtr_en = 0; 936 DRV_LOG(WARNING, "No available register for" 937 " meter."); 938 } else { 939 priv->mtr_color_reg = ffs(reg_c_mask) - 1 + 940 REG_C_0; 941 priv->mtr_en = 1; 942 priv->mtr_reg_share = 943 config.hca_attr.qos.flow_meter_reg_share; 944 DRV_LOG(DEBUG, "The REG_C meter uses is %d", 945 priv->mtr_color_reg); 946 } 947 } 948 #endif 949 } 950 if (config.mprq.enabled && mprq) { 951 if (config.mprq.stride_num_n && 952 (config.mprq.stride_num_n > mprq_max_stride_num_n || 953 config.mprq.stride_num_n < mprq_min_stride_num_n)) { 954 config.mprq.stride_num_n = 955 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, 956 mprq_min_stride_num_n), 957 mprq_max_stride_num_n); 958 DRV_LOG(WARNING, 959 "the number of strides" 960 " for Multi-Packet RQ is out of range," 961 " setting default value (%u)", 962 1 << config.mprq.stride_num_n); 963 } 964 if (config.mprq.stride_size_n && 965 (config.mprq.stride_size_n > mprq_max_stride_size_n || 966 config.mprq.stride_size_n < mprq_min_stride_size_n)) { 967 config.mprq.stride_size_n = 968 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N, 969 mprq_min_stride_size_n), 970 mprq_max_stride_size_n); 971 DRV_LOG(WARNING, 972 "the size of a stride" 973 " for Multi-Packet RQ is out of range," 974 " setting default value (%u)", 975 1 << config.mprq.stride_size_n); 976 } 977 config.mprq.min_stride_size_n = mprq_min_stride_size_n; 978 config.mprq.max_stride_size_n = mprq_max_stride_size_n; 979 } else if (config.mprq.enabled && !mprq) { 980 DRV_LOG(WARNING, "Multi-Packet RQ isn't supported"); 981 config.mprq.enabled = 0; 982 } 983 if (config.max_dump_files_num == 0) 984 config.max_dump_files_num = 128; 985 eth_dev = rte_eth_dev_allocate(name); 986 if (eth_dev == NULL) { 987 DRV_LOG(ERR, "can not allocate rte ethdev"); 988 err = ENOMEM; 989 goto error; 990 } 991 /* Flag to call rte_eth_dev_release_port() in rte_eth_dev_close(). */ 992 eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE; 993 if (priv->representor) { 994 eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR; 995 eth_dev->data->representor_id = priv->representor_id; 996 } 997 /* 998 * Store associated network device interface index. This index 999 * is permanent throughout the lifetime of device. So, we may store 1000 * the ifindex here and use the cached value further. 1001 */ 1002 MLX5_ASSERT(spawn->ifindex); 1003 priv->if_index = spawn->ifindex; 1004 eth_dev->data->dev_private = priv; 1005 priv->dev_data = eth_dev->data; 1006 eth_dev->data->mac_addrs = priv->mac; 1007 eth_dev->device = dpdk_dev; 1008 /* Configure the first MAC address by default. */ 1009 if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) { 1010 DRV_LOG(ERR, 1011 "port %u cannot get MAC address, is mlx5_en" 1012 " loaded? (errno: %s)", 1013 eth_dev->data->port_id, strerror(rte_errno)); 1014 err = ENODEV; 1015 goto error; 1016 } 1017 DRV_LOG(INFO, 1018 "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", 1019 eth_dev->data->port_id, 1020 mac.addr_bytes[0], mac.addr_bytes[1], 1021 mac.addr_bytes[2], mac.addr_bytes[3], 1022 mac.addr_bytes[4], mac.addr_bytes[5]); 1023 #ifdef RTE_LIBRTE_MLX5_DEBUG 1024 { 1025 char ifname[IF_NAMESIZE]; 1026 1027 if (mlx5_get_ifname(eth_dev, &ifname) == 0) 1028 DRV_LOG(DEBUG, "port %u ifname is \"%s\"", 1029 eth_dev->data->port_id, ifname); 1030 else 1031 DRV_LOG(DEBUG, "port %u ifname is unknown", 1032 eth_dev->data->port_id); 1033 } 1034 #endif 1035 /* Get actual MTU if possible. */ 1036 err = mlx5_get_mtu(eth_dev, &priv->mtu); 1037 if (err) { 1038 err = rte_errno; 1039 goto error; 1040 } 1041 DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id, 1042 priv->mtu); 1043 /* Initialize burst functions to prevent crashes before link-up. */ 1044 eth_dev->rx_pkt_burst = removed_rx_burst; 1045 eth_dev->tx_pkt_burst = removed_tx_burst; 1046 eth_dev->dev_ops = &mlx5_os_dev_ops; 1047 /* Register MAC address. */ 1048 claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); 1049 if (config.vf && config.vf_nl_en) 1050 mlx5_nl_mac_addr_sync(priv->nl_socket_route, 1051 mlx5_ifindex(eth_dev), 1052 eth_dev->data->mac_addrs, 1053 MLX5_MAX_MAC_ADDRESSES); 1054 priv->flows = 0; 1055 priv->ctrl_flows = 0; 1056 TAILQ_INIT(&priv->flow_meters); 1057 TAILQ_INIT(&priv->flow_meter_profiles); 1058 /* Hint libmlx5 to use PMD allocator for data plane resources */ 1059 struct mlx5dv_ctx_allocators alctr = { 1060 .alloc = &mlx5_alloc_verbs_buf, 1061 .free = &mlx5_free_verbs_buf, 1062 .data = priv, 1063 }; 1064 mlx5_glue->dv_set_context_attr(sh->ctx, 1065 MLX5DV_CTX_ATTR_BUF_ALLOCATORS, 1066 (void *)((uintptr_t)&alctr)); 1067 /* Bring Ethernet device up. */ 1068 DRV_LOG(DEBUG, "port %u forcing Ethernet interface up", 1069 eth_dev->data->port_id); 1070 mlx5_set_link_up(eth_dev); 1071 /* 1072 * Even though the interrupt handler is not installed yet, 1073 * interrupts will still trigger on the async_fd from 1074 * Verbs context returned by ibv_open_device(). 1075 */ 1076 mlx5_link_update(eth_dev, 0); 1077 #ifdef HAVE_MLX5DV_DR_ESWITCH 1078 if (!(config.hca_attr.eswitch_manager && config.dv_flow_en && 1079 (switch_info->representor || switch_info->master))) 1080 config.dv_esw_en = 0; 1081 #else 1082 config.dv_esw_en = 0; 1083 #endif 1084 /* Detect minimal data bytes to inline. */ 1085 mlx5_set_min_inline(spawn, &config); 1086 /* Store device configuration on private structure. */ 1087 priv->config = config; 1088 /* Create context for virtual machine VLAN workaround. */ 1089 priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex); 1090 if (config.dv_flow_en) { 1091 err = mlx5_alloc_shared_dr(priv); 1092 if (err) 1093 goto error; 1094 /* 1095 * RSS id is shared with meter flow id. Meter flow id can only 1096 * use the 24 MSB of the register. 1097 */ 1098 priv->qrss_id_pool = mlx5_flow_id_pool_alloc(UINT32_MAX >> 1099 MLX5_MTR_COLOR_BITS); 1100 if (!priv->qrss_id_pool) { 1101 DRV_LOG(ERR, "can't create flow id pool"); 1102 err = ENOMEM; 1103 goto error; 1104 } 1105 } 1106 /* Supported Verbs flow priority number detection. */ 1107 err = mlx5_flow_discover_priorities(eth_dev); 1108 if (err < 0) { 1109 err = -err; 1110 goto error; 1111 } 1112 priv->config.flow_prio = err; 1113 if (!priv->config.dv_esw_en && 1114 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { 1115 DRV_LOG(WARNING, "metadata mode %u is not supported " 1116 "(no E-Switch)", priv->config.dv_xmeta_en); 1117 priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY; 1118 } 1119 mlx5_set_metadata_mask(eth_dev); 1120 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && 1121 !priv->sh->dv_regc0_mask) { 1122 DRV_LOG(ERR, "metadata mode %u is not supported " 1123 "(no metadata reg_c[0] is available)", 1124 priv->config.dv_xmeta_en); 1125 err = ENOTSUP; 1126 goto error; 1127 } 1128 /* 1129 * Allocate the buffer for flow creating, just once. 1130 * The allocation must be done before any flow creating. 1131 */ 1132 mlx5_flow_alloc_intermediate(eth_dev); 1133 /* Query availability of metadata reg_c's. */ 1134 err = mlx5_flow_discover_mreg_c(eth_dev); 1135 if (err < 0) { 1136 err = -err; 1137 goto error; 1138 } 1139 if (!mlx5_flow_ext_mreg_supported(eth_dev)) { 1140 DRV_LOG(DEBUG, 1141 "port %u extensive metadata register is not supported", 1142 eth_dev->data->port_id); 1143 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { 1144 DRV_LOG(ERR, "metadata mode %u is not supported " 1145 "(no metadata registers available)", 1146 priv->config.dv_xmeta_en); 1147 err = ENOTSUP; 1148 goto error; 1149 } 1150 } 1151 if (priv->config.dv_flow_en && 1152 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && 1153 mlx5_flow_ext_mreg_supported(eth_dev) && 1154 priv->sh->dv_regc0_mask) { 1155 priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME, 1156 MLX5_FLOW_MREG_HTABLE_SZ); 1157 if (!priv->mreg_cp_tbl) { 1158 err = ENOMEM; 1159 goto error; 1160 } 1161 } 1162 return eth_dev; 1163 error: 1164 if (priv) { 1165 if (priv->mreg_cp_tbl) 1166 mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL); 1167 if (priv->sh) 1168 mlx5_os_free_shared_dr(priv); 1169 if (priv->nl_socket_route >= 0) 1170 close(priv->nl_socket_route); 1171 if (priv->nl_socket_rdma >= 0) 1172 close(priv->nl_socket_rdma); 1173 if (priv->vmwa_context) 1174 mlx5_vlan_vmwa_exit(priv->vmwa_context); 1175 if (priv->qrss_id_pool) 1176 mlx5_flow_id_pool_release(priv->qrss_id_pool); 1177 if (own_domain_id) 1178 claim_zero(rte_eth_switch_domain_free(priv->domain_id)); 1179 rte_free(priv); 1180 if (eth_dev != NULL) 1181 eth_dev->data->dev_private = NULL; 1182 } 1183 if (eth_dev != NULL) { 1184 /* mac_addrs must not be freed alone because part of 1185 * dev_private 1186 **/ 1187 eth_dev->data->mac_addrs = NULL; 1188 rte_eth_dev_release_port(eth_dev); 1189 } 1190 if (sh) 1191 mlx5_free_shared_dev_ctx(sh); 1192 MLX5_ASSERT(err > 0); 1193 rte_errno = err; 1194 return NULL; 1195 } 1196 1197 /** 1198 * Comparison callback to sort device data. 1199 * 1200 * This is meant to be used with qsort(). 1201 * 1202 * @param a[in] 1203 * Pointer to pointer to first data object. 1204 * @param b[in] 1205 * Pointer to pointer to second data object. 1206 * 1207 * @return 1208 * 0 if both objects are equal, less than 0 if the first argument is less 1209 * than the second, greater than 0 otherwise. 1210 */ 1211 static int 1212 mlx5_dev_spawn_data_cmp(const void *a, const void *b) 1213 { 1214 const struct mlx5_switch_info *si_a = 1215 &((const struct mlx5_dev_spawn_data *)a)->info; 1216 const struct mlx5_switch_info *si_b = 1217 &((const struct mlx5_dev_spawn_data *)b)->info; 1218 int ret; 1219 1220 /* Master device first. */ 1221 ret = si_b->master - si_a->master; 1222 if (ret) 1223 return ret; 1224 /* Then representor devices. */ 1225 ret = si_b->representor - si_a->representor; 1226 if (ret) 1227 return ret; 1228 /* Unidentified devices come last in no specific order. */ 1229 if (!si_a->representor) 1230 return 0; 1231 /* Order representors by name. */ 1232 return si_a->port_name - si_b->port_name; 1233 } 1234 1235 /** 1236 * Match PCI information for possible slaves of bonding device. 1237 * 1238 * @param[in] ibv_dev 1239 * Pointer to Infiniband device structure. 1240 * @param[in] pci_dev 1241 * Pointer to PCI device structure to match PCI address. 1242 * @param[in] nl_rdma 1243 * Netlink RDMA group socket handle. 1244 * 1245 * @return 1246 * negative value if no bonding device found, otherwise 1247 * positive index of slave PF in bonding. 1248 */ 1249 static int 1250 mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev, 1251 const struct rte_pci_device *pci_dev, 1252 int nl_rdma) 1253 { 1254 char ifname[IF_NAMESIZE + 1]; 1255 unsigned int ifindex; 1256 unsigned int np, i; 1257 FILE *file = NULL; 1258 int pf = -1; 1259 1260 /* 1261 * Try to get master device name. If something goes 1262 * wrong suppose the lack of kernel support and no 1263 * bonding devices. 1264 */ 1265 if (nl_rdma < 0) 1266 return -1; 1267 if (!strstr(ibv_dev->name, "bond")) 1268 return -1; 1269 np = mlx5_nl_portnum(nl_rdma, ibv_dev->name); 1270 if (!np) 1271 return -1; 1272 /* 1273 * The Master device might not be on the predefined 1274 * port (not on port index 1, it is not garanted), 1275 * we have to scan all Infiniband device port and 1276 * find master. 1277 */ 1278 for (i = 1; i <= np; ++i) { 1279 /* Check whether Infiniband port is populated. */ 1280 ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i); 1281 if (!ifindex) 1282 continue; 1283 if (!if_indextoname(ifindex, ifname)) 1284 continue; 1285 /* Try to read bonding slave names from sysfs. */ 1286 MKSTR(slaves, 1287 "/sys/class/net/%s/master/bonding/slaves", ifname); 1288 file = fopen(slaves, "r"); 1289 if (file) 1290 break; 1291 } 1292 if (!file) 1293 return -1; 1294 /* Use safe format to check maximal buffer length. */ 1295 MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE); 1296 while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) { 1297 char tmp_str[IF_NAMESIZE + 32]; 1298 struct rte_pci_addr pci_addr; 1299 struct mlx5_switch_info info; 1300 1301 /* Process slave interface names in the loop. */ 1302 snprintf(tmp_str, sizeof(tmp_str), 1303 "/sys/class/net/%s", ifname); 1304 if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) { 1305 DRV_LOG(WARNING, "can not get PCI address" 1306 " for netdev \"%s\"", ifname); 1307 continue; 1308 } 1309 if (pci_dev->addr.domain != pci_addr.domain || 1310 pci_dev->addr.bus != pci_addr.bus || 1311 pci_dev->addr.devid != pci_addr.devid || 1312 pci_dev->addr.function != pci_addr.function) 1313 continue; 1314 /* Slave interface PCI address match found. */ 1315 fclose(file); 1316 snprintf(tmp_str, sizeof(tmp_str), 1317 "/sys/class/net/%s/phys_port_name", ifname); 1318 file = fopen(tmp_str, "rb"); 1319 if (!file) 1320 break; 1321 info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET; 1322 if (fscanf(file, "%32s", tmp_str) == 1) 1323 mlx5_translate_port_name(tmp_str, &info); 1324 if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY || 1325 info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK) 1326 pf = info.port_name; 1327 break; 1328 } 1329 if (file) 1330 fclose(file); 1331 return pf; 1332 } 1333 1334 /** 1335 * DPDK callback to register a PCI device. 1336 * 1337 * This function spawns Ethernet devices out of a given PCI device. 1338 * 1339 * @param[in] pci_drv 1340 * PCI driver structure (mlx5_driver). 1341 * @param[in] pci_dev 1342 * PCI device information. 1343 * 1344 * @return 1345 * 0 on success, a negative errno value otherwise and rte_errno is set. 1346 */ 1347 int 1348 mlx5_os_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, 1349 struct rte_pci_device *pci_dev) 1350 { 1351 struct ibv_device **ibv_list; 1352 /* 1353 * Number of found IB Devices matching with requested PCI BDF. 1354 * nd != 1 means there are multiple IB devices over the same 1355 * PCI device and we have representors and master. 1356 */ 1357 unsigned int nd = 0; 1358 /* 1359 * Number of found IB device Ports. nd = 1 and np = 1..n means 1360 * we have the single multiport IB device, and there may be 1361 * representors attached to some of found ports. 1362 */ 1363 unsigned int np = 0; 1364 /* 1365 * Number of DPDK ethernet devices to Spawn - either over 1366 * multiple IB devices or multiple ports of single IB device. 1367 * Actually this is the number of iterations to spawn. 1368 */ 1369 unsigned int ns = 0; 1370 /* 1371 * Bonding device 1372 * < 0 - no bonding device (single one) 1373 * >= 0 - bonding device (value is slave PF index) 1374 */ 1375 int bd = -1; 1376 struct mlx5_dev_spawn_data *list = NULL; 1377 struct mlx5_dev_config dev_config; 1378 int ret; 1379 1380 if (mlx5_class_get(pci_dev->device.devargs) != MLX5_CLASS_NET) { 1381 DRV_LOG(DEBUG, "Skip probing - should be probed by other mlx5" 1382 " driver."); 1383 return 1; 1384 } 1385 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 1386 mlx5_pmd_socket_init(); 1387 ret = mlx5_init_once(); 1388 if (ret) { 1389 DRV_LOG(ERR, "unable to init PMD global data: %s", 1390 strerror(rte_errno)); 1391 return -rte_errno; 1392 } 1393 MLX5_ASSERT(pci_drv == &mlx5_driver); 1394 errno = 0; 1395 ibv_list = mlx5_glue->get_device_list(&ret); 1396 if (!ibv_list) { 1397 rte_errno = errno ? errno : ENOSYS; 1398 DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?"); 1399 return -rte_errno; 1400 } 1401 /* 1402 * First scan the list of all Infiniband devices to find 1403 * matching ones, gathering into the list. 1404 */ 1405 struct ibv_device *ibv_match[ret + 1]; 1406 int nl_route = mlx5_nl_init(NETLINK_ROUTE); 1407 int nl_rdma = mlx5_nl_init(NETLINK_RDMA); 1408 unsigned int i; 1409 1410 while (ret-- > 0) { 1411 struct rte_pci_addr pci_addr; 1412 1413 DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name); 1414 bd = mlx5_device_bond_pci_match 1415 (ibv_list[ret], pci_dev, nl_rdma); 1416 if (bd >= 0) { 1417 /* 1418 * Bonding device detected. Only one match is allowed, 1419 * the bonding is supported over multi-port IB device, 1420 * there should be no matches on representor PCI 1421 * functions or non VF LAG bonding devices with 1422 * specified address. 1423 */ 1424 if (nd) { 1425 DRV_LOG(ERR, 1426 "multiple PCI match on bonding device" 1427 "\"%s\" found", ibv_list[ret]->name); 1428 rte_errno = ENOENT; 1429 ret = -rte_errno; 1430 goto exit; 1431 } 1432 DRV_LOG(INFO, "PCI information matches for" 1433 " slave %d bonding device \"%s\"", 1434 bd, ibv_list[ret]->name); 1435 ibv_match[nd++] = ibv_list[ret]; 1436 break; 1437 } 1438 if (mlx5_dev_to_pci_addr 1439 (ibv_list[ret]->ibdev_path, &pci_addr)) 1440 continue; 1441 if (pci_dev->addr.domain != pci_addr.domain || 1442 pci_dev->addr.bus != pci_addr.bus || 1443 pci_dev->addr.devid != pci_addr.devid || 1444 pci_dev->addr.function != pci_addr.function) 1445 continue; 1446 DRV_LOG(INFO, "PCI information matches for device \"%s\"", 1447 ibv_list[ret]->name); 1448 ibv_match[nd++] = ibv_list[ret]; 1449 } 1450 ibv_match[nd] = NULL; 1451 if (!nd) { 1452 /* No device matches, just complain and bail out. */ 1453 DRV_LOG(WARNING, 1454 "no Verbs device matches PCI device " PCI_PRI_FMT "," 1455 " are kernel drivers loaded?", 1456 pci_dev->addr.domain, pci_dev->addr.bus, 1457 pci_dev->addr.devid, pci_dev->addr.function); 1458 rte_errno = ENOENT; 1459 ret = -rte_errno; 1460 goto exit; 1461 } 1462 if (nd == 1) { 1463 /* 1464 * Found single matching device may have multiple ports. 1465 * Each port may be representor, we have to check the port 1466 * number and check the representors existence. 1467 */ 1468 if (nl_rdma >= 0) 1469 np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name); 1470 if (!np) 1471 DRV_LOG(WARNING, "can not get IB device \"%s\"" 1472 " ports number", ibv_match[0]->name); 1473 if (bd >= 0 && !np) { 1474 DRV_LOG(ERR, "can not get ports" 1475 " for bonding device"); 1476 rte_errno = ENOENT; 1477 ret = -rte_errno; 1478 goto exit; 1479 } 1480 } 1481 #ifndef HAVE_MLX5DV_DR_DEVX_PORT 1482 if (bd >= 0) { 1483 /* 1484 * This may happen if there is VF LAG kernel support and 1485 * application is compiled with older rdma_core library. 1486 */ 1487 DRV_LOG(ERR, 1488 "No kernel/verbs support for VF LAG bonding found."); 1489 rte_errno = ENOTSUP; 1490 ret = -rte_errno; 1491 goto exit; 1492 } 1493 #endif 1494 /* 1495 * Now we can determine the maximal 1496 * amount of devices to be spawned. 1497 */ 1498 list = rte_zmalloc("device spawn data", 1499 sizeof(struct mlx5_dev_spawn_data) * 1500 (np ? np : nd), 1501 RTE_CACHE_LINE_SIZE); 1502 if (!list) { 1503 DRV_LOG(ERR, "spawn data array allocation failure"); 1504 rte_errno = ENOMEM; 1505 ret = -rte_errno; 1506 goto exit; 1507 } 1508 if (bd >= 0 || np > 1) { 1509 /* 1510 * Single IB device with multiple ports found, 1511 * it may be E-Switch master device and representors. 1512 * We have to perform identification through the ports. 1513 */ 1514 MLX5_ASSERT(nl_rdma >= 0); 1515 MLX5_ASSERT(ns == 0); 1516 MLX5_ASSERT(nd == 1); 1517 MLX5_ASSERT(np); 1518 for (i = 1; i <= np; ++i) { 1519 list[ns].max_port = np; 1520 list[ns].phys_port = i; 1521 list[ns].phys_dev = ibv_match[0]; 1522 list[ns].eth_dev = NULL; 1523 list[ns].pci_dev = pci_dev; 1524 list[ns].pf_bond = bd; 1525 list[ns].ifindex = mlx5_nl_ifindex 1526 (nl_rdma, 1527 mlx5_os_get_dev_device_name 1528 (list[ns].phys_dev), i); 1529 if (!list[ns].ifindex) { 1530 /* 1531 * No network interface index found for the 1532 * specified port, it means there is no 1533 * representor on this port. It's OK, 1534 * there can be disabled ports, for example 1535 * if sriov_numvfs < sriov_totalvfs. 1536 */ 1537 continue; 1538 } 1539 ret = -1; 1540 if (nl_route >= 0) 1541 ret = mlx5_nl_switch_info 1542 (nl_route, 1543 list[ns].ifindex, 1544 &list[ns].info); 1545 if (ret || (!list[ns].info.representor && 1546 !list[ns].info.master)) { 1547 /* 1548 * We failed to recognize representors with 1549 * Netlink, let's try to perform the task 1550 * with sysfs. 1551 */ 1552 ret = mlx5_sysfs_switch_info 1553 (list[ns].ifindex, 1554 &list[ns].info); 1555 } 1556 if (!ret && bd >= 0) { 1557 switch (list[ns].info.name_type) { 1558 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: 1559 if (list[ns].info.port_name == bd) 1560 ns++; 1561 break; 1562 case MLX5_PHYS_PORT_NAME_TYPE_PFVF: 1563 if (list[ns].info.pf_num == bd) 1564 ns++; 1565 break; 1566 default: 1567 break; 1568 } 1569 continue; 1570 } 1571 if (!ret && (list[ns].info.representor ^ 1572 list[ns].info.master)) 1573 ns++; 1574 } 1575 if (!ns) { 1576 DRV_LOG(ERR, 1577 "unable to recognize master/representors" 1578 " on the IB device with multiple ports"); 1579 rte_errno = ENOENT; 1580 ret = -rte_errno; 1581 goto exit; 1582 } 1583 } else { 1584 /* 1585 * The existence of several matching entries (nd > 1) means 1586 * port representors have been instantiated. No existing Verbs 1587 * call nor sysfs entries can tell them apart, this can only 1588 * be done through Netlink calls assuming kernel drivers are 1589 * recent enough to support them. 1590 * 1591 * In the event of identification failure through Netlink, 1592 * try again through sysfs, then: 1593 * 1594 * 1. A single IB device matches (nd == 1) with single 1595 * port (np=0/1) and is not a representor, assume 1596 * no switch support. 1597 * 1598 * 2. Otherwise no safe assumptions can be made; 1599 * complain louder and bail out. 1600 */ 1601 for (i = 0; i != nd; ++i) { 1602 memset(&list[ns].info, 0, sizeof(list[ns].info)); 1603 list[ns].max_port = 1; 1604 list[ns].phys_port = 1; 1605 list[ns].phys_dev = ibv_match[i]; 1606 list[ns].eth_dev = NULL; 1607 list[ns].pci_dev = pci_dev; 1608 list[ns].pf_bond = -1; 1609 list[ns].ifindex = 0; 1610 if (nl_rdma >= 0) 1611 list[ns].ifindex = mlx5_nl_ifindex 1612 (nl_rdma, 1613 mlx5_os_get_dev_device_name 1614 (list[ns].phys_dev), 1); 1615 if (!list[ns].ifindex) { 1616 char ifname[IF_NAMESIZE]; 1617 1618 /* 1619 * Netlink failed, it may happen with old 1620 * ib_core kernel driver (before 4.16). 1621 * We can assume there is old driver because 1622 * here we are processing single ports IB 1623 * devices. Let's try sysfs to retrieve 1624 * the ifindex. The method works for 1625 * master device only. 1626 */ 1627 if (nd > 1) { 1628 /* 1629 * Multiple devices found, assume 1630 * representors, can not distinguish 1631 * master/representor and retrieve 1632 * ifindex via sysfs. 1633 */ 1634 continue; 1635 } 1636 ret = mlx5_get_master_ifname 1637 (ibv_match[i]->ibdev_path, &ifname); 1638 if (!ret) 1639 list[ns].ifindex = 1640 if_nametoindex(ifname); 1641 if (!list[ns].ifindex) { 1642 /* 1643 * No network interface index found 1644 * for the specified device, it means 1645 * there it is neither representor 1646 * nor master. 1647 */ 1648 continue; 1649 } 1650 } 1651 ret = -1; 1652 if (nl_route >= 0) 1653 ret = mlx5_nl_switch_info 1654 (nl_route, 1655 list[ns].ifindex, 1656 &list[ns].info); 1657 if (ret || (!list[ns].info.representor && 1658 !list[ns].info.master)) { 1659 /* 1660 * We failed to recognize representors with 1661 * Netlink, let's try to perform the task 1662 * with sysfs. 1663 */ 1664 ret = mlx5_sysfs_switch_info 1665 (list[ns].ifindex, 1666 &list[ns].info); 1667 } 1668 if (!ret && (list[ns].info.representor ^ 1669 list[ns].info.master)) { 1670 ns++; 1671 } else if ((nd == 1) && 1672 !list[ns].info.representor && 1673 !list[ns].info.master) { 1674 /* 1675 * Single IB device with 1676 * one physical port and 1677 * attached network device. 1678 * May be SRIOV is not enabled 1679 * or there is no representors. 1680 */ 1681 DRV_LOG(INFO, "no E-Switch support detected"); 1682 ns++; 1683 break; 1684 } 1685 } 1686 if (!ns) { 1687 DRV_LOG(ERR, 1688 "unable to recognize master/representors" 1689 " on the multiple IB devices"); 1690 rte_errno = ENOENT; 1691 ret = -rte_errno; 1692 goto exit; 1693 } 1694 } 1695 MLX5_ASSERT(ns); 1696 /* 1697 * Sort list to probe devices in natural order for users convenience 1698 * (i.e. master first, then representors from lowest to highest ID). 1699 */ 1700 qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp); 1701 /* Default configuration. */ 1702 dev_config = (struct mlx5_dev_config){ 1703 .hw_padding = 0, 1704 .mps = MLX5_ARG_UNSET, 1705 .dbnc = MLX5_ARG_UNSET, 1706 .rx_vec_en = 1, 1707 .txq_inline_max = MLX5_ARG_UNSET, 1708 .txq_inline_min = MLX5_ARG_UNSET, 1709 .txq_inline_mpw = MLX5_ARG_UNSET, 1710 .txqs_inline = MLX5_ARG_UNSET, 1711 .vf_nl_en = 1, 1712 .mr_ext_memseg_en = 1, 1713 .mprq = { 1714 .enabled = 0, /* Disabled by default. */ 1715 .stride_num_n = 0, 1716 .stride_size_n = 0, 1717 .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN, 1718 .min_rxqs_num = MLX5_MPRQ_MIN_RXQS, 1719 }, 1720 .dv_esw_en = 1, 1721 .dv_flow_en = 1, 1722 .log_hp_size = MLX5_ARG_UNSET, 1723 }; 1724 /* Device specific configuration. */ 1725 switch (pci_dev->id.device_id) { 1726 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: 1727 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: 1728 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: 1729 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: 1730 case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF: 1731 case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF: 1732 case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF: 1733 dev_config.vf = 1; 1734 break; 1735 default: 1736 break; 1737 } 1738 for (i = 0; i != ns; ++i) { 1739 uint32_t restore; 1740 1741 list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device, 1742 &list[i], 1743 dev_config); 1744 if (!list[i].eth_dev) { 1745 if (rte_errno != EBUSY && rte_errno != EEXIST) 1746 break; 1747 /* Device is disabled or already spawned. Ignore it. */ 1748 continue; 1749 } 1750 restore = list[i].eth_dev->data->dev_flags; 1751 rte_eth_copy_pci_info(list[i].eth_dev, pci_dev); 1752 /* Restore non-PCI flags cleared by the above call. */ 1753 list[i].eth_dev->data->dev_flags |= restore; 1754 rte_eth_dev_probing_finish(list[i].eth_dev); 1755 } 1756 if (i != ns) { 1757 DRV_LOG(ERR, 1758 "probe of PCI device " PCI_PRI_FMT " aborted after" 1759 " encountering an error: %s", 1760 pci_dev->addr.domain, pci_dev->addr.bus, 1761 pci_dev->addr.devid, pci_dev->addr.function, 1762 strerror(rte_errno)); 1763 ret = -rte_errno; 1764 /* Roll back. */ 1765 while (i--) { 1766 if (!list[i].eth_dev) 1767 continue; 1768 mlx5_dev_close(list[i].eth_dev); 1769 /* mac_addrs must not be freed because in dev_private */ 1770 list[i].eth_dev->data->mac_addrs = NULL; 1771 claim_zero(rte_eth_dev_release_port(list[i].eth_dev)); 1772 } 1773 /* Restore original error. */ 1774 rte_errno = -ret; 1775 } else { 1776 ret = 0; 1777 } 1778 exit: 1779 /* 1780 * Do the routine cleanup: 1781 * - close opened Netlink sockets 1782 * - free allocated spawn data array 1783 * - free the Infiniband device list 1784 */ 1785 if (nl_rdma >= 0) 1786 close(nl_rdma); 1787 if (nl_route >= 0) 1788 close(nl_route); 1789 if (list) 1790 rte_free(list); 1791 MLX5_ASSERT(ibv_list); 1792 mlx5_glue->free_device_list(ibv_list); 1793 return ret; 1794 } 1795 1796 static int 1797 mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config) 1798 { 1799 char *env; 1800 int value; 1801 1802 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 1803 /* Get environment variable to store. */ 1804 env = getenv(MLX5_SHUT_UP_BF); 1805 value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET; 1806 if (config->dbnc == MLX5_ARG_UNSET) 1807 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1); 1808 else 1809 setenv(MLX5_SHUT_UP_BF, 1810 config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1); 1811 return value; 1812 } 1813 1814 static void 1815 mlx5_restore_doorbell_mapping_env(int value) 1816 { 1817 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 1818 /* Restore the original environment variable state. */ 1819 if (value == MLX5_ARG_UNSET) 1820 unsetenv(MLX5_SHUT_UP_BF); 1821 else 1822 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1); 1823 } 1824 1825 /** 1826 * Extract pdn of PD object using DV API. 1827 * 1828 * @param[in] pd 1829 * Pointer to the verbs PD object. 1830 * @param[out] pdn 1831 * Pointer to the PD object number variable. 1832 * 1833 * @return 1834 * 0 on success, error value otherwise. 1835 */ 1836 int 1837 mlx5_os_get_pdn(void *pd, uint32_t *pdn) 1838 { 1839 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 1840 struct mlx5dv_obj obj; 1841 struct mlx5dv_pd pd_info; 1842 int ret = 0; 1843 1844 obj.pd.in = pd; 1845 obj.pd.out = &pd_info; 1846 ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD); 1847 if (ret) { 1848 DRV_LOG(DEBUG, "Fail to get PD object info"); 1849 return ret; 1850 } 1851 *pdn = pd_info.pdn; 1852 return 0; 1853 #else 1854 (void)pd; 1855 (void)pdn; 1856 return -ENOTSUP; 1857 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */ 1858 } 1859 1860 /** 1861 * Function API to open IB device. 1862 * 1863 * This function calls the Linux glue APIs to open a device. 1864 * 1865 * @param[in] spawn 1866 * Pointer to the IB device attributes (name, port, etc). 1867 * @param[out] config 1868 * Pointer to device configuration structure. 1869 * @param[out] sh 1870 * Pointer to shared context structure. 1871 * 1872 * @return 1873 * 0 on success, a positive error value otherwise. 1874 */ 1875 int 1876 mlx5_os_open_device(const struct mlx5_dev_spawn_data *spawn, 1877 const struct mlx5_dev_config *config, 1878 struct mlx5_dev_ctx_shared *sh) 1879 { 1880 int dbmap_env; 1881 int err = 0; 1882 /* 1883 * Configure environment variable "MLX5_BF_SHUT_UP" 1884 * before the device creation. The rdma_core library 1885 * checks the variable at device creation and 1886 * stores the result internally. 1887 */ 1888 dbmap_env = mlx5_config_doorbell_mapping_env(config); 1889 /* Try to open IB device with DV first, then usual Verbs. */ 1890 errno = 0; 1891 sh->ctx = mlx5_glue->dv_open_device(spawn->phys_dev); 1892 if (sh->ctx) { 1893 sh->devx = 1; 1894 DRV_LOG(DEBUG, "DevX is supported"); 1895 /* The device is created, no need for environment. */ 1896 mlx5_restore_doorbell_mapping_env(dbmap_env); 1897 } else { 1898 /* The environment variable is still configured. */ 1899 sh->ctx = mlx5_glue->open_device(spawn->phys_dev); 1900 err = errno ? errno : ENODEV; 1901 /* 1902 * The environment variable is not needed anymore, 1903 * all device creation attempts are completed. 1904 */ 1905 mlx5_restore_doorbell_mapping_env(dbmap_env); 1906 if (!sh->ctx) 1907 return err; 1908 DRV_LOG(DEBUG, "DevX is NOT supported"); 1909 err = 0; 1910 } 1911 return err; 1912 } 1913 1914 /** 1915 * Install shared asynchronous device events handler. 1916 * This function is implemented to support event sharing 1917 * between multiple ports of single IB device. 1918 * 1919 * @param sh 1920 * Pointer to mlx5_dev_ctx_shared object. 1921 */ 1922 void 1923 mlx5_os_dev_shared_handler_install(struct mlx5_dev_ctx_shared *sh) 1924 { 1925 int ret; 1926 int flags; 1927 1928 sh->intr_handle.fd = -1; 1929 flags = fcntl(((struct ibv_context *)sh->ctx)->async_fd, F_GETFL); 1930 ret = fcntl(((struct ibv_context *)sh->ctx)->async_fd, 1931 F_SETFL, flags | O_NONBLOCK); 1932 if (ret) { 1933 DRV_LOG(INFO, "failed to change file descriptor async event" 1934 " queue"); 1935 } else { 1936 sh->intr_handle.fd = ((struct ibv_context *)sh->ctx)->async_fd; 1937 sh->intr_handle.type = RTE_INTR_HANDLE_EXT; 1938 if (rte_intr_callback_register(&sh->intr_handle, 1939 mlx5_dev_interrupt_handler, sh)) { 1940 DRV_LOG(INFO, "Fail to install the shared interrupt."); 1941 sh->intr_handle.fd = -1; 1942 } 1943 } 1944 if (sh->devx) { 1945 #ifdef HAVE_IBV_DEVX_ASYNC 1946 sh->intr_handle_devx.fd = -1; 1947 sh->devx_comp = 1948 (void *)mlx5_glue->devx_create_cmd_comp(sh->ctx); 1949 struct mlx5dv_devx_cmd_comp *devx_comp = sh->devx_comp; 1950 if (!devx_comp) { 1951 DRV_LOG(INFO, "failed to allocate devx_comp."); 1952 return; 1953 } 1954 flags = fcntl(devx_comp->fd, F_GETFL); 1955 ret = fcntl(devx_comp->fd, F_SETFL, flags | O_NONBLOCK); 1956 if (ret) { 1957 DRV_LOG(INFO, "failed to change file descriptor" 1958 " devx comp"); 1959 return; 1960 } 1961 sh->intr_handle_devx.fd = devx_comp->fd; 1962 sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT; 1963 if (rte_intr_callback_register(&sh->intr_handle_devx, 1964 mlx5_dev_interrupt_handler_devx, sh)) { 1965 DRV_LOG(INFO, "Fail to install the devx shared" 1966 " interrupt."); 1967 sh->intr_handle_devx.fd = -1; 1968 } 1969 #endif /* HAVE_IBV_DEVX_ASYNC */ 1970 } 1971 } 1972 1973 /** 1974 * Uninstall shared asynchronous device events handler. 1975 * This function is implemented to support event sharing 1976 * between multiple ports of single IB device. 1977 * 1978 * @param dev 1979 * Pointer to mlx5_dev_ctx_shared object. 1980 */ 1981 void 1982 mlx5_os_dev_shared_handler_uninstall(struct mlx5_dev_ctx_shared *sh) 1983 { 1984 if (sh->intr_handle.fd >= 0) 1985 mlx5_intr_callback_unregister(&sh->intr_handle, 1986 mlx5_dev_interrupt_handler, sh); 1987 #ifdef HAVE_IBV_DEVX_ASYNC 1988 if (sh->intr_handle_devx.fd >= 0) 1989 rte_intr_callback_unregister(&sh->intr_handle_devx, 1990 mlx5_dev_interrupt_handler_devx, sh); 1991 if (sh->devx_comp) 1992 mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp); 1993 #endif 1994 } 1995 1996 const struct eth_dev_ops mlx5_os_dev_ops = { 1997 .dev_configure = mlx5_dev_configure, 1998 .dev_start = mlx5_dev_start, 1999 .dev_stop = mlx5_dev_stop, 2000 .dev_set_link_down = mlx5_set_link_down, 2001 .dev_set_link_up = mlx5_set_link_up, 2002 .dev_close = mlx5_dev_close, 2003 .promiscuous_enable = mlx5_promiscuous_enable, 2004 .promiscuous_disable = mlx5_promiscuous_disable, 2005 .allmulticast_enable = mlx5_allmulticast_enable, 2006 .allmulticast_disable = mlx5_allmulticast_disable, 2007 .link_update = mlx5_link_update, 2008 .stats_get = mlx5_stats_get, 2009 .stats_reset = mlx5_stats_reset, 2010 .xstats_get = mlx5_xstats_get, 2011 .xstats_reset = mlx5_xstats_reset, 2012 .xstats_get_names = mlx5_xstats_get_names, 2013 .fw_version_get = mlx5_fw_version_get, 2014 .dev_infos_get = mlx5_dev_infos_get, 2015 .read_clock = mlx5_read_clock, 2016 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 2017 .vlan_filter_set = mlx5_vlan_filter_set, 2018 .rx_queue_setup = mlx5_rx_queue_setup, 2019 .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup, 2020 .tx_queue_setup = mlx5_tx_queue_setup, 2021 .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup, 2022 .rx_queue_release = mlx5_rx_queue_release, 2023 .tx_queue_release = mlx5_tx_queue_release, 2024 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 2025 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 2026 .mac_addr_remove = mlx5_mac_addr_remove, 2027 .mac_addr_add = mlx5_mac_addr_add, 2028 .mac_addr_set = mlx5_mac_addr_set, 2029 .set_mc_addr_list = mlx5_set_mc_addr_list, 2030 .mtu_set = mlx5_dev_set_mtu, 2031 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 2032 .vlan_offload_set = mlx5_vlan_offload_set, 2033 .reta_update = mlx5_dev_rss_reta_update, 2034 .reta_query = mlx5_dev_rss_reta_query, 2035 .rss_hash_update = mlx5_rss_hash_update, 2036 .rss_hash_conf_get = mlx5_rss_hash_conf_get, 2037 .filter_ctrl = mlx5_dev_filter_ctrl, 2038 .rx_descriptor_status = mlx5_rx_descriptor_status, 2039 .tx_descriptor_status = mlx5_tx_descriptor_status, 2040 .rxq_info_get = mlx5_rxq_info_get, 2041 .txq_info_get = mlx5_txq_info_get, 2042 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 2043 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 2044 .rx_queue_count = mlx5_rx_queue_count, 2045 .rx_queue_intr_enable = mlx5_rx_intr_enable, 2046 .rx_queue_intr_disable = mlx5_rx_intr_disable, 2047 .is_removed = mlx5_is_removed, 2048 .udp_tunnel_port_add = mlx5_udp_tunnel_port_add, 2049 .get_module_info = mlx5_get_module_info, 2050 .get_module_eeprom = mlx5_get_module_eeprom, 2051 .hairpin_cap_get = mlx5_hairpin_cap_get, 2052 .mtr_ops_get = mlx5_flow_meter_ops_get, 2053 }; 2054 2055 /* Available operations from secondary process. */ 2056 const struct eth_dev_ops mlx5_os_dev_sec_ops = { 2057 .stats_get = mlx5_stats_get, 2058 .stats_reset = mlx5_stats_reset, 2059 .xstats_get = mlx5_xstats_get, 2060 .xstats_reset = mlx5_xstats_reset, 2061 .xstats_get_names = mlx5_xstats_get_names, 2062 .fw_version_get = mlx5_fw_version_get, 2063 .dev_infos_get = mlx5_dev_infos_get, 2064 .rx_descriptor_status = mlx5_rx_descriptor_status, 2065 .tx_descriptor_status = mlx5_tx_descriptor_status, 2066 .rxq_info_get = mlx5_rxq_info_get, 2067 .txq_info_get = mlx5_txq_info_get, 2068 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 2069 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 2070 .get_module_info = mlx5_get_module_info, 2071 .get_module_eeprom = mlx5_get_module_eeprom, 2072 }; 2073 2074 /* Available operations in flow isolated mode. */ 2075 const struct eth_dev_ops mlx5_os_dev_ops_isolate = { 2076 .dev_configure = mlx5_dev_configure, 2077 .dev_start = mlx5_dev_start, 2078 .dev_stop = mlx5_dev_stop, 2079 .dev_set_link_down = mlx5_set_link_down, 2080 .dev_set_link_up = mlx5_set_link_up, 2081 .dev_close = mlx5_dev_close, 2082 .promiscuous_enable = mlx5_promiscuous_enable, 2083 .promiscuous_disable = mlx5_promiscuous_disable, 2084 .allmulticast_enable = mlx5_allmulticast_enable, 2085 .allmulticast_disable = mlx5_allmulticast_disable, 2086 .link_update = mlx5_link_update, 2087 .stats_get = mlx5_stats_get, 2088 .stats_reset = mlx5_stats_reset, 2089 .xstats_get = mlx5_xstats_get, 2090 .xstats_reset = mlx5_xstats_reset, 2091 .xstats_get_names = mlx5_xstats_get_names, 2092 .fw_version_get = mlx5_fw_version_get, 2093 .dev_infos_get = mlx5_dev_infos_get, 2094 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 2095 .vlan_filter_set = mlx5_vlan_filter_set, 2096 .rx_queue_setup = mlx5_rx_queue_setup, 2097 .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup, 2098 .tx_queue_setup = mlx5_tx_queue_setup, 2099 .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup, 2100 .rx_queue_release = mlx5_rx_queue_release, 2101 .tx_queue_release = mlx5_tx_queue_release, 2102 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 2103 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 2104 .mac_addr_remove = mlx5_mac_addr_remove, 2105 .mac_addr_add = mlx5_mac_addr_add, 2106 .mac_addr_set = mlx5_mac_addr_set, 2107 .set_mc_addr_list = mlx5_set_mc_addr_list, 2108 .mtu_set = mlx5_dev_set_mtu, 2109 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 2110 .vlan_offload_set = mlx5_vlan_offload_set, 2111 .filter_ctrl = mlx5_dev_filter_ctrl, 2112 .rx_descriptor_status = mlx5_rx_descriptor_status, 2113 .tx_descriptor_status = mlx5_tx_descriptor_status, 2114 .rxq_info_get = mlx5_rxq_info_get, 2115 .txq_info_get = mlx5_txq_info_get, 2116 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 2117 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 2118 .rx_queue_intr_enable = mlx5_rx_intr_enable, 2119 .rx_queue_intr_disable = mlx5_rx_intr_disable, 2120 .is_removed = mlx5_is_removed, 2121 .get_module_info = mlx5_get_module_info, 2122 .get_module_eeprom = mlx5_get_module_eeprom, 2123 .hairpin_cap_get = mlx5_hairpin_cap_get, 2124 .mtr_ops_get = mlx5_flow_meter_ops_get, 2125 }; 2126