1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2020 Mellanox Technologies, Ltd 4 */ 5 6 #include <stddef.h> 7 #include <unistd.h> 8 #include <string.h> 9 #include <stdint.h> 10 #include <stdlib.h> 11 #include <errno.h> 12 #include <net/if.h> 13 #include <linux/rtnetlink.h> 14 #include <linux/sockios.h> 15 #include <linux/ethtool.h> 16 #include <fcntl.h> 17 18 /* Verbs header. */ 19 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 20 #ifdef PEDANTIC 21 #pragma GCC diagnostic ignored "-Wpedantic" 22 #endif 23 #include <infiniband/verbs.h> 24 #ifdef PEDANTIC 25 #pragma GCC diagnostic error "-Wpedantic" 26 #endif 27 28 #include <rte_malloc.h> 29 #include <rte_ethdev_driver.h> 30 #include <rte_ethdev_pci.h> 31 #include <rte_pci.h> 32 #include <rte_bus_pci.h> 33 #include <rte_common.h> 34 #include <rte_kvargs.h> 35 #include <rte_rwlock.h> 36 #include <rte_spinlock.h> 37 #include <rte_string_fns.h> 38 #include <rte_alarm.h> 39 #include <rte_eal_paging.h> 40 41 #include <mlx5_glue.h> 42 #include <mlx5_devx_cmds.h> 43 #include <mlx5_common.h> 44 #include <mlx5_common_mp.h> 45 #include <mlx5_common_mr.h> 46 #include <mlx5_malloc.h> 47 48 #include "mlx5_defs.h" 49 #include "mlx5.h" 50 #include "mlx5_common_os.h" 51 #include "mlx5_utils.h" 52 #include "mlx5_rxtx.h" 53 #include "mlx5_autoconf.h" 54 #include "mlx5_mr.h" 55 #include "mlx5_flow.h" 56 #include "rte_pmd_mlx5.h" 57 #include "mlx5_verbs.h" 58 59 #define MLX5_TAGS_HLIST_ARRAY_SIZE 8192 60 61 #ifndef HAVE_IBV_MLX5_MOD_MPW 62 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) 63 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) 64 #endif 65 66 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP 67 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4) 68 #endif 69 70 /** 71 * Get MAC address by querying netdevice. 72 * 73 * @param[in] dev 74 * Pointer to Ethernet device. 75 * @param[out] mac 76 * MAC address output buffer. 77 * 78 * @return 79 * 0 on success, a negative errno value otherwise and rte_errno is set. 80 */ 81 static int 82 mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[RTE_ETHER_ADDR_LEN]) 83 { 84 struct ifreq request; 85 int ret; 86 87 ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request); 88 if (ret) 89 return ret; 90 memcpy(mac, request.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN); 91 return 0; 92 } 93 94 /** 95 * Get mlx5 device attributes. The glue function query_device_ex() is called 96 * with out parameter of type 'struct ibv_device_attr_ex *'. Then fill in mlx5 97 * device attributes from the glue out parameter. 98 * 99 * @param dev 100 * Pointer to ibv context. 101 * 102 * @param device_attr 103 * Pointer to mlx5 device attributes. 104 * 105 * @return 106 * 0 on success, non zero error number otherwise 107 */ 108 int 109 mlx5_os_get_dev_attr(void *ctx, struct mlx5_dev_attr *device_attr) 110 { 111 int err; 112 struct ibv_device_attr_ex attr_ex; 113 memset(device_attr, 0, sizeof(*device_attr)); 114 err = mlx5_glue->query_device_ex(ctx, NULL, &attr_ex); 115 if (err) 116 return err; 117 118 device_attr->device_cap_flags_ex = attr_ex.device_cap_flags_ex; 119 device_attr->max_qp_wr = attr_ex.orig_attr.max_qp_wr; 120 device_attr->max_sge = attr_ex.orig_attr.max_sge; 121 device_attr->max_cq = attr_ex.orig_attr.max_cq; 122 device_attr->max_qp = attr_ex.orig_attr.max_qp; 123 device_attr->raw_packet_caps = attr_ex.raw_packet_caps; 124 device_attr->max_rwq_indirection_table_size = 125 attr_ex.rss_caps.max_rwq_indirection_table_size; 126 device_attr->max_tso = attr_ex.tso_caps.max_tso; 127 device_attr->tso_supported_qpts = attr_ex.tso_caps.supported_qpts; 128 129 struct mlx5dv_context dv_attr = { .comp_mask = 0 }; 130 err = mlx5_glue->dv_query_device(ctx, &dv_attr); 131 if (err) 132 return err; 133 134 device_attr->flags = dv_attr.flags; 135 device_attr->comp_mask = dv_attr.comp_mask; 136 #ifdef HAVE_IBV_MLX5_MOD_SWP 137 device_attr->sw_parsing_offloads = 138 dv_attr.sw_parsing_caps.sw_parsing_offloads; 139 #endif 140 device_attr->min_single_stride_log_num_of_bytes = 141 dv_attr.striding_rq_caps.min_single_stride_log_num_of_bytes; 142 device_attr->max_single_stride_log_num_of_bytes = 143 dv_attr.striding_rq_caps.max_single_stride_log_num_of_bytes; 144 device_attr->min_single_wqe_log_num_of_strides = 145 dv_attr.striding_rq_caps.min_single_wqe_log_num_of_strides; 146 device_attr->max_single_wqe_log_num_of_strides = 147 dv_attr.striding_rq_caps.max_single_wqe_log_num_of_strides; 148 device_attr->stride_supported_qpts = 149 dv_attr.striding_rq_caps.supported_qpts; 150 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 151 device_attr->tunnel_offloads_caps = dv_attr.tunnel_offloads_caps; 152 #endif 153 154 return err; 155 } 156 157 /** 158 * Verbs callback to allocate a memory. This function should allocate the space 159 * according to the size provided residing inside a huge page. 160 * Please note that all allocation must respect the alignment from libmlx5 161 * (i.e. currently rte_mem_page_size()). 162 * 163 * @param[in] size 164 * The size in bytes of the memory to allocate. 165 * @param[in] data 166 * A pointer to the callback data. 167 * 168 * @return 169 * Allocated buffer, NULL otherwise and rte_errno is set. 170 */ 171 static void * 172 mlx5_alloc_verbs_buf(size_t size, void *data) 173 { 174 struct mlx5_priv *priv = data; 175 void *ret; 176 unsigned int socket = SOCKET_ID_ANY; 177 size_t alignment = rte_mem_page_size(); 178 if (alignment == (size_t)-1) { 179 DRV_LOG(ERR, "Failed to get mem page size"); 180 rte_errno = ENOMEM; 181 return NULL; 182 } 183 184 if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) { 185 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 186 187 socket = ctrl->socket; 188 } else if (priv->verbs_alloc_ctx.type == 189 MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) { 190 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 191 192 socket = ctrl->socket; 193 } 194 MLX5_ASSERT(data != NULL); 195 ret = mlx5_malloc(0, size, alignment, socket); 196 if (!ret && size) 197 rte_errno = ENOMEM; 198 return ret; 199 } 200 201 /** 202 * Verbs callback to free a memory. 203 * 204 * @param[in] ptr 205 * A pointer to the memory to free. 206 * @param[in] data 207 * A pointer to the callback data. 208 */ 209 static void 210 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused) 211 { 212 MLX5_ASSERT(data != NULL); 213 mlx5_free(ptr); 214 } 215 216 /** 217 * Initialize DR related data within private structure. 218 * Routine checks the reference counter and does actual 219 * resources creation/initialization only if counter is zero. 220 * 221 * @param[in] priv 222 * Pointer to the private device data structure. 223 * 224 * @return 225 * Zero on success, positive error code otherwise. 226 */ 227 static int 228 mlx5_alloc_shared_dr(struct mlx5_priv *priv) 229 { 230 struct mlx5_dev_ctx_shared *sh = priv->sh; 231 char s[MLX5_HLIST_NAMESIZE]; 232 int err = 0; 233 234 if (!sh->flow_tbls) 235 err = mlx5_alloc_table_hash_list(priv); 236 else 237 DRV_LOG(DEBUG, "sh->flow_tbls[%p] already created, reuse\n", 238 (void *)sh->flow_tbls); 239 if (err) 240 return err; 241 /* Create tags hash list table. */ 242 snprintf(s, sizeof(s), "%s_tags", sh->ibdev_name); 243 sh->tag_table = mlx5_hlist_create(s, MLX5_TAGS_HLIST_ARRAY_SIZE); 244 if (!sh->tag_table) { 245 DRV_LOG(ERR, "tags with hash creation failed."); 246 err = ENOMEM; 247 goto error; 248 } 249 #ifdef HAVE_MLX5DV_DR 250 void *domain; 251 252 if (sh->dv_refcnt) { 253 /* Shared DV/DR structures is already initialized. */ 254 sh->dv_refcnt++; 255 priv->dr_shared = 1; 256 return 0; 257 } 258 /* Reference counter is zero, we should initialize structures. */ 259 domain = mlx5_glue->dr_create_domain(sh->ctx, 260 MLX5DV_DR_DOMAIN_TYPE_NIC_RX); 261 if (!domain) { 262 DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed"); 263 err = errno; 264 goto error; 265 } 266 sh->rx_domain = domain; 267 domain = mlx5_glue->dr_create_domain(sh->ctx, 268 MLX5DV_DR_DOMAIN_TYPE_NIC_TX); 269 if (!domain) { 270 DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed"); 271 err = errno; 272 goto error; 273 } 274 pthread_mutex_init(&sh->dv_mutex, NULL); 275 sh->tx_domain = domain; 276 #ifdef HAVE_MLX5DV_DR_ESWITCH 277 if (priv->config.dv_esw_en) { 278 domain = mlx5_glue->dr_create_domain 279 (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB); 280 if (!domain) { 281 DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed"); 282 err = errno; 283 goto error; 284 } 285 sh->fdb_domain = domain; 286 sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop(); 287 } 288 #endif 289 if (priv->config.reclaim_mode == MLX5_RCM_AGGR) { 290 mlx5_glue->dr_reclaim_domain_memory(sh->rx_domain, 1); 291 mlx5_glue->dr_reclaim_domain_memory(sh->tx_domain, 1); 292 if (sh->fdb_domain) 293 mlx5_glue->dr_reclaim_domain_memory(sh->fdb_domain, 1); 294 } 295 sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan(); 296 #endif /* HAVE_MLX5DV_DR */ 297 sh->dv_refcnt++; 298 priv->dr_shared = 1; 299 return 0; 300 error: 301 /* Rollback the created objects. */ 302 if (sh->rx_domain) { 303 mlx5_glue->dr_destroy_domain(sh->rx_domain); 304 sh->rx_domain = NULL; 305 } 306 if (sh->tx_domain) { 307 mlx5_glue->dr_destroy_domain(sh->tx_domain); 308 sh->tx_domain = NULL; 309 } 310 if (sh->fdb_domain) { 311 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 312 sh->fdb_domain = NULL; 313 } 314 if (sh->esw_drop_action) { 315 mlx5_glue->destroy_flow_action(sh->esw_drop_action); 316 sh->esw_drop_action = NULL; 317 } 318 if (sh->pop_vlan_action) { 319 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 320 sh->pop_vlan_action = NULL; 321 } 322 if (sh->tag_table) { 323 /* tags should be destroyed with flow before. */ 324 mlx5_hlist_destroy(sh->tag_table, NULL, NULL); 325 sh->tag_table = NULL; 326 } 327 mlx5_free_table_hash_list(priv); 328 return err; 329 } 330 331 /** 332 * Destroy DR related data within private structure. 333 * 334 * @param[in] priv 335 * Pointer to the private device data structure. 336 */ 337 void 338 mlx5_os_free_shared_dr(struct mlx5_priv *priv) 339 { 340 struct mlx5_dev_ctx_shared *sh; 341 342 if (!priv->dr_shared) 343 return; 344 priv->dr_shared = 0; 345 sh = priv->sh; 346 MLX5_ASSERT(sh); 347 #ifdef HAVE_MLX5DV_DR 348 MLX5_ASSERT(sh->dv_refcnt); 349 if (sh->dv_refcnt && --sh->dv_refcnt) 350 return; 351 if (sh->rx_domain) { 352 mlx5_glue->dr_destroy_domain(sh->rx_domain); 353 sh->rx_domain = NULL; 354 } 355 if (sh->tx_domain) { 356 mlx5_glue->dr_destroy_domain(sh->tx_domain); 357 sh->tx_domain = NULL; 358 } 359 #ifdef HAVE_MLX5DV_DR_ESWITCH 360 if (sh->fdb_domain) { 361 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 362 sh->fdb_domain = NULL; 363 } 364 if (sh->esw_drop_action) { 365 mlx5_glue->destroy_flow_action(sh->esw_drop_action); 366 sh->esw_drop_action = NULL; 367 } 368 #endif 369 if (sh->pop_vlan_action) { 370 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 371 sh->pop_vlan_action = NULL; 372 } 373 pthread_mutex_destroy(&sh->dv_mutex); 374 #endif /* HAVE_MLX5DV_DR */ 375 if (sh->tag_table) { 376 /* tags should be destroyed with flow before. */ 377 mlx5_hlist_destroy(sh->tag_table, NULL, NULL); 378 sh->tag_table = NULL; 379 } 380 mlx5_free_table_hash_list(priv); 381 } 382 383 /** 384 * Spawn an Ethernet device from Verbs information. 385 * 386 * @param dpdk_dev 387 * Backing DPDK device. 388 * @param spawn 389 * Verbs device parameters (name, port, switch_info) to spawn. 390 * @param config 391 * Device configuration parameters. 392 * 393 * @return 394 * A valid Ethernet device object on success, NULL otherwise and rte_errno 395 * is set. The following errors are defined: 396 * 397 * EBUSY: device is not supposed to be spawned. 398 * EEXIST: device is already spawned 399 */ 400 static struct rte_eth_dev * 401 mlx5_dev_spawn(struct rte_device *dpdk_dev, 402 struct mlx5_dev_spawn_data *spawn, 403 struct mlx5_dev_config config) 404 { 405 const struct mlx5_switch_info *switch_info = &spawn->info; 406 struct mlx5_dev_ctx_shared *sh = NULL; 407 struct ibv_port_attr port_attr; 408 struct mlx5dv_context dv_attr = { .comp_mask = 0 }; 409 struct rte_eth_dev *eth_dev = NULL; 410 struct mlx5_priv *priv = NULL; 411 int err = 0; 412 unsigned int hw_padding = 0; 413 unsigned int mps; 414 unsigned int cqe_comp; 415 unsigned int cqe_pad = 0; 416 unsigned int tunnel_en = 0; 417 unsigned int mpls_en = 0; 418 unsigned int swp = 0; 419 unsigned int mprq = 0; 420 unsigned int mprq_min_stride_size_n = 0; 421 unsigned int mprq_max_stride_size_n = 0; 422 unsigned int mprq_min_stride_num_n = 0; 423 unsigned int mprq_max_stride_num_n = 0; 424 struct rte_ether_addr mac; 425 char name[RTE_ETH_NAME_MAX_LEN]; 426 int own_domain_id = 0; 427 uint16_t port_id; 428 unsigned int i; 429 #ifdef HAVE_MLX5DV_DR_DEVX_PORT 430 struct mlx5dv_devx_port devx_port = { .comp_mask = 0 }; 431 #endif 432 433 /* Determine if this port representor is supposed to be spawned. */ 434 if (switch_info->representor && dpdk_dev->devargs) { 435 struct rte_eth_devargs eth_da; 436 437 err = rte_eth_devargs_parse(dpdk_dev->devargs->args, ð_da); 438 if (err) { 439 rte_errno = -err; 440 DRV_LOG(ERR, "failed to process device arguments: %s", 441 strerror(rte_errno)); 442 return NULL; 443 } 444 for (i = 0; i < eth_da.nb_representor_ports; ++i) 445 if (eth_da.representor_ports[i] == 446 (uint16_t)switch_info->port_name) 447 break; 448 if (i == eth_da.nb_representor_ports) { 449 rte_errno = EBUSY; 450 return NULL; 451 } 452 } 453 /* Build device name. */ 454 if (spawn->pf_bond < 0) { 455 /* Single device. */ 456 if (!switch_info->representor) 457 strlcpy(name, dpdk_dev->name, sizeof(name)); 458 else 459 snprintf(name, sizeof(name), "%s_representor_%u", 460 dpdk_dev->name, switch_info->port_name); 461 } else { 462 /* Bonding device. */ 463 if (!switch_info->representor) 464 snprintf(name, sizeof(name), "%s_%s", 465 dpdk_dev->name, 466 mlx5_os_get_dev_device_name(spawn->phys_dev)); 467 else 468 snprintf(name, sizeof(name), "%s_%s_representor_%u", 469 dpdk_dev->name, 470 mlx5_os_get_dev_device_name(spawn->phys_dev), 471 switch_info->port_name); 472 } 473 /* check if the device is already spawned */ 474 if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) { 475 rte_errno = EEXIST; 476 return NULL; 477 } 478 DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name); 479 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 480 struct mlx5_mp_id mp_id; 481 482 eth_dev = rte_eth_dev_attach_secondary(name); 483 if (eth_dev == NULL) { 484 DRV_LOG(ERR, "can not attach rte ethdev"); 485 rte_errno = ENOMEM; 486 return NULL; 487 } 488 eth_dev->device = dpdk_dev; 489 eth_dev->dev_ops = &mlx5_os_dev_sec_ops; 490 err = mlx5_proc_priv_init(eth_dev); 491 if (err) 492 return NULL; 493 mp_id.port_id = eth_dev->data->port_id; 494 strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN); 495 /* Receive command fd from primary process */ 496 err = mlx5_mp_req_verbs_cmd_fd(&mp_id); 497 if (err < 0) 498 goto err_secondary; 499 /* Remap UAR for Tx queues. */ 500 err = mlx5_tx_uar_init_secondary(eth_dev, err); 501 if (err) 502 goto err_secondary; 503 /* 504 * Ethdev pointer is still required as input since 505 * the primary device is not accessible from the 506 * secondary process. 507 */ 508 eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev); 509 eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev); 510 return eth_dev; 511 err_secondary: 512 mlx5_dev_close(eth_dev); 513 return NULL; 514 } 515 /* 516 * Some parameters ("tx_db_nc" in particularly) are needed in 517 * advance to create dv/verbs device context. We proceed the 518 * devargs here to get ones, and later proceed devargs again 519 * to override some hardware settings. 520 */ 521 err = mlx5_args(&config, dpdk_dev->devargs); 522 if (err) { 523 err = rte_errno; 524 DRV_LOG(ERR, "failed to process device arguments: %s", 525 strerror(rte_errno)); 526 goto error; 527 } 528 mlx5_malloc_mem_select(config.sys_mem_en); 529 sh = mlx5_alloc_shared_dev_ctx(spawn, &config); 530 if (!sh) 531 return NULL; 532 config.devx = sh->devx; 533 #ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR 534 config.dest_tir = 1; 535 #endif 536 #ifdef HAVE_IBV_MLX5_MOD_SWP 537 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; 538 #endif 539 /* 540 * Multi-packet send is supported by ConnectX-4 Lx PF as well 541 * as all ConnectX-5 devices. 542 */ 543 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 544 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS; 545 #endif 546 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 547 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; 548 #endif 549 mlx5_glue->dv_query_device(sh->ctx, &dv_attr); 550 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { 551 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { 552 DRV_LOG(DEBUG, "enhanced MPW is supported"); 553 mps = MLX5_MPW_ENHANCED; 554 } else { 555 DRV_LOG(DEBUG, "MPW is supported"); 556 mps = MLX5_MPW; 557 } 558 } else { 559 DRV_LOG(DEBUG, "MPW isn't supported"); 560 mps = MLX5_MPW_DISABLED; 561 } 562 #ifdef HAVE_IBV_MLX5_MOD_SWP 563 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP) 564 swp = dv_attr.sw_parsing_caps.sw_parsing_offloads; 565 DRV_LOG(DEBUG, "SWP support: %u", swp); 566 #endif 567 config.swp = !!swp; 568 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 569 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) { 570 struct mlx5dv_striding_rq_caps mprq_caps = 571 dv_attr.striding_rq_caps; 572 573 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d", 574 mprq_caps.min_single_stride_log_num_of_bytes); 575 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d", 576 mprq_caps.max_single_stride_log_num_of_bytes); 577 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d", 578 mprq_caps.min_single_wqe_log_num_of_strides); 579 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d", 580 mprq_caps.max_single_wqe_log_num_of_strides); 581 DRV_LOG(DEBUG, "\tsupported_qpts: %d", 582 mprq_caps.supported_qpts); 583 DRV_LOG(DEBUG, "device supports Multi-Packet RQ"); 584 mprq = 1; 585 mprq_min_stride_size_n = 586 mprq_caps.min_single_stride_log_num_of_bytes; 587 mprq_max_stride_size_n = 588 mprq_caps.max_single_stride_log_num_of_bytes; 589 mprq_min_stride_num_n = 590 mprq_caps.min_single_wqe_log_num_of_strides; 591 mprq_max_stride_num_n = 592 mprq_caps.max_single_wqe_log_num_of_strides; 593 } 594 #endif 595 if (RTE_CACHE_LINE_SIZE == 128 && 596 !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) 597 cqe_comp = 0; 598 else 599 cqe_comp = 1; 600 config.cqe_comp = cqe_comp; 601 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD 602 /* Whether device supports 128B Rx CQE padding. */ 603 cqe_pad = RTE_CACHE_LINE_SIZE == 128 && 604 (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD); 605 #endif 606 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 607 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { 608 tunnel_en = ((dv_attr.tunnel_offloads_caps & 609 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) && 610 (dv_attr.tunnel_offloads_caps & 611 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) && 612 (dv_attr.tunnel_offloads_caps & 613 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE)); 614 } 615 DRV_LOG(DEBUG, "tunnel offloading is %ssupported", 616 tunnel_en ? "" : "not "); 617 #else 618 DRV_LOG(WARNING, 619 "tunnel offloading disabled due to old OFED/rdma-core version"); 620 #endif 621 config.tunnel_en = tunnel_en; 622 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT 623 mpls_en = ((dv_attr.tunnel_offloads_caps & 624 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) && 625 (dv_attr.tunnel_offloads_caps & 626 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP)); 627 DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported", 628 mpls_en ? "" : "not "); 629 #else 630 DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to" 631 " old OFED/rdma-core version or firmware configuration"); 632 #endif 633 config.mpls_en = mpls_en; 634 /* Check port status. */ 635 err = mlx5_glue->query_port(sh->ctx, spawn->phys_port, &port_attr); 636 if (err) { 637 DRV_LOG(ERR, "port query failed: %s", strerror(err)); 638 goto error; 639 } 640 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { 641 DRV_LOG(ERR, "port is not configured in Ethernet mode"); 642 err = EINVAL; 643 goto error; 644 } 645 if (port_attr.state != IBV_PORT_ACTIVE) 646 DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)", 647 mlx5_glue->port_state_str(port_attr.state), 648 port_attr.state); 649 /* Allocate private eth device data. */ 650 priv = mlx5_malloc(MLX5_MEM_ZERO | MLX5_MEM_RTE, 651 sizeof(*priv), 652 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 653 if (priv == NULL) { 654 DRV_LOG(ERR, "priv allocation failure"); 655 err = ENOMEM; 656 goto error; 657 } 658 priv->sh = sh; 659 priv->dev_port = spawn->phys_port; 660 priv->pci_dev = spawn->pci_dev; 661 priv->mtu = RTE_ETHER_MTU; 662 priv->mp_id.port_id = port_id; 663 strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN); 664 /* Some internal functions rely on Netlink sockets, open them now. */ 665 priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA); 666 priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE); 667 priv->representor = !!switch_info->representor; 668 priv->master = !!switch_info->master; 669 priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; 670 priv->vport_meta_tag = 0; 671 priv->vport_meta_mask = 0; 672 priv->pf_bond = spawn->pf_bond; 673 #ifdef HAVE_MLX5DV_DR_DEVX_PORT 674 /* 675 * The DevX port query API is implemented. E-Switch may use 676 * either vport or reg_c[0] metadata register to match on 677 * vport index. The engaged part of metadata register is 678 * defined by mask. 679 */ 680 if (switch_info->representor || switch_info->master) { 681 devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT | 682 MLX5DV_DEVX_PORT_MATCH_REG_C_0; 683 err = mlx5_glue->devx_port_query(sh->ctx, spawn->phys_port, 684 &devx_port); 685 if (err) { 686 DRV_LOG(WARNING, 687 "can't query devx port %d on device %s", 688 spawn->phys_port, 689 mlx5_os_get_dev_device_name(spawn->phys_dev)); 690 devx_port.comp_mask = 0; 691 } 692 } 693 if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) { 694 priv->vport_meta_tag = devx_port.reg_c_0.value; 695 priv->vport_meta_mask = devx_port.reg_c_0.mask; 696 if (!priv->vport_meta_mask) { 697 DRV_LOG(ERR, "vport zero mask for port %d" 698 " on bonding device %s", 699 spawn->phys_port, 700 mlx5_os_get_dev_device_name 701 (spawn->phys_dev)); 702 err = ENOTSUP; 703 goto error; 704 } 705 if (priv->vport_meta_tag & ~priv->vport_meta_mask) { 706 DRV_LOG(ERR, "invalid vport tag for port %d" 707 " on bonding device %s", 708 spawn->phys_port, 709 mlx5_os_get_dev_device_name 710 (spawn->phys_dev)); 711 err = ENOTSUP; 712 goto error; 713 } 714 } 715 if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) { 716 priv->vport_id = devx_port.vport_num; 717 } else if (spawn->pf_bond >= 0) { 718 DRV_LOG(ERR, "can't deduce vport index for port %d" 719 " on bonding device %s", 720 spawn->phys_port, 721 mlx5_os_get_dev_device_name(spawn->phys_dev)); 722 err = ENOTSUP; 723 goto error; 724 } else { 725 /* Suppose vport index in compatible way. */ 726 priv->vport_id = switch_info->representor ? 727 switch_info->port_name + 1 : -1; 728 } 729 #else 730 /* 731 * Kernel/rdma_core support single E-Switch per PF configurations 732 * only and vport_id field contains the vport index for 733 * associated VF, which is deduced from representor port name. 734 * For example, let's have the IB device port 10, it has 735 * attached network device eth0, which has port name attribute 736 * pf0vf2, we can deduce the VF number as 2, and set vport index 737 * as 3 (2+1). This assigning schema should be changed if the 738 * multiple E-Switch instances per PF configurations or/and PCI 739 * subfunctions are added. 740 */ 741 priv->vport_id = switch_info->representor ? 742 switch_info->port_name + 1 : -1; 743 #endif 744 /* representor_id field keeps the unmodified VF index. */ 745 priv->representor_id = switch_info->representor ? 746 switch_info->port_name : -1; 747 /* 748 * Look for sibling devices in order to reuse their switch domain 749 * if any, otherwise allocate one. 750 */ 751 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { 752 const struct mlx5_priv *opriv = 753 rte_eth_devices[port_id].data->dev_private; 754 755 if (!opriv || 756 opriv->sh != priv->sh || 757 opriv->domain_id == 758 RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) 759 continue; 760 priv->domain_id = opriv->domain_id; 761 break; 762 } 763 if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { 764 err = rte_eth_switch_domain_alloc(&priv->domain_id); 765 if (err) { 766 err = rte_errno; 767 DRV_LOG(ERR, "unable to allocate switch domain: %s", 768 strerror(rte_errno)); 769 goto error; 770 } 771 own_domain_id = 1; 772 } 773 /* Override some values set by hardware configuration. */ 774 mlx5_args(&config, dpdk_dev->devargs); 775 err = mlx5_dev_check_sibling_config(priv, &config); 776 if (err) 777 goto error; 778 config.hw_csum = !!(sh->device_attr.device_cap_flags_ex & 779 IBV_DEVICE_RAW_IP_CSUM); 780 DRV_LOG(DEBUG, "checksum offloading is %ssupported", 781 (config.hw_csum ? "" : "not ")); 782 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \ 783 !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) 784 DRV_LOG(DEBUG, "counters are not supported"); 785 #endif 786 #if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR) 787 if (config.dv_flow_en) { 788 DRV_LOG(WARNING, "DV flow is not supported"); 789 config.dv_flow_en = 0; 790 } 791 #endif 792 config.ind_table_max_size = 793 sh->device_attr.max_rwq_indirection_table_size; 794 /* 795 * Remove this check once DPDK supports larger/variable 796 * indirection tables. 797 */ 798 if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512) 799 config.ind_table_max_size = ETH_RSS_RETA_SIZE_512; 800 DRV_LOG(DEBUG, "maximum Rx indirection table size is %u", 801 config.ind_table_max_size); 802 config.hw_vlan_strip = !!(sh->device_attr.raw_packet_caps & 803 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); 804 DRV_LOG(DEBUG, "VLAN stripping is %ssupported", 805 (config.hw_vlan_strip ? "" : "not ")); 806 config.hw_fcs_strip = !!(sh->device_attr.raw_packet_caps & 807 IBV_RAW_PACKET_CAP_SCATTER_FCS); 808 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING) 809 hw_padding = !!sh->device_attr.rx_pad_end_addr_align; 810 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING) 811 hw_padding = !!(sh->device_attr.device_cap_flags_ex & 812 IBV_DEVICE_PCI_WRITE_END_PADDING); 813 #endif 814 if (config.hw_padding && !hw_padding) { 815 DRV_LOG(DEBUG, "Rx end alignment padding isn't supported"); 816 config.hw_padding = 0; 817 } else if (config.hw_padding) { 818 DRV_LOG(DEBUG, "Rx end alignment padding is enabled"); 819 } 820 config.tso = (sh->device_attr.max_tso > 0 && 821 (sh->device_attr.tso_supported_qpts & 822 (1 << IBV_QPT_RAW_PACKET))); 823 if (config.tso) 824 config.tso_max_payload_sz = sh->device_attr.max_tso; 825 /* 826 * MPW is disabled by default, while the Enhanced MPW is enabled 827 * by default. 828 */ 829 if (config.mps == MLX5_ARG_UNSET) 830 config.mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED : 831 MLX5_MPW_DISABLED; 832 else 833 config.mps = config.mps ? mps : MLX5_MPW_DISABLED; 834 DRV_LOG(INFO, "%sMPS is %s", 835 config.mps == MLX5_MPW_ENHANCED ? "enhanced " : 836 config.mps == MLX5_MPW ? "legacy " : "", 837 config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); 838 if (config.cqe_comp && !cqe_comp) { 839 DRV_LOG(WARNING, "Rx CQE compression isn't supported"); 840 config.cqe_comp = 0; 841 } 842 if (config.cqe_pad && !cqe_pad) { 843 DRV_LOG(WARNING, "Rx CQE padding isn't supported"); 844 config.cqe_pad = 0; 845 } else if (config.cqe_pad) { 846 DRV_LOG(INFO, "Rx CQE padding is enabled"); 847 } 848 if (config.devx) { 849 priv->counter_fallback = 0; 850 err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr); 851 if (err) { 852 err = -err; 853 goto error; 854 } 855 if (!config.hca_attr.flow_counters_dump) 856 priv->counter_fallback = 1; 857 #ifndef HAVE_IBV_DEVX_ASYNC 858 priv->counter_fallback = 1; 859 #endif 860 if (priv->counter_fallback) 861 DRV_LOG(INFO, "Use fall-back DV counter management"); 862 /* Check for LRO support. */ 863 if (config.dest_tir && config.hca_attr.lro_cap && 864 config.dv_flow_en) { 865 /* TBD check tunnel lro caps. */ 866 config.lro.supported = config.hca_attr.lro_cap; 867 DRV_LOG(DEBUG, "Device supports LRO"); 868 /* 869 * If LRO timeout is not configured by application, 870 * use the minimal supported value. 871 */ 872 if (!config.lro.timeout) 873 config.lro.timeout = 874 config.hca_attr.lro_timer_supported_periods[0]; 875 DRV_LOG(DEBUG, "LRO session timeout set to %d usec", 876 config.lro.timeout); 877 } 878 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER) 879 if (config.hca_attr.qos.sup && config.hca_attr.qos.srtcm_sup && 880 config.dv_flow_en) { 881 uint8_t reg_c_mask = 882 config.hca_attr.qos.flow_meter_reg_c_ids; 883 /* 884 * Meter needs two REG_C's for color match and pre-sfx 885 * flow match. Here get the REG_C for color match. 886 * REG_C_0 and REG_C_1 is reserved for metadata feature. 887 */ 888 reg_c_mask &= 0xfc; 889 if (__builtin_popcount(reg_c_mask) < 1) { 890 priv->mtr_en = 0; 891 DRV_LOG(WARNING, "No available register for" 892 " meter."); 893 } else { 894 priv->mtr_color_reg = ffs(reg_c_mask) - 1 + 895 REG_C_0; 896 priv->mtr_en = 1; 897 priv->mtr_reg_share = 898 config.hca_attr.qos.flow_meter_reg_share; 899 DRV_LOG(DEBUG, "The REG_C meter uses is %d", 900 priv->mtr_color_reg); 901 } 902 } 903 #endif 904 } 905 if (config.tx_pp) { 906 DRV_LOG(DEBUG, "Timestamp counter frequency %u kHz", 907 config.hca_attr.dev_freq_khz); 908 DRV_LOG(DEBUG, "Packet pacing is %ssupported", 909 config.hca_attr.qos.packet_pacing ? "" : "not "); 910 DRV_LOG(DEBUG, "Cross channel ops are %ssupported", 911 config.hca_attr.cross_channel ? "" : "not "); 912 DRV_LOG(DEBUG, "WQE index ignore is %ssupported", 913 config.hca_attr.wqe_index_ignore ? "" : "not "); 914 DRV_LOG(DEBUG, "Non-wire SQ feature is %ssupported", 915 config.hca_attr.non_wire_sq ? "" : "not "); 916 DRV_LOG(DEBUG, "Static WQE SQ feature is %ssupported (%d)", 917 config.hca_attr.log_max_static_sq_wq ? "" : "not ", 918 config.hca_attr.log_max_static_sq_wq); 919 DRV_LOG(DEBUG, "WQE rate PP mode is %ssupported", 920 config.hca_attr.qos.wqe_rate_pp ? "" : "not "); 921 if (!config.devx) { 922 DRV_LOG(ERR, "DevX is required for packet pacing"); 923 err = ENODEV; 924 goto error; 925 } 926 if (!config.hca_attr.qos.packet_pacing) { 927 DRV_LOG(ERR, "Packet pacing is not supported"); 928 err = ENODEV; 929 goto error; 930 } 931 if (!config.hca_attr.cross_channel) { 932 DRV_LOG(ERR, "Cross channel operations are" 933 " required for packet pacing"); 934 err = ENODEV; 935 goto error; 936 } 937 if (!config.hca_attr.wqe_index_ignore) { 938 DRV_LOG(ERR, "WQE index ignore feature is" 939 " required for packet pacing"); 940 err = ENODEV; 941 goto error; 942 } 943 if (!config.hca_attr.non_wire_sq) { 944 DRV_LOG(ERR, "Non-wire SQ feature is" 945 " required for packet pacing"); 946 err = ENODEV; 947 goto error; 948 } 949 if (!config.hca_attr.log_max_static_sq_wq) { 950 DRV_LOG(ERR, "Static WQE SQ feature is" 951 " required for packet pacing"); 952 err = ENODEV; 953 goto error; 954 } 955 if (!config.hca_attr.qos.wqe_rate_pp) { 956 DRV_LOG(ERR, "WQE rate mode is required" 957 " for packet pacing"); 958 err = ENODEV; 959 goto error; 960 } 961 #ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET 962 DRV_LOG(ERR, "DevX does not provide UAR offset," 963 " can't create queues for packet pacing"); 964 err = ENODEV; 965 goto error; 966 #endif 967 } 968 if (config.devx) { 969 uint32_t reg[MLX5_ST_SZ_DW(register_mtutc)]; 970 971 err = mlx5_devx_cmd_register_read 972 (sh->ctx, MLX5_REGISTER_ID_MTUTC, 0, 973 reg, MLX5_ST_SZ_DW(register_mtutc)); 974 if (!err) { 975 uint32_t ts_mode; 976 977 /* MTUTC register is read successfully. */ 978 ts_mode = MLX5_GET(register_mtutc, reg, 979 time_stamp_mode); 980 if (ts_mode == MLX5_MTUTC_TIMESTAMP_MODE_REAL_TIME) 981 config.rt_timestamp = 1; 982 } else { 983 /* Kernel does not support register reading. */ 984 if (config.hca_attr.dev_freq_khz == 985 (NS_PER_S / MS_PER_S)) 986 config.rt_timestamp = 1; 987 } 988 } 989 /* 990 * If HW has bug working with tunnel packet decapsulation and 991 * scatter FCS, and decapsulation is needed, clear the hw_fcs_strip 992 * bit. Then DEV_RX_OFFLOAD_KEEP_CRC bit will not be set anymore. 993 */ 994 if (config.hca_attr.scatter_fcs_w_decap_disable && config.decap_en) 995 config.hw_fcs_strip = 0; 996 DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported", 997 (config.hw_fcs_strip ? "" : "not ")); 998 if (config.mprq.enabled && mprq) { 999 if (config.mprq.stride_num_n && 1000 (config.mprq.stride_num_n > mprq_max_stride_num_n || 1001 config.mprq.stride_num_n < mprq_min_stride_num_n)) { 1002 config.mprq.stride_num_n = 1003 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, 1004 mprq_min_stride_num_n), 1005 mprq_max_stride_num_n); 1006 DRV_LOG(WARNING, 1007 "the number of strides" 1008 " for Multi-Packet RQ is out of range," 1009 " setting default value (%u)", 1010 1 << config.mprq.stride_num_n); 1011 } 1012 if (config.mprq.stride_size_n && 1013 (config.mprq.stride_size_n > mprq_max_stride_size_n || 1014 config.mprq.stride_size_n < mprq_min_stride_size_n)) { 1015 config.mprq.stride_size_n = 1016 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N, 1017 mprq_min_stride_size_n), 1018 mprq_max_stride_size_n); 1019 DRV_LOG(WARNING, 1020 "the size of a stride" 1021 " for Multi-Packet RQ is out of range," 1022 " setting default value (%u)", 1023 1 << config.mprq.stride_size_n); 1024 } 1025 config.mprq.min_stride_size_n = mprq_min_stride_size_n; 1026 config.mprq.max_stride_size_n = mprq_max_stride_size_n; 1027 } else if (config.mprq.enabled && !mprq) { 1028 DRV_LOG(WARNING, "Multi-Packet RQ isn't supported"); 1029 config.mprq.enabled = 0; 1030 } 1031 if (config.max_dump_files_num == 0) 1032 config.max_dump_files_num = 128; 1033 eth_dev = rte_eth_dev_allocate(name); 1034 if (eth_dev == NULL) { 1035 DRV_LOG(ERR, "can not allocate rte ethdev"); 1036 err = ENOMEM; 1037 goto error; 1038 } 1039 /* Flag to call rte_eth_dev_release_port() in rte_eth_dev_close(). */ 1040 eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE; 1041 if (priv->representor) { 1042 eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR; 1043 eth_dev->data->representor_id = priv->representor_id; 1044 } 1045 /* 1046 * Store associated network device interface index. This index 1047 * is permanent throughout the lifetime of device. So, we may store 1048 * the ifindex here and use the cached value further. 1049 */ 1050 MLX5_ASSERT(spawn->ifindex); 1051 priv->if_index = spawn->ifindex; 1052 eth_dev->data->dev_private = priv; 1053 priv->dev_data = eth_dev->data; 1054 eth_dev->data->mac_addrs = priv->mac; 1055 eth_dev->device = dpdk_dev; 1056 /* Configure the first MAC address by default. */ 1057 if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) { 1058 DRV_LOG(ERR, 1059 "port %u cannot get MAC address, is mlx5_en" 1060 " loaded? (errno: %s)", 1061 eth_dev->data->port_id, strerror(rte_errno)); 1062 err = ENODEV; 1063 goto error; 1064 } 1065 DRV_LOG(INFO, 1066 "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", 1067 eth_dev->data->port_id, 1068 mac.addr_bytes[0], mac.addr_bytes[1], 1069 mac.addr_bytes[2], mac.addr_bytes[3], 1070 mac.addr_bytes[4], mac.addr_bytes[5]); 1071 #ifdef RTE_LIBRTE_MLX5_DEBUG 1072 { 1073 char ifname[IF_NAMESIZE]; 1074 1075 if (mlx5_get_ifname(eth_dev, &ifname) == 0) 1076 DRV_LOG(DEBUG, "port %u ifname is \"%s\"", 1077 eth_dev->data->port_id, ifname); 1078 else 1079 DRV_LOG(DEBUG, "port %u ifname is unknown", 1080 eth_dev->data->port_id); 1081 } 1082 #endif 1083 /* Get actual MTU if possible. */ 1084 err = mlx5_get_mtu(eth_dev, &priv->mtu); 1085 if (err) { 1086 err = rte_errno; 1087 goto error; 1088 } 1089 DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id, 1090 priv->mtu); 1091 /* Initialize burst functions to prevent crashes before link-up. */ 1092 eth_dev->rx_pkt_burst = removed_rx_burst; 1093 eth_dev->tx_pkt_burst = removed_tx_burst; 1094 eth_dev->dev_ops = &mlx5_os_dev_ops; 1095 /* Register MAC address. */ 1096 claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); 1097 if (config.vf && config.vf_nl_en) 1098 mlx5_nl_mac_addr_sync(priv->nl_socket_route, 1099 mlx5_ifindex(eth_dev), 1100 eth_dev->data->mac_addrs, 1101 MLX5_MAX_MAC_ADDRESSES); 1102 priv->flows = 0; 1103 priv->ctrl_flows = 0; 1104 TAILQ_INIT(&priv->flow_meters); 1105 TAILQ_INIT(&priv->flow_meter_profiles); 1106 /* Hint libmlx5 to use PMD allocator for data plane resources */ 1107 mlx5_glue->dv_set_context_attr(sh->ctx, 1108 MLX5DV_CTX_ATTR_BUF_ALLOCATORS, 1109 (void *)((uintptr_t)&(struct mlx5dv_ctx_allocators){ 1110 .alloc = &mlx5_alloc_verbs_buf, 1111 .free = &mlx5_free_verbs_buf, 1112 .data = priv, 1113 })); 1114 /* Bring Ethernet device up. */ 1115 DRV_LOG(DEBUG, "port %u forcing Ethernet interface up", 1116 eth_dev->data->port_id); 1117 mlx5_set_link_up(eth_dev); 1118 /* 1119 * Even though the interrupt handler is not installed yet, 1120 * interrupts will still trigger on the async_fd from 1121 * Verbs context returned by ibv_open_device(). 1122 */ 1123 mlx5_link_update(eth_dev, 0); 1124 #ifdef HAVE_MLX5DV_DR_ESWITCH 1125 if (!(config.hca_attr.eswitch_manager && config.dv_flow_en && 1126 (switch_info->representor || switch_info->master))) 1127 config.dv_esw_en = 0; 1128 #else 1129 config.dv_esw_en = 0; 1130 #endif 1131 /* Detect minimal data bytes to inline. */ 1132 mlx5_set_min_inline(spawn, &config); 1133 /* Store device configuration on private structure. */ 1134 priv->config = config; 1135 /* Create context for virtual machine VLAN workaround. */ 1136 priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex); 1137 if (config.dv_flow_en) { 1138 err = mlx5_alloc_shared_dr(priv); 1139 if (err) 1140 goto error; 1141 /* 1142 * RSS id is shared with meter flow id. Meter flow id can only 1143 * use the 24 MSB of the register. 1144 */ 1145 priv->qrss_id_pool = mlx5_flow_id_pool_alloc(UINT32_MAX >> 1146 MLX5_MTR_COLOR_BITS); 1147 if (!priv->qrss_id_pool) { 1148 DRV_LOG(ERR, "can't create flow id pool"); 1149 err = ENOMEM; 1150 goto error; 1151 } 1152 } 1153 /* Supported Verbs flow priority number detection. */ 1154 err = mlx5_flow_discover_priorities(eth_dev); 1155 if (err < 0) { 1156 err = -err; 1157 goto error; 1158 } 1159 priv->config.flow_prio = err; 1160 if (!priv->config.dv_esw_en && 1161 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { 1162 DRV_LOG(WARNING, "metadata mode %u is not supported " 1163 "(no E-Switch)", priv->config.dv_xmeta_en); 1164 priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY; 1165 } 1166 mlx5_set_metadata_mask(eth_dev); 1167 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && 1168 !priv->sh->dv_regc0_mask) { 1169 DRV_LOG(ERR, "metadata mode %u is not supported " 1170 "(no metadata reg_c[0] is available)", 1171 priv->config.dv_xmeta_en); 1172 err = ENOTSUP; 1173 goto error; 1174 } 1175 /* 1176 * Allocate the buffer for flow creating, just once. 1177 * The allocation must be done before any flow creating. 1178 */ 1179 mlx5_flow_alloc_intermediate(eth_dev); 1180 /* Query availability of metadata reg_c's. */ 1181 err = mlx5_flow_discover_mreg_c(eth_dev); 1182 if (err < 0) { 1183 err = -err; 1184 goto error; 1185 } 1186 if (!mlx5_flow_ext_mreg_supported(eth_dev)) { 1187 DRV_LOG(DEBUG, 1188 "port %u extensive metadata register is not supported", 1189 eth_dev->data->port_id); 1190 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { 1191 DRV_LOG(ERR, "metadata mode %u is not supported " 1192 "(no metadata registers available)", 1193 priv->config.dv_xmeta_en); 1194 err = ENOTSUP; 1195 goto error; 1196 } 1197 } 1198 if (priv->config.dv_flow_en && 1199 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && 1200 mlx5_flow_ext_mreg_supported(eth_dev) && 1201 priv->sh->dv_regc0_mask) { 1202 priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME, 1203 MLX5_FLOW_MREG_HTABLE_SZ); 1204 if (!priv->mreg_cp_tbl) { 1205 err = ENOMEM; 1206 goto error; 1207 } 1208 } 1209 return eth_dev; 1210 error: 1211 if (priv) { 1212 if (priv->mreg_cp_tbl) 1213 mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL); 1214 if (priv->sh) 1215 mlx5_os_free_shared_dr(priv); 1216 if (priv->nl_socket_route >= 0) 1217 close(priv->nl_socket_route); 1218 if (priv->nl_socket_rdma >= 0) 1219 close(priv->nl_socket_rdma); 1220 if (priv->vmwa_context) 1221 mlx5_vlan_vmwa_exit(priv->vmwa_context); 1222 if (priv->qrss_id_pool) 1223 mlx5_flow_id_pool_release(priv->qrss_id_pool); 1224 if (own_domain_id) 1225 claim_zero(rte_eth_switch_domain_free(priv->domain_id)); 1226 mlx5_free(priv); 1227 if (eth_dev != NULL) 1228 eth_dev->data->dev_private = NULL; 1229 } 1230 if (eth_dev != NULL) { 1231 /* mac_addrs must not be freed alone because part of 1232 * dev_private 1233 **/ 1234 eth_dev->data->mac_addrs = NULL; 1235 rte_eth_dev_release_port(eth_dev); 1236 } 1237 if (sh) 1238 mlx5_free_shared_dev_ctx(sh); 1239 MLX5_ASSERT(err > 0); 1240 rte_errno = err; 1241 return NULL; 1242 } 1243 1244 /** 1245 * Comparison callback to sort device data. 1246 * 1247 * This is meant to be used with qsort(). 1248 * 1249 * @param a[in] 1250 * Pointer to pointer to first data object. 1251 * @param b[in] 1252 * Pointer to pointer to second data object. 1253 * 1254 * @return 1255 * 0 if both objects are equal, less than 0 if the first argument is less 1256 * than the second, greater than 0 otherwise. 1257 */ 1258 static int 1259 mlx5_dev_spawn_data_cmp(const void *a, const void *b) 1260 { 1261 const struct mlx5_switch_info *si_a = 1262 &((const struct mlx5_dev_spawn_data *)a)->info; 1263 const struct mlx5_switch_info *si_b = 1264 &((const struct mlx5_dev_spawn_data *)b)->info; 1265 int ret; 1266 1267 /* Master device first. */ 1268 ret = si_b->master - si_a->master; 1269 if (ret) 1270 return ret; 1271 /* Then representor devices. */ 1272 ret = si_b->representor - si_a->representor; 1273 if (ret) 1274 return ret; 1275 /* Unidentified devices come last in no specific order. */ 1276 if (!si_a->representor) 1277 return 0; 1278 /* Order representors by name. */ 1279 return si_a->port_name - si_b->port_name; 1280 } 1281 1282 /** 1283 * Match PCI information for possible slaves of bonding device. 1284 * 1285 * @param[in] ibv_dev 1286 * Pointer to Infiniband device structure. 1287 * @param[in] pci_dev 1288 * Pointer to PCI device structure to match PCI address. 1289 * @param[in] nl_rdma 1290 * Netlink RDMA group socket handle. 1291 * 1292 * @return 1293 * negative value if no bonding device found, otherwise 1294 * positive index of slave PF in bonding. 1295 */ 1296 static int 1297 mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev, 1298 const struct rte_pci_device *pci_dev, 1299 int nl_rdma) 1300 { 1301 char ifname[IF_NAMESIZE + 1]; 1302 unsigned int ifindex; 1303 unsigned int np, i; 1304 FILE *file = NULL; 1305 int pf = -1; 1306 1307 /* 1308 * Try to get master device name. If something goes 1309 * wrong suppose the lack of kernel support and no 1310 * bonding devices. 1311 */ 1312 if (nl_rdma < 0) 1313 return -1; 1314 if (!strstr(ibv_dev->name, "bond")) 1315 return -1; 1316 np = mlx5_nl_portnum(nl_rdma, ibv_dev->name); 1317 if (!np) 1318 return -1; 1319 /* 1320 * The Master device might not be on the predefined 1321 * port (not on port index 1, it is not garanted), 1322 * we have to scan all Infiniband device port and 1323 * find master. 1324 */ 1325 for (i = 1; i <= np; ++i) { 1326 /* Check whether Infiniband port is populated. */ 1327 ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i); 1328 if (!ifindex) 1329 continue; 1330 if (!if_indextoname(ifindex, ifname)) 1331 continue; 1332 /* Try to read bonding slave names from sysfs. */ 1333 MKSTR(slaves, 1334 "/sys/class/net/%s/master/bonding/slaves", ifname); 1335 file = fopen(slaves, "r"); 1336 if (file) 1337 break; 1338 } 1339 if (!file) 1340 return -1; 1341 /* Use safe format to check maximal buffer length. */ 1342 MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE); 1343 while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) { 1344 char tmp_str[IF_NAMESIZE + 32]; 1345 struct rte_pci_addr pci_addr; 1346 struct mlx5_switch_info info; 1347 1348 /* Process slave interface names in the loop. */ 1349 snprintf(tmp_str, sizeof(tmp_str), 1350 "/sys/class/net/%s", ifname); 1351 if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) { 1352 DRV_LOG(WARNING, "can not get PCI address" 1353 " for netdev \"%s\"", ifname); 1354 continue; 1355 } 1356 if (pci_dev->addr.domain != pci_addr.domain || 1357 pci_dev->addr.bus != pci_addr.bus || 1358 pci_dev->addr.devid != pci_addr.devid || 1359 pci_dev->addr.function != pci_addr.function) 1360 continue; 1361 /* Slave interface PCI address match found. */ 1362 fclose(file); 1363 snprintf(tmp_str, sizeof(tmp_str), 1364 "/sys/class/net/%s/phys_port_name", ifname); 1365 file = fopen(tmp_str, "rb"); 1366 if (!file) 1367 break; 1368 info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET; 1369 if (fscanf(file, "%32s", tmp_str) == 1) 1370 mlx5_translate_port_name(tmp_str, &info); 1371 if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY || 1372 info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK) 1373 pf = info.port_name; 1374 break; 1375 } 1376 if (file) 1377 fclose(file); 1378 return pf; 1379 } 1380 1381 /** 1382 * DPDK callback to register a PCI device. 1383 * 1384 * This function spawns Ethernet devices out of a given PCI device. 1385 * 1386 * @param[in] pci_drv 1387 * PCI driver structure (mlx5_driver). 1388 * @param[in] pci_dev 1389 * PCI device information. 1390 * 1391 * @return 1392 * 0 on success, a negative errno value otherwise and rte_errno is set. 1393 */ 1394 int 1395 mlx5_os_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, 1396 struct rte_pci_device *pci_dev) 1397 { 1398 struct ibv_device **ibv_list; 1399 /* 1400 * Number of found IB Devices matching with requested PCI BDF. 1401 * nd != 1 means there are multiple IB devices over the same 1402 * PCI device and we have representors and master. 1403 */ 1404 unsigned int nd = 0; 1405 /* 1406 * Number of found IB device Ports. nd = 1 and np = 1..n means 1407 * we have the single multiport IB device, and there may be 1408 * representors attached to some of found ports. 1409 */ 1410 unsigned int np = 0; 1411 /* 1412 * Number of DPDK ethernet devices to Spawn - either over 1413 * multiple IB devices or multiple ports of single IB device. 1414 * Actually this is the number of iterations to spawn. 1415 */ 1416 unsigned int ns = 0; 1417 /* 1418 * Bonding device 1419 * < 0 - no bonding device (single one) 1420 * >= 0 - bonding device (value is slave PF index) 1421 */ 1422 int bd = -1; 1423 struct mlx5_dev_spawn_data *list = NULL; 1424 struct mlx5_dev_config dev_config; 1425 int ret; 1426 1427 if (mlx5_class_get(pci_dev->device.devargs) != MLX5_CLASS_NET) { 1428 DRV_LOG(DEBUG, "Skip probing - should be probed by other mlx5" 1429 " driver."); 1430 return 1; 1431 } 1432 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 1433 mlx5_pmd_socket_init(); 1434 ret = mlx5_init_once(); 1435 if (ret) { 1436 DRV_LOG(ERR, "unable to init PMD global data: %s", 1437 strerror(rte_errno)); 1438 return -rte_errno; 1439 } 1440 MLX5_ASSERT(pci_drv == &mlx5_driver); 1441 errno = 0; 1442 ibv_list = mlx5_glue->get_device_list(&ret); 1443 if (!ibv_list) { 1444 rte_errno = errno ? errno : ENOSYS; 1445 DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?"); 1446 return -rte_errno; 1447 } 1448 /* 1449 * First scan the list of all Infiniband devices to find 1450 * matching ones, gathering into the list. 1451 */ 1452 struct ibv_device *ibv_match[ret + 1]; 1453 int nl_route = mlx5_nl_init(NETLINK_ROUTE); 1454 int nl_rdma = mlx5_nl_init(NETLINK_RDMA); 1455 unsigned int i; 1456 1457 while (ret-- > 0) { 1458 struct rte_pci_addr pci_addr; 1459 1460 DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name); 1461 bd = mlx5_device_bond_pci_match 1462 (ibv_list[ret], pci_dev, nl_rdma); 1463 if (bd >= 0) { 1464 /* 1465 * Bonding device detected. Only one match is allowed, 1466 * the bonding is supported over multi-port IB device, 1467 * there should be no matches on representor PCI 1468 * functions or non VF LAG bonding devices with 1469 * specified address. 1470 */ 1471 if (nd) { 1472 DRV_LOG(ERR, 1473 "multiple PCI match on bonding device" 1474 "\"%s\" found", ibv_list[ret]->name); 1475 rte_errno = ENOENT; 1476 ret = -rte_errno; 1477 goto exit; 1478 } 1479 DRV_LOG(INFO, "PCI information matches for" 1480 " slave %d bonding device \"%s\"", 1481 bd, ibv_list[ret]->name); 1482 ibv_match[nd++] = ibv_list[ret]; 1483 break; 1484 } 1485 if (mlx5_dev_to_pci_addr 1486 (ibv_list[ret]->ibdev_path, &pci_addr)) 1487 continue; 1488 if (pci_dev->addr.domain != pci_addr.domain || 1489 pci_dev->addr.bus != pci_addr.bus || 1490 pci_dev->addr.devid != pci_addr.devid || 1491 pci_dev->addr.function != pci_addr.function) 1492 continue; 1493 DRV_LOG(INFO, "PCI information matches for device \"%s\"", 1494 ibv_list[ret]->name); 1495 ibv_match[nd++] = ibv_list[ret]; 1496 } 1497 ibv_match[nd] = NULL; 1498 if (!nd) { 1499 /* No device matches, just complain and bail out. */ 1500 DRV_LOG(WARNING, 1501 "no Verbs device matches PCI device " PCI_PRI_FMT "," 1502 " are kernel drivers loaded?", 1503 pci_dev->addr.domain, pci_dev->addr.bus, 1504 pci_dev->addr.devid, pci_dev->addr.function); 1505 rte_errno = ENOENT; 1506 ret = -rte_errno; 1507 goto exit; 1508 } 1509 if (nd == 1) { 1510 /* 1511 * Found single matching device may have multiple ports. 1512 * Each port may be representor, we have to check the port 1513 * number and check the representors existence. 1514 */ 1515 if (nl_rdma >= 0) 1516 np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name); 1517 if (!np) 1518 DRV_LOG(WARNING, "can not get IB device \"%s\"" 1519 " ports number", ibv_match[0]->name); 1520 if (bd >= 0 && !np) { 1521 DRV_LOG(ERR, "can not get ports" 1522 " for bonding device"); 1523 rte_errno = ENOENT; 1524 ret = -rte_errno; 1525 goto exit; 1526 } 1527 } 1528 #ifndef HAVE_MLX5DV_DR_DEVX_PORT 1529 if (bd >= 0) { 1530 /* 1531 * This may happen if there is VF LAG kernel support and 1532 * application is compiled with older rdma_core library. 1533 */ 1534 DRV_LOG(ERR, 1535 "No kernel/verbs support for VF LAG bonding found."); 1536 rte_errno = ENOTSUP; 1537 ret = -rte_errno; 1538 goto exit; 1539 } 1540 #endif 1541 /* 1542 * Now we can determine the maximal 1543 * amount of devices to be spawned. 1544 */ 1545 list = mlx5_malloc(MLX5_MEM_ZERO, 1546 sizeof(struct mlx5_dev_spawn_data) * 1547 (np ? np : nd), 1548 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 1549 if (!list) { 1550 DRV_LOG(ERR, "spawn data array allocation failure"); 1551 rte_errno = ENOMEM; 1552 ret = -rte_errno; 1553 goto exit; 1554 } 1555 if (bd >= 0 || np > 1) { 1556 /* 1557 * Single IB device with multiple ports found, 1558 * it may be E-Switch master device and representors. 1559 * We have to perform identification through the ports. 1560 */ 1561 MLX5_ASSERT(nl_rdma >= 0); 1562 MLX5_ASSERT(ns == 0); 1563 MLX5_ASSERT(nd == 1); 1564 MLX5_ASSERT(np); 1565 for (i = 1; i <= np; ++i) { 1566 list[ns].max_port = np; 1567 list[ns].phys_port = i; 1568 list[ns].phys_dev = ibv_match[0]; 1569 list[ns].eth_dev = NULL; 1570 list[ns].pci_dev = pci_dev; 1571 list[ns].pf_bond = bd; 1572 list[ns].ifindex = mlx5_nl_ifindex 1573 (nl_rdma, 1574 mlx5_os_get_dev_device_name 1575 (list[ns].phys_dev), i); 1576 if (!list[ns].ifindex) { 1577 /* 1578 * No network interface index found for the 1579 * specified port, it means there is no 1580 * representor on this port. It's OK, 1581 * there can be disabled ports, for example 1582 * if sriov_numvfs < sriov_totalvfs. 1583 */ 1584 continue; 1585 } 1586 ret = -1; 1587 if (nl_route >= 0) 1588 ret = mlx5_nl_switch_info 1589 (nl_route, 1590 list[ns].ifindex, 1591 &list[ns].info); 1592 if (ret || (!list[ns].info.representor && 1593 !list[ns].info.master)) { 1594 /* 1595 * We failed to recognize representors with 1596 * Netlink, let's try to perform the task 1597 * with sysfs. 1598 */ 1599 ret = mlx5_sysfs_switch_info 1600 (list[ns].ifindex, 1601 &list[ns].info); 1602 } 1603 if (!ret && bd >= 0) { 1604 switch (list[ns].info.name_type) { 1605 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: 1606 if (list[ns].info.port_name == bd) 1607 ns++; 1608 break; 1609 case MLX5_PHYS_PORT_NAME_TYPE_PFHPF: 1610 /* Fallthrough */ 1611 case MLX5_PHYS_PORT_NAME_TYPE_PFVF: 1612 if (list[ns].info.pf_num == bd) 1613 ns++; 1614 break; 1615 default: 1616 break; 1617 } 1618 continue; 1619 } 1620 if (!ret && (list[ns].info.representor ^ 1621 list[ns].info.master)) 1622 ns++; 1623 } 1624 if (!ns) { 1625 DRV_LOG(ERR, 1626 "unable to recognize master/representors" 1627 " on the IB device with multiple ports"); 1628 rte_errno = ENOENT; 1629 ret = -rte_errno; 1630 goto exit; 1631 } 1632 } else { 1633 /* 1634 * The existence of several matching entries (nd > 1) means 1635 * port representors have been instantiated. No existing Verbs 1636 * call nor sysfs entries can tell them apart, this can only 1637 * be done through Netlink calls assuming kernel drivers are 1638 * recent enough to support them. 1639 * 1640 * In the event of identification failure through Netlink, 1641 * try again through sysfs, then: 1642 * 1643 * 1. A single IB device matches (nd == 1) with single 1644 * port (np=0/1) and is not a representor, assume 1645 * no switch support. 1646 * 1647 * 2. Otherwise no safe assumptions can be made; 1648 * complain louder and bail out. 1649 */ 1650 for (i = 0; i != nd; ++i) { 1651 memset(&list[ns].info, 0, sizeof(list[ns].info)); 1652 list[ns].max_port = 1; 1653 list[ns].phys_port = 1; 1654 list[ns].phys_dev = ibv_match[i]; 1655 list[ns].eth_dev = NULL; 1656 list[ns].pci_dev = pci_dev; 1657 list[ns].pf_bond = -1; 1658 list[ns].ifindex = 0; 1659 if (nl_rdma >= 0) 1660 list[ns].ifindex = mlx5_nl_ifindex 1661 (nl_rdma, 1662 mlx5_os_get_dev_device_name 1663 (list[ns].phys_dev), 1); 1664 if (!list[ns].ifindex) { 1665 char ifname[IF_NAMESIZE]; 1666 1667 /* 1668 * Netlink failed, it may happen with old 1669 * ib_core kernel driver (before 4.16). 1670 * We can assume there is old driver because 1671 * here we are processing single ports IB 1672 * devices. Let's try sysfs to retrieve 1673 * the ifindex. The method works for 1674 * master device only. 1675 */ 1676 if (nd > 1) { 1677 /* 1678 * Multiple devices found, assume 1679 * representors, can not distinguish 1680 * master/representor and retrieve 1681 * ifindex via sysfs. 1682 */ 1683 continue; 1684 } 1685 ret = mlx5_get_ifname_sysfs 1686 (ibv_match[i]->ibdev_path, ifname); 1687 if (!ret) 1688 list[ns].ifindex = 1689 if_nametoindex(ifname); 1690 if (!list[ns].ifindex) { 1691 /* 1692 * No network interface index found 1693 * for the specified device, it means 1694 * there it is neither representor 1695 * nor master. 1696 */ 1697 continue; 1698 } 1699 } 1700 ret = -1; 1701 if (nl_route >= 0) 1702 ret = mlx5_nl_switch_info 1703 (nl_route, 1704 list[ns].ifindex, 1705 &list[ns].info); 1706 if (ret || (!list[ns].info.representor && 1707 !list[ns].info.master)) { 1708 /* 1709 * We failed to recognize representors with 1710 * Netlink, let's try to perform the task 1711 * with sysfs. 1712 */ 1713 ret = mlx5_sysfs_switch_info 1714 (list[ns].ifindex, 1715 &list[ns].info); 1716 } 1717 if (!ret && (list[ns].info.representor ^ 1718 list[ns].info.master)) { 1719 ns++; 1720 } else if ((nd == 1) && 1721 !list[ns].info.representor && 1722 !list[ns].info.master) { 1723 /* 1724 * Single IB device with 1725 * one physical port and 1726 * attached network device. 1727 * May be SRIOV is not enabled 1728 * or there is no representors. 1729 */ 1730 DRV_LOG(INFO, "no E-Switch support detected"); 1731 ns++; 1732 break; 1733 } 1734 } 1735 if (!ns) { 1736 DRV_LOG(ERR, 1737 "unable to recognize master/representors" 1738 " on the multiple IB devices"); 1739 rte_errno = ENOENT; 1740 ret = -rte_errno; 1741 goto exit; 1742 } 1743 } 1744 MLX5_ASSERT(ns); 1745 /* 1746 * Sort list to probe devices in natural order for users convenience 1747 * (i.e. master first, then representors from lowest to highest ID). 1748 */ 1749 qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp); 1750 /* Default configuration. */ 1751 dev_config = (struct mlx5_dev_config){ 1752 .hw_padding = 0, 1753 .mps = MLX5_ARG_UNSET, 1754 .dbnc = MLX5_ARG_UNSET, 1755 .rx_vec_en = 1, 1756 .txq_inline_max = MLX5_ARG_UNSET, 1757 .txq_inline_min = MLX5_ARG_UNSET, 1758 .txq_inline_mpw = MLX5_ARG_UNSET, 1759 .txqs_inline = MLX5_ARG_UNSET, 1760 .vf_nl_en = 1, 1761 .mr_ext_memseg_en = 1, 1762 .mprq = { 1763 .enabled = 0, /* Disabled by default. */ 1764 .stride_num_n = 0, 1765 .stride_size_n = 0, 1766 .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN, 1767 .min_rxqs_num = MLX5_MPRQ_MIN_RXQS, 1768 }, 1769 .dv_esw_en = 1, 1770 .dv_flow_en = 1, 1771 .decap_en = 1, 1772 .log_hp_size = MLX5_ARG_UNSET, 1773 }; 1774 /* Device specific configuration. */ 1775 switch (pci_dev->id.device_id) { 1776 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: 1777 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: 1778 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: 1779 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: 1780 case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF: 1781 case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF: 1782 case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF: 1783 dev_config.vf = 1; 1784 break; 1785 default: 1786 break; 1787 } 1788 for (i = 0; i != ns; ++i) { 1789 uint32_t restore; 1790 1791 list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device, 1792 &list[i], 1793 dev_config); 1794 if (!list[i].eth_dev) { 1795 if (rte_errno != EBUSY && rte_errno != EEXIST) 1796 break; 1797 /* Device is disabled or already spawned. Ignore it. */ 1798 continue; 1799 } 1800 restore = list[i].eth_dev->data->dev_flags; 1801 rte_eth_copy_pci_info(list[i].eth_dev, pci_dev); 1802 /* Restore non-PCI flags cleared by the above call. */ 1803 list[i].eth_dev->data->dev_flags |= restore; 1804 rte_eth_dev_probing_finish(list[i].eth_dev); 1805 } 1806 if (i != ns) { 1807 DRV_LOG(ERR, 1808 "probe of PCI device " PCI_PRI_FMT " aborted after" 1809 " encountering an error: %s", 1810 pci_dev->addr.domain, pci_dev->addr.bus, 1811 pci_dev->addr.devid, pci_dev->addr.function, 1812 strerror(rte_errno)); 1813 ret = -rte_errno; 1814 /* Roll back. */ 1815 while (i--) { 1816 if (!list[i].eth_dev) 1817 continue; 1818 mlx5_dev_close(list[i].eth_dev); 1819 /* mac_addrs must not be freed because in dev_private */ 1820 list[i].eth_dev->data->mac_addrs = NULL; 1821 claim_zero(rte_eth_dev_release_port(list[i].eth_dev)); 1822 } 1823 /* Restore original error. */ 1824 rte_errno = -ret; 1825 } else { 1826 ret = 0; 1827 } 1828 exit: 1829 /* 1830 * Do the routine cleanup: 1831 * - close opened Netlink sockets 1832 * - free allocated spawn data array 1833 * - free the Infiniband device list 1834 */ 1835 if (nl_rdma >= 0) 1836 close(nl_rdma); 1837 if (nl_route >= 0) 1838 close(nl_route); 1839 if (list) 1840 mlx5_free(list); 1841 MLX5_ASSERT(ibv_list); 1842 mlx5_glue->free_device_list(ibv_list); 1843 return ret; 1844 } 1845 1846 static int 1847 mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config) 1848 { 1849 char *env; 1850 int value; 1851 1852 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 1853 /* Get environment variable to store. */ 1854 env = getenv(MLX5_SHUT_UP_BF); 1855 value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET; 1856 if (config->dbnc == MLX5_ARG_UNSET) 1857 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1); 1858 else 1859 setenv(MLX5_SHUT_UP_BF, 1860 config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1); 1861 return value; 1862 } 1863 1864 static void 1865 mlx5_restore_doorbell_mapping_env(int value) 1866 { 1867 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 1868 /* Restore the original environment variable state. */ 1869 if (value == MLX5_ARG_UNSET) 1870 unsetenv(MLX5_SHUT_UP_BF); 1871 else 1872 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1); 1873 } 1874 1875 /** 1876 * Extract pdn of PD object using DV API. 1877 * 1878 * @param[in] pd 1879 * Pointer to the verbs PD object. 1880 * @param[out] pdn 1881 * Pointer to the PD object number variable. 1882 * 1883 * @return 1884 * 0 on success, error value otherwise. 1885 */ 1886 int 1887 mlx5_os_get_pdn(void *pd, uint32_t *pdn) 1888 { 1889 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 1890 struct mlx5dv_obj obj; 1891 struct mlx5dv_pd pd_info; 1892 int ret = 0; 1893 1894 obj.pd.in = pd; 1895 obj.pd.out = &pd_info; 1896 ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD); 1897 if (ret) { 1898 DRV_LOG(DEBUG, "Fail to get PD object info"); 1899 return ret; 1900 } 1901 *pdn = pd_info.pdn; 1902 return 0; 1903 #else 1904 (void)pd; 1905 (void)pdn; 1906 return -ENOTSUP; 1907 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */ 1908 } 1909 1910 /** 1911 * Function API to open IB device. 1912 * 1913 * This function calls the Linux glue APIs to open a device. 1914 * 1915 * @param[in] spawn 1916 * Pointer to the IB device attributes (name, port, etc). 1917 * @param[out] config 1918 * Pointer to device configuration structure. 1919 * @param[out] sh 1920 * Pointer to shared context structure. 1921 * 1922 * @return 1923 * 0 on success, a positive error value otherwise. 1924 */ 1925 int 1926 mlx5_os_open_device(const struct mlx5_dev_spawn_data *spawn, 1927 const struct mlx5_dev_config *config, 1928 struct mlx5_dev_ctx_shared *sh) 1929 { 1930 int dbmap_env; 1931 int err = 0; 1932 1933 sh->numa_node = spawn->pci_dev->device.numa_node; 1934 pthread_mutex_init(&sh->txpp.mutex, NULL); 1935 /* 1936 * Configure environment variable "MLX5_BF_SHUT_UP" 1937 * before the device creation. The rdma_core library 1938 * checks the variable at device creation and 1939 * stores the result internally. 1940 */ 1941 dbmap_env = mlx5_config_doorbell_mapping_env(config); 1942 /* Try to open IB device with DV first, then usual Verbs. */ 1943 errno = 0; 1944 sh->ctx = mlx5_glue->dv_open_device(spawn->phys_dev); 1945 if (sh->ctx) { 1946 sh->devx = 1; 1947 DRV_LOG(DEBUG, "DevX is supported"); 1948 /* The device is created, no need for environment. */ 1949 mlx5_restore_doorbell_mapping_env(dbmap_env); 1950 } else { 1951 /* The environment variable is still configured. */ 1952 sh->ctx = mlx5_glue->open_device(spawn->phys_dev); 1953 err = errno ? errno : ENODEV; 1954 /* 1955 * The environment variable is not needed anymore, 1956 * all device creation attempts are completed. 1957 */ 1958 mlx5_restore_doorbell_mapping_env(dbmap_env); 1959 if (!sh->ctx) 1960 return err; 1961 DRV_LOG(DEBUG, "DevX is NOT supported"); 1962 err = 0; 1963 } 1964 return err; 1965 } 1966 1967 /** 1968 * Install shared asynchronous device events handler. 1969 * This function is implemented to support event sharing 1970 * between multiple ports of single IB device. 1971 * 1972 * @param sh 1973 * Pointer to mlx5_dev_ctx_shared object. 1974 */ 1975 void 1976 mlx5_os_dev_shared_handler_install(struct mlx5_dev_ctx_shared *sh) 1977 { 1978 int ret; 1979 int flags; 1980 1981 sh->intr_handle.fd = -1; 1982 flags = fcntl(((struct ibv_context *)sh->ctx)->async_fd, F_GETFL); 1983 ret = fcntl(((struct ibv_context *)sh->ctx)->async_fd, 1984 F_SETFL, flags | O_NONBLOCK); 1985 if (ret) { 1986 DRV_LOG(INFO, "failed to change file descriptor async event" 1987 " queue"); 1988 } else { 1989 sh->intr_handle.fd = ((struct ibv_context *)sh->ctx)->async_fd; 1990 sh->intr_handle.type = RTE_INTR_HANDLE_EXT; 1991 if (rte_intr_callback_register(&sh->intr_handle, 1992 mlx5_dev_interrupt_handler, sh)) { 1993 DRV_LOG(INFO, "Fail to install the shared interrupt."); 1994 sh->intr_handle.fd = -1; 1995 } 1996 } 1997 if (sh->devx) { 1998 #ifdef HAVE_IBV_DEVX_ASYNC 1999 sh->intr_handle_devx.fd = -1; 2000 sh->devx_comp = 2001 (void *)mlx5_glue->devx_create_cmd_comp(sh->ctx); 2002 struct mlx5dv_devx_cmd_comp *devx_comp = sh->devx_comp; 2003 if (!devx_comp) { 2004 DRV_LOG(INFO, "failed to allocate devx_comp."); 2005 return; 2006 } 2007 flags = fcntl(devx_comp->fd, F_GETFL); 2008 ret = fcntl(devx_comp->fd, F_SETFL, flags | O_NONBLOCK); 2009 if (ret) { 2010 DRV_LOG(INFO, "failed to change file descriptor" 2011 " devx comp"); 2012 return; 2013 } 2014 sh->intr_handle_devx.fd = devx_comp->fd; 2015 sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT; 2016 if (rte_intr_callback_register(&sh->intr_handle_devx, 2017 mlx5_dev_interrupt_handler_devx, sh)) { 2018 DRV_LOG(INFO, "Fail to install the devx shared" 2019 " interrupt."); 2020 sh->intr_handle_devx.fd = -1; 2021 } 2022 #endif /* HAVE_IBV_DEVX_ASYNC */ 2023 } 2024 } 2025 2026 /** 2027 * Uninstall shared asynchronous device events handler. 2028 * This function is implemented to support event sharing 2029 * between multiple ports of single IB device. 2030 * 2031 * @param dev 2032 * Pointer to mlx5_dev_ctx_shared object. 2033 */ 2034 void 2035 mlx5_os_dev_shared_handler_uninstall(struct mlx5_dev_ctx_shared *sh) 2036 { 2037 if (sh->intr_handle.fd >= 0) 2038 mlx5_intr_callback_unregister(&sh->intr_handle, 2039 mlx5_dev_interrupt_handler, sh); 2040 #ifdef HAVE_IBV_DEVX_ASYNC 2041 if (sh->intr_handle_devx.fd >= 0) 2042 rte_intr_callback_unregister(&sh->intr_handle_devx, 2043 mlx5_dev_interrupt_handler_devx, sh); 2044 if (sh->devx_comp) 2045 mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp); 2046 #endif 2047 } 2048 2049 /** 2050 * Read statistics by a named counter. 2051 * 2052 * @param[in] priv 2053 * Pointer to the private device data structure. 2054 * @param[in] ctr_name 2055 * Pointer to the name of the statistic counter to read 2056 * @param[out] stat 2057 * Pointer to read statistic value. 2058 * @return 2059 * 0 on success and stat is valud, 1 if failed to read the value 2060 * rte_errno is set. 2061 * 2062 */ 2063 int 2064 mlx5_os_read_dev_stat(struct mlx5_priv *priv, const char *ctr_name, 2065 uint64_t *stat) 2066 { 2067 int fd; 2068 2069 if (priv->sh) { 2070 MKSTR(path, "%s/ports/%d/hw_counters/%s", 2071 priv->sh->ibdev_path, 2072 priv->dev_port, 2073 ctr_name); 2074 fd = open(path, O_RDONLY); 2075 if (fd != -1) { 2076 char buf[21] = {'\0'}; 2077 ssize_t n = read(fd, buf, sizeof(buf)); 2078 2079 close(fd); 2080 if (n != -1) { 2081 *stat = strtoull(buf, NULL, 10); 2082 return 0; 2083 } 2084 } 2085 } 2086 *stat = 0; 2087 return 1; 2088 } 2089 2090 /** 2091 * Read device counters table. 2092 * 2093 * @param dev 2094 * Pointer to Ethernet device. 2095 * @param[out] stats 2096 * Counters table output buffer. 2097 * 2098 * @return 2099 * 0 on success and stats is filled, negative errno value otherwise and 2100 * rte_errno is set. 2101 */ 2102 int 2103 mlx5_os_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats) 2104 { 2105 struct mlx5_priv *priv = dev->data->dev_private; 2106 struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; 2107 unsigned int i; 2108 struct ifreq ifr; 2109 unsigned int stats_sz = xstats_ctrl->stats_n * sizeof(uint64_t); 2110 unsigned char et_stat_buf[sizeof(struct ethtool_stats) + stats_sz]; 2111 struct ethtool_stats *et_stats = (struct ethtool_stats *)et_stat_buf; 2112 int ret; 2113 2114 et_stats->cmd = ETHTOOL_GSTATS; 2115 et_stats->n_stats = xstats_ctrl->stats_n; 2116 ifr.ifr_data = (caddr_t)et_stats; 2117 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 2118 if (ret) { 2119 DRV_LOG(WARNING, 2120 "port %u unable to read statistic values from device", 2121 dev->data->port_id); 2122 return ret; 2123 } 2124 for (i = 0; i != xstats_ctrl->mlx5_stats_n; ++i) { 2125 if (xstats_ctrl->info[i].dev) { 2126 ret = mlx5_os_read_dev_stat(priv, 2127 xstats_ctrl->info[i].ctr_name, 2128 &stats[i]); 2129 /* return last xstats counter if fail to read. */ 2130 if (ret == 0) 2131 xstats_ctrl->xstats[i] = stats[i]; 2132 else 2133 stats[i] = xstats_ctrl->xstats[i]; 2134 } else { 2135 stats[i] = (uint64_t) 2136 et_stats->data[xstats_ctrl->dev_table_idx[i]]; 2137 } 2138 } 2139 return 0; 2140 } 2141 2142 /** 2143 * Query the number of statistics provided by ETHTOOL. 2144 * 2145 * @param dev 2146 * Pointer to Ethernet device. 2147 * 2148 * @return 2149 * Number of statistics on success, negative errno value otherwise and 2150 * rte_errno is set. 2151 */ 2152 int 2153 mlx5_os_get_stats_n(struct rte_eth_dev *dev) 2154 { 2155 struct ethtool_drvinfo drvinfo; 2156 struct ifreq ifr; 2157 int ret; 2158 2159 drvinfo.cmd = ETHTOOL_GDRVINFO; 2160 ifr.ifr_data = (caddr_t)&drvinfo; 2161 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 2162 if (ret) { 2163 DRV_LOG(WARNING, "port %u unable to query number of statistics", 2164 dev->data->port_id); 2165 return ret; 2166 } 2167 return drvinfo.n_stats; 2168 } 2169 2170 static const struct mlx5_counter_ctrl mlx5_counters_init[] = { 2171 { 2172 .dpdk_name = "rx_port_unicast_bytes", 2173 .ctr_name = "rx_vport_unicast_bytes", 2174 }, 2175 { 2176 .dpdk_name = "rx_port_multicast_bytes", 2177 .ctr_name = "rx_vport_multicast_bytes", 2178 }, 2179 { 2180 .dpdk_name = "rx_port_broadcast_bytes", 2181 .ctr_name = "rx_vport_broadcast_bytes", 2182 }, 2183 { 2184 .dpdk_name = "rx_port_unicast_packets", 2185 .ctr_name = "rx_vport_unicast_packets", 2186 }, 2187 { 2188 .dpdk_name = "rx_port_multicast_packets", 2189 .ctr_name = "rx_vport_multicast_packets", 2190 }, 2191 { 2192 .dpdk_name = "rx_port_broadcast_packets", 2193 .ctr_name = "rx_vport_broadcast_packets", 2194 }, 2195 { 2196 .dpdk_name = "tx_port_unicast_bytes", 2197 .ctr_name = "tx_vport_unicast_bytes", 2198 }, 2199 { 2200 .dpdk_name = "tx_port_multicast_bytes", 2201 .ctr_name = "tx_vport_multicast_bytes", 2202 }, 2203 { 2204 .dpdk_name = "tx_port_broadcast_bytes", 2205 .ctr_name = "tx_vport_broadcast_bytes", 2206 }, 2207 { 2208 .dpdk_name = "tx_port_unicast_packets", 2209 .ctr_name = "tx_vport_unicast_packets", 2210 }, 2211 { 2212 .dpdk_name = "tx_port_multicast_packets", 2213 .ctr_name = "tx_vport_multicast_packets", 2214 }, 2215 { 2216 .dpdk_name = "tx_port_broadcast_packets", 2217 .ctr_name = "tx_vport_broadcast_packets", 2218 }, 2219 { 2220 .dpdk_name = "rx_wqe_err", 2221 .ctr_name = "rx_wqe_err", 2222 }, 2223 { 2224 .dpdk_name = "rx_crc_errors_phy", 2225 .ctr_name = "rx_crc_errors_phy", 2226 }, 2227 { 2228 .dpdk_name = "rx_in_range_len_errors_phy", 2229 .ctr_name = "rx_in_range_len_errors_phy", 2230 }, 2231 { 2232 .dpdk_name = "rx_symbol_err_phy", 2233 .ctr_name = "rx_symbol_err_phy", 2234 }, 2235 { 2236 .dpdk_name = "tx_errors_phy", 2237 .ctr_name = "tx_errors_phy", 2238 }, 2239 { 2240 .dpdk_name = "rx_out_of_buffer", 2241 .ctr_name = "out_of_buffer", 2242 .dev = 1, 2243 }, 2244 { 2245 .dpdk_name = "tx_packets_phy", 2246 .ctr_name = "tx_packets_phy", 2247 }, 2248 { 2249 .dpdk_name = "rx_packets_phy", 2250 .ctr_name = "rx_packets_phy", 2251 }, 2252 { 2253 .dpdk_name = "tx_discards_phy", 2254 .ctr_name = "tx_discards_phy", 2255 }, 2256 { 2257 .dpdk_name = "rx_discards_phy", 2258 .ctr_name = "rx_discards_phy", 2259 }, 2260 { 2261 .dpdk_name = "tx_bytes_phy", 2262 .ctr_name = "tx_bytes_phy", 2263 }, 2264 { 2265 .dpdk_name = "rx_bytes_phy", 2266 .ctr_name = "rx_bytes_phy", 2267 }, 2268 /* Representor only */ 2269 { 2270 .dpdk_name = "rx_packets", 2271 .ctr_name = "vport_rx_packets", 2272 }, 2273 { 2274 .dpdk_name = "rx_bytes", 2275 .ctr_name = "vport_rx_bytes", 2276 }, 2277 { 2278 .dpdk_name = "tx_packets", 2279 .ctr_name = "vport_tx_packets", 2280 }, 2281 { 2282 .dpdk_name = "tx_bytes", 2283 .ctr_name = "vport_tx_bytes", 2284 }, 2285 }; 2286 2287 static const unsigned int xstats_n = RTE_DIM(mlx5_counters_init); 2288 2289 /** 2290 * Init the structures to read device counters. 2291 * 2292 * @param dev 2293 * Pointer to Ethernet device. 2294 */ 2295 void 2296 mlx5_os_stats_init(struct rte_eth_dev *dev) 2297 { 2298 struct mlx5_priv *priv = dev->data->dev_private; 2299 struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; 2300 struct mlx5_stats_ctrl *stats_ctrl = &priv->stats_ctrl; 2301 unsigned int i; 2302 unsigned int j; 2303 struct ifreq ifr; 2304 struct ethtool_gstrings *strings = NULL; 2305 unsigned int dev_stats_n; 2306 unsigned int str_sz; 2307 int ret; 2308 2309 /* So that it won't aggregate for each init. */ 2310 xstats_ctrl->mlx5_stats_n = 0; 2311 ret = mlx5_os_get_stats_n(dev); 2312 if (ret < 0) { 2313 DRV_LOG(WARNING, "port %u no extended statistics available", 2314 dev->data->port_id); 2315 return; 2316 } 2317 dev_stats_n = ret; 2318 /* Allocate memory to grab stat names and values. */ 2319 str_sz = dev_stats_n * ETH_GSTRING_LEN; 2320 strings = (struct ethtool_gstrings *) 2321 mlx5_malloc(0, str_sz + sizeof(struct ethtool_gstrings), 0, 2322 SOCKET_ID_ANY); 2323 if (!strings) { 2324 DRV_LOG(WARNING, "port %u unable to allocate memory for xstats", 2325 dev->data->port_id); 2326 return; 2327 } 2328 strings->cmd = ETHTOOL_GSTRINGS; 2329 strings->string_set = ETH_SS_STATS; 2330 strings->len = dev_stats_n; 2331 ifr.ifr_data = (caddr_t)strings; 2332 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); 2333 if (ret) { 2334 DRV_LOG(WARNING, "port %u unable to get statistic names", 2335 dev->data->port_id); 2336 goto free; 2337 } 2338 for (i = 0; i != dev_stats_n; ++i) { 2339 const char *curr_string = (const char *) 2340 &strings->data[i * ETH_GSTRING_LEN]; 2341 2342 for (j = 0; j != xstats_n; ++j) { 2343 if (!strcmp(mlx5_counters_init[j].ctr_name, 2344 curr_string)) { 2345 unsigned int idx = xstats_ctrl->mlx5_stats_n++; 2346 2347 xstats_ctrl->dev_table_idx[idx] = i; 2348 xstats_ctrl->info[idx] = mlx5_counters_init[j]; 2349 break; 2350 } 2351 } 2352 } 2353 /* Add dev counters. */ 2354 for (i = 0; i != xstats_n; ++i) { 2355 if (mlx5_counters_init[i].dev) { 2356 unsigned int idx = xstats_ctrl->mlx5_stats_n++; 2357 2358 xstats_ctrl->info[idx] = mlx5_counters_init[i]; 2359 xstats_ctrl->hw_stats[idx] = 0; 2360 } 2361 } 2362 MLX5_ASSERT(xstats_ctrl->mlx5_stats_n <= MLX5_MAX_XSTATS); 2363 xstats_ctrl->stats_n = dev_stats_n; 2364 /* Copy to base at first time. */ 2365 ret = mlx5_os_read_dev_counters(dev, xstats_ctrl->base); 2366 if (ret) 2367 DRV_LOG(ERR, "port %u cannot read device counters: %s", 2368 dev->data->port_id, strerror(rte_errno)); 2369 mlx5_os_read_dev_stat(priv, "out_of_buffer", &stats_ctrl->imissed_base); 2370 stats_ctrl->imissed = 0; 2371 free: 2372 mlx5_free(strings); 2373 } 2374 2375 /** 2376 * Set the reg_mr and dereg_mr call backs 2377 * 2378 * @param reg_mr_cb[out] 2379 * Pointer to reg_mr func 2380 * @param dereg_mr_cb[out] 2381 * Pointer to dereg_mr func 2382 * 2383 */ 2384 void 2385 mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb, 2386 mlx5_dereg_mr_t *dereg_mr_cb) 2387 { 2388 *reg_mr_cb = mlx5_verbs_ops.reg_mr; 2389 *dereg_mr_cb = mlx5_verbs_ops.dereg_mr; 2390 } 2391 2392 /** 2393 * Remove a MAC address from device 2394 * 2395 * @param dev 2396 * Pointer to Ethernet device structure. 2397 * @param index 2398 * MAC address index. 2399 */ 2400 void 2401 mlx5_os_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) 2402 { 2403 struct mlx5_priv *priv = dev->data->dev_private; 2404 const int vf = priv->config.vf; 2405 2406 if (vf) 2407 mlx5_nl_mac_addr_remove(priv->nl_socket_route, 2408 mlx5_ifindex(dev), priv->mac_own, 2409 &dev->data->mac_addrs[index], index); 2410 } 2411 2412 /** 2413 * Adds a MAC address to the device 2414 * 2415 * @param dev 2416 * Pointer to Ethernet device structure. 2417 * @param mac_addr 2418 * MAC address to register. 2419 * @param index 2420 * MAC address index. 2421 * 2422 * @return 2423 * 0 on success, a negative errno value otherwise 2424 */ 2425 int 2426 mlx5_os_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac, 2427 uint32_t index) 2428 { 2429 struct mlx5_priv *priv = dev->data->dev_private; 2430 const int vf = priv->config.vf; 2431 int ret = 0; 2432 2433 if (vf) 2434 ret = mlx5_nl_mac_addr_add(priv->nl_socket_route, 2435 mlx5_ifindex(dev), priv->mac_own, 2436 mac, index); 2437 return ret; 2438 } 2439 2440 /** 2441 * Modify a VF MAC address 2442 * 2443 * @param priv 2444 * Pointer to device private data. 2445 * @param mac_addr 2446 * MAC address to modify into. 2447 * @param iface_idx 2448 * Net device interface index 2449 * @param vf_index 2450 * VF index 2451 * 2452 * @return 2453 * 0 on success, a negative errno value otherwise 2454 */ 2455 int 2456 mlx5_os_vf_mac_addr_modify(struct mlx5_priv *priv, 2457 unsigned int iface_idx, 2458 struct rte_ether_addr *mac_addr, 2459 int vf_index) 2460 { 2461 return mlx5_nl_vf_mac_addr_modify 2462 (priv->nl_socket_route, iface_idx, mac_addr, vf_index); 2463 } 2464 2465 const struct eth_dev_ops mlx5_os_dev_ops = { 2466 .dev_configure = mlx5_dev_configure, 2467 .dev_start = mlx5_dev_start, 2468 .dev_stop = mlx5_dev_stop, 2469 .dev_set_link_down = mlx5_set_link_down, 2470 .dev_set_link_up = mlx5_set_link_up, 2471 .dev_close = mlx5_dev_close, 2472 .promiscuous_enable = mlx5_promiscuous_enable, 2473 .promiscuous_disable = mlx5_promiscuous_disable, 2474 .allmulticast_enable = mlx5_allmulticast_enable, 2475 .allmulticast_disable = mlx5_allmulticast_disable, 2476 .link_update = mlx5_link_update, 2477 .stats_get = mlx5_stats_get, 2478 .stats_reset = mlx5_stats_reset, 2479 .xstats_get = mlx5_xstats_get, 2480 .xstats_reset = mlx5_xstats_reset, 2481 .xstats_get_names = mlx5_xstats_get_names, 2482 .fw_version_get = mlx5_fw_version_get, 2483 .dev_infos_get = mlx5_dev_infos_get, 2484 .read_clock = mlx5_txpp_read_clock, 2485 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 2486 .vlan_filter_set = mlx5_vlan_filter_set, 2487 .rx_queue_setup = mlx5_rx_queue_setup, 2488 .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup, 2489 .tx_queue_setup = mlx5_tx_queue_setup, 2490 .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup, 2491 .rx_queue_release = mlx5_rx_queue_release, 2492 .tx_queue_release = mlx5_tx_queue_release, 2493 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 2494 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 2495 .mac_addr_remove = mlx5_mac_addr_remove, 2496 .mac_addr_add = mlx5_mac_addr_add, 2497 .mac_addr_set = mlx5_mac_addr_set, 2498 .set_mc_addr_list = mlx5_set_mc_addr_list, 2499 .mtu_set = mlx5_dev_set_mtu, 2500 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 2501 .vlan_offload_set = mlx5_vlan_offload_set, 2502 .reta_update = mlx5_dev_rss_reta_update, 2503 .reta_query = mlx5_dev_rss_reta_query, 2504 .rss_hash_update = mlx5_rss_hash_update, 2505 .rss_hash_conf_get = mlx5_rss_hash_conf_get, 2506 .filter_ctrl = mlx5_dev_filter_ctrl, 2507 .rx_descriptor_status = mlx5_rx_descriptor_status, 2508 .tx_descriptor_status = mlx5_tx_descriptor_status, 2509 .rxq_info_get = mlx5_rxq_info_get, 2510 .txq_info_get = mlx5_txq_info_get, 2511 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 2512 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 2513 .rx_queue_count = mlx5_rx_queue_count, 2514 .rx_queue_intr_enable = mlx5_rx_intr_enable, 2515 .rx_queue_intr_disable = mlx5_rx_intr_disable, 2516 .is_removed = mlx5_is_removed, 2517 .udp_tunnel_port_add = mlx5_udp_tunnel_port_add, 2518 .get_module_info = mlx5_get_module_info, 2519 .get_module_eeprom = mlx5_get_module_eeprom, 2520 .hairpin_cap_get = mlx5_hairpin_cap_get, 2521 .mtr_ops_get = mlx5_flow_meter_ops_get, 2522 }; 2523 2524 /* Available operations from secondary process. */ 2525 const struct eth_dev_ops mlx5_os_dev_sec_ops = { 2526 .stats_get = mlx5_stats_get, 2527 .stats_reset = mlx5_stats_reset, 2528 .xstats_get = mlx5_xstats_get, 2529 .xstats_reset = mlx5_xstats_reset, 2530 .xstats_get_names = mlx5_xstats_get_names, 2531 .fw_version_get = mlx5_fw_version_get, 2532 .dev_infos_get = mlx5_dev_infos_get, 2533 .read_clock = mlx5_txpp_read_clock, 2534 .rx_descriptor_status = mlx5_rx_descriptor_status, 2535 .tx_descriptor_status = mlx5_tx_descriptor_status, 2536 .rxq_info_get = mlx5_rxq_info_get, 2537 .txq_info_get = mlx5_txq_info_get, 2538 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 2539 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 2540 .get_module_info = mlx5_get_module_info, 2541 .get_module_eeprom = mlx5_get_module_eeprom, 2542 }; 2543 2544 /* Available operations in flow isolated mode. */ 2545 const struct eth_dev_ops mlx5_os_dev_ops_isolate = { 2546 .dev_configure = mlx5_dev_configure, 2547 .dev_start = mlx5_dev_start, 2548 .dev_stop = mlx5_dev_stop, 2549 .dev_set_link_down = mlx5_set_link_down, 2550 .dev_set_link_up = mlx5_set_link_up, 2551 .dev_close = mlx5_dev_close, 2552 .promiscuous_enable = mlx5_promiscuous_enable, 2553 .promiscuous_disable = mlx5_promiscuous_disable, 2554 .allmulticast_enable = mlx5_allmulticast_enable, 2555 .allmulticast_disable = mlx5_allmulticast_disable, 2556 .link_update = mlx5_link_update, 2557 .stats_get = mlx5_stats_get, 2558 .stats_reset = mlx5_stats_reset, 2559 .xstats_get = mlx5_xstats_get, 2560 .xstats_reset = mlx5_xstats_reset, 2561 .xstats_get_names = mlx5_xstats_get_names, 2562 .fw_version_get = mlx5_fw_version_get, 2563 .dev_infos_get = mlx5_dev_infos_get, 2564 .read_clock = mlx5_txpp_read_clock, 2565 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 2566 .vlan_filter_set = mlx5_vlan_filter_set, 2567 .rx_queue_setup = mlx5_rx_queue_setup, 2568 .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup, 2569 .tx_queue_setup = mlx5_tx_queue_setup, 2570 .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup, 2571 .rx_queue_release = mlx5_rx_queue_release, 2572 .tx_queue_release = mlx5_tx_queue_release, 2573 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 2574 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 2575 .mac_addr_remove = mlx5_mac_addr_remove, 2576 .mac_addr_add = mlx5_mac_addr_add, 2577 .mac_addr_set = mlx5_mac_addr_set, 2578 .set_mc_addr_list = mlx5_set_mc_addr_list, 2579 .mtu_set = mlx5_dev_set_mtu, 2580 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 2581 .vlan_offload_set = mlx5_vlan_offload_set, 2582 .filter_ctrl = mlx5_dev_filter_ctrl, 2583 .rx_descriptor_status = mlx5_rx_descriptor_status, 2584 .tx_descriptor_status = mlx5_tx_descriptor_status, 2585 .rxq_info_get = mlx5_rxq_info_get, 2586 .txq_info_get = mlx5_txq_info_get, 2587 .rx_burst_mode_get = mlx5_rx_burst_mode_get, 2588 .tx_burst_mode_get = mlx5_tx_burst_mode_get, 2589 .rx_queue_intr_enable = mlx5_rx_intr_enable, 2590 .rx_queue_intr_disable = mlx5_rx_intr_disable, 2591 .is_removed = mlx5_is_removed, 2592 .get_module_info = mlx5_get_module_info, 2593 .get_module_eeprom = mlx5_get_module_eeprom, 2594 .hairpin_cap_get = mlx5_hairpin_cap_get, 2595 .mtr_ops_get = mlx5_flow_meter_ops_get, 2596 }; 2597