1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2020 Mellanox Technologies, Ltd 4 */ 5 6 #include <stddef.h> 7 #include <unistd.h> 8 #include <string.h> 9 #include <stdint.h> 10 #include <stdlib.h> 11 #include <errno.h> 12 #include <net/if.h> 13 #include <linux/rtnetlink.h> 14 #include <linux/sockios.h> 15 #include <linux/ethtool.h> 16 #include <fcntl.h> 17 18 #include <rte_malloc.h> 19 #include <ethdev_driver.h> 20 #include <ethdev_pci.h> 21 #include <rte_pci.h> 22 #include <bus_driver.h> 23 #include <bus_pci_driver.h> 24 #include <bus_auxiliary_driver.h> 25 #include <rte_common.h> 26 #include <rte_kvargs.h> 27 #include <rte_rwlock.h> 28 #include <rte_spinlock.h> 29 #include <rte_string_fns.h> 30 #include <rte_alarm.h> 31 #include <rte_eal_paging.h> 32 33 #include <mlx5_glue.h> 34 #include <mlx5_devx_cmds.h> 35 #include <mlx5_common.h> 36 #include <mlx5_common_mp.h> 37 #include <mlx5_common_mr.h> 38 #include <mlx5_malloc.h> 39 40 #include "mlx5_defs.h" 41 #include "mlx5.h" 42 #include "mlx5_common_os.h" 43 #include "mlx5_utils.h" 44 #include "mlx5_rxtx.h" 45 #include "mlx5_rx.h" 46 #include "mlx5_tx.h" 47 #include "mlx5_autoconf.h" 48 #include "mlx5_flow.h" 49 #include "rte_pmd_mlx5.h" 50 #include "mlx5_verbs.h" 51 #include "mlx5_nl.h" 52 #include "mlx5_devx.h" 53 54 #ifndef HAVE_IBV_MLX5_MOD_MPW 55 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) 56 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) 57 #endif 58 59 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP 60 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4) 61 #endif 62 63 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data"; 64 65 /* Spinlock for mlx5_shared_data allocation. */ 66 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER; 67 68 /* Process local data for secondary processes. */ 69 static struct mlx5_local_data mlx5_local_data; 70 71 /* rte flow indexed pool configuration. */ 72 static const struct mlx5_indexed_pool_config default_icfg[] = { 73 { 74 .size = sizeof(struct rte_flow), 75 .trunk_size = 64, 76 .need_lock = 1, 77 .release_mem_en = 0, 78 .malloc = mlx5_malloc, 79 .free = mlx5_free, 80 .per_core_cache = 0, 81 .type = "ctl_flow_ipool", 82 }, 83 { 84 .size = sizeof(struct rte_flow), 85 .trunk_size = 64, 86 .grow_trunk = 3, 87 .grow_shift = 2, 88 .need_lock = 1, 89 .release_mem_en = 0, 90 .malloc = mlx5_malloc, 91 .free = mlx5_free, 92 .per_core_cache = 1 << 14, 93 .type = "rte_flow_ipool", 94 }, 95 { 96 .size = sizeof(struct rte_flow), 97 .trunk_size = 64, 98 .grow_trunk = 3, 99 .grow_shift = 2, 100 .need_lock = 1, 101 .release_mem_en = 0, 102 .malloc = mlx5_malloc, 103 .free = mlx5_free, 104 .per_core_cache = 0, 105 .type = "mcp_flow_ipool", 106 }, 107 }; 108 109 /** 110 * Set the completion channel file descriptor interrupt as non-blocking. 111 * 112 * @param[in] rxq_obj 113 * Pointer to RQ channel object, which includes the channel fd 114 * 115 * @param[out] fd 116 * The file descriptor (representing the interrupt) used in this channel. 117 * 118 * @return 119 * 0 on successfully setting the fd to non-blocking, non-zero otherwise. 120 */ 121 int 122 mlx5_os_set_nonblock_channel_fd(int fd) 123 { 124 int flags; 125 126 flags = fcntl(fd, F_GETFL); 127 return fcntl(fd, F_SETFL, flags | O_NONBLOCK); 128 } 129 130 /** 131 * Get mlx5 device attributes. The glue function query_device_ex() is called 132 * with out parameter of type 'struct ibv_device_attr_ex *'. Then fill in mlx5 133 * device attributes from the glue out parameter. 134 * 135 * @param sh 136 * Pointer to shared device context. 137 * 138 * @return 139 * 0 on success, a negative errno value otherwise and rte_errno is set. 140 */ 141 int 142 mlx5_os_capabilities_prepare(struct mlx5_dev_ctx_shared *sh) 143 { 144 int err; 145 struct mlx5_common_device *cdev = sh->cdev; 146 struct mlx5_hca_attr *hca_attr = &cdev->config.hca_attr; 147 struct ibv_device_attr_ex attr_ex = { .comp_mask = 0 }; 148 struct mlx5dv_context dv_attr = { .comp_mask = 0 }; 149 150 err = mlx5_glue->query_device_ex(cdev->ctx, NULL, &attr_ex); 151 if (err) { 152 rte_errno = errno; 153 return -rte_errno; 154 } 155 #ifdef HAVE_IBV_MLX5_MOD_SWP 156 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; 157 #endif 158 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 159 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS; 160 #endif 161 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 162 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; 163 #endif 164 #ifdef HAVE_IBV_DEVICE_ATTR_ESW_MGR_REG_C0 165 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_REG_C0; 166 #endif 167 err = mlx5_glue->dv_query_device(cdev->ctx, &dv_attr); 168 if (err) { 169 rte_errno = errno; 170 return -rte_errno; 171 } 172 memset(&sh->dev_cap, 0, sizeof(struct mlx5_dev_cap)); 173 if (mlx5_dev_is_pci(cdev->dev)) 174 sh->dev_cap.vf = mlx5_dev_is_vf_pci(RTE_DEV_TO_PCI(cdev->dev)); 175 else 176 sh->dev_cap.sf = 1; 177 sh->dev_cap.max_qp_wr = attr_ex.orig_attr.max_qp_wr; 178 sh->dev_cap.max_sge = attr_ex.orig_attr.max_sge; 179 sh->dev_cap.max_cq = attr_ex.orig_attr.max_cq; 180 sh->dev_cap.max_qp = attr_ex.orig_attr.max_qp; 181 #ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR 182 sh->dev_cap.dest_tir = 1; 183 #endif 184 #if defined(HAVE_IBV_FLOW_DV_SUPPORT) && defined(HAVE_MLX5DV_DR) 185 DRV_LOG(DEBUG, "DV flow is supported."); 186 sh->dev_cap.dv_flow_en = 1; 187 #endif 188 #ifdef HAVE_MLX5DV_DR_ESWITCH 189 if (hca_attr->eswitch_manager && sh->dev_cap.dv_flow_en && sh->esw_mode) 190 sh->dev_cap.dv_esw_en = 1; 191 #endif 192 /* 193 * Multi-packet send is supported by ConnectX-4 Lx PF as well 194 * as all ConnectX-5 devices. 195 */ 196 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { 197 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { 198 DRV_LOG(DEBUG, "Enhanced MPW is supported."); 199 sh->dev_cap.mps = MLX5_MPW_ENHANCED; 200 } else { 201 DRV_LOG(DEBUG, "MPW is supported."); 202 sh->dev_cap.mps = MLX5_MPW; 203 } 204 } else { 205 DRV_LOG(DEBUG, "MPW isn't supported."); 206 sh->dev_cap.mps = MLX5_MPW_DISABLED; 207 } 208 #if (RTE_CACHE_LINE_SIZE == 128) 209 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP) 210 sh->dev_cap.cqe_comp = 1; 211 DRV_LOG(DEBUG, "Rx CQE 128B compression is %ssupported.", 212 sh->dev_cap.cqe_comp ? "" : "not "); 213 #else 214 sh->dev_cap.cqe_comp = 1; 215 #endif 216 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT 217 sh->dev_cap.mpls_en = 218 ((dv_attr.tunnel_offloads_caps & 219 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) && 220 (dv_attr.tunnel_offloads_caps & 221 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP)); 222 DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported.", 223 sh->dev_cap.mpls_en ? "" : "not "); 224 #else 225 DRV_LOG(WARNING, 226 "MPLS over GRE/UDP tunnel offloading disabled due to old OFED/rdma-core version or firmware configuration"); 227 #endif 228 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING) 229 sh->dev_cap.hw_padding = !!attr_ex.rx_pad_end_addr_align; 230 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING) 231 sh->dev_cap.hw_padding = !!(attr_ex.device_cap_flags_ex & 232 IBV_DEVICE_PCI_WRITE_END_PADDING); 233 #endif 234 sh->dev_cap.hw_csum = 235 !!(attr_ex.device_cap_flags_ex & IBV_DEVICE_RAW_IP_CSUM); 236 DRV_LOG(DEBUG, "Checksum offloading is %ssupported.", 237 sh->dev_cap.hw_csum ? "" : "not "); 238 sh->dev_cap.hw_vlan_strip = !!(attr_ex.raw_packet_caps & 239 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); 240 DRV_LOG(DEBUG, "VLAN stripping is %ssupported.", 241 (sh->dev_cap.hw_vlan_strip ? "" : "not ")); 242 sh->dev_cap.hw_fcs_strip = !!(attr_ex.raw_packet_caps & 243 IBV_RAW_PACKET_CAP_SCATTER_FCS); 244 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \ 245 !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) 246 DRV_LOG(DEBUG, "Counters are not supported."); 247 #endif 248 /* 249 * DPDK doesn't support larger/variable indirection tables. 250 * Once DPDK supports it, take max size from device attr. 251 */ 252 sh->dev_cap.ind_table_max_size = 253 RTE_MIN(attr_ex.rss_caps.max_rwq_indirection_table_size, 254 (unsigned int)RTE_ETH_RSS_RETA_SIZE_512); 255 DRV_LOG(DEBUG, "Maximum Rx indirection table size is %u", 256 sh->dev_cap.ind_table_max_size); 257 sh->dev_cap.tso = (attr_ex.tso_caps.max_tso > 0 && 258 (attr_ex.tso_caps.supported_qpts & 259 (1 << IBV_QPT_RAW_PACKET))); 260 if (sh->dev_cap.tso) 261 sh->dev_cap.tso_max_payload_sz = attr_ex.tso_caps.max_tso; 262 strlcpy(sh->dev_cap.fw_ver, attr_ex.orig_attr.fw_ver, 263 sizeof(sh->dev_cap.fw_ver)); 264 #ifdef HAVE_IBV_MLX5_MOD_SWP 265 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP) 266 sh->dev_cap.swp = dv_attr.sw_parsing_caps.sw_parsing_offloads & 267 (MLX5_SW_PARSING_CAP | 268 MLX5_SW_PARSING_CSUM_CAP | 269 MLX5_SW_PARSING_TSO_CAP); 270 DRV_LOG(DEBUG, "SWP support: %u", sh->dev_cap.swp); 271 #endif 272 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 273 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) { 274 struct mlx5dv_striding_rq_caps *strd_rq_caps = 275 &dv_attr.striding_rq_caps; 276 277 sh->dev_cap.mprq.enabled = 1; 278 sh->dev_cap.mprq.log_min_stride_size = 279 strd_rq_caps->min_single_stride_log_num_of_bytes; 280 sh->dev_cap.mprq.log_max_stride_size = 281 strd_rq_caps->max_single_stride_log_num_of_bytes; 282 sh->dev_cap.mprq.log_min_stride_num = 283 strd_rq_caps->min_single_wqe_log_num_of_strides; 284 sh->dev_cap.mprq.log_max_stride_num = 285 strd_rq_caps->max_single_wqe_log_num_of_strides; 286 sh->dev_cap.mprq.log_min_stride_wqe_size = 287 cdev->config.devx ? 288 hca_attr->log_min_stride_wqe_sz : 289 MLX5_MPRQ_LOG_MIN_STRIDE_WQE_SIZE; 290 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %u", 291 sh->dev_cap.mprq.log_min_stride_size); 292 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %u", 293 sh->dev_cap.mprq.log_max_stride_size); 294 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %u", 295 sh->dev_cap.mprq.log_min_stride_num); 296 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %u", 297 sh->dev_cap.mprq.log_max_stride_num); 298 DRV_LOG(DEBUG, "\tmin_stride_wqe_log_size: %u", 299 sh->dev_cap.mprq.log_min_stride_wqe_size); 300 DRV_LOG(DEBUG, "\tsupported_qpts: %d", 301 strd_rq_caps->supported_qpts); 302 DRV_LOG(DEBUG, "Device supports Multi-Packet RQ."); 303 } 304 #endif 305 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 306 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { 307 sh->dev_cap.tunnel_en = dv_attr.tunnel_offloads_caps & 308 (MLX5_TUNNELED_OFFLOADS_VXLAN_CAP | 309 MLX5_TUNNELED_OFFLOADS_GRE_CAP | 310 MLX5_TUNNELED_OFFLOADS_GENEVE_CAP); 311 } 312 if (sh->dev_cap.tunnel_en) { 313 DRV_LOG(DEBUG, "Tunnel offloading is supported for %s%s%s", 314 sh->dev_cap.tunnel_en & 315 MLX5_TUNNELED_OFFLOADS_VXLAN_CAP ? "[VXLAN]" : "", 316 sh->dev_cap.tunnel_en & 317 MLX5_TUNNELED_OFFLOADS_GRE_CAP ? "[GRE]" : "", 318 sh->dev_cap.tunnel_en & 319 MLX5_TUNNELED_OFFLOADS_GENEVE_CAP ? "[GENEVE]" : ""); 320 } else { 321 DRV_LOG(DEBUG, "Tunnel offloading is not supported."); 322 } 323 #else 324 DRV_LOG(WARNING, 325 "Tunnel offloading disabled due to old OFED/rdma-core version"); 326 #endif 327 if (!sh->cdev->config.devx) 328 return 0; 329 /* Check capabilities for Packet Pacing. */ 330 DRV_LOG(DEBUG, "Timestamp counter frequency %u kHz.", 331 hca_attr->dev_freq_khz); 332 DRV_LOG(DEBUG, "Packet pacing is %ssupported.", 333 hca_attr->qos.packet_pacing ? "" : "not "); 334 DRV_LOG(DEBUG, "Cross channel ops are %ssupported.", 335 hca_attr->cross_channel ? "" : "not "); 336 DRV_LOG(DEBUG, "WQE index ignore is %ssupported.", 337 hca_attr->wqe_index_ignore ? "" : "not "); 338 DRV_LOG(DEBUG, "Non-wire SQ feature is %ssupported.", 339 hca_attr->non_wire_sq ? "" : "not "); 340 DRV_LOG(DEBUG, "Static WQE SQ feature is %ssupported (%d)", 341 hca_attr->log_max_static_sq_wq ? "" : "not ", 342 hca_attr->log_max_static_sq_wq); 343 DRV_LOG(DEBUG, "WQE rate PP mode is %ssupported.", 344 hca_attr->qos.wqe_rate_pp ? "" : "not "); 345 sh->dev_cap.txpp_en = hca_attr->qos.packet_pacing; 346 if (!hca_attr->cross_channel) { 347 DRV_LOG(DEBUG, 348 "Cross channel operations are required for packet pacing."); 349 sh->dev_cap.txpp_en = 0; 350 } 351 if (!hca_attr->wqe_index_ignore) { 352 DRV_LOG(DEBUG, 353 "WQE index ignore feature is required for packet pacing."); 354 sh->dev_cap.txpp_en = 0; 355 } 356 if (!hca_attr->non_wire_sq) { 357 DRV_LOG(DEBUG, 358 "Non-wire SQ feature is required for packet pacing."); 359 sh->dev_cap.txpp_en = 0; 360 } 361 if (!hca_attr->log_max_static_sq_wq) { 362 DRV_LOG(DEBUG, 363 "Static WQE SQ feature is required for packet pacing."); 364 sh->dev_cap.txpp_en = 0; 365 } 366 if (!hca_attr->qos.wqe_rate_pp) { 367 DRV_LOG(DEBUG, 368 "WQE rate mode is required for packet pacing."); 369 sh->dev_cap.txpp_en = 0; 370 } 371 #ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET 372 DRV_LOG(DEBUG, 373 "DevX does not provide UAR offset, can't create queues for packet pacing."); 374 sh->dev_cap.txpp_en = 0; 375 #endif 376 sh->dev_cap.scatter_fcs_w_decap_disable = 377 hca_attr->scatter_fcs_w_decap_disable; 378 sh->dev_cap.rq_delay_drop_en = hca_attr->rq_delay_drop; 379 mlx5_rt_timestamp_config(sh, hca_attr); 380 #ifdef HAVE_IBV_DEVICE_ATTR_ESW_MGR_REG_C0 381 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_REG_C0) { 382 sh->dev_cap.esw_info.regc_value = dv_attr.reg_c0.value; 383 sh->dev_cap.esw_info.regc_mask = dv_attr.reg_c0.mask; 384 } 385 #else 386 sh->dev_cap.esw_info.regc_value = 0; 387 sh->dev_cap.esw_info.regc_mask = 0; 388 #endif 389 return 0; 390 } 391 392 /** 393 * Detect misc5 support or not 394 * 395 * @param[in] priv 396 * Device private data pointer 397 */ 398 #ifdef HAVE_MLX5DV_DR 399 static void 400 __mlx5_discovery_misc5_cap(struct mlx5_priv *priv) 401 { 402 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 403 /* Dummy VxLAN matcher to detect rdma-core misc5 cap 404 * Case: IPv4--->UDP--->VxLAN--->vni 405 */ 406 void *tbl; 407 struct mlx5_flow_dv_match_params matcher_mask; 408 void *match_m; 409 void *matcher; 410 void *headers_m; 411 void *misc5_m; 412 uint32_t *tunnel_header_m; 413 struct mlx5dv_flow_matcher_attr dv_attr; 414 415 memset(&matcher_mask, 0, sizeof(matcher_mask)); 416 matcher_mask.size = sizeof(matcher_mask.buf); 417 match_m = matcher_mask.buf; 418 headers_m = MLX5_ADDR_OF(fte_match_param, match_m, outer_headers); 419 misc5_m = MLX5_ADDR_OF(fte_match_param, 420 match_m, misc_parameters_5); 421 tunnel_header_m = (uint32_t *) 422 MLX5_ADDR_OF(fte_match_set_misc5, 423 misc5_m, tunnel_header_1); 424 MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff); 425 MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_version, 4); 426 MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xffff); 427 *tunnel_header_m = 0xffffff; 428 429 tbl = mlx5_glue->dr_create_flow_tbl(priv->sh->rx_domain, 1); 430 if (!tbl) { 431 DRV_LOG(INFO, "No SW steering support"); 432 return; 433 } 434 dv_attr.type = IBV_FLOW_ATTR_NORMAL, 435 dv_attr.match_mask = (void *)&matcher_mask, 436 dv_attr.match_criteria_enable = 437 (1 << MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT) | 438 (1 << MLX5_MATCH_CRITERIA_ENABLE_MISC5_BIT); 439 dv_attr.priority = 3; 440 #ifdef HAVE_MLX5DV_DR_ESWITCH 441 void *misc2_m; 442 if (priv->sh->config.dv_esw_en) { 443 /* FDB enabled reg_c_0 */ 444 dv_attr.match_criteria_enable |= 445 (1 << MLX5_MATCH_CRITERIA_ENABLE_MISC2_BIT); 446 misc2_m = MLX5_ADDR_OF(fte_match_param, 447 match_m, misc_parameters_2); 448 MLX5_SET(fte_match_set_misc2, misc2_m, 449 metadata_reg_c_0, 0xffff); 450 } 451 #endif 452 matcher = mlx5_glue->dv_create_flow_matcher(priv->sh->cdev->ctx, 453 &dv_attr, tbl); 454 if (matcher) { 455 priv->sh->misc5_cap = 1; 456 mlx5_glue->dv_destroy_flow_matcher(matcher); 457 } 458 mlx5_glue->dr_destroy_flow_tbl(tbl); 459 #else 460 RTE_SET_USED(priv); 461 #endif 462 } 463 #endif 464 465 /** 466 * Initialize DR related data within private structure. 467 * Routine checks the reference counter and does actual 468 * resources creation/initialization only if counter is zero. 469 * 470 * @param[in] eth_dev 471 * Pointer to the device. 472 * 473 * @return 474 * Zero on success, positive error code otherwise. 475 */ 476 static int 477 mlx5_alloc_shared_dr(struct rte_eth_dev *eth_dev) 478 { 479 struct mlx5_priv *priv = eth_dev->data->dev_private; 480 struct mlx5_dev_ctx_shared *sh = priv->sh; 481 char s[MLX5_NAME_SIZE] __rte_unused; 482 int err; 483 484 MLX5_ASSERT(sh && sh->refcnt); 485 if (sh->refcnt > 1) 486 return 0; 487 err = mlx5_alloc_table_hash_list(priv); 488 if (err) 489 goto error; 490 sh->default_miss_action = 491 mlx5_glue->dr_create_flow_action_default_miss(); 492 if (!sh->default_miss_action) 493 DRV_LOG(WARNING, "Default miss action is not supported."); 494 /* The resources below are only valid with DV support. */ 495 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 496 /* Init shared flex parsers list, no need lcore_share */ 497 snprintf(s, sizeof(s), "%s_flex_parsers_list", sh->ibdev_name); 498 sh->flex_parsers_dv = mlx5_list_create(s, sh, false, 499 mlx5_flex_parser_create_cb, 500 mlx5_flex_parser_match_cb, 501 mlx5_flex_parser_remove_cb, 502 mlx5_flex_parser_clone_cb, 503 mlx5_flex_parser_clone_free_cb); 504 if (!sh->flex_parsers_dv) 505 goto error; 506 if (priv->sh->config.dv_flow_en == 2) 507 return 0; 508 /* Init port id action list. */ 509 snprintf(s, sizeof(s), "%s_port_id_action_list", sh->ibdev_name); 510 sh->port_id_action_list = mlx5_list_create(s, sh, true, 511 flow_dv_port_id_create_cb, 512 flow_dv_port_id_match_cb, 513 flow_dv_port_id_remove_cb, 514 flow_dv_port_id_clone_cb, 515 flow_dv_port_id_clone_free_cb); 516 if (!sh->port_id_action_list) 517 goto error; 518 /* Init push vlan action list. */ 519 snprintf(s, sizeof(s), "%s_push_vlan_action_list", sh->ibdev_name); 520 sh->push_vlan_action_list = mlx5_list_create(s, sh, true, 521 flow_dv_push_vlan_create_cb, 522 flow_dv_push_vlan_match_cb, 523 flow_dv_push_vlan_remove_cb, 524 flow_dv_push_vlan_clone_cb, 525 flow_dv_push_vlan_clone_free_cb); 526 if (!sh->push_vlan_action_list) 527 goto error; 528 /* Init sample action list. */ 529 snprintf(s, sizeof(s), "%s_sample_action_list", sh->ibdev_name); 530 sh->sample_action_list = mlx5_list_create(s, sh, true, 531 flow_dv_sample_create_cb, 532 flow_dv_sample_match_cb, 533 flow_dv_sample_remove_cb, 534 flow_dv_sample_clone_cb, 535 flow_dv_sample_clone_free_cb); 536 if (!sh->sample_action_list) 537 goto error; 538 /* Init dest array action list. */ 539 snprintf(s, sizeof(s), "%s_dest_array_list", sh->ibdev_name); 540 sh->dest_array_list = mlx5_list_create(s, sh, true, 541 flow_dv_dest_array_create_cb, 542 flow_dv_dest_array_match_cb, 543 flow_dv_dest_array_remove_cb, 544 flow_dv_dest_array_clone_cb, 545 flow_dv_dest_array_clone_free_cb); 546 if (!sh->dest_array_list) 547 goto error; 548 #else 549 if (priv->sh->config.dv_flow_en == 2) 550 return 0; 551 #endif 552 #ifdef HAVE_MLX5DV_DR 553 void *domain; 554 555 /* Reference counter is zero, we should initialize structures. */ 556 domain = mlx5_glue->dr_create_domain(sh->cdev->ctx, 557 MLX5DV_DR_DOMAIN_TYPE_NIC_RX); 558 if (!domain) { 559 DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed"); 560 err = errno; 561 goto error; 562 } 563 sh->rx_domain = domain; 564 domain = mlx5_glue->dr_create_domain(sh->cdev->ctx, 565 MLX5DV_DR_DOMAIN_TYPE_NIC_TX); 566 if (!domain) { 567 DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed"); 568 err = errno; 569 goto error; 570 } 571 sh->tx_domain = domain; 572 #ifdef HAVE_MLX5DV_DR_ESWITCH 573 if (sh->config.dv_esw_en) { 574 domain = mlx5_glue->dr_create_domain(sh->cdev->ctx, 575 MLX5DV_DR_DOMAIN_TYPE_FDB); 576 if (!domain) { 577 DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed"); 578 err = errno; 579 goto error; 580 } 581 sh->fdb_domain = domain; 582 } 583 /* 584 * The drop action is just some dummy placeholder in rdma-core. It 585 * does not belong to domains and has no any attributes, and, can be 586 * shared by the entire device. 587 */ 588 sh->dr_drop_action = mlx5_glue->dr_create_flow_action_drop(); 589 if (!sh->dr_drop_action) { 590 DRV_LOG(ERR, "FDB mlx5dv_dr_create_flow_action_drop"); 591 err = errno; 592 goto error; 593 } 594 595 if (sh->config.dv_flow_en == 1) { 596 /* Query availability of metadata reg_c's. */ 597 if (!priv->sh->metadata_regc_check_flag) { 598 err = mlx5_flow_discover_mreg_c(eth_dev); 599 if (err < 0) { 600 err = -err; 601 goto error; 602 } 603 } 604 if (!mlx5_flow_ext_mreg_supported(eth_dev)) { 605 DRV_LOG(DEBUG, 606 "port %u extensive metadata register is not supported", 607 eth_dev->data->port_id); 608 if (sh->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { 609 DRV_LOG(ERR, "metadata mode %u is not supported " 610 "(no metadata registers available)", 611 sh->config.dv_xmeta_en); 612 err = ENOTSUP; 613 goto error; 614 } 615 } 616 if (sh->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && 617 mlx5_flow_ext_mreg_supported(eth_dev) && sh->dv_regc0_mask) { 618 sh->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME, 619 MLX5_FLOW_MREG_HTABLE_SZ, 620 false, true, eth_dev, 621 flow_dv_mreg_create_cb, 622 flow_dv_mreg_match_cb, 623 flow_dv_mreg_remove_cb, 624 flow_dv_mreg_clone_cb, 625 flow_dv_mreg_clone_free_cb); 626 if (!sh->mreg_cp_tbl) { 627 err = ENOMEM; 628 goto error; 629 } 630 } 631 } 632 #endif 633 if (!sh->tunnel_hub && sh->config.dv_miss_info) 634 err = mlx5_alloc_tunnel_hub(sh); 635 if (err) { 636 DRV_LOG(ERR, "mlx5_alloc_tunnel_hub failed err=%d", err); 637 goto error; 638 } 639 if (sh->config.reclaim_mode == MLX5_RCM_AGGR) { 640 mlx5_glue->dr_reclaim_domain_memory(sh->rx_domain, 1); 641 mlx5_glue->dr_reclaim_domain_memory(sh->tx_domain, 1); 642 if (sh->fdb_domain) 643 mlx5_glue->dr_reclaim_domain_memory(sh->fdb_domain, 1); 644 } 645 sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan(); 646 if (!sh->config.allow_duplicate_pattern) { 647 #ifndef HAVE_MLX5_DR_ALLOW_DUPLICATE 648 DRV_LOG(WARNING, "Disallow duplicate pattern is not supported - maybe old rdma-core version?"); 649 #endif 650 mlx5_glue->dr_allow_duplicate_rules(sh->rx_domain, 0); 651 mlx5_glue->dr_allow_duplicate_rules(sh->tx_domain, 0); 652 if (sh->fdb_domain) 653 mlx5_glue->dr_allow_duplicate_rules(sh->fdb_domain, 0); 654 } 655 656 __mlx5_discovery_misc5_cap(priv); 657 #endif /* HAVE_MLX5DV_DR */ 658 LIST_INIT(&sh->shared_rxqs); 659 return 0; 660 error: 661 /* Rollback the created objects. */ 662 if (sh->rx_domain) { 663 mlx5_glue->dr_destroy_domain(sh->rx_domain); 664 sh->rx_domain = NULL; 665 } 666 if (sh->tx_domain) { 667 mlx5_glue->dr_destroy_domain(sh->tx_domain); 668 sh->tx_domain = NULL; 669 } 670 if (sh->fdb_domain) { 671 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 672 sh->fdb_domain = NULL; 673 } 674 if (sh->dr_drop_action) { 675 mlx5_glue->destroy_flow_action(sh->dr_drop_action); 676 sh->dr_drop_action = NULL; 677 } 678 if (sh->pop_vlan_action) { 679 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 680 sh->pop_vlan_action = NULL; 681 } 682 if (sh->encaps_decaps) { 683 mlx5_hlist_destroy(sh->encaps_decaps); 684 sh->encaps_decaps = NULL; 685 } 686 if (sh->modify_cmds) { 687 mlx5_hlist_destroy(sh->modify_cmds); 688 sh->modify_cmds = NULL; 689 } 690 if (sh->tag_table) { 691 /* tags should be destroyed with flow before. */ 692 mlx5_hlist_destroy(sh->tag_table); 693 sh->tag_table = NULL; 694 } 695 if (sh->tunnel_hub) { 696 mlx5_release_tunnel_hub(sh, priv->dev_port); 697 sh->tunnel_hub = NULL; 698 } 699 mlx5_free_table_hash_list(priv); 700 if (sh->port_id_action_list) { 701 mlx5_list_destroy(sh->port_id_action_list); 702 sh->port_id_action_list = NULL; 703 } 704 if (sh->push_vlan_action_list) { 705 mlx5_list_destroy(sh->push_vlan_action_list); 706 sh->push_vlan_action_list = NULL; 707 } 708 if (sh->sample_action_list) { 709 mlx5_list_destroy(sh->sample_action_list); 710 sh->sample_action_list = NULL; 711 } 712 if (sh->dest_array_list) { 713 mlx5_list_destroy(sh->dest_array_list); 714 sh->dest_array_list = NULL; 715 } 716 if (sh->mreg_cp_tbl) { 717 mlx5_hlist_destroy(sh->mreg_cp_tbl); 718 sh->mreg_cp_tbl = NULL; 719 } 720 return err; 721 } 722 723 /** 724 * Destroy DR related data within private structure. 725 * 726 * @param[in] priv 727 * Pointer to the private device data structure. 728 */ 729 void 730 mlx5_os_free_shared_dr(struct mlx5_priv *priv) 731 { 732 struct mlx5_dev_ctx_shared *sh = priv->sh; 733 #ifdef HAVE_MLX5DV_DR 734 int i; 735 #endif 736 737 MLX5_ASSERT(sh && sh->refcnt); 738 if (sh->refcnt > 1) 739 return; 740 MLX5_ASSERT(LIST_EMPTY(&sh->shared_rxqs)); 741 #ifdef HAVE_MLX5DV_DR 742 if (sh->rx_domain) { 743 mlx5_glue->dr_destroy_domain(sh->rx_domain); 744 sh->rx_domain = NULL; 745 } 746 if (sh->tx_domain) { 747 mlx5_glue->dr_destroy_domain(sh->tx_domain); 748 sh->tx_domain = NULL; 749 } 750 #ifdef HAVE_MLX5DV_DR_ESWITCH 751 if (sh->fdb_domain) { 752 mlx5_glue->dr_destroy_domain(sh->fdb_domain); 753 sh->fdb_domain = NULL; 754 } 755 if (sh->dr_drop_action) { 756 mlx5_glue->destroy_flow_action(sh->dr_drop_action); 757 sh->dr_drop_action = NULL; 758 } 759 #endif 760 if (sh->pop_vlan_action) { 761 mlx5_glue->destroy_flow_action(sh->pop_vlan_action); 762 sh->pop_vlan_action = NULL; 763 } 764 for (i = 0; i < MLX5DR_TABLE_TYPE_MAX; i++) { 765 if (sh->send_to_kernel_action[i].action) { 766 void *action = sh->send_to_kernel_action[i].action; 767 768 mlx5_glue->destroy_flow_action(action); 769 sh->send_to_kernel_action[i].action = NULL; 770 } 771 if (sh->send_to_kernel_action[i].tbl) { 772 struct mlx5_flow_tbl_resource *tbl = 773 sh->send_to_kernel_action[i].tbl; 774 775 flow_dv_tbl_resource_release(sh, tbl); 776 sh->send_to_kernel_action[i].tbl = NULL; 777 } 778 } 779 #endif /* HAVE_MLX5DV_DR */ 780 if (sh->default_miss_action) 781 mlx5_glue->destroy_flow_action 782 (sh->default_miss_action); 783 if (sh->encaps_decaps) { 784 mlx5_hlist_destroy(sh->encaps_decaps); 785 sh->encaps_decaps = NULL; 786 } 787 if (sh->modify_cmds) { 788 mlx5_hlist_destroy(sh->modify_cmds); 789 sh->modify_cmds = NULL; 790 } 791 if (sh->tag_table) { 792 /* tags should be destroyed with flow before. */ 793 mlx5_hlist_destroy(sh->tag_table); 794 sh->tag_table = NULL; 795 } 796 if (sh->tunnel_hub) { 797 mlx5_release_tunnel_hub(sh, priv->dev_port); 798 sh->tunnel_hub = NULL; 799 } 800 mlx5_free_table_hash_list(priv); 801 if (sh->port_id_action_list) { 802 mlx5_list_destroy(sh->port_id_action_list); 803 sh->port_id_action_list = NULL; 804 } 805 if (sh->push_vlan_action_list) { 806 mlx5_list_destroy(sh->push_vlan_action_list); 807 sh->push_vlan_action_list = NULL; 808 } 809 if (sh->sample_action_list) { 810 mlx5_list_destroy(sh->sample_action_list); 811 sh->sample_action_list = NULL; 812 } 813 if (sh->dest_array_list) { 814 mlx5_list_destroy(sh->dest_array_list); 815 sh->dest_array_list = NULL; 816 } 817 if (sh->mreg_cp_tbl) { 818 mlx5_hlist_destroy(sh->mreg_cp_tbl); 819 sh->mreg_cp_tbl = NULL; 820 } 821 } 822 823 /** 824 * Initialize shared data between primary and secondary process. 825 * 826 * A memzone is reserved by primary process and secondary processes attach to 827 * the memzone. 828 * 829 * @return 830 * 0 on success, a negative errno value otherwise and rte_errno is set. 831 */ 832 static int 833 mlx5_init_shared_data(void) 834 { 835 const struct rte_memzone *mz; 836 int ret = 0; 837 838 rte_spinlock_lock(&mlx5_shared_data_lock); 839 if (mlx5_shared_data == NULL) { 840 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 841 /* Allocate shared memory. */ 842 mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA, 843 sizeof(*mlx5_shared_data), 844 SOCKET_ID_ANY, 0); 845 if (mz == NULL) { 846 DRV_LOG(ERR, 847 "Cannot allocate mlx5 shared data"); 848 ret = -rte_errno; 849 goto error; 850 } 851 mlx5_shared_data = mz->addr; 852 memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data)); 853 rte_spinlock_init(&mlx5_shared_data->lock); 854 } else { 855 /* Lookup allocated shared memory. */ 856 mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA); 857 if (mz == NULL) { 858 DRV_LOG(ERR, 859 "Cannot attach mlx5 shared data"); 860 ret = -rte_errno; 861 goto error; 862 } 863 mlx5_shared_data = mz->addr; 864 memset(&mlx5_local_data, 0, sizeof(mlx5_local_data)); 865 } 866 } 867 error: 868 rte_spinlock_unlock(&mlx5_shared_data_lock); 869 return ret; 870 } 871 872 /** 873 * PMD global initialization. 874 * 875 * Independent from individual device, this function initializes global 876 * per-PMD data structures distinguishing primary and secondary processes. 877 * Hence, each initialization is called once per a process. 878 * 879 * @return 880 * 0 on success, a negative errno value otherwise and rte_errno is set. 881 */ 882 static int 883 mlx5_init_once(void) 884 { 885 struct mlx5_shared_data *sd; 886 struct mlx5_local_data *ld = &mlx5_local_data; 887 int ret = 0; 888 889 if (mlx5_init_shared_data()) 890 return -rte_errno; 891 sd = mlx5_shared_data; 892 MLX5_ASSERT(sd); 893 rte_spinlock_lock(&sd->lock); 894 switch (rte_eal_process_type()) { 895 case RTE_PROC_PRIMARY: 896 if (sd->init_done) 897 break; 898 ret = mlx5_mp_init_primary(MLX5_MP_NAME, 899 mlx5_mp_os_primary_handle); 900 if (ret) 901 goto out; 902 sd->init_done = true; 903 break; 904 case RTE_PROC_SECONDARY: 905 if (ld->init_done) 906 break; 907 ret = mlx5_mp_init_secondary(MLX5_MP_NAME, 908 mlx5_mp_os_secondary_handle); 909 if (ret) 910 goto out; 911 ++sd->secondary_cnt; 912 ld->init_done = true; 913 break; 914 default: 915 break; 916 } 917 out: 918 rte_spinlock_unlock(&sd->lock); 919 return ret; 920 } 921 922 /** 923 * DR flow drop action support detect. 924 * 925 * @param dev 926 * Pointer to rte_eth_dev structure. 927 * 928 */ 929 static void 930 mlx5_flow_drop_action_config(struct rte_eth_dev *dev __rte_unused) 931 { 932 #ifdef HAVE_MLX5DV_DR 933 struct mlx5_priv *priv = dev->data->dev_private; 934 935 if (!priv->sh->config.dv_flow_en || !priv->sh->dr_drop_action) 936 return; 937 /** 938 * DR supports drop action placeholder when it is supported; 939 * otherwise, use the queue drop action. 940 */ 941 if (!priv->sh->drop_action_check_flag) { 942 if (!mlx5_flow_discover_dr_action_support(dev)) 943 priv->sh->dr_root_drop_action_en = 1; 944 priv->sh->drop_action_check_flag = 1; 945 } 946 if (priv->sh->dr_root_drop_action_en) 947 priv->root_drop_action = priv->sh->dr_drop_action; 948 else 949 priv->root_drop_action = priv->drop_queue.hrxq->action; 950 #endif 951 } 952 953 static void 954 mlx5_queue_counter_id_prepare(struct rte_eth_dev *dev) 955 { 956 struct mlx5_priv *priv = dev->data->dev_private; 957 void *ctx = priv->sh->cdev->ctx; 958 959 priv->q_counters = mlx5_devx_cmd_queue_counter_alloc(ctx); 960 if (!priv->q_counters) { 961 struct ibv_cq *cq = mlx5_glue->create_cq(ctx, 1, NULL, NULL, 0); 962 struct ibv_wq *wq; 963 964 DRV_LOG(DEBUG, "Port %d queue counter object cannot be created " 965 "by DevX - fall-back to use the kernel driver global " 966 "queue counter.", dev->data->port_id); 967 priv->q_counters_allocation_failure = 1; 968 969 /* Create WQ by kernel and query its queue counter ID. */ 970 if (cq) { 971 wq = mlx5_glue->create_wq(ctx, 972 &(struct ibv_wq_init_attr){ 973 .wq_type = IBV_WQT_RQ, 974 .max_wr = 1, 975 .max_sge = 1, 976 .pd = priv->sh->cdev->pd, 977 .cq = cq, 978 }); 979 if (wq) { 980 /* Counter is assigned only on RDY state. */ 981 int ret = mlx5_glue->modify_wq(wq, 982 &(struct ibv_wq_attr){ 983 .attr_mask = IBV_WQ_ATTR_STATE, 984 .wq_state = IBV_WQS_RDY, 985 }); 986 987 if (ret == 0) 988 mlx5_devx_cmd_wq_query(wq, 989 &priv->counter_set_id); 990 claim_zero(mlx5_glue->destroy_wq(wq)); 991 } 992 claim_zero(mlx5_glue->destroy_cq(cq)); 993 } 994 } else { 995 priv->counter_set_id = priv->q_counters->id; 996 } 997 if (priv->counter_set_id == 0) 998 DRV_LOG(INFO, "Part of the port %d statistics will not be " 999 "available.", dev->data->port_id); 1000 } 1001 1002 /** 1003 * Check if representor spawn info match devargs. 1004 * 1005 * @param spawn 1006 * Verbs device parameters (name, port, switch_info) to spawn. 1007 * @param eth_da 1008 * Device devargs to probe. 1009 * 1010 * @return 1011 * Match result. 1012 */ 1013 static bool 1014 mlx5_representor_match(struct mlx5_dev_spawn_data *spawn, 1015 struct rte_eth_devargs *eth_da) 1016 { 1017 struct mlx5_switch_info *switch_info = &spawn->info; 1018 unsigned int p, f; 1019 uint16_t id; 1020 uint16_t repr_id = mlx5_representor_id_encode(switch_info, 1021 eth_da->type); 1022 1023 /* 1024 * Assuming Multiport E-Switch device was detected, 1025 * if spawned port is an uplink, check if the port 1026 * was requested through representor devarg. 1027 */ 1028 if (mlx5_is_probed_port_on_mpesw_device(spawn) && 1029 switch_info->name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK) { 1030 for (p = 0; p < eth_da->nb_ports; ++p) 1031 if (switch_info->port_name == eth_da->ports[p]) 1032 return true; 1033 rte_errno = EBUSY; 1034 return false; 1035 } 1036 switch (eth_da->type) { 1037 case RTE_ETH_REPRESENTOR_PF: 1038 /* 1039 * PF representors provided in devargs translate to uplink ports, but 1040 * if and only if the device is a part of MPESW device. 1041 */ 1042 if (!mlx5_is_probed_port_on_mpesw_device(spawn)) { 1043 rte_errno = EBUSY; 1044 return false; 1045 } 1046 break; 1047 case RTE_ETH_REPRESENTOR_SF: 1048 if (!(spawn->info.port_name == -1 && 1049 switch_info->name_type == 1050 MLX5_PHYS_PORT_NAME_TYPE_PFHPF) && 1051 switch_info->name_type != MLX5_PHYS_PORT_NAME_TYPE_PFSF) { 1052 rte_errno = EBUSY; 1053 return false; 1054 } 1055 break; 1056 case RTE_ETH_REPRESENTOR_VF: 1057 /* Allows HPF representor index -1 as exception. */ 1058 if (!(spawn->info.port_name == -1 && 1059 switch_info->name_type == 1060 MLX5_PHYS_PORT_NAME_TYPE_PFHPF) && 1061 switch_info->name_type != MLX5_PHYS_PORT_NAME_TYPE_PFVF) { 1062 rte_errno = EBUSY; 1063 return false; 1064 } 1065 break; 1066 case RTE_ETH_REPRESENTOR_NONE: 1067 rte_errno = EBUSY; 1068 return false; 1069 default: 1070 rte_errno = ENOTSUP; 1071 DRV_LOG(ERR, "unsupported representor type"); 1072 return false; 1073 } 1074 /* Check representor ID: */ 1075 for (p = 0; p < eth_da->nb_ports; ++p) { 1076 if (!mlx5_is_probed_port_on_mpesw_device(spawn) && spawn->pf_bond < 0) { 1077 /* For non-LAG mode, allow and ignore pf. */ 1078 switch_info->pf_num = eth_da->ports[p]; 1079 repr_id = mlx5_representor_id_encode(switch_info, 1080 eth_da->type); 1081 } 1082 for (f = 0; f < eth_da->nb_representor_ports; ++f) { 1083 id = MLX5_REPRESENTOR_ID 1084 (eth_da->ports[p], eth_da->type, 1085 eth_da->representor_ports[f]); 1086 if (repr_id == id) 1087 return true; 1088 } 1089 } 1090 rte_errno = EBUSY; 1091 return false; 1092 } 1093 1094 /** 1095 * Spawn an Ethernet device from Verbs information. 1096 * 1097 * @param dpdk_dev 1098 * Backing DPDK device. 1099 * @param spawn 1100 * Verbs device parameters (name, port, switch_info) to spawn. 1101 * @param eth_da 1102 * Device arguments. 1103 * @param mkvlist 1104 * Pointer to mlx5 kvargs control, can be NULL if there is no devargs. 1105 * 1106 * @return 1107 * A valid Ethernet device object on success, NULL otherwise and rte_errno 1108 * is set. The following errors are defined: 1109 * 1110 * EBUSY: device is not supposed to be spawned. 1111 * EEXIST: device is already spawned 1112 */ 1113 static struct rte_eth_dev * 1114 mlx5_dev_spawn(struct rte_device *dpdk_dev, 1115 struct mlx5_dev_spawn_data *spawn, 1116 struct rte_eth_devargs *eth_da, 1117 struct mlx5_kvargs_ctrl *mkvlist) 1118 { 1119 const struct mlx5_switch_info *switch_info = &spawn->info; 1120 struct mlx5_dev_ctx_shared *sh = NULL; 1121 struct ibv_port_attr port_attr = { .state = IBV_PORT_NOP }; 1122 struct rte_eth_dev *eth_dev = NULL; 1123 struct mlx5_priv *priv = NULL; 1124 int err = 0; 1125 struct rte_ether_addr mac; 1126 char name[RTE_ETH_NAME_MAX_LEN]; 1127 int own_domain_id = 0; 1128 uint16_t port_id; 1129 struct mlx5_port_info vport_info = { .query_flags = 0 }; 1130 int nl_rdma; 1131 int i; 1132 struct mlx5_indexed_pool_config icfg[RTE_DIM(default_icfg)]; 1133 1134 memcpy(icfg, default_icfg, sizeof(icfg)); 1135 /* Determine if this port representor is supposed to be spawned. */ 1136 if (switch_info->representor && dpdk_dev->devargs && 1137 !mlx5_representor_match(spawn, eth_da)) 1138 return NULL; 1139 /* Build device name. */ 1140 if (spawn->pf_bond >= 0) { 1141 /* Bonding device. */ 1142 if (!switch_info->representor) { 1143 err = snprintf(name, sizeof(name), "%s_%s", 1144 dpdk_dev->name, spawn->phys_dev_name); 1145 } else { 1146 err = snprintf(name, sizeof(name), "%s_%s_representor_c%dpf%d%s%u", 1147 dpdk_dev->name, spawn->phys_dev_name, 1148 switch_info->ctrl_num, 1149 switch_info->pf_num, 1150 switch_info->name_type == 1151 MLX5_PHYS_PORT_NAME_TYPE_PFSF ? "sf" : "vf", 1152 switch_info->port_name); 1153 } 1154 } else if (mlx5_is_probed_port_on_mpesw_device(spawn)) { 1155 /* MPESW device. */ 1156 if (switch_info->name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK) { 1157 err = snprintf(name, sizeof(name), "%s_p%d", 1158 dpdk_dev->name, spawn->mpesw_port); 1159 } else { 1160 err = snprintf(name, sizeof(name), "%s_representor_c%dpf%d%s%u", 1161 dpdk_dev->name, 1162 switch_info->ctrl_num, 1163 switch_info->pf_num, 1164 switch_info->name_type == 1165 MLX5_PHYS_PORT_NAME_TYPE_PFSF ? "sf" : "vf", 1166 switch_info->port_name); 1167 } 1168 } else { 1169 /* Single device. */ 1170 if (!switch_info->representor) 1171 strlcpy(name, dpdk_dev->name, sizeof(name)); 1172 else 1173 err = snprintf(name, sizeof(name), "%s_representor_%s%u", 1174 dpdk_dev->name, 1175 switch_info->name_type == 1176 MLX5_PHYS_PORT_NAME_TYPE_PFSF ? "sf" : "vf", 1177 switch_info->port_name); 1178 } 1179 if (err >= (int)sizeof(name)) 1180 DRV_LOG(WARNING, "device name overflow %s", name); 1181 /* check if the device is already spawned */ 1182 if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) { 1183 /* 1184 * When device is already spawned, its devargs should be set 1185 * as used. otherwise, mlx5_kvargs_validate() will fail. 1186 */ 1187 if (mkvlist) 1188 mlx5_port_args_set_used(name, port_id, mkvlist); 1189 rte_errno = EEXIST; 1190 return NULL; 1191 } 1192 DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name); 1193 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 1194 struct mlx5_mp_id mp_id; 1195 int fd; 1196 1197 eth_dev = rte_eth_dev_attach_secondary(name); 1198 if (eth_dev == NULL) { 1199 DRV_LOG(ERR, "can not attach rte ethdev"); 1200 rte_errno = ENOMEM; 1201 return NULL; 1202 } 1203 eth_dev->device = dpdk_dev; 1204 eth_dev->dev_ops = &mlx5_dev_sec_ops; 1205 eth_dev->rx_descriptor_status = mlx5_rx_descriptor_status; 1206 eth_dev->tx_descriptor_status = mlx5_tx_descriptor_status; 1207 err = mlx5_proc_priv_init(eth_dev); 1208 if (err) 1209 return NULL; 1210 mlx5_mp_id_init(&mp_id, eth_dev->data->port_id); 1211 /* Receive command fd from primary process */ 1212 fd = mlx5_mp_req_verbs_cmd_fd(&mp_id); 1213 if (fd < 0) 1214 goto err_secondary; 1215 /* Remap UAR for Tx queues. */ 1216 err = mlx5_tx_uar_init_secondary(eth_dev, fd); 1217 close(fd); 1218 if (err) 1219 goto err_secondary; 1220 /* 1221 * Ethdev pointer is still required as input since 1222 * the primary device is not accessible from the 1223 * secondary process. 1224 */ 1225 eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev); 1226 eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev); 1227 return eth_dev; 1228 err_secondary: 1229 mlx5_dev_close(eth_dev); 1230 return NULL; 1231 } 1232 sh = mlx5_alloc_shared_dev_ctx(spawn, mkvlist); 1233 if (!sh) 1234 return NULL; 1235 nl_rdma = mlx5_nl_init(NETLINK_RDMA, 0); 1236 /* Check port status. */ 1237 if (spawn->phys_port <= UINT8_MAX) { 1238 /* Legacy Verbs api only support u8 port number. */ 1239 err = mlx5_glue->query_port(sh->cdev->ctx, spawn->phys_port, 1240 &port_attr); 1241 if (err) { 1242 DRV_LOG(ERR, "port query failed: %s", strerror(err)); 1243 goto error; 1244 } 1245 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { 1246 DRV_LOG(ERR, "port is not configured in Ethernet mode"); 1247 err = EINVAL; 1248 goto error; 1249 } 1250 } else if (nl_rdma >= 0) { 1251 /* IB doesn't allow more than 255 ports, must be Ethernet. */ 1252 err = mlx5_nl_port_state(nl_rdma, 1253 spawn->phys_dev_name, 1254 spawn->phys_port); 1255 if (err < 0) { 1256 DRV_LOG(INFO, "Failed to get netlink port state: %s", 1257 strerror(rte_errno)); 1258 err = -rte_errno; 1259 goto error; 1260 } 1261 port_attr.state = (enum ibv_port_state)err; 1262 } 1263 if (port_attr.state != IBV_PORT_ACTIVE) 1264 DRV_LOG(INFO, "port is not active: \"%s\" (%d)", 1265 mlx5_glue->port_state_str(port_attr.state), 1266 port_attr.state); 1267 /* Allocate private eth device data. */ 1268 priv = mlx5_malloc(MLX5_MEM_ZERO | MLX5_MEM_RTE, 1269 sizeof(*priv), 1270 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 1271 if (priv == NULL) { 1272 DRV_LOG(ERR, "priv allocation failure"); 1273 err = ENOMEM; 1274 goto error; 1275 } 1276 /* 1277 * When user configures remote PD and CTX and device creates RxQ by 1278 * DevX, external RxQ is both supported and requested. 1279 */ 1280 if (mlx5_imported_pd_and_ctx(sh->cdev) && mlx5_devx_obj_ops_en(sh)) { 1281 priv->ext_rxqs = mlx5_malloc(MLX5_MEM_ZERO | MLX5_MEM_RTE, 1282 sizeof(struct mlx5_external_q) * 1283 MLX5_MAX_EXT_RX_QUEUES, 0, 1284 SOCKET_ID_ANY); 1285 if (priv->ext_rxqs == NULL) { 1286 DRV_LOG(ERR, "Fail to allocate external RxQ array."); 1287 err = ENOMEM; 1288 goto error; 1289 } 1290 priv->ext_txqs = mlx5_malloc(MLX5_MEM_ZERO | MLX5_MEM_RTE, 1291 sizeof(struct mlx5_external_q) * 1292 MLX5_MAX_EXT_TX_QUEUES, 0, 1293 SOCKET_ID_ANY); 1294 if (priv->ext_txqs == NULL) { 1295 DRV_LOG(ERR, "Fail to allocate external TxQ array."); 1296 err = ENOMEM; 1297 goto error; 1298 } 1299 DRV_LOG(DEBUG, "External queue is supported."); 1300 } 1301 priv->sh = sh; 1302 priv->dev_port = spawn->phys_port; 1303 priv->pci_dev = spawn->pci_dev; 1304 priv->mtu = RTE_ETHER_MTU; 1305 /* Some internal functions rely on Netlink sockets, open them now. */ 1306 priv->nl_socket_rdma = nl_rdma; 1307 priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE, 0); 1308 priv->representor = !!switch_info->representor; 1309 priv->master = !!switch_info->master; 1310 priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; 1311 priv->vport_meta_tag = 0; 1312 priv->vport_meta_mask = 0; 1313 priv->pf_bond = spawn->pf_bond; 1314 priv->mpesw_port = spawn->mpesw_port; 1315 priv->mpesw_uplink = false; 1316 priv->mpesw_owner = spawn->info.mpesw_owner; 1317 if (mlx5_is_port_on_mpesw_device(priv)) 1318 priv->mpesw_uplink = (spawn->info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK); 1319 1320 DRV_LOG(DEBUG, 1321 "dev_port=%u bus=%s pci=%s master=%d representor=%d pf_bond=%d " 1322 "mpesw_port=%d mpesw_uplink=%d", 1323 priv->dev_port, dpdk_dev->bus->name, 1324 priv->pci_dev ? priv->pci_dev->name : "NONE", 1325 priv->master, priv->representor, priv->pf_bond, 1326 priv->mpesw_port, priv->mpesw_uplink); 1327 1328 if (mlx5_is_port_on_mpesw_device(priv) && priv->sh->config.dv_flow_en != 2) { 1329 DRV_LOG(ERR, "MPESW device is supported only with HWS"); 1330 err = ENOTSUP; 1331 goto error; 1332 } 1333 /* 1334 * If we have E-Switch we should determine the vport attributes. 1335 * E-Switch may use either source vport field or reg_c[0] metadata 1336 * register to match on vport index. The engaged part of metadata 1337 * register is defined by mask. 1338 */ 1339 if (sh->esw_mode) { 1340 err = mlx5_glue->devx_port_query(sh->cdev->ctx, 1341 spawn->phys_port, 1342 &vport_info); 1343 if (err) { 1344 DRV_LOG(WARNING, 1345 "Cannot query devx port %d on device %s", 1346 spawn->phys_port, spawn->phys_dev_name); 1347 vport_info.query_flags = 0; 1348 } 1349 } 1350 if (vport_info.query_flags & MLX5_PORT_QUERY_REG_C0) { 1351 priv->vport_meta_tag = vport_info.vport_meta_tag; 1352 priv->vport_meta_mask = vport_info.vport_meta_mask; 1353 if (!priv->vport_meta_mask) { 1354 DRV_LOG(ERR, 1355 "vport zero mask for port %d on bonding device %s", 1356 spawn->phys_port, spawn->phys_dev_name); 1357 err = ENOTSUP; 1358 goto error; 1359 } 1360 if (priv->vport_meta_tag & ~priv->vport_meta_mask) { 1361 DRV_LOG(ERR, 1362 "Invalid vport tag for port %d on bonding device %s", 1363 spawn->phys_port, spawn->phys_dev_name); 1364 err = ENOTSUP; 1365 goto error; 1366 } 1367 } 1368 if (vport_info.query_flags & MLX5_PORT_QUERY_VPORT) { 1369 priv->vport_id = vport_info.vport_id; 1370 } else if (spawn->pf_bond >= 0 && sh->esw_mode) { 1371 DRV_LOG(ERR, 1372 "Cannot deduce vport index for port %d on bonding device %s", 1373 spawn->phys_port, spawn->phys_dev_name); 1374 err = ENOTSUP; 1375 goto error; 1376 } else { 1377 /* 1378 * Suppose vport index in compatible way. Kernel/rdma_core 1379 * support single E-Switch per PF configurations only and 1380 * vport_id field contains the vport index for associated VF, 1381 * which is deduced from representor port name. 1382 * For example, let's have the IB device port 10, it has 1383 * attached network device eth0, which has port name attribute 1384 * pf0vf2, we can deduce the VF number as 2, and set vport index 1385 * as 3 (2+1). This assigning schema should be changed if the 1386 * multiple E-Switch instances per PF configurations or/and PCI 1387 * subfunctions are added. 1388 */ 1389 priv->vport_id = switch_info->representor ? 1390 switch_info->port_name + 1 : -1; 1391 } 1392 priv->representor_id = mlx5_representor_id_encode(switch_info, 1393 eth_da->type); 1394 /* 1395 * Look for sibling devices in order to reuse their switch domain 1396 * if any, otherwise allocate one. 1397 */ 1398 MLX5_ETH_FOREACH_DEV(port_id, dpdk_dev) { 1399 const struct mlx5_priv *opriv = 1400 rte_eth_devices[port_id].data->dev_private; 1401 1402 if (!opriv || 1403 opriv->sh != priv->sh || 1404 opriv->domain_id == 1405 RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) 1406 continue; 1407 priv->domain_id = opriv->domain_id; 1408 DRV_LOG(DEBUG, "dev_port-%u inherit domain_id=%u\n", 1409 priv->dev_port, priv->domain_id); 1410 break; 1411 } 1412 if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { 1413 err = rte_eth_switch_domain_alloc(&priv->domain_id); 1414 if (err) { 1415 err = rte_errno; 1416 DRV_LOG(ERR, "unable to allocate switch domain: %s", 1417 strerror(rte_errno)); 1418 goto error; 1419 } 1420 own_domain_id = 1; 1421 DRV_LOG(DEBUG, "dev_port-%u new domain_id=%u\n", 1422 priv->dev_port, priv->domain_id); 1423 } 1424 if (sh->cdev->config.devx) { 1425 struct mlx5_hca_attr *hca_attr = &sh->cdev->config.hca_attr; 1426 1427 sh->steering_format_version = hca_attr->steering_format_version; 1428 #if defined(HAVE_MLX5_DR_CREATE_ACTION_ASO_EXT) 1429 if (hca_attr->qos.sup && hca_attr->qos.flow_meter_old && 1430 sh->config.dv_flow_en) { 1431 if (sh->registers.aso_reg != REG_NON) { 1432 priv->mtr_en = 1; 1433 priv->mtr_reg_share = hca_attr->qos.flow_meter; 1434 } 1435 } 1436 if (hca_attr->qos.sup && hca_attr->qos.flow_meter_aso_sup) { 1437 uint32_t log_obj_size = 1438 rte_log2_u32(MLX5_ASO_MTRS_PER_POOL >> 1); 1439 if (log_obj_size >= 1440 hca_attr->qos.log_meter_aso_granularity && 1441 log_obj_size <= 1442 hca_attr->qos.log_meter_aso_max_alloc) 1443 sh->meter_aso_en = 1; 1444 } 1445 if (priv->mtr_en) { 1446 err = mlx5_aso_flow_mtrs_mng_init(priv->sh); 1447 if (err) { 1448 err = -err; 1449 goto error; 1450 } 1451 } 1452 if (hca_attr->flow.tunnel_header_0_1) 1453 sh->tunnel_header_0_1 = 1; 1454 if (hca_attr->flow.tunnel_header_2_3) 1455 sh->tunnel_header_2_3 = 1; 1456 #endif /* HAVE_MLX5_DR_CREATE_ACTION_ASO_EXT */ 1457 #ifdef HAVE_MLX5_DR_CREATE_ACTION_ASO 1458 if (hca_attr->flow_hit_aso && sh->registers.aso_reg == REG_C_3) { 1459 sh->flow_hit_aso_en = 1; 1460 err = mlx5_flow_aso_age_mng_init(sh); 1461 if (err) { 1462 err = -err; 1463 goto error; 1464 } 1465 DRV_LOG(DEBUG, "Flow Hit ASO is supported."); 1466 } 1467 #endif /* HAVE_MLX5_DR_CREATE_ACTION_ASO */ 1468 #if defined (HAVE_MLX5_DR_CREATE_ACTION_ASO) && \ 1469 defined (HAVE_MLX5_DR_ACTION_ASO_CT) 1470 /* HWS create CT ASO SQ based on HWS configure queue number. */ 1471 if (sh->config.dv_flow_en != 2 && 1472 hca_attr->ct_offload && sh->registers.aso_reg == REG_C_3) { 1473 err = mlx5_flow_aso_ct_mng_init(sh); 1474 if (err) { 1475 err = -err; 1476 goto error; 1477 } 1478 DRV_LOG(DEBUG, "CT ASO is supported."); 1479 sh->ct_aso_en = 1; 1480 } 1481 #endif /* HAVE_MLX5_DR_CREATE_ACTION_ASO && HAVE_MLX5_DR_ACTION_ASO_CT */ 1482 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_SAMPLE) 1483 if (hca_attr->log_max_ft_sampler_num > 0 && 1484 sh->config.dv_flow_en) { 1485 priv->sampler_en = 1; 1486 DRV_LOG(DEBUG, "Sampler enabled!"); 1487 } else { 1488 priv->sampler_en = 0; 1489 if (!hca_attr->log_max_ft_sampler_num) 1490 DRV_LOG(WARNING, 1491 "No available register for sampler."); 1492 else 1493 DRV_LOG(DEBUG, "DV flow is not supported!"); 1494 } 1495 #endif 1496 if (hca_attr->lag_rx_port_affinity) { 1497 sh->lag_rx_port_affinity_en = 1; 1498 DRV_LOG(DEBUG, "LAG Rx Port Affinity enabled"); 1499 } 1500 priv->num_lag_ports = hca_attr->num_lag_ports; 1501 DRV_LOG(DEBUG, "The number of lag ports is %d", priv->num_lag_ports); 1502 } 1503 /* Process parameters and store port configuration on priv structure. */ 1504 err = mlx5_port_args_config(priv, mkvlist, &priv->config); 1505 if (err) { 1506 err = rte_errno; 1507 DRV_LOG(ERR, "Failed to process port configure: %s", 1508 strerror(rte_errno)); 1509 goto error; 1510 } 1511 eth_dev = rte_eth_dev_allocate(name); 1512 if (eth_dev == NULL) { 1513 DRV_LOG(ERR, "can not allocate rte ethdev"); 1514 err = ENOMEM; 1515 goto error; 1516 } 1517 if (priv->representor) { 1518 eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR; 1519 eth_dev->data->representor_id = priv->representor_id; 1520 MLX5_ETH_FOREACH_DEV(port_id, dpdk_dev) { 1521 struct mlx5_priv *opriv = 1522 rte_eth_devices[port_id].data->dev_private; 1523 if (opriv && 1524 opriv->master && 1525 opriv->domain_id == priv->domain_id && 1526 opriv->sh == priv->sh) { 1527 eth_dev->data->backer_port_id = port_id; 1528 break; 1529 } 1530 } 1531 if (port_id >= RTE_MAX_ETHPORTS) 1532 eth_dev->data->backer_port_id = eth_dev->data->port_id; 1533 } 1534 priv->mp_id.port_id = eth_dev->data->port_id; 1535 strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN); 1536 /* 1537 * Store associated network device interface index. This index 1538 * is permanent throughout the lifetime of device. So, we may store 1539 * the ifindex here and use the cached value further. 1540 */ 1541 MLX5_ASSERT(spawn->ifindex); 1542 priv->if_index = spawn->ifindex; 1543 priv->lag_affinity_idx = sh->refcnt - 1; 1544 eth_dev->data->dev_private = priv; 1545 priv->dev_data = eth_dev->data; 1546 eth_dev->data->mac_addrs = priv->mac; 1547 eth_dev->device = dpdk_dev; 1548 eth_dev->data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS; 1549 /* Configure the first MAC address by default. */ 1550 if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) { 1551 DRV_LOG(ERR, 1552 "port %u cannot get MAC address, is mlx5_en" 1553 " loaded? (errno: %s)", 1554 eth_dev->data->port_id, strerror(rte_errno)); 1555 err = ENODEV; 1556 goto error; 1557 } 1558 DRV_LOG(INFO, 1559 "port %u MAC address is " RTE_ETHER_ADDR_PRT_FMT, 1560 eth_dev->data->port_id, RTE_ETHER_ADDR_BYTES(&mac)); 1561 #ifdef RTE_LIBRTE_MLX5_DEBUG 1562 { 1563 char ifname[MLX5_NAMESIZE]; 1564 1565 if (mlx5_get_ifname(eth_dev, &ifname) == 0) 1566 DRV_LOG(DEBUG, "port %u ifname is \"%s\"", 1567 eth_dev->data->port_id, ifname); 1568 else 1569 DRV_LOG(DEBUG, "port %u ifname is unknown", 1570 eth_dev->data->port_id); 1571 } 1572 #endif 1573 /* Get actual MTU if possible. */ 1574 err = mlx5_get_mtu(eth_dev, &priv->mtu); 1575 if (err) { 1576 err = rte_errno; 1577 goto error; 1578 } 1579 DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id, 1580 priv->mtu); 1581 /* Initialize burst functions to prevent crashes before link-up. */ 1582 eth_dev->rx_pkt_burst = rte_eth_pkt_burst_dummy; 1583 eth_dev->tx_pkt_burst = rte_eth_pkt_burst_dummy; 1584 eth_dev->dev_ops = &mlx5_dev_ops; 1585 eth_dev->rx_descriptor_status = mlx5_rx_descriptor_status; 1586 eth_dev->tx_descriptor_status = mlx5_tx_descriptor_status; 1587 eth_dev->rx_queue_count = mlx5_rx_queue_count; 1588 /* Register MAC address. */ 1589 claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); 1590 if (sh->dev_cap.vf && sh->config.vf_nl_en) 1591 mlx5_nl_mac_addr_sync(priv->nl_socket_route, 1592 mlx5_ifindex(eth_dev), 1593 eth_dev->data->mac_addrs, 1594 MLX5_MAX_MAC_ADDRESSES); 1595 priv->ctrl_flows = 0; 1596 rte_spinlock_init(&priv->flow_list_lock); 1597 TAILQ_INIT(&priv->flow_meters); 1598 priv->mtr_profile_tbl = mlx5_l3t_create(MLX5_L3T_TYPE_PTR); 1599 if (!priv->mtr_profile_tbl) 1600 goto error; 1601 /* Bring Ethernet device up. */ 1602 DRV_LOG(DEBUG, "port %u forcing Ethernet interface up", 1603 eth_dev->data->port_id); 1604 /* Read link status in case it is up and there will be no event. */ 1605 mlx5_link_update(eth_dev, 0); 1606 /* Watch LSC interrupts between port probe and port start. */ 1607 priv->sh->port[priv->dev_port - 1].nl_ih_port_id = 1608 eth_dev->data->port_id; 1609 mlx5_set_link_up(eth_dev); 1610 for (i = 0; i < MLX5_FLOW_TYPE_MAXI; i++) { 1611 icfg[i].release_mem_en = !!sh->config.reclaim_mode; 1612 if (sh->config.reclaim_mode) 1613 icfg[i].per_core_cache = 0; 1614 #ifdef HAVE_MLX5_HWS_SUPPORT 1615 if (priv->sh->config.dv_flow_en == 2) 1616 icfg[i].size = sizeof(struct rte_flow_hw) + sizeof(struct rte_flow_nt2hws); 1617 #endif 1618 priv->flows[i] = mlx5_ipool_create(&icfg[i]); 1619 if (!priv->flows[i]) 1620 goto error; 1621 } 1622 /* Create context for virtual machine VLAN workaround. */ 1623 priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex); 1624 if (mlx5_devx_obj_ops_en(sh)) { 1625 priv->obj_ops = devx_obj_ops; 1626 mlx5_queue_counter_id_prepare(eth_dev); 1627 priv->obj_ops.lb_dummy_queue_create = 1628 mlx5_rxq_ibv_obj_dummy_lb_create; 1629 priv->obj_ops.lb_dummy_queue_release = 1630 mlx5_rxq_ibv_obj_dummy_lb_release; 1631 } else if (spawn->max_port > UINT8_MAX) { 1632 /* Verbs can't support ports larger than 255 by design. */ 1633 DRV_LOG(ERR, "must enable DV and ESW when RDMA link ports > 255"); 1634 err = ENOTSUP; 1635 goto error; 1636 } else { 1637 priv->obj_ops = ibv_obj_ops; 1638 } 1639 if (sh->config.tx_pp && 1640 priv->obj_ops.txq_obj_new != mlx5_txq_devx_obj_new) { 1641 /* 1642 * HAVE_MLX5DV_DEVX_UAR_OFFSET is required to support 1643 * packet pacing and already checked above. 1644 * Hence, we should only make sure the SQs will be created 1645 * with DevX, not with Verbs. 1646 * Verbs allocates the SQ UAR on its own and it can't be shared 1647 * with Clock Queue UAR as required for Tx scheduling. 1648 */ 1649 DRV_LOG(ERR, "Verbs SQs, UAR can't be shared as required for packet pacing"); 1650 err = ENODEV; 1651 goto error; 1652 } 1653 priv->drop_queue.hrxq = mlx5_drop_action_create(eth_dev); 1654 if (!priv->drop_queue.hrxq) 1655 goto error; 1656 priv->hrxqs = mlx5_list_create("hrxq", eth_dev, true, 1657 mlx5_hrxq_create_cb, 1658 mlx5_hrxq_match_cb, 1659 mlx5_hrxq_remove_cb, 1660 mlx5_hrxq_clone_cb, 1661 mlx5_hrxq_clone_free_cb); 1662 if (!priv->hrxqs) 1663 goto error; 1664 mlx5_set_metadata_mask(eth_dev); 1665 if (sh->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && 1666 !priv->sh->dv_regc0_mask) { 1667 DRV_LOG(ERR, "metadata mode %u is not supported " 1668 "(no metadata reg_c[0] is available)", 1669 sh->config.dv_xmeta_en); 1670 err = ENOTSUP; 1671 goto error; 1672 } 1673 rte_rwlock_init(&priv->ind_tbls_lock); 1674 if (sh->config.dv_flow_en) { 1675 err = mlx5_alloc_shared_dr(eth_dev); 1676 if (err) 1677 goto error; 1678 if (mlx5_flex_item_port_init(eth_dev) < 0) 1679 goto error; 1680 } 1681 if (sh->phdev->config.ipv6_tc_fallback == MLX5_IPV6_TC_UNKNOWN) { 1682 sh->phdev->config.ipv6_tc_fallback = MLX5_IPV6_TC_OK; 1683 if (!sh->cdev->config.hca_attr.modify_outer_ipv6_traffic_class || 1684 (sh->config.dv_flow_en == 1 && mlx5_flow_discover_ipv6_tc_support(eth_dev))) 1685 sh->phdev->config.ipv6_tc_fallback = MLX5_IPV6_TC_FALLBACK; 1686 } 1687 if (priv->sh->config.dv_flow_en == 2) { 1688 #ifdef HAVE_MLX5_HWS_SUPPORT 1689 if (priv->sh->config.dv_esw_en) { 1690 uint32_t usable_bits; 1691 uint32_t required_bits; 1692 1693 if (priv->sh->dv_regc0_mask == UINT32_MAX) { 1694 DRV_LOG(ERR, "E-Switch port metadata is required when using HWS " 1695 "but it is disabled (configure it through devlink)"); 1696 err = ENOTSUP; 1697 goto error; 1698 } 1699 if (priv->sh->dv_regc0_mask == 0) { 1700 DRV_LOG(ERR, "E-Switch with HWS is not supported " 1701 "(no available bits in reg_c[0])"); 1702 err = ENOTSUP; 1703 goto error; 1704 } 1705 usable_bits = rte_popcount32(priv->sh->dv_regc0_mask); 1706 required_bits = rte_popcount32(priv->vport_meta_mask); 1707 if (usable_bits < required_bits) { 1708 DRV_LOG(ERR, "Not enough bits available in reg_c[0] to provide " 1709 "representor matching."); 1710 err = ENOTSUP; 1711 goto error; 1712 } 1713 } 1714 if (priv->vport_meta_mask) 1715 flow_hw_set_port_info(eth_dev); 1716 if (priv->sh->config.dv_esw_en && 1717 priv->sh->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && 1718 priv->sh->config.dv_xmeta_en != MLX5_XMETA_MODE_META32_HWS) { 1719 DRV_LOG(ERR, 1720 "metadata mode %u is not supported in HWS eswitch mode", 1721 priv->sh->config.dv_xmeta_en); 1722 err = ENOTSUP; 1723 goto error; 1724 } 1725 if (priv->sh->config.dv_esw_en && 1726 flow_hw_create_vport_action(eth_dev)) { 1727 DRV_LOG(ERR, "port %u failed to create vport action", 1728 eth_dev->data->port_id); 1729 err = EINVAL; 1730 goto error; 1731 } 1732 /* 1733 * If representor matching is disabled, PMD cannot create default flow rules 1734 * to receive traffic for all ports, since implicit source port match is not added. 1735 * Isolated mode is forced. 1736 */ 1737 if (priv->sh->config.dv_esw_en && !priv->sh->config.repr_matching) { 1738 err = mlx5_flow_isolate(eth_dev, 1, NULL); 1739 if (err < 0) { 1740 err = -err; 1741 goto error; 1742 } 1743 DRV_LOG(WARNING, "port %u ingress traffic is restricted to defined " 1744 "flow rules (isolated mode) since representor " 1745 "matching is disabled", 1746 eth_dev->data->port_id); 1747 } 1748 eth_dev->data->dev_flags |= RTE_ETH_DEV_FLOW_OPS_THREAD_SAFE; 1749 return eth_dev; 1750 #else 1751 DRV_LOG(ERR, "DV support is missing for HWS."); 1752 goto error; 1753 #endif 1754 } 1755 if (!priv->sh->flow_priority_check_flag) { 1756 /* Supported Verbs flow priority number detection. */ 1757 err = mlx5_flow_discover_priorities(eth_dev); 1758 priv->sh->flow_max_priority = err; 1759 priv->sh->flow_priority_check_flag = 1; 1760 } else { 1761 err = priv->sh->flow_max_priority; 1762 } 1763 if (err < 0) { 1764 err = -err; 1765 goto error; 1766 } 1767 rte_spinlock_init(&priv->shared_act_sl); 1768 mlx5_flow_counter_mode_config(eth_dev); 1769 mlx5_flow_drop_action_config(eth_dev); 1770 if (sh->config.dv_flow_en) 1771 eth_dev->data->dev_flags |= RTE_ETH_DEV_FLOW_OPS_THREAD_SAFE; 1772 return eth_dev; 1773 error: 1774 if (priv) { 1775 priv->sh->port[priv->dev_port - 1].nl_ih_port_id = 1776 RTE_MAX_ETHPORTS; 1777 rte_io_wmb(); 1778 #ifdef HAVE_MLX5_HWS_SUPPORT 1779 if (eth_dev && 1780 priv->sh && 1781 priv->sh->config.dv_flow_en == 2 && 1782 priv->sh->config.dv_esw_en) 1783 flow_hw_destroy_vport_action(eth_dev); 1784 #endif 1785 if (priv->sh) 1786 mlx5_os_free_shared_dr(priv); 1787 if (priv->nl_socket_route >= 0) 1788 close(priv->nl_socket_route); 1789 if (priv->vmwa_context) 1790 mlx5_vlan_vmwa_exit(priv->vmwa_context); 1791 if (eth_dev && priv->drop_queue.hrxq) 1792 mlx5_drop_action_destroy(eth_dev); 1793 if (priv->mtr_profile_tbl) 1794 mlx5_l3t_destroy(priv->mtr_profile_tbl); 1795 if (own_domain_id) 1796 claim_zero(rte_eth_switch_domain_free(priv->domain_id)); 1797 if (priv->hrxqs) 1798 mlx5_list_destroy(priv->hrxqs); 1799 if (eth_dev && priv->flex_item_map) 1800 mlx5_flex_item_port_cleanup(eth_dev); 1801 mlx5_free(priv->ext_rxqs); 1802 mlx5_free(priv->ext_txqs); 1803 mlx5_free(priv); 1804 if (eth_dev != NULL) 1805 eth_dev->data->dev_private = NULL; 1806 } 1807 if (eth_dev != NULL) { 1808 /* mac_addrs must not be freed alone because part of 1809 * dev_private 1810 **/ 1811 eth_dev->data->mac_addrs = NULL; 1812 rte_eth_dev_release_port(eth_dev); 1813 } 1814 if (sh) 1815 mlx5_free_shared_dev_ctx(sh); 1816 if (nl_rdma >= 0) 1817 close(nl_rdma); 1818 MLX5_ASSERT(err > 0); 1819 rte_errno = err; 1820 return NULL; 1821 } 1822 1823 /** 1824 * Comparison callback to sort device data. 1825 * 1826 * This is meant to be used with qsort(). 1827 * 1828 * @param a[in] 1829 * Pointer to pointer to first data object. 1830 * @param b[in] 1831 * Pointer to pointer to second data object. 1832 * 1833 * @return 1834 * 0 if both objects are equal, less than 0 if the first argument is less 1835 * than the second, greater than 0 otherwise. 1836 */ 1837 static int 1838 mlx5_dev_spawn_data_cmp(const void *a, const void *b) 1839 { 1840 const struct mlx5_switch_info *si_a = 1841 &((const struct mlx5_dev_spawn_data *)a)->info; 1842 const struct mlx5_switch_info *si_b = 1843 &((const struct mlx5_dev_spawn_data *)b)->info; 1844 int uplink_a = si_a->name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK; 1845 int uplink_b = si_b->name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK; 1846 int ret; 1847 1848 /* Uplink ports first. */ 1849 ret = uplink_b - uplink_a; 1850 if (ret) 1851 return ret; 1852 /* Then master devices. */ 1853 ret = si_b->master - si_a->master; 1854 if (ret) 1855 return ret; 1856 /* Then representor devices. */ 1857 ret = si_b->representor - si_a->representor; 1858 if (ret) 1859 return ret; 1860 /* Unidentified devices come last in no specific order. */ 1861 if (!si_a->representor) 1862 return 0; 1863 /* Order representors by name. */ 1864 return si_a->port_name - si_b->port_name; 1865 } 1866 1867 /** 1868 * Match PCI information for possible slaves of bonding device. 1869 * 1870 * @param[in] ibdev_name 1871 * Name of Infiniband device. 1872 * @param[in] pci_dev 1873 * Pointer to primary PCI address structure to match. 1874 * @param[in] nl_rdma 1875 * Netlink RDMA group socket handle. 1876 * @param[in] owner 1877 * Representor owner PF index. 1878 * @param[out] bond_info 1879 * Pointer to bonding information. 1880 * 1881 * @return 1882 * negative value if no bonding device found, otherwise 1883 * positive index of slave PF in bonding. 1884 */ 1885 static int 1886 mlx5_device_bond_pci_match(const char *ibdev_name, 1887 const struct rte_pci_addr *pci_dev, 1888 int nl_rdma, uint16_t owner, 1889 struct mlx5_bond_info *bond_info) 1890 { 1891 char ifname[IF_NAMESIZE + 1]; 1892 unsigned int ifindex; 1893 unsigned int np, i; 1894 FILE *bond_file = NULL, *file; 1895 int pf = -1; 1896 int ret; 1897 uint8_t cur_guid[32] = {0}; 1898 uint8_t guid[32] = {0}; 1899 1900 /* 1901 * Try to get master device name. If something goes wrong suppose 1902 * the lack of kernel support and no bonding devices. 1903 */ 1904 memset(bond_info, 0, sizeof(*bond_info)); 1905 if (nl_rdma < 0) 1906 return -1; 1907 if (!strstr(ibdev_name, "bond")) 1908 return -1; 1909 np = mlx5_nl_portnum(nl_rdma, ibdev_name); 1910 if (!np) 1911 return -1; 1912 if (mlx5_get_device_guid(pci_dev, cur_guid, sizeof(cur_guid)) < 0) 1913 return -1; 1914 /* 1915 * The master device might not be on the predefined port(not on port 1916 * index 1, it is not guaranteed), we have to scan all Infiniband 1917 * device ports and find master. 1918 */ 1919 for (i = 1; i <= np; ++i) { 1920 /* Check whether Infiniband port is populated. */ 1921 ifindex = mlx5_nl_ifindex(nl_rdma, ibdev_name, i); 1922 if (!ifindex) 1923 continue; 1924 if (!if_indextoname(ifindex, ifname)) 1925 continue; 1926 /* Try to read bonding slave names from sysfs. */ 1927 MKSTR(slaves, 1928 "/sys/class/net/%s/master/bonding/slaves", ifname); 1929 bond_file = fopen(slaves, "r"); 1930 if (bond_file) 1931 break; 1932 } 1933 if (!bond_file) 1934 return -1; 1935 /* Use safe format to check maximal buffer length. */ 1936 MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE); 1937 while (fscanf(bond_file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) { 1938 char tmp_str[IF_NAMESIZE + 32]; 1939 struct rte_pci_addr pci_addr; 1940 struct mlx5_switch_info info; 1941 int ret; 1942 1943 /* Process slave interface names in the loop. */ 1944 snprintf(tmp_str, sizeof(tmp_str), 1945 "/sys/class/net/%s", ifname); 1946 if (mlx5_get_pci_addr(tmp_str, &pci_addr)) { 1947 DRV_LOG(WARNING, 1948 "Cannot get PCI address for netdev \"%s\".", 1949 ifname); 1950 continue; 1951 } 1952 /* Slave interface PCI address match found. */ 1953 snprintf(tmp_str, sizeof(tmp_str), 1954 "/sys/class/net/%s/phys_port_name", ifname); 1955 file = fopen(tmp_str, "rb"); 1956 if (!file) 1957 break; 1958 info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET; 1959 if (fscanf(file, "%32s", tmp_str) == 1) 1960 mlx5_translate_port_name(tmp_str, &info); 1961 fclose(file); 1962 /* Only process PF ports. */ 1963 if (info.name_type != MLX5_PHYS_PORT_NAME_TYPE_LEGACY && 1964 info.name_type != MLX5_PHYS_PORT_NAME_TYPE_UPLINK) 1965 continue; 1966 /* Check max bonding member. */ 1967 if (info.port_name >= MLX5_BOND_MAX_PORTS) { 1968 DRV_LOG(WARNING, "bonding index out of range, " 1969 "please increase MLX5_BOND_MAX_PORTS: %s", 1970 tmp_str); 1971 break; 1972 } 1973 /* Get ifindex. */ 1974 snprintf(tmp_str, sizeof(tmp_str), 1975 "/sys/class/net/%s/ifindex", ifname); 1976 file = fopen(tmp_str, "rb"); 1977 if (!file) 1978 break; 1979 ret = fscanf(file, "%u", &ifindex); 1980 fclose(file); 1981 if (ret != 1) 1982 break; 1983 /* Save bonding info. */ 1984 strncpy(bond_info->ports[info.port_name].ifname, ifname, 1985 sizeof(bond_info->ports[0].ifname)); 1986 bond_info->ports[info.port_name].pci_addr = pci_addr; 1987 bond_info->ports[info.port_name].ifindex = ifindex; 1988 bond_info->n_port++; 1989 /* 1990 * Under socket direct mode, bonding will use 1991 * system_image_guid as identification. 1992 * After OFED 5.4, guid is readable (ret >= 0) under sysfs. 1993 * All bonding members should have the same guid even if driver 1994 * is using PCIe BDF. 1995 */ 1996 ret = mlx5_get_device_guid(&pci_addr, guid, sizeof(guid)); 1997 if (ret < 0) 1998 break; 1999 else if (ret > 0) { 2000 if (!memcmp(guid, cur_guid, sizeof(guid)) && 2001 owner == info.port_name && 2002 (owner != 0 || (owner == 0 && 2003 !rte_pci_addr_cmp(pci_dev, &pci_addr)))) 2004 pf = info.port_name; 2005 } else if (pci_dev->domain == pci_addr.domain && 2006 pci_dev->bus == pci_addr.bus && 2007 pci_dev->devid == pci_addr.devid && 2008 ((pci_dev->function == 0 && 2009 pci_dev->function + owner == pci_addr.function) || 2010 (pci_dev->function == owner && 2011 pci_addr.function == owner))) 2012 pf = info.port_name; 2013 } 2014 if (pf >= 0) { 2015 /* Get bond interface info */ 2016 ret = mlx5_sysfs_bond_info(ifindex, &bond_info->ifindex, 2017 bond_info->ifname); 2018 if (ret) 2019 DRV_LOG(ERR, "unable to get bond info: %s", 2020 strerror(rte_errno)); 2021 else 2022 DRV_LOG(INFO, "PF device %u, bond device %u(%s)", 2023 ifindex, bond_info->ifindex, bond_info->ifname); 2024 } 2025 if (owner == 0 && pf != 0) { 2026 DRV_LOG(INFO, "PCIe instance " PCI_PRI_FMT " isn't bonding owner", 2027 pci_dev->domain, pci_dev->bus, pci_dev->devid, 2028 pci_dev->function); 2029 } 2030 return pf; 2031 } 2032 2033 static int 2034 mlx5_nl_esw_multiport_get(struct rte_pci_addr *pci_addr, int *enabled) 2035 { 2036 char pci_addr_str[PCI_PRI_STR_SIZE] = { 0 }; 2037 int nlsk_fd; 2038 int devlink_id; 2039 int ret; 2040 2041 /* Provide correct value to have defined enabled state in case of an error. */ 2042 *enabled = 0; 2043 rte_pci_device_name(pci_addr, pci_addr_str, sizeof(pci_addr_str)); 2044 nlsk_fd = mlx5_nl_init(NETLINK_GENERIC, 0); 2045 if (nlsk_fd < 0) 2046 return nlsk_fd; 2047 devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd); 2048 if (devlink_id < 0) { 2049 ret = devlink_id; 2050 DRV_LOG(DEBUG, "Unable to get devlink family id for Multiport E-Switch checks " 2051 "by netlink, for PCI device %s", pci_addr_str); 2052 goto close_nlsk_fd; 2053 } 2054 ret = mlx5_nl_devlink_esw_multiport_get(nlsk_fd, devlink_id, pci_addr_str, enabled); 2055 if (ret < 0) 2056 DRV_LOG(DEBUG, "Unable to get Multiport E-Switch state by Netlink."); 2057 close_nlsk_fd: 2058 close(nlsk_fd); 2059 return ret; 2060 } 2061 2062 #define SYSFS_MPESW_PARAM_MAX_LEN 16 2063 2064 static int 2065 mlx5_sysfs_esw_multiport_get(struct ibv_device *ibv, struct rte_pci_addr *pci_addr, int *enabled) 2066 { 2067 int nl_rdma; 2068 unsigned int n_ports; 2069 unsigned int i; 2070 int ret; 2071 2072 /* Provide correct value to have defined enabled state in case of an error. */ 2073 *enabled = 0; 2074 nl_rdma = mlx5_nl_init(NETLINK_RDMA, 0); 2075 if (nl_rdma < 0) 2076 return nl_rdma; 2077 n_ports = mlx5_nl_portnum(nl_rdma, ibv->name); 2078 if (!n_ports) { 2079 ret = -rte_errno; 2080 goto close_nl_rdma; 2081 } 2082 for (i = 1; i <= n_ports; ++i) { 2083 unsigned int ifindex; 2084 char ifname[IF_NAMESIZE + 1]; 2085 struct rte_pci_addr if_pci_addr; 2086 char mpesw[SYSFS_MPESW_PARAM_MAX_LEN + 1]; 2087 FILE *sysfs; 2088 int n; 2089 2090 ifindex = mlx5_nl_ifindex(nl_rdma, ibv->name, i); 2091 if (!ifindex) 2092 continue; 2093 if (!if_indextoname(ifindex, ifname)) 2094 continue; 2095 MKSTR(sysfs_if_path, "/sys/class/net/%s", ifname); 2096 if (mlx5_get_pci_addr(sysfs_if_path, &if_pci_addr)) 2097 continue; 2098 if (pci_addr->domain != if_pci_addr.domain || 2099 pci_addr->bus != if_pci_addr.bus || 2100 pci_addr->devid != if_pci_addr.devid || 2101 pci_addr->function != if_pci_addr.function) 2102 continue; 2103 MKSTR(sysfs_mpesw_path, 2104 "/sys/class/net/%s/compat/devlink/lag_port_select_mode", ifname); 2105 sysfs = fopen(sysfs_mpesw_path, "r"); 2106 if (!sysfs) 2107 continue; 2108 n = fscanf(sysfs, "%" RTE_STR(SYSFS_MPESW_PARAM_MAX_LEN) "s", mpesw); 2109 fclose(sysfs); 2110 if (n != 1) 2111 continue; 2112 ret = 0; 2113 if (strcmp(mpesw, "multiport_esw") == 0) { 2114 *enabled = 1; 2115 break; 2116 } 2117 *enabled = 0; 2118 break; 2119 } 2120 if (i > n_ports) { 2121 DRV_LOG(DEBUG, "Unable to get Multiport E-Switch state by sysfs."); 2122 rte_errno = ENOENT; 2123 ret = -rte_errno; 2124 } 2125 2126 close_nl_rdma: 2127 close(nl_rdma); 2128 return ret; 2129 } 2130 2131 static int 2132 mlx5_is_mpesw_enabled(struct ibv_device *ibv, struct rte_pci_addr *ibv_pci_addr, int *enabled) 2133 { 2134 /* 2135 * Try getting Multiport E-Switch state through netlink interface 2136 * If unable, try sysfs interface. If that is unable as well, 2137 * assume that Multiport E-Switch is disabled and return an error. 2138 */ 2139 if (mlx5_nl_esw_multiport_get(ibv_pci_addr, enabled) >= 0 || 2140 mlx5_sysfs_esw_multiport_get(ibv, ibv_pci_addr, enabled) >= 0) 2141 return 0; 2142 DRV_LOG(DEBUG, "Unable to check MPESW state for IB device %s " 2143 "(PCI: " PCI_PRI_FMT ")", 2144 ibv->name, 2145 ibv_pci_addr->domain, ibv_pci_addr->bus, 2146 ibv_pci_addr->devid, ibv_pci_addr->function); 2147 *enabled = 0; 2148 return -rte_errno; 2149 } 2150 2151 static int 2152 mlx5_device_mpesw_pci_match(struct ibv_device *ibv, 2153 const struct rte_pci_addr *owner_pci, 2154 int nl_rdma) 2155 { 2156 struct rte_pci_addr ibdev_pci_addr = { 0 }; 2157 char ifname[IF_NAMESIZE + 1] = { 0 }; 2158 unsigned int ifindex; 2159 unsigned int np; 2160 unsigned int i; 2161 int enabled = 0; 2162 int ret; 2163 2164 /* Check if IB device's PCI address matches the probed PCI address. */ 2165 if (mlx5_get_pci_addr(ibv->ibdev_path, &ibdev_pci_addr)) { 2166 DRV_LOG(DEBUG, "Skipping MPESW check for IB device %s since " 2167 "there is no underlying PCI device", ibv->name); 2168 rte_errno = ENOENT; 2169 return -rte_errno; 2170 } 2171 if (ibdev_pci_addr.domain != owner_pci->domain || 2172 ibdev_pci_addr.bus != owner_pci->bus || 2173 ibdev_pci_addr.devid != owner_pci->devid || 2174 ibdev_pci_addr.function != owner_pci->function) { 2175 return -1; 2176 } 2177 /* Check if IB device has MPESW enabled. */ 2178 if (mlx5_is_mpesw_enabled(ibv, &ibdev_pci_addr, &enabled)) 2179 return -1; 2180 if (!enabled) 2181 return -1; 2182 /* Iterate through IB ports to find MPESW master uplink port. */ 2183 if (nl_rdma < 0) 2184 return -1; 2185 np = mlx5_nl_portnum(nl_rdma, ibv->name); 2186 if (!np) 2187 return -1; 2188 for (i = 1; i <= np; ++i) { 2189 struct rte_pci_addr pci_addr; 2190 FILE *file; 2191 char port_name[IF_NAMESIZE + 1]; 2192 struct mlx5_switch_info info; 2193 2194 /* Check whether IB port has a corresponding netdev. */ 2195 ifindex = mlx5_nl_ifindex(nl_rdma, ibv->name, i); 2196 if (!ifindex) 2197 continue; 2198 if (!if_indextoname(ifindex, ifname)) 2199 continue; 2200 /* Read port name and determine its type. */ 2201 MKSTR(ifphysportname, "/sys/class/net/%s/phys_port_name", ifname); 2202 file = fopen(ifphysportname, "rb"); 2203 if (!file) 2204 continue; 2205 ret = fscanf(file, "%16s", port_name); 2206 fclose(file); 2207 if (ret != 1) 2208 continue; 2209 memset(&info, 0, sizeof(info)); 2210 mlx5_translate_port_name(port_name, &info); 2211 if (info.name_type != MLX5_PHYS_PORT_NAME_TYPE_UPLINK) 2212 continue; 2213 /* Fetch PCI address of the device to which the netdev is bound. */ 2214 MKSTR(ifpath, "/sys/class/net/%s", ifname); 2215 if (mlx5_get_pci_addr(ifpath, &pci_addr)) 2216 continue; 2217 if (pci_addr.domain == ibdev_pci_addr.domain && 2218 pci_addr.bus == ibdev_pci_addr.bus && 2219 pci_addr.devid == ibdev_pci_addr.devid && 2220 pci_addr.function == ibdev_pci_addr.function) { 2221 MLX5_ASSERT(info.port_name >= 0); 2222 return info.port_name; 2223 } 2224 } 2225 /* No matching MPESW uplink port was found. */ 2226 return -1; 2227 } 2228 2229 /** 2230 * Register a PCI device within bonding. 2231 * 2232 * This function spawns Ethernet devices out of a given PCI device and 2233 * bonding owner PF index. 2234 * 2235 * @param[in] cdev 2236 * Pointer to common mlx5 device structure. 2237 * @param[in] req_eth_da 2238 * Requested ethdev device argument. 2239 * @param[in] owner_id 2240 * Requested owner PF port ID within bonding device, default to 0. 2241 * @param[in, out] mkvlist 2242 * Pointer to mlx5 kvargs control, can be NULL if there is no devargs. 2243 * 2244 * @return 2245 * 0 on success, a negative errno value otherwise and rte_errno is set. 2246 */ 2247 static int 2248 mlx5_os_pci_probe_pf(struct mlx5_common_device *cdev, 2249 struct rte_eth_devargs *req_eth_da, 2250 uint16_t owner_id, struct mlx5_kvargs_ctrl *mkvlist) 2251 { 2252 struct ibv_device **ibv_list; 2253 /* 2254 * Number of found IB Devices matching with requested PCI BDF. 2255 * nd != 1 means there are multiple IB devices over the same 2256 * PCI device and we have representors and master. 2257 */ 2258 unsigned int nd = 0; 2259 /* 2260 * Number of found IB device Ports. nd = 1 and np = 1..n means 2261 * we have the single multiport IB device, and there may be 2262 * representors attached to some of found ports. 2263 */ 2264 unsigned int np = 0; 2265 /* 2266 * Number of DPDK ethernet devices to Spawn - either over 2267 * multiple IB devices or multiple ports of single IB device. 2268 * Actually this is the number of iterations to spawn. 2269 */ 2270 unsigned int ns = 0; 2271 /* 2272 * Bonding device 2273 * < 0 - no bonding device (single one) 2274 * >= 0 - bonding device (value is slave PF index) 2275 */ 2276 int bd = -1; 2277 /* 2278 * Multiport E-Switch (MPESW) device: 2279 * < 0 - no MPESW device or could not determine if it is MPESW device, 2280 * >= 0 - MPESW device. Value is the port index of the MPESW owner. 2281 */ 2282 int mpesw = MLX5_MPESW_PORT_INVALID; 2283 struct rte_pci_device *pci_dev = RTE_DEV_TO_PCI(cdev->dev); 2284 struct mlx5_dev_spawn_data *list = NULL; 2285 struct rte_eth_devargs eth_da = *req_eth_da; 2286 struct rte_pci_addr owner_pci = pci_dev->addr; /* Owner PF. */ 2287 struct mlx5_bond_info bond_info; 2288 int ret = -1; 2289 2290 errno = 0; 2291 ibv_list = mlx5_glue->get_device_list(&ret); 2292 if (!ibv_list) { 2293 rte_errno = errno ? errno : ENOSYS; 2294 DRV_LOG(ERR, "Cannot list devices, is ib_uverbs loaded?"); 2295 return -rte_errno; 2296 } 2297 /* 2298 * First scan the list of all Infiniband devices to find 2299 * matching ones, gathering into the list. 2300 */ 2301 struct ibv_device *ibv_match[ret + 1]; 2302 int nl_route = mlx5_nl_init(NETLINK_ROUTE, 0); 2303 int nl_rdma = mlx5_nl_init(NETLINK_RDMA, 0); 2304 unsigned int i; 2305 2306 while (ret-- > 0) { 2307 struct rte_pci_addr pci_addr; 2308 2309 DRV_LOG(DEBUG, "Checking device \"%s\"", ibv_list[ret]->name); 2310 bd = mlx5_device_bond_pci_match(ibv_list[ret]->name, &owner_pci, 2311 nl_rdma, owner_id, &bond_info); 2312 if (bd >= 0) { 2313 /* 2314 * Bonding device detected. Only one match is allowed, 2315 * the bonding is supported over multi-port IB device, 2316 * there should be no matches on representor PCI 2317 * functions or non VF LAG bonding devices with 2318 * specified address. 2319 */ 2320 if (nd) { 2321 DRV_LOG(ERR, 2322 "multiple PCI match on bonding device" 2323 "\"%s\" found", ibv_list[ret]->name); 2324 rte_errno = ENOENT; 2325 ret = -rte_errno; 2326 goto exit; 2327 } 2328 /* Amend owner pci address if owner PF ID specified. */ 2329 if (eth_da.nb_representor_ports) 2330 owner_pci.function += owner_id; 2331 DRV_LOG(INFO, 2332 "PCI information matches for slave %d bonding device \"%s\"", 2333 bd, ibv_list[ret]->name); 2334 ibv_match[nd++] = ibv_list[ret]; 2335 break; 2336 } 2337 mpesw = mlx5_device_mpesw_pci_match(ibv_list[ret], &owner_pci, nl_rdma); 2338 if (mpesw >= 0) { 2339 /* 2340 * MPESW device detected. Only one matching IB device is allowed, 2341 * so if any matches were found previously, fail gracefully. 2342 */ 2343 if (nd) { 2344 DRV_LOG(ERR, 2345 "PCI information matches MPESW device \"%s\", " 2346 "but multiple matching PCI devices were found. " 2347 "Probing failed.", 2348 ibv_list[ret]->name); 2349 rte_errno = ENOENT; 2350 ret = -rte_errno; 2351 goto exit; 2352 } 2353 DRV_LOG(INFO, 2354 "PCI information matches MPESW device \"%s\"", 2355 ibv_list[ret]->name); 2356 ibv_match[nd++] = ibv_list[ret]; 2357 break; 2358 } 2359 /* Bonding or MPESW device was not found. */ 2360 if (mlx5_get_pci_addr(ibv_list[ret]->ibdev_path, 2361 &pci_addr)) 2362 continue; 2363 if (rte_pci_addr_cmp(&owner_pci, &pci_addr) != 0) 2364 continue; 2365 DRV_LOG(INFO, "PCI information matches for device \"%s\"", 2366 ibv_list[ret]->name); 2367 ibv_match[nd++] = ibv_list[ret]; 2368 } 2369 ibv_match[nd] = NULL; 2370 if (!nd) { 2371 /* No device matches, just complain and bail out. */ 2372 DRV_LOG(WARNING, 2373 "PF %u doesn't have Verbs device matches PCI device " PCI_PRI_FMT "," 2374 " are kernel drivers loaded?", 2375 owner_id, owner_pci.domain, owner_pci.bus, 2376 owner_pci.devid, owner_pci.function); 2377 rte_errno = ENOENT; 2378 ret = -rte_errno; 2379 goto exit; 2380 } 2381 if (nd == 1) { 2382 /* 2383 * Found single matching device may have multiple ports. 2384 * Each port may be representor, we have to check the port 2385 * number and check the representors existence. 2386 */ 2387 if (nl_rdma >= 0) 2388 np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name); 2389 if (!np) 2390 DRV_LOG(WARNING, 2391 "Cannot get IB device \"%s\" ports number.", 2392 ibv_match[0]->name); 2393 if (bd >= 0 && !np) { 2394 DRV_LOG(ERR, "Cannot get ports for bonding device."); 2395 rte_errno = ENOENT; 2396 ret = -rte_errno; 2397 goto exit; 2398 } 2399 if (mpesw >= 0 && !np) { 2400 DRV_LOG(ERR, "Cannot get ports for MPESW device."); 2401 rte_errno = ENOENT; 2402 ret = -rte_errno; 2403 goto exit; 2404 } 2405 } 2406 /* Now we can determine the maximal amount of devices to be spawned. */ 2407 list = mlx5_malloc(MLX5_MEM_ZERO, 2408 sizeof(struct mlx5_dev_spawn_data) * (np ? np : nd), 2409 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 2410 if (!list) { 2411 DRV_LOG(ERR, "Spawn data array allocation failure."); 2412 rte_errno = ENOMEM; 2413 ret = -rte_errno; 2414 goto exit; 2415 } 2416 if (bd >= 0 || mpesw >= 0 || np > 1) { 2417 /* 2418 * Single IB device with multiple ports found, 2419 * it may be E-Switch master device and representors. 2420 * We have to perform identification through the ports. 2421 */ 2422 MLX5_ASSERT(nl_rdma >= 0); 2423 MLX5_ASSERT(ns == 0); 2424 MLX5_ASSERT(nd == 1); 2425 MLX5_ASSERT(np); 2426 for (i = 1; i <= np; ++i) { 2427 list[ns].bond_info = &bond_info; 2428 list[ns].max_port = np; 2429 list[ns].phys_port = i; 2430 list[ns].phys_dev_name = ibv_match[0]->name; 2431 list[ns].eth_dev = NULL; 2432 list[ns].pci_dev = pci_dev; 2433 list[ns].cdev = cdev; 2434 list[ns].pf_bond = bd; 2435 list[ns].mpesw_port = MLX5_MPESW_PORT_INVALID; 2436 list[ns].ifindex = mlx5_nl_ifindex(nl_rdma, 2437 ibv_match[0]->name, 2438 i); 2439 if (!list[ns].ifindex) { 2440 /* 2441 * No network interface index found for the 2442 * specified port, it means there is no 2443 * representor on this port. It's OK, 2444 * there can be disabled ports, for example 2445 * if sriov_numvfs < sriov_totalvfs. 2446 */ 2447 continue; 2448 } 2449 ret = -1; 2450 if (nl_route >= 0) 2451 ret = mlx5_nl_switch_info(nl_route, 2452 list[ns].ifindex, 2453 &list[ns].info); 2454 if (ret || (!list[ns].info.representor && 2455 !list[ns].info.master)) { 2456 /* 2457 * We failed to recognize representors with 2458 * Netlink, let's try to perform the task 2459 * with sysfs. 2460 */ 2461 ret = mlx5_sysfs_switch_info(list[ns].ifindex, 2462 &list[ns].info); 2463 } 2464 if (!ret && bd >= 0) { 2465 switch (list[ns].info.name_type) { 2466 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: 2467 if (np == 1) { 2468 /* 2469 * Force standalone bonding 2470 * device for ROCE LAG 2471 * configurations. 2472 */ 2473 list[ns].info.master = 0; 2474 list[ns].info.representor = 0; 2475 } 2476 ns++; 2477 break; 2478 case MLX5_PHYS_PORT_NAME_TYPE_PFHPF: 2479 /* Fallthrough */ 2480 case MLX5_PHYS_PORT_NAME_TYPE_PFVF: 2481 /* Fallthrough */ 2482 case MLX5_PHYS_PORT_NAME_TYPE_PFSF: 2483 if (list[ns].info.pf_num == bd) 2484 ns++; 2485 break; 2486 default: 2487 break; 2488 } 2489 continue; 2490 } 2491 if (!ret && mpesw >= 0) { 2492 switch (list[ns].info.name_type) { 2493 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: 2494 /* Owner port is treated as master port. */ 2495 if (list[ns].info.port_name == mpesw) { 2496 list[ns].info.master = 1; 2497 list[ns].info.representor = 0; 2498 } else { 2499 list[ns].info.master = 0; 2500 list[ns].info.representor = 1; 2501 } 2502 /* 2503 * Ports of this type have uplink port index 2504 * encoded in the name. This index is also a PF index. 2505 */ 2506 list[ns].info.pf_num = list[ns].info.port_name; 2507 list[ns].mpesw_port = list[ns].info.port_name; 2508 list[ns].info.mpesw_owner = mpesw; 2509 ns++; 2510 break; 2511 case MLX5_PHYS_PORT_NAME_TYPE_PFHPF: 2512 case MLX5_PHYS_PORT_NAME_TYPE_PFVF: 2513 case MLX5_PHYS_PORT_NAME_TYPE_PFSF: 2514 /* Only spawn representors related to the probed PF. */ 2515 if (list[ns].info.pf_num == owner_id) { 2516 /* 2517 * Ports of this type have PF index encoded in name, 2518 * which translate to the related uplink port index. 2519 */ 2520 list[ns].mpesw_port = list[ns].info.pf_num; 2521 /* MPESW owner is also saved but not used now. */ 2522 list[ns].info.mpesw_owner = mpesw; 2523 ns++; 2524 } 2525 break; 2526 default: 2527 break; 2528 } 2529 continue; 2530 } 2531 if (!ret && (list[ns].info.representor ^ 2532 list[ns].info.master)) 2533 ns++; 2534 } 2535 if (!ns) { 2536 DRV_LOG(ERR, 2537 "Unable to recognize master/representors on the IB device with multiple ports."); 2538 rte_errno = ENOENT; 2539 ret = -rte_errno; 2540 goto exit; 2541 } 2542 } else { 2543 /* 2544 * The existence of several matching entries (nd > 1) means 2545 * port representors have been instantiated. No existing Verbs 2546 * call nor sysfs entries can tell them apart, this can only 2547 * be done through Netlink calls assuming kernel drivers are 2548 * recent enough to support them. 2549 * 2550 * In the event of identification failure through Netlink, 2551 * try again through sysfs, then: 2552 * 2553 * 1. A single IB device matches (nd == 1) with single 2554 * port (np=0/1) and is not a representor, assume 2555 * no switch support. 2556 * 2557 * 2. Otherwise no safe assumptions can be made; 2558 * complain louder and bail out. 2559 */ 2560 for (i = 0; i != nd; ++i) { 2561 memset(&list[ns].info, 0, sizeof(list[ns].info)); 2562 list[ns].bond_info = NULL; 2563 list[ns].max_port = 1; 2564 list[ns].phys_port = 1; 2565 list[ns].phys_dev_name = ibv_match[i]->name; 2566 list[ns].eth_dev = NULL; 2567 list[ns].pci_dev = pci_dev; 2568 list[ns].cdev = cdev; 2569 list[ns].pf_bond = -1; 2570 list[ns].mpesw_port = MLX5_MPESW_PORT_INVALID; 2571 list[ns].ifindex = 0; 2572 if (nl_rdma >= 0) 2573 list[ns].ifindex = mlx5_nl_ifindex 2574 (nl_rdma, 2575 ibv_match[i]->name, 2576 1); 2577 if (!list[ns].ifindex) { 2578 char ifname[IF_NAMESIZE]; 2579 2580 /* 2581 * Netlink failed, it may happen with old 2582 * ib_core kernel driver (before 4.16). 2583 * We can assume there is old driver because 2584 * here we are processing single ports IB 2585 * devices. Let's try sysfs to retrieve 2586 * the ifindex. The method works for 2587 * master device only. 2588 */ 2589 if (nd > 1) { 2590 /* 2591 * Multiple devices found, assume 2592 * representors, can not distinguish 2593 * master/representor and retrieve 2594 * ifindex via sysfs. 2595 */ 2596 continue; 2597 } 2598 ret = mlx5_get_ifname_sysfs 2599 (ibv_match[i]->ibdev_path, ifname); 2600 if (!ret) 2601 list[ns].ifindex = 2602 if_nametoindex(ifname); 2603 if (!list[ns].ifindex) { 2604 /* 2605 * No network interface index found 2606 * for the specified device, it means 2607 * there it is neither representor 2608 * nor master. 2609 */ 2610 continue; 2611 } 2612 } 2613 ret = -1; 2614 if (nl_route >= 0) 2615 ret = mlx5_nl_switch_info(nl_route, 2616 list[ns].ifindex, 2617 &list[ns].info); 2618 if (ret || (!list[ns].info.representor && 2619 !list[ns].info.master)) { 2620 /* 2621 * We failed to recognize representors with 2622 * Netlink, let's try to perform the task 2623 * with sysfs. 2624 */ 2625 ret = mlx5_sysfs_switch_info(list[ns].ifindex, 2626 &list[ns].info); 2627 } 2628 if (!ret && (list[ns].info.representor ^ 2629 list[ns].info.master)) { 2630 ns++; 2631 } else if ((nd == 1) && 2632 !list[ns].info.representor && 2633 !list[ns].info.master) { 2634 /* 2635 * Single IB device with one physical port and 2636 * attached network device. 2637 * May be SRIOV is not enabled or there is no 2638 * representors. 2639 */ 2640 DRV_LOG(INFO, "No E-Switch support detected."); 2641 ns++; 2642 break; 2643 } 2644 } 2645 if (!ns) { 2646 DRV_LOG(ERR, 2647 "Unable to recognize master/representors on the multiple IB devices."); 2648 rte_errno = ENOENT; 2649 ret = -rte_errno; 2650 goto exit; 2651 } 2652 /* 2653 * New kernels may add the switch_id attribute for the case 2654 * there is no E-Switch and we wrongly recognized the only 2655 * device as master. Override this if there is the single 2656 * device with single port and new device name format present. 2657 */ 2658 if (nd == 1 && 2659 list[0].info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK) { 2660 list[0].info.master = 0; 2661 list[0].info.representor = 0; 2662 } 2663 } 2664 MLX5_ASSERT(ns); 2665 /* 2666 * Sort list to probe devices in natural order for users convenience 2667 * (i.e. master first, then representors from lowest to highest ID). 2668 */ 2669 qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp); 2670 if (eth_da.type != RTE_ETH_REPRESENTOR_NONE) { 2671 /* Set devargs default values. */ 2672 if (eth_da.nb_mh_controllers == 0) { 2673 eth_da.nb_mh_controllers = 1; 2674 eth_da.mh_controllers[0] = 0; 2675 } 2676 if (eth_da.nb_ports == 0 && ns > 0) { 2677 if (list[0].pf_bond >= 0 && list[0].info.representor) 2678 DRV_LOG(WARNING, "Representor on Bonding device should use pf#vf# syntax: %s", 2679 pci_dev->device.devargs->args); 2680 eth_da.nb_ports = 1; 2681 eth_da.ports[0] = list[0].info.pf_num; 2682 } 2683 if (eth_da.nb_representor_ports == 0) { 2684 eth_da.nb_representor_ports = 1; 2685 eth_da.representor_ports[0] = 0; 2686 } 2687 } 2688 for (i = 0; i != ns; ++i) { 2689 uint32_t restore; 2690 2691 list[i].eth_dev = mlx5_dev_spawn(cdev->dev, &list[i], ð_da, 2692 mkvlist); 2693 if (!list[i].eth_dev) { 2694 if (rte_errno != EBUSY && rte_errno != EEXIST) 2695 break; 2696 /* Device is disabled or already spawned. Ignore it. */ 2697 continue; 2698 } 2699 restore = list[i].eth_dev->data->dev_flags; 2700 rte_eth_copy_pci_info(list[i].eth_dev, pci_dev); 2701 /** 2702 * Each representor has a dedicated interrupts vector. 2703 * rte_eth_copy_pci_info() assigns PF interrupts handle to 2704 * representor eth_dev object because representor and PF 2705 * share the same PCI address. 2706 * Override representor device with a dedicated 2707 * interrupts handle here. 2708 * Representor interrupts handle is released in mlx5_dev_stop(). 2709 */ 2710 if (list[i].info.representor) { 2711 struct rte_intr_handle *intr_handle = 2712 rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_SHARED); 2713 if (intr_handle == NULL) { 2714 DRV_LOG(ERR, 2715 "port %u failed to allocate memory for interrupt handler " 2716 "Rx interrupts will not be supported", 2717 i); 2718 rte_errno = ENOMEM; 2719 ret = -rte_errno; 2720 goto exit; 2721 } 2722 list[i].eth_dev->intr_handle = intr_handle; 2723 } 2724 /* Restore non-PCI flags cleared by the above call. */ 2725 list[i].eth_dev->data->dev_flags |= restore; 2726 rte_eth_dev_probing_finish(list[i].eth_dev); 2727 } 2728 if (i != ns) { 2729 DRV_LOG(ERR, 2730 "probe of PCI device " PCI_PRI_FMT " aborted after" 2731 " encountering an error: %s", 2732 owner_pci.domain, owner_pci.bus, 2733 owner_pci.devid, owner_pci.function, 2734 strerror(rte_errno)); 2735 ret = -rte_errno; 2736 /* Roll back. */ 2737 while (i--) { 2738 if (!list[i].eth_dev) 2739 continue; 2740 mlx5_dev_close(list[i].eth_dev); 2741 /* mac_addrs must not be freed because in dev_private */ 2742 list[i].eth_dev->data->mac_addrs = NULL; 2743 claim_zero(rte_eth_dev_release_port(list[i].eth_dev)); 2744 } 2745 /* Restore original error. */ 2746 rte_errno = -ret; 2747 } else { 2748 ret = 0; 2749 } 2750 exit: 2751 /* 2752 * Do the routine cleanup: 2753 * - close opened Netlink sockets 2754 * - free allocated spawn data array 2755 * - free the Infiniband device list 2756 */ 2757 if (nl_rdma >= 0) 2758 close(nl_rdma); 2759 if (nl_route >= 0) 2760 close(nl_route); 2761 if (list) 2762 mlx5_free(list); 2763 MLX5_ASSERT(ibv_list); 2764 mlx5_glue->free_device_list(ibv_list); 2765 return ret; 2766 } 2767 2768 static int 2769 mlx5_os_parse_eth_devargs(struct rte_device *dev, 2770 struct rte_eth_devargs *eth_da) 2771 { 2772 int ret = 0; 2773 2774 if (dev->devargs == NULL) 2775 return 0; 2776 memset(eth_da, 0, sizeof(*eth_da)); 2777 /* Parse representor information first from class argument. */ 2778 if (dev->devargs->cls_str) 2779 ret = rte_eth_devargs_parse(dev->devargs->cls_str, eth_da, 1); 2780 if (ret < 0) { 2781 DRV_LOG(ERR, "failed to parse device arguments: %s", 2782 dev->devargs->cls_str); 2783 return -rte_errno; 2784 } 2785 if (eth_da->type == RTE_ETH_REPRESENTOR_NONE && dev->devargs->args) { 2786 /* Parse legacy device argument */ 2787 ret = rte_eth_devargs_parse(dev->devargs->args, eth_da, 1); 2788 if (ret < 0) { 2789 DRV_LOG(ERR, "failed to parse device arguments: %s", 2790 dev->devargs->args); 2791 return -rte_errno; 2792 } 2793 } 2794 return 0; 2795 } 2796 2797 /** 2798 * Callback to register a PCI device. 2799 * 2800 * This function spawns Ethernet devices out of a given PCI device. 2801 * 2802 * @param[in] cdev 2803 * Pointer to common mlx5 device structure. 2804 * @param[in, out] mkvlist 2805 * Pointer to mlx5 kvargs control, can be NULL if there is no devargs. 2806 * 2807 * @return 2808 * 0 on success, a negative errno value otherwise and rte_errno is set. 2809 */ 2810 static int 2811 mlx5_os_pci_probe(struct mlx5_common_device *cdev, 2812 struct mlx5_kvargs_ctrl *mkvlist) 2813 { 2814 struct rte_pci_device *pci_dev = RTE_DEV_TO_PCI(cdev->dev); 2815 struct rte_eth_devargs eth_da = { .nb_ports = 0 }; 2816 int ret = 0; 2817 uint16_t p; 2818 2819 ret = mlx5_os_parse_eth_devargs(cdev->dev, ð_da); 2820 if (ret != 0) 2821 return ret; 2822 2823 if (eth_da.nb_ports > 0) { 2824 /* Iterate all port if devargs pf is range: "pf[0-1]vf[...]". */ 2825 for (p = 0; p < eth_da.nb_ports; p++) { 2826 ret = mlx5_os_pci_probe_pf(cdev, ð_da, 2827 eth_da.ports[p], mkvlist); 2828 if (ret) { 2829 DRV_LOG(INFO, "Probe of PCI device " PCI_PRI_FMT " " 2830 "aborted due to proding failure of PF %u", 2831 pci_dev->addr.domain, pci_dev->addr.bus, 2832 pci_dev->addr.devid, pci_dev->addr.function, 2833 eth_da.ports[p]); 2834 mlx5_net_remove(cdev); 2835 if (p != 0) 2836 break; 2837 } 2838 } 2839 } else { 2840 ret = mlx5_os_pci_probe_pf(cdev, ð_da, 0, mkvlist); 2841 } 2842 return ret; 2843 } 2844 2845 /* Probe a single SF device on auxiliary bus, no representor support. */ 2846 static int 2847 mlx5_os_auxiliary_probe(struct mlx5_common_device *cdev, 2848 struct mlx5_kvargs_ctrl *mkvlist) 2849 { 2850 struct rte_eth_devargs eth_da = { .nb_ports = 0 }; 2851 struct mlx5_dev_spawn_data spawn = { 2852 .pf_bond = -1, 2853 .mpesw_port = MLX5_MPESW_PORT_INVALID, 2854 }; 2855 struct rte_device *dev = cdev->dev; 2856 struct rte_auxiliary_device *adev = RTE_DEV_TO_AUXILIARY(dev); 2857 struct rte_eth_dev *eth_dev; 2858 int ret = 0; 2859 2860 /* Parse ethdev devargs. */ 2861 ret = mlx5_os_parse_eth_devargs(dev, ð_da); 2862 if (ret != 0) 2863 return ret; 2864 /* Init spawn data. */ 2865 spawn.max_port = 1; 2866 spawn.phys_port = 1; 2867 spawn.phys_dev_name = mlx5_os_get_ctx_device_name(cdev->ctx); 2868 ret = mlx5_auxiliary_get_ifindex(dev->name); 2869 if (ret < 0) { 2870 DRV_LOG(ERR, "failed to get ethdev ifindex: %s", dev->name); 2871 return ret; 2872 } 2873 spawn.ifindex = ret; 2874 spawn.cdev = cdev; 2875 /* Spawn device. */ 2876 eth_dev = mlx5_dev_spawn(dev, &spawn, ð_da, mkvlist); 2877 if (eth_dev == NULL) 2878 return -rte_errno; 2879 /* Post create. */ 2880 eth_dev->intr_handle = adev->intr_handle; 2881 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 2882 eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_LSC; 2883 eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_RMV; 2884 eth_dev->data->numa_node = dev->numa_node; 2885 } 2886 rte_eth_dev_probing_finish(eth_dev); 2887 return 0; 2888 } 2889 2890 /** 2891 * Net class driver callback to probe a device. 2892 * 2893 * This function probe PCI bus device(s) or a single SF on auxiliary bus. 2894 * 2895 * @param[in] cdev 2896 * Pointer to the common mlx5 device. 2897 * @param[in, out] mkvlist 2898 * Pointer to mlx5 kvargs control, can be NULL if there is no devargs. 2899 * 2900 * @return 2901 * 0 on success, a negative errno value otherwise and rte_errno is set. 2902 */ 2903 int 2904 mlx5_os_net_probe(struct mlx5_common_device *cdev, 2905 struct mlx5_kvargs_ctrl *mkvlist) 2906 { 2907 int ret; 2908 2909 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 2910 mlx5_pmd_socket_init(); 2911 ret = mlx5_init_once(); 2912 if (ret) { 2913 DRV_LOG(ERR, "Unable to init PMD global data: %s", 2914 strerror(rte_errno)); 2915 return -rte_errno; 2916 } 2917 ret = mlx5_probe_again_args_validate(cdev, mkvlist); 2918 if (ret) { 2919 DRV_LOG(ERR, "Probe again parameters are not compatible : %s", 2920 strerror(rte_errno)); 2921 return -rte_errno; 2922 } 2923 if (mlx5_dev_is_pci(cdev->dev)) 2924 return mlx5_os_pci_probe(cdev, mkvlist); 2925 else 2926 return mlx5_os_auxiliary_probe(cdev, mkvlist); 2927 } 2928 2929 /** 2930 * Cleanup resources when the last device is closed. 2931 */ 2932 void 2933 mlx5_os_net_cleanup(void) 2934 { 2935 mlx5_pmd_socket_uninit(); 2936 } 2937 2938 /** 2939 * Install shared asynchronous device events handler. 2940 * This function is implemented to support event sharing 2941 * between multiple ports of single IB device. 2942 * 2943 * @param sh 2944 * Pointer to mlx5_dev_ctx_shared object. 2945 */ 2946 void 2947 mlx5_os_dev_shared_handler_install(struct mlx5_dev_ctx_shared *sh) 2948 { 2949 struct ibv_context *ctx = sh->cdev->ctx; 2950 int nlsk_fd; 2951 2952 sh->intr_handle = mlx5_os_interrupt_handler_create 2953 (RTE_INTR_INSTANCE_F_SHARED, true, 2954 ctx->async_fd, mlx5_dev_interrupt_handler, sh); 2955 if (!sh->intr_handle) { 2956 DRV_LOG(ERR, "Failed to allocate intr_handle."); 2957 return; 2958 } 2959 nlsk_fd = mlx5_nl_init(NETLINK_ROUTE, RTMGRP_LINK); 2960 if (nlsk_fd < 0) { 2961 DRV_LOG(ERR, "Failed to create a socket for Netlink events: %s", 2962 rte_strerror(rte_errno)); 2963 return; 2964 } 2965 sh->intr_handle_nl = mlx5_os_interrupt_handler_create 2966 (RTE_INTR_INSTANCE_F_SHARED, true, 2967 nlsk_fd, mlx5_dev_interrupt_handler_nl, sh); 2968 if (sh->intr_handle_nl == NULL) { 2969 DRV_LOG(ERR, "Fail to allocate intr_handle"); 2970 return; 2971 } 2972 if (sh->cdev->config.devx) { 2973 #ifdef HAVE_IBV_DEVX_ASYNC 2974 struct mlx5dv_devx_cmd_comp *devx_comp; 2975 2976 sh->devx_comp = (void *)mlx5_glue->devx_create_cmd_comp(ctx); 2977 devx_comp = sh->devx_comp; 2978 if (!devx_comp) { 2979 DRV_LOG(INFO, "failed to allocate devx_comp."); 2980 return; 2981 } 2982 sh->intr_handle_devx = mlx5_os_interrupt_handler_create 2983 (RTE_INTR_INSTANCE_F_SHARED, true, 2984 devx_comp->fd, 2985 mlx5_dev_interrupt_handler_devx, sh); 2986 if (!sh->intr_handle_devx) { 2987 DRV_LOG(ERR, "Failed to allocate intr_handle."); 2988 return; 2989 } 2990 #endif /* HAVE_IBV_DEVX_ASYNC */ 2991 } 2992 } 2993 2994 /** 2995 * Uninstall shared asynchronous device events handler. 2996 * This function is implemented to support event sharing 2997 * between multiple ports of single IB device. 2998 * 2999 * @param dev 3000 * Pointer to mlx5_dev_ctx_shared object. 3001 */ 3002 void 3003 mlx5_os_dev_shared_handler_uninstall(struct mlx5_dev_ctx_shared *sh) 3004 { 3005 mlx5_os_interrupt_handler_destroy(sh->intr_handle, 3006 mlx5_dev_interrupt_handler, sh); 3007 mlx5_os_interrupt_handler_destroy(sh->intr_handle_nl, 3008 mlx5_dev_interrupt_handler_nl, sh); 3009 #ifdef HAVE_IBV_DEVX_ASYNC 3010 mlx5_os_interrupt_handler_destroy(sh->intr_handle_devx, 3011 mlx5_dev_interrupt_handler_devx, sh); 3012 if (sh->devx_comp) 3013 mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp); 3014 #endif 3015 } 3016 3017 /** 3018 * Read statistics by a named counter. 3019 * 3020 * @param[in] priv 3021 * Pointer to the private device data structure. 3022 * @param[in] ctr_name 3023 * Pointer to the name of the statistic counter to read 3024 * @param[out] stat 3025 * Pointer to read statistic value. 3026 * @return 3027 * 0 on success and stat is valud, 1 if failed to read the value 3028 * rte_errno is set. 3029 * 3030 */ 3031 int 3032 mlx5_os_read_dev_stat(struct mlx5_priv *priv, const char *ctr_name, 3033 uint64_t *stat) 3034 { 3035 int fd; 3036 3037 if (priv->sh) { 3038 if (priv->q_counters != NULL && 3039 strcmp(ctr_name, "out_of_buffer") == 0) { 3040 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 3041 DRV_LOG(WARNING, "DevX out_of_buffer counter is not supported in the secondary process"); 3042 rte_errno = ENOTSUP; 3043 return 1; 3044 } 3045 return mlx5_devx_cmd_queue_counter_query 3046 (priv->q_counters, 0, (uint32_t *)stat); 3047 } 3048 if (priv->q_counters_hairpin != NULL && 3049 strcmp(ctr_name, "hairpin_out_of_buffer") == 0) { 3050 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 3051 DRV_LOG(WARNING, "DevX out_of_buffer counter is not supported in the secondary process"); 3052 rte_errno = ENOTSUP; 3053 return 1; 3054 } 3055 return mlx5_devx_cmd_queue_counter_query 3056 (priv->q_counters_hairpin, 0, (uint32_t *)stat); 3057 } 3058 MKSTR(path, "%s/ports/%d/hw_counters/%s", 3059 priv->sh->ibdev_path, 3060 priv->dev_port, 3061 ctr_name); 3062 fd = open(path, O_RDONLY); 3063 /* 3064 * in switchdev the file location is not per port 3065 * but rather in <ibdev_path>/hw_counters/<file_name>. 3066 */ 3067 if (fd == -1) { 3068 MKSTR(path1, "%s/hw_counters/%s", 3069 priv->sh->ibdev_path, 3070 ctr_name); 3071 fd = open(path1, O_RDONLY); 3072 } 3073 if (fd != -1) { 3074 char buf[21] = {'\0'}; 3075 ssize_t n = read(fd, buf, sizeof(buf)); 3076 3077 close(fd); 3078 if (n != -1) { 3079 *stat = strtoull(buf, NULL, 10); 3080 return 0; 3081 } 3082 } 3083 } 3084 *stat = 0; 3085 return 1; 3086 } 3087 3088 /** 3089 * Remove a MAC address from device 3090 * 3091 * @param dev 3092 * Pointer to Ethernet device structure. 3093 * @param index 3094 * MAC address index. 3095 */ 3096 void 3097 mlx5_os_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) 3098 { 3099 struct mlx5_priv *priv = dev->data->dev_private; 3100 const int vf = priv->sh->dev_cap.vf; 3101 3102 if (vf) 3103 mlx5_nl_mac_addr_remove(priv->nl_socket_route, 3104 mlx5_ifindex(dev), priv->mac_own, 3105 &dev->data->mac_addrs[index], index); 3106 } 3107 3108 /** 3109 * Adds a MAC address to the device 3110 * 3111 * @param dev 3112 * Pointer to Ethernet device structure. 3113 * @param mac_addr 3114 * MAC address to register. 3115 * @param index 3116 * MAC address index. 3117 * 3118 * @return 3119 * 0 on success, a negative errno value otherwise 3120 */ 3121 int 3122 mlx5_os_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac, 3123 uint32_t index) 3124 { 3125 struct mlx5_priv *priv = dev->data->dev_private; 3126 const int vf = priv->sh->dev_cap.vf; 3127 int ret = 0; 3128 3129 if (vf) 3130 ret = mlx5_nl_mac_addr_add(priv->nl_socket_route, 3131 mlx5_ifindex(dev), priv->mac_own, 3132 mac, index); 3133 return ret; 3134 } 3135 3136 /** 3137 * Modify a VF MAC address 3138 * 3139 * @param priv 3140 * Pointer to device private data. 3141 * @param mac_addr 3142 * MAC address to modify into. 3143 * @param iface_idx 3144 * Net device interface index 3145 * @param vf_index 3146 * VF index 3147 * 3148 * @return 3149 * 0 on success, a negative errno value otherwise 3150 */ 3151 int 3152 mlx5_os_vf_mac_addr_modify(struct mlx5_priv *priv, 3153 unsigned int iface_idx, 3154 struct rte_ether_addr *mac_addr, 3155 int vf_index) 3156 { 3157 return mlx5_nl_vf_mac_addr_modify 3158 (priv->nl_socket_route, iface_idx, mac_addr, vf_index); 3159 } 3160 3161 /** 3162 * Set device promiscuous mode 3163 * 3164 * @param dev 3165 * Pointer to Ethernet device structure. 3166 * @param enable 3167 * 0 - promiscuous is disabled, otherwise - enabled 3168 * 3169 * @return 3170 * 0 on success, a negative error value otherwise 3171 */ 3172 int 3173 mlx5_os_set_promisc(struct rte_eth_dev *dev, int enable) 3174 { 3175 struct mlx5_priv *priv = dev->data->dev_private; 3176 3177 return mlx5_nl_promisc(priv->nl_socket_route, 3178 mlx5_ifindex(dev), !!enable); 3179 } 3180 3181 /** 3182 * Set device promiscuous mode 3183 * 3184 * @param dev 3185 * Pointer to Ethernet device structure. 3186 * @param enable 3187 * 0 - all multicase is disabled, otherwise - enabled 3188 * 3189 * @return 3190 * 0 on success, a negative error value otherwise 3191 */ 3192 int 3193 mlx5_os_set_allmulti(struct rte_eth_dev *dev, int enable) 3194 { 3195 struct mlx5_priv *priv = dev->data->dev_private; 3196 3197 return mlx5_nl_allmulti(priv->nl_socket_route, 3198 mlx5_ifindex(dev), !!enable); 3199 } 3200 3201 /** 3202 * Flush device MAC addresses 3203 * 3204 * @param dev 3205 * Pointer to Ethernet device structure. 3206 * 3207 */ 3208 void 3209 mlx5_os_mac_addr_flush(struct rte_eth_dev *dev) 3210 { 3211 struct mlx5_priv *priv = dev->data->dev_private; 3212 3213 mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev), 3214 dev->data->mac_addrs, 3215 MLX5_MAX_MAC_ADDRESSES, priv->mac_own); 3216 } 3217