1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2015 6WIND S.A. 3 * Copyright 2015 Mellanox Technologies, Ltd 4 */ 5 6 #include <stddef.h> 7 #include <unistd.h> 8 #include <string.h> 9 #include <assert.h> 10 #include <dlfcn.h> 11 #include <stdint.h> 12 #include <stdlib.h> 13 #include <errno.h> 14 #include <net/if.h> 15 #include <sys/mman.h> 16 #include <linux/rtnetlink.h> 17 18 /* Verbs header. */ 19 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 20 #ifdef PEDANTIC 21 #pragma GCC diagnostic ignored "-Wpedantic" 22 #endif 23 #include <infiniband/verbs.h> 24 #ifdef PEDANTIC 25 #pragma GCC diagnostic error "-Wpedantic" 26 #endif 27 28 #include <rte_malloc.h> 29 #include <rte_ethdev_driver.h> 30 #include <rte_ethdev_pci.h> 31 #include <rte_pci.h> 32 #include <rte_bus_pci.h> 33 #include <rte_common.h> 34 #include <rte_config.h> 35 #include <rte_eal_memconfig.h> 36 #include <rte_kvargs.h> 37 #include <rte_rwlock.h> 38 #include <rte_spinlock.h> 39 #include <rte_string_fns.h> 40 41 #include "mlx5.h" 42 #include "mlx5_utils.h" 43 #include "mlx5_rxtx.h" 44 #include "mlx5_autoconf.h" 45 #include "mlx5_defs.h" 46 #include "mlx5_glue.h" 47 #include "mlx5_mr.h" 48 #include "mlx5_flow.h" 49 50 /* Device parameter to enable RX completion queue compression. */ 51 #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en" 52 53 /* Device parameter to enable RX completion entry padding to 128B. */ 54 #define MLX5_RXQ_CQE_PAD_EN "rxq_cqe_pad_en" 55 56 /* Device parameter to enable padding Rx packet to cacheline size. */ 57 #define MLX5_RXQ_PKT_PAD_EN "rxq_pkt_pad_en" 58 59 /* Device parameter to enable Multi-Packet Rx queue. */ 60 #define MLX5_RX_MPRQ_EN "mprq_en" 61 62 /* Device parameter to configure log 2 of the number of strides for MPRQ. */ 63 #define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num" 64 65 /* Device parameter to limit the size of memcpy'd packet for MPRQ. */ 66 #define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len" 67 68 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */ 69 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq" 70 71 /* Device parameter to configure inline send. */ 72 #define MLX5_TXQ_INLINE "txq_inline" 73 74 /* 75 * Device parameter to configure the number of TX queues threshold for 76 * enabling inline send. 77 */ 78 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline" 79 80 /* 81 * Device parameter to configure the number of TX queues threshold for 82 * enabling vectorized Tx. 83 */ 84 #define MLX5_TXQS_MAX_VEC "txqs_max_vec" 85 86 /* Device parameter to enable multi-packet send WQEs. */ 87 #define MLX5_TXQ_MPW_EN "txq_mpw_en" 88 89 /* Device parameter to include 2 dsegs in the title WQEBB. */ 90 #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en" 91 92 /* Device parameter to limit the size of inlining packet. */ 93 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len" 94 95 /* Device parameter to enable hardware Tx vector. */ 96 #define MLX5_TX_VEC_EN "tx_vec_en" 97 98 /* Device parameter to enable hardware Rx vector. */ 99 #define MLX5_RX_VEC_EN "rx_vec_en" 100 101 /* Allow L3 VXLAN flow creation. */ 102 #define MLX5_L3_VXLAN_EN "l3_vxlan_en" 103 104 /* Activate DV flow steering. */ 105 #define MLX5_DV_FLOW_EN "dv_flow_en" 106 107 /* Activate Netlink support in VF mode. */ 108 #define MLX5_VF_NL_EN "vf_nl_en" 109 110 /* Select port representors to instantiate. */ 111 #define MLX5_REPRESENTOR "representor" 112 113 #ifndef HAVE_IBV_MLX5_MOD_MPW 114 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) 115 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) 116 #endif 117 118 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP 119 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4) 120 #endif 121 122 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data"; 123 124 /* Shared memory between primary and secondary processes. */ 125 struct mlx5_shared_data *mlx5_shared_data; 126 127 /* Spinlock for mlx5_shared_data allocation. */ 128 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER; 129 130 /** Driver-specific log messages type. */ 131 int mlx5_logtype; 132 133 /** Data associated with devices to spawn. */ 134 struct mlx5_dev_spawn_data { 135 uint32_t ifindex; /**< Network interface index. */ 136 uint32_t max_port; /**< IB device maximal port index. */ 137 uint32_t ibv_port; /**< IB device physical port index. */ 138 struct mlx5_switch_info info; /**< Switch information. */ 139 struct ibv_device *ibv_dev; /**< Associated IB device. */ 140 struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */ 141 }; 142 143 static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER(); 144 static pthread_mutex_t mlx5_ibv_list_mutex = PTHREAD_MUTEX_INITIALIZER; 145 146 /** 147 * Allocate shared IB device context. If there is multiport device the 148 * master and representors will share this context, if there is single 149 * port dedicated IB device, the context will be used by only given 150 * port due to unification. 151 * 152 * Routine first searches the context for the spesified IB device name, 153 * if found the shared context assumed and reference counter is incremented. 154 * If no context found the new one is created and initialized with specified 155 * IB device context and parameters. 156 * 157 * @param[in] spawn 158 * Pointer to the IB device attributes (name, port, etc). 159 * 160 * @return 161 * Pointer to mlx5_ibv_shared object on success, 162 * otherwise NULL and rte_errno is set. 163 */ 164 static struct mlx5_ibv_shared * 165 mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn) 166 { 167 struct mlx5_ibv_shared *sh; 168 int err = 0; 169 uint32_t i; 170 171 assert(spawn); 172 /* Secondary process should not create the shared context. */ 173 assert(rte_eal_process_type() == RTE_PROC_PRIMARY); 174 pthread_mutex_lock(&mlx5_ibv_list_mutex); 175 /* Search for IB context by device name. */ 176 LIST_FOREACH(sh, &mlx5_ibv_list, next) { 177 if (!strcmp(sh->ibdev_name, spawn->ibv_dev->name)) { 178 sh->refcnt++; 179 goto exit; 180 } 181 } 182 /* No device found, we have to create new sharted context. */ 183 assert(spawn->max_port); 184 sh = rte_zmalloc("ethdev shared ib context", 185 sizeof(struct mlx5_ibv_shared) + 186 spawn->max_port * 187 sizeof(struct mlx5_ibv_shared_port), 188 RTE_CACHE_LINE_SIZE); 189 if (!sh) { 190 DRV_LOG(ERR, "shared context allocation failure"); 191 rte_errno = ENOMEM; 192 goto exit; 193 } 194 /* Try to open IB device with DV first, then usual Verbs. */ 195 errno = 0; 196 sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev); 197 if (sh->ctx) { 198 sh->devx = 1; 199 DRV_LOG(DEBUG, "DevX is supported"); 200 } else { 201 sh->ctx = mlx5_glue->open_device(spawn->ibv_dev); 202 if (!sh->ctx) { 203 err = errno ? errno : ENODEV; 204 goto error; 205 } 206 DRV_LOG(DEBUG, "DevX is NOT supported"); 207 } 208 err = mlx5_glue->query_device_ex(sh->ctx, NULL, &sh->device_attr); 209 if (err) { 210 DRV_LOG(DEBUG, "ibv_query_device_ex() failed"); 211 goto error; 212 } 213 sh->refcnt = 1; 214 sh->max_port = spawn->max_port; 215 strncpy(sh->ibdev_name, sh->ctx->device->name, 216 sizeof(sh->ibdev_name)); 217 strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path, 218 sizeof(sh->ibdev_path)); 219 pthread_mutex_init(&sh->intr_mutex, NULL); 220 /* 221 * Setting port_id to max unallowed value means 222 * there is no interrupt subhandler installed for 223 * the given port index i. 224 */ 225 for (i = 0; i < sh->max_port; i++) 226 sh->port[i].ih_port_id = RTE_MAX_ETHPORTS; 227 sh->pd = mlx5_glue->alloc_pd(sh->ctx); 228 if (sh->pd == NULL) { 229 DRV_LOG(ERR, "PD allocation failure"); 230 err = ENOMEM; 231 goto error; 232 } 233 LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next); 234 exit: 235 pthread_mutex_unlock(&mlx5_ibv_list_mutex); 236 return sh; 237 error: 238 pthread_mutex_unlock(&mlx5_ibv_list_mutex); 239 assert(sh); 240 if (sh->pd) 241 claim_zero(mlx5_glue->dealloc_pd(sh->pd)); 242 if (sh->ctx) 243 claim_zero(mlx5_glue->close_device(sh->ctx)); 244 rte_free(sh); 245 assert(err > 0); 246 rte_errno = err; 247 return NULL; 248 } 249 250 /** 251 * Free shared IB device context. Decrement counter and if zero free 252 * all allocated resources and close handles. 253 * 254 * @param[in] sh 255 * Pointer to mlx5_ibv_shared object to free 256 */ 257 static void 258 mlx5_free_shared_ibctx(struct mlx5_ibv_shared *sh) 259 { 260 pthread_mutex_lock(&mlx5_ibv_list_mutex); 261 #ifndef NDEBUG 262 /* Check the object presence in the list. */ 263 struct mlx5_ibv_shared *lctx; 264 265 LIST_FOREACH(lctx, &mlx5_ibv_list, next) 266 if (lctx == sh) 267 break; 268 assert(lctx); 269 if (lctx != sh) { 270 DRV_LOG(ERR, "Freeing non-existing shared IB context"); 271 goto exit; 272 } 273 #endif 274 assert(sh); 275 assert(sh->refcnt); 276 /* Secondary process should not free the shared context. */ 277 assert(rte_eal_process_type() == RTE_PROC_PRIMARY); 278 if (--sh->refcnt) 279 goto exit; 280 LIST_REMOVE(sh, next); 281 /* 282 * Ensure there is no async event handler installed. 283 * Only primary process handles async device events. 284 **/ 285 assert(!sh->intr_cnt); 286 if (sh->intr_cnt) 287 rte_intr_callback_unregister 288 (&sh->intr_handle, mlx5_dev_interrupt_handler, sh); 289 pthread_mutex_destroy(&sh->intr_mutex); 290 if (sh->pd) 291 claim_zero(mlx5_glue->dealloc_pd(sh->pd)); 292 if (sh->ctx) 293 claim_zero(mlx5_glue->close_device(sh->ctx)); 294 rte_free(sh); 295 exit: 296 pthread_mutex_unlock(&mlx5_ibv_list_mutex); 297 } 298 299 /** 300 * Prepare shared data between primary and secondary process. 301 */ 302 static void 303 mlx5_prepare_shared_data(void) 304 { 305 const struct rte_memzone *mz; 306 307 rte_spinlock_lock(&mlx5_shared_data_lock); 308 if (mlx5_shared_data == NULL) { 309 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 310 /* Allocate shared memory. */ 311 mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA, 312 sizeof(*mlx5_shared_data), 313 SOCKET_ID_ANY, 0); 314 } else { 315 /* Lookup allocated shared memory. */ 316 mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA); 317 } 318 if (mz == NULL) 319 rte_panic("Cannot allocate mlx5 shared data\n"); 320 mlx5_shared_data = mz->addr; 321 /* Initialize shared data. */ 322 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 323 LIST_INIT(&mlx5_shared_data->mem_event_cb_list); 324 rte_rwlock_init(&mlx5_shared_data->mem_event_rwlock); 325 } 326 rte_mem_event_callback_register("MLX5_MEM_EVENT_CB", 327 mlx5_mr_mem_event_cb, NULL); 328 } 329 rte_spinlock_unlock(&mlx5_shared_data_lock); 330 } 331 332 /** 333 * Retrieve integer value from environment variable. 334 * 335 * @param[in] name 336 * Environment variable name. 337 * 338 * @return 339 * Integer value, 0 if the variable is not set. 340 */ 341 int 342 mlx5_getenv_int(const char *name) 343 { 344 const char *val = getenv(name); 345 346 if (val == NULL) 347 return 0; 348 return atoi(val); 349 } 350 351 /** 352 * Verbs callback to allocate a memory. This function should allocate the space 353 * according to the size provided residing inside a huge page. 354 * Please note that all allocation must respect the alignment from libmlx5 355 * (i.e. currently sysconf(_SC_PAGESIZE)). 356 * 357 * @param[in] size 358 * The size in bytes of the memory to allocate. 359 * @param[in] data 360 * A pointer to the callback data. 361 * 362 * @return 363 * Allocated buffer, NULL otherwise and rte_errno is set. 364 */ 365 static void * 366 mlx5_alloc_verbs_buf(size_t size, void *data) 367 { 368 struct mlx5_priv *priv = data; 369 void *ret; 370 size_t alignment = sysconf(_SC_PAGESIZE); 371 unsigned int socket = SOCKET_ID_ANY; 372 373 if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) { 374 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 375 376 socket = ctrl->socket; 377 } else if (priv->verbs_alloc_ctx.type == 378 MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) { 379 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; 380 381 socket = ctrl->socket; 382 } 383 assert(data != NULL); 384 ret = rte_malloc_socket(__func__, size, alignment, socket); 385 if (!ret && size) 386 rte_errno = ENOMEM; 387 return ret; 388 } 389 390 /** 391 * Verbs callback to free a memory. 392 * 393 * @param[in] ptr 394 * A pointer to the memory to free. 395 * @param[in] data 396 * A pointer to the callback data. 397 */ 398 static void 399 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused) 400 { 401 assert(data != NULL); 402 rte_free(ptr); 403 } 404 405 /** 406 * DPDK callback to close the device. 407 * 408 * Destroy all queues and objects, free memory. 409 * 410 * @param dev 411 * Pointer to Ethernet device structure. 412 */ 413 static void 414 mlx5_dev_close(struct rte_eth_dev *dev) 415 { 416 struct mlx5_priv *priv = dev->data->dev_private; 417 unsigned int i; 418 int ret; 419 420 DRV_LOG(DEBUG, "port %u closing device \"%s\"", 421 dev->data->port_id, 422 ((priv->sh->ctx != NULL) ? priv->sh->ctx->device->name : "")); 423 /* In case mlx5_dev_stop() has not been called. */ 424 mlx5_dev_interrupt_handler_uninstall(dev); 425 mlx5_traffic_disable(dev); 426 mlx5_flow_flush(dev, NULL); 427 /* Prevent crashes when queues are still in use. */ 428 dev->rx_pkt_burst = removed_rx_burst; 429 dev->tx_pkt_burst = removed_tx_burst; 430 if (priv->rxqs != NULL) { 431 /* XXX race condition if mlx5_rx_burst() is still running. */ 432 usleep(1000); 433 for (i = 0; (i != priv->rxqs_n); ++i) 434 mlx5_rxq_release(dev, i); 435 priv->rxqs_n = 0; 436 priv->rxqs = NULL; 437 } 438 if (priv->txqs != NULL) { 439 /* XXX race condition if mlx5_tx_burst() is still running. */ 440 usleep(1000); 441 for (i = 0; (i != priv->txqs_n); ++i) 442 mlx5_txq_release(dev, i); 443 priv->txqs_n = 0; 444 priv->txqs = NULL; 445 } 446 mlx5_mprq_free_mp(dev); 447 mlx5_mr_release(dev); 448 assert(priv->sh); 449 if (priv->sh) 450 mlx5_free_shared_ibctx(priv->sh); 451 priv->sh = NULL; 452 if (priv->rss_conf.rss_key != NULL) 453 rte_free(priv->rss_conf.rss_key); 454 if (priv->reta_idx != NULL) 455 rte_free(priv->reta_idx); 456 if (priv->primary_socket) 457 mlx5_socket_uninit(dev); 458 if (priv->config.vf) 459 mlx5_nl_mac_addr_flush(dev); 460 if (priv->nl_socket_route >= 0) 461 close(priv->nl_socket_route); 462 if (priv->nl_socket_rdma >= 0) 463 close(priv->nl_socket_rdma); 464 if (priv->tcf_context) 465 mlx5_flow_tcf_context_destroy(priv->tcf_context); 466 ret = mlx5_hrxq_ibv_verify(dev); 467 if (ret) 468 DRV_LOG(WARNING, "port %u some hash Rx queue still remain", 469 dev->data->port_id); 470 ret = mlx5_ind_table_ibv_verify(dev); 471 if (ret) 472 DRV_LOG(WARNING, "port %u some indirection table still remain", 473 dev->data->port_id); 474 ret = mlx5_rxq_ibv_verify(dev); 475 if (ret) 476 DRV_LOG(WARNING, "port %u some Verbs Rx queue still remain", 477 dev->data->port_id); 478 ret = mlx5_rxq_verify(dev); 479 if (ret) 480 DRV_LOG(WARNING, "port %u some Rx queues still remain", 481 dev->data->port_id); 482 ret = mlx5_txq_ibv_verify(dev); 483 if (ret) 484 DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain", 485 dev->data->port_id); 486 ret = mlx5_txq_verify(dev); 487 if (ret) 488 DRV_LOG(WARNING, "port %u some Tx queues still remain", 489 dev->data->port_id); 490 ret = mlx5_flow_verify(dev); 491 if (ret) 492 DRV_LOG(WARNING, "port %u some flows still remain", 493 dev->data->port_id); 494 if (priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { 495 unsigned int c = 0; 496 unsigned int i = mlx5_dev_to_port_id(dev->device, NULL, 0); 497 uint16_t port_id[i]; 498 499 i = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, i), i); 500 while (i--) { 501 struct mlx5_priv *opriv = 502 rte_eth_devices[port_id[i]].data->dev_private; 503 504 if (!opriv || 505 opriv->domain_id != priv->domain_id || 506 &rte_eth_devices[port_id[i]] == dev) 507 continue; 508 ++c; 509 } 510 if (!c) 511 claim_zero(rte_eth_switch_domain_free(priv->domain_id)); 512 } 513 memset(priv, 0, sizeof(*priv)); 514 priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; 515 /* 516 * Reset mac_addrs to NULL such that it is not freed as part of 517 * rte_eth_dev_release_port(). mac_addrs is part of dev_private so 518 * it is freed when dev_private is freed. 519 */ 520 dev->data->mac_addrs = NULL; 521 } 522 523 const struct eth_dev_ops mlx5_dev_ops = { 524 .dev_configure = mlx5_dev_configure, 525 .dev_start = mlx5_dev_start, 526 .dev_stop = mlx5_dev_stop, 527 .dev_set_link_down = mlx5_set_link_down, 528 .dev_set_link_up = mlx5_set_link_up, 529 .dev_close = mlx5_dev_close, 530 .promiscuous_enable = mlx5_promiscuous_enable, 531 .promiscuous_disable = mlx5_promiscuous_disable, 532 .allmulticast_enable = mlx5_allmulticast_enable, 533 .allmulticast_disable = mlx5_allmulticast_disable, 534 .link_update = mlx5_link_update, 535 .stats_get = mlx5_stats_get, 536 .stats_reset = mlx5_stats_reset, 537 .xstats_get = mlx5_xstats_get, 538 .xstats_reset = mlx5_xstats_reset, 539 .xstats_get_names = mlx5_xstats_get_names, 540 .fw_version_get = mlx5_fw_version_get, 541 .dev_infos_get = mlx5_dev_infos_get, 542 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 543 .vlan_filter_set = mlx5_vlan_filter_set, 544 .rx_queue_setup = mlx5_rx_queue_setup, 545 .tx_queue_setup = mlx5_tx_queue_setup, 546 .rx_queue_release = mlx5_rx_queue_release, 547 .tx_queue_release = mlx5_tx_queue_release, 548 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 549 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 550 .mac_addr_remove = mlx5_mac_addr_remove, 551 .mac_addr_add = mlx5_mac_addr_add, 552 .mac_addr_set = mlx5_mac_addr_set, 553 .set_mc_addr_list = mlx5_set_mc_addr_list, 554 .mtu_set = mlx5_dev_set_mtu, 555 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 556 .vlan_offload_set = mlx5_vlan_offload_set, 557 .reta_update = mlx5_dev_rss_reta_update, 558 .reta_query = mlx5_dev_rss_reta_query, 559 .rss_hash_update = mlx5_rss_hash_update, 560 .rss_hash_conf_get = mlx5_rss_hash_conf_get, 561 .filter_ctrl = mlx5_dev_filter_ctrl, 562 .rx_descriptor_status = mlx5_rx_descriptor_status, 563 .tx_descriptor_status = mlx5_tx_descriptor_status, 564 .rx_queue_count = mlx5_rx_queue_count, 565 .rx_queue_intr_enable = mlx5_rx_intr_enable, 566 .rx_queue_intr_disable = mlx5_rx_intr_disable, 567 .is_removed = mlx5_is_removed, 568 }; 569 570 /* Available operations from secondary process. */ 571 static const struct eth_dev_ops mlx5_dev_sec_ops = { 572 .stats_get = mlx5_stats_get, 573 .stats_reset = mlx5_stats_reset, 574 .xstats_get = mlx5_xstats_get, 575 .xstats_reset = mlx5_xstats_reset, 576 .xstats_get_names = mlx5_xstats_get_names, 577 .fw_version_get = mlx5_fw_version_get, 578 .dev_infos_get = mlx5_dev_infos_get, 579 .rx_descriptor_status = mlx5_rx_descriptor_status, 580 .tx_descriptor_status = mlx5_tx_descriptor_status, 581 }; 582 583 /* Available operations in flow isolated mode. */ 584 const struct eth_dev_ops mlx5_dev_ops_isolate = { 585 .dev_configure = mlx5_dev_configure, 586 .dev_start = mlx5_dev_start, 587 .dev_stop = mlx5_dev_stop, 588 .dev_set_link_down = mlx5_set_link_down, 589 .dev_set_link_up = mlx5_set_link_up, 590 .dev_close = mlx5_dev_close, 591 .promiscuous_enable = mlx5_promiscuous_enable, 592 .promiscuous_disable = mlx5_promiscuous_disable, 593 .allmulticast_enable = mlx5_allmulticast_enable, 594 .allmulticast_disable = mlx5_allmulticast_disable, 595 .link_update = mlx5_link_update, 596 .stats_get = mlx5_stats_get, 597 .stats_reset = mlx5_stats_reset, 598 .xstats_get = mlx5_xstats_get, 599 .xstats_reset = mlx5_xstats_reset, 600 .xstats_get_names = mlx5_xstats_get_names, 601 .fw_version_get = mlx5_fw_version_get, 602 .dev_infos_get = mlx5_dev_infos_get, 603 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, 604 .vlan_filter_set = mlx5_vlan_filter_set, 605 .rx_queue_setup = mlx5_rx_queue_setup, 606 .tx_queue_setup = mlx5_tx_queue_setup, 607 .rx_queue_release = mlx5_rx_queue_release, 608 .tx_queue_release = mlx5_tx_queue_release, 609 .flow_ctrl_get = mlx5_dev_get_flow_ctrl, 610 .flow_ctrl_set = mlx5_dev_set_flow_ctrl, 611 .mac_addr_remove = mlx5_mac_addr_remove, 612 .mac_addr_add = mlx5_mac_addr_add, 613 .mac_addr_set = mlx5_mac_addr_set, 614 .set_mc_addr_list = mlx5_set_mc_addr_list, 615 .mtu_set = mlx5_dev_set_mtu, 616 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, 617 .vlan_offload_set = mlx5_vlan_offload_set, 618 .filter_ctrl = mlx5_dev_filter_ctrl, 619 .rx_descriptor_status = mlx5_rx_descriptor_status, 620 .tx_descriptor_status = mlx5_tx_descriptor_status, 621 .rx_queue_intr_enable = mlx5_rx_intr_enable, 622 .rx_queue_intr_disable = mlx5_rx_intr_disable, 623 .is_removed = mlx5_is_removed, 624 }; 625 626 /** 627 * Verify and store value for device argument. 628 * 629 * @param[in] key 630 * Key argument to verify. 631 * @param[in] val 632 * Value associated with key. 633 * @param opaque 634 * User data. 635 * 636 * @return 637 * 0 on success, a negative errno value otherwise and rte_errno is set. 638 */ 639 static int 640 mlx5_args_check(const char *key, const char *val, void *opaque) 641 { 642 struct mlx5_dev_config *config = opaque; 643 unsigned long tmp; 644 645 /* No-op, port representors are processed in mlx5_dev_spawn(). */ 646 if (!strcmp(MLX5_REPRESENTOR, key)) 647 return 0; 648 errno = 0; 649 tmp = strtoul(val, NULL, 0); 650 if (errno) { 651 rte_errno = errno; 652 DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val); 653 return -rte_errno; 654 } 655 if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) { 656 config->cqe_comp = !!tmp; 657 } else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) { 658 config->cqe_pad = !!tmp; 659 } else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) { 660 config->hw_padding = !!tmp; 661 } else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) { 662 config->mprq.enabled = !!tmp; 663 } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) { 664 config->mprq.stride_num_n = tmp; 665 } else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) { 666 config->mprq.max_memcpy_len = tmp; 667 } else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) { 668 config->mprq.min_rxqs_num = tmp; 669 } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) { 670 config->txq_inline = tmp; 671 } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) { 672 config->txqs_inline = tmp; 673 } else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) { 674 config->txqs_vec = tmp; 675 } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) { 676 config->mps = !!tmp; 677 } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) { 678 config->mpw_hdr_dseg = !!tmp; 679 } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) { 680 config->inline_max_packet_sz = tmp; 681 } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) { 682 config->tx_vec_en = !!tmp; 683 } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) { 684 config->rx_vec_en = !!tmp; 685 } else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) { 686 config->l3_vxlan_en = !!tmp; 687 } else if (strcmp(MLX5_VF_NL_EN, key) == 0) { 688 config->vf_nl_en = !!tmp; 689 } else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) { 690 config->dv_flow_en = !!tmp; 691 } else { 692 DRV_LOG(WARNING, "%s: unknown parameter", key); 693 rte_errno = EINVAL; 694 return -rte_errno; 695 } 696 return 0; 697 } 698 699 /** 700 * Parse device parameters. 701 * 702 * @param config 703 * Pointer to device configuration structure. 704 * @param devargs 705 * Device arguments structure. 706 * 707 * @return 708 * 0 on success, a negative errno value otherwise and rte_errno is set. 709 */ 710 static int 711 mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) 712 { 713 const char **params = (const char *[]){ 714 MLX5_RXQ_CQE_COMP_EN, 715 MLX5_RXQ_CQE_PAD_EN, 716 MLX5_RXQ_PKT_PAD_EN, 717 MLX5_RX_MPRQ_EN, 718 MLX5_RX_MPRQ_LOG_STRIDE_NUM, 719 MLX5_RX_MPRQ_MAX_MEMCPY_LEN, 720 MLX5_RXQS_MIN_MPRQ, 721 MLX5_TXQ_INLINE, 722 MLX5_TXQS_MIN_INLINE, 723 MLX5_TXQS_MAX_VEC, 724 MLX5_TXQ_MPW_EN, 725 MLX5_TXQ_MPW_HDR_DSEG_EN, 726 MLX5_TXQ_MAX_INLINE_LEN, 727 MLX5_TX_VEC_EN, 728 MLX5_RX_VEC_EN, 729 MLX5_L3_VXLAN_EN, 730 MLX5_VF_NL_EN, 731 MLX5_DV_FLOW_EN, 732 MLX5_REPRESENTOR, 733 NULL, 734 }; 735 struct rte_kvargs *kvlist; 736 int ret = 0; 737 int i; 738 739 if (devargs == NULL) 740 return 0; 741 /* Following UGLY cast is done to pass checkpatch. */ 742 kvlist = rte_kvargs_parse(devargs->args, params); 743 if (kvlist == NULL) 744 return 0; 745 /* Process parameters. */ 746 for (i = 0; (params[i] != NULL); ++i) { 747 if (rte_kvargs_count(kvlist, params[i])) { 748 ret = rte_kvargs_process(kvlist, params[i], 749 mlx5_args_check, config); 750 if (ret) { 751 rte_errno = EINVAL; 752 rte_kvargs_free(kvlist); 753 return -rte_errno; 754 } 755 } 756 } 757 rte_kvargs_free(kvlist); 758 return 0; 759 } 760 761 static struct rte_pci_driver mlx5_driver; 762 763 /* 764 * Reserved UAR address space for TXQ UAR(hw doorbell) mapping, process 765 * local resource used by both primary and secondary to avoid duplicate 766 * reservation. 767 * The space has to be available on both primary and secondary process, 768 * TXQ UAR maps to this area using fixed mmap w/o double check. 769 */ 770 static void *uar_base; 771 772 static int 773 find_lower_va_bound(const struct rte_memseg_list *msl, 774 const struct rte_memseg *ms, void *arg) 775 { 776 void **addr = arg; 777 778 if (msl->external) 779 return 0; 780 if (*addr == NULL) 781 *addr = ms->addr; 782 else 783 *addr = RTE_MIN(*addr, ms->addr); 784 785 return 0; 786 } 787 788 /** 789 * Reserve UAR address space for primary process. 790 * 791 * @param[in] dev 792 * Pointer to Ethernet device. 793 * 794 * @return 795 * 0 on success, a negative errno value otherwise and rte_errno is set. 796 */ 797 static int 798 mlx5_uar_init_primary(struct rte_eth_dev *dev) 799 { 800 struct mlx5_priv *priv = dev->data->dev_private; 801 void *addr = (void *)0; 802 803 if (uar_base) { /* UAR address space mapped. */ 804 priv->uar_base = uar_base; 805 return 0; 806 } 807 /* find out lower bound of hugepage segments */ 808 rte_memseg_walk(find_lower_va_bound, &addr); 809 810 /* keep distance to hugepages to minimize potential conflicts. */ 811 addr = RTE_PTR_SUB(addr, (uintptr_t)(MLX5_UAR_OFFSET + MLX5_UAR_SIZE)); 812 /* anonymous mmap, no real memory consumption. */ 813 addr = mmap(addr, MLX5_UAR_SIZE, 814 PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 815 if (addr == MAP_FAILED) { 816 DRV_LOG(ERR, 817 "port %u failed to reserve UAR address space, please" 818 " adjust MLX5_UAR_SIZE or try --base-virtaddr", 819 dev->data->port_id); 820 rte_errno = ENOMEM; 821 return -rte_errno; 822 } 823 /* Accept either same addr or a new addr returned from mmap if target 824 * range occupied. 825 */ 826 DRV_LOG(INFO, "port %u reserved UAR address space: %p", 827 dev->data->port_id, addr); 828 priv->uar_base = addr; /* for primary and secondary UAR re-mmap. */ 829 uar_base = addr; /* process local, don't reserve again. */ 830 return 0; 831 } 832 833 /** 834 * Reserve UAR address space for secondary process, align with 835 * primary process. 836 * 837 * @param[in] dev 838 * Pointer to Ethernet device. 839 * 840 * @return 841 * 0 on success, a negative errno value otherwise and rte_errno is set. 842 */ 843 static int 844 mlx5_uar_init_secondary(struct rte_eth_dev *dev) 845 { 846 struct mlx5_priv *priv = dev->data->dev_private; 847 void *addr; 848 849 assert(priv->uar_base); 850 if (uar_base) { /* already reserved. */ 851 assert(uar_base == priv->uar_base); 852 return 0; 853 } 854 /* anonymous mmap, no real memory consumption. */ 855 addr = mmap(priv->uar_base, MLX5_UAR_SIZE, 856 PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 857 if (addr == MAP_FAILED) { 858 DRV_LOG(ERR, "port %u UAR mmap failed: %p size: %llu", 859 dev->data->port_id, priv->uar_base, MLX5_UAR_SIZE); 860 rte_errno = ENXIO; 861 return -rte_errno; 862 } 863 if (priv->uar_base != addr) { 864 DRV_LOG(ERR, 865 "port %u UAR address %p size %llu occupied, please" 866 " adjust MLX5_UAR_OFFSET or try EAL parameter" 867 " --base-virtaddr", 868 dev->data->port_id, priv->uar_base, MLX5_UAR_SIZE); 869 rte_errno = ENXIO; 870 return -rte_errno; 871 } 872 uar_base = addr; /* process local, don't reserve again */ 873 DRV_LOG(INFO, "port %u reserved UAR address space: %p", 874 dev->data->port_id, addr); 875 return 0; 876 } 877 878 /** 879 * Spawn an Ethernet device from Verbs information. 880 * 881 * @param dpdk_dev 882 * Backing DPDK device. 883 * @param spawn 884 * Verbs device parameters (name, port, switch_info) to spawn. 885 * @param config 886 * Device configuration parameters. 887 * 888 * @return 889 * A valid Ethernet device object on success, NULL otherwise and rte_errno 890 * is set. The following errors are defined: 891 * 892 * EBUSY: device is not supposed to be spawned. 893 * EEXIST: device is already spawned 894 */ 895 static struct rte_eth_dev * 896 mlx5_dev_spawn(struct rte_device *dpdk_dev, 897 struct mlx5_dev_spawn_data *spawn, 898 struct mlx5_dev_config config) 899 { 900 const struct mlx5_switch_info *switch_info = &spawn->info; 901 struct mlx5_ibv_shared *sh = NULL; 902 struct ibv_port_attr port_attr; 903 struct mlx5dv_context dv_attr = { .comp_mask = 0 }; 904 struct rte_eth_dev *eth_dev = NULL; 905 struct mlx5_priv *priv = NULL; 906 int err = 0; 907 unsigned int hw_padding = 0; 908 unsigned int mps; 909 unsigned int cqe_comp; 910 unsigned int cqe_pad = 0; 911 unsigned int tunnel_en = 0; 912 unsigned int mpls_en = 0; 913 unsigned int swp = 0; 914 unsigned int mprq = 0; 915 unsigned int mprq_min_stride_size_n = 0; 916 unsigned int mprq_max_stride_size_n = 0; 917 unsigned int mprq_min_stride_num_n = 0; 918 unsigned int mprq_max_stride_num_n = 0; 919 struct ether_addr mac; 920 char name[RTE_ETH_NAME_MAX_LEN]; 921 int own_domain_id = 0; 922 uint16_t port_id; 923 unsigned int i; 924 925 /* Determine if this port representor is supposed to be spawned. */ 926 if (switch_info->representor && dpdk_dev->devargs) { 927 struct rte_eth_devargs eth_da; 928 929 err = rte_eth_devargs_parse(dpdk_dev->devargs->args, ð_da); 930 if (err) { 931 rte_errno = -err; 932 DRV_LOG(ERR, "failed to process device arguments: %s", 933 strerror(rte_errno)); 934 return NULL; 935 } 936 for (i = 0; i < eth_da.nb_representor_ports; ++i) 937 if (eth_da.representor_ports[i] == 938 (uint16_t)switch_info->port_name) 939 break; 940 if (i == eth_da.nb_representor_ports) { 941 rte_errno = EBUSY; 942 return NULL; 943 } 944 } 945 /* Build device name. */ 946 if (!switch_info->representor) 947 strlcpy(name, dpdk_dev->name, sizeof(name)); 948 else 949 snprintf(name, sizeof(name), "%s_representor_%u", 950 dpdk_dev->name, switch_info->port_name); 951 /* check if the device is already spawned */ 952 if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) { 953 rte_errno = EEXIST; 954 return NULL; 955 } 956 /* Prepare shared data between primary and secondary process. */ 957 mlx5_prepare_shared_data(); 958 DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name); 959 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 960 eth_dev = rte_eth_dev_attach_secondary(name); 961 if (eth_dev == NULL) { 962 DRV_LOG(ERR, "can not attach rte ethdev"); 963 rte_errno = ENOMEM; 964 return NULL; 965 } 966 eth_dev->device = dpdk_dev; 967 eth_dev->dev_ops = &mlx5_dev_sec_ops; 968 err = mlx5_uar_init_secondary(eth_dev); 969 if (err) 970 return NULL; 971 /* Receive command fd from primary process */ 972 err = mlx5_socket_connect(eth_dev); 973 if (err < 0) 974 return NULL; 975 /* Remap UAR for Tx queues. */ 976 err = mlx5_tx_uar_remap(eth_dev, err); 977 if (err) 978 return NULL; 979 /* 980 * Ethdev pointer is still required as input since 981 * the primary device is not accessible from the 982 * secondary process. 983 */ 984 eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev); 985 eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev); 986 return eth_dev; 987 } 988 sh = mlx5_alloc_shared_ibctx(spawn); 989 if (!sh) 990 return NULL; 991 config.devx = sh->devx; 992 #ifdef HAVE_IBV_MLX5_MOD_SWP 993 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; 994 #endif 995 /* 996 * Multi-packet send is supported by ConnectX-4 Lx PF as well 997 * as all ConnectX-5 devices. 998 */ 999 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 1000 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS; 1001 #endif 1002 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 1003 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; 1004 #endif 1005 mlx5_glue->dv_query_device(sh->ctx, &dv_attr); 1006 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { 1007 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { 1008 DRV_LOG(DEBUG, "enhanced MPW is supported"); 1009 mps = MLX5_MPW_ENHANCED; 1010 } else { 1011 DRV_LOG(DEBUG, "MPW is supported"); 1012 mps = MLX5_MPW; 1013 } 1014 } else { 1015 DRV_LOG(DEBUG, "MPW isn't supported"); 1016 mps = MLX5_MPW_DISABLED; 1017 } 1018 #ifdef HAVE_IBV_MLX5_MOD_SWP 1019 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP) 1020 swp = dv_attr.sw_parsing_caps.sw_parsing_offloads; 1021 DRV_LOG(DEBUG, "SWP support: %u", swp); 1022 #endif 1023 config.swp = !!swp; 1024 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT 1025 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) { 1026 struct mlx5dv_striding_rq_caps mprq_caps = 1027 dv_attr.striding_rq_caps; 1028 1029 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d", 1030 mprq_caps.min_single_stride_log_num_of_bytes); 1031 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d", 1032 mprq_caps.max_single_stride_log_num_of_bytes); 1033 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d", 1034 mprq_caps.min_single_wqe_log_num_of_strides); 1035 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d", 1036 mprq_caps.max_single_wqe_log_num_of_strides); 1037 DRV_LOG(DEBUG, "\tsupported_qpts: %d", 1038 mprq_caps.supported_qpts); 1039 DRV_LOG(DEBUG, "device supports Multi-Packet RQ"); 1040 mprq = 1; 1041 mprq_min_stride_size_n = 1042 mprq_caps.min_single_stride_log_num_of_bytes; 1043 mprq_max_stride_size_n = 1044 mprq_caps.max_single_stride_log_num_of_bytes; 1045 mprq_min_stride_num_n = 1046 mprq_caps.min_single_wqe_log_num_of_strides; 1047 mprq_max_stride_num_n = 1048 mprq_caps.max_single_wqe_log_num_of_strides; 1049 config.mprq.stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, 1050 mprq_min_stride_num_n); 1051 } 1052 #endif 1053 if (RTE_CACHE_LINE_SIZE == 128 && 1054 !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) 1055 cqe_comp = 0; 1056 else 1057 cqe_comp = 1; 1058 config.cqe_comp = cqe_comp; 1059 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD 1060 /* Whether device supports 128B Rx CQE padding. */ 1061 cqe_pad = RTE_CACHE_LINE_SIZE == 128 && 1062 (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD); 1063 #endif 1064 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT 1065 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { 1066 tunnel_en = ((dv_attr.tunnel_offloads_caps & 1067 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) && 1068 (dv_attr.tunnel_offloads_caps & 1069 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE)); 1070 } 1071 DRV_LOG(DEBUG, "tunnel offloading is %ssupported", 1072 tunnel_en ? "" : "not "); 1073 #else 1074 DRV_LOG(WARNING, 1075 "tunnel offloading disabled due to old OFED/rdma-core version"); 1076 #endif 1077 config.tunnel_en = tunnel_en; 1078 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT 1079 mpls_en = ((dv_attr.tunnel_offloads_caps & 1080 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) && 1081 (dv_attr.tunnel_offloads_caps & 1082 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP)); 1083 DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported", 1084 mpls_en ? "" : "not "); 1085 #else 1086 DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to" 1087 " old OFED/rdma-core version or firmware configuration"); 1088 #endif 1089 config.mpls_en = mpls_en; 1090 /* Check port status. */ 1091 err = mlx5_glue->query_port(sh->ctx, spawn->ibv_port, &port_attr); 1092 if (err) { 1093 DRV_LOG(ERR, "port query failed: %s", strerror(err)); 1094 goto error; 1095 } 1096 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { 1097 DRV_LOG(ERR, "port is not configured in Ethernet mode"); 1098 err = EINVAL; 1099 goto error; 1100 } 1101 if (port_attr.state != IBV_PORT_ACTIVE) 1102 DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)", 1103 mlx5_glue->port_state_str(port_attr.state), 1104 port_attr.state); 1105 /* Allocate private eth device data. */ 1106 priv = rte_zmalloc("ethdev private structure", 1107 sizeof(*priv), 1108 RTE_CACHE_LINE_SIZE); 1109 if (priv == NULL) { 1110 DRV_LOG(ERR, "priv allocation failure"); 1111 err = ENOMEM; 1112 goto error; 1113 } 1114 priv->sh = sh; 1115 priv->ibv_port = spawn->ibv_port; 1116 priv->mtu = ETHER_MTU; 1117 #ifndef RTE_ARCH_64 1118 /* Initialize UAR access locks for 32bit implementations. */ 1119 rte_spinlock_init(&priv->uar_lock_cq); 1120 for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++) 1121 rte_spinlock_init(&priv->uar_lock[i]); 1122 #endif 1123 /* Some internal functions rely on Netlink sockets, open them now. */ 1124 priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA); 1125 priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE); 1126 priv->nl_sn = 0; 1127 priv->representor = !!switch_info->representor; 1128 priv->master = !!switch_info->master; 1129 priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; 1130 /* 1131 * Currently we support single E-Switch per PF configurations 1132 * only and vport_id field contains the vport index for 1133 * associated VF, which is deduced from representor port name. 1134 * For exapmple, let's have the IB device port 10, it has 1135 * attached network device eth0, which has port name attribute 1136 * pf0vf2, we can deduce the VF number as 2, and set vport index 1137 * as 3 (2+1). This assigning schema should be changed if the 1138 * multiple E-Switch instances per PF configurations or/and PCI 1139 * subfunctions are added. 1140 */ 1141 priv->vport_id = switch_info->representor ? 1142 switch_info->port_name + 1 : -1; 1143 /* representor_id field keeps the unmodified port/VF index. */ 1144 priv->representor_id = switch_info->representor ? 1145 switch_info->port_name : -1; 1146 /* 1147 * Look for sibling devices in order to reuse their switch domain 1148 * if any, otherwise allocate one. 1149 */ 1150 i = mlx5_dev_to_port_id(dpdk_dev, NULL, 0); 1151 if (i > 0) { 1152 uint16_t port_id[i]; 1153 1154 i = RTE_MIN(mlx5_dev_to_port_id(dpdk_dev, port_id, i), i); 1155 while (i--) { 1156 const struct mlx5_priv *opriv = 1157 rte_eth_devices[port_id[i]].data->dev_private; 1158 1159 if (!opriv || 1160 opriv->domain_id == 1161 RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) 1162 continue; 1163 priv->domain_id = opriv->domain_id; 1164 break; 1165 } 1166 } 1167 if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { 1168 err = rte_eth_switch_domain_alloc(&priv->domain_id); 1169 if (err) { 1170 err = rte_errno; 1171 DRV_LOG(ERR, "unable to allocate switch domain: %s", 1172 strerror(rte_errno)); 1173 goto error; 1174 } 1175 own_domain_id = 1; 1176 } 1177 err = mlx5_args(&config, dpdk_dev->devargs); 1178 if (err) { 1179 err = rte_errno; 1180 DRV_LOG(ERR, "failed to process device arguments: %s", 1181 strerror(rte_errno)); 1182 goto error; 1183 } 1184 config.hw_csum = !!(sh->device_attr.device_cap_flags_ex & 1185 IBV_DEVICE_RAW_IP_CSUM); 1186 DRV_LOG(DEBUG, "checksum offloading is %ssupported", 1187 (config.hw_csum ? "" : "not ")); 1188 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \ 1189 !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) 1190 DRV_LOG(DEBUG, "counters are not supported"); 1191 #endif 1192 #ifndef HAVE_IBV_FLOW_DV_SUPPORT 1193 if (config.dv_flow_en) { 1194 DRV_LOG(WARNING, "DV flow is not supported"); 1195 config.dv_flow_en = 0; 1196 } 1197 #endif 1198 config.ind_table_max_size = 1199 sh->device_attr.rss_caps.max_rwq_indirection_table_size; 1200 /* 1201 * Remove this check once DPDK supports larger/variable 1202 * indirection tables. 1203 */ 1204 if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512) 1205 config.ind_table_max_size = ETH_RSS_RETA_SIZE_512; 1206 DRV_LOG(DEBUG, "maximum Rx indirection table size is %u", 1207 config.ind_table_max_size); 1208 config.hw_vlan_strip = !!(sh->device_attr.raw_packet_caps & 1209 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); 1210 DRV_LOG(DEBUG, "VLAN stripping is %ssupported", 1211 (config.hw_vlan_strip ? "" : "not ")); 1212 config.hw_fcs_strip = !!(sh->device_attr.raw_packet_caps & 1213 IBV_RAW_PACKET_CAP_SCATTER_FCS); 1214 DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported", 1215 (config.hw_fcs_strip ? "" : "not ")); 1216 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING) 1217 hw_padding = !!sh->device_attr.rx_pad_end_addr_align; 1218 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING) 1219 hw_padding = !!(sh->device_attr.device_cap_flags_ex & 1220 IBV_DEVICE_PCI_WRITE_END_PADDING); 1221 #endif 1222 if (config.hw_padding && !hw_padding) { 1223 DRV_LOG(DEBUG, "Rx end alignment padding isn't supported"); 1224 config.hw_padding = 0; 1225 } else if (config.hw_padding) { 1226 DRV_LOG(DEBUG, "Rx end alignment padding is enabled"); 1227 } 1228 config.tso = (sh->device_attr.tso_caps.max_tso > 0 && 1229 (sh->device_attr.tso_caps.supported_qpts & 1230 (1 << IBV_QPT_RAW_PACKET))); 1231 if (config.tso) 1232 config.tso_max_payload_sz = sh->device_attr.tso_caps.max_tso; 1233 /* 1234 * MPW is disabled by default, while the Enhanced MPW is enabled 1235 * by default. 1236 */ 1237 if (config.mps == MLX5_ARG_UNSET) 1238 config.mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED : 1239 MLX5_MPW_DISABLED; 1240 else 1241 config.mps = config.mps ? mps : MLX5_MPW_DISABLED; 1242 DRV_LOG(INFO, "%sMPS is %s", 1243 config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "", 1244 config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); 1245 if (config.cqe_comp && !cqe_comp) { 1246 DRV_LOG(WARNING, "Rx CQE compression isn't supported"); 1247 config.cqe_comp = 0; 1248 } 1249 if (config.cqe_pad && !cqe_pad) { 1250 DRV_LOG(WARNING, "Rx CQE padding isn't supported"); 1251 config.cqe_pad = 0; 1252 } else if (config.cqe_pad) { 1253 DRV_LOG(INFO, "Rx CQE padding is enabled"); 1254 } 1255 if (config.mprq.enabled && mprq) { 1256 if (config.mprq.stride_num_n > mprq_max_stride_num_n || 1257 config.mprq.stride_num_n < mprq_min_stride_num_n) { 1258 config.mprq.stride_num_n = 1259 RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, 1260 mprq_min_stride_num_n); 1261 DRV_LOG(WARNING, 1262 "the number of strides" 1263 " for Multi-Packet RQ is out of range," 1264 " setting default value (%u)", 1265 1 << config.mprq.stride_num_n); 1266 } 1267 config.mprq.min_stride_size_n = mprq_min_stride_size_n; 1268 config.mprq.max_stride_size_n = mprq_max_stride_size_n; 1269 } else if (config.mprq.enabled && !mprq) { 1270 DRV_LOG(WARNING, "Multi-Packet RQ isn't supported"); 1271 config.mprq.enabled = 0; 1272 } 1273 eth_dev = rte_eth_dev_allocate(name); 1274 if (eth_dev == NULL) { 1275 DRV_LOG(ERR, "can not allocate rte ethdev"); 1276 err = ENOMEM; 1277 goto error; 1278 } 1279 /* Flag to call rte_eth_dev_release_port() in rte_eth_dev_close(). */ 1280 eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE; 1281 if (priv->representor) { 1282 eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR; 1283 eth_dev->data->representor_id = priv->representor_id; 1284 } 1285 eth_dev->data->dev_private = priv; 1286 priv->dev_data = eth_dev->data; 1287 eth_dev->data->mac_addrs = priv->mac; 1288 eth_dev->device = dpdk_dev; 1289 err = mlx5_uar_init_primary(eth_dev); 1290 if (err) { 1291 err = rte_errno; 1292 goto error; 1293 } 1294 /* Configure the first MAC address by default. */ 1295 if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) { 1296 DRV_LOG(ERR, 1297 "port %u cannot get MAC address, is mlx5_en" 1298 " loaded? (errno: %s)", 1299 eth_dev->data->port_id, strerror(rte_errno)); 1300 err = ENODEV; 1301 goto error; 1302 } 1303 DRV_LOG(INFO, 1304 "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", 1305 eth_dev->data->port_id, 1306 mac.addr_bytes[0], mac.addr_bytes[1], 1307 mac.addr_bytes[2], mac.addr_bytes[3], 1308 mac.addr_bytes[4], mac.addr_bytes[5]); 1309 #ifndef NDEBUG 1310 { 1311 char ifname[IF_NAMESIZE]; 1312 1313 if (mlx5_get_ifname(eth_dev, &ifname) == 0) 1314 DRV_LOG(DEBUG, "port %u ifname is \"%s\"", 1315 eth_dev->data->port_id, ifname); 1316 else 1317 DRV_LOG(DEBUG, "port %u ifname is unknown", 1318 eth_dev->data->port_id); 1319 } 1320 #endif 1321 /* Get actual MTU if possible. */ 1322 err = mlx5_get_mtu(eth_dev, &priv->mtu); 1323 if (err) { 1324 err = rte_errno; 1325 goto error; 1326 } 1327 DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id, 1328 priv->mtu); 1329 /* Initialize burst functions to prevent crashes before link-up. */ 1330 eth_dev->rx_pkt_burst = removed_rx_burst; 1331 eth_dev->tx_pkt_burst = removed_tx_burst; 1332 eth_dev->dev_ops = &mlx5_dev_ops; 1333 /* Register MAC address. */ 1334 claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); 1335 if (config.vf && config.vf_nl_en) 1336 mlx5_nl_mac_addr_sync(eth_dev); 1337 priv->tcf_context = mlx5_flow_tcf_context_create(); 1338 if (!priv->tcf_context) { 1339 err = -rte_errno; 1340 DRV_LOG(WARNING, 1341 "flow rules relying on switch offloads will not be" 1342 " supported: cannot open libmnl socket: %s", 1343 strerror(rte_errno)); 1344 } else { 1345 struct rte_flow_error error; 1346 unsigned int ifindex = mlx5_ifindex(eth_dev); 1347 1348 if (!ifindex) { 1349 err = -rte_errno; 1350 error.message = 1351 "cannot retrieve network interface index"; 1352 } else { 1353 err = mlx5_flow_tcf_init(priv->tcf_context, 1354 ifindex, &error); 1355 } 1356 if (err) { 1357 DRV_LOG(WARNING, 1358 "flow rules relying on switch offloads will" 1359 " not be supported: %s: %s", 1360 error.message, strerror(rte_errno)); 1361 mlx5_flow_tcf_context_destroy(priv->tcf_context); 1362 priv->tcf_context = NULL; 1363 } 1364 } 1365 TAILQ_INIT(&priv->flows); 1366 TAILQ_INIT(&priv->ctrl_flows); 1367 /* Hint libmlx5 to use PMD allocator for data plane resources */ 1368 struct mlx5dv_ctx_allocators alctr = { 1369 .alloc = &mlx5_alloc_verbs_buf, 1370 .free = &mlx5_free_verbs_buf, 1371 .data = priv, 1372 }; 1373 mlx5_glue->dv_set_context_attr(sh->ctx, 1374 MLX5DV_CTX_ATTR_BUF_ALLOCATORS, 1375 (void *)((uintptr_t)&alctr)); 1376 /* Bring Ethernet device up. */ 1377 DRV_LOG(DEBUG, "port %u forcing Ethernet interface up", 1378 eth_dev->data->port_id); 1379 mlx5_set_link_up(eth_dev); 1380 /* 1381 * Even though the interrupt handler is not installed yet, 1382 * interrupts will still trigger on the asyn_fd from 1383 * Verbs context returned by ibv_open_device(). 1384 */ 1385 mlx5_link_update(eth_dev, 0); 1386 /* Store device configuration on private structure. */ 1387 priv->config = config; 1388 /* Supported Verbs flow priority number detection. */ 1389 err = mlx5_flow_discover_priorities(eth_dev); 1390 if (err < 0) { 1391 err = -err; 1392 goto error; 1393 } 1394 priv->config.flow_prio = err; 1395 /* 1396 * Once the device is added to the list of memory event 1397 * callback, its global MR cache table cannot be expanded 1398 * on the fly because of deadlock. If it overflows, lookup 1399 * should be done by searching MR list linearly, which is slow. 1400 */ 1401 err = mlx5_mr_btree_init(&priv->mr.cache, 1402 MLX5_MR_BTREE_CACHE_N * 2, 1403 eth_dev->device->numa_node); 1404 if (err) { 1405 err = rte_errno; 1406 goto error; 1407 } 1408 /* Add device to memory callback list. */ 1409 rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); 1410 LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list, 1411 priv, mem_event_cb); 1412 rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); 1413 return eth_dev; 1414 error: 1415 if (priv) { 1416 if (priv->nl_socket_route >= 0) 1417 close(priv->nl_socket_route); 1418 if (priv->nl_socket_rdma >= 0) 1419 close(priv->nl_socket_rdma); 1420 if (priv->tcf_context) 1421 mlx5_flow_tcf_context_destroy(priv->tcf_context); 1422 if (own_domain_id) 1423 claim_zero(rte_eth_switch_domain_free(priv->domain_id)); 1424 rte_free(priv); 1425 if (eth_dev != NULL) 1426 eth_dev->data->dev_private = NULL; 1427 } 1428 if (eth_dev != NULL) { 1429 /* mac_addrs must not be freed alone because part of dev_private */ 1430 eth_dev->data->mac_addrs = NULL; 1431 rte_eth_dev_release_port(eth_dev); 1432 } 1433 if (sh) 1434 mlx5_free_shared_ibctx(sh); 1435 assert(err > 0); 1436 rte_errno = err; 1437 return NULL; 1438 } 1439 1440 /** 1441 * Comparison callback to sort device data. 1442 * 1443 * This is meant to be used with qsort(). 1444 * 1445 * @param a[in] 1446 * Pointer to pointer to first data object. 1447 * @param b[in] 1448 * Pointer to pointer to second data object. 1449 * 1450 * @return 1451 * 0 if both objects are equal, less than 0 if the first argument is less 1452 * than the second, greater than 0 otherwise. 1453 */ 1454 static int 1455 mlx5_dev_spawn_data_cmp(const void *a, const void *b) 1456 { 1457 const struct mlx5_switch_info *si_a = 1458 &((const struct mlx5_dev_spawn_data *)a)->info; 1459 const struct mlx5_switch_info *si_b = 1460 &((const struct mlx5_dev_spawn_data *)b)->info; 1461 int ret; 1462 1463 /* Master device first. */ 1464 ret = si_b->master - si_a->master; 1465 if (ret) 1466 return ret; 1467 /* Then representor devices. */ 1468 ret = si_b->representor - si_a->representor; 1469 if (ret) 1470 return ret; 1471 /* Unidentified devices come last in no specific order. */ 1472 if (!si_a->representor) 1473 return 0; 1474 /* Order representors by name. */ 1475 return si_a->port_name - si_b->port_name; 1476 } 1477 1478 /** 1479 * DPDK callback to register a PCI device. 1480 * 1481 * This function spawns Ethernet devices out of a given PCI device. 1482 * 1483 * @param[in] pci_drv 1484 * PCI driver structure (mlx5_driver). 1485 * @param[in] pci_dev 1486 * PCI device information. 1487 * 1488 * @return 1489 * 0 on success, a negative errno value otherwise and rte_errno is set. 1490 */ 1491 static int 1492 mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, 1493 struct rte_pci_device *pci_dev) 1494 { 1495 struct ibv_device **ibv_list; 1496 /* 1497 * Number of found IB Devices matching with requested PCI BDF. 1498 * nd != 1 means there are multiple IB devices over the same 1499 * PCI device and we have representors and master. 1500 */ 1501 unsigned int nd = 0; 1502 /* 1503 * Number of found IB device Ports. nd = 1 and np = 1..n means 1504 * we have the single multiport IB device, and there may be 1505 * representors attached to some of found ports. 1506 */ 1507 unsigned int np = 0; 1508 /* 1509 * Number of DPDK ethernet devices to Spawn - either over 1510 * multiple IB devices or multiple ports of single IB device. 1511 * Actually this is the number of iterations to spawn. 1512 */ 1513 unsigned int ns = 0; 1514 struct mlx5_dev_config dev_config; 1515 int ret; 1516 1517 assert(pci_drv == &mlx5_driver); 1518 errno = 0; 1519 ibv_list = mlx5_glue->get_device_list(&ret); 1520 if (!ibv_list) { 1521 rte_errno = errno ? errno : ENOSYS; 1522 DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?"); 1523 return -rte_errno; 1524 } 1525 /* 1526 * First scan the list of all Infiniband devices to find 1527 * matching ones, gathering into the list. 1528 */ 1529 struct ibv_device *ibv_match[ret + 1]; 1530 int nl_route = -1; 1531 int nl_rdma = -1; 1532 unsigned int i; 1533 1534 while (ret-- > 0) { 1535 struct rte_pci_addr pci_addr; 1536 1537 DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name); 1538 if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr)) 1539 continue; 1540 if (pci_dev->addr.domain != pci_addr.domain || 1541 pci_dev->addr.bus != pci_addr.bus || 1542 pci_dev->addr.devid != pci_addr.devid || 1543 pci_dev->addr.function != pci_addr.function) 1544 continue; 1545 DRV_LOG(INFO, "PCI information matches for device \"%s\"", 1546 ibv_list[ret]->name); 1547 ibv_match[nd++] = ibv_list[ret]; 1548 } 1549 ibv_match[nd] = NULL; 1550 if (!nd) { 1551 /* No device macthes, just complain and bail out. */ 1552 mlx5_glue->free_device_list(ibv_list); 1553 DRV_LOG(WARNING, 1554 "no Verbs device matches PCI device " PCI_PRI_FMT "," 1555 " are kernel drivers loaded?", 1556 pci_dev->addr.domain, pci_dev->addr.bus, 1557 pci_dev->addr.devid, pci_dev->addr.function); 1558 rte_errno = ENOENT; 1559 ret = -rte_errno; 1560 return ret; 1561 } 1562 nl_route = mlx5_nl_init(NETLINK_ROUTE); 1563 nl_rdma = mlx5_nl_init(NETLINK_RDMA); 1564 if (nd == 1) { 1565 /* 1566 * Found single matching device may have multiple ports. 1567 * Each port may be representor, we have to check the port 1568 * number and check the representors existence. 1569 */ 1570 if (nl_rdma >= 0) 1571 np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name); 1572 if (!np) 1573 DRV_LOG(WARNING, "can not get IB device \"%s\"" 1574 " ports number", ibv_match[0]->name); 1575 } 1576 /* 1577 * Now we can determine the maximal 1578 * amount of devices to be spawned. 1579 */ 1580 struct mlx5_dev_spawn_data list[np ? np : nd]; 1581 1582 if (np > 1) { 1583 /* 1584 * Signle IB device with multiple ports found, 1585 * it may be E-Switch master device and representors. 1586 * We have to perform identification trough the ports. 1587 */ 1588 assert(nl_rdma >= 0); 1589 assert(ns == 0); 1590 assert(nd == 1); 1591 for (i = 1; i <= np; ++i) { 1592 list[ns].max_port = np; 1593 list[ns].ibv_port = i; 1594 list[ns].ibv_dev = ibv_match[0]; 1595 list[ns].eth_dev = NULL; 1596 list[ns].ifindex = mlx5_nl_ifindex 1597 (nl_rdma, list[ns].ibv_dev->name, i); 1598 if (!list[ns].ifindex) { 1599 /* 1600 * No network interface index found for the 1601 * specified port, it means there is no 1602 * representor on this port. It's OK, 1603 * there can be disabled ports, for example 1604 * if sriov_numvfs < sriov_totalvfs. 1605 */ 1606 continue; 1607 } 1608 ret = -1; 1609 if (nl_route >= 0) 1610 ret = mlx5_nl_switch_info 1611 (nl_route, 1612 list[ns].ifindex, 1613 &list[ns].info); 1614 if (ret || (!list[ns].info.representor && 1615 !list[ns].info.master)) { 1616 /* 1617 * We failed to recognize representors with 1618 * Netlink, let's try to perform the task 1619 * with sysfs. 1620 */ 1621 ret = mlx5_sysfs_switch_info 1622 (list[ns].ifindex, 1623 &list[ns].info); 1624 } 1625 if (!ret && (list[ns].info.representor ^ 1626 list[ns].info.master)) 1627 ns++; 1628 } 1629 if (!ns) { 1630 DRV_LOG(ERR, 1631 "unable to recognize master/representors" 1632 " on the IB device with multiple ports"); 1633 rte_errno = ENOENT; 1634 ret = -rte_errno; 1635 goto exit; 1636 } 1637 } else { 1638 /* 1639 * The existence of several matching entries (nd > 1) means 1640 * port representors have been instantiated. No existing Verbs 1641 * call nor sysfs entries can tell them apart, this can only 1642 * be done through Netlink calls assuming kernel drivers are 1643 * recent enough to support them. 1644 * 1645 * In the event of identification failure through Netlink, 1646 * try again through sysfs, then: 1647 * 1648 * 1. A single IB device matches (nd == 1) with single 1649 * port (np=0/1) and is not a representor, assume 1650 * no switch support. 1651 * 1652 * 2. Otherwise no safe assumptions can be made; 1653 * complain louder and bail out. 1654 */ 1655 np = 1; 1656 for (i = 0; i != nd; ++i) { 1657 memset(&list[ns].info, 0, sizeof(list[ns].info)); 1658 list[ns].max_port = 1; 1659 list[ns].ibv_port = 1; 1660 list[ns].ibv_dev = ibv_match[i]; 1661 list[ns].eth_dev = NULL; 1662 list[ns].ifindex = 0; 1663 if (nl_rdma >= 0) 1664 list[ns].ifindex = mlx5_nl_ifindex 1665 (nl_rdma, list[ns].ibv_dev->name, 1); 1666 if (!list[ns].ifindex) { 1667 /* 1668 * No network interface index found for the 1669 * specified device, it means there it is not 1670 * a representor/master. 1671 */ 1672 continue; 1673 } 1674 ret = -1; 1675 if (nl_route >= 0) 1676 ret = mlx5_nl_switch_info 1677 (nl_route, 1678 list[ns].ifindex, 1679 &list[ns].info); 1680 if (ret || (!list[ns].info.representor && 1681 !list[ns].info.master)) { 1682 /* 1683 * We failed to recognize representors with 1684 * Netlink, let's try to perform the task 1685 * with sysfs. 1686 */ 1687 ret = mlx5_sysfs_switch_info 1688 (list[ns].ifindex, 1689 &list[ns].info); 1690 } 1691 if (!ret && (list[ns].info.representor ^ 1692 list[ns].info.master)) { 1693 ns++; 1694 } else if ((nd == 1) && 1695 !list[ns].info.representor && 1696 !list[ns].info.master) { 1697 /* 1698 * Single IB device with 1699 * one physical port and 1700 * attached network device. 1701 * May be SRIOV is not enabled 1702 * or there is no representors. 1703 */ 1704 DRV_LOG(INFO, "no E-Switch support detected"); 1705 ns++; 1706 break; 1707 } 1708 } 1709 if (!ns) { 1710 DRV_LOG(ERR, 1711 "unable to recognize master/representors" 1712 " on the multiple IB devices"); 1713 rte_errno = ENOENT; 1714 ret = -rte_errno; 1715 goto exit; 1716 } 1717 } 1718 assert(ns); 1719 /* 1720 * Sort list to probe devices in natural order for users convenience 1721 * (i.e. master first, then representors from lowest to highest ID). 1722 */ 1723 qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp); 1724 /* Default configuration. */ 1725 dev_config = (struct mlx5_dev_config){ 1726 .hw_padding = 0, 1727 .mps = MLX5_ARG_UNSET, 1728 .tx_vec_en = 1, 1729 .rx_vec_en = 1, 1730 .txq_inline = MLX5_ARG_UNSET, 1731 .txqs_inline = MLX5_ARG_UNSET, 1732 .txqs_vec = MLX5_ARG_UNSET, 1733 .inline_max_packet_sz = MLX5_ARG_UNSET, 1734 .vf_nl_en = 1, 1735 .mprq = { 1736 .enabled = 0, /* Disabled by default. */ 1737 .stride_num_n = MLX5_MPRQ_STRIDE_NUM_N, 1738 .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN, 1739 .min_rxqs_num = MLX5_MPRQ_MIN_RXQS, 1740 }, 1741 }; 1742 /* Device specific configuration. */ 1743 switch (pci_dev->id.device_id) { 1744 case PCI_DEVICE_ID_MELLANOX_CONNECTX5BF: 1745 dev_config.txqs_vec = MLX5_VPMD_MAX_TXQS_BLUEFIELD; 1746 break; 1747 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: 1748 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: 1749 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: 1750 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: 1751 dev_config.vf = 1; 1752 break; 1753 default: 1754 break; 1755 } 1756 /* Set architecture-dependent default value if unset. */ 1757 if (dev_config.txqs_vec == MLX5_ARG_UNSET) 1758 dev_config.txqs_vec = MLX5_VPMD_MAX_TXQS; 1759 for (i = 0; i != ns; ++i) { 1760 uint32_t restore; 1761 1762 list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device, 1763 &list[i], 1764 dev_config); 1765 if (!list[i].eth_dev) { 1766 if (rte_errno != EBUSY && rte_errno != EEXIST) 1767 break; 1768 /* Device is disabled or already spawned. Ignore it. */ 1769 continue; 1770 } 1771 restore = list[i].eth_dev->data->dev_flags; 1772 rte_eth_copy_pci_info(list[i].eth_dev, pci_dev); 1773 /* Restore non-PCI flags cleared by the above call. */ 1774 list[i].eth_dev->data->dev_flags |= restore; 1775 rte_eth_dev_probing_finish(list[i].eth_dev); 1776 } 1777 if (i != ns) { 1778 DRV_LOG(ERR, 1779 "probe of PCI device " PCI_PRI_FMT " aborted after" 1780 " encountering an error: %s", 1781 pci_dev->addr.domain, pci_dev->addr.bus, 1782 pci_dev->addr.devid, pci_dev->addr.function, 1783 strerror(rte_errno)); 1784 ret = -rte_errno; 1785 /* Roll back. */ 1786 while (i--) { 1787 if (!list[i].eth_dev) 1788 continue; 1789 mlx5_dev_close(list[i].eth_dev); 1790 /* mac_addrs must not be freed because in dev_private */ 1791 list[i].eth_dev->data->mac_addrs = NULL; 1792 claim_zero(rte_eth_dev_release_port(list[i].eth_dev)); 1793 } 1794 /* Restore original error. */ 1795 rte_errno = -ret; 1796 } else { 1797 ret = 0; 1798 } 1799 exit: 1800 /* 1801 * Do the routine cleanup: 1802 * - close opened Netlink sockets 1803 * - free the Infiniband device list 1804 */ 1805 if (nl_rdma >= 0) 1806 close(nl_rdma); 1807 if (nl_route >= 0) 1808 close(nl_route); 1809 assert(ibv_list); 1810 mlx5_glue->free_device_list(ibv_list); 1811 return ret; 1812 } 1813 1814 /** 1815 * DPDK callback to remove a PCI device. 1816 * 1817 * This function removes all Ethernet devices belong to a given PCI device. 1818 * 1819 * @param[in] pci_dev 1820 * Pointer to the PCI device. 1821 * 1822 * @return 1823 * 0 on success, the function cannot fail. 1824 */ 1825 static int 1826 mlx5_pci_remove(struct rte_pci_device *pci_dev) 1827 { 1828 uint16_t port_id; 1829 struct rte_eth_dev *port; 1830 1831 for (port_id = 0; port_id < RTE_MAX_ETHPORTS; port_id++) { 1832 port = &rte_eth_devices[port_id]; 1833 if (port->state != RTE_ETH_DEV_UNUSED && 1834 port->device == &pci_dev->device) 1835 rte_eth_dev_close(port_id); 1836 } 1837 return 0; 1838 } 1839 1840 static const struct rte_pci_id mlx5_pci_id_map[] = { 1841 { 1842 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1843 PCI_DEVICE_ID_MELLANOX_CONNECTX4) 1844 }, 1845 { 1846 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1847 PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) 1848 }, 1849 { 1850 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1851 PCI_DEVICE_ID_MELLANOX_CONNECTX4LX) 1852 }, 1853 { 1854 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1855 PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) 1856 }, 1857 { 1858 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1859 PCI_DEVICE_ID_MELLANOX_CONNECTX5) 1860 }, 1861 { 1862 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1863 PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) 1864 }, 1865 { 1866 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1867 PCI_DEVICE_ID_MELLANOX_CONNECTX5EX) 1868 }, 1869 { 1870 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1871 PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF) 1872 }, 1873 { 1874 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1875 PCI_DEVICE_ID_MELLANOX_CONNECTX5BF) 1876 }, 1877 { 1878 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1879 PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF) 1880 }, 1881 { 1882 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1883 PCI_DEVICE_ID_MELLANOX_CONNECTX6) 1884 }, 1885 { 1886 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, 1887 PCI_DEVICE_ID_MELLANOX_CONNECTX6VF) 1888 }, 1889 { 1890 .vendor_id = 0 1891 } 1892 }; 1893 1894 static struct rte_pci_driver mlx5_driver = { 1895 .driver = { 1896 .name = MLX5_DRIVER_NAME 1897 }, 1898 .id_table = mlx5_pci_id_map, 1899 .probe = mlx5_pci_probe, 1900 .remove = mlx5_pci_remove, 1901 .dma_map = mlx5_dma_map, 1902 .dma_unmap = mlx5_dma_unmap, 1903 .drv_flags = (RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV | 1904 RTE_PCI_DRV_PROBE_AGAIN), 1905 }; 1906 1907 #ifdef RTE_IBVERBS_LINK_DLOPEN 1908 1909 /** 1910 * Suffix RTE_EAL_PMD_PATH with "-glue". 1911 * 1912 * This function performs a sanity check on RTE_EAL_PMD_PATH before 1913 * suffixing its last component. 1914 * 1915 * @param buf[out] 1916 * Output buffer, should be large enough otherwise NULL is returned. 1917 * @param size 1918 * Size of @p out. 1919 * 1920 * @return 1921 * Pointer to @p buf or @p NULL in case suffix cannot be appended. 1922 */ 1923 static char * 1924 mlx5_glue_path(char *buf, size_t size) 1925 { 1926 static const char *const bad[] = { "/", ".", "..", NULL }; 1927 const char *path = RTE_EAL_PMD_PATH; 1928 size_t len = strlen(path); 1929 size_t off; 1930 int i; 1931 1932 while (len && path[len - 1] == '/') 1933 --len; 1934 for (off = len; off && path[off - 1] != '/'; --off) 1935 ; 1936 for (i = 0; bad[i]; ++i) 1937 if (!strncmp(path + off, bad[i], (int)(len - off))) 1938 goto error; 1939 i = snprintf(buf, size, "%.*s-glue", (int)len, path); 1940 if (i == -1 || (size_t)i >= size) 1941 goto error; 1942 return buf; 1943 error: 1944 DRV_LOG(ERR, 1945 "unable to append \"-glue\" to last component of" 1946 " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\")," 1947 " please re-configure DPDK"); 1948 return NULL; 1949 } 1950 1951 /** 1952 * Initialization routine for run-time dependency on rdma-core. 1953 */ 1954 static int 1955 mlx5_glue_init(void) 1956 { 1957 char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")]; 1958 const char *path[] = { 1959 /* 1960 * A basic security check is necessary before trusting 1961 * MLX5_GLUE_PATH, which may override RTE_EAL_PMD_PATH. 1962 */ 1963 (geteuid() == getuid() && getegid() == getgid() ? 1964 getenv("MLX5_GLUE_PATH") : NULL), 1965 /* 1966 * When RTE_EAL_PMD_PATH is set, use its glue-suffixed 1967 * variant, otherwise let dlopen() look up libraries on its 1968 * own. 1969 */ 1970 (*RTE_EAL_PMD_PATH ? 1971 mlx5_glue_path(glue_path, sizeof(glue_path)) : ""), 1972 }; 1973 unsigned int i = 0; 1974 void *handle = NULL; 1975 void **sym; 1976 const char *dlmsg; 1977 1978 while (!handle && i != RTE_DIM(path)) { 1979 const char *end; 1980 size_t len; 1981 int ret; 1982 1983 if (!path[i]) { 1984 ++i; 1985 continue; 1986 } 1987 end = strpbrk(path[i], ":;"); 1988 if (!end) 1989 end = path[i] + strlen(path[i]); 1990 len = end - path[i]; 1991 ret = 0; 1992 do { 1993 char name[ret + 1]; 1994 1995 ret = snprintf(name, sizeof(name), "%.*s%s" MLX5_GLUE, 1996 (int)len, path[i], 1997 (!len || *(end - 1) == '/') ? "" : "/"); 1998 if (ret == -1) 1999 break; 2000 if (sizeof(name) != (size_t)ret + 1) 2001 continue; 2002 DRV_LOG(DEBUG, "looking for rdma-core glue as \"%s\"", 2003 name); 2004 handle = dlopen(name, RTLD_LAZY); 2005 break; 2006 } while (1); 2007 path[i] = end + 1; 2008 if (!*end) 2009 ++i; 2010 } 2011 if (!handle) { 2012 rte_errno = EINVAL; 2013 dlmsg = dlerror(); 2014 if (dlmsg) 2015 DRV_LOG(WARNING, "cannot load glue library: %s", dlmsg); 2016 goto glue_error; 2017 } 2018 sym = dlsym(handle, "mlx5_glue"); 2019 if (!sym || !*sym) { 2020 rte_errno = EINVAL; 2021 dlmsg = dlerror(); 2022 if (dlmsg) 2023 DRV_LOG(ERR, "cannot resolve glue symbol: %s", dlmsg); 2024 goto glue_error; 2025 } 2026 mlx5_glue = *sym; 2027 return 0; 2028 glue_error: 2029 if (handle) 2030 dlclose(handle); 2031 DRV_LOG(WARNING, 2032 "cannot initialize PMD due to missing run-time dependency on" 2033 " rdma-core libraries (libibverbs, libmlx5)"); 2034 return -rte_errno; 2035 } 2036 2037 #endif 2038 2039 /** 2040 * Driver initialization routine. 2041 */ 2042 RTE_INIT(rte_mlx5_pmd_init) 2043 { 2044 /* Initialize driver log type. */ 2045 mlx5_logtype = rte_log_register("pmd.net.mlx5"); 2046 if (mlx5_logtype >= 0) 2047 rte_log_set_level(mlx5_logtype, RTE_LOG_NOTICE); 2048 2049 /* Build the static tables for Verbs conversion. */ 2050 mlx5_set_ptype_table(); 2051 mlx5_set_cksum_table(); 2052 mlx5_set_swp_types_table(); 2053 /* 2054 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use 2055 * huge pages. Calling ibv_fork_init() during init allows 2056 * applications to use fork() safely for purposes other than 2057 * using this PMD, which is not supported in forked processes. 2058 */ 2059 setenv("RDMAV_HUGEPAGES_SAFE", "1", 1); 2060 /* Match the size of Rx completion entry to the size of a cacheline. */ 2061 if (RTE_CACHE_LINE_SIZE == 128) 2062 setenv("MLX5_CQE_SIZE", "128", 0); 2063 /* 2064 * MLX5_DEVICE_FATAL_CLEANUP tells ibv_destroy functions to 2065 * cleanup all the Verbs resources even when the device was removed. 2066 */ 2067 setenv("MLX5_DEVICE_FATAL_CLEANUP", "1", 1); 2068 #ifdef RTE_IBVERBS_LINK_DLOPEN 2069 if (mlx5_glue_init()) 2070 return; 2071 assert(mlx5_glue); 2072 #endif 2073 #ifndef NDEBUG 2074 /* Glue structure must not contain any NULL pointers. */ 2075 { 2076 unsigned int i; 2077 2078 for (i = 0; i != sizeof(*mlx5_glue) / sizeof(void *); ++i) 2079 assert(((const void *const *)mlx5_glue)[i]); 2080 } 2081 #endif 2082 if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) { 2083 DRV_LOG(ERR, 2084 "rdma-core glue \"%s\" mismatch: \"%s\" is required", 2085 mlx5_glue->version, MLX5_GLUE_VERSION); 2086 return; 2087 } 2088 mlx5_glue->fork_init(); 2089 rte_pci_register(&mlx5_driver); 2090 } 2091 2092 RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__); 2093 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map); 2094 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib"); 2095