xref: /dpdk/drivers/net/mlx5/linux/mlx5_os.c (revision 679f46c7751fd5e7ff3b5039d28c06602e634223)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2020 Mellanox Technologies, Ltd
4  */
5 
6 #include <stddef.h>
7 #include <unistd.h>
8 #include <string.h>
9 #include <stdint.h>
10 #include <stdlib.h>
11 #include <errno.h>
12 #include <net/if.h>
13 #include <linux/rtnetlink.h>
14 #include <linux/sockios.h>
15 #include <linux/ethtool.h>
16 #include <fcntl.h>
17 
18 #include <rte_malloc.h>
19 #include <ethdev_driver.h>
20 #include <ethdev_pci.h>
21 #include <rte_pci.h>
22 #include <rte_bus_pci.h>
23 #include <rte_common.h>
24 #include <rte_kvargs.h>
25 #include <rte_rwlock.h>
26 #include <rte_spinlock.h>
27 #include <rte_string_fns.h>
28 #include <rte_alarm.h>
29 #include <rte_eal_paging.h>
30 
31 #include <mlx5_glue.h>
32 #include <mlx5_devx_cmds.h>
33 #include <mlx5_common.h>
34 #include <mlx5_common_mp.h>
35 #include <mlx5_common_mr.h>
36 #include <mlx5_malloc.h>
37 
38 #include "mlx5_defs.h"
39 #include "mlx5.h"
40 #include "mlx5_common_os.h"
41 #include "mlx5_utils.h"
42 #include "mlx5_rxtx.h"
43 #include "mlx5_rx.h"
44 #include "mlx5_tx.h"
45 #include "mlx5_autoconf.h"
46 #include "mlx5_mr.h"
47 #include "mlx5_flow.h"
48 #include "rte_pmd_mlx5.h"
49 #include "mlx5_verbs.h"
50 #include "mlx5_nl.h"
51 #include "mlx5_devx.h"
52 
53 #define MLX5_TAGS_HLIST_ARRAY_SIZE 8192
54 
55 #ifndef HAVE_IBV_MLX5_MOD_MPW
56 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
57 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
58 #endif
59 
60 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP
61 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4)
62 #endif
63 
64 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data";
65 
66 /* Spinlock for mlx5_shared_data allocation. */
67 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
68 
69 /* Process local data for secondary processes. */
70 static struct mlx5_local_data mlx5_local_data;
71 
72 /* rte flow indexed pool configuration. */
73 static struct mlx5_indexed_pool_config icfg[] = {
74 	{
75 		.size = sizeof(struct rte_flow),
76 		.trunk_size = 64,
77 		.need_lock = 1,
78 		.release_mem_en = 0,
79 		.malloc = mlx5_malloc,
80 		.free = mlx5_free,
81 		.per_core_cache = 0,
82 		.type = "ctl_flow_ipool",
83 	},
84 	{
85 		.size = sizeof(struct rte_flow),
86 		.trunk_size = 64,
87 		.grow_trunk = 3,
88 		.grow_shift = 2,
89 		.need_lock = 1,
90 		.release_mem_en = 0,
91 		.malloc = mlx5_malloc,
92 		.free = mlx5_free,
93 		.per_core_cache = 1 << 14,
94 		.type = "rte_flow_ipool",
95 	},
96 	{
97 		.size = sizeof(struct rte_flow),
98 		.trunk_size = 64,
99 		.grow_trunk = 3,
100 		.grow_shift = 2,
101 		.need_lock = 1,
102 		.release_mem_en = 0,
103 		.malloc = mlx5_malloc,
104 		.free = mlx5_free,
105 		.per_core_cache = 0,
106 		.type = "mcp_flow_ipool",
107 	},
108 };
109 
110 /**
111  * Set the completion channel file descriptor interrupt as non-blocking.
112  *
113  * @param[in] rxq_obj
114  *   Pointer to RQ channel object, which includes the channel fd
115  *
116  * @param[out] fd
117  *   The file descriptor (representing the intetrrupt) used in this channel.
118  *
119  * @return
120  *   0 on successfully setting the fd to non-blocking, non-zero otherwise.
121  */
122 int
123 mlx5_os_set_nonblock_channel_fd(int fd)
124 {
125 	int flags;
126 
127 	flags = fcntl(fd, F_GETFL);
128 	return fcntl(fd, F_SETFL, flags | O_NONBLOCK);
129 }
130 
131 /**
132  * Get mlx5 device attributes. The glue function query_device_ex() is called
133  * with out parameter of type 'struct ibv_device_attr_ex *'. Then fill in mlx5
134  * device attributes from the glue out parameter.
135  *
136  * @param dev
137  *   Pointer to ibv context.
138  *
139  * @param device_attr
140  *   Pointer to mlx5 device attributes.
141  *
142  * @return
143  *   0 on success, non zero error number otherwise
144  */
145 int
146 mlx5_os_get_dev_attr(void *ctx, struct mlx5_dev_attr *device_attr)
147 {
148 	int err;
149 	struct ibv_device_attr_ex attr_ex;
150 	memset(device_attr, 0, sizeof(*device_attr));
151 	err = mlx5_glue->query_device_ex(ctx, NULL, &attr_ex);
152 	if (err)
153 		return err;
154 
155 	device_attr->device_cap_flags_ex = attr_ex.device_cap_flags_ex;
156 	device_attr->max_qp_wr = attr_ex.orig_attr.max_qp_wr;
157 	device_attr->max_sge = attr_ex.orig_attr.max_sge;
158 	device_attr->max_cq = attr_ex.orig_attr.max_cq;
159 	device_attr->max_cqe = attr_ex.orig_attr.max_cqe;
160 	device_attr->max_mr = attr_ex.orig_attr.max_mr;
161 	device_attr->max_pd = attr_ex.orig_attr.max_pd;
162 	device_attr->max_qp = attr_ex.orig_attr.max_qp;
163 	device_attr->max_srq = attr_ex.orig_attr.max_srq;
164 	device_attr->max_srq_wr = attr_ex.orig_attr.max_srq_wr;
165 	device_attr->raw_packet_caps = attr_ex.raw_packet_caps;
166 	device_attr->max_rwq_indirection_table_size =
167 		attr_ex.rss_caps.max_rwq_indirection_table_size;
168 	device_attr->max_tso = attr_ex.tso_caps.max_tso;
169 	device_attr->tso_supported_qpts = attr_ex.tso_caps.supported_qpts;
170 
171 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
172 	err = mlx5_glue->dv_query_device(ctx, &dv_attr);
173 	if (err)
174 		return err;
175 
176 	device_attr->flags = dv_attr.flags;
177 	device_attr->comp_mask = dv_attr.comp_mask;
178 #ifdef HAVE_IBV_MLX5_MOD_SWP
179 	device_attr->sw_parsing_offloads =
180 		dv_attr.sw_parsing_caps.sw_parsing_offloads;
181 #endif
182 	device_attr->min_single_stride_log_num_of_bytes =
183 		dv_attr.striding_rq_caps.min_single_stride_log_num_of_bytes;
184 	device_attr->max_single_stride_log_num_of_bytes =
185 		dv_attr.striding_rq_caps.max_single_stride_log_num_of_bytes;
186 	device_attr->min_single_wqe_log_num_of_strides =
187 		dv_attr.striding_rq_caps.min_single_wqe_log_num_of_strides;
188 	device_attr->max_single_wqe_log_num_of_strides =
189 		dv_attr.striding_rq_caps.max_single_wqe_log_num_of_strides;
190 	device_attr->stride_supported_qpts =
191 		dv_attr.striding_rq_caps.supported_qpts;
192 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
193 	device_attr->tunnel_offloads_caps = dv_attr.tunnel_offloads_caps;
194 #endif
195 	strlcpy(device_attr->fw_ver, attr_ex.orig_attr.fw_ver,
196 		sizeof(device_attr->fw_ver));
197 
198 	return err;
199 }
200 
201 /**
202  * Verbs callback to allocate a memory. This function should allocate the space
203  * according to the size provided residing inside a huge page.
204  * Please note that all allocation must respect the alignment from libmlx5
205  * (i.e. currently rte_mem_page_size()).
206  *
207  * @param[in] size
208  *   The size in bytes of the memory to allocate.
209  * @param[in] data
210  *   A pointer to the callback data.
211  *
212  * @return
213  *   Allocated buffer, NULL otherwise and rte_errno is set.
214  */
215 static void *
216 mlx5_alloc_verbs_buf(size_t size, void *data)
217 {
218 	struct mlx5_dev_ctx_shared *sh = data;
219 	void *ret;
220 	size_t alignment = rte_mem_page_size();
221 	if (alignment == (size_t)-1) {
222 		DRV_LOG(ERR, "Failed to get mem page size");
223 		rte_errno = ENOMEM;
224 		return NULL;
225 	}
226 
227 	MLX5_ASSERT(data != NULL);
228 	ret = mlx5_malloc(0, size, alignment, sh->numa_node);
229 	if (!ret && size)
230 		rte_errno = ENOMEM;
231 	return ret;
232 }
233 
234 /**
235  * Detect misc5 support or not
236  *
237  * @param[in] priv
238  *   Device private data pointer
239  */
240 #ifdef HAVE_MLX5DV_DR
241 static void
242 __mlx5_discovery_misc5_cap(struct mlx5_priv *priv)
243 {
244 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
245 	/* Dummy VxLAN matcher to detect rdma-core misc5 cap
246 	 * Case: IPv4--->UDP--->VxLAN--->vni
247 	 */
248 	void *tbl;
249 	struct mlx5_flow_dv_match_params matcher_mask;
250 	void *match_m;
251 	void *matcher;
252 	void *headers_m;
253 	void *misc5_m;
254 	uint32_t *tunnel_header_m;
255 	struct mlx5dv_flow_matcher_attr dv_attr;
256 
257 	memset(&matcher_mask, 0, sizeof(matcher_mask));
258 	matcher_mask.size = sizeof(matcher_mask.buf);
259 	match_m = matcher_mask.buf;
260 	headers_m = MLX5_ADDR_OF(fte_match_param, match_m, outer_headers);
261 	misc5_m = MLX5_ADDR_OF(fte_match_param,
262 			       match_m, misc_parameters_5);
263 	tunnel_header_m = (uint32_t *)
264 				MLX5_ADDR_OF(fte_match_set_misc5,
265 				misc5_m, tunnel_header_1);
266 	MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff);
267 	MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_version, 4);
268 	MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xffff);
269 	*tunnel_header_m = 0xffffff;
270 
271 	tbl = mlx5_glue->dr_create_flow_tbl(priv->sh->rx_domain, 1);
272 	if (!tbl) {
273 		DRV_LOG(INFO, "No SW steering support");
274 		return;
275 	}
276 	dv_attr.type = IBV_FLOW_ATTR_NORMAL,
277 	dv_attr.match_mask = (void *)&matcher_mask,
278 	dv_attr.match_criteria_enable =
279 			(1 << MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT) |
280 			(1 << MLX5_MATCH_CRITERIA_ENABLE_MISC5_BIT);
281 	dv_attr.priority = 3;
282 #ifdef HAVE_MLX5DV_DR_ESWITCH
283 	void *misc2_m;
284 	if (priv->config.dv_esw_en) {
285 		/* FDB enabled reg_c_0 */
286 		dv_attr.match_criteria_enable |=
287 				(1 << MLX5_MATCH_CRITERIA_ENABLE_MISC2_BIT);
288 		misc2_m = MLX5_ADDR_OF(fte_match_param,
289 				       match_m, misc_parameters_2);
290 		MLX5_SET(fte_match_set_misc2, misc2_m,
291 			 metadata_reg_c_0, 0xffff);
292 	}
293 #endif
294 	matcher = mlx5_glue->dv_create_flow_matcher(priv->sh->ctx,
295 						    &dv_attr, tbl);
296 	if (matcher) {
297 		priv->sh->misc5_cap = 1;
298 		mlx5_glue->dv_destroy_flow_matcher(matcher);
299 	}
300 	mlx5_glue->dr_destroy_flow_tbl(tbl);
301 #else
302 	RTE_SET_USED(priv);
303 #endif
304 }
305 #endif
306 
307 /**
308  * Verbs callback to free a memory.
309  *
310  * @param[in] ptr
311  *   A pointer to the memory to free.
312  * @param[in] data
313  *   A pointer to the callback data.
314  */
315 static void
316 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused)
317 {
318 	MLX5_ASSERT(data != NULL);
319 	mlx5_free(ptr);
320 }
321 
322 /**
323  * Initialize DR related data within private structure.
324  * Routine checks the reference counter and does actual
325  * resources creation/initialization only if counter is zero.
326  *
327  * @param[in] priv
328  *   Pointer to the private device data structure.
329  *
330  * @return
331  *   Zero on success, positive error code otherwise.
332  */
333 static int
334 mlx5_alloc_shared_dr(struct mlx5_priv *priv)
335 {
336 	struct mlx5_dev_ctx_shared *sh = priv->sh;
337 	char s[MLX5_HLIST_NAMESIZE] __rte_unused;
338 	int err;
339 
340 	MLX5_ASSERT(sh && sh->refcnt);
341 	if (sh->refcnt > 1)
342 		return 0;
343 	err = mlx5_alloc_table_hash_list(priv);
344 	if (err)
345 		goto error;
346 	/* The resources below are only valid with DV support. */
347 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
348 	/* Init port id action list. */
349 	snprintf(s, sizeof(s), "%s_port_id_action_list", sh->ibdev_name);
350 	sh->port_id_action_list = mlx5_list_create(s, sh,
351 						   flow_dv_port_id_create_cb,
352 						   flow_dv_port_id_match_cb,
353 						   flow_dv_port_id_remove_cb,
354 						   flow_dv_port_id_clone_cb,
355 						 flow_dv_port_id_clone_free_cb);
356 	if (!sh->port_id_action_list)
357 		goto error;
358 	/* Init push vlan action list. */
359 	snprintf(s, sizeof(s), "%s_push_vlan_action_list", sh->ibdev_name);
360 	sh->push_vlan_action_list = mlx5_list_create(s, sh,
361 						    flow_dv_push_vlan_create_cb,
362 						    flow_dv_push_vlan_match_cb,
363 						    flow_dv_push_vlan_remove_cb,
364 						    flow_dv_push_vlan_clone_cb,
365 					       flow_dv_push_vlan_clone_free_cb);
366 	if (!sh->push_vlan_action_list)
367 		goto error;
368 	/* Init sample action list. */
369 	snprintf(s, sizeof(s), "%s_sample_action_list", sh->ibdev_name);
370 	sh->sample_action_list = mlx5_list_create(s, sh,
371 						  flow_dv_sample_create_cb,
372 						  flow_dv_sample_match_cb,
373 						  flow_dv_sample_remove_cb,
374 						  flow_dv_sample_clone_cb,
375 						  flow_dv_sample_clone_free_cb);
376 	if (!sh->sample_action_list)
377 		goto error;
378 	/* Init dest array action list. */
379 	snprintf(s, sizeof(s), "%s_dest_array_list", sh->ibdev_name);
380 	sh->dest_array_list = mlx5_list_create(s, sh,
381 					       flow_dv_dest_array_create_cb,
382 					       flow_dv_dest_array_match_cb,
383 					       flow_dv_dest_array_remove_cb,
384 					       flow_dv_dest_array_clone_cb,
385 					      flow_dv_dest_array_clone_free_cb);
386 	if (!sh->dest_array_list)
387 		goto error;
388 	/* Create tags hash list table. */
389 	snprintf(s, sizeof(s), "%s_tags", sh->ibdev_name);
390 	sh->tag_table = mlx5_hlist_create(s, MLX5_TAGS_HLIST_ARRAY_SIZE, 0,
391 					  MLX5_HLIST_WRITE_MOST,
392 					  flow_dv_tag_create_cb,
393 					  flow_dv_tag_match_cb,
394 					  flow_dv_tag_remove_cb);
395 	if (!sh->tag_table) {
396 		DRV_LOG(ERR, "tags with hash creation failed.");
397 		err = ENOMEM;
398 		goto error;
399 	}
400 	sh->tag_table->ctx = sh;
401 	snprintf(s, sizeof(s), "%s_hdr_modify", sh->ibdev_name);
402 	sh->modify_cmds = mlx5_hlist_create(s, MLX5_FLOW_HDR_MODIFY_HTABLE_SZ,
403 					    0, MLX5_HLIST_WRITE_MOST |
404 					    MLX5_HLIST_DIRECT_KEY,
405 					    flow_dv_modify_create_cb,
406 					    flow_dv_modify_match_cb,
407 					    flow_dv_modify_remove_cb);
408 	if (!sh->modify_cmds) {
409 		DRV_LOG(ERR, "hdr modify hash creation failed");
410 		err = ENOMEM;
411 		goto error;
412 	}
413 	sh->modify_cmds->ctx = sh;
414 	snprintf(s, sizeof(s), "%s_encaps_decaps", sh->ibdev_name);
415 	sh->encaps_decaps = mlx5_hlist_create(s,
416 					      MLX5_FLOW_ENCAP_DECAP_HTABLE_SZ,
417 					      0, MLX5_HLIST_DIRECT_KEY |
418 					      MLX5_HLIST_WRITE_MOST,
419 					      flow_dv_encap_decap_create_cb,
420 					      flow_dv_encap_decap_match_cb,
421 					      flow_dv_encap_decap_remove_cb);
422 	if (!sh->encaps_decaps) {
423 		DRV_LOG(ERR, "encap decap hash creation failed");
424 		err = ENOMEM;
425 		goto error;
426 	}
427 	sh->encaps_decaps->ctx = sh;
428 #endif
429 #ifdef HAVE_MLX5DV_DR
430 	void *domain;
431 
432 	/* Reference counter is zero, we should initialize structures. */
433 	domain = mlx5_glue->dr_create_domain(sh->ctx,
434 					     MLX5DV_DR_DOMAIN_TYPE_NIC_RX);
435 	if (!domain) {
436 		DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed");
437 		err = errno;
438 		goto error;
439 	}
440 	sh->rx_domain = domain;
441 	domain = mlx5_glue->dr_create_domain(sh->ctx,
442 					     MLX5DV_DR_DOMAIN_TYPE_NIC_TX);
443 	if (!domain) {
444 		DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed");
445 		err = errno;
446 		goto error;
447 	}
448 	sh->tx_domain = domain;
449 #ifdef HAVE_MLX5DV_DR_ESWITCH
450 	if (priv->config.dv_esw_en) {
451 		domain  = mlx5_glue->dr_create_domain
452 			(sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB);
453 		if (!domain) {
454 			DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed");
455 			err = errno;
456 			goto error;
457 		}
458 		sh->fdb_domain = domain;
459 	}
460 	/*
461 	 * The drop action is just some dummy placeholder in rdma-core. It
462 	 * does not belong to domains and has no any attributes, and, can be
463 	 * shared by the entire device.
464 	 */
465 	sh->dr_drop_action = mlx5_glue->dr_create_flow_action_drop();
466 	if (!sh->dr_drop_action) {
467 		DRV_LOG(ERR, "FDB mlx5dv_dr_create_flow_action_drop");
468 		err = errno;
469 		goto error;
470 	}
471 #endif
472 	if (!sh->tunnel_hub)
473 		err = mlx5_alloc_tunnel_hub(sh);
474 	if (err) {
475 		DRV_LOG(ERR, "mlx5_alloc_tunnel_hub failed err=%d", err);
476 		goto error;
477 	}
478 	if (priv->config.reclaim_mode == MLX5_RCM_AGGR) {
479 		mlx5_glue->dr_reclaim_domain_memory(sh->rx_domain, 1);
480 		mlx5_glue->dr_reclaim_domain_memory(sh->tx_domain, 1);
481 		if (sh->fdb_domain)
482 			mlx5_glue->dr_reclaim_domain_memory(sh->fdb_domain, 1);
483 	}
484 	sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan();
485 	if (!priv->config.allow_duplicate_pattern) {
486 #ifndef HAVE_MLX5_DR_ALLOW_DUPLICATE
487 		DRV_LOG(WARNING, "Disallow duplicate pattern is not supported - maybe old rdma-core version?");
488 #endif
489 		mlx5_glue->dr_allow_duplicate_rules(sh->rx_domain, 0);
490 		mlx5_glue->dr_allow_duplicate_rules(sh->tx_domain, 0);
491 		if (sh->fdb_domain)
492 			mlx5_glue->dr_allow_duplicate_rules(sh->fdb_domain, 0);
493 	}
494 
495 	__mlx5_discovery_misc5_cap(priv);
496 #endif /* HAVE_MLX5DV_DR */
497 	sh->default_miss_action =
498 			mlx5_glue->dr_create_flow_action_default_miss();
499 	if (!sh->default_miss_action)
500 		DRV_LOG(WARNING, "Default miss action is not supported.");
501 	return 0;
502 error:
503 	/* Rollback the created objects. */
504 	if (sh->rx_domain) {
505 		mlx5_glue->dr_destroy_domain(sh->rx_domain);
506 		sh->rx_domain = NULL;
507 	}
508 	if (sh->tx_domain) {
509 		mlx5_glue->dr_destroy_domain(sh->tx_domain);
510 		sh->tx_domain = NULL;
511 	}
512 	if (sh->fdb_domain) {
513 		mlx5_glue->dr_destroy_domain(sh->fdb_domain);
514 		sh->fdb_domain = NULL;
515 	}
516 	if (sh->dr_drop_action) {
517 		mlx5_glue->destroy_flow_action(sh->dr_drop_action);
518 		sh->dr_drop_action = NULL;
519 	}
520 	if (sh->pop_vlan_action) {
521 		mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
522 		sh->pop_vlan_action = NULL;
523 	}
524 	if (sh->encaps_decaps) {
525 		mlx5_hlist_destroy(sh->encaps_decaps);
526 		sh->encaps_decaps = NULL;
527 	}
528 	if (sh->modify_cmds) {
529 		mlx5_hlist_destroy(sh->modify_cmds);
530 		sh->modify_cmds = NULL;
531 	}
532 	if (sh->tag_table) {
533 		/* tags should be destroyed with flow before. */
534 		mlx5_hlist_destroy(sh->tag_table);
535 		sh->tag_table = NULL;
536 	}
537 	if (sh->tunnel_hub) {
538 		mlx5_release_tunnel_hub(sh, priv->dev_port);
539 		sh->tunnel_hub = NULL;
540 	}
541 	mlx5_free_table_hash_list(priv);
542 	if (sh->port_id_action_list) {
543 		mlx5_list_destroy(sh->port_id_action_list);
544 		sh->port_id_action_list = NULL;
545 	}
546 	if (sh->push_vlan_action_list) {
547 		mlx5_list_destroy(sh->push_vlan_action_list);
548 		sh->push_vlan_action_list = NULL;
549 	}
550 	if (sh->sample_action_list) {
551 		mlx5_list_destroy(sh->sample_action_list);
552 		sh->sample_action_list = NULL;
553 	}
554 	if (sh->dest_array_list) {
555 		mlx5_list_destroy(sh->dest_array_list);
556 		sh->dest_array_list = NULL;
557 	}
558 	return err;
559 }
560 
561 /**
562  * Destroy DR related data within private structure.
563  *
564  * @param[in] priv
565  *   Pointer to the private device data structure.
566  */
567 void
568 mlx5_os_free_shared_dr(struct mlx5_priv *priv)
569 {
570 	struct mlx5_dev_ctx_shared *sh = priv->sh;
571 
572 	MLX5_ASSERT(sh && sh->refcnt);
573 	if (sh->refcnt > 1)
574 		return;
575 #ifdef HAVE_MLX5DV_DR
576 	if (sh->rx_domain) {
577 		mlx5_glue->dr_destroy_domain(sh->rx_domain);
578 		sh->rx_domain = NULL;
579 	}
580 	if (sh->tx_domain) {
581 		mlx5_glue->dr_destroy_domain(sh->tx_domain);
582 		sh->tx_domain = NULL;
583 	}
584 #ifdef HAVE_MLX5DV_DR_ESWITCH
585 	if (sh->fdb_domain) {
586 		mlx5_glue->dr_destroy_domain(sh->fdb_domain);
587 		sh->fdb_domain = NULL;
588 	}
589 	if (sh->dr_drop_action) {
590 		mlx5_glue->destroy_flow_action(sh->dr_drop_action);
591 		sh->dr_drop_action = NULL;
592 	}
593 #endif
594 	if (sh->pop_vlan_action) {
595 		mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
596 		sh->pop_vlan_action = NULL;
597 	}
598 #endif /* HAVE_MLX5DV_DR */
599 	if (sh->default_miss_action)
600 		mlx5_glue->destroy_flow_action
601 				(sh->default_miss_action);
602 	if (sh->encaps_decaps) {
603 		mlx5_hlist_destroy(sh->encaps_decaps);
604 		sh->encaps_decaps = NULL;
605 	}
606 	if (sh->modify_cmds) {
607 		mlx5_hlist_destroy(sh->modify_cmds);
608 		sh->modify_cmds = NULL;
609 	}
610 	if (sh->tag_table) {
611 		/* tags should be destroyed with flow before. */
612 		mlx5_hlist_destroy(sh->tag_table);
613 		sh->tag_table = NULL;
614 	}
615 	if (sh->tunnel_hub) {
616 		mlx5_release_tunnel_hub(sh, priv->dev_port);
617 		sh->tunnel_hub = NULL;
618 	}
619 	mlx5_free_table_hash_list(priv);
620 	if (sh->port_id_action_list) {
621 		mlx5_list_destroy(sh->port_id_action_list);
622 		sh->port_id_action_list = NULL;
623 	}
624 	if (sh->push_vlan_action_list) {
625 		mlx5_list_destroy(sh->push_vlan_action_list);
626 		sh->push_vlan_action_list = NULL;
627 	}
628 	if (sh->sample_action_list) {
629 		mlx5_list_destroy(sh->sample_action_list);
630 		sh->sample_action_list = NULL;
631 	}
632 	if (sh->dest_array_list) {
633 		mlx5_list_destroy(sh->dest_array_list);
634 		sh->dest_array_list = NULL;
635 	}
636 }
637 
638 /**
639  * Initialize shared data between primary and secondary process.
640  *
641  * A memzone is reserved by primary process and secondary processes attach to
642  * the memzone.
643  *
644  * @return
645  *   0 on success, a negative errno value otherwise and rte_errno is set.
646  */
647 static int
648 mlx5_init_shared_data(void)
649 {
650 	const struct rte_memzone *mz;
651 	int ret = 0;
652 
653 	rte_spinlock_lock(&mlx5_shared_data_lock);
654 	if (mlx5_shared_data == NULL) {
655 		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
656 			/* Allocate shared memory. */
657 			mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA,
658 						 sizeof(*mlx5_shared_data),
659 						 SOCKET_ID_ANY, 0);
660 			if (mz == NULL) {
661 				DRV_LOG(ERR,
662 					"Cannot allocate mlx5 shared data");
663 				ret = -rte_errno;
664 				goto error;
665 			}
666 			mlx5_shared_data = mz->addr;
667 			memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data));
668 			rte_spinlock_init(&mlx5_shared_data->lock);
669 		} else {
670 			/* Lookup allocated shared memory. */
671 			mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA);
672 			if (mz == NULL) {
673 				DRV_LOG(ERR,
674 					"Cannot attach mlx5 shared data");
675 				ret = -rte_errno;
676 				goto error;
677 			}
678 			mlx5_shared_data = mz->addr;
679 			memset(&mlx5_local_data, 0, sizeof(mlx5_local_data));
680 		}
681 	}
682 error:
683 	rte_spinlock_unlock(&mlx5_shared_data_lock);
684 	return ret;
685 }
686 
687 /**
688  * PMD global initialization.
689  *
690  * Independent from individual device, this function initializes global
691  * per-PMD data structures distinguishing primary and secondary processes.
692  * Hence, each initialization is called once per a process.
693  *
694  * @return
695  *   0 on success, a negative errno value otherwise and rte_errno is set.
696  */
697 static int
698 mlx5_init_once(void)
699 {
700 	struct mlx5_shared_data *sd;
701 	struct mlx5_local_data *ld = &mlx5_local_data;
702 	int ret = 0;
703 
704 	if (mlx5_init_shared_data())
705 		return -rte_errno;
706 	sd = mlx5_shared_data;
707 	MLX5_ASSERT(sd);
708 	rte_spinlock_lock(&sd->lock);
709 	switch (rte_eal_process_type()) {
710 	case RTE_PROC_PRIMARY:
711 		if (sd->init_done)
712 			break;
713 		LIST_INIT(&sd->mem_event_cb_list);
714 		rte_rwlock_init(&sd->mem_event_rwlock);
715 		rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
716 						mlx5_mr_mem_event_cb, NULL);
717 		ret = mlx5_mp_init_primary(MLX5_MP_NAME,
718 					   mlx5_mp_os_primary_handle);
719 		if (ret)
720 			goto out;
721 		sd->init_done = true;
722 		break;
723 	case RTE_PROC_SECONDARY:
724 		if (ld->init_done)
725 			break;
726 		ret = mlx5_mp_init_secondary(MLX5_MP_NAME,
727 					     mlx5_mp_os_secondary_handle);
728 		if (ret)
729 			goto out;
730 		++sd->secondary_cnt;
731 		ld->init_done = true;
732 		break;
733 	default:
734 		break;
735 	}
736 out:
737 	rte_spinlock_unlock(&sd->lock);
738 	return ret;
739 }
740 
741 /**
742  * Create the Tx queue DevX/Verbs object.
743  *
744  * @param dev
745  *   Pointer to Ethernet device.
746  * @param idx
747  *   Queue index in DPDK Tx queue array.
748  *
749  * @return
750  *   0 on success, a negative errno value otherwise and rte_errno is set.
751  */
752 static int
753 mlx5_os_txq_obj_new(struct rte_eth_dev *dev, uint16_t idx)
754 {
755 	struct mlx5_priv *priv = dev->data->dev_private;
756 	struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
757 	struct mlx5_txq_ctrl *txq_ctrl =
758 			container_of(txq_data, struct mlx5_txq_ctrl, txq);
759 
760 	if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN)
761 		return mlx5_txq_devx_obj_new(dev, idx);
762 #ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET
763 	if (!priv->config.dv_esw_en)
764 		return mlx5_txq_devx_obj_new(dev, idx);
765 #endif
766 	return mlx5_txq_ibv_obj_new(dev, idx);
767 }
768 
769 /**
770  * Release an Tx DevX/verbs queue object.
771  *
772  * @param txq_obj
773  *   DevX/Verbs Tx queue object.
774  */
775 static void
776 mlx5_os_txq_obj_release(struct mlx5_txq_obj *txq_obj)
777 {
778 	if (txq_obj->txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) {
779 		mlx5_txq_devx_obj_release(txq_obj);
780 		return;
781 	}
782 #ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET
783 	if (!txq_obj->txq_ctrl->priv->config.dv_esw_en) {
784 		mlx5_txq_devx_obj_release(txq_obj);
785 		return;
786 	}
787 #endif
788 	mlx5_txq_ibv_obj_release(txq_obj);
789 }
790 
791 /**
792  * DV flow counter mode detect and config.
793  *
794  * @param dev
795  *   Pointer to rte_eth_dev structure.
796  *
797  */
798 static void
799 mlx5_flow_counter_mode_config(struct rte_eth_dev *dev __rte_unused)
800 {
801 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
802 	struct mlx5_priv *priv = dev->data->dev_private;
803 	struct mlx5_dev_ctx_shared *sh = priv->sh;
804 	bool fallback;
805 
806 #ifndef HAVE_IBV_DEVX_ASYNC
807 	fallback = true;
808 #else
809 	fallback = false;
810 	if (!priv->config.devx || !priv->config.dv_flow_en ||
811 	    !priv->config.hca_attr.flow_counters_dump ||
812 	    !(priv->config.hca_attr.flow_counter_bulk_alloc_bitmap & 0x4) ||
813 	    (mlx5_flow_dv_discover_counter_offset_support(dev) == -ENOTSUP))
814 		fallback = true;
815 #endif
816 	if (fallback)
817 		DRV_LOG(INFO, "Use fall-back DV counter management. Flow "
818 			"counter dump:%d, bulk_alloc_bitmap:0x%hhx.",
819 			priv->config.hca_attr.flow_counters_dump,
820 			priv->config.hca_attr.flow_counter_bulk_alloc_bitmap);
821 	/* Initialize fallback mode only on the port initializes sh. */
822 	if (sh->refcnt == 1)
823 		sh->cmng.counter_fallback = fallback;
824 	else if (fallback != sh->cmng.counter_fallback)
825 		DRV_LOG(WARNING, "Port %d in sh has different fallback mode "
826 			"with others:%d.", PORT_ID(priv), fallback);
827 #endif
828 }
829 
830 static void
831 mlx5_queue_counter_id_prepare(struct rte_eth_dev *dev)
832 {
833 	struct mlx5_priv *priv = dev->data->dev_private;
834 	void *ctx = priv->sh->ctx;
835 
836 	priv->q_counters = mlx5_devx_cmd_queue_counter_alloc(ctx);
837 	if (!priv->q_counters) {
838 		struct ibv_cq *cq = mlx5_glue->create_cq(ctx, 1, NULL, NULL, 0);
839 		struct ibv_wq *wq;
840 
841 		DRV_LOG(DEBUG, "Port %d queue counter object cannot be created "
842 			"by DevX - fall-back to use the kernel driver global "
843 			"queue counter.", dev->data->port_id);
844 		/* Create WQ by kernel and query its queue counter ID. */
845 		if (cq) {
846 			wq = mlx5_glue->create_wq(ctx,
847 						  &(struct ibv_wq_init_attr){
848 						    .wq_type = IBV_WQT_RQ,
849 						    .max_wr = 1,
850 						    .max_sge = 1,
851 						    .pd = priv->sh->pd,
852 						    .cq = cq,
853 						});
854 			if (wq) {
855 				/* Counter is assigned only on RDY state. */
856 				int ret = mlx5_glue->modify_wq(wq,
857 						 &(struct ibv_wq_attr){
858 						 .attr_mask = IBV_WQ_ATTR_STATE,
859 						 .wq_state = IBV_WQS_RDY,
860 						});
861 
862 				if (ret == 0)
863 					mlx5_devx_cmd_wq_query(wq,
864 							 &priv->counter_set_id);
865 				claim_zero(mlx5_glue->destroy_wq(wq));
866 			}
867 			claim_zero(mlx5_glue->destroy_cq(cq));
868 		}
869 	} else {
870 		priv->counter_set_id = priv->q_counters->id;
871 	}
872 	if (priv->counter_set_id == 0)
873 		DRV_LOG(INFO, "Part of the port %d statistics will not be "
874 			"available.", dev->data->port_id);
875 }
876 
877 /**
878  * Check if representor spawn info match devargs.
879  *
880  * @param spawn
881  *   Verbs device parameters (name, port, switch_info) to spawn.
882  * @param eth_da
883  *   Device devargs to probe.
884  *
885  * @return
886  *   Match result.
887  */
888 static bool
889 mlx5_representor_match(struct mlx5_dev_spawn_data *spawn,
890 		       struct rte_eth_devargs *eth_da)
891 {
892 	struct mlx5_switch_info *switch_info = &spawn->info;
893 	unsigned int p, f;
894 	uint16_t id;
895 	uint16_t repr_id = mlx5_representor_id_encode(switch_info,
896 						      eth_da->type);
897 
898 	switch (eth_da->type) {
899 	case RTE_ETH_REPRESENTOR_SF:
900 		if (!(spawn->info.port_name == -1 &&
901 		      switch_info->name_type ==
902 				MLX5_PHYS_PORT_NAME_TYPE_PFHPF) &&
903 		    switch_info->name_type != MLX5_PHYS_PORT_NAME_TYPE_PFSF) {
904 			rte_errno = EBUSY;
905 			return false;
906 		}
907 		break;
908 	case RTE_ETH_REPRESENTOR_VF:
909 		/* Allows HPF representor index -1 as exception. */
910 		if (!(spawn->info.port_name == -1 &&
911 		      switch_info->name_type ==
912 				MLX5_PHYS_PORT_NAME_TYPE_PFHPF) &&
913 		    switch_info->name_type != MLX5_PHYS_PORT_NAME_TYPE_PFVF) {
914 			rte_errno = EBUSY;
915 			return false;
916 		}
917 		break;
918 	case RTE_ETH_REPRESENTOR_NONE:
919 		rte_errno = EBUSY;
920 		return false;
921 	default:
922 		rte_errno = ENOTSUP;
923 		DRV_LOG(ERR, "unsupported representor type");
924 		return false;
925 	}
926 	/* Check representor ID: */
927 	for (p = 0; p < eth_da->nb_ports; ++p) {
928 		if (spawn->pf_bond < 0) {
929 			/* For non-LAG mode, allow and ignore pf. */
930 			switch_info->pf_num = eth_da->ports[p];
931 			repr_id = mlx5_representor_id_encode(switch_info,
932 							     eth_da->type);
933 		}
934 		for (f = 0; f < eth_da->nb_representor_ports; ++f) {
935 			id = MLX5_REPRESENTOR_ID
936 				(eth_da->ports[p], eth_da->type,
937 				 eth_da->representor_ports[f]);
938 			if (repr_id == id)
939 				return true;
940 		}
941 	}
942 	rte_errno = EBUSY;
943 	return false;
944 }
945 
946 
947 /**
948  * Spawn an Ethernet device from Verbs information.
949  *
950  * @param dpdk_dev
951  *   Backing DPDK device.
952  * @param spawn
953  *   Verbs device parameters (name, port, switch_info) to spawn.
954  * @param config
955  *   Device configuration parameters.
956  * @param config
957  *   Device arguments.
958  *
959  * @return
960  *   A valid Ethernet device object on success, NULL otherwise and rte_errno
961  *   is set. The following errors are defined:
962  *
963  *   EBUSY: device is not supposed to be spawned.
964  *   EEXIST: device is already spawned
965  */
966 static struct rte_eth_dev *
967 mlx5_dev_spawn(struct rte_device *dpdk_dev,
968 	       struct mlx5_dev_spawn_data *spawn,
969 	       struct mlx5_dev_config *config,
970 	       struct rte_eth_devargs *eth_da)
971 {
972 	const struct mlx5_switch_info *switch_info = &spawn->info;
973 	struct mlx5_dev_ctx_shared *sh = NULL;
974 	struct ibv_port_attr port_attr;
975 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
976 	struct rte_eth_dev *eth_dev = NULL;
977 	struct mlx5_priv *priv = NULL;
978 	int err = 0;
979 	unsigned int hw_padding = 0;
980 	unsigned int mps;
981 	unsigned int tunnel_en = 0;
982 	unsigned int mpls_en = 0;
983 	unsigned int swp = 0;
984 	unsigned int mprq = 0;
985 	unsigned int mprq_min_stride_size_n = 0;
986 	unsigned int mprq_max_stride_size_n = 0;
987 	unsigned int mprq_min_stride_num_n = 0;
988 	unsigned int mprq_max_stride_num_n = 0;
989 	struct rte_ether_addr mac;
990 	char name[RTE_ETH_NAME_MAX_LEN];
991 	int own_domain_id = 0;
992 	uint16_t port_id;
993 	struct mlx5_port_info vport_info = { .query_flags = 0 };
994 	int i;
995 
996 	/* Determine if this port representor is supposed to be spawned. */
997 	if (switch_info->representor && dpdk_dev->devargs &&
998 	    !mlx5_representor_match(spawn, eth_da))
999 		return NULL;
1000 	/* Build device name. */
1001 	if (spawn->pf_bond < 0) {
1002 		/* Single device. */
1003 		if (!switch_info->representor)
1004 			strlcpy(name, dpdk_dev->name, sizeof(name));
1005 		else
1006 			err = snprintf(name, sizeof(name), "%s_representor_%s%u",
1007 				 dpdk_dev->name,
1008 				 switch_info->name_type ==
1009 				 MLX5_PHYS_PORT_NAME_TYPE_PFSF ? "sf" : "vf",
1010 				 switch_info->port_name);
1011 	} else {
1012 		/* Bonding device. */
1013 		if (!switch_info->representor) {
1014 			err = snprintf(name, sizeof(name), "%s_%s",
1015 				 dpdk_dev->name,
1016 				 mlx5_os_get_dev_device_name(spawn->phys_dev));
1017 		} else {
1018 			err = snprintf(name, sizeof(name), "%s_%s_representor_c%dpf%d%s%u",
1019 				dpdk_dev->name,
1020 				mlx5_os_get_dev_device_name(spawn->phys_dev),
1021 				switch_info->ctrl_num,
1022 				switch_info->pf_num,
1023 				switch_info->name_type ==
1024 				MLX5_PHYS_PORT_NAME_TYPE_PFSF ? "sf" : "vf",
1025 				switch_info->port_name);
1026 		}
1027 	}
1028 	if (err >= (int)sizeof(name))
1029 		DRV_LOG(WARNING, "device name overflow %s", name);
1030 	/* check if the device is already spawned */
1031 	if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) {
1032 		rte_errno = EEXIST;
1033 		return NULL;
1034 	}
1035 	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
1036 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1037 		struct mlx5_mp_id mp_id;
1038 
1039 		eth_dev = rte_eth_dev_attach_secondary(name);
1040 		if (eth_dev == NULL) {
1041 			DRV_LOG(ERR, "can not attach rte ethdev");
1042 			rte_errno = ENOMEM;
1043 			return NULL;
1044 		}
1045 		eth_dev->device = dpdk_dev;
1046 		eth_dev->dev_ops = &mlx5_dev_sec_ops;
1047 		eth_dev->rx_descriptor_status = mlx5_rx_descriptor_status;
1048 		eth_dev->tx_descriptor_status = mlx5_tx_descriptor_status;
1049 		err = mlx5_proc_priv_init(eth_dev);
1050 		if (err)
1051 			return NULL;
1052 		mp_id.port_id = eth_dev->data->port_id;
1053 		strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
1054 		/* Receive command fd from primary process */
1055 		err = mlx5_mp_req_verbs_cmd_fd(&mp_id);
1056 		if (err < 0)
1057 			goto err_secondary;
1058 		/* Remap UAR for Tx queues. */
1059 		err = mlx5_tx_uar_init_secondary(eth_dev, err);
1060 		if (err)
1061 			goto err_secondary;
1062 		/*
1063 		 * Ethdev pointer is still required as input since
1064 		 * the primary device is not accessible from the
1065 		 * secondary process.
1066 		 */
1067 		eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
1068 		eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
1069 		return eth_dev;
1070 err_secondary:
1071 		mlx5_dev_close(eth_dev);
1072 		return NULL;
1073 	}
1074 	/*
1075 	 * Some parameters ("tx_db_nc" in particularly) are needed in
1076 	 * advance to create dv/verbs device context. We proceed the
1077 	 * devargs here to get ones, and later proceed devargs again
1078 	 * to override some hardware settings.
1079 	 */
1080 	err = mlx5_args(config, dpdk_dev->devargs);
1081 	if (err) {
1082 		err = rte_errno;
1083 		DRV_LOG(ERR, "failed to process device arguments: %s",
1084 			strerror(rte_errno));
1085 		goto error;
1086 	}
1087 	if (config->dv_miss_info) {
1088 		if (switch_info->master || switch_info->representor)
1089 			config->dv_xmeta_en = MLX5_XMETA_MODE_META16;
1090 	}
1091 	mlx5_malloc_mem_select(config->sys_mem_en);
1092 	sh = mlx5_alloc_shared_dev_ctx(spawn, config);
1093 	if (!sh)
1094 		return NULL;
1095 	config->devx = sh->devx;
1096 #ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR
1097 	config->dest_tir = 1;
1098 #endif
1099 #ifdef HAVE_IBV_MLX5_MOD_SWP
1100 	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
1101 #endif
1102 	/*
1103 	 * Multi-packet send is supported by ConnectX-4 Lx PF as well
1104 	 * as all ConnectX-5 devices.
1105 	 */
1106 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
1107 	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
1108 #endif
1109 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
1110 	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
1111 #endif
1112 	mlx5_glue->dv_query_device(sh->ctx, &dv_attr);
1113 	if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
1114 		if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
1115 			DRV_LOG(DEBUG, "enhanced MPW is supported");
1116 			mps = MLX5_MPW_ENHANCED;
1117 		} else {
1118 			DRV_LOG(DEBUG, "MPW is supported");
1119 			mps = MLX5_MPW;
1120 		}
1121 	} else {
1122 		DRV_LOG(DEBUG, "MPW isn't supported");
1123 		mps = MLX5_MPW_DISABLED;
1124 	}
1125 #ifdef HAVE_IBV_MLX5_MOD_SWP
1126 	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
1127 		swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
1128 	DRV_LOG(DEBUG, "SWP support: %u", swp);
1129 #endif
1130 	config->swp = !!swp;
1131 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
1132 	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
1133 		struct mlx5dv_striding_rq_caps mprq_caps =
1134 			dv_attr.striding_rq_caps;
1135 
1136 		DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
1137 			mprq_caps.min_single_stride_log_num_of_bytes);
1138 		DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d",
1139 			mprq_caps.max_single_stride_log_num_of_bytes);
1140 		DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d",
1141 			mprq_caps.min_single_wqe_log_num_of_strides);
1142 		DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d",
1143 			mprq_caps.max_single_wqe_log_num_of_strides);
1144 		DRV_LOG(DEBUG, "\tsupported_qpts: %d",
1145 			mprq_caps.supported_qpts);
1146 		DRV_LOG(DEBUG, "device supports Multi-Packet RQ");
1147 		mprq = 1;
1148 		mprq_min_stride_size_n =
1149 			mprq_caps.min_single_stride_log_num_of_bytes;
1150 		mprq_max_stride_size_n =
1151 			mprq_caps.max_single_stride_log_num_of_bytes;
1152 		mprq_min_stride_num_n =
1153 			mprq_caps.min_single_wqe_log_num_of_strides;
1154 		mprq_max_stride_num_n =
1155 			mprq_caps.max_single_wqe_log_num_of_strides;
1156 	}
1157 #endif
1158 	/* Rx CQE compression is enabled by default. */
1159 	config->cqe_comp = 1;
1160 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
1161 	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
1162 		tunnel_en = ((dv_attr.tunnel_offloads_caps &
1163 			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
1164 			     (dv_attr.tunnel_offloads_caps &
1165 			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) &&
1166 			     (dv_attr.tunnel_offloads_caps &
1167 			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE));
1168 	}
1169 	DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
1170 		tunnel_en ? "" : "not ");
1171 #else
1172 	DRV_LOG(WARNING,
1173 		"tunnel offloading disabled due to old OFED/rdma-core version");
1174 #endif
1175 	config->tunnel_en = tunnel_en;
1176 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
1177 	mpls_en = ((dv_attr.tunnel_offloads_caps &
1178 		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
1179 		   (dv_attr.tunnel_offloads_caps &
1180 		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
1181 	DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
1182 		mpls_en ? "" : "not ");
1183 #else
1184 	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
1185 		" old OFED/rdma-core version or firmware configuration");
1186 #endif
1187 	config->mpls_en = mpls_en;
1188 	/* Check port status. */
1189 	err = mlx5_glue->query_port(sh->ctx, spawn->phys_port, &port_attr);
1190 	if (err) {
1191 		DRV_LOG(ERR, "port query failed: %s", strerror(err));
1192 		goto error;
1193 	}
1194 	if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
1195 		DRV_LOG(ERR, "port is not configured in Ethernet mode");
1196 		err = EINVAL;
1197 		goto error;
1198 	}
1199 	if (port_attr.state != IBV_PORT_ACTIVE)
1200 		DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
1201 			mlx5_glue->port_state_str(port_attr.state),
1202 			port_attr.state);
1203 	/* Allocate private eth device data. */
1204 	priv = mlx5_malloc(MLX5_MEM_ZERO | MLX5_MEM_RTE,
1205 			   sizeof(*priv),
1206 			   RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
1207 	if (priv == NULL) {
1208 		DRV_LOG(ERR, "priv allocation failure");
1209 		err = ENOMEM;
1210 		goto error;
1211 	}
1212 	priv->sh = sh;
1213 	priv->dev_port = spawn->phys_port;
1214 	priv->pci_dev = spawn->pci_dev;
1215 	priv->mtu = RTE_ETHER_MTU;
1216 	/* Some internal functions rely on Netlink sockets, open them now. */
1217 	priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);
1218 	priv->nl_socket_route =	mlx5_nl_init(NETLINK_ROUTE);
1219 	priv->representor = !!switch_info->representor;
1220 	priv->master = !!switch_info->master;
1221 	priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
1222 	priv->vport_meta_tag = 0;
1223 	priv->vport_meta_mask = 0;
1224 	priv->pf_bond = spawn->pf_bond;
1225 	/*
1226 	 * If we have E-Switch we should determine the vport attributes.
1227 	 * E-Switch may use either source vport field or reg_c[0] metadata
1228 	 * register to match on vport index. The engaged part of metadata
1229 	 * register is defined by mask.
1230 	 */
1231 	if (switch_info->representor || switch_info->master) {
1232 		err = mlx5_glue->devx_port_query(sh->ctx,
1233 						 spawn->phys_port,
1234 						 &vport_info);
1235 		if (err) {
1236 			DRV_LOG(WARNING,
1237 				"can't query devx port %d on device %s",
1238 				spawn->phys_port,
1239 				mlx5_os_get_dev_device_name(spawn->phys_dev));
1240 			vport_info.query_flags = 0;
1241 		}
1242 	}
1243 	if (vport_info.query_flags & MLX5_PORT_QUERY_REG_C0) {
1244 		priv->vport_meta_tag = vport_info.vport_meta_tag;
1245 		priv->vport_meta_mask = vport_info.vport_meta_mask;
1246 		if (!priv->vport_meta_mask) {
1247 			DRV_LOG(ERR, "vport zero mask for port %d"
1248 				     " on bonding device %s",
1249 				     spawn->phys_port,
1250 				     mlx5_os_get_dev_device_name
1251 							(spawn->phys_dev));
1252 			err = ENOTSUP;
1253 			goto error;
1254 		}
1255 		if (priv->vport_meta_tag & ~priv->vport_meta_mask) {
1256 			DRV_LOG(ERR, "invalid vport tag for port %d"
1257 				     " on bonding device %s",
1258 				     spawn->phys_port,
1259 				     mlx5_os_get_dev_device_name
1260 							(spawn->phys_dev));
1261 			err = ENOTSUP;
1262 			goto error;
1263 		}
1264 	}
1265 	if (vport_info.query_flags & MLX5_PORT_QUERY_VPORT) {
1266 		priv->vport_id = vport_info.vport_id;
1267 	} else if (spawn->pf_bond >= 0 &&
1268 		   (switch_info->representor || switch_info->master)) {
1269 		DRV_LOG(ERR, "can't deduce vport index for port %d"
1270 			     " on bonding device %s",
1271 			     spawn->phys_port,
1272 			     mlx5_os_get_dev_device_name(spawn->phys_dev));
1273 		err = ENOTSUP;
1274 		goto error;
1275 	} else {
1276 		/*
1277 		 * Suppose vport index in compatible way. Kernel/rdma_core
1278 		 * support single E-Switch per PF configurations only and
1279 		 * vport_id field contains the vport index for associated VF,
1280 		 * which is deduced from representor port name.
1281 		 * For example, let's have the IB device port 10, it has
1282 		 * attached network device eth0, which has port name attribute
1283 		 * pf0vf2, we can deduce the VF number as 2, and set vport index
1284 		 * as 3 (2+1). This assigning schema should be changed if the
1285 		 * multiple E-Switch instances per PF configurations or/and PCI
1286 		 * subfunctions are added.
1287 		 */
1288 		priv->vport_id = switch_info->representor ?
1289 				 switch_info->port_name + 1 : -1;
1290 	}
1291 	priv->representor_id = mlx5_representor_id_encode(switch_info,
1292 							  eth_da->type);
1293 	/*
1294 	 * Look for sibling devices in order to reuse their switch domain
1295 	 * if any, otherwise allocate one.
1296 	 */
1297 	MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
1298 		const struct mlx5_priv *opriv =
1299 			rte_eth_devices[port_id].data->dev_private;
1300 
1301 		if (!opriv ||
1302 		    opriv->sh != priv->sh ||
1303 			opriv->domain_id ==
1304 			RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
1305 			continue;
1306 		priv->domain_id = opriv->domain_id;
1307 		break;
1308 	}
1309 	if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
1310 		err = rte_eth_switch_domain_alloc(&priv->domain_id);
1311 		if (err) {
1312 			err = rte_errno;
1313 			DRV_LOG(ERR, "unable to allocate switch domain: %s",
1314 				strerror(rte_errno));
1315 			goto error;
1316 		}
1317 		own_domain_id = 1;
1318 	}
1319 	/* Override some values set by hardware configuration. */
1320 	mlx5_args(config, dpdk_dev->devargs);
1321 	err = mlx5_dev_check_sibling_config(priv, config);
1322 	if (err)
1323 		goto error;
1324 	config->hw_csum = !!(sh->device_attr.device_cap_flags_ex &
1325 			    IBV_DEVICE_RAW_IP_CSUM);
1326 	DRV_LOG(DEBUG, "checksum offloading is %ssupported",
1327 		(config->hw_csum ? "" : "not "));
1328 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \
1329 	!defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
1330 	DRV_LOG(DEBUG, "counters are not supported");
1331 #endif
1332 #if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR)
1333 	if (config->dv_flow_en) {
1334 		DRV_LOG(WARNING, "DV flow is not supported");
1335 		config->dv_flow_en = 0;
1336 	}
1337 #endif
1338 	config->ind_table_max_size =
1339 		sh->device_attr.max_rwq_indirection_table_size;
1340 	/*
1341 	 * Remove this check once DPDK supports larger/variable
1342 	 * indirection tables.
1343 	 */
1344 	if (config->ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512)
1345 		config->ind_table_max_size = ETH_RSS_RETA_SIZE_512;
1346 	DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
1347 		config->ind_table_max_size);
1348 	config->hw_vlan_strip = !!(sh->device_attr.raw_packet_caps &
1349 				  IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
1350 	DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
1351 		(config->hw_vlan_strip ? "" : "not "));
1352 	config->hw_fcs_strip = !!(sh->device_attr.raw_packet_caps &
1353 				 IBV_RAW_PACKET_CAP_SCATTER_FCS);
1354 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
1355 	hw_padding = !!sh->device_attr.rx_pad_end_addr_align;
1356 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
1357 	hw_padding = !!(sh->device_attr.device_cap_flags_ex &
1358 			IBV_DEVICE_PCI_WRITE_END_PADDING);
1359 #endif
1360 	if (config->hw_padding && !hw_padding) {
1361 		DRV_LOG(DEBUG, "Rx end alignment padding isn't supported");
1362 		config->hw_padding = 0;
1363 	} else if (config->hw_padding) {
1364 		DRV_LOG(DEBUG, "Rx end alignment padding is enabled");
1365 	}
1366 	config->tso = (sh->device_attr.max_tso > 0 &&
1367 		      (sh->device_attr.tso_supported_qpts &
1368 		       (1 << IBV_QPT_RAW_PACKET)));
1369 	if (config->tso)
1370 		config->tso_max_payload_sz = sh->device_attr.max_tso;
1371 	/*
1372 	 * MPW is disabled by default, while the Enhanced MPW is enabled
1373 	 * by default.
1374 	 */
1375 	if (config->mps == MLX5_ARG_UNSET)
1376 		config->mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED :
1377 							  MLX5_MPW_DISABLED;
1378 	else
1379 		config->mps = config->mps ? mps : MLX5_MPW_DISABLED;
1380 	DRV_LOG(INFO, "%sMPS is %s",
1381 		config->mps == MLX5_MPW_ENHANCED ? "enhanced " :
1382 		config->mps == MLX5_MPW ? "legacy " : "",
1383 		config->mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
1384 	if (config->devx) {
1385 		err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config->hca_attr);
1386 		if (err) {
1387 			err = -err;
1388 			goto error;
1389 		}
1390 		/* Check relax ordering support. */
1391 		if (!haswell_broadwell_cpu) {
1392 			sh->cmng.relaxed_ordering_write =
1393 				config->hca_attr.relaxed_ordering_write;
1394 			sh->cmng.relaxed_ordering_read =
1395 				config->hca_attr.relaxed_ordering_read;
1396 		} else {
1397 			sh->cmng.relaxed_ordering_read = 0;
1398 			sh->cmng.relaxed_ordering_write = 0;
1399 		}
1400 		sh->rq_ts_format = config->hca_attr.rq_ts_format;
1401 		sh->sq_ts_format = config->hca_attr.sq_ts_format;
1402 		sh->qp_ts_format = config->hca_attr.qp_ts_format;
1403 		/* Check for LRO support. */
1404 		if (config->dest_tir && config->hca_attr.lro_cap &&
1405 		    config->dv_flow_en) {
1406 			/* TBD check tunnel lro caps. */
1407 			config->lro.supported = config->hca_attr.lro_cap;
1408 			DRV_LOG(DEBUG, "Device supports LRO");
1409 			/*
1410 			 * If LRO timeout is not configured by application,
1411 			 * use the minimal supported value.
1412 			 */
1413 			if (!config->lro.timeout)
1414 				config->lro.timeout =
1415 				config->hca_attr.lro_timer_supported_periods[0];
1416 			DRV_LOG(DEBUG, "LRO session timeout set to %d usec",
1417 				config->lro.timeout);
1418 			DRV_LOG(DEBUG, "LRO minimal size of TCP segment "
1419 				"required for coalescing is %d bytes",
1420 				config->hca_attr.lro_min_mss_size);
1421 		}
1422 #if defined(HAVE_MLX5DV_DR) && \
1423 	(defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER) || \
1424 	 defined(HAVE_MLX5_DR_CREATE_ACTION_ASO))
1425 		if (config->hca_attr.qos.sup &&
1426 		    config->hca_attr.qos.flow_meter_old &&
1427 		    config->dv_flow_en) {
1428 			uint8_t reg_c_mask =
1429 				config->hca_attr.qos.flow_meter_reg_c_ids;
1430 			/*
1431 			 * Meter needs two REG_C's for color match and pre-sfx
1432 			 * flow match. Here get the REG_C for color match.
1433 			 * REG_C_0 and REG_C_1 is reserved for metadata feature.
1434 			 */
1435 			reg_c_mask &= 0xfc;
1436 			if (__builtin_popcount(reg_c_mask) < 1) {
1437 				priv->mtr_en = 0;
1438 				DRV_LOG(WARNING, "No available register for"
1439 					" meter.");
1440 			} else {
1441 				/*
1442 				 * The meter color register is used by the
1443 				 * flow-hit feature as well.
1444 				 * The flow-hit feature must use REG_C_3
1445 				 * Prefer REG_C_3 if it is available.
1446 				 */
1447 				if (reg_c_mask & (1 << (REG_C_3 - REG_C_0)))
1448 					priv->mtr_color_reg = REG_C_3;
1449 				else
1450 					priv->mtr_color_reg = ffs(reg_c_mask)
1451 							      - 1 + REG_C_0;
1452 				priv->mtr_en = 1;
1453 				priv->mtr_reg_share =
1454 				      config->hca_attr.qos.flow_meter;
1455 				DRV_LOG(DEBUG, "The REG_C meter uses is %d",
1456 					priv->mtr_color_reg);
1457 			}
1458 		}
1459 		if (config->hca_attr.qos.sup &&
1460 			config->hca_attr.qos.flow_meter_aso_sup) {
1461 			uint32_t log_obj_size =
1462 				rte_log2_u32(MLX5_ASO_MTRS_PER_POOL >> 1);
1463 			if (log_obj_size >=
1464 			config->hca_attr.qos.log_meter_aso_granularity &&
1465 			log_obj_size <=
1466 			config->hca_attr.qos.log_meter_aso_max_alloc)
1467 				sh->meter_aso_en = 1;
1468 		}
1469 		if (priv->mtr_en) {
1470 			err = mlx5_aso_flow_mtrs_mng_init(priv->sh);
1471 			if (err) {
1472 				err = -err;
1473 				goto error;
1474 			}
1475 		}
1476 		if (config->hca_attr.flow.tunnel_header_0_1)
1477 			sh->tunnel_header_0_1 = 1;
1478 #endif
1479 #ifdef HAVE_MLX5_DR_CREATE_ACTION_ASO
1480 		if (config->hca_attr.flow_hit_aso &&
1481 		    priv->mtr_color_reg == REG_C_3) {
1482 			sh->flow_hit_aso_en = 1;
1483 			err = mlx5_flow_aso_age_mng_init(sh);
1484 			if (err) {
1485 				err = -err;
1486 				goto error;
1487 			}
1488 			DRV_LOG(DEBUG, "Flow Hit ASO is supported.");
1489 		}
1490 #endif /* HAVE_MLX5_DR_CREATE_ACTION_ASO */
1491 #if defined(HAVE_MLX5_DR_CREATE_ACTION_ASO) && \
1492 	defined(HAVE_MLX5_DR_ACTION_ASO_CT)
1493 		if (config->hca_attr.ct_offload &&
1494 		    priv->mtr_color_reg == REG_C_3) {
1495 			err = mlx5_flow_aso_ct_mng_init(sh);
1496 			if (err) {
1497 				err = -err;
1498 				goto error;
1499 			}
1500 			DRV_LOG(DEBUG, "CT ASO is supported.");
1501 			sh->ct_aso_en = 1;
1502 		}
1503 #endif /* HAVE_MLX5_DR_CREATE_ACTION_ASO && HAVE_MLX5_DR_ACTION_ASO_CT */
1504 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_SAMPLE)
1505 		if (config->hca_attr.log_max_ft_sampler_num > 0  &&
1506 		    config->dv_flow_en) {
1507 			priv->sampler_en = 1;
1508 			DRV_LOG(DEBUG, "Sampler enabled!");
1509 		} else {
1510 			priv->sampler_en = 0;
1511 			if (!config->hca_attr.log_max_ft_sampler_num)
1512 				DRV_LOG(WARNING,
1513 					"No available register for sampler.");
1514 			else
1515 				DRV_LOG(DEBUG, "DV flow is not supported!");
1516 		}
1517 #endif
1518 	}
1519 	if (config->cqe_comp && RTE_CACHE_LINE_SIZE == 128 &&
1520 	    !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) {
1521 		DRV_LOG(WARNING, "Rx CQE 128B compression is not supported");
1522 		config->cqe_comp = 0;
1523 	}
1524 	if (config->cqe_comp_fmt == MLX5_CQE_RESP_FORMAT_FTAG_STRIDX &&
1525 	    (!config->devx || !config->hca_attr.mini_cqe_resp_flow_tag)) {
1526 		DRV_LOG(WARNING, "Flow Tag CQE compression"
1527 				 " format isn't supported.");
1528 		config->cqe_comp = 0;
1529 	}
1530 	if (config->cqe_comp_fmt == MLX5_CQE_RESP_FORMAT_L34H_STRIDX &&
1531 	    (!config->devx || !config->hca_attr.mini_cqe_resp_l3_l4_tag)) {
1532 		DRV_LOG(WARNING, "L3/L4 Header CQE compression"
1533 				 " format isn't supported.");
1534 		config->cqe_comp = 0;
1535 	}
1536 	DRV_LOG(DEBUG, "Rx CQE compression is %ssupported",
1537 			config->cqe_comp ? "" : "not ");
1538 	if (config->tx_pp) {
1539 		DRV_LOG(DEBUG, "Timestamp counter frequency %u kHz",
1540 			config->hca_attr.dev_freq_khz);
1541 		DRV_LOG(DEBUG, "Packet pacing is %ssupported",
1542 			config->hca_attr.qos.packet_pacing ? "" : "not ");
1543 		DRV_LOG(DEBUG, "Cross channel ops are %ssupported",
1544 			config->hca_attr.cross_channel ? "" : "not ");
1545 		DRV_LOG(DEBUG, "WQE index ignore is %ssupported",
1546 			config->hca_attr.wqe_index_ignore ? "" : "not ");
1547 		DRV_LOG(DEBUG, "Non-wire SQ feature is %ssupported",
1548 			config->hca_attr.non_wire_sq ? "" : "not ");
1549 		DRV_LOG(DEBUG, "Static WQE SQ feature is %ssupported (%d)",
1550 			config->hca_attr.log_max_static_sq_wq ? "" : "not ",
1551 			config->hca_attr.log_max_static_sq_wq);
1552 		DRV_LOG(DEBUG, "WQE rate PP mode is %ssupported",
1553 			config->hca_attr.qos.wqe_rate_pp ? "" : "not ");
1554 		if (!config->devx) {
1555 			DRV_LOG(ERR, "DevX is required for packet pacing");
1556 			err = ENODEV;
1557 			goto error;
1558 		}
1559 		if (!config->hca_attr.qos.packet_pacing) {
1560 			DRV_LOG(ERR, "Packet pacing is not supported");
1561 			err = ENODEV;
1562 			goto error;
1563 		}
1564 		if (!config->hca_attr.cross_channel) {
1565 			DRV_LOG(ERR, "Cross channel operations are"
1566 				     " required for packet pacing");
1567 			err = ENODEV;
1568 			goto error;
1569 		}
1570 		if (!config->hca_attr.wqe_index_ignore) {
1571 			DRV_LOG(ERR, "WQE index ignore feature is"
1572 				     " required for packet pacing");
1573 			err = ENODEV;
1574 			goto error;
1575 		}
1576 		if (!config->hca_attr.non_wire_sq) {
1577 			DRV_LOG(ERR, "Non-wire SQ feature is"
1578 				     " required for packet pacing");
1579 			err = ENODEV;
1580 			goto error;
1581 		}
1582 		if (!config->hca_attr.log_max_static_sq_wq) {
1583 			DRV_LOG(ERR, "Static WQE SQ feature is"
1584 				     " required for packet pacing");
1585 			err = ENODEV;
1586 			goto error;
1587 		}
1588 		if (!config->hca_attr.qos.wqe_rate_pp) {
1589 			DRV_LOG(ERR, "WQE rate mode is required"
1590 				     " for packet pacing");
1591 			err = ENODEV;
1592 			goto error;
1593 		}
1594 #ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET
1595 		DRV_LOG(ERR, "DevX does not provide UAR offset,"
1596 			     " can't create queues for packet pacing");
1597 		err = ENODEV;
1598 		goto error;
1599 #endif
1600 	}
1601 	if (config->devx) {
1602 		uint32_t reg[MLX5_ST_SZ_DW(register_mtutc)];
1603 
1604 		err = config->hca_attr.access_register_user ?
1605 			mlx5_devx_cmd_register_read
1606 				(sh->ctx, MLX5_REGISTER_ID_MTUTC, 0,
1607 				reg, MLX5_ST_SZ_DW(register_mtutc)) : ENOTSUP;
1608 		if (!err) {
1609 			uint32_t ts_mode;
1610 
1611 			/* MTUTC register is read successfully. */
1612 			ts_mode = MLX5_GET(register_mtutc, reg,
1613 					   time_stamp_mode);
1614 			if (ts_mode == MLX5_MTUTC_TIMESTAMP_MODE_REAL_TIME)
1615 				config->rt_timestamp = 1;
1616 		} else {
1617 			/* Kernel does not support register reading. */
1618 			if (config->hca_attr.dev_freq_khz ==
1619 						 (NS_PER_S / MS_PER_S))
1620 				config->rt_timestamp = 1;
1621 		}
1622 	}
1623 	/*
1624 	 * If HW has bug working with tunnel packet decapsulation and
1625 	 * scatter FCS, and decapsulation is needed, clear the hw_fcs_strip
1626 	 * bit. Then DEV_RX_OFFLOAD_KEEP_CRC bit will not be set anymore.
1627 	 */
1628 	if (config->hca_attr.scatter_fcs_w_decap_disable && config->decap_en)
1629 		config->hw_fcs_strip = 0;
1630 	DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
1631 		(config->hw_fcs_strip ? "" : "not "));
1632 	if (config->mprq.enabled && mprq) {
1633 		if (config->mprq.stride_num_n &&
1634 		    (config->mprq.stride_num_n > mprq_max_stride_num_n ||
1635 		     config->mprq.stride_num_n < mprq_min_stride_num_n)) {
1636 			config->mprq.stride_num_n =
1637 				RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
1638 						mprq_min_stride_num_n),
1639 					mprq_max_stride_num_n);
1640 			DRV_LOG(WARNING,
1641 				"the number of strides"
1642 				" for Multi-Packet RQ is out of range,"
1643 				" setting default value (%u)",
1644 				1 << config->mprq.stride_num_n);
1645 		}
1646 		if (config->mprq.stride_size_n &&
1647 		    (config->mprq.stride_size_n > mprq_max_stride_size_n ||
1648 		     config->mprq.stride_size_n < mprq_min_stride_size_n)) {
1649 			config->mprq.stride_size_n =
1650 				RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N,
1651 						mprq_min_stride_size_n),
1652 					mprq_max_stride_size_n);
1653 			DRV_LOG(WARNING,
1654 				"the size of a stride"
1655 				" for Multi-Packet RQ is out of range,"
1656 				" setting default value (%u)",
1657 				1 << config->mprq.stride_size_n);
1658 		}
1659 		config->mprq.min_stride_size_n = mprq_min_stride_size_n;
1660 		config->mprq.max_stride_size_n = mprq_max_stride_size_n;
1661 	} else if (config->mprq.enabled && !mprq) {
1662 		DRV_LOG(WARNING, "Multi-Packet RQ isn't supported");
1663 		config->mprq.enabled = 0;
1664 	}
1665 	if (config->max_dump_files_num == 0)
1666 		config->max_dump_files_num = 128;
1667 	eth_dev = rte_eth_dev_allocate(name);
1668 	if (eth_dev == NULL) {
1669 		DRV_LOG(ERR, "can not allocate rte ethdev");
1670 		err = ENOMEM;
1671 		goto error;
1672 	}
1673 	if (priv->representor) {
1674 		eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
1675 		eth_dev->data->representor_id = priv->representor_id;
1676 	}
1677 	priv->mp_id.port_id = eth_dev->data->port_id;
1678 	strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
1679 	/*
1680 	 * Store associated network device interface index. This index
1681 	 * is permanent throughout the lifetime of device. So, we may store
1682 	 * the ifindex here and use the cached value further.
1683 	 */
1684 	MLX5_ASSERT(spawn->ifindex);
1685 	priv->if_index = spawn->ifindex;
1686 	eth_dev->data->dev_private = priv;
1687 	priv->dev_data = eth_dev->data;
1688 	eth_dev->data->mac_addrs = priv->mac;
1689 	eth_dev->device = dpdk_dev;
1690 	eth_dev->data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1691 	/* Configure the first MAC address by default. */
1692 	if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
1693 		DRV_LOG(ERR,
1694 			"port %u cannot get MAC address, is mlx5_en"
1695 			" loaded? (errno: %s)",
1696 			eth_dev->data->port_id, strerror(rte_errno));
1697 		err = ENODEV;
1698 		goto error;
1699 	}
1700 	DRV_LOG(INFO,
1701 		"port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
1702 		eth_dev->data->port_id,
1703 		mac.addr_bytes[0], mac.addr_bytes[1],
1704 		mac.addr_bytes[2], mac.addr_bytes[3],
1705 		mac.addr_bytes[4], mac.addr_bytes[5]);
1706 #ifdef RTE_LIBRTE_MLX5_DEBUG
1707 	{
1708 		char ifname[MLX5_NAMESIZE];
1709 
1710 		if (mlx5_get_ifname(eth_dev, &ifname) == 0)
1711 			DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
1712 				eth_dev->data->port_id, ifname);
1713 		else
1714 			DRV_LOG(DEBUG, "port %u ifname is unknown",
1715 				eth_dev->data->port_id);
1716 	}
1717 #endif
1718 	/* Get actual MTU if possible. */
1719 	err = mlx5_get_mtu(eth_dev, &priv->mtu);
1720 	if (err) {
1721 		err = rte_errno;
1722 		goto error;
1723 	}
1724 	DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
1725 		priv->mtu);
1726 	/* Initialize burst functions to prevent crashes before link-up. */
1727 	eth_dev->rx_pkt_burst = removed_rx_burst;
1728 	eth_dev->tx_pkt_burst = removed_tx_burst;
1729 	eth_dev->dev_ops = &mlx5_dev_ops;
1730 	eth_dev->rx_descriptor_status = mlx5_rx_descriptor_status;
1731 	eth_dev->tx_descriptor_status = mlx5_tx_descriptor_status;
1732 	eth_dev->rx_queue_count = mlx5_rx_queue_count;
1733 	/* Register MAC address. */
1734 	claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
1735 	if (config->vf && config->vf_nl_en)
1736 		mlx5_nl_mac_addr_sync(priv->nl_socket_route,
1737 				      mlx5_ifindex(eth_dev),
1738 				      eth_dev->data->mac_addrs,
1739 				      MLX5_MAX_MAC_ADDRESSES);
1740 	priv->ctrl_flows = 0;
1741 	rte_spinlock_init(&priv->flow_list_lock);
1742 	TAILQ_INIT(&priv->flow_meters);
1743 	priv->mtr_profile_tbl = mlx5_l3t_create(MLX5_L3T_TYPE_PTR);
1744 	if (!priv->mtr_profile_tbl)
1745 		goto error;
1746 	/* Hint libmlx5 to use PMD allocator for data plane resources */
1747 	mlx5_glue->dv_set_context_attr(sh->ctx,
1748 			MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
1749 			(void *)((uintptr_t)&(struct mlx5dv_ctx_allocators){
1750 				.alloc = &mlx5_alloc_verbs_buf,
1751 				.free = &mlx5_free_verbs_buf,
1752 				.data = sh,
1753 			}));
1754 	/* Bring Ethernet device up. */
1755 	DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
1756 		eth_dev->data->port_id);
1757 	mlx5_set_link_up(eth_dev);
1758 	/*
1759 	 * Even though the interrupt handler is not installed yet,
1760 	 * interrupts will still trigger on the async_fd from
1761 	 * Verbs context returned by ibv_open_device().
1762 	 */
1763 	mlx5_link_update(eth_dev, 0);
1764 #ifdef HAVE_MLX5DV_DR_ESWITCH
1765 	if (!(config->hca_attr.eswitch_manager && config->dv_flow_en &&
1766 	      (switch_info->representor || switch_info->master)))
1767 		config->dv_esw_en = 0;
1768 #else
1769 	config->dv_esw_en = 0;
1770 #endif
1771 	/* Detect minimal data bytes to inline. */
1772 	mlx5_set_min_inline(spawn, config);
1773 	/* Store device configuration on private structure. */
1774 	priv->config = *config;
1775 	for (i = 0; i < MLX5_FLOW_TYPE_MAXI; i++) {
1776 		icfg[i].release_mem_en = !!config->reclaim_mode;
1777 		if (config->reclaim_mode)
1778 			icfg[i].per_core_cache = 0;
1779 		priv->flows[i] = mlx5_ipool_create(&icfg[i]);
1780 		if (!priv->flows[i])
1781 			goto error;
1782 	}
1783 	/* Create context for virtual machine VLAN workaround. */
1784 	priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex);
1785 	if (config->dv_flow_en) {
1786 		err = mlx5_alloc_shared_dr(priv);
1787 		if (err)
1788 			goto error;
1789 	}
1790 	if (config->devx && config->dv_flow_en && config->dest_tir) {
1791 		priv->obj_ops = devx_obj_ops;
1792 		priv->obj_ops.drop_action_create =
1793 						ibv_obj_ops.drop_action_create;
1794 		priv->obj_ops.drop_action_destroy =
1795 						ibv_obj_ops.drop_action_destroy;
1796 #ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET
1797 		priv->obj_ops.txq_obj_modify = ibv_obj_ops.txq_obj_modify;
1798 #else
1799 		if (config->dv_esw_en)
1800 			priv->obj_ops.txq_obj_modify =
1801 						ibv_obj_ops.txq_obj_modify;
1802 #endif
1803 		/* Use specific wrappers for Tx object. */
1804 		priv->obj_ops.txq_obj_new = mlx5_os_txq_obj_new;
1805 		priv->obj_ops.txq_obj_release = mlx5_os_txq_obj_release;
1806 		mlx5_queue_counter_id_prepare(eth_dev);
1807 		priv->obj_ops.lb_dummy_queue_create =
1808 					mlx5_rxq_ibv_obj_dummy_lb_create;
1809 		priv->obj_ops.lb_dummy_queue_release =
1810 					mlx5_rxq_ibv_obj_dummy_lb_release;
1811 	} else {
1812 		priv->obj_ops = ibv_obj_ops;
1813 	}
1814 	priv->drop_queue.hrxq = mlx5_drop_action_create(eth_dev);
1815 	if (!priv->drop_queue.hrxq)
1816 		goto error;
1817 	/* Supported Verbs flow priority number detection. */
1818 	err = mlx5_flow_discover_priorities(eth_dev);
1819 	if (err < 0) {
1820 		err = -err;
1821 		goto error;
1822 	}
1823 	priv->config.flow_prio = err;
1824 	if (!priv->config.dv_esw_en &&
1825 	    priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
1826 		DRV_LOG(WARNING, "metadata mode %u is not supported "
1827 				 "(no E-Switch)", priv->config.dv_xmeta_en);
1828 		priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY;
1829 	}
1830 	mlx5_set_metadata_mask(eth_dev);
1831 	if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
1832 	    !priv->sh->dv_regc0_mask) {
1833 		DRV_LOG(ERR, "metadata mode %u is not supported "
1834 			     "(no metadata reg_c[0] is available)",
1835 			     priv->config.dv_xmeta_en);
1836 			err = ENOTSUP;
1837 			goto error;
1838 	}
1839 	priv->hrxqs = mlx5_list_create("hrxq", eth_dev, mlx5_hrxq_create_cb,
1840 				       mlx5_hrxq_match_cb,
1841 				       mlx5_hrxq_remove_cb,
1842 				       mlx5_hrxq_clone_cb,
1843 				       mlx5_hrxq_clone_free_cb);
1844 	if (!priv->hrxqs)
1845 		goto error;
1846 	rte_rwlock_init(&priv->ind_tbls_lock);
1847 	/* Query availability of metadata reg_c's. */
1848 	err = mlx5_flow_discover_mreg_c(eth_dev);
1849 	if (err < 0) {
1850 		err = -err;
1851 		goto error;
1852 	}
1853 	if (!mlx5_flow_ext_mreg_supported(eth_dev)) {
1854 		DRV_LOG(DEBUG,
1855 			"port %u extensive metadata register is not supported",
1856 			eth_dev->data->port_id);
1857 		if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
1858 			DRV_LOG(ERR, "metadata mode %u is not supported "
1859 				     "(no metadata registers available)",
1860 				     priv->config.dv_xmeta_en);
1861 			err = ENOTSUP;
1862 			goto error;
1863 		}
1864 	}
1865 	if (priv->config.dv_flow_en &&
1866 	    priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
1867 	    mlx5_flow_ext_mreg_supported(eth_dev) &&
1868 	    priv->sh->dv_regc0_mask) {
1869 		priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME,
1870 						      MLX5_FLOW_MREG_HTABLE_SZ,
1871 						      0, 0,
1872 						      flow_dv_mreg_create_cb,
1873 						      flow_dv_mreg_match_cb,
1874 						      flow_dv_mreg_remove_cb);
1875 		if (!priv->mreg_cp_tbl) {
1876 			err = ENOMEM;
1877 			goto error;
1878 		}
1879 		priv->mreg_cp_tbl->ctx = eth_dev;
1880 	}
1881 	rte_spinlock_init(&priv->shared_act_sl);
1882 	mlx5_flow_counter_mode_config(eth_dev);
1883 	if (priv->config.dv_flow_en)
1884 		eth_dev->data->dev_flags |= RTE_ETH_DEV_FLOW_OPS_THREAD_SAFE;
1885 	return eth_dev;
1886 error:
1887 	if (priv) {
1888 		if (priv->mreg_cp_tbl)
1889 			mlx5_hlist_destroy(priv->mreg_cp_tbl);
1890 		if (priv->sh)
1891 			mlx5_os_free_shared_dr(priv);
1892 		if (priv->nl_socket_route >= 0)
1893 			close(priv->nl_socket_route);
1894 		if (priv->nl_socket_rdma >= 0)
1895 			close(priv->nl_socket_rdma);
1896 		if (priv->vmwa_context)
1897 			mlx5_vlan_vmwa_exit(priv->vmwa_context);
1898 		if (eth_dev && priv->drop_queue.hrxq)
1899 			mlx5_drop_action_destroy(eth_dev);
1900 		if (priv->mtr_profile_tbl)
1901 			mlx5_l3t_destroy(priv->mtr_profile_tbl);
1902 		if (own_domain_id)
1903 			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
1904 		if (priv->hrxqs)
1905 			mlx5_list_destroy(priv->hrxqs);
1906 		mlx5_free(priv);
1907 		if (eth_dev != NULL)
1908 			eth_dev->data->dev_private = NULL;
1909 	}
1910 	if (eth_dev != NULL) {
1911 		/* mac_addrs must not be freed alone because part of
1912 		 * dev_private
1913 		 **/
1914 		eth_dev->data->mac_addrs = NULL;
1915 		rte_eth_dev_release_port(eth_dev);
1916 	}
1917 	if (sh)
1918 		mlx5_free_shared_dev_ctx(sh);
1919 	MLX5_ASSERT(err > 0);
1920 	rte_errno = err;
1921 	return NULL;
1922 }
1923 
1924 /**
1925  * Comparison callback to sort device data.
1926  *
1927  * This is meant to be used with qsort().
1928  *
1929  * @param a[in]
1930  *   Pointer to pointer to first data object.
1931  * @param b[in]
1932  *   Pointer to pointer to second data object.
1933  *
1934  * @return
1935  *   0 if both objects are equal, less than 0 if the first argument is less
1936  *   than the second, greater than 0 otherwise.
1937  */
1938 static int
1939 mlx5_dev_spawn_data_cmp(const void *a, const void *b)
1940 {
1941 	const struct mlx5_switch_info *si_a =
1942 		&((const struct mlx5_dev_spawn_data *)a)->info;
1943 	const struct mlx5_switch_info *si_b =
1944 		&((const struct mlx5_dev_spawn_data *)b)->info;
1945 	int ret;
1946 
1947 	/* Master device first. */
1948 	ret = si_b->master - si_a->master;
1949 	if (ret)
1950 		return ret;
1951 	/* Then representor devices. */
1952 	ret = si_b->representor - si_a->representor;
1953 	if (ret)
1954 		return ret;
1955 	/* Unidentified devices come last in no specific order. */
1956 	if (!si_a->representor)
1957 		return 0;
1958 	/* Order representors by name. */
1959 	return si_a->port_name - si_b->port_name;
1960 }
1961 
1962 /**
1963  * Match PCI information for possible slaves of bonding device.
1964  *
1965  * @param[in] ibv_dev
1966  *   Pointer to Infiniband device structure.
1967  * @param[in] pci_dev
1968  *   Pointer to primary PCI address structure to match.
1969  * @param[in] nl_rdma
1970  *   Netlink RDMA group socket handle.
1971  * @param[in] owner
1972  *   Rerepsentor owner PF index.
1973  * @param[out] bond_info
1974  *   Pointer to bonding information.
1975  *
1976  * @return
1977  *   negative value if no bonding device found, otherwise
1978  *   positive index of slave PF in bonding.
1979  */
1980 static int
1981 mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev,
1982 			   const struct rte_pci_addr *pci_dev,
1983 			   int nl_rdma, uint16_t owner,
1984 			   struct mlx5_bond_info *bond_info)
1985 {
1986 	char ifname[IF_NAMESIZE + 1];
1987 	unsigned int ifindex;
1988 	unsigned int np, i;
1989 	FILE *bond_file = NULL, *file;
1990 	int pf = -1;
1991 	int ret;
1992 
1993 	/*
1994 	 * Try to get master device name. If something goes
1995 	 * wrong suppose the lack of kernel support and no
1996 	 * bonding devices.
1997 	 */
1998 	memset(bond_info, 0, sizeof(*bond_info));
1999 	if (nl_rdma < 0)
2000 		return -1;
2001 	if (!strstr(ibv_dev->name, "bond"))
2002 		return -1;
2003 	np = mlx5_nl_portnum(nl_rdma, ibv_dev->name);
2004 	if (!np)
2005 		return -1;
2006 	/*
2007 	 * The Master device might not be on the predefined
2008 	 * port (not on port index 1, it is not garanted),
2009 	 * we have to scan all Infiniband device port and
2010 	 * find master.
2011 	 */
2012 	for (i = 1; i <= np; ++i) {
2013 		/* Check whether Infiniband port is populated. */
2014 		ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i);
2015 		if (!ifindex)
2016 			continue;
2017 		if (!if_indextoname(ifindex, ifname))
2018 			continue;
2019 		/* Try to read bonding slave names from sysfs. */
2020 		MKSTR(slaves,
2021 		      "/sys/class/net/%s/master/bonding/slaves", ifname);
2022 		bond_file = fopen(slaves, "r");
2023 		if (bond_file)
2024 			break;
2025 	}
2026 	if (!bond_file)
2027 		return -1;
2028 	/* Use safe format to check maximal buffer length. */
2029 	MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE);
2030 	while (fscanf(bond_file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) {
2031 		char tmp_str[IF_NAMESIZE + 32];
2032 		struct rte_pci_addr pci_addr;
2033 		struct mlx5_switch_info	info;
2034 
2035 		/* Process slave interface names in the loop. */
2036 		snprintf(tmp_str, sizeof(tmp_str),
2037 			 "/sys/class/net/%s", ifname);
2038 		if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) {
2039 			DRV_LOG(WARNING, "can not get PCI address"
2040 					 " for netdev \"%s\"", ifname);
2041 			continue;
2042 		}
2043 		/* Slave interface PCI address match found. */
2044 		snprintf(tmp_str, sizeof(tmp_str),
2045 			 "/sys/class/net/%s/phys_port_name", ifname);
2046 		file = fopen(tmp_str, "rb");
2047 		if (!file)
2048 			break;
2049 		info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET;
2050 		if (fscanf(file, "%32s", tmp_str) == 1)
2051 			mlx5_translate_port_name(tmp_str, &info);
2052 		fclose(file);
2053 		/* Only process PF ports. */
2054 		if (info.name_type != MLX5_PHYS_PORT_NAME_TYPE_LEGACY &&
2055 		    info.name_type != MLX5_PHYS_PORT_NAME_TYPE_UPLINK)
2056 			continue;
2057 		/* Check max bonding member. */
2058 		if (info.port_name >= MLX5_BOND_MAX_PORTS) {
2059 			DRV_LOG(WARNING, "bonding index out of range, "
2060 				"please increase MLX5_BOND_MAX_PORTS: %s",
2061 				tmp_str);
2062 			break;
2063 		}
2064 		/* Match PCI address, allows BDF0+pfx or BDFx+pfx. */
2065 		if (pci_dev->domain == pci_addr.domain &&
2066 		    pci_dev->bus == pci_addr.bus &&
2067 		    pci_dev->devid == pci_addr.devid &&
2068 		    ((pci_dev->function == 0 &&
2069 		      pci_dev->function + owner == pci_addr.function) ||
2070 		     (pci_dev->function == owner &&
2071 		      pci_addr.function == owner)))
2072 			pf = info.port_name;
2073 		/* Get ifindex. */
2074 		snprintf(tmp_str, sizeof(tmp_str),
2075 			 "/sys/class/net/%s/ifindex", ifname);
2076 		file = fopen(tmp_str, "rb");
2077 		if (!file)
2078 			break;
2079 		ret = fscanf(file, "%u", &ifindex);
2080 		fclose(file);
2081 		if (ret != 1)
2082 			break;
2083 		/* Save bonding info. */
2084 		strncpy(bond_info->ports[info.port_name].ifname, ifname,
2085 			sizeof(bond_info->ports[0].ifname));
2086 		bond_info->ports[info.port_name].pci_addr = pci_addr;
2087 		bond_info->ports[info.port_name].ifindex = ifindex;
2088 		bond_info->n_port++;
2089 	}
2090 	if (pf >= 0) {
2091 		/* Get bond interface info */
2092 		ret = mlx5_sysfs_bond_info(ifindex, &bond_info->ifindex,
2093 					   bond_info->ifname);
2094 		if (ret)
2095 			DRV_LOG(ERR, "unable to get bond info: %s",
2096 				strerror(rte_errno));
2097 		else
2098 			DRV_LOG(INFO, "PF device %u, bond device %u(%s)",
2099 				ifindex, bond_info->ifindex, bond_info->ifname);
2100 	}
2101 	return pf;
2102 }
2103 
2104 /**
2105  * Register a PCI device within bonding.
2106  *
2107  * This function spawns Ethernet devices out of a given PCI device and
2108  * bonding owner PF index.
2109  *
2110  * @param[in] pci_dev
2111  *   PCI device information.
2112  * @param[in] req_eth_da
2113  *   Requested ethdev device argument.
2114  * @param[in] owner_id
2115  *   Requested owner PF port ID within bonding device, default to 0.
2116  *
2117  * @return
2118  *   0 on success, a negative errno value otherwise and rte_errno is set.
2119  */
2120 static int
2121 mlx5_os_pci_probe_pf(struct rte_pci_device *pci_dev,
2122 		     struct rte_eth_devargs *req_eth_da,
2123 		     uint16_t owner_id)
2124 {
2125 	struct ibv_device **ibv_list;
2126 	/*
2127 	 * Number of found IB Devices matching with requested PCI BDF.
2128 	 * nd != 1 means there are multiple IB devices over the same
2129 	 * PCI device and we have representors and master.
2130 	 */
2131 	unsigned int nd = 0;
2132 	/*
2133 	 * Number of found IB device Ports. nd = 1 and np = 1..n means
2134 	 * we have the single multiport IB device, and there may be
2135 	 * representors attached to some of found ports.
2136 	 */
2137 	unsigned int np = 0;
2138 	/*
2139 	 * Number of DPDK ethernet devices to Spawn - either over
2140 	 * multiple IB devices or multiple ports of single IB device.
2141 	 * Actually this is the number of iterations to spawn.
2142 	 */
2143 	unsigned int ns = 0;
2144 	/*
2145 	 * Bonding device
2146 	 *   < 0 - no bonding device (single one)
2147 	 *  >= 0 - bonding device (value is slave PF index)
2148 	 */
2149 	int bd = -1;
2150 	struct mlx5_dev_spawn_data *list = NULL;
2151 	struct mlx5_dev_config dev_config;
2152 	unsigned int dev_config_vf;
2153 	struct rte_eth_devargs eth_da = *req_eth_da;
2154 	struct rte_pci_addr owner_pci = pci_dev->addr; /* Owner PF. */
2155 	struct mlx5_bond_info bond_info;
2156 	int ret = -1;
2157 
2158 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
2159 		mlx5_pmd_socket_init();
2160 	ret = mlx5_init_once();
2161 	if (ret) {
2162 		DRV_LOG(ERR, "unable to init PMD global data: %s",
2163 			strerror(rte_errno));
2164 		return -rte_errno;
2165 	}
2166 	errno = 0;
2167 	ibv_list = mlx5_glue->get_device_list(&ret);
2168 	if (!ibv_list) {
2169 		rte_errno = errno ? errno : ENOSYS;
2170 		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
2171 		return -rte_errno;
2172 	}
2173 	/*
2174 	 * First scan the list of all Infiniband devices to find
2175 	 * matching ones, gathering into the list.
2176 	 */
2177 	struct ibv_device *ibv_match[ret + 1];
2178 	int nl_route = mlx5_nl_init(NETLINK_ROUTE);
2179 	int nl_rdma = mlx5_nl_init(NETLINK_RDMA);
2180 	unsigned int i;
2181 
2182 	while (ret-- > 0) {
2183 		struct rte_pci_addr pci_addr;
2184 
2185 		DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
2186 		bd = mlx5_device_bond_pci_match
2187 				(ibv_list[ret], &owner_pci, nl_rdma, owner_id,
2188 				 &bond_info);
2189 		if (bd >= 0) {
2190 			/*
2191 			 * Bonding device detected. Only one match is allowed,
2192 			 * the bonding is supported over multi-port IB device,
2193 			 * there should be no matches on representor PCI
2194 			 * functions or non VF LAG bonding devices with
2195 			 * specified address.
2196 			 */
2197 			if (nd) {
2198 				DRV_LOG(ERR,
2199 					"multiple PCI match on bonding device"
2200 					"\"%s\" found", ibv_list[ret]->name);
2201 				rte_errno = ENOENT;
2202 				ret = -rte_errno;
2203 				goto exit;
2204 			}
2205 			/* Amend owner pci address if owner PF ID specified. */
2206 			if (eth_da.nb_representor_ports)
2207 				owner_pci.function += owner_id;
2208 			DRV_LOG(INFO, "PCI information matches for"
2209 				      " slave %d bonding device \"%s\"",
2210 				      bd, ibv_list[ret]->name);
2211 			ibv_match[nd++] = ibv_list[ret];
2212 			break;
2213 		} else {
2214 			/* Bonding device not found. */
2215 			if (mlx5_dev_to_pci_addr
2216 				(ibv_list[ret]->ibdev_path, &pci_addr))
2217 				continue;
2218 			if (owner_pci.domain != pci_addr.domain ||
2219 			    owner_pci.bus != pci_addr.bus ||
2220 			    owner_pci.devid != pci_addr.devid ||
2221 			    owner_pci.function != pci_addr.function)
2222 				continue;
2223 			DRV_LOG(INFO, "PCI information matches for device \"%s\"",
2224 				ibv_list[ret]->name);
2225 			ibv_match[nd++] = ibv_list[ret];
2226 		}
2227 	}
2228 	ibv_match[nd] = NULL;
2229 	if (!nd) {
2230 		/* No device matches, just complain and bail out. */
2231 		DRV_LOG(WARNING,
2232 			"no Verbs device matches PCI device " PCI_PRI_FMT ","
2233 			" are kernel drivers loaded?",
2234 			owner_pci.domain, owner_pci.bus,
2235 			owner_pci.devid, owner_pci.function);
2236 		rte_errno = ENOENT;
2237 		ret = -rte_errno;
2238 		goto exit;
2239 	}
2240 	if (nd == 1) {
2241 		/*
2242 		 * Found single matching device may have multiple ports.
2243 		 * Each port may be representor, we have to check the port
2244 		 * number and check the representors existence.
2245 		 */
2246 		if (nl_rdma >= 0)
2247 			np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name);
2248 		if (!np)
2249 			DRV_LOG(WARNING, "can not get IB device \"%s\""
2250 					 " ports number", ibv_match[0]->name);
2251 		if (bd >= 0 && !np) {
2252 			DRV_LOG(ERR, "can not get ports"
2253 				     " for bonding device");
2254 			rte_errno = ENOENT;
2255 			ret = -rte_errno;
2256 			goto exit;
2257 		}
2258 	}
2259 #ifndef HAVE_MLX5DV_DR_DEVX_PORT
2260 	if (bd >= 0) {
2261 		/*
2262 		 * This may happen if there is VF LAG kernel support and
2263 		 * application is compiled with older rdma_core library.
2264 		 */
2265 		DRV_LOG(ERR,
2266 			"No kernel/verbs support for VF LAG bonding found.");
2267 		rte_errno = ENOTSUP;
2268 		ret = -rte_errno;
2269 		goto exit;
2270 	}
2271 #endif
2272 	/*
2273 	 * Now we can determine the maximal
2274 	 * amount of devices to be spawned.
2275 	 */
2276 	list = mlx5_malloc(MLX5_MEM_ZERO,
2277 			   sizeof(struct mlx5_dev_spawn_data) *
2278 			   (np ? np : nd),
2279 			   RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
2280 	if (!list) {
2281 		DRV_LOG(ERR, "spawn data array allocation failure");
2282 		rte_errno = ENOMEM;
2283 		ret = -rte_errno;
2284 		goto exit;
2285 	}
2286 	if (bd >= 0 || np > 1) {
2287 		/*
2288 		 * Single IB device with multiple ports found,
2289 		 * it may be E-Switch master device and representors.
2290 		 * We have to perform identification through the ports.
2291 		 */
2292 		MLX5_ASSERT(nl_rdma >= 0);
2293 		MLX5_ASSERT(ns == 0);
2294 		MLX5_ASSERT(nd == 1);
2295 		MLX5_ASSERT(np);
2296 		for (i = 1; i <= np; ++i) {
2297 			list[ns].bond_info = &bond_info;
2298 			list[ns].max_port = np;
2299 			list[ns].phys_port = i;
2300 			list[ns].phys_dev = ibv_match[0];
2301 			list[ns].eth_dev = NULL;
2302 			list[ns].pci_dev = pci_dev;
2303 			list[ns].pf_bond = bd;
2304 			list[ns].ifindex = mlx5_nl_ifindex
2305 				(nl_rdma,
2306 				mlx5_os_get_dev_device_name
2307 						(list[ns].phys_dev), i);
2308 			if (!list[ns].ifindex) {
2309 				/*
2310 				 * No network interface index found for the
2311 				 * specified port, it means there is no
2312 				 * representor on this port. It's OK,
2313 				 * there can be disabled ports, for example
2314 				 * if sriov_numvfs < sriov_totalvfs.
2315 				 */
2316 				continue;
2317 			}
2318 			ret = -1;
2319 			if (nl_route >= 0)
2320 				ret = mlx5_nl_switch_info
2321 					       (nl_route,
2322 						list[ns].ifindex,
2323 						&list[ns].info);
2324 			if (ret || (!list[ns].info.representor &&
2325 				    !list[ns].info.master)) {
2326 				/*
2327 				 * We failed to recognize representors with
2328 				 * Netlink, let's try to perform the task
2329 				 * with sysfs.
2330 				 */
2331 				ret =  mlx5_sysfs_switch_info
2332 						(list[ns].ifindex,
2333 						 &list[ns].info);
2334 			}
2335 #ifdef HAVE_MLX5DV_DR_DEVX_PORT
2336 			if (!ret && bd >= 0) {
2337 				switch (list[ns].info.name_type) {
2338 				case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
2339 					if (list[ns].info.port_name == bd)
2340 						ns++;
2341 					break;
2342 				case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
2343 					/* Fallthrough */
2344 				case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
2345 					/* Fallthrough */
2346 				case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
2347 					if (list[ns].info.pf_num == bd)
2348 						ns++;
2349 					break;
2350 				default:
2351 					break;
2352 				}
2353 				continue;
2354 			}
2355 #endif
2356 			if (!ret && (list[ns].info.representor ^
2357 				     list[ns].info.master))
2358 				ns++;
2359 		}
2360 		if (!ns) {
2361 			DRV_LOG(ERR,
2362 				"unable to recognize master/representors"
2363 				" on the IB device with multiple ports");
2364 			rte_errno = ENOENT;
2365 			ret = -rte_errno;
2366 			goto exit;
2367 		}
2368 	} else {
2369 		/*
2370 		 * The existence of several matching entries (nd > 1) means
2371 		 * port representors have been instantiated. No existing Verbs
2372 		 * call nor sysfs entries can tell them apart, this can only
2373 		 * be done through Netlink calls assuming kernel drivers are
2374 		 * recent enough to support them.
2375 		 *
2376 		 * In the event of identification failure through Netlink,
2377 		 * try again through sysfs, then:
2378 		 *
2379 		 * 1. A single IB device matches (nd == 1) with single
2380 		 *    port (np=0/1) and is not a representor, assume
2381 		 *    no switch support.
2382 		 *
2383 		 * 2. Otherwise no safe assumptions can be made;
2384 		 *    complain louder and bail out.
2385 		 */
2386 		for (i = 0; i != nd; ++i) {
2387 			memset(&list[ns].info, 0, sizeof(list[ns].info));
2388 			list[ns].bond_info = NULL;
2389 			list[ns].max_port = 1;
2390 			list[ns].phys_port = 1;
2391 			list[ns].phys_dev = ibv_match[i];
2392 			list[ns].eth_dev = NULL;
2393 			list[ns].pci_dev = pci_dev;
2394 			list[ns].pf_bond = -1;
2395 			list[ns].ifindex = 0;
2396 			if (nl_rdma >= 0)
2397 				list[ns].ifindex = mlx5_nl_ifindex
2398 				(nl_rdma,
2399 				mlx5_os_get_dev_device_name
2400 						(list[ns].phys_dev), 1);
2401 			if (!list[ns].ifindex) {
2402 				char ifname[IF_NAMESIZE];
2403 
2404 				/*
2405 				 * Netlink failed, it may happen with old
2406 				 * ib_core kernel driver (before 4.16).
2407 				 * We can assume there is old driver because
2408 				 * here we are processing single ports IB
2409 				 * devices. Let's try sysfs to retrieve
2410 				 * the ifindex. The method works for
2411 				 * master device only.
2412 				 */
2413 				if (nd > 1) {
2414 					/*
2415 					 * Multiple devices found, assume
2416 					 * representors, can not distinguish
2417 					 * master/representor and retrieve
2418 					 * ifindex via sysfs.
2419 					 */
2420 					continue;
2421 				}
2422 				ret = mlx5_get_ifname_sysfs
2423 					(ibv_match[i]->ibdev_path, ifname);
2424 				if (!ret)
2425 					list[ns].ifindex =
2426 						if_nametoindex(ifname);
2427 				if (!list[ns].ifindex) {
2428 					/*
2429 					 * No network interface index found
2430 					 * for the specified device, it means
2431 					 * there it is neither representor
2432 					 * nor master.
2433 					 */
2434 					continue;
2435 				}
2436 			}
2437 			ret = -1;
2438 			if (nl_route >= 0)
2439 				ret = mlx5_nl_switch_info
2440 					       (nl_route,
2441 						list[ns].ifindex,
2442 						&list[ns].info);
2443 			if (ret || (!list[ns].info.representor &&
2444 				    !list[ns].info.master)) {
2445 				/*
2446 				 * We failed to recognize representors with
2447 				 * Netlink, let's try to perform the task
2448 				 * with sysfs.
2449 				 */
2450 				ret =  mlx5_sysfs_switch_info
2451 						(list[ns].ifindex,
2452 						 &list[ns].info);
2453 			}
2454 			if (!ret && (list[ns].info.representor ^
2455 				     list[ns].info.master)) {
2456 				ns++;
2457 			} else if ((nd == 1) &&
2458 				   !list[ns].info.representor &&
2459 				   !list[ns].info.master) {
2460 				/*
2461 				 * Single IB device with
2462 				 * one physical port and
2463 				 * attached network device.
2464 				 * May be SRIOV is not enabled
2465 				 * or there is no representors.
2466 				 */
2467 				DRV_LOG(INFO, "no E-Switch support detected");
2468 				ns++;
2469 				break;
2470 			}
2471 		}
2472 		if (!ns) {
2473 			DRV_LOG(ERR,
2474 				"unable to recognize master/representors"
2475 				" on the multiple IB devices");
2476 			rte_errno = ENOENT;
2477 			ret = -rte_errno;
2478 			goto exit;
2479 		}
2480 		/*
2481 		 * New kernels may add the switch_id attribute for the case
2482 		 * there is no E-Switch and we wrongly recognized the
2483 		 * only device as master. Override this if there is the
2484 		 * single device with single port and new device name
2485 		 * format present.
2486 		 */
2487 		if (nd == 1 &&
2488 		    list[0].info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK) {
2489 			list[0].info.master = 0;
2490 			list[0].info.representor = 0;
2491 		}
2492 	}
2493 	MLX5_ASSERT(ns);
2494 	/*
2495 	 * Sort list to probe devices in natural order for users convenience
2496 	 * (i.e. master first, then representors from lowest to highest ID).
2497 	 */
2498 	qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp);
2499 	/* Device specific configuration. */
2500 	switch (pci_dev->id.device_id) {
2501 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
2502 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
2503 	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
2504 	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
2505 	case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF:
2506 	case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF:
2507 	case PCI_DEVICE_ID_MELLANOX_CONNECTXVF:
2508 		dev_config_vf = 1;
2509 		break;
2510 	default:
2511 		dev_config_vf = 0;
2512 		break;
2513 	}
2514 	if (eth_da.type != RTE_ETH_REPRESENTOR_NONE) {
2515 		/* Set devargs default values. */
2516 		if (eth_da.nb_mh_controllers == 0) {
2517 			eth_da.nb_mh_controllers = 1;
2518 			eth_da.mh_controllers[0] = 0;
2519 		}
2520 		if (eth_da.nb_ports == 0 && ns > 0) {
2521 			if (list[0].pf_bond >= 0 && list[0].info.representor)
2522 				DRV_LOG(WARNING, "Representor on Bonding device should use pf#vf# syntax: %s",
2523 					pci_dev->device.devargs->args);
2524 			eth_da.nb_ports = 1;
2525 			eth_da.ports[0] = list[0].info.pf_num;
2526 		}
2527 		if (eth_da.nb_representor_ports == 0) {
2528 			eth_da.nb_representor_ports = 1;
2529 			eth_da.representor_ports[0] = 0;
2530 		}
2531 	}
2532 	for (i = 0; i != ns; ++i) {
2533 		uint32_t restore;
2534 
2535 		/* Default configuration. */
2536 		memset(&dev_config, 0, sizeof(struct mlx5_dev_config));
2537 		dev_config.vf = dev_config_vf;
2538 		dev_config.mps = MLX5_ARG_UNSET;
2539 		dev_config.dbnc = MLX5_ARG_UNSET;
2540 		dev_config.rx_vec_en = 1;
2541 		dev_config.txq_inline_max = MLX5_ARG_UNSET;
2542 		dev_config.txq_inline_min = MLX5_ARG_UNSET;
2543 		dev_config.txq_inline_mpw = MLX5_ARG_UNSET;
2544 		dev_config.txqs_inline = MLX5_ARG_UNSET;
2545 		dev_config.vf_nl_en = 1;
2546 		dev_config.mr_ext_memseg_en = 1;
2547 		dev_config.mprq.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN;
2548 		dev_config.mprq.min_rxqs_num = MLX5_MPRQ_MIN_RXQS;
2549 		dev_config.dv_esw_en = 1;
2550 		dev_config.dv_flow_en = 1;
2551 		dev_config.decap_en = 1;
2552 		dev_config.log_hp_size = MLX5_ARG_UNSET;
2553 		dev_config.allow_duplicate_pattern = 1;
2554 		list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device,
2555 						 &list[i],
2556 						 &dev_config,
2557 						 &eth_da);
2558 		if (!list[i].eth_dev) {
2559 			if (rte_errno != EBUSY && rte_errno != EEXIST)
2560 				break;
2561 			/* Device is disabled or already spawned. Ignore it. */
2562 			continue;
2563 		}
2564 		restore = list[i].eth_dev->data->dev_flags;
2565 		rte_eth_copy_pci_info(list[i].eth_dev, pci_dev);
2566 		/* Restore non-PCI flags cleared by the above call. */
2567 		list[i].eth_dev->data->dev_flags |= restore;
2568 		rte_eth_dev_probing_finish(list[i].eth_dev);
2569 	}
2570 	if (i != ns) {
2571 		DRV_LOG(ERR,
2572 			"probe of PCI device " PCI_PRI_FMT " aborted after"
2573 			" encountering an error: %s",
2574 			owner_pci.domain, owner_pci.bus,
2575 			owner_pci.devid, owner_pci.function,
2576 			strerror(rte_errno));
2577 		ret = -rte_errno;
2578 		/* Roll back. */
2579 		while (i--) {
2580 			if (!list[i].eth_dev)
2581 				continue;
2582 			mlx5_dev_close(list[i].eth_dev);
2583 			/* mac_addrs must not be freed because in dev_private */
2584 			list[i].eth_dev->data->mac_addrs = NULL;
2585 			claim_zero(rte_eth_dev_release_port(list[i].eth_dev));
2586 		}
2587 		/* Restore original error. */
2588 		rte_errno = -ret;
2589 	} else {
2590 		ret = 0;
2591 	}
2592 exit:
2593 	/*
2594 	 * Do the routine cleanup:
2595 	 * - close opened Netlink sockets
2596 	 * - free allocated spawn data array
2597 	 * - free the Infiniband device list
2598 	 */
2599 	if (nl_rdma >= 0)
2600 		close(nl_rdma);
2601 	if (nl_route >= 0)
2602 		close(nl_route);
2603 	if (list)
2604 		mlx5_free(list);
2605 	MLX5_ASSERT(ibv_list);
2606 	mlx5_glue->free_device_list(ibv_list);
2607 	return ret;
2608 }
2609 
2610 /**
2611  * DPDK callback to register a PCI device.
2612  *
2613  * This function spawns Ethernet devices out of a given PCI device.
2614  *
2615  * @param[in] pci_drv
2616  *   PCI driver structure (mlx5_driver).
2617  * @param[in] pci_dev
2618  *   PCI device information.
2619  *
2620  * @return
2621  *   0 on success, a negative errno value otherwise and rte_errno is set.
2622  */
2623 int
2624 mlx5_os_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
2625 		  struct rte_pci_device *pci_dev)
2626 {
2627 	struct rte_eth_devargs eth_da = { .type = RTE_ETH_REPRESENTOR_NONE };
2628 	int ret = 0;
2629 	uint16_t p;
2630 
2631 	if (pci_dev->device.devargs) {
2632 		/* Parse representor information from device argument. */
2633 		if (pci_dev->device.devargs->cls_str)
2634 			ret = rte_eth_devargs_parse
2635 				(pci_dev->device.devargs->cls_str, &eth_da);
2636 		if (ret) {
2637 			DRV_LOG(ERR, "failed to parse device arguments: %s",
2638 				pci_dev->device.devargs->cls_str);
2639 			return -rte_errno;
2640 		}
2641 		if (eth_da.type == RTE_ETH_REPRESENTOR_NONE) {
2642 			/* Support legacy device argument */
2643 			ret = rte_eth_devargs_parse
2644 				(pci_dev->device.devargs->args, &eth_da);
2645 			if (ret) {
2646 				DRV_LOG(ERR, "failed to parse device arguments: %s",
2647 					pci_dev->device.devargs->args);
2648 				return -rte_errno;
2649 			}
2650 		}
2651 	}
2652 
2653 	if (eth_da.nb_ports > 0) {
2654 		/* Iterate all port if devargs pf is range: "pf[0-1]vf[...]". */
2655 		for (p = 0; p < eth_da.nb_ports; p++)
2656 			ret = mlx5_os_pci_probe_pf(pci_dev, &eth_da,
2657 						   eth_da.ports[p]);
2658 	} else {
2659 		ret = mlx5_os_pci_probe_pf(pci_dev, &eth_da, 0);
2660 	}
2661 	return ret;
2662 }
2663 
2664 static int
2665 mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config)
2666 {
2667 	char *env;
2668 	int value;
2669 
2670 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
2671 	/* Get environment variable to store. */
2672 	env = getenv(MLX5_SHUT_UP_BF);
2673 	value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
2674 	if (config->dbnc == MLX5_ARG_UNSET)
2675 		setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1);
2676 	else
2677 		setenv(MLX5_SHUT_UP_BF,
2678 		       config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1);
2679 	return value;
2680 }
2681 
2682 static void
2683 mlx5_restore_doorbell_mapping_env(int value)
2684 {
2685 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
2686 	/* Restore the original environment variable state. */
2687 	if (value == MLX5_ARG_UNSET)
2688 		unsetenv(MLX5_SHUT_UP_BF);
2689 	else
2690 		setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1);
2691 }
2692 
2693 /**
2694  * Extract pdn of PD object using DV API.
2695  *
2696  * @param[in] pd
2697  *   Pointer to the verbs PD object.
2698  * @param[out] pdn
2699  *   Pointer to the PD object number variable.
2700  *
2701  * @return
2702  *   0 on success, error value otherwise.
2703  */
2704 int
2705 mlx5_os_get_pdn(void *pd, uint32_t *pdn)
2706 {
2707 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
2708 	struct mlx5dv_obj obj;
2709 	struct mlx5dv_pd pd_info;
2710 	int ret = 0;
2711 
2712 	obj.pd.in = pd;
2713 	obj.pd.out = &pd_info;
2714 	ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
2715 	if (ret) {
2716 		DRV_LOG(DEBUG, "Fail to get PD object info");
2717 		return ret;
2718 	}
2719 	*pdn = pd_info.pdn;
2720 	return 0;
2721 #else
2722 	(void)pd;
2723 	(void)pdn;
2724 	return -ENOTSUP;
2725 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
2726 }
2727 
2728 /**
2729  * Function API to open IB device.
2730  *
2731  * This function calls the Linux glue APIs to open a device.
2732  *
2733  * @param[in] spawn
2734  *   Pointer to the IB device attributes (name, port, etc).
2735  * @param[out] config
2736  *   Pointer to device configuration structure.
2737  * @param[out] sh
2738  *   Pointer to shared context structure.
2739  *
2740  * @return
2741  *   0 on success, a positive error value otherwise.
2742  */
2743 int
2744 mlx5_os_open_device(const struct mlx5_dev_spawn_data *spawn,
2745 		     const struct mlx5_dev_config *config,
2746 		     struct mlx5_dev_ctx_shared *sh)
2747 {
2748 	int dbmap_env;
2749 	int err = 0;
2750 
2751 	sh->numa_node = spawn->pci_dev->device.numa_node;
2752 	pthread_mutex_init(&sh->txpp.mutex, NULL);
2753 	/*
2754 	 * Configure environment variable "MLX5_BF_SHUT_UP"
2755 	 * before the device creation. The rdma_core library
2756 	 * checks the variable at device creation and
2757 	 * stores the result internally.
2758 	 */
2759 	dbmap_env = mlx5_config_doorbell_mapping_env(config);
2760 	/* Try to open IB device with DV first, then usual Verbs. */
2761 	errno = 0;
2762 	sh->ctx = mlx5_glue->dv_open_device(spawn->phys_dev);
2763 	if (sh->ctx) {
2764 		sh->devx = 1;
2765 		DRV_LOG(DEBUG, "DevX is supported");
2766 		/* The device is created, no need for environment. */
2767 		mlx5_restore_doorbell_mapping_env(dbmap_env);
2768 	} else {
2769 		/* The environment variable is still configured. */
2770 		sh->ctx = mlx5_glue->open_device(spawn->phys_dev);
2771 		err = errno ? errno : ENODEV;
2772 		/*
2773 		 * The environment variable is not needed anymore,
2774 		 * all device creation attempts are completed.
2775 		 */
2776 		mlx5_restore_doorbell_mapping_env(dbmap_env);
2777 		if (!sh->ctx)
2778 			return err;
2779 		DRV_LOG(DEBUG, "DevX is NOT supported");
2780 		err = 0;
2781 	}
2782 	if (!err && sh->ctx) {
2783 		/* Hint libmlx5 to use PMD allocator for data plane resources */
2784 		mlx5_glue->dv_set_context_attr(sh->ctx,
2785 			MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
2786 			(void *)((uintptr_t)&(struct mlx5dv_ctx_allocators){
2787 				.alloc = &mlx5_alloc_verbs_buf,
2788 				.free = &mlx5_free_verbs_buf,
2789 				.data = sh,
2790 			}));
2791 	}
2792 	return err;
2793 }
2794 
2795 /**
2796  * Install shared asynchronous device events handler.
2797  * This function is implemented to support event sharing
2798  * between multiple ports of single IB device.
2799  *
2800  * @param sh
2801  *   Pointer to mlx5_dev_ctx_shared object.
2802  */
2803 void
2804 mlx5_os_dev_shared_handler_install(struct mlx5_dev_ctx_shared *sh)
2805 {
2806 	int ret;
2807 	int flags;
2808 
2809 	sh->intr_handle.fd = -1;
2810 	flags = fcntl(((struct ibv_context *)sh->ctx)->async_fd, F_GETFL);
2811 	ret = fcntl(((struct ibv_context *)sh->ctx)->async_fd,
2812 		    F_SETFL, flags | O_NONBLOCK);
2813 	if (ret) {
2814 		DRV_LOG(INFO, "failed to change file descriptor async event"
2815 			" queue");
2816 	} else {
2817 		sh->intr_handle.fd = ((struct ibv_context *)sh->ctx)->async_fd;
2818 		sh->intr_handle.type = RTE_INTR_HANDLE_EXT;
2819 		if (rte_intr_callback_register(&sh->intr_handle,
2820 					mlx5_dev_interrupt_handler, sh)) {
2821 			DRV_LOG(INFO, "Fail to install the shared interrupt.");
2822 			sh->intr_handle.fd = -1;
2823 		}
2824 	}
2825 	if (sh->devx) {
2826 #ifdef HAVE_IBV_DEVX_ASYNC
2827 		sh->intr_handle_devx.fd = -1;
2828 		sh->devx_comp =
2829 			(void *)mlx5_glue->devx_create_cmd_comp(sh->ctx);
2830 		struct mlx5dv_devx_cmd_comp *devx_comp = sh->devx_comp;
2831 		if (!devx_comp) {
2832 			DRV_LOG(INFO, "failed to allocate devx_comp.");
2833 			return;
2834 		}
2835 		flags = fcntl(devx_comp->fd, F_GETFL);
2836 		ret = fcntl(devx_comp->fd, F_SETFL, flags | O_NONBLOCK);
2837 		if (ret) {
2838 			DRV_LOG(INFO, "failed to change file descriptor"
2839 				" devx comp");
2840 			return;
2841 		}
2842 		sh->intr_handle_devx.fd = devx_comp->fd;
2843 		sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT;
2844 		if (rte_intr_callback_register(&sh->intr_handle_devx,
2845 					mlx5_dev_interrupt_handler_devx, sh)) {
2846 			DRV_LOG(INFO, "Fail to install the devx shared"
2847 				" interrupt.");
2848 			sh->intr_handle_devx.fd = -1;
2849 		}
2850 #endif /* HAVE_IBV_DEVX_ASYNC */
2851 	}
2852 }
2853 
2854 /**
2855  * Uninstall shared asynchronous device events handler.
2856  * This function is implemented to support event sharing
2857  * between multiple ports of single IB device.
2858  *
2859  * @param dev
2860  *   Pointer to mlx5_dev_ctx_shared object.
2861  */
2862 void
2863 mlx5_os_dev_shared_handler_uninstall(struct mlx5_dev_ctx_shared *sh)
2864 {
2865 	if (sh->intr_handle.fd >= 0)
2866 		mlx5_intr_callback_unregister(&sh->intr_handle,
2867 					      mlx5_dev_interrupt_handler, sh);
2868 #ifdef HAVE_IBV_DEVX_ASYNC
2869 	if (sh->intr_handle_devx.fd >= 0)
2870 		rte_intr_callback_unregister(&sh->intr_handle_devx,
2871 				  mlx5_dev_interrupt_handler_devx, sh);
2872 	if (sh->devx_comp)
2873 		mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp);
2874 #endif
2875 }
2876 
2877 /**
2878  * Read statistics by a named counter.
2879  *
2880  * @param[in] priv
2881  *   Pointer to the private device data structure.
2882  * @param[in] ctr_name
2883  *   Pointer to the name of the statistic counter to read
2884  * @param[out] stat
2885  *   Pointer to read statistic value.
2886  * @return
2887  *   0 on success and stat is valud, 1 if failed to read the value
2888  *   rte_errno is set.
2889  *
2890  */
2891 int
2892 mlx5_os_read_dev_stat(struct mlx5_priv *priv, const char *ctr_name,
2893 		      uint64_t *stat)
2894 {
2895 	int fd;
2896 
2897 	if (priv->sh) {
2898 		if (priv->q_counters != NULL &&
2899 		    strcmp(ctr_name, "out_of_buffer") == 0)
2900 			return mlx5_devx_cmd_queue_counter_query
2901 					(priv->q_counters, 0, (uint32_t *)stat);
2902 		MKSTR(path, "%s/ports/%d/hw_counters/%s",
2903 		      priv->sh->ibdev_path,
2904 		      priv->dev_port,
2905 		      ctr_name);
2906 		fd = open(path, O_RDONLY);
2907 		/*
2908 		 * in switchdev the file location is not per port
2909 		 * but rather in <ibdev_path>/hw_counters/<file_name>.
2910 		 */
2911 		if (fd == -1) {
2912 			MKSTR(path1, "%s/hw_counters/%s",
2913 			      priv->sh->ibdev_path,
2914 			      ctr_name);
2915 			fd = open(path1, O_RDONLY);
2916 		}
2917 		if (fd != -1) {
2918 			char buf[21] = {'\0'};
2919 			ssize_t n = read(fd, buf, sizeof(buf));
2920 
2921 			close(fd);
2922 			if (n != -1) {
2923 				*stat = strtoull(buf, NULL, 10);
2924 				return 0;
2925 			}
2926 		}
2927 	}
2928 	*stat = 0;
2929 	return 1;
2930 }
2931 
2932 /**
2933  * Set the reg_mr and dereg_mr call backs
2934  *
2935  * @param reg_mr_cb[out]
2936  *   Pointer to reg_mr func
2937  * @param dereg_mr_cb[out]
2938  *   Pointer to dereg_mr func
2939  *
2940  */
2941 void
2942 mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb,
2943 		      mlx5_dereg_mr_t *dereg_mr_cb)
2944 {
2945 	*reg_mr_cb = mlx5_mr_verbs_ops.reg_mr;
2946 	*dereg_mr_cb = mlx5_mr_verbs_ops.dereg_mr;
2947 }
2948 
2949 /**
2950  * Remove a MAC address from device
2951  *
2952  * @param dev
2953  *   Pointer to Ethernet device structure.
2954  * @param index
2955  *   MAC address index.
2956  */
2957 void
2958 mlx5_os_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index)
2959 {
2960 	struct mlx5_priv *priv = dev->data->dev_private;
2961 	const int vf = priv->config.vf;
2962 
2963 	if (vf)
2964 		mlx5_nl_mac_addr_remove(priv->nl_socket_route,
2965 					mlx5_ifindex(dev), priv->mac_own,
2966 					&dev->data->mac_addrs[index], index);
2967 }
2968 
2969 /**
2970  * Adds a MAC address to the device
2971  *
2972  * @param dev
2973  *   Pointer to Ethernet device structure.
2974  * @param mac_addr
2975  *   MAC address to register.
2976  * @param index
2977  *   MAC address index.
2978  *
2979  * @return
2980  *   0 on success, a negative errno value otherwise
2981  */
2982 int
2983 mlx5_os_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
2984 		     uint32_t index)
2985 {
2986 	struct mlx5_priv *priv = dev->data->dev_private;
2987 	const int vf = priv->config.vf;
2988 	int ret = 0;
2989 
2990 	if (vf)
2991 		ret = mlx5_nl_mac_addr_add(priv->nl_socket_route,
2992 					   mlx5_ifindex(dev), priv->mac_own,
2993 					   mac, index);
2994 	return ret;
2995 }
2996 
2997 /**
2998  * Modify a VF MAC address
2999  *
3000  * @param priv
3001  *   Pointer to device private data.
3002  * @param mac_addr
3003  *   MAC address to modify into.
3004  * @param iface_idx
3005  *   Net device interface index
3006  * @param vf_index
3007  *   VF index
3008  *
3009  * @return
3010  *   0 on success, a negative errno value otherwise
3011  */
3012 int
3013 mlx5_os_vf_mac_addr_modify(struct mlx5_priv *priv,
3014 			   unsigned int iface_idx,
3015 			   struct rte_ether_addr *mac_addr,
3016 			   int vf_index)
3017 {
3018 	return mlx5_nl_vf_mac_addr_modify
3019 		(priv->nl_socket_route, iface_idx, mac_addr, vf_index);
3020 }
3021 
3022 /**
3023  * Set device promiscuous mode
3024  *
3025  * @param dev
3026  *   Pointer to Ethernet device structure.
3027  * @param enable
3028  *   0 - promiscuous is disabled, otherwise - enabled
3029  *
3030  * @return
3031  *   0 on success, a negative error value otherwise
3032  */
3033 int
3034 mlx5_os_set_promisc(struct rte_eth_dev *dev, int enable)
3035 {
3036 	struct mlx5_priv *priv = dev->data->dev_private;
3037 
3038 	return mlx5_nl_promisc(priv->nl_socket_route,
3039 			       mlx5_ifindex(dev), !!enable);
3040 }
3041 
3042 /**
3043  * Set device promiscuous mode
3044  *
3045  * @param dev
3046  *   Pointer to Ethernet device structure.
3047  * @param enable
3048  *   0 - all multicase is disabled, otherwise - enabled
3049  *
3050  * @return
3051  *   0 on success, a negative error value otherwise
3052  */
3053 int
3054 mlx5_os_set_allmulti(struct rte_eth_dev *dev, int enable)
3055 {
3056 	struct mlx5_priv *priv = dev->data->dev_private;
3057 
3058 	return mlx5_nl_allmulti(priv->nl_socket_route,
3059 				mlx5_ifindex(dev), !!enable);
3060 }
3061 
3062 /**
3063  * Flush device MAC addresses
3064  *
3065  * @param dev
3066  *   Pointer to Ethernet device structure.
3067  *
3068  */
3069 void
3070 mlx5_os_mac_addr_flush(struct rte_eth_dev *dev)
3071 {
3072 	struct mlx5_priv *priv = dev->data->dev_private;
3073 
3074 	mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev),
3075 			       dev->data->mac_addrs,
3076 			       MLX5_MAX_MAC_ADDRESSES, priv->mac_own);
3077 }
3078