xref: /dpdk/drivers/net/mlx5/mlx5.c (revision b9765e96c225f468eaa0ee5e5b0626743b1d4e9c)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <stddef.h>
7 #include <unistd.h>
8 #include <string.h>
9 #include <stdint.h>
10 #include <stdlib.h>
11 #include <errno.h>
12 #include <net/if.h>
13 #include <sys/mman.h>
14 #include <linux/rtnetlink.h>
15 
16 /* Verbs header. */
17 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
18 #ifdef PEDANTIC
19 #pragma GCC diagnostic ignored "-Wpedantic"
20 #endif
21 #include <infiniband/verbs.h>
22 #ifdef PEDANTIC
23 #pragma GCC diagnostic error "-Wpedantic"
24 #endif
25 
26 #include <rte_malloc.h>
27 #include <rte_ethdev_driver.h>
28 #include <rte_ethdev_pci.h>
29 #include <rte_pci.h>
30 #include <rte_bus_pci.h>
31 #include <rte_common.h>
32 #include <rte_kvargs.h>
33 #include <rte_rwlock.h>
34 #include <rte_spinlock.h>
35 #include <rte_string_fns.h>
36 #include <rte_alarm.h>
37 
38 #include <mlx5_glue.h>
39 #include <mlx5_devx_cmds.h>
40 #include <mlx5_common.h>
41 #include <mlx5_common_os.h>
42 #include <mlx5_common_mp.h>
43 
44 #include "mlx5_defs.h"
45 #include "mlx5.h"
46 #include "mlx5_utils.h"
47 #include "mlx5_rxtx.h"
48 #include "mlx5_autoconf.h"
49 #include "mlx5_mr.h"
50 #include "mlx5_flow.h"
51 #include "rte_pmd_mlx5.h"
52 
53 /* Device parameter to enable RX completion queue compression. */
54 #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"
55 
56 /* Device parameter to enable RX completion entry padding to 128B. */
57 #define MLX5_RXQ_CQE_PAD_EN "rxq_cqe_pad_en"
58 
59 /* Device parameter to enable padding Rx packet to cacheline size. */
60 #define MLX5_RXQ_PKT_PAD_EN "rxq_pkt_pad_en"
61 
62 /* Device parameter to enable Multi-Packet Rx queue. */
63 #define MLX5_RX_MPRQ_EN "mprq_en"
64 
65 /* Device parameter to configure log 2 of the number of strides for MPRQ. */
66 #define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num"
67 
68 /* Device parameter to configure log 2 of the stride size for MPRQ. */
69 #define MLX5_RX_MPRQ_LOG_STRIDE_SIZE "mprq_log_stride_size"
70 
71 /* Device parameter to limit the size of memcpy'd packet for MPRQ. */
72 #define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len"
73 
74 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */
75 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq"
76 
77 /* Device parameter to configure inline send. Deprecated, ignored.*/
78 #define MLX5_TXQ_INLINE "txq_inline"
79 
80 /* Device parameter to limit packet size to inline with ordinary SEND. */
81 #define MLX5_TXQ_INLINE_MAX "txq_inline_max"
82 
83 /* Device parameter to configure minimal data size to inline. */
84 #define MLX5_TXQ_INLINE_MIN "txq_inline_min"
85 
86 /* Device parameter to limit packet size to inline with Enhanced MPW. */
87 #define MLX5_TXQ_INLINE_MPW "txq_inline_mpw"
88 
89 /*
90  * Device parameter to configure the number of TX queues threshold for
91  * enabling inline send.
92  */
93 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
94 
95 /*
96  * Device parameter to configure the number of TX queues threshold for
97  * enabling vectorized Tx, deprecated, ignored (no vectorized Tx routines).
98  */
99 #define MLX5_TXQS_MAX_VEC "txqs_max_vec"
100 
101 /* Device parameter to enable multi-packet send WQEs. */
102 #define MLX5_TXQ_MPW_EN "txq_mpw_en"
103 
104 /*
105  * Device parameter to force doorbell register mapping
106  * to non-cahed region eliminating the extra write memory barrier.
107  */
108 #define MLX5_TX_DB_NC "tx_db_nc"
109 
110 /*
111  * Device parameter to include 2 dsegs in the title WQEBB.
112  * Deprecated, ignored.
113  */
114 #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
115 
116 /*
117  * Device parameter to limit the size of inlining packet.
118  * Deprecated, ignored.
119  */
120 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
121 
122 /*
123  * Device parameter to enable hardware Tx vector.
124  * Deprecated, ignored (no vectorized Tx routines anymore).
125  */
126 #define MLX5_TX_VEC_EN "tx_vec_en"
127 
128 /* Device parameter to enable hardware Rx vector. */
129 #define MLX5_RX_VEC_EN "rx_vec_en"
130 
131 /* Allow L3 VXLAN flow creation. */
132 #define MLX5_L3_VXLAN_EN "l3_vxlan_en"
133 
134 /* Activate DV E-Switch flow steering. */
135 #define MLX5_DV_ESW_EN "dv_esw_en"
136 
137 /* Activate DV flow steering. */
138 #define MLX5_DV_FLOW_EN "dv_flow_en"
139 
140 /* Enable extensive flow metadata support. */
141 #define MLX5_DV_XMETA_EN "dv_xmeta_en"
142 
143 /* Device parameter to let the user manage the lacp traffic of bonded device */
144 #define MLX5_LACP_BY_USER "lacp_by_user"
145 
146 /* Activate Netlink support in VF mode. */
147 #define MLX5_VF_NL_EN "vf_nl_en"
148 
149 /* Enable extending memsegs when creating a MR. */
150 #define MLX5_MR_EXT_MEMSEG_EN "mr_ext_memseg_en"
151 
152 /* Select port representors to instantiate. */
153 #define MLX5_REPRESENTOR "representor"
154 
155 /* Device parameter to configure the maximum number of dump files per queue. */
156 #define MLX5_MAX_DUMP_FILES_NUM "max_dump_files_num"
157 
158 /* Configure timeout of LRO session (in microseconds). */
159 #define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
160 
161 /*
162  * Device parameter to configure the total data buffer size for a single
163  * hairpin queue (logarithm value).
164  */
165 #define MLX5_HP_BUF_SIZE "hp_buf_log_sz"
166 
167 /* Flow memory reclaim mode. */
168 #define MLX5_RECLAIM_MEM "reclaim_mem_mode"
169 
170 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data";
171 
172 /* Shared memory between primary and secondary processes. */
173 struct mlx5_shared_data *mlx5_shared_data;
174 
175 /* Spinlock for mlx5_shared_data allocation. */
176 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
177 
178 /* Process local data for secondary processes. */
179 static struct mlx5_local_data mlx5_local_data;
180 
181 static LIST_HEAD(, mlx5_dev_ctx_shared) mlx5_dev_ctx_list =
182 						LIST_HEAD_INITIALIZER();
183 static pthread_mutex_t mlx5_dev_ctx_list_mutex = PTHREAD_MUTEX_INITIALIZER;
184 
185 static const struct mlx5_indexed_pool_config mlx5_ipool_cfg[] = {
186 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
187 	{
188 		.size = sizeof(struct mlx5_flow_dv_encap_decap_resource),
189 		.trunk_size = 64,
190 		.grow_trunk = 3,
191 		.grow_shift = 2,
192 		.need_lock = 0,
193 		.release_mem_en = 1,
194 		.malloc = rte_malloc_socket,
195 		.free = rte_free,
196 		.type = "mlx5_encap_decap_ipool",
197 	},
198 	{
199 		.size = sizeof(struct mlx5_flow_dv_push_vlan_action_resource),
200 		.trunk_size = 64,
201 		.grow_trunk = 3,
202 		.grow_shift = 2,
203 		.need_lock = 0,
204 		.release_mem_en = 1,
205 		.malloc = rte_malloc_socket,
206 		.free = rte_free,
207 		.type = "mlx5_push_vlan_ipool",
208 	},
209 	{
210 		.size = sizeof(struct mlx5_flow_dv_tag_resource),
211 		.trunk_size = 64,
212 		.grow_trunk = 3,
213 		.grow_shift = 2,
214 		.need_lock = 0,
215 		.release_mem_en = 1,
216 		.malloc = rte_malloc_socket,
217 		.free = rte_free,
218 		.type = "mlx5_tag_ipool",
219 	},
220 	{
221 		.size = sizeof(struct mlx5_flow_dv_port_id_action_resource),
222 		.trunk_size = 64,
223 		.grow_trunk = 3,
224 		.grow_shift = 2,
225 		.need_lock = 0,
226 		.release_mem_en = 1,
227 		.malloc = rte_malloc_socket,
228 		.free = rte_free,
229 		.type = "mlx5_port_id_ipool",
230 	},
231 	{
232 		.size = sizeof(struct mlx5_flow_tbl_data_entry),
233 		.trunk_size = 64,
234 		.grow_trunk = 3,
235 		.grow_shift = 2,
236 		.need_lock = 0,
237 		.release_mem_en = 1,
238 		.malloc = rte_malloc_socket,
239 		.free = rte_free,
240 		.type = "mlx5_jump_ipool",
241 	},
242 #endif
243 	{
244 		.size = sizeof(struct mlx5_flow_meter),
245 		.trunk_size = 64,
246 		.grow_trunk = 3,
247 		.grow_shift = 2,
248 		.need_lock = 0,
249 		.release_mem_en = 1,
250 		.malloc = rte_malloc_socket,
251 		.free = rte_free,
252 		.type = "mlx5_meter_ipool",
253 	},
254 	{
255 		.size = sizeof(struct mlx5_flow_mreg_copy_resource),
256 		.trunk_size = 64,
257 		.grow_trunk = 3,
258 		.grow_shift = 2,
259 		.need_lock = 0,
260 		.release_mem_en = 1,
261 		.malloc = rte_malloc_socket,
262 		.free = rte_free,
263 		.type = "mlx5_mcp_ipool",
264 	},
265 	{
266 		.size = (sizeof(struct mlx5_hrxq) + MLX5_RSS_HASH_KEY_LEN),
267 		.trunk_size = 64,
268 		.grow_trunk = 3,
269 		.grow_shift = 2,
270 		.need_lock = 0,
271 		.release_mem_en = 1,
272 		.malloc = rte_malloc_socket,
273 		.free = rte_free,
274 		.type = "mlx5_hrxq_ipool",
275 	},
276 	{
277 		/*
278 		 * MLX5_IPOOL_MLX5_FLOW size varies for DV and VERBS flows.
279 		 * It set in run time according to PCI function configuration.
280 		 */
281 		.size = 0,
282 		.trunk_size = 64,
283 		.grow_trunk = 3,
284 		.grow_shift = 2,
285 		.need_lock = 0,
286 		.release_mem_en = 1,
287 		.malloc = rte_malloc_socket,
288 		.free = rte_free,
289 		.type = "mlx5_flow_handle_ipool",
290 	},
291 	{
292 		.size = sizeof(struct rte_flow),
293 		.trunk_size = 4096,
294 		.need_lock = 1,
295 		.release_mem_en = 1,
296 		.malloc = rte_malloc_socket,
297 		.free = rte_free,
298 		.type = "rte_flow_ipool",
299 	},
300 };
301 
302 
303 #define MLX5_FLOW_MIN_ID_POOL_SIZE 512
304 #define MLX5_ID_GENERATION_ARRAY_FACTOR 16
305 
306 #define MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE 4096
307 
308 /**
309  * Allocate ID pool structure.
310  *
311  * @param[in] max_id
312  *   The maximum id can be allocated from the pool.
313  *
314  * @return
315  *   Pointer to pool object, NULL value otherwise.
316  */
317 struct mlx5_flow_id_pool *
318 mlx5_flow_id_pool_alloc(uint32_t max_id)
319 {
320 	struct mlx5_flow_id_pool *pool;
321 	void *mem;
322 
323 	pool = rte_zmalloc("id pool allocation", sizeof(*pool),
324 			   RTE_CACHE_LINE_SIZE);
325 	if (!pool) {
326 		DRV_LOG(ERR, "can't allocate id pool");
327 		rte_errno  = ENOMEM;
328 		return NULL;
329 	}
330 	mem = rte_zmalloc("", MLX5_FLOW_MIN_ID_POOL_SIZE * sizeof(uint32_t),
331 			  RTE_CACHE_LINE_SIZE);
332 	if (!mem) {
333 		DRV_LOG(ERR, "can't allocate mem for id pool");
334 		rte_errno  = ENOMEM;
335 		goto error;
336 	}
337 	pool->free_arr = mem;
338 	pool->curr = pool->free_arr;
339 	pool->last = pool->free_arr + MLX5_FLOW_MIN_ID_POOL_SIZE;
340 	pool->base_index = 0;
341 	pool->max_id = max_id;
342 	return pool;
343 error:
344 	rte_free(pool);
345 	return NULL;
346 }
347 
348 /**
349  * Release ID pool structure.
350  *
351  * @param[in] pool
352  *   Pointer to flow id pool object to free.
353  */
354 void
355 mlx5_flow_id_pool_release(struct mlx5_flow_id_pool *pool)
356 {
357 	rte_free(pool->free_arr);
358 	rte_free(pool);
359 }
360 
361 /**
362  * Generate ID.
363  *
364  * @param[in] pool
365  *   Pointer to flow id pool.
366  * @param[out] id
367  *   The generated ID.
368  *
369  * @return
370  *   0 on success, error value otherwise.
371  */
372 uint32_t
373 mlx5_flow_id_get(struct mlx5_flow_id_pool *pool, uint32_t *id)
374 {
375 	if (pool->curr == pool->free_arr) {
376 		if (pool->base_index == pool->max_id) {
377 			rte_errno  = ENOMEM;
378 			DRV_LOG(ERR, "no free id");
379 			return -rte_errno;
380 		}
381 		*id = ++pool->base_index;
382 		return 0;
383 	}
384 	*id = *(--pool->curr);
385 	return 0;
386 }
387 
388 /**
389  * Release ID.
390  *
391  * @param[in] pool
392  *   Pointer to flow id pool.
393  * @param[out] id
394  *   The generated ID.
395  *
396  * @return
397  *   0 on success, error value otherwise.
398  */
399 uint32_t
400 mlx5_flow_id_release(struct mlx5_flow_id_pool *pool, uint32_t id)
401 {
402 	uint32_t size;
403 	uint32_t size2;
404 	void *mem;
405 
406 	if (pool->curr == pool->last) {
407 		size = pool->curr - pool->free_arr;
408 		size2 = size * MLX5_ID_GENERATION_ARRAY_FACTOR;
409 		MLX5_ASSERT(size2 > size);
410 		mem = rte_malloc("", size2 * sizeof(uint32_t), 0);
411 		if (!mem) {
412 			DRV_LOG(ERR, "can't allocate mem for id pool");
413 			rte_errno  = ENOMEM;
414 			return -rte_errno;
415 		}
416 		memcpy(mem, pool->free_arr, size * sizeof(uint32_t));
417 		rte_free(pool->free_arr);
418 		pool->free_arr = mem;
419 		pool->curr = pool->free_arr + size;
420 		pool->last = pool->free_arr + size2;
421 	}
422 	*pool->curr = id;
423 	pool->curr++;
424 	return 0;
425 }
426 
427 /**
428  * Initialize the shared aging list information per port.
429  *
430  * @param[in] sh
431  *   Pointer to mlx5_dev_ctx_shared object.
432  */
433 static void
434 mlx5_flow_aging_init(struct mlx5_dev_ctx_shared *sh)
435 {
436 	uint32_t i;
437 	struct mlx5_age_info *age_info;
438 
439 	for (i = 0; i < sh->max_port; i++) {
440 		age_info = &sh->port[i].age_info;
441 		age_info->flags = 0;
442 		TAILQ_INIT(&age_info->aged_counters);
443 		rte_spinlock_init(&age_info->aged_sl);
444 		MLX5_AGE_SET(age_info, MLX5_AGE_TRIGGER);
445 	}
446 }
447 
448 /**
449  * Initialize the counters management structure.
450  *
451  * @param[in] sh
452  *   Pointer to mlx5_dev_ctx_shared object to free
453  */
454 static void
455 mlx5_flow_counters_mng_init(struct mlx5_dev_ctx_shared *sh)
456 {
457 	int i;
458 
459 	memset(&sh->cmng, 0, sizeof(sh->cmng));
460 	TAILQ_INIT(&sh->cmng.flow_counters);
461 	for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
462 		sh->cmng.ccont[i].min_id = MLX5_CNT_BATCH_OFFSET;
463 		sh->cmng.ccont[i].max_id = -1;
464 		sh->cmng.ccont[i].last_pool_idx = POOL_IDX_INVALID;
465 		TAILQ_INIT(&sh->cmng.ccont[i].pool_list);
466 		rte_spinlock_init(&sh->cmng.ccont[i].resize_sl);
467 		TAILQ_INIT(&sh->cmng.ccont[i].counters);
468 		rte_spinlock_init(&sh->cmng.ccont[i].csl);
469 	}
470 }
471 
472 /**
473  * Destroy all the resources allocated for a counter memory management.
474  *
475  * @param[in] mng
476  *   Pointer to the memory management structure.
477  */
478 static void
479 mlx5_flow_destroy_counter_stat_mem_mng(struct mlx5_counter_stats_mem_mng *mng)
480 {
481 	uint8_t *mem = (uint8_t *)(uintptr_t)mng->raws[0].data;
482 
483 	LIST_REMOVE(mng, next);
484 	claim_zero(mlx5_devx_cmd_destroy(mng->dm));
485 	claim_zero(mlx5_glue->devx_umem_dereg(mng->umem));
486 	rte_free(mem);
487 }
488 
489 /**
490  * Close and release all the resources of the counters management.
491  *
492  * @param[in] sh
493  *   Pointer to mlx5_dev_ctx_shared object to free.
494  */
495 static void
496 mlx5_flow_counters_mng_close(struct mlx5_dev_ctx_shared *sh)
497 {
498 	struct mlx5_counter_stats_mem_mng *mng;
499 	int i;
500 	int j;
501 	int retries = 1024;
502 
503 	rte_errno = 0;
504 	while (--retries) {
505 		rte_eal_alarm_cancel(mlx5_flow_query_alarm, sh);
506 		if (rte_errno != EINPROGRESS)
507 			break;
508 		rte_pause();
509 	}
510 	for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
511 		struct mlx5_flow_counter_pool *pool;
512 		uint32_t batch = !!(i > 1);
513 
514 		if (!sh->cmng.ccont[i].pools)
515 			continue;
516 		pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
517 		while (pool) {
518 			if (batch && pool->min_dcs)
519 				claim_zero(mlx5_devx_cmd_destroy
520 							       (pool->min_dcs));
521 			for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) {
522 				if (MLX5_POOL_GET_CNT(pool, j)->action)
523 					claim_zero
524 					 (mlx5_glue->destroy_flow_action
525 					  (MLX5_POOL_GET_CNT
526 					  (pool, j)->action));
527 				if (!batch && MLX5_GET_POOL_CNT_EXT
528 				    (pool, j)->dcs)
529 					claim_zero(mlx5_devx_cmd_destroy
530 						   (MLX5_GET_POOL_CNT_EXT
531 						    (pool, j)->dcs));
532 			}
533 			TAILQ_REMOVE(&sh->cmng.ccont[i].pool_list, pool, next);
534 			rte_free(pool);
535 			pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
536 		}
537 		rte_free(sh->cmng.ccont[i].pools);
538 	}
539 	mng = LIST_FIRST(&sh->cmng.mem_mngs);
540 	while (mng) {
541 		mlx5_flow_destroy_counter_stat_mem_mng(mng);
542 		mng = LIST_FIRST(&sh->cmng.mem_mngs);
543 	}
544 	memset(&sh->cmng, 0, sizeof(sh->cmng));
545 }
546 
547 /**
548  * Initialize the flow resources' indexed mempool.
549  *
550  * @param[in] sh
551  *   Pointer to mlx5_dev_ctx_shared object.
552  * @param[in] sh
553  *   Pointer to user dev config.
554  */
555 static void
556 mlx5_flow_ipool_create(struct mlx5_dev_ctx_shared *sh,
557 		       const struct mlx5_dev_config *config)
558 {
559 	uint8_t i;
560 	struct mlx5_indexed_pool_config cfg;
561 
562 	for (i = 0; i < MLX5_IPOOL_MAX; ++i) {
563 		cfg = mlx5_ipool_cfg[i];
564 		switch (i) {
565 		default:
566 			break;
567 		/*
568 		 * Set MLX5_IPOOL_MLX5_FLOW ipool size
569 		 * according to PCI function flow configuration.
570 		 */
571 		case MLX5_IPOOL_MLX5_FLOW:
572 			cfg.size = config->dv_flow_en ?
573 				sizeof(struct mlx5_flow_handle) :
574 				MLX5_FLOW_HANDLE_VERBS_SIZE;
575 			break;
576 		}
577 		if (config->reclaim_mode)
578 			cfg.release_mem_en = 1;
579 		sh->ipool[i] = mlx5_ipool_create(&cfg);
580 	}
581 }
582 
583 /**
584  * Release the flow resources' indexed mempool.
585  *
586  * @param[in] sh
587  *   Pointer to mlx5_dev_ctx_shared object.
588  */
589 static void
590 mlx5_flow_ipool_destroy(struct mlx5_dev_ctx_shared *sh)
591 {
592 	uint8_t i;
593 
594 	for (i = 0; i < MLX5_IPOOL_MAX; ++i)
595 		mlx5_ipool_destroy(sh->ipool[i]);
596 }
597 
598 /**
599  * Allocate shared device context. If there is multiport device the
600  * master and representors will share this context, if there is single
601  * port dedicated device, the context will be used by only given
602  * port due to unification.
603  *
604  * Routine first searches the context for the specified device name,
605  * if found the shared context assumed and reference counter is incremented.
606  * If no context found the new one is created and initialized with specified
607  * device context and parameters.
608  *
609  * @param[in] spawn
610  *   Pointer to the device attributes (name, port, etc).
611  * @param[in] config
612  *   Pointer to device configuration structure.
613  *
614  * @return
615  *   Pointer to mlx5_dev_ctx_shared object on success,
616  *   otherwise NULL and rte_errno is set.
617  */
618 struct mlx5_dev_ctx_shared *
619 mlx5_alloc_shared_dev_ctx(const struct mlx5_dev_spawn_data *spawn,
620 			   const struct mlx5_dev_config *config)
621 {
622 	struct mlx5_dev_ctx_shared *sh;
623 	int err = 0;
624 	uint32_t i;
625 	struct mlx5_devx_tis_attr tis_attr = { 0 };
626 
627 	MLX5_ASSERT(spawn);
628 	/* Secondary process should not create the shared context. */
629 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
630 	pthread_mutex_lock(&mlx5_dev_ctx_list_mutex);
631 	/* Search for IB context by device name. */
632 	LIST_FOREACH(sh, &mlx5_dev_ctx_list, next) {
633 		if (!strcmp(sh->ibdev_name,
634 			mlx5_os_get_dev_device_name(spawn->phys_dev))) {
635 			sh->refcnt++;
636 			goto exit;
637 		}
638 	}
639 	/* No device found, we have to create new shared context. */
640 	MLX5_ASSERT(spawn->max_port);
641 	sh = rte_zmalloc("ethdev shared ib context",
642 			 sizeof(struct mlx5_dev_ctx_shared) +
643 			 spawn->max_port *
644 			 sizeof(struct mlx5_dev_shared_port),
645 			 RTE_CACHE_LINE_SIZE);
646 	if (!sh) {
647 		DRV_LOG(ERR, "shared context allocation failure");
648 		rte_errno  = ENOMEM;
649 		goto exit;
650 	}
651 	err = mlx5_os_open_device(spawn, config, sh);
652 	if (!sh->ctx)
653 		goto error;
654 	err = mlx5_os_get_dev_attr(sh->ctx, &sh->device_attr);
655 	if (err) {
656 		DRV_LOG(DEBUG, "mlx5_os_get_dev_attr() failed");
657 		goto error;
658 	}
659 	sh->refcnt = 1;
660 	sh->max_port = spawn->max_port;
661 	strncpy(sh->ibdev_name, mlx5_os_get_ctx_device_name(sh->ctx),
662 		sizeof(sh->ibdev_name) - 1);
663 	strncpy(sh->ibdev_path, mlx5_os_get_ctx_device_path(sh->ctx),
664 		sizeof(sh->ibdev_path) - 1);
665 	/*
666 	 * Setting port_id to max unallowed value means
667 	 * there is no interrupt subhandler installed for
668 	 * the given port index i.
669 	 */
670 	for (i = 0; i < sh->max_port; i++) {
671 		sh->port[i].ih_port_id = RTE_MAX_ETHPORTS;
672 		sh->port[i].devx_ih_port_id = RTE_MAX_ETHPORTS;
673 	}
674 	sh->pd = mlx5_glue->alloc_pd(sh->ctx);
675 	if (sh->pd == NULL) {
676 		DRV_LOG(ERR, "PD allocation failure");
677 		err = ENOMEM;
678 		goto error;
679 	}
680 	if (sh->devx) {
681 		err = mlx5_os_get_pdn(sh->pd, &sh->pdn);
682 		if (err) {
683 			DRV_LOG(ERR, "Fail to extract pdn from PD");
684 			goto error;
685 		}
686 		sh->td = mlx5_devx_cmd_create_td(sh->ctx);
687 		if (!sh->td) {
688 			DRV_LOG(ERR, "TD allocation failure");
689 			err = ENOMEM;
690 			goto error;
691 		}
692 		tis_attr.transport_domain = sh->td->id;
693 		sh->tis = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr);
694 		if (!sh->tis) {
695 			DRV_LOG(ERR, "TIS allocation failure");
696 			err = ENOMEM;
697 			goto error;
698 		}
699 	}
700 	sh->flow_id_pool = mlx5_flow_id_pool_alloc
701 					((1 << HAIRPIN_FLOW_ID_BITS) - 1);
702 	if (!sh->flow_id_pool) {
703 		DRV_LOG(ERR, "can't create flow id pool");
704 		err = ENOMEM;
705 		goto error;
706 	}
707 	/*
708 	 * Once the device is added to the list of memory event
709 	 * callback, its global MR cache table cannot be expanded
710 	 * on the fly because of deadlock. If it overflows, lookup
711 	 * should be done by searching MR list linearly, which is slow.
712 	 *
713 	 * At this point the device is not added to the memory
714 	 * event list yet, context is just being created.
715 	 */
716 	err = mlx5_mr_btree_init(&sh->share_cache.cache,
717 				 MLX5_MR_BTREE_CACHE_N * 2,
718 				 spawn->pci_dev->device.numa_node);
719 	if (err) {
720 		err = rte_errno;
721 		goto error;
722 	}
723 	mlx5_os_set_reg_mr_cb(&sh->share_cache.reg_mr_cb,
724 			      &sh->share_cache.dereg_mr_cb);
725 	mlx5_os_dev_shared_handler_install(sh);
726 	sh->cnt_id_tbl = mlx5_l3t_create(MLX5_L3T_TYPE_DWORD);
727 	if (!sh->cnt_id_tbl) {
728 		err = rte_errno;
729 		goto error;
730 	}
731 	mlx5_flow_aging_init(sh);
732 	mlx5_flow_counters_mng_init(sh);
733 	mlx5_flow_ipool_create(sh, config);
734 	/* Add device to memory callback list. */
735 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
736 	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
737 			 sh, mem_event_cb);
738 	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
739 	/* Add context to the global device list. */
740 	LIST_INSERT_HEAD(&mlx5_dev_ctx_list, sh, next);
741 exit:
742 	pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
743 	return sh;
744 error:
745 	pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
746 	MLX5_ASSERT(sh);
747 	if (sh->cnt_id_tbl) {
748 		mlx5_l3t_destroy(sh->cnt_id_tbl);
749 		sh->cnt_id_tbl = NULL;
750 	}
751 	if (sh->tis)
752 		claim_zero(mlx5_devx_cmd_destroy(sh->tis));
753 	if (sh->td)
754 		claim_zero(mlx5_devx_cmd_destroy(sh->td));
755 	if (sh->pd)
756 		claim_zero(mlx5_glue->dealloc_pd(sh->pd));
757 	if (sh->ctx)
758 		claim_zero(mlx5_glue->close_device(sh->ctx));
759 	if (sh->flow_id_pool)
760 		mlx5_flow_id_pool_release(sh->flow_id_pool);
761 	rte_free(sh);
762 	MLX5_ASSERT(err > 0);
763 	rte_errno = err;
764 	return NULL;
765 }
766 
767 /**
768  * Free shared IB device context. Decrement counter and if zero free
769  * all allocated resources and close handles.
770  *
771  * @param[in] sh
772  *   Pointer to mlx5_dev_ctx_shared object to free
773  */
774 void
775 mlx5_free_shared_dev_ctx(struct mlx5_dev_ctx_shared *sh)
776 {
777 	pthread_mutex_lock(&mlx5_dev_ctx_list_mutex);
778 #ifdef RTE_LIBRTE_MLX5_DEBUG
779 	/* Check the object presence in the list. */
780 	struct mlx5_dev_ctx_shared *lctx;
781 
782 	LIST_FOREACH(lctx, &mlx5_dev_ctx_list, next)
783 		if (lctx == sh)
784 			break;
785 	MLX5_ASSERT(lctx);
786 	if (lctx != sh) {
787 		DRV_LOG(ERR, "Freeing non-existing shared IB context");
788 		goto exit;
789 	}
790 #endif
791 	MLX5_ASSERT(sh);
792 	MLX5_ASSERT(sh->refcnt);
793 	/* Secondary process should not free the shared context. */
794 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
795 	if (--sh->refcnt)
796 		goto exit;
797 	/* Remove from memory callback device list. */
798 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
799 	LIST_REMOVE(sh, mem_event_cb);
800 	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
801 	/* Release created Memory Regions. */
802 	mlx5_mr_release_cache(&sh->share_cache);
803 	/* Remove context from the global device list. */
804 	LIST_REMOVE(sh, next);
805 	/*
806 	 *  Ensure there is no async event handler installed.
807 	 *  Only primary process handles async device events.
808 	 **/
809 	mlx5_flow_counters_mng_close(sh);
810 	mlx5_flow_ipool_destroy(sh);
811 	mlx5_os_dev_shared_handler_uninstall(sh);
812 	if (sh->cnt_id_tbl) {
813 		mlx5_l3t_destroy(sh->cnt_id_tbl);
814 		sh->cnt_id_tbl = NULL;
815 	}
816 	if (sh->pd)
817 		claim_zero(mlx5_glue->dealloc_pd(sh->pd));
818 	if (sh->tis)
819 		claim_zero(mlx5_devx_cmd_destroy(sh->tis));
820 	if (sh->td)
821 		claim_zero(mlx5_devx_cmd_destroy(sh->td));
822 	if (sh->ctx)
823 		claim_zero(mlx5_glue->close_device(sh->ctx));
824 	if (sh->flow_id_pool)
825 		mlx5_flow_id_pool_release(sh->flow_id_pool);
826 	rte_free(sh);
827 exit:
828 	pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
829 }
830 
831 /**
832  * Destroy table hash list and all the root entries per domain.
833  *
834  * @param[in] priv
835  *   Pointer to the private device data structure.
836  */
837 void
838 mlx5_free_table_hash_list(struct mlx5_priv *priv)
839 {
840 	struct mlx5_dev_ctx_shared *sh = priv->sh;
841 	struct mlx5_flow_tbl_data_entry *tbl_data;
842 	union mlx5_flow_tbl_key table_key = {
843 		{
844 			.table_id = 0,
845 			.reserved = 0,
846 			.domain = 0,
847 			.direction = 0,
848 		}
849 	};
850 	struct mlx5_hlist_entry *pos;
851 
852 	if (!sh->flow_tbls)
853 		return;
854 	pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
855 	if (pos) {
856 		tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
857 					entry);
858 		MLX5_ASSERT(tbl_data);
859 		mlx5_hlist_remove(sh->flow_tbls, pos);
860 		rte_free(tbl_data);
861 	}
862 	table_key.direction = 1;
863 	pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
864 	if (pos) {
865 		tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
866 					entry);
867 		MLX5_ASSERT(tbl_data);
868 		mlx5_hlist_remove(sh->flow_tbls, pos);
869 		rte_free(tbl_data);
870 	}
871 	table_key.direction = 0;
872 	table_key.domain = 1;
873 	pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
874 	if (pos) {
875 		tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
876 					entry);
877 		MLX5_ASSERT(tbl_data);
878 		mlx5_hlist_remove(sh->flow_tbls, pos);
879 		rte_free(tbl_data);
880 	}
881 	mlx5_hlist_destroy(sh->flow_tbls, NULL, NULL);
882 }
883 
884 /**
885  * Initialize flow table hash list and create the root tables entry
886  * for each domain.
887  *
888  * @param[in] priv
889  *   Pointer to the private device data structure.
890  *
891  * @return
892  *   Zero on success, positive error code otherwise.
893  */
894 int
895 mlx5_alloc_table_hash_list(struct mlx5_priv *priv)
896 {
897 	struct mlx5_dev_ctx_shared *sh = priv->sh;
898 	char s[MLX5_HLIST_NAMESIZE];
899 	int err = 0;
900 
901 	MLX5_ASSERT(sh);
902 	snprintf(s, sizeof(s), "%s_flow_table", priv->sh->ibdev_name);
903 	sh->flow_tbls = mlx5_hlist_create(s, MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE);
904 	if (!sh->flow_tbls) {
905 		DRV_LOG(ERR, "flow tables with hash creation failed.");
906 		err = ENOMEM;
907 		return err;
908 	}
909 #ifndef HAVE_MLX5DV_DR
910 	/*
911 	 * In case we have not DR support, the zero tables should be created
912 	 * because DV expect to see them even if they cannot be created by
913 	 * RDMA-CORE.
914 	 */
915 	union mlx5_flow_tbl_key table_key = {
916 		{
917 			.table_id = 0,
918 			.reserved = 0,
919 			.domain = 0,
920 			.direction = 0,
921 		}
922 	};
923 	struct mlx5_flow_tbl_data_entry *tbl_data = rte_zmalloc(NULL,
924 							  sizeof(*tbl_data), 0);
925 
926 	if (!tbl_data) {
927 		err = ENOMEM;
928 		goto error;
929 	}
930 	tbl_data->entry.key = table_key.v64;
931 	err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
932 	if (err)
933 		goto error;
934 	rte_atomic32_init(&tbl_data->tbl.refcnt);
935 	rte_atomic32_inc(&tbl_data->tbl.refcnt);
936 	table_key.direction = 1;
937 	tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0);
938 	if (!tbl_data) {
939 		err = ENOMEM;
940 		goto error;
941 	}
942 	tbl_data->entry.key = table_key.v64;
943 	err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
944 	if (err)
945 		goto error;
946 	rte_atomic32_init(&tbl_data->tbl.refcnt);
947 	rte_atomic32_inc(&tbl_data->tbl.refcnt);
948 	table_key.direction = 0;
949 	table_key.domain = 1;
950 	tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0);
951 	if (!tbl_data) {
952 		err = ENOMEM;
953 		goto error;
954 	}
955 	tbl_data->entry.key = table_key.v64;
956 	err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
957 	if (err)
958 		goto error;
959 	rte_atomic32_init(&tbl_data->tbl.refcnt);
960 	rte_atomic32_inc(&tbl_data->tbl.refcnt);
961 	return err;
962 error:
963 	mlx5_free_table_hash_list(priv);
964 #endif /* HAVE_MLX5DV_DR */
965 	return err;
966 }
967 
968 /**
969  * Initialize shared data between primary and secondary process.
970  *
971  * A memzone is reserved by primary process and secondary processes attach to
972  * the memzone.
973  *
974  * @return
975  *   0 on success, a negative errno value otherwise and rte_errno is set.
976  */
977 static int
978 mlx5_init_shared_data(void)
979 {
980 	const struct rte_memzone *mz;
981 	int ret = 0;
982 
983 	rte_spinlock_lock(&mlx5_shared_data_lock);
984 	if (mlx5_shared_data == NULL) {
985 		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
986 			/* Allocate shared memory. */
987 			mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA,
988 						 sizeof(*mlx5_shared_data),
989 						 SOCKET_ID_ANY, 0);
990 			if (mz == NULL) {
991 				DRV_LOG(ERR,
992 					"Cannot allocate mlx5 shared data");
993 				ret = -rte_errno;
994 				goto error;
995 			}
996 			mlx5_shared_data = mz->addr;
997 			memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data));
998 			rte_spinlock_init(&mlx5_shared_data->lock);
999 		} else {
1000 			/* Lookup allocated shared memory. */
1001 			mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA);
1002 			if (mz == NULL) {
1003 				DRV_LOG(ERR,
1004 					"Cannot attach mlx5 shared data");
1005 				ret = -rte_errno;
1006 				goto error;
1007 			}
1008 			mlx5_shared_data = mz->addr;
1009 			memset(&mlx5_local_data, 0, sizeof(mlx5_local_data));
1010 		}
1011 	}
1012 error:
1013 	rte_spinlock_unlock(&mlx5_shared_data_lock);
1014 	return ret;
1015 }
1016 
1017 /**
1018  * Retrieve integer value from environment variable.
1019  *
1020  * @param[in] name
1021  *   Environment variable name.
1022  *
1023  * @return
1024  *   Integer value, 0 if the variable is not set.
1025  */
1026 int
1027 mlx5_getenv_int(const char *name)
1028 {
1029 	const char *val = getenv(name);
1030 
1031 	if (val == NULL)
1032 		return 0;
1033 	return atoi(val);
1034 }
1035 
1036 /**
1037  * DPDK callback to add udp tunnel port
1038  *
1039  * @param[in] dev
1040  *   A pointer to eth_dev
1041  * @param[in] udp_tunnel
1042  *   A pointer to udp tunnel
1043  *
1044  * @return
1045  *   0 on valid udp ports and tunnels, -ENOTSUP otherwise.
1046  */
1047 int
1048 mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev __rte_unused,
1049 			 struct rte_eth_udp_tunnel *udp_tunnel)
1050 {
1051 	MLX5_ASSERT(udp_tunnel != NULL);
1052 	if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN &&
1053 	    udp_tunnel->udp_port == 4789)
1054 		return 0;
1055 	if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN_GPE &&
1056 	    udp_tunnel->udp_port == 4790)
1057 		return 0;
1058 	return -ENOTSUP;
1059 }
1060 
1061 /**
1062  * Initialize process private data structure.
1063  *
1064  * @param dev
1065  *   Pointer to Ethernet device structure.
1066  *
1067  * @return
1068  *   0 on success, a negative errno value otherwise and rte_errno is set.
1069  */
1070 int
1071 mlx5_proc_priv_init(struct rte_eth_dev *dev)
1072 {
1073 	struct mlx5_priv *priv = dev->data->dev_private;
1074 	struct mlx5_proc_priv *ppriv;
1075 	size_t ppriv_size;
1076 
1077 	/*
1078 	 * UAR register table follows the process private structure. BlueFlame
1079 	 * registers for Tx queues are stored in the table.
1080 	 */
1081 	ppriv_size =
1082 		sizeof(struct mlx5_proc_priv) + priv->txqs_n * sizeof(void *);
1083 	ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size,
1084 				  RTE_CACHE_LINE_SIZE, dev->device->numa_node);
1085 	if (!ppriv) {
1086 		rte_errno = ENOMEM;
1087 		return -rte_errno;
1088 	}
1089 	ppriv->uar_table_sz = ppriv_size;
1090 	dev->process_private = ppriv;
1091 	return 0;
1092 }
1093 
1094 /**
1095  * Un-initialize process private data structure.
1096  *
1097  * @param dev
1098  *   Pointer to Ethernet device structure.
1099  */
1100 static void
1101 mlx5_proc_priv_uninit(struct rte_eth_dev *dev)
1102 {
1103 	if (!dev->process_private)
1104 		return;
1105 	rte_free(dev->process_private);
1106 	dev->process_private = NULL;
1107 }
1108 
1109 /**
1110  * DPDK callback to close the device.
1111  *
1112  * Destroy all queues and objects, free memory.
1113  *
1114  * @param dev
1115  *   Pointer to Ethernet device structure.
1116  */
1117 void
1118 mlx5_dev_close(struct rte_eth_dev *dev)
1119 {
1120 	struct mlx5_priv *priv = dev->data->dev_private;
1121 	unsigned int i;
1122 	int ret;
1123 
1124 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1125 		/* Check if process_private released. */
1126 		if (!dev->process_private)
1127 			return;
1128 		mlx5_tx_uar_uninit_secondary(dev);
1129 		mlx5_proc_priv_uninit(dev);
1130 		rte_eth_dev_release_port(dev);
1131 		return;
1132 	}
1133 	if (!priv->sh)
1134 		return;
1135 	DRV_LOG(DEBUG, "port %u closing device \"%s\"",
1136 		dev->data->port_id,
1137 		((priv->sh->ctx != NULL) ?
1138 		mlx5_os_get_ctx_device_name(priv->sh->ctx) : ""));
1139 	/*
1140 	 * If default mreg copy action is removed at the stop stage,
1141 	 * the search will return none and nothing will be done anymore.
1142 	 */
1143 	mlx5_flow_stop_default(dev);
1144 	mlx5_traffic_disable(dev);
1145 	/*
1146 	 * If all the flows are already flushed in the device stop stage,
1147 	 * then this will return directly without any action.
1148 	 */
1149 	mlx5_flow_list_flush(dev, &priv->flows, true);
1150 	mlx5_flow_meter_flush(dev, NULL);
1151 	/* Free the intermediate buffers for flow creation. */
1152 	mlx5_flow_free_intermediate(dev);
1153 	/* Prevent crashes when queues are still in use. */
1154 	dev->rx_pkt_burst = removed_rx_burst;
1155 	dev->tx_pkt_burst = removed_tx_burst;
1156 	rte_wmb();
1157 	/* Disable datapath on secondary process. */
1158 	mlx5_mp_req_stop_rxtx(dev);
1159 	if (priv->rxqs != NULL) {
1160 		/* XXX race condition if mlx5_rx_burst() is still running. */
1161 		usleep(1000);
1162 		for (i = 0; (i != priv->rxqs_n); ++i)
1163 			mlx5_rxq_release(dev, i);
1164 		priv->rxqs_n = 0;
1165 		priv->rxqs = NULL;
1166 	}
1167 	if (priv->txqs != NULL) {
1168 		/* XXX race condition if mlx5_tx_burst() is still running. */
1169 		usleep(1000);
1170 		for (i = 0; (i != priv->txqs_n); ++i)
1171 			mlx5_txq_release(dev, i);
1172 		priv->txqs_n = 0;
1173 		priv->txqs = NULL;
1174 	}
1175 	mlx5_proc_priv_uninit(dev);
1176 	if (priv->mreg_cp_tbl)
1177 		mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL);
1178 	mlx5_mprq_free_mp(dev);
1179 	mlx5_os_free_shared_dr(priv);
1180 	if (priv->rss_conf.rss_key != NULL)
1181 		rte_free(priv->rss_conf.rss_key);
1182 	if (priv->reta_idx != NULL)
1183 		rte_free(priv->reta_idx);
1184 	if (priv->config.vf)
1185 		mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev),
1186 				       dev->data->mac_addrs,
1187 				       MLX5_MAX_MAC_ADDRESSES, priv->mac_own);
1188 	if (priv->nl_socket_route >= 0)
1189 		close(priv->nl_socket_route);
1190 	if (priv->nl_socket_rdma >= 0)
1191 		close(priv->nl_socket_rdma);
1192 	if (priv->vmwa_context)
1193 		mlx5_vlan_vmwa_exit(priv->vmwa_context);
1194 	ret = mlx5_hrxq_verify(dev);
1195 	if (ret)
1196 		DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
1197 			dev->data->port_id);
1198 	ret = mlx5_ind_table_obj_verify(dev);
1199 	if (ret)
1200 		DRV_LOG(WARNING, "port %u some indirection table still remain",
1201 			dev->data->port_id);
1202 	ret = mlx5_rxq_obj_verify(dev);
1203 	if (ret)
1204 		DRV_LOG(WARNING, "port %u some Rx queue objects still remain",
1205 			dev->data->port_id);
1206 	ret = mlx5_rxq_verify(dev);
1207 	if (ret)
1208 		DRV_LOG(WARNING, "port %u some Rx queues still remain",
1209 			dev->data->port_id);
1210 	ret = mlx5_txq_obj_verify(dev);
1211 	if (ret)
1212 		DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain",
1213 			dev->data->port_id);
1214 	ret = mlx5_txq_verify(dev);
1215 	if (ret)
1216 		DRV_LOG(WARNING, "port %u some Tx queues still remain",
1217 			dev->data->port_id);
1218 	ret = mlx5_flow_verify(dev);
1219 	if (ret)
1220 		DRV_LOG(WARNING, "port %u some flows still remain",
1221 			dev->data->port_id);
1222 	/*
1223 	 * Free the shared context in last turn, because the cleanup
1224 	 * routines above may use some shared fields, like
1225 	 * mlx5_nl_mac_addr_flush() uses ibdev_path for retrieveing
1226 	 * ifindex if Netlink fails.
1227 	 */
1228 	mlx5_free_shared_dev_ctx(priv->sh);
1229 	if (priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
1230 		unsigned int c = 0;
1231 		uint16_t port_id;
1232 
1233 		MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
1234 			struct mlx5_priv *opriv =
1235 				rte_eth_devices[port_id].data->dev_private;
1236 
1237 			if (!opriv ||
1238 			    opriv->domain_id != priv->domain_id ||
1239 			    &rte_eth_devices[port_id] == dev)
1240 				continue;
1241 			++c;
1242 			break;
1243 		}
1244 		if (!c)
1245 			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
1246 	}
1247 	memset(priv, 0, sizeof(*priv));
1248 	priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
1249 	/*
1250 	 * Reset mac_addrs to NULL such that it is not freed as part of
1251 	 * rte_eth_dev_release_port(). mac_addrs is part of dev_private so
1252 	 * it is freed when dev_private is freed.
1253 	 */
1254 	dev->data->mac_addrs = NULL;
1255 }
1256 
1257 /**
1258  * Verify and store value for device argument.
1259  *
1260  * @param[in] key
1261  *   Key argument to verify.
1262  * @param[in] val
1263  *   Value associated with key.
1264  * @param opaque
1265  *   User data.
1266  *
1267  * @return
1268  *   0 on success, a negative errno value otherwise and rte_errno is set.
1269  */
1270 static int
1271 mlx5_args_check(const char *key, const char *val, void *opaque)
1272 {
1273 	struct mlx5_dev_config *config = opaque;
1274 	unsigned long tmp;
1275 
1276 	/* No-op, port representors are processed in mlx5_dev_spawn(). */
1277 	if (!strcmp(MLX5_REPRESENTOR, key))
1278 		return 0;
1279 	errno = 0;
1280 	tmp = strtoul(val, NULL, 0);
1281 	if (errno) {
1282 		rte_errno = errno;
1283 		DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val);
1284 		return -rte_errno;
1285 	}
1286 	if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
1287 		config->cqe_comp = !!tmp;
1288 	} else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) {
1289 		config->cqe_pad = !!tmp;
1290 	} else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) {
1291 		config->hw_padding = !!tmp;
1292 	} else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) {
1293 		config->mprq.enabled = !!tmp;
1294 	} else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) {
1295 		config->mprq.stride_num_n = tmp;
1296 	} else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_SIZE, key) == 0) {
1297 		config->mprq.stride_size_n = tmp;
1298 	} else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) {
1299 		config->mprq.max_memcpy_len = tmp;
1300 	} else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
1301 		config->mprq.min_rxqs_num = tmp;
1302 	} else if (strcmp(MLX5_TXQ_INLINE, key) == 0) {
1303 		DRV_LOG(WARNING, "%s: deprecated parameter,"
1304 				 " converted to txq_inline_max", key);
1305 		config->txq_inline_max = tmp;
1306 	} else if (strcmp(MLX5_TXQ_INLINE_MAX, key) == 0) {
1307 		config->txq_inline_max = tmp;
1308 	} else if (strcmp(MLX5_TXQ_INLINE_MIN, key) == 0) {
1309 		config->txq_inline_min = tmp;
1310 	} else if (strcmp(MLX5_TXQ_INLINE_MPW, key) == 0) {
1311 		config->txq_inline_mpw = tmp;
1312 	} else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
1313 		config->txqs_inline = tmp;
1314 	} else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) {
1315 		DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1316 	} else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
1317 		config->mps = !!tmp;
1318 	} else if (strcmp(MLX5_TX_DB_NC, key) == 0) {
1319 		if (tmp != MLX5_TXDB_CACHED &&
1320 		    tmp != MLX5_TXDB_NCACHED &&
1321 		    tmp != MLX5_TXDB_HEURISTIC) {
1322 			DRV_LOG(ERR, "invalid Tx doorbell "
1323 				     "mapping parameter");
1324 			rte_errno = EINVAL;
1325 			return -rte_errno;
1326 		}
1327 		config->dbnc = tmp;
1328 	} else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
1329 		DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1330 	} else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
1331 		DRV_LOG(WARNING, "%s: deprecated parameter,"
1332 				 " converted to txq_inline_mpw", key);
1333 		config->txq_inline_mpw = tmp;
1334 	} else if (strcmp(MLX5_TX_VEC_EN, key) == 0) {
1335 		DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1336 	} else if (strcmp(MLX5_RX_VEC_EN, key) == 0) {
1337 		config->rx_vec_en = !!tmp;
1338 	} else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) {
1339 		config->l3_vxlan_en = !!tmp;
1340 	} else if (strcmp(MLX5_VF_NL_EN, key) == 0) {
1341 		config->vf_nl_en = !!tmp;
1342 	} else if (strcmp(MLX5_DV_ESW_EN, key) == 0) {
1343 		config->dv_esw_en = !!tmp;
1344 	} else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) {
1345 		config->dv_flow_en = !!tmp;
1346 	} else if (strcmp(MLX5_DV_XMETA_EN, key) == 0) {
1347 		if (tmp != MLX5_XMETA_MODE_LEGACY &&
1348 		    tmp != MLX5_XMETA_MODE_META16 &&
1349 		    tmp != MLX5_XMETA_MODE_META32) {
1350 			DRV_LOG(ERR, "invalid extensive "
1351 				     "metadata parameter");
1352 			rte_errno = EINVAL;
1353 			return -rte_errno;
1354 		}
1355 		config->dv_xmeta_en = tmp;
1356 	} else if (strcmp(MLX5_LACP_BY_USER, key) == 0) {
1357 		config->lacp_by_user = !!tmp;
1358 	} else if (strcmp(MLX5_MR_EXT_MEMSEG_EN, key) == 0) {
1359 		config->mr_ext_memseg_en = !!tmp;
1360 	} else if (strcmp(MLX5_MAX_DUMP_FILES_NUM, key) == 0) {
1361 		config->max_dump_files_num = tmp;
1362 	} else if (strcmp(MLX5_LRO_TIMEOUT_USEC, key) == 0) {
1363 		config->lro.timeout = tmp;
1364 	} else if (strcmp(MLX5_CLASS_ARG_NAME, key) == 0) {
1365 		DRV_LOG(DEBUG, "class argument is %s.", val);
1366 	} else if (strcmp(MLX5_HP_BUF_SIZE, key) == 0) {
1367 		config->log_hp_size = tmp;
1368 	} else if (strcmp(MLX5_RECLAIM_MEM, key) == 0) {
1369 		if (tmp != MLX5_RCM_NONE &&
1370 		    tmp != MLX5_RCM_LIGHT &&
1371 		    tmp != MLX5_RCM_AGGR) {
1372 			DRV_LOG(ERR, "Unrecognize %s: \"%s\"", key, val);
1373 			rte_errno = EINVAL;
1374 			return -rte_errno;
1375 		}
1376 		config->reclaim_mode = tmp;
1377 	} else {
1378 		DRV_LOG(WARNING, "%s: unknown parameter", key);
1379 		rte_errno = EINVAL;
1380 		return -rte_errno;
1381 	}
1382 	return 0;
1383 }
1384 
1385 /**
1386  * Parse device parameters.
1387  *
1388  * @param config
1389  *   Pointer to device configuration structure.
1390  * @param devargs
1391  *   Device arguments structure.
1392  *
1393  * @return
1394  *   0 on success, a negative errno value otherwise and rte_errno is set.
1395  */
1396 int
1397 mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
1398 {
1399 	const char **params = (const char *[]){
1400 		MLX5_RXQ_CQE_COMP_EN,
1401 		MLX5_RXQ_CQE_PAD_EN,
1402 		MLX5_RXQ_PKT_PAD_EN,
1403 		MLX5_RX_MPRQ_EN,
1404 		MLX5_RX_MPRQ_LOG_STRIDE_NUM,
1405 		MLX5_RX_MPRQ_LOG_STRIDE_SIZE,
1406 		MLX5_RX_MPRQ_MAX_MEMCPY_LEN,
1407 		MLX5_RXQS_MIN_MPRQ,
1408 		MLX5_TXQ_INLINE,
1409 		MLX5_TXQ_INLINE_MIN,
1410 		MLX5_TXQ_INLINE_MAX,
1411 		MLX5_TXQ_INLINE_MPW,
1412 		MLX5_TXQS_MIN_INLINE,
1413 		MLX5_TXQS_MAX_VEC,
1414 		MLX5_TXQ_MPW_EN,
1415 		MLX5_TXQ_MPW_HDR_DSEG_EN,
1416 		MLX5_TXQ_MAX_INLINE_LEN,
1417 		MLX5_TX_DB_NC,
1418 		MLX5_TX_VEC_EN,
1419 		MLX5_RX_VEC_EN,
1420 		MLX5_L3_VXLAN_EN,
1421 		MLX5_VF_NL_EN,
1422 		MLX5_DV_ESW_EN,
1423 		MLX5_DV_FLOW_EN,
1424 		MLX5_DV_XMETA_EN,
1425 		MLX5_LACP_BY_USER,
1426 		MLX5_MR_EXT_MEMSEG_EN,
1427 		MLX5_REPRESENTOR,
1428 		MLX5_MAX_DUMP_FILES_NUM,
1429 		MLX5_LRO_TIMEOUT_USEC,
1430 		MLX5_CLASS_ARG_NAME,
1431 		MLX5_HP_BUF_SIZE,
1432 		MLX5_RECLAIM_MEM,
1433 		NULL,
1434 	};
1435 	struct rte_kvargs *kvlist;
1436 	int ret = 0;
1437 	int i;
1438 
1439 	if (devargs == NULL)
1440 		return 0;
1441 	/* Following UGLY cast is done to pass checkpatch. */
1442 	kvlist = rte_kvargs_parse(devargs->args, params);
1443 	if (kvlist == NULL) {
1444 		rte_errno = EINVAL;
1445 		return -rte_errno;
1446 	}
1447 	/* Process parameters. */
1448 	for (i = 0; (params[i] != NULL); ++i) {
1449 		if (rte_kvargs_count(kvlist, params[i])) {
1450 			ret = rte_kvargs_process(kvlist, params[i],
1451 						 mlx5_args_check, config);
1452 			if (ret) {
1453 				rte_errno = EINVAL;
1454 				rte_kvargs_free(kvlist);
1455 				return -rte_errno;
1456 			}
1457 		}
1458 	}
1459 	rte_kvargs_free(kvlist);
1460 	return 0;
1461 }
1462 
1463 /**
1464  * PMD global initialization.
1465  *
1466  * Independent from individual device, this function initializes global
1467  * per-PMD data structures distinguishing primary and secondary processes.
1468  * Hence, each initialization is called once per a process.
1469  *
1470  * @return
1471  *   0 on success, a negative errno value otherwise and rte_errno is set.
1472  */
1473 int
1474 mlx5_init_once(void)
1475 {
1476 	struct mlx5_shared_data *sd;
1477 	struct mlx5_local_data *ld = &mlx5_local_data;
1478 	int ret = 0;
1479 
1480 	if (mlx5_init_shared_data())
1481 		return -rte_errno;
1482 	sd = mlx5_shared_data;
1483 	MLX5_ASSERT(sd);
1484 	rte_spinlock_lock(&sd->lock);
1485 	switch (rte_eal_process_type()) {
1486 	case RTE_PROC_PRIMARY:
1487 		if (sd->init_done)
1488 			break;
1489 		LIST_INIT(&sd->mem_event_cb_list);
1490 		rte_rwlock_init(&sd->mem_event_rwlock);
1491 		rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
1492 						mlx5_mr_mem_event_cb, NULL);
1493 		ret = mlx5_mp_init_primary(MLX5_MP_NAME,
1494 					   mlx5_mp_primary_handle);
1495 		if (ret)
1496 			goto out;
1497 		sd->init_done = true;
1498 		break;
1499 	case RTE_PROC_SECONDARY:
1500 		if (ld->init_done)
1501 			break;
1502 		ret = mlx5_mp_init_secondary(MLX5_MP_NAME,
1503 					     mlx5_mp_secondary_handle);
1504 		if (ret)
1505 			goto out;
1506 		++sd->secondary_cnt;
1507 		ld->init_done = true;
1508 		break;
1509 	default:
1510 		break;
1511 	}
1512 out:
1513 	rte_spinlock_unlock(&sd->lock);
1514 	return ret;
1515 }
1516 
1517 /**
1518  * Configures the minimal amount of data to inline into WQE
1519  * while sending packets.
1520  *
1521  * - the txq_inline_min has the maximal priority, if this
1522  *   key is specified in devargs
1523  * - if DevX is enabled the inline mode is queried from the
1524  *   device (HCA attributes and NIC vport context if needed).
1525  * - otherwise L2 mode (18 bytes) is assumed for ConnectX-4/4 Lx
1526  *   and none (0 bytes) for other NICs
1527  *
1528  * @param spawn
1529  *   Verbs device parameters (name, port, switch_info) to spawn.
1530  * @param config
1531  *   Device configuration parameters.
1532  */
1533 void
1534 mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn,
1535 		    struct mlx5_dev_config *config)
1536 {
1537 	if (config->txq_inline_min != MLX5_ARG_UNSET) {
1538 		/* Application defines size of inlined data explicitly. */
1539 		switch (spawn->pci_dev->id.device_id) {
1540 		case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
1541 		case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
1542 			if (config->txq_inline_min <
1543 				       (int)MLX5_INLINE_HSIZE_L2) {
1544 				DRV_LOG(DEBUG,
1545 					"txq_inline_mix aligned to minimal"
1546 					" ConnectX-4 required value %d",
1547 					(int)MLX5_INLINE_HSIZE_L2);
1548 				config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1549 			}
1550 			break;
1551 		}
1552 		goto exit;
1553 	}
1554 	if (config->hca_attr.eth_net_offloads) {
1555 		/* We have DevX enabled, inline mode queried successfully. */
1556 		switch (config->hca_attr.wqe_inline_mode) {
1557 		case MLX5_CAP_INLINE_MODE_L2:
1558 			/* outer L2 header must be inlined. */
1559 			config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1560 			goto exit;
1561 		case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
1562 			/* No inline data are required by NIC. */
1563 			config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
1564 			config->hw_vlan_insert =
1565 				config->hca_attr.wqe_vlan_insert;
1566 			DRV_LOG(DEBUG, "Tx VLAN insertion is supported");
1567 			goto exit;
1568 		case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
1569 			/* inline mode is defined by NIC vport context. */
1570 			if (!config->hca_attr.eth_virt)
1571 				break;
1572 			switch (config->hca_attr.vport_inline_mode) {
1573 			case MLX5_INLINE_MODE_NONE:
1574 				config->txq_inline_min =
1575 					MLX5_INLINE_HSIZE_NONE;
1576 				goto exit;
1577 			case MLX5_INLINE_MODE_L2:
1578 				config->txq_inline_min =
1579 					MLX5_INLINE_HSIZE_L2;
1580 				goto exit;
1581 			case MLX5_INLINE_MODE_IP:
1582 				config->txq_inline_min =
1583 					MLX5_INLINE_HSIZE_L3;
1584 				goto exit;
1585 			case MLX5_INLINE_MODE_TCP_UDP:
1586 				config->txq_inline_min =
1587 					MLX5_INLINE_HSIZE_L4;
1588 				goto exit;
1589 			case MLX5_INLINE_MODE_INNER_L2:
1590 				config->txq_inline_min =
1591 					MLX5_INLINE_HSIZE_INNER_L2;
1592 				goto exit;
1593 			case MLX5_INLINE_MODE_INNER_IP:
1594 				config->txq_inline_min =
1595 					MLX5_INLINE_HSIZE_INNER_L3;
1596 				goto exit;
1597 			case MLX5_INLINE_MODE_INNER_TCP_UDP:
1598 				config->txq_inline_min =
1599 					MLX5_INLINE_HSIZE_INNER_L4;
1600 				goto exit;
1601 			}
1602 		}
1603 	}
1604 	/*
1605 	 * We get here if we are unable to deduce
1606 	 * inline data size with DevX. Try PCI ID
1607 	 * to determine old NICs.
1608 	 */
1609 	switch (spawn->pci_dev->id.device_id) {
1610 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
1611 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
1612 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX:
1613 	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
1614 		config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1615 		config->hw_vlan_insert = 0;
1616 		break;
1617 	case PCI_DEVICE_ID_MELLANOX_CONNECTX5:
1618 	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
1619 	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX:
1620 	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
1621 		/*
1622 		 * These NICs support VLAN insertion from WQE and
1623 		 * report the wqe_vlan_insert flag. But there is the bug
1624 		 * and PFC control may be broken, so disable feature.
1625 		 */
1626 		config->hw_vlan_insert = 0;
1627 		config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
1628 		break;
1629 	default:
1630 		config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
1631 		break;
1632 	}
1633 exit:
1634 	DRV_LOG(DEBUG, "min tx inline configured: %d", config->txq_inline_min);
1635 }
1636 
1637 /**
1638  * Configures the metadata mask fields in the shared context.
1639  *
1640  * @param [in] dev
1641  *   Pointer to Ethernet device.
1642  */
1643 void
1644 mlx5_set_metadata_mask(struct rte_eth_dev *dev)
1645 {
1646 	struct mlx5_priv *priv = dev->data->dev_private;
1647 	struct mlx5_dev_ctx_shared *sh = priv->sh;
1648 	uint32_t meta, mark, reg_c0;
1649 
1650 	reg_c0 = ~priv->vport_meta_mask;
1651 	switch (priv->config.dv_xmeta_en) {
1652 	case MLX5_XMETA_MODE_LEGACY:
1653 		meta = UINT32_MAX;
1654 		mark = MLX5_FLOW_MARK_MASK;
1655 		break;
1656 	case MLX5_XMETA_MODE_META16:
1657 		meta = reg_c0 >> rte_bsf32(reg_c0);
1658 		mark = MLX5_FLOW_MARK_MASK;
1659 		break;
1660 	case MLX5_XMETA_MODE_META32:
1661 		meta = UINT32_MAX;
1662 		mark = (reg_c0 >> rte_bsf32(reg_c0)) & MLX5_FLOW_MARK_MASK;
1663 		break;
1664 	default:
1665 		meta = 0;
1666 		mark = 0;
1667 		MLX5_ASSERT(false);
1668 		break;
1669 	}
1670 	if (sh->dv_mark_mask && sh->dv_mark_mask != mark)
1671 		DRV_LOG(WARNING, "metadata MARK mask mismatche %08X:%08X",
1672 				 sh->dv_mark_mask, mark);
1673 	else
1674 		sh->dv_mark_mask = mark;
1675 	if (sh->dv_meta_mask && sh->dv_meta_mask != meta)
1676 		DRV_LOG(WARNING, "metadata META mask mismatche %08X:%08X",
1677 				 sh->dv_meta_mask, meta);
1678 	else
1679 		sh->dv_meta_mask = meta;
1680 	if (sh->dv_regc0_mask && sh->dv_regc0_mask != reg_c0)
1681 		DRV_LOG(WARNING, "metadata reg_c0 mask mismatche %08X:%08X",
1682 				 sh->dv_meta_mask, reg_c0);
1683 	else
1684 		sh->dv_regc0_mask = reg_c0;
1685 	DRV_LOG(DEBUG, "metadata mode %u", priv->config.dv_xmeta_en);
1686 	DRV_LOG(DEBUG, "metadata MARK mask %08X", sh->dv_mark_mask);
1687 	DRV_LOG(DEBUG, "metadata META mask %08X", sh->dv_meta_mask);
1688 	DRV_LOG(DEBUG, "metadata reg_c0 mask %08X", sh->dv_regc0_mask);
1689 }
1690 
1691 int
1692 rte_pmd_mlx5_get_dyn_flag_names(char *names[], unsigned int n)
1693 {
1694 	static const char *const dynf_names[] = {
1695 		RTE_PMD_MLX5_FINE_GRANULARITY_INLINE,
1696 		RTE_MBUF_DYNFLAG_METADATA_NAME
1697 	};
1698 	unsigned int i;
1699 
1700 	if (n < RTE_DIM(dynf_names))
1701 		return -ENOMEM;
1702 	for (i = 0; i < RTE_DIM(dynf_names); i++) {
1703 		if (names[i] == NULL)
1704 			return -EINVAL;
1705 		strcpy(names[i], dynf_names[i]);
1706 	}
1707 	return RTE_DIM(dynf_names);
1708 }
1709 
1710 /**
1711  * Comparison callback to sort device data.
1712  *
1713  * This is meant to be used with qsort().
1714  *
1715  * @param a[in]
1716  *   Pointer to pointer to first data object.
1717  * @param b[in]
1718  *   Pointer to pointer to second data object.
1719  *
1720  * @return
1721  *   0 if both objects are equal, less than 0 if the first argument is less
1722  *   than the second, greater than 0 otherwise.
1723  */
1724 int
1725 mlx5_dev_check_sibling_config(struct mlx5_priv *priv,
1726 			      struct mlx5_dev_config *config)
1727 {
1728 	struct mlx5_dev_ctx_shared *sh = priv->sh;
1729 	struct mlx5_dev_config *sh_conf = NULL;
1730 	uint16_t port_id;
1731 
1732 	MLX5_ASSERT(sh);
1733 	/* Nothing to compare for the single/first device. */
1734 	if (sh->refcnt == 1)
1735 		return 0;
1736 	/* Find the device with shared context. */
1737 	MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
1738 		struct mlx5_priv *opriv =
1739 			rte_eth_devices[port_id].data->dev_private;
1740 
1741 		if (opriv && opriv != priv && opriv->sh == sh) {
1742 			sh_conf = &opriv->config;
1743 			break;
1744 		}
1745 	}
1746 	if (!sh_conf)
1747 		return 0;
1748 	if (sh_conf->dv_flow_en ^ config->dv_flow_en) {
1749 		DRV_LOG(ERR, "\"dv_flow_en\" configuration mismatch"
1750 			     " for shared %s context", sh->ibdev_name);
1751 		rte_errno = EINVAL;
1752 		return rte_errno;
1753 	}
1754 	if (sh_conf->dv_xmeta_en ^ config->dv_xmeta_en) {
1755 		DRV_LOG(ERR, "\"dv_xmeta_en\" configuration mismatch"
1756 			     " for shared %s context", sh->ibdev_name);
1757 		rte_errno = EINVAL;
1758 		return rte_errno;
1759 	}
1760 	return 0;
1761 }
1762 
1763 /**
1764  * Look for the ethernet device belonging to mlx5 driver.
1765  *
1766  * @param[in] port_id
1767  *   port_id to start looking for device.
1768  * @param[in] pci_dev
1769  *   Pointer to the hint PCI device. When device is being probed
1770  *   the its siblings (master and preceding representors might
1771  *   not have assigned driver yet (because the mlx5_os_pci_probe()
1772  *   is not completed yet, for this case match on hint PCI
1773  *   device may be used to detect sibling device.
1774  *
1775  * @return
1776  *   port_id of found device, RTE_MAX_ETHPORT if not found.
1777  */
1778 uint16_t
1779 mlx5_eth_find_next(uint16_t port_id, struct rte_pci_device *pci_dev)
1780 {
1781 	while (port_id < RTE_MAX_ETHPORTS) {
1782 		struct rte_eth_dev *dev = &rte_eth_devices[port_id];
1783 
1784 		if (dev->state != RTE_ETH_DEV_UNUSED &&
1785 		    dev->device &&
1786 		    (dev->device == &pci_dev->device ||
1787 		     (dev->device->driver &&
1788 		     dev->device->driver->name &&
1789 		     !strcmp(dev->device->driver->name, MLX5_DRIVER_NAME))))
1790 			break;
1791 		port_id++;
1792 	}
1793 	if (port_id >= RTE_MAX_ETHPORTS)
1794 		return RTE_MAX_ETHPORTS;
1795 	return port_id;
1796 }
1797 
1798 /**
1799  * DPDK callback to remove a PCI device.
1800  *
1801  * This function removes all Ethernet devices belong to a given PCI device.
1802  *
1803  * @param[in] pci_dev
1804  *   Pointer to the PCI device.
1805  *
1806  * @return
1807  *   0 on success, the function cannot fail.
1808  */
1809 static int
1810 mlx5_pci_remove(struct rte_pci_device *pci_dev)
1811 {
1812 	uint16_t port_id;
1813 
1814 	RTE_ETH_FOREACH_DEV_OF(port_id, &pci_dev->device) {
1815 		/*
1816 		 * mlx5_dev_close() is not registered to secondary process,
1817 		 * call the close function explicitly for secondary process.
1818 		 */
1819 		if (rte_eal_process_type() == RTE_PROC_SECONDARY)
1820 			mlx5_dev_close(&rte_eth_devices[port_id]);
1821 		else
1822 			rte_eth_dev_close(port_id);
1823 	}
1824 	return 0;
1825 }
1826 
1827 static const struct rte_pci_id mlx5_pci_id_map[] = {
1828 	{
1829 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1830 			       PCI_DEVICE_ID_MELLANOX_CONNECTX4)
1831 	},
1832 	{
1833 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1834 			       PCI_DEVICE_ID_MELLANOX_CONNECTX4VF)
1835 	},
1836 	{
1837 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1838 			       PCI_DEVICE_ID_MELLANOX_CONNECTX4LX)
1839 	},
1840 	{
1841 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1842 			       PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF)
1843 	},
1844 	{
1845 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1846 			       PCI_DEVICE_ID_MELLANOX_CONNECTX5)
1847 	},
1848 	{
1849 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1850 			       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF)
1851 	},
1852 	{
1853 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1854 			       PCI_DEVICE_ID_MELLANOX_CONNECTX5EX)
1855 	},
1856 	{
1857 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1858 			       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)
1859 	},
1860 	{
1861 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1862 			       PCI_DEVICE_ID_MELLANOX_CONNECTX5BF)
1863 	},
1864 	{
1865 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1866 			       PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF)
1867 	},
1868 	{
1869 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1870 				PCI_DEVICE_ID_MELLANOX_CONNECTX6)
1871 	},
1872 	{
1873 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1874 				PCI_DEVICE_ID_MELLANOX_CONNECTX6VF)
1875 	},
1876 	{
1877 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1878 				PCI_DEVICE_ID_MELLANOX_CONNECTX6DX)
1879 	},
1880 	{
1881 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1882 				PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF)
1883 	},
1884 	{
1885 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1886 				PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF)
1887 	},
1888 	{
1889 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1890 				PCI_DEVICE_ID_MELLANOX_CONNECTX6LX)
1891 	},
1892 	{
1893 		.vendor_id = 0
1894 	}
1895 };
1896 
1897 struct rte_pci_driver mlx5_driver = {
1898 	.driver = {
1899 		.name = MLX5_DRIVER_NAME
1900 	},
1901 	.id_table = mlx5_pci_id_map,
1902 	.probe = mlx5_os_pci_probe,
1903 	.remove = mlx5_pci_remove,
1904 	.dma_map = mlx5_dma_map,
1905 	.dma_unmap = mlx5_dma_unmap,
1906 	.drv_flags = PCI_DRV_FLAGS,
1907 };
1908 
1909 /* Initialize driver log type. */
1910 RTE_LOG_REGISTER(mlx5_logtype, pmd.net.mlx5, NOTICE)
1911 
1912 /**
1913  * Driver initialization routine.
1914  */
1915 RTE_INIT(rte_mlx5_pmd_init)
1916 {
1917 	/* Build the static tables for Verbs conversion. */
1918 	mlx5_set_ptype_table();
1919 	mlx5_set_cksum_table();
1920 	mlx5_set_swp_types_table();
1921 	if (mlx5_glue)
1922 		rte_pci_register(&mlx5_driver);
1923 }
1924 
1925 RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__);
1926 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map);
1927 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib");
1928