xref: /dpdk/drivers/net/mlx5/mlx5_flow.c (revision 1334586b669c1d27aec2bffd0b1ed9f542cd7d19)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2016 6WIND S.A.
3  * Copyright 2016 Mellanox Technologies, Ltd
4  */
5 
6 #include <sys/queue.h>
7 #include <stdalign.h>
8 #include <stdint.h>
9 #include <string.h>
10 
11 /* Verbs header. */
12 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
13 #ifdef PEDANTIC
14 #pragma GCC diagnostic ignored "-Wpedantic"
15 #endif
16 #include <infiniband/verbs.h>
17 #ifdef PEDANTIC
18 #pragma GCC diagnostic error "-Wpedantic"
19 #endif
20 
21 #include <rte_common.h>
22 #include <rte_ether.h>
23 #include <rte_eth_ctrl.h>
24 #include <rte_ethdev_driver.h>
25 #include <rte_flow.h>
26 #include <rte_flow_driver.h>
27 #include <rte_malloc.h>
28 #include <rte_ip.h>
29 
30 #include "mlx5.h"
31 #include "mlx5_defs.h"
32 #include "mlx5_prm.h"
33 #include "mlx5_glue.h"
34 
35 /* Dev ops structure defined in mlx5.c */
36 extern const struct eth_dev_ops mlx5_dev_ops;
37 extern const struct eth_dev_ops mlx5_dev_ops_isolate;
38 
39 /* Pattern outer Layer bits. */
40 #define MLX5_FLOW_LAYER_OUTER_L2 (1u << 0)
41 #define MLX5_FLOW_LAYER_OUTER_L3_IPV4 (1u << 1)
42 #define MLX5_FLOW_LAYER_OUTER_L3_IPV6 (1u << 2)
43 #define MLX5_FLOW_LAYER_OUTER_L4_UDP (1u << 3)
44 #define MLX5_FLOW_LAYER_OUTER_L4_TCP (1u << 4)
45 #define MLX5_FLOW_LAYER_OUTER_VLAN (1u << 5)
46 
47 /* Pattern inner Layer bits. */
48 #define MLX5_FLOW_LAYER_INNER_L2 (1u << 6)
49 #define MLX5_FLOW_LAYER_INNER_L3_IPV4 (1u << 7)
50 #define MLX5_FLOW_LAYER_INNER_L3_IPV6 (1u << 8)
51 #define MLX5_FLOW_LAYER_INNER_L4_UDP (1u << 9)
52 #define MLX5_FLOW_LAYER_INNER_L4_TCP (1u << 10)
53 #define MLX5_FLOW_LAYER_INNER_VLAN (1u << 11)
54 
55 /* Pattern tunnel Layer bits. */
56 #define MLX5_FLOW_LAYER_VXLAN (1u << 12)
57 #define MLX5_FLOW_LAYER_VXLAN_GPE (1u << 13)
58 #define MLX5_FLOW_LAYER_GRE (1u << 14)
59 #define MLX5_FLOW_LAYER_MPLS (1u << 15)
60 
61 /* Outer Masks. */
62 #define MLX5_FLOW_LAYER_OUTER_L3 \
63 	(MLX5_FLOW_LAYER_OUTER_L3_IPV4 | MLX5_FLOW_LAYER_OUTER_L3_IPV6)
64 #define MLX5_FLOW_LAYER_OUTER_L4 \
65 	(MLX5_FLOW_LAYER_OUTER_L4_UDP | MLX5_FLOW_LAYER_OUTER_L4_TCP)
66 #define MLX5_FLOW_LAYER_OUTER \
67 	(MLX5_FLOW_LAYER_OUTER_L2 | MLX5_FLOW_LAYER_OUTER_L3 | \
68 	 MLX5_FLOW_LAYER_OUTER_L4)
69 
70 /* Tunnel Masks. */
71 #define MLX5_FLOW_LAYER_TUNNEL \
72 	(MLX5_FLOW_LAYER_VXLAN | MLX5_FLOW_LAYER_VXLAN_GPE | \
73 	 MLX5_FLOW_LAYER_GRE | MLX5_FLOW_LAYER_MPLS)
74 
75 /* Inner Masks. */
76 #define MLX5_FLOW_LAYER_INNER_L3 \
77 	(MLX5_FLOW_LAYER_INNER_L3_IPV4 | MLX5_FLOW_LAYER_INNER_L3_IPV6)
78 #define MLX5_FLOW_LAYER_INNER_L4 \
79 	(MLX5_FLOW_LAYER_INNER_L4_UDP | MLX5_FLOW_LAYER_INNER_L4_TCP)
80 #define MLX5_FLOW_LAYER_INNER \
81 	(MLX5_FLOW_LAYER_INNER_L2 | MLX5_FLOW_LAYER_INNER_L3 | \
82 	 MLX5_FLOW_LAYER_INNER_L4)
83 
84 /* Actions that modify the fate of matching traffic. */
85 #define MLX5_FLOW_FATE_DROP (1u << 0)
86 #define MLX5_FLOW_FATE_QUEUE (1u << 1)
87 #define MLX5_FLOW_FATE_RSS (1u << 2)
88 
89 /* Modify a packet. */
90 #define MLX5_FLOW_MOD_FLAG (1u << 0)
91 #define MLX5_FLOW_MOD_MARK (1u << 1)
92 #define MLX5_FLOW_MOD_COUNT (1u << 2)
93 
94 /* possible L3 layers protocols filtering. */
95 #define MLX5_IP_PROTOCOL_TCP 6
96 #define MLX5_IP_PROTOCOL_UDP 17
97 #define MLX5_IP_PROTOCOL_GRE 47
98 #define MLX5_IP_PROTOCOL_MPLS 147
99 
100 /* Priority reserved for default flows. */
101 #define MLX5_FLOW_PRIO_RSVD ((uint32_t)-1)
102 
103 enum mlx5_expansion {
104 	MLX5_EXPANSION_ROOT,
105 	MLX5_EXPANSION_ROOT_OUTER,
106 	MLX5_EXPANSION_ROOT_ETH_VLAN,
107 	MLX5_EXPANSION_ROOT_OUTER_ETH_VLAN,
108 	MLX5_EXPANSION_OUTER_ETH,
109 	MLX5_EXPANSION_OUTER_ETH_VLAN,
110 	MLX5_EXPANSION_OUTER_VLAN,
111 	MLX5_EXPANSION_OUTER_IPV4,
112 	MLX5_EXPANSION_OUTER_IPV4_UDP,
113 	MLX5_EXPANSION_OUTER_IPV4_TCP,
114 	MLX5_EXPANSION_OUTER_IPV6,
115 	MLX5_EXPANSION_OUTER_IPV6_UDP,
116 	MLX5_EXPANSION_OUTER_IPV6_TCP,
117 	MLX5_EXPANSION_VXLAN,
118 	MLX5_EXPANSION_VXLAN_GPE,
119 	MLX5_EXPANSION_GRE,
120 	MLX5_EXPANSION_MPLS,
121 	MLX5_EXPANSION_ETH,
122 	MLX5_EXPANSION_ETH_VLAN,
123 	MLX5_EXPANSION_VLAN,
124 	MLX5_EXPANSION_IPV4,
125 	MLX5_EXPANSION_IPV4_UDP,
126 	MLX5_EXPANSION_IPV4_TCP,
127 	MLX5_EXPANSION_IPV6,
128 	MLX5_EXPANSION_IPV6_UDP,
129 	MLX5_EXPANSION_IPV6_TCP,
130 };
131 
132 /** Supported expansion of items. */
133 static const struct rte_flow_expand_node mlx5_support_expansion[] = {
134 	[MLX5_EXPANSION_ROOT] = {
135 		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH,
136 						 MLX5_EXPANSION_IPV4,
137 						 MLX5_EXPANSION_IPV6),
138 		.type = RTE_FLOW_ITEM_TYPE_END,
139 	},
140 	[MLX5_EXPANSION_ROOT_OUTER] = {
141 		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_ETH,
142 						 MLX5_EXPANSION_OUTER_IPV4,
143 						 MLX5_EXPANSION_OUTER_IPV6),
144 		.type = RTE_FLOW_ITEM_TYPE_END,
145 	},
146 	[MLX5_EXPANSION_ROOT_ETH_VLAN] = {
147 		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH_VLAN),
148 		.type = RTE_FLOW_ITEM_TYPE_END,
149 	},
150 	[MLX5_EXPANSION_ROOT_OUTER_ETH_VLAN] = {
151 		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_ETH_VLAN),
152 		.type = RTE_FLOW_ITEM_TYPE_END,
153 	},
154 	[MLX5_EXPANSION_OUTER_ETH] = {
155 		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_IPV4,
156 						 MLX5_EXPANSION_OUTER_IPV6,
157 						 MLX5_EXPANSION_MPLS),
158 		.type = RTE_FLOW_ITEM_TYPE_ETH,
159 		.rss_types = 0,
160 	},
161 	[MLX5_EXPANSION_OUTER_ETH_VLAN] = {
162 		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_VLAN),
163 		.type = RTE_FLOW_ITEM_TYPE_ETH,
164 		.rss_types = 0,
165 	},
166 	[MLX5_EXPANSION_OUTER_VLAN] = {
167 		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_IPV4,
168 						 MLX5_EXPANSION_OUTER_IPV6),
169 		.type = RTE_FLOW_ITEM_TYPE_VLAN,
170 	},
171 	[MLX5_EXPANSION_OUTER_IPV4] = {
172 		.next = RTE_FLOW_EXPAND_RSS_NEXT
173 			(MLX5_EXPANSION_OUTER_IPV4_UDP,
174 			 MLX5_EXPANSION_OUTER_IPV4_TCP,
175 			 MLX5_EXPANSION_GRE),
176 		.type = RTE_FLOW_ITEM_TYPE_IPV4,
177 		.rss_types = ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 |
178 			ETH_RSS_NONFRAG_IPV4_OTHER,
179 	},
180 	[MLX5_EXPANSION_OUTER_IPV4_UDP] = {
181 		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_VXLAN,
182 						 MLX5_EXPANSION_VXLAN_GPE),
183 		.type = RTE_FLOW_ITEM_TYPE_UDP,
184 		.rss_types = ETH_RSS_NONFRAG_IPV4_UDP,
185 	},
186 	[MLX5_EXPANSION_OUTER_IPV4_TCP] = {
187 		.type = RTE_FLOW_ITEM_TYPE_TCP,
188 		.rss_types = ETH_RSS_NONFRAG_IPV4_TCP,
189 	},
190 	[MLX5_EXPANSION_OUTER_IPV6] = {
191 		.next = RTE_FLOW_EXPAND_RSS_NEXT
192 			(MLX5_EXPANSION_OUTER_IPV6_UDP,
193 			 MLX5_EXPANSION_OUTER_IPV6_TCP),
194 		.type = RTE_FLOW_ITEM_TYPE_IPV6,
195 		.rss_types = ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6 |
196 			ETH_RSS_NONFRAG_IPV6_OTHER,
197 	},
198 	[MLX5_EXPANSION_OUTER_IPV6_UDP] = {
199 		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_VXLAN,
200 						 MLX5_EXPANSION_VXLAN_GPE),
201 		.type = RTE_FLOW_ITEM_TYPE_UDP,
202 		.rss_types = ETH_RSS_NONFRAG_IPV6_UDP,
203 	},
204 	[MLX5_EXPANSION_OUTER_IPV6_TCP] = {
205 		.type = RTE_FLOW_ITEM_TYPE_TCP,
206 		.rss_types = ETH_RSS_NONFRAG_IPV6_TCP,
207 	},
208 	[MLX5_EXPANSION_VXLAN] = {
209 		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH),
210 		.type = RTE_FLOW_ITEM_TYPE_VXLAN,
211 	},
212 	[MLX5_EXPANSION_VXLAN_GPE] = {
213 		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH,
214 						 MLX5_EXPANSION_IPV4,
215 						 MLX5_EXPANSION_IPV6),
216 		.type = RTE_FLOW_ITEM_TYPE_VXLAN_GPE,
217 	},
218 	[MLX5_EXPANSION_GRE] = {
219 		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4),
220 		.type = RTE_FLOW_ITEM_TYPE_GRE,
221 	},
222 	[MLX5_EXPANSION_MPLS] = {
223 		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4,
224 						 MLX5_EXPANSION_IPV6),
225 		.type = RTE_FLOW_ITEM_TYPE_MPLS,
226 	},
227 	[MLX5_EXPANSION_ETH] = {
228 		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4,
229 						 MLX5_EXPANSION_IPV6),
230 		.type = RTE_FLOW_ITEM_TYPE_ETH,
231 	},
232 	[MLX5_EXPANSION_ETH_VLAN] = {
233 		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_VLAN),
234 		.type = RTE_FLOW_ITEM_TYPE_ETH,
235 	},
236 	[MLX5_EXPANSION_VLAN] = {
237 		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4,
238 						 MLX5_EXPANSION_IPV6),
239 		.type = RTE_FLOW_ITEM_TYPE_VLAN,
240 	},
241 	[MLX5_EXPANSION_IPV4] = {
242 		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4_UDP,
243 						 MLX5_EXPANSION_IPV4_TCP),
244 		.type = RTE_FLOW_ITEM_TYPE_IPV4,
245 		.rss_types = ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 |
246 			ETH_RSS_NONFRAG_IPV4_OTHER,
247 	},
248 	[MLX5_EXPANSION_IPV4_UDP] = {
249 		.type = RTE_FLOW_ITEM_TYPE_UDP,
250 		.rss_types = ETH_RSS_NONFRAG_IPV4_UDP,
251 	},
252 	[MLX5_EXPANSION_IPV4_TCP] = {
253 		.type = RTE_FLOW_ITEM_TYPE_TCP,
254 		.rss_types = ETH_RSS_NONFRAG_IPV4_TCP,
255 	},
256 	[MLX5_EXPANSION_IPV6] = {
257 		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV6_UDP,
258 						 MLX5_EXPANSION_IPV6_TCP),
259 		.type = RTE_FLOW_ITEM_TYPE_IPV6,
260 		.rss_types = ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6 |
261 			ETH_RSS_NONFRAG_IPV6_OTHER,
262 	},
263 	[MLX5_EXPANSION_IPV6_UDP] = {
264 		.type = RTE_FLOW_ITEM_TYPE_UDP,
265 		.rss_types = ETH_RSS_NONFRAG_IPV6_UDP,
266 	},
267 	[MLX5_EXPANSION_IPV6_TCP] = {
268 		.type = RTE_FLOW_ITEM_TYPE_TCP,
269 		.rss_types = ETH_RSS_NONFRAG_IPV6_TCP,
270 	},
271 };
272 
273 /** Handles information leading to a drop fate. */
274 struct mlx5_flow_verbs {
275 	LIST_ENTRY(mlx5_flow_verbs) next;
276 	unsigned int size; /**< Size of the attribute. */
277 	struct {
278 		struct ibv_flow_attr *attr;
279 		/**< Pointer to the Specification buffer. */
280 		uint8_t *specs; /**< Pointer to the specifications. */
281 	};
282 	struct ibv_flow *flow; /**< Verbs flow pointer. */
283 	struct mlx5_hrxq *hrxq; /**< Hash Rx queue object. */
284 	uint64_t hash_fields; /**< Verbs hash Rx queue hash fields. */
285 };
286 
287 /* Counters information. */
288 struct mlx5_flow_counter {
289 	LIST_ENTRY(mlx5_flow_counter) next; /**< Pointer to the next counter. */
290 	uint32_t shared:1; /**< Share counter ID with other flow rules. */
291 	uint32_t ref_cnt:31; /**< Reference counter. */
292 	uint32_t id; /**< Counter ID. */
293 	struct ibv_counter_set *cs; /**< Holds the counters for the rule. */
294 	uint64_t hits; /**< Number of packets matched by the rule. */
295 	uint64_t bytes; /**< Number of bytes matched by the rule. */
296 };
297 
298 /* Flow structure. */
299 struct rte_flow {
300 	TAILQ_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */
301 	struct rte_flow_attr attributes; /**< User flow attribute. */
302 	uint32_t l3_protocol_en:1; /**< Protocol filtering requested. */
303 	uint32_t layers;
304 	/**< Bit-fields of present layers see MLX5_FLOW_LAYER_*. */
305 	uint32_t modifier;
306 	/**< Bit-fields of present modifier see MLX5_FLOW_MOD_*. */
307 	uint32_t fate;
308 	/**< Bit-fields of present fate see MLX5_FLOW_FATE_*. */
309 	uint8_t l3_protocol; /**< valid when l3_protocol_en is set. */
310 	LIST_HEAD(verbs, mlx5_flow_verbs) verbs; /**< Verbs flows list. */
311 	struct mlx5_flow_verbs *cur_verbs;
312 	/**< Current Verbs flow structure being filled. */
313 	struct mlx5_flow_counter *counter; /**< Holds Verbs flow counter. */
314 	struct rte_flow_action_rss rss;/**< RSS context. */
315 	uint8_t key[MLX5_RSS_HASH_KEY_LEN]; /**< RSS hash key. */
316 	uint16_t (*queue)[]; /**< Destination queues to redirect traffic to. */
317 	void *nl_flow; /**< Netlink flow buffer if relevant. */
318 };
319 
320 static const struct rte_flow_ops mlx5_flow_ops = {
321 	.validate = mlx5_flow_validate,
322 	.create = mlx5_flow_create,
323 	.destroy = mlx5_flow_destroy,
324 	.flush = mlx5_flow_flush,
325 	.isolate = mlx5_flow_isolate,
326 	.query = mlx5_flow_query,
327 };
328 
329 /* Convert FDIR request to Generic flow. */
330 struct mlx5_fdir {
331 	struct rte_flow_attr attr;
332 	struct rte_flow_action actions[2];
333 	struct rte_flow_item items[4];
334 	struct rte_flow_item_eth l2;
335 	struct rte_flow_item_eth l2_mask;
336 	union {
337 		struct rte_flow_item_ipv4 ipv4;
338 		struct rte_flow_item_ipv6 ipv6;
339 	} l3;
340 	union {
341 		struct rte_flow_item_ipv4 ipv4;
342 		struct rte_flow_item_ipv6 ipv6;
343 	} l3_mask;
344 	union {
345 		struct rte_flow_item_udp udp;
346 		struct rte_flow_item_tcp tcp;
347 	} l4;
348 	union {
349 		struct rte_flow_item_udp udp;
350 		struct rte_flow_item_tcp tcp;
351 	} l4_mask;
352 	struct rte_flow_action_queue queue;
353 };
354 
355 /* Verbs specification header. */
356 struct ibv_spec_header {
357 	enum ibv_flow_spec_type type;
358 	uint16_t size;
359 };
360 
361 /*
362  * Number of sub priorities.
363  * For each kind of pattern matching i.e. L2, L3, L4 to have a correct
364  * matching on the NIC (firmware dependent) L4 most have the higher priority
365  * followed by L3 and ending with L2.
366  */
367 #define MLX5_PRIORITY_MAP_L2 2
368 #define MLX5_PRIORITY_MAP_L3 1
369 #define MLX5_PRIORITY_MAP_L4 0
370 #define MLX5_PRIORITY_MAP_MAX 3
371 
372 /* Map of Verbs to Flow priority with 8 Verbs priorities. */
373 static const uint32_t priority_map_3[][MLX5_PRIORITY_MAP_MAX] = {
374 	{ 0, 1, 2 }, { 2, 3, 4 }, { 5, 6, 7 },
375 };
376 
377 /* Map of Verbs to Flow priority with 16 Verbs priorities. */
378 static const uint32_t priority_map_5[][MLX5_PRIORITY_MAP_MAX] = {
379 	{ 0, 1, 2 }, { 3, 4, 5 }, { 6, 7, 8 },
380 	{ 9, 10, 11 }, { 12, 13, 14 },
381 };
382 
383 /* Tunnel information. */
384 struct mlx5_flow_tunnel_info {
385 	uint32_t tunnel; /**< Tunnel bit (see MLX5_FLOW_*). */
386 	uint32_t ptype; /**< Tunnel Ptype (see RTE_PTYPE_*). */
387 };
388 
389 static struct mlx5_flow_tunnel_info tunnels_info[] = {
390 	{
391 		.tunnel = MLX5_FLOW_LAYER_VXLAN,
392 		.ptype = RTE_PTYPE_TUNNEL_VXLAN | RTE_PTYPE_L4_UDP,
393 	},
394 	{
395 		.tunnel = MLX5_FLOW_LAYER_VXLAN_GPE,
396 		.ptype = RTE_PTYPE_TUNNEL_VXLAN_GPE | RTE_PTYPE_L4_UDP,
397 	},
398 	{
399 		.tunnel = MLX5_FLOW_LAYER_GRE,
400 		.ptype = RTE_PTYPE_TUNNEL_GRE,
401 	},
402 	{
403 		.tunnel = MLX5_FLOW_LAYER_MPLS | MLX5_FLOW_LAYER_OUTER_L4_UDP,
404 		.ptype = RTE_PTYPE_TUNNEL_MPLS_IN_GRE | RTE_PTYPE_L4_UDP,
405 	},
406 	{
407 		.tunnel = MLX5_FLOW_LAYER_MPLS,
408 		.ptype = RTE_PTYPE_TUNNEL_MPLS_IN_GRE,
409 	},
410 };
411 
412 /**
413  * Discover the maximum number of priority available.
414  *
415  * @param[in] dev
416  *   Pointer to Ethernet device.
417  *
418  * @return
419  *   number of supported flow priority on success, a negative errno
420  *   value otherwise and rte_errno is set.
421  */
422 int
423 mlx5_flow_discover_priorities(struct rte_eth_dev *dev)
424 {
425 	struct {
426 		struct ibv_flow_attr attr;
427 		struct ibv_flow_spec_eth eth;
428 		struct ibv_flow_spec_action_drop drop;
429 	} flow_attr = {
430 		.attr = {
431 			.num_of_specs = 2,
432 		},
433 		.eth = {
434 			.type = IBV_FLOW_SPEC_ETH,
435 			.size = sizeof(struct ibv_flow_spec_eth),
436 		},
437 		.drop = {
438 			.size = sizeof(struct ibv_flow_spec_action_drop),
439 			.type = IBV_FLOW_SPEC_ACTION_DROP,
440 		},
441 	};
442 	struct ibv_flow *flow;
443 	struct mlx5_hrxq *drop = mlx5_hrxq_drop_new(dev);
444 	uint16_t vprio[] = { 8, 16 };
445 	int i;
446 	int priority = 0;
447 
448 	if (!drop) {
449 		rte_errno = ENOTSUP;
450 		return -rte_errno;
451 	}
452 	for (i = 0; i != RTE_DIM(vprio); i++) {
453 		flow_attr.attr.priority = vprio[i] - 1;
454 		flow = mlx5_glue->create_flow(drop->qp, &flow_attr.attr);
455 		if (!flow)
456 			break;
457 		claim_zero(mlx5_glue->destroy_flow(flow));
458 		priority = vprio[i];
459 	}
460 	switch (priority) {
461 	case 8:
462 		priority = RTE_DIM(priority_map_3);
463 		break;
464 	case 16:
465 		priority = RTE_DIM(priority_map_5);
466 		break;
467 	default:
468 		rte_errno = ENOTSUP;
469 		DRV_LOG(ERR,
470 			"port %u verbs maximum priority: %d expected 8/16",
471 			dev->data->port_id, vprio[i]);
472 		return -rte_errno;
473 	}
474 	mlx5_hrxq_drop_release(dev);
475 	DRV_LOG(INFO, "port %u flow maximum priority: %d",
476 		dev->data->port_id, priority);
477 	return priority;
478 }
479 
480 /**
481  * Adjust flow priority.
482  *
483  * @param dev
484  *   Pointer to Ethernet device.
485  * @param flow
486  *   Pointer to an rte flow.
487  */
488 static void
489 mlx5_flow_adjust_priority(struct rte_eth_dev *dev, struct rte_flow *flow)
490 {
491 	struct priv *priv = dev->data->dev_private;
492 	uint32_t priority = flow->attributes.priority;
493 	uint32_t subpriority = flow->cur_verbs->attr->priority;
494 
495 	switch (priv->config.flow_prio) {
496 	case RTE_DIM(priority_map_3):
497 		priority = priority_map_3[priority][subpriority];
498 		break;
499 	case RTE_DIM(priority_map_5):
500 		priority = priority_map_5[priority][subpriority];
501 		break;
502 	}
503 	flow->cur_verbs->attr->priority = priority;
504 }
505 
506 /**
507  * Get a flow counter.
508  *
509  * @param[in] dev
510  *   Pointer to Ethernet device.
511  * @param[in] shared
512  *   Indicate if this counter is shared with other flows.
513  * @param[in] id
514  *   Counter identifier.
515  *
516  * @return
517  *   A pointer to the counter, NULL otherwise and rte_errno is set.
518  */
519 static struct mlx5_flow_counter *
520 mlx5_flow_counter_new(struct rte_eth_dev *dev, uint32_t shared, uint32_t id)
521 {
522 	struct priv *priv = dev->data->dev_private;
523 	struct mlx5_flow_counter *cnt;
524 
525 	LIST_FOREACH(cnt, &priv->flow_counters, next) {
526 		if (!cnt->shared || cnt->shared != shared)
527 			continue;
528 		if (cnt->id != id)
529 			continue;
530 		cnt->ref_cnt++;
531 		return cnt;
532 	}
533 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
534 
535 	struct mlx5_flow_counter tmpl = {
536 		.shared = shared,
537 		.id = id,
538 		.cs = mlx5_glue->create_counter_set
539 			(priv->ctx,
540 			 &(struct ibv_counter_set_init_attr){
541 				 .counter_set_id = id,
542 			 }),
543 		.hits = 0,
544 		.bytes = 0,
545 	};
546 
547 	if (!tmpl.cs) {
548 		rte_errno = errno;
549 		return NULL;
550 	}
551 	cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
552 	if (!cnt) {
553 		rte_errno = ENOMEM;
554 		return NULL;
555 	}
556 	*cnt = tmpl;
557 	LIST_INSERT_HEAD(&priv->flow_counters, cnt, next);
558 	return cnt;
559 #endif
560 	rte_errno = ENOTSUP;
561 	return NULL;
562 }
563 
564 /**
565  * Release a flow counter.
566  *
567  * @param[in] counter
568  *   Pointer to the counter handler.
569  */
570 static void
571 mlx5_flow_counter_release(struct mlx5_flow_counter *counter)
572 {
573 	if (--counter->ref_cnt == 0) {
574 		claim_zero(mlx5_glue->destroy_counter_set(counter->cs));
575 		LIST_REMOVE(counter, next);
576 		rte_free(counter);
577 	}
578 }
579 
580 /**
581  * Verify the @p attributes will be correctly understood by the NIC and store
582  * them in the @p flow if everything is correct.
583  *
584  * @param[in] dev
585  *   Pointer to Ethernet device.
586  * @param[in] attributes
587  *   Pointer to flow attributes
588  * @param[in, out] flow
589  *   Pointer to the rte_flow structure.
590  * @param[out] error
591  *   Pointer to error structure.
592  *
593  * @return
594  *   0 on success, a negative errno value otherwise and rte_errno is set.
595  */
596 static int
597 mlx5_flow_attributes(struct rte_eth_dev *dev,
598 		     const struct rte_flow_attr *attributes,
599 		     struct rte_flow *flow,
600 		     struct rte_flow_error *error)
601 {
602 	uint32_t priority_max =
603 		((struct priv *)dev->data->dev_private)->config.flow_prio - 1;
604 
605 	if (attributes->group)
606 		return rte_flow_error_set(error, ENOTSUP,
607 					  RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
608 					  NULL,
609 					  "groups is not supported");
610 	if (attributes->priority != MLX5_FLOW_PRIO_RSVD &&
611 	    attributes->priority >= priority_max)
612 		return rte_flow_error_set(error, ENOTSUP,
613 					  RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
614 					  NULL,
615 					  "priority out of range");
616 	if (attributes->egress)
617 		return rte_flow_error_set(error, ENOTSUP,
618 					  RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
619 					  NULL,
620 					  "egress is not supported");
621 	if (attributes->transfer)
622 		return rte_flow_error_set(error, ENOTSUP,
623 					  RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
624 					  NULL,
625 					  "transfer is not supported");
626 	if (!attributes->ingress)
627 		return rte_flow_error_set(error, ENOTSUP,
628 					  RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
629 					  NULL,
630 					  "ingress attribute is mandatory");
631 	flow->attributes = *attributes;
632 	if (attributes->priority == MLX5_FLOW_PRIO_RSVD)
633 		flow->attributes.priority = priority_max;
634 	return 0;
635 }
636 
637 /**
638  * Verify the @p item specifications (spec, last, mask) are compatible with the
639  * NIC capabilities.
640  *
641  * @param[in] item
642  *   Item specification.
643  * @param[in] mask
644  *   @p item->mask or flow default bit-masks.
645  * @param[in] nic_mask
646  *   Bit-masks covering supported fields by the NIC to compare with user mask.
647  * @param[in] size
648  *   Bit-masks size in bytes.
649  * @param[out] error
650  *   Pointer to error structure.
651  *
652  * @return
653  *   0 on success, a negative errno value otherwise and rte_errno is set.
654  */
655 static int
656 mlx5_flow_item_acceptable(const struct rte_flow_item *item,
657 			  const uint8_t *mask,
658 			  const uint8_t *nic_mask,
659 			  unsigned int size,
660 			  struct rte_flow_error *error)
661 {
662 	unsigned int i;
663 
664 	assert(nic_mask);
665 	for (i = 0; i < size; ++i)
666 		if ((nic_mask[i] | mask[i]) != nic_mask[i])
667 			return rte_flow_error_set(error, ENOTSUP,
668 						  RTE_FLOW_ERROR_TYPE_ITEM,
669 						  item,
670 						  "mask enables non supported"
671 						  " bits");
672 	if (!item->spec && (item->mask || item->last))
673 		return rte_flow_error_set(error, EINVAL,
674 					  RTE_FLOW_ERROR_TYPE_ITEM,
675 					  item,
676 					  "mask/last without a spec is not"
677 					  " supported");
678 	if (item->spec && item->last) {
679 		uint8_t spec[size];
680 		uint8_t last[size];
681 		unsigned int i;
682 		int ret;
683 
684 		for (i = 0; i < size; ++i) {
685 			spec[i] = ((const uint8_t *)item->spec)[i] & mask[i];
686 			last[i] = ((const uint8_t *)item->last)[i] & mask[i];
687 		}
688 		ret = memcmp(spec, last, size);
689 		if (ret != 0)
690 			return rte_flow_error_set(error, ENOTSUP,
691 						  RTE_FLOW_ERROR_TYPE_ITEM,
692 						  item,
693 						  "range is not supported");
694 	}
695 	return 0;
696 }
697 
698 /**
699  * Add a verbs item specification into @p flow.
700  *
701  * @param[in, out] flow
702  *   Pointer to flow structure.
703  * @param[in] src
704  *   Create specification.
705  * @param[in] size
706  *   Size in bytes of the specification to copy.
707  */
708 static void
709 mlx5_flow_spec_verbs_add(struct rte_flow *flow, void *src, unsigned int size)
710 {
711 	struct mlx5_flow_verbs *verbs = flow->cur_verbs;
712 
713 	if (verbs->specs) {
714 		void *dst;
715 
716 		dst = (void *)(verbs->specs + verbs->size);
717 		memcpy(dst, src, size);
718 		++verbs->attr->num_of_specs;
719 	}
720 	verbs->size += size;
721 }
722 
723 /**
724  * Adjust verbs hash fields according to the @p flow information.
725  *
726  * @param[in, out] flow.
727  *   Pointer to flow structure.
728  * @param[in] tunnel
729  *   1 when the hash field is for a tunnel item.
730  * @param[in] layer_types
731  *   ETH_RSS_* types.
732  * @param[in] hash_fields
733  *   Item hash fields.
734  */
735 static void
736 mlx5_flow_verbs_hashfields_adjust(struct rte_flow *flow,
737 				  int tunnel __rte_unused,
738 				  uint32_t layer_types, uint64_t hash_fields)
739 {
740 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
741 	hash_fields |= (tunnel ? IBV_RX_HASH_INNER : 0);
742 	if (flow->rss.level == 2 && !tunnel)
743 		hash_fields = 0;
744 	else if (flow->rss.level < 2 && tunnel)
745 		hash_fields = 0;
746 #endif
747 	if (!(flow->rss.types & layer_types))
748 		hash_fields = 0;
749 	flow->cur_verbs->hash_fields |= hash_fields;
750 }
751 
752 /**
753  * Convert the @p item into a Verbs specification after ensuring the NIC
754  * will understand and process it correctly.
755  * If the necessary size for the conversion is greater than the @p flow_size,
756  * nothing is written in @p flow, the validation is still performed.
757  *
758  * @param[in] item
759  *   Item specification.
760  * @param[in, out] flow
761  *   Pointer to flow structure.
762  * @param[in] flow_size
763  *   Size in bytes of the available space in @p flow, if too small, nothing is
764  *   written.
765  * @param[out] error
766  *   Pointer to error structure.
767  *
768  * @return
769  *   On success the number of bytes consumed/necessary, if the returned value
770  *   is lesser or equal to @p flow_size, the @p item has fully been converted,
771  *   otherwise another call with this returned memory size should be done.
772  *   On error, a negative errno value is returned and rte_errno is set.
773  */
774 static int
775 mlx5_flow_item_eth(const struct rte_flow_item *item, struct rte_flow *flow,
776 		   const size_t flow_size, struct rte_flow_error *error)
777 {
778 	const struct rte_flow_item_eth *spec = item->spec;
779 	const struct rte_flow_item_eth *mask = item->mask;
780 	const struct rte_flow_item_eth nic_mask = {
781 		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
782 		.src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
783 		.type = RTE_BE16(0xffff),
784 	};
785 	const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL);
786 	const unsigned int size = sizeof(struct ibv_flow_spec_eth);
787 	struct ibv_flow_spec_eth eth = {
788 		.type = IBV_FLOW_SPEC_ETH | (tunnel ? IBV_FLOW_SPEC_INNER : 0),
789 		.size = size,
790 	};
791 	int ret;
792 
793 	if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L2 :
794 			    MLX5_FLOW_LAYER_OUTER_L2))
795 		return rte_flow_error_set(error, ENOTSUP,
796 					  RTE_FLOW_ERROR_TYPE_ITEM,
797 					  item,
798 					  "L2 layers already configured");
799 	if (!mask)
800 		mask = &rte_flow_item_eth_mask;
801 	ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
802 					(const uint8_t *)&nic_mask,
803 					sizeof(struct rte_flow_item_eth),
804 					error);
805 	if (ret)
806 		return ret;
807 	flow->layers |= tunnel ? MLX5_FLOW_LAYER_INNER_L2 :
808 		MLX5_FLOW_LAYER_OUTER_L2;
809 	if (size > flow_size)
810 		return size;
811 	if (spec) {
812 		unsigned int i;
813 
814 		memcpy(&eth.val.dst_mac, spec->dst.addr_bytes, ETHER_ADDR_LEN);
815 		memcpy(&eth.val.src_mac, spec->src.addr_bytes, ETHER_ADDR_LEN);
816 		eth.val.ether_type = spec->type;
817 		memcpy(&eth.mask.dst_mac, mask->dst.addr_bytes, ETHER_ADDR_LEN);
818 		memcpy(&eth.mask.src_mac, mask->src.addr_bytes, ETHER_ADDR_LEN);
819 		eth.mask.ether_type = mask->type;
820 		/* Remove unwanted bits from values. */
821 		for (i = 0; i < ETHER_ADDR_LEN; ++i) {
822 			eth.val.dst_mac[i] &= eth.mask.dst_mac[i];
823 			eth.val.src_mac[i] &= eth.mask.src_mac[i];
824 		}
825 		eth.val.ether_type &= eth.mask.ether_type;
826 	}
827 	flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L2;
828 	mlx5_flow_spec_verbs_add(flow, &eth, size);
829 	return size;
830 }
831 
832 /**
833  * Update the VLAN tag in the Verbs Ethernet specification.
834  *
835  * @param[in, out] attr
836  *   Pointer to Verbs attributes structure.
837  * @param[in] eth
838  *   Verbs structure containing the VLAN information to copy.
839  */
840 static void
841 mlx5_flow_item_vlan_update(struct ibv_flow_attr *attr,
842 			   struct ibv_flow_spec_eth *eth)
843 {
844 	unsigned int i;
845 	const enum ibv_flow_spec_type search = eth->type;
846 	struct ibv_spec_header *hdr = (struct ibv_spec_header *)
847 		((uint8_t *)attr + sizeof(struct ibv_flow_attr));
848 
849 	for (i = 0; i != attr->num_of_specs; ++i) {
850 		if (hdr->type == search) {
851 			struct ibv_flow_spec_eth *e =
852 				(struct ibv_flow_spec_eth *)hdr;
853 
854 			e->val.vlan_tag = eth->val.vlan_tag;
855 			e->mask.vlan_tag = eth->mask.vlan_tag;
856 			e->val.ether_type = eth->val.ether_type;
857 			e->mask.ether_type = eth->mask.ether_type;
858 			break;
859 		}
860 		hdr = (struct ibv_spec_header *)((uint8_t *)hdr + hdr->size);
861 	}
862 }
863 
864 /**
865  * Convert the @p item into @p flow (or by updating the already present
866  * Ethernet Verbs) specification after ensuring the NIC will understand and
867  * process it correctly.
868  * If the necessary size for the conversion is greater than the @p flow_size,
869  * nothing is written in @p flow, the validation is still performed.
870  *
871  * @param[in] item
872  *   Item specification.
873  * @param[in, out] flow
874  *   Pointer to flow structure.
875  * @param[in] flow_size
876  *   Size in bytes of the available space in @p flow, if too small, nothing is
877  *   written.
878  * @param[out] error
879  *   Pointer to error structure.
880  *
881  * @return
882  *   On success the number of bytes consumed/necessary, if the returned value
883  *   is lesser or equal to @p flow_size, the @p item has fully been converted,
884  *   otherwise another call with this returned memory size should be done.
885  *   On error, a negative errno value is returned and rte_errno is set.
886  */
887 static int
888 mlx5_flow_item_vlan(const struct rte_flow_item *item, struct rte_flow *flow,
889 		    const size_t flow_size, struct rte_flow_error *error)
890 {
891 	const struct rte_flow_item_vlan *spec = item->spec;
892 	const struct rte_flow_item_vlan *mask = item->mask;
893 	const struct rte_flow_item_vlan nic_mask = {
894 		.tci = RTE_BE16(0x0fff),
895 		.inner_type = RTE_BE16(0xffff),
896 	};
897 	unsigned int size = sizeof(struct ibv_flow_spec_eth);
898 	const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL);
899 	struct ibv_flow_spec_eth eth = {
900 		.type = IBV_FLOW_SPEC_ETH | (tunnel ? IBV_FLOW_SPEC_INNER : 0),
901 		.size = size,
902 	};
903 	int ret;
904 	const uint32_t l34m = tunnel ? (MLX5_FLOW_LAYER_INNER_L3 |
905 					MLX5_FLOW_LAYER_INNER_L4) :
906 		(MLX5_FLOW_LAYER_OUTER_L3 | MLX5_FLOW_LAYER_OUTER_L4);
907 	const uint32_t vlanm = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN :
908 		MLX5_FLOW_LAYER_OUTER_VLAN;
909 	const uint32_t l2m = tunnel ? MLX5_FLOW_LAYER_INNER_L2 :
910 		MLX5_FLOW_LAYER_OUTER_L2;
911 
912 	if (flow->layers & vlanm)
913 		return rte_flow_error_set(error, ENOTSUP,
914 					  RTE_FLOW_ERROR_TYPE_ITEM,
915 					  item,
916 					  "VLAN layer already configured");
917 	else if ((flow->layers & l34m) != 0)
918 		return rte_flow_error_set(error, ENOTSUP,
919 					  RTE_FLOW_ERROR_TYPE_ITEM,
920 					  item,
921 					  "L2 layer cannot follow L3/L4 layer");
922 	if (!mask)
923 		mask = &rte_flow_item_vlan_mask;
924 	ret = mlx5_flow_item_acceptable
925 		(item, (const uint8_t *)mask,
926 		 (const uint8_t *)&nic_mask,
927 		 sizeof(struct rte_flow_item_vlan), error);
928 	if (ret)
929 		return ret;
930 	if (spec) {
931 		eth.val.vlan_tag = spec->tci;
932 		eth.mask.vlan_tag = mask->tci;
933 		eth.val.vlan_tag &= eth.mask.vlan_tag;
934 		eth.val.ether_type = spec->inner_type;
935 		eth.mask.ether_type = mask->inner_type;
936 		eth.val.ether_type &= eth.mask.ether_type;
937 	}
938 	/*
939 	 * From verbs perspective an empty VLAN is equivalent
940 	 * to a packet without VLAN layer.
941 	 */
942 	if (!eth.mask.vlan_tag)
943 		return rte_flow_error_set(error, EINVAL,
944 					  RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
945 					  item->spec,
946 					  "VLAN cannot be empty");
947 	if (!(flow->layers & l2m)) {
948 		if (size <= flow_size) {
949 			flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L2;
950 			mlx5_flow_spec_verbs_add(flow, &eth, size);
951 		}
952 	} else {
953 		if (flow->cur_verbs)
954 			mlx5_flow_item_vlan_update(flow->cur_verbs->attr,
955 						   &eth);
956 		size = 0; /* Only an update is done in eth specification. */
957 	}
958 	flow->layers |= tunnel ?
959 		(MLX5_FLOW_LAYER_INNER_L2 | MLX5_FLOW_LAYER_INNER_VLAN) :
960 		(MLX5_FLOW_LAYER_OUTER_L2 | MLX5_FLOW_LAYER_OUTER_VLAN);
961 	return size;
962 }
963 
964 /**
965  * Convert the @p item into a Verbs specification after ensuring the NIC
966  * will understand and process it correctly.
967  * If the necessary size for the conversion is greater than the @p flow_size,
968  * nothing is written in @p flow, the validation is still performed.
969  *
970  * @param[in] item
971  *   Item specification.
972  * @param[in, out] flow
973  *   Pointer to flow structure.
974  * @param[in] flow_size
975  *   Size in bytes of the available space in @p flow, if too small, nothing is
976  *   written.
977  * @param[out] error
978  *   Pointer to error structure.
979  *
980  * @return
981  *   On success the number of bytes consumed/necessary, if the returned value
982  *   is lesser or equal to @p flow_size, the @p item has fully been converted,
983  *   otherwise another call with this returned memory size should be done.
984  *   On error, a negative errno value is returned and rte_errno is set.
985  */
986 static int
987 mlx5_flow_item_ipv4(const struct rte_flow_item *item, struct rte_flow *flow,
988 		    const size_t flow_size, struct rte_flow_error *error)
989 {
990 	const struct rte_flow_item_ipv4 *spec = item->spec;
991 	const struct rte_flow_item_ipv4 *mask = item->mask;
992 	const struct rte_flow_item_ipv4 nic_mask = {
993 		.hdr = {
994 			.src_addr = RTE_BE32(0xffffffff),
995 			.dst_addr = RTE_BE32(0xffffffff),
996 			.type_of_service = 0xff,
997 			.next_proto_id = 0xff,
998 		},
999 	};
1000 	const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL);
1001 	unsigned int size = sizeof(struct ibv_flow_spec_ipv4_ext);
1002 	struct ibv_flow_spec_ipv4_ext ipv4 = {
1003 		.type = IBV_FLOW_SPEC_IPV4_EXT |
1004 			(tunnel ? IBV_FLOW_SPEC_INNER : 0),
1005 		.size = size,
1006 	};
1007 	int ret;
1008 
1009 	if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 :
1010 			    MLX5_FLOW_LAYER_OUTER_L3))
1011 		return rte_flow_error_set(error, ENOTSUP,
1012 					  RTE_FLOW_ERROR_TYPE_ITEM,
1013 					  item,
1014 					  "multiple L3 layers not supported");
1015 	else if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 :
1016 				 MLX5_FLOW_LAYER_OUTER_L4))
1017 		return rte_flow_error_set(error, ENOTSUP,
1018 					  RTE_FLOW_ERROR_TYPE_ITEM,
1019 					  item,
1020 					  "L3 cannot follow an L4 layer.");
1021 	if (!mask)
1022 		mask = &rte_flow_item_ipv4_mask;
1023 	ret = mlx5_flow_item_acceptable
1024 		(item, (const uint8_t *)mask,
1025 		 (const uint8_t *)&nic_mask,
1026 		 sizeof(struct rte_flow_item_ipv4), error);
1027 	if (ret < 0)
1028 		return ret;
1029 	flow->layers |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 :
1030 		MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1031 	if (spec) {
1032 		ipv4.val = (struct ibv_flow_ipv4_ext_filter){
1033 			.src_ip = spec->hdr.src_addr,
1034 			.dst_ip = spec->hdr.dst_addr,
1035 			.proto = spec->hdr.next_proto_id,
1036 			.tos = spec->hdr.type_of_service,
1037 		};
1038 		ipv4.mask = (struct ibv_flow_ipv4_ext_filter){
1039 			.src_ip = mask->hdr.src_addr,
1040 			.dst_ip = mask->hdr.dst_addr,
1041 			.proto = mask->hdr.next_proto_id,
1042 			.tos = mask->hdr.type_of_service,
1043 		};
1044 		/* Remove unwanted bits from values. */
1045 		ipv4.val.src_ip &= ipv4.mask.src_ip;
1046 		ipv4.val.dst_ip &= ipv4.mask.dst_ip;
1047 		ipv4.val.proto &= ipv4.mask.proto;
1048 		ipv4.val.tos &= ipv4.mask.tos;
1049 	}
1050 	flow->l3_protocol_en = !!ipv4.mask.proto;
1051 	flow->l3_protocol = ipv4.val.proto;
1052 	if (size <= flow_size) {
1053 		mlx5_flow_verbs_hashfields_adjust
1054 			(flow, tunnel,
1055 			 (ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 |
1056 			  ETH_RSS_NONFRAG_IPV4_OTHER),
1057 			 (IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4));
1058 		flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L3;
1059 		mlx5_flow_spec_verbs_add(flow, &ipv4, size);
1060 	}
1061 	return size;
1062 }
1063 
1064 /**
1065  * Convert the @p item into a Verbs specification after ensuring the NIC
1066  * will understand and process it correctly.
1067  * If the necessary size for the conversion is greater than the @p flow_size,
1068  * nothing is written in @p flow, the validation is still performed.
1069  *
1070  * @param[in] item
1071  *   Item specification.
1072  * @param[in, out] flow
1073  *   Pointer to flow structure.
1074  * @param[in] flow_size
1075  *   Size in bytes of the available space in @p flow, if too small, nothing is
1076  *   written.
1077  * @param[out] error
1078  *   Pointer to error structure.
1079  *
1080  * @return
1081  *   On success the number of bytes consumed/necessary, if the returned value
1082  *   is lesser or equal to @p flow_size, the @p item has fully been converted,
1083  *   otherwise another call with this returned memory size should be done.
1084  *   On error, a negative errno value is returned and rte_errno is set.
1085  */
1086 static int
1087 mlx5_flow_item_ipv6(const struct rte_flow_item *item, struct rte_flow *flow,
1088 		    const size_t flow_size, struct rte_flow_error *error)
1089 {
1090 	const struct rte_flow_item_ipv6 *spec = item->spec;
1091 	const struct rte_flow_item_ipv6 *mask = item->mask;
1092 	const struct rte_flow_item_ipv6 nic_mask = {
1093 		.hdr = {
1094 			.src_addr =
1095 				"\xff\xff\xff\xff\xff\xff\xff\xff"
1096 				"\xff\xff\xff\xff\xff\xff\xff\xff",
1097 			.dst_addr =
1098 				"\xff\xff\xff\xff\xff\xff\xff\xff"
1099 				"\xff\xff\xff\xff\xff\xff\xff\xff",
1100 			.vtc_flow = RTE_BE32(0xffffffff),
1101 			.proto = 0xff,
1102 			.hop_limits = 0xff,
1103 		},
1104 	};
1105 	const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL);
1106 	unsigned int size = sizeof(struct ibv_flow_spec_ipv6);
1107 	struct ibv_flow_spec_ipv6 ipv6 = {
1108 		.type = IBV_FLOW_SPEC_IPV6 | (tunnel ? IBV_FLOW_SPEC_INNER : 0),
1109 		.size = size,
1110 	};
1111 	int ret;
1112 
1113 	if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 :
1114 			    MLX5_FLOW_LAYER_OUTER_L3))
1115 		return rte_flow_error_set(error, ENOTSUP,
1116 					  RTE_FLOW_ERROR_TYPE_ITEM,
1117 					  item,
1118 					  "multiple L3 layers not supported");
1119 	else if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 :
1120 				 MLX5_FLOW_LAYER_OUTER_L4))
1121 		return rte_flow_error_set(error, ENOTSUP,
1122 					  RTE_FLOW_ERROR_TYPE_ITEM,
1123 					  item,
1124 					  "L3 cannot follow an L4 layer.");
1125 	/*
1126 	 * IPv6 is not recognised by the NIC inside a GRE tunnel.
1127 	 * Such support has to be disabled as the rule will be
1128 	 * accepted.  Issue reproduced with Mellanox OFED 4.3-3.0.2.1 and
1129 	 * Mellanox OFED 4.4-1.0.0.0.
1130 	 */
1131 	if (tunnel && flow->layers & MLX5_FLOW_LAYER_GRE)
1132 		return rte_flow_error_set(error, ENOTSUP,
1133 					  RTE_FLOW_ERROR_TYPE_ITEM,
1134 					  item,
1135 					  "IPv6 inside a GRE tunnel is"
1136 					  " not recognised.");
1137 	if (!mask)
1138 		mask = &rte_flow_item_ipv6_mask;
1139 	ret = mlx5_flow_item_acceptable
1140 		(item, (const uint8_t *)mask,
1141 		 (const uint8_t *)&nic_mask,
1142 		 sizeof(struct rte_flow_item_ipv6), error);
1143 	if (ret < 0)
1144 		return ret;
1145 	flow->layers |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 :
1146 		MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1147 	if (spec) {
1148 		unsigned int i;
1149 		uint32_t vtc_flow_val;
1150 		uint32_t vtc_flow_mask;
1151 
1152 		memcpy(&ipv6.val.src_ip, spec->hdr.src_addr,
1153 		       RTE_DIM(ipv6.val.src_ip));
1154 		memcpy(&ipv6.val.dst_ip, spec->hdr.dst_addr,
1155 		       RTE_DIM(ipv6.val.dst_ip));
1156 		memcpy(&ipv6.mask.src_ip, mask->hdr.src_addr,
1157 		       RTE_DIM(ipv6.mask.src_ip));
1158 		memcpy(&ipv6.mask.dst_ip, mask->hdr.dst_addr,
1159 		       RTE_DIM(ipv6.mask.dst_ip));
1160 		vtc_flow_val = rte_be_to_cpu_32(spec->hdr.vtc_flow);
1161 		vtc_flow_mask = rte_be_to_cpu_32(mask->hdr.vtc_flow);
1162 		ipv6.val.flow_label =
1163 			rte_cpu_to_be_32((vtc_flow_val & IPV6_HDR_FL_MASK) >>
1164 					 IPV6_HDR_FL_SHIFT);
1165 		ipv6.val.traffic_class = (vtc_flow_val & IPV6_HDR_TC_MASK) >>
1166 					 IPV6_HDR_TC_SHIFT;
1167 		ipv6.val.next_hdr = spec->hdr.proto;
1168 		ipv6.val.hop_limit = spec->hdr.hop_limits;
1169 		ipv6.mask.flow_label =
1170 			rte_cpu_to_be_32((vtc_flow_mask & IPV6_HDR_FL_MASK) >>
1171 					 IPV6_HDR_FL_SHIFT);
1172 		ipv6.mask.traffic_class = (vtc_flow_mask & IPV6_HDR_TC_MASK) >>
1173 					  IPV6_HDR_TC_SHIFT;
1174 		ipv6.mask.next_hdr = mask->hdr.proto;
1175 		ipv6.mask.hop_limit = mask->hdr.hop_limits;
1176 		/* Remove unwanted bits from values. */
1177 		for (i = 0; i < RTE_DIM(ipv6.val.src_ip); ++i) {
1178 			ipv6.val.src_ip[i] &= ipv6.mask.src_ip[i];
1179 			ipv6.val.dst_ip[i] &= ipv6.mask.dst_ip[i];
1180 		}
1181 		ipv6.val.flow_label &= ipv6.mask.flow_label;
1182 		ipv6.val.traffic_class &= ipv6.mask.traffic_class;
1183 		ipv6.val.next_hdr &= ipv6.mask.next_hdr;
1184 		ipv6.val.hop_limit &= ipv6.mask.hop_limit;
1185 	}
1186 	flow->l3_protocol_en = !!ipv6.mask.next_hdr;
1187 	flow->l3_protocol = ipv6.val.next_hdr;
1188 	if (size <= flow_size) {
1189 		mlx5_flow_verbs_hashfields_adjust
1190 			(flow, tunnel,
1191 			 (ETH_RSS_IPV6 | ETH_RSS_NONFRAG_IPV6_OTHER),
1192 			 (IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6));
1193 		flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L3;
1194 		mlx5_flow_spec_verbs_add(flow, &ipv6, size);
1195 	}
1196 	return size;
1197 }
1198 
1199 /**
1200  * Convert the @p item into a Verbs specification after ensuring the NIC
1201  * will understand and process it correctly.
1202  * If the necessary size for the conversion is greater than the @p flow_size,
1203  * nothing is written in @p flow, the validation is still performed.
1204  *
1205  * @param[in] item
1206  *   Item specification.
1207  * @param[in, out] flow
1208  *   Pointer to flow structure.
1209  * @param[in] flow_size
1210  *   Size in bytes of the available space in @p flow, if too small, nothing is
1211  *   written.
1212  * @param[out] error
1213  *   Pointer to error structure.
1214  *
1215  * @return
1216  *   On success the number of bytes consumed/necessary, if the returned value
1217  *   is lesser or equal to @p flow_size, the @p item has fully been converted,
1218  *   otherwise another call with this returned memory size should be done.
1219  *   On error, a negative errno value is returned and rte_errno is set.
1220  */
1221 static int
1222 mlx5_flow_item_udp(const struct rte_flow_item *item, struct rte_flow *flow,
1223 		   const size_t flow_size, struct rte_flow_error *error)
1224 {
1225 	const struct rte_flow_item_udp *spec = item->spec;
1226 	const struct rte_flow_item_udp *mask = item->mask;
1227 	const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL);
1228 	unsigned int size = sizeof(struct ibv_flow_spec_tcp_udp);
1229 	struct ibv_flow_spec_tcp_udp udp = {
1230 		.type = IBV_FLOW_SPEC_UDP | (tunnel ? IBV_FLOW_SPEC_INNER : 0),
1231 		.size = size,
1232 	};
1233 	int ret;
1234 
1235 	if (flow->l3_protocol_en && flow->l3_protocol != MLX5_IP_PROTOCOL_UDP)
1236 		return rte_flow_error_set(error, ENOTSUP,
1237 					  RTE_FLOW_ERROR_TYPE_ITEM,
1238 					  item,
1239 					  "protocol filtering not compatible"
1240 					  " with UDP layer");
1241 	if (!(flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 :
1242 			      MLX5_FLOW_LAYER_OUTER_L3)))
1243 		return rte_flow_error_set(error, ENOTSUP,
1244 					  RTE_FLOW_ERROR_TYPE_ITEM,
1245 					  item,
1246 					  "L3 is mandatory to filter"
1247 					  " on L4");
1248 	if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 :
1249 			    MLX5_FLOW_LAYER_OUTER_L4))
1250 		return rte_flow_error_set(error, ENOTSUP,
1251 					  RTE_FLOW_ERROR_TYPE_ITEM,
1252 					  item,
1253 					  "L4 layer is already"
1254 					  " present");
1255 	if (!mask)
1256 		mask = &rte_flow_item_udp_mask;
1257 	ret = mlx5_flow_item_acceptable
1258 		(item, (const uint8_t *)mask,
1259 		 (const uint8_t *)&rte_flow_item_udp_mask,
1260 		 sizeof(struct rte_flow_item_udp), error);
1261 	if (ret < 0)
1262 		return ret;
1263 	flow->layers |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP :
1264 		MLX5_FLOW_LAYER_OUTER_L4_UDP;
1265 	if (spec) {
1266 		udp.val.dst_port = spec->hdr.dst_port;
1267 		udp.val.src_port = spec->hdr.src_port;
1268 		udp.mask.dst_port = mask->hdr.dst_port;
1269 		udp.mask.src_port = mask->hdr.src_port;
1270 		/* Remove unwanted bits from values. */
1271 		udp.val.src_port &= udp.mask.src_port;
1272 		udp.val.dst_port &= udp.mask.dst_port;
1273 	}
1274 	if (size <= flow_size) {
1275 		mlx5_flow_verbs_hashfields_adjust(flow, tunnel, ETH_RSS_UDP,
1276 						  (IBV_RX_HASH_SRC_PORT_UDP |
1277 						   IBV_RX_HASH_DST_PORT_UDP));
1278 		flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L4;
1279 		mlx5_flow_spec_verbs_add(flow, &udp, size);
1280 	}
1281 	return size;
1282 }
1283 
1284 /**
1285  * Convert the @p item into a Verbs specification after ensuring the NIC
1286  * will understand and process it correctly.
1287  * If the necessary size for the conversion is greater than the @p flow_size,
1288  * nothing is written in @p flow, the validation is still performed.
1289  *
1290  * @param[in] item
1291  *   Item specification.
1292  * @param[in, out] flow
1293  *   Pointer to flow structure.
1294  * @param[in] flow_size
1295  *   Size in bytes of the available space in @p flow, if too small, nothing is
1296  *   written.
1297  * @param[out] error
1298  *   Pointer to error structure.
1299  *
1300  * @return
1301  *   On success the number of bytes consumed/necessary, if the returned value
1302  *   is lesser or equal to @p flow_size, the @p item has fully been converted,
1303  *   otherwise another call with this returned memory size should be done.
1304  *   On error, a negative errno value is returned and rte_errno is set.
1305  */
1306 static int
1307 mlx5_flow_item_tcp(const struct rte_flow_item *item, struct rte_flow *flow,
1308 		   const size_t flow_size, struct rte_flow_error *error)
1309 {
1310 	const struct rte_flow_item_tcp *spec = item->spec;
1311 	const struct rte_flow_item_tcp *mask = item->mask;
1312 	const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL);
1313 	unsigned int size = sizeof(struct ibv_flow_spec_tcp_udp);
1314 	struct ibv_flow_spec_tcp_udp tcp = {
1315 		.type = IBV_FLOW_SPEC_TCP | (tunnel ? IBV_FLOW_SPEC_INNER : 0),
1316 		.size = size,
1317 	};
1318 	int ret;
1319 
1320 	if (flow->l3_protocol_en && flow->l3_protocol != MLX5_IP_PROTOCOL_TCP)
1321 		return rte_flow_error_set(error, ENOTSUP,
1322 					  RTE_FLOW_ERROR_TYPE_ITEM,
1323 					  item,
1324 					  "protocol filtering not compatible"
1325 					  " with TCP layer");
1326 	if (!(flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 :
1327 			      MLX5_FLOW_LAYER_OUTER_L3)))
1328 		return rte_flow_error_set(error, ENOTSUP,
1329 					  RTE_FLOW_ERROR_TYPE_ITEM,
1330 					  item,
1331 					  "L3 is mandatory to filter on L4");
1332 	if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 :
1333 			    MLX5_FLOW_LAYER_OUTER_L4))
1334 		return rte_flow_error_set(error, ENOTSUP,
1335 					  RTE_FLOW_ERROR_TYPE_ITEM,
1336 					  item,
1337 					  "L4 layer is already present");
1338 	if (!mask)
1339 		mask = &rte_flow_item_tcp_mask;
1340 	ret = mlx5_flow_item_acceptable
1341 		(item, (const uint8_t *)mask,
1342 		 (const uint8_t *)&rte_flow_item_tcp_mask,
1343 		 sizeof(struct rte_flow_item_tcp), error);
1344 	if (ret < 0)
1345 		return ret;
1346 	flow->layers |=  tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP :
1347 		MLX5_FLOW_LAYER_OUTER_L4_TCP;
1348 	if (spec) {
1349 		tcp.val.dst_port = spec->hdr.dst_port;
1350 		tcp.val.src_port = spec->hdr.src_port;
1351 		tcp.mask.dst_port = mask->hdr.dst_port;
1352 		tcp.mask.src_port = mask->hdr.src_port;
1353 		/* Remove unwanted bits from values. */
1354 		tcp.val.src_port &= tcp.mask.src_port;
1355 		tcp.val.dst_port &= tcp.mask.dst_port;
1356 	}
1357 	if (size <= flow_size) {
1358 		mlx5_flow_verbs_hashfields_adjust(flow, tunnel, ETH_RSS_TCP,
1359 						  (IBV_RX_HASH_SRC_PORT_TCP |
1360 						   IBV_RX_HASH_DST_PORT_TCP));
1361 		flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L4;
1362 		mlx5_flow_spec_verbs_add(flow, &tcp, size);
1363 	}
1364 	return size;
1365 }
1366 
1367 /**
1368  * Convert the @p item into a Verbs specification after ensuring the NIC
1369  * will understand and process it correctly.
1370  * If the necessary size for the conversion is greater than the @p flow_size,
1371  * nothing is written in @p flow, the validation is still performed.
1372  *
1373  * @param[in] item
1374  *   Item specification.
1375  * @param[in, out] flow
1376  *   Pointer to flow structure.
1377  * @param[in] flow_size
1378  *   Size in bytes of the available space in @p flow, if too small, nothing is
1379  *   written.
1380  * @param[out] error
1381  *   Pointer to error structure.
1382  *
1383  * @return
1384  *   On success the number of bytes consumed/necessary, if the returned value
1385  *   is lesser or equal to @p flow_size, the @p item has fully been converted,
1386  *   otherwise another call with this returned memory size should be done.
1387  *   On error, a negative errno value is returned and rte_errno is set.
1388  */
1389 static int
1390 mlx5_flow_item_vxlan(const struct rte_flow_item *item, struct rte_flow *flow,
1391 		     const size_t flow_size, struct rte_flow_error *error)
1392 {
1393 	const struct rte_flow_item_vxlan *spec = item->spec;
1394 	const struct rte_flow_item_vxlan *mask = item->mask;
1395 	unsigned int size = sizeof(struct ibv_flow_spec_tunnel);
1396 	struct ibv_flow_spec_tunnel vxlan = {
1397 		.type = IBV_FLOW_SPEC_VXLAN_TUNNEL,
1398 		.size = size,
1399 	};
1400 	int ret;
1401 	union vni {
1402 		uint32_t vlan_id;
1403 		uint8_t vni[4];
1404 	} id = { .vlan_id = 0, };
1405 
1406 	if (flow->layers & MLX5_FLOW_LAYER_TUNNEL)
1407 		return rte_flow_error_set(error, ENOTSUP,
1408 					  RTE_FLOW_ERROR_TYPE_ITEM,
1409 					  item,
1410 					  "a tunnel is already present");
1411 	/*
1412 	 * Verify only UDPv4 is present as defined in
1413 	 * https://tools.ietf.org/html/rfc7348
1414 	 */
1415 	if (!(flow->layers & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1416 		return rte_flow_error_set(error, ENOTSUP,
1417 					  RTE_FLOW_ERROR_TYPE_ITEM,
1418 					  item,
1419 					  "no outer UDP layer found");
1420 	if (!mask)
1421 		mask = &rte_flow_item_vxlan_mask;
1422 	ret = mlx5_flow_item_acceptable
1423 		(item, (const uint8_t *)mask,
1424 		 (const uint8_t *)&rte_flow_item_vxlan_mask,
1425 		 sizeof(struct rte_flow_item_vxlan), error);
1426 	if (ret < 0)
1427 		return ret;
1428 	if (spec) {
1429 		memcpy(&id.vni[1], spec->vni, 3);
1430 		vxlan.val.tunnel_id = id.vlan_id;
1431 		memcpy(&id.vni[1], mask->vni, 3);
1432 		vxlan.mask.tunnel_id = id.vlan_id;
1433 		/* Remove unwanted bits from values. */
1434 		vxlan.val.tunnel_id &= vxlan.mask.tunnel_id;
1435 	}
1436 	/*
1437 	 * Tunnel id 0 is equivalent as not adding a VXLAN layer, if
1438 	 * only this layer is defined in the Verbs specification it is
1439 	 * interpreted as wildcard and all packets will match this
1440 	 * rule, if it follows a full stack layer (ex: eth / ipv4 /
1441 	 * udp), all packets matching the layers before will also
1442 	 * match this rule.  To avoid such situation, VNI 0 is
1443 	 * currently refused.
1444 	 */
1445 	if (!vxlan.val.tunnel_id)
1446 		return rte_flow_error_set(error, EINVAL,
1447 					  RTE_FLOW_ERROR_TYPE_ITEM,
1448 					  item,
1449 					  "VXLAN vni cannot be 0");
1450 	if (!(flow->layers & MLX5_FLOW_LAYER_OUTER))
1451 		return rte_flow_error_set(error, EINVAL,
1452 					  RTE_FLOW_ERROR_TYPE_ITEM,
1453 					  item,
1454 					  "VXLAN tunnel must be fully defined");
1455 	if (size <= flow_size) {
1456 		mlx5_flow_spec_verbs_add(flow, &vxlan, size);
1457 		flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L2;
1458 	}
1459 	flow->layers |= MLX5_FLOW_LAYER_VXLAN;
1460 	return size;
1461 }
1462 
1463 /**
1464  * Convert the @p item into a Verbs specification after ensuring the NIC
1465  * will understand and process it correctly.
1466  * If the necessary size for the conversion is greater than the @p flow_size,
1467  * nothing is written in @p flow, the validation is still performed.
1468  *
1469  * @param dev
1470  *   Pointer to Ethernet device.
1471  * @param[in] item
1472  *   Item specification.
1473  * @param[in, out] flow
1474  *   Pointer to flow structure.
1475  * @param[in] flow_size
1476  *   Size in bytes of the available space in @p flow, if too small, nothing is
1477  *   written.
1478  * @param[out] error
1479  *   Pointer to error structure.
1480  *
1481  * @return
1482  *   On success the number of bytes consumed/necessary, if the returned value
1483  *   is lesser or equal to @p flow_size, the @p item has fully been converted,
1484  *   otherwise another call with this returned memory size should be done.
1485  *   On error, a negative errno value is returned and rte_errno is set.
1486  */
1487 static int
1488 mlx5_flow_item_vxlan_gpe(struct rte_eth_dev *dev,
1489 			 const struct rte_flow_item *item,
1490 			 struct rte_flow *flow, const size_t flow_size,
1491 			 struct rte_flow_error *error)
1492 {
1493 	const struct rte_flow_item_vxlan_gpe *spec = item->spec;
1494 	const struct rte_flow_item_vxlan_gpe *mask = item->mask;
1495 	unsigned int size = sizeof(struct ibv_flow_spec_tunnel);
1496 	struct ibv_flow_spec_tunnel vxlan_gpe = {
1497 		.type = IBV_FLOW_SPEC_VXLAN_TUNNEL,
1498 		.size = size,
1499 	};
1500 	int ret;
1501 	union vni {
1502 		uint32_t vlan_id;
1503 		uint8_t vni[4];
1504 	} id = { .vlan_id = 0, };
1505 
1506 	if (!((struct priv *)dev->data->dev_private)->config.l3_vxlan_en)
1507 		return rte_flow_error_set(error, ENOTSUP,
1508 					  RTE_FLOW_ERROR_TYPE_ITEM,
1509 					  item,
1510 					  "L3 VXLAN is not enabled by device"
1511 					  " parameter and/or not configured in"
1512 					  " firmware");
1513 	if (flow->layers & MLX5_FLOW_LAYER_TUNNEL)
1514 		return rte_flow_error_set(error, ENOTSUP,
1515 					  RTE_FLOW_ERROR_TYPE_ITEM,
1516 					  item,
1517 					  "a tunnel is already present");
1518 	/*
1519 	 * Verify only UDPv4 is present as defined in
1520 	 * https://tools.ietf.org/html/rfc7348
1521 	 */
1522 	if (!(flow->layers & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1523 		return rte_flow_error_set(error, ENOTSUP,
1524 					  RTE_FLOW_ERROR_TYPE_ITEM,
1525 					  item,
1526 					  "no outer UDP layer found");
1527 	if (!mask)
1528 		mask = &rte_flow_item_vxlan_gpe_mask;
1529 	ret = mlx5_flow_item_acceptable
1530 		(item, (const uint8_t *)mask,
1531 		 (const uint8_t *)&rte_flow_item_vxlan_gpe_mask,
1532 		 sizeof(struct rte_flow_item_vxlan_gpe), error);
1533 	if (ret < 0)
1534 		return ret;
1535 	if (spec) {
1536 		memcpy(&id.vni[1], spec->vni, 3);
1537 		vxlan_gpe.val.tunnel_id = id.vlan_id;
1538 		memcpy(&id.vni[1], mask->vni, 3);
1539 		vxlan_gpe.mask.tunnel_id = id.vlan_id;
1540 		if (spec->protocol)
1541 			return rte_flow_error_set
1542 				(error, EINVAL,
1543 				 RTE_FLOW_ERROR_TYPE_ITEM,
1544 				 item,
1545 				 "VxLAN-GPE protocol not supported");
1546 		/* Remove unwanted bits from values. */
1547 		vxlan_gpe.val.tunnel_id &= vxlan_gpe.mask.tunnel_id;
1548 	}
1549 	/*
1550 	 * Tunnel id 0 is equivalent as not adding a VXLAN layer, if only this
1551 	 * layer is defined in the Verbs specification it is interpreted as
1552 	 * wildcard and all packets will match this rule, if it follows a full
1553 	 * stack layer (ex: eth / ipv4 / udp), all packets matching the layers
1554 	 * before will also match this rule.  To avoid such situation, VNI 0
1555 	 * is currently refused.
1556 	 */
1557 	if (!vxlan_gpe.val.tunnel_id)
1558 		return rte_flow_error_set(error, EINVAL,
1559 					  RTE_FLOW_ERROR_TYPE_ITEM,
1560 					  item,
1561 					  "VXLAN-GPE vni cannot be 0");
1562 	if (!(flow->layers & MLX5_FLOW_LAYER_OUTER))
1563 		return rte_flow_error_set(error, EINVAL,
1564 					  RTE_FLOW_ERROR_TYPE_ITEM,
1565 					  item,
1566 					  "VXLAN-GPE tunnel must be fully"
1567 					  " defined");
1568 	if (size <= flow_size) {
1569 		mlx5_flow_spec_verbs_add(flow, &vxlan_gpe, size);
1570 		flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L2;
1571 	}
1572 	flow->layers |= MLX5_FLOW_LAYER_VXLAN_GPE;
1573 	return size;
1574 }
1575 
1576 /**
1577  * Update the protocol in Verbs IPv4/IPv6 spec.
1578  *
1579  * @param[in, out] attr
1580  *   Pointer to Verbs attributes structure.
1581  * @param[in] search
1582  *   Specification type to search in order to update the IP protocol.
1583  * @param[in] protocol
1584  *   Protocol value to set if none is present in the specification.
1585  */
1586 static void
1587 mlx5_flow_item_gre_ip_protocol_update(struct ibv_flow_attr *attr,
1588 				      enum ibv_flow_spec_type search,
1589 				      uint8_t protocol)
1590 {
1591 	unsigned int i;
1592 	struct ibv_spec_header *hdr = (struct ibv_spec_header *)
1593 		((uint8_t *)attr + sizeof(struct ibv_flow_attr));
1594 
1595 	if (!attr)
1596 		return;
1597 	for (i = 0; i != attr->num_of_specs; ++i) {
1598 		if (hdr->type == search) {
1599 			union {
1600 				struct ibv_flow_spec_ipv4_ext *ipv4;
1601 				struct ibv_flow_spec_ipv6 *ipv6;
1602 			} ip;
1603 
1604 			switch (search) {
1605 			case IBV_FLOW_SPEC_IPV4_EXT:
1606 				ip.ipv4 = (struct ibv_flow_spec_ipv4_ext *)hdr;
1607 				if (!ip.ipv4->val.proto) {
1608 					ip.ipv4->val.proto = protocol;
1609 					ip.ipv4->mask.proto = 0xff;
1610 				}
1611 				break;
1612 			case IBV_FLOW_SPEC_IPV6:
1613 				ip.ipv6 = (struct ibv_flow_spec_ipv6 *)hdr;
1614 				if (!ip.ipv6->val.next_hdr) {
1615 					ip.ipv6->val.next_hdr = protocol;
1616 					ip.ipv6->mask.next_hdr = 0xff;
1617 				}
1618 				break;
1619 			default:
1620 				break;
1621 			}
1622 			break;
1623 		}
1624 		hdr = (struct ibv_spec_header *)((uint8_t *)hdr + hdr->size);
1625 	}
1626 }
1627 
1628 /**
1629  * Convert the @p item into a Verbs specification after ensuring the NIC
1630  * will understand and process it correctly.
1631  * It will also update the previous L3 layer with the protocol value matching
1632  * the GRE.
1633  * If the necessary size for the conversion is greater than the @p flow_size,
1634  * nothing is written in @p flow, the validation is still performed.
1635  *
1636  * @param dev
1637  *   Pointer to Ethernet device.
1638  * @param[in] item
1639  *   Item specification.
1640  * @param[in, out] flow
1641  *   Pointer to flow structure.
1642  * @param[in] flow_size
1643  *   Size in bytes of the available space in @p flow, if too small, nothing is
1644  *   written.
1645  * @param[out] error
1646  *   Pointer to error structure.
1647  *
1648  * @return
1649  *   On success the number of bytes consumed/necessary, if the returned value
1650  *   is lesser or equal to @p flow_size, the @p item has fully been converted,
1651  *   otherwise another call with this returned memory size should be done.
1652  *   On error, a negative errno value is returned and rte_errno is set.
1653  */
1654 static int
1655 mlx5_flow_item_gre(const struct rte_flow_item *item,
1656 		   struct rte_flow *flow, const size_t flow_size,
1657 		   struct rte_flow_error *error)
1658 {
1659 	struct mlx5_flow_verbs *verbs = flow->cur_verbs;
1660 	const struct rte_flow_item_gre *spec = item->spec;
1661 	const struct rte_flow_item_gre *mask = item->mask;
1662 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
1663 	unsigned int size = sizeof(struct ibv_flow_spec_gre);
1664 	struct ibv_flow_spec_gre tunnel = {
1665 		.type = IBV_FLOW_SPEC_GRE,
1666 		.size = size,
1667 	};
1668 #else
1669 	unsigned int size = sizeof(struct ibv_flow_spec_tunnel);
1670 	struct ibv_flow_spec_tunnel tunnel = {
1671 		.type = IBV_FLOW_SPEC_VXLAN_TUNNEL,
1672 		.size = size,
1673 	};
1674 #endif
1675 	int ret;
1676 
1677 	if (flow->l3_protocol_en && flow->l3_protocol != MLX5_IP_PROTOCOL_GRE)
1678 		return rte_flow_error_set(error, ENOTSUP,
1679 					  RTE_FLOW_ERROR_TYPE_ITEM,
1680 					  item,
1681 					  "protocol filtering not compatible"
1682 					  " with this GRE layer");
1683 	if (flow->layers & MLX5_FLOW_LAYER_TUNNEL)
1684 		return rte_flow_error_set(error, ENOTSUP,
1685 					  RTE_FLOW_ERROR_TYPE_ITEM,
1686 					  item,
1687 					  "a tunnel is already present");
1688 	if (!(flow->layers & MLX5_FLOW_LAYER_OUTER_L3))
1689 		return rte_flow_error_set(error, ENOTSUP,
1690 					  RTE_FLOW_ERROR_TYPE_ITEM,
1691 					  item,
1692 					  "L3 Layer is missing");
1693 	if (!mask)
1694 		mask = &rte_flow_item_gre_mask;
1695 	ret = mlx5_flow_item_acceptable
1696 		(item, (const uint8_t *)mask,
1697 		 (const uint8_t *)&rte_flow_item_gre_mask,
1698 		 sizeof(struct rte_flow_item_gre), error);
1699 	if (ret < 0)
1700 		return ret;
1701 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
1702 	if (spec) {
1703 		tunnel.val.c_ks_res0_ver = spec->c_rsvd0_ver;
1704 		tunnel.val.protocol = spec->protocol;
1705 		tunnel.mask.c_ks_res0_ver = mask->c_rsvd0_ver;
1706 		tunnel.mask.protocol = mask->protocol;
1707 		/* Remove unwanted bits from values. */
1708 		tunnel.val.c_ks_res0_ver &= tunnel.mask.c_ks_res0_ver;
1709 		tunnel.val.protocol &= tunnel.mask.protocol;
1710 		tunnel.val.key &= tunnel.mask.key;
1711 	}
1712 #else
1713 	if (spec && (spec->protocol & mask->protocol))
1714 		return rte_flow_error_set(error, ENOTSUP,
1715 					  RTE_FLOW_ERROR_TYPE_ITEM,
1716 					  item,
1717 					  "without MPLS support the"
1718 					  " specification cannot be used for"
1719 					  " filtering");
1720 #endif /* !HAVE_IBV_DEVICE_MPLS_SUPPORT */
1721 	if (size <= flow_size) {
1722 		if (flow->layers & MLX5_FLOW_LAYER_OUTER_L3_IPV4)
1723 			mlx5_flow_item_gre_ip_protocol_update
1724 				(verbs->attr, IBV_FLOW_SPEC_IPV4_EXT,
1725 				 MLX5_IP_PROTOCOL_GRE);
1726 		else
1727 			mlx5_flow_item_gre_ip_protocol_update
1728 				(verbs->attr, IBV_FLOW_SPEC_IPV6,
1729 				 MLX5_IP_PROTOCOL_GRE);
1730 		mlx5_flow_spec_verbs_add(flow, &tunnel, size);
1731 		flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L2;
1732 	}
1733 	flow->layers |= MLX5_FLOW_LAYER_GRE;
1734 	return size;
1735 }
1736 
1737 /**
1738  * Convert the @p item into a Verbs specification after ensuring the NIC
1739  * will understand and process it correctly.
1740  * If the necessary size for the conversion is greater than the @p flow_size,
1741  * nothing is written in @p flow, the validation is still performed.
1742  *
1743  * @param[in] item
1744  *   Item specification.
1745  * @param[in, out] flow
1746  *   Pointer to flow structure.
1747  * @param[in] flow_size
1748  *   Size in bytes of the available space in @p flow, if too small, nothing is
1749  *   written.
1750  * @param[out] error
1751  *   Pointer to error structure.
1752  *
1753  * @return
1754  *   On success the number of bytes consumed/necessary, if the returned value
1755  *   is lesser or equal to @p flow_size, the @p item has fully been converted,
1756  *   otherwise another call with this returned memory size should be done.
1757  *   On error, a negative errno value is returned and rte_errno is set.
1758  */
1759 static int
1760 mlx5_flow_item_mpls(const struct rte_flow_item *item __rte_unused,
1761 		    struct rte_flow *flow __rte_unused,
1762 		    const size_t flow_size __rte_unused,
1763 		    struct rte_flow_error *error)
1764 {
1765 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
1766 	const struct rte_flow_item_mpls *spec = item->spec;
1767 	const struct rte_flow_item_mpls *mask = item->mask;
1768 	unsigned int size = sizeof(struct ibv_flow_spec_mpls);
1769 	struct ibv_flow_spec_mpls mpls = {
1770 		.type = IBV_FLOW_SPEC_MPLS,
1771 		.size = size,
1772 	};
1773 	int ret;
1774 
1775 	if (flow->l3_protocol_en && flow->l3_protocol != MLX5_IP_PROTOCOL_MPLS)
1776 		return rte_flow_error_set(error, ENOTSUP,
1777 					  RTE_FLOW_ERROR_TYPE_ITEM,
1778 					  item,
1779 					  "protocol filtering not compatible"
1780 					  " with MPLS layer");
1781 	/* Multi-tunnel isn't allowed but MPLS over GRE is an exception. */
1782 	if (flow->layers & MLX5_FLOW_LAYER_TUNNEL &&
1783 	    (flow->layers & MLX5_FLOW_LAYER_GRE) != MLX5_FLOW_LAYER_GRE)
1784 		return rte_flow_error_set(error, ENOTSUP,
1785 					  RTE_FLOW_ERROR_TYPE_ITEM,
1786 					  item,
1787 					  "a tunnel is already"
1788 					  " present");
1789 	if (!mask)
1790 		mask = &rte_flow_item_mpls_mask;
1791 	ret = mlx5_flow_item_acceptable
1792 		(item, (const uint8_t *)mask,
1793 		 (const uint8_t *)&rte_flow_item_mpls_mask,
1794 		 sizeof(struct rte_flow_item_mpls), error);
1795 	if (ret < 0)
1796 		return ret;
1797 	if (spec) {
1798 		memcpy(&mpls.val.label, spec, sizeof(mpls.val.label));
1799 		memcpy(&mpls.mask.label, mask, sizeof(mpls.mask.label));
1800 		/* Remove unwanted bits from values.  */
1801 		mpls.val.label &= mpls.mask.label;
1802 	}
1803 	if (size <= flow_size) {
1804 		mlx5_flow_spec_verbs_add(flow, &mpls, size);
1805 		flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L2;
1806 	}
1807 	flow->layers |= MLX5_FLOW_LAYER_MPLS;
1808 	return size;
1809 #endif /* !HAVE_IBV_DEVICE_MPLS_SUPPORT */
1810 	return rte_flow_error_set(error, ENOTSUP,
1811 				  RTE_FLOW_ERROR_TYPE_ITEM,
1812 				  item,
1813 				  "MPLS is not supported by Verbs, please"
1814 				  " update.");
1815 }
1816 
1817 /**
1818  * Convert the @p pattern into a Verbs specifications after ensuring the NIC
1819  * will understand and process it correctly.
1820  * The conversion is performed item per item, each of them is written into
1821  * the @p flow if its size is lesser or equal to @p flow_size.
1822  * Validation and memory consumption computation are still performed until the
1823  * end of @p pattern, unless an error is encountered.
1824  *
1825  * @param[in] pattern
1826  *   Flow pattern.
1827  * @param[in, out] flow
1828  *   Pointer to the rte_flow structure.
1829  * @param[in] flow_size
1830  *   Size in bytes of the available space in @p flow, if too small some
1831  *   garbage may be present.
1832  * @param[out] error
1833  *   Pointer to error structure.
1834  *
1835  * @return
1836  *   On success the number of bytes consumed/necessary, if the returned value
1837  *   is lesser or equal to @p flow_size, the @pattern  has fully been
1838  *   converted, otherwise another call with this returned memory size should
1839  *   be done.
1840  *   On error, a negative errno value is returned and rte_errno is set.
1841  */
1842 static int
1843 mlx5_flow_items(struct rte_eth_dev *dev,
1844 		const struct rte_flow_item pattern[],
1845 		struct rte_flow *flow, const size_t flow_size,
1846 		struct rte_flow_error *error)
1847 {
1848 	int remain = flow_size;
1849 	size_t size = 0;
1850 
1851 	for (; pattern->type != RTE_FLOW_ITEM_TYPE_END; pattern++) {
1852 		int ret = 0;
1853 
1854 		switch (pattern->type) {
1855 		case RTE_FLOW_ITEM_TYPE_VOID:
1856 			break;
1857 		case RTE_FLOW_ITEM_TYPE_ETH:
1858 			ret = mlx5_flow_item_eth(pattern, flow, remain, error);
1859 			break;
1860 		case RTE_FLOW_ITEM_TYPE_VLAN:
1861 			ret = mlx5_flow_item_vlan(pattern, flow, remain, error);
1862 			break;
1863 		case RTE_FLOW_ITEM_TYPE_IPV4:
1864 			ret = mlx5_flow_item_ipv4(pattern, flow, remain, error);
1865 			break;
1866 		case RTE_FLOW_ITEM_TYPE_IPV6:
1867 			ret = mlx5_flow_item_ipv6(pattern, flow, remain, error);
1868 			break;
1869 		case RTE_FLOW_ITEM_TYPE_UDP:
1870 			ret = mlx5_flow_item_udp(pattern, flow, remain, error);
1871 			break;
1872 		case RTE_FLOW_ITEM_TYPE_TCP:
1873 			ret = mlx5_flow_item_tcp(pattern, flow, remain, error);
1874 			break;
1875 		case RTE_FLOW_ITEM_TYPE_VXLAN:
1876 			ret = mlx5_flow_item_vxlan(pattern, flow, remain,
1877 						   error);
1878 			break;
1879 		case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
1880 			ret = mlx5_flow_item_vxlan_gpe(dev, pattern, flow,
1881 						       remain, error);
1882 			break;
1883 		case RTE_FLOW_ITEM_TYPE_GRE:
1884 			ret = mlx5_flow_item_gre(pattern, flow, remain, error);
1885 			break;
1886 		case RTE_FLOW_ITEM_TYPE_MPLS:
1887 			ret = mlx5_flow_item_mpls(pattern, flow, remain, error);
1888 			break;
1889 		default:
1890 			return rte_flow_error_set(error, ENOTSUP,
1891 						  RTE_FLOW_ERROR_TYPE_ITEM,
1892 						  pattern,
1893 						  "item not supported");
1894 		}
1895 		if (ret < 0)
1896 			return ret;
1897 		if (remain > ret)
1898 			remain -= ret;
1899 		else
1900 			remain = 0;
1901 		size += ret;
1902 	}
1903 	if (!flow->layers) {
1904 		const struct rte_flow_item item = {
1905 			.type = RTE_FLOW_ITEM_TYPE_ETH,
1906 		};
1907 
1908 		return mlx5_flow_item_eth(&item, flow, flow_size, error);
1909 	}
1910 	return size;
1911 }
1912 
1913 /**
1914  * Convert the @p action into a Verbs specification after ensuring the NIC
1915  * will understand and process it correctly.
1916  * If the necessary size for the conversion is greater than the @p flow_size,
1917  * nothing is written in @p flow, the validation is still performed.
1918  *
1919  * @param[in] action
1920  *   Action configuration.
1921  * @param[in, out] flow
1922  *   Pointer to flow structure.
1923  * @param[in] flow_size
1924  *   Size in bytes of the available space in @p flow, if too small, nothing is
1925  *   written.
1926  * @param[out] error
1927  *   Pointer to error structure.
1928  *
1929  * @return
1930  *   On success the number of bytes consumed/necessary, if the returned value
1931  *   is lesser or equal to @p flow_size, the @p action has fully been
1932  *   converted, otherwise another call with this returned memory size should
1933  *   be done.
1934  *   On error, a negative errno value is returned and rte_errno is set.
1935  */
1936 static int
1937 mlx5_flow_action_drop(const struct rte_flow_action *action,
1938 		      struct rte_flow *flow, const size_t flow_size,
1939 		      struct rte_flow_error *error)
1940 {
1941 	unsigned int size = sizeof(struct ibv_flow_spec_action_drop);
1942 	struct ibv_flow_spec_action_drop drop = {
1943 			.type = IBV_FLOW_SPEC_ACTION_DROP,
1944 			.size = size,
1945 	};
1946 
1947 	if (flow->fate)
1948 		return rte_flow_error_set(error, ENOTSUP,
1949 					  RTE_FLOW_ERROR_TYPE_ACTION,
1950 					  action,
1951 					  "multiple fate actions are not"
1952 					  " supported");
1953 	if (flow->modifier & (MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK))
1954 		return rte_flow_error_set(error, ENOTSUP,
1955 					  RTE_FLOW_ERROR_TYPE_ACTION,
1956 					  action,
1957 					  "drop is not compatible with"
1958 					  " flag/mark action");
1959 	if (size < flow_size)
1960 		mlx5_flow_spec_verbs_add(flow, &drop, size);
1961 	flow->fate |= MLX5_FLOW_FATE_DROP;
1962 	return size;
1963 }
1964 
1965 /**
1966  * Convert the @p action into @p flow after ensuring the NIC will understand
1967  * and process it correctly.
1968  *
1969  * @param[in] dev
1970  *   Pointer to Ethernet device structure.
1971  * @param[in] action
1972  *   Action configuration.
1973  * @param[in, out] flow
1974  *   Pointer to flow structure.
1975  * @param[out] error
1976  *   Pointer to error structure.
1977  *
1978  * @return
1979  *   0 on success, a negative errno value otherwise and rte_errno is set.
1980  */
1981 static int
1982 mlx5_flow_action_queue(struct rte_eth_dev *dev,
1983 		       const struct rte_flow_action *action,
1984 		       struct rte_flow *flow,
1985 		       struct rte_flow_error *error)
1986 {
1987 	struct priv *priv = dev->data->dev_private;
1988 	const struct rte_flow_action_queue *queue = action->conf;
1989 
1990 	if (flow->fate)
1991 		return rte_flow_error_set(error, ENOTSUP,
1992 					  RTE_FLOW_ERROR_TYPE_ACTION,
1993 					  action,
1994 					  "multiple fate actions are not"
1995 					  " supported");
1996 	if (queue->index >= priv->rxqs_n)
1997 		return rte_flow_error_set(error, EINVAL,
1998 					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1999 					  &queue->index,
2000 					  "queue index out of range");
2001 	if (!(*priv->rxqs)[queue->index])
2002 		return rte_flow_error_set(error, EINVAL,
2003 					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
2004 					  &queue->index,
2005 					  "queue is not configured");
2006 	if (flow->queue)
2007 		(*flow->queue)[0] = queue->index;
2008 	flow->rss.queue_num = 1;
2009 	flow->fate |= MLX5_FLOW_FATE_QUEUE;
2010 	return 0;
2011 }
2012 
2013 /**
2014  * Ensure the @p action will be understood and used correctly by the  NIC.
2015  *
2016  * @param dev
2017  *   Pointer to Ethernet device structure.
2018  * @param action[in]
2019  *   Pointer to flow actions array.
2020  * @param flow[in, out]
2021  *   Pointer to the rte_flow structure.
2022  * @param error[in, out]
2023  *   Pointer to error structure.
2024  *
2025  * @return
2026  *   On success @p flow->queue array and @p flow->rss are filled and valid.
2027  *   On error, a negative errno value is returned and rte_errno is set.
2028  */
2029 static int
2030 mlx5_flow_action_rss(struct rte_eth_dev *dev,
2031 		     const struct rte_flow_action *action,
2032 		     struct rte_flow *flow,
2033 		     struct rte_flow_error *error)
2034 {
2035 	struct priv *priv = dev->data->dev_private;
2036 	const struct rte_flow_action_rss *rss = action->conf;
2037 	unsigned int i;
2038 
2039 	if (flow->fate)
2040 		return rte_flow_error_set(error, ENOTSUP,
2041 					  RTE_FLOW_ERROR_TYPE_ACTION,
2042 					  action,
2043 					  "multiple fate actions are not"
2044 					  " supported");
2045 	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT &&
2046 	    rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ)
2047 		return rte_flow_error_set(error, ENOTSUP,
2048 					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
2049 					  &rss->func,
2050 					  "RSS hash function not supported");
2051 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
2052 	if (rss->level > 2)
2053 #else
2054 	if (rss->level > 1)
2055 #endif
2056 		return rte_flow_error_set(error, ENOTSUP,
2057 					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
2058 					  &rss->level,
2059 					  "tunnel RSS is not supported");
2060 	if (rss->key_len < MLX5_RSS_HASH_KEY_LEN)
2061 		return rte_flow_error_set(error, ENOTSUP,
2062 					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
2063 					  &rss->key_len,
2064 					  "RSS hash key too small");
2065 	if (rss->key_len > MLX5_RSS_HASH_KEY_LEN)
2066 		return rte_flow_error_set(error, ENOTSUP,
2067 					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
2068 					  &rss->key_len,
2069 					  "RSS hash key too large");
2070 	if (!rss->queue_num)
2071 		return rte_flow_error_set(error, ENOTSUP,
2072 					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
2073 					  rss,
2074 					  "no queues were provided for RSS");
2075 	if (rss->queue_num > priv->config.ind_table_max_size)
2076 		return rte_flow_error_set(error, ENOTSUP,
2077 					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
2078 					  &rss->queue_num,
2079 					  "number of queues too large");
2080 	if (rss->types & MLX5_RSS_HF_MASK)
2081 		return rte_flow_error_set(error, ENOTSUP,
2082 					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
2083 					  &rss->types,
2084 					  "some RSS protocols are not"
2085 					  " supported");
2086 	for (i = 0; i != rss->queue_num; ++i) {
2087 		if (rss->queue[i] >= priv->rxqs_n)
2088 			return rte_flow_error_set
2089 				(error, EINVAL,
2090 				 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
2091 				 rss,
2092 				 "queue index out of range");
2093 		if (!(*priv->rxqs)[rss->queue[i]])
2094 			return rte_flow_error_set
2095 				(error, EINVAL,
2096 				 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
2097 				 &rss->queue[i],
2098 				 "queue is not configured");
2099 	}
2100 	if (flow->queue)
2101 		memcpy((*flow->queue), rss->queue,
2102 		       rss->queue_num * sizeof(uint16_t));
2103 	flow->rss.queue_num = rss->queue_num;
2104 	memcpy(flow->key, rss->key, MLX5_RSS_HASH_KEY_LEN);
2105 	flow->rss.types = rss->types;
2106 	flow->rss.level = rss->level;
2107 	flow->fate |= MLX5_FLOW_FATE_RSS;
2108 	return 0;
2109 }
2110 
2111 /**
2112  * Convert the @p action into a Verbs specification after ensuring the NIC
2113  * will understand and process it correctly.
2114  * If the necessary size for the conversion is greater than the @p flow_size,
2115  * nothing is written in @p flow, the validation is still performed.
2116  *
2117  * @param[in] action
2118  *   Action configuration.
2119  * @param[in, out] flow
2120  *   Pointer to flow structure.
2121  * @param[in] flow_size
2122  *   Size in bytes of the available space in @p flow, if too small, nothing is
2123  *   written.
2124  * @param[out] error
2125  *   Pointer to error structure.
2126  *
2127  * @return
2128  *   On success the number of bytes consumed/necessary, if the returned value
2129  *   is lesser or equal to @p flow_size, the @p action has fully been
2130  *   converted, otherwise another call with this returned memory size should
2131  *   be done.
2132  *   On error, a negative errno value is returned and rte_errno is set.
2133  */
2134 static int
2135 mlx5_flow_action_flag(const struct rte_flow_action *action,
2136 		      struct rte_flow *flow, const size_t flow_size,
2137 		      struct rte_flow_error *error)
2138 {
2139 	unsigned int size = sizeof(struct ibv_flow_spec_action_tag);
2140 	struct ibv_flow_spec_action_tag tag = {
2141 		.type = IBV_FLOW_SPEC_ACTION_TAG,
2142 		.size = size,
2143 		.tag_id = mlx5_flow_mark_set(MLX5_FLOW_MARK_DEFAULT),
2144 	};
2145 	struct mlx5_flow_verbs *verbs = flow->cur_verbs;
2146 
2147 	if (flow->modifier & MLX5_FLOW_MOD_FLAG)
2148 		return rte_flow_error_set(error, ENOTSUP,
2149 					  RTE_FLOW_ERROR_TYPE_ACTION,
2150 					  action,
2151 					  "flag action already present");
2152 	if (flow->fate & MLX5_FLOW_FATE_DROP)
2153 		return rte_flow_error_set(error, ENOTSUP,
2154 					  RTE_FLOW_ERROR_TYPE_ACTION,
2155 					  action,
2156 					  "flag is not compatible with drop"
2157 					  " action");
2158 	if (flow->modifier & MLX5_FLOW_MOD_MARK)
2159 		size = 0;
2160 	else if (size <= flow_size && verbs)
2161 		mlx5_flow_spec_verbs_add(flow, &tag, size);
2162 	flow->modifier |= MLX5_FLOW_MOD_FLAG;
2163 	return size;
2164 }
2165 
2166 /**
2167  * Update verbs specification to modify the flag to mark.
2168  *
2169  * @param[in, out] verbs
2170  *   Pointer to the mlx5_flow_verbs structure.
2171  * @param[in] mark_id
2172  *   Mark identifier to replace the flag.
2173  */
2174 static void
2175 mlx5_flow_verbs_mark_update(struct mlx5_flow_verbs *verbs, uint32_t mark_id)
2176 {
2177 	struct ibv_spec_header *hdr;
2178 	int i;
2179 
2180 	if (!verbs)
2181 		return;
2182 	/* Update Verbs specification. */
2183 	hdr = (struct ibv_spec_header *)verbs->specs;
2184 	if (!hdr)
2185 		return;
2186 	for (i = 0; i != verbs->attr->num_of_specs; ++i) {
2187 		if (hdr->type == IBV_FLOW_SPEC_ACTION_TAG) {
2188 			struct ibv_flow_spec_action_tag *t =
2189 				(struct ibv_flow_spec_action_tag *)hdr;
2190 
2191 			t->tag_id = mlx5_flow_mark_set(mark_id);
2192 		}
2193 		hdr = (struct ibv_spec_header *)((uintptr_t)hdr + hdr->size);
2194 	}
2195 }
2196 
2197 /**
2198  * Convert the @p action into @p flow (or by updating the already present
2199  * Flag Verbs specification) after ensuring the NIC will understand and
2200  * process it correctly.
2201  * If the necessary size for the conversion is greater than the @p flow_size,
2202  * nothing is written in @p flow, the validation is still performed.
2203  *
2204  * @param[in] action
2205  *   Action configuration.
2206  * @param[in, out] flow
2207  *   Pointer to flow structure.
2208  * @param[in] flow_size
2209  *   Size in bytes of the available space in @p flow, if too small, nothing is
2210  *   written.
2211  * @param[out] error
2212  *   Pointer to error structure.
2213  *
2214  * @return
2215  *   On success the number of bytes consumed/necessary, if the returned value
2216  *   is lesser or equal to @p flow_size, the @p action has fully been
2217  *   converted, otherwise another call with this returned memory size should
2218  *   be done.
2219  *   On error, a negative errno value is returned and rte_errno is set.
2220  */
2221 static int
2222 mlx5_flow_action_mark(const struct rte_flow_action *action,
2223 		      struct rte_flow *flow, const size_t flow_size,
2224 		      struct rte_flow_error *error)
2225 {
2226 	const struct rte_flow_action_mark *mark = action->conf;
2227 	unsigned int size = sizeof(struct ibv_flow_spec_action_tag);
2228 	struct ibv_flow_spec_action_tag tag = {
2229 		.type = IBV_FLOW_SPEC_ACTION_TAG,
2230 		.size = size,
2231 	};
2232 	struct mlx5_flow_verbs *verbs = flow->cur_verbs;
2233 
2234 	if (!mark)
2235 		return rte_flow_error_set(error, EINVAL,
2236 					  RTE_FLOW_ERROR_TYPE_ACTION,
2237 					  action,
2238 					  "configuration cannot be null");
2239 	if (mark->id >= MLX5_FLOW_MARK_MAX)
2240 		return rte_flow_error_set(error, EINVAL,
2241 					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
2242 					  &mark->id,
2243 					  "mark id must in 0 <= id < "
2244 					  RTE_STR(MLX5_FLOW_MARK_MAX));
2245 	if (flow->modifier & MLX5_FLOW_MOD_MARK)
2246 		return rte_flow_error_set(error, ENOTSUP,
2247 					  RTE_FLOW_ERROR_TYPE_ACTION,
2248 					  action,
2249 					  "mark action already present");
2250 	if (flow->fate & MLX5_FLOW_FATE_DROP)
2251 		return rte_flow_error_set(error, ENOTSUP,
2252 					  RTE_FLOW_ERROR_TYPE_ACTION,
2253 					  action,
2254 					  "mark is not compatible with drop"
2255 					  " action");
2256 	if (flow->modifier & MLX5_FLOW_MOD_FLAG) {
2257 		mlx5_flow_verbs_mark_update(verbs, mark->id);
2258 		size = 0;
2259 	} else if (size <= flow_size) {
2260 		tag.tag_id = mlx5_flow_mark_set(mark->id);
2261 		mlx5_flow_spec_verbs_add(flow, &tag, size);
2262 	}
2263 	flow->modifier |= MLX5_FLOW_MOD_MARK;
2264 	return size;
2265 }
2266 
2267 /**
2268  * Convert the @p action into a Verbs specification after ensuring the NIC
2269  * will understand and process it correctly.
2270  * If the necessary size for the conversion is greater than the @p flow_size,
2271  * nothing is written in @p flow, the validation is still performed.
2272  *
2273  * @param action[in]
2274  *   Action configuration.
2275  * @param flow[in, out]
2276  *   Pointer to flow structure.
2277  * @param flow_size[in]
2278  *   Size in bytes of the available space in @p flow, if too small, nothing is
2279  *   written.
2280  * @param error[int, out]
2281  *   Pointer to error structure.
2282  *
2283  * @return
2284  *   On success the number of bytes consumed/necessary, if the returned value
2285  *   is lesser or equal to @p flow_size, the @p action has fully been
2286  *   converted, otherwise another call with this returned memory size should
2287  *   be done.
2288  *   On error, a negative errno value is returned and rte_errno is set.
2289  */
2290 static int
2291 mlx5_flow_action_count(struct rte_eth_dev *dev,
2292 		       const struct rte_flow_action *action,
2293 		       struct rte_flow *flow,
2294 		       const size_t flow_size __rte_unused,
2295 		       struct rte_flow_error *error)
2296 {
2297 	const struct rte_flow_action_count *count = action->conf;
2298 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
2299 	unsigned int size = sizeof(struct ibv_flow_spec_counter_action);
2300 	struct ibv_flow_spec_counter_action counter = {
2301 		.type = IBV_FLOW_SPEC_ACTION_COUNT,
2302 		.size = size,
2303 	};
2304 #endif
2305 
2306 	if (!flow->counter) {
2307 		flow->counter = mlx5_flow_counter_new(dev, count->shared,
2308 						      count->id);
2309 		if (!flow->counter)
2310 			return rte_flow_error_set(error, ENOTSUP,
2311 						  RTE_FLOW_ERROR_TYPE_ACTION,
2312 						  action,
2313 						  "cannot get counter"
2314 						  " context.");
2315 	}
2316 	if (!((struct priv *)dev->data->dev_private)->config.flow_counter_en)
2317 		return rte_flow_error_set(error, ENOTSUP,
2318 					  RTE_FLOW_ERROR_TYPE_ACTION,
2319 					  action,
2320 					  "flow counters are not supported.");
2321 	flow->modifier |= MLX5_FLOW_MOD_COUNT;
2322 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
2323 	counter.counter_set_handle = flow->counter->cs->handle;
2324 	if (size <= flow_size)
2325 		mlx5_flow_spec_verbs_add(flow, &counter, size);
2326 	return size;
2327 #endif
2328 	return 0;
2329 }
2330 
2331 /**
2332  * Convert the @p action into @p flow after ensuring the NIC will understand
2333  * and process it correctly.
2334  * The conversion is performed action per action, each of them is written into
2335  * the @p flow if its size is lesser or equal to @p flow_size.
2336  * Validation and memory consumption computation are still performed until the
2337  * end of @p action, unless an error is encountered.
2338  *
2339  * @param[in] dev
2340  *   Pointer to Ethernet device structure.
2341  * @param[in] actions
2342  *   Pointer to flow actions array.
2343  * @param[in, out] flow
2344  *   Pointer to the rte_flow structure.
2345  * @param[in] flow_size
2346  *   Size in bytes of the available space in @p flow, if too small some
2347  *   garbage may be present.
2348  * @param[out] error
2349  *   Pointer to error structure.
2350  *
2351  * @return
2352  *   On success the number of bytes consumed/necessary, if the returned value
2353  *   is lesser or equal to @p flow_size, the @p actions has fully been
2354  *   converted, otherwise another call with this returned memory size should
2355  *   be done.
2356  *   On error, a negative errno value is returned and rte_errno is set.
2357  */
2358 static int
2359 mlx5_flow_actions(struct rte_eth_dev *dev,
2360 		  const struct rte_flow_action actions[],
2361 		  struct rte_flow *flow, const size_t flow_size,
2362 		  struct rte_flow_error *error)
2363 {
2364 	size_t size = 0;
2365 	int remain = flow_size;
2366 	int ret = 0;
2367 
2368 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2369 		switch (actions->type) {
2370 		case RTE_FLOW_ACTION_TYPE_VOID:
2371 			break;
2372 		case RTE_FLOW_ACTION_TYPE_FLAG:
2373 			ret = mlx5_flow_action_flag(actions, flow, remain,
2374 						    error);
2375 			break;
2376 		case RTE_FLOW_ACTION_TYPE_MARK:
2377 			ret = mlx5_flow_action_mark(actions, flow, remain,
2378 						    error);
2379 			break;
2380 		case RTE_FLOW_ACTION_TYPE_DROP:
2381 			ret = mlx5_flow_action_drop(actions, flow, remain,
2382 						    error);
2383 			break;
2384 		case RTE_FLOW_ACTION_TYPE_QUEUE:
2385 			ret = mlx5_flow_action_queue(dev, actions, flow, error);
2386 			break;
2387 		case RTE_FLOW_ACTION_TYPE_RSS:
2388 			ret = mlx5_flow_action_rss(dev, actions, flow, error);
2389 			break;
2390 		case RTE_FLOW_ACTION_TYPE_COUNT:
2391 			ret = mlx5_flow_action_count(dev, actions, flow, remain,
2392 						     error);
2393 			break;
2394 		default:
2395 			return rte_flow_error_set(error, ENOTSUP,
2396 						  RTE_FLOW_ERROR_TYPE_ACTION,
2397 						  actions,
2398 						  "action not supported");
2399 		}
2400 		if (ret < 0)
2401 			return ret;
2402 		if (remain > ret)
2403 			remain -= ret;
2404 		else
2405 			remain = 0;
2406 		size += ret;
2407 	}
2408 	if (!flow->fate)
2409 		return rte_flow_error_set(error, ENOTSUP,
2410 					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
2411 					  NULL,
2412 					  "no fate action found");
2413 	return size;
2414 }
2415 
2416 /**
2417  * Validate flow rule and fill flow structure accordingly.
2418  *
2419  * @param dev
2420  *   Pointer to Ethernet device.
2421  * @param[out] flow
2422  *   Pointer to flow structure.
2423  * @param flow_size
2424  *   Size of allocated space for @p flow.
2425  * @param[in] attr
2426  *   Flow rule attributes.
2427  * @param[in] pattern
2428  *   Pattern specification (list terminated by the END pattern item).
2429  * @param[in] actions
2430  *   Associated actions (list terminated by the END action).
2431  * @param[out] error
2432  *   Perform verbose error reporting if not NULL.
2433  *
2434  * @return
2435  *   A positive value representing the size of the flow object in bytes
2436  *   regardless of @p flow_size on success, a negative errno value otherwise
2437  *   and rte_errno is set.
2438  */
2439 static int
2440 mlx5_flow_merge_switch(struct rte_eth_dev *dev,
2441 		       struct rte_flow *flow,
2442 		       size_t flow_size,
2443 		       const struct rte_flow_attr *attr,
2444 		       const struct rte_flow_item pattern[],
2445 		       const struct rte_flow_action actions[],
2446 		       struct rte_flow_error *error)
2447 {
2448 	unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
2449 	uint16_t port_id[!n + n];
2450 	struct mlx5_nl_flow_ptoi ptoi[!n + n + 1];
2451 	size_t off = RTE_ALIGN_CEIL(sizeof(*flow), alignof(max_align_t));
2452 	unsigned int i;
2453 	unsigned int own = 0;
2454 	int ret;
2455 
2456 	/* At least one port is needed when no switch domain is present. */
2457 	if (!n) {
2458 		n = 1;
2459 		port_id[0] = dev->data->port_id;
2460 	} else {
2461 		n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
2462 	}
2463 	for (i = 0; i != n; ++i) {
2464 		struct rte_eth_dev_info dev_info;
2465 
2466 		rte_eth_dev_info_get(port_id[i], &dev_info);
2467 		if (port_id[i] == dev->data->port_id)
2468 			own = i;
2469 		ptoi[i].port_id = port_id[i];
2470 		ptoi[i].ifindex = dev_info.if_index;
2471 	}
2472 	/* Ensure first entry of ptoi[] is the current device. */
2473 	if (own) {
2474 		ptoi[n] = ptoi[0];
2475 		ptoi[0] = ptoi[own];
2476 		ptoi[own] = ptoi[n];
2477 	}
2478 	/* An entry with zero ifindex terminates ptoi[]. */
2479 	ptoi[n].port_id = 0;
2480 	ptoi[n].ifindex = 0;
2481 	if (flow_size < off)
2482 		flow_size = 0;
2483 	ret = mlx5_nl_flow_transpose((uint8_t *)flow + off,
2484 				     flow_size ? flow_size - off : 0,
2485 				     ptoi, attr, pattern, actions, error);
2486 	if (ret < 0)
2487 		return ret;
2488 	if (flow_size) {
2489 		*flow = (struct rte_flow){
2490 			.attributes = *attr,
2491 			.nl_flow = (uint8_t *)flow + off,
2492 		};
2493 		/*
2494 		 * Generate a reasonably unique handle based on the address
2495 		 * of the target buffer.
2496 		 *
2497 		 * This is straightforward on 32-bit systems where the flow
2498 		 * pointer can be used directly. Otherwise, its least
2499 		 * significant part is taken after shifting it by the
2500 		 * previous power of two of the pointed buffer size.
2501 		 */
2502 		if (sizeof(flow) <= 4)
2503 			mlx5_nl_flow_brand(flow->nl_flow, (uintptr_t)flow);
2504 		else
2505 			mlx5_nl_flow_brand
2506 				(flow->nl_flow,
2507 				 (uintptr_t)flow >>
2508 				 rte_log2_u32(rte_align32prevpow2(flow_size)));
2509 	}
2510 	return off + ret;
2511 }
2512 
2513 static unsigned int
2514 mlx5_find_graph_root(const struct rte_flow_item pattern[], uint32_t rss_level)
2515 {
2516 	const struct rte_flow_item *item;
2517 	unsigned int has_vlan = 0;
2518 
2519 	for (item = pattern; item->type != RTE_FLOW_ITEM_TYPE_END; item++) {
2520 		if (item->type == RTE_FLOW_ITEM_TYPE_VLAN) {
2521 			has_vlan = 1;
2522 			break;
2523 		}
2524 	}
2525 	if (has_vlan)
2526 		return rss_level < 2 ? MLX5_EXPANSION_ROOT_ETH_VLAN :
2527 				       MLX5_EXPANSION_ROOT_OUTER_ETH_VLAN;
2528 	return rss_level < 2 ? MLX5_EXPANSION_ROOT :
2529 			       MLX5_EXPANSION_ROOT_OUTER;
2530 }
2531 
2532 /**
2533  * Convert the @p attributes, @p pattern, @p action, into an flow for the NIC
2534  * after ensuring the NIC will understand and process it correctly.
2535  * The conversion is only performed item/action per item/action, each of
2536  * them is written into the @p flow if its size is lesser or equal to @p
2537  * flow_size.
2538  * Validation and memory consumption computation are still performed until the
2539  * end, unless an error is encountered.
2540  *
2541  * @param[in] dev
2542  *   Pointer to Ethernet device.
2543  * @param[in, out] flow
2544  *   Pointer to flow structure.
2545  * @param[in] flow_size
2546  *   Size in bytes of the available space in @p flow, if too small some
2547  *   garbage may be present.
2548  * @param[in] attributes
2549  *   Flow rule attributes.
2550  * @param[in] pattern
2551  *   Pattern specification (list terminated by the END pattern item).
2552  * @param[in] actions
2553  *   Associated actions (list terminated by the END action).
2554  * @param[out] error
2555  *   Perform verbose error reporting if not NULL.
2556  *
2557  * @return
2558  *   On success the number of bytes consumed/necessary, if the returned value
2559  *   is lesser or equal to @p flow_size, the flow has fully been converted and
2560  *   can be applied, otherwise another call with this returned memory size
2561  *   should be done.
2562  *   On error, a negative errno value is returned and rte_errno is set.
2563  */
2564 static int
2565 mlx5_flow_merge(struct rte_eth_dev *dev, struct rte_flow *flow,
2566 		const size_t flow_size,
2567 		const struct rte_flow_attr *attributes,
2568 		const struct rte_flow_item pattern[],
2569 		const struct rte_flow_action actions[],
2570 		struct rte_flow_error *error)
2571 {
2572 	struct rte_flow local_flow = { .layers = 0, };
2573 	size_t size = sizeof(*flow);
2574 	union {
2575 		struct rte_flow_expand_rss buf;
2576 		uint8_t buffer[2048];
2577 	} expand_buffer;
2578 	struct rte_flow_expand_rss *buf = &expand_buffer.buf;
2579 	struct mlx5_flow_verbs *original_verbs = NULL;
2580 	size_t original_verbs_size = 0;
2581 	uint32_t original_layers = 0;
2582 	int expanded_pattern_idx = 0;
2583 	int ret;
2584 	uint32_t i;
2585 
2586 	if (attributes->transfer)
2587 		return mlx5_flow_merge_switch(dev, flow, flow_size,
2588 					      attributes, pattern,
2589 					      actions, error);
2590 	if (size > flow_size)
2591 		flow = &local_flow;
2592 	ret = mlx5_flow_attributes(dev, attributes, flow, error);
2593 	if (ret < 0)
2594 		return ret;
2595 	ret = mlx5_flow_actions(dev, actions, &local_flow, 0, error);
2596 	if (ret < 0)
2597 		return ret;
2598 	if (local_flow.rss.types) {
2599 		unsigned int graph_root;
2600 
2601 		graph_root = mlx5_find_graph_root(pattern,
2602 						  local_flow.rss.level);
2603 		ret = rte_flow_expand_rss(buf, sizeof(expand_buffer.buffer),
2604 					  pattern, local_flow.rss.types,
2605 					  mlx5_support_expansion,
2606 					  graph_root);
2607 		assert(ret > 0 &&
2608 		       (unsigned int)ret < sizeof(expand_buffer.buffer));
2609 	} else {
2610 		buf->entries = 1;
2611 		buf->entry[0].pattern = (void *)(uintptr_t)pattern;
2612 	}
2613 	size += RTE_ALIGN_CEIL(local_flow.rss.queue_num * sizeof(uint16_t),
2614 			       sizeof(void *));
2615 	if (size <= flow_size)
2616 		flow->queue = (void *)(flow + 1);
2617 	LIST_INIT(&flow->verbs);
2618 	flow->layers = 0;
2619 	flow->modifier = 0;
2620 	flow->fate = 0;
2621 	for (i = 0; i != buf->entries; ++i) {
2622 		size_t off = size;
2623 		size_t off2;
2624 
2625 		flow->layers = original_layers;
2626 		size += sizeof(struct ibv_flow_attr) +
2627 			sizeof(struct mlx5_flow_verbs);
2628 		off2 = size;
2629 		if (size < flow_size) {
2630 			flow->cur_verbs = (void *)((uintptr_t)flow + off);
2631 			flow->cur_verbs->attr = (void *)(flow->cur_verbs + 1);
2632 			flow->cur_verbs->specs =
2633 				(void *)(flow->cur_verbs->attr + 1);
2634 		}
2635 		/* First iteration convert the pattern into Verbs. */
2636 		if (i == 0) {
2637 			/* Actions don't need to be converted several time. */
2638 			ret = mlx5_flow_actions(dev, actions, flow,
2639 						(size < flow_size) ?
2640 						flow_size - size : 0,
2641 						error);
2642 			if (ret < 0)
2643 				return ret;
2644 			size += ret;
2645 		} else {
2646 			/*
2647 			 * Next iteration means the pattern has already been
2648 			 * converted and an expansion is necessary to match
2649 			 * the user RSS request.  For that only the expanded
2650 			 * items will be converted, the common part with the
2651 			 * user pattern are just copied into the next buffer
2652 			 * zone.
2653 			 */
2654 			size += original_verbs_size;
2655 			if (size < flow_size) {
2656 				rte_memcpy(flow->cur_verbs->attr,
2657 					   original_verbs->attr,
2658 					   original_verbs_size +
2659 					   sizeof(struct ibv_flow_attr));
2660 				flow->cur_verbs->size = original_verbs_size;
2661 			}
2662 		}
2663 		ret = mlx5_flow_items
2664 			(dev,
2665 			 (const struct rte_flow_item *)
2666 			 &buf->entry[i].pattern[expanded_pattern_idx],
2667 			 flow,
2668 			 (size < flow_size) ? flow_size - size : 0, error);
2669 		if (ret < 0)
2670 			return ret;
2671 		size += ret;
2672 		if (size <= flow_size) {
2673 			mlx5_flow_adjust_priority(dev, flow);
2674 			LIST_INSERT_HEAD(&flow->verbs, flow->cur_verbs, next);
2675 		}
2676 		/*
2677 		 * Keep a pointer of the first verbs conversion and the layers
2678 		 * it has encountered.
2679 		 */
2680 		if (i == 0) {
2681 			original_verbs = flow->cur_verbs;
2682 			original_verbs_size = size - off2;
2683 			original_layers = flow->layers;
2684 			/*
2685 			 * move the index of the expanded pattern to the
2686 			 * first item not addressed yet.
2687 			 */
2688 			if (pattern->type == RTE_FLOW_ITEM_TYPE_END) {
2689 				expanded_pattern_idx++;
2690 			} else {
2691 				const struct rte_flow_item *item = pattern;
2692 
2693 				for (item = pattern;
2694 				     item->type != RTE_FLOW_ITEM_TYPE_END;
2695 				     ++item)
2696 					expanded_pattern_idx++;
2697 			}
2698 		}
2699 	}
2700 	/* Restore the origin layers in the flow. */
2701 	flow->layers = original_layers;
2702 	return size;
2703 }
2704 
2705 /**
2706  * Lookup and set the ptype in the data Rx part.  A single Ptype can be used,
2707  * if several tunnel rules are used on this queue, the tunnel ptype will be
2708  * cleared.
2709  *
2710  * @param rxq_ctrl
2711  *   Rx queue to update.
2712  */
2713 static void
2714 mlx5_flow_rxq_tunnel_ptype_update(struct mlx5_rxq_ctrl *rxq_ctrl)
2715 {
2716 	unsigned int i;
2717 	uint32_t tunnel_ptype = 0;
2718 
2719 	/* Look up for the ptype to use. */
2720 	for (i = 0; i != MLX5_FLOW_TUNNEL; ++i) {
2721 		if (!rxq_ctrl->flow_tunnels_n[i])
2722 			continue;
2723 		if (!tunnel_ptype) {
2724 			tunnel_ptype = tunnels_info[i].ptype;
2725 		} else {
2726 			tunnel_ptype = 0;
2727 			break;
2728 		}
2729 	}
2730 	rxq_ctrl->rxq.tunnel = tunnel_ptype;
2731 }
2732 
2733 /**
2734  * Set the Rx queue flags (Mark/Flag and Tunnel Ptypes) according to the flow.
2735  *
2736  * @param[in] dev
2737  *   Pointer to Ethernet device.
2738  * @param[in] flow
2739  *   Pointer to flow structure.
2740  */
2741 static void
2742 mlx5_flow_rxq_flags_set(struct rte_eth_dev *dev, struct rte_flow *flow)
2743 {
2744 	struct priv *priv = dev->data->dev_private;
2745 	const int mark = !!(flow->modifier &
2746 			    (MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK));
2747 	const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL);
2748 	unsigned int i;
2749 
2750 	for (i = 0; i != flow->rss.queue_num; ++i) {
2751 		int idx = (*flow->queue)[i];
2752 		struct mlx5_rxq_ctrl *rxq_ctrl =
2753 			container_of((*priv->rxqs)[idx],
2754 				     struct mlx5_rxq_ctrl, rxq);
2755 
2756 		if (mark) {
2757 			rxq_ctrl->rxq.mark = 1;
2758 			rxq_ctrl->flow_mark_n++;
2759 		}
2760 		if (tunnel) {
2761 			unsigned int j;
2762 
2763 			/* Increase the counter matching the flow. */
2764 			for (j = 0; j != MLX5_FLOW_TUNNEL; ++j) {
2765 				if ((tunnels_info[j].tunnel & flow->layers) ==
2766 				    tunnels_info[j].tunnel) {
2767 					rxq_ctrl->flow_tunnels_n[j]++;
2768 					break;
2769 				}
2770 			}
2771 			mlx5_flow_rxq_tunnel_ptype_update(rxq_ctrl);
2772 		}
2773 	}
2774 }
2775 
2776 /**
2777  * Clear the Rx queue flags (Mark/Flag and Tunnel Ptype) associated with the
2778  * @p flow if no other flow uses it with the same kind of request.
2779  *
2780  * @param dev
2781  *   Pointer to Ethernet device.
2782  * @param[in] flow
2783  *   Pointer to the flow.
2784  */
2785 static void
2786 mlx5_flow_rxq_flags_trim(struct rte_eth_dev *dev, struct rte_flow *flow)
2787 {
2788 	struct priv *priv = dev->data->dev_private;
2789 	const int mark = !!(flow->modifier &
2790 			    (MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK));
2791 	const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL);
2792 	unsigned int i;
2793 
2794 	assert(dev->data->dev_started);
2795 	for (i = 0; i != flow->rss.queue_num; ++i) {
2796 		int idx = (*flow->queue)[i];
2797 		struct mlx5_rxq_ctrl *rxq_ctrl =
2798 			container_of((*priv->rxqs)[idx],
2799 				     struct mlx5_rxq_ctrl, rxq);
2800 
2801 		if (mark) {
2802 			rxq_ctrl->flow_mark_n--;
2803 			rxq_ctrl->rxq.mark = !!rxq_ctrl->flow_mark_n;
2804 		}
2805 		if (tunnel) {
2806 			unsigned int j;
2807 
2808 			/* Decrease the counter matching the flow. */
2809 			for (j = 0; j != MLX5_FLOW_TUNNEL; ++j) {
2810 				if ((tunnels_info[j].tunnel & flow->layers) ==
2811 				    tunnels_info[j].tunnel) {
2812 					rxq_ctrl->flow_tunnels_n[j]--;
2813 					break;
2814 				}
2815 			}
2816 			mlx5_flow_rxq_tunnel_ptype_update(rxq_ctrl);
2817 		}
2818 	}
2819 }
2820 
2821 /**
2822  * Clear the Mark/Flag and Tunnel ptype information in all Rx queues.
2823  *
2824  * @param dev
2825  *   Pointer to Ethernet device.
2826  */
2827 static void
2828 mlx5_flow_rxq_flags_clear(struct rte_eth_dev *dev)
2829 {
2830 	struct priv *priv = dev->data->dev_private;
2831 	unsigned int i;
2832 
2833 	for (i = 0; i != priv->rxqs_n; ++i) {
2834 		struct mlx5_rxq_ctrl *rxq_ctrl;
2835 		unsigned int j;
2836 
2837 		if (!(*priv->rxqs)[i])
2838 			continue;
2839 		rxq_ctrl = container_of((*priv->rxqs)[i],
2840 					struct mlx5_rxq_ctrl, rxq);
2841 		rxq_ctrl->flow_mark_n = 0;
2842 		rxq_ctrl->rxq.mark = 0;
2843 		for (j = 0; j != MLX5_FLOW_TUNNEL; ++j)
2844 			rxq_ctrl->flow_tunnels_n[j] = 0;
2845 		rxq_ctrl->rxq.tunnel = 0;
2846 	}
2847 }
2848 
2849 /**
2850  * Validate a flow supported by the NIC.
2851  *
2852  * @see rte_flow_validate()
2853  * @see rte_flow_ops
2854  */
2855 int
2856 mlx5_flow_validate(struct rte_eth_dev *dev,
2857 		   const struct rte_flow_attr *attr,
2858 		   const struct rte_flow_item items[],
2859 		   const struct rte_flow_action actions[],
2860 		   struct rte_flow_error *error)
2861 {
2862 	int ret = mlx5_flow_merge(dev, NULL, 0, attr, items, actions, error);
2863 
2864 	if (ret < 0)
2865 		return ret;
2866 	return 0;
2867 }
2868 
2869 /**
2870  * Remove the flow.
2871  *
2872  * @param[in] dev
2873  *   Pointer to Ethernet device.
2874  * @param[in, out] flow
2875  *   Pointer to flow structure.
2876  */
2877 static void
2878 mlx5_flow_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
2879 {
2880 	struct priv *priv = dev->data->dev_private;
2881 	struct mlx5_flow_verbs *verbs;
2882 
2883 	if (flow->nl_flow && priv->mnl_socket)
2884 		mlx5_nl_flow_destroy(priv->mnl_socket, flow->nl_flow, NULL);
2885 	LIST_FOREACH(verbs, &flow->verbs, next) {
2886 		if (verbs->flow) {
2887 			claim_zero(mlx5_glue->destroy_flow(verbs->flow));
2888 			verbs->flow = NULL;
2889 		}
2890 		if (verbs->hrxq) {
2891 			if (flow->fate & MLX5_FLOW_FATE_DROP)
2892 				mlx5_hrxq_drop_release(dev);
2893 			else
2894 				mlx5_hrxq_release(dev, verbs->hrxq);
2895 			verbs->hrxq = NULL;
2896 		}
2897 	}
2898 	if (flow->counter) {
2899 		mlx5_flow_counter_release(flow->counter);
2900 		flow->counter = NULL;
2901 	}
2902 }
2903 
2904 /**
2905  * Apply the flow.
2906  *
2907  * @param[in] dev
2908  *   Pointer to Ethernet device structure.
2909  * @param[in, out] flow
2910  *   Pointer to flow structure.
2911  * @param[out] error
2912  *   Pointer to error structure.
2913  *
2914  * @return
2915  *   0 on success, a negative errno value otherwise and rte_errno is set.
2916  */
2917 static int
2918 mlx5_flow_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
2919 		struct rte_flow_error *error)
2920 {
2921 	struct priv *priv = dev->data->dev_private;
2922 	struct mlx5_flow_verbs *verbs;
2923 	int err;
2924 
2925 	LIST_FOREACH(verbs, &flow->verbs, next) {
2926 		if (flow->fate & MLX5_FLOW_FATE_DROP) {
2927 			verbs->hrxq = mlx5_hrxq_drop_new(dev);
2928 			if (!verbs->hrxq) {
2929 				rte_flow_error_set
2930 					(error, errno,
2931 					 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
2932 					 NULL,
2933 					 "cannot get drop hash queue");
2934 				goto error;
2935 			}
2936 		} else {
2937 			struct mlx5_hrxq *hrxq;
2938 
2939 			hrxq = mlx5_hrxq_get(dev, flow->key,
2940 					     MLX5_RSS_HASH_KEY_LEN,
2941 					     verbs->hash_fields,
2942 					     (*flow->queue),
2943 					     flow->rss.queue_num);
2944 			if (!hrxq)
2945 				hrxq = mlx5_hrxq_new(dev, flow->key,
2946 						     MLX5_RSS_HASH_KEY_LEN,
2947 						     verbs->hash_fields,
2948 						     (*flow->queue),
2949 						     flow->rss.queue_num,
2950 						     !!(flow->layers &
2951 						      MLX5_FLOW_LAYER_TUNNEL));
2952 			if (!hrxq) {
2953 				rte_flow_error_set
2954 					(error, rte_errno,
2955 					 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
2956 					 NULL,
2957 					 "cannot get hash queue");
2958 				goto error;
2959 			}
2960 			verbs->hrxq = hrxq;
2961 		}
2962 		verbs->flow =
2963 			mlx5_glue->create_flow(verbs->hrxq->qp, verbs->attr);
2964 		if (!verbs->flow) {
2965 			rte_flow_error_set(error, errno,
2966 					   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
2967 					   NULL,
2968 					   "hardware refuses to create flow");
2969 			goto error;
2970 		}
2971 	}
2972 	if (flow->nl_flow &&
2973 	    priv->mnl_socket &&
2974 	    mlx5_nl_flow_create(priv->mnl_socket, flow->nl_flow, error))
2975 		goto error;
2976 	return 0;
2977 error:
2978 	err = rte_errno; /* Save rte_errno before cleanup. */
2979 	LIST_FOREACH(verbs, &flow->verbs, next) {
2980 		if (verbs->hrxq) {
2981 			if (flow->fate & MLX5_FLOW_FATE_DROP)
2982 				mlx5_hrxq_drop_release(dev);
2983 			else
2984 				mlx5_hrxq_release(dev, verbs->hrxq);
2985 			verbs->hrxq = NULL;
2986 		}
2987 	}
2988 	rte_errno = err; /* Restore rte_errno. */
2989 	return -rte_errno;
2990 }
2991 
2992 /**
2993  * Create a flow and add it to @p list.
2994  *
2995  * @param dev
2996  *   Pointer to Ethernet device.
2997  * @param list
2998  *   Pointer to a TAILQ flow list.
2999  * @param[in] attr
3000  *   Flow rule attributes.
3001  * @param[in] items
3002  *   Pattern specification (list terminated by the END pattern item).
3003  * @param[in] actions
3004  *   Associated actions (list terminated by the END action).
3005  * @param[out] error
3006  *   Perform verbose error reporting if not NULL.
3007  *
3008  * @return
3009  *   A flow on success, NULL otherwise and rte_errno is set.
3010  */
3011 static struct rte_flow *
3012 mlx5_flow_list_create(struct rte_eth_dev *dev,
3013 		      struct mlx5_flows *list,
3014 		      const struct rte_flow_attr *attr,
3015 		      const struct rte_flow_item items[],
3016 		      const struct rte_flow_action actions[],
3017 		      struct rte_flow_error *error)
3018 {
3019 	struct rte_flow *flow = NULL;
3020 	size_t size = 0;
3021 	int ret;
3022 
3023 	ret = mlx5_flow_merge(dev, flow, size, attr, items, actions, error);
3024 	if (ret < 0)
3025 		return NULL;
3026 	size = ret;
3027 	flow = rte_calloc(__func__, 1, size, 0);
3028 	if (!flow) {
3029 		rte_flow_error_set(error, ENOMEM,
3030 				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
3031 				   NULL,
3032 				   "not enough memory to create flow");
3033 		return NULL;
3034 	}
3035 	ret = mlx5_flow_merge(dev, flow, size, attr, items, actions, error);
3036 	if (ret < 0) {
3037 		rte_free(flow);
3038 		return NULL;
3039 	}
3040 	assert((size_t)ret == size);
3041 	if (dev->data->dev_started) {
3042 		ret = mlx5_flow_apply(dev, flow, error);
3043 		if (ret < 0) {
3044 			ret = rte_errno; /* Save rte_errno before cleanup. */
3045 			if (flow) {
3046 				mlx5_flow_remove(dev, flow);
3047 				rte_free(flow);
3048 			}
3049 			rte_errno = ret; /* Restore rte_errno. */
3050 			return NULL;
3051 		}
3052 	}
3053 	TAILQ_INSERT_TAIL(list, flow, next);
3054 	mlx5_flow_rxq_flags_set(dev, flow);
3055 	return flow;
3056 }
3057 
3058 /**
3059  * Create a flow.
3060  *
3061  * @see rte_flow_create()
3062  * @see rte_flow_ops
3063  */
3064 struct rte_flow *
3065 mlx5_flow_create(struct rte_eth_dev *dev,
3066 		 const struct rte_flow_attr *attr,
3067 		 const struct rte_flow_item items[],
3068 		 const struct rte_flow_action actions[],
3069 		 struct rte_flow_error *error)
3070 {
3071 	return mlx5_flow_list_create
3072 		(dev, &((struct priv *)dev->data->dev_private)->flows,
3073 		 attr, items, actions, error);
3074 }
3075 
3076 /**
3077  * Destroy a flow in a list.
3078  *
3079  * @param dev
3080  *   Pointer to Ethernet device.
3081  * @param list
3082  *   Pointer to a TAILQ flow list.
3083  * @param[in] flow
3084  *   Flow to destroy.
3085  */
3086 static void
3087 mlx5_flow_list_destroy(struct rte_eth_dev *dev, struct mlx5_flows *list,
3088 		       struct rte_flow *flow)
3089 {
3090 	mlx5_flow_remove(dev, flow);
3091 	TAILQ_REMOVE(list, flow, next);
3092 	/*
3093 	 * Update RX queue flags only if port is started, otherwise it is
3094 	 * already clean.
3095 	 */
3096 	if (dev->data->dev_started)
3097 		mlx5_flow_rxq_flags_trim(dev, flow);
3098 	rte_free(flow);
3099 }
3100 
3101 /**
3102  * Destroy all flows.
3103  *
3104  * @param dev
3105  *   Pointer to Ethernet device.
3106  * @param list
3107  *   Pointer to a TAILQ flow list.
3108  */
3109 void
3110 mlx5_flow_list_flush(struct rte_eth_dev *dev, struct mlx5_flows *list)
3111 {
3112 	while (!TAILQ_EMPTY(list)) {
3113 		struct rte_flow *flow;
3114 
3115 		flow = TAILQ_FIRST(list);
3116 		mlx5_flow_list_destroy(dev, list, flow);
3117 	}
3118 }
3119 
3120 /**
3121  * Remove all flows.
3122  *
3123  * @param dev
3124  *   Pointer to Ethernet device.
3125  * @param list
3126  *   Pointer to a TAILQ flow list.
3127  */
3128 void
3129 mlx5_flow_stop(struct rte_eth_dev *dev, struct mlx5_flows *list)
3130 {
3131 	struct rte_flow *flow;
3132 
3133 	TAILQ_FOREACH_REVERSE(flow, list, mlx5_flows, next)
3134 		mlx5_flow_remove(dev, flow);
3135 	mlx5_flow_rxq_flags_clear(dev);
3136 }
3137 
3138 /**
3139  * Add all flows.
3140  *
3141  * @param dev
3142  *   Pointer to Ethernet device.
3143  * @param list
3144  *   Pointer to a TAILQ flow list.
3145  *
3146  * @return
3147  *   0 on success, a negative errno value otherwise and rte_errno is set.
3148  */
3149 int
3150 mlx5_flow_start(struct rte_eth_dev *dev, struct mlx5_flows *list)
3151 {
3152 	struct rte_flow *flow;
3153 	struct rte_flow_error error;
3154 	int ret = 0;
3155 
3156 	TAILQ_FOREACH(flow, list, next) {
3157 		ret = mlx5_flow_apply(dev, flow, &error);
3158 		if (ret < 0)
3159 			goto error;
3160 		mlx5_flow_rxq_flags_set(dev, flow);
3161 	}
3162 	return 0;
3163 error:
3164 	ret = rte_errno; /* Save rte_errno before cleanup. */
3165 	mlx5_flow_stop(dev, list);
3166 	rte_errno = ret; /* Restore rte_errno. */
3167 	return -rte_errno;
3168 }
3169 
3170 /**
3171  * Verify the flow list is empty
3172  *
3173  * @param dev
3174  *  Pointer to Ethernet device.
3175  *
3176  * @return the number of flows not released.
3177  */
3178 int
3179 mlx5_flow_verify(struct rte_eth_dev *dev)
3180 {
3181 	struct priv *priv = dev->data->dev_private;
3182 	struct rte_flow *flow;
3183 	int ret = 0;
3184 
3185 	TAILQ_FOREACH(flow, &priv->flows, next) {
3186 		DRV_LOG(DEBUG, "port %u flow %p still referenced",
3187 			dev->data->port_id, (void *)flow);
3188 		++ret;
3189 	}
3190 	return ret;
3191 }
3192 
3193 /**
3194  * Enable a control flow configured from the control plane.
3195  *
3196  * @param dev
3197  *   Pointer to Ethernet device.
3198  * @param eth_spec
3199  *   An Ethernet flow spec to apply.
3200  * @param eth_mask
3201  *   An Ethernet flow mask to apply.
3202  * @param vlan_spec
3203  *   A VLAN flow spec to apply.
3204  * @param vlan_mask
3205  *   A VLAN flow mask to apply.
3206  *
3207  * @return
3208  *   0 on success, a negative errno value otherwise and rte_errno is set.
3209  */
3210 int
3211 mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
3212 		    struct rte_flow_item_eth *eth_spec,
3213 		    struct rte_flow_item_eth *eth_mask,
3214 		    struct rte_flow_item_vlan *vlan_spec,
3215 		    struct rte_flow_item_vlan *vlan_mask)
3216 {
3217 	struct priv *priv = dev->data->dev_private;
3218 	const struct rte_flow_attr attr = {
3219 		.ingress = 1,
3220 		.priority = MLX5_FLOW_PRIO_RSVD,
3221 	};
3222 	struct rte_flow_item items[] = {
3223 		{
3224 			.type = RTE_FLOW_ITEM_TYPE_ETH,
3225 			.spec = eth_spec,
3226 			.last = NULL,
3227 			.mask = eth_mask,
3228 		},
3229 		{
3230 			.type = (vlan_spec) ? RTE_FLOW_ITEM_TYPE_VLAN :
3231 				RTE_FLOW_ITEM_TYPE_END,
3232 			.spec = vlan_spec,
3233 			.last = NULL,
3234 			.mask = vlan_mask,
3235 		},
3236 		{
3237 			.type = RTE_FLOW_ITEM_TYPE_END,
3238 		},
3239 	};
3240 	uint16_t queue[priv->reta_idx_n];
3241 	struct rte_flow_action_rss action_rss = {
3242 		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
3243 		.level = 0,
3244 		.types = priv->rss_conf.rss_hf,
3245 		.key_len = priv->rss_conf.rss_key_len,
3246 		.queue_num = priv->reta_idx_n,
3247 		.key = priv->rss_conf.rss_key,
3248 		.queue = queue,
3249 	};
3250 	struct rte_flow_action actions[] = {
3251 		{
3252 			.type = RTE_FLOW_ACTION_TYPE_RSS,
3253 			.conf = &action_rss,
3254 		},
3255 		{
3256 			.type = RTE_FLOW_ACTION_TYPE_END,
3257 		},
3258 	};
3259 	struct rte_flow *flow;
3260 	struct rte_flow_error error;
3261 	unsigned int i;
3262 
3263 	if (!priv->reta_idx_n) {
3264 		rte_errno = EINVAL;
3265 		return -rte_errno;
3266 	}
3267 	for (i = 0; i != priv->reta_idx_n; ++i)
3268 		queue[i] = (*priv->reta_idx)[i];
3269 	flow = mlx5_flow_list_create(dev, &priv->ctrl_flows, &attr, items,
3270 				     actions, &error);
3271 	if (!flow)
3272 		return -rte_errno;
3273 	return 0;
3274 }
3275 
3276 /**
3277  * Enable a flow control configured from the control plane.
3278  *
3279  * @param dev
3280  *   Pointer to Ethernet device.
3281  * @param eth_spec
3282  *   An Ethernet flow spec to apply.
3283  * @param eth_mask
3284  *   An Ethernet flow mask to apply.
3285  *
3286  * @return
3287  *   0 on success, a negative errno value otherwise and rte_errno is set.
3288  */
3289 int
3290 mlx5_ctrl_flow(struct rte_eth_dev *dev,
3291 	       struct rte_flow_item_eth *eth_spec,
3292 	       struct rte_flow_item_eth *eth_mask)
3293 {
3294 	return mlx5_ctrl_flow_vlan(dev, eth_spec, eth_mask, NULL, NULL);
3295 }
3296 
3297 /**
3298  * Destroy a flow.
3299  *
3300  * @see rte_flow_destroy()
3301  * @see rte_flow_ops
3302  */
3303 int
3304 mlx5_flow_destroy(struct rte_eth_dev *dev,
3305 		  struct rte_flow *flow,
3306 		  struct rte_flow_error *error __rte_unused)
3307 {
3308 	struct priv *priv = dev->data->dev_private;
3309 
3310 	mlx5_flow_list_destroy(dev, &priv->flows, flow);
3311 	return 0;
3312 }
3313 
3314 /**
3315  * Destroy all flows.
3316  *
3317  * @see rte_flow_flush()
3318  * @see rte_flow_ops
3319  */
3320 int
3321 mlx5_flow_flush(struct rte_eth_dev *dev,
3322 		struct rte_flow_error *error __rte_unused)
3323 {
3324 	struct priv *priv = dev->data->dev_private;
3325 
3326 	mlx5_flow_list_flush(dev, &priv->flows);
3327 	return 0;
3328 }
3329 
3330 /**
3331  * Isolated mode.
3332  *
3333  * @see rte_flow_isolate()
3334  * @see rte_flow_ops
3335  */
3336 int
3337 mlx5_flow_isolate(struct rte_eth_dev *dev,
3338 		  int enable,
3339 		  struct rte_flow_error *error)
3340 {
3341 	struct priv *priv = dev->data->dev_private;
3342 
3343 	if (dev->data->dev_started) {
3344 		rte_flow_error_set(error, EBUSY,
3345 				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
3346 				   NULL,
3347 				   "port must be stopped first");
3348 		return -rte_errno;
3349 	}
3350 	priv->isolated = !!enable;
3351 	if (enable)
3352 		dev->dev_ops = &mlx5_dev_ops_isolate;
3353 	else
3354 		dev->dev_ops = &mlx5_dev_ops;
3355 	return 0;
3356 }
3357 
3358 /**
3359  * Query flow counter.
3360  *
3361  * @param flow
3362  *   Pointer to the flow.
3363  *
3364  * @return
3365  *   0 on success, a negative errno value otherwise and rte_errno is set.
3366  */
3367 static int
3368 mlx5_flow_query_count(struct rte_flow *flow __rte_unused,
3369 		      void *data __rte_unused,
3370 		      struct rte_flow_error *error)
3371 {
3372 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
3373 	if (flow->modifier & MLX5_FLOW_MOD_COUNT) {
3374 		struct rte_flow_query_count *qc = data;
3375 		uint64_t counters[2] = {0, 0};
3376 		struct ibv_query_counter_set_attr query_cs_attr = {
3377 			.cs = flow->counter->cs,
3378 			.query_flags = IBV_COUNTER_SET_FORCE_UPDATE,
3379 		};
3380 		struct ibv_counter_set_data query_out = {
3381 			.out = counters,
3382 			.outlen = 2 * sizeof(uint64_t),
3383 		};
3384 		int err = mlx5_glue->query_counter_set(&query_cs_attr,
3385 						       &query_out);
3386 
3387 		if (err)
3388 			return rte_flow_error_set
3389 				(error, err,
3390 				 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
3391 				 NULL,
3392 				 "cannot read counter");
3393 		qc->hits_set = 1;
3394 		qc->bytes_set = 1;
3395 		qc->hits = counters[0] - flow->counter->hits;
3396 		qc->bytes = counters[1] - flow->counter->bytes;
3397 		if (qc->reset) {
3398 			flow->counter->hits = counters[0];
3399 			flow->counter->bytes = counters[1];
3400 		}
3401 		return 0;
3402 	}
3403 	return rte_flow_error_set(error, ENOTSUP,
3404 				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
3405 				  NULL,
3406 				  "flow does not have counter");
3407 #endif
3408 	return rte_flow_error_set(error, ENOTSUP,
3409 				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
3410 				  NULL,
3411 				  "counters are not available");
3412 }
3413 
3414 /**
3415  * Query a flows.
3416  *
3417  * @see rte_flow_query()
3418  * @see rte_flow_ops
3419  */
3420 int
3421 mlx5_flow_query(struct rte_eth_dev *dev __rte_unused,
3422 		struct rte_flow *flow,
3423 		const struct rte_flow_action *actions,
3424 		void *data,
3425 		struct rte_flow_error *error)
3426 {
3427 	int ret = 0;
3428 
3429 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3430 		switch (actions->type) {
3431 		case RTE_FLOW_ACTION_TYPE_VOID:
3432 			break;
3433 		case RTE_FLOW_ACTION_TYPE_COUNT:
3434 			ret = mlx5_flow_query_count(flow, data, error);
3435 			break;
3436 		default:
3437 			return rte_flow_error_set(error, ENOTSUP,
3438 						  RTE_FLOW_ERROR_TYPE_ACTION,
3439 						  actions,
3440 						  "action not supported");
3441 		}
3442 		if (ret < 0)
3443 			return ret;
3444 	}
3445 	return 0;
3446 }
3447 
3448 /**
3449  * Convert a flow director filter to a generic flow.
3450  *
3451  * @param dev
3452  *   Pointer to Ethernet device.
3453  * @param fdir_filter
3454  *   Flow director filter to add.
3455  * @param attributes
3456  *   Generic flow parameters structure.
3457  *
3458  * @return
3459  *   0 on success, a negative errno value otherwise and rte_errno is set.
3460  */
3461 static int
3462 mlx5_fdir_filter_convert(struct rte_eth_dev *dev,
3463 			 const struct rte_eth_fdir_filter *fdir_filter,
3464 			 struct mlx5_fdir *attributes)
3465 {
3466 	struct priv *priv = dev->data->dev_private;
3467 	const struct rte_eth_fdir_input *input = &fdir_filter->input;
3468 	const struct rte_eth_fdir_masks *mask =
3469 		&dev->data->dev_conf.fdir_conf.mask;
3470 
3471 	/* Validate queue number. */
3472 	if (fdir_filter->action.rx_queue >= priv->rxqs_n) {
3473 		DRV_LOG(ERR, "port %u invalid queue number %d",
3474 			dev->data->port_id, fdir_filter->action.rx_queue);
3475 		rte_errno = EINVAL;
3476 		return -rte_errno;
3477 	}
3478 	attributes->attr.ingress = 1;
3479 	attributes->items[0] = (struct rte_flow_item) {
3480 		.type = RTE_FLOW_ITEM_TYPE_ETH,
3481 		.spec = &attributes->l2,
3482 		.mask = &attributes->l2_mask,
3483 	};
3484 	switch (fdir_filter->action.behavior) {
3485 	case RTE_ETH_FDIR_ACCEPT:
3486 		attributes->actions[0] = (struct rte_flow_action){
3487 			.type = RTE_FLOW_ACTION_TYPE_QUEUE,
3488 			.conf = &attributes->queue,
3489 		};
3490 		break;
3491 	case RTE_ETH_FDIR_REJECT:
3492 		attributes->actions[0] = (struct rte_flow_action){
3493 			.type = RTE_FLOW_ACTION_TYPE_DROP,
3494 		};
3495 		break;
3496 	default:
3497 		DRV_LOG(ERR, "port %u invalid behavior %d",
3498 			dev->data->port_id,
3499 			fdir_filter->action.behavior);
3500 		rte_errno = ENOTSUP;
3501 		return -rte_errno;
3502 	}
3503 	attributes->queue.index = fdir_filter->action.rx_queue;
3504 	/* Handle L3. */
3505 	switch (fdir_filter->input.flow_type) {
3506 	case RTE_ETH_FLOW_NONFRAG_IPV4_UDP:
3507 	case RTE_ETH_FLOW_NONFRAG_IPV4_TCP:
3508 	case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER:
3509 		attributes->l3.ipv4.hdr = (struct ipv4_hdr){
3510 			.src_addr = input->flow.ip4_flow.src_ip,
3511 			.dst_addr = input->flow.ip4_flow.dst_ip,
3512 			.time_to_live = input->flow.ip4_flow.ttl,
3513 			.type_of_service = input->flow.ip4_flow.tos,
3514 			.next_proto_id = input->flow.ip4_flow.proto,
3515 		};
3516 		attributes->l3_mask.ipv4.hdr = (struct ipv4_hdr){
3517 			.src_addr = mask->ipv4_mask.src_ip,
3518 			.dst_addr = mask->ipv4_mask.dst_ip,
3519 			.time_to_live = mask->ipv4_mask.ttl,
3520 			.type_of_service = mask->ipv4_mask.tos,
3521 			.next_proto_id = mask->ipv4_mask.proto,
3522 		};
3523 		attributes->items[1] = (struct rte_flow_item){
3524 			.type = RTE_FLOW_ITEM_TYPE_IPV4,
3525 			.spec = &attributes->l3,
3526 			.mask = &attributes->l3_mask,
3527 		};
3528 		break;
3529 	case RTE_ETH_FLOW_NONFRAG_IPV6_UDP:
3530 	case RTE_ETH_FLOW_NONFRAG_IPV6_TCP:
3531 	case RTE_ETH_FLOW_NONFRAG_IPV6_OTHER:
3532 		attributes->l3.ipv6.hdr = (struct ipv6_hdr){
3533 			.hop_limits = input->flow.ipv6_flow.hop_limits,
3534 			.proto = input->flow.ipv6_flow.proto,
3535 		};
3536 
3537 		memcpy(attributes->l3.ipv6.hdr.src_addr,
3538 		       input->flow.ipv6_flow.src_ip,
3539 		       RTE_DIM(attributes->l3.ipv6.hdr.src_addr));
3540 		memcpy(attributes->l3.ipv6.hdr.dst_addr,
3541 		       input->flow.ipv6_flow.dst_ip,
3542 		       RTE_DIM(attributes->l3.ipv6.hdr.src_addr));
3543 		memcpy(attributes->l3_mask.ipv6.hdr.src_addr,
3544 		       mask->ipv6_mask.src_ip,
3545 		       RTE_DIM(attributes->l3_mask.ipv6.hdr.src_addr));
3546 		memcpy(attributes->l3_mask.ipv6.hdr.dst_addr,
3547 		       mask->ipv6_mask.dst_ip,
3548 		       RTE_DIM(attributes->l3_mask.ipv6.hdr.src_addr));
3549 		attributes->items[1] = (struct rte_flow_item){
3550 			.type = RTE_FLOW_ITEM_TYPE_IPV6,
3551 			.spec = &attributes->l3,
3552 			.mask = &attributes->l3_mask,
3553 		};
3554 		break;
3555 	default:
3556 		DRV_LOG(ERR, "port %u invalid flow type%d",
3557 			dev->data->port_id, fdir_filter->input.flow_type);
3558 		rte_errno = ENOTSUP;
3559 		return -rte_errno;
3560 	}
3561 	/* Handle L4. */
3562 	switch (fdir_filter->input.flow_type) {
3563 	case RTE_ETH_FLOW_NONFRAG_IPV4_UDP:
3564 		attributes->l4.udp.hdr = (struct udp_hdr){
3565 			.src_port = input->flow.udp4_flow.src_port,
3566 			.dst_port = input->flow.udp4_flow.dst_port,
3567 		};
3568 		attributes->l4_mask.udp.hdr = (struct udp_hdr){
3569 			.src_port = mask->src_port_mask,
3570 			.dst_port = mask->dst_port_mask,
3571 		};
3572 		attributes->items[2] = (struct rte_flow_item){
3573 			.type = RTE_FLOW_ITEM_TYPE_UDP,
3574 			.spec = &attributes->l4,
3575 			.mask = &attributes->l4_mask,
3576 		};
3577 		break;
3578 	case RTE_ETH_FLOW_NONFRAG_IPV4_TCP:
3579 		attributes->l4.tcp.hdr = (struct tcp_hdr){
3580 			.src_port = input->flow.tcp4_flow.src_port,
3581 			.dst_port = input->flow.tcp4_flow.dst_port,
3582 		};
3583 		attributes->l4_mask.tcp.hdr = (struct tcp_hdr){
3584 			.src_port = mask->src_port_mask,
3585 			.dst_port = mask->dst_port_mask,
3586 		};
3587 		attributes->items[2] = (struct rte_flow_item){
3588 			.type = RTE_FLOW_ITEM_TYPE_TCP,
3589 			.spec = &attributes->l4,
3590 			.mask = &attributes->l4_mask,
3591 		};
3592 		break;
3593 	case RTE_ETH_FLOW_NONFRAG_IPV6_UDP:
3594 		attributes->l4.udp.hdr = (struct udp_hdr){
3595 			.src_port = input->flow.udp6_flow.src_port,
3596 			.dst_port = input->flow.udp6_flow.dst_port,
3597 		};
3598 		attributes->l4_mask.udp.hdr = (struct udp_hdr){
3599 			.src_port = mask->src_port_mask,
3600 			.dst_port = mask->dst_port_mask,
3601 		};
3602 		attributes->items[2] = (struct rte_flow_item){
3603 			.type = RTE_FLOW_ITEM_TYPE_UDP,
3604 			.spec = &attributes->l4,
3605 			.mask = &attributes->l4_mask,
3606 		};
3607 		break;
3608 	case RTE_ETH_FLOW_NONFRAG_IPV6_TCP:
3609 		attributes->l4.tcp.hdr = (struct tcp_hdr){
3610 			.src_port = input->flow.tcp6_flow.src_port,
3611 			.dst_port = input->flow.tcp6_flow.dst_port,
3612 		};
3613 		attributes->l4_mask.tcp.hdr = (struct tcp_hdr){
3614 			.src_port = mask->src_port_mask,
3615 			.dst_port = mask->dst_port_mask,
3616 		};
3617 		attributes->items[2] = (struct rte_flow_item){
3618 			.type = RTE_FLOW_ITEM_TYPE_TCP,
3619 			.spec = &attributes->l4,
3620 			.mask = &attributes->l4_mask,
3621 		};
3622 		break;
3623 	case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER:
3624 	case RTE_ETH_FLOW_NONFRAG_IPV6_OTHER:
3625 		break;
3626 	default:
3627 		DRV_LOG(ERR, "port %u invalid flow type%d",
3628 			dev->data->port_id, fdir_filter->input.flow_type);
3629 		rte_errno = ENOTSUP;
3630 		return -rte_errno;
3631 	}
3632 	return 0;
3633 }
3634 
3635 /**
3636  * Add new flow director filter and store it in list.
3637  *
3638  * @param dev
3639  *   Pointer to Ethernet device.
3640  * @param fdir_filter
3641  *   Flow director filter to add.
3642  *
3643  * @return
3644  *   0 on success, a negative errno value otherwise and rte_errno is set.
3645  */
3646 static int
3647 mlx5_fdir_filter_add(struct rte_eth_dev *dev,
3648 		     const struct rte_eth_fdir_filter *fdir_filter)
3649 {
3650 	struct priv *priv = dev->data->dev_private;
3651 	struct mlx5_fdir attributes = {
3652 		.attr.group = 0,
3653 		.l2_mask = {
3654 			.dst.addr_bytes = "\x00\x00\x00\x00\x00\x00",
3655 			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
3656 			.type = 0,
3657 		},
3658 	};
3659 	struct rte_flow_error error;
3660 	struct rte_flow *flow;
3661 	int ret;
3662 
3663 	ret = mlx5_fdir_filter_convert(dev, fdir_filter, &attributes);
3664 	if (ret)
3665 		return ret;
3666 	flow = mlx5_flow_list_create(dev, &priv->flows, &attributes.attr,
3667 				     attributes.items, attributes.actions,
3668 				     &error);
3669 	if (flow) {
3670 		DRV_LOG(DEBUG, "port %u FDIR created %p", dev->data->port_id,
3671 			(void *)flow);
3672 		return 0;
3673 	}
3674 	return -rte_errno;
3675 }
3676 
3677 /**
3678  * Delete specific filter.
3679  *
3680  * @param dev
3681  *   Pointer to Ethernet device.
3682  * @param fdir_filter
3683  *   Filter to be deleted.
3684  *
3685  * @return
3686  *   0 on success, a negative errno value otherwise and rte_errno is set.
3687  */
3688 static int
3689 mlx5_fdir_filter_delete(struct rte_eth_dev *dev __rte_unused,
3690 			const struct rte_eth_fdir_filter *fdir_filter
3691 			__rte_unused)
3692 {
3693 	rte_errno = ENOTSUP;
3694 	return -rte_errno;
3695 }
3696 
3697 /**
3698  * Update queue for specific filter.
3699  *
3700  * @param dev
3701  *   Pointer to Ethernet device.
3702  * @param fdir_filter
3703  *   Filter to be updated.
3704  *
3705  * @return
3706  *   0 on success, a negative errno value otherwise and rte_errno is set.
3707  */
3708 static int
3709 mlx5_fdir_filter_update(struct rte_eth_dev *dev,
3710 			const struct rte_eth_fdir_filter *fdir_filter)
3711 {
3712 	int ret;
3713 
3714 	ret = mlx5_fdir_filter_delete(dev, fdir_filter);
3715 	if (ret)
3716 		return ret;
3717 	return mlx5_fdir_filter_add(dev, fdir_filter);
3718 }
3719 
3720 /**
3721  * Flush all filters.
3722  *
3723  * @param dev
3724  *   Pointer to Ethernet device.
3725  */
3726 static void
3727 mlx5_fdir_filter_flush(struct rte_eth_dev *dev)
3728 {
3729 	struct priv *priv = dev->data->dev_private;
3730 
3731 	mlx5_flow_list_flush(dev, &priv->flows);
3732 }
3733 
3734 /**
3735  * Get flow director information.
3736  *
3737  * @param dev
3738  *   Pointer to Ethernet device.
3739  * @param[out] fdir_info
3740  *   Resulting flow director information.
3741  */
3742 static void
3743 mlx5_fdir_info_get(struct rte_eth_dev *dev, struct rte_eth_fdir_info *fdir_info)
3744 {
3745 	struct rte_eth_fdir_masks *mask =
3746 		&dev->data->dev_conf.fdir_conf.mask;
3747 
3748 	fdir_info->mode = dev->data->dev_conf.fdir_conf.mode;
3749 	fdir_info->guarant_spc = 0;
3750 	rte_memcpy(&fdir_info->mask, mask, sizeof(fdir_info->mask));
3751 	fdir_info->max_flexpayload = 0;
3752 	fdir_info->flow_types_mask[0] = 0;
3753 	fdir_info->flex_payload_unit = 0;
3754 	fdir_info->max_flex_payload_segment_num = 0;
3755 	fdir_info->flex_payload_limit = 0;
3756 	memset(&fdir_info->flex_conf, 0, sizeof(fdir_info->flex_conf));
3757 }
3758 
3759 /**
3760  * Deal with flow director operations.
3761  *
3762  * @param dev
3763  *   Pointer to Ethernet device.
3764  * @param filter_op
3765  *   Operation to perform.
3766  * @param arg
3767  *   Pointer to operation-specific structure.
3768  *
3769  * @return
3770  *   0 on success, a negative errno value otherwise and rte_errno is set.
3771  */
3772 static int
3773 mlx5_fdir_ctrl_func(struct rte_eth_dev *dev, enum rte_filter_op filter_op,
3774 		    void *arg)
3775 {
3776 	enum rte_fdir_mode fdir_mode =
3777 		dev->data->dev_conf.fdir_conf.mode;
3778 
3779 	if (filter_op == RTE_ETH_FILTER_NOP)
3780 		return 0;
3781 	if (fdir_mode != RTE_FDIR_MODE_PERFECT &&
3782 	    fdir_mode != RTE_FDIR_MODE_PERFECT_MAC_VLAN) {
3783 		DRV_LOG(ERR, "port %u flow director mode %d not supported",
3784 			dev->data->port_id, fdir_mode);
3785 		rte_errno = EINVAL;
3786 		return -rte_errno;
3787 	}
3788 	switch (filter_op) {
3789 	case RTE_ETH_FILTER_ADD:
3790 		return mlx5_fdir_filter_add(dev, arg);
3791 	case RTE_ETH_FILTER_UPDATE:
3792 		return mlx5_fdir_filter_update(dev, arg);
3793 	case RTE_ETH_FILTER_DELETE:
3794 		return mlx5_fdir_filter_delete(dev, arg);
3795 	case RTE_ETH_FILTER_FLUSH:
3796 		mlx5_fdir_filter_flush(dev);
3797 		break;
3798 	case RTE_ETH_FILTER_INFO:
3799 		mlx5_fdir_info_get(dev, arg);
3800 		break;
3801 	default:
3802 		DRV_LOG(DEBUG, "port %u unknown operation %u",
3803 			dev->data->port_id, filter_op);
3804 		rte_errno = EINVAL;
3805 		return -rte_errno;
3806 	}
3807 	return 0;
3808 }
3809 
3810 /**
3811  * Manage filter operations.
3812  *
3813  * @param dev
3814  *   Pointer to Ethernet device structure.
3815  * @param filter_type
3816  *   Filter type.
3817  * @param filter_op
3818  *   Operation to perform.
3819  * @param arg
3820  *   Pointer to operation-specific structure.
3821  *
3822  * @return
3823  *   0 on success, a negative errno value otherwise and rte_errno is set.
3824  */
3825 int
3826 mlx5_dev_filter_ctrl(struct rte_eth_dev *dev,
3827 		     enum rte_filter_type filter_type,
3828 		     enum rte_filter_op filter_op,
3829 		     void *arg)
3830 {
3831 	switch (filter_type) {
3832 	case RTE_ETH_FILTER_GENERIC:
3833 		if (filter_op != RTE_ETH_FILTER_GET) {
3834 			rte_errno = EINVAL;
3835 			return -rte_errno;
3836 		}
3837 		*(const void **)arg = &mlx5_flow_ops;
3838 		return 0;
3839 	case RTE_ETH_FILTER_FDIR:
3840 		return mlx5_fdir_ctrl_func(dev, filter_op, arg);
3841 	default:
3842 		DRV_LOG(ERR, "port %u filter type (%d) not supported",
3843 			dev->data->port_id, filter_type);
3844 		rte_errno = ENOTSUP;
3845 		return -rte_errno;
3846 	}
3847 	return 0;
3848 }
3849