xref: /dpdk/drivers/net/tap/tap_flow.c (revision e9fd1ebf981f361844aea9ec94e17f4bda5e1479)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2017 6WIND S.A.
3  * Copyright 2017 Mellanox Technologies, Ltd
4  */
5 
6 #include <errno.h>
7 #include <string.h>
8 #include <unistd.h>
9 #include <sys/queue.h>
10 #include <sys/resource.h>
11 
12 #include <rte_byteorder.h>
13 #include <rte_jhash.h>
14 #include <rte_random.h>
15 #include <rte_malloc.h>
16 #include <rte_eth_tap.h>
17 #include <tap_flow.h>
18 #include <tap_autoconf.h>
19 #include <tap_tcmsgs.h>
20 #include <tap_rss.h>
21 
22 #ifndef HAVE_TC_FLOWER
23 /*
24  * For kernels < 4.2, this enum is not defined. Runtime checks will be made to
25  * avoid sending TC messages the kernel cannot understand.
26  */
27 enum {
28 	TCA_FLOWER_UNSPEC,
29 	TCA_FLOWER_CLASSID,
30 	TCA_FLOWER_INDEV,
31 	TCA_FLOWER_ACT,
32 	TCA_FLOWER_KEY_ETH_DST,         /* ETH_ALEN */
33 	TCA_FLOWER_KEY_ETH_DST_MASK,    /* ETH_ALEN */
34 	TCA_FLOWER_KEY_ETH_SRC,         /* ETH_ALEN */
35 	TCA_FLOWER_KEY_ETH_SRC_MASK,    /* ETH_ALEN */
36 	TCA_FLOWER_KEY_ETH_TYPE,        /* be16 */
37 	TCA_FLOWER_KEY_IP_PROTO,        /* u8 */
38 	TCA_FLOWER_KEY_IPV4_SRC,        /* be32 */
39 	TCA_FLOWER_KEY_IPV4_SRC_MASK,   /* be32 */
40 	TCA_FLOWER_KEY_IPV4_DST,        /* be32 */
41 	TCA_FLOWER_KEY_IPV4_DST_MASK,   /* be32 */
42 	TCA_FLOWER_KEY_IPV6_SRC,        /* struct in6_addr */
43 	TCA_FLOWER_KEY_IPV6_SRC_MASK,   /* struct in6_addr */
44 	TCA_FLOWER_KEY_IPV6_DST,        /* struct in6_addr */
45 	TCA_FLOWER_KEY_IPV6_DST_MASK,   /* struct in6_addr */
46 	TCA_FLOWER_KEY_TCP_SRC,         /* be16 */
47 	TCA_FLOWER_KEY_TCP_DST,         /* be16 */
48 	TCA_FLOWER_KEY_UDP_SRC,         /* be16 */
49 	TCA_FLOWER_KEY_UDP_DST,         /* be16 */
50 };
51 #endif
52 #ifndef HAVE_TC_VLAN_ID
53 enum {
54 	/* TCA_FLOWER_FLAGS, */
55 	TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */
56 	TCA_FLOWER_KEY_VLAN_PRIO,       /* u8   */
57 	TCA_FLOWER_KEY_VLAN_ETH_TYPE,   /* be16 */
58 };
59 #endif
60 /*
61  * For kernels < 4.2 BPF related enums may not be defined.
62  * Runtime checks will be carried out to gracefully report on TC messages that
63  * are rejected by the kernel. Rejection reasons may be due to:
64  * 1. enum is not defined
65  * 2. enum is defined but kernel is not configured to support BPF system calls,
66  *    BPF classifications or BPF actions.
67  */
68 #ifndef HAVE_TC_BPF
69 enum {
70 	TCA_BPF_UNSPEC,
71 	TCA_BPF_ACT,
72 	TCA_BPF_POLICE,
73 	TCA_BPF_CLASSID,
74 	TCA_BPF_OPS_LEN,
75 	TCA_BPF_OPS,
76 };
77 #endif
78 #ifndef HAVE_TC_BPF_FD
79 enum {
80 	TCA_BPF_FD = TCA_BPF_OPS + 1,
81 	TCA_BPF_NAME,
82 };
83 #endif
84 #ifndef HAVE_TC_ACT_BPF
85 #define tc_gen \
86 	__u32                 index; \
87 	__u32                 capab; \
88 	int                   action; \
89 	int                   refcnt; \
90 	int                   bindcnt
91 
92 struct tc_act_bpf {
93 	tc_gen;
94 };
95 
96 enum {
97 	TCA_ACT_BPF_UNSPEC,
98 	TCA_ACT_BPF_TM,
99 	TCA_ACT_BPF_PARMS,
100 	TCA_ACT_BPF_OPS_LEN,
101 	TCA_ACT_BPF_OPS,
102 };
103 
104 #endif
105 #ifndef HAVE_TC_ACT_BPF_FD
106 enum {
107 	TCA_ACT_BPF_FD = TCA_ACT_BPF_OPS + 1,
108 	TCA_ACT_BPF_NAME,
109 };
110 #endif
111 
112 /* RSS key management */
113 enum bpf_rss_key_e {
114 	KEY_CMD_GET = 1,
115 	KEY_CMD_RELEASE,
116 	KEY_CMD_INIT,
117 	KEY_CMD_DEINIT,
118 };
119 
120 enum key_status_e {
121 	KEY_STAT_UNSPEC,
122 	KEY_STAT_USED,
123 	KEY_STAT_AVAILABLE,
124 };
125 
126 #define ISOLATE_HANDLE 1
127 #define REMOTE_PROMISCUOUS_HANDLE 2
128 
129 struct rte_flow {
130 	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
131 	struct rte_flow *remote_flow; /* associated remote flow */
132 	int bpf_fd[SEC_MAX]; /* list of bfs fds per ELF section */
133 	uint32_t key_idx; /* RSS rule key index into BPF map */
134 	struct nlmsg msg;
135 };
136 
137 struct convert_data {
138 	uint16_t eth_type;
139 	uint16_t ip_proto;
140 	uint8_t vlan;
141 	struct rte_flow *flow;
142 };
143 
144 struct remote_rule {
145 	struct rte_flow_attr attr;
146 	struct rte_flow_item items[2];
147 	struct rte_flow_action actions[2];
148 	int mirred;
149 };
150 
151 struct action_data {
152 	char id[16];
153 
154 	union {
155 		struct tc_gact gact;
156 		struct tc_mirred mirred;
157 		struct skbedit {
158 			struct tc_skbedit skbedit;
159 			uint16_t queue;
160 		} skbedit;
161 		struct bpf {
162 			struct tc_act_bpf bpf;
163 			int bpf_fd;
164 			const char *annotation;
165 		} bpf;
166 	};
167 };
168 
169 static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
170 static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
171 static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
172 static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data);
173 static int tap_flow_create_udp(const struct rte_flow_item *item, void *data);
174 static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data);
175 static int
176 tap_flow_validate(struct rte_eth_dev *dev,
177 		  const struct rte_flow_attr *attr,
178 		  const struct rte_flow_item items[],
179 		  const struct rte_flow_action actions[],
180 		  struct rte_flow_error *error);
181 
182 static struct rte_flow *
183 tap_flow_create(struct rte_eth_dev *dev,
184 		const struct rte_flow_attr *attr,
185 		const struct rte_flow_item items[],
186 		const struct rte_flow_action actions[],
187 		struct rte_flow_error *error);
188 
189 static void
190 tap_flow_free(struct pmd_internals *pmd,
191 	struct rte_flow *flow);
192 
193 static int
194 tap_flow_destroy(struct rte_eth_dev *dev,
195 		 struct rte_flow *flow,
196 		 struct rte_flow_error *error);
197 
198 static int
199 tap_flow_isolate(struct rte_eth_dev *dev,
200 		 int set,
201 		 struct rte_flow_error *error);
202 
203 static int bpf_rss_key(enum bpf_rss_key_e cmd, __u32 *key_idx);
204 static int rss_enable(struct pmd_internals *pmd,
205 			const struct rte_flow_attr *attr,
206 			struct rte_flow_error *error);
207 static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
208 			const struct rte_flow_action_rss *rss,
209 			struct rte_flow_error *error);
210 
211 static const struct rte_flow_ops tap_flow_ops = {
212 	.validate = tap_flow_validate,
213 	.create = tap_flow_create,
214 	.destroy = tap_flow_destroy,
215 	.flush = tap_flow_flush,
216 	.isolate = tap_flow_isolate,
217 };
218 
219 /* Static initializer for items. */
220 #define ITEMS(...) \
221 	(const enum rte_flow_item_type []){ \
222 		__VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
223 	}
224 
225 /* Structure to generate a simple graph of layers supported by the NIC. */
226 struct tap_flow_items {
227 	/* Bit-mask corresponding to what is supported for this item. */
228 	const void *mask;
229 	const unsigned int mask_sz; /* Bit-mask size in bytes. */
230 	/*
231 	 * Bit-mask corresponding to the default mask, if none is provided
232 	 * along with the item.
233 	 */
234 	const void *default_mask;
235 	/**
236 	 * Conversion function from rte_flow to netlink attributes.
237 	 *
238 	 * @param item
239 	 *   rte_flow item to convert.
240 	 * @param data
241 	 *   Internal structure to store the conversion.
242 	 *
243 	 * @return
244 	 *   0 on success, negative value otherwise.
245 	 */
246 	int (*convert)(const struct rte_flow_item *item, void *data);
247 	/** List of possible following items.  */
248 	const enum rte_flow_item_type *const items;
249 };
250 
251 /* Graph of supported items and associated actions. */
252 static const struct tap_flow_items tap_flow_items[] = {
253 	[RTE_FLOW_ITEM_TYPE_END] = {
254 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
255 	},
256 	[RTE_FLOW_ITEM_TYPE_ETH] = {
257 		.items = ITEMS(
258 			RTE_FLOW_ITEM_TYPE_VLAN,
259 			RTE_FLOW_ITEM_TYPE_IPV4,
260 			RTE_FLOW_ITEM_TYPE_IPV6),
261 		.mask = &(const struct rte_flow_item_eth){
262 			.hdr.dst_addr.addr_bytes = "\xff\xff\xff\xff\xff\xff",
263 			.hdr.src_addr.addr_bytes = "\xff\xff\xff\xff\xff\xff",
264 			.hdr.ether_type = -1,
265 		},
266 		.mask_sz = sizeof(struct rte_flow_item_eth),
267 		.default_mask = &rte_flow_item_eth_mask,
268 		.convert = tap_flow_create_eth,
269 	},
270 	[RTE_FLOW_ITEM_TYPE_VLAN] = {
271 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
272 			       RTE_FLOW_ITEM_TYPE_IPV6),
273 		.mask = &(const struct rte_flow_item_vlan){
274 			/* DEI matching is not supported */
275 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
276 			.hdr.vlan_tci = 0xffef,
277 #else
278 			.hdr.vlan_tci = 0xefff,
279 #endif
280 			.hdr.eth_proto = -1,
281 		},
282 		.mask_sz = sizeof(struct rte_flow_item_vlan),
283 		.default_mask = &rte_flow_item_vlan_mask,
284 		.convert = tap_flow_create_vlan,
285 	},
286 	[RTE_FLOW_ITEM_TYPE_IPV4] = {
287 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
288 			       RTE_FLOW_ITEM_TYPE_TCP),
289 		.mask = &(const struct rte_flow_item_ipv4){
290 			.hdr = {
291 				.src_addr = -1,
292 				.dst_addr = -1,
293 				.next_proto_id = -1,
294 			},
295 		},
296 		.mask_sz = sizeof(struct rte_flow_item_ipv4),
297 		.default_mask = &rte_flow_item_ipv4_mask,
298 		.convert = tap_flow_create_ipv4,
299 	},
300 	[RTE_FLOW_ITEM_TYPE_IPV6] = {
301 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
302 			       RTE_FLOW_ITEM_TYPE_TCP),
303 		.mask = &(const struct rte_flow_item_ipv6){
304 			.hdr = {
305 				.src_addr = {
306 					"\xff\xff\xff\xff\xff\xff\xff\xff"
307 					"\xff\xff\xff\xff\xff\xff\xff\xff",
308 				},
309 				.dst_addr = {
310 					"\xff\xff\xff\xff\xff\xff\xff\xff"
311 					"\xff\xff\xff\xff\xff\xff\xff\xff",
312 				},
313 				.proto = -1,
314 			},
315 		},
316 		.mask_sz = sizeof(struct rte_flow_item_ipv6),
317 		.default_mask = &rte_flow_item_ipv6_mask,
318 		.convert = tap_flow_create_ipv6,
319 	},
320 	[RTE_FLOW_ITEM_TYPE_UDP] = {
321 		.mask = &(const struct rte_flow_item_udp){
322 			.hdr = {
323 				.src_port = -1,
324 				.dst_port = -1,
325 			},
326 		},
327 		.mask_sz = sizeof(struct rte_flow_item_udp),
328 		.default_mask = &rte_flow_item_udp_mask,
329 		.convert = tap_flow_create_udp,
330 	},
331 	[RTE_FLOW_ITEM_TYPE_TCP] = {
332 		.mask = &(const struct rte_flow_item_tcp){
333 			.hdr = {
334 				.src_port = -1,
335 				.dst_port = -1,
336 			},
337 		},
338 		.mask_sz = sizeof(struct rte_flow_item_tcp),
339 		.default_mask = &rte_flow_item_tcp_mask,
340 		.convert = tap_flow_create_tcp,
341 	},
342 };
343 
344 /*
345  *                TC rules, by growing priority
346  *
347  *        Remote netdevice                  Tap netdevice
348  * +-------------+-------------+  +-------------+-------------+
349  * |   Ingress   |   Egress    |  |   Ingress   |   Egress    |
350  * |-------------|-------------|  |-------------|-------------|
351  * |             |  \       /  |  |             |  REMOTE TX  | prio 1
352  * |             |   \     /   |  |             |   \     /   | prio 2
353  * |  EXPLICIT   |    \   /    |  |  EXPLICIT   |    \   /    |   .
354  * |             |     \ /     |  |             |     \ /     |   .
355  * |    RULES    |      X      |  |    RULES    |      X      |   .
356  * |      .      |     / \     |  |      .      |     / \     |   .
357  * |      .      |    /   \    |  |      .      |    /   \    |   .
358  * |      .      |   /     \   |  |      .      |   /     \   |   .
359  * |      .      |  /       \  |  |      .      |  /       \  |   .
360  *
361  *      ....           ....           ....           ....
362  *
363  * |      .      |  \       /  |  |      .      |  \       /  |   .
364  * |      .      |   \     /   |  |      .      |   \     /   |   .
365  * |             |    \   /    |  |             |    \   /    |
366  * |  LOCAL_MAC  |     \ /     |  |    \   /    |     \ /     | last prio - 5
367  * |   PROMISC   |      X      |  |     \ /     |      X      | last prio - 4
368  * |   ALLMULTI  |     / \     |  |      X      |     / \     | last prio - 3
369  * |  BROADCAST  |    /   \    |  |     / \     |    /   \    | last prio - 2
370  * | BROADCASTV6 |   /     \   |  |    /   \    |   /     \   | last prio - 1
371  * |     xx      |  /       \  |  |   ISOLATE   |  /       \  | last prio
372  * +-------------+-------------+  +-------------+-------------+
373  *
374  * The implicit flow rules are stored in a list in with mandatorily the last two
375  * being the ISOLATE and REMOTE_TX rules. e.g.:
376  *
377  * LOCAL_MAC -> BROADCAST -> BROADCASTV6 -> REMOTE_TX -> ISOLATE -> NULL
378  *
379  * That enables tap_flow_isolate() to remove implicit rules by popping the list
380  * head and remove it as long as it applies on the remote netdevice. The
381  * implicit rule for TX redirection is not removed, as isolate concerns only
382  * incoming traffic.
383  */
384 
385 static struct remote_rule implicit_rte_flows[TAP_REMOTE_MAX_IDX] = {
386 	[TAP_REMOTE_LOCAL_MAC] = {
387 		.attr = {
388 			.group = MAX_GROUP,
389 			.priority = PRIORITY_MASK - TAP_REMOTE_LOCAL_MAC,
390 			.ingress = 1,
391 		},
392 		.items[0] = {
393 			.type = RTE_FLOW_ITEM_TYPE_ETH,
394 			.mask =  &(const struct rte_flow_item_eth){
395 				.hdr.dst_addr.addr_bytes = "\xff\xff\xff\xff\xff\xff",
396 			},
397 		},
398 		.items[1] = {
399 			.type = RTE_FLOW_ITEM_TYPE_END,
400 		},
401 		.mirred = TCA_EGRESS_REDIR,
402 	},
403 	[TAP_REMOTE_BROADCAST] = {
404 		.attr = {
405 			.group = MAX_GROUP,
406 			.priority = PRIORITY_MASK - TAP_REMOTE_BROADCAST,
407 			.ingress = 1,
408 		},
409 		.items[0] = {
410 			.type = RTE_FLOW_ITEM_TYPE_ETH,
411 			.mask =  &(const struct rte_flow_item_eth){
412 				.hdr.dst_addr.addr_bytes = "\xff\xff\xff\xff\xff\xff",
413 			},
414 			.spec = &(const struct rte_flow_item_eth){
415 				.hdr.dst_addr.addr_bytes = "\xff\xff\xff\xff\xff\xff",
416 			},
417 		},
418 		.items[1] = {
419 			.type = RTE_FLOW_ITEM_TYPE_END,
420 		},
421 		.mirred = TCA_EGRESS_MIRROR,
422 	},
423 	[TAP_REMOTE_BROADCASTV6] = {
424 		.attr = {
425 			.group = MAX_GROUP,
426 			.priority = PRIORITY_MASK - TAP_REMOTE_BROADCASTV6,
427 			.ingress = 1,
428 		},
429 		.items[0] = {
430 			.type = RTE_FLOW_ITEM_TYPE_ETH,
431 			.mask =  &(const struct rte_flow_item_eth){
432 				.hdr.dst_addr.addr_bytes = "\x33\x33\x00\x00\x00\x00",
433 			},
434 			.spec = &(const struct rte_flow_item_eth){
435 				.hdr.dst_addr.addr_bytes = "\x33\x33\x00\x00\x00\x00",
436 			},
437 		},
438 		.items[1] = {
439 			.type = RTE_FLOW_ITEM_TYPE_END,
440 		},
441 		.mirred = TCA_EGRESS_MIRROR,
442 	},
443 	[TAP_REMOTE_PROMISC] = {
444 		.attr = {
445 			.group = MAX_GROUP,
446 			.priority = PRIORITY_MASK - TAP_REMOTE_PROMISC,
447 			.ingress = 1,
448 		},
449 		.items[0] = {
450 			.type = RTE_FLOW_ITEM_TYPE_VOID,
451 		},
452 		.items[1] = {
453 			.type = RTE_FLOW_ITEM_TYPE_END,
454 		},
455 		.mirred = TCA_EGRESS_MIRROR,
456 	},
457 	[TAP_REMOTE_ALLMULTI] = {
458 		.attr = {
459 			.group = MAX_GROUP,
460 			.priority = PRIORITY_MASK - TAP_REMOTE_ALLMULTI,
461 			.ingress = 1,
462 		},
463 		.items[0] = {
464 			.type = RTE_FLOW_ITEM_TYPE_ETH,
465 			.mask =  &(const struct rte_flow_item_eth){
466 				.hdr.dst_addr.addr_bytes = "\x01\x00\x00\x00\x00\x00",
467 			},
468 			.spec = &(const struct rte_flow_item_eth){
469 				.hdr.dst_addr.addr_bytes = "\x01\x00\x00\x00\x00\x00",
470 			},
471 		},
472 		.items[1] = {
473 			.type = RTE_FLOW_ITEM_TYPE_END,
474 		},
475 		.mirred = TCA_EGRESS_MIRROR,
476 	},
477 	[TAP_REMOTE_TX] = {
478 		.attr = {
479 			.group = 0,
480 			.priority = TAP_REMOTE_TX,
481 			.egress = 1,
482 		},
483 		.items[0] = {
484 			.type = RTE_FLOW_ITEM_TYPE_VOID,
485 		},
486 		.items[1] = {
487 			.type = RTE_FLOW_ITEM_TYPE_END,
488 		},
489 		.mirred = TCA_EGRESS_MIRROR,
490 	},
491 	[TAP_ISOLATE] = {
492 		.attr = {
493 			.group = MAX_GROUP,
494 			.priority = PRIORITY_MASK - TAP_ISOLATE,
495 			.ingress = 1,
496 		},
497 		.items[0] = {
498 			.type = RTE_FLOW_ITEM_TYPE_VOID,
499 		},
500 		.items[1] = {
501 			.type = RTE_FLOW_ITEM_TYPE_END,
502 		},
503 	},
504 };
505 
506 /**
507  * Make as much checks as possible on an Ethernet item, and if a flow is
508  * provided, fill it appropriately with Ethernet info.
509  *
510  * @param[in] item
511  *   Item specification.
512  * @param[in, out] data
513  *   Additional data structure to tell next layers we've been here.
514  *
515  * @return
516  *   0 if checks are alright, -1 otherwise.
517  */
518 static int
519 tap_flow_create_eth(const struct rte_flow_item *item, void *data)
520 {
521 	struct convert_data *info = (struct convert_data *)data;
522 	const struct rte_flow_item_eth *spec = item->spec;
523 	const struct rte_flow_item_eth *mask = item->mask;
524 	struct rte_flow *flow = info->flow;
525 	struct nlmsg *msg;
526 
527 	/* use default mask if none provided */
528 	if (!mask)
529 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask;
530 	/* TC does not support eth_type masking. Only accept if exact match. */
531 	if (mask->hdr.ether_type && mask->hdr.ether_type != 0xffff)
532 		return -1;
533 	if (!spec)
534 		return 0;
535 	/* store eth_type for consistency if ipv4/6 pattern item comes next */
536 	if (spec->hdr.ether_type & mask->hdr.ether_type)
537 		info->eth_type = spec->hdr.ether_type;
538 	if (!flow)
539 		return 0;
540 	msg = &flow->msg;
541 	if (!rte_is_zero_ether_addr(&mask->hdr.dst_addr)) {
542 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST,
543 			RTE_ETHER_ADDR_LEN,
544 			   &spec->hdr.dst_addr.addr_bytes);
545 		tap_nlattr_add(&msg->nh,
546 			   TCA_FLOWER_KEY_ETH_DST_MASK, RTE_ETHER_ADDR_LEN,
547 			   &mask->hdr.dst_addr.addr_bytes);
548 	}
549 	if (!rte_is_zero_ether_addr(&mask->hdr.src_addr)) {
550 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC,
551 			RTE_ETHER_ADDR_LEN,
552 			&spec->hdr.src_addr.addr_bytes);
553 		tap_nlattr_add(&msg->nh,
554 			   TCA_FLOWER_KEY_ETH_SRC_MASK, RTE_ETHER_ADDR_LEN,
555 			   &mask->hdr.src_addr.addr_bytes);
556 	}
557 	return 0;
558 }
559 
560 /**
561  * Make as much checks as possible on a VLAN item, and if a flow is provided,
562  * fill it appropriately with VLAN info.
563  *
564  * @param[in] item
565  *   Item specification.
566  * @param[in, out] data
567  *   Additional data structure to tell next layers we've been here.
568  *
569  * @return
570  *   0 if checks are alright, -1 otherwise.
571  */
572 static int
573 tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
574 {
575 	struct convert_data *info = (struct convert_data *)data;
576 	const struct rte_flow_item_vlan *spec = item->spec;
577 	const struct rte_flow_item_vlan *mask = item->mask;
578 	struct rte_flow *flow = info->flow;
579 	struct nlmsg *msg;
580 
581 	/* use default mask if none provided */
582 	if (!mask)
583 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
584 	/* Outer TPID cannot be matched. */
585 	if (info->eth_type)
586 		return -1;
587 	/* Double-tagging not supported. */
588 	if (info->vlan)
589 		return -1;
590 	info->vlan = 1;
591 	if (mask->hdr.eth_proto) {
592 		/* TC does not support partial eth_type masking */
593 		if (mask->hdr.eth_proto != RTE_BE16(0xffff))
594 			return -1;
595 		info->eth_type = spec->hdr.eth_proto;
596 	}
597 	if (!flow)
598 		return 0;
599 	msg = &flow->msg;
600 	msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q));
601 #define VLAN_PRIO(tci) ((tci) >> 13)
602 #define VLAN_ID(tci) ((tci) & 0xfff)
603 	if (!spec)
604 		return 0;
605 	if (spec->hdr.vlan_tci) {
606 		uint16_t tci = ntohs(spec->hdr.vlan_tci) & mask->hdr.vlan_tci;
607 		uint16_t prio = VLAN_PRIO(tci);
608 		uint8_t vid = VLAN_ID(tci);
609 
610 		if (prio)
611 			tap_nlattr_add8(&msg->nh,
612 					TCA_FLOWER_KEY_VLAN_PRIO, prio);
613 		if (vid)
614 			tap_nlattr_add16(&msg->nh,
615 					 TCA_FLOWER_KEY_VLAN_ID, vid);
616 	}
617 	return 0;
618 }
619 
620 /**
621  * Make as much checks as possible on an IPv4 item, and if a flow is provided,
622  * fill it appropriately with IPv4 info.
623  *
624  * @param[in] item
625  *   Item specification.
626  * @param[in, out] data
627  *   Additional data structure to tell next layers we've been here.
628  *
629  * @return
630  *   0 if checks are alright, -1 otherwise.
631  */
632 static int
633 tap_flow_create_ipv4(const struct rte_flow_item *item, void *data)
634 {
635 	struct convert_data *info = (struct convert_data *)data;
636 	const struct rte_flow_item_ipv4 *spec = item->spec;
637 	const struct rte_flow_item_ipv4 *mask = item->mask;
638 	struct rte_flow *flow = info->flow;
639 	struct nlmsg *msg;
640 
641 	/* use default mask if none provided */
642 	if (!mask)
643 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask;
644 	/* check that previous eth type is compatible with ipv4 */
645 	if (info->eth_type && info->eth_type != htons(ETH_P_IP))
646 		return -1;
647 	/* store ip_proto for consistency if udp/tcp pattern item comes next */
648 	if (spec)
649 		info->ip_proto = spec->hdr.next_proto_id;
650 	if (!flow)
651 		return 0;
652 	msg = &flow->msg;
653 	if (!info->eth_type)
654 		info->eth_type = htons(ETH_P_IP);
655 	if (!spec)
656 		return 0;
657 	if (mask->hdr.dst_addr) {
658 		tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST,
659 			     spec->hdr.dst_addr);
660 		tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK,
661 			     mask->hdr.dst_addr);
662 	}
663 	if (mask->hdr.src_addr) {
664 		tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC,
665 			     spec->hdr.src_addr);
666 		tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK,
667 			     mask->hdr.src_addr);
668 	}
669 	if (spec->hdr.next_proto_id)
670 		tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO,
671 			    spec->hdr.next_proto_id);
672 	return 0;
673 }
674 
675 /**
676  * Make as much checks as possible on an IPv6 item, and if a flow is provided,
677  * fill it appropriately with IPv6 info.
678  *
679  * @param[in] item
680  *   Item specification.
681  * @param[in, out] data
682  *   Additional data structure to tell next layers we've been here.
683  *
684  * @return
685  *   0 if checks are alright, -1 otherwise.
686  */
687 static int
688 tap_flow_create_ipv6(const struct rte_flow_item *item, void *data)
689 {
690 	struct convert_data *info = (struct convert_data *)data;
691 	const struct rte_flow_item_ipv6 *spec = item->spec;
692 	const struct rte_flow_item_ipv6 *mask = item->mask;
693 	struct rte_flow *flow = info->flow;
694 	uint8_t empty_addr[16] = { 0 };
695 	struct nlmsg *msg;
696 
697 	/* use default mask if none provided */
698 	if (!mask)
699 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask;
700 	/* check that previous eth type is compatible with ipv6 */
701 	if (info->eth_type && info->eth_type != htons(ETH_P_IPV6))
702 		return -1;
703 	/* store ip_proto for consistency if udp/tcp pattern item comes next */
704 	if (spec)
705 		info->ip_proto = spec->hdr.proto;
706 	if (!flow)
707 		return 0;
708 	msg = &flow->msg;
709 	if (!info->eth_type)
710 		info->eth_type = htons(ETH_P_IPV6);
711 	if (!spec)
712 		return 0;
713 	if (memcmp(mask->hdr.dst_addr, empty_addr, 16)) {
714 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST,
715 			   sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr);
716 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK,
717 			   sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr);
718 	}
719 	if (memcmp(mask->hdr.src_addr, empty_addr, 16)) {
720 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC,
721 			   sizeof(spec->hdr.src_addr), &spec->hdr.src_addr);
722 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
723 			   sizeof(mask->hdr.src_addr), &mask->hdr.src_addr);
724 	}
725 	if (spec->hdr.proto)
726 		tap_nlattr_add8(&msg->nh,
727 				TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto);
728 	return 0;
729 }
730 
731 /**
732  * Make as much checks as possible on a UDP item, and if a flow is provided,
733  * fill it appropriately with UDP info.
734  *
735  * @param[in] item
736  *   Item specification.
737  * @param[in, out] data
738  *   Additional data structure to tell next layers we've been here.
739  *
740  * @return
741  *   0 if checks are alright, -1 otherwise.
742  */
743 static int
744 tap_flow_create_udp(const struct rte_flow_item *item, void *data)
745 {
746 	struct convert_data *info = (struct convert_data *)data;
747 	const struct rte_flow_item_udp *spec = item->spec;
748 	const struct rte_flow_item_udp *mask = item->mask;
749 	struct rte_flow *flow = info->flow;
750 	struct nlmsg *msg;
751 
752 	/* use default mask if none provided */
753 	if (!mask)
754 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask;
755 	/* check that previous ip_proto is compatible with udp */
756 	if (info->ip_proto && info->ip_proto != IPPROTO_UDP)
757 		return -1;
758 	/* TC does not support UDP port masking. Only accept if exact match. */
759 	if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
760 	    (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
761 		return -1;
762 	if (!flow)
763 		return 0;
764 	msg = &flow->msg;
765 	tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP);
766 	if (!spec)
767 		return 0;
768 	if (mask->hdr.dst_port)
769 		tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST,
770 			     spec->hdr.dst_port);
771 	if (mask->hdr.src_port)
772 		tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC,
773 			     spec->hdr.src_port);
774 	return 0;
775 }
776 
777 /**
778  * Make as much checks as possible on a TCP item, and if a flow is provided,
779  * fill it appropriately with TCP info.
780  *
781  * @param[in] item
782  *   Item specification.
783  * @param[in, out] data
784  *   Additional data structure to tell next layers we've been here.
785  *
786  * @return
787  *   0 if checks are alright, -1 otherwise.
788  */
789 static int
790 tap_flow_create_tcp(const struct rte_flow_item *item, void *data)
791 {
792 	struct convert_data *info = (struct convert_data *)data;
793 	const struct rte_flow_item_tcp *spec = item->spec;
794 	const struct rte_flow_item_tcp *mask = item->mask;
795 	struct rte_flow *flow = info->flow;
796 	struct nlmsg *msg;
797 
798 	/* use default mask if none provided */
799 	if (!mask)
800 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask;
801 	/* check that previous ip_proto is compatible with tcp */
802 	if (info->ip_proto && info->ip_proto != IPPROTO_TCP)
803 		return -1;
804 	/* TC does not support TCP port masking. Only accept if exact match. */
805 	if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
806 	    (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
807 		return -1;
808 	if (!flow)
809 		return 0;
810 	msg = &flow->msg;
811 	tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP);
812 	if (!spec)
813 		return 0;
814 	if (mask->hdr.dst_port)
815 		tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST,
816 			     spec->hdr.dst_port);
817 	if (mask->hdr.src_port)
818 		tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC,
819 			     spec->hdr.src_port);
820 	return 0;
821 }
822 
823 /**
824  * Check support for a given item.
825  *
826  * @param[in] item
827  *   Item specification.
828  * @param size
829  *   Bit-Mask size in bytes.
830  * @param[in] supported_mask
831  *   Bit-mask covering supported fields to compare with spec, last and mask in
832  *   \item.
833  * @param[in] default_mask
834  *   Bit-mask default mask if none is provided in \item.
835  *
836  * @return
837  *   0 on success.
838  */
839 static int
840 tap_flow_item_validate(const struct rte_flow_item *item,
841 		       unsigned int size,
842 		       const uint8_t *supported_mask,
843 		       const uint8_t *default_mask)
844 {
845 	int ret = 0;
846 
847 	/* An empty layer is allowed, as long as all fields are NULL */
848 	if (!item->spec && (item->mask || item->last))
849 		return -1;
850 	/* Is the item spec compatible with what the NIC supports? */
851 	if (item->spec && !item->mask) {
852 		unsigned int i;
853 		const uint8_t *spec = item->spec;
854 
855 		for (i = 0; i < size; ++i)
856 			if ((spec[i] | supported_mask[i]) != supported_mask[i])
857 				return -1;
858 		/* Is the default mask compatible with what the NIC supports? */
859 		for (i = 0; i < size; i++)
860 			if ((default_mask[i] | supported_mask[i]) !=
861 			    supported_mask[i])
862 				return -1;
863 	}
864 	/* Is the item last compatible with what the NIC supports? */
865 	if (item->last && !item->mask) {
866 		unsigned int i;
867 		const uint8_t *spec = item->last;
868 
869 		for (i = 0; i < size; ++i)
870 			if ((spec[i] | supported_mask[i]) != supported_mask[i])
871 				return -1;
872 	}
873 	/* Is the item mask compatible with what the NIC supports? */
874 	if (item->mask) {
875 		unsigned int i;
876 		const uint8_t *spec = item->mask;
877 
878 		for (i = 0; i < size; ++i)
879 			if ((spec[i] | supported_mask[i]) != supported_mask[i])
880 				return -1;
881 	}
882 	/**
883 	 * Once masked, Are item spec and item last equal?
884 	 * TC does not support range so anything else is invalid.
885 	 */
886 	if (item->spec && item->last) {
887 		uint8_t spec[size];
888 		uint8_t last[size];
889 		const uint8_t *apply = default_mask;
890 		unsigned int i;
891 
892 		if (item->mask)
893 			apply = item->mask;
894 		for (i = 0; i < size; ++i) {
895 			spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
896 			last[i] = ((const uint8_t *)item->last)[i] & apply[i];
897 		}
898 		ret = memcmp(spec, last, size);
899 	}
900 	return ret;
901 }
902 
903 /**
904  * Configure the kernel with a TC action and its configured parameters
905  * Handled actions: "gact", "mirred", "skbedit", "bpf"
906  *
907  * @param[in] flow
908  *   Pointer to rte flow containing the netlink message
909  *
910  * @param[in, out] act_index
911  *   Pointer to action sequence number in the TC command
912  *
913  * @param[in] adata
914  *  Pointer to struct holding the action parameters
915  *
916  * @return
917  *   -1 on failure, 0 on success
918  */
919 static int
920 add_action(struct rte_flow *flow, size_t *act_index, struct action_data *adata)
921 {
922 	struct nlmsg *msg = &flow->msg;
923 
924 	if (tap_nlattr_nested_start(msg, (*act_index)++) < 0)
925 		return -1;
926 
927 	tap_nlattr_add(&msg->nh, TCA_ACT_KIND,
928 				strlen(adata->id) + 1, adata->id);
929 	if (tap_nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
930 		return -1;
931 	if (strcmp("gact", adata->id) == 0) {
932 		tap_nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(adata->gact),
933 			   &adata->gact);
934 	} else if (strcmp("mirred", adata->id) == 0) {
935 		if (adata->mirred.eaction == TCA_EGRESS_MIRROR)
936 			adata->mirred.action = TC_ACT_PIPE;
937 		else /* REDIRECT */
938 			adata->mirred.action = TC_ACT_STOLEN;
939 		tap_nlattr_add(&msg->nh, TCA_MIRRED_PARMS,
940 			   sizeof(adata->mirred),
941 			   &adata->mirred);
942 	} else if (strcmp("skbedit", adata->id) == 0) {
943 		tap_nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS,
944 			   sizeof(adata->skbedit.skbedit),
945 			   &adata->skbedit.skbedit);
946 		tap_nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING,
947 			     adata->skbedit.queue);
948 	} else if (strcmp("bpf", adata->id) == 0) {
949 		tap_nlattr_add32(&msg->nh, TCA_ACT_BPF_FD, adata->bpf.bpf_fd);
950 		tap_nlattr_add(&msg->nh, TCA_ACT_BPF_NAME,
951 			   strlen(adata->bpf.annotation) + 1,
952 			   adata->bpf.annotation);
953 		tap_nlattr_add(&msg->nh, TCA_ACT_BPF_PARMS,
954 			   sizeof(adata->bpf.bpf),
955 			   &adata->bpf.bpf);
956 	} else {
957 		return -1;
958 	}
959 	tap_nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
960 	tap_nlattr_nested_finish(msg); /* nested act_index */
961 	return 0;
962 }
963 
964 /**
965  * Helper function to send a series of TC actions to the kernel
966  *
967  * @param[in] flow
968  *   Pointer to rte flow containing the netlink message
969  *
970  * @param[in] nb_actions
971  *   Number of actions in an array of action structs
972  *
973  * @param[in] data
974  *   Pointer to an array of action structs
975  *
976  * @param[in] classifier_actions
977  *   The classifier on behave of which the actions are configured
978  *
979  * @return
980  *   -1 on failure, 0 on success
981  */
982 static int
983 add_actions(struct rte_flow *flow, int nb_actions, struct action_data *data,
984 	    int classifier_action)
985 {
986 	struct nlmsg *msg = &flow->msg;
987 	size_t act_index = 1;
988 	int i;
989 
990 	if (tap_nlattr_nested_start(msg, classifier_action) < 0)
991 		return -1;
992 	for (i = 0; i < nb_actions; i++)
993 		if (add_action(flow, &act_index, data + i) < 0)
994 			return -1;
995 	tap_nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
996 	return 0;
997 }
998 
999 /**
1000  * Validate a flow supported by TC.
1001  * If flow param is not NULL, then also fill the netlink message inside.
1002  *
1003  * @param pmd
1004  *   Pointer to private structure.
1005  * @param[in] attr
1006  *   Flow rule attributes.
1007  * @param[in] pattern
1008  *   Pattern specification (list terminated by the END pattern item).
1009  * @param[in] actions
1010  *   Associated actions (list terminated by the END action).
1011  * @param[out] error
1012  *   Perform verbose error reporting if not NULL.
1013  * @param[in, out] flow
1014  *   Flow structure to update.
1015  * @param[in] mirred
1016  *   If set to TCA_EGRESS_REDIR, provided actions will be replaced with a
1017  *   redirection to the tap netdevice, and the TC rule will be configured
1018  *   on the remote netdevice in pmd.
1019  *   If set to TCA_EGRESS_MIRROR, provided actions will be replaced with a
1020  *   mirroring to the tap netdevice, and the TC rule will be configured
1021  *   on the remote netdevice in pmd. Matching packets will thus be duplicated.
1022  *   If set to 0, the standard behavior is to be used: set correct actions for
1023  *   the TC rule, and apply it on the tap netdevice.
1024  *
1025  * @return
1026  *   0 on success, a negative errno value otherwise and rte_errno is set.
1027  */
1028 static int
1029 priv_flow_process(struct pmd_internals *pmd,
1030 		  const struct rte_flow_attr *attr,
1031 		  const struct rte_flow_item items[],
1032 		  const struct rte_flow_action actions[],
1033 		  struct rte_flow_error *error,
1034 		  struct rte_flow *flow,
1035 		  int mirred)
1036 {
1037 	const struct tap_flow_items *cur_item = tap_flow_items;
1038 	struct convert_data data = {
1039 		.eth_type = 0,
1040 		.ip_proto = 0,
1041 		.flow = flow,
1042 	};
1043 	int action = 0; /* Only one action authorized for now */
1044 
1045 	if (attr->transfer) {
1046 		rte_flow_error_set(
1047 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
1048 			NULL, "transfer is not supported");
1049 		return -rte_errno;
1050 	}
1051 	if (attr->group > MAX_GROUP) {
1052 		rte_flow_error_set(
1053 			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
1054 			NULL, "group value too big: cannot exceed 15");
1055 		return -rte_errno;
1056 	}
1057 	if (attr->priority > MAX_PRIORITY) {
1058 		rte_flow_error_set(
1059 			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1060 			NULL, "priority value too big");
1061 		return -rte_errno;
1062 	} else if (flow) {
1063 		uint16_t group = attr->group << GROUP_SHIFT;
1064 		uint16_t prio = group | (attr->priority +
1065 				RSS_PRIORITY_OFFSET + PRIORITY_OFFSET);
1066 		flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
1067 						 flow->msg.t.tcm_info);
1068 	}
1069 	if (flow) {
1070 		if (mirred) {
1071 			/*
1072 			 * If attr->ingress, the rule applies on remote ingress
1073 			 * to match incoming packets
1074 			 * If attr->egress, the rule applies on tap ingress (as
1075 			 * seen from the kernel) to deal with packets going out
1076 			 * from the DPDK app.
1077 			 */
1078 			flow->msg.t.tcm_parent = TC_H_MAKE(TC_H_INGRESS, 0);
1079 		} else {
1080 			/* Standard rule on tap egress (kernel standpoint). */
1081 			flow->msg.t.tcm_parent =
1082 				TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
1083 		}
1084 		/* use flower filter type */
1085 		tap_nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower");
1086 		if (tap_nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0) {
1087 			rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_ACTION,
1088 					   actions, "could not allocated netlink msg");
1089 			goto exit_return_error;
1090 		}
1091 	}
1092 	for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
1093 		const struct tap_flow_items *token = NULL;
1094 		unsigned int i;
1095 		int err = 0;
1096 
1097 		if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
1098 			continue;
1099 		for (i = 0;
1100 		     cur_item->items &&
1101 		     cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
1102 		     ++i) {
1103 			if (cur_item->items[i] == items->type) {
1104 				token = &tap_flow_items[items->type];
1105 				break;
1106 			}
1107 		}
1108 		if (!token)
1109 			goto exit_item_not_supported;
1110 		cur_item = token;
1111 		err = tap_flow_item_validate(
1112 			items, cur_item->mask_sz,
1113 			(const uint8_t *)cur_item->mask,
1114 			(const uint8_t *)cur_item->default_mask);
1115 		if (err)
1116 			goto exit_item_not_supported;
1117 		if (flow && cur_item->convert) {
1118 			err = cur_item->convert(items, &data);
1119 			if (err)
1120 				goto exit_item_not_supported;
1121 		}
1122 	}
1123 	if (flow) {
1124 		if (data.vlan) {
1125 			tap_nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
1126 				     htons(ETH_P_8021Q));
1127 			tap_nlattr_add16(&flow->msg.nh,
1128 				     TCA_FLOWER_KEY_VLAN_ETH_TYPE,
1129 				     data.eth_type ?
1130 				     data.eth_type : htons(ETH_P_ALL));
1131 		} else if (data.eth_type) {
1132 			tap_nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
1133 				     data.eth_type);
1134 		}
1135 	}
1136 	if (mirred && flow) {
1137 		struct action_data adata = {
1138 			.id = "mirred",
1139 			.mirred = {
1140 				.eaction = mirred,
1141 			},
1142 		};
1143 
1144 		/*
1145 		 * If attr->egress && mirred, then this is a special
1146 		 * case where the rule must be applied on the tap, to
1147 		 * redirect packets coming from the DPDK App, out
1148 		 * through the remote netdevice.
1149 		 */
1150 		adata.mirred.ifindex = attr->ingress ? pmd->if_index :
1151 			pmd->remote_if_index;
1152 		if (mirred == TCA_EGRESS_MIRROR)
1153 			adata.mirred.action = TC_ACT_PIPE;
1154 		else
1155 			adata.mirred.action = TC_ACT_STOLEN;
1156 		if (add_actions(flow, 1, &adata, TCA_FLOWER_ACT) < 0)
1157 			goto exit_action_not_supported;
1158 		else
1159 			goto end;
1160 	}
1161 actions:
1162 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
1163 		int err = 0;
1164 
1165 		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
1166 			continue;
1167 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
1168 			if (action)
1169 				goto exit_action_not_supported;
1170 			action = 1;
1171 			if (flow) {
1172 				struct action_data adata = {
1173 					.id = "gact",
1174 					.gact = {
1175 						.action = TC_ACT_SHOT,
1176 					},
1177 				};
1178 
1179 				err = add_actions(flow, 1, &adata,
1180 						  TCA_FLOWER_ACT);
1181 			}
1182 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
1183 			if (action)
1184 				goto exit_action_not_supported;
1185 			action = 1;
1186 			if (flow) {
1187 				struct action_data adata = {
1188 					.id = "gact",
1189 					.gact = {
1190 						/* continue */
1191 						.action = TC_ACT_UNSPEC,
1192 					},
1193 				};
1194 
1195 				err = add_actions(flow, 1, &adata,
1196 						  TCA_FLOWER_ACT);
1197 			}
1198 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
1199 			const struct rte_flow_action_queue *queue =
1200 				(const struct rte_flow_action_queue *)
1201 				actions->conf;
1202 
1203 			if (action)
1204 				goto exit_action_not_supported;
1205 			action = 1;
1206 			if (queue->index >= pmd->dev->data->nb_rx_queues) {
1207 				rte_flow_error_set(error, ERANGE,
1208 						   RTE_FLOW_ERROR_TYPE_ACTION, actions,
1209 						   "queue index out of range");
1210 				goto exit_return_error;
1211 			}
1212 			if (flow) {
1213 				struct action_data adata = {
1214 					.id = "skbedit",
1215 					.skbedit = {
1216 						.skbedit = {
1217 							.action = TC_ACT_PIPE,
1218 						},
1219 						.queue = queue->index,
1220 					},
1221 				};
1222 
1223 				err = add_actions(flow, 1, &adata,
1224 					TCA_FLOWER_ACT);
1225 			}
1226 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
1227 			const struct rte_flow_action_rss *rss =
1228 				(const struct rte_flow_action_rss *)
1229 				actions->conf;
1230 
1231 			if (action++)
1232 				goto exit_action_not_supported;
1233 
1234 			if (!pmd->rss_enabled) {
1235 				err = rss_enable(pmd, attr, error);
1236 				if (err)
1237 					goto exit_return_error;
1238 			}
1239 			if (flow)
1240 				err = rss_add_actions(flow, pmd, rss, error);
1241 		} else {
1242 			goto exit_action_not_supported;
1243 		}
1244 		if (err)
1245 			goto exit_return_error;
1246 	}
1247 	/* When fate is unknown, drop traffic. */
1248 	if (!action) {
1249 		static const struct rte_flow_action drop[] = {
1250 			{ .type = RTE_FLOW_ACTION_TYPE_DROP, },
1251 			{ .type = RTE_FLOW_ACTION_TYPE_END, },
1252 		};
1253 
1254 		actions = drop;
1255 		goto actions;
1256 	}
1257 end:
1258 	if (flow)
1259 		tap_nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
1260 	return 0;
1261 exit_item_not_supported:
1262 	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
1263 			   items, "item not supported");
1264 	return -rte_errno;
1265 exit_action_not_supported:
1266 	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
1267 			   actions, "action not supported");
1268 exit_return_error:
1269 	return -rte_errno;
1270 }
1271 
1272 
1273 
1274 /**
1275  * Validate a flow.
1276  *
1277  * @see rte_flow_validate()
1278  * @see rte_flow_ops
1279  */
1280 static int
1281 tap_flow_validate(struct rte_eth_dev *dev,
1282 		  const struct rte_flow_attr *attr,
1283 		  const struct rte_flow_item items[],
1284 		  const struct rte_flow_action actions[],
1285 		  struct rte_flow_error *error)
1286 {
1287 	struct pmd_internals *pmd = dev->data->dev_private;
1288 
1289 	return priv_flow_process(pmd, attr, items, actions, error, NULL, 0);
1290 }
1291 
1292 /**
1293  * Set a unique handle in a flow.
1294  *
1295  * The kernel supports TC rules with equal priority, as long as they use the
1296  * same matching fields (e.g.: dst mac and ipv4) with different values (and
1297  * full mask to ensure no collision is possible).
1298  * In those rules, the handle (uint32_t) is the part that would identify
1299  * specifically each rule.
1300  *
1301  * Use jhash of the flow pointer to make a unique handle.
1302  *
1303  * @param[in, out] flow
1304  *   The flow that needs its handle set.
1305  */
1306 static void
1307 tap_flow_set_handle(struct rte_flow *flow)
1308 {
1309 	union {
1310 		struct rte_flow *flow;
1311 		uint32_t words[sizeof(flow) / sizeof(uint32_t)];
1312 	} tmp = {
1313 		.flow = flow,
1314 	};
1315 	uint32_t handle;
1316 	static uint64_t hash_seed;
1317 
1318 	if (hash_seed == 0)
1319 		hash_seed = rte_rand();
1320 
1321 	handle = rte_jhash_32b(tmp.words, sizeof(flow) / sizeof(uint32_t), hash_seed);
1322 
1323 	/* must be at least 1 to avoid letting the kernel choose one for us */
1324 	if (!handle)
1325 		handle = 1;
1326 	flow->msg.t.tcm_handle = handle;
1327 }
1328 
1329 /**
1330  * Free the flow opened file descriptors and allocated memory
1331  *
1332  * @param[in] flow
1333  *   Pointer to the flow to free
1334  *
1335  */
1336 static void
1337 tap_flow_free(struct pmd_internals *pmd, struct rte_flow *flow)
1338 {
1339 	int i;
1340 
1341 	if (!flow)
1342 		return;
1343 
1344 	if (pmd->rss_enabled) {
1345 		/* Close flow BPF file descriptors */
1346 		for (i = 0; i < SEC_MAX; i++)
1347 			if (flow->bpf_fd[i] != 0) {
1348 				close(flow->bpf_fd[i]);
1349 				flow->bpf_fd[i] = 0;
1350 			}
1351 
1352 		/* Release the map key for this RSS rule */
1353 		bpf_rss_key(KEY_CMD_RELEASE, &flow->key_idx);
1354 		flow->key_idx = 0;
1355 	}
1356 
1357 	/* Free flow allocated memory */
1358 	rte_free(flow);
1359 }
1360 
1361 /**
1362  * Create a flow.
1363  *
1364  * @see rte_flow_create()
1365  * @see rte_flow_ops
1366  */
1367 static struct rte_flow *
1368 tap_flow_create(struct rte_eth_dev *dev,
1369 		const struct rte_flow_attr *attr,
1370 		const struct rte_flow_item items[],
1371 		const struct rte_flow_action actions[],
1372 		struct rte_flow_error *error)
1373 {
1374 	struct pmd_internals *pmd = dev->data->dev_private;
1375 	struct rte_flow *remote_flow = NULL;
1376 	struct rte_flow *flow = NULL;
1377 	struct nlmsg *msg = NULL;
1378 	int err;
1379 
1380 	if (!pmd->if_index) {
1381 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1382 				   NULL,
1383 				   "can't create rule, ifindex not found");
1384 		goto fail;
1385 	}
1386 	/*
1387 	 * No rules configured through standard rte_flow should be set on the
1388 	 * priorities used by implicit rules.
1389 	 */
1390 	if ((attr->group == MAX_GROUP) &&
1391 	    attr->priority > (MAX_PRIORITY - TAP_REMOTE_MAX_IDX)) {
1392 		rte_flow_error_set(
1393 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1394 			NULL, "priority value too big");
1395 		goto fail;
1396 	}
1397 	flow = rte_zmalloc(__func__, sizeof(struct rte_flow), 0);
1398 	if (!flow) {
1399 		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1400 				   NULL, "cannot allocate memory for rte_flow");
1401 		goto fail;
1402 	}
1403 	msg = &flow->msg;
1404 	tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER,
1405 		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1406 	msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1407 	tap_flow_set_handle(flow);
1408 	if (priv_flow_process(pmd, attr, items, actions, error, flow, 0))
1409 		goto fail;
1410 	err = tap_nl_send(pmd->nlsk_fd, &msg->nh);
1411 	if (err < 0) {
1412 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1413 				   NULL, "couldn't send request to kernel");
1414 		goto fail;
1415 	}
1416 	err = tap_nl_recv_ack(pmd->nlsk_fd);
1417 	if (err < 0) {
1418 		TAP_LOG(ERR,
1419 			"Kernel refused TC filter rule creation (%d): %s",
1420 			errno, strerror(errno));
1421 		rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE,
1422 				   NULL,
1423 				   "overlapping rules or Kernel too old for flower support");
1424 		goto fail;
1425 	}
1426 	LIST_INSERT_HEAD(&pmd->flows, flow, next);
1427 	/**
1428 	 * If a remote device is configured, a TC rule with identical items for
1429 	 * matching must be set on that device, with a single action: redirect
1430 	 * to the local pmd->if_index.
1431 	 */
1432 	if (pmd->remote_if_index) {
1433 		remote_flow = rte_zmalloc(__func__, sizeof(struct rte_flow), 0);
1434 		if (!remote_flow) {
1435 			rte_flow_error_set(
1436 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1437 				"cannot allocate memory for rte_flow");
1438 			goto fail;
1439 		}
1440 		msg = &remote_flow->msg;
1441 		/* set the rule if_index for the remote netdevice */
1442 		tc_init_msg(
1443 			msg, pmd->remote_if_index, RTM_NEWTFILTER,
1444 			NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1445 		msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1446 		tap_flow_set_handle(remote_flow);
1447 		if (priv_flow_process(pmd, attr, items, NULL,
1448 				      error, remote_flow, TCA_EGRESS_REDIR)) {
1449 			rte_flow_error_set(
1450 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1451 				NULL, "rte flow rule validation failed");
1452 			goto fail;
1453 		}
1454 		err = tap_nl_send(pmd->nlsk_fd, &msg->nh);
1455 		if (err < 0) {
1456 			rte_flow_error_set(
1457 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1458 				NULL, "Failure sending nl request");
1459 			goto fail;
1460 		}
1461 		err = tap_nl_recv_ack(pmd->nlsk_fd);
1462 		if (err < 0) {
1463 			TAP_LOG(ERR,
1464 				"Kernel refused TC filter rule creation (%d): %s",
1465 				errno, strerror(errno));
1466 			rte_flow_error_set(
1467 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1468 				NULL,
1469 				"overlapping rules or Kernel too old for flower support");
1470 			goto fail;
1471 		}
1472 		flow->remote_flow = remote_flow;
1473 	}
1474 	return flow;
1475 fail:
1476 	rte_free(remote_flow);
1477 	if (flow)
1478 		tap_flow_free(pmd, flow);
1479 	return NULL;
1480 }
1481 
1482 /**
1483  * Destroy a flow using pointer to pmd_internal.
1484  *
1485  * @param[in, out] pmd
1486  *   Pointer to private structure.
1487  * @param[in] flow
1488  *   Pointer to the flow to destroy.
1489  * @param[in, out] error
1490  *   Pointer to the flow error handler
1491  *
1492  * @return 0 if the flow could be destroyed, -1 otherwise.
1493  */
1494 static int
1495 tap_flow_destroy_pmd(struct pmd_internals *pmd,
1496 		     struct rte_flow *flow,
1497 		     struct rte_flow_error *error)
1498 {
1499 	struct rte_flow *remote_flow = flow->remote_flow;
1500 	int ret = 0;
1501 
1502 	LIST_REMOVE(flow, next);
1503 	flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1504 	flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1505 
1506 	ret = tap_nl_send(pmd->nlsk_fd, &flow->msg.nh);
1507 	if (ret < 0) {
1508 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1509 				   NULL, "couldn't send request to kernel");
1510 		goto end;
1511 	}
1512 	ret = tap_nl_recv_ack(pmd->nlsk_fd);
1513 	/* If errno is ENOENT, the rule is already no longer in the kernel. */
1514 	if (ret < 0 && errno == ENOENT)
1515 		ret = 0;
1516 	if (ret < 0) {
1517 		TAP_LOG(ERR,
1518 			"Kernel refused TC filter rule deletion (%d): %s",
1519 			errno, strerror(errno));
1520 		rte_flow_error_set(
1521 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1522 			"couldn't receive kernel ack to our request");
1523 		goto end;
1524 	}
1525 
1526 	if (remote_flow) {
1527 		remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1528 		remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1529 
1530 		ret = tap_nl_send(pmd->nlsk_fd, &remote_flow->msg.nh);
1531 		if (ret < 0) {
1532 			rte_flow_error_set(
1533 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1534 				NULL, "Failure sending nl request");
1535 			goto end;
1536 		}
1537 		ret = tap_nl_recv_ack(pmd->nlsk_fd);
1538 		if (ret < 0 && errno == ENOENT)
1539 			ret = 0;
1540 		if (ret < 0) {
1541 			TAP_LOG(ERR,
1542 				"Kernel refused TC filter rule deletion (%d): %s",
1543 				errno, strerror(errno));
1544 			rte_flow_error_set(
1545 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1546 				NULL, "Failure trying to receive nl ack");
1547 			goto end;
1548 		}
1549 	}
1550 end:
1551 	rte_free(remote_flow);
1552 	tap_flow_free(pmd, flow);
1553 	return ret;
1554 }
1555 
1556 /**
1557  * Destroy a flow.
1558  *
1559  * @see rte_flow_destroy()
1560  * @see rte_flow_ops
1561  */
1562 static int
1563 tap_flow_destroy(struct rte_eth_dev *dev,
1564 		 struct rte_flow *flow,
1565 		 struct rte_flow_error *error)
1566 {
1567 	struct pmd_internals *pmd = dev->data->dev_private;
1568 
1569 	return tap_flow_destroy_pmd(pmd, flow, error);
1570 }
1571 
1572 /**
1573  * Enable/disable flow isolation.
1574  *
1575  * @see rte_flow_isolate()
1576  * @see rte_flow_ops
1577  */
1578 static int
1579 tap_flow_isolate(struct rte_eth_dev *dev,
1580 		 int set,
1581 		 struct rte_flow_error *error __rte_unused)
1582 {
1583 	struct pmd_internals *pmd = dev->data->dev_private;
1584 	struct pmd_process_private *process_private = dev->process_private;
1585 
1586 	/* normalize 'set' variable to contain 0 or 1 values */
1587 	if (set)
1588 		set = 1;
1589 	/* if already in the right isolation mode - nothing to do */
1590 	if ((set ^ pmd->flow_isolate) == 0)
1591 		return 0;
1592 	/* mark the isolation mode for tap_flow_implicit_create() */
1593 	pmd->flow_isolate = set;
1594 	/*
1595 	 * If netdevice is there, setup appropriate flow rules immediately.
1596 	 * Otherwise it will be set when bringing up the netdevice (tun_alloc).
1597 	 */
1598 	if (!process_private->rxq_fds[0])
1599 		return 0;
1600 	if (set) {
1601 		struct rte_flow *remote_flow;
1602 
1603 		while (1) {
1604 			remote_flow = LIST_FIRST(&pmd->implicit_flows);
1605 			if (!remote_flow)
1606 				break;
1607 			/*
1608 			 * Remove all implicit rules on the remote.
1609 			 * Keep the local rule to redirect packets on TX.
1610 			 * Keep also the last implicit local rule: ISOLATE.
1611 			 */
1612 			if (remote_flow->msg.t.tcm_ifindex == pmd->if_index)
1613 				break;
1614 			if (tap_flow_destroy_pmd(pmd, remote_flow, NULL) < 0)
1615 				goto error;
1616 		}
1617 		/* Switch the TC rule according to pmd->flow_isolate */
1618 		if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1)
1619 			goto error;
1620 	} else {
1621 		/* Switch the TC rule according to pmd->flow_isolate */
1622 		if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1)
1623 			goto error;
1624 		if (!pmd->remote_if_index)
1625 			return 0;
1626 		if (tap_flow_implicit_create(pmd, TAP_REMOTE_TX) < 0)
1627 			goto error;
1628 		if (tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0)
1629 			goto error;
1630 		if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCAST) < 0)
1631 			goto error;
1632 		if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCASTV6) < 0)
1633 			goto error;
1634 		if (dev->data->promiscuous &&
1635 		    tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC) < 0)
1636 			goto error;
1637 		if (dev->data->all_multicast &&
1638 		    tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI) < 0)
1639 			goto error;
1640 	}
1641 	return 0;
1642 error:
1643 	pmd->flow_isolate = 0;
1644 	return rte_flow_error_set(
1645 		error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
1646 		"TC rule creation failed");
1647 }
1648 
1649 /**
1650  * Destroy all flows.
1651  *
1652  * @see rte_flow_flush()
1653  * @see rte_flow_ops
1654  */
1655 int
1656 tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
1657 {
1658 	struct pmd_internals *pmd = dev->data->dev_private;
1659 	struct rte_flow *flow;
1660 
1661 	while (!LIST_EMPTY(&pmd->flows)) {
1662 		flow = LIST_FIRST(&pmd->flows);
1663 		if (tap_flow_destroy(dev, flow, error) < 0)
1664 			return -1;
1665 	}
1666 	return 0;
1667 }
1668 
1669 /**
1670  * Add an implicit flow rule on the remote device to make sure traffic gets to
1671  * the tap netdevice from there.
1672  *
1673  * @param pmd
1674  *   Pointer to private structure.
1675  * @param[in] idx
1676  *   The idx in the implicit_rte_flows array specifying which rule to apply.
1677  *
1678  * @return -1 if the rule couldn't be applied, 0 otherwise.
1679  */
1680 int tap_flow_implicit_create(struct pmd_internals *pmd,
1681 			     enum implicit_rule_index idx)
1682 {
1683 	uint16_t flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE;
1684 	struct rte_flow_action *actions = implicit_rte_flows[idx].actions;
1685 	struct rte_flow_action isolate_actions[2] = {
1686 		[1] = {
1687 			.type = RTE_FLOW_ACTION_TYPE_END,
1688 		},
1689 	};
1690 	struct rte_flow_item *items = implicit_rte_flows[idx].items;
1691 	struct rte_flow_attr *attr = &implicit_rte_flows[idx].attr;
1692 	struct rte_flow_item_eth eth_local = { .hdr.ether_type = 0 };
1693 	unsigned int if_index = pmd->remote_if_index;
1694 	struct rte_flow *remote_flow = NULL;
1695 	struct nlmsg *msg = NULL;
1696 	int err = 0;
1697 	struct rte_flow_item items_local[2] = {
1698 		[0] = {
1699 			.type = items[0].type,
1700 			.spec = &eth_local,
1701 			.mask = items[0].mask,
1702 		},
1703 		[1] = {
1704 			.type = items[1].type,
1705 		}
1706 	};
1707 
1708 	remote_flow = rte_zmalloc(__func__, sizeof(struct rte_flow), 0);
1709 	if (!remote_flow) {
1710 		TAP_LOG(ERR, "Cannot allocate memory for rte_flow");
1711 		goto fail;
1712 	}
1713 	msg = &remote_flow->msg;
1714 	if (idx == TAP_REMOTE_TX) {
1715 		if_index = pmd->if_index;
1716 	} else if (idx == TAP_ISOLATE) {
1717 		if_index = pmd->if_index;
1718 		/* Don't be exclusive for this rule, it can be changed later. */
1719 		flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE;
1720 		isolate_actions[0].type = pmd->flow_isolate ?
1721 			RTE_FLOW_ACTION_TYPE_DROP :
1722 			RTE_FLOW_ACTION_TYPE_PASSTHRU;
1723 		actions = isolate_actions;
1724 	} else if (idx == TAP_REMOTE_LOCAL_MAC) {
1725 		/*
1726 		 * eth addr couldn't be set in implicit_rte_flows[] as it is not
1727 		 * known at compile time.
1728 		 */
1729 		memcpy(&eth_local.hdr.dst_addr, &pmd->eth_addr, sizeof(pmd->eth_addr));
1730 		items = items_local;
1731 	}
1732 	tc_init_msg(msg, if_index, RTM_NEWTFILTER, flags);
1733 	msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1734 	/*
1735 	 * The ISOLATE rule is always present and must have a static handle, as
1736 	 * the action is changed whether the feature is enabled (DROP) or
1737 	 * disabled (PASSTHRU).
1738 	 * There is just one REMOTE_PROMISCUOUS rule in all cases. It should
1739 	 * have a static handle such that adding it twice will fail with EEXIST
1740 	 * with any kernel version. Remark: old kernels may falsely accept the
1741 	 * same REMOTE_PROMISCUOUS rules if they had different handles.
1742 	 */
1743 	if (idx == TAP_ISOLATE)
1744 		remote_flow->msg.t.tcm_handle = ISOLATE_HANDLE;
1745 	else if (idx == TAP_REMOTE_PROMISC)
1746 		remote_flow->msg.t.tcm_handle = REMOTE_PROMISCUOUS_HANDLE;
1747 	else
1748 		tap_flow_set_handle(remote_flow);
1749 	if (priv_flow_process(pmd, attr, items, actions, NULL,
1750 			      remote_flow, implicit_rte_flows[idx].mirred)) {
1751 		TAP_LOG(ERR, "rte flow rule validation failed");
1752 		goto fail;
1753 	}
1754 	err = tap_nl_send(pmd->nlsk_fd, &msg->nh);
1755 	if (err < 0) {
1756 		TAP_LOG(ERR, "Failure sending nl request");
1757 		goto fail;
1758 	}
1759 	err = tap_nl_recv_ack(pmd->nlsk_fd);
1760 	if (err < 0) {
1761 		/* Silently ignore re-entering existing rule */
1762 		if (errno == EEXIST)
1763 			goto success;
1764 		TAP_LOG(ERR,
1765 			"Kernel refused TC filter rule creation (%d): %s",
1766 			errno, strerror(errno));
1767 		goto fail;
1768 	}
1769 	LIST_INSERT_HEAD(&pmd->implicit_flows, remote_flow, next);
1770 success:
1771 	return 0;
1772 fail:
1773 	rte_free(remote_flow);
1774 	return -1;
1775 }
1776 
1777 /**
1778  * Remove specific implicit flow rule on the remote device.
1779  *
1780  * @param[in, out] pmd
1781  *   Pointer to private structure.
1782  * @param[in] idx
1783  *   The idx in the implicit_rte_flows array specifying which rule to remove.
1784  *
1785  * @return -1 if one of the implicit rules couldn't be created, 0 otherwise.
1786  */
1787 int tap_flow_implicit_destroy(struct pmd_internals *pmd,
1788 			      enum implicit_rule_index idx)
1789 {
1790 	struct rte_flow *remote_flow;
1791 	int cur_prio = -1;
1792 	int idx_prio = implicit_rte_flows[idx].attr.priority + PRIORITY_OFFSET;
1793 
1794 	for (remote_flow = LIST_FIRST(&pmd->implicit_flows);
1795 	     remote_flow;
1796 	     remote_flow = LIST_NEXT(remote_flow, next)) {
1797 		cur_prio = (remote_flow->msg.t.tcm_info >> 16) & PRIORITY_MASK;
1798 		if (cur_prio != idx_prio)
1799 			continue;
1800 		return tap_flow_destroy_pmd(pmd, remote_flow, NULL);
1801 	}
1802 	return 0;
1803 }
1804 
1805 /**
1806  * Destroy all implicit flows.
1807  *
1808  * @see rte_flow_flush()
1809  */
1810 int
1811 tap_flow_implicit_flush(struct pmd_internals *pmd, struct rte_flow_error *error)
1812 {
1813 	struct rte_flow *remote_flow;
1814 
1815 	while (!LIST_EMPTY(&pmd->implicit_flows)) {
1816 		remote_flow = LIST_FIRST(&pmd->implicit_flows);
1817 		if (tap_flow_destroy_pmd(pmd, remote_flow, error) < 0)
1818 			return -1;
1819 	}
1820 	return 0;
1821 }
1822 
1823 #define MAX_RSS_KEYS 256
1824 #define KEY_IDX_OFFSET (3 * MAX_RSS_KEYS)
1825 #define SEC_NAME_CLS_Q "cls_q"
1826 
1827 static const char *sec_name[SEC_MAX] = {
1828 	[SEC_L3_L4] = "l3_l4",
1829 };
1830 
1831 /**
1832  * Enable RSS on tap: create TC rules for queuing.
1833  *
1834  * @param[in, out] pmd
1835  *   Pointer to private structure.
1836  *
1837  * @param[in] attr
1838  *   Pointer to rte_flow to get flow group
1839  *
1840  * @param[out] error
1841  *   Pointer to error reporting if not NULL.
1842  *
1843  * @return 0 on success, negative value on failure.
1844  */
1845 static int rss_enable(struct pmd_internals *pmd,
1846 			const struct rte_flow_attr *attr,
1847 			struct rte_flow_error *error)
1848 {
1849 	struct rte_flow *rss_flow = NULL;
1850 	struct nlmsg *msg = NULL;
1851 	/* 4096 is the maximum number of instructions for a BPF program */
1852 	char annotation[64];
1853 	int i;
1854 	int err = 0;
1855 
1856 	/* unlimit locked memory */
1857 	struct rlimit memlock_limit = {
1858 		.rlim_cur = RLIM_INFINITY,
1859 		.rlim_max = RLIM_INFINITY,
1860 	};
1861 	setrlimit(RLIMIT_MEMLOCK, &memlock_limit);
1862 
1863 	 /* Get a new map key for a new RSS rule */
1864 	err = bpf_rss_key(KEY_CMD_INIT, NULL);
1865 	if (err < 0) {
1866 		rte_flow_error_set(
1867 			error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1868 			"Failed to initialize BPF RSS keys");
1869 
1870 		return -1;
1871 	}
1872 
1873 	/*
1874 	 *  Create BPF RSS MAP
1875 	 */
1876 	pmd->map_fd = tap_flow_bpf_rss_map_create(sizeof(__u32), /* key size */
1877 				sizeof(struct rss_key),
1878 				MAX_RSS_KEYS);
1879 	if (pmd->map_fd < 0) {
1880 		TAP_LOG(ERR,
1881 			"Failed to create BPF map (%d): %s",
1882 				errno, strerror(errno));
1883 		rte_flow_error_set(
1884 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1885 			"Kernel too old or not configured "
1886 			"to support BPF maps");
1887 
1888 		return -ENOTSUP;
1889 	}
1890 
1891 	/*
1892 	 * Add a rule per queue to match reclassified packets and direct them to
1893 	 * the correct queue.
1894 	 */
1895 	for (i = 0; i < pmd->dev->data->nb_rx_queues; i++) {
1896 		pmd->bpf_fd[i] = tap_flow_bpf_cls_q(i);
1897 		if (pmd->bpf_fd[i] < 0) {
1898 			TAP_LOG(ERR,
1899 				"Failed to load BPF section %s for queue %d",
1900 				SEC_NAME_CLS_Q, i);
1901 			rte_flow_error_set(
1902 				error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1903 				NULL,
1904 				"Kernel too old or not configured "
1905 				"to support BPF programs loading");
1906 
1907 			return -ENOTSUP;
1908 		}
1909 
1910 		rss_flow = rte_zmalloc(__func__, sizeof(struct rte_flow), 0);
1911 		if (!rss_flow) {
1912 			TAP_LOG(ERR,
1913 				"Cannot allocate memory for rte_flow");
1914 			return -1;
1915 		}
1916 		msg = &rss_flow->msg;
1917 		tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER, NLM_F_REQUEST |
1918 			    NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1919 		msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1920 		tap_flow_set_handle(rss_flow);
1921 		uint16_t group = attr->group << GROUP_SHIFT;
1922 		uint16_t prio = group | (i + PRIORITY_OFFSET);
1923 		msg->t.tcm_info = TC_H_MAKE(prio << 16, msg->t.tcm_info);
1924 		msg->t.tcm_parent = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
1925 
1926 		tap_nlattr_add(&msg->nh, TCA_KIND, sizeof("bpf"), "bpf");
1927 		if (tap_nlattr_nested_start(msg, TCA_OPTIONS) < 0)
1928 			return -1;
1929 		tap_nlattr_add32(&msg->nh, TCA_BPF_FD, pmd->bpf_fd[i]);
1930 		snprintf(annotation, sizeof(annotation), "[%s%d]",
1931 			SEC_NAME_CLS_Q, i);
1932 		tap_nlattr_add(&msg->nh, TCA_BPF_NAME, strlen(annotation) + 1,
1933 			   annotation);
1934 		/* Actions */
1935 		{
1936 			struct action_data adata = {
1937 				.id = "skbedit",
1938 				.skbedit = {
1939 					.skbedit = {
1940 						.action = TC_ACT_PIPE,
1941 					},
1942 					.queue = i,
1943 				},
1944 			};
1945 			if (add_actions(rss_flow, 1, &adata, TCA_BPF_ACT) < 0)
1946 				return -1;
1947 		}
1948 		tap_nlattr_nested_finish(msg); /* nested TCA_OPTIONS */
1949 
1950 		/* Netlink message is now ready to be sent */
1951 		if (tap_nl_send(pmd->nlsk_fd, &msg->nh) < 0)
1952 			return -1;
1953 		err = tap_nl_recv_ack(pmd->nlsk_fd);
1954 		if (err < 0) {
1955 			TAP_LOG(ERR,
1956 				"Kernel refused TC filter rule creation (%d): %s",
1957 				errno, strerror(errno));
1958 			return err;
1959 		}
1960 		LIST_INSERT_HEAD(&pmd->rss_flows, rss_flow, next);
1961 	}
1962 
1963 	pmd->rss_enabled = 1;
1964 	return err;
1965 }
1966 
1967 /**
1968  * Manage bpf RSS keys repository with operations: init, get, release
1969  *
1970  * @param[in] cmd
1971  *   Command on RSS keys: init, get, release
1972  *
1973  * @param[in, out] key_idx
1974  *   Pointer to RSS Key index (out for get command, in for release command)
1975  *
1976  * @return -1 if couldn't get, release or init the RSS keys, 0 otherwise.
1977  */
1978 static int bpf_rss_key(enum bpf_rss_key_e cmd, __u32 *key_idx)
1979 {
1980 	__u32 i;
1981 	int err = 0;
1982 	static __u32 num_used_keys;
1983 	static __u32 rss_keys[MAX_RSS_KEYS] = {KEY_STAT_UNSPEC};
1984 	static __u32 rss_keys_initialized;
1985 	__u32 key;
1986 
1987 	switch (cmd) {
1988 	case KEY_CMD_GET:
1989 		if (!rss_keys_initialized) {
1990 			err = -1;
1991 			break;
1992 		}
1993 
1994 		if (num_used_keys == RTE_DIM(rss_keys)) {
1995 			err = -1;
1996 			break;
1997 		}
1998 
1999 		*key_idx = num_used_keys % RTE_DIM(rss_keys);
2000 		while (rss_keys[*key_idx] == KEY_STAT_USED)
2001 			*key_idx = (*key_idx + 1) % RTE_DIM(rss_keys);
2002 
2003 		rss_keys[*key_idx] = KEY_STAT_USED;
2004 
2005 		/*
2006 		 * Add an offset to key_idx in order to handle a case of
2007 		 * RSS and non RSS flows mixture.
2008 		 * If a non RSS flow is destroyed it has an eBPF map
2009 		 * index 0 (initialized on flow creation) and might
2010 		 * unintentionally remove RSS entry 0 from eBPF map.
2011 		 * To avoid this issue, add an offset to the real index
2012 		 * during a KEY_CMD_GET operation and subtract this offset
2013 		 * during a KEY_CMD_RELEASE operation in order to restore
2014 		 * the real index.
2015 		 */
2016 		*key_idx += KEY_IDX_OFFSET;
2017 		num_used_keys++;
2018 	break;
2019 
2020 	case KEY_CMD_RELEASE:
2021 		if (!rss_keys_initialized)
2022 			break;
2023 
2024 		/*
2025 		 * Subtract offset to restore real key index
2026 		 * If a non RSS flow is falsely trying to release map
2027 		 * entry 0 - the offset subtraction will calculate the real
2028 		 * map index as an out-of-range value and the release operation
2029 		 * will be silently ignored.
2030 		 */
2031 		key = *key_idx - KEY_IDX_OFFSET;
2032 		if (key >= RTE_DIM(rss_keys))
2033 			break;
2034 
2035 		if (rss_keys[key] == KEY_STAT_USED) {
2036 			rss_keys[key] = KEY_STAT_AVAILABLE;
2037 			num_used_keys--;
2038 		}
2039 	break;
2040 
2041 	case KEY_CMD_INIT:
2042 		for (i = 0; i < RTE_DIM(rss_keys); i++)
2043 			rss_keys[i] = KEY_STAT_AVAILABLE;
2044 
2045 		rss_keys_initialized = 1;
2046 		num_used_keys = 0;
2047 	break;
2048 
2049 	case KEY_CMD_DEINIT:
2050 		for (i = 0; i < RTE_DIM(rss_keys); i++)
2051 			rss_keys[i] = KEY_STAT_UNSPEC;
2052 
2053 		rss_keys_initialized = 0;
2054 		num_used_keys = 0;
2055 	break;
2056 
2057 	default:
2058 		break;
2059 	}
2060 
2061 	return err;
2062 }
2063 
2064 /**
2065  * Add RSS hash calculations and queue selection
2066  *
2067  * @param[in, out] pmd
2068  *   Pointer to internal structure. Used to set/get RSS map fd
2069  *
2070  * @param[in] rss
2071  *   Pointer to RSS flow actions
2072  *
2073  * @param[out] error
2074  *   Pointer to error reporting if not NULL.
2075  *
2076  * @return 0 on success, negative value on failure
2077  */
2078 static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
2079 			   const struct rte_flow_action_rss *rss,
2080 			   struct rte_flow_error *error)
2081 {
2082 	/* 4096 is the maximum number of instructions for a BPF program */
2083 	unsigned int i;
2084 	int err;
2085 	struct rss_key rss_entry = { .hash_fields = 0,
2086 				     .key_size = 0 };
2087 
2088 	/* Check supported RSS features */
2089 	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
2090 		return rte_flow_error_set
2091 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2092 			 "non-default RSS hash functions are not supported");
2093 	if (rss->level)
2094 		return rte_flow_error_set
2095 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2096 			 "a nonzero RSS encapsulation level is not supported");
2097 
2098 	/* Get a new map key for a new RSS rule */
2099 	err = bpf_rss_key(KEY_CMD_GET, &flow->key_idx);
2100 	if (err < 0) {
2101 		rte_flow_error_set(
2102 			error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
2103 			"Failed to get BPF RSS key");
2104 
2105 		return -1;
2106 	}
2107 
2108 	/* Update RSS map entry with queues */
2109 	rss_entry.nb_queues = rss->queue_num;
2110 	for (i = 0; i < rss->queue_num; i++)
2111 		rss_entry.queues[i] = rss->queue[i];
2112 	rss_entry.hash_fields =
2113 		(1 << HASH_FIELD_IPV4_L3_L4) | (1 << HASH_FIELD_IPV6_L3_L4);
2114 
2115 	/* Add this RSS entry to map */
2116 	err = tap_flow_bpf_update_rss_elem(pmd->map_fd,
2117 				&flow->key_idx, &rss_entry);
2118 
2119 	if (err) {
2120 		TAP_LOG(ERR,
2121 			"Failed to update BPF map entry #%u (%d): %s",
2122 			flow->key_idx, errno, strerror(errno));
2123 		rte_flow_error_set(
2124 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
2125 			"Kernel too old or not configured "
2126 			"to support BPF maps updates");
2127 
2128 		return -ENOTSUP;
2129 	}
2130 
2131 
2132 	/*
2133 	 * Load bpf rules to calculate hash for this key_idx
2134 	 */
2135 
2136 	flow->bpf_fd[SEC_L3_L4] =
2137 		tap_flow_bpf_calc_l3_l4_hash(flow->key_idx, pmd->map_fd);
2138 	if (flow->bpf_fd[SEC_L3_L4] < 0) {
2139 		TAP_LOG(ERR,
2140 			"Failed to load BPF section %s (%d): %s",
2141 				sec_name[SEC_L3_L4], errno, strerror(errno));
2142 		rte_flow_error_set(
2143 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
2144 			"Kernel too old or not configured "
2145 			"to support BPF program loading");
2146 
2147 		return -ENOTSUP;
2148 	}
2149 
2150 	/* Actions */
2151 	{
2152 		struct action_data adata[] = {
2153 			{
2154 				.id = "bpf",
2155 				.bpf = {
2156 					.bpf_fd = flow->bpf_fd[SEC_L3_L4],
2157 					.annotation = sec_name[SEC_L3_L4],
2158 					.bpf = {
2159 						.action = TC_ACT_PIPE,
2160 					},
2161 				},
2162 			},
2163 		};
2164 
2165 		if (add_actions(flow, RTE_DIM(adata), adata,
2166 			TCA_FLOWER_ACT) < 0)
2167 			return -1;
2168 	}
2169 
2170 	return 0;
2171 }
2172 
2173 /**
2174  * Get rte_flow operations.
2175  *
2176  * @param dev
2177  *   Pointer to Ethernet device structure.
2178  * @param ops
2179  *   Pointer to operation-specific structure.
2180  *
2181  * @return
2182  *   0 on success, negative errno value on failure.
2183  */
2184 int
2185 tap_dev_flow_ops_get(struct rte_eth_dev *dev __rte_unused,
2186 		     const struct rte_flow_ops **ops)
2187 {
2188 	*ops = &tap_flow_ops;
2189 	return 0;
2190 }
2191