xref: /dpdk/drivers/net/tap/tap_flow.c (revision 89f0711f9ddfb5822da9d34f384b92f72a61c4dc)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2017 6WIND S.A.
5  *   Copyright 2017 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <errno.h>
35 #include <string.h>
36 #include <unistd.h>
37 #include <sys/queue.h>
38 #include <sys/resource.h>
39 
40 #include <rte_byteorder.h>
41 #include <rte_jhash.h>
42 #include <rte_malloc.h>
43 #include <rte_eth_tap.h>
44 #include <tap_flow.h>
45 #include <tap_autoconf.h>
46 #include <tap_tcmsgs.h>
47 #include <tap_rss.h>
48 
49 #ifndef HAVE_TC_FLOWER
50 /*
51  * For kernels < 4.2, this enum is not defined. Runtime checks will be made to
52  * avoid sending TC messages the kernel cannot understand.
53  */
54 enum {
55 	TCA_FLOWER_UNSPEC,
56 	TCA_FLOWER_CLASSID,
57 	TCA_FLOWER_INDEV,
58 	TCA_FLOWER_ACT,
59 	TCA_FLOWER_KEY_ETH_DST,         /* ETH_ALEN */
60 	TCA_FLOWER_KEY_ETH_DST_MASK,    /* ETH_ALEN */
61 	TCA_FLOWER_KEY_ETH_SRC,         /* ETH_ALEN */
62 	TCA_FLOWER_KEY_ETH_SRC_MASK,    /* ETH_ALEN */
63 	TCA_FLOWER_KEY_ETH_TYPE,        /* be16 */
64 	TCA_FLOWER_KEY_IP_PROTO,        /* u8 */
65 	TCA_FLOWER_KEY_IPV4_SRC,        /* be32 */
66 	TCA_FLOWER_KEY_IPV4_SRC_MASK,   /* be32 */
67 	TCA_FLOWER_KEY_IPV4_DST,        /* be32 */
68 	TCA_FLOWER_KEY_IPV4_DST_MASK,   /* be32 */
69 	TCA_FLOWER_KEY_IPV6_SRC,        /* struct in6_addr */
70 	TCA_FLOWER_KEY_IPV6_SRC_MASK,   /* struct in6_addr */
71 	TCA_FLOWER_KEY_IPV6_DST,        /* struct in6_addr */
72 	TCA_FLOWER_KEY_IPV6_DST_MASK,   /* struct in6_addr */
73 	TCA_FLOWER_KEY_TCP_SRC,         /* be16 */
74 	TCA_FLOWER_KEY_TCP_DST,         /* be16 */
75 	TCA_FLOWER_KEY_UDP_SRC,         /* be16 */
76 	TCA_FLOWER_KEY_UDP_DST,         /* be16 */
77 };
78 #endif
79 #ifndef HAVE_TC_VLAN_ID
80 enum {
81 	/* TCA_FLOWER_FLAGS, */
82 	TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */
83 	TCA_FLOWER_KEY_VLAN_PRIO,       /* u8   */
84 	TCA_FLOWER_KEY_VLAN_ETH_TYPE,   /* be16 */
85 };
86 #endif
87 /*
88  * For kernels < 4.2 BPF related enums may not be defined.
89  * Runtime checks will be carried out to gracefully report on TC messages that
90  * are rejected by the kernel. Rejection reasons may be due to:
91  * 1. enum is not defined
92  * 2. enum is defined but kernel is not configured to support BPF system calls,
93  *    BPF classifications or BPF actions.
94  */
95 #ifndef HAVE_TC_BPF
96 enum {
97 	TCA_BPF_UNSPEC,
98 	TCA_BPF_ACT,
99 	TCA_BPF_POLICE,
100 	TCA_BPF_CLASSID,
101 	TCA_BPF_OPS_LEN,
102 	TCA_BPF_OPS,
103 };
104 #endif
105 #ifndef HAVE_TC_BPF_FD
106 enum {
107 	TCA_BPF_FD = TCA_BPF_OPS + 1,
108 	TCA_BPF_NAME,
109 };
110 #endif
111 #ifndef HAVE_TC_ACT_BPF
112 #define tc_gen \
113 	__u32                 index; \
114 	__u32                 capab; \
115 	int                   action; \
116 	int                   refcnt; \
117 	int                   bindcnt
118 
119 struct tc_act_bpf {
120 	tc_gen;
121 };
122 
123 enum {
124 	TCA_ACT_BPF_UNSPEC,
125 	TCA_ACT_BPF_TM,
126 	TCA_ACT_BPF_PARMS,
127 	TCA_ACT_BPF_OPS_LEN,
128 	TCA_ACT_BPF_OPS,
129 };
130 
131 #endif
132 #ifndef HAVE_TC_ACT_BPF_FD
133 enum {
134 	TCA_ACT_BPF_FD = TCA_ACT_BPF_OPS + 1,
135 	TCA_ACT_BPF_NAME,
136 };
137 #endif
138 
139 /* RSS key management */
140 enum bpf_rss_key_e {
141 	KEY_CMD_GET = 1,
142 	KEY_CMD_RELEASE,
143 	KEY_CMD_INIT,
144 	KEY_CMD_DEINIT,
145 };
146 
147 enum key_status_e {
148 	KEY_STAT_UNSPEC,
149 	KEY_STAT_USED,
150 	KEY_STAT_AVAILABLE,
151 };
152 
153 #define ISOLATE_HANDLE 1
154 
155 struct rte_flow {
156 	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
157 	struct rte_flow *remote_flow; /* associated remote flow */
158 	int bpf_fd[SEC_MAX]; /* list of bfs fds per ELF section */
159 	uint32_t key_idx; /* RSS rule key index into BPF map */
160 	struct nlmsg msg;
161 };
162 
163 struct convert_data {
164 	uint16_t eth_type;
165 	uint16_t ip_proto;
166 	uint8_t vlan;
167 	struct rte_flow *flow;
168 };
169 
170 struct remote_rule {
171 	struct rte_flow_attr attr;
172 	struct rte_flow_item items[2];
173 	struct rte_flow_action actions[2];
174 	int mirred;
175 };
176 
177 struct action_data {
178 	char id[16];
179 
180 	union {
181 		struct tc_gact gact;
182 		struct tc_mirred mirred;
183 		struct skbedit {
184 			struct tc_skbedit skbedit;
185 			uint16_t queue;
186 		} skbedit;
187 		struct bpf {
188 			struct tc_act_bpf bpf;
189 			int bpf_fd;
190 			const char *annotation;
191 		} bpf;
192 	};
193 };
194 
195 static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
196 static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
197 static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
198 static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data);
199 static int tap_flow_create_udp(const struct rte_flow_item *item, void *data);
200 static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data);
201 static int
202 tap_flow_validate(struct rte_eth_dev *dev,
203 		  const struct rte_flow_attr *attr,
204 		  const struct rte_flow_item items[],
205 		  const struct rte_flow_action actions[],
206 		  struct rte_flow_error *error);
207 
208 static struct rte_flow *
209 tap_flow_create(struct rte_eth_dev *dev,
210 		const struct rte_flow_attr *attr,
211 		const struct rte_flow_item items[],
212 		const struct rte_flow_action actions[],
213 		struct rte_flow_error *error);
214 
215 static int
216 tap_flow_destroy(struct rte_eth_dev *dev,
217 		 struct rte_flow *flow,
218 		 struct rte_flow_error *error);
219 
220 static int
221 tap_flow_isolate(struct rte_eth_dev *dev,
222 		 int set,
223 		 struct rte_flow_error *error);
224 
225 static int bpf_rss_key(enum bpf_rss_key_e cmd, __u32 *key_idx);
226 static int rss_enable(struct pmd_internals *pmd,
227 			const struct rte_flow_attr *attr,
228 			struct rte_flow_error *error);
229 static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
230 			const struct rte_flow_action_rss *rss,
231 			struct rte_flow_error *error);
232 
233 static const struct rte_flow_ops tap_flow_ops = {
234 	.validate = tap_flow_validate,
235 	.create = tap_flow_create,
236 	.destroy = tap_flow_destroy,
237 	.flush = tap_flow_flush,
238 	.isolate = tap_flow_isolate,
239 };
240 
241 /* Static initializer for items. */
242 #define ITEMS(...) \
243 	(const enum rte_flow_item_type []){ \
244 		__VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
245 	}
246 
247 /* Structure to generate a simple graph of layers supported by the NIC. */
248 struct tap_flow_items {
249 	/* Bit-mask corresponding to what is supported for this item. */
250 	const void *mask;
251 	const unsigned int mask_sz; /* Bit-mask size in bytes. */
252 	/*
253 	 * Bit-mask corresponding to the default mask, if none is provided
254 	 * along with the item.
255 	 */
256 	const void *default_mask;
257 	/**
258 	 * Conversion function from rte_flow to netlink attributes.
259 	 *
260 	 * @param item
261 	 *   rte_flow item to convert.
262 	 * @param data
263 	 *   Internal structure to store the conversion.
264 	 *
265 	 * @return
266 	 *   0 on success, negative value otherwise.
267 	 */
268 	int (*convert)(const struct rte_flow_item *item, void *data);
269 	/** List of possible following items.  */
270 	const enum rte_flow_item_type *const items;
271 };
272 
273 /* Graph of supported items and associated actions. */
274 static const struct tap_flow_items tap_flow_items[] = {
275 	[RTE_FLOW_ITEM_TYPE_END] = {
276 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
277 	},
278 	[RTE_FLOW_ITEM_TYPE_ETH] = {
279 		.items = ITEMS(
280 			RTE_FLOW_ITEM_TYPE_VLAN,
281 			RTE_FLOW_ITEM_TYPE_IPV4,
282 			RTE_FLOW_ITEM_TYPE_IPV6),
283 		.mask = &(const struct rte_flow_item_eth){
284 			.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
285 			.src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
286 			.type = -1,
287 		},
288 		.mask_sz = sizeof(struct rte_flow_item_eth),
289 		.default_mask = &rte_flow_item_eth_mask,
290 		.convert = tap_flow_create_eth,
291 	},
292 	[RTE_FLOW_ITEM_TYPE_VLAN] = {
293 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
294 			       RTE_FLOW_ITEM_TYPE_IPV6),
295 		.mask = &(const struct rte_flow_item_vlan){
296 			.tpid = -1,
297 			/* DEI matching is not supported */
298 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
299 			.tci = 0xffef,
300 #else
301 			.tci = 0xefff,
302 #endif
303 		},
304 		.mask_sz = sizeof(struct rte_flow_item_vlan),
305 		.default_mask = &rte_flow_item_vlan_mask,
306 		.convert = tap_flow_create_vlan,
307 	},
308 	[RTE_FLOW_ITEM_TYPE_IPV4] = {
309 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
310 			       RTE_FLOW_ITEM_TYPE_TCP),
311 		.mask = &(const struct rte_flow_item_ipv4){
312 			.hdr = {
313 				.src_addr = -1,
314 				.dst_addr = -1,
315 				.next_proto_id = -1,
316 			},
317 		},
318 		.mask_sz = sizeof(struct rte_flow_item_ipv4),
319 		.default_mask = &rte_flow_item_ipv4_mask,
320 		.convert = tap_flow_create_ipv4,
321 	},
322 	[RTE_FLOW_ITEM_TYPE_IPV6] = {
323 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
324 			       RTE_FLOW_ITEM_TYPE_TCP),
325 		.mask = &(const struct rte_flow_item_ipv6){
326 			.hdr = {
327 				.src_addr = {
328 					"\xff\xff\xff\xff\xff\xff\xff\xff"
329 					"\xff\xff\xff\xff\xff\xff\xff\xff",
330 				},
331 				.dst_addr = {
332 					"\xff\xff\xff\xff\xff\xff\xff\xff"
333 					"\xff\xff\xff\xff\xff\xff\xff\xff",
334 				},
335 				.proto = -1,
336 			},
337 		},
338 		.mask_sz = sizeof(struct rte_flow_item_ipv6),
339 		.default_mask = &rte_flow_item_ipv6_mask,
340 		.convert = tap_flow_create_ipv6,
341 	},
342 	[RTE_FLOW_ITEM_TYPE_UDP] = {
343 		.mask = &(const struct rte_flow_item_udp){
344 			.hdr = {
345 				.src_port = -1,
346 				.dst_port = -1,
347 			},
348 		},
349 		.mask_sz = sizeof(struct rte_flow_item_udp),
350 		.default_mask = &rte_flow_item_udp_mask,
351 		.convert = tap_flow_create_udp,
352 	},
353 	[RTE_FLOW_ITEM_TYPE_TCP] = {
354 		.mask = &(const struct rte_flow_item_tcp){
355 			.hdr = {
356 				.src_port = -1,
357 				.dst_port = -1,
358 			},
359 		},
360 		.mask_sz = sizeof(struct rte_flow_item_tcp),
361 		.default_mask = &rte_flow_item_tcp_mask,
362 		.convert = tap_flow_create_tcp,
363 	},
364 };
365 
366 /*
367  *                TC rules, by growing priority
368  *
369  *        Remote netdevice                  Tap netdevice
370  * +-------------+-------------+  +-------------+-------------+
371  * |   Ingress   |   Egress    |  |   Ingress   |   Egress    |
372  * |-------------|-------------|  |-------------|-------------|
373  * |             |  \       /  |  |             |  REMOTE TX  | prio 1
374  * |             |   \     /   |  |             |   \     /   | prio 2
375  * |  EXPLICIT   |    \   /    |  |  EXPLICIT   |    \   /    |   .
376  * |             |     \ /     |  |             |     \ /     |   .
377  * |    RULES    |      X      |  |    RULES    |      X      |   .
378  * |      .      |     / \     |  |      .      |     / \     |   .
379  * |      .      |    /   \    |  |      .      |    /   \    |   .
380  * |      .      |   /     \   |  |      .      |   /     \   |   .
381  * |      .      |  /       \  |  |      .      |  /       \  |   .
382  *
383  *      ....           ....           ....           ....
384  *
385  * |      .      |  \       /  |  |      .      |  \       /  |   .
386  * |      .      |   \     /   |  |      .      |   \     /   |   .
387  * |             |    \   /    |  |             |    \   /    |
388  * |  LOCAL_MAC  |     \ /     |  |    \   /    |     \ /     | last prio - 5
389  * |   PROMISC   |      X      |  |     \ /     |      X      | last prio - 4
390  * |   ALLMULTI  |     / \     |  |      X      |     / \     | last prio - 3
391  * |  BROADCAST  |    /   \    |  |     / \     |    /   \    | last prio - 2
392  * | BROADCASTV6 |   /     \   |  |    /   \    |   /     \   | last prio - 1
393  * |     xx      |  /       \  |  |   ISOLATE   |  /       \  | last prio
394  * +-------------+-------------+  +-------------+-------------+
395  *
396  * The implicit flow rules are stored in a list in with mandatorily the last two
397  * being the ISOLATE and REMOTE_TX rules. e.g.:
398  *
399  * LOCAL_MAC -> BROADCAST -> BROADCASTV6 -> REMOTE_TX -> ISOLATE -> NULL
400  *
401  * That enables tap_flow_isolate() to remove implicit rules by popping the list
402  * head and remove it as long as it applies on the remote netdevice. The
403  * implicit rule for TX redirection is not removed, as isolate concerns only
404  * incoming traffic.
405  */
406 
407 static struct remote_rule implicit_rte_flows[TAP_REMOTE_MAX_IDX] = {
408 	[TAP_REMOTE_LOCAL_MAC] = {
409 		.attr = {
410 			.group = MAX_GROUP,
411 			.priority = PRIORITY_MASK - TAP_REMOTE_LOCAL_MAC,
412 			.ingress = 1,
413 		},
414 		.items[0] = {
415 			.type = RTE_FLOW_ITEM_TYPE_ETH,
416 			.mask =  &(const struct rte_flow_item_eth){
417 				.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
418 			},
419 		},
420 		.items[1] = {
421 			.type = RTE_FLOW_ITEM_TYPE_END,
422 		},
423 		.mirred = TCA_EGRESS_REDIR,
424 	},
425 	[TAP_REMOTE_BROADCAST] = {
426 		.attr = {
427 			.group = MAX_GROUP,
428 			.priority = PRIORITY_MASK - TAP_REMOTE_BROADCAST,
429 			.ingress = 1,
430 		},
431 		.items[0] = {
432 			.type = RTE_FLOW_ITEM_TYPE_ETH,
433 			.mask =  &(const struct rte_flow_item_eth){
434 				.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
435 			},
436 			.spec = &(const struct rte_flow_item_eth){
437 				.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
438 			},
439 		},
440 		.items[1] = {
441 			.type = RTE_FLOW_ITEM_TYPE_END,
442 		},
443 		.mirred = TCA_EGRESS_MIRROR,
444 	},
445 	[TAP_REMOTE_BROADCASTV6] = {
446 		.attr = {
447 			.group = MAX_GROUP,
448 			.priority = PRIORITY_MASK - TAP_REMOTE_BROADCASTV6,
449 			.ingress = 1,
450 		},
451 		.items[0] = {
452 			.type = RTE_FLOW_ITEM_TYPE_ETH,
453 			.mask =  &(const struct rte_flow_item_eth){
454 				.dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
455 			},
456 			.spec = &(const struct rte_flow_item_eth){
457 				.dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
458 			},
459 		},
460 		.items[1] = {
461 			.type = RTE_FLOW_ITEM_TYPE_END,
462 		},
463 		.mirred = TCA_EGRESS_MIRROR,
464 	},
465 	[TAP_REMOTE_PROMISC] = {
466 		.attr = {
467 			.group = MAX_GROUP,
468 			.priority = PRIORITY_MASK - TAP_REMOTE_PROMISC,
469 			.ingress = 1,
470 		},
471 		.items[0] = {
472 			.type = RTE_FLOW_ITEM_TYPE_VOID,
473 		},
474 		.items[1] = {
475 			.type = RTE_FLOW_ITEM_TYPE_END,
476 		},
477 		.mirred = TCA_EGRESS_MIRROR,
478 	},
479 	[TAP_REMOTE_ALLMULTI] = {
480 		.attr = {
481 			.group = MAX_GROUP,
482 			.priority = PRIORITY_MASK - TAP_REMOTE_ALLMULTI,
483 			.ingress = 1,
484 		},
485 		.items[0] = {
486 			.type = RTE_FLOW_ITEM_TYPE_ETH,
487 			.mask =  &(const struct rte_flow_item_eth){
488 				.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
489 			},
490 			.spec = &(const struct rte_flow_item_eth){
491 				.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
492 			},
493 		},
494 		.items[1] = {
495 			.type = RTE_FLOW_ITEM_TYPE_END,
496 		},
497 		.mirred = TCA_EGRESS_MIRROR,
498 	},
499 	[TAP_REMOTE_TX] = {
500 		.attr = {
501 			.group = 0,
502 			.priority = TAP_REMOTE_TX,
503 			.egress = 1,
504 		},
505 		.items[0] = {
506 			.type = RTE_FLOW_ITEM_TYPE_VOID,
507 		},
508 		.items[1] = {
509 			.type = RTE_FLOW_ITEM_TYPE_END,
510 		},
511 		.mirred = TCA_EGRESS_MIRROR,
512 	},
513 	[TAP_ISOLATE] = {
514 		.attr = {
515 			.group = MAX_GROUP,
516 			.priority = PRIORITY_MASK - TAP_ISOLATE,
517 			.ingress = 1,
518 		},
519 		.items[0] = {
520 			.type = RTE_FLOW_ITEM_TYPE_VOID,
521 		},
522 		.items[1] = {
523 			.type = RTE_FLOW_ITEM_TYPE_END,
524 		},
525 	},
526 };
527 
528 /**
529  * Make as much checks as possible on an Ethernet item, and if a flow is
530  * provided, fill it appropriately with Ethernet info.
531  *
532  * @param[in] item
533  *   Item specification.
534  * @param[in, out] data
535  *   Additional data structure to tell next layers we've been here.
536  *
537  * @return
538  *   0 if checks are alright, -1 otherwise.
539  */
540 static int
541 tap_flow_create_eth(const struct rte_flow_item *item, void *data)
542 {
543 	struct convert_data *info = (struct convert_data *)data;
544 	const struct rte_flow_item_eth *spec = item->spec;
545 	const struct rte_flow_item_eth *mask = item->mask;
546 	struct rte_flow *flow = info->flow;
547 	struct nlmsg *msg;
548 
549 	/* use default mask if none provided */
550 	if (!mask)
551 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask;
552 	/* TC does not support eth_type masking. Only accept if exact match. */
553 	if (mask->type && mask->type != 0xffff)
554 		return -1;
555 	if (!spec)
556 		return 0;
557 	/* store eth_type for consistency if ipv4/6 pattern item comes next */
558 	if (spec->type & mask->type)
559 		info->eth_type = spec->type;
560 	if (!flow)
561 		return 0;
562 	msg = &flow->msg;
563 	if (!is_zero_ether_addr(&spec->dst)) {
564 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN,
565 			   &spec->dst.addr_bytes);
566 		tap_nlattr_add(&msg->nh,
567 			   TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN,
568 			   &mask->dst.addr_bytes);
569 	}
570 	if (!is_zero_ether_addr(&mask->src)) {
571 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN,
572 			   &spec->src.addr_bytes);
573 		tap_nlattr_add(&msg->nh,
574 			   TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN,
575 			   &mask->src.addr_bytes);
576 	}
577 	return 0;
578 }
579 
580 /**
581  * Make as much checks as possible on a VLAN item, and if a flow is provided,
582  * fill it appropriately with VLAN info.
583  *
584  * @param[in] item
585  *   Item specification.
586  * @param[in, out] data
587  *   Additional data structure to tell next layers we've been here.
588  *
589  * @return
590  *   0 if checks are alright, -1 otherwise.
591  */
592 static int
593 tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
594 {
595 	struct convert_data *info = (struct convert_data *)data;
596 	const struct rte_flow_item_vlan *spec = item->spec;
597 	const struct rte_flow_item_vlan *mask = item->mask;
598 	struct rte_flow *flow = info->flow;
599 	struct nlmsg *msg;
600 
601 	/* use default mask if none provided */
602 	if (!mask)
603 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
604 	/* TC does not support tpid masking. Only accept if exact match. */
605 	if (mask->tpid && mask->tpid != 0xffff)
606 		return -1;
607 	/* Double-tagging not supported. */
608 	if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
609 		return -1;
610 	info->vlan = 1;
611 	if (!flow)
612 		return 0;
613 	msg = &flow->msg;
614 	msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q));
615 #define VLAN_PRIO(tci) ((tci) >> 13)
616 #define VLAN_ID(tci) ((tci) & 0xfff)
617 	if (!spec)
618 		return 0;
619 	if (spec->tci) {
620 		uint16_t tci = ntohs(spec->tci) & mask->tci;
621 		uint16_t prio = VLAN_PRIO(tci);
622 		uint8_t vid = VLAN_ID(tci);
623 
624 		if (prio)
625 			tap_nlattr_add8(&msg->nh,
626 					TCA_FLOWER_KEY_VLAN_PRIO, prio);
627 		if (vid)
628 			tap_nlattr_add16(&msg->nh,
629 					 TCA_FLOWER_KEY_VLAN_ID, vid);
630 	}
631 	return 0;
632 }
633 
634 /**
635  * Make as much checks as possible on an IPv4 item, and if a flow is provided,
636  * fill it appropriately with IPv4 info.
637  *
638  * @param[in] item
639  *   Item specification.
640  * @param[in, out] data
641  *   Additional data structure to tell next layers we've been here.
642  *
643  * @return
644  *   0 if checks are alright, -1 otherwise.
645  */
646 static int
647 tap_flow_create_ipv4(const struct rte_flow_item *item, void *data)
648 {
649 	struct convert_data *info = (struct convert_data *)data;
650 	const struct rte_flow_item_ipv4 *spec = item->spec;
651 	const struct rte_flow_item_ipv4 *mask = item->mask;
652 	struct rte_flow *flow = info->flow;
653 	struct nlmsg *msg;
654 
655 	/* use default mask if none provided */
656 	if (!mask)
657 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask;
658 	/* check that previous eth type is compatible with ipv4 */
659 	if (info->eth_type && info->eth_type != htons(ETH_P_IP))
660 		return -1;
661 	/* store ip_proto for consistency if udp/tcp pattern item comes next */
662 	if (spec)
663 		info->ip_proto = spec->hdr.next_proto_id;
664 	if (!flow)
665 		return 0;
666 	msg = &flow->msg;
667 	if (!info->eth_type)
668 		info->eth_type = htons(ETH_P_IP);
669 	if (!spec)
670 		return 0;
671 	if (spec->hdr.dst_addr) {
672 		tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST,
673 			     spec->hdr.dst_addr);
674 		tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK,
675 			     mask->hdr.dst_addr);
676 	}
677 	if (spec->hdr.src_addr) {
678 		tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC,
679 			     spec->hdr.src_addr);
680 		tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK,
681 			     mask->hdr.src_addr);
682 	}
683 	if (spec->hdr.next_proto_id)
684 		tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO,
685 			    spec->hdr.next_proto_id);
686 	return 0;
687 }
688 
689 /**
690  * Make as much checks as possible on an IPv6 item, and if a flow is provided,
691  * fill it appropriately with IPv6 info.
692  *
693  * @param[in] item
694  *   Item specification.
695  * @param[in, out] data
696  *   Additional data structure to tell next layers we've been here.
697  *
698  * @return
699  *   0 if checks are alright, -1 otherwise.
700  */
701 static int
702 tap_flow_create_ipv6(const struct rte_flow_item *item, void *data)
703 {
704 	struct convert_data *info = (struct convert_data *)data;
705 	const struct rte_flow_item_ipv6 *spec = item->spec;
706 	const struct rte_flow_item_ipv6 *mask = item->mask;
707 	struct rte_flow *flow = info->flow;
708 	uint8_t empty_addr[16] = { 0 };
709 	struct nlmsg *msg;
710 
711 	/* use default mask if none provided */
712 	if (!mask)
713 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask;
714 	/* check that previous eth type is compatible with ipv6 */
715 	if (info->eth_type && info->eth_type != htons(ETH_P_IPV6))
716 		return -1;
717 	/* store ip_proto for consistency if udp/tcp pattern item comes next */
718 	if (spec)
719 		info->ip_proto = spec->hdr.proto;
720 	if (!flow)
721 		return 0;
722 	msg = &flow->msg;
723 	if (!info->eth_type)
724 		info->eth_type = htons(ETH_P_IPV6);
725 	if (!spec)
726 		return 0;
727 	if (memcmp(spec->hdr.dst_addr, empty_addr, 16)) {
728 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST,
729 			   sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr);
730 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK,
731 			   sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr);
732 	}
733 	if (memcmp(spec->hdr.src_addr, empty_addr, 16)) {
734 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC,
735 			   sizeof(spec->hdr.src_addr), &spec->hdr.src_addr);
736 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
737 			   sizeof(mask->hdr.src_addr), &mask->hdr.src_addr);
738 	}
739 	if (spec->hdr.proto)
740 		tap_nlattr_add8(&msg->nh,
741 				TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto);
742 	return 0;
743 }
744 
745 /**
746  * Make as much checks as possible on a UDP item, and if a flow is provided,
747  * fill it appropriately with UDP info.
748  *
749  * @param[in] item
750  *   Item specification.
751  * @param[in, out] data
752  *   Additional data structure to tell next layers we've been here.
753  *
754  * @return
755  *   0 if checks are alright, -1 otherwise.
756  */
757 static int
758 tap_flow_create_udp(const struct rte_flow_item *item, void *data)
759 {
760 	struct convert_data *info = (struct convert_data *)data;
761 	const struct rte_flow_item_udp *spec = item->spec;
762 	const struct rte_flow_item_udp *mask = item->mask;
763 	struct rte_flow *flow = info->flow;
764 	struct nlmsg *msg;
765 
766 	/* use default mask if none provided */
767 	if (!mask)
768 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask;
769 	/* check that previous ip_proto is compatible with udp */
770 	if (info->ip_proto && info->ip_proto != IPPROTO_UDP)
771 		return -1;
772 	/* TC does not support UDP port masking. Only accept if exact match. */
773 	if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
774 	    (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
775 		return -1;
776 	if (!flow)
777 		return 0;
778 	msg = &flow->msg;
779 	tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP);
780 	if (!spec)
781 		return 0;
782 	if (spec->hdr.dst_port & mask->hdr.dst_port)
783 		tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST,
784 			     spec->hdr.dst_port);
785 	if (spec->hdr.src_port & mask->hdr.src_port)
786 		tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC,
787 			     spec->hdr.src_port);
788 	return 0;
789 }
790 
791 /**
792  * Make as much checks as possible on a TCP item, and if a flow is provided,
793  * fill it appropriately with TCP info.
794  *
795  * @param[in] item
796  *   Item specification.
797  * @param[in, out] data
798  *   Additional data structure to tell next layers we've been here.
799  *
800  * @return
801  *   0 if checks are alright, -1 otherwise.
802  */
803 static int
804 tap_flow_create_tcp(const struct rte_flow_item *item, void *data)
805 {
806 	struct convert_data *info = (struct convert_data *)data;
807 	const struct rte_flow_item_tcp *spec = item->spec;
808 	const struct rte_flow_item_tcp *mask = item->mask;
809 	struct rte_flow *flow = info->flow;
810 	struct nlmsg *msg;
811 
812 	/* use default mask if none provided */
813 	if (!mask)
814 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask;
815 	/* check that previous ip_proto is compatible with tcp */
816 	if (info->ip_proto && info->ip_proto != IPPROTO_TCP)
817 		return -1;
818 	/* TC does not support TCP port masking. Only accept if exact match. */
819 	if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
820 	    (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
821 		return -1;
822 	if (!flow)
823 		return 0;
824 	msg = &flow->msg;
825 	tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP);
826 	if (!spec)
827 		return 0;
828 	if (spec->hdr.dst_port & mask->hdr.dst_port)
829 		tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST,
830 			     spec->hdr.dst_port);
831 	if (spec->hdr.src_port & mask->hdr.src_port)
832 		tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC,
833 			     spec->hdr.src_port);
834 	return 0;
835 }
836 
837 /**
838  * Check support for a given item.
839  *
840  * @param[in] item
841  *   Item specification.
842  * @param size
843  *   Bit-Mask size in bytes.
844  * @param[in] supported_mask
845  *   Bit-mask covering supported fields to compare with spec, last and mask in
846  *   \item.
847  * @param[in] default_mask
848  *   Bit-mask default mask if none is provided in \item.
849  *
850  * @return
851  *   0 on success.
852  */
853 static int
854 tap_flow_item_validate(const struct rte_flow_item *item,
855 		       unsigned int size,
856 		       const uint8_t *supported_mask,
857 		       const uint8_t *default_mask)
858 {
859 	int ret = 0;
860 
861 	/* An empty layer is allowed, as long as all fields are NULL */
862 	if (!item->spec && (item->mask || item->last))
863 		return -1;
864 	/* Is the item spec compatible with what the NIC supports? */
865 	if (item->spec && !item->mask) {
866 		unsigned int i;
867 		const uint8_t *spec = item->spec;
868 
869 		for (i = 0; i < size; ++i)
870 			if ((spec[i] | supported_mask[i]) != supported_mask[i])
871 				return -1;
872 		/* Is the default mask compatible with what the NIC supports? */
873 		for (i = 0; i < size; i++)
874 			if ((default_mask[i] | supported_mask[i]) !=
875 			    supported_mask[i])
876 				return -1;
877 	}
878 	/* Is the item last compatible with what the NIC supports? */
879 	if (item->last && !item->mask) {
880 		unsigned int i;
881 		const uint8_t *spec = item->last;
882 
883 		for (i = 0; i < size; ++i)
884 			if ((spec[i] | supported_mask[i]) != supported_mask[i])
885 				return -1;
886 	}
887 	/* Is the item mask compatible with what the NIC supports? */
888 	if (item->mask) {
889 		unsigned int i;
890 		const uint8_t *spec = item->mask;
891 
892 		for (i = 0; i < size; ++i)
893 			if ((spec[i] | supported_mask[i]) != supported_mask[i])
894 				return -1;
895 	}
896 	/**
897 	 * Once masked, Are item spec and item last equal?
898 	 * TC does not support range so anything else is invalid.
899 	 */
900 	if (item->spec && item->last) {
901 		uint8_t spec[size];
902 		uint8_t last[size];
903 		const uint8_t *apply = default_mask;
904 		unsigned int i;
905 
906 		if (item->mask)
907 			apply = item->mask;
908 		for (i = 0; i < size; ++i) {
909 			spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
910 			last[i] = ((const uint8_t *)item->last)[i] & apply[i];
911 		}
912 		ret = memcmp(spec, last, size);
913 	}
914 	return ret;
915 }
916 
917 /**
918  * Configure the kernel with a TC action and its configured parameters
919  * Handled actions: "gact", "mirred", "skbedit", "bpf"
920  *
921  * @param[in] flow
922  *   Pointer to rte flow containing the netlink message
923  *
924  * @param[in, out] act_index
925  *   Pointer to action sequence number in the TC command
926  *
927  * @param[in] adata
928  *  Pointer to struct holding the action parameters
929  *
930  * @return
931  *   -1 on failure, 0 on success
932  */
933 static int
934 add_action(struct rte_flow *flow, size_t *act_index, struct action_data *adata)
935 {
936 	struct nlmsg *msg = &flow->msg;
937 
938 	if (tap_nlattr_nested_start(msg, (*act_index)++) < 0)
939 		return -1;
940 
941 	tap_nlattr_add(&msg->nh, TCA_ACT_KIND,
942 				strlen(adata->id) + 1, adata->id);
943 	if (tap_nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
944 		return -1;
945 	if (strcmp("gact", adata->id) == 0) {
946 		tap_nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(adata->gact),
947 			   &adata->gact);
948 	} else if (strcmp("mirred", adata->id) == 0) {
949 		if (adata->mirred.eaction == TCA_EGRESS_MIRROR)
950 			adata->mirred.action = TC_ACT_PIPE;
951 		else /* REDIRECT */
952 			adata->mirred.action = TC_ACT_STOLEN;
953 		tap_nlattr_add(&msg->nh, TCA_MIRRED_PARMS,
954 			   sizeof(adata->mirred),
955 			   &adata->mirred);
956 	} else if (strcmp("skbedit", adata->id) == 0) {
957 		tap_nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS,
958 			   sizeof(adata->skbedit.skbedit),
959 			   &adata->skbedit.skbedit);
960 		tap_nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING,
961 			     adata->skbedit.queue);
962 	} else if (strcmp("bpf", adata->id) == 0) {
963 		tap_nlattr_add32(&msg->nh, TCA_ACT_BPF_FD, adata->bpf.bpf_fd);
964 		tap_nlattr_add(&msg->nh, TCA_ACT_BPF_NAME,
965 			   strlen(adata->bpf.annotation) + 1,
966 			   adata->bpf.annotation);
967 		tap_nlattr_add(&msg->nh, TCA_ACT_BPF_PARMS,
968 			   sizeof(adata->bpf.bpf),
969 			   &adata->bpf.bpf);
970 	} else {
971 		return -1;
972 	}
973 	tap_nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
974 	tap_nlattr_nested_finish(msg); /* nested act_index */
975 	return 0;
976 }
977 
978 /**
979  * Helper function to send a serie of TC actions to the kernel
980  *
981  * @param[in] flow
982  *   Pointer to rte flow containing the netlink message
983  *
984  * @param[in] nb_actions
985  *   Number of actions in an array of action structs
986  *
987  * @param[in] data
988  *   Pointer to an array of action structs
989  *
990  * @param[in] classifier_actions
991  *   The classifier on behave of which the actions are configured
992  *
993  * @return
994  *   -1 on failure, 0 on success
995  */
996 static int
997 add_actions(struct rte_flow *flow, int nb_actions, struct action_data *data,
998 	    int classifier_action)
999 {
1000 	struct nlmsg *msg = &flow->msg;
1001 	size_t act_index = 1;
1002 	int i;
1003 
1004 	if (tap_nlattr_nested_start(msg, classifier_action) < 0)
1005 		return -1;
1006 	for (i = 0; i < nb_actions; i++)
1007 		if (add_action(flow, &act_index, data + i) < 0)
1008 			return -1;
1009 	tap_nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
1010 	return 0;
1011 }
1012 
1013 /**
1014  * Validate a flow supported by TC.
1015  * If flow param is not NULL, then also fill the netlink message inside.
1016  *
1017  * @param pmd
1018  *   Pointer to private structure.
1019  * @param[in] attr
1020  *   Flow rule attributes.
1021  * @param[in] pattern
1022  *   Pattern specification (list terminated by the END pattern item).
1023  * @param[in] actions
1024  *   Associated actions (list terminated by the END action).
1025  * @param[out] error
1026  *   Perform verbose error reporting if not NULL.
1027  * @param[in, out] flow
1028  *   Flow structure to update.
1029  * @param[in] mirred
1030  *   If set to TCA_EGRESS_REDIR, provided actions will be replaced with a
1031  *   redirection to the tap netdevice, and the TC rule will be configured
1032  *   on the remote netdevice in pmd.
1033  *   If set to TCA_EGRESS_MIRROR, provided actions will be replaced with a
1034  *   mirroring to the tap netdevice, and the TC rule will be configured
1035  *   on the remote netdevice in pmd. Matching packets will thus be duplicated.
1036  *   If set to 0, the standard behavior is to be used: set correct actions for
1037  *   the TC rule, and apply it on the tap netdevice.
1038  *
1039  * @return
1040  *   0 on success, a negative errno value otherwise and rte_errno is set.
1041  */
1042 static int
1043 priv_flow_process(struct pmd_internals *pmd,
1044 		  const struct rte_flow_attr *attr,
1045 		  const struct rte_flow_item items[],
1046 		  const struct rte_flow_action actions[],
1047 		  struct rte_flow_error *error,
1048 		  struct rte_flow *flow,
1049 		  int mirred)
1050 {
1051 	const struct tap_flow_items *cur_item = tap_flow_items;
1052 	struct convert_data data = {
1053 		.eth_type = 0,
1054 		.ip_proto = 0,
1055 		.flow = flow,
1056 	};
1057 	int action = 0; /* Only one action authorized for now */
1058 
1059 	if (attr->group > MAX_GROUP) {
1060 		rte_flow_error_set(
1061 			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
1062 			NULL, "group value too big: cannot exceed 15");
1063 		return -rte_errno;
1064 	}
1065 	if (attr->priority > MAX_PRIORITY) {
1066 		rte_flow_error_set(
1067 			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1068 			NULL, "priority value too big");
1069 		return -rte_errno;
1070 	} else if (flow) {
1071 		uint16_t group = attr->group << GROUP_SHIFT;
1072 		uint16_t prio = group | (attr->priority +
1073 				RSS_PRIORITY_OFFSET + PRIORITY_OFFSET);
1074 		flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
1075 						 flow->msg.t.tcm_info);
1076 	}
1077 	if (flow) {
1078 		if (mirred) {
1079 			/*
1080 			 * If attr->ingress, the rule applies on remote ingress
1081 			 * to match incoming packets
1082 			 * If attr->egress, the rule applies on tap ingress (as
1083 			 * seen from the kernel) to deal with packets going out
1084 			 * from the DPDK app.
1085 			 */
1086 			flow->msg.t.tcm_parent = TC_H_MAKE(TC_H_INGRESS, 0);
1087 		} else {
1088 			/* Standard rule on tap egress (kernel standpoint). */
1089 			flow->msg.t.tcm_parent =
1090 				TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
1091 		}
1092 		/* use flower filter type */
1093 		tap_nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower");
1094 		if (tap_nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0)
1095 			goto exit_item_not_supported;
1096 	}
1097 	for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
1098 		const struct tap_flow_items *token = NULL;
1099 		unsigned int i;
1100 		int err = 0;
1101 
1102 		if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
1103 			continue;
1104 		for (i = 0;
1105 		     cur_item->items &&
1106 		     cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
1107 		     ++i) {
1108 			if (cur_item->items[i] == items->type) {
1109 				token = &tap_flow_items[items->type];
1110 				break;
1111 			}
1112 		}
1113 		if (!token)
1114 			goto exit_item_not_supported;
1115 		cur_item = token;
1116 		err = tap_flow_item_validate(
1117 			items, cur_item->mask_sz,
1118 			(const uint8_t *)cur_item->mask,
1119 			(const uint8_t *)cur_item->default_mask);
1120 		if (err)
1121 			goto exit_item_not_supported;
1122 		if (flow && cur_item->convert) {
1123 			err = cur_item->convert(items, &data);
1124 			if (err)
1125 				goto exit_item_not_supported;
1126 		}
1127 	}
1128 	if (flow) {
1129 		if (data.vlan) {
1130 			tap_nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
1131 				     htons(ETH_P_8021Q));
1132 			tap_nlattr_add16(&flow->msg.nh,
1133 				     TCA_FLOWER_KEY_VLAN_ETH_TYPE,
1134 				     data.eth_type ?
1135 				     data.eth_type : htons(ETH_P_ALL));
1136 		} else if (data.eth_type) {
1137 			tap_nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
1138 				     data.eth_type);
1139 		}
1140 	}
1141 	if (mirred && flow) {
1142 		struct action_data adata = {
1143 			.id = "mirred",
1144 			.mirred = {
1145 				.eaction = mirred,
1146 			},
1147 		};
1148 
1149 		/*
1150 		 * If attr->egress && mirred, then this is a special
1151 		 * case where the rule must be applied on the tap, to
1152 		 * redirect packets coming from the DPDK App, out
1153 		 * through the remote netdevice.
1154 		 */
1155 		adata.mirred.ifindex = attr->ingress ? pmd->if_index :
1156 			pmd->remote_if_index;
1157 		if (mirred == TCA_EGRESS_MIRROR)
1158 			adata.mirred.action = TC_ACT_PIPE;
1159 		else
1160 			adata.mirred.action = TC_ACT_STOLEN;
1161 		if (add_actions(flow, 1, &adata, TCA_FLOWER_ACT) < 0)
1162 			goto exit_action_not_supported;
1163 		else
1164 			goto end;
1165 	}
1166 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
1167 		int err = 0;
1168 
1169 		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
1170 			continue;
1171 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
1172 			if (action)
1173 				goto exit_action_not_supported;
1174 			action = 1;
1175 			if (flow) {
1176 				struct action_data adata = {
1177 					.id = "gact",
1178 					.gact = {
1179 						.action = TC_ACT_SHOT,
1180 					},
1181 				};
1182 
1183 				err = add_actions(flow, 1, &adata,
1184 						  TCA_FLOWER_ACT);
1185 			}
1186 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
1187 			if (action)
1188 				goto exit_action_not_supported;
1189 			action = 1;
1190 			if (flow) {
1191 				struct action_data adata = {
1192 					.id = "gact",
1193 					.gact = {
1194 						/* continue */
1195 						.action = TC_ACT_UNSPEC,
1196 					},
1197 				};
1198 
1199 				err = add_actions(flow, 1, &adata,
1200 						  TCA_FLOWER_ACT);
1201 			}
1202 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
1203 			const struct rte_flow_action_queue *queue =
1204 				(const struct rte_flow_action_queue *)
1205 				actions->conf;
1206 
1207 			if (action)
1208 				goto exit_action_not_supported;
1209 			action = 1;
1210 			if (!queue ||
1211 			    (queue->index > pmd->dev->data->nb_rx_queues - 1))
1212 				goto exit_action_not_supported;
1213 			if (flow) {
1214 				struct action_data adata = {
1215 					.id = "skbedit",
1216 					.skbedit = {
1217 						.skbedit = {
1218 							.action = TC_ACT_PIPE,
1219 						},
1220 						.queue = queue->index,
1221 					},
1222 				};
1223 
1224 				err = add_actions(flow, 1, &adata,
1225 					TCA_FLOWER_ACT);
1226 			}
1227 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
1228 			const struct rte_flow_action_rss *rss =
1229 				(const struct rte_flow_action_rss *)
1230 				actions->conf;
1231 
1232 			if (action++)
1233 				goto exit_action_not_supported;
1234 
1235 			if (!pmd->rss_enabled) {
1236 				err = rss_enable(pmd, attr, error);
1237 				if (err)
1238 					goto exit_action_not_supported;
1239 			}
1240 			if (flow && rss)
1241 				err = rss_add_actions(flow, pmd, rss, error);
1242 		} else {
1243 			goto exit_action_not_supported;
1244 		}
1245 		if (err)
1246 			goto exit_action_not_supported;
1247 	}
1248 end:
1249 	if (flow)
1250 		tap_nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
1251 	return 0;
1252 exit_item_not_supported:
1253 	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
1254 			   items, "item not supported");
1255 	return -rte_errno;
1256 exit_action_not_supported:
1257 	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
1258 			   actions, "action not supported");
1259 	return -rte_errno;
1260 }
1261 
1262 
1263 
1264 /**
1265  * Validate a flow.
1266  *
1267  * @see rte_flow_validate()
1268  * @see rte_flow_ops
1269  */
1270 static int
1271 tap_flow_validate(struct rte_eth_dev *dev,
1272 		  const struct rte_flow_attr *attr,
1273 		  const struct rte_flow_item items[],
1274 		  const struct rte_flow_action actions[],
1275 		  struct rte_flow_error *error)
1276 {
1277 	struct pmd_internals *pmd = dev->data->dev_private;
1278 
1279 	return priv_flow_process(pmd, attr, items, actions, error, NULL, 0);
1280 }
1281 
1282 /**
1283  * Set a unique handle in a flow.
1284  *
1285  * The kernel supports TC rules with equal priority, as long as they use the
1286  * same matching fields (e.g.: dst mac and ipv4) with different values (and
1287  * full mask to ensure no collision is possible).
1288  * In those rules, the handle (uint32_t) is the part that would identify
1289  * specifically each rule.
1290  *
1291  * On 32-bit architectures, the handle can simply be the flow's pointer address.
1292  * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently)
1293  * unique handle.
1294  *
1295  * @param[in, out] flow
1296  *   The flow that needs its handle set.
1297  */
1298 static void
1299 tap_flow_set_handle(struct rte_flow *flow)
1300 {
1301 	uint32_t handle = 0;
1302 
1303 	if (sizeof(flow) > 4)
1304 		handle = rte_jhash(&flow, sizeof(flow), 1);
1305 	else
1306 		handle = (uintptr_t)flow;
1307 	/* must be at least 1 to avoid letting the kernel choose one for us */
1308 	if (!handle)
1309 		handle = 1;
1310 	flow->msg.t.tcm_handle = handle;
1311 }
1312 
1313 /**
1314  * Create a flow.
1315  *
1316  * @see rte_flow_create()
1317  * @see rte_flow_ops
1318  */
1319 static struct rte_flow *
1320 tap_flow_create(struct rte_eth_dev *dev,
1321 		const struct rte_flow_attr *attr,
1322 		const struct rte_flow_item items[],
1323 		const struct rte_flow_action actions[],
1324 		struct rte_flow_error *error)
1325 {
1326 	struct pmd_internals *pmd = dev->data->dev_private;
1327 	struct rte_flow *remote_flow = NULL;
1328 	struct rte_flow *flow = NULL;
1329 	struct nlmsg *msg = NULL;
1330 	int err;
1331 
1332 	if (!pmd->if_index) {
1333 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1334 				   NULL,
1335 				   "can't create rule, ifindex not found");
1336 		goto fail;
1337 	}
1338 	/*
1339 	 * No rules configured through standard rte_flow should be set on the
1340 	 * priorities used by implicit rules.
1341 	 */
1342 	if ((attr->group == MAX_GROUP) &&
1343 	    attr->priority > (MAX_PRIORITY - TAP_REMOTE_MAX_IDX)) {
1344 		rte_flow_error_set(
1345 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1346 			NULL, "priority value too big");
1347 		goto fail;
1348 	}
1349 	flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1350 	if (!flow) {
1351 		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1352 				   NULL, "cannot allocate memory for rte_flow");
1353 		goto fail;
1354 	}
1355 	msg = &flow->msg;
1356 	tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER,
1357 		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1358 	msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1359 	tap_flow_set_handle(flow);
1360 	if (priv_flow_process(pmd, attr, items, actions, error, flow, 0))
1361 		goto fail;
1362 	err = tap_nl_send(pmd->nlsk_fd, &msg->nh);
1363 	if (err < 0) {
1364 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1365 				   NULL, "couldn't send request to kernel");
1366 		goto fail;
1367 	}
1368 	err = tap_nl_recv_ack(pmd->nlsk_fd);
1369 	if (err < 0) {
1370 		RTE_LOG(ERR, PMD,
1371 			"Kernel refused TC filter rule creation (%d): %s\n",
1372 			errno, strerror(errno));
1373 		rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE,
1374 				   NULL,
1375 				   "overlapping rules or Kernel too old for flower support");
1376 		goto fail;
1377 	}
1378 	LIST_INSERT_HEAD(&pmd->flows, flow, next);
1379 	/**
1380 	 * If a remote device is configured, a TC rule with identical items for
1381 	 * matching must be set on that device, with a single action: redirect
1382 	 * to the local pmd->if_index.
1383 	 */
1384 	if (pmd->remote_if_index) {
1385 		remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1386 		if (!remote_flow) {
1387 			rte_flow_error_set(
1388 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1389 				"cannot allocate memory for rte_flow");
1390 			goto fail;
1391 		}
1392 		msg = &remote_flow->msg;
1393 		/* set the rule if_index for the remote netdevice */
1394 		tc_init_msg(
1395 			msg, pmd->remote_if_index, RTM_NEWTFILTER,
1396 			NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1397 		msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1398 		tap_flow_set_handle(remote_flow);
1399 		if (priv_flow_process(pmd, attr, items, NULL,
1400 				      error, remote_flow, TCA_EGRESS_REDIR)) {
1401 			rte_flow_error_set(
1402 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1403 				NULL, "rte flow rule validation failed");
1404 			goto fail;
1405 		}
1406 		err = tap_nl_send(pmd->nlsk_fd, &msg->nh);
1407 		if (err < 0) {
1408 			rte_flow_error_set(
1409 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1410 				NULL, "Failure sending nl request");
1411 			goto fail;
1412 		}
1413 		err = tap_nl_recv_ack(pmd->nlsk_fd);
1414 		if (err < 0) {
1415 			RTE_LOG(ERR, PMD,
1416 				"Kernel refused TC filter rule creation (%d): %s\n",
1417 				errno, strerror(errno));
1418 			rte_flow_error_set(
1419 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1420 				NULL,
1421 				"overlapping rules or Kernel too old for flower support");
1422 			goto fail;
1423 		}
1424 		flow->remote_flow = remote_flow;
1425 	}
1426 	return flow;
1427 fail:
1428 	if (remote_flow)
1429 		rte_free(remote_flow);
1430 	if (flow)
1431 		rte_free(flow);
1432 	return NULL;
1433 }
1434 
1435 /**
1436  * Destroy a flow using pointer to pmd_internal.
1437  *
1438  * @param[in, out] pmd
1439  *   Pointer to private structure.
1440  * @param[in] flow
1441  *   Pointer to the flow to destroy.
1442  * @param[in, out] error
1443  *   Pointer to the flow error handler
1444  *
1445  * @return 0 if the flow could be destroyed, -1 otherwise.
1446  */
1447 static int
1448 tap_flow_destroy_pmd(struct pmd_internals *pmd,
1449 		     struct rte_flow *flow,
1450 		     struct rte_flow_error *error)
1451 {
1452 	struct rte_flow *remote_flow = flow->remote_flow;
1453 	int i;
1454 	int ret = 0;
1455 
1456 	LIST_REMOVE(flow, next);
1457 	flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1458 	flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1459 
1460 	ret = tap_nl_send(pmd->nlsk_fd, &flow->msg.nh);
1461 	if (ret < 0) {
1462 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1463 				   NULL, "couldn't send request to kernel");
1464 		goto end;
1465 	}
1466 	ret = tap_nl_recv_ack(pmd->nlsk_fd);
1467 	/* If errno is ENOENT, the rule is already no longer in the kernel. */
1468 	if (ret < 0 && errno == ENOENT)
1469 		ret = 0;
1470 	if (ret < 0) {
1471 		RTE_LOG(ERR, PMD,
1472 			"Kernel refused TC filter rule deletion (%d): %s\n",
1473 			errno, strerror(errno));
1474 		rte_flow_error_set(
1475 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1476 			"couldn't receive kernel ack to our request");
1477 		goto end;
1478 	}
1479 	/* Close opened BPF file descriptors of this flow */
1480 	for (i = 0; i < SEC_MAX; i++)
1481 		if (flow->bpf_fd[i] != 0) {
1482 			close(flow->bpf_fd[i]);
1483 			flow->bpf_fd[i] = 0;
1484 		}
1485 
1486 	/* Release map key for this RSS rule */
1487 	ret = bpf_rss_key(KEY_CMD_RELEASE, &flow->key_idx);
1488 	if (ret < 0) {
1489 		rte_flow_error_set(
1490 			error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1491 			"Failed to release BPF RSS key");
1492 
1493 		goto end;
1494 	}
1495 
1496 	if (remote_flow) {
1497 		remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1498 		remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1499 
1500 		ret = tap_nl_send(pmd->nlsk_fd, &remote_flow->msg.nh);
1501 		if (ret < 0) {
1502 			rte_flow_error_set(
1503 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1504 				NULL, "Failure sending nl request");
1505 			goto end;
1506 		}
1507 		ret = tap_nl_recv_ack(pmd->nlsk_fd);
1508 		if (ret < 0 && errno == ENOENT)
1509 			ret = 0;
1510 		if (ret < 0) {
1511 			RTE_LOG(ERR, PMD,
1512 				"Kernel refused TC filter rule deletion (%d): %s\n",
1513 				errno, strerror(errno));
1514 			rte_flow_error_set(
1515 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1516 				NULL, "Failure trying to receive nl ack");
1517 			goto end;
1518 		}
1519 	}
1520 end:
1521 	if (remote_flow)
1522 		rte_free(remote_flow);
1523 	rte_free(flow);
1524 	return ret;
1525 }
1526 
1527 /**
1528  * Destroy a flow.
1529  *
1530  * @see rte_flow_destroy()
1531  * @see rte_flow_ops
1532  */
1533 static int
1534 tap_flow_destroy(struct rte_eth_dev *dev,
1535 		 struct rte_flow *flow,
1536 		 struct rte_flow_error *error)
1537 {
1538 	struct pmd_internals *pmd = dev->data->dev_private;
1539 
1540 	return tap_flow_destroy_pmd(pmd, flow, error);
1541 }
1542 
1543 /**
1544  * Enable/disable flow isolation.
1545  *
1546  * @see rte_flow_isolate()
1547  * @see rte_flow_ops
1548  */
1549 static int
1550 tap_flow_isolate(struct rte_eth_dev *dev,
1551 		 int set,
1552 		 struct rte_flow_error *error __rte_unused)
1553 {
1554 	struct pmd_internals *pmd = dev->data->dev_private;
1555 
1556 	if (set)
1557 		pmd->flow_isolate = 1;
1558 	else
1559 		pmd->flow_isolate = 0;
1560 	/*
1561 	 * If netdevice is there, setup appropriate flow rules immediately.
1562 	 * Otherwise it will be set when bringing up the netdevice (tun_alloc).
1563 	 */
1564 	if (!pmd->rxq[0].fd)
1565 		return 0;
1566 	if (set) {
1567 		struct rte_flow *flow;
1568 
1569 		while (1) {
1570 			flow = LIST_FIRST(&pmd->implicit_flows);
1571 			if (!flow)
1572 				break;
1573 			/*
1574 			 * Remove all implicit rules on the remote.
1575 			 * Keep the local rule to redirect packets on TX.
1576 			 * Keep also the last implicit local rule: ISOLATE.
1577 			 */
1578 			if (flow->msg.t.tcm_ifindex == pmd->if_index)
1579 				break;
1580 			if (tap_flow_destroy_pmd(pmd, flow, NULL) < 0)
1581 				goto error;
1582 		}
1583 		/* Switch the TC rule according to pmd->flow_isolate */
1584 		if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1)
1585 			goto error;
1586 	} else {
1587 		/* Switch the TC rule according to pmd->flow_isolate */
1588 		if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1)
1589 			goto error;
1590 		if (!pmd->remote_if_index)
1591 			return 0;
1592 		if (tap_flow_implicit_create(pmd, TAP_REMOTE_TX) < 0)
1593 			goto error;
1594 		if (tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0)
1595 			goto error;
1596 		if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCAST) < 0)
1597 			goto error;
1598 		if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCASTV6) < 0)
1599 			goto error;
1600 		if (dev->data->promiscuous &&
1601 		    tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC) < 0)
1602 			goto error;
1603 		if (dev->data->all_multicast &&
1604 		    tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI) < 0)
1605 			goto error;
1606 	}
1607 	return 0;
1608 error:
1609 	pmd->flow_isolate = 0;
1610 	return rte_flow_error_set(
1611 		error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
1612 		"TC rule creation failed");
1613 }
1614 
1615 /**
1616  * Destroy all flows.
1617  *
1618  * @see rte_flow_flush()
1619  * @see rte_flow_ops
1620  */
1621 int
1622 tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
1623 {
1624 	struct pmd_internals *pmd = dev->data->dev_private;
1625 	struct rte_flow *flow;
1626 
1627 	while (!LIST_EMPTY(&pmd->flows)) {
1628 		flow = LIST_FIRST(&pmd->flows);
1629 		if (tap_flow_destroy(dev, flow, error) < 0)
1630 			return -1;
1631 	}
1632 	return 0;
1633 }
1634 
1635 /**
1636  * Add an implicit flow rule on the remote device to make sure traffic gets to
1637  * the tap netdevice from there.
1638  *
1639  * @param pmd
1640  *   Pointer to private structure.
1641  * @param[in] idx
1642  *   The idx in the implicit_rte_flows array specifying which rule to apply.
1643  *
1644  * @return -1 if the rule couldn't be applied, 0 otherwise.
1645  */
1646 int tap_flow_implicit_create(struct pmd_internals *pmd,
1647 			     enum implicit_rule_index idx)
1648 {
1649 	uint16_t flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE;
1650 	struct rte_flow_action *actions = implicit_rte_flows[idx].actions;
1651 	struct rte_flow_action isolate_actions[2] = {
1652 		[1] = {
1653 			.type = RTE_FLOW_ACTION_TYPE_END,
1654 		},
1655 	};
1656 	struct rte_flow_item *items = implicit_rte_flows[idx].items;
1657 	struct rte_flow_attr *attr = &implicit_rte_flows[idx].attr;
1658 	struct rte_flow_item_eth eth_local = { .type = 0 };
1659 	uint16_t if_index = pmd->remote_if_index;
1660 	struct rte_flow *remote_flow = NULL;
1661 	struct nlmsg *msg = NULL;
1662 	int err = 0;
1663 	struct rte_flow_item items_local[2] = {
1664 		[0] = {
1665 			.type = items[0].type,
1666 			.spec = &eth_local,
1667 			.mask = items[0].mask,
1668 		},
1669 		[1] = {
1670 			.type = items[1].type,
1671 		}
1672 	};
1673 
1674 	remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1675 	if (!remote_flow) {
1676 		RTE_LOG(ERR, PMD, "Cannot allocate memory for rte_flow\n");
1677 		goto fail;
1678 	}
1679 	msg = &remote_flow->msg;
1680 	if (idx == TAP_REMOTE_TX) {
1681 		if_index = pmd->if_index;
1682 	} else if (idx == TAP_ISOLATE) {
1683 		if_index = pmd->if_index;
1684 		/* Don't be exclusive for this rule, it can be changed later. */
1685 		flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE;
1686 		isolate_actions[0].type = pmd->flow_isolate ?
1687 			RTE_FLOW_ACTION_TYPE_DROP :
1688 			RTE_FLOW_ACTION_TYPE_PASSTHRU;
1689 		actions = isolate_actions;
1690 	} else if (idx == TAP_REMOTE_LOCAL_MAC) {
1691 		/*
1692 		 * eth addr couldn't be set in implicit_rte_flows[] as it is not
1693 		 * known at compile time.
1694 		 */
1695 		memcpy(&eth_local.dst, &pmd->eth_addr, sizeof(pmd->eth_addr));
1696 		items = items_local;
1697 	}
1698 	tc_init_msg(msg, if_index, RTM_NEWTFILTER, flags);
1699 	msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1700 	/*
1701 	 * The ISOLATE rule is always present and must have a static handle, as
1702 	 * the action is changed whether the feature is enabled (DROP) or
1703 	 * disabled (PASSTHRU).
1704 	 */
1705 	if (idx == TAP_ISOLATE)
1706 		remote_flow->msg.t.tcm_handle = ISOLATE_HANDLE;
1707 	else
1708 		tap_flow_set_handle(remote_flow);
1709 	if (priv_flow_process(pmd, attr, items, actions, NULL,
1710 			      remote_flow, implicit_rte_flows[idx].mirred)) {
1711 		RTE_LOG(ERR, PMD, "rte flow rule validation failed\n");
1712 		goto fail;
1713 	}
1714 	err = tap_nl_send(pmd->nlsk_fd, &msg->nh);
1715 	if (err < 0) {
1716 		RTE_LOG(ERR, PMD, "Failure sending nl request\n");
1717 		goto fail;
1718 	}
1719 	err = tap_nl_recv_ack(pmd->nlsk_fd);
1720 	if (err < 0) {
1721 		RTE_LOG(ERR, PMD,
1722 			"Kernel refused TC filter rule creation (%d): %s\n",
1723 			errno, strerror(errno));
1724 		goto fail;
1725 	}
1726 	LIST_INSERT_HEAD(&pmd->implicit_flows, remote_flow, next);
1727 	return 0;
1728 fail:
1729 	if (remote_flow)
1730 		rte_free(remote_flow);
1731 	return -1;
1732 }
1733 
1734 /**
1735  * Remove specific implicit flow rule on the remote device.
1736  *
1737  * @param[in, out] pmd
1738  *   Pointer to private structure.
1739  * @param[in] idx
1740  *   The idx in the implicit_rte_flows array specifying which rule to remove.
1741  *
1742  * @return -1 if one of the implicit rules couldn't be created, 0 otherwise.
1743  */
1744 int tap_flow_implicit_destroy(struct pmd_internals *pmd,
1745 			      enum implicit_rule_index idx)
1746 {
1747 	struct rte_flow *remote_flow;
1748 	int cur_prio = -1;
1749 	int idx_prio = implicit_rte_flows[idx].attr.priority + PRIORITY_OFFSET;
1750 
1751 	for (remote_flow = LIST_FIRST(&pmd->implicit_flows);
1752 	     remote_flow;
1753 	     remote_flow = LIST_NEXT(remote_flow, next)) {
1754 		cur_prio = (remote_flow->msg.t.tcm_info >> 16) & PRIORITY_MASK;
1755 		if (cur_prio != idx_prio)
1756 			continue;
1757 		return tap_flow_destroy_pmd(pmd, remote_flow, NULL);
1758 	}
1759 	return 0;
1760 }
1761 
1762 /**
1763  * Destroy all implicit flows.
1764  *
1765  * @see rte_flow_flush()
1766  */
1767 int
1768 tap_flow_implicit_flush(struct pmd_internals *pmd, struct rte_flow_error *error)
1769 {
1770 	struct rte_flow *remote_flow;
1771 
1772 	while (!LIST_EMPTY(&pmd->implicit_flows)) {
1773 		remote_flow = LIST_FIRST(&pmd->implicit_flows);
1774 		if (tap_flow_destroy_pmd(pmd, remote_flow, error) < 0)
1775 			return -1;
1776 	}
1777 	return 0;
1778 }
1779 
1780 #define MAX_RSS_KEYS 256
1781 #define SEC_NAME_CLS_Q "cls_q"
1782 
1783 const char *sec_name[SEC_MAX] = {
1784 	[SEC_L3_L4] = "l3_l4",
1785 };
1786 
1787 /**
1788  * Enable RSS on tap: create TC rules for queuing.
1789  *
1790  * @param[in, out] pmd
1791  *   Pointer to private structure.
1792  *
1793  * @param[in] attr
1794  *   Pointer to rte_flow to get flow group
1795  *
1796  * @param[out] error
1797  *   Pointer to error reporting if not NULL.
1798  *
1799  * @return 0 on success, negative value on failure.
1800  */
1801 static int rss_enable(struct pmd_internals *pmd,
1802 			const struct rte_flow_attr *attr,
1803 			struct rte_flow_error *error)
1804 {
1805 	struct rte_flow *rss_flow = NULL;
1806 	struct nlmsg *msg = NULL;
1807 	/* 4096 is the maximum number of instructions for a BPF program */
1808 	char annotation[64];
1809 	int i;
1810 	int err = 0;
1811 
1812 	/* unlimit locked memory */
1813 	struct rlimit memlock_limit = {
1814 		.rlim_cur = RLIM_INFINITY,
1815 		.rlim_max = RLIM_INFINITY,
1816 	};
1817 	setrlimit(RLIMIT_MEMLOCK, &memlock_limit);
1818 
1819 	 /* Get a new map key for a new RSS rule */
1820 	err = bpf_rss_key(KEY_CMD_INIT, NULL);
1821 	if (err < 0) {
1822 		rte_flow_error_set(
1823 			error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1824 			"Failed to initialize BPF RSS keys");
1825 
1826 		return -1;
1827 	}
1828 
1829 	/*
1830 	 *  Create BPF RSS MAP
1831 	 */
1832 	pmd->map_fd = tap_flow_bpf_rss_map_create(sizeof(__u32), /* key size */
1833 				sizeof(struct rss_key),
1834 				MAX_RSS_KEYS);
1835 	if (pmd->map_fd < 0) {
1836 		RTE_LOG(ERR, PMD,
1837 			"Failed to create BPF map (%d): %s\n",
1838 				errno, strerror(errno));
1839 		rte_flow_error_set(
1840 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1841 			"Kernel too old or not configured "
1842 			"to support BPF maps");
1843 
1844 		return -ENOTSUP;
1845 	}
1846 
1847 	/*
1848 	 * Add a rule per queue to match reclassified packets and direct them to
1849 	 * the correct queue.
1850 	 */
1851 	for (i = 0; i < pmd->dev->data->nb_rx_queues; i++) {
1852 		pmd->bpf_fd[i] = tap_flow_bpf_cls_q(i);
1853 		if (pmd->bpf_fd[i] < 0) {
1854 			RTE_LOG(ERR, PMD,
1855 				"Failed to load BPF section %s for queue %d",
1856 				SEC_NAME_CLS_Q, i);
1857 			rte_flow_error_set(
1858 				error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1859 				NULL,
1860 				"Kernel too old or not configured "
1861 				"to support BPF programs loading");
1862 
1863 			return -ENOTSUP;
1864 		}
1865 
1866 		rss_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1867 		if (!rss_flow) {
1868 			RTE_LOG(ERR, PMD,
1869 				"Cannot allocate memory for rte_flow");
1870 			return -1;
1871 		}
1872 		msg = &rss_flow->msg;
1873 		tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER, NLM_F_REQUEST |
1874 			    NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1875 		msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1876 		tap_flow_set_handle(rss_flow);
1877 		uint16_t group = attr->group << GROUP_SHIFT;
1878 		uint16_t prio = group | (i + PRIORITY_OFFSET);
1879 		msg->t.tcm_info = TC_H_MAKE(prio << 16, msg->t.tcm_info);
1880 		msg->t.tcm_parent = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
1881 
1882 		tap_nlattr_add(&msg->nh, TCA_KIND, sizeof("bpf"), "bpf");
1883 		if (tap_nlattr_nested_start(msg, TCA_OPTIONS) < 0)
1884 			return -1;
1885 		tap_nlattr_add32(&msg->nh, TCA_BPF_FD, pmd->bpf_fd[i]);
1886 		snprintf(annotation, sizeof(annotation), "[%s%d]",
1887 			SEC_NAME_CLS_Q, i);
1888 		tap_nlattr_add(&msg->nh, TCA_BPF_NAME, strlen(annotation) + 1,
1889 			   annotation);
1890 		/* Actions */
1891 		{
1892 			struct action_data adata = {
1893 				.id = "skbedit",
1894 				.skbedit = {
1895 					.skbedit = {
1896 						.action = TC_ACT_PIPE,
1897 					},
1898 					.queue = i,
1899 				},
1900 			};
1901 			if (add_actions(rss_flow, 1, &adata, TCA_BPF_ACT) < 0)
1902 				return -1;
1903 		}
1904 		tap_nlattr_nested_finish(msg); /* nested TCA_OPTIONS */
1905 
1906 		/* Netlink message is now ready to be sent */
1907 		if (tap_nl_send(pmd->nlsk_fd, &msg->nh) < 0)
1908 			return -1;
1909 		err = tap_nl_recv_ack(pmd->nlsk_fd);
1910 		if (err < 0) {
1911 			RTE_LOG(ERR, PMD,
1912 				"Kernel refused TC filter rule creation (%d): %s\n",
1913 				errno, strerror(errno));
1914 			return err;
1915 		}
1916 		LIST_INSERT_HEAD(&pmd->rss_flows, rss_flow, next);
1917 	}
1918 
1919 	pmd->rss_enabled = 1;
1920 	return err;
1921 }
1922 
1923 /**
1924  * Manage bpf RSS keys repository with operations: init, get, release
1925  *
1926  * @param[in] cmd
1927  *   Command on RSS keys: init, get, release
1928  *
1929  * @param[in, out] key_idx
1930  *   Pointer to RSS Key index (out for get command, in for release command)
1931  *
1932  * @return -1 if couldn't get, release or init the RSS keys, 0 otherwise.
1933  */
1934 static int bpf_rss_key(enum bpf_rss_key_e cmd, __u32 *key_idx)
1935 {
1936 	__u32 i;
1937 	int err = -1;
1938 	static __u32 num_used_keys;
1939 	static __u32 rss_keys[MAX_RSS_KEYS] = {KEY_STAT_UNSPEC};
1940 	static __u32 rss_keys_initialized;
1941 
1942 	switch (cmd) {
1943 	case KEY_CMD_GET:
1944 		if (!rss_keys_initialized)
1945 			break;
1946 
1947 		if (num_used_keys == RTE_DIM(rss_keys))
1948 			break;
1949 
1950 		*key_idx = num_used_keys % RTE_DIM(rss_keys);
1951 		while (rss_keys[*key_idx] == KEY_STAT_USED)
1952 			*key_idx = (*key_idx + 1) % RTE_DIM(rss_keys);
1953 
1954 		rss_keys[*key_idx] = KEY_STAT_USED;
1955 		num_used_keys++;
1956 		err = 0;
1957 	break;
1958 
1959 	case KEY_CMD_RELEASE:
1960 		if (!rss_keys_initialized) {
1961 			err = 0;
1962 			break;
1963 		}
1964 
1965 		if (rss_keys[*key_idx] == KEY_STAT_USED) {
1966 			rss_keys[*key_idx] = KEY_STAT_AVAILABLE;
1967 			num_used_keys--;
1968 			err = 0;
1969 		}
1970 	break;
1971 
1972 	case KEY_CMD_INIT:
1973 		for (i = 0; i < RTE_DIM(rss_keys); i++)
1974 			rss_keys[i] = KEY_STAT_AVAILABLE;
1975 
1976 		rss_keys_initialized = 1;
1977 		num_used_keys = 0;
1978 		err = 0;
1979 	break;
1980 
1981 	case KEY_CMD_DEINIT:
1982 		for (i = 0; i < RTE_DIM(rss_keys); i++)
1983 			rss_keys[i] = KEY_STAT_UNSPEC;
1984 
1985 		rss_keys_initialized = 0;
1986 		num_used_keys = 0;
1987 		err = 0;
1988 	break;
1989 
1990 	default:
1991 		break;
1992 	}
1993 
1994 	return err;
1995 }
1996 
1997 /**
1998  * Add RSS hash calculations and queue selection
1999  *
2000  * @param[in, out] pmd
2001  *   Pointer to internal structure. Used to set/get RSS map fd
2002  *
2003  * @param[in] rss
2004  *   Pointer to RSS flow actions
2005  *
2006  * @param[out] error
2007  *   Pointer to error reporting if not NULL.
2008  *
2009  * @return 0 on success, negative value on failure
2010  */
2011 static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
2012 			   const struct rte_flow_action_rss *rss,
2013 			   struct rte_flow_error *error)
2014 {
2015 	/* 4096 is the maximum number of instructions for a BPF program */
2016 	int i;
2017 	int err;
2018 	struct rss_key rss_entry = { .hash_fields = 0,
2019 				     .key_size = 0 };
2020 
2021 	/* Get a new map key for a new RSS rule */
2022 	err = bpf_rss_key(KEY_CMD_GET, &flow->key_idx);
2023 	if (err < 0) {
2024 		rte_flow_error_set(
2025 			error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
2026 			"Failed to get BPF RSS key");
2027 
2028 		return -1;
2029 	}
2030 
2031 	/* Update RSS map entry with queues */
2032 	rss_entry.nb_queues = rss->num;
2033 	for (i = 0; i < rss->num; i++)
2034 		rss_entry.queues[i] = rss->queue[i];
2035 	rss_entry.hash_fields =
2036 		(1 << HASH_FIELD_IPV4_L3_L4) | (1 << HASH_FIELD_IPV6_L3_L4);
2037 
2038 	/* Add this RSS entry to map */
2039 	err = tap_flow_bpf_update_rss_elem(pmd->map_fd,
2040 				&flow->key_idx, &rss_entry);
2041 
2042 	if (err) {
2043 		RTE_LOG(ERR, PMD,
2044 			"Failed to update BPF map entry #%u (%d): %s\n",
2045 			flow->key_idx, errno, strerror(errno));
2046 		rte_flow_error_set(
2047 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
2048 			"Kernel too old or not configured "
2049 			"to support BPF maps updates");
2050 
2051 		return -ENOTSUP;
2052 	}
2053 
2054 
2055 	/*
2056 	 * Load bpf rules to calculate hash for this key_idx
2057 	 */
2058 
2059 	flow->bpf_fd[SEC_L3_L4] =
2060 		tap_flow_bpf_calc_l3_l4_hash(flow->key_idx, pmd->map_fd);
2061 	if (flow->bpf_fd[SEC_L3_L4] < 0) {
2062 		RTE_LOG(ERR, PMD,
2063 			"Failed to load BPF section %s (%d): %s\n",
2064 				sec_name[SEC_L3_L4], errno, strerror(errno));
2065 		rte_flow_error_set(
2066 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
2067 			"Kernel too old or not configured "
2068 			"to support BPF program loading");
2069 
2070 		return -ENOTSUP;
2071 	}
2072 
2073 	/* Actions */
2074 	{
2075 		struct action_data adata[] = {
2076 			{
2077 				.id = "bpf",
2078 				.bpf = {
2079 					.bpf_fd = flow->bpf_fd[SEC_L3_L4],
2080 					.annotation = sec_name[SEC_L3_L4],
2081 					.bpf = {
2082 						.action = TC_ACT_PIPE,
2083 					},
2084 				},
2085 			},
2086 		};
2087 
2088 		if (add_actions(flow, RTE_DIM(adata), adata,
2089 			TCA_FLOWER_ACT) < 0)
2090 			return -1;
2091 	}
2092 
2093 	return 0;
2094 }
2095 
2096 /**
2097  * Manage filter operations.
2098  *
2099  * @param dev
2100  *   Pointer to Ethernet device structure.
2101  * @param filter_type
2102  *   Filter type.
2103  * @param filter_op
2104  *   Operation to perform.
2105  * @param arg
2106  *   Pointer to operation-specific structure.
2107  *
2108  * @return
2109  *   0 on success, negative errno value on failure.
2110  */
2111 int
2112 tap_dev_filter_ctrl(struct rte_eth_dev *dev,
2113 		    enum rte_filter_type filter_type,
2114 		    enum rte_filter_op filter_op,
2115 		    void *arg)
2116 {
2117 	switch (filter_type) {
2118 	case RTE_ETH_FILTER_GENERIC:
2119 		if (filter_op != RTE_ETH_FILTER_GET)
2120 			return -EINVAL;
2121 		*(const void **)arg = &tap_flow_ops;
2122 		return 0;
2123 	default:
2124 		RTE_LOG(ERR, PMD, "%p: filter type (%d) not supported\n",
2125 			(void *)dev, filter_type);
2126 	}
2127 	return -EINVAL;
2128 }
2129 
2130