xref: /dpdk/drivers/net/tap/tap_flow.c (revision f96fe1ab35b74b1e0914bc8a99aac633c5b15807)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2017 6WIND S.A.
5  *   Copyright 2017 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <errno.h>
35 #include <string.h>
36 #include <sys/queue.h>
37 
38 #include <rte_byteorder.h>
39 #include <rte_jhash.h>
40 #include <rte_malloc.h>
41 #include <rte_eth_tap.h>
42 #include <tap_flow.h>
43 #include <tap_autoconf.h>
44 #include <tap_tcmsgs.h>
45 
46 #ifndef HAVE_TC_FLOWER
47 /*
48  * For kernels < 4.2, this enum is not defined. Runtime checks will be made to
49  * avoid sending TC messages the kernel cannot understand.
50  */
51 enum {
52 	TCA_FLOWER_UNSPEC,
53 	TCA_FLOWER_CLASSID,
54 	TCA_FLOWER_INDEV,
55 	TCA_FLOWER_ACT,
56 	TCA_FLOWER_KEY_ETH_DST,         /* ETH_ALEN */
57 	TCA_FLOWER_KEY_ETH_DST_MASK,    /* ETH_ALEN */
58 	TCA_FLOWER_KEY_ETH_SRC,         /* ETH_ALEN */
59 	TCA_FLOWER_KEY_ETH_SRC_MASK,    /* ETH_ALEN */
60 	TCA_FLOWER_KEY_ETH_TYPE,        /* be16 */
61 	TCA_FLOWER_KEY_IP_PROTO,        /* u8 */
62 	TCA_FLOWER_KEY_IPV4_SRC,        /* be32 */
63 	TCA_FLOWER_KEY_IPV4_SRC_MASK,   /* be32 */
64 	TCA_FLOWER_KEY_IPV4_DST,        /* be32 */
65 	TCA_FLOWER_KEY_IPV4_DST_MASK,   /* be32 */
66 	TCA_FLOWER_KEY_IPV6_SRC,        /* struct in6_addr */
67 	TCA_FLOWER_KEY_IPV6_SRC_MASK,   /* struct in6_addr */
68 	TCA_FLOWER_KEY_IPV6_DST,        /* struct in6_addr */
69 	TCA_FLOWER_KEY_IPV6_DST_MASK,   /* struct in6_addr */
70 	TCA_FLOWER_KEY_TCP_SRC,         /* be16 */
71 	TCA_FLOWER_KEY_TCP_DST,         /* be16 */
72 	TCA_FLOWER_KEY_UDP_SRC,         /* be16 */
73 	TCA_FLOWER_KEY_UDP_DST,         /* be16 */
74 };
75 #endif
76 #ifndef HAVE_TC_VLAN_ID
77 enum {
78 	/* TCA_FLOWER_FLAGS, */
79 	TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */
80 	TCA_FLOWER_KEY_VLAN_PRIO,       /* u8   */
81 	TCA_FLOWER_KEY_VLAN_ETH_TYPE,   /* be16 */
82 };
83 #endif
84 
85 struct rte_flow {
86 	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
87 	struct rte_flow *remote_flow; /* associated remote flow */
88 	struct nlmsg msg;
89 };
90 
91 struct convert_data {
92 	uint16_t eth_type;
93 	uint16_t ip_proto;
94 	uint8_t vlan;
95 	struct rte_flow *flow;
96 };
97 
98 struct remote_rule {
99 	struct rte_flow_attr attr;
100 	struct rte_flow_item items[2];
101 	int mirred;
102 };
103 
104 static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
105 static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
106 static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
107 static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data);
108 static int tap_flow_create_udp(const struct rte_flow_item *item, void *data);
109 static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data);
110 static int
111 tap_flow_validate(struct rte_eth_dev *dev,
112 		  const struct rte_flow_attr *attr,
113 		  const struct rte_flow_item items[],
114 		  const struct rte_flow_action actions[],
115 		  struct rte_flow_error *error);
116 
117 static struct rte_flow *
118 tap_flow_create(struct rte_eth_dev *dev,
119 		const struct rte_flow_attr *attr,
120 		const struct rte_flow_item items[],
121 		const struct rte_flow_action actions[],
122 		struct rte_flow_error *error);
123 
124 static int
125 tap_flow_destroy(struct rte_eth_dev *dev,
126 		 struct rte_flow *flow,
127 		 struct rte_flow_error *error);
128 
129 static const struct rte_flow_ops tap_flow_ops = {
130 	.validate = tap_flow_validate,
131 	.create = tap_flow_create,
132 	.destroy = tap_flow_destroy,
133 	.flush = tap_flow_flush,
134 };
135 
136 /* Static initializer for items. */
137 #define ITEMS(...) \
138 	(const enum rte_flow_item_type []){ \
139 		__VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
140 	}
141 
142 /* Structure to generate a simple graph of layers supported by the NIC. */
143 struct tap_flow_items {
144 	/* Bit-mask corresponding to what is supported for this item. */
145 	const void *mask;
146 	const unsigned int mask_sz; /* Bit-mask size in bytes. */
147 	/*
148 	 * Bit-mask corresponding to the default mask, if none is provided
149 	 * along with the item.
150 	 */
151 	const void *default_mask;
152 	/**
153 	 * Conversion function from rte_flow to netlink attributes.
154 	 *
155 	 * @param item
156 	 *   rte_flow item to convert.
157 	 * @param data
158 	 *   Internal structure to store the conversion.
159 	 *
160 	 * @return
161 	 *   0 on success, negative value otherwise.
162 	 */
163 	int (*convert)(const struct rte_flow_item *item, void *data);
164 	/** List of possible following items.  */
165 	const enum rte_flow_item_type *const items;
166 };
167 
168 /* Graph of supported items and associated actions. */
169 static const struct tap_flow_items tap_flow_items[] = {
170 	[RTE_FLOW_ITEM_TYPE_END] = {
171 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
172 	},
173 	[RTE_FLOW_ITEM_TYPE_ETH] = {
174 		.items = ITEMS(
175 			RTE_FLOW_ITEM_TYPE_VLAN,
176 			RTE_FLOW_ITEM_TYPE_IPV4,
177 			RTE_FLOW_ITEM_TYPE_IPV6),
178 		.mask = &(const struct rte_flow_item_eth){
179 			.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
180 			.src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
181 			.type = -1,
182 		},
183 		.mask_sz = sizeof(struct rte_flow_item_eth),
184 		.default_mask = &rte_flow_item_eth_mask,
185 		.convert = tap_flow_create_eth,
186 	},
187 	[RTE_FLOW_ITEM_TYPE_VLAN] = {
188 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
189 			       RTE_FLOW_ITEM_TYPE_IPV6),
190 		.mask = &(const struct rte_flow_item_vlan){
191 			.tpid = -1,
192 			/* DEI matching is not supported */
193 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
194 			.tci = 0xffef,
195 #else
196 			.tci = 0xefff,
197 #endif
198 		},
199 		.mask_sz = sizeof(struct rte_flow_item_vlan),
200 		.default_mask = &rte_flow_item_vlan_mask,
201 		.convert = tap_flow_create_vlan,
202 	},
203 	[RTE_FLOW_ITEM_TYPE_IPV4] = {
204 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
205 			       RTE_FLOW_ITEM_TYPE_TCP),
206 		.mask = &(const struct rte_flow_item_ipv4){
207 			.hdr = {
208 				.src_addr = -1,
209 				.dst_addr = -1,
210 				.next_proto_id = -1,
211 			},
212 		},
213 		.mask_sz = sizeof(struct rte_flow_item_ipv4),
214 		.default_mask = &rte_flow_item_ipv4_mask,
215 		.convert = tap_flow_create_ipv4,
216 	},
217 	[RTE_FLOW_ITEM_TYPE_IPV6] = {
218 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
219 			       RTE_FLOW_ITEM_TYPE_TCP),
220 		.mask = &(const struct rte_flow_item_ipv6){
221 			.hdr = {
222 				.src_addr = {
223 					"\xff\xff\xff\xff\xff\xff\xff\xff"
224 					"\xff\xff\xff\xff\xff\xff\xff\xff",
225 				},
226 				.dst_addr = {
227 					"\xff\xff\xff\xff\xff\xff\xff\xff"
228 					"\xff\xff\xff\xff\xff\xff\xff\xff",
229 				},
230 				.proto = -1,
231 			},
232 		},
233 		.mask_sz = sizeof(struct rte_flow_item_ipv6),
234 		.default_mask = &rte_flow_item_ipv6_mask,
235 		.convert = tap_flow_create_ipv6,
236 	},
237 	[RTE_FLOW_ITEM_TYPE_UDP] = {
238 		.mask = &(const struct rte_flow_item_udp){
239 			.hdr = {
240 				.src_port = -1,
241 				.dst_port = -1,
242 			},
243 		},
244 		.mask_sz = sizeof(struct rte_flow_item_udp),
245 		.default_mask = &rte_flow_item_udp_mask,
246 		.convert = tap_flow_create_udp,
247 	},
248 	[RTE_FLOW_ITEM_TYPE_TCP] = {
249 		.mask = &(const struct rte_flow_item_tcp){
250 			.hdr = {
251 				.src_port = -1,
252 				.dst_port = -1,
253 			},
254 		},
255 		.mask_sz = sizeof(struct rte_flow_item_tcp),
256 		.default_mask = &rte_flow_item_tcp_mask,
257 		.convert = tap_flow_create_tcp,
258 	},
259 };
260 
261 static struct remote_rule implicit_rte_flows[TAP_REMOTE_MAX_IDX] = {
262 	[TAP_REMOTE_LOCAL_MAC] = {
263 		.attr = {
264 			.group = MAX_GROUP,
265 			.priority = PRIORITY_MASK - TAP_REMOTE_LOCAL_MAC,
266 			.ingress = 1,
267 		},
268 		.items[0] = {
269 			.type = RTE_FLOW_ITEM_TYPE_ETH,
270 			.mask =  &(const struct rte_flow_item_eth){
271 				.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
272 			},
273 		},
274 		.items[1] = {
275 			.type = RTE_FLOW_ITEM_TYPE_END,
276 		},
277 		.mirred = TCA_EGRESS_REDIR,
278 	},
279 	[TAP_REMOTE_BROADCAST] = {
280 		.attr = {
281 			.group = MAX_GROUP,
282 			.priority = PRIORITY_MASK - TAP_REMOTE_BROADCAST,
283 			.ingress = 1,
284 		},
285 		.items[0] = {
286 			.type = RTE_FLOW_ITEM_TYPE_ETH,
287 			.mask =  &(const struct rte_flow_item_eth){
288 				.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
289 			},
290 			.spec = &(const struct rte_flow_item_eth){
291 				.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
292 			},
293 		},
294 		.items[1] = {
295 			.type = RTE_FLOW_ITEM_TYPE_END,
296 		},
297 		.mirred = TCA_EGRESS_MIRROR,
298 	},
299 	[TAP_REMOTE_BROADCASTV6] = {
300 		.attr = {
301 			.group = MAX_GROUP,
302 			.priority = PRIORITY_MASK - TAP_REMOTE_BROADCASTV6,
303 			.ingress = 1,
304 		},
305 		.items[0] = {
306 			.type = RTE_FLOW_ITEM_TYPE_ETH,
307 			.mask =  &(const struct rte_flow_item_eth){
308 				.dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
309 			},
310 			.spec = &(const struct rte_flow_item_eth){
311 				.dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
312 			},
313 		},
314 		.items[1] = {
315 			.type = RTE_FLOW_ITEM_TYPE_END,
316 		},
317 		.mirred = TCA_EGRESS_MIRROR,
318 	},
319 	[TAP_REMOTE_PROMISC] = {
320 		.attr = {
321 			.group = MAX_GROUP,
322 			.priority = PRIORITY_MASK - TAP_REMOTE_PROMISC,
323 			.ingress = 1,
324 		},
325 		.items[0] = {
326 			.type = RTE_FLOW_ITEM_TYPE_VOID,
327 		},
328 		.items[1] = {
329 			.type = RTE_FLOW_ITEM_TYPE_END,
330 		},
331 		.mirred = TCA_EGRESS_MIRROR,
332 	},
333 	[TAP_REMOTE_ALLMULTI] = {
334 		.attr = {
335 			.group = MAX_GROUP,
336 			.priority = PRIORITY_MASK - TAP_REMOTE_ALLMULTI,
337 			.ingress = 1,
338 		},
339 		.items[0] = {
340 			.type = RTE_FLOW_ITEM_TYPE_ETH,
341 			.mask =  &(const struct rte_flow_item_eth){
342 				.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
343 			},
344 			.spec = &(const struct rte_flow_item_eth){
345 				.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
346 			},
347 		},
348 		.items[1] = {
349 			.type = RTE_FLOW_ITEM_TYPE_END,
350 		},
351 		.mirred = TCA_EGRESS_MIRROR,
352 	},
353 	[TAP_REMOTE_TX] = {
354 		.attr = {
355 			.group = 0,
356 			.priority = TAP_REMOTE_TX,
357 			.egress = 1,
358 		},
359 		.items[0] = {
360 			.type = RTE_FLOW_ITEM_TYPE_VOID,
361 		},
362 		.items[1] = {
363 			.type = RTE_FLOW_ITEM_TYPE_END,
364 		},
365 		.mirred = TCA_EGRESS_MIRROR,
366 	},
367 };
368 
369 /**
370  * Make as much checks as possible on an Ethernet item, and if a flow is
371  * provided, fill it appropriately with Ethernet info.
372  *
373  * @param[in] item
374  *   Item specification.
375  * @param[in, out] data
376  *   Additional data structure to tell next layers we've been here.
377  *
378  * @return
379  *   0 if checks are alright, -1 otherwise.
380  */
381 static int
382 tap_flow_create_eth(const struct rte_flow_item *item, void *data)
383 {
384 	struct convert_data *info = (struct convert_data *)data;
385 	const struct rte_flow_item_eth *spec = item->spec;
386 	const struct rte_flow_item_eth *mask = item->mask;
387 	struct rte_flow *flow = info->flow;
388 	struct nlmsg *msg;
389 
390 	/* use default mask if none provided */
391 	if (!mask)
392 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask;
393 	/* TC does not support eth_type masking. Only accept if exact match. */
394 	if (mask->type && mask->type != 0xffff)
395 		return -1;
396 	if (!spec)
397 		return 0;
398 	/* store eth_type for consistency if ipv4/6 pattern item comes next */
399 	if (spec->type & mask->type)
400 		info->eth_type = spec->type;
401 	if (!flow)
402 		return 0;
403 	msg = &flow->msg;
404 	if (!is_zero_ether_addr(&spec->dst)) {
405 		nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN,
406 			   &spec->dst.addr_bytes);
407 		nlattr_add(&msg->nh,
408 			   TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN,
409 			   &mask->dst.addr_bytes);
410 	}
411 	if (!is_zero_ether_addr(&mask->src)) {
412 		nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN,
413 			   &spec->src.addr_bytes);
414 		nlattr_add(&msg->nh,
415 			   TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN,
416 			   &mask->src.addr_bytes);
417 	}
418 	return 0;
419 }
420 
421 /**
422  * Make as much checks as possible on a VLAN item, and if a flow is provided,
423  * fill it appropriately with VLAN info.
424  *
425  * @param[in] item
426  *   Item specification.
427  * @param[in, out] data
428  *   Additional data structure to tell next layers we've been here.
429  *
430  * @return
431  *   0 if checks are alright, -1 otherwise.
432  */
433 static int
434 tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
435 {
436 	struct convert_data *info = (struct convert_data *)data;
437 	const struct rte_flow_item_vlan *spec = item->spec;
438 	const struct rte_flow_item_vlan *mask = item->mask;
439 	struct rte_flow *flow = info->flow;
440 	struct nlmsg *msg;
441 
442 	/* use default mask if none provided */
443 	if (!mask)
444 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
445 	/* TC does not support tpid masking. Only accept if exact match. */
446 	if (mask->tpid && mask->tpid != 0xffff)
447 		return -1;
448 	/* Double-tagging not supported. */
449 	if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
450 		return -1;
451 	info->vlan = 1;
452 	if (!flow)
453 		return 0;
454 	msg = &flow->msg;
455 	msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q));
456 #define VLAN_PRIO(tci) ((tci) >> 13)
457 #define VLAN_ID(tci) ((tci) & 0xfff)
458 	if (!spec)
459 		return 0;
460 	if (spec->tci) {
461 		uint16_t tci = ntohs(spec->tci) & mask->tci;
462 		uint16_t prio = VLAN_PRIO(tci);
463 		uint8_t vid = VLAN_ID(tci);
464 
465 		if (prio)
466 			nlattr_add8(&msg->nh, TCA_FLOWER_KEY_VLAN_PRIO, prio);
467 		if (vid)
468 			nlattr_add16(&msg->nh, TCA_FLOWER_KEY_VLAN_ID, vid);
469 	}
470 	return 0;
471 }
472 
473 /**
474  * Make as much checks as possible on an IPv4 item, and if a flow is provided,
475  * fill it appropriately with IPv4 info.
476  *
477  * @param[in] item
478  *   Item specification.
479  * @param[in, out] data
480  *   Additional data structure to tell next layers we've been here.
481  *
482  * @return
483  *   0 if checks are alright, -1 otherwise.
484  */
485 static int
486 tap_flow_create_ipv4(const struct rte_flow_item *item, void *data)
487 {
488 	struct convert_data *info = (struct convert_data *)data;
489 	const struct rte_flow_item_ipv4 *spec = item->spec;
490 	const struct rte_flow_item_ipv4 *mask = item->mask;
491 	struct rte_flow *flow = info->flow;
492 	struct nlmsg *msg;
493 
494 	/* use default mask if none provided */
495 	if (!mask)
496 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask;
497 	/* check that previous eth type is compatible with ipv4 */
498 	if (info->eth_type && info->eth_type != htons(ETH_P_IP))
499 		return -1;
500 	/* store ip_proto for consistency if udp/tcp pattern item comes next */
501 	if (spec)
502 		info->ip_proto = spec->hdr.next_proto_id;
503 	if (!flow)
504 		return 0;
505 	msg = &flow->msg;
506 	if (!info->eth_type)
507 		info->eth_type = htons(ETH_P_IP);
508 	if (!spec)
509 		return 0;
510 	if (spec->hdr.dst_addr) {
511 		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST,
512 			     spec->hdr.dst_addr);
513 		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK,
514 			     mask->hdr.dst_addr);
515 	}
516 	if (spec->hdr.src_addr) {
517 		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC,
518 			     spec->hdr.src_addr);
519 		nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK,
520 			     mask->hdr.src_addr);
521 	}
522 	if (spec->hdr.next_proto_id)
523 		nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO,
524 			    spec->hdr.next_proto_id);
525 	return 0;
526 }
527 
528 /**
529  * Make as much checks as possible on an IPv6 item, and if a flow is provided,
530  * fill it appropriately with IPv6 info.
531  *
532  * @param[in] item
533  *   Item specification.
534  * @param[in, out] data
535  *   Additional data structure to tell next layers we've been here.
536  *
537  * @return
538  *   0 if checks are alright, -1 otherwise.
539  */
540 static int
541 tap_flow_create_ipv6(const struct rte_flow_item *item, void *data)
542 {
543 	struct convert_data *info = (struct convert_data *)data;
544 	const struct rte_flow_item_ipv6 *spec = item->spec;
545 	const struct rte_flow_item_ipv6 *mask = item->mask;
546 	struct rte_flow *flow = info->flow;
547 	uint8_t empty_addr[16] = { 0 };
548 	struct nlmsg *msg;
549 
550 	/* use default mask if none provided */
551 	if (!mask)
552 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask;
553 	/* check that previous eth type is compatible with ipv6 */
554 	if (info->eth_type && info->eth_type != htons(ETH_P_IPV6))
555 		return -1;
556 	/* store ip_proto for consistency if udp/tcp pattern item comes next */
557 	if (spec)
558 		info->ip_proto = spec->hdr.proto;
559 	if (!flow)
560 		return 0;
561 	msg = &flow->msg;
562 	if (!info->eth_type)
563 		info->eth_type = htons(ETH_P_IPV6);
564 	if (!spec)
565 		return 0;
566 	if (memcmp(spec->hdr.dst_addr, empty_addr, 16)) {
567 		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST,
568 			   sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr);
569 		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK,
570 			   sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr);
571 	}
572 	if (memcmp(spec->hdr.src_addr, empty_addr, 16)) {
573 		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC,
574 			   sizeof(spec->hdr.src_addr), &spec->hdr.src_addr);
575 		nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
576 			   sizeof(mask->hdr.src_addr), &mask->hdr.src_addr);
577 	}
578 	if (spec->hdr.proto)
579 		nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto);
580 	return 0;
581 }
582 
583 /**
584  * Make as much checks as possible on a UDP item, and if a flow is provided,
585  * fill it appropriately with UDP info.
586  *
587  * @param[in] item
588  *   Item specification.
589  * @param[in, out] data
590  *   Additional data structure to tell next layers we've been here.
591  *
592  * @return
593  *   0 if checks are alright, -1 otherwise.
594  */
595 static int
596 tap_flow_create_udp(const struct rte_flow_item *item, void *data)
597 {
598 	struct convert_data *info = (struct convert_data *)data;
599 	const struct rte_flow_item_udp *spec = item->spec;
600 	const struct rte_flow_item_udp *mask = item->mask;
601 	struct rte_flow *flow = info->flow;
602 	struct nlmsg *msg;
603 
604 	/* use default mask if none provided */
605 	if (!mask)
606 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask;
607 	/* check that previous ip_proto is compatible with udp */
608 	if (info->ip_proto && info->ip_proto != IPPROTO_UDP)
609 		return -1;
610 	/* TC does not support UDP port masking. Only accept if exact match. */
611 	if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
612 	    (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
613 		return -1;
614 	if (!flow)
615 		return 0;
616 	msg = &flow->msg;
617 	nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP);
618 	if (!spec)
619 		return 0;
620 	if (spec->hdr.dst_port & mask->hdr.dst_port)
621 		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST,
622 			     spec->hdr.dst_port);
623 	if (spec->hdr.src_port & mask->hdr.src_port)
624 		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC,
625 			     spec->hdr.src_port);
626 	return 0;
627 }
628 
629 /**
630  * Make as much checks as possible on a TCP item, and if a flow is provided,
631  * fill it appropriately with TCP info.
632  *
633  * @param[in] item
634  *   Item specification.
635  * @param[in, out] data
636  *   Additional data structure to tell next layers we've been here.
637  *
638  * @return
639  *   0 if checks are alright, -1 otherwise.
640  */
641 static int
642 tap_flow_create_tcp(const struct rte_flow_item *item, void *data)
643 {
644 	struct convert_data *info = (struct convert_data *)data;
645 	const struct rte_flow_item_tcp *spec = item->spec;
646 	const struct rte_flow_item_tcp *mask = item->mask;
647 	struct rte_flow *flow = info->flow;
648 	struct nlmsg *msg;
649 
650 	/* use default mask if none provided */
651 	if (!mask)
652 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask;
653 	/* check that previous ip_proto is compatible with tcp */
654 	if (info->ip_proto && info->ip_proto != IPPROTO_TCP)
655 		return -1;
656 	/* TC does not support TCP port masking. Only accept if exact match. */
657 	if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
658 	    (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
659 		return -1;
660 	if (!flow)
661 		return 0;
662 	msg = &flow->msg;
663 	nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP);
664 	if (!spec)
665 		return 0;
666 	if (spec->hdr.dst_port & mask->hdr.dst_port)
667 		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST,
668 			     spec->hdr.dst_port);
669 	if (spec->hdr.src_port & mask->hdr.src_port)
670 		nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC,
671 			     spec->hdr.src_port);
672 	return 0;
673 }
674 
675 /**
676  * Check support for a given item.
677  *
678  * @param[in] item
679  *   Item specification.
680  * @param size
681  *   Bit-Mask size in bytes.
682  * @param[in] supported_mask
683  *   Bit-mask covering supported fields to compare with spec, last and mask in
684  *   \item.
685  * @param[in] default_mask
686  *   Bit-mask default mask if none is provided in \item.
687  *
688  * @return
689  *   0 on success.
690  */
691 static int
692 tap_flow_item_validate(const struct rte_flow_item *item,
693 		       unsigned int size,
694 		       const uint8_t *supported_mask,
695 		       const uint8_t *default_mask)
696 {
697 	int ret = 0;
698 
699 	/* An empty layer is allowed, as long as all fields are NULL */
700 	if (!item->spec && (item->mask || item->last))
701 		return -1;
702 	/* Is the item spec compatible with what the NIC supports? */
703 	if (item->spec && !item->mask) {
704 		unsigned int i;
705 		const uint8_t *spec = item->spec;
706 
707 		for (i = 0; i < size; ++i)
708 			if ((spec[i] | supported_mask[i]) != supported_mask[i])
709 				return -1;
710 		/* Is the default mask compatible with what the NIC supports? */
711 		for (i = 0; i < size; i++)
712 			if ((default_mask[i] | supported_mask[i]) !=
713 			    supported_mask[i])
714 				return -1;
715 	}
716 	/* Is the item last compatible with what the NIC supports? */
717 	if (item->last && !item->mask) {
718 		unsigned int i;
719 		const uint8_t *spec = item->last;
720 
721 		for (i = 0; i < size; ++i)
722 			if ((spec[i] | supported_mask[i]) != supported_mask[i])
723 				return -1;
724 	}
725 	/* Is the item mask compatible with what the NIC supports? */
726 	if (item->mask) {
727 		unsigned int i;
728 		const uint8_t *spec = item->mask;
729 
730 		for (i = 0; i < size; ++i)
731 			if ((spec[i] | supported_mask[i]) != supported_mask[i])
732 				return -1;
733 	}
734 	/**
735 	 * Once masked, Are item spec and item last equal?
736 	 * TC does not support range so anything else is invalid.
737 	 */
738 	if (item->spec && item->last) {
739 		uint8_t spec[size];
740 		uint8_t last[size];
741 		const uint8_t *apply = default_mask;
742 		unsigned int i;
743 
744 		if (item->mask)
745 			apply = item->mask;
746 		for (i = 0; i < size; ++i) {
747 			spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
748 			last[i] = ((const uint8_t *)item->last)[i] & apply[i];
749 		}
750 		ret = memcmp(spec, last, size);
751 	}
752 	return ret;
753 }
754 
755 /**
756  * Transform a DROP/PASSTHRU action item in the provided flow for TC.
757  *
758  * @param[in, out] flow
759  *   Flow to be filled.
760  * @param[in] action
761  *   Appropriate action to be set in the TCA_GACT_PARMS structure.
762  *
763  * @return
764  *   0 if checks are alright, -1 otherwise.
765  */
766 static int
767 add_action_gact(struct rte_flow *flow, int action)
768 {
769 	struct nlmsg *msg = &flow->msg;
770 	size_t act_index = 1;
771 	struct tc_gact p = {
772 		.action = action
773 	};
774 
775 	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
776 		return -1;
777 	if (nlattr_nested_start(msg, act_index++) < 0)
778 		return -1;
779 	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("gact"), "gact");
780 	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
781 		return -1;
782 	nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(p), &p);
783 	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
784 	nlattr_nested_finish(msg); /* nested act_index */
785 	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
786 	return 0;
787 }
788 
789 /**
790  * Transform a MIRRED action item in the provided flow for TC.
791  *
792  * @param[in, out] flow
793  *   Flow to be filled.
794  * @param[in] ifindex
795  *   Netdevice ifindex, where to mirror/redirect packet to.
796  * @param[in] action_type
797  *   Either TCA_EGRESS_REDIR for redirection or TCA_EGRESS_MIRROR for mirroring.
798  *
799  * @return
800  *   0 if checks are alright, -1 otherwise.
801  */
802 static int
803 add_action_mirred(struct rte_flow *flow, uint16_t ifindex, uint16_t action_type)
804 {
805 	struct nlmsg *msg = &flow->msg;
806 	size_t act_index = 1;
807 	struct tc_mirred p = {
808 		.eaction = action_type,
809 		.ifindex = ifindex,
810 	};
811 
812 	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
813 		return -1;
814 	if (nlattr_nested_start(msg, act_index++) < 0)
815 		return -1;
816 	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("mirred"), "mirred");
817 	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
818 		return -1;
819 	if (action_type == TCA_EGRESS_MIRROR)
820 		p.action = TC_ACT_PIPE;
821 	else /* REDIRECT */
822 		p.action = TC_ACT_STOLEN;
823 	nlattr_add(&msg->nh, TCA_MIRRED_PARMS, sizeof(p), &p);
824 	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
825 	nlattr_nested_finish(msg); /* nested act_index */
826 	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
827 	return 0;
828 }
829 
830 /**
831  * Transform a QUEUE action item in the provided flow for TC.
832  *
833  * @param[in, out] flow
834  *   Flow to be filled.
835  * @param[in] queue
836  *   Queue id to use.
837  *
838  * @return
839  *   0 if checks are alright, -1 otherwise.
840  */
841 static int
842 add_action_skbedit(struct rte_flow *flow, uint16_t queue)
843 {
844 	struct nlmsg *msg = &flow->msg;
845 	size_t act_index = 1;
846 	struct tc_skbedit p = {
847 		.action = TC_ACT_PIPE
848 	};
849 
850 	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
851 		return -1;
852 	if (nlattr_nested_start(msg, act_index++) < 0)
853 		return -1;
854 	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("skbedit"), "skbedit");
855 	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
856 		return -1;
857 	nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, sizeof(p), &p);
858 	nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, queue);
859 	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
860 	nlattr_nested_finish(msg); /* nested act_index */
861 	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
862 	return 0;
863 }
864 
865 /**
866  * Validate a flow supported by TC.
867  * If flow param is not NULL, then also fill the netlink message inside.
868  *
869  * @param pmd
870  *   Pointer to private structure.
871  * @param[in] attr
872  *   Flow rule attributes.
873  * @param[in] pattern
874  *   Pattern specification (list terminated by the END pattern item).
875  * @param[in] actions
876  *   Associated actions (list terminated by the END action).
877  * @param[out] error
878  *   Perform verbose error reporting if not NULL.
879  * @param[in, out] flow
880  *   Flow structure to update.
881  * @param[in] mirred
882  *   If set to TCA_EGRESS_REDIR, provided actions will be replaced with a
883  *   redirection to the tap netdevice, and the TC rule will be configured
884  *   on the remote netdevice in pmd.
885  *   If set to TCA_EGRESS_MIRROR, provided actions will be replaced with a
886  *   mirroring to the tap netdevice, and the TC rule will be configured
887  *   on the remote netdevice in pmd. Matching packets will thus be duplicated.
888  *   If set to 0, the standard behavior is to be used: set correct actions for
889  *   the TC rule, and apply it on the tap netdevice.
890  *
891  * @return
892  *   0 on success, a negative errno value otherwise and rte_errno is set.
893  */
894 static int
895 priv_flow_process(struct pmd_internals *pmd,
896 		  const struct rte_flow_attr *attr,
897 		  const struct rte_flow_item items[],
898 		  const struct rte_flow_action actions[],
899 		  struct rte_flow_error *error,
900 		  struct rte_flow *flow,
901 		  int mirred)
902 {
903 	const struct tap_flow_items *cur_item = tap_flow_items;
904 	struct convert_data data = {
905 		.eth_type = 0,
906 		.ip_proto = 0,
907 		.flow = flow,
908 	};
909 	int action = 0; /* Only one action authorized for now */
910 
911 	if (attr->group > MAX_GROUP) {
912 		rte_flow_error_set(
913 			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
914 			NULL, "group value too big: cannot exceed 15");
915 		return -rte_errno;
916 	}
917 	if (attr->priority > MAX_PRIORITY) {
918 		rte_flow_error_set(
919 			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
920 			NULL, "priority value too big");
921 		return -rte_errno;
922 	} else if (flow) {
923 		uint16_t group = attr->group << GROUP_SHIFT;
924 		uint16_t prio = group | (attr->priority + PRIORITY_OFFSET);
925 		flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
926 						 flow->msg.t.tcm_info);
927 	}
928 	if (flow) {
929 		if (mirred) {
930 			/*
931 			 * If attr->ingress, the rule applies on remote ingress
932 			 * to match incoming packets
933 			 * If attr->egress, the rule applies on tap ingress (as
934 			 * seen from the kernel) to deal with packets going out
935 			 * from the DPDK app.
936 			 */
937 			flow->msg.t.tcm_parent = TC_H_MAKE(TC_H_INGRESS, 0);
938 		} else {
939 			/* Standard rule on tap egress (kernel standpoint). */
940 			flow->msg.t.tcm_parent =
941 				TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
942 		}
943 		/* use flower filter type */
944 		nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower");
945 		if (nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0)
946 			goto exit_item_not_supported;
947 	}
948 	for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
949 		const struct tap_flow_items *token = NULL;
950 		unsigned int i;
951 		int err = 0;
952 
953 		if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
954 			continue;
955 		for (i = 0;
956 		     cur_item->items &&
957 		     cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
958 		     ++i) {
959 			if (cur_item->items[i] == items->type) {
960 				token = &tap_flow_items[items->type];
961 				break;
962 			}
963 		}
964 		if (!token)
965 			goto exit_item_not_supported;
966 		cur_item = token;
967 		err = tap_flow_item_validate(
968 			items, cur_item->mask_sz,
969 			(const uint8_t *)cur_item->mask,
970 			(const uint8_t *)cur_item->default_mask);
971 		if (err)
972 			goto exit_item_not_supported;
973 		if (flow && cur_item->convert) {
974 			if (!pmd->flower_vlan_support &&
975 			    cur_item->convert == tap_flow_create_vlan)
976 				goto exit_item_not_supported;
977 			err = cur_item->convert(items, &data);
978 			if (err)
979 				goto exit_item_not_supported;
980 		}
981 	}
982 	if (flow) {
983 		if (pmd->flower_vlan_support && data.vlan) {
984 			nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
985 				     htons(ETH_P_8021Q));
986 			nlattr_add16(&flow->msg.nh,
987 				     TCA_FLOWER_KEY_VLAN_ETH_TYPE,
988 				     data.eth_type ?
989 				     data.eth_type : htons(ETH_P_ALL));
990 		} else if (data.eth_type) {
991 			nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
992 				     data.eth_type);
993 		}
994 	}
995 	if (mirred && flow) {
996 		uint16_t if_index = pmd->if_index;
997 
998 		/*
999 		 * If attr->egress && mirred, then this is a special
1000 		 * case where the rule must be applied on the tap, to
1001 		 * redirect packets coming from the DPDK App, out
1002 		 * through the remote netdevice.
1003 		 */
1004 		if (attr->egress)
1005 			if_index = pmd->remote_if_index;
1006 		if (add_action_mirred(flow, if_index, mirred) < 0)
1007 			goto exit_action_not_supported;
1008 		else
1009 			goto end;
1010 	}
1011 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
1012 		int err = 0;
1013 
1014 		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
1015 			continue;
1016 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
1017 			if (action)
1018 				goto exit_action_not_supported;
1019 			action = 1;
1020 			if (flow)
1021 				err = add_action_gact(flow, TC_ACT_SHOT);
1022 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
1023 			if (action)
1024 				goto exit_action_not_supported;
1025 			action = 1;
1026 			if (flow)
1027 				err = add_action_gact(flow, TC_ACT_UNSPEC);
1028 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
1029 			const struct rte_flow_action_queue *queue =
1030 				(const struct rte_flow_action_queue *)
1031 				actions->conf;
1032 			if (action)
1033 				goto exit_action_not_supported;
1034 			action = 1;
1035 			if (!queue || (queue->index >= pmd->nb_queues))
1036 				goto exit_action_not_supported;
1037 			if (flow)
1038 				err = add_action_skbedit(flow, queue->index);
1039 		} else {
1040 			goto exit_action_not_supported;
1041 		}
1042 		if (err)
1043 			goto exit_action_not_supported;
1044 	}
1045 end:
1046 	if (flow)
1047 		nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
1048 	return 0;
1049 exit_item_not_supported:
1050 	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
1051 			   items, "item not supported");
1052 	return -rte_errno;
1053 exit_action_not_supported:
1054 	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
1055 			   actions, "action not supported");
1056 	return -rte_errno;
1057 }
1058 
1059 
1060 
1061 /**
1062  * Validate a flow.
1063  *
1064  * @see rte_flow_validate()
1065  * @see rte_flow_ops
1066  */
1067 static int
1068 tap_flow_validate(struct rte_eth_dev *dev,
1069 		  const struct rte_flow_attr *attr,
1070 		  const struct rte_flow_item items[],
1071 		  const struct rte_flow_action actions[],
1072 		  struct rte_flow_error *error)
1073 {
1074 	struct pmd_internals *pmd = dev->data->dev_private;
1075 
1076 	return priv_flow_process(pmd, attr, items, actions, error, NULL, 0);
1077 }
1078 
1079 /**
1080  * Set a unique handle in a flow.
1081  *
1082  * The kernel supports TC rules with equal priority, as long as they use the
1083  * same matching fields (e.g.: dst mac and ipv4) with different values (and
1084  * full mask to ensure no collision is possible).
1085  * In those rules, the handle (uint32_t) is the part that would identify
1086  * specifically each rule.
1087  *
1088  * On 32-bit architectures, the handle can simply be the flow's pointer address.
1089  * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently)
1090  * unique handle.
1091  *
1092  * @param[in, out] flow
1093  *   The flow that needs its handle set.
1094  */
1095 static void
1096 tap_flow_set_handle(struct rte_flow *flow)
1097 {
1098 	uint32_t handle = 0;
1099 
1100 	if (sizeof(flow) > 4)
1101 		handle = rte_jhash(&flow, sizeof(flow), 1);
1102 	else
1103 		handle = (uintptr_t)flow;
1104 	/* must be at least 1 to avoid letting the kernel choose one for us */
1105 	if (!handle)
1106 		handle = 1;
1107 	flow->msg.t.tcm_handle = handle;
1108 }
1109 
1110 /**
1111  * Create a flow.
1112  *
1113  * @see rte_flow_create()
1114  * @see rte_flow_ops
1115  */
1116 static struct rte_flow *
1117 tap_flow_create(struct rte_eth_dev *dev,
1118 		const struct rte_flow_attr *attr,
1119 		const struct rte_flow_item items[],
1120 		const struct rte_flow_action actions[],
1121 		struct rte_flow_error *error)
1122 {
1123 	struct pmd_internals *pmd = dev->data->dev_private;
1124 	struct rte_flow *remote_flow = NULL;
1125 	struct rte_flow *flow = NULL;
1126 	struct nlmsg *msg = NULL;
1127 	int err;
1128 
1129 	if (!pmd->if_index) {
1130 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1131 				   NULL,
1132 				   "can't create rule, ifindex not found");
1133 		goto fail;
1134 	}
1135 	/*
1136 	 * No rules configured through standard rte_flow should be set on the
1137 	 * priorities used by implicit rules.
1138 	 */
1139 	if ((attr->group == MAX_GROUP) &&
1140 	    attr->priority > (MAX_PRIORITY - TAP_REMOTE_MAX_IDX)) {
1141 		rte_flow_error_set(
1142 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1143 			NULL, "priority value too big");
1144 		goto fail;
1145 	}
1146 	flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1147 	if (!flow) {
1148 		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1149 				   NULL, "cannot allocate memory for rte_flow");
1150 		goto fail;
1151 	}
1152 	msg = &flow->msg;
1153 	tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER,
1154 		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1155 	msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1156 	tap_flow_set_handle(flow);
1157 	if (priv_flow_process(pmd, attr, items, actions, error, flow, 0))
1158 		goto fail;
1159 	err = nl_send(pmd->nlsk_fd, &msg->nh);
1160 	if (err < 0) {
1161 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1162 				   NULL, "couldn't send request to kernel");
1163 		goto fail;
1164 	}
1165 	err = nl_recv_ack(pmd->nlsk_fd);
1166 	if (err < 0) {
1167 		RTE_LOG(ERR, PMD,
1168 			"Kernel refused TC filter rule creation (%d): %s\n",
1169 			errno, strerror(errno));
1170 		rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE,
1171 				   NULL, "overlapping rules");
1172 		goto fail;
1173 	}
1174 	LIST_INSERT_HEAD(&pmd->flows, flow, next);
1175 	/**
1176 	 * If a remote device is configured, a TC rule with identical items for
1177 	 * matching must be set on that device, with a single action: redirect
1178 	 * to the local pmd->if_index.
1179 	 */
1180 	if (pmd->remote_if_index) {
1181 		remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1182 		if (!remote_flow) {
1183 			rte_flow_error_set(
1184 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1185 				"cannot allocate memory for rte_flow");
1186 			goto fail;
1187 		}
1188 		msg = &remote_flow->msg;
1189 		/* set the rule if_index for the remote netdevice */
1190 		tc_init_msg(
1191 			msg, pmd->remote_if_index, RTM_NEWTFILTER,
1192 			NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1193 		msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1194 		tap_flow_set_handle(remote_flow);
1195 		if (priv_flow_process(pmd, attr, items, NULL,
1196 				      error, remote_flow, TCA_EGRESS_REDIR)) {
1197 			rte_flow_error_set(
1198 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1199 				NULL, "rte flow rule validation failed");
1200 			goto fail;
1201 		}
1202 		err = nl_send(pmd->nlsk_fd, &msg->nh);
1203 		if (err < 0) {
1204 			rte_flow_error_set(
1205 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1206 				NULL, "Failure sending nl request");
1207 			goto fail;
1208 		}
1209 		err = nl_recv_ack(pmd->nlsk_fd);
1210 		if (err < 0) {
1211 			RTE_LOG(ERR, PMD,
1212 				"Kernel refused TC filter rule creation (%d): %s\n",
1213 				errno, strerror(errno));
1214 			rte_flow_error_set(
1215 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1216 				NULL, "overlapping rules");
1217 			goto fail;
1218 		}
1219 		flow->remote_flow = remote_flow;
1220 	}
1221 	return flow;
1222 fail:
1223 	if (remote_flow)
1224 		rte_free(remote_flow);
1225 	if (flow)
1226 		rte_free(flow);
1227 	return NULL;
1228 }
1229 
1230 /**
1231  * Destroy a flow using pointer to pmd_internal.
1232  *
1233  * @param[in, out] pmd
1234  *   Pointer to private structure.
1235  * @param[in] flow
1236  *   Pointer to the flow to destroy.
1237  * @param[in, out] error
1238  *   Pointer to the flow error handler
1239  *
1240  * @return 0 if the flow could be destroyed, -1 otherwise.
1241  */
1242 static int
1243 tap_flow_destroy_pmd(struct pmd_internals *pmd,
1244 		     struct rte_flow *flow,
1245 		     struct rte_flow_error *error)
1246 {
1247 	struct rte_flow *remote_flow = flow->remote_flow;
1248 	int ret = 0;
1249 
1250 	LIST_REMOVE(flow, next);
1251 	flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1252 	flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1253 
1254 	ret = nl_send(pmd->nlsk_fd, &flow->msg.nh);
1255 	if (ret < 0) {
1256 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1257 				   NULL, "couldn't send request to kernel");
1258 		goto end;
1259 	}
1260 	ret = nl_recv_ack(pmd->nlsk_fd);
1261 	/* If errno is ENOENT, the rule is already no longer in the kernel. */
1262 	if (ret < 0 && errno == ENOENT)
1263 		ret = 0;
1264 	if (ret < 0) {
1265 		RTE_LOG(ERR, PMD,
1266 			"Kernel refused TC filter rule deletion (%d): %s\n",
1267 			errno, strerror(errno));
1268 		rte_flow_error_set(
1269 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1270 			"couldn't receive kernel ack to our request");
1271 		goto end;
1272 	}
1273 	if (remote_flow) {
1274 		remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1275 		remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1276 
1277 		ret = nl_send(pmd->nlsk_fd, &remote_flow->msg.nh);
1278 		if (ret < 0) {
1279 			rte_flow_error_set(
1280 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1281 				NULL, "Failure sending nl request");
1282 			goto end;
1283 		}
1284 		ret = nl_recv_ack(pmd->nlsk_fd);
1285 		if (ret < 0 && errno == ENOENT)
1286 			ret = 0;
1287 		if (ret < 0) {
1288 			RTE_LOG(ERR, PMD,
1289 				"Kernel refused TC filter rule deletion (%d): %s\n",
1290 				errno, strerror(errno));
1291 			rte_flow_error_set(
1292 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1293 				NULL, "Failure trying to receive nl ack");
1294 			goto end;
1295 		}
1296 	}
1297 end:
1298 	if (remote_flow)
1299 		rte_free(remote_flow);
1300 	rte_free(flow);
1301 	return ret;
1302 }
1303 
1304 /**
1305  * Destroy a flow.
1306  *
1307  * @see rte_flow_destroy()
1308  * @see rte_flow_ops
1309  */
1310 static int
1311 tap_flow_destroy(struct rte_eth_dev *dev,
1312 		 struct rte_flow *flow,
1313 		 struct rte_flow_error *error)
1314 {
1315 	struct pmd_internals *pmd = dev->data->dev_private;
1316 
1317 	return tap_flow_destroy_pmd(pmd, flow, error);
1318 }
1319 
1320 /**
1321  * Destroy all flows.
1322  *
1323  * @see rte_flow_flush()
1324  * @see rte_flow_ops
1325  */
1326 int
1327 tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
1328 {
1329 	struct pmd_internals *pmd = dev->data->dev_private;
1330 	struct rte_flow *flow;
1331 
1332 	while (!LIST_EMPTY(&pmd->flows)) {
1333 		flow = LIST_FIRST(&pmd->flows);
1334 		if (tap_flow_destroy(dev, flow, error) < 0)
1335 			return -1;
1336 	}
1337 	return 0;
1338 }
1339 
1340 /**
1341  * Add an implicit flow rule on the remote device to make sure traffic gets to
1342  * the tap netdevice from there.
1343  *
1344  * @param pmd
1345  *   Pointer to private structure.
1346  * @param[in] idx
1347  *   The idx in the implicit_rte_flows array specifying which rule to apply.
1348  *
1349  * @return -1 if the rule couldn't be applied, 0 otherwise.
1350  */
1351 int tap_flow_implicit_create(struct pmd_internals *pmd,
1352 			     enum implicit_rule_index idx)
1353 {
1354 	struct rte_flow_item *items = implicit_rte_flows[idx].items;
1355 	struct rte_flow_attr *attr = &implicit_rte_flows[idx].attr;
1356 	struct rte_flow_item_eth eth_local = { .type = 0 };
1357 	uint16_t if_index = pmd->remote_if_index;
1358 	struct rte_flow *remote_flow = NULL;
1359 	struct nlmsg *msg = NULL;
1360 	int err = 0;
1361 	struct rte_flow_item items_local[2] = {
1362 		[0] = {
1363 			.type = items[0].type,
1364 			.spec = &eth_local,
1365 			.mask = items[0].mask,
1366 		},
1367 		[1] = {
1368 			.type = items[1].type,
1369 		}
1370 	};
1371 
1372 	remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1373 	if (!remote_flow) {
1374 		RTE_LOG(ERR, PMD, "Cannot allocate memory for rte_flow");
1375 		goto fail;
1376 	}
1377 	msg = &remote_flow->msg;
1378 	if (idx == TAP_REMOTE_TX) {
1379 		if_index = pmd->if_index;
1380 	} else if (idx == TAP_REMOTE_LOCAL_MAC) {
1381 		/*
1382 		 * eth addr couldn't be set in implicit_rte_flows[] as it is not
1383 		 * known at compile time.
1384 		 */
1385 		memcpy(&eth_local.dst, &pmd->eth_addr, sizeof(pmd->eth_addr));
1386 		items = items_local;
1387 	}
1388 	tc_init_msg(msg, if_index, RTM_NEWTFILTER,
1389 		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1390 	msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1391 	tap_flow_set_handle(remote_flow);
1392 	if (priv_flow_process(pmd, attr, items, NULL, NULL,
1393 			      remote_flow, implicit_rte_flows[idx].mirred)) {
1394 		RTE_LOG(ERR, PMD, "rte flow rule validation failed\n");
1395 		goto fail;
1396 	}
1397 	err = nl_send(pmd->nlsk_fd, &msg->nh);
1398 	if (err < 0) {
1399 		RTE_LOG(ERR, PMD, "Failure sending nl request");
1400 		goto fail;
1401 	}
1402 	err = nl_recv_ack(pmd->nlsk_fd);
1403 	if (err < 0) {
1404 		RTE_LOG(ERR, PMD,
1405 			"Kernel refused TC filter rule creation (%d): %s\n",
1406 			errno, strerror(errno));
1407 		goto fail;
1408 	}
1409 	LIST_INSERT_HEAD(&pmd->implicit_flows, remote_flow, next);
1410 	return 0;
1411 fail:
1412 	if (remote_flow)
1413 		rte_free(remote_flow);
1414 	return -1;
1415 }
1416 
1417 /**
1418  * Remove specific implicit flow rule on the remote device.
1419  *
1420  * @param[in, out] pmd
1421  *   Pointer to private structure.
1422  * @param[in] idx
1423  *   The idx in the implicit_rte_flows array specifying which rule to remove.
1424  *
1425  * @return -1 if one of the implicit rules couldn't be created, 0 otherwise.
1426  */
1427 int tap_flow_implicit_destroy(struct pmd_internals *pmd,
1428 			      enum implicit_rule_index idx)
1429 {
1430 	struct rte_flow *remote_flow;
1431 	int cur_prio = -1;
1432 	int idx_prio = implicit_rte_flows[idx].attr.priority + PRIORITY_OFFSET;
1433 
1434 	for (remote_flow = LIST_FIRST(&pmd->implicit_flows);
1435 	     remote_flow;
1436 	     remote_flow = LIST_NEXT(remote_flow, next)) {
1437 		cur_prio = (remote_flow->msg.t.tcm_info >> 16) & PRIORITY_MASK;
1438 		if (cur_prio != idx_prio)
1439 			continue;
1440 		return tap_flow_destroy_pmd(pmd, remote_flow, NULL);
1441 	}
1442 	return 0;
1443 }
1444 
1445 /**
1446  * Destroy all implicit flows.
1447  *
1448  * @see rte_flow_flush()
1449  */
1450 int
1451 tap_flow_implicit_flush(struct pmd_internals *pmd, struct rte_flow_error *error)
1452 {
1453 	struct rte_flow *remote_flow;
1454 
1455 	while (!LIST_EMPTY(&pmd->implicit_flows)) {
1456 		remote_flow = LIST_FIRST(&pmd->implicit_flows);
1457 		if (tap_flow_destroy_pmd(pmd, remote_flow, error) < 0)
1458 			return -1;
1459 	}
1460 	return 0;
1461 }
1462 
1463 /**
1464  * Manage filter operations.
1465  *
1466  * @param dev
1467  *   Pointer to Ethernet device structure.
1468  * @param filter_type
1469  *   Filter type.
1470  * @param filter_op
1471  *   Operation to perform.
1472  * @param arg
1473  *   Pointer to operation-specific structure.
1474  *
1475  * @return
1476  *   0 on success, negative errno value on failure.
1477  */
1478 int
1479 tap_dev_filter_ctrl(struct rte_eth_dev *dev,
1480 		    enum rte_filter_type filter_type,
1481 		    enum rte_filter_op filter_op,
1482 		    void *arg)
1483 {
1484 	struct pmd_internals *pmd = dev->data->dev_private;
1485 
1486 	if (!pmd->flower_support)
1487 		return -ENOTSUP;
1488 	switch (filter_type) {
1489 	case RTE_ETH_FILTER_GENERIC:
1490 		if (filter_op != RTE_ETH_FILTER_GET)
1491 			return -EINVAL;
1492 		*(const void **)arg = &tap_flow_ops;
1493 		return 0;
1494 	default:
1495 		RTE_LOG(ERR, PMD, "%p: filter type (%d) not supported",
1496 			(void *)dev, filter_type);
1497 	}
1498 	return -EINVAL;
1499 }
1500 
1501