xref: /dpdk/drivers/net/tap/tap_flow.c (revision 7917b0d38e92e8b9ec5a870415b791420e10f11a)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2017 6WIND S.A.
3  * Copyright 2017 Mellanox Technologies, Ltd
4  */
5 
6 #include <errno.h>
7 #include <string.h>
8 #include <unistd.h>
9 #include <sys/queue.h>
10 #include <sys/resource.h>
11 
12 #include <rte_byteorder.h>
13 #include <rte_jhash.h>
14 #include <rte_thash.h>
15 #include <rte_random.h>
16 #include <rte_malloc.h>
17 #include <rte_eth_tap.h>
18 #include <rte_uuid.h>
19 
20 #include <tap_flow.h>
21 #include <tap_tcmsgs.h>
22 #include <tap_rss.h>
23 
24 #ifdef HAVE_BPF_RSS
25 /* Workaround for warning in bpftool generated skeleton code */
26 #pragma GCC diagnostic push
27 #pragma GCC diagnostic ignored "-Wcast-qual"
28 #include "tap_rss.skel.h"
29 #pragma GCC diagnostic pop
30 #endif
31 
32 #define ISOLATE_HANDLE 1
33 #define REMOTE_PROMISCUOUS_HANDLE 2
34 
35 struct rte_flow {
36 	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
37 	struct rte_flow *remote_flow; /* associated remote flow */
38 	struct nlmsg msg;
39 };
40 
41 struct convert_data {
42 	uint16_t eth_type;
43 	uint16_t ip_proto;
44 	uint8_t vlan;
45 	struct rte_flow *flow;
46 };
47 
48 struct remote_rule {
49 	struct rte_flow_attr attr;
50 	struct rte_flow_item items[2];
51 	struct rte_flow_action actions[2];
52 	int mirred;
53 };
54 
55 struct action_data {
56 	char id[16];
57 
58 	union {
59 		struct tc_gact gact;
60 		struct tc_mirred mirred;
61 		struct skbedit {
62 			struct tc_skbedit skbedit;
63 			uint16_t queue;
64 			uint32_t mark;
65 		} skbedit;
66 #ifdef HAVE_BPF_RSS
67 		struct bpf {
68 			struct tc_act_bpf bpf;
69 			uint32_t map_key;
70 			int bpf_fd;
71 			const char *annotation;
72 		} bpf;
73 #endif
74 	};
75 };
76 
77 static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
78 static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
79 static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
80 static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data);
81 static int tap_flow_create_udp(const struct rte_flow_item *item, void *data);
82 static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data);
83 static int
84 tap_flow_validate(struct rte_eth_dev *dev,
85 		  const struct rte_flow_attr *attr,
86 		  const struct rte_flow_item items[],
87 		  const struct rte_flow_action actions[],
88 		  struct rte_flow_error *error);
89 
90 static struct rte_flow *
91 tap_flow_create(struct rte_eth_dev *dev,
92 		const struct rte_flow_attr *attr,
93 		const struct rte_flow_item items[],
94 		const struct rte_flow_action actions[],
95 		struct rte_flow_error *error);
96 
97 static void
98 tap_flow_free(struct pmd_internals *pmd,
99 	struct rte_flow *flow);
100 
101 static int
102 tap_flow_destroy(struct rte_eth_dev *dev,
103 		 struct rte_flow *flow,
104 		 struct rte_flow_error *error);
105 
106 static int
107 tap_flow_isolate(struct rte_eth_dev *dev,
108 		 int set,
109 		 struct rte_flow_error *error);
110 
111 #ifdef HAVE_BPF_RSS
112 static int rss_enable(struct pmd_internals *pmd, struct rte_flow_error *error);
113 static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
114 			const struct rte_flow_action_rss *rss,
115 			struct rte_flow_error *error);
116 #endif
117 
118 static const struct rte_flow_ops tap_flow_ops = {
119 	.validate = tap_flow_validate,
120 	.create = tap_flow_create,
121 	.destroy = tap_flow_destroy,
122 	.flush = tap_flow_flush,
123 	.isolate = tap_flow_isolate,
124 };
125 
126 /* Static initializer for items. */
127 #define ITEMS(...) \
128 	(const enum rte_flow_item_type []){ \
129 		__VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
130 	}
131 
132 /* Structure to generate a simple graph of layers supported by the NIC. */
133 struct tap_flow_items {
134 	/* Bit-mask corresponding to what is supported for this item. */
135 	const void *mask;
136 	const unsigned int mask_sz; /* Bit-mask size in bytes. */
137 	/*
138 	 * Bit-mask corresponding to the default mask, if none is provided
139 	 * along with the item.
140 	 */
141 	const void *default_mask;
142 	/**
143 	 * Conversion function from rte_flow to netlink attributes.
144 	 *
145 	 * @param item
146 	 *   rte_flow item to convert.
147 	 * @param data
148 	 *   Internal structure to store the conversion.
149 	 *
150 	 * @return
151 	 *   0 on success, negative value otherwise.
152 	 */
153 	int (*convert)(const struct rte_flow_item *item, void *data);
154 	/** List of possible following items.  */
155 	const enum rte_flow_item_type *const items;
156 };
157 
158 /* Graph of supported items and associated actions. */
159 static const struct tap_flow_items tap_flow_items[] = {
160 	[RTE_FLOW_ITEM_TYPE_END] = {
161 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
162 	},
163 	[RTE_FLOW_ITEM_TYPE_ETH] = {
164 		.items = ITEMS(
165 			RTE_FLOW_ITEM_TYPE_VLAN,
166 			RTE_FLOW_ITEM_TYPE_IPV4,
167 			RTE_FLOW_ITEM_TYPE_IPV6),
168 		.mask = &(const struct rte_flow_item_eth){
169 			.hdr.dst_addr.addr_bytes = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
170 			.hdr.src_addr.addr_bytes = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
171 			.hdr.ether_type = -1,
172 		},
173 		.mask_sz = sizeof(struct rte_flow_item_eth),
174 		.default_mask = &rte_flow_item_eth_mask,
175 		.convert = tap_flow_create_eth,
176 	},
177 	[RTE_FLOW_ITEM_TYPE_VLAN] = {
178 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
179 			       RTE_FLOW_ITEM_TYPE_IPV6),
180 		.mask = &(const struct rte_flow_item_vlan){
181 			/* DEI matching is not supported */
182 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
183 			.hdr.vlan_tci = 0xffef,
184 #else
185 			.hdr.vlan_tci = 0xefff,
186 #endif
187 			.hdr.eth_proto = -1,
188 		},
189 		.mask_sz = sizeof(struct rte_flow_item_vlan),
190 		.default_mask = &rte_flow_item_vlan_mask,
191 		.convert = tap_flow_create_vlan,
192 	},
193 	[RTE_FLOW_ITEM_TYPE_IPV4] = {
194 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
195 			       RTE_FLOW_ITEM_TYPE_TCP),
196 		.mask = &(const struct rte_flow_item_ipv4){
197 			.hdr = {
198 				.src_addr = -1,
199 				.dst_addr = -1,
200 				.next_proto_id = -1,
201 			},
202 		},
203 		.mask_sz = sizeof(struct rte_flow_item_ipv4),
204 		.default_mask = &rte_flow_item_ipv4_mask,
205 		.convert = tap_flow_create_ipv4,
206 	},
207 	[RTE_FLOW_ITEM_TYPE_IPV6] = {
208 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
209 			       RTE_FLOW_ITEM_TYPE_TCP),
210 		.mask = &(const struct rte_flow_item_ipv6){
211 			.hdr = {
212 				.src_addr = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
213 					      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
214 				.dst_addr = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
215 					      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
216 				.proto = -1,
217 			},
218 		},
219 		.mask_sz = sizeof(struct rte_flow_item_ipv6),
220 		.default_mask = &rte_flow_item_ipv6_mask,
221 		.convert = tap_flow_create_ipv6,
222 	},
223 	[RTE_FLOW_ITEM_TYPE_UDP] = {
224 		.mask = &(const struct rte_flow_item_udp){
225 			.hdr = {
226 				.src_port = -1,
227 				.dst_port = -1,
228 			},
229 		},
230 		.mask_sz = sizeof(struct rte_flow_item_udp),
231 		.default_mask = &rte_flow_item_udp_mask,
232 		.convert = tap_flow_create_udp,
233 	},
234 	[RTE_FLOW_ITEM_TYPE_TCP] = {
235 		.mask = &(const struct rte_flow_item_tcp){
236 			.hdr = {
237 				.src_port = -1,
238 				.dst_port = -1,
239 			},
240 		},
241 		.mask_sz = sizeof(struct rte_flow_item_tcp),
242 		.default_mask = &rte_flow_item_tcp_mask,
243 		.convert = tap_flow_create_tcp,
244 	},
245 };
246 
247 /*
248  *                TC rules, by growing priority
249  *
250  *        Remote netdevice                  Tap netdevice
251  * +-------------+-------------+  +-------------+-------------+
252  * |   Ingress   |   Egress    |  |   Ingress   |   Egress    |
253  * |-------------|-------------|  |-------------|-------------|
254  * |             |  \       /  |  |             |  REMOTE TX  | prio 1
255  * |             |   \     /   |  |             |   \     /   | prio 2
256  * |  EXPLICIT   |    \   /    |  |  EXPLICIT   |    \   /    |   .
257  * |             |     \ /     |  |             |     \ /     |   .
258  * |    RULES    |      X      |  |    RULES    |      X      |   .
259  * |      .      |     / \     |  |      .      |     / \     |   .
260  * |      .      |    /   \    |  |      .      |    /   \    |   .
261  * |      .      |   /     \   |  |      .      |   /     \   |   .
262  * |      .      |  /       \  |  |      .      |  /       \  |   .
263  *
264  *      ....           ....           ....           ....
265  *
266  * |      .      |  \       /  |  |      .      |  \       /  |   .
267  * |      .      |   \     /   |  |      .      |   \     /   |   .
268  * |             |    \   /    |  |             |    \   /    |
269  * |  LOCAL_MAC  |     \ /     |  |    \   /    |     \ /     | last prio - 5
270  * |   PROMISC   |      X      |  |     \ /     |      X      | last prio - 4
271  * |   ALLMULTI  |     / \     |  |      X      |     / \     | last prio - 3
272  * |  BROADCAST  |    /   \    |  |     / \     |    /   \    | last prio - 2
273  * | BROADCASTV6 |   /     \   |  |    /   \    |   /     \   | last prio - 1
274  * |     xx      |  /       \  |  |   ISOLATE   |  /       \  | last prio
275  * +-------------+-------------+  +-------------+-------------+
276  *
277  * The implicit flow rules are stored in a list in with mandatorily the last two
278  * being the ISOLATE and REMOTE_TX rules. e.g.:
279  *
280  * LOCAL_MAC -> BROADCAST -> BROADCASTV6 -> REMOTE_TX -> ISOLATE -> NULL
281  *
282  * That enables tap_flow_isolate() to remove implicit rules by popping the list
283  * head and remove it as long as it applies on the remote netdevice. The
284  * implicit rule for TX redirection is not removed, as isolate concerns only
285  * incoming traffic.
286  */
287 
288 static struct remote_rule implicit_rte_flows[TAP_REMOTE_MAX_IDX] = {
289 	[TAP_REMOTE_LOCAL_MAC] = {
290 		.attr = {
291 			.group = MAX_GROUP,
292 			.priority = PRIORITY_MASK - TAP_REMOTE_LOCAL_MAC,
293 			.ingress = 1,
294 		},
295 		.items[0] = {
296 			.type = RTE_FLOW_ITEM_TYPE_ETH,
297 			.mask =  &(const struct rte_flow_item_eth){
298 				.hdr.dst_addr.addr_bytes = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
299 			},
300 		},
301 		.items[1] = {
302 			.type = RTE_FLOW_ITEM_TYPE_END,
303 		},
304 		.mirred = TCA_EGRESS_REDIR,
305 	},
306 	[TAP_REMOTE_BROADCAST] = {
307 		.attr = {
308 			.group = MAX_GROUP,
309 			.priority = PRIORITY_MASK - TAP_REMOTE_BROADCAST,
310 			.ingress = 1,
311 		},
312 		.items[0] = {
313 			.type = RTE_FLOW_ITEM_TYPE_ETH,
314 			.mask =  &(const struct rte_flow_item_eth){
315 				.hdr.dst_addr.addr_bytes = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
316 			},
317 			.spec = &(const struct rte_flow_item_eth){
318 				.hdr.dst_addr.addr_bytes = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
319 			},
320 		},
321 		.items[1] = {
322 			.type = RTE_FLOW_ITEM_TYPE_END,
323 		},
324 		.mirred = TCA_EGRESS_MIRROR,
325 	},
326 	[TAP_REMOTE_BROADCASTV6] = {
327 		.attr = {
328 			.group = MAX_GROUP,
329 			.priority = PRIORITY_MASK - TAP_REMOTE_BROADCASTV6,
330 			.ingress = 1,
331 		},
332 		.items[0] = {
333 			.type = RTE_FLOW_ITEM_TYPE_ETH,
334 			.mask =  &(const struct rte_flow_item_eth){
335 				.hdr.dst_addr.addr_bytes = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 },
336 			},
337 			.spec = &(const struct rte_flow_item_eth){
338 				.hdr.dst_addr.addr_bytes = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 },
339 			},
340 		},
341 		.items[1] = {
342 			.type = RTE_FLOW_ITEM_TYPE_END,
343 		},
344 		.mirred = TCA_EGRESS_MIRROR,
345 	},
346 	[TAP_REMOTE_PROMISC] = {
347 		.attr = {
348 			.group = MAX_GROUP,
349 			.priority = PRIORITY_MASK - TAP_REMOTE_PROMISC,
350 			.ingress = 1,
351 		},
352 		.items[0] = {
353 			.type = RTE_FLOW_ITEM_TYPE_VOID,
354 		},
355 		.items[1] = {
356 			.type = RTE_FLOW_ITEM_TYPE_END,
357 		},
358 		.mirred = TCA_EGRESS_MIRROR,
359 	},
360 	[TAP_REMOTE_ALLMULTI] = {
361 		.attr = {
362 			.group = MAX_GROUP,
363 			.priority = PRIORITY_MASK - TAP_REMOTE_ALLMULTI,
364 			.ingress = 1,
365 		},
366 		.items[0] = {
367 			.type = RTE_FLOW_ITEM_TYPE_ETH,
368 			.mask =  &(const struct rte_flow_item_eth){
369 				.hdr.dst_addr.addr_bytes = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 },
370 			},
371 			.spec = &(const struct rte_flow_item_eth){
372 				.hdr.dst_addr.addr_bytes = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 },
373 			},
374 		},
375 		.items[1] = {
376 			.type = RTE_FLOW_ITEM_TYPE_END,
377 		},
378 		.mirred = TCA_EGRESS_MIRROR,
379 	},
380 	[TAP_REMOTE_TX] = {
381 		.attr = {
382 			.group = 0,
383 			.priority = TAP_REMOTE_TX,
384 			.egress = 1,
385 		},
386 		.items[0] = {
387 			.type = RTE_FLOW_ITEM_TYPE_VOID,
388 		},
389 		.items[1] = {
390 			.type = RTE_FLOW_ITEM_TYPE_END,
391 		},
392 		.mirred = TCA_EGRESS_MIRROR,
393 	},
394 	[TAP_ISOLATE] = {
395 		.attr = {
396 			.group = MAX_GROUP,
397 			.priority = PRIORITY_MASK - TAP_ISOLATE,
398 			.ingress = 1,
399 		},
400 		.items[0] = {
401 			.type = RTE_FLOW_ITEM_TYPE_VOID,
402 		},
403 		.items[1] = {
404 			.type = RTE_FLOW_ITEM_TYPE_END,
405 		},
406 	},
407 };
408 
409 /**
410  * Make as much checks as possible on an Ethernet item, and if a flow is
411  * provided, fill it appropriately with Ethernet info.
412  *
413  * @param[in] item
414  *   Item specification.
415  * @param[in, out] data
416  *   Additional data structure to tell next layers we've been here.
417  *
418  * @return
419  *   0 if checks are alright, -1 otherwise.
420  */
421 static int
422 tap_flow_create_eth(const struct rte_flow_item *item, void *data)
423 {
424 	struct convert_data *info = (struct convert_data *)data;
425 	const struct rte_flow_item_eth *spec = item->spec;
426 	const struct rte_flow_item_eth *mask = item->mask;
427 	struct rte_flow *flow = info->flow;
428 	struct nlmsg *msg;
429 
430 	/* use default mask if none provided */
431 	if (!mask)
432 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask;
433 	/* TC does not support eth_type masking. Only accept if exact match. */
434 	if (mask->hdr.ether_type && mask->hdr.ether_type != 0xffff)
435 		return -1;
436 	if (!spec)
437 		return 0;
438 	/* store eth_type for consistency if ipv4/6 pattern item comes next */
439 	if (spec->hdr.ether_type & mask->hdr.ether_type)
440 		info->eth_type = spec->hdr.ether_type;
441 	if (!flow)
442 		return 0;
443 	msg = &flow->msg;
444 	if (!rte_is_zero_ether_addr(&mask->hdr.dst_addr)) {
445 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST,
446 			RTE_ETHER_ADDR_LEN,
447 			   &spec->hdr.dst_addr.addr_bytes);
448 		tap_nlattr_add(&msg->nh,
449 			   TCA_FLOWER_KEY_ETH_DST_MASK, RTE_ETHER_ADDR_LEN,
450 			   &mask->hdr.dst_addr.addr_bytes);
451 	}
452 	if (!rte_is_zero_ether_addr(&mask->hdr.src_addr)) {
453 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC,
454 			RTE_ETHER_ADDR_LEN,
455 			&spec->hdr.src_addr.addr_bytes);
456 		tap_nlattr_add(&msg->nh,
457 			   TCA_FLOWER_KEY_ETH_SRC_MASK, RTE_ETHER_ADDR_LEN,
458 			   &mask->hdr.src_addr.addr_bytes);
459 	}
460 	return 0;
461 }
462 
463 /**
464  * Make as much checks as possible on a VLAN item, and if a flow is provided,
465  * fill it appropriately with VLAN info.
466  *
467  * @param[in] item
468  *   Item specification.
469  * @param[in, out] data
470  *   Additional data structure to tell next layers we've been here.
471  *
472  * @return
473  *   0 if checks are alright, -1 otherwise.
474  */
475 static int
476 tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
477 {
478 	struct convert_data *info = (struct convert_data *)data;
479 	const struct rte_flow_item_vlan *spec = item->spec;
480 	const struct rte_flow_item_vlan *mask = item->mask;
481 	struct rte_flow *flow = info->flow;
482 	struct nlmsg *msg;
483 
484 	/* use default mask if none provided */
485 	if (!mask)
486 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
487 	/* Outer TPID cannot be matched. */
488 	if (info->eth_type)
489 		return -1;
490 	/* Double-tagging not supported. */
491 	if (info->vlan)
492 		return -1;
493 	info->vlan = 1;
494 	if (mask->hdr.eth_proto) {
495 		/* TC does not support partial eth_type masking */
496 		if (mask->hdr.eth_proto != RTE_BE16(0xffff))
497 			return -1;
498 		info->eth_type = spec->hdr.eth_proto;
499 	}
500 	if (!flow)
501 		return 0;
502 	msg = &flow->msg;
503 	msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q));
504 #define VLAN_PRIO(tci) ((tci) >> 13)
505 #define VLAN_ID(tci) ((tci) & 0xfff)
506 	if (!spec)
507 		return 0;
508 	if (spec->hdr.vlan_tci) {
509 		uint16_t tci = ntohs(spec->hdr.vlan_tci) & mask->hdr.vlan_tci;
510 		uint16_t prio = VLAN_PRIO(tci);
511 		uint8_t vid = VLAN_ID(tci);
512 
513 		if (prio)
514 			tap_nlattr_add8(&msg->nh,
515 					TCA_FLOWER_KEY_VLAN_PRIO, prio);
516 		if (vid)
517 			tap_nlattr_add16(&msg->nh,
518 					 TCA_FLOWER_KEY_VLAN_ID, vid);
519 	}
520 	return 0;
521 }
522 
523 /**
524  * Make as much checks as possible on an IPv4 item, and if a flow is provided,
525  * fill it appropriately with IPv4 info.
526  *
527  * @param[in] item
528  *   Item specification.
529  * @param[in, out] data
530  *   Additional data structure to tell next layers we've been here.
531  *
532  * @return
533  *   0 if checks are alright, -1 otherwise.
534  */
535 static int
536 tap_flow_create_ipv4(const struct rte_flow_item *item, void *data)
537 {
538 	struct convert_data *info = (struct convert_data *)data;
539 	const struct rte_flow_item_ipv4 *spec = item->spec;
540 	const struct rte_flow_item_ipv4 *mask = item->mask;
541 	struct rte_flow *flow = info->flow;
542 	struct nlmsg *msg;
543 
544 	/* use default mask if none provided */
545 	if (!mask)
546 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask;
547 	/* check that previous eth type is compatible with ipv4 */
548 	if (info->eth_type && info->eth_type != htons(ETH_P_IP))
549 		return -1;
550 	/* store ip_proto for consistency if udp/tcp pattern item comes next */
551 	if (spec)
552 		info->ip_proto = spec->hdr.next_proto_id;
553 	if (!flow)
554 		return 0;
555 	msg = &flow->msg;
556 	if (!info->eth_type)
557 		info->eth_type = htons(ETH_P_IP);
558 	if (!spec)
559 		return 0;
560 	if (mask->hdr.dst_addr) {
561 		tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST,
562 			     spec->hdr.dst_addr);
563 		tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK,
564 			     mask->hdr.dst_addr);
565 	}
566 	if (mask->hdr.src_addr) {
567 		tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC,
568 			     spec->hdr.src_addr);
569 		tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK,
570 			     mask->hdr.src_addr);
571 	}
572 	if (spec->hdr.next_proto_id)
573 		tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO,
574 			    spec->hdr.next_proto_id);
575 	return 0;
576 }
577 
578 /**
579  * Make as much checks as possible on an IPv6 item, and if a flow is provided,
580  * fill it appropriately with IPv6 info.
581  *
582  * @param[in] item
583  *   Item specification.
584  * @param[in, out] data
585  *   Additional data structure to tell next layers we've been here.
586  *
587  * @return
588  *   0 if checks are alright, -1 otherwise.
589  */
590 static int
591 tap_flow_create_ipv6(const struct rte_flow_item *item, void *data)
592 {
593 	struct convert_data *info = (struct convert_data *)data;
594 	const struct rte_flow_item_ipv6 *spec = item->spec;
595 	const struct rte_flow_item_ipv6 *mask = item->mask;
596 	struct rte_flow *flow = info->flow;
597 	uint8_t empty_addr[16] = { 0 };
598 	struct nlmsg *msg;
599 
600 	/* use default mask if none provided */
601 	if (!mask)
602 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask;
603 	/* check that previous eth type is compatible with ipv6 */
604 	if (info->eth_type && info->eth_type != htons(ETH_P_IPV6))
605 		return -1;
606 	/* store ip_proto for consistency if udp/tcp pattern item comes next */
607 	if (spec)
608 		info->ip_proto = spec->hdr.proto;
609 	if (!flow)
610 		return 0;
611 	msg = &flow->msg;
612 	if (!info->eth_type)
613 		info->eth_type = htons(ETH_P_IPV6);
614 	if (!spec)
615 		return 0;
616 	if (memcmp(mask->hdr.dst_addr, empty_addr, 16)) {
617 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST,
618 			   sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr);
619 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK,
620 			   sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr);
621 	}
622 	if (memcmp(mask->hdr.src_addr, empty_addr, 16)) {
623 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC,
624 			   sizeof(spec->hdr.src_addr), &spec->hdr.src_addr);
625 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
626 			   sizeof(mask->hdr.src_addr), &mask->hdr.src_addr);
627 	}
628 	if (spec->hdr.proto)
629 		tap_nlattr_add8(&msg->nh,
630 				TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto);
631 	return 0;
632 }
633 
634 /**
635  * Make as much checks as possible on a UDP item, and if a flow is provided,
636  * fill it appropriately with UDP info.
637  *
638  * @param[in] item
639  *   Item specification.
640  * @param[in, out] data
641  *   Additional data structure to tell next layers we've been here.
642  *
643  * @return
644  *   0 if checks are alright, -1 otherwise.
645  */
646 static int
647 tap_flow_create_udp(const struct rte_flow_item *item, void *data)
648 {
649 	struct convert_data *info = (struct convert_data *)data;
650 	const struct rte_flow_item_udp *spec = item->spec;
651 	const struct rte_flow_item_udp *mask = item->mask;
652 	struct rte_flow *flow = info->flow;
653 	struct nlmsg *msg;
654 
655 	/* use default mask if none provided */
656 	if (!mask)
657 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask;
658 	/* check that previous ip_proto is compatible with udp */
659 	if (info->ip_proto && info->ip_proto != IPPROTO_UDP)
660 		return -1;
661 	/* TC does not support UDP port masking. Only accept if exact match. */
662 	if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
663 	    (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
664 		return -1;
665 	if (!flow)
666 		return 0;
667 	msg = &flow->msg;
668 	tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP);
669 	if (!spec)
670 		return 0;
671 	if (mask->hdr.dst_port)
672 		tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST,
673 			     spec->hdr.dst_port);
674 	if (mask->hdr.src_port)
675 		tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC,
676 			     spec->hdr.src_port);
677 	return 0;
678 }
679 
680 /**
681  * Make as much checks as possible on a TCP item, and if a flow is provided,
682  * fill it appropriately with TCP info.
683  *
684  * @param[in] item
685  *   Item specification.
686  * @param[in, out] data
687  *   Additional data structure to tell next layers we've been here.
688  *
689  * @return
690  *   0 if checks are alright, -1 otherwise.
691  */
692 static int
693 tap_flow_create_tcp(const struct rte_flow_item *item, void *data)
694 {
695 	struct convert_data *info = (struct convert_data *)data;
696 	const struct rte_flow_item_tcp *spec = item->spec;
697 	const struct rte_flow_item_tcp *mask = item->mask;
698 	struct rte_flow *flow = info->flow;
699 	struct nlmsg *msg;
700 
701 	/* use default mask if none provided */
702 	if (!mask)
703 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask;
704 	/* check that previous ip_proto is compatible with tcp */
705 	if (info->ip_proto && info->ip_proto != IPPROTO_TCP)
706 		return -1;
707 	/* TC does not support TCP port masking. Only accept if exact match. */
708 	if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
709 	    (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
710 		return -1;
711 	if (!flow)
712 		return 0;
713 	msg = &flow->msg;
714 	tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP);
715 	if (!spec)
716 		return 0;
717 	if (mask->hdr.dst_port)
718 		tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST,
719 			     spec->hdr.dst_port);
720 	if (mask->hdr.src_port)
721 		tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC,
722 			     spec->hdr.src_port);
723 	return 0;
724 }
725 
726 /**
727  * Check support for a given item.
728  *
729  * @param[in] item
730  *   Item specification.
731  * @param size
732  *   Bit-Mask size in bytes.
733  * @param[in] supported_mask
734  *   Bit-mask covering supported fields to compare with spec, last and mask in
735  *   \item.
736  * @param[in] default_mask
737  *   Bit-mask default mask if none is provided in \item.
738  *
739  * @return
740  *   0 on success.
741  */
742 static int
743 tap_flow_item_validate(const struct rte_flow_item *item,
744 		       unsigned int size,
745 		       const uint8_t *supported_mask,
746 		       const uint8_t *default_mask)
747 {
748 	int ret = 0;
749 
750 	/* An empty layer is allowed, as long as all fields are NULL */
751 	if (!item->spec && (item->mask || item->last))
752 		return -1;
753 	/* Is the item spec compatible with what the NIC supports? */
754 	if (item->spec && !item->mask) {
755 		unsigned int i;
756 		const uint8_t *spec = item->spec;
757 
758 		for (i = 0; i < size; ++i)
759 			if ((spec[i] | supported_mask[i]) != supported_mask[i])
760 				return -1;
761 		/* Is the default mask compatible with what the NIC supports? */
762 		for (i = 0; i < size; i++)
763 			if ((default_mask[i] | supported_mask[i]) !=
764 			    supported_mask[i])
765 				return -1;
766 	}
767 	/* Is the item last compatible with what the NIC supports? */
768 	if (item->last && !item->mask) {
769 		unsigned int i;
770 		const uint8_t *spec = item->last;
771 
772 		for (i = 0; i < size; ++i)
773 			if ((spec[i] | supported_mask[i]) != supported_mask[i])
774 				return -1;
775 	}
776 	/* Is the item mask compatible with what the NIC supports? */
777 	if (item->mask) {
778 		unsigned int i;
779 		const uint8_t *spec = item->mask;
780 
781 		for (i = 0; i < size; ++i)
782 			if ((spec[i] | supported_mask[i]) != supported_mask[i])
783 				return -1;
784 	}
785 	/**
786 	 * Once masked, Are item spec and item last equal?
787 	 * TC does not support range so anything else is invalid.
788 	 */
789 	if (item->spec && item->last) {
790 		uint8_t spec[size];
791 		uint8_t last[size];
792 		const uint8_t *apply = default_mask;
793 		unsigned int i;
794 
795 		if (item->mask)
796 			apply = item->mask;
797 		for (i = 0; i < size; ++i) {
798 			spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
799 			last[i] = ((const uint8_t *)item->last)[i] & apply[i];
800 		}
801 		ret = memcmp(spec, last, size);
802 	}
803 	return ret;
804 }
805 
806 /**
807  * Configure the kernel with a TC action and its configured parameters
808  * Handled actions: "gact", "mirred", "skbedit", "bpf"
809  *
810  * @param[in] flow
811  *   Pointer to rte flow containing the netlink message
812  *
813  * @param[in, out] act_index
814  *   Pointer to action sequence number in the TC command
815  *
816  * @param[in] adata
817  *  Pointer to struct holding the action parameters
818  *
819  * @return
820  *   -1 on failure, 0 on success
821  */
822 static int
823 add_action(struct rte_flow *flow, size_t *act_index, struct action_data *adata)
824 {
825 	struct nlmsg *msg = &flow->msg;
826 
827 	if (tap_nlattr_nested_start(msg, (*act_index)++) < 0)
828 		return -1;
829 
830 	tap_nlattr_add(&msg->nh, TCA_ACT_KIND,
831 				strlen(adata->id) + 1, adata->id);
832 	if (tap_nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
833 		return -1;
834 	if (strcmp("gact", adata->id) == 0) {
835 		tap_nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(adata->gact),
836 			   &adata->gact);
837 	} else if (strcmp("mirred", adata->id) == 0) {
838 		if (adata->mirred.eaction == TCA_EGRESS_MIRROR)
839 			adata->mirred.action = TC_ACT_PIPE;
840 		else /* REDIRECT */
841 			adata->mirred.action = TC_ACT_STOLEN;
842 		tap_nlattr_add(&msg->nh, TCA_MIRRED_PARMS,
843 			   sizeof(adata->mirred),
844 			   &adata->mirred);
845 	} else if (strcmp("skbedit", adata->id) == 0) {
846 		tap_nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS,
847 			   sizeof(adata->skbedit.skbedit), &adata->skbedit.skbedit);
848 		if (adata->skbedit.mark)
849 			tap_nlattr_add32(&msg->nh, TCA_SKBEDIT_MARK, adata->skbedit.mark);
850 		else
851 			tap_nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, adata->skbedit.queue);
852 	} else if (strcmp("bpf", adata->id) == 0) {
853 #ifdef HAVE_BPF_RSS
854 		tap_nlattr_add32(&msg->nh, TCA_ACT_BPF_FD, adata->bpf.bpf_fd);
855 		tap_nlattr_add(&msg->nh, TCA_ACT_BPF_NAME,
856 			   strlen(adata->bpf.annotation) + 1,
857 			   adata->bpf.annotation);
858 		tap_nlattr_add(&msg->nh, TCA_ACT_BPF_PARMS,
859 			   sizeof(adata->bpf.bpf),
860 			   &adata->bpf.bpf);
861 #else
862 		TAP_LOG(ERR, "Internal error: bpf requested but not supported");
863 		return -1;
864 #endif
865 	} else {
866 		TAP_LOG(ERR, "Internal error: unknown action: %s", adata->id);
867 		return -1;
868 	}
869 	tap_nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
870 	tap_nlattr_nested_finish(msg); /* nested act_index */
871 	return 0;
872 }
873 
874 /**
875  * Helper function to send a series of TC actions to the kernel
876  *
877  * @param[in] flow
878  *   Pointer to rte flow containing the netlink message
879  *
880  * @param[in] nb_actions
881  *   Number of actions in an array of action structs
882  *
883  * @param[in] data
884  *   Pointer to an array of action structs
885  *
886  * @param[in] classifier_actions
887  *   The classifier on behave of which the actions are configured
888  *
889  * @return
890  *   -1 on failure, 0 on success
891  */
892 static int
893 add_actions(struct rte_flow *flow, int nb_actions, struct action_data *data,
894 	    int classifier_action)
895 {
896 	struct nlmsg *msg = &flow->msg;
897 	size_t act_index = 1;
898 	int i;
899 
900 	if (tap_nlattr_nested_start(msg, classifier_action) < 0)
901 		return -1;
902 	for (i = 0; i < nb_actions; i++)
903 		if (add_action(flow, &act_index, data + i) < 0)
904 			return -1;
905 	tap_nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
906 	return 0;
907 }
908 
909 /**
910  * Validate a flow supported by TC.
911  * If flow param is not NULL, then also fill the netlink message inside.
912  *
913  * @param pmd
914  *   Pointer to private structure.
915  * @param[in] attr
916  *   Flow rule attributes.
917  * @param[in] pattern
918  *   Pattern specification (list terminated by the END pattern item).
919  * @param[in] actions
920  *   Associated actions (list terminated by the END action).
921  * @param[out] error
922  *   Perform verbose error reporting if not NULL.
923  * @param[in, out] flow
924  *   Flow structure to update.
925  * @param[in] mirred
926  *   If set to TCA_EGRESS_REDIR, provided actions will be replaced with a
927  *   redirection to the tap netdevice, and the TC rule will be configured
928  *   on the remote netdevice in pmd.
929  *   If set to TCA_EGRESS_MIRROR, provided actions will be replaced with a
930  *   mirroring to the tap netdevice, and the TC rule will be configured
931  *   on the remote netdevice in pmd. Matching packets will thus be duplicated.
932  *   If set to 0, the standard behavior is to be used: set correct actions for
933  *   the TC rule, and apply it on the tap netdevice.
934  *
935  * @return
936  *   0 on success, a negative errno value otherwise and rte_errno is set.
937  */
938 static int
939 priv_flow_process(struct pmd_internals *pmd,
940 		  const struct rte_flow_attr *attr,
941 		  const struct rte_flow_item items[],
942 		  const struct rte_flow_action actions[],
943 		  struct rte_flow_error *error,
944 		  struct rte_flow *flow,
945 		  int mirred)
946 {
947 	const struct tap_flow_items *cur_item = tap_flow_items;
948 	struct convert_data data = {
949 		.eth_type = 0,
950 		.ip_proto = 0,
951 		.flow = flow,
952 	};
953 	int action = 0; /* Only one action authorized for now */
954 
955 	if (attr->transfer) {
956 		rte_flow_error_set(
957 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
958 			NULL, "transfer is not supported");
959 		return -rte_errno;
960 	}
961 	if (attr->group > MAX_GROUP) {
962 		rte_flow_error_set(
963 			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
964 			NULL, "group value too big: cannot exceed 15");
965 		return -rte_errno;
966 	}
967 	if (attr->priority > MAX_PRIORITY) {
968 		rte_flow_error_set(
969 			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
970 			NULL, "priority value too big");
971 		return -rte_errno;
972 	} else if (flow) {
973 		uint16_t group = attr->group << GROUP_SHIFT;
974 		uint16_t prio = group | (attr->priority +
975 				RSS_PRIORITY_OFFSET + PRIORITY_OFFSET);
976 		flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
977 						 flow->msg.t.tcm_info);
978 	}
979 	if (flow) {
980 		if (mirred) {
981 			/*
982 			 * If attr->ingress, the rule applies on remote ingress
983 			 * to match incoming packets
984 			 * If attr->egress, the rule applies on tap ingress (as
985 			 * seen from the kernel) to deal with packets going out
986 			 * from the DPDK app.
987 			 */
988 			flow->msg.t.tcm_parent = TC_H_MAKE(TC_H_INGRESS, 0);
989 		} else {
990 			/* Standard rule on tap egress (kernel standpoint). */
991 			flow->msg.t.tcm_parent =
992 				TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
993 		}
994 		/* use flower filter type */
995 		tap_nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower");
996 		if (tap_nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0) {
997 			rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_ACTION,
998 					   actions, "could not allocated netlink msg");
999 			goto exit_return_error;
1000 		}
1001 	}
1002 	for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
1003 		const struct tap_flow_items *token = NULL;
1004 		unsigned int i;
1005 		int err = 0;
1006 
1007 		if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
1008 			continue;
1009 		for (i = 0;
1010 		     cur_item->items &&
1011 		     cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
1012 		     ++i) {
1013 			if (cur_item->items[i] == items->type) {
1014 				token = &tap_flow_items[items->type];
1015 				break;
1016 			}
1017 		}
1018 		if (!token)
1019 			goto exit_item_not_supported;
1020 		cur_item = token;
1021 		err = tap_flow_item_validate(
1022 			items, cur_item->mask_sz,
1023 			(const uint8_t *)cur_item->mask,
1024 			(const uint8_t *)cur_item->default_mask);
1025 		if (err)
1026 			goto exit_item_not_supported;
1027 		if (flow && cur_item->convert) {
1028 			err = cur_item->convert(items, &data);
1029 			if (err)
1030 				goto exit_item_not_supported;
1031 		}
1032 	}
1033 	if (flow) {
1034 		if (data.vlan) {
1035 			tap_nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
1036 				     htons(ETH_P_8021Q));
1037 			tap_nlattr_add16(&flow->msg.nh,
1038 				     TCA_FLOWER_KEY_VLAN_ETH_TYPE,
1039 				     data.eth_type ?
1040 				     data.eth_type : htons(ETH_P_ALL));
1041 		} else if (data.eth_type) {
1042 			tap_nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
1043 				     data.eth_type);
1044 		}
1045 	}
1046 	if (mirred && flow) {
1047 		struct action_data adata = {
1048 			.id = "mirred",
1049 			.mirred = {
1050 				.eaction = mirred,
1051 			},
1052 		};
1053 
1054 		/*
1055 		 * If attr->egress && mirred, then this is a special
1056 		 * case where the rule must be applied on the tap, to
1057 		 * redirect packets coming from the DPDK App, out
1058 		 * through the remote netdevice.
1059 		 */
1060 		adata.mirred.ifindex = attr->ingress ? pmd->if_index :
1061 			pmd->remote_if_index;
1062 		if (mirred == TCA_EGRESS_MIRROR)
1063 			adata.mirred.action = TC_ACT_PIPE;
1064 		else
1065 			adata.mirred.action = TC_ACT_STOLEN;
1066 		if (add_actions(flow, 1, &adata, TCA_FLOWER_ACT) < 0)
1067 			goto exit_action_not_supported;
1068 		else
1069 			goto end;
1070 	}
1071 actions:
1072 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
1073 		int err = 0;
1074 
1075 		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
1076 			continue;
1077 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
1078 			if (action)
1079 				goto exit_action_not_supported;
1080 			action = 1;
1081 			if (flow) {
1082 				struct action_data adata = {
1083 					.id = "gact",
1084 					.gact = {
1085 						.action = TC_ACT_SHOT,
1086 					},
1087 				};
1088 
1089 				err = add_actions(flow, 1, &adata,
1090 						  TCA_FLOWER_ACT);
1091 			}
1092 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
1093 			if (action)
1094 				goto exit_action_not_supported;
1095 			action = 1;
1096 			if (flow) {
1097 				struct action_data adata = {
1098 					.id = "gact",
1099 					.gact = {
1100 						/* continue */
1101 						.action = TC_ACT_UNSPEC,
1102 					},
1103 				};
1104 
1105 				err = add_actions(flow, 1, &adata, TCA_FLOWER_ACT);
1106 			}
1107 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
1108 			const struct rte_flow_action_queue *queue =
1109 				(const struct rte_flow_action_queue *)
1110 				actions->conf;
1111 
1112 			if (action)
1113 				goto exit_action_not_supported;
1114 			action = 1;
1115 			if (queue->index >= pmd->dev->data->nb_rx_queues) {
1116 				rte_flow_error_set(error, ERANGE,
1117 						   RTE_FLOW_ERROR_TYPE_ACTION, actions,
1118 						   "queue index out of range");
1119 				goto exit_return_error;
1120 			}
1121 			if (flow) {
1122 				struct action_data adata = {
1123 					.id = "skbedit",
1124 					.skbedit = {
1125 						.skbedit = {
1126 							.action = TC_ACT_PIPE,
1127 						},
1128 						.queue = queue->index,
1129 					},
1130 				};
1131 
1132 				err = add_actions(flow, 1, &adata,
1133 					TCA_FLOWER_ACT);
1134 			}
1135 #ifdef HAVE_BPF_RSS
1136 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
1137 			const struct rte_flow_action_rss *rss =
1138 				(const struct rte_flow_action_rss *)
1139 				actions->conf;
1140 
1141 			if (action++)
1142 				goto exit_action_not_supported;
1143 
1144 			if (pmd->rss == NULL) {
1145 				err = rss_enable(pmd, error);
1146 				if (err)
1147 					goto exit_return_error;
1148 			}
1149 			if (flow)
1150 				err = rss_add_actions(flow, pmd, rss, error);
1151 #endif
1152 		} else {
1153 			goto exit_action_not_supported;
1154 		}
1155 		if (err)
1156 			goto exit_return_error;
1157 	}
1158 	/* When fate is unknown, drop traffic. */
1159 	if (!action) {
1160 		static const struct rte_flow_action drop[] = {
1161 			{ .type = RTE_FLOW_ACTION_TYPE_DROP, },
1162 			{ .type = RTE_FLOW_ACTION_TYPE_END, },
1163 		};
1164 
1165 		actions = drop;
1166 		goto actions;
1167 	}
1168 end:
1169 	if (flow)
1170 		tap_nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
1171 	return 0;
1172 exit_item_not_supported:
1173 	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
1174 			   items, "item not supported");
1175 	return -rte_errno;
1176 exit_action_not_supported:
1177 	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
1178 			   actions, "action not supported");
1179 exit_return_error:
1180 	return -rte_errno;
1181 }
1182 
1183 
1184 
1185 /**
1186  * Validate a flow.
1187  *
1188  * @see rte_flow_validate()
1189  * @see rte_flow_ops
1190  */
1191 static int
1192 tap_flow_validate(struct rte_eth_dev *dev,
1193 		  const struct rte_flow_attr *attr,
1194 		  const struct rte_flow_item items[],
1195 		  const struct rte_flow_action actions[],
1196 		  struct rte_flow_error *error)
1197 {
1198 	struct pmd_internals *pmd = dev->data->dev_private;
1199 
1200 	return priv_flow_process(pmd, attr, items, actions, error, NULL, 0);
1201 }
1202 
1203 /**
1204  * Set a unique handle in a flow.
1205  *
1206  * The kernel supports TC rules with equal priority, as long as they use the
1207  * same matching fields (e.g.: dst mac and ipv4) with different values (and
1208  * full mask to ensure no collision is possible).
1209  * In those rules, the handle (uint32_t) is the part that would identify
1210  * specifically each rule.
1211  *
1212  * Use jhash of the flow pointer to make a unique handle.
1213  *
1214  * @param[in, out] flow
1215  *   The flow that needs its handle set.
1216  */
1217 static void
1218 tap_flow_set_handle(struct rte_flow *flow)
1219 {
1220 	union {
1221 		struct rte_flow *flow;
1222 		uint32_t words[sizeof(flow) / sizeof(uint32_t)];
1223 	} tmp = {
1224 		.flow = flow,
1225 	};
1226 	uint32_t handle;
1227 	static uint64_t hash_seed;
1228 
1229 	if (hash_seed == 0)
1230 		hash_seed = rte_rand();
1231 
1232 	handle = rte_jhash_32b(tmp.words, sizeof(flow) / sizeof(uint32_t), hash_seed);
1233 
1234 	/* must be at least 1 to avoid letting the kernel choose one for us */
1235 	if (!handle)
1236 		handle = 1;
1237 	flow->msg.t.tcm_handle = handle;
1238 }
1239 
1240 /**
1241  * Free the flow opened file descriptors and allocated memory
1242  *
1243  * @param[in] flow
1244  *   Pointer to the flow to free
1245  *
1246  */
1247 static void
1248 tap_flow_free(struct pmd_internals *pmd __rte_unused, struct rte_flow *flow)
1249 {
1250 	if (!flow)
1251 		return;
1252 
1253 #ifdef HAVE_BPF_RSS
1254 	struct tap_rss *rss = pmd->rss;
1255 	if (rss)
1256 		bpf_map__delete_elem(rss->maps.rss_map,
1257 				     &flow->msg.t.tcm_handle, sizeof(uint32_t), 0);
1258 #endif
1259 	/* Free flow allocated memory */
1260 	rte_free(flow);
1261 }
1262 
1263 /**
1264  * Create a flow.
1265  *
1266  * @see rte_flow_create()
1267  * @see rte_flow_ops
1268  */
1269 static struct rte_flow *
1270 tap_flow_create(struct rte_eth_dev *dev,
1271 		const struct rte_flow_attr *attr,
1272 		const struct rte_flow_item items[],
1273 		const struct rte_flow_action actions[],
1274 		struct rte_flow_error *error)
1275 {
1276 	struct pmd_internals *pmd = dev->data->dev_private;
1277 	struct rte_flow *remote_flow = NULL;
1278 	struct rte_flow *flow = NULL;
1279 	struct nlmsg *msg = NULL;
1280 	int err;
1281 
1282 	if (!pmd->if_index) {
1283 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1284 				   NULL,
1285 				   "can't create rule, ifindex not found");
1286 		goto fail;
1287 	}
1288 	/*
1289 	 * No rules configured through standard rte_flow should be set on the
1290 	 * priorities used by implicit rules.
1291 	 */
1292 	if ((attr->group == MAX_GROUP) &&
1293 	    attr->priority > (MAX_PRIORITY - TAP_REMOTE_MAX_IDX)) {
1294 		rte_flow_error_set(
1295 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1296 			NULL, "priority value too big");
1297 		goto fail;
1298 	}
1299 	flow = rte_zmalloc(__func__, sizeof(struct rte_flow), 0);
1300 	if (!flow) {
1301 		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1302 				   NULL, "cannot allocate memory for rte_flow");
1303 		goto fail;
1304 	}
1305 	msg = &flow->msg;
1306 	tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER,
1307 		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1308 	msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1309 	tap_flow_set_handle(flow);
1310 	if (priv_flow_process(pmd, attr, items, actions, error, flow, 0))
1311 		goto fail;
1312 	err = tap_nl_send(pmd->nlsk_fd, &msg->nh);
1313 	if (err < 0) {
1314 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1315 				   NULL, "couldn't send request to kernel");
1316 		goto fail;
1317 	}
1318 	err = tap_nl_recv_ack(pmd->nlsk_fd);
1319 	if (err < 0) {
1320 		TAP_LOG(ERR,
1321 			"Kernel refused TC filter rule creation (%d): %s",
1322 			errno, strerror(errno));
1323 		rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE,
1324 				   NULL,
1325 				   "overlapping rules or Kernel too old for flower support");
1326 		goto fail;
1327 	}
1328 	LIST_INSERT_HEAD(&pmd->flows, flow, next);
1329 	/**
1330 	 * If a remote device is configured, a TC rule with identical items for
1331 	 * matching must be set on that device, with a single action: redirect
1332 	 * to the local pmd->if_index.
1333 	 */
1334 	if (pmd->remote_if_index) {
1335 		remote_flow = rte_zmalloc(__func__, sizeof(struct rte_flow), 0);
1336 		if (!remote_flow) {
1337 			rte_flow_error_set(
1338 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1339 				"cannot allocate memory for rte_flow");
1340 			goto fail;
1341 		}
1342 		msg = &remote_flow->msg;
1343 		/* set the rule if_index for the remote netdevice */
1344 		tc_init_msg(
1345 			msg, pmd->remote_if_index, RTM_NEWTFILTER,
1346 			NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1347 		msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1348 		tap_flow_set_handle(remote_flow);
1349 		if (priv_flow_process(pmd, attr, items, NULL,
1350 				      error, remote_flow, TCA_EGRESS_REDIR)) {
1351 			rte_flow_error_set(
1352 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1353 				NULL, "rte flow rule validation failed");
1354 			goto fail;
1355 		}
1356 		err = tap_nl_send(pmd->nlsk_fd, &msg->nh);
1357 		if (err < 0) {
1358 			rte_flow_error_set(
1359 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1360 				NULL, "Failure sending nl request");
1361 			goto fail;
1362 		}
1363 		err = tap_nl_recv_ack(pmd->nlsk_fd);
1364 		if (err < 0) {
1365 			TAP_LOG(ERR,
1366 				"Kernel refused TC filter rule creation (%d): %s",
1367 				errno, strerror(errno));
1368 			rte_flow_error_set(
1369 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1370 				NULL,
1371 				"overlapping rules or Kernel too old for flower support");
1372 			goto fail;
1373 		}
1374 		flow->remote_flow = remote_flow;
1375 	}
1376 	return flow;
1377 fail:
1378 	rte_free(remote_flow);
1379 	if (flow)
1380 		tap_flow_free(pmd, flow);
1381 	return NULL;
1382 }
1383 
1384 /**
1385  * Destroy a flow using pointer to pmd_internal.
1386  *
1387  * @param[in, out] pmd
1388  *   Pointer to private structure.
1389  * @param[in] flow
1390  *   Pointer to the flow to destroy.
1391  * @param[in, out] error
1392  *   Pointer to the flow error handler
1393  *
1394  * @return 0 if the flow could be destroyed, -1 otherwise.
1395  */
1396 static int
1397 tap_flow_destroy_pmd(struct pmd_internals *pmd,
1398 		     struct rte_flow *flow,
1399 		     struct rte_flow_error *error)
1400 {
1401 	struct rte_flow *remote_flow = flow->remote_flow;
1402 	int ret = 0;
1403 
1404 	LIST_REMOVE(flow, next);
1405 	flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1406 	flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1407 
1408 	ret = tap_nl_send(pmd->nlsk_fd, &flow->msg.nh);
1409 	if (ret < 0) {
1410 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1411 				   NULL, "couldn't send request to kernel");
1412 		goto end;
1413 	}
1414 	ret = tap_nl_recv_ack(pmd->nlsk_fd);
1415 	/* If errno is ENOENT, the rule is already no longer in the kernel. */
1416 	if (ret < 0 && errno == ENOENT)
1417 		ret = 0;
1418 	if (ret < 0) {
1419 		TAP_LOG(ERR,
1420 			"Kernel refused TC filter rule deletion (%d): %s",
1421 			errno, strerror(errno));
1422 		rte_flow_error_set(
1423 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1424 			"couldn't receive kernel ack to our request");
1425 		goto end;
1426 	}
1427 
1428 	if (remote_flow) {
1429 		remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1430 		remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1431 
1432 		ret = tap_nl_send(pmd->nlsk_fd, &remote_flow->msg.nh);
1433 		if (ret < 0) {
1434 			rte_flow_error_set(
1435 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1436 				NULL, "Failure sending nl request");
1437 			goto end;
1438 		}
1439 		ret = tap_nl_recv_ack(pmd->nlsk_fd);
1440 		if (ret < 0 && errno == ENOENT)
1441 			ret = 0;
1442 		if (ret < 0) {
1443 			TAP_LOG(ERR,
1444 				"Kernel refused TC filter rule deletion (%d): %s",
1445 				errno, strerror(errno));
1446 			rte_flow_error_set(
1447 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1448 				NULL, "Failure trying to receive nl ack");
1449 			goto end;
1450 		}
1451 	}
1452 end:
1453 	rte_free(remote_flow);
1454 	tap_flow_free(pmd, flow);
1455 	return ret;
1456 }
1457 
1458 /**
1459  * Destroy a flow.
1460  *
1461  * @see rte_flow_destroy()
1462  * @see rte_flow_ops
1463  */
1464 static int
1465 tap_flow_destroy(struct rte_eth_dev *dev,
1466 		 struct rte_flow *flow,
1467 		 struct rte_flow_error *error)
1468 {
1469 	struct pmd_internals *pmd = dev->data->dev_private;
1470 
1471 	return tap_flow_destroy_pmd(pmd, flow, error);
1472 }
1473 
1474 /**
1475  * Enable/disable flow isolation.
1476  *
1477  * @see rte_flow_isolate()
1478  * @see rte_flow_ops
1479  */
1480 static int
1481 tap_flow_isolate(struct rte_eth_dev *dev,
1482 		 int set,
1483 		 struct rte_flow_error *error __rte_unused)
1484 {
1485 	struct pmd_internals *pmd = dev->data->dev_private;
1486 	struct pmd_process_private *process_private = dev->process_private;
1487 
1488 	/* normalize 'set' variable to contain 0 or 1 values */
1489 	if (set)
1490 		set = 1;
1491 	/* if already in the right isolation mode - nothing to do */
1492 	if ((set ^ pmd->flow_isolate) == 0)
1493 		return 0;
1494 	/* mark the isolation mode for tap_flow_implicit_create() */
1495 	pmd->flow_isolate = set;
1496 	/*
1497 	 * If netdevice is there, setup appropriate flow rules immediately.
1498 	 * Otherwise it will be set when bringing up the netdevice (tun_alloc).
1499 	 */
1500 	if (process_private->fds[0] == -1)
1501 		return 0;
1502 
1503 	if (set) {
1504 		struct rte_flow *remote_flow;
1505 
1506 		while (1) {
1507 			remote_flow = LIST_FIRST(&pmd->implicit_flows);
1508 			if (!remote_flow)
1509 				break;
1510 			/*
1511 			 * Remove all implicit rules on the remote.
1512 			 * Keep the local rule to redirect packets on TX.
1513 			 * Keep also the last implicit local rule: ISOLATE.
1514 			 */
1515 			if (remote_flow->msg.t.tcm_ifindex == pmd->if_index)
1516 				break;
1517 			if (tap_flow_destroy_pmd(pmd, remote_flow, NULL) < 0)
1518 				goto error;
1519 		}
1520 		/* Switch the TC rule according to pmd->flow_isolate */
1521 		if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1)
1522 			goto error;
1523 	} else {
1524 		/* Switch the TC rule according to pmd->flow_isolate */
1525 		if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1)
1526 			goto error;
1527 		if (!pmd->remote_if_index)
1528 			return 0;
1529 		if (tap_flow_implicit_create(pmd, TAP_REMOTE_TX) < 0)
1530 			goto error;
1531 		if (tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0)
1532 			goto error;
1533 		if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCAST) < 0)
1534 			goto error;
1535 		if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCASTV6) < 0)
1536 			goto error;
1537 		if (dev->data->promiscuous &&
1538 		    tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC) < 0)
1539 			goto error;
1540 		if (dev->data->all_multicast &&
1541 		    tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI) < 0)
1542 			goto error;
1543 	}
1544 	return 0;
1545 error:
1546 	pmd->flow_isolate = 0;
1547 	return rte_flow_error_set(
1548 		error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
1549 		"TC rule creation failed");
1550 }
1551 
1552 /**
1553  * Destroy all flows.
1554  *
1555  * @see rte_flow_flush()
1556  * @see rte_flow_ops
1557  */
1558 int
1559 tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
1560 {
1561 	struct pmd_internals *pmd = dev->data->dev_private;
1562 	struct rte_flow *flow;
1563 
1564 	while (!LIST_EMPTY(&pmd->flows)) {
1565 		flow = LIST_FIRST(&pmd->flows);
1566 		if (tap_flow_destroy(dev, flow, error) < 0)
1567 			return -1;
1568 	}
1569 	return 0;
1570 }
1571 
1572 /**
1573  * Add an implicit flow rule on the remote device to make sure traffic gets to
1574  * the tap netdevice from there.
1575  *
1576  * @param pmd
1577  *   Pointer to private structure.
1578  * @param[in] idx
1579  *   The idx in the implicit_rte_flows array specifying which rule to apply.
1580  *
1581  * @return -1 if the rule couldn't be applied, 0 otherwise.
1582  */
1583 int tap_flow_implicit_create(struct pmd_internals *pmd,
1584 			     enum implicit_rule_index idx)
1585 {
1586 	uint16_t flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE;
1587 	struct rte_flow_action *actions = implicit_rte_flows[idx].actions;
1588 	struct rte_flow_action isolate_actions[2] = {
1589 		[1] = {
1590 			.type = RTE_FLOW_ACTION_TYPE_END,
1591 		},
1592 	};
1593 	struct rte_flow_item *items = implicit_rte_flows[idx].items;
1594 	struct rte_flow_attr *attr = &implicit_rte_flows[idx].attr;
1595 	struct rte_flow_item_eth eth_local = { .hdr.ether_type = 0 };
1596 	unsigned int if_index = pmd->remote_if_index;
1597 	struct rte_flow *remote_flow = NULL;
1598 	struct nlmsg *msg = NULL;
1599 	int err = 0;
1600 	struct rte_flow_item items_local[2] = {
1601 		[0] = {
1602 			.type = items[0].type,
1603 			.spec = &eth_local,
1604 			.mask = items[0].mask,
1605 		},
1606 		[1] = {
1607 			.type = items[1].type,
1608 		}
1609 	};
1610 
1611 	remote_flow = rte_zmalloc(__func__, sizeof(struct rte_flow), 0);
1612 	if (!remote_flow) {
1613 		TAP_LOG(ERR, "Cannot allocate memory for rte_flow");
1614 		goto fail;
1615 	}
1616 	msg = &remote_flow->msg;
1617 	if (idx == TAP_REMOTE_TX) {
1618 		if_index = pmd->if_index;
1619 	} else if (idx == TAP_ISOLATE) {
1620 		if_index = pmd->if_index;
1621 		/* Don't be exclusive for this rule, it can be changed later. */
1622 		flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE;
1623 		isolate_actions[0].type = pmd->flow_isolate ?
1624 			RTE_FLOW_ACTION_TYPE_DROP :
1625 			RTE_FLOW_ACTION_TYPE_PASSTHRU;
1626 		actions = isolate_actions;
1627 	} else if (idx == TAP_REMOTE_LOCAL_MAC) {
1628 		/*
1629 		 * eth addr couldn't be set in implicit_rte_flows[] as it is not
1630 		 * known at compile time.
1631 		 */
1632 		memcpy(&eth_local.hdr.dst_addr, &pmd->eth_addr, sizeof(pmd->eth_addr));
1633 		items = items_local;
1634 	}
1635 	tc_init_msg(msg, if_index, RTM_NEWTFILTER, flags);
1636 	msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1637 	/*
1638 	 * The ISOLATE rule is always present and must have a static handle, as
1639 	 * the action is changed whether the feature is enabled (DROP) or
1640 	 * disabled (PASSTHRU).
1641 	 * There is just one REMOTE_PROMISCUOUS rule in all cases. It should
1642 	 * have a static handle such that adding it twice will fail with EEXIST
1643 	 * with any kernel version. Remark: old kernels may falsely accept the
1644 	 * same REMOTE_PROMISCUOUS rules if they had different handles.
1645 	 */
1646 	if (idx == TAP_ISOLATE)
1647 		remote_flow->msg.t.tcm_handle = ISOLATE_HANDLE;
1648 	else if (idx == TAP_REMOTE_PROMISC)
1649 		remote_flow->msg.t.tcm_handle = REMOTE_PROMISCUOUS_HANDLE;
1650 	else
1651 		tap_flow_set_handle(remote_flow);
1652 	if (priv_flow_process(pmd, attr, items, actions, NULL,
1653 			      remote_flow, implicit_rte_flows[idx].mirred)) {
1654 		TAP_LOG(ERR, "rte flow rule validation failed");
1655 		goto fail;
1656 	}
1657 	err = tap_nl_send(pmd->nlsk_fd, &msg->nh);
1658 	if (err < 0) {
1659 		TAP_LOG(ERR, "Failure sending nl request");
1660 		goto fail;
1661 	}
1662 	err = tap_nl_recv_ack(pmd->nlsk_fd);
1663 	if (err < 0) {
1664 		/* Silently ignore re-entering existing rule */
1665 		if (errno == EEXIST)
1666 			goto success;
1667 		TAP_LOG(ERR,
1668 			"Kernel refused TC filter rule creation (%d): %s",
1669 			errno, strerror(errno));
1670 		goto fail;
1671 	}
1672 	LIST_INSERT_HEAD(&pmd->implicit_flows, remote_flow, next);
1673 success:
1674 	return 0;
1675 fail:
1676 	rte_free(remote_flow);
1677 	return -1;
1678 }
1679 
1680 /**
1681  * Remove specific implicit flow rule on the remote device.
1682  *
1683  * @param[in, out] pmd
1684  *   Pointer to private structure.
1685  * @param[in] idx
1686  *   The idx in the implicit_rte_flows array specifying which rule to remove.
1687  *
1688  * @return -1 if one of the implicit rules couldn't be created, 0 otherwise.
1689  */
1690 int tap_flow_implicit_destroy(struct pmd_internals *pmd,
1691 			      enum implicit_rule_index idx)
1692 {
1693 	struct rte_flow *remote_flow;
1694 	int cur_prio = -1;
1695 	int idx_prio = implicit_rte_flows[idx].attr.priority + PRIORITY_OFFSET;
1696 
1697 	for (remote_flow = LIST_FIRST(&pmd->implicit_flows);
1698 	     remote_flow;
1699 	     remote_flow = LIST_NEXT(remote_flow, next)) {
1700 		cur_prio = (remote_flow->msg.t.tcm_info >> 16) & PRIORITY_MASK;
1701 		if (cur_prio != idx_prio)
1702 			continue;
1703 		return tap_flow_destroy_pmd(pmd, remote_flow, NULL);
1704 	}
1705 	return 0;
1706 }
1707 
1708 /**
1709  * Destroy all implicit flows.
1710  *
1711  * @see rte_flow_flush()
1712  */
1713 int
1714 tap_flow_implicit_flush(struct pmd_internals *pmd, struct rte_flow_error *error)
1715 {
1716 	struct rte_flow *remote_flow;
1717 
1718 	while (!LIST_EMPTY(&pmd->implicit_flows)) {
1719 		remote_flow = LIST_FIRST(&pmd->implicit_flows);
1720 		if (tap_flow_destroy_pmd(pmd, remote_flow, error) < 0)
1721 			return -1;
1722 	}
1723 	return 0;
1724 }
1725 
1726 /**
1727  * Cleanup when device is closed
1728  */
1729 void tap_flow_bpf_destroy(struct pmd_internals *pmd __rte_unused)
1730 {
1731 #ifdef HAVE_BPF_RSS
1732 	tap_rss__destroy(pmd->rss);
1733 	pmd->rss = NULL;
1734 #endif
1735 }
1736 
1737 #ifdef HAVE_BPF_RSS
1738 /**
1739  * Enable RSS on tap: create TC rules for queuing.
1740  *
1741  * @param[in, out] pmd
1742  *   Pointer to private structure.
1743  *
1744  * @param[in] attr
1745  *   Pointer to rte_flow to get flow group
1746  *
1747  * @param[out] error
1748  *   Pointer to error reporting if not NULL.
1749  *
1750  * @return 0 on success, negative value on failure.
1751  */
1752 static int rss_enable(struct pmd_internals *pmd, struct rte_flow_error *error)
1753 {
1754 	int err;
1755 
1756 	/* Load the BPF program (defined in tap_bpf.h from skeleton) */
1757 	pmd->rss = tap_rss__open_and_load();
1758 	if (pmd->rss == NULL) {
1759 		TAP_LOG(ERR, "Failed to load BPF object: %s", strerror(errno));
1760 		rte_flow_error_set(error, errno, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1761 			"BPF object could not be loaded");
1762 		return -errno;
1763 	}
1764 
1765 	/* Attach the maps defined in BPF program */
1766 	err = tap_rss__attach(pmd->rss);
1767 	if (err < 0) {
1768 		TAP_LOG(ERR, "Failed to attach BPF object: %d", err);
1769 		rte_flow_error_set(error, -err, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1770 			"BPF object could not be attached");
1771 		tap_flow_bpf_destroy(pmd);
1772 		return err;
1773 	}
1774 
1775 	return 0;
1776 }
1777 
1778 /* Default RSS hash key also used by mlx devices */
1779 static const uint8_t rss_hash_default_key[] = {
1780 	0x2c, 0xc6, 0x81, 0xd1,
1781 	0x5b, 0xdb, 0xf4, 0xf7,
1782 	0xfc, 0xa2, 0x83, 0x19,
1783 	0xdb, 0x1a, 0x3e, 0x94,
1784 	0x6b, 0x9e, 0x38, 0xd9,
1785 	0x2c, 0x9c, 0x03, 0xd1,
1786 	0xad, 0x99, 0x44, 0xa7,
1787 	0xd9, 0x56, 0x3d, 0x59,
1788 	0x06, 0x3c, 0x25, 0xf3,
1789 	0xfc, 0x1f, 0xdc, 0x2a,
1790 };
1791 
1792 /**
1793  * Add RSS hash calculations and queue selection
1794  *
1795  * @param[in, out] pmd
1796  *   Pointer to internal structure. Used to set/get RSS map fd
1797  *
1798  * @param[in] rss
1799  *   Pointer to RSS flow actions
1800  *
1801  * @param[out] error
1802  *   Pointer to error reporting if not NULL.
1803  *
1804  * @return 0 on success, negative value on failure
1805  */
1806 static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
1807 			   const struct rte_flow_action_rss *rss,
1808 			   struct rte_flow_error *error)
1809 {
1810 	const struct bpf_program *rss_prog = pmd->rss->progs.rss_flow_action;
1811 	struct rss_key rss_entry = { };
1812 	const uint8_t *key_in;
1813 	uint32_t hash_type = 0;
1814 	uint32_t handle = flow->msg.t.tcm_handle;
1815 	unsigned int i;
1816 	int err;
1817 
1818 	/* Check supported RSS features */
1819 	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
1820 		return rte_flow_error_set
1821 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
1822 			 "non-default RSS hash functions are not supported");
1823 	if (rss->level)
1824 		return rte_flow_error_set
1825 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
1826 			 "a nonzero RSS encapsulation level is not supported");
1827 
1828 	if (rss->queue_num == 0 || rss->queue_num >= TAP_MAX_QUEUES)
1829 		return rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
1830 					  "invalid number of queues");
1831 
1832 	/*
1833 	 * Follow the semantics of RSS key (see rte_ethdev.h)
1834 	 * There are two valid cases:
1835 	 *   1. key_length of zero, and key must be NULL;
1836 	 *      this uses the default driver key.
1837 	 *
1838 	 *   2. key_length is the TAP_RSS_HASH_KEY_SIZE (40 bytes)
1839 	 *      and the key must not be NULL.
1840 	 *
1841 	 * Anything else is an error.
1842 	 */
1843 	if (rss->key_len == 0) {
1844 		if (rss->key != NULL)
1845 			return rte_flow_error_set(error, ENOTSUP,
1846 						  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1847 						  &rss->key_len, "RSS hash key length 0");
1848 		key_in = rss_hash_default_key;
1849 	} else {
1850 		if (rss->key_len != TAP_RSS_HASH_KEY_SIZE)
1851 			return rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
1852 						  NULL, "RSS hash invalid key length");
1853 		if (rss->key == NULL)
1854 			return rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
1855 						  NULL, "RSS hash key is NULL");
1856 		key_in = rss->key;
1857 	}
1858 
1859 	if (rss->types & TAP_RSS_HF_MASK)
1860 		return rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
1861 					  NULL, "RSS hash type not supported");
1862 
1863 	if (rss->types & (RTE_ETH_RSS_NONFRAG_IPV4_UDP | RTE_ETH_RSS_NONFRAG_IPV4_TCP))
1864 		hash_type |= RTE_BIT32(HASH_FIELD_IPV4_L3_L4);
1865 	else if (rss->types & (RTE_ETH_RSS_IPV4 | RTE_ETH_RSS_FRAG_IPV4))
1866 		hash_type |= RTE_BIT32(HASH_FIELD_IPV4_L3);
1867 
1868 	if (rss->types & (RTE_ETH_RSS_NONFRAG_IPV6_UDP | RTE_ETH_RSS_NONFRAG_IPV6_TCP))
1869 		hash_type |= RTE_BIT32(HASH_FIELD_IPV6_L3_L4);
1870 	else if (rss->types & (RTE_ETH_RSS_IPV6 | RTE_ETH_RSS_FRAG_IPV6 | RTE_ETH_RSS_IPV6_EX))
1871 		hash_type |= RTE_BIT32(HASH_FIELD_IPV6_L3);
1872 
1873 	rss_entry.hash_fields = hash_type;
1874 	rte_convert_rss_key((const uint32_t *)key_in, (uint32_t *)rss_entry.key,
1875 			    TAP_RSS_HASH_KEY_SIZE);
1876 
1877 	/* Update RSS map entry with queues */
1878 	rss_entry.nb_queues = rss->queue_num;
1879 	for (i = 0; i < rss->queue_num; i++)
1880 		rss_entry.queues[i] = rss->queue[i];
1881 
1882 
1883 	/* Add this way for BPF to find  entry in map */
1884 	err = bpf_map__update_elem(pmd->rss->maps.rss_map,
1885 				   &handle, sizeof(handle),
1886 				   &rss_entry, sizeof(rss_entry), 0);
1887 	if (err) {
1888 		TAP_LOG(ERR,
1889 			"Failed to update BPF map entry %#x (%d): %s",
1890 			handle,  errno, strerror(errno));
1891 		rte_flow_error_set(
1892 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1893 			"Kernel too old or not configured "
1894 			"to support BPF maps updates");
1895 
1896 		return -ENOTSUP;
1897 	}
1898 
1899 	/* Add actions to mark packet then run the RSS BPF program */
1900 	struct action_data adata[] = {
1901 		{
1902 			.id = "skbedit",
1903 			.skbedit = {
1904 				.skbedit.action = TC_ACT_PIPE,
1905 				.mark = handle,
1906 			},
1907 		},
1908 		{
1909 			.id = "bpf",
1910 			.bpf = {
1911 				.bpf.action = TC_ACT_PIPE,
1912 				.annotation = "tap_rss",
1913 				.bpf_fd = bpf_program__fd(rss_prog),
1914 			},
1915 		},
1916 	};
1917 
1918 	return add_actions(flow, RTE_DIM(adata), adata, TCA_FLOWER_ACT);
1919 }
1920 #endif
1921 
1922 /**
1923  * Get rte_flow operations.
1924  *
1925  * @param dev
1926  *   Pointer to Ethernet device structure.
1927  * @param ops
1928  *   Pointer to operation-specific structure.
1929  *
1930  * @return
1931  *   0 on success, negative errno value on failure.
1932  */
1933 int
1934 tap_dev_flow_ops_get(struct rte_eth_dev *dev __rte_unused,
1935 		     const struct rte_flow_ops **ops)
1936 {
1937 	*ops = &tap_flow_ops;
1938 	return 0;
1939 }
1940