xref: /dpdk/drivers/net/tap/tap_flow.c (revision 5dba3b9c4c131b88a78bcecfef39db23ebc47873)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2017 6WIND S.A.
5  *   Copyright 2017 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <errno.h>
35 #include <string.h>
36 #include <sys/queue.h>
37 
38 #include <rte_byteorder.h>
39 #include <rte_jhash.h>
40 #include <rte_malloc.h>
41 #include <rte_eth_tap.h>
42 #include <tap_flow.h>
43 #include <tap_autoconf.h>
44 #include <tap_tcmsgs.h>
45 
46 #ifndef HAVE_TC_FLOWER
47 /*
48  * For kernels < 4.2, this enum is not defined. Runtime checks will be made to
49  * avoid sending TC messages the kernel cannot understand.
50  */
51 enum {
52 	TCA_FLOWER_UNSPEC,
53 	TCA_FLOWER_CLASSID,
54 	TCA_FLOWER_INDEV,
55 	TCA_FLOWER_ACT,
56 	TCA_FLOWER_KEY_ETH_DST,         /* ETH_ALEN */
57 	TCA_FLOWER_KEY_ETH_DST_MASK,    /* ETH_ALEN */
58 	TCA_FLOWER_KEY_ETH_SRC,         /* ETH_ALEN */
59 	TCA_FLOWER_KEY_ETH_SRC_MASK,    /* ETH_ALEN */
60 	TCA_FLOWER_KEY_ETH_TYPE,        /* be16 */
61 	TCA_FLOWER_KEY_IP_PROTO,        /* u8 */
62 	TCA_FLOWER_KEY_IPV4_SRC,        /* be32 */
63 	TCA_FLOWER_KEY_IPV4_SRC_MASK,   /* be32 */
64 	TCA_FLOWER_KEY_IPV4_DST,        /* be32 */
65 	TCA_FLOWER_KEY_IPV4_DST_MASK,   /* be32 */
66 	TCA_FLOWER_KEY_IPV6_SRC,        /* struct in6_addr */
67 	TCA_FLOWER_KEY_IPV6_SRC_MASK,   /* struct in6_addr */
68 	TCA_FLOWER_KEY_IPV6_DST,        /* struct in6_addr */
69 	TCA_FLOWER_KEY_IPV6_DST_MASK,   /* struct in6_addr */
70 	TCA_FLOWER_KEY_TCP_SRC,         /* be16 */
71 	TCA_FLOWER_KEY_TCP_DST,         /* be16 */
72 	TCA_FLOWER_KEY_UDP_SRC,         /* be16 */
73 	TCA_FLOWER_KEY_UDP_DST,         /* be16 */
74 };
75 #endif
76 #ifndef HAVE_TC_VLAN_ID
77 enum {
78 	/* TCA_FLOWER_FLAGS, */
79 	TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */
80 	TCA_FLOWER_KEY_VLAN_PRIO,       /* u8   */
81 	TCA_FLOWER_KEY_VLAN_ETH_TYPE,   /* be16 */
82 };
83 #endif
84 
85 #define ISOLATE_HANDLE 1
86 
87 struct rte_flow {
88 	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
89 	struct rte_flow *remote_flow; /* associated remote flow */
90 	struct nlmsg msg;
91 };
92 
93 struct convert_data {
94 	uint16_t eth_type;
95 	uint16_t ip_proto;
96 	uint8_t vlan;
97 	struct rte_flow *flow;
98 };
99 
100 struct remote_rule {
101 	struct rte_flow_attr attr;
102 	struct rte_flow_item items[2];
103 	struct rte_flow_action actions[2];
104 	int mirred;
105 };
106 
107 static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
108 static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
109 static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
110 static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data);
111 static int tap_flow_create_udp(const struct rte_flow_item *item, void *data);
112 static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data);
113 static int
114 tap_flow_validate(struct rte_eth_dev *dev,
115 		  const struct rte_flow_attr *attr,
116 		  const struct rte_flow_item items[],
117 		  const struct rte_flow_action actions[],
118 		  struct rte_flow_error *error);
119 
120 static struct rte_flow *
121 tap_flow_create(struct rte_eth_dev *dev,
122 		const struct rte_flow_attr *attr,
123 		const struct rte_flow_item items[],
124 		const struct rte_flow_action actions[],
125 		struct rte_flow_error *error);
126 
127 static int
128 tap_flow_destroy(struct rte_eth_dev *dev,
129 		 struct rte_flow *flow,
130 		 struct rte_flow_error *error);
131 
132 static int
133 tap_flow_isolate(struct rte_eth_dev *dev,
134 		 int set,
135 		 struct rte_flow_error *error);
136 
137 static const struct rte_flow_ops tap_flow_ops = {
138 	.validate = tap_flow_validate,
139 	.create = tap_flow_create,
140 	.destroy = tap_flow_destroy,
141 	.flush = tap_flow_flush,
142 	.isolate = tap_flow_isolate,
143 };
144 
145 /* Static initializer for items. */
146 #define ITEMS(...) \
147 	(const enum rte_flow_item_type []){ \
148 		__VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
149 	}
150 
151 /* Structure to generate a simple graph of layers supported by the NIC. */
152 struct tap_flow_items {
153 	/* Bit-mask corresponding to what is supported for this item. */
154 	const void *mask;
155 	const unsigned int mask_sz; /* Bit-mask size in bytes. */
156 	/*
157 	 * Bit-mask corresponding to the default mask, if none is provided
158 	 * along with the item.
159 	 */
160 	const void *default_mask;
161 	/**
162 	 * Conversion function from rte_flow to netlink attributes.
163 	 *
164 	 * @param item
165 	 *   rte_flow item to convert.
166 	 * @param data
167 	 *   Internal structure to store the conversion.
168 	 *
169 	 * @return
170 	 *   0 on success, negative value otherwise.
171 	 */
172 	int (*convert)(const struct rte_flow_item *item, void *data);
173 	/** List of possible following items.  */
174 	const enum rte_flow_item_type *const items;
175 };
176 
177 /* Graph of supported items and associated actions. */
178 static const struct tap_flow_items tap_flow_items[] = {
179 	[RTE_FLOW_ITEM_TYPE_END] = {
180 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
181 	},
182 	[RTE_FLOW_ITEM_TYPE_ETH] = {
183 		.items = ITEMS(
184 			RTE_FLOW_ITEM_TYPE_VLAN,
185 			RTE_FLOW_ITEM_TYPE_IPV4,
186 			RTE_FLOW_ITEM_TYPE_IPV6),
187 		.mask = &(const struct rte_flow_item_eth){
188 			.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
189 			.src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
190 			.type = -1,
191 		},
192 		.mask_sz = sizeof(struct rte_flow_item_eth),
193 		.default_mask = &rte_flow_item_eth_mask,
194 		.convert = tap_flow_create_eth,
195 	},
196 	[RTE_FLOW_ITEM_TYPE_VLAN] = {
197 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
198 			       RTE_FLOW_ITEM_TYPE_IPV6),
199 		.mask = &(const struct rte_flow_item_vlan){
200 			.tpid = -1,
201 			/* DEI matching is not supported */
202 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
203 			.tci = 0xffef,
204 #else
205 			.tci = 0xefff,
206 #endif
207 		},
208 		.mask_sz = sizeof(struct rte_flow_item_vlan),
209 		.default_mask = &rte_flow_item_vlan_mask,
210 		.convert = tap_flow_create_vlan,
211 	},
212 	[RTE_FLOW_ITEM_TYPE_IPV4] = {
213 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
214 			       RTE_FLOW_ITEM_TYPE_TCP),
215 		.mask = &(const struct rte_flow_item_ipv4){
216 			.hdr = {
217 				.src_addr = -1,
218 				.dst_addr = -1,
219 				.next_proto_id = -1,
220 			},
221 		},
222 		.mask_sz = sizeof(struct rte_flow_item_ipv4),
223 		.default_mask = &rte_flow_item_ipv4_mask,
224 		.convert = tap_flow_create_ipv4,
225 	},
226 	[RTE_FLOW_ITEM_TYPE_IPV6] = {
227 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
228 			       RTE_FLOW_ITEM_TYPE_TCP),
229 		.mask = &(const struct rte_flow_item_ipv6){
230 			.hdr = {
231 				.src_addr = {
232 					"\xff\xff\xff\xff\xff\xff\xff\xff"
233 					"\xff\xff\xff\xff\xff\xff\xff\xff",
234 				},
235 				.dst_addr = {
236 					"\xff\xff\xff\xff\xff\xff\xff\xff"
237 					"\xff\xff\xff\xff\xff\xff\xff\xff",
238 				},
239 				.proto = -1,
240 			},
241 		},
242 		.mask_sz = sizeof(struct rte_flow_item_ipv6),
243 		.default_mask = &rte_flow_item_ipv6_mask,
244 		.convert = tap_flow_create_ipv6,
245 	},
246 	[RTE_FLOW_ITEM_TYPE_UDP] = {
247 		.mask = &(const struct rte_flow_item_udp){
248 			.hdr = {
249 				.src_port = -1,
250 				.dst_port = -1,
251 			},
252 		},
253 		.mask_sz = sizeof(struct rte_flow_item_udp),
254 		.default_mask = &rte_flow_item_udp_mask,
255 		.convert = tap_flow_create_udp,
256 	},
257 	[RTE_FLOW_ITEM_TYPE_TCP] = {
258 		.mask = &(const struct rte_flow_item_tcp){
259 			.hdr = {
260 				.src_port = -1,
261 				.dst_port = -1,
262 			},
263 		},
264 		.mask_sz = sizeof(struct rte_flow_item_tcp),
265 		.default_mask = &rte_flow_item_tcp_mask,
266 		.convert = tap_flow_create_tcp,
267 	},
268 };
269 
270 /*
271  *                TC rules, by growing priority
272  *
273  *        Remote netdevice                  Tap netdevice
274  * +-------------+-------------+  +-------------+-------------+
275  * |   Ingress   |   Egress    |  |   Ingress   |   Egress    |
276  * |-------------|-------------|  |-------------|-------------|
277  * |             |  \       /  |  |             |  REMOTE TX  | prio 1
278  * |             |   \     /   |  |             |   \     /   | prio 2
279  * |  EXPLICIT   |    \   /    |  |  EXPLICIT   |    \   /    |   .
280  * |             |     \ /     |  |             |     \ /     |   .
281  * |    RULES    |      X      |  |    RULES    |      X      |   .
282  * |      .      |     / \     |  |      .      |     / \     |   .
283  * |      .      |    /   \    |  |      .      |    /   \    |   .
284  * |      .      |   /     \   |  |      .      |   /     \   |   .
285  * |      .      |  /       \  |  |      .      |  /       \  |   .
286  *
287  *      ....           ....           ....           ....
288  *
289  * |      .      |  \       /  |  |      .      |  \       /  |   .
290  * |      .      |   \     /   |  |      .      |   \     /   |   .
291  * |             |    \   /    |  |             |    \   /    |
292  * |  LOCAL_MAC  |     \ /     |  |    \   /    |     \ /     | last prio - 5
293  * |   PROMISC   |      X      |  |     \ /     |      X      | last prio - 4
294  * |   ALLMULTI  |     / \     |  |      X      |     / \     | last prio - 3
295  * |  BROADCAST  |    /   \    |  |     / \     |    /   \    | last prio - 2
296  * | BROADCASTV6 |   /     \   |  |    /   \    |   /     \   | last prio - 1
297  * |     xx      |  /       \  |  |   ISOLATE   |  /       \  | last prio
298  * +-------------+-------------+  +-------------+-------------+
299  *
300  * The implicit flow rules are stored in a list in with mandatorily the last two
301  * being the ISOLATE and REMOTE_TX rules. e.g.:
302  *
303  * LOCAL_MAC -> BROADCAST -> BROADCASTV6 -> REMOTE_TX -> ISOLATE -> NULL
304  *
305  * That enables tap_flow_isolate() to remove implicit rules by popping the list
306  * head and remove it as long as it applies on the remote netdevice. The
307  * implicit rule for TX redirection is not removed, as isolate concerns only
308  * incoming traffic.
309  */
310 
311 static struct remote_rule implicit_rte_flows[TAP_REMOTE_MAX_IDX] = {
312 	[TAP_REMOTE_LOCAL_MAC] = {
313 		.attr = {
314 			.group = MAX_GROUP,
315 			.priority = PRIORITY_MASK - TAP_REMOTE_LOCAL_MAC,
316 			.ingress = 1,
317 		},
318 		.items[0] = {
319 			.type = RTE_FLOW_ITEM_TYPE_ETH,
320 			.mask =  &(const struct rte_flow_item_eth){
321 				.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
322 			},
323 		},
324 		.items[1] = {
325 			.type = RTE_FLOW_ITEM_TYPE_END,
326 		},
327 		.mirred = TCA_EGRESS_REDIR,
328 	},
329 	[TAP_REMOTE_BROADCAST] = {
330 		.attr = {
331 			.group = MAX_GROUP,
332 			.priority = PRIORITY_MASK - TAP_REMOTE_BROADCAST,
333 			.ingress = 1,
334 		},
335 		.items[0] = {
336 			.type = RTE_FLOW_ITEM_TYPE_ETH,
337 			.mask =  &(const struct rte_flow_item_eth){
338 				.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
339 			},
340 			.spec = &(const struct rte_flow_item_eth){
341 				.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
342 			},
343 		},
344 		.items[1] = {
345 			.type = RTE_FLOW_ITEM_TYPE_END,
346 		},
347 		.mirred = TCA_EGRESS_MIRROR,
348 	},
349 	[TAP_REMOTE_BROADCASTV6] = {
350 		.attr = {
351 			.group = MAX_GROUP,
352 			.priority = PRIORITY_MASK - TAP_REMOTE_BROADCASTV6,
353 			.ingress = 1,
354 		},
355 		.items[0] = {
356 			.type = RTE_FLOW_ITEM_TYPE_ETH,
357 			.mask =  &(const struct rte_flow_item_eth){
358 				.dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
359 			},
360 			.spec = &(const struct rte_flow_item_eth){
361 				.dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
362 			},
363 		},
364 		.items[1] = {
365 			.type = RTE_FLOW_ITEM_TYPE_END,
366 		},
367 		.mirred = TCA_EGRESS_MIRROR,
368 	},
369 	[TAP_REMOTE_PROMISC] = {
370 		.attr = {
371 			.group = MAX_GROUP,
372 			.priority = PRIORITY_MASK - TAP_REMOTE_PROMISC,
373 			.ingress = 1,
374 		},
375 		.items[0] = {
376 			.type = RTE_FLOW_ITEM_TYPE_VOID,
377 		},
378 		.items[1] = {
379 			.type = RTE_FLOW_ITEM_TYPE_END,
380 		},
381 		.mirred = TCA_EGRESS_MIRROR,
382 	},
383 	[TAP_REMOTE_ALLMULTI] = {
384 		.attr = {
385 			.group = MAX_GROUP,
386 			.priority = PRIORITY_MASK - TAP_REMOTE_ALLMULTI,
387 			.ingress = 1,
388 		},
389 		.items[0] = {
390 			.type = RTE_FLOW_ITEM_TYPE_ETH,
391 			.mask =  &(const struct rte_flow_item_eth){
392 				.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
393 			},
394 			.spec = &(const struct rte_flow_item_eth){
395 				.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
396 			},
397 		},
398 		.items[1] = {
399 			.type = RTE_FLOW_ITEM_TYPE_END,
400 		},
401 		.mirred = TCA_EGRESS_MIRROR,
402 	},
403 	[TAP_REMOTE_TX] = {
404 		.attr = {
405 			.group = 0,
406 			.priority = TAP_REMOTE_TX,
407 			.egress = 1,
408 		},
409 		.items[0] = {
410 			.type = RTE_FLOW_ITEM_TYPE_VOID,
411 		},
412 		.items[1] = {
413 			.type = RTE_FLOW_ITEM_TYPE_END,
414 		},
415 		.mirred = TCA_EGRESS_MIRROR,
416 	},
417 	[TAP_ISOLATE] = {
418 		.attr = {
419 			.group = MAX_GROUP,
420 			.priority = PRIORITY_MASK - TAP_ISOLATE,
421 			.ingress = 1,
422 		},
423 		.items[0] = {
424 			.type = RTE_FLOW_ITEM_TYPE_VOID,
425 		},
426 		.items[1] = {
427 			.type = RTE_FLOW_ITEM_TYPE_END,
428 		},
429 	},
430 };
431 
432 /**
433  * Make as much checks as possible on an Ethernet item, and if a flow is
434  * provided, fill it appropriately with Ethernet info.
435  *
436  * @param[in] item
437  *   Item specification.
438  * @param[in, out] data
439  *   Additional data structure to tell next layers we've been here.
440  *
441  * @return
442  *   0 if checks are alright, -1 otherwise.
443  */
444 static int
445 tap_flow_create_eth(const struct rte_flow_item *item, void *data)
446 {
447 	struct convert_data *info = (struct convert_data *)data;
448 	const struct rte_flow_item_eth *spec = item->spec;
449 	const struct rte_flow_item_eth *mask = item->mask;
450 	struct rte_flow *flow = info->flow;
451 	struct nlmsg *msg;
452 
453 	/* use default mask if none provided */
454 	if (!mask)
455 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask;
456 	/* TC does not support eth_type masking. Only accept if exact match. */
457 	if (mask->type && mask->type != 0xffff)
458 		return -1;
459 	if (!spec)
460 		return 0;
461 	/* store eth_type for consistency if ipv4/6 pattern item comes next */
462 	if (spec->type & mask->type)
463 		info->eth_type = spec->type;
464 	if (!flow)
465 		return 0;
466 	msg = &flow->msg;
467 	if (!is_zero_ether_addr(&spec->dst)) {
468 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN,
469 			   &spec->dst.addr_bytes);
470 		tap_nlattr_add(&msg->nh,
471 			   TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN,
472 			   &mask->dst.addr_bytes);
473 	}
474 	if (!is_zero_ether_addr(&mask->src)) {
475 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN,
476 			   &spec->src.addr_bytes);
477 		tap_nlattr_add(&msg->nh,
478 			   TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN,
479 			   &mask->src.addr_bytes);
480 	}
481 	return 0;
482 }
483 
484 /**
485  * Make as much checks as possible on a VLAN item, and if a flow is provided,
486  * fill it appropriately with VLAN info.
487  *
488  * @param[in] item
489  *   Item specification.
490  * @param[in, out] data
491  *   Additional data structure to tell next layers we've been here.
492  *
493  * @return
494  *   0 if checks are alright, -1 otherwise.
495  */
496 static int
497 tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
498 {
499 	struct convert_data *info = (struct convert_data *)data;
500 	const struct rte_flow_item_vlan *spec = item->spec;
501 	const struct rte_flow_item_vlan *mask = item->mask;
502 	struct rte_flow *flow = info->flow;
503 	struct nlmsg *msg;
504 
505 	/* use default mask if none provided */
506 	if (!mask)
507 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
508 	/* TC does not support tpid masking. Only accept if exact match. */
509 	if (mask->tpid && mask->tpid != 0xffff)
510 		return -1;
511 	/* Double-tagging not supported. */
512 	if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
513 		return -1;
514 	info->vlan = 1;
515 	if (!flow)
516 		return 0;
517 	msg = &flow->msg;
518 	msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q));
519 #define VLAN_PRIO(tci) ((tci) >> 13)
520 #define VLAN_ID(tci) ((tci) & 0xfff)
521 	if (!spec)
522 		return 0;
523 	if (spec->tci) {
524 		uint16_t tci = ntohs(spec->tci) & mask->tci;
525 		uint16_t prio = VLAN_PRIO(tci);
526 		uint8_t vid = VLAN_ID(tci);
527 
528 		if (prio)
529 			tap_nlattr_add8(&msg->nh,
530 					TCA_FLOWER_KEY_VLAN_PRIO, prio);
531 		if (vid)
532 			tap_nlattr_add16(&msg->nh,
533 					 TCA_FLOWER_KEY_VLAN_ID, vid);
534 	}
535 	return 0;
536 }
537 
538 /**
539  * Make as much checks as possible on an IPv4 item, and if a flow is provided,
540  * fill it appropriately with IPv4 info.
541  *
542  * @param[in] item
543  *   Item specification.
544  * @param[in, out] data
545  *   Additional data structure to tell next layers we've been here.
546  *
547  * @return
548  *   0 if checks are alright, -1 otherwise.
549  */
550 static int
551 tap_flow_create_ipv4(const struct rte_flow_item *item, void *data)
552 {
553 	struct convert_data *info = (struct convert_data *)data;
554 	const struct rte_flow_item_ipv4 *spec = item->spec;
555 	const struct rte_flow_item_ipv4 *mask = item->mask;
556 	struct rte_flow *flow = info->flow;
557 	struct nlmsg *msg;
558 
559 	/* use default mask if none provided */
560 	if (!mask)
561 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask;
562 	/* check that previous eth type is compatible with ipv4 */
563 	if (info->eth_type && info->eth_type != htons(ETH_P_IP))
564 		return -1;
565 	/* store ip_proto for consistency if udp/tcp pattern item comes next */
566 	if (spec)
567 		info->ip_proto = spec->hdr.next_proto_id;
568 	if (!flow)
569 		return 0;
570 	msg = &flow->msg;
571 	if (!info->eth_type)
572 		info->eth_type = htons(ETH_P_IP);
573 	if (!spec)
574 		return 0;
575 	if (spec->hdr.dst_addr) {
576 		tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST,
577 			     spec->hdr.dst_addr);
578 		tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK,
579 			     mask->hdr.dst_addr);
580 	}
581 	if (spec->hdr.src_addr) {
582 		tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC,
583 			     spec->hdr.src_addr);
584 		tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK,
585 			     mask->hdr.src_addr);
586 	}
587 	if (spec->hdr.next_proto_id)
588 		tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO,
589 			    spec->hdr.next_proto_id);
590 	return 0;
591 }
592 
593 /**
594  * Make as much checks as possible on an IPv6 item, and if a flow is provided,
595  * fill it appropriately with IPv6 info.
596  *
597  * @param[in] item
598  *   Item specification.
599  * @param[in, out] data
600  *   Additional data structure to tell next layers we've been here.
601  *
602  * @return
603  *   0 if checks are alright, -1 otherwise.
604  */
605 static int
606 tap_flow_create_ipv6(const struct rte_flow_item *item, void *data)
607 {
608 	struct convert_data *info = (struct convert_data *)data;
609 	const struct rte_flow_item_ipv6 *spec = item->spec;
610 	const struct rte_flow_item_ipv6 *mask = item->mask;
611 	struct rte_flow *flow = info->flow;
612 	uint8_t empty_addr[16] = { 0 };
613 	struct nlmsg *msg;
614 
615 	/* use default mask if none provided */
616 	if (!mask)
617 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask;
618 	/* check that previous eth type is compatible with ipv6 */
619 	if (info->eth_type && info->eth_type != htons(ETH_P_IPV6))
620 		return -1;
621 	/* store ip_proto for consistency if udp/tcp pattern item comes next */
622 	if (spec)
623 		info->ip_proto = spec->hdr.proto;
624 	if (!flow)
625 		return 0;
626 	msg = &flow->msg;
627 	if (!info->eth_type)
628 		info->eth_type = htons(ETH_P_IPV6);
629 	if (!spec)
630 		return 0;
631 	if (memcmp(spec->hdr.dst_addr, empty_addr, 16)) {
632 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST,
633 			   sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr);
634 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK,
635 			   sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr);
636 	}
637 	if (memcmp(spec->hdr.src_addr, empty_addr, 16)) {
638 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC,
639 			   sizeof(spec->hdr.src_addr), &spec->hdr.src_addr);
640 		tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
641 			   sizeof(mask->hdr.src_addr), &mask->hdr.src_addr);
642 	}
643 	if (spec->hdr.proto)
644 		tap_nlattr_add8(&msg->nh,
645 				TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto);
646 	return 0;
647 }
648 
649 /**
650  * Make as much checks as possible on a UDP item, and if a flow is provided,
651  * fill it appropriately with UDP info.
652  *
653  * @param[in] item
654  *   Item specification.
655  * @param[in, out] data
656  *   Additional data structure to tell next layers we've been here.
657  *
658  * @return
659  *   0 if checks are alright, -1 otherwise.
660  */
661 static int
662 tap_flow_create_udp(const struct rte_flow_item *item, void *data)
663 {
664 	struct convert_data *info = (struct convert_data *)data;
665 	const struct rte_flow_item_udp *spec = item->spec;
666 	const struct rte_flow_item_udp *mask = item->mask;
667 	struct rte_flow *flow = info->flow;
668 	struct nlmsg *msg;
669 
670 	/* use default mask if none provided */
671 	if (!mask)
672 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask;
673 	/* check that previous ip_proto is compatible with udp */
674 	if (info->ip_proto && info->ip_proto != IPPROTO_UDP)
675 		return -1;
676 	/* TC does not support UDP port masking. Only accept if exact match. */
677 	if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
678 	    (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
679 		return -1;
680 	if (!flow)
681 		return 0;
682 	msg = &flow->msg;
683 	tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP);
684 	if (!spec)
685 		return 0;
686 	if (spec->hdr.dst_port & mask->hdr.dst_port)
687 		tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST,
688 			     spec->hdr.dst_port);
689 	if (spec->hdr.src_port & mask->hdr.src_port)
690 		tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC,
691 			     spec->hdr.src_port);
692 	return 0;
693 }
694 
695 /**
696  * Make as much checks as possible on a TCP item, and if a flow is provided,
697  * fill it appropriately with TCP info.
698  *
699  * @param[in] item
700  *   Item specification.
701  * @param[in, out] data
702  *   Additional data structure to tell next layers we've been here.
703  *
704  * @return
705  *   0 if checks are alright, -1 otherwise.
706  */
707 static int
708 tap_flow_create_tcp(const struct rte_flow_item *item, void *data)
709 {
710 	struct convert_data *info = (struct convert_data *)data;
711 	const struct rte_flow_item_tcp *spec = item->spec;
712 	const struct rte_flow_item_tcp *mask = item->mask;
713 	struct rte_flow *flow = info->flow;
714 	struct nlmsg *msg;
715 
716 	/* use default mask if none provided */
717 	if (!mask)
718 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask;
719 	/* check that previous ip_proto is compatible with tcp */
720 	if (info->ip_proto && info->ip_proto != IPPROTO_TCP)
721 		return -1;
722 	/* TC does not support TCP port masking. Only accept if exact match. */
723 	if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
724 	    (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
725 		return -1;
726 	if (!flow)
727 		return 0;
728 	msg = &flow->msg;
729 	tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP);
730 	if (!spec)
731 		return 0;
732 	if (spec->hdr.dst_port & mask->hdr.dst_port)
733 		tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST,
734 			     spec->hdr.dst_port);
735 	if (spec->hdr.src_port & mask->hdr.src_port)
736 		tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC,
737 			     spec->hdr.src_port);
738 	return 0;
739 }
740 
741 /**
742  * Check support for a given item.
743  *
744  * @param[in] item
745  *   Item specification.
746  * @param size
747  *   Bit-Mask size in bytes.
748  * @param[in] supported_mask
749  *   Bit-mask covering supported fields to compare with spec, last and mask in
750  *   \item.
751  * @param[in] default_mask
752  *   Bit-mask default mask if none is provided in \item.
753  *
754  * @return
755  *   0 on success.
756  */
757 static int
758 tap_flow_item_validate(const struct rte_flow_item *item,
759 		       unsigned int size,
760 		       const uint8_t *supported_mask,
761 		       const uint8_t *default_mask)
762 {
763 	int ret = 0;
764 
765 	/* An empty layer is allowed, as long as all fields are NULL */
766 	if (!item->spec && (item->mask || item->last))
767 		return -1;
768 	/* Is the item spec compatible with what the NIC supports? */
769 	if (item->spec && !item->mask) {
770 		unsigned int i;
771 		const uint8_t *spec = item->spec;
772 
773 		for (i = 0; i < size; ++i)
774 			if ((spec[i] | supported_mask[i]) != supported_mask[i])
775 				return -1;
776 		/* Is the default mask compatible with what the NIC supports? */
777 		for (i = 0; i < size; i++)
778 			if ((default_mask[i] | supported_mask[i]) !=
779 			    supported_mask[i])
780 				return -1;
781 	}
782 	/* Is the item last compatible with what the NIC supports? */
783 	if (item->last && !item->mask) {
784 		unsigned int i;
785 		const uint8_t *spec = item->last;
786 
787 		for (i = 0; i < size; ++i)
788 			if ((spec[i] | supported_mask[i]) != supported_mask[i])
789 				return -1;
790 	}
791 	/* Is the item mask compatible with what the NIC supports? */
792 	if (item->mask) {
793 		unsigned int i;
794 		const uint8_t *spec = item->mask;
795 
796 		for (i = 0; i < size; ++i)
797 			if ((spec[i] | supported_mask[i]) != supported_mask[i])
798 				return -1;
799 	}
800 	/**
801 	 * Once masked, Are item spec and item last equal?
802 	 * TC does not support range so anything else is invalid.
803 	 */
804 	if (item->spec && item->last) {
805 		uint8_t spec[size];
806 		uint8_t last[size];
807 		const uint8_t *apply = default_mask;
808 		unsigned int i;
809 
810 		if (item->mask)
811 			apply = item->mask;
812 		for (i = 0; i < size; ++i) {
813 			spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
814 			last[i] = ((const uint8_t *)item->last)[i] & apply[i];
815 		}
816 		ret = memcmp(spec, last, size);
817 	}
818 	return ret;
819 }
820 
821 /**
822  * Transform a DROP/PASSTHRU action item in the provided flow for TC.
823  *
824  * @param[in, out] flow
825  *   Flow to be filled.
826  * @param[in] action
827  *   Appropriate action to be set in the TCA_GACT_PARMS structure.
828  *
829  * @return
830  *   0 if checks are alright, -1 otherwise.
831  */
832 static int
833 add_action_gact(struct rte_flow *flow, int action)
834 {
835 	struct nlmsg *msg = &flow->msg;
836 	size_t act_index = 1;
837 	struct tc_gact p = {
838 		.action = action
839 	};
840 
841 	if (tap_nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
842 		return -1;
843 	if (tap_nlattr_nested_start(msg, act_index++) < 0)
844 		return -1;
845 	tap_nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("gact"), "gact");
846 	if (tap_nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
847 		return -1;
848 	tap_nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(p), &p);
849 	tap_nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
850 	tap_nlattr_nested_finish(msg); /* nested act_index */
851 	tap_nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
852 	return 0;
853 }
854 
855 /**
856  * Transform a MIRRED action item in the provided flow for TC.
857  *
858  * @param[in, out] flow
859  *   Flow to be filled.
860  * @param[in] ifindex
861  *   Netdevice ifindex, where to mirror/redirect packet to.
862  * @param[in] action_type
863  *   Either TCA_EGRESS_REDIR for redirection or TCA_EGRESS_MIRROR for mirroring.
864  *
865  * @return
866  *   0 if checks are alright, -1 otherwise.
867  */
868 static int
869 add_action_mirred(struct rte_flow *flow, uint16_t ifindex, uint16_t action_type)
870 {
871 	struct nlmsg *msg = &flow->msg;
872 	size_t act_index = 1;
873 	struct tc_mirred p = {
874 		.eaction = action_type,
875 		.ifindex = ifindex,
876 	};
877 
878 	if (tap_nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
879 		return -1;
880 	if (tap_nlattr_nested_start(msg, act_index++) < 0)
881 		return -1;
882 	tap_nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("mirred"), "mirred");
883 	if (tap_nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
884 		return -1;
885 	if (action_type == TCA_EGRESS_MIRROR)
886 		p.action = TC_ACT_PIPE;
887 	else /* REDIRECT */
888 		p.action = TC_ACT_STOLEN;
889 	tap_nlattr_add(&msg->nh, TCA_MIRRED_PARMS, sizeof(p), &p);
890 	tap_nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
891 	tap_nlattr_nested_finish(msg); /* nested act_index */
892 	tap_nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
893 	return 0;
894 }
895 
896 /**
897  * Transform a QUEUE action item in the provided flow for TC.
898  *
899  * @param[in, out] flow
900  *   Flow to be filled.
901  * @param[in] queue
902  *   Queue id to use.
903  *
904  * @return
905  *   0 if checks are alright, -1 otherwise.
906  */
907 static int
908 add_action_skbedit(struct rte_flow *flow, uint16_t queue)
909 {
910 	struct nlmsg *msg = &flow->msg;
911 	size_t act_index = 1;
912 	struct tc_skbedit p = {
913 		.action = TC_ACT_PIPE
914 	};
915 
916 	if (tap_nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
917 		return -1;
918 	if (tap_nlattr_nested_start(msg, act_index++) < 0)
919 		return -1;
920 	tap_nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("skbedit"), "skbedit");
921 	if (tap_nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
922 		return -1;
923 	tap_nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, sizeof(p), &p);
924 	tap_nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, queue);
925 	tap_nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
926 	tap_nlattr_nested_finish(msg); /* nested act_index */
927 	tap_nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
928 	return 0;
929 }
930 
931 /**
932  * Validate a flow supported by TC.
933  * If flow param is not NULL, then also fill the netlink message inside.
934  *
935  * @param pmd
936  *   Pointer to private structure.
937  * @param[in] attr
938  *   Flow rule attributes.
939  * @param[in] pattern
940  *   Pattern specification (list terminated by the END pattern item).
941  * @param[in] actions
942  *   Associated actions (list terminated by the END action).
943  * @param[out] error
944  *   Perform verbose error reporting if not NULL.
945  * @param[in, out] flow
946  *   Flow structure to update.
947  * @param[in] mirred
948  *   If set to TCA_EGRESS_REDIR, provided actions will be replaced with a
949  *   redirection to the tap netdevice, and the TC rule will be configured
950  *   on the remote netdevice in pmd.
951  *   If set to TCA_EGRESS_MIRROR, provided actions will be replaced with a
952  *   mirroring to the tap netdevice, and the TC rule will be configured
953  *   on the remote netdevice in pmd. Matching packets will thus be duplicated.
954  *   If set to 0, the standard behavior is to be used: set correct actions for
955  *   the TC rule, and apply it on the tap netdevice.
956  *
957  * @return
958  *   0 on success, a negative errno value otherwise and rte_errno is set.
959  */
960 static int
961 priv_flow_process(struct pmd_internals *pmd,
962 		  const struct rte_flow_attr *attr,
963 		  const struct rte_flow_item items[],
964 		  const struct rte_flow_action actions[],
965 		  struct rte_flow_error *error,
966 		  struct rte_flow *flow,
967 		  int mirred)
968 {
969 	const struct tap_flow_items *cur_item = tap_flow_items;
970 	struct convert_data data = {
971 		.eth_type = 0,
972 		.ip_proto = 0,
973 		.flow = flow,
974 	};
975 	int action = 0; /* Only one action authorized for now */
976 
977 	if (attr->group > MAX_GROUP) {
978 		rte_flow_error_set(
979 			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
980 			NULL, "group value too big: cannot exceed 15");
981 		return -rte_errno;
982 	}
983 	if (attr->priority > MAX_PRIORITY) {
984 		rte_flow_error_set(
985 			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
986 			NULL, "priority value too big");
987 		return -rte_errno;
988 	} else if (flow) {
989 		uint16_t group = attr->group << GROUP_SHIFT;
990 		uint16_t prio = group | (attr->priority + PRIORITY_OFFSET);
991 		flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
992 						 flow->msg.t.tcm_info);
993 	}
994 	if (flow) {
995 		if (mirred) {
996 			/*
997 			 * If attr->ingress, the rule applies on remote ingress
998 			 * to match incoming packets
999 			 * If attr->egress, the rule applies on tap ingress (as
1000 			 * seen from the kernel) to deal with packets going out
1001 			 * from the DPDK app.
1002 			 */
1003 			flow->msg.t.tcm_parent = TC_H_MAKE(TC_H_INGRESS, 0);
1004 		} else {
1005 			/* Standard rule on tap egress (kernel standpoint). */
1006 			flow->msg.t.tcm_parent =
1007 				TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
1008 		}
1009 		/* use flower filter type */
1010 		tap_nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower");
1011 		if (tap_nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0)
1012 			goto exit_item_not_supported;
1013 	}
1014 	for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
1015 		const struct tap_flow_items *token = NULL;
1016 		unsigned int i;
1017 		int err = 0;
1018 
1019 		if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
1020 			continue;
1021 		for (i = 0;
1022 		     cur_item->items &&
1023 		     cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
1024 		     ++i) {
1025 			if (cur_item->items[i] == items->type) {
1026 				token = &tap_flow_items[items->type];
1027 				break;
1028 			}
1029 		}
1030 		if (!token)
1031 			goto exit_item_not_supported;
1032 		cur_item = token;
1033 		err = tap_flow_item_validate(
1034 			items, cur_item->mask_sz,
1035 			(const uint8_t *)cur_item->mask,
1036 			(const uint8_t *)cur_item->default_mask);
1037 		if (err)
1038 			goto exit_item_not_supported;
1039 		if (flow && cur_item->convert) {
1040 			err = cur_item->convert(items, &data);
1041 			if (err)
1042 				goto exit_item_not_supported;
1043 		}
1044 	}
1045 	if (flow) {
1046 		if (data.vlan) {
1047 			tap_nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
1048 				     htons(ETH_P_8021Q));
1049 			tap_nlattr_add16(&flow->msg.nh,
1050 				     TCA_FLOWER_KEY_VLAN_ETH_TYPE,
1051 				     data.eth_type ?
1052 				     data.eth_type : htons(ETH_P_ALL));
1053 		} else if (data.eth_type) {
1054 			tap_nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
1055 				     data.eth_type);
1056 		}
1057 	}
1058 	if (mirred && flow) {
1059 		uint16_t if_index = pmd->if_index;
1060 
1061 		/*
1062 		 * If attr->egress && mirred, then this is a special
1063 		 * case where the rule must be applied on the tap, to
1064 		 * redirect packets coming from the DPDK App, out
1065 		 * through the remote netdevice.
1066 		 */
1067 		if (attr->egress)
1068 			if_index = pmd->remote_if_index;
1069 		if (add_action_mirred(flow, if_index, mirred) < 0)
1070 			goto exit_action_not_supported;
1071 		else
1072 			goto end;
1073 	}
1074 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
1075 		int err = 0;
1076 
1077 		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
1078 			continue;
1079 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
1080 			if (action)
1081 				goto exit_action_not_supported;
1082 			action = 1;
1083 			if (flow)
1084 				err = add_action_gact(flow, TC_ACT_SHOT);
1085 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
1086 			if (action)
1087 				goto exit_action_not_supported;
1088 			action = 1;
1089 			if (flow)
1090 				err = add_action_gact(flow, TC_ACT_UNSPEC);
1091 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
1092 			const struct rte_flow_action_queue *queue =
1093 				(const struct rte_flow_action_queue *)
1094 				actions->conf;
1095 
1096 			if (action)
1097 				goto exit_action_not_supported;
1098 			action = 1;
1099 			if (!queue ||
1100 			    (queue->index > pmd->dev->data->nb_rx_queues - 1))
1101 				goto exit_action_not_supported;
1102 			if (flow)
1103 				err = add_action_skbedit(flow, queue->index);
1104 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
1105 			/* Fake RSS support. */
1106 			const struct rte_flow_action_rss *rss =
1107 				(const struct rte_flow_action_rss *)
1108 				actions->conf;
1109 
1110 			if (action)
1111 				goto exit_action_not_supported;
1112 			action = 1;
1113 			if (!rss || rss->num < 1 ||
1114 			    (rss->queue[0] > pmd->dev->data->nb_rx_queues - 1))
1115 				goto exit_action_not_supported;
1116 			if (flow)
1117 				err = add_action_skbedit(flow, rss->queue[0]);
1118 		} else {
1119 			goto exit_action_not_supported;
1120 		}
1121 		if (err)
1122 			goto exit_action_not_supported;
1123 	}
1124 end:
1125 	if (flow)
1126 		tap_nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
1127 	return 0;
1128 exit_item_not_supported:
1129 	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
1130 			   items, "item not supported");
1131 	return -rte_errno;
1132 exit_action_not_supported:
1133 	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
1134 			   actions, "action not supported");
1135 	return -rte_errno;
1136 }
1137 
1138 
1139 
1140 /**
1141  * Validate a flow.
1142  *
1143  * @see rte_flow_validate()
1144  * @see rte_flow_ops
1145  */
1146 static int
1147 tap_flow_validate(struct rte_eth_dev *dev,
1148 		  const struct rte_flow_attr *attr,
1149 		  const struct rte_flow_item items[],
1150 		  const struct rte_flow_action actions[],
1151 		  struct rte_flow_error *error)
1152 {
1153 	struct pmd_internals *pmd = dev->data->dev_private;
1154 
1155 	return priv_flow_process(pmd, attr, items, actions, error, NULL, 0);
1156 }
1157 
1158 /**
1159  * Set a unique handle in a flow.
1160  *
1161  * The kernel supports TC rules with equal priority, as long as they use the
1162  * same matching fields (e.g.: dst mac and ipv4) with different values (and
1163  * full mask to ensure no collision is possible).
1164  * In those rules, the handle (uint32_t) is the part that would identify
1165  * specifically each rule.
1166  *
1167  * On 32-bit architectures, the handle can simply be the flow's pointer address.
1168  * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently)
1169  * unique handle.
1170  *
1171  * @param[in, out] flow
1172  *   The flow that needs its handle set.
1173  */
1174 static void
1175 tap_flow_set_handle(struct rte_flow *flow)
1176 {
1177 	uint32_t handle = 0;
1178 
1179 	if (sizeof(flow) > 4)
1180 		handle = rte_jhash(&flow, sizeof(flow), 1);
1181 	else
1182 		handle = (uintptr_t)flow;
1183 	/* must be at least 1 to avoid letting the kernel choose one for us */
1184 	if (!handle)
1185 		handle = 1;
1186 	flow->msg.t.tcm_handle = handle;
1187 }
1188 
1189 /**
1190  * Create a flow.
1191  *
1192  * @see rte_flow_create()
1193  * @see rte_flow_ops
1194  */
1195 static struct rte_flow *
1196 tap_flow_create(struct rte_eth_dev *dev,
1197 		const struct rte_flow_attr *attr,
1198 		const struct rte_flow_item items[],
1199 		const struct rte_flow_action actions[],
1200 		struct rte_flow_error *error)
1201 {
1202 	struct pmd_internals *pmd = dev->data->dev_private;
1203 	struct rte_flow *remote_flow = NULL;
1204 	struct rte_flow *flow = NULL;
1205 	struct nlmsg *msg = NULL;
1206 	int err;
1207 
1208 	if (!pmd->if_index) {
1209 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1210 				   NULL,
1211 				   "can't create rule, ifindex not found");
1212 		goto fail;
1213 	}
1214 	/*
1215 	 * No rules configured through standard rte_flow should be set on the
1216 	 * priorities used by implicit rules.
1217 	 */
1218 	if ((attr->group == MAX_GROUP) &&
1219 	    attr->priority > (MAX_PRIORITY - TAP_REMOTE_MAX_IDX)) {
1220 		rte_flow_error_set(
1221 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1222 			NULL, "priority value too big");
1223 		goto fail;
1224 	}
1225 	flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1226 	if (!flow) {
1227 		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1228 				   NULL, "cannot allocate memory for rte_flow");
1229 		goto fail;
1230 	}
1231 	msg = &flow->msg;
1232 	tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER,
1233 		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1234 	msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1235 	tap_flow_set_handle(flow);
1236 	if (priv_flow_process(pmd, attr, items, actions, error, flow, 0))
1237 		goto fail;
1238 	err = tap_nl_send(pmd->nlsk_fd, &msg->nh);
1239 	if (err < 0) {
1240 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1241 				   NULL, "couldn't send request to kernel");
1242 		goto fail;
1243 	}
1244 	err = tap_nl_recv_ack(pmd->nlsk_fd);
1245 	if (err < 0) {
1246 		RTE_LOG(ERR, PMD,
1247 			"Kernel refused TC filter rule creation (%d): %s\n",
1248 			errno, strerror(errno));
1249 		rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE,
1250 				   NULL,
1251 				   "overlapping rules or Kernel too old for flower support");
1252 		goto fail;
1253 	}
1254 	LIST_INSERT_HEAD(&pmd->flows, flow, next);
1255 	/**
1256 	 * If a remote device is configured, a TC rule with identical items for
1257 	 * matching must be set on that device, with a single action: redirect
1258 	 * to the local pmd->if_index.
1259 	 */
1260 	if (pmd->remote_if_index) {
1261 		remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1262 		if (!remote_flow) {
1263 			rte_flow_error_set(
1264 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1265 				"cannot allocate memory for rte_flow");
1266 			goto fail;
1267 		}
1268 		msg = &remote_flow->msg;
1269 		/* set the rule if_index for the remote netdevice */
1270 		tc_init_msg(
1271 			msg, pmd->remote_if_index, RTM_NEWTFILTER,
1272 			NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1273 		msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1274 		tap_flow_set_handle(remote_flow);
1275 		if (priv_flow_process(pmd, attr, items, NULL,
1276 				      error, remote_flow, TCA_EGRESS_REDIR)) {
1277 			rte_flow_error_set(
1278 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1279 				NULL, "rte flow rule validation failed");
1280 			goto fail;
1281 		}
1282 		err = tap_nl_send(pmd->nlsk_fd, &msg->nh);
1283 		if (err < 0) {
1284 			rte_flow_error_set(
1285 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1286 				NULL, "Failure sending nl request");
1287 			goto fail;
1288 		}
1289 		err = tap_nl_recv_ack(pmd->nlsk_fd);
1290 		if (err < 0) {
1291 			RTE_LOG(ERR, PMD,
1292 				"Kernel refused TC filter rule creation (%d): %s\n",
1293 				errno, strerror(errno));
1294 			rte_flow_error_set(
1295 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1296 				NULL,
1297 				"overlapping rules or Kernel too old for flower support");
1298 			goto fail;
1299 		}
1300 		flow->remote_flow = remote_flow;
1301 	}
1302 	return flow;
1303 fail:
1304 	if (remote_flow)
1305 		rte_free(remote_flow);
1306 	if (flow)
1307 		rte_free(flow);
1308 	return NULL;
1309 }
1310 
1311 /**
1312  * Destroy a flow using pointer to pmd_internal.
1313  *
1314  * @param[in, out] pmd
1315  *   Pointer to private structure.
1316  * @param[in] flow
1317  *   Pointer to the flow to destroy.
1318  * @param[in, out] error
1319  *   Pointer to the flow error handler
1320  *
1321  * @return 0 if the flow could be destroyed, -1 otherwise.
1322  */
1323 static int
1324 tap_flow_destroy_pmd(struct pmd_internals *pmd,
1325 		     struct rte_flow *flow,
1326 		     struct rte_flow_error *error)
1327 {
1328 	struct rte_flow *remote_flow = flow->remote_flow;
1329 	int ret = 0;
1330 
1331 	LIST_REMOVE(flow, next);
1332 	flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1333 	flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1334 
1335 	ret = tap_nl_send(pmd->nlsk_fd, &flow->msg.nh);
1336 	if (ret < 0) {
1337 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1338 				   NULL, "couldn't send request to kernel");
1339 		goto end;
1340 	}
1341 	ret = tap_nl_recv_ack(pmd->nlsk_fd);
1342 	/* If errno is ENOENT, the rule is already no longer in the kernel. */
1343 	if (ret < 0 && errno == ENOENT)
1344 		ret = 0;
1345 	if (ret < 0) {
1346 		RTE_LOG(ERR, PMD,
1347 			"Kernel refused TC filter rule deletion (%d): %s\n",
1348 			errno, strerror(errno));
1349 		rte_flow_error_set(
1350 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1351 			"couldn't receive kernel ack to our request");
1352 		goto end;
1353 	}
1354 	if (remote_flow) {
1355 		remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1356 		remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1357 
1358 		ret = tap_nl_send(pmd->nlsk_fd, &remote_flow->msg.nh);
1359 		if (ret < 0) {
1360 			rte_flow_error_set(
1361 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1362 				NULL, "Failure sending nl request");
1363 			goto end;
1364 		}
1365 		ret = tap_nl_recv_ack(pmd->nlsk_fd);
1366 		if (ret < 0 && errno == ENOENT)
1367 			ret = 0;
1368 		if (ret < 0) {
1369 			RTE_LOG(ERR, PMD,
1370 				"Kernel refused TC filter rule deletion (%d): %s\n",
1371 				errno, strerror(errno));
1372 			rte_flow_error_set(
1373 				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1374 				NULL, "Failure trying to receive nl ack");
1375 			goto end;
1376 		}
1377 	}
1378 end:
1379 	if (remote_flow)
1380 		rte_free(remote_flow);
1381 	rte_free(flow);
1382 	return ret;
1383 }
1384 
1385 /**
1386  * Destroy a flow.
1387  *
1388  * @see rte_flow_destroy()
1389  * @see rte_flow_ops
1390  */
1391 static int
1392 tap_flow_destroy(struct rte_eth_dev *dev,
1393 		 struct rte_flow *flow,
1394 		 struct rte_flow_error *error)
1395 {
1396 	struct pmd_internals *pmd = dev->data->dev_private;
1397 
1398 	return tap_flow_destroy_pmd(pmd, flow, error);
1399 }
1400 
1401 /**
1402  * Enable/disable flow isolation.
1403  *
1404  * @see rte_flow_isolate()
1405  * @see rte_flow_ops
1406  */
1407 static int
1408 tap_flow_isolate(struct rte_eth_dev *dev,
1409 		 int set,
1410 		 struct rte_flow_error *error __rte_unused)
1411 {
1412 	struct pmd_internals *pmd = dev->data->dev_private;
1413 
1414 	if (set)
1415 		pmd->flow_isolate = 1;
1416 	else
1417 		pmd->flow_isolate = 0;
1418 	/*
1419 	 * If netdevice is there, setup appropriate flow rules immediately.
1420 	 * Otherwise it will be set when bringing up the netdevice (tun_alloc).
1421 	 */
1422 	if (!pmd->rxq[0].fd)
1423 		return 0;
1424 	if (set) {
1425 		struct rte_flow *flow;
1426 
1427 		while (1) {
1428 			flow = LIST_FIRST(&pmd->implicit_flows);
1429 			if (!flow)
1430 				break;
1431 			/*
1432 			 * Remove all implicit rules on the remote.
1433 			 * Keep the local rule to redirect packets on TX.
1434 			 * Keep also the last implicit local rule: ISOLATE.
1435 			 */
1436 			if (flow->msg.t.tcm_ifindex == pmd->if_index)
1437 				break;
1438 			if (tap_flow_destroy_pmd(pmd, flow, NULL) < 0)
1439 				goto error;
1440 		}
1441 		/* Switch the TC rule according to pmd->flow_isolate */
1442 		if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1)
1443 			goto error;
1444 	} else {
1445 		/* Switch the TC rule according to pmd->flow_isolate */
1446 		if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1)
1447 			goto error;
1448 		if (!pmd->remote_if_index)
1449 			return 0;
1450 		if (tap_flow_implicit_create(pmd, TAP_REMOTE_TX) < 0)
1451 			goto error;
1452 		if (tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0)
1453 			goto error;
1454 		if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCAST) < 0)
1455 			goto error;
1456 		if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCASTV6) < 0)
1457 			goto error;
1458 		if (dev->data->promiscuous &&
1459 		    tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC) < 0)
1460 			goto error;
1461 		if (dev->data->all_multicast &&
1462 		    tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI) < 0)
1463 			goto error;
1464 	}
1465 	return 0;
1466 error:
1467 	pmd->flow_isolate = 0;
1468 	return rte_flow_error_set(
1469 		error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
1470 		"TC rule creation failed");
1471 }
1472 
1473 /**
1474  * Destroy all flows.
1475  *
1476  * @see rte_flow_flush()
1477  * @see rte_flow_ops
1478  */
1479 int
1480 tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
1481 {
1482 	struct pmd_internals *pmd = dev->data->dev_private;
1483 	struct rte_flow *flow;
1484 
1485 	while (!LIST_EMPTY(&pmd->flows)) {
1486 		flow = LIST_FIRST(&pmd->flows);
1487 		if (tap_flow_destroy(dev, flow, error) < 0)
1488 			return -1;
1489 	}
1490 	return 0;
1491 }
1492 
1493 /**
1494  * Add an implicit flow rule on the remote device to make sure traffic gets to
1495  * the tap netdevice from there.
1496  *
1497  * @param pmd
1498  *   Pointer to private structure.
1499  * @param[in] idx
1500  *   The idx in the implicit_rte_flows array specifying which rule to apply.
1501  *
1502  * @return -1 if the rule couldn't be applied, 0 otherwise.
1503  */
1504 int tap_flow_implicit_create(struct pmd_internals *pmd,
1505 			     enum implicit_rule_index idx)
1506 {
1507 	uint16_t flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE;
1508 	struct rte_flow_action *actions = implicit_rte_flows[idx].actions;
1509 	struct rte_flow_action isolate_actions[2] = {
1510 		[1] = {
1511 			.type = RTE_FLOW_ACTION_TYPE_END,
1512 		},
1513 	};
1514 	struct rte_flow_item *items = implicit_rte_flows[idx].items;
1515 	struct rte_flow_attr *attr = &implicit_rte_flows[idx].attr;
1516 	struct rte_flow_item_eth eth_local = { .type = 0 };
1517 	uint16_t if_index = pmd->remote_if_index;
1518 	struct rte_flow *remote_flow = NULL;
1519 	struct nlmsg *msg = NULL;
1520 	int err = 0;
1521 	struct rte_flow_item items_local[2] = {
1522 		[0] = {
1523 			.type = items[0].type,
1524 			.spec = &eth_local,
1525 			.mask = items[0].mask,
1526 		},
1527 		[1] = {
1528 			.type = items[1].type,
1529 		}
1530 	};
1531 
1532 	remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1533 	if (!remote_flow) {
1534 		RTE_LOG(ERR, PMD, "Cannot allocate memory for rte_flow\n");
1535 		goto fail;
1536 	}
1537 	msg = &remote_flow->msg;
1538 	if (idx == TAP_REMOTE_TX) {
1539 		if_index = pmd->if_index;
1540 	} else if (idx == TAP_ISOLATE) {
1541 		if_index = pmd->if_index;
1542 		/* Don't be exclusive for this rule, it can be changed later. */
1543 		flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE;
1544 		isolate_actions[0].type = pmd->flow_isolate ?
1545 			RTE_FLOW_ACTION_TYPE_DROP :
1546 			RTE_FLOW_ACTION_TYPE_PASSTHRU;
1547 		actions = isolate_actions;
1548 	} else if (idx == TAP_REMOTE_LOCAL_MAC) {
1549 		/*
1550 		 * eth addr couldn't be set in implicit_rte_flows[] as it is not
1551 		 * known at compile time.
1552 		 */
1553 		memcpy(&eth_local.dst, &pmd->eth_addr, sizeof(pmd->eth_addr));
1554 		items = items_local;
1555 	}
1556 	tc_init_msg(msg, if_index, RTM_NEWTFILTER, flags);
1557 	msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1558 	/*
1559 	 * The ISOLATE rule is always present and must have a static handle, as
1560 	 * the action is changed whether the feature is enabled (DROP) or
1561 	 * disabled (PASSTHRU).
1562 	 */
1563 	if (idx == TAP_ISOLATE)
1564 		remote_flow->msg.t.tcm_handle = ISOLATE_HANDLE;
1565 	else
1566 		tap_flow_set_handle(remote_flow);
1567 	if (priv_flow_process(pmd, attr, items, actions, NULL,
1568 			      remote_flow, implicit_rte_flows[idx].mirred)) {
1569 		RTE_LOG(ERR, PMD, "rte flow rule validation failed\n");
1570 		goto fail;
1571 	}
1572 	err = tap_nl_send(pmd->nlsk_fd, &msg->nh);
1573 	if (err < 0) {
1574 		RTE_LOG(ERR, PMD, "Failure sending nl request\n");
1575 		goto fail;
1576 	}
1577 	err = tap_nl_recv_ack(pmd->nlsk_fd);
1578 	if (err < 0) {
1579 		RTE_LOG(ERR, PMD,
1580 			"Kernel refused TC filter rule creation (%d): %s\n",
1581 			errno, strerror(errno));
1582 		goto fail;
1583 	}
1584 	LIST_INSERT_HEAD(&pmd->implicit_flows, remote_flow, next);
1585 	return 0;
1586 fail:
1587 	if (remote_flow)
1588 		rte_free(remote_flow);
1589 	return -1;
1590 }
1591 
1592 /**
1593  * Remove specific implicit flow rule on the remote device.
1594  *
1595  * @param[in, out] pmd
1596  *   Pointer to private structure.
1597  * @param[in] idx
1598  *   The idx in the implicit_rte_flows array specifying which rule to remove.
1599  *
1600  * @return -1 if one of the implicit rules couldn't be created, 0 otherwise.
1601  */
1602 int tap_flow_implicit_destroy(struct pmd_internals *pmd,
1603 			      enum implicit_rule_index idx)
1604 {
1605 	struct rte_flow *remote_flow;
1606 	int cur_prio = -1;
1607 	int idx_prio = implicit_rte_flows[idx].attr.priority + PRIORITY_OFFSET;
1608 
1609 	for (remote_flow = LIST_FIRST(&pmd->implicit_flows);
1610 	     remote_flow;
1611 	     remote_flow = LIST_NEXT(remote_flow, next)) {
1612 		cur_prio = (remote_flow->msg.t.tcm_info >> 16) & PRIORITY_MASK;
1613 		if (cur_prio != idx_prio)
1614 			continue;
1615 		return tap_flow_destroy_pmd(pmd, remote_flow, NULL);
1616 	}
1617 	return 0;
1618 }
1619 
1620 /**
1621  * Destroy all implicit flows.
1622  *
1623  * @see rte_flow_flush()
1624  */
1625 int
1626 tap_flow_implicit_flush(struct pmd_internals *pmd, struct rte_flow_error *error)
1627 {
1628 	struct rte_flow *remote_flow;
1629 
1630 	while (!LIST_EMPTY(&pmd->implicit_flows)) {
1631 		remote_flow = LIST_FIRST(&pmd->implicit_flows);
1632 		if (tap_flow_destroy_pmd(pmd, remote_flow, error) < 0)
1633 			return -1;
1634 	}
1635 	return 0;
1636 }
1637 
1638 /**
1639  * Manage filter operations.
1640  *
1641  * @param dev
1642  *   Pointer to Ethernet device structure.
1643  * @param filter_type
1644  *   Filter type.
1645  * @param filter_op
1646  *   Operation to perform.
1647  * @param arg
1648  *   Pointer to operation-specific structure.
1649  *
1650  * @return
1651  *   0 on success, negative errno value on failure.
1652  */
1653 int
1654 tap_dev_filter_ctrl(struct rte_eth_dev *dev,
1655 		    enum rte_filter_type filter_type,
1656 		    enum rte_filter_op filter_op,
1657 		    void *arg)
1658 {
1659 	switch (filter_type) {
1660 	case RTE_ETH_FILTER_GENERIC:
1661 		if (filter_op != RTE_ETH_FILTER_GET)
1662 			return -EINVAL;
1663 		*(const void **)arg = &tap_flow_ops;
1664 		return 0;
1665 	default:
1666 		RTE_LOG(ERR, PMD, "%p: filter type (%d) not supported\n",
1667 			(void *)dev, filter_type);
1668 	}
1669 	return -EINVAL;
1670 }
1671 
1672