xref: /dpdk/drivers/net/tap/bpf/tap_rss.c (revision d8d065045c4ae99476485981abbd462f18496d86)
1 /* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
2  * Copyright 2017 Mellanox Technologies, Ltd
3  */
4 
5 #include <linux/in.h>
6 #include <linux/if_ether.h>
7 #include <linux/ip.h>
8 #include <linux/ipv6.h>
9 #include <linux/pkt_cls.h>
10 #include <linux/bpf.h>
11 
12 #include <bpf/bpf_helpers.h>
13 #include <bpf/bpf_endian.h>
14 
15 #include "../tap_rss.h"
16 
17 /*
18  * This map provides configuration information about flows which need BPF RSS.
19  *
20  * The hash is indexed by the skb mark.
21  */
22 struct {
23 	__uint(type, BPF_MAP_TYPE_HASH);
24 	__uint(key_size, sizeof(__u32));
25 	__uint(value_size, sizeof(struct rss_key));
26 	__uint(max_entries, TAP_RSS_MAX);
27 } rss_map SEC(".maps");
28 
29 #define IP_MF		0x2000		/** IP header Flags **/
30 #define IP_OFFSET	0x1FFF		/** IP header fragment offset **/
31 
32 /*
33  * Compute Toeplitz hash over the input tuple.
34  * This is same as rte_softrss_be in lib/hash
35  * but loop needs to be setup to match BPF restrictions.
36  */
37 static __always_inline __u32
softrss_be(const __u32 * input_tuple,__u32 input_len,const __u32 * key)38 softrss_be(const __u32 *input_tuple, __u32 input_len, const __u32 *key)
39 {
40 	__u32 i, j, hash = 0;
41 
42 #pragma unroll
43 	for (j = 0; j < input_len; j++) {
44 #pragma unroll
45 		for (i = 0; i < 32; i++) {
46 			if (input_tuple[j] & (1U << (31 - i)))
47 				hash ^= key[j] << i | key[j + 1] >> (32 - i);
48 		}
49 	}
50 	return hash;
51 }
52 
53 /*
54  * Compute RSS hash for IPv4 packet.
55  * return in 0 if RSS not specified
56  */
57 static __always_inline __u32
parse_ipv4(const struct __sk_buff * skb,__u32 hash_type,const __u32 * key)58 parse_ipv4(const struct __sk_buff *skb, __u32 hash_type, const __u32 *key)
59 {
60 	struct iphdr iph;
61 	__u32 off = 0;
62 
63 	if (bpf_skb_load_bytes_relative(skb, off, &iph, sizeof(iph), BPF_HDR_START_NET))
64 		return 0;	/* no IP header present */
65 
66 	struct {
67 		__u32    src_addr;
68 		__u32    dst_addr;
69 		__u16    dport;
70 		__u16    sport;
71 	} v4_tuple = {
72 		.src_addr = bpf_ntohl(iph.saddr),
73 		.dst_addr = bpf_ntohl(iph.daddr),
74 	};
75 
76 	/* If only calculating L3 hash, do it now */
77 	if (hash_type & (1 << HASH_FIELD_IPV4_L3))
78 		return softrss_be((__u32 *)&v4_tuple, sizeof(v4_tuple) / sizeof(__u32) - 1, key);
79 
80 	/* If packet is fragmented then no L4 hash is possible */
81 	if ((iph.frag_off & bpf_htons(IP_MF | IP_OFFSET)) != 0)
82 		return 0;
83 
84 	/* Do RSS on UDP or TCP protocols */
85 	if (iph.protocol == IPPROTO_UDP || iph.protocol == IPPROTO_TCP) {
86 		__u16 src_dst_port[2];
87 
88 		off += iph.ihl * 4;
89 		if (bpf_skb_load_bytes_relative(skb, off, &src_dst_port, sizeof(src_dst_port),
90 						BPF_HDR_START_NET))
91 			return 0; /* TCP or UDP header missing */
92 
93 		v4_tuple.sport = bpf_ntohs(src_dst_port[0]);
94 		v4_tuple.dport = bpf_ntohs(src_dst_port[1]);
95 		return softrss_be((__u32 *)&v4_tuple, sizeof(v4_tuple) / sizeof(__u32), key);
96 	}
97 
98 	/* Other protocol */
99 	return 0;
100 }
101 
102 /*
103  * Parse Ipv6 extended headers, update offset and return next proto.
104  * returns next proto on success, -1 on malformed header
105  */
106 static __always_inline int
skip_ip6_ext(__u16 proto,const struct __sk_buff * skb,__u32 * off,int * frag)107 skip_ip6_ext(__u16 proto, const struct __sk_buff *skb, __u32 *off, int *frag)
108 {
109 	struct ext_hdr {
110 		__u8 next_hdr;
111 		__u8 len;
112 	} xh;
113 	unsigned int i;
114 
115 	*frag = 0;
116 
117 #define MAX_EXT_HDRS 5
118 #pragma unroll
119 	for (i = 0; i < MAX_EXT_HDRS; i++) {
120 		switch (proto) {
121 		case IPPROTO_HOPOPTS:
122 		case IPPROTO_ROUTING:
123 		case IPPROTO_DSTOPTS:
124 			if (bpf_skb_load_bytes_relative(skb, *off, &xh, sizeof(xh),
125 							BPF_HDR_START_NET))
126 				return -1;
127 
128 			*off += (xh.len + 1) * 8;
129 			proto = xh.next_hdr;
130 			break;
131 		case IPPROTO_FRAGMENT:
132 			if (bpf_skb_load_bytes_relative(skb, *off, &xh, sizeof(xh),
133 							BPF_HDR_START_NET))
134 				return -1;
135 
136 			*off += 8;
137 			proto = xh.next_hdr;
138 			*frag = 1;
139 			return proto; /* this is always the last ext hdr */
140 		default:
141 			return proto;
142 		}
143 	}
144 
145 	/* too many extension headers give up */
146 	return -1;
147 }
148 
149 /*
150  * Compute RSS hash for IPv6 packet.
151  * return in 0 if RSS not specified
152  */
153 static __always_inline __u32
parse_ipv6(const struct __sk_buff * skb,__u32 hash_type,const __u32 * key)154 parse_ipv6(const struct __sk_buff *skb, __u32 hash_type, const __u32 *key)
155 {
156 	struct {
157 		__u32       src_addr[4];
158 		__u32       dst_addr[4];
159 		__u16       dport;
160 		__u16       sport;
161 	} v6_tuple = { };
162 	struct ipv6hdr ip6h;
163 	__u32 off = 0, j;
164 	int proto, frag;
165 
166 	if (bpf_skb_load_bytes_relative(skb, off, &ip6h, sizeof(ip6h), BPF_HDR_START_NET))
167 		return 0;	/* missing IPv6 header */
168 
169 #pragma unroll
170 	for (j = 0; j < 4; j++) {
171 		v6_tuple.src_addr[j] = bpf_ntohl(ip6h.saddr.in6_u.u6_addr32[j]);
172 		v6_tuple.dst_addr[j] = bpf_ntohl(ip6h.daddr.in6_u.u6_addr32[j]);
173 	}
174 
175 	/* If only doing L3 hash, do it now */
176 	if (hash_type & (1 << HASH_FIELD_IPV6_L3))
177 		return softrss_be((__u32 *)&v6_tuple, sizeof(v6_tuple) / sizeof(__u32) - 1, key);
178 
179 	/* Skip extension headers if present */
180 	off += sizeof(ip6h);
181 	proto = skip_ip6_ext(ip6h.nexthdr, skb, &off, &frag);
182 	if (proto < 0)
183 		return 0;
184 
185 	/* If packet is a fragment then no L4 hash is possible */
186 	if (frag)
187 		return 0;
188 
189 	/* Do RSS on UDP or TCP */
190 	if (proto == IPPROTO_UDP || proto == IPPROTO_TCP) {
191 		__u16 src_dst_port[2];
192 
193 		if (bpf_skb_load_bytes_relative(skb, off, &src_dst_port, sizeof(src_dst_port),
194 						BPF_HDR_START_NET))
195 			return 0;
196 
197 		v6_tuple.sport = bpf_ntohs(src_dst_port[0]);
198 		v6_tuple.dport = bpf_ntohs(src_dst_port[1]);
199 
200 		return softrss_be((__u32 *)&v6_tuple, sizeof(v6_tuple) / sizeof(__u32), key);
201 	}
202 
203 	return 0;
204 }
205 
206 /*
207  * Scale value to be into range [0, n)
208  * Assumes val is large (ie hash covers whole u32 range)
209  */
210 static __always_inline __u32
reciprocal_scale(__u32 val,__u32 n)211 reciprocal_scale(__u32 val, __u32 n)
212 {
213 	return (__u32)(((__u64)val * n) >> 32);
214 }
215 
216 /*
217  * When this BPF program is run by tc from the filter classifier,
218  * it is able to read skb metadata and packet data.
219  *
220  * For packets where RSS is not possible, then just return TC_ACT_OK.
221  * When RSS is desired, change the skb->queue_mapping and set TC_ACT_PIPE
222  * to continue processing.
223  *
224  * This should be BPF_PROG_TYPE_SCHED_ACT so section needs to be "action"
225  */
226 SEC("action") int
rss_flow_action(struct __sk_buff * skb)227 rss_flow_action(struct __sk_buff *skb)
228 {
229 	const struct rss_key *rsskey;
230 	const __u32 *key;
231 	__be16 proto;
232 	__u32 mark;
233 	__u32 hash;
234 	__u16 queue;
235 
236 	__builtin_preserve_access_index(({
237 		mark = skb->mark;
238 		proto = skb->protocol;
239 	}));
240 
241 	/* Lookup RSS configuration for that BPF class */
242 	rsskey = bpf_map_lookup_elem(&rss_map, &mark);
243 	if (rsskey == NULL)
244 		return TC_ACT_OK;
245 
246 	key = (const __u32 *)rsskey->key;
247 
248 	if (proto == bpf_htons(ETH_P_IP))
249 		hash = parse_ipv4(skb, rsskey->hash_fields, key);
250 	else if (proto == bpf_htons(ETH_P_IPV6))
251 		hash = parse_ipv6(skb, rsskey->hash_fields, key);
252 	else
253 		hash = 0;
254 
255 	if (hash == 0)
256 		return TC_ACT_OK;
257 
258 	/* Fold hash to the number of queues configured */
259 	queue = reciprocal_scale(hash, rsskey->nb_queues);
260 
261 	__builtin_preserve_access_index(({
262 		skb->queue_mapping = queue;
263 	}));
264 	return TC_ACT_PIPE;
265 }
266 
267 char _license[] SEC("license") = "Dual BSD/GPL";
268