1 /* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
2 * Copyright 2017 Mellanox Technologies, Ltd
3 */
4
5 #include <linux/in.h>
6 #include <linux/if_ether.h>
7 #include <linux/ip.h>
8 #include <linux/ipv6.h>
9 #include <linux/pkt_cls.h>
10 #include <linux/bpf.h>
11
12 #include <bpf/bpf_helpers.h>
13 #include <bpf/bpf_endian.h>
14
15 #include "../tap_rss.h"
16
17 /*
18 * This map provides configuration information about flows which need BPF RSS.
19 *
20 * The hash is indexed by the skb mark.
21 */
22 struct {
23 __uint(type, BPF_MAP_TYPE_HASH);
24 __uint(key_size, sizeof(__u32));
25 __uint(value_size, sizeof(struct rss_key));
26 __uint(max_entries, TAP_RSS_MAX);
27 } rss_map SEC(".maps");
28
29 #define IP_MF 0x2000 /** IP header Flags **/
30 #define IP_OFFSET 0x1FFF /** IP header fragment offset **/
31
32 /*
33 * Compute Toeplitz hash over the input tuple.
34 * This is same as rte_softrss_be in lib/hash
35 * but loop needs to be setup to match BPF restrictions.
36 */
37 static __always_inline __u32
softrss_be(const __u32 * input_tuple,__u32 input_len,const __u32 * key)38 softrss_be(const __u32 *input_tuple, __u32 input_len, const __u32 *key)
39 {
40 __u32 i, j, hash = 0;
41
42 #pragma unroll
43 for (j = 0; j < input_len; j++) {
44 #pragma unroll
45 for (i = 0; i < 32; i++) {
46 if (input_tuple[j] & (1U << (31 - i)))
47 hash ^= key[j] << i | key[j + 1] >> (32 - i);
48 }
49 }
50 return hash;
51 }
52
53 /*
54 * Compute RSS hash for IPv4 packet.
55 * return in 0 if RSS not specified
56 */
57 static __always_inline __u32
parse_ipv4(const struct __sk_buff * skb,__u32 hash_type,const __u32 * key)58 parse_ipv4(const struct __sk_buff *skb, __u32 hash_type, const __u32 *key)
59 {
60 struct iphdr iph;
61 __u32 off = 0;
62
63 if (bpf_skb_load_bytes_relative(skb, off, &iph, sizeof(iph), BPF_HDR_START_NET))
64 return 0; /* no IP header present */
65
66 struct {
67 __u32 src_addr;
68 __u32 dst_addr;
69 __u16 dport;
70 __u16 sport;
71 } v4_tuple = {
72 .src_addr = bpf_ntohl(iph.saddr),
73 .dst_addr = bpf_ntohl(iph.daddr),
74 };
75
76 /* If only calculating L3 hash, do it now */
77 if (hash_type & (1 << HASH_FIELD_IPV4_L3))
78 return softrss_be((__u32 *)&v4_tuple, sizeof(v4_tuple) / sizeof(__u32) - 1, key);
79
80 /* If packet is fragmented then no L4 hash is possible */
81 if ((iph.frag_off & bpf_htons(IP_MF | IP_OFFSET)) != 0)
82 return 0;
83
84 /* Do RSS on UDP or TCP protocols */
85 if (iph.protocol == IPPROTO_UDP || iph.protocol == IPPROTO_TCP) {
86 __u16 src_dst_port[2];
87
88 off += iph.ihl * 4;
89 if (bpf_skb_load_bytes_relative(skb, off, &src_dst_port, sizeof(src_dst_port),
90 BPF_HDR_START_NET))
91 return 0; /* TCP or UDP header missing */
92
93 v4_tuple.sport = bpf_ntohs(src_dst_port[0]);
94 v4_tuple.dport = bpf_ntohs(src_dst_port[1]);
95 return softrss_be((__u32 *)&v4_tuple, sizeof(v4_tuple) / sizeof(__u32), key);
96 }
97
98 /* Other protocol */
99 return 0;
100 }
101
102 /*
103 * Parse Ipv6 extended headers, update offset and return next proto.
104 * returns next proto on success, -1 on malformed header
105 */
106 static __always_inline int
skip_ip6_ext(__u16 proto,const struct __sk_buff * skb,__u32 * off,int * frag)107 skip_ip6_ext(__u16 proto, const struct __sk_buff *skb, __u32 *off, int *frag)
108 {
109 struct ext_hdr {
110 __u8 next_hdr;
111 __u8 len;
112 } xh;
113 unsigned int i;
114
115 *frag = 0;
116
117 #define MAX_EXT_HDRS 5
118 #pragma unroll
119 for (i = 0; i < MAX_EXT_HDRS; i++) {
120 switch (proto) {
121 case IPPROTO_HOPOPTS:
122 case IPPROTO_ROUTING:
123 case IPPROTO_DSTOPTS:
124 if (bpf_skb_load_bytes_relative(skb, *off, &xh, sizeof(xh),
125 BPF_HDR_START_NET))
126 return -1;
127
128 *off += (xh.len + 1) * 8;
129 proto = xh.next_hdr;
130 break;
131 case IPPROTO_FRAGMENT:
132 if (bpf_skb_load_bytes_relative(skb, *off, &xh, sizeof(xh),
133 BPF_HDR_START_NET))
134 return -1;
135
136 *off += 8;
137 proto = xh.next_hdr;
138 *frag = 1;
139 return proto; /* this is always the last ext hdr */
140 default:
141 return proto;
142 }
143 }
144
145 /* too many extension headers give up */
146 return -1;
147 }
148
149 /*
150 * Compute RSS hash for IPv6 packet.
151 * return in 0 if RSS not specified
152 */
153 static __always_inline __u32
parse_ipv6(const struct __sk_buff * skb,__u32 hash_type,const __u32 * key)154 parse_ipv6(const struct __sk_buff *skb, __u32 hash_type, const __u32 *key)
155 {
156 struct {
157 __u32 src_addr[4];
158 __u32 dst_addr[4];
159 __u16 dport;
160 __u16 sport;
161 } v6_tuple = { };
162 struct ipv6hdr ip6h;
163 __u32 off = 0, j;
164 int proto, frag;
165
166 if (bpf_skb_load_bytes_relative(skb, off, &ip6h, sizeof(ip6h), BPF_HDR_START_NET))
167 return 0; /* missing IPv6 header */
168
169 #pragma unroll
170 for (j = 0; j < 4; j++) {
171 v6_tuple.src_addr[j] = bpf_ntohl(ip6h.saddr.in6_u.u6_addr32[j]);
172 v6_tuple.dst_addr[j] = bpf_ntohl(ip6h.daddr.in6_u.u6_addr32[j]);
173 }
174
175 /* If only doing L3 hash, do it now */
176 if (hash_type & (1 << HASH_FIELD_IPV6_L3))
177 return softrss_be((__u32 *)&v6_tuple, sizeof(v6_tuple) / sizeof(__u32) - 1, key);
178
179 /* Skip extension headers if present */
180 off += sizeof(ip6h);
181 proto = skip_ip6_ext(ip6h.nexthdr, skb, &off, &frag);
182 if (proto < 0)
183 return 0;
184
185 /* If packet is a fragment then no L4 hash is possible */
186 if (frag)
187 return 0;
188
189 /* Do RSS on UDP or TCP */
190 if (proto == IPPROTO_UDP || proto == IPPROTO_TCP) {
191 __u16 src_dst_port[2];
192
193 if (bpf_skb_load_bytes_relative(skb, off, &src_dst_port, sizeof(src_dst_port),
194 BPF_HDR_START_NET))
195 return 0;
196
197 v6_tuple.sport = bpf_ntohs(src_dst_port[0]);
198 v6_tuple.dport = bpf_ntohs(src_dst_port[1]);
199
200 return softrss_be((__u32 *)&v6_tuple, sizeof(v6_tuple) / sizeof(__u32), key);
201 }
202
203 return 0;
204 }
205
206 /*
207 * Scale value to be into range [0, n)
208 * Assumes val is large (ie hash covers whole u32 range)
209 */
210 static __always_inline __u32
reciprocal_scale(__u32 val,__u32 n)211 reciprocal_scale(__u32 val, __u32 n)
212 {
213 return (__u32)(((__u64)val * n) >> 32);
214 }
215
216 /*
217 * When this BPF program is run by tc from the filter classifier,
218 * it is able to read skb metadata and packet data.
219 *
220 * For packets where RSS is not possible, then just return TC_ACT_OK.
221 * When RSS is desired, change the skb->queue_mapping and set TC_ACT_PIPE
222 * to continue processing.
223 *
224 * This should be BPF_PROG_TYPE_SCHED_ACT so section needs to be "action"
225 */
226 SEC("action") int
rss_flow_action(struct __sk_buff * skb)227 rss_flow_action(struct __sk_buff *skb)
228 {
229 const struct rss_key *rsskey;
230 const __u32 *key;
231 __be16 proto;
232 __u32 mark;
233 __u32 hash;
234 __u16 queue;
235
236 __builtin_preserve_access_index(({
237 mark = skb->mark;
238 proto = skb->protocol;
239 }));
240
241 /* Lookup RSS configuration for that BPF class */
242 rsskey = bpf_map_lookup_elem(&rss_map, &mark);
243 if (rsskey == NULL)
244 return TC_ACT_OK;
245
246 key = (const __u32 *)rsskey->key;
247
248 if (proto == bpf_htons(ETH_P_IP))
249 hash = parse_ipv4(skb, rsskey->hash_fields, key);
250 else if (proto == bpf_htons(ETH_P_IPV6))
251 hash = parse_ipv6(skb, rsskey->hash_fields, key);
252 else
253 hash = 0;
254
255 if (hash == 0)
256 return TC_ACT_OK;
257
258 /* Fold hash to the number of queues configured */
259 queue = reciprocal_scale(hash, rsskey->nb_queues);
260
261 __builtin_preserve_access_index(({
262 skb->queue_mapping = queue;
263 }));
264 return TC_ACT_PIPE;
265 }
266
267 char _license[] SEC("license") = "Dual BSD/GPL";
268