1*d8d06504SStephen Hemminger /* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
2*d8d06504SStephen Hemminger * Copyright 2017 Mellanox Technologies, Ltd
3*d8d06504SStephen Hemminger */
4*d8d06504SStephen Hemminger
5*d8d06504SStephen Hemminger #include <linux/in.h>
6*d8d06504SStephen Hemminger #include <linux/if_ether.h>
7*d8d06504SStephen Hemminger #include <linux/ip.h>
8*d8d06504SStephen Hemminger #include <linux/ipv6.h>
9*d8d06504SStephen Hemminger #include <linux/pkt_cls.h>
10*d8d06504SStephen Hemminger #include <linux/bpf.h>
11*d8d06504SStephen Hemminger
12*d8d06504SStephen Hemminger #include <bpf/bpf_helpers.h>
13*d8d06504SStephen Hemminger #include <bpf/bpf_endian.h>
14*d8d06504SStephen Hemminger
15*d8d06504SStephen Hemminger #include "../tap_rss.h"
16*d8d06504SStephen Hemminger
17*d8d06504SStephen Hemminger /*
18*d8d06504SStephen Hemminger * This map provides configuration information about flows which need BPF RSS.
19*d8d06504SStephen Hemminger *
20*d8d06504SStephen Hemminger * The hash is indexed by the skb mark.
21*d8d06504SStephen Hemminger */
22*d8d06504SStephen Hemminger struct {
23*d8d06504SStephen Hemminger __uint(type, BPF_MAP_TYPE_HASH);
24*d8d06504SStephen Hemminger __uint(key_size, sizeof(__u32));
25*d8d06504SStephen Hemminger __uint(value_size, sizeof(struct rss_key));
26*d8d06504SStephen Hemminger __uint(max_entries, TAP_RSS_MAX);
27*d8d06504SStephen Hemminger } rss_map SEC(".maps");
28*d8d06504SStephen Hemminger
29*d8d06504SStephen Hemminger #define IP_MF 0x2000 /** IP header Flags **/
30*d8d06504SStephen Hemminger #define IP_OFFSET 0x1FFF /** IP header fragment offset **/
31*d8d06504SStephen Hemminger
32*d8d06504SStephen Hemminger /*
33*d8d06504SStephen Hemminger * Compute Toeplitz hash over the input tuple.
34*d8d06504SStephen Hemminger * This is same as rte_softrss_be in lib/hash
35*d8d06504SStephen Hemminger * but loop needs to be setup to match BPF restrictions.
36*d8d06504SStephen Hemminger */
37*d8d06504SStephen Hemminger static __always_inline __u32
softrss_be(const __u32 * input_tuple,__u32 input_len,const __u32 * key)38*d8d06504SStephen Hemminger softrss_be(const __u32 *input_tuple, __u32 input_len, const __u32 *key)
39*d8d06504SStephen Hemminger {
40*d8d06504SStephen Hemminger __u32 i, j, hash = 0;
41*d8d06504SStephen Hemminger
42*d8d06504SStephen Hemminger #pragma unroll
43*d8d06504SStephen Hemminger for (j = 0; j < input_len; j++) {
44*d8d06504SStephen Hemminger #pragma unroll
45*d8d06504SStephen Hemminger for (i = 0; i < 32; i++) {
46*d8d06504SStephen Hemminger if (input_tuple[j] & (1U << (31 - i)))
47*d8d06504SStephen Hemminger hash ^= key[j] << i | key[j + 1] >> (32 - i);
48*d8d06504SStephen Hemminger }
49*d8d06504SStephen Hemminger }
50*d8d06504SStephen Hemminger return hash;
51*d8d06504SStephen Hemminger }
52*d8d06504SStephen Hemminger
53*d8d06504SStephen Hemminger /*
54*d8d06504SStephen Hemminger * Compute RSS hash for IPv4 packet.
55*d8d06504SStephen Hemminger * return in 0 if RSS not specified
56*d8d06504SStephen Hemminger */
57*d8d06504SStephen Hemminger static __always_inline __u32
parse_ipv4(const struct __sk_buff * skb,__u32 hash_type,const __u32 * key)58*d8d06504SStephen Hemminger parse_ipv4(const struct __sk_buff *skb, __u32 hash_type, const __u32 *key)
59*d8d06504SStephen Hemminger {
60*d8d06504SStephen Hemminger struct iphdr iph;
61*d8d06504SStephen Hemminger __u32 off = 0;
62*d8d06504SStephen Hemminger
63*d8d06504SStephen Hemminger if (bpf_skb_load_bytes_relative(skb, off, &iph, sizeof(iph), BPF_HDR_START_NET))
64*d8d06504SStephen Hemminger return 0; /* no IP header present */
65*d8d06504SStephen Hemminger
66*d8d06504SStephen Hemminger struct {
67*d8d06504SStephen Hemminger __u32 src_addr;
68*d8d06504SStephen Hemminger __u32 dst_addr;
69*d8d06504SStephen Hemminger __u16 dport;
70*d8d06504SStephen Hemminger __u16 sport;
71*d8d06504SStephen Hemminger } v4_tuple = {
72*d8d06504SStephen Hemminger .src_addr = bpf_ntohl(iph.saddr),
73*d8d06504SStephen Hemminger .dst_addr = bpf_ntohl(iph.daddr),
74*d8d06504SStephen Hemminger };
75*d8d06504SStephen Hemminger
76*d8d06504SStephen Hemminger /* If only calculating L3 hash, do it now */
77*d8d06504SStephen Hemminger if (hash_type & (1 << HASH_FIELD_IPV4_L3))
78*d8d06504SStephen Hemminger return softrss_be((__u32 *)&v4_tuple, sizeof(v4_tuple) / sizeof(__u32) - 1, key);
79*d8d06504SStephen Hemminger
80*d8d06504SStephen Hemminger /* If packet is fragmented then no L4 hash is possible */
81*d8d06504SStephen Hemminger if ((iph.frag_off & bpf_htons(IP_MF | IP_OFFSET)) != 0)
82*d8d06504SStephen Hemminger return 0;
83*d8d06504SStephen Hemminger
84*d8d06504SStephen Hemminger /* Do RSS on UDP or TCP protocols */
85*d8d06504SStephen Hemminger if (iph.protocol == IPPROTO_UDP || iph.protocol == IPPROTO_TCP) {
86*d8d06504SStephen Hemminger __u16 src_dst_port[2];
87*d8d06504SStephen Hemminger
88*d8d06504SStephen Hemminger off += iph.ihl * 4;
89*d8d06504SStephen Hemminger if (bpf_skb_load_bytes_relative(skb, off, &src_dst_port, sizeof(src_dst_port),
90*d8d06504SStephen Hemminger BPF_HDR_START_NET))
91*d8d06504SStephen Hemminger return 0; /* TCP or UDP header missing */
92*d8d06504SStephen Hemminger
93*d8d06504SStephen Hemminger v4_tuple.sport = bpf_ntohs(src_dst_port[0]);
94*d8d06504SStephen Hemminger v4_tuple.dport = bpf_ntohs(src_dst_port[1]);
95*d8d06504SStephen Hemminger return softrss_be((__u32 *)&v4_tuple, sizeof(v4_tuple) / sizeof(__u32), key);
96*d8d06504SStephen Hemminger }
97*d8d06504SStephen Hemminger
98*d8d06504SStephen Hemminger /* Other protocol */
99*d8d06504SStephen Hemminger return 0;
100*d8d06504SStephen Hemminger }
101*d8d06504SStephen Hemminger
102*d8d06504SStephen Hemminger /*
103*d8d06504SStephen Hemminger * Parse Ipv6 extended headers, update offset and return next proto.
104*d8d06504SStephen Hemminger * returns next proto on success, -1 on malformed header
105*d8d06504SStephen Hemminger */
106*d8d06504SStephen Hemminger static __always_inline int
skip_ip6_ext(__u16 proto,const struct __sk_buff * skb,__u32 * off,int * frag)107*d8d06504SStephen Hemminger skip_ip6_ext(__u16 proto, const struct __sk_buff *skb, __u32 *off, int *frag)
108*d8d06504SStephen Hemminger {
109*d8d06504SStephen Hemminger struct ext_hdr {
110*d8d06504SStephen Hemminger __u8 next_hdr;
111*d8d06504SStephen Hemminger __u8 len;
112*d8d06504SStephen Hemminger } xh;
113*d8d06504SStephen Hemminger unsigned int i;
114*d8d06504SStephen Hemminger
115*d8d06504SStephen Hemminger *frag = 0;
116*d8d06504SStephen Hemminger
117*d8d06504SStephen Hemminger #define MAX_EXT_HDRS 5
118*d8d06504SStephen Hemminger #pragma unroll
119*d8d06504SStephen Hemminger for (i = 0; i < MAX_EXT_HDRS; i++) {
120*d8d06504SStephen Hemminger switch (proto) {
121*d8d06504SStephen Hemminger case IPPROTO_HOPOPTS:
122*d8d06504SStephen Hemminger case IPPROTO_ROUTING:
123*d8d06504SStephen Hemminger case IPPROTO_DSTOPTS:
124*d8d06504SStephen Hemminger if (bpf_skb_load_bytes_relative(skb, *off, &xh, sizeof(xh),
125*d8d06504SStephen Hemminger BPF_HDR_START_NET))
126*d8d06504SStephen Hemminger return -1;
127*d8d06504SStephen Hemminger
128*d8d06504SStephen Hemminger *off += (xh.len + 1) * 8;
129*d8d06504SStephen Hemminger proto = xh.next_hdr;
130*d8d06504SStephen Hemminger break;
131*d8d06504SStephen Hemminger case IPPROTO_FRAGMENT:
132*d8d06504SStephen Hemminger if (bpf_skb_load_bytes_relative(skb, *off, &xh, sizeof(xh),
133*d8d06504SStephen Hemminger BPF_HDR_START_NET))
134*d8d06504SStephen Hemminger return -1;
135*d8d06504SStephen Hemminger
136*d8d06504SStephen Hemminger *off += 8;
137*d8d06504SStephen Hemminger proto = xh.next_hdr;
138*d8d06504SStephen Hemminger *frag = 1;
139*d8d06504SStephen Hemminger return proto; /* this is always the last ext hdr */
140*d8d06504SStephen Hemminger default:
141*d8d06504SStephen Hemminger return proto;
142*d8d06504SStephen Hemminger }
143*d8d06504SStephen Hemminger }
144*d8d06504SStephen Hemminger
145*d8d06504SStephen Hemminger /* too many extension headers give up */
146*d8d06504SStephen Hemminger return -1;
147*d8d06504SStephen Hemminger }
148*d8d06504SStephen Hemminger
149*d8d06504SStephen Hemminger /*
150*d8d06504SStephen Hemminger * Compute RSS hash for IPv6 packet.
151*d8d06504SStephen Hemminger * return in 0 if RSS not specified
152*d8d06504SStephen Hemminger */
153*d8d06504SStephen Hemminger static __always_inline __u32
parse_ipv6(const struct __sk_buff * skb,__u32 hash_type,const __u32 * key)154*d8d06504SStephen Hemminger parse_ipv6(const struct __sk_buff *skb, __u32 hash_type, const __u32 *key)
155*d8d06504SStephen Hemminger {
156*d8d06504SStephen Hemminger struct {
157*d8d06504SStephen Hemminger __u32 src_addr[4];
158*d8d06504SStephen Hemminger __u32 dst_addr[4];
159*d8d06504SStephen Hemminger __u16 dport;
160*d8d06504SStephen Hemminger __u16 sport;
161*d8d06504SStephen Hemminger } v6_tuple = { };
162*d8d06504SStephen Hemminger struct ipv6hdr ip6h;
163*d8d06504SStephen Hemminger __u32 off = 0, j;
164*d8d06504SStephen Hemminger int proto, frag;
165*d8d06504SStephen Hemminger
166*d8d06504SStephen Hemminger if (bpf_skb_load_bytes_relative(skb, off, &ip6h, sizeof(ip6h), BPF_HDR_START_NET))
167*d8d06504SStephen Hemminger return 0; /* missing IPv6 header */
168*d8d06504SStephen Hemminger
169*d8d06504SStephen Hemminger #pragma unroll
170*d8d06504SStephen Hemminger for (j = 0; j < 4; j++) {
171*d8d06504SStephen Hemminger v6_tuple.src_addr[j] = bpf_ntohl(ip6h.saddr.in6_u.u6_addr32[j]);
172*d8d06504SStephen Hemminger v6_tuple.dst_addr[j] = bpf_ntohl(ip6h.daddr.in6_u.u6_addr32[j]);
173*d8d06504SStephen Hemminger }
174*d8d06504SStephen Hemminger
175*d8d06504SStephen Hemminger /* If only doing L3 hash, do it now */
176*d8d06504SStephen Hemminger if (hash_type & (1 << HASH_FIELD_IPV6_L3))
177*d8d06504SStephen Hemminger return softrss_be((__u32 *)&v6_tuple, sizeof(v6_tuple) / sizeof(__u32) - 1, key);
178*d8d06504SStephen Hemminger
179*d8d06504SStephen Hemminger /* Skip extension headers if present */
180*d8d06504SStephen Hemminger off += sizeof(ip6h);
181*d8d06504SStephen Hemminger proto = skip_ip6_ext(ip6h.nexthdr, skb, &off, &frag);
182*d8d06504SStephen Hemminger if (proto < 0)
183*d8d06504SStephen Hemminger return 0;
184*d8d06504SStephen Hemminger
185*d8d06504SStephen Hemminger /* If packet is a fragment then no L4 hash is possible */
186*d8d06504SStephen Hemminger if (frag)
187*d8d06504SStephen Hemminger return 0;
188*d8d06504SStephen Hemminger
189*d8d06504SStephen Hemminger /* Do RSS on UDP or TCP */
190*d8d06504SStephen Hemminger if (proto == IPPROTO_UDP || proto == IPPROTO_TCP) {
191*d8d06504SStephen Hemminger __u16 src_dst_port[2];
192*d8d06504SStephen Hemminger
193*d8d06504SStephen Hemminger if (bpf_skb_load_bytes_relative(skb, off, &src_dst_port, sizeof(src_dst_port),
194*d8d06504SStephen Hemminger BPF_HDR_START_NET))
195*d8d06504SStephen Hemminger return 0;
196*d8d06504SStephen Hemminger
197*d8d06504SStephen Hemminger v6_tuple.sport = bpf_ntohs(src_dst_port[0]);
198*d8d06504SStephen Hemminger v6_tuple.dport = bpf_ntohs(src_dst_port[1]);
199*d8d06504SStephen Hemminger
200*d8d06504SStephen Hemminger return softrss_be((__u32 *)&v6_tuple, sizeof(v6_tuple) / sizeof(__u32), key);
201*d8d06504SStephen Hemminger }
202*d8d06504SStephen Hemminger
203*d8d06504SStephen Hemminger return 0;
204*d8d06504SStephen Hemminger }
205*d8d06504SStephen Hemminger
206*d8d06504SStephen Hemminger /*
207*d8d06504SStephen Hemminger * Scale value to be into range [0, n)
208*d8d06504SStephen Hemminger * Assumes val is large (ie hash covers whole u32 range)
209*d8d06504SStephen Hemminger */
210*d8d06504SStephen Hemminger static __always_inline __u32
reciprocal_scale(__u32 val,__u32 n)211*d8d06504SStephen Hemminger reciprocal_scale(__u32 val, __u32 n)
212*d8d06504SStephen Hemminger {
213*d8d06504SStephen Hemminger return (__u32)(((__u64)val * n) >> 32);
214*d8d06504SStephen Hemminger }
215*d8d06504SStephen Hemminger
216*d8d06504SStephen Hemminger /*
217*d8d06504SStephen Hemminger * When this BPF program is run by tc from the filter classifier,
218*d8d06504SStephen Hemminger * it is able to read skb metadata and packet data.
219*d8d06504SStephen Hemminger *
220*d8d06504SStephen Hemminger * For packets where RSS is not possible, then just return TC_ACT_OK.
221*d8d06504SStephen Hemminger * When RSS is desired, change the skb->queue_mapping and set TC_ACT_PIPE
222*d8d06504SStephen Hemminger * to continue processing.
223*d8d06504SStephen Hemminger *
224*d8d06504SStephen Hemminger * This should be BPF_PROG_TYPE_SCHED_ACT so section needs to be "action"
225*d8d06504SStephen Hemminger */
226*d8d06504SStephen Hemminger SEC("action") int
rss_flow_action(struct __sk_buff * skb)227*d8d06504SStephen Hemminger rss_flow_action(struct __sk_buff *skb)
228*d8d06504SStephen Hemminger {
229*d8d06504SStephen Hemminger const struct rss_key *rsskey;
230*d8d06504SStephen Hemminger const __u32 *key;
231*d8d06504SStephen Hemminger __be16 proto;
232*d8d06504SStephen Hemminger __u32 mark;
233*d8d06504SStephen Hemminger __u32 hash;
234*d8d06504SStephen Hemminger __u16 queue;
235*d8d06504SStephen Hemminger
236*d8d06504SStephen Hemminger __builtin_preserve_access_index(({
237*d8d06504SStephen Hemminger mark = skb->mark;
238*d8d06504SStephen Hemminger proto = skb->protocol;
239*d8d06504SStephen Hemminger }));
240*d8d06504SStephen Hemminger
241*d8d06504SStephen Hemminger /* Lookup RSS configuration for that BPF class */
242*d8d06504SStephen Hemminger rsskey = bpf_map_lookup_elem(&rss_map, &mark);
243*d8d06504SStephen Hemminger if (rsskey == NULL)
244*d8d06504SStephen Hemminger return TC_ACT_OK;
245*d8d06504SStephen Hemminger
246*d8d06504SStephen Hemminger key = (const __u32 *)rsskey->key;
247*d8d06504SStephen Hemminger
248*d8d06504SStephen Hemminger if (proto == bpf_htons(ETH_P_IP))
249*d8d06504SStephen Hemminger hash = parse_ipv4(skb, rsskey->hash_fields, key);
250*d8d06504SStephen Hemminger else if (proto == bpf_htons(ETH_P_IPV6))
251*d8d06504SStephen Hemminger hash = parse_ipv6(skb, rsskey->hash_fields, key);
252*d8d06504SStephen Hemminger else
253*d8d06504SStephen Hemminger hash = 0;
254*d8d06504SStephen Hemminger
255*d8d06504SStephen Hemminger if (hash == 0)
256*d8d06504SStephen Hemminger return TC_ACT_OK;
257*d8d06504SStephen Hemminger
258*d8d06504SStephen Hemminger /* Fold hash to the number of queues configured */
259*d8d06504SStephen Hemminger queue = reciprocal_scale(hash, rsskey->nb_queues);
260*d8d06504SStephen Hemminger
261*d8d06504SStephen Hemminger __builtin_preserve_access_index(({
262*d8d06504SStephen Hemminger skb->queue_mapping = queue;
263*d8d06504SStephen Hemminger }));
264*d8d06504SStephen Hemminger return TC_ACT_PIPE;
265*d8d06504SStephen Hemminger }
266*d8d06504SStephen Hemminger
267*d8d06504SStephen Hemminger char _license[] SEC("license") = "Dual BSD/GPL";
268