xref: /dpdk/examples/l3fwd/l3fwd_neon.h (revision ebab0e8b2257aa049dd35dedc7efd230b0f45b88)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016-2018 Intel Corporation.
3  * Copyright(c) 2017-2018 Linaro Limited.
4  */
5 
6 #ifndef _L3FWD_NEON_H_
7 #define _L3FWD_NEON_H_
8 
9 #include "l3fwd.h"
10 #include "neon/port_group.h"
11 #include "l3fwd_common.h"
12 
13 #undef SENDM_PORT_OVERHEAD
14 #define SENDM_PORT_OVERHEAD(x) ((x) + 2 * FWDSTEP)
15 
16 /*
17  * Update source and destination MAC addresses in the ethernet header.
18  * Perform RFC1812 checks and updates for IPV4 packets.
19  */
20 static inline void
21 processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP])
22 {
23 	uint32x4_t te[FWDSTEP];
24 	uint32x4_t ve[FWDSTEP];
25 	uint32_t *p[FWDSTEP];
26 
27 	p[0] = rte_pktmbuf_mtod(pkt[0], uint32_t *);
28 	p[1] = rte_pktmbuf_mtod(pkt[1], uint32_t *);
29 	p[2] = rte_pktmbuf_mtod(pkt[2], uint32_t *);
30 	p[3] = rte_pktmbuf_mtod(pkt[3], uint32_t *);
31 
32 	ve[0] = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
33 	te[0] = vld1q_u32(p[0]);
34 
35 	ve[1] = vreinterpretq_u32_s32(val_eth[dst_port[1]]);
36 	te[1] = vld1q_u32(p[1]);
37 
38 	ve[2] = vreinterpretq_u32_s32(val_eth[dst_port[2]]);
39 	te[2] = vld1q_u32(p[2]);
40 
41 	ve[3] = vreinterpretq_u32_s32(val_eth[dst_port[3]]);
42 	te[3] = vld1q_u32(p[3]);
43 
44 	/* Update last 4 bytes */
45 	ve[0] = vsetq_lane_u32(vgetq_lane_u32(te[0], 3), ve[0], 3);
46 	ve[1] = vsetq_lane_u32(vgetq_lane_u32(te[1], 3), ve[1], 3);
47 	ve[2] = vsetq_lane_u32(vgetq_lane_u32(te[2], 3), ve[2], 3);
48 	ve[3] = vsetq_lane_u32(vgetq_lane_u32(te[3], 3), ve[3], 3);
49 
50 	vst1q_u32(p[0], ve[0]);
51 	vst1q_u32(p[1], ve[1]);
52 	vst1q_u32(p[2], ve[2]);
53 	vst1q_u32(p[3], ve[3]);
54 
55 	rfc1812_process((struct rte_ipv4_hdr *)
56 			((struct rte_ether_hdr *)p[0] + 1),
57 			&dst_port[0], pkt[0]->packet_type);
58 	rfc1812_process((struct rte_ipv4_hdr *)
59 			((struct rte_ether_hdr *)p[1] + 1),
60 			&dst_port[1], pkt[1]->packet_type);
61 	rfc1812_process((struct rte_ipv4_hdr *)
62 			((struct rte_ether_hdr *)p[2] + 1),
63 			&dst_port[2], pkt[2]->packet_type);
64 	rfc1812_process((struct rte_ipv4_hdr *)
65 			((struct rte_ether_hdr *)p[3] + 1),
66 			&dst_port[3], pkt[3]->packet_type);
67 }
68 
69 /**
70  * Process one packet:
71  * Update source and destination MAC addresses in the ethernet header.
72  * Perform RFC1812 checks and updates for IPV4 packets.
73  */
74 static inline void
75 process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
76 {
77 	struct rte_ether_hdr *eth_hdr;
78 	uint32x4_t te, ve;
79 
80 	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
81 
82 	te = vld1q_u32((uint32_t *)eth_hdr);
83 	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
84 
85 
86 	rfc1812_process((struct rte_ipv4_hdr *)(eth_hdr + 1), dst_port,
87 			pkt->packet_type);
88 
89 	ve = vcopyq_laneq_u32(ve, 3, te, 3);
90 	vst1q_u32((uint32_t *)eth_hdr, ve);
91 }
92 
93 /**
94  * Send packets burst from pkts_burst to the ports in dst_port array
95  */
96 static __rte_always_inline void
97 send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
98 		uint16_t dst_port[SENDM_PORT_OVERHEAD(MAX_PKT_BURST)],
99 		int nb_rx)
100 {
101 	int32_t k;
102 	int j = 0;
103 	uint16_t dlp;
104 	uint16_t *lp;
105 	uint16_t pnum[MAX_PKT_BURST + 1];
106 
107 	/*
108 	 * Finish packet processing and group consecutive
109 	 * packets with the same destination port.
110 	 */
111 	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
112 	if (k != 0) {
113 		uint16x8_t dp1, dp2;
114 
115 		lp = pnum;
116 		lp[0] = 1;
117 
118 		processx4_step3(pkts_burst, dst_port);
119 
120 		/* dp1: <d[0], d[1], d[2], d[3], ... > */
121 		dp1 = vld1q_u16(dst_port);
122 
123 		for (j = FWDSTEP; j != k; j += FWDSTEP) {
124 			processx4_step3(&pkts_burst[j], &dst_port[j]);
125 
126 			/*
127 			 * dp2:
128 			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
129 			 */
130 			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
131 			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
132 
133 			/*
134 			 * dp1:
135 			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
136 			 */
137 			dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
138 		}
139 
140 		/*
141 		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
142 		 */
143 		dp2 = vextq_u16(dp1, dp1, 1);
144 		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
145 		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
146 
147 		/*
148 		 * remove values added by the last repeated
149 		 * dst port.
150 		 */
151 		lp[0]--;
152 		dlp = dst_port[j - 1];
153 	} else {
154 		/* set dlp and lp to the never used values. */
155 		dlp = BAD_PORT - 1;
156 		lp = pnum + MAX_PKT_BURST;
157 	}
158 
159 	/* Process up to last 3 packets one by one. */
160 	switch (nb_rx % FWDSTEP) {
161 	case 3:
162 		process_packet(pkts_burst[j], dst_port + j);
163 		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
164 		j++;
165 		/* fallthrough */
166 	case 2:
167 		process_packet(pkts_burst[j], dst_port + j);
168 		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
169 		j++;
170 		/* fallthrough */
171 	case 1:
172 		process_packet(pkts_burst[j], dst_port + j);
173 		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
174 		j++;
175 	}
176 
177 	/*
178 	 * Send packets out, through destination port.
179 	 * Consecutive packets with the same destination port
180 	 * are already grouped together.
181 	 * If destination port for the packet equals BAD_PORT,
182 	 * then free the packet without sending it out.
183 	 */
184 	for (j = 0; j < nb_rx; j += k) {
185 
186 		int32_t m;
187 		uint16_t pn;
188 
189 		pn = dst_port[j];
190 		k = pnum[j];
191 
192 		if (likely(pn != BAD_PORT))
193 			send_packetsx4(qconf, pn, pkts_burst + j, k);
194 		else
195 			for (m = j; m != j + k; m++)
196 				rte_pktmbuf_free(pkts_burst[m]);
197 
198 	}
199 }
200 
201 static __rte_always_inline uint16_t
202 process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
203 {
204 	uint16_t i = 0;
205 
206 #if defined(RTE_ARCH_ARM64)
207 	uint64_t res;
208 
209 	while (nb_elem > 7) {
210 		uint16x8_t dp = vdupq_n_u16(dst_ports[0]);
211 		uint16x8_t dp1;
212 
213 		dp1 = vld1q_u16(&dst_ports[i]);
214 		dp1 = vceqq_u16(dp1, dp);
215 		res = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(dp1, 4)),
216 				    0);
217 		if (res != ~0ULL)
218 			return BAD_PORT;
219 
220 		nb_elem -= 8;
221 		i += 8;
222 	}
223 
224 	while (nb_elem > 3) {
225 		uint16x4_t dp = vdup_n_u16(dst_ports[0]);
226 		uint16x4_t dp1;
227 
228 		dp1 = vld1_u16(&dst_ports[i]);
229 		dp1 = vceq_u16(dp1, dp);
230 		res = vget_lane_u64(vreinterpret_u64_u16(dp1), 0);
231 		if (res != ~0ULL)
232 			return BAD_PORT;
233 
234 		nb_elem -= 4;
235 		i += 4;
236 	}
237 #endif
238 
239 	while (nb_elem) {
240 		if (dst_ports[i] != dst_ports[0])
241 			return BAD_PORT;
242 		nb_elem--;
243 		i++;
244 	}
245 
246 	return dst_ports[0];
247 }
248 
249 #endif /* _L3FWD_NEON_H_ */
250