1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(C) 2022 Marvell.
3 */
4
5 #ifndef IPSEC_NEON_H
6 #define IPSEC_NEON_H
7
8 #include "ipsec.h"
9 #include "neon/port_group.h"
10
11 #define MAX_TX_BURST (MAX_PKT_BURST / 2)
12
13 extern xmm_t val_eth[RTE_MAX_ETHPORTS];
14
15 /*
16 * Update source and destination MAC addresses in the ethernet header.
17 */
18 static inline void
processx4_step3(struct rte_mbuf * pkts[FWDSTEP],uint16_t dst_port[FWDSTEP],uint64_t tx_offloads,bool ip_cksum,bool is_ipv4,uint8_t * l_pkt)19 processx4_step3(struct rte_mbuf *pkts[FWDSTEP], uint16_t dst_port[FWDSTEP],
20 uint64_t tx_offloads, bool ip_cksum, bool is_ipv4, uint8_t *l_pkt)
21 {
22 uint32x4_t te[FWDSTEP];
23 uint32x4_t ve[FWDSTEP];
24 uint32_t *p[FWDSTEP];
25 struct rte_mbuf *pkt;
26 uint32_t val;
27 uint8_t i;
28
29 for (i = 0; i < FWDSTEP; i++) {
30 pkt = pkts[i];
31
32 /* Check if it is a large packet */
33 if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
34 *l_pkt |= 1;
35
36 p[i] = rte_pktmbuf_mtod(pkt, uint32_t *);
37 ve[i] = vreinterpretq_u32_s32(val_eth[dst_port[i]]);
38 te[i] = vld1q_u32(p[i]);
39
40 /* Update last 4 bytes */
41 val = vgetq_lane_u32(te[i], 3);
42 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
43 val &= 0xFFFFUL << 16;
44 val |= rte_cpu_to_be_16(is_ipv4 ? RTE_ETHER_TYPE_IPV4 : RTE_ETHER_TYPE_IPV6);
45 #else
46 val &= 0xFFFFUL;
47 val |= rte_cpu_to_be_16(is_ipv4 ? RTE_ETHER_TYPE_IPV4 : RTE_ETHER_TYPE_IPV6) << 16;
48 #endif
49 ve[i] = vsetq_lane_u32(val, ve[i], 3);
50 vst1q_u32(p[i], ve[i]);
51
52 if (ip_cksum) {
53 struct rte_ipv4_hdr *ip;
54
55 pkt->ol_flags |= tx_offloads;
56
57 ip = (struct rte_ipv4_hdr *)
58 (((uintptr_t)p[i]) + RTE_ETHER_HDR_LEN);
59 ip->hdr_checksum = 0;
60
61 /* calculate IPv4 cksum in SW */
62 if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
63 ip->hdr_checksum = rte_ipv4_cksum(ip);
64 }
65
66 }
67 }
68
69 /**
70 * Process single packet:
71 * Update source and destination MAC addresses in the ethernet header.
72 */
73 static inline void
process_packet(struct rte_mbuf * pkt,uint16_t * dst_port,uint64_t tx_offloads,bool ip_cksum,bool is_ipv4,uint8_t * l_pkt)74 process_packet(struct rte_mbuf *pkt, uint16_t *dst_port, uint64_t tx_offloads,
75 bool ip_cksum, bool is_ipv4, uint8_t *l_pkt)
76 {
77 struct rte_ether_hdr *eth_hdr;
78 uint32x4_t te, ve;
79 uint32_t val;
80
81 /* Check if it is a large packet */
82 if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
83 *l_pkt |= 1;
84
85 eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
86
87 te = vld1q_u32((uint32_t *)eth_hdr);
88 ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
89
90 val = vgetq_lane_u32(te, 3);
91 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
92 val &= 0xFFFFUL << 16;
93 val |= rte_cpu_to_be_16(is_ipv4 ? RTE_ETHER_TYPE_IPV4 : RTE_ETHER_TYPE_IPV6);
94 #else
95 val &= 0xFFFFUL;
96 val |= rte_cpu_to_be_16(is_ipv4 ? RTE_ETHER_TYPE_IPV4 : RTE_ETHER_TYPE_IPV6) << 16;
97 #endif
98 ve = vsetq_lane_u32(val, ve, 3);
99 vst1q_u32((uint32_t *)eth_hdr, ve);
100
101 if (ip_cksum) {
102 struct rte_ipv4_hdr *ip;
103
104 pkt->ol_flags |= tx_offloads;
105
106 ip = (struct rte_ipv4_hdr *)(eth_hdr + 1);
107 ip->hdr_checksum = 0;
108
109 /* calculate IPv4 cksum in SW */
110 if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
111 ip->hdr_checksum = rte_ipv4_cksum(ip);
112 }
113 }
114
115 static inline void
send_packets(struct rte_mbuf * m[],uint16_t port,uint32_t num,bool is_ipv4)116 send_packets(struct rte_mbuf *m[], uint16_t port, uint32_t num, bool is_ipv4)
117 {
118 uint8_t proto;
119 uint32_t i;
120
121 proto = is_ipv4 ? IPPROTO_IP : IPPROTO_IPV6;
122 for (i = 0; i < num; i++)
123 send_single_packet(m[i], port, proto);
124 }
125
126 static inline void
send_packetsx4(struct rte_mbuf * m[],uint16_t port,uint32_t num)127 send_packetsx4(struct rte_mbuf *m[], uint16_t port, uint32_t num)
128 {
129 unsigned int lcoreid = rte_lcore_id();
130 struct lcore_conf *qconf;
131 uint32_t len, j, n;
132
133 qconf = &lcore_conf[lcoreid];
134
135 len = qconf->tx_mbufs[port].len;
136
137 /*
138 * If TX buffer for that queue is empty, and we have enough packets,
139 * then send them straightway.
140 */
141 if (num >= MAX_TX_BURST && len == 0) {
142 n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
143 core_stats_update_tx(n);
144 if (unlikely(n < num)) {
145 do {
146 rte_pktmbuf_free(m[n]);
147 } while (++n < num);
148 }
149 return;
150 }
151
152 /*
153 * Put packets into TX buffer for that queue.
154 */
155
156 n = len + num;
157 n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
158
159 j = 0;
160 switch (n % FWDSTEP) {
161 while (j < n) {
162 case 0:
163 qconf->tx_mbufs[port].m_table[len + j] = m[j];
164 j++;
165 /* fallthrough */
166 case 3:
167 qconf->tx_mbufs[port].m_table[len + j] = m[j];
168 j++;
169 /* fallthrough */
170 case 2:
171 qconf->tx_mbufs[port].m_table[len + j] = m[j];
172 j++;
173 /* fallthrough */
174 case 1:
175 qconf->tx_mbufs[port].m_table[len + j] = m[j];
176 j++;
177 }
178 }
179
180 len += n;
181
182 /* enough pkts to be sent */
183 if (unlikely(len == MAX_PKT_BURST)) {
184
185 send_burst(qconf, MAX_PKT_BURST, port);
186
187 /* copy rest of the packets into the TX buffer. */
188 len = num - n;
189 if (len == 0)
190 goto exit;
191
192 j = 0;
193 switch (len % FWDSTEP) {
194 while (j < len) {
195 case 0:
196 qconf->tx_mbufs[port].m_table[j] = m[n + j];
197 j++;
198 /* fallthrough */
199 case 3:
200 qconf->tx_mbufs[port].m_table[j] = m[n + j];
201 j++;
202 /* fallthrough */
203 case 2:
204 qconf->tx_mbufs[port].m_table[j] = m[n + j];
205 j++;
206 /* fallthrough */
207 case 1:
208 qconf->tx_mbufs[port].m_table[j] = m[n + j];
209 j++;
210 }
211 }
212 }
213
214 exit:
215 qconf->tx_mbufs[port].len = len;
216 }
217
218 /**
219 * Send packets burst to the ports in dst_port array
220 */
221 static __rte_always_inline void
send_multi_pkts(struct rte_mbuf ** pkts,uint16_t dst_port[MAX_PKT_BURST],int nb_rx,uint64_t tx_offloads,bool ip_cksum,bool is_ipv4)222 send_multi_pkts(struct rte_mbuf **pkts, uint16_t dst_port[MAX_PKT_BURST],
223 int nb_rx, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4)
224 {
225 unsigned int lcoreid = rte_lcore_id();
226 uint16_t pnum[MAX_PKT_BURST + 1];
227 uint8_t l_pkt = 0;
228 uint16_t dlp, *lp;
229 int i = 0, k;
230
231 /*
232 * Finish packet processing and group consecutive
233 * packets with the same destination port.
234 */
235 k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
236
237 if (k != 0) {
238 uint16x8_t dp1, dp2;
239
240 lp = pnum;
241 lp[0] = 1;
242
243 processx4_step3(pkts, dst_port, tx_offloads, ip_cksum, is_ipv4, &l_pkt);
244
245 /* dp1: <d[0], d[1], d[2], d[3], ... > */
246 dp1 = vld1q_u16(dst_port);
247
248 for (i = FWDSTEP; i != k; i += FWDSTEP) {
249 processx4_step3(&pkts[i], &dst_port[i], tx_offloads, ip_cksum, is_ipv4,
250 &l_pkt);
251
252 /*
253 * dp2:
254 * <d[j-3], d[j-2], d[j-1], d[j], ... >
255 */
256 dp2 = vld1q_u16(&dst_port[i - FWDSTEP + 1]);
257 lp = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
258
259 /*
260 * dp1:
261 * <d[j], d[j+1], d[j+2], d[j+3], ... >
262 */
263 dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
264 }
265
266 /*
267 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
268 */
269 dp2 = vextq_u16(dp1, dp1, 1);
270 dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
271 lp = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
272
273 /*
274 * remove values added by the last repeated
275 * dst port.
276 */
277 lp[0]--;
278 dlp = dst_port[i - 1];
279 } else {
280 /* set dlp and lp to the never used values. */
281 dlp = BAD_PORT - 1;
282 lp = pnum + MAX_PKT_BURST;
283 }
284
285 /* Process up to last 3 packets one by one. */
286 switch (nb_rx % FWDSTEP) {
287 case 3:
288 process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum, is_ipv4, &l_pkt);
289 GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
290 i++;
291 /* fallthrough */
292 case 2:
293 process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum, is_ipv4, &l_pkt);
294 GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
295 i++;
296 /* fallthrough */
297 case 1:
298 process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum, is_ipv4, &l_pkt);
299 GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
300 }
301
302 /*
303 * Send packets out, through destination port.
304 * Consecutive packets with the same destination port
305 * are already grouped together.
306 * If destination port for the packet equals BAD_PORT,
307 * then free the packet without sending it out.
308 */
309 for (i = 0; i < nb_rx; i += k) {
310
311 uint16_t pn;
312
313 pn = dst_port[i];
314 k = pnum[i];
315
316 if (likely(pn != BAD_PORT)) {
317 if (l_pkt)
318 /* Large packet is present, need to send
319 * individual packets with fragment
320 */
321 send_packets(pkts + i, pn, k, is_ipv4);
322 else
323 send_packetsx4(pkts + i, pn, k);
324
325 } else {
326 free_pkts(&pkts[i], k);
327 if (is_ipv4)
328 core_statistics[lcoreid].lpm4.miss++;
329 else
330 core_statistics[lcoreid].lpm6.miss++;
331 }
332 }
333 }
334
335 #endif /* IPSEC_NEON_H */
336