xref: /dpdk/examples/ipsec-secgw/ipsec_neon.h (revision 58e2cf4cf75b54f89c0cde772fe0edfc830ce8e2)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(C) 2022 Marvell.
3  */
4 
5 #ifndef IPSEC_NEON_H
6 #define IPSEC_NEON_H
7 
8 #include "ipsec.h"
9 #include "neon/port_group.h"
10 
11 #define MAX_TX_BURST	(MAX_PKT_BURST / 2)
12 
13 extern xmm_t val_eth[RTE_MAX_ETHPORTS];
14 
15 /*
16  * Update source and destination MAC addresses in the ethernet header.
17  */
18 static inline void
processx4_step3(struct rte_mbuf * pkts[FWDSTEP],uint16_t dst_port[FWDSTEP],uint64_t tx_offloads,bool ip_cksum,bool is_ipv4,uint8_t * l_pkt)19 processx4_step3(struct rte_mbuf *pkts[FWDSTEP], uint16_t dst_port[FWDSTEP],
20 		uint64_t tx_offloads, bool ip_cksum, bool is_ipv4, uint8_t *l_pkt)
21 {
22 	uint32x4_t te[FWDSTEP];
23 	uint32x4_t ve[FWDSTEP];
24 	uint32_t *p[FWDSTEP];
25 	struct rte_mbuf *pkt;
26 	uint32_t val;
27 	uint8_t i;
28 
29 	for (i = 0; i < FWDSTEP; i++) {
30 		pkt = pkts[i];
31 
32 		/* Check if it is a large packet */
33 		if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
34 			*l_pkt |= 1;
35 
36 		p[i] = rte_pktmbuf_mtod(pkt, uint32_t *);
37 		ve[i] = vreinterpretq_u32_s32(val_eth[dst_port[i]]);
38 		te[i] = vld1q_u32(p[i]);
39 
40 		/* Update last 4 bytes */
41 		val = vgetq_lane_u32(te[i], 3);
42 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
43 		val &= 0xFFFFUL << 16;
44 		val |= rte_cpu_to_be_16(is_ipv4 ? RTE_ETHER_TYPE_IPV4 : RTE_ETHER_TYPE_IPV6);
45 #else
46 		val &= 0xFFFFUL;
47 		val |= rte_cpu_to_be_16(is_ipv4 ? RTE_ETHER_TYPE_IPV4 : RTE_ETHER_TYPE_IPV6) << 16;
48 #endif
49 		ve[i] = vsetq_lane_u32(val, ve[i], 3);
50 		vst1q_u32(p[i], ve[i]);
51 
52 		if (ip_cksum) {
53 			struct rte_ipv4_hdr *ip;
54 
55 			pkt->ol_flags |= tx_offloads;
56 
57 			ip = (struct rte_ipv4_hdr *)
58 				(((uintptr_t)p[i]) + RTE_ETHER_HDR_LEN);
59 			ip->hdr_checksum = 0;
60 
61 			/* calculate IPv4 cksum in SW */
62 			if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
63 				ip->hdr_checksum = rte_ipv4_cksum(ip);
64 		}
65 
66 	}
67 }
68 
69 /**
70  * Process single packet:
71  * Update source and destination MAC addresses in the ethernet header.
72  */
73 static inline void
process_packet(struct rte_mbuf * pkt,uint16_t * dst_port,uint64_t tx_offloads,bool ip_cksum,bool is_ipv4,uint8_t * l_pkt)74 process_packet(struct rte_mbuf *pkt, uint16_t *dst_port, uint64_t tx_offloads,
75 	       bool ip_cksum, bool is_ipv4, uint8_t *l_pkt)
76 {
77 	struct rte_ether_hdr *eth_hdr;
78 	uint32x4_t te, ve;
79 	uint32_t val;
80 
81 	/* Check if it is a large packet */
82 	if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
83 		*l_pkt |= 1;
84 
85 	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
86 
87 	te = vld1q_u32((uint32_t *)eth_hdr);
88 	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
89 
90 	val = vgetq_lane_u32(te, 3);
91 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
92 	val &= 0xFFFFUL << 16;
93 	val |= rte_cpu_to_be_16(is_ipv4 ? RTE_ETHER_TYPE_IPV4 : RTE_ETHER_TYPE_IPV6);
94 #else
95 	val &= 0xFFFFUL;
96 	val |= rte_cpu_to_be_16(is_ipv4 ? RTE_ETHER_TYPE_IPV4 : RTE_ETHER_TYPE_IPV6) << 16;
97 #endif
98 	ve = vsetq_lane_u32(val, ve, 3);
99 	vst1q_u32((uint32_t *)eth_hdr, ve);
100 
101 	if (ip_cksum) {
102 		struct rte_ipv4_hdr *ip;
103 
104 		pkt->ol_flags |= tx_offloads;
105 
106 		ip = (struct rte_ipv4_hdr *)(eth_hdr + 1);
107 		ip->hdr_checksum = 0;
108 
109 		/* calculate IPv4 cksum in SW */
110 		if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
111 			ip->hdr_checksum = rte_ipv4_cksum(ip);
112 	}
113 }
114 
115 static inline void
send_packets(struct rte_mbuf * m[],uint16_t port,uint32_t num,bool is_ipv4)116 send_packets(struct rte_mbuf *m[], uint16_t port, uint32_t num, bool is_ipv4)
117 {
118 	uint8_t proto;
119 	uint32_t i;
120 
121 	proto = is_ipv4 ? IPPROTO_IP : IPPROTO_IPV6;
122 	for (i = 0; i < num; i++)
123 		send_single_packet(m[i], port, proto);
124 }
125 
126 static inline void
send_packetsx4(struct rte_mbuf * m[],uint16_t port,uint32_t num)127 send_packetsx4(struct rte_mbuf *m[], uint16_t port, uint32_t num)
128 {
129 	unsigned int lcoreid = rte_lcore_id();
130 	struct lcore_conf *qconf;
131 	uint32_t len, j, n;
132 
133 	qconf = &lcore_conf[lcoreid];
134 
135 	len = qconf->tx_mbufs[port].len;
136 
137 	/*
138 	 * If TX buffer for that queue is empty, and we have enough packets,
139 	 * then send them straightway.
140 	 */
141 	if (num >= MAX_TX_BURST && len == 0) {
142 		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
143 		core_stats_update_tx(n);
144 		if (unlikely(n < num)) {
145 			do {
146 				rte_pktmbuf_free(m[n]);
147 			} while (++n < num);
148 		}
149 		return;
150 	}
151 
152 	/*
153 	 * Put packets into TX buffer for that queue.
154 	 */
155 
156 	n = len + num;
157 	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
158 
159 	j = 0;
160 	switch (n % FWDSTEP) {
161 	while (j < n) {
162 		case 0:
163 			qconf->tx_mbufs[port].m_table[len + j] = m[j];
164 			j++;
165 			/* fallthrough */
166 		case 3:
167 			qconf->tx_mbufs[port].m_table[len + j] = m[j];
168 			j++;
169 			/* fallthrough */
170 		case 2:
171 			qconf->tx_mbufs[port].m_table[len + j] = m[j];
172 			j++;
173 			/* fallthrough */
174 		case 1:
175 			qconf->tx_mbufs[port].m_table[len + j] = m[j];
176 			j++;
177 		}
178 	}
179 
180 	len += n;
181 
182 	/* enough pkts to be sent */
183 	if (unlikely(len == MAX_PKT_BURST)) {
184 
185 		send_burst(qconf, MAX_PKT_BURST, port);
186 
187 		/* copy rest of the packets into the TX buffer. */
188 		len = num - n;
189 		if (len == 0)
190 			goto exit;
191 
192 		j = 0;
193 		switch (len % FWDSTEP) {
194 		while (j < len) {
195 			case 0:
196 				qconf->tx_mbufs[port].m_table[j] = m[n + j];
197 				j++;
198 				/* fallthrough */
199 			case 3:
200 				qconf->tx_mbufs[port].m_table[j] = m[n + j];
201 				j++;
202 				/* fallthrough */
203 			case 2:
204 				qconf->tx_mbufs[port].m_table[j] = m[n + j];
205 				j++;
206 				/* fallthrough */
207 			case 1:
208 				qconf->tx_mbufs[port].m_table[j] = m[n + j];
209 				j++;
210 		}
211 		}
212 	}
213 
214 exit:
215 	qconf->tx_mbufs[port].len = len;
216 }
217 
218 /**
219  * Send packets burst to the ports in dst_port array
220  */
221 static __rte_always_inline void
send_multi_pkts(struct rte_mbuf ** pkts,uint16_t dst_port[MAX_PKT_BURST],int nb_rx,uint64_t tx_offloads,bool ip_cksum,bool is_ipv4)222 send_multi_pkts(struct rte_mbuf **pkts, uint16_t dst_port[MAX_PKT_BURST],
223 		int nb_rx, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4)
224 {
225 	unsigned int lcoreid = rte_lcore_id();
226 	uint16_t pnum[MAX_PKT_BURST + 1];
227 	uint8_t l_pkt = 0;
228 	uint16_t dlp, *lp;
229 	int i = 0, k;
230 
231 	/*
232 	 * Finish packet processing and group consecutive
233 	 * packets with the same destination port.
234 	 */
235 	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
236 
237 	if (k != 0) {
238 		uint16x8_t dp1, dp2;
239 
240 		lp = pnum;
241 		lp[0] = 1;
242 
243 		processx4_step3(pkts, dst_port, tx_offloads, ip_cksum, is_ipv4, &l_pkt);
244 
245 		/* dp1: <d[0], d[1], d[2], d[3], ... > */
246 		dp1 = vld1q_u16(dst_port);
247 
248 		for (i = FWDSTEP; i != k; i += FWDSTEP) {
249 			processx4_step3(&pkts[i], &dst_port[i], tx_offloads, ip_cksum, is_ipv4,
250 					&l_pkt);
251 
252 			/*
253 			 * dp2:
254 			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
255 			 */
256 			dp2 = vld1q_u16(&dst_port[i - FWDSTEP + 1]);
257 			lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
258 
259 			/*
260 			 * dp1:
261 			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
262 			 */
263 			dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
264 		}
265 
266 		/*
267 		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
268 		 */
269 		dp2 = vextq_u16(dp1, dp1, 1);
270 		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
271 		lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
272 
273 		/*
274 		 * remove values added by the last repeated
275 		 * dst port.
276 		 */
277 		lp[0]--;
278 		dlp = dst_port[i - 1];
279 	} else {
280 		/* set dlp and lp to the never used values. */
281 		dlp = BAD_PORT - 1;
282 		lp = pnum + MAX_PKT_BURST;
283 	}
284 
285 	/* Process up to last 3 packets one by one. */
286 	switch (nb_rx % FWDSTEP) {
287 	case 3:
288 		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum, is_ipv4, &l_pkt);
289 		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
290 		i++;
291 		/* fallthrough */
292 	case 2:
293 		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum, is_ipv4, &l_pkt);
294 		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
295 		i++;
296 		/* fallthrough */
297 	case 1:
298 		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum, is_ipv4, &l_pkt);
299 		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
300 	}
301 
302 	/*
303 	 * Send packets out, through destination port.
304 	 * Consecutive packets with the same destination port
305 	 * are already grouped together.
306 	 * If destination port for the packet equals BAD_PORT,
307 	 * then free the packet without sending it out.
308 	 */
309 	for (i = 0; i < nb_rx; i += k) {
310 
311 		uint16_t pn;
312 
313 		pn = dst_port[i];
314 		k = pnum[i];
315 
316 		if (likely(pn != BAD_PORT)) {
317 			if (l_pkt)
318 				/* Large packet is present, need to send
319 				 * individual packets with fragment
320 				 */
321 				send_packets(pkts + i, pn, k, is_ipv4);
322 			else
323 				send_packetsx4(pkts + i, pn, k);
324 
325 		} else {
326 			free_pkts(&pkts[i], k);
327 			if (is_ipv4)
328 				core_statistics[lcoreid].lpm4.miss++;
329 			else
330 				core_statistics[lcoreid].lpm6.miss++;
331 		}
332 	}
333 }
334 
335 #endif /* IPSEC_NEON_H */
336