10ef246a7SRuifeng Wang /* SPDX-License-Identifier: BSD-3-Clause
20ef246a7SRuifeng Wang * Copyright(c) 2019 Arm Limited
30ef246a7SRuifeng Wang *
40ef246a7SRuifeng Wang * Copyright(c) 2019 Intel Corporation
50ef246a7SRuifeng Wang *
60ef246a7SRuifeng Wang * Derived do_macswap implementation from app/test-pmd/macswap_sse.h
70ef246a7SRuifeng Wang */
80ef246a7SRuifeng Wang
90ef246a7SRuifeng Wang #ifndef _MACSWAP_NEON_H_
100ef246a7SRuifeng Wang #define _MACSWAP_NEON_H_
110ef246a7SRuifeng Wang
120ef246a7SRuifeng Wang #include "macswap_common.h"
130ef246a7SRuifeng Wang #include "rte_vect.h"
140ef246a7SRuifeng Wang
150ef246a7SRuifeng Wang static inline void
do_macswap(struct rte_mbuf * pkts[],uint16_t nb,struct rte_port * txp)160ef246a7SRuifeng Wang do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
170ef246a7SRuifeng Wang struct rte_port *txp)
180ef246a7SRuifeng Wang {
19*6d13ea8eSOlivier Matz struct rte_ether_hdr *eth_hdr[4];
200ef246a7SRuifeng Wang struct rte_mbuf *mb[4];
210ef246a7SRuifeng Wang uint64_t ol_flags;
220ef246a7SRuifeng Wang int i;
230ef246a7SRuifeng Wang int r;
240ef246a7SRuifeng Wang uint8x16_t v0, v1, v2, v3;
250ef246a7SRuifeng Wang /**
260ef246a7SRuifeng Wang * Index map be used to shuffle the 16 bytes.
270ef246a7SRuifeng Wang * byte 0-5 will be swapped with byte 6-11.
280ef246a7SRuifeng Wang * byte 12-15 will keep unchanged.
290ef246a7SRuifeng Wang */
300ef246a7SRuifeng Wang const uint8x16_t idx_map = {6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5,
310ef246a7SRuifeng Wang 12, 13, 14, 15};
320ef246a7SRuifeng Wang
330ef246a7SRuifeng Wang ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
340ef246a7SRuifeng Wang vlan_qinq_set(pkts, nb, ol_flags,
350ef246a7SRuifeng Wang txp->tx_vlan_id, txp->tx_vlan_id_outer);
360ef246a7SRuifeng Wang
370ef246a7SRuifeng Wang i = 0;
380ef246a7SRuifeng Wang r = nb;
390ef246a7SRuifeng Wang
400ef246a7SRuifeng Wang while (r >= 4) {
410ef246a7SRuifeng Wang if (r >= 8) {
420ef246a7SRuifeng Wang rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 4], void *));
430ef246a7SRuifeng Wang rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 5], void *));
440ef246a7SRuifeng Wang rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 6], void *));
450ef246a7SRuifeng Wang rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 7], void *));
460ef246a7SRuifeng Wang }
470ef246a7SRuifeng Wang
480ef246a7SRuifeng Wang mb[0] = pkts[i++];
49*6d13ea8eSOlivier Matz eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct rte_ether_hdr *);
500ef246a7SRuifeng Wang
510ef246a7SRuifeng Wang mb[1] = pkts[i++];
52*6d13ea8eSOlivier Matz eth_hdr[1] = rte_pktmbuf_mtod(mb[1], struct rte_ether_hdr *);
530ef246a7SRuifeng Wang
540ef246a7SRuifeng Wang mb[2] = pkts[i++];
55*6d13ea8eSOlivier Matz eth_hdr[2] = rte_pktmbuf_mtod(mb[2], struct rte_ether_hdr *);
560ef246a7SRuifeng Wang
570ef246a7SRuifeng Wang mb[3] = pkts[i++];
58*6d13ea8eSOlivier Matz eth_hdr[3] = rte_pktmbuf_mtod(mb[3], struct rte_ether_hdr *);
590ef246a7SRuifeng Wang
600ef246a7SRuifeng Wang v0 = vld1q_u8((uint8_t const *)eth_hdr[0]);
610ef246a7SRuifeng Wang v1 = vld1q_u8((uint8_t const *)eth_hdr[1]);
620ef246a7SRuifeng Wang v2 = vld1q_u8((uint8_t const *)eth_hdr[2]);
630ef246a7SRuifeng Wang v3 = vld1q_u8((uint8_t const *)eth_hdr[3]);
640ef246a7SRuifeng Wang
650ef246a7SRuifeng Wang v0 = vqtbl1q_u8(v0, idx_map);
660ef246a7SRuifeng Wang v1 = vqtbl1q_u8(v1, idx_map);
670ef246a7SRuifeng Wang v2 = vqtbl1q_u8(v2, idx_map);
680ef246a7SRuifeng Wang v3 = vqtbl1q_u8(v3, idx_map);
690ef246a7SRuifeng Wang
700ef246a7SRuifeng Wang vst1q_u8((uint8_t *)eth_hdr[0], v0);
710ef246a7SRuifeng Wang vst1q_u8((uint8_t *)eth_hdr[1], v1);
720ef246a7SRuifeng Wang vst1q_u8((uint8_t *)eth_hdr[2], v2);
730ef246a7SRuifeng Wang vst1q_u8((uint8_t *)eth_hdr[3], v3);
740ef246a7SRuifeng Wang
750ef246a7SRuifeng Wang mbuf_field_set(mb[0], ol_flags);
760ef246a7SRuifeng Wang mbuf_field_set(mb[1], ol_flags);
770ef246a7SRuifeng Wang mbuf_field_set(mb[2], ol_flags);
780ef246a7SRuifeng Wang mbuf_field_set(mb[3], ol_flags);
790ef246a7SRuifeng Wang r -= 4;
800ef246a7SRuifeng Wang }
810ef246a7SRuifeng Wang
820ef246a7SRuifeng Wang for ( ; i < nb; i++) {
830ef246a7SRuifeng Wang if (i < nb - 1)
840ef246a7SRuifeng Wang rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
850ef246a7SRuifeng Wang mb[0] = pkts[i];
86*6d13ea8eSOlivier Matz eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct rte_ether_hdr *);
870ef246a7SRuifeng Wang
880ef246a7SRuifeng Wang /* Swap dest and src mac addresses. */
890ef246a7SRuifeng Wang v0 = vld1q_u8((uint8_t const *)eth_hdr[0]);
900ef246a7SRuifeng Wang v0 = vqtbl1q_u8(v0, idx_map);
910ef246a7SRuifeng Wang vst1q_u8((uint8_t *)eth_hdr[0], v0);
920ef246a7SRuifeng Wang
930ef246a7SRuifeng Wang mbuf_field_set(mb[0], ol_flags);
940ef246a7SRuifeng Wang }
950ef246a7SRuifeng Wang }
960ef246a7SRuifeng Wang
970ef246a7SRuifeng Wang #endif /* _MACSWAP_NEON_H_ */
98