xref: /dpdk/lib/eal/arm/include/rte_memcpy_32.h (revision 719834a6849e1daf4a70ff7742bbcc3ae7e25607)
199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
299a2dd95SBruce Richardson  * Copyright(c) 2015 RehiveTech. All rights reserved.
399a2dd95SBruce Richardson  */
499a2dd95SBruce Richardson 
599a2dd95SBruce Richardson #ifndef _RTE_MEMCPY_ARM32_H_
699a2dd95SBruce Richardson #define _RTE_MEMCPY_ARM32_H_
799a2dd95SBruce Richardson 
899a2dd95SBruce Richardson #include <stdint.h>
999a2dd95SBruce Richardson #include <string.h>
1099a2dd95SBruce Richardson 
1199a2dd95SBruce Richardson #include "generic/rte_memcpy.h"
1299a2dd95SBruce Richardson 
1399a2dd95SBruce Richardson #ifdef RTE_ARCH_ARM_NEON_MEMCPY
1499a2dd95SBruce Richardson 
1599a2dd95SBruce Richardson #ifndef __ARM_NEON
1699a2dd95SBruce Richardson #error "Cannot optimize memcpy by NEON as the CPU seems to not support this"
1799a2dd95SBruce Richardson #endif
1899a2dd95SBruce Richardson 
1999a2dd95SBruce Richardson /* ARM NEON Intrinsics are used to copy data */
2099a2dd95SBruce Richardson #include <arm_neon.h>
2199a2dd95SBruce Richardson 
22*719834a6SMattias Rönnblom #ifdef __cplusplus
23*719834a6SMattias Rönnblom extern "C" {
24*719834a6SMattias Rönnblom #endif
25*719834a6SMattias Rönnblom 
2699a2dd95SBruce Richardson static inline void
2799a2dd95SBruce Richardson rte_mov16(uint8_t *dst, const uint8_t *src)
2899a2dd95SBruce Richardson {
2999a2dd95SBruce Richardson 	vst1q_u8(dst, vld1q_u8(src));
3099a2dd95SBruce Richardson }
3199a2dd95SBruce Richardson 
3299a2dd95SBruce Richardson static inline void
3399a2dd95SBruce Richardson rte_mov32(uint8_t *dst, const uint8_t *src)
3499a2dd95SBruce Richardson {
3599a2dd95SBruce Richardson 	asm volatile (
3699a2dd95SBruce Richardson 		"vld1.8 {d0-d3}, [%0]\n\t"
3799a2dd95SBruce Richardson 		"vst1.8 {d0-d3}, [%1]\n\t"
3899a2dd95SBruce Richardson 		: "+r" (src), "+r" (dst)
3999a2dd95SBruce Richardson 		: : "memory", "d0", "d1", "d2", "d3");
4099a2dd95SBruce Richardson }
4199a2dd95SBruce Richardson 
4299a2dd95SBruce Richardson static inline void
4399a2dd95SBruce Richardson rte_mov48(uint8_t *dst, const uint8_t *src)
4499a2dd95SBruce Richardson {
4599a2dd95SBruce Richardson 	asm volatile (
4699a2dd95SBruce Richardson 		"vld1.8 {d0-d3}, [%0]!\n\t"
4799a2dd95SBruce Richardson 		"vld1.8 {d4-d5}, [%0]\n\t"
4899a2dd95SBruce Richardson 		"vst1.8 {d0-d3}, [%1]!\n\t"
4999a2dd95SBruce Richardson 		"vst1.8 {d4-d5}, [%1]\n\t"
5099a2dd95SBruce Richardson 		: "+r" (src), "+r" (dst)
5199a2dd95SBruce Richardson 		:
5299a2dd95SBruce Richardson 		: "memory", "d0", "d1", "d2", "d3", "d4", "d5");
5399a2dd95SBruce Richardson }
5499a2dd95SBruce Richardson 
5599a2dd95SBruce Richardson static inline void
5699a2dd95SBruce Richardson rte_mov64(uint8_t *dst, const uint8_t *src)
5799a2dd95SBruce Richardson {
5899a2dd95SBruce Richardson 	asm volatile (
5999a2dd95SBruce Richardson 		"vld1.8 {d0-d3}, [%0]!\n\t"
6099a2dd95SBruce Richardson 		"vld1.8 {d4-d7}, [%0]\n\t"
6199a2dd95SBruce Richardson 		"vst1.8 {d0-d3}, [%1]!\n\t"
6299a2dd95SBruce Richardson 		"vst1.8 {d4-d7}, [%1]\n\t"
6399a2dd95SBruce Richardson 		: "+r" (src), "+r" (dst)
6499a2dd95SBruce Richardson 		:
6599a2dd95SBruce Richardson 		: "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7");
6699a2dd95SBruce Richardson }
6799a2dd95SBruce Richardson 
6899a2dd95SBruce Richardson static inline void
6999a2dd95SBruce Richardson rte_mov128(uint8_t *dst, const uint8_t *src)
7099a2dd95SBruce Richardson {
7199a2dd95SBruce Richardson 	asm volatile ("pld [%0, #64]" : : "r" (src));
7299a2dd95SBruce Richardson 	asm volatile (
7399a2dd95SBruce Richardson 		"vld1.8 {d0-d3},   [%0]!\n\t"
7499a2dd95SBruce Richardson 		"vld1.8 {d4-d7},   [%0]!\n\t"
7599a2dd95SBruce Richardson 		"vld1.8 {d8-d11},  [%0]!\n\t"
7699a2dd95SBruce Richardson 		"vld1.8 {d12-d15}, [%0]\n\t"
7799a2dd95SBruce Richardson 		"vst1.8 {d0-d3},   [%1]!\n\t"
7899a2dd95SBruce Richardson 		"vst1.8 {d4-d7},   [%1]!\n\t"
7999a2dd95SBruce Richardson 		"vst1.8 {d8-d11},  [%1]!\n\t"
8099a2dd95SBruce Richardson 		"vst1.8 {d12-d15}, [%1]\n\t"
8199a2dd95SBruce Richardson 		: "+r" (src), "+r" (dst)
8299a2dd95SBruce Richardson 		:
8399a2dd95SBruce Richardson 		: "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
8499a2dd95SBruce Richardson 		"d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15");
8599a2dd95SBruce Richardson }
8699a2dd95SBruce Richardson 
8799a2dd95SBruce Richardson static inline void
8899a2dd95SBruce Richardson rte_mov256(uint8_t *dst, const uint8_t *src)
8999a2dd95SBruce Richardson {
9099a2dd95SBruce Richardson 	asm volatile ("pld [%0,  #64]" : : "r" (src));
9199a2dd95SBruce Richardson 	asm volatile ("pld [%0, #128]" : : "r" (src));
9299a2dd95SBruce Richardson 	asm volatile ("pld [%0, #192]" : : "r" (src));
9399a2dd95SBruce Richardson 	asm volatile ("pld [%0, #256]" : : "r" (src));
9499a2dd95SBruce Richardson 	asm volatile ("pld [%0, #320]" : : "r" (src));
9599a2dd95SBruce Richardson 	asm volatile ("pld [%0, #384]" : : "r" (src));
9699a2dd95SBruce Richardson 	asm volatile ("pld [%0, #448]" : : "r" (src));
9799a2dd95SBruce Richardson 	asm volatile (
9899a2dd95SBruce Richardson 		"vld1.8 {d0-d3},   [%0]!\n\t"
9999a2dd95SBruce Richardson 		"vld1.8 {d4-d7},   [%0]!\n\t"
10099a2dd95SBruce Richardson 		"vld1.8 {d8-d11},  [%0]!\n\t"
10199a2dd95SBruce Richardson 		"vld1.8 {d12-d15}, [%0]!\n\t"
10299a2dd95SBruce Richardson 		"vld1.8 {d16-d19}, [%0]!\n\t"
10399a2dd95SBruce Richardson 		"vld1.8 {d20-d23}, [%0]!\n\t"
10499a2dd95SBruce Richardson 		"vld1.8 {d24-d27}, [%0]!\n\t"
10599a2dd95SBruce Richardson 		"vld1.8 {d28-d31}, [%0]\n\t"
10699a2dd95SBruce Richardson 		"vst1.8 {d0-d3},   [%1]!\n\t"
10799a2dd95SBruce Richardson 		"vst1.8 {d4-d7},   [%1]!\n\t"
10899a2dd95SBruce Richardson 		"vst1.8 {d8-d11},  [%1]!\n\t"
10999a2dd95SBruce Richardson 		"vst1.8 {d12-d15}, [%1]!\n\t"
11099a2dd95SBruce Richardson 		"vst1.8 {d16-d19}, [%1]!\n\t"
11199a2dd95SBruce Richardson 		"vst1.8 {d20-d23}, [%1]!\n\t"
11299a2dd95SBruce Richardson 		"vst1.8 {d24-d27}, [%1]!\n\t"
11399a2dd95SBruce Richardson 		"vst1.8 {d28-d31}, [%1]!\n\t"
11499a2dd95SBruce Richardson 		: "+r" (src), "+r" (dst)
11599a2dd95SBruce Richardson 		:
11699a2dd95SBruce Richardson 		: "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
11799a2dd95SBruce Richardson 		"d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
11899a2dd95SBruce Richardson 		"d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
11999a2dd95SBruce Richardson 		"d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31");
12099a2dd95SBruce Richardson }
12199a2dd95SBruce Richardson 
12299a2dd95SBruce Richardson #define rte_memcpy(dst, src, n)              \
12399a2dd95SBruce Richardson 	__extension__ ({                     \
12499a2dd95SBruce Richardson 	(__builtin_constant_p(n)) ?          \
12599a2dd95SBruce Richardson 	memcpy((dst), (src), (n)) :          \
12699a2dd95SBruce Richardson 	rte_memcpy_func((dst), (src), (n)); })
12799a2dd95SBruce Richardson 
12899a2dd95SBruce Richardson static inline void *
12999a2dd95SBruce Richardson rte_memcpy_func(void *dst, const void *src, size_t n)
13099a2dd95SBruce Richardson {
13199a2dd95SBruce Richardson 	void *ret = dst;
13299a2dd95SBruce Richardson 
13399a2dd95SBruce Richardson 	/* We can't copy < 16 bytes using XMM registers so do it manually. */
13499a2dd95SBruce Richardson 	if (n < 16) {
13599a2dd95SBruce Richardson 		if (n & 0x01) {
13699a2dd95SBruce Richardson 			*(uint8_t *)dst = *(const uint8_t *)src;
13799a2dd95SBruce Richardson 			dst = (uint8_t *)dst + 1;
13899a2dd95SBruce Richardson 			src = (const uint8_t *)src + 1;
13999a2dd95SBruce Richardson 		}
14099a2dd95SBruce Richardson 		if (n & 0x02) {
14199a2dd95SBruce Richardson 			*(uint16_t *)dst = *(const uint16_t *)src;
14299a2dd95SBruce Richardson 			dst = (uint16_t *)dst + 1;
14399a2dd95SBruce Richardson 			src = (const uint16_t *)src + 1;
14499a2dd95SBruce Richardson 		}
14599a2dd95SBruce Richardson 		if (n & 0x04) {
14699a2dd95SBruce Richardson 			*(uint32_t *)dst = *(const uint32_t *)src;
14799a2dd95SBruce Richardson 			dst = (uint32_t *)dst + 1;
14899a2dd95SBruce Richardson 			src = (const uint32_t *)src + 1;
14999a2dd95SBruce Richardson 		}
15099a2dd95SBruce Richardson 		if (n & 0x08) {
15199a2dd95SBruce Richardson 			/* ARMv7 can not handle unaligned access to long long
15299a2dd95SBruce Richardson 			 * (uint64_t). Therefore two uint32_t operations are
15399a2dd95SBruce Richardson 			 * used.
15499a2dd95SBruce Richardson 			 */
15599a2dd95SBruce Richardson 			*(uint32_t *)dst = *(const uint32_t *)src;
15699a2dd95SBruce Richardson 			dst = (uint32_t *)dst + 1;
15799a2dd95SBruce Richardson 			src = (const uint32_t *)src + 1;
15899a2dd95SBruce Richardson 			*(uint32_t *)dst = *(const uint32_t *)src;
15999a2dd95SBruce Richardson 		}
16099a2dd95SBruce Richardson 		return ret;
16199a2dd95SBruce Richardson 	}
16299a2dd95SBruce Richardson 
16399a2dd95SBruce Richardson 	/* Special fast cases for <= 128 bytes */
16499a2dd95SBruce Richardson 	if (n <= 32) {
16599a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
16699a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst - 16 + n,
16799a2dd95SBruce Richardson 			(const uint8_t *)src - 16 + n);
16899a2dd95SBruce Richardson 		return ret;
16999a2dd95SBruce Richardson 	}
17099a2dd95SBruce Richardson 
17199a2dd95SBruce Richardson 	if (n <= 64) {
17299a2dd95SBruce Richardson 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
17399a2dd95SBruce Richardson 		rte_mov32((uint8_t *)dst - 32 + n,
17499a2dd95SBruce Richardson 			(const uint8_t *)src - 32 + n);
17599a2dd95SBruce Richardson 		return ret;
17699a2dd95SBruce Richardson 	}
17799a2dd95SBruce Richardson 
17899a2dd95SBruce Richardson 	if (n <= 128) {
17999a2dd95SBruce Richardson 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
18099a2dd95SBruce Richardson 		rte_mov64((uint8_t *)dst - 64 + n,
18199a2dd95SBruce Richardson 			(const uint8_t *)src - 64 + n);
18299a2dd95SBruce Richardson 		return ret;
18399a2dd95SBruce Richardson 	}
18499a2dd95SBruce Richardson 
18599a2dd95SBruce Richardson 	/*
18699a2dd95SBruce Richardson 	 * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
18799a2dd95SBruce Richardson 	 * copies was found to be faster than doing 128 and 32 byte copies as
18899a2dd95SBruce Richardson 	 * well.
18999a2dd95SBruce Richardson 	 */
19099a2dd95SBruce Richardson 	for ( ; n >= 256; n -= 256) {
19199a2dd95SBruce Richardson 		rte_mov256((uint8_t *)dst, (const uint8_t *)src);
19299a2dd95SBruce Richardson 		dst = (uint8_t *)dst + 256;
19399a2dd95SBruce Richardson 		src = (const uint8_t *)src + 256;
19499a2dd95SBruce Richardson 	}
19599a2dd95SBruce Richardson 
19699a2dd95SBruce Richardson 	/*
19799a2dd95SBruce Richardson 	 * We split the remaining bytes (which will be less than 256) into
19899a2dd95SBruce Richardson 	 * 64byte (2^6) chunks.
19999a2dd95SBruce Richardson 	 * Using incrementing integers in the case labels of a switch statement
20099a2dd95SBruce Richardson 	 * encourages the compiler to use a jump table. To get incrementing
20199a2dd95SBruce Richardson 	 * integers, we shift the 2 relevant bits to the LSB position to first
20299a2dd95SBruce Richardson 	 * get decrementing integers, and then subtract.
20399a2dd95SBruce Richardson 	 */
20499a2dd95SBruce Richardson 	switch (3 - (n >> 6)) {
20599a2dd95SBruce Richardson 	case 0x00:
20699a2dd95SBruce Richardson 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
20799a2dd95SBruce Richardson 		n -= 64;
20899a2dd95SBruce Richardson 		dst = (uint8_t *)dst + 64;
20999a2dd95SBruce Richardson 		src = (const uint8_t *)src + 64;      /* fallthrough */
21099a2dd95SBruce Richardson 	case 0x01:
21199a2dd95SBruce Richardson 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
21299a2dd95SBruce Richardson 		n -= 64;
21399a2dd95SBruce Richardson 		dst = (uint8_t *)dst + 64;
21499a2dd95SBruce Richardson 		src = (const uint8_t *)src + 64;      /* fallthrough */
21599a2dd95SBruce Richardson 	case 0x02:
21699a2dd95SBruce Richardson 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
21799a2dd95SBruce Richardson 		n -= 64;
21899a2dd95SBruce Richardson 		dst = (uint8_t *)dst + 64;
21999a2dd95SBruce Richardson 		src = (const uint8_t *)src + 64;      /* fallthrough */
22099a2dd95SBruce Richardson 	default:
22199a2dd95SBruce Richardson 		break;
22299a2dd95SBruce Richardson 	}
22399a2dd95SBruce Richardson 
22499a2dd95SBruce Richardson 	/*
22599a2dd95SBruce Richardson 	 * We split the remaining bytes (which will be less than 64) into
22699a2dd95SBruce Richardson 	 * 16byte (2^4) chunks, using the same switch structure as above.
22799a2dd95SBruce Richardson 	 */
22899a2dd95SBruce Richardson 	switch (3 - (n >> 4)) {
22999a2dd95SBruce Richardson 	case 0x00:
23099a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
23199a2dd95SBruce Richardson 		n -= 16;
23299a2dd95SBruce Richardson 		dst = (uint8_t *)dst + 16;
23399a2dd95SBruce Richardson 		src = (const uint8_t *)src + 16;      /* fallthrough */
23499a2dd95SBruce Richardson 	case 0x01:
23599a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
23699a2dd95SBruce Richardson 		n -= 16;
23799a2dd95SBruce Richardson 		dst = (uint8_t *)dst + 16;
23899a2dd95SBruce Richardson 		src = (const uint8_t *)src + 16;      /* fallthrough */
23999a2dd95SBruce Richardson 	case 0x02:
24099a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
24199a2dd95SBruce Richardson 		n -= 16;
24299a2dd95SBruce Richardson 		dst = (uint8_t *)dst + 16;
24399a2dd95SBruce Richardson 		src = (const uint8_t *)src + 16;      /* fallthrough */
24499a2dd95SBruce Richardson 	default:
24599a2dd95SBruce Richardson 		break;
24699a2dd95SBruce Richardson 	}
24799a2dd95SBruce Richardson 
24899a2dd95SBruce Richardson 	/* Copy any remaining bytes, without going beyond end of buffers */
24999a2dd95SBruce Richardson 	if (n != 0)
25099a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst - 16 + n,
25199a2dd95SBruce Richardson 			(const uint8_t *)src - 16 + n);
25299a2dd95SBruce Richardson 	return ret;
25399a2dd95SBruce Richardson }
25499a2dd95SBruce Richardson 
25599a2dd95SBruce Richardson #else
25699a2dd95SBruce Richardson 
25799a2dd95SBruce Richardson static inline void
25899a2dd95SBruce Richardson rte_mov16(uint8_t *dst, const uint8_t *src)
25999a2dd95SBruce Richardson {
26099a2dd95SBruce Richardson 	memcpy(dst, src, 16);
26199a2dd95SBruce Richardson }
26299a2dd95SBruce Richardson 
26399a2dd95SBruce Richardson static inline void
26499a2dd95SBruce Richardson rte_mov32(uint8_t *dst, const uint8_t *src)
26599a2dd95SBruce Richardson {
26699a2dd95SBruce Richardson 	memcpy(dst, src, 32);
26799a2dd95SBruce Richardson }
26899a2dd95SBruce Richardson 
26999a2dd95SBruce Richardson static inline void
27099a2dd95SBruce Richardson rte_mov48(uint8_t *dst, const uint8_t *src)
27199a2dd95SBruce Richardson {
27299a2dd95SBruce Richardson 	memcpy(dst, src, 48);
27399a2dd95SBruce Richardson }
27499a2dd95SBruce Richardson 
27599a2dd95SBruce Richardson static inline void
27699a2dd95SBruce Richardson rte_mov64(uint8_t *dst, const uint8_t *src)
27799a2dd95SBruce Richardson {
27899a2dd95SBruce Richardson 	memcpy(dst, src, 64);
27999a2dd95SBruce Richardson }
28099a2dd95SBruce Richardson 
28199a2dd95SBruce Richardson static inline void
28299a2dd95SBruce Richardson rte_mov128(uint8_t *dst, const uint8_t *src)
28399a2dd95SBruce Richardson {
28499a2dd95SBruce Richardson 	memcpy(dst, src, 128);
28599a2dd95SBruce Richardson }
28699a2dd95SBruce Richardson 
28799a2dd95SBruce Richardson static inline void
28899a2dd95SBruce Richardson rte_mov256(uint8_t *dst, const uint8_t *src)
28999a2dd95SBruce Richardson {
29099a2dd95SBruce Richardson 	memcpy(dst, src, 256);
29199a2dd95SBruce Richardson }
29299a2dd95SBruce Richardson 
29399a2dd95SBruce Richardson static inline void *
29499a2dd95SBruce Richardson rte_memcpy(void *dst, const void *src, size_t n)
29599a2dd95SBruce Richardson {
29699a2dd95SBruce Richardson 	return memcpy(dst, src, n);
29799a2dd95SBruce Richardson }
29899a2dd95SBruce Richardson 
29999a2dd95SBruce Richardson #endif /* RTE_ARCH_ARM_NEON_MEMCPY */
30099a2dd95SBruce Richardson 
30199a2dd95SBruce Richardson #ifdef __cplusplus
30299a2dd95SBruce Richardson }
30399a2dd95SBruce Richardson #endif
30499a2dd95SBruce Richardson 
30599a2dd95SBruce Richardson #endif /* _RTE_MEMCPY_ARM32_H_ */
306