xref: /dpdk/lib/eal/arm/include/rte_memcpy_64.h (revision 719834a6849e1daf4a70ff7742bbcc3ae7e25607)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2015 Cavium, Inc
3  */
4 
5 #ifndef _RTE_MEMCPY_ARM64_H_
6 #define _RTE_MEMCPY_ARM64_H_
7 
8 #include <stdint.h>
9 #include <string.h>
10 
11 #include "generic/rte_memcpy.h"
12 
13 #ifdef RTE_ARCH_ARM64_MEMCPY
14 #include <rte_common.h>
15 #include <rte_branch_prediction.h>
16 
17 #ifdef __cplusplus
18 extern "C" {
19 #endif
20 
21 /*
22  * The memory copy performance differs on different AArch64 micro-architectures.
23  * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy()
24  * performance compared to old glibc versions. It's always suggested to use a
25  * more recent glibc if possible, from which the entire system can get benefit.
26  *
27  * This implementation improves memory copy on some aarch64 micro-architectures,
28  * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
29  * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not
30  * always providing better performance than memcpy() so users need to run unit
31  * test "memcpy_perf_autotest" and customize parameters in customization section
32  * below for best performance.
33  *
34  * Compiler version will also impact the rte_memcpy() performance. It's observed
35  * on some platforms and with the same code, GCC 7.2.0 compiled binaries can
36  * provide better performance than GCC 4.8.5 compiled binaries.
37  */
38 
39 /**************************************
40  * Beginning of customization section
41  **************************************/
42 #ifndef RTE_ARM64_MEMCPY_ALIGN_MASK
43 #define RTE_ARM64_MEMCPY_ALIGN_MASK ((RTE_CACHE_LINE_SIZE >> 3) - 1)
44 #endif
45 
46 #ifndef RTE_ARM64_MEMCPY_STRICT_ALIGN
47 /* Only src unalignment will be treated as unaligned copy */
48 #define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
49 	((uintptr_t)(src) & RTE_ARM64_MEMCPY_ALIGN_MASK)
50 #else
51 /* Both dst and src unalignment will be treated as unaligned copy */
52 #define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
53 	(((uintptr_t)(dst) | (uintptr_t)(src)) & RTE_ARM64_MEMCPY_ALIGN_MASK)
54 #endif
55 
56 
57 /*
58  * If copy size is larger than threshold, memcpy() will be used.
59  * Run "memcpy_perf_autotest" to determine the proper threshold.
60  */
61 #ifdef RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD
62 #define USE_ALIGNED_RTE_MEMCPY(dst, src, n) \
63 (!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
64 n <= (size_t)RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD)
65 #else
66 #define USE_ALIGNED_RTE_MEMCPY(dst, src, n) \
67 (!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src))
68 #endif
69 #ifdef RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD
70 #define USE_UNALIGNED_RTE_MEMCPY(dst, src, n) \
71 (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
72 n <= (size_t)RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD)
73 #else
74 #define USE_UNALIGNED_RTE_MEMCPY(dst, src, n) \
75 (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src))
76 #endif
77 /*
78  * The logic of USE_RTE_MEMCPY() can also be modified to best fit platform.
79  */
80 #if defined(RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
81 || defined(RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD)
82 #define USE_RTE_MEMCPY(dst, src, n) \
83 (USE_ALIGNED_RTE_MEMCPY(dst, src, n) || USE_UNALIGNED_RTE_MEMCPY(dst, src, n))
84 #else
85 #define USE_RTE_MEMCPY(dst, src, n) (1)
86 #endif
87 /**************************************
88  * End of customization section
89  **************************************/
90 
91 
92 #if RTE_CC_IS_GNU && !defined RTE_ARM64_MEMCPY_SKIP_GCC_VER_CHECK
93 #if (GCC_VERSION < 50400)
94 #warning "The GCC version is quite old, which may result in sub-optimal \
95 performance of the compiled code. It is suggested that at least GCC 5.4.0 \
96 be used."
97 #endif
98 #endif
99 
100 static __rte_always_inline
101 void rte_mov16(uint8_t *dst, const uint8_t *src)
102 {
103 	__uint128_t *dst128 = (__uint128_t *)dst;
104 	const __uint128_t *src128 = (const __uint128_t *)src;
105 	*dst128 = *src128;
106 }
107 
108 static __rte_always_inline
109 void rte_mov32(uint8_t *dst, const uint8_t *src)
110 {
111 	__uint128_t *dst128 = (__uint128_t *)dst;
112 	const __uint128_t *src128 = (const __uint128_t *)src;
113 	const __uint128_t x0 = src128[0], x1 = src128[1];
114 	dst128[0] = x0;
115 	dst128[1] = x1;
116 }
117 
118 static __rte_always_inline
119 void rte_mov48(uint8_t *dst, const uint8_t *src)
120 {
121 	__uint128_t *dst128 = (__uint128_t *)dst;
122 	const __uint128_t *src128 = (const __uint128_t *)src;
123 	const __uint128_t x0 = src128[0], x1 = src128[1], x2 = src128[2];
124 	dst128[0] = x0;
125 	dst128[1] = x1;
126 	dst128[2] = x2;
127 }
128 
129 static __rte_always_inline
130 void rte_mov64(uint8_t *dst, const uint8_t *src)
131 {
132 	__uint128_t *dst128 = (__uint128_t *)dst;
133 	const __uint128_t *src128 = (const __uint128_t *)src;
134 	const __uint128_t
135 		x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
136 	dst128[0] = x0;
137 	dst128[1] = x1;
138 	dst128[2] = x2;
139 	dst128[3] = x3;
140 }
141 
142 static __rte_always_inline
143 void rte_mov128(uint8_t *dst, const uint8_t *src)
144 {
145 	__uint128_t *dst128 = (__uint128_t *)dst;
146 	const __uint128_t *src128 = (const __uint128_t *)src;
147 	/* Keep below declaration & copy sequence for optimized instructions */
148 	const __uint128_t
149 		x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
150 	dst128[0] = x0;
151 	__uint128_t x4 = src128[4];
152 	dst128[1] = x1;
153 	__uint128_t x5 = src128[5];
154 	dst128[2] = x2;
155 	__uint128_t x6 = src128[6];
156 	dst128[3] = x3;
157 	__uint128_t x7 = src128[7];
158 	dst128[4] = x4;
159 	dst128[5] = x5;
160 	dst128[6] = x6;
161 	dst128[7] = x7;
162 }
163 
164 static __rte_always_inline
165 void rte_mov256(uint8_t *dst, const uint8_t *src)
166 {
167 	rte_mov128(dst, src);
168 	rte_mov128(dst + 128, src + 128);
169 }
170 
171 static __rte_always_inline void
172 rte_memcpy_lt16(uint8_t *dst, const uint8_t *src, size_t n)
173 {
174 	if (n & 0x08) {
175 		/* copy 8 ~ 15 bytes */
176 		*(uint64_t *)dst = *(const uint64_t *)src;
177 		*(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n);
178 	} else if (n & 0x04) {
179 		/* copy 4 ~ 7 bytes */
180 		*(uint32_t *)dst = *(const uint32_t *)src;
181 		*(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n);
182 	} else if (n & 0x02) {
183 		/* copy 2 ~ 3 bytes */
184 		*(uint16_t *)dst = *(const uint16_t *)src;
185 		*(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n);
186 	} else if (n & 0x01) {
187 		/* copy 1 byte */
188 		*dst = *src;
189 	}
190 }
191 
192 static __rte_always_inline
193 void rte_memcpy_ge16_lt128(uint8_t *dst, const uint8_t *src, size_t n)
194 {
195 	if (n < 64) {
196 		if (n == 16) {
197 			rte_mov16(dst, src);
198 		} else if (n <= 32) {
199 			rte_mov16(dst, src);
200 			rte_mov16(dst - 16 + n, src - 16 + n);
201 		} else if (n <= 48) {
202 			rte_mov32(dst, src);
203 			rte_mov16(dst - 16 + n, src - 16 + n);
204 		} else {
205 			rte_mov48(dst, src);
206 			rte_mov16(dst - 16 + n, src - 16 + n);
207 		}
208 	} else {
209 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
210 		if (n > 48 + 64)
211 			rte_mov64(dst - 64 + n, src - 64 + n);
212 		else if (n > 32 + 64)
213 			rte_mov48(dst - 48 + n, src - 48 + n);
214 		else if (n > 16 + 64)
215 			rte_mov32(dst - 32 + n, src - 32 + n);
216 		else if (n > 64)
217 			rte_mov16(dst - 16 + n, src - 16 + n);
218 	}
219 }
220 
221 static __rte_always_inline
222 void rte_memcpy_ge128(uint8_t *dst, const uint8_t *src, size_t n)
223 {
224 	do {
225 		rte_mov128(dst, src);
226 		src += 128;
227 		dst += 128;
228 		n -= 128;
229 	} while (likely(n >= 128));
230 
231 	if (likely(n)) {
232 		if (n <= 16)
233 			rte_mov16(dst - 16 + n, src - 16 + n);
234 		else if (n <= 32)
235 			rte_mov32(dst - 32 + n, src - 32 + n);
236 		else if (n <= 48)
237 			rte_mov48(dst - 48 + n, src - 48 + n);
238 		else if (n <= 64)
239 			rte_mov64(dst - 64 + n, src - 64 + n);
240 		else
241 			rte_memcpy_ge16_lt128(dst, src, n);
242 	}
243 }
244 
245 static __rte_always_inline
246 void rte_memcpy_ge16_lt64(uint8_t *dst, const uint8_t *src, size_t n)
247 {
248 	if (n == 16) {
249 		rte_mov16(dst, src);
250 	} else if (n <= 32) {
251 		rte_mov16(dst, src);
252 		rte_mov16(dst - 16 + n, src - 16 + n);
253 	} else if (n <= 48) {
254 		rte_mov32(dst, src);
255 		rte_mov16(dst - 16 + n, src - 16 + n);
256 	} else {
257 		rte_mov48(dst, src);
258 		rte_mov16(dst - 16 + n, src - 16 + n);
259 	}
260 }
261 
262 static __rte_always_inline
263 void rte_memcpy_ge64(uint8_t *dst, const uint8_t *src, size_t n)
264 {
265 	do {
266 		rte_mov64(dst, src);
267 		src += 64;
268 		dst += 64;
269 		n -= 64;
270 	} while (likely(n >= 64));
271 
272 	if (likely(n)) {
273 		if (n <= 16)
274 			rte_mov16(dst - 16 + n, src - 16 + n);
275 		else if (n <= 32)
276 			rte_mov32(dst - 32 + n, src - 32 + n);
277 		else if (n <= 48)
278 			rte_mov48(dst - 48 + n, src - 48 + n);
279 		else
280 			rte_mov64(dst - 64 + n, src - 64 + n);
281 	}
282 }
283 
284 #if RTE_CACHE_LINE_SIZE >= 128
285 static __rte_always_inline
286 void *rte_memcpy(void *dst, const void *src, size_t n)
287 {
288 	if (n < 16) {
289 		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
290 		return dst;
291 	}
292 	if (n < 128) {
293 		rte_memcpy_ge16_lt128((uint8_t *)dst, (const uint8_t *)src, n);
294 		return dst;
295 	}
296 	__builtin_prefetch(src, 0, 0);
297 	__builtin_prefetch(dst, 1, 0);
298 	if (likely(USE_RTE_MEMCPY(dst, src, n))) {
299 		rte_memcpy_ge128((uint8_t *)dst, (const uint8_t *)src, n);
300 		return dst;
301 	} else
302 		return memcpy(dst, src, n);
303 }
304 
305 #else
306 static __rte_always_inline
307 void *rte_memcpy(void *dst, const void *src, size_t n)
308 {
309 	if (n < 16) {
310 		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
311 		return dst;
312 	}
313 	if (n < 64) {
314 		rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
315 		return dst;
316 	}
317 	__builtin_prefetch(src, 0, 0);
318 	__builtin_prefetch(dst, 1, 0);
319 	if (likely(USE_RTE_MEMCPY(dst, src, n))) {
320 		rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n);
321 		return dst;
322 	} else
323 		return memcpy(dst, src, n);
324 }
325 #endif /* RTE_CACHE_LINE_SIZE >= 128 */
326 
327 #ifdef __cplusplus
328 }
329 #endif
330 
331 #else /* RTE_ARCH_ARM64_MEMCPY */
332 
333 #ifdef __cplusplus
334 extern "C" {
335 #endif
336 
337 static inline void
338 rte_mov16(uint8_t *dst, const uint8_t *src)
339 {
340 	memcpy(dst, src, 16);
341 }
342 
343 static inline void
344 rte_mov32(uint8_t *dst, const uint8_t *src)
345 {
346 	memcpy(dst, src, 32);
347 }
348 
349 static inline void
350 rte_mov48(uint8_t *dst, const uint8_t *src)
351 {
352 	memcpy(dst, src, 48);
353 }
354 
355 static inline void
356 rte_mov64(uint8_t *dst, const uint8_t *src)
357 {
358 	memcpy(dst, src, 64);
359 }
360 
361 static inline void
362 rte_mov128(uint8_t *dst, const uint8_t *src)
363 {
364 	memcpy(dst, src, 128);
365 }
366 
367 static inline void
368 rte_mov256(uint8_t *dst, const uint8_t *src)
369 {
370 	memcpy(dst, src, 256);
371 }
372 
373 #define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
374 
375 #ifdef __cplusplus
376 }
377 #endif
378 
379 #endif /* RTE_ARCH_ARM64_MEMCPY */
380 
381 #endif /* _RTE_MEMCPY_ARM_64_H_ */
382