xref: /dpdk/lib/eal/x86/include/rte_memcpy.h (revision fba9875559906e04eaeb74532f4cfd51194259a2)
199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
299a2dd95SBruce Richardson  * Copyright(c) 2010-2014 Intel Corporation
399a2dd95SBruce Richardson  */
499a2dd95SBruce Richardson 
599a2dd95SBruce Richardson #ifndef _RTE_MEMCPY_X86_64_H_
699a2dd95SBruce Richardson #define _RTE_MEMCPY_X86_64_H_
799a2dd95SBruce Richardson 
899a2dd95SBruce Richardson /**
999a2dd95SBruce Richardson  * @file
1099a2dd95SBruce Richardson  *
1199a2dd95SBruce Richardson  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
1299a2dd95SBruce Richardson  */
1399a2dd95SBruce Richardson 
1499a2dd95SBruce Richardson #include <stdio.h>
1599a2dd95SBruce Richardson #include <stdint.h>
1699a2dd95SBruce Richardson #include <string.h>
1799a2dd95SBruce Richardson #include <rte_vect.h>
1899a2dd95SBruce Richardson #include <rte_common.h>
1999a2dd95SBruce Richardson #include <rte_config.h>
2099a2dd95SBruce Richardson 
2199a2dd95SBruce Richardson #ifdef __cplusplus
2299a2dd95SBruce Richardson extern "C" {
2399a2dd95SBruce Richardson #endif
2499a2dd95SBruce Richardson 
2599a2dd95SBruce Richardson #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
2699a2dd95SBruce Richardson #pragma GCC diagnostic push
2799a2dd95SBruce Richardson #pragma GCC diagnostic ignored "-Wstringop-overflow"
2899a2dd95SBruce Richardson #endif
2999a2dd95SBruce Richardson 
30830d7c98SMorten Brørup /*
31830d7c98SMorten Brørup  * GCC older than version 11 doesn't compile AVX properly, so use SSE instead.
32830d7c98SMorten Brørup  * There are no problems with AVX2.
33830d7c98SMorten Brørup  */
34830d7c98SMorten Brørup #if defined __AVX2__
35830d7c98SMorten Brørup #define RTE_MEMCPY_AVX
36830d7c98SMorten Brørup #elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000))
37830d7c98SMorten Brørup #define RTE_MEMCPY_AVX
38830d7c98SMorten Brørup #endif
39830d7c98SMorten Brørup 
4099a2dd95SBruce Richardson /**
4199a2dd95SBruce Richardson  * Copy bytes from one location to another. The locations must not overlap.
4299a2dd95SBruce Richardson  *
4399a2dd95SBruce Richardson  * @note This is implemented as a macro, so it's address should not be taken
4499a2dd95SBruce Richardson  * and care is needed as parameter expressions may be evaluated multiple times.
4599a2dd95SBruce Richardson  *
4699a2dd95SBruce Richardson  * @param dst
4799a2dd95SBruce Richardson  *   Pointer to the destination of the data.
4899a2dd95SBruce Richardson  * @param src
4999a2dd95SBruce Richardson  *   Pointer to the source data.
5099a2dd95SBruce Richardson  * @param n
5199a2dd95SBruce Richardson  *   Number of bytes to copy.
5299a2dd95SBruce Richardson  * @return
5399a2dd95SBruce Richardson  *   Pointer to the destination data.
5499a2dd95SBruce Richardson  */
5599a2dd95SBruce Richardson static __rte_always_inline void *
5699a2dd95SBruce Richardson rte_memcpy(void *dst, const void *src, size_t n);
5799a2dd95SBruce Richardson 
5800901e4dSLuc Pelletier /**
5900901e4dSLuc Pelletier  * Copy bytes from one location to another,
6000901e4dSLuc Pelletier  * locations should not overlap.
6100901e4dSLuc Pelletier  * Use with n <= 15.
6200901e4dSLuc Pelletier  */
6300901e4dSLuc Pelletier static __rte_always_inline void *
6400901e4dSLuc Pelletier rte_mov15_or_less(void *dst, const void *src, size_t n)
6500901e4dSLuc Pelletier {
6600901e4dSLuc Pelletier 	/**
6700901e4dSLuc Pelletier 	 * Use the following structs to avoid violating C standard
6800901e4dSLuc Pelletier 	 * alignment requirements and to avoid strict aliasing bugs
6900901e4dSLuc Pelletier 	 */
70*fba98755SAndre Muezerie 	struct __rte_packed_begin rte_uint64_alias {
7100901e4dSLuc Pelletier 		uint64_t val;
72*fba98755SAndre Muezerie 	} __rte_packed_end __rte_may_alias;
73*fba98755SAndre Muezerie 	struct __rte_packed_begin rte_uint32_alias {
7400901e4dSLuc Pelletier 		uint32_t val;
75*fba98755SAndre Muezerie 	} __rte_packed_end __rte_may_alias;
76*fba98755SAndre Muezerie 	struct __rte_packed_begin rte_uint16_alias {
7700901e4dSLuc Pelletier 		uint16_t val;
78*fba98755SAndre Muezerie 	} __rte_packed_end __rte_may_alias;
7900901e4dSLuc Pelletier 
8000901e4dSLuc Pelletier 	void *ret = dst;
8100901e4dSLuc Pelletier 	if (n & 8) {
8200901e4dSLuc Pelletier 		((struct rte_uint64_alias *)dst)->val =
8300901e4dSLuc Pelletier 			((const struct rte_uint64_alias *)src)->val;
8400901e4dSLuc Pelletier 		src = (const uint64_t *)src + 1;
8500901e4dSLuc Pelletier 		dst = (uint64_t *)dst + 1;
8600901e4dSLuc Pelletier 	}
8700901e4dSLuc Pelletier 	if (n & 4) {
8800901e4dSLuc Pelletier 		((struct rte_uint32_alias *)dst)->val =
8900901e4dSLuc Pelletier 			((const struct rte_uint32_alias *)src)->val;
9000901e4dSLuc Pelletier 		src = (const uint32_t *)src + 1;
9100901e4dSLuc Pelletier 		dst = (uint32_t *)dst + 1;
9200901e4dSLuc Pelletier 	}
9300901e4dSLuc Pelletier 	if (n & 2) {
9400901e4dSLuc Pelletier 		((struct rte_uint16_alias *)dst)->val =
9500901e4dSLuc Pelletier 			((const struct rte_uint16_alias *)src)->val;
9600901e4dSLuc Pelletier 		src = (const uint16_t *)src + 1;
9700901e4dSLuc Pelletier 		dst = (uint16_t *)dst + 1;
9800901e4dSLuc Pelletier 	}
9900901e4dSLuc Pelletier 	if (n & 1)
10000901e4dSLuc Pelletier 		*(uint8_t *)dst = *(const uint8_t *)src;
10100901e4dSLuc Pelletier 	return ret;
10200901e4dSLuc Pelletier }
10300901e4dSLuc Pelletier 
10499a2dd95SBruce Richardson /**
10599a2dd95SBruce Richardson  * Copy 16 bytes from one location to another,
10699a2dd95SBruce Richardson  * locations should not overlap.
10799a2dd95SBruce Richardson  */
10899a2dd95SBruce Richardson static __rte_always_inline void
10999a2dd95SBruce Richardson rte_mov16(uint8_t *dst, const uint8_t *src)
11099a2dd95SBruce Richardson {
11199a2dd95SBruce Richardson 	__m128i xmm0;
11299a2dd95SBruce Richardson 
113830d7c98SMorten Brørup 	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
114830d7c98SMorten Brørup 	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
11599a2dd95SBruce Richardson }
11699a2dd95SBruce Richardson 
11799a2dd95SBruce Richardson /**
11899a2dd95SBruce Richardson  * Copy 32 bytes from one location to another,
11999a2dd95SBruce Richardson  * locations should not overlap.
12099a2dd95SBruce Richardson  */
12199a2dd95SBruce Richardson static __rte_always_inline void
12299a2dd95SBruce Richardson rte_mov32(uint8_t *dst, const uint8_t *src)
12399a2dd95SBruce Richardson {
124830d7c98SMorten Brørup #if defined RTE_MEMCPY_AVX
12599a2dd95SBruce Richardson 	__m256i ymm0;
12699a2dd95SBruce Richardson 
127830d7c98SMorten Brørup 	ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
128830d7c98SMorten Brørup 	_mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
129830d7c98SMorten Brørup #else /* SSE implementation */
130830d7c98SMorten Brørup 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
131830d7c98SMorten Brørup 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
132830d7c98SMorten Brørup #endif
13399a2dd95SBruce Richardson }
13499a2dd95SBruce Richardson 
13599a2dd95SBruce Richardson /**
13699a2dd95SBruce Richardson  * Copy 64 bytes from one location to another,
13799a2dd95SBruce Richardson  * locations should not overlap.
13899a2dd95SBruce Richardson  */
13999a2dd95SBruce Richardson static __rte_always_inline void
14099a2dd95SBruce Richardson rte_mov64(uint8_t *dst, const uint8_t *src)
14199a2dd95SBruce Richardson {
142830d7c98SMorten Brørup #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
14399a2dd95SBruce Richardson 	__m512i zmm0;
14499a2dd95SBruce Richardson 
14599a2dd95SBruce Richardson 	zmm0 = _mm512_loadu_si512((const void *)src);
14699a2dd95SBruce Richardson 	_mm512_storeu_si512((void *)dst, zmm0);
147830d7c98SMorten Brørup #else /* AVX2, AVX & SSE implementation */
148830d7c98SMorten Brørup 	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
149830d7c98SMorten Brørup 	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
150830d7c98SMorten Brørup #endif
15199a2dd95SBruce Richardson }
15299a2dd95SBruce Richardson 
15399a2dd95SBruce Richardson /**
15499a2dd95SBruce Richardson  * Copy 128 bytes from one location to another,
15599a2dd95SBruce Richardson  * locations should not overlap.
15699a2dd95SBruce Richardson  */
15799a2dd95SBruce Richardson static __rte_always_inline void
15899a2dd95SBruce Richardson rte_mov128(uint8_t *dst, const uint8_t *src)
15999a2dd95SBruce Richardson {
16099a2dd95SBruce Richardson 	rte_mov64(dst + 0 * 64, src + 0 * 64);
16199a2dd95SBruce Richardson 	rte_mov64(dst + 1 * 64, src + 1 * 64);
16299a2dd95SBruce Richardson }
16399a2dd95SBruce Richardson 
16499a2dd95SBruce Richardson /**
16599a2dd95SBruce Richardson  * Copy 256 bytes from one location to another,
16699a2dd95SBruce Richardson  * locations should not overlap.
16799a2dd95SBruce Richardson  */
16899a2dd95SBruce Richardson static __rte_always_inline void
16999a2dd95SBruce Richardson rte_mov256(uint8_t *dst, const uint8_t *src)
17099a2dd95SBruce Richardson {
171830d7c98SMorten Brørup 	rte_mov128(dst + 0 * 128, src + 0 * 128);
172830d7c98SMorten Brørup 	rte_mov128(dst + 1 * 128, src + 1 * 128);
17399a2dd95SBruce Richardson }
17499a2dd95SBruce Richardson 
175830d7c98SMorten Brørup #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
176830d7c98SMorten Brørup 
177830d7c98SMorten Brørup /**
178830d7c98SMorten Brørup  * AVX512 implementation below
179830d7c98SMorten Brørup  */
180830d7c98SMorten Brørup 
181830d7c98SMorten Brørup #define ALIGNMENT_MASK 0x3F
182830d7c98SMorten Brørup 
18399a2dd95SBruce Richardson /**
18499a2dd95SBruce Richardson  * Copy 128-byte blocks from one location to another,
18599a2dd95SBruce Richardson  * locations should not overlap.
18699a2dd95SBruce Richardson  */
18799a2dd95SBruce Richardson static __rte_always_inline void
18899a2dd95SBruce Richardson rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
18999a2dd95SBruce Richardson {
19099a2dd95SBruce Richardson 	__m512i zmm0, zmm1;
19199a2dd95SBruce Richardson 
19299a2dd95SBruce Richardson 	while (n >= 128) {
19399a2dd95SBruce Richardson 		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
19499a2dd95SBruce Richardson 		n -= 128;
19599a2dd95SBruce Richardson 		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
19699a2dd95SBruce Richardson 		src = src + 128;
19799a2dd95SBruce Richardson 		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
19899a2dd95SBruce Richardson 		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
19999a2dd95SBruce Richardson 		dst = dst + 128;
20099a2dd95SBruce Richardson 	}
20199a2dd95SBruce Richardson }
20299a2dd95SBruce Richardson 
20399a2dd95SBruce Richardson /**
20499a2dd95SBruce Richardson  * Copy 512-byte blocks from one location to another,
20599a2dd95SBruce Richardson  * locations should not overlap.
20699a2dd95SBruce Richardson  */
20799a2dd95SBruce Richardson static inline void
20899a2dd95SBruce Richardson rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
20999a2dd95SBruce Richardson {
21099a2dd95SBruce Richardson 	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
21199a2dd95SBruce Richardson 
21299a2dd95SBruce Richardson 	while (n >= 512) {
21399a2dd95SBruce Richardson 		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
21499a2dd95SBruce Richardson 		n -= 512;
21599a2dd95SBruce Richardson 		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
21699a2dd95SBruce Richardson 		zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
21799a2dd95SBruce Richardson 		zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
21899a2dd95SBruce Richardson 		zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
21999a2dd95SBruce Richardson 		zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
22099a2dd95SBruce Richardson 		zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
22199a2dd95SBruce Richardson 		zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
22299a2dd95SBruce Richardson 		src = src + 512;
22399a2dd95SBruce Richardson 		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
22499a2dd95SBruce Richardson 		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
22599a2dd95SBruce Richardson 		_mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
22699a2dd95SBruce Richardson 		_mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
22799a2dd95SBruce Richardson 		_mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
22899a2dd95SBruce Richardson 		_mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
22999a2dd95SBruce Richardson 		_mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
23099a2dd95SBruce Richardson 		_mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
23199a2dd95SBruce Richardson 		dst = dst + 512;
23299a2dd95SBruce Richardson 	}
23399a2dd95SBruce Richardson }
23499a2dd95SBruce Richardson 
23599a2dd95SBruce Richardson static __rte_always_inline void *
23699a2dd95SBruce Richardson rte_memcpy_generic(void *dst, const void *src, size_t n)
23799a2dd95SBruce Richardson {
23899a2dd95SBruce Richardson 	void *ret = dst;
23999a2dd95SBruce Richardson 	size_t dstofss;
24099a2dd95SBruce Richardson 	size_t bits;
24199a2dd95SBruce Richardson 
24299a2dd95SBruce Richardson 	/**
24399a2dd95SBruce Richardson 	 * Copy less than 16 bytes
24499a2dd95SBruce Richardson 	 */
24599a2dd95SBruce Richardson 	if (n < 16) {
24600901e4dSLuc Pelletier 		return rte_mov15_or_less(dst, src, n);
24799a2dd95SBruce Richardson 	}
24899a2dd95SBruce Richardson 
24999a2dd95SBruce Richardson 	/**
25099a2dd95SBruce Richardson 	 * Fast way when copy size doesn't exceed 512 bytes
25199a2dd95SBruce Richardson 	 */
252830d7c98SMorten Brørup 	if (__rte_constant(n) && n == 32) {
253830d7c98SMorten Brørup 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
254830d7c98SMorten Brørup 		return ret;
255830d7c98SMorten Brørup 	}
25699a2dd95SBruce Richardson 	if (n <= 32) {
25799a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
258830d7c98SMorten Brørup 		if (__rte_constant(n) && n == 16)
259830d7c98SMorten Brørup 			return ret; /* avoid (harmless) duplicate copy */
26099a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst - 16 + n,
26199a2dd95SBruce Richardson 				  (const uint8_t *)src - 16 + n);
26299a2dd95SBruce Richardson 		return ret;
26399a2dd95SBruce Richardson 	}
264830d7c98SMorten Brørup 	if (__rte_constant(n) && n == 64) {
265830d7c98SMorten Brørup 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
266830d7c98SMorten Brørup 		return ret;
267830d7c98SMorten Brørup 	}
26899a2dd95SBruce Richardson 	if (n <= 64) {
26999a2dd95SBruce Richardson 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
27099a2dd95SBruce Richardson 		rte_mov32((uint8_t *)dst - 32 + n,
27199a2dd95SBruce Richardson 				  (const uint8_t *)src - 32 + n);
27299a2dd95SBruce Richardson 		return ret;
27399a2dd95SBruce Richardson 	}
27499a2dd95SBruce Richardson 	if (n <= 512) {
27599a2dd95SBruce Richardson 		if (n >= 256) {
27699a2dd95SBruce Richardson 			n -= 256;
27799a2dd95SBruce Richardson 			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
27899a2dd95SBruce Richardson 			src = (const uint8_t *)src + 256;
27999a2dd95SBruce Richardson 			dst = (uint8_t *)dst + 256;
28099a2dd95SBruce Richardson 		}
28199a2dd95SBruce Richardson 		if (n >= 128) {
28299a2dd95SBruce Richardson 			n -= 128;
28399a2dd95SBruce Richardson 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
28499a2dd95SBruce Richardson 			src = (const uint8_t *)src + 128;
28599a2dd95SBruce Richardson 			dst = (uint8_t *)dst + 128;
28699a2dd95SBruce Richardson 		}
28799a2dd95SBruce Richardson COPY_BLOCK_128_BACK63:
28899a2dd95SBruce Richardson 		if (n > 64) {
28999a2dd95SBruce Richardson 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
29099a2dd95SBruce Richardson 			rte_mov64((uint8_t *)dst - 64 + n,
29199a2dd95SBruce Richardson 					  (const uint8_t *)src - 64 + n);
29299a2dd95SBruce Richardson 			return ret;
29399a2dd95SBruce Richardson 		}
29499a2dd95SBruce Richardson 		if (n > 0)
29599a2dd95SBruce Richardson 			rte_mov64((uint8_t *)dst - 64 + n,
29699a2dd95SBruce Richardson 					  (const uint8_t *)src - 64 + n);
29799a2dd95SBruce Richardson 		return ret;
29899a2dd95SBruce Richardson 	}
29999a2dd95SBruce Richardson 
30099a2dd95SBruce Richardson 	/**
30199a2dd95SBruce Richardson 	 * Make store aligned when copy size exceeds 512 bytes
30299a2dd95SBruce Richardson 	 */
30399a2dd95SBruce Richardson 	dstofss = ((uintptr_t)dst & 0x3F);
30499a2dd95SBruce Richardson 	if (dstofss > 0) {
30599a2dd95SBruce Richardson 		dstofss = 64 - dstofss;
30699a2dd95SBruce Richardson 		n -= dstofss;
30799a2dd95SBruce Richardson 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
30899a2dd95SBruce Richardson 		src = (const uint8_t *)src + dstofss;
30999a2dd95SBruce Richardson 		dst = (uint8_t *)dst + dstofss;
31099a2dd95SBruce Richardson 	}
31199a2dd95SBruce Richardson 
31299a2dd95SBruce Richardson 	/**
31399a2dd95SBruce Richardson 	 * Copy 512-byte blocks.
31499a2dd95SBruce Richardson 	 * Use copy block function for better instruction order control,
31599a2dd95SBruce Richardson 	 * which is important when load is unaligned.
31699a2dd95SBruce Richardson 	 */
31799a2dd95SBruce Richardson 	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
31899a2dd95SBruce Richardson 	bits = n;
31999a2dd95SBruce Richardson 	n = n & 511;
32099a2dd95SBruce Richardson 	bits -= n;
32199a2dd95SBruce Richardson 	src = (const uint8_t *)src + bits;
32299a2dd95SBruce Richardson 	dst = (uint8_t *)dst + bits;
32399a2dd95SBruce Richardson 
32499a2dd95SBruce Richardson 	/**
32599a2dd95SBruce Richardson 	 * Copy 128-byte blocks.
32699a2dd95SBruce Richardson 	 * Use copy block function for better instruction order control,
32799a2dd95SBruce Richardson 	 * which is important when load is unaligned.
32899a2dd95SBruce Richardson 	 */
32999a2dd95SBruce Richardson 	if (n >= 128) {
33099a2dd95SBruce Richardson 		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
33199a2dd95SBruce Richardson 		bits = n;
33299a2dd95SBruce Richardson 		n = n & 127;
33399a2dd95SBruce Richardson 		bits -= n;
33499a2dd95SBruce Richardson 		src = (const uint8_t *)src + bits;
33599a2dd95SBruce Richardson 		dst = (uint8_t *)dst + bits;
33699a2dd95SBruce Richardson 	}
33799a2dd95SBruce Richardson 
33899a2dd95SBruce Richardson 	/**
33999a2dd95SBruce Richardson 	 * Copy whatever left
34099a2dd95SBruce Richardson 	 */
34199a2dd95SBruce Richardson 	goto COPY_BLOCK_128_BACK63;
34299a2dd95SBruce Richardson }
34399a2dd95SBruce Richardson 
344830d7c98SMorten Brørup #elif defined RTE_MEMCPY_AVX
345830d7c98SMorten Brørup 
346830d7c98SMorten Brørup /**
347830d7c98SMorten Brørup  * AVX implementation below
348830d7c98SMorten Brørup  */
34999a2dd95SBruce Richardson 
35099a2dd95SBruce Richardson #define ALIGNMENT_MASK 0x1F
35199a2dd95SBruce Richardson 
35299a2dd95SBruce Richardson /**
35399a2dd95SBruce Richardson  * Copy 128-byte blocks from one location to another,
35499a2dd95SBruce Richardson  * locations should not overlap.
35599a2dd95SBruce Richardson  */
35699a2dd95SBruce Richardson static __rte_always_inline void
35799a2dd95SBruce Richardson rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
35899a2dd95SBruce Richardson {
35999a2dd95SBruce Richardson 	__m256i ymm0, ymm1, ymm2, ymm3;
36099a2dd95SBruce Richardson 
36199a2dd95SBruce Richardson 	while (n >= 128) {
3626de430b7SEli Britstein 		ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)
3636de430b7SEli Britstein 					  ((const uint8_t *)src + 0 * 32));
36499a2dd95SBruce Richardson 		n -= 128;
3656de430b7SEli Britstein 		ymm1 = _mm256_loadu_si256((const __m256i *)(const void *)
3666de430b7SEli Britstein 					  ((const uint8_t *)src + 1 * 32));
3676de430b7SEli Britstein 		ymm2 = _mm256_loadu_si256((const __m256i *)(const void *)
3686de430b7SEli Britstein 					  ((const uint8_t *)src + 2 * 32));
3696de430b7SEli Britstein 		ymm3 = _mm256_loadu_si256((const __m256i *)(const void *)
3706de430b7SEli Britstein 					  ((const uint8_t *)src + 3 * 32));
37199a2dd95SBruce Richardson 		src = (const uint8_t *)src + 128;
3726de430b7SEli Britstein 		_mm256_storeu_si256((__m256i *)(void *)
3736de430b7SEli Britstein 				    ((uint8_t *)dst + 0 * 32), ymm0);
3746de430b7SEli Britstein 		_mm256_storeu_si256((__m256i *)(void *)
3756de430b7SEli Britstein 				    ((uint8_t *)dst + 1 * 32), ymm1);
3766de430b7SEli Britstein 		_mm256_storeu_si256((__m256i *)(void *)
3776de430b7SEli Britstein 				    ((uint8_t *)dst + 2 * 32), ymm2);
3786de430b7SEli Britstein 		_mm256_storeu_si256((__m256i *)(void *)
3796de430b7SEli Britstein 				    ((uint8_t *)dst + 3 * 32), ymm3);
38099a2dd95SBruce Richardson 		dst = (uint8_t *)dst + 128;
38199a2dd95SBruce Richardson 	}
38299a2dd95SBruce Richardson }
38399a2dd95SBruce Richardson 
38499a2dd95SBruce Richardson static __rte_always_inline void *
38599a2dd95SBruce Richardson rte_memcpy_generic(void *dst, const void *src, size_t n)
38699a2dd95SBruce Richardson {
38799a2dd95SBruce Richardson 	void *ret = dst;
38899a2dd95SBruce Richardson 	size_t dstofss;
38999a2dd95SBruce Richardson 	size_t bits;
39099a2dd95SBruce Richardson 
39199a2dd95SBruce Richardson 	/**
39299a2dd95SBruce Richardson 	 * Copy less than 16 bytes
39399a2dd95SBruce Richardson 	 */
39499a2dd95SBruce Richardson 	if (n < 16) {
39500901e4dSLuc Pelletier 		return rte_mov15_or_less(dst, src, n);
39699a2dd95SBruce Richardson 	}
39799a2dd95SBruce Richardson 
39899a2dd95SBruce Richardson 	/**
39999a2dd95SBruce Richardson 	 * Fast way when copy size doesn't exceed 256 bytes
40099a2dd95SBruce Richardson 	 */
401830d7c98SMorten Brørup 	if (__rte_constant(n) && n == 32) {
402830d7c98SMorten Brørup 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
40399a2dd95SBruce Richardson 		return ret;
40499a2dd95SBruce Richardson 	}
405830d7c98SMorten Brørup 	if (n <= 32) {
40699a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
407830d7c98SMorten Brørup 		if (__rte_constant(n) && n == 16)
408830d7c98SMorten Brørup 			return ret; /* avoid (harmless) duplicate copy */
40999a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst - 16 + n,
41099a2dd95SBruce Richardson 				(const uint8_t *)src - 16 + n);
41199a2dd95SBruce Richardson 		return ret;
41299a2dd95SBruce Richardson 	}
41399a2dd95SBruce Richardson 	if (n <= 64) {
41499a2dd95SBruce Richardson 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
41599a2dd95SBruce Richardson 		rte_mov32((uint8_t *)dst - 32 + n,
41699a2dd95SBruce Richardson 				(const uint8_t *)src - 32 + n);
41799a2dd95SBruce Richardson 		return ret;
41899a2dd95SBruce Richardson 	}
41999a2dd95SBruce Richardson 	if (n <= 256) {
42099a2dd95SBruce Richardson 		if (n >= 128) {
42199a2dd95SBruce Richardson 			n -= 128;
42299a2dd95SBruce Richardson 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
42399a2dd95SBruce Richardson 			src = (const uint8_t *)src + 128;
42499a2dd95SBruce Richardson 			dst = (uint8_t *)dst + 128;
42599a2dd95SBruce Richardson 		}
42699a2dd95SBruce Richardson COPY_BLOCK_128_BACK31:
42799a2dd95SBruce Richardson 		if (n >= 64) {
42899a2dd95SBruce Richardson 			n -= 64;
42999a2dd95SBruce Richardson 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
43099a2dd95SBruce Richardson 			src = (const uint8_t *)src + 64;
43199a2dd95SBruce Richardson 			dst = (uint8_t *)dst + 64;
43299a2dd95SBruce Richardson 		}
43399a2dd95SBruce Richardson 		if (n > 32) {
43499a2dd95SBruce Richardson 			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
43599a2dd95SBruce Richardson 			rte_mov32((uint8_t *)dst - 32 + n,
43699a2dd95SBruce Richardson 					(const uint8_t *)src - 32 + n);
43799a2dd95SBruce Richardson 			return ret;
43899a2dd95SBruce Richardson 		}
43999a2dd95SBruce Richardson 		if (n > 0) {
44099a2dd95SBruce Richardson 			rte_mov32((uint8_t *)dst - 32 + n,
44199a2dd95SBruce Richardson 					(const uint8_t *)src - 32 + n);
44299a2dd95SBruce Richardson 		}
44399a2dd95SBruce Richardson 		return ret;
44499a2dd95SBruce Richardson 	}
44599a2dd95SBruce Richardson 
44699a2dd95SBruce Richardson 	/**
44799a2dd95SBruce Richardson 	 * Make store aligned when copy size exceeds 256 bytes
44899a2dd95SBruce Richardson 	 */
44999a2dd95SBruce Richardson 	dstofss = (uintptr_t)dst & 0x1F;
45099a2dd95SBruce Richardson 	if (dstofss > 0) {
45199a2dd95SBruce Richardson 		dstofss = 32 - dstofss;
45299a2dd95SBruce Richardson 		n -= dstofss;
45399a2dd95SBruce Richardson 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
45499a2dd95SBruce Richardson 		src = (const uint8_t *)src + dstofss;
45599a2dd95SBruce Richardson 		dst = (uint8_t *)dst + dstofss;
45699a2dd95SBruce Richardson 	}
45799a2dd95SBruce Richardson 
45899a2dd95SBruce Richardson 	/**
45999a2dd95SBruce Richardson 	 * Copy 128-byte blocks
46099a2dd95SBruce Richardson 	 */
46199a2dd95SBruce Richardson 	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
46299a2dd95SBruce Richardson 	bits = n;
46399a2dd95SBruce Richardson 	n = n & 127;
46499a2dd95SBruce Richardson 	bits -= n;
46599a2dd95SBruce Richardson 	src = (const uint8_t *)src + bits;
46699a2dd95SBruce Richardson 	dst = (uint8_t *)dst + bits;
46799a2dd95SBruce Richardson 
46899a2dd95SBruce Richardson 	/**
46999a2dd95SBruce Richardson 	 * Copy whatever left
47099a2dd95SBruce Richardson 	 */
47199a2dd95SBruce Richardson 	goto COPY_BLOCK_128_BACK31;
47299a2dd95SBruce Richardson }
47399a2dd95SBruce Richardson 
47499a2dd95SBruce Richardson #else /* __AVX512F__ */
47599a2dd95SBruce Richardson 
476830d7c98SMorten Brørup /**
477830d7c98SMorten Brørup  * SSE implementation below
478830d7c98SMorten Brørup  */
479830d7c98SMorten Brørup 
48099a2dd95SBruce Richardson #define ALIGNMENT_MASK 0x0F
48199a2dd95SBruce Richardson 
48299a2dd95SBruce Richardson /**
48399a2dd95SBruce Richardson  * Macro for copying unaligned block from one location to another with constant load offset,
48499a2dd95SBruce Richardson  * 47 bytes leftover maximum,
48599a2dd95SBruce Richardson  * locations should not overlap.
48699a2dd95SBruce Richardson  * Requirements:
48799a2dd95SBruce Richardson  * - Store is aligned
48899a2dd95SBruce Richardson  * - Load offset is <offset>, which must be immediate value within [1, 15]
48999a2dd95SBruce Richardson  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
49099a2dd95SBruce Richardson  * - <dst>, <src>, <len> must be variables
49199a2dd95SBruce Richardson  * - __m128i <xmm0> ~ <xmm8> must be pre-defined
49299a2dd95SBruce Richardson  */
49399a2dd95SBruce Richardson #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)                                                     \
494810c007eSTyler Retzlaff {                                                                                            \
49599a2dd95SBruce Richardson     size_t tmp;                                                                                                \
49699a2dd95SBruce Richardson     while (len >= 128 + 16 - offset) {                                                                      \
4976de430b7SEli Britstein         xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16));                  \
49899a2dd95SBruce Richardson         len -= 128;                                                                                         \
4996de430b7SEli Britstein         xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16));                  \
5006de430b7SEli Britstein         xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16));                  \
5016de430b7SEli Britstein         xmm3 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 3 * 16));                  \
5026de430b7SEli Britstein         xmm4 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 4 * 16));                  \
5036de430b7SEli Britstein         xmm5 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 5 * 16));                  \
5046de430b7SEli Britstein         xmm6 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 6 * 16));                  \
5056de430b7SEli Britstein         xmm7 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 7 * 16));                  \
5066de430b7SEli Britstein         xmm8 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 8 * 16));                  \
50799a2dd95SBruce Richardson         src = (const uint8_t *)src + 128;                                                                   \
5086de430b7SEli Britstein         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
5096de430b7SEli Britstein         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
5106de430b7SEli Britstein         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
5116de430b7SEli Britstein         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
5126de430b7SEli Britstein         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
5136de430b7SEli Britstein         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
5146de430b7SEli Britstein         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
5156de430b7SEli Britstein         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
51699a2dd95SBruce Richardson         dst = (uint8_t *)dst + 128;                                                                         \
51799a2dd95SBruce Richardson     }                                                                                                       \
51899a2dd95SBruce Richardson     tmp = len;                                                                                              \
51999a2dd95SBruce Richardson     len = ((len - 16 + offset) & 127) + 16 - offset;                                                        \
52099a2dd95SBruce Richardson     tmp -= len;                                                                                             \
52199a2dd95SBruce Richardson     src = (const uint8_t *)src + tmp;                                                                       \
52299a2dd95SBruce Richardson     dst = (uint8_t *)dst + tmp;                                                                             \
52399a2dd95SBruce Richardson     if (len >= 32 + 16 - offset) {                                                                          \
52499a2dd95SBruce Richardson         while (len >= 32 + 16 - offset) {                                                                   \
5256de430b7SEli Britstein             xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16));              \
52699a2dd95SBruce Richardson             len -= 32;                                                                                      \
5276de430b7SEli Britstein             xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16));              \
5286de430b7SEli Britstein             xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16));              \
52999a2dd95SBruce Richardson             src = (const uint8_t *)src + 32;                                                                \
5306de430b7SEli Britstein             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
5316de430b7SEli Britstein             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
53299a2dd95SBruce Richardson             dst = (uint8_t *)dst + 32;                                                                      \
53399a2dd95SBruce Richardson         }                                                                                                   \
53499a2dd95SBruce Richardson         tmp = len;                                                                                          \
53599a2dd95SBruce Richardson         len = ((len - 16 + offset) & 31) + 16 - offset;                                                     \
53699a2dd95SBruce Richardson         tmp -= len;                                                                                         \
53799a2dd95SBruce Richardson         src = (const uint8_t *)src + tmp;                                                                   \
53899a2dd95SBruce Richardson         dst = (uint8_t *)dst + tmp;                                                                         \
53999a2dd95SBruce Richardson     }                                                                                                       \
540810c007eSTyler Retzlaff }
54199a2dd95SBruce Richardson 
54299a2dd95SBruce Richardson /**
54399a2dd95SBruce Richardson  * Macro for copying unaligned block from one location to another,
54499a2dd95SBruce Richardson  * 47 bytes leftover maximum,
54599a2dd95SBruce Richardson  * locations should not overlap.
54699a2dd95SBruce Richardson  * Use switch here because the aligning instruction requires immediate value for shift count.
54799a2dd95SBruce Richardson  * Requirements:
54899a2dd95SBruce Richardson  * - Store is aligned
54999a2dd95SBruce Richardson  * - Load offset is <offset>, which must be within [1, 15]
55099a2dd95SBruce Richardson  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
55199a2dd95SBruce Richardson  * - <dst>, <src>, <len> must be variables
55299a2dd95SBruce Richardson  * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
55399a2dd95SBruce Richardson  */
55499a2dd95SBruce Richardson #define MOVEUNALIGNED_LEFT47(dst, src, len, offset)                   \
555810c007eSTyler Retzlaff {                                                      \
55699a2dd95SBruce Richardson     switch (offset) {                                                 \
55799a2dd95SBruce Richardson     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
55899a2dd95SBruce Richardson     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
55999a2dd95SBruce Richardson     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
56099a2dd95SBruce Richardson     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
56199a2dd95SBruce Richardson     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
56299a2dd95SBruce Richardson     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
56399a2dd95SBruce Richardson     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
56499a2dd95SBruce Richardson     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
56599a2dd95SBruce Richardson     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
56699a2dd95SBruce Richardson     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
56799a2dd95SBruce Richardson     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
56899a2dd95SBruce Richardson     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
56999a2dd95SBruce Richardson     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
57099a2dd95SBruce Richardson     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
57199a2dd95SBruce Richardson     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
57299a2dd95SBruce Richardson     default:;                                                         \
57399a2dd95SBruce Richardson     }                                                                 \
574810c007eSTyler Retzlaff }
57599a2dd95SBruce Richardson 
57699a2dd95SBruce Richardson static __rte_always_inline void *
57799a2dd95SBruce Richardson rte_memcpy_generic(void *dst, const void *src, size_t n)
57899a2dd95SBruce Richardson {
57999a2dd95SBruce Richardson 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
58099a2dd95SBruce Richardson 	void *ret = dst;
58199a2dd95SBruce Richardson 	size_t dstofss;
58299a2dd95SBruce Richardson 	size_t srcofs;
58399a2dd95SBruce Richardson 
58499a2dd95SBruce Richardson 	/**
58599a2dd95SBruce Richardson 	 * Copy less than 16 bytes
58699a2dd95SBruce Richardson 	 */
58799a2dd95SBruce Richardson 	if (n < 16) {
58800901e4dSLuc Pelletier 		return rte_mov15_or_less(dst, src, n);
58999a2dd95SBruce Richardson 	}
59099a2dd95SBruce Richardson 
59199a2dd95SBruce Richardson 	/**
59299a2dd95SBruce Richardson 	 * Fast way when copy size doesn't exceed 512 bytes
59399a2dd95SBruce Richardson 	 */
59499a2dd95SBruce Richardson 	if (n <= 32) {
59599a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
596830d7c98SMorten Brørup 		if (__rte_constant(n) && n == 16)
597830d7c98SMorten Brørup 			return ret; /* avoid (harmless) duplicate copy */
59899a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
59999a2dd95SBruce Richardson 		return ret;
60099a2dd95SBruce Richardson 	}
60199a2dd95SBruce Richardson 	if (n <= 64) {
60299a2dd95SBruce Richardson 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
603830d7c98SMorten Brørup 		if (n > 48)
60499a2dd95SBruce Richardson 			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
60599a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
60699a2dd95SBruce Richardson 		return ret;
60799a2dd95SBruce Richardson 	}
60899a2dd95SBruce Richardson 	if (n <= 128) {
60999a2dd95SBruce Richardson 		goto COPY_BLOCK_128_BACK15;
61099a2dd95SBruce Richardson 	}
61199a2dd95SBruce Richardson 	if (n <= 512) {
61299a2dd95SBruce Richardson 		if (n >= 256) {
61399a2dd95SBruce Richardson 			n -= 256;
61499a2dd95SBruce Richardson 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
61599a2dd95SBruce Richardson 			rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
61699a2dd95SBruce Richardson 			src = (const uint8_t *)src + 256;
61799a2dd95SBruce Richardson 			dst = (uint8_t *)dst + 256;
61899a2dd95SBruce Richardson 		}
61999a2dd95SBruce Richardson COPY_BLOCK_255_BACK15:
62099a2dd95SBruce Richardson 		if (n >= 128) {
62199a2dd95SBruce Richardson 			n -= 128;
62299a2dd95SBruce Richardson 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
62399a2dd95SBruce Richardson 			src = (const uint8_t *)src + 128;
62499a2dd95SBruce Richardson 			dst = (uint8_t *)dst + 128;
62599a2dd95SBruce Richardson 		}
62699a2dd95SBruce Richardson COPY_BLOCK_128_BACK15:
62799a2dd95SBruce Richardson 		if (n >= 64) {
62899a2dd95SBruce Richardson 			n -= 64;
62999a2dd95SBruce Richardson 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
63099a2dd95SBruce Richardson 			src = (const uint8_t *)src + 64;
63199a2dd95SBruce Richardson 			dst = (uint8_t *)dst + 64;
63299a2dd95SBruce Richardson 		}
63399a2dd95SBruce Richardson COPY_BLOCK_64_BACK15:
63499a2dd95SBruce Richardson 		if (n >= 32) {
63599a2dd95SBruce Richardson 			n -= 32;
63699a2dd95SBruce Richardson 			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
63799a2dd95SBruce Richardson 			src = (const uint8_t *)src + 32;
63899a2dd95SBruce Richardson 			dst = (uint8_t *)dst + 32;
63999a2dd95SBruce Richardson 		}
64099a2dd95SBruce Richardson 		if (n > 16) {
64199a2dd95SBruce Richardson 			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
64299a2dd95SBruce Richardson 			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
64399a2dd95SBruce Richardson 			return ret;
64499a2dd95SBruce Richardson 		}
64599a2dd95SBruce Richardson 		if (n > 0) {
64699a2dd95SBruce Richardson 			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
64799a2dd95SBruce Richardson 		}
64899a2dd95SBruce Richardson 		return ret;
64999a2dd95SBruce Richardson 	}
65099a2dd95SBruce Richardson 
65199a2dd95SBruce Richardson 	/**
65299a2dd95SBruce Richardson 	 * Make store aligned when copy size exceeds 512 bytes,
65399a2dd95SBruce Richardson 	 * and make sure the first 15 bytes are copied, because
65499a2dd95SBruce Richardson 	 * unaligned copy functions require up to 15 bytes
65599a2dd95SBruce Richardson 	 * backwards access.
65699a2dd95SBruce Richardson 	 */
65799a2dd95SBruce Richardson 	dstofss = (uintptr_t)dst & 0x0F;
65899a2dd95SBruce Richardson 	if (dstofss > 0) {
65999a2dd95SBruce Richardson 		dstofss = 16 - dstofss + 16;
66099a2dd95SBruce Richardson 		n -= dstofss;
66199a2dd95SBruce Richardson 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
66299a2dd95SBruce Richardson 		src = (const uint8_t *)src + dstofss;
66399a2dd95SBruce Richardson 		dst = (uint8_t *)dst + dstofss;
66499a2dd95SBruce Richardson 	}
66599a2dd95SBruce Richardson 	srcofs = ((uintptr_t)src & 0x0F);
66699a2dd95SBruce Richardson 
66799a2dd95SBruce Richardson 	/**
66899a2dd95SBruce Richardson 	 * For aligned copy
66999a2dd95SBruce Richardson 	 */
67099a2dd95SBruce Richardson 	if (srcofs == 0) {
67199a2dd95SBruce Richardson 		/**
67299a2dd95SBruce Richardson 		 * Copy 256-byte blocks
67399a2dd95SBruce Richardson 		 */
67499a2dd95SBruce Richardson 		for (; n >= 256; n -= 256) {
67599a2dd95SBruce Richardson 			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
67699a2dd95SBruce Richardson 			dst = (uint8_t *)dst + 256;
67799a2dd95SBruce Richardson 			src = (const uint8_t *)src + 256;
67899a2dd95SBruce Richardson 		}
67999a2dd95SBruce Richardson 
68099a2dd95SBruce Richardson 		/**
68199a2dd95SBruce Richardson 		 * Copy whatever left
68299a2dd95SBruce Richardson 		 */
68399a2dd95SBruce Richardson 		goto COPY_BLOCK_255_BACK15;
68499a2dd95SBruce Richardson 	}
68599a2dd95SBruce Richardson 
68699a2dd95SBruce Richardson 	/**
68799a2dd95SBruce Richardson 	 * For copy with unaligned load
68899a2dd95SBruce Richardson 	 */
68999a2dd95SBruce Richardson 	MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
69099a2dd95SBruce Richardson 
69199a2dd95SBruce Richardson 	/**
69299a2dd95SBruce Richardson 	 * Copy whatever left
69399a2dd95SBruce Richardson 	 */
69499a2dd95SBruce Richardson 	goto COPY_BLOCK_64_BACK15;
69599a2dd95SBruce Richardson }
69699a2dd95SBruce Richardson 
69799a2dd95SBruce Richardson #endif /* __AVX512F__ */
69899a2dd95SBruce Richardson 
69999a2dd95SBruce Richardson static __rte_always_inline void *
70099a2dd95SBruce Richardson rte_memcpy_aligned(void *dst, const void *src, size_t n)
70199a2dd95SBruce Richardson {
70299a2dd95SBruce Richardson 	void *ret = dst;
70399a2dd95SBruce Richardson 
70400901e4dSLuc Pelletier 	/* Copy size < 16 bytes */
70599a2dd95SBruce Richardson 	if (n < 16) {
70600901e4dSLuc Pelletier 		return rte_mov15_or_less(dst, src, n);
70799a2dd95SBruce Richardson 	}
70899a2dd95SBruce Richardson 
70999a2dd95SBruce Richardson 	/* Copy 16 <= size <= 32 bytes */
710830d7c98SMorten Brørup 	if (__rte_constant(n) && n == 32) {
711830d7c98SMorten Brørup 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
712830d7c98SMorten Brørup 		return ret;
713830d7c98SMorten Brørup 	}
71499a2dd95SBruce Richardson 	if (n <= 32) {
71599a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
716830d7c98SMorten Brørup 		if (__rte_constant(n) && n == 16)
717830d7c98SMorten Brørup 			return ret; /* avoid (harmless) duplicate copy */
71899a2dd95SBruce Richardson 		rte_mov16((uint8_t *)dst - 16 + n,
71999a2dd95SBruce Richardson 				(const uint8_t *)src - 16 + n);
72099a2dd95SBruce Richardson 
72199a2dd95SBruce Richardson 		return ret;
72299a2dd95SBruce Richardson 	}
72399a2dd95SBruce Richardson 
72499a2dd95SBruce Richardson 	/* Copy 32 < size <= 64 bytes */
725830d7c98SMorten Brørup 	if (__rte_constant(n) && n == 64) {
726830d7c98SMorten Brørup 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
727830d7c98SMorten Brørup 		return ret;
728830d7c98SMorten Brørup 	}
72999a2dd95SBruce Richardson 	if (n <= 64) {
73099a2dd95SBruce Richardson 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
73199a2dd95SBruce Richardson 		rte_mov32((uint8_t *)dst - 32 + n,
73299a2dd95SBruce Richardson 				(const uint8_t *)src - 32 + n);
73399a2dd95SBruce Richardson 
73499a2dd95SBruce Richardson 		return ret;
73599a2dd95SBruce Richardson 	}
73699a2dd95SBruce Richardson 
73799a2dd95SBruce Richardson 	/* Copy 64 bytes blocks */
7382ef17be8SLeyi Rong 	for (; n > 64; n -= 64) {
73999a2dd95SBruce Richardson 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
74099a2dd95SBruce Richardson 		dst = (uint8_t *)dst + 64;
74199a2dd95SBruce Richardson 		src = (const uint8_t *)src + 64;
74299a2dd95SBruce Richardson 	}
74399a2dd95SBruce Richardson 
74499a2dd95SBruce Richardson 	/* Copy whatever left */
74599a2dd95SBruce Richardson 	rte_mov64((uint8_t *)dst - 64 + n,
74699a2dd95SBruce Richardson 			(const uint8_t *)src - 64 + n);
74799a2dd95SBruce Richardson 
74899a2dd95SBruce Richardson 	return ret;
74999a2dd95SBruce Richardson }
75099a2dd95SBruce Richardson 
75199a2dd95SBruce Richardson static __rte_always_inline void *
75299a2dd95SBruce Richardson rte_memcpy(void *dst, const void *src, size_t n)
75399a2dd95SBruce Richardson {
75499a2dd95SBruce Richardson 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
75599a2dd95SBruce Richardson 		return rte_memcpy_aligned(dst, src, n);
75699a2dd95SBruce Richardson 	else
75799a2dd95SBruce Richardson 		return rte_memcpy_generic(dst, src, n);
75899a2dd95SBruce Richardson }
75999a2dd95SBruce Richardson 
76011f61ea2SDavid Marchand #undef ALIGNMENT_MASK
76111f61ea2SDavid Marchand 
76299a2dd95SBruce Richardson #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
76399a2dd95SBruce Richardson #pragma GCC diagnostic pop
76499a2dd95SBruce Richardson #endif
76599a2dd95SBruce Richardson 
76699a2dd95SBruce Richardson #ifdef __cplusplus
76799a2dd95SBruce Richardson }
76899a2dd95SBruce Richardson #endif
76999a2dd95SBruce Richardson 
77099a2dd95SBruce Richardson #endif /* _RTE_MEMCPY_X86_64_H_ */
771