199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause 299a2dd95SBruce Richardson * Copyright(c) 2010-2014 Intel Corporation 399a2dd95SBruce Richardson */ 499a2dd95SBruce Richardson 599a2dd95SBruce Richardson #ifndef _RTE_MEMCPY_X86_64_H_ 699a2dd95SBruce Richardson #define _RTE_MEMCPY_X86_64_H_ 799a2dd95SBruce Richardson 899a2dd95SBruce Richardson /** 999a2dd95SBruce Richardson * @file 1099a2dd95SBruce Richardson * 1199a2dd95SBruce Richardson * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy(). 1299a2dd95SBruce Richardson */ 1399a2dd95SBruce Richardson 1499a2dd95SBruce Richardson #include <stdio.h> 1599a2dd95SBruce Richardson #include <stdint.h> 1699a2dd95SBruce Richardson #include <string.h> 1799a2dd95SBruce Richardson #include <rte_vect.h> 1899a2dd95SBruce Richardson #include <rte_common.h> 1999a2dd95SBruce Richardson #include <rte_config.h> 2099a2dd95SBruce Richardson 2199a2dd95SBruce Richardson #ifdef __cplusplus 2299a2dd95SBruce Richardson extern "C" { 2399a2dd95SBruce Richardson #endif 2499a2dd95SBruce Richardson 2599a2dd95SBruce Richardson #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) 2699a2dd95SBruce Richardson #pragma GCC diagnostic push 2799a2dd95SBruce Richardson #pragma GCC diagnostic ignored "-Wstringop-overflow" 2899a2dd95SBruce Richardson #endif 2999a2dd95SBruce Richardson 30830d7c98SMorten Brørup /* 31830d7c98SMorten Brørup * GCC older than version 11 doesn't compile AVX properly, so use SSE instead. 32830d7c98SMorten Brørup * There are no problems with AVX2. 33830d7c98SMorten Brørup */ 34830d7c98SMorten Brørup #if defined __AVX2__ 35830d7c98SMorten Brørup #define RTE_MEMCPY_AVX 36830d7c98SMorten Brørup #elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)) 37830d7c98SMorten Brørup #define RTE_MEMCPY_AVX 38830d7c98SMorten Brørup #endif 39830d7c98SMorten Brørup 4099a2dd95SBruce Richardson /** 4199a2dd95SBruce Richardson * Copy bytes from one location to another. The locations must not overlap. 4299a2dd95SBruce Richardson * 4399a2dd95SBruce Richardson * @note This is implemented as a macro, so it's address should not be taken 4499a2dd95SBruce Richardson * and care is needed as parameter expressions may be evaluated multiple times. 4599a2dd95SBruce Richardson * 4699a2dd95SBruce Richardson * @param dst 4799a2dd95SBruce Richardson * Pointer to the destination of the data. 4899a2dd95SBruce Richardson * @param src 4999a2dd95SBruce Richardson * Pointer to the source data. 5099a2dd95SBruce Richardson * @param n 5199a2dd95SBruce Richardson * Number of bytes to copy. 5299a2dd95SBruce Richardson * @return 5399a2dd95SBruce Richardson * Pointer to the destination data. 5499a2dd95SBruce Richardson */ 5599a2dd95SBruce Richardson static __rte_always_inline void * 5699a2dd95SBruce Richardson rte_memcpy(void *dst, const void *src, size_t n); 5799a2dd95SBruce Richardson 5800901e4dSLuc Pelletier /** 5900901e4dSLuc Pelletier * Copy bytes from one location to another, 6000901e4dSLuc Pelletier * locations should not overlap. 6100901e4dSLuc Pelletier * Use with n <= 15. 6200901e4dSLuc Pelletier */ 6300901e4dSLuc Pelletier static __rte_always_inline void * 6400901e4dSLuc Pelletier rte_mov15_or_less(void *dst, const void *src, size_t n) 6500901e4dSLuc Pelletier { 6600901e4dSLuc Pelletier /** 6700901e4dSLuc Pelletier * Use the following structs to avoid violating C standard 6800901e4dSLuc Pelletier * alignment requirements and to avoid strict aliasing bugs 6900901e4dSLuc Pelletier */ 70*fba98755SAndre Muezerie struct __rte_packed_begin rte_uint64_alias { 7100901e4dSLuc Pelletier uint64_t val; 72*fba98755SAndre Muezerie } __rte_packed_end __rte_may_alias; 73*fba98755SAndre Muezerie struct __rte_packed_begin rte_uint32_alias { 7400901e4dSLuc Pelletier uint32_t val; 75*fba98755SAndre Muezerie } __rte_packed_end __rte_may_alias; 76*fba98755SAndre Muezerie struct __rte_packed_begin rte_uint16_alias { 7700901e4dSLuc Pelletier uint16_t val; 78*fba98755SAndre Muezerie } __rte_packed_end __rte_may_alias; 7900901e4dSLuc Pelletier 8000901e4dSLuc Pelletier void *ret = dst; 8100901e4dSLuc Pelletier if (n & 8) { 8200901e4dSLuc Pelletier ((struct rte_uint64_alias *)dst)->val = 8300901e4dSLuc Pelletier ((const struct rte_uint64_alias *)src)->val; 8400901e4dSLuc Pelletier src = (const uint64_t *)src + 1; 8500901e4dSLuc Pelletier dst = (uint64_t *)dst + 1; 8600901e4dSLuc Pelletier } 8700901e4dSLuc Pelletier if (n & 4) { 8800901e4dSLuc Pelletier ((struct rte_uint32_alias *)dst)->val = 8900901e4dSLuc Pelletier ((const struct rte_uint32_alias *)src)->val; 9000901e4dSLuc Pelletier src = (const uint32_t *)src + 1; 9100901e4dSLuc Pelletier dst = (uint32_t *)dst + 1; 9200901e4dSLuc Pelletier } 9300901e4dSLuc Pelletier if (n & 2) { 9400901e4dSLuc Pelletier ((struct rte_uint16_alias *)dst)->val = 9500901e4dSLuc Pelletier ((const struct rte_uint16_alias *)src)->val; 9600901e4dSLuc Pelletier src = (const uint16_t *)src + 1; 9700901e4dSLuc Pelletier dst = (uint16_t *)dst + 1; 9800901e4dSLuc Pelletier } 9900901e4dSLuc Pelletier if (n & 1) 10000901e4dSLuc Pelletier *(uint8_t *)dst = *(const uint8_t *)src; 10100901e4dSLuc Pelletier return ret; 10200901e4dSLuc Pelletier } 10300901e4dSLuc Pelletier 10499a2dd95SBruce Richardson /** 10599a2dd95SBruce Richardson * Copy 16 bytes from one location to another, 10699a2dd95SBruce Richardson * locations should not overlap. 10799a2dd95SBruce Richardson */ 10899a2dd95SBruce Richardson static __rte_always_inline void 10999a2dd95SBruce Richardson rte_mov16(uint8_t *dst, const uint8_t *src) 11099a2dd95SBruce Richardson { 11199a2dd95SBruce Richardson __m128i xmm0; 11299a2dd95SBruce Richardson 113830d7c98SMorten Brørup xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); 114830d7c98SMorten Brørup _mm_storeu_si128((__m128i *)(void *)dst, xmm0); 11599a2dd95SBruce Richardson } 11699a2dd95SBruce Richardson 11799a2dd95SBruce Richardson /** 11899a2dd95SBruce Richardson * Copy 32 bytes from one location to another, 11999a2dd95SBruce Richardson * locations should not overlap. 12099a2dd95SBruce Richardson */ 12199a2dd95SBruce Richardson static __rte_always_inline void 12299a2dd95SBruce Richardson rte_mov32(uint8_t *dst, const uint8_t *src) 12399a2dd95SBruce Richardson { 124830d7c98SMorten Brørup #if defined RTE_MEMCPY_AVX 12599a2dd95SBruce Richardson __m256i ymm0; 12699a2dd95SBruce Richardson 127830d7c98SMorten Brørup ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); 128830d7c98SMorten Brørup _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); 129830d7c98SMorten Brørup #else /* SSE implementation */ 130830d7c98SMorten Brørup rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); 131830d7c98SMorten Brørup rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); 132830d7c98SMorten Brørup #endif 13399a2dd95SBruce Richardson } 13499a2dd95SBruce Richardson 13599a2dd95SBruce Richardson /** 13699a2dd95SBruce Richardson * Copy 64 bytes from one location to another, 13799a2dd95SBruce Richardson * locations should not overlap. 13899a2dd95SBruce Richardson */ 13999a2dd95SBruce Richardson static __rte_always_inline void 14099a2dd95SBruce Richardson rte_mov64(uint8_t *dst, const uint8_t *src) 14199a2dd95SBruce Richardson { 142830d7c98SMorten Brørup #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 14399a2dd95SBruce Richardson __m512i zmm0; 14499a2dd95SBruce Richardson 14599a2dd95SBruce Richardson zmm0 = _mm512_loadu_si512((const void *)src); 14699a2dd95SBruce Richardson _mm512_storeu_si512((void *)dst, zmm0); 147830d7c98SMorten Brørup #else /* AVX2, AVX & SSE implementation */ 148830d7c98SMorten Brørup rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); 149830d7c98SMorten Brørup rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); 150830d7c98SMorten Brørup #endif 15199a2dd95SBruce Richardson } 15299a2dd95SBruce Richardson 15399a2dd95SBruce Richardson /** 15499a2dd95SBruce Richardson * Copy 128 bytes from one location to another, 15599a2dd95SBruce Richardson * locations should not overlap. 15699a2dd95SBruce Richardson */ 15799a2dd95SBruce Richardson static __rte_always_inline void 15899a2dd95SBruce Richardson rte_mov128(uint8_t *dst, const uint8_t *src) 15999a2dd95SBruce Richardson { 16099a2dd95SBruce Richardson rte_mov64(dst + 0 * 64, src + 0 * 64); 16199a2dd95SBruce Richardson rte_mov64(dst + 1 * 64, src + 1 * 64); 16299a2dd95SBruce Richardson } 16399a2dd95SBruce Richardson 16499a2dd95SBruce Richardson /** 16599a2dd95SBruce Richardson * Copy 256 bytes from one location to another, 16699a2dd95SBruce Richardson * locations should not overlap. 16799a2dd95SBruce Richardson */ 16899a2dd95SBruce Richardson static __rte_always_inline void 16999a2dd95SBruce Richardson rte_mov256(uint8_t *dst, const uint8_t *src) 17099a2dd95SBruce Richardson { 171830d7c98SMorten Brørup rte_mov128(dst + 0 * 128, src + 0 * 128); 172830d7c98SMorten Brørup rte_mov128(dst + 1 * 128, src + 1 * 128); 17399a2dd95SBruce Richardson } 17499a2dd95SBruce Richardson 175830d7c98SMorten Brørup #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 176830d7c98SMorten Brørup 177830d7c98SMorten Brørup /** 178830d7c98SMorten Brørup * AVX512 implementation below 179830d7c98SMorten Brørup */ 180830d7c98SMorten Brørup 181830d7c98SMorten Brørup #define ALIGNMENT_MASK 0x3F 182830d7c98SMorten Brørup 18399a2dd95SBruce Richardson /** 18499a2dd95SBruce Richardson * Copy 128-byte blocks from one location to another, 18599a2dd95SBruce Richardson * locations should not overlap. 18699a2dd95SBruce Richardson */ 18799a2dd95SBruce Richardson static __rte_always_inline void 18899a2dd95SBruce Richardson rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) 18999a2dd95SBruce Richardson { 19099a2dd95SBruce Richardson __m512i zmm0, zmm1; 19199a2dd95SBruce Richardson 19299a2dd95SBruce Richardson while (n >= 128) { 19399a2dd95SBruce Richardson zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); 19499a2dd95SBruce Richardson n -= 128; 19599a2dd95SBruce Richardson zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); 19699a2dd95SBruce Richardson src = src + 128; 19799a2dd95SBruce Richardson _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); 19899a2dd95SBruce Richardson _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); 19999a2dd95SBruce Richardson dst = dst + 128; 20099a2dd95SBruce Richardson } 20199a2dd95SBruce Richardson } 20299a2dd95SBruce Richardson 20399a2dd95SBruce Richardson /** 20499a2dd95SBruce Richardson * Copy 512-byte blocks from one location to another, 20599a2dd95SBruce Richardson * locations should not overlap. 20699a2dd95SBruce Richardson */ 20799a2dd95SBruce Richardson static inline void 20899a2dd95SBruce Richardson rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) 20999a2dd95SBruce Richardson { 21099a2dd95SBruce Richardson __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7; 21199a2dd95SBruce Richardson 21299a2dd95SBruce Richardson while (n >= 512) { 21399a2dd95SBruce Richardson zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); 21499a2dd95SBruce Richardson n -= 512; 21599a2dd95SBruce Richardson zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); 21699a2dd95SBruce Richardson zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64)); 21799a2dd95SBruce Richardson zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64)); 21899a2dd95SBruce Richardson zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64)); 21999a2dd95SBruce Richardson zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64)); 22099a2dd95SBruce Richardson zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64)); 22199a2dd95SBruce Richardson zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64)); 22299a2dd95SBruce Richardson src = src + 512; 22399a2dd95SBruce Richardson _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); 22499a2dd95SBruce Richardson _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); 22599a2dd95SBruce Richardson _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2); 22699a2dd95SBruce Richardson _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3); 22799a2dd95SBruce Richardson _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4); 22899a2dd95SBruce Richardson _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5); 22999a2dd95SBruce Richardson _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6); 23099a2dd95SBruce Richardson _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7); 23199a2dd95SBruce Richardson dst = dst + 512; 23299a2dd95SBruce Richardson } 23399a2dd95SBruce Richardson } 23499a2dd95SBruce Richardson 23599a2dd95SBruce Richardson static __rte_always_inline void * 23699a2dd95SBruce Richardson rte_memcpy_generic(void *dst, const void *src, size_t n) 23799a2dd95SBruce Richardson { 23899a2dd95SBruce Richardson void *ret = dst; 23999a2dd95SBruce Richardson size_t dstofss; 24099a2dd95SBruce Richardson size_t bits; 24199a2dd95SBruce Richardson 24299a2dd95SBruce Richardson /** 24399a2dd95SBruce Richardson * Copy less than 16 bytes 24499a2dd95SBruce Richardson */ 24599a2dd95SBruce Richardson if (n < 16) { 24600901e4dSLuc Pelletier return rte_mov15_or_less(dst, src, n); 24799a2dd95SBruce Richardson } 24899a2dd95SBruce Richardson 24999a2dd95SBruce Richardson /** 25099a2dd95SBruce Richardson * Fast way when copy size doesn't exceed 512 bytes 25199a2dd95SBruce Richardson */ 252830d7c98SMorten Brørup if (__rte_constant(n) && n == 32) { 253830d7c98SMorten Brørup rte_mov32((uint8_t *)dst, (const uint8_t *)src); 254830d7c98SMorten Brørup return ret; 255830d7c98SMorten Brørup } 25699a2dd95SBruce Richardson if (n <= 32) { 25799a2dd95SBruce Richardson rte_mov16((uint8_t *)dst, (const uint8_t *)src); 258830d7c98SMorten Brørup if (__rte_constant(n) && n == 16) 259830d7c98SMorten Brørup return ret; /* avoid (harmless) duplicate copy */ 26099a2dd95SBruce Richardson rte_mov16((uint8_t *)dst - 16 + n, 26199a2dd95SBruce Richardson (const uint8_t *)src - 16 + n); 26299a2dd95SBruce Richardson return ret; 26399a2dd95SBruce Richardson } 264830d7c98SMorten Brørup if (__rte_constant(n) && n == 64) { 265830d7c98SMorten Brørup rte_mov64((uint8_t *)dst, (const uint8_t *)src); 266830d7c98SMorten Brørup return ret; 267830d7c98SMorten Brørup } 26899a2dd95SBruce Richardson if (n <= 64) { 26999a2dd95SBruce Richardson rte_mov32((uint8_t *)dst, (const uint8_t *)src); 27099a2dd95SBruce Richardson rte_mov32((uint8_t *)dst - 32 + n, 27199a2dd95SBruce Richardson (const uint8_t *)src - 32 + n); 27299a2dd95SBruce Richardson return ret; 27399a2dd95SBruce Richardson } 27499a2dd95SBruce Richardson if (n <= 512) { 27599a2dd95SBruce Richardson if (n >= 256) { 27699a2dd95SBruce Richardson n -= 256; 27799a2dd95SBruce Richardson rte_mov256((uint8_t *)dst, (const uint8_t *)src); 27899a2dd95SBruce Richardson src = (const uint8_t *)src + 256; 27999a2dd95SBruce Richardson dst = (uint8_t *)dst + 256; 28099a2dd95SBruce Richardson } 28199a2dd95SBruce Richardson if (n >= 128) { 28299a2dd95SBruce Richardson n -= 128; 28399a2dd95SBruce Richardson rte_mov128((uint8_t *)dst, (const uint8_t *)src); 28499a2dd95SBruce Richardson src = (const uint8_t *)src + 128; 28599a2dd95SBruce Richardson dst = (uint8_t *)dst + 128; 28699a2dd95SBruce Richardson } 28799a2dd95SBruce Richardson COPY_BLOCK_128_BACK63: 28899a2dd95SBruce Richardson if (n > 64) { 28999a2dd95SBruce Richardson rte_mov64((uint8_t *)dst, (const uint8_t *)src); 29099a2dd95SBruce Richardson rte_mov64((uint8_t *)dst - 64 + n, 29199a2dd95SBruce Richardson (const uint8_t *)src - 64 + n); 29299a2dd95SBruce Richardson return ret; 29399a2dd95SBruce Richardson } 29499a2dd95SBruce Richardson if (n > 0) 29599a2dd95SBruce Richardson rte_mov64((uint8_t *)dst - 64 + n, 29699a2dd95SBruce Richardson (const uint8_t *)src - 64 + n); 29799a2dd95SBruce Richardson return ret; 29899a2dd95SBruce Richardson } 29999a2dd95SBruce Richardson 30099a2dd95SBruce Richardson /** 30199a2dd95SBruce Richardson * Make store aligned when copy size exceeds 512 bytes 30299a2dd95SBruce Richardson */ 30399a2dd95SBruce Richardson dstofss = ((uintptr_t)dst & 0x3F); 30499a2dd95SBruce Richardson if (dstofss > 0) { 30599a2dd95SBruce Richardson dstofss = 64 - dstofss; 30699a2dd95SBruce Richardson n -= dstofss; 30799a2dd95SBruce Richardson rte_mov64((uint8_t *)dst, (const uint8_t *)src); 30899a2dd95SBruce Richardson src = (const uint8_t *)src + dstofss; 30999a2dd95SBruce Richardson dst = (uint8_t *)dst + dstofss; 31099a2dd95SBruce Richardson } 31199a2dd95SBruce Richardson 31299a2dd95SBruce Richardson /** 31399a2dd95SBruce Richardson * Copy 512-byte blocks. 31499a2dd95SBruce Richardson * Use copy block function for better instruction order control, 31599a2dd95SBruce Richardson * which is important when load is unaligned. 31699a2dd95SBruce Richardson */ 31799a2dd95SBruce Richardson rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n); 31899a2dd95SBruce Richardson bits = n; 31999a2dd95SBruce Richardson n = n & 511; 32099a2dd95SBruce Richardson bits -= n; 32199a2dd95SBruce Richardson src = (const uint8_t *)src + bits; 32299a2dd95SBruce Richardson dst = (uint8_t *)dst + bits; 32399a2dd95SBruce Richardson 32499a2dd95SBruce Richardson /** 32599a2dd95SBruce Richardson * Copy 128-byte blocks. 32699a2dd95SBruce Richardson * Use copy block function for better instruction order control, 32799a2dd95SBruce Richardson * which is important when load is unaligned. 32899a2dd95SBruce Richardson */ 32999a2dd95SBruce Richardson if (n >= 128) { 33099a2dd95SBruce Richardson rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); 33199a2dd95SBruce Richardson bits = n; 33299a2dd95SBruce Richardson n = n & 127; 33399a2dd95SBruce Richardson bits -= n; 33499a2dd95SBruce Richardson src = (const uint8_t *)src + bits; 33599a2dd95SBruce Richardson dst = (uint8_t *)dst + bits; 33699a2dd95SBruce Richardson } 33799a2dd95SBruce Richardson 33899a2dd95SBruce Richardson /** 33999a2dd95SBruce Richardson * Copy whatever left 34099a2dd95SBruce Richardson */ 34199a2dd95SBruce Richardson goto COPY_BLOCK_128_BACK63; 34299a2dd95SBruce Richardson } 34399a2dd95SBruce Richardson 344830d7c98SMorten Brørup #elif defined RTE_MEMCPY_AVX 345830d7c98SMorten Brørup 346830d7c98SMorten Brørup /** 347830d7c98SMorten Brørup * AVX implementation below 348830d7c98SMorten Brørup */ 34999a2dd95SBruce Richardson 35099a2dd95SBruce Richardson #define ALIGNMENT_MASK 0x1F 35199a2dd95SBruce Richardson 35299a2dd95SBruce Richardson /** 35399a2dd95SBruce Richardson * Copy 128-byte blocks from one location to another, 35499a2dd95SBruce Richardson * locations should not overlap. 35599a2dd95SBruce Richardson */ 35699a2dd95SBruce Richardson static __rte_always_inline void 35799a2dd95SBruce Richardson rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) 35899a2dd95SBruce Richardson { 35999a2dd95SBruce Richardson __m256i ymm0, ymm1, ymm2, ymm3; 36099a2dd95SBruce Richardson 36199a2dd95SBruce Richardson while (n >= 128) { 3626de430b7SEli Britstein ymm0 = _mm256_loadu_si256((const __m256i *)(const void *) 3636de430b7SEli Britstein ((const uint8_t *)src + 0 * 32)); 36499a2dd95SBruce Richardson n -= 128; 3656de430b7SEli Britstein ymm1 = _mm256_loadu_si256((const __m256i *)(const void *) 3666de430b7SEli Britstein ((const uint8_t *)src + 1 * 32)); 3676de430b7SEli Britstein ymm2 = _mm256_loadu_si256((const __m256i *)(const void *) 3686de430b7SEli Britstein ((const uint8_t *)src + 2 * 32)); 3696de430b7SEli Britstein ymm3 = _mm256_loadu_si256((const __m256i *)(const void *) 3706de430b7SEli Britstein ((const uint8_t *)src + 3 * 32)); 37199a2dd95SBruce Richardson src = (const uint8_t *)src + 128; 3726de430b7SEli Britstein _mm256_storeu_si256((__m256i *)(void *) 3736de430b7SEli Britstein ((uint8_t *)dst + 0 * 32), ymm0); 3746de430b7SEli Britstein _mm256_storeu_si256((__m256i *)(void *) 3756de430b7SEli Britstein ((uint8_t *)dst + 1 * 32), ymm1); 3766de430b7SEli Britstein _mm256_storeu_si256((__m256i *)(void *) 3776de430b7SEli Britstein ((uint8_t *)dst + 2 * 32), ymm2); 3786de430b7SEli Britstein _mm256_storeu_si256((__m256i *)(void *) 3796de430b7SEli Britstein ((uint8_t *)dst + 3 * 32), ymm3); 38099a2dd95SBruce Richardson dst = (uint8_t *)dst + 128; 38199a2dd95SBruce Richardson } 38299a2dd95SBruce Richardson } 38399a2dd95SBruce Richardson 38499a2dd95SBruce Richardson static __rte_always_inline void * 38599a2dd95SBruce Richardson rte_memcpy_generic(void *dst, const void *src, size_t n) 38699a2dd95SBruce Richardson { 38799a2dd95SBruce Richardson void *ret = dst; 38899a2dd95SBruce Richardson size_t dstofss; 38999a2dd95SBruce Richardson size_t bits; 39099a2dd95SBruce Richardson 39199a2dd95SBruce Richardson /** 39299a2dd95SBruce Richardson * Copy less than 16 bytes 39399a2dd95SBruce Richardson */ 39499a2dd95SBruce Richardson if (n < 16) { 39500901e4dSLuc Pelletier return rte_mov15_or_less(dst, src, n); 39699a2dd95SBruce Richardson } 39799a2dd95SBruce Richardson 39899a2dd95SBruce Richardson /** 39999a2dd95SBruce Richardson * Fast way when copy size doesn't exceed 256 bytes 40099a2dd95SBruce Richardson */ 401830d7c98SMorten Brørup if (__rte_constant(n) && n == 32) { 402830d7c98SMorten Brørup rte_mov32((uint8_t *)dst, (const uint8_t *)src); 40399a2dd95SBruce Richardson return ret; 40499a2dd95SBruce Richardson } 405830d7c98SMorten Brørup if (n <= 32) { 40699a2dd95SBruce Richardson rte_mov16((uint8_t *)dst, (const uint8_t *)src); 407830d7c98SMorten Brørup if (__rte_constant(n) && n == 16) 408830d7c98SMorten Brørup return ret; /* avoid (harmless) duplicate copy */ 40999a2dd95SBruce Richardson rte_mov16((uint8_t *)dst - 16 + n, 41099a2dd95SBruce Richardson (const uint8_t *)src - 16 + n); 41199a2dd95SBruce Richardson return ret; 41299a2dd95SBruce Richardson } 41399a2dd95SBruce Richardson if (n <= 64) { 41499a2dd95SBruce Richardson rte_mov32((uint8_t *)dst, (const uint8_t *)src); 41599a2dd95SBruce Richardson rte_mov32((uint8_t *)dst - 32 + n, 41699a2dd95SBruce Richardson (const uint8_t *)src - 32 + n); 41799a2dd95SBruce Richardson return ret; 41899a2dd95SBruce Richardson } 41999a2dd95SBruce Richardson if (n <= 256) { 42099a2dd95SBruce Richardson if (n >= 128) { 42199a2dd95SBruce Richardson n -= 128; 42299a2dd95SBruce Richardson rte_mov128((uint8_t *)dst, (const uint8_t *)src); 42399a2dd95SBruce Richardson src = (const uint8_t *)src + 128; 42499a2dd95SBruce Richardson dst = (uint8_t *)dst + 128; 42599a2dd95SBruce Richardson } 42699a2dd95SBruce Richardson COPY_BLOCK_128_BACK31: 42799a2dd95SBruce Richardson if (n >= 64) { 42899a2dd95SBruce Richardson n -= 64; 42999a2dd95SBruce Richardson rte_mov64((uint8_t *)dst, (const uint8_t *)src); 43099a2dd95SBruce Richardson src = (const uint8_t *)src + 64; 43199a2dd95SBruce Richardson dst = (uint8_t *)dst + 64; 43299a2dd95SBruce Richardson } 43399a2dd95SBruce Richardson if (n > 32) { 43499a2dd95SBruce Richardson rte_mov32((uint8_t *)dst, (const uint8_t *)src); 43599a2dd95SBruce Richardson rte_mov32((uint8_t *)dst - 32 + n, 43699a2dd95SBruce Richardson (const uint8_t *)src - 32 + n); 43799a2dd95SBruce Richardson return ret; 43899a2dd95SBruce Richardson } 43999a2dd95SBruce Richardson if (n > 0) { 44099a2dd95SBruce Richardson rte_mov32((uint8_t *)dst - 32 + n, 44199a2dd95SBruce Richardson (const uint8_t *)src - 32 + n); 44299a2dd95SBruce Richardson } 44399a2dd95SBruce Richardson return ret; 44499a2dd95SBruce Richardson } 44599a2dd95SBruce Richardson 44699a2dd95SBruce Richardson /** 44799a2dd95SBruce Richardson * Make store aligned when copy size exceeds 256 bytes 44899a2dd95SBruce Richardson */ 44999a2dd95SBruce Richardson dstofss = (uintptr_t)dst & 0x1F; 45099a2dd95SBruce Richardson if (dstofss > 0) { 45199a2dd95SBruce Richardson dstofss = 32 - dstofss; 45299a2dd95SBruce Richardson n -= dstofss; 45399a2dd95SBruce Richardson rte_mov32((uint8_t *)dst, (const uint8_t *)src); 45499a2dd95SBruce Richardson src = (const uint8_t *)src + dstofss; 45599a2dd95SBruce Richardson dst = (uint8_t *)dst + dstofss; 45699a2dd95SBruce Richardson } 45799a2dd95SBruce Richardson 45899a2dd95SBruce Richardson /** 45999a2dd95SBruce Richardson * Copy 128-byte blocks 46099a2dd95SBruce Richardson */ 46199a2dd95SBruce Richardson rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); 46299a2dd95SBruce Richardson bits = n; 46399a2dd95SBruce Richardson n = n & 127; 46499a2dd95SBruce Richardson bits -= n; 46599a2dd95SBruce Richardson src = (const uint8_t *)src + bits; 46699a2dd95SBruce Richardson dst = (uint8_t *)dst + bits; 46799a2dd95SBruce Richardson 46899a2dd95SBruce Richardson /** 46999a2dd95SBruce Richardson * Copy whatever left 47099a2dd95SBruce Richardson */ 47199a2dd95SBruce Richardson goto COPY_BLOCK_128_BACK31; 47299a2dd95SBruce Richardson } 47399a2dd95SBruce Richardson 47499a2dd95SBruce Richardson #else /* __AVX512F__ */ 47599a2dd95SBruce Richardson 476830d7c98SMorten Brørup /** 477830d7c98SMorten Brørup * SSE implementation below 478830d7c98SMorten Brørup */ 479830d7c98SMorten Brørup 48099a2dd95SBruce Richardson #define ALIGNMENT_MASK 0x0F 48199a2dd95SBruce Richardson 48299a2dd95SBruce Richardson /** 48399a2dd95SBruce Richardson * Macro for copying unaligned block from one location to another with constant load offset, 48499a2dd95SBruce Richardson * 47 bytes leftover maximum, 48599a2dd95SBruce Richardson * locations should not overlap. 48699a2dd95SBruce Richardson * Requirements: 48799a2dd95SBruce Richardson * - Store is aligned 48899a2dd95SBruce Richardson * - Load offset is <offset>, which must be immediate value within [1, 15] 48999a2dd95SBruce Richardson * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading 49099a2dd95SBruce Richardson * - <dst>, <src>, <len> must be variables 49199a2dd95SBruce Richardson * - __m128i <xmm0> ~ <xmm8> must be pre-defined 49299a2dd95SBruce Richardson */ 49399a2dd95SBruce Richardson #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset) \ 494810c007eSTyler Retzlaff { \ 49599a2dd95SBruce Richardson size_t tmp; \ 49699a2dd95SBruce Richardson while (len >= 128 + 16 - offset) { \ 4976de430b7SEli Britstein xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16)); \ 49899a2dd95SBruce Richardson len -= 128; \ 4996de430b7SEli Britstein xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16)); \ 5006de430b7SEli Britstein xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16)); \ 5016de430b7SEli Britstein xmm3 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 3 * 16)); \ 5026de430b7SEli Britstein xmm4 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 4 * 16)); \ 5036de430b7SEli Britstein xmm5 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 5 * 16)); \ 5046de430b7SEli Britstein xmm6 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 6 * 16)); \ 5056de430b7SEli Britstein xmm7 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 7 * 16)); \ 5066de430b7SEli Britstein xmm8 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 8 * 16)); \ 50799a2dd95SBruce Richardson src = (const uint8_t *)src + 128; \ 5086de430b7SEli Britstein _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ 5096de430b7SEli Britstein _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ 5106de430b7SEli Britstein _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ 5116de430b7SEli Britstein _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ 5126de430b7SEli Britstein _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ 5136de430b7SEli Britstein _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ 5146de430b7SEli Britstein _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ 5156de430b7SEli Britstein _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ 51699a2dd95SBruce Richardson dst = (uint8_t *)dst + 128; \ 51799a2dd95SBruce Richardson } \ 51899a2dd95SBruce Richardson tmp = len; \ 51999a2dd95SBruce Richardson len = ((len - 16 + offset) & 127) + 16 - offset; \ 52099a2dd95SBruce Richardson tmp -= len; \ 52199a2dd95SBruce Richardson src = (const uint8_t *)src + tmp; \ 52299a2dd95SBruce Richardson dst = (uint8_t *)dst + tmp; \ 52399a2dd95SBruce Richardson if (len >= 32 + 16 - offset) { \ 52499a2dd95SBruce Richardson while (len >= 32 + 16 - offset) { \ 5256de430b7SEli Britstein xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16)); \ 52699a2dd95SBruce Richardson len -= 32; \ 5276de430b7SEli Britstein xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16)); \ 5286de430b7SEli Britstein xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16)); \ 52999a2dd95SBruce Richardson src = (const uint8_t *)src + 32; \ 5306de430b7SEli Britstein _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ 5316de430b7SEli Britstein _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ 53299a2dd95SBruce Richardson dst = (uint8_t *)dst + 32; \ 53399a2dd95SBruce Richardson } \ 53499a2dd95SBruce Richardson tmp = len; \ 53599a2dd95SBruce Richardson len = ((len - 16 + offset) & 31) + 16 - offset; \ 53699a2dd95SBruce Richardson tmp -= len; \ 53799a2dd95SBruce Richardson src = (const uint8_t *)src + tmp; \ 53899a2dd95SBruce Richardson dst = (uint8_t *)dst + tmp; \ 53999a2dd95SBruce Richardson } \ 540810c007eSTyler Retzlaff } 54199a2dd95SBruce Richardson 54299a2dd95SBruce Richardson /** 54399a2dd95SBruce Richardson * Macro for copying unaligned block from one location to another, 54499a2dd95SBruce Richardson * 47 bytes leftover maximum, 54599a2dd95SBruce Richardson * locations should not overlap. 54699a2dd95SBruce Richardson * Use switch here because the aligning instruction requires immediate value for shift count. 54799a2dd95SBruce Richardson * Requirements: 54899a2dd95SBruce Richardson * - Store is aligned 54999a2dd95SBruce Richardson * - Load offset is <offset>, which must be within [1, 15] 55099a2dd95SBruce Richardson * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading 55199a2dd95SBruce Richardson * - <dst>, <src>, <len> must be variables 55299a2dd95SBruce Richardson * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined 55399a2dd95SBruce Richardson */ 55499a2dd95SBruce Richardson #define MOVEUNALIGNED_LEFT47(dst, src, len, offset) \ 555810c007eSTyler Retzlaff { \ 55699a2dd95SBruce Richardson switch (offset) { \ 55799a2dd95SBruce Richardson case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ 55899a2dd95SBruce Richardson case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ 55999a2dd95SBruce Richardson case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ 56099a2dd95SBruce Richardson case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ 56199a2dd95SBruce Richardson case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ 56299a2dd95SBruce Richardson case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ 56399a2dd95SBruce Richardson case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ 56499a2dd95SBruce Richardson case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ 56599a2dd95SBruce Richardson case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ 56699a2dd95SBruce Richardson case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ 56799a2dd95SBruce Richardson case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ 56899a2dd95SBruce Richardson case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ 56999a2dd95SBruce Richardson case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ 57099a2dd95SBruce Richardson case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ 57199a2dd95SBruce Richardson case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ 57299a2dd95SBruce Richardson default:; \ 57399a2dd95SBruce Richardson } \ 574810c007eSTyler Retzlaff } 57599a2dd95SBruce Richardson 57699a2dd95SBruce Richardson static __rte_always_inline void * 57799a2dd95SBruce Richardson rte_memcpy_generic(void *dst, const void *src, size_t n) 57899a2dd95SBruce Richardson { 57999a2dd95SBruce Richardson __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; 58099a2dd95SBruce Richardson void *ret = dst; 58199a2dd95SBruce Richardson size_t dstofss; 58299a2dd95SBruce Richardson size_t srcofs; 58399a2dd95SBruce Richardson 58499a2dd95SBruce Richardson /** 58599a2dd95SBruce Richardson * Copy less than 16 bytes 58699a2dd95SBruce Richardson */ 58799a2dd95SBruce Richardson if (n < 16) { 58800901e4dSLuc Pelletier return rte_mov15_or_less(dst, src, n); 58999a2dd95SBruce Richardson } 59099a2dd95SBruce Richardson 59199a2dd95SBruce Richardson /** 59299a2dd95SBruce Richardson * Fast way when copy size doesn't exceed 512 bytes 59399a2dd95SBruce Richardson */ 59499a2dd95SBruce Richardson if (n <= 32) { 59599a2dd95SBruce Richardson rte_mov16((uint8_t *)dst, (const uint8_t *)src); 596830d7c98SMorten Brørup if (__rte_constant(n) && n == 16) 597830d7c98SMorten Brørup return ret; /* avoid (harmless) duplicate copy */ 59899a2dd95SBruce Richardson rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 59999a2dd95SBruce Richardson return ret; 60099a2dd95SBruce Richardson } 60199a2dd95SBruce Richardson if (n <= 64) { 60299a2dd95SBruce Richardson rte_mov32((uint8_t *)dst, (const uint8_t *)src); 603830d7c98SMorten Brørup if (n > 48) 60499a2dd95SBruce Richardson rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); 60599a2dd95SBruce Richardson rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 60699a2dd95SBruce Richardson return ret; 60799a2dd95SBruce Richardson } 60899a2dd95SBruce Richardson if (n <= 128) { 60999a2dd95SBruce Richardson goto COPY_BLOCK_128_BACK15; 61099a2dd95SBruce Richardson } 61199a2dd95SBruce Richardson if (n <= 512) { 61299a2dd95SBruce Richardson if (n >= 256) { 61399a2dd95SBruce Richardson n -= 256; 61499a2dd95SBruce Richardson rte_mov128((uint8_t *)dst, (const uint8_t *)src); 61599a2dd95SBruce Richardson rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128); 61699a2dd95SBruce Richardson src = (const uint8_t *)src + 256; 61799a2dd95SBruce Richardson dst = (uint8_t *)dst + 256; 61899a2dd95SBruce Richardson } 61999a2dd95SBruce Richardson COPY_BLOCK_255_BACK15: 62099a2dd95SBruce Richardson if (n >= 128) { 62199a2dd95SBruce Richardson n -= 128; 62299a2dd95SBruce Richardson rte_mov128((uint8_t *)dst, (const uint8_t *)src); 62399a2dd95SBruce Richardson src = (const uint8_t *)src + 128; 62499a2dd95SBruce Richardson dst = (uint8_t *)dst + 128; 62599a2dd95SBruce Richardson } 62699a2dd95SBruce Richardson COPY_BLOCK_128_BACK15: 62799a2dd95SBruce Richardson if (n >= 64) { 62899a2dd95SBruce Richardson n -= 64; 62999a2dd95SBruce Richardson rte_mov64((uint8_t *)dst, (const uint8_t *)src); 63099a2dd95SBruce Richardson src = (const uint8_t *)src + 64; 63199a2dd95SBruce Richardson dst = (uint8_t *)dst + 64; 63299a2dd95SBruce Richardson } 63399a2dd95SBruce Richardson COPY_BLOCK_64_BACK15: 63499a2dd95SBruce Richardson if (n >= 32) { 63599a2dd95SBruce Richardson n -= 32; 63699a2dd95SBruce Richardson rte_mov32((uint8_t *)dst, (const uint8_t *)src); 63799a2dd95SBruce Richardson src = (const uint8_t *)src + 32; 63899a2dd95SBruce Richardson dst = (uint8_t *)dst + 32; 63999a2dd95SBruce Richardson } 64099a2dd95SBruce Richardson if (n > 16) { 64199a2dd95SBruce Richardson rte_mov16((uint8_t *)dst, (const uint8_t *)src); 64299a2dd95SBruce Richardson rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 64399a2dd95SBruce Richardson return ret; 64499a2dd95SBruce Richardson } 64599a2dd95SBruce Richardson if (n > 0) { 64699a2dd95SBruce Richardson rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 64799a2dd95SBruce Richardson } 64899a2dd95SBruce Richardson return ret; 64999a2dd95SBruce Richardson } 65099a2dd95SBruce Richardson 65199a2dd95SBruce Richardson /** 65299a2dd95SBruce Richardson * Make store aligned when copy size exceeds 512 bytes, 65399a2dd95SBruce Richardson * and make sure the first 15 bytes are copied, because 65499a2dd95SBruce Richardson * unaligned copy functions require up to 15 bytes 65599a2dd95SBruce Richardson * backwards access. 65699a2dd95SBruce Richardson */ 65799a2dd95SBruce Richardson dstofss = (uintptr_t)dst & 0x0F; 65899a2dd95SBruce Richardson if (dstofss > 0) { 65999a2dd95SBruce Richardson dstofss = 16 - dstofss + 16; 66099a2dd95SBruce Richardson n -= dstofss; 66199a2dd95SBruce Richardson rte_mov32((uint8_t *)dst, (const uint8_t *)src); 66299a2dd95SBruce Richardson src = (const uint8_t *)src + dstofss; 66399a2dd95SBruce Richardson dst = (uint8_t *)dst + dstofss; 66499a2dd95SBruce Richardson } 66599a2dd95SBruce Richardson srcofs = ((uintptr_t)src & 0x0F); 66699a2dd95SBruce Richardson 66799a2dd95SBruce Richardson /** 66899a2dd95SBruce Richardson * For aligned copy 66999a2dd95SBruce Richardson */ 67099a2dd95SBruce Richardson if (srcofs == 0) { 67199a2dd95SBruce Richardson /** 67299a2dd95SBruce Richardson * Copy 256-byte blocks 67399a2dd95SBruce Richardson */ 67499a2dd95SBruce Richardson for (; n >= 256; n -= 256) { 67599a2dd95SBruce Richardson rte_mov256((uint8_t *)dst, (const uint8_t *)src); 67699a2dd95SBruce Richardson dst = (uint8_t *)dst + 256; 67799a2dd95SBruce Richardson src = (const uint8_t *)src + 256; 67899a2dd95SBruce Richardson } 67999a2dd95SBruce Richardson 68099a2dd95SBruce Richardson /** 68199a2dd95SBruce Richardson * Copy whatever left 68299a2dd95SBruce Richardson */ 68399a2dd95SBruce Richardson goto COPY_BLOCK_255_BACK15; 68499a2dd95SBruce Richardson } 68599a2dd95SBruce Richardson 68699a2dd95SBruce Richardson /** 68799a2dd95SBruce Richardson * For copy with unaligned load 68899a2dd95SBruce Richardson */ 68999a2dd95SBruce Richardson MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); 69099a2dd95SBruce Richardson 69199a2dd95SBruce Richardson /** 69299a2dd95SBruce Richardson * Copy whatever left 69399a2dd95SBruce Richardson */ 69499a2dd95SBruce Richardson goto COPY_BLOCK_64_BACK15; 69599a2dd95SBruce Richardson } 69699a2dd95SBruce Richardson 69799a2dd95SBruce Richardson #endif /* __AVX512F__ */ 69899a2dd95SBruce Richardson 69999a2dd95SBruce Richardson static __rte_always_inline void * 70099a2dd95SBruce Richardson rte_memcpy_aligned(void *dst, const void *src, size_t n) 70199a2dd95SBruce Richardson { 70299a2dd95SBruce Richardson void *ret = dst; 70399a2dd95SBruce Richardson 70400901e4dSLuc Pelletier /* Copy size < 16 bytes */ 70599a2dd95SBruce Richardson if (n < 16) { 70600901e4dSLuc Pelletier return rte_mov15_or_less(dst, src, n); 70799a2dd95SBruce Richardson } 70899a2dd95SBruce Richardson 70999a2dd95SBruce Richardson /* Copy 16 <= size <= 32 bytes */ 710830d7c98SMorten Brørup if (__rte_constant(n) && n == 32) { 711830d7c98SMorten Brørup rte_mov32((uint8_t *)dst, (const uint8_t *)src); 712830d7c98SMorten Brørup return ret; 713830d7c98SMorten Brørup } 71499a2dd95SBruce Richardson if (n <= 32) { 71599a2dd95SBruce Richardson rte_mov16((uint8_t *)dst, (const uint8_t *)src); 716830d7c98SMorten Brørup if (__rte_constant(n) && n == 16) 717830d7c98SMorten Brørup return ret; /* avoid (harmless) duplicate copy */ 71899a2dd95SBruce Richardson rte_mov16((uint8_t *)dst - 16 + n, 71999a2dd95SBruce Richardson (const uint8_t *)src - 16 + n); 72099a2dd95SBruce Richardson 72199a2dd95SBruce Richardson return ret; 72299a2dd95SBruce Richardson } 72399a2dd95SBruce Richardson 72499a2dd95SBruce Richardson /* Copy 32 < size <= 64 bytes */ 725830d7c98SMorten Brørup if (__rte_constant(n) && n == 64) { 726830d7c98SMorten Brørup rte_mov64((uint8_t *)dst, (const uint8_t *)src); 727830d7c98SMorten Brørup return ret; 728830d7c98SMorten Brørup } 72999a2dd95SBruce Richardson if (n <= 64) { 73099a2dd95SBruce Richardson rte_mov32((uint8_t *)dst, (const uint8_t *)src); 73199a2dd95SBruce Richardson rte_mov32((uint8_t *)dst - 32 + n, 73299a2dd95SBruce Richardson (const uint8_t *)src - 32 + n); 73399a2dd95SBruce Richardson 73499a2dd95SBruce Richardson return ret; 73599a2dd95SBruce Richardson } 73699a2dd95SBruce Richardson 73799a2dd95SBruce Richardson /* Copy 64 bytes blocks */ 7382ef17be8SLeyi Rong for (; n > 64; n -= 64) { 73999a2dd95SBruce Richardson rte_mov64((uint8_t *)dst, (const uint8_t *)src); 74099a2dd95SBruce Richardson dst = (uint8_t *)dst + 64; 74199a2dd95SBruce Richardson src = (const uint8_t *)src + 64; 74299a2dd95SBruce Richardson } 74399a2dd95SBruce Richardson 74499a2dd95SBruce Richardson /* Copy whatever left */ 74599a2dd95SBruce Richardson rte_mov64((uint8_t *)dst - 64 + n, 74699a2dd95SBruce Richardson (const uint8_t *)src - 64 + n); 74799a2dd95SBruce Richardson 74899a2dd95SBruce Richardson return ret; 74999a2dd95SBruce Richardson } 75099a2dd95SBruce Richardson 75199a2dd95SBruce Richardson static __rte_always_inline void * 75299a2dd95SBruce Richardson rte_memcpy(void *dst, const void *src, size_t n) 75399a2dd95SBruce Richardson { 75499a2dd95SBruce Richardson if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK)) 75599a2dd95SBruce Richardson return rte_memcpy_aligned(dst, src, n); 75699a2dd95SBruce Richardson else 75799a2dd95SBruce Richardson return rte_memcpy_generic(dst, src, n); 75899a2dd95SBruce Richardson } 75999a2dd95SBruce Richardson 76011f61ea2SDavid Marchand #undef ALIGNMENT_MASK 76111f61ea2SDavid Marchand 76299a2dd95SBruce Richardson #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) 76399a2dd95SBruce Richardson #pragma GCC diagnostic pop 76499a2dd95SBruce Richardson #endif 76599a2dd95SBruce Richardson 76699a2dd95SBruce Richardson #ifdef __cplusplus 76799a2dd95SBruce Richardson } 76899a2dd95SBruce Richardson #endif 76999a2dd95SBruce Richardson 77099a2dd95SBruce Richardson #endif /* _RTE_MEMCPY_X86_64_H_ */ 771