199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause 299a2dd95SBruce Richardson * Copyright(c) 2015 RehiveTech. All rights reserved. 399a2dd95SBruce Richardson */ 499a2dd95SBruce Richardson 599a2dd95SBruce Richardson #ifndef _RTE_MEMCPY_ARM32_H_ 699a2dd95SBruce Richardson #define _RTE_MEMCPY_ARM32_H_ 799a2dd95SBruce Richardson 899a2dd95SBruce Richardson #include <stdint.h> 999a2dd95SBruce Richardson #include <string.h> 1099a2dd95SBruce Richardson 1199a2dd95SBruce Richardson #include "generic/rte_memcpy.h" 1299a2dd95SBruce Richardson 1399a2dd95SBruce Richardson #ifdef RTE_ARCH_ARM_NEON_MEMCPY 1499a2dd95SBruce Richardson 1599a2dd95SBruce Richardson #ifndef __ARM_NEON 1699a2dd95SBruce Richardson #error "Cannot optimize memcpy by NEON as the CPU seems to not support this" 1799a2dd95SBruce Richardson #endif 1899a2dd95SBruce Richardson 1999a2dd95SBruce Richardson /* ARM NEON Intrinsics are used to copy data */ 2099a2dd95SBruce Richardson #include <arm_neon.h> 2199a2dd95SBruce Richardson 22*719834a6SMattias Rönnblom #ifdef __cplusplus 23*719834a6SMattias Rönnblom extern "C" { 24*719834a6SMattias Rönnblom #endif 25*719834a6SMattias Rönnblom 2699a2dd95SBruce Richardson static inline void 2799a2dd95SBruce Richardson rte_mov16(uint8_t *dst, const uint8_t *src) 2899a2dd95SBruce Richardson { 2999a2dd95SBruce Richardson vst1q_u8(dst, vld1q_u8(src)); 3099a2dd95SBruce Richardson } 3199a2dd95SBruce Richardson 3299a2dd95SBruce Richardson static inline void 3399a2dd95SBruce Richardson rte_mov32(uint8_t *dst, const uint8_t *src) 3499a2dd95SBruce Richardson { 3599a2dd95SBruce Richardson asm volatile ( 3699a2dd95SBruce Richardson "vld1.8 {d0-d3}, [%0]\n\t" 3799a2dd95SBruce Richardson "vst1.8 {d0-d3}, [%1]\n\t" 3899a2dd95SBruce Richardson : "+r" (src), "+r" (dst) 3999a2dd95SBruce Richardson : : "memory", "d0", "d1", "d2", "d3"); 4099a2dd95SBruce Richardson } 4199a2dd95SBruce Richardson 4299a2dd95SBruce Richardson static inline void 4399a2dd95SBruce Richardson rte_mov48(uint8_t *dst, const uint8_t *src) 4499a2dd95SBruce Richardson { 4599a2dd95SBruce Richardson asm volatile ( 4699a2dd95SBruce Richardson "vld1.8 {d0-d3}, [%0]!\n\t" 4799a2dd95SBruce Richardson "vld1.8 {d4-d5}, [%0]\n\t" 4899a2dd95SBruce Richardson "vst1.8 {d0-d3}, [%1]!\n\t" 4999a2dd95SBruce Richardson "vst1.8 {d4-d5}, [%1]\n\t" 5099a2dd95SBruce Richardson : "+r" (src), "+r" (dst) 5199a2dd95SBruce Richardson : 5299a2dd95SBruce Richardson : "memory", "d0", "d1", "d2", "d3", "d4", "d5"); 5399a2dd95SBruce Richardson } 5499a2dd95SBruce Richardson 5599a2dd95SBruce Richardson static inline void 5699a2dd95SBruce Richardson rte_mov64(uint8_t *dst, const uint8_t *src) 5799a2dd95SBruce Richardson { 5899a2dd95SBruce Richardson asm volatile ( 5999a2dd95SBruce Richardson "vld1.8 {d0-d3}, [%0]!\n\t" 6099a2dd95SBruce Richardson "vld1.8 {d4-d7}, [%0]\n\t" 6199a2dd95SBruce Richardson "vst1.8 {d0-d3}, [%1]!\n\t" 6299a2dd95SBruce Richardson "vst1.8 {d4-d7}, [%1]\n\t" 6399a2dd95SBruce Richardson : "+r" (src), "+r" (dst) 6499a2dd95SBruce Richardson : 6599a2dd95SBruce Richardson : "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"); 6699a2dd95SBruce Richardson } 6799a2dd95SBruce Richardson 6899a2dd95SBruce Richardson static inline void 6999a2dd95SBruce Richardson rte_mov128(uint8_t *dst, const uint8_t *src) 7099a2dd95SBruce Richardson { 7199a2dd95SBruce Richardson asm volatile ("pld [%0, #64]" : : "r" (src)); 7299a2dd95SBruce Richardson asm volatile ( 7399a2dd95SBruce Richardson "vld1.8 {d0-d3}, [%0]!\n\t" 7499a2dd95SBruce Richardson "vld1.8 {d4-d7}, [%0]!\n\t" 7599a2dd95SBruce Richardson "vld1.8 {d8-d11}, [%0]!\n\t" 7699a2dd95SBruce Richardson "vld1.8 {d12-d15}, [%0]\n\t" 7799a2dd95SBruce Richardson "vst1.8 {d0-d3}, [%1]!\n\t" 7899a2dd95SBruce Richardson "vst1.8 {d4-d7}, [%1]!\n\t" 7999a2dd95SBruce Richardson "vst1.8 {d8-d11}, [%1]!\n\t" 8099a2dd95SBruce Richardson "vst1.8 {d12-d15}, [%1]\n\t" 8199a2dd95SBruce Richardson : "+r" (src), "+r" (dst) 8299a2dd95SBruce Richardson : 8399a2dd95SBruce Richardson : "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 8499a2dd95SBruce Richardson "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"); 8599a2dd95SBruce Richardson } 8699a2dd95SBruce Richardson 8799a2dd95SBruce Richardson static inline void 8899a2dd95SBruce Richardson rte_mov256(uint8_t *dst, const uint8_t *src) 8999a2dd95SBruce Richardson { 9099a2dd95SBruce Richardson asm volatile ("pld [%0, #64]" : : "r" (src)); 9199a2dd95SBruce Richardson asm volatile ("pld [%0, #128]" : : "r" (src)); 9299a2dd95SBruce Richardson asm volatile ("pld [%0, #192]" : : "r" (src)); 9399a2dd95SBruce Richardson asm volatile ("pld [%0, #256]" : : "r" (src)); 9499a2dd95SBruce Richardson asm volatile ("pld [%0, #320]" : : "r" (src)); 9599a2dd95SBruce Richardson asm volatile ("pld [%0, #384]" : : "r" (src)); 9699a2dd95SBruce Richardson asm volatile ("pld [%0, #448]" : : "r" (src)); 9799a2dd95SBruce Richardson asm volatile ( 9899a2dd95SBruce Richardson "vld1.8 {d0-d3}, [%0]!\n\t" 9999a2dd95SBruce Richardson "vld1.8 {d4-d7}, [%0]!\n\t" 10099a2dd95SBruce Richardson "vld1.8 {d8-d11}, [%0]!\n\t" 10199a2dd95SBruce Richardson "vld1.8 {d12-d15}, [%0]!\n\t" 10299a2dd95SBruce Richardson "vld1.8 {d16-d19}, [%0]!\n\t" 10399a2dd95SBruce Richardson "vld1.8 {d20-d23}, [%0]!\n\t" 10499a2dd95SBruce Richardson "vld1.8 {d24-d27}, [%0]!\n\t" 10599a2dd95SBruce Richardson "vld1.8 {d28-d31}, [%0]\n\t" 10699a2dd95SBruce Richardson "vst1.8 {d0-d3}, [%1]!\n\t" 10799a2dd95SBruce Richardson "vst1.8 {d4-d7}, [%1]!\n\t" 10899a2dd95SBruce Richardson "vst1.8 {d8-d11}, [%1]!\n\t" 10999a2dd95SBruce Richardson "vst1.8 {d12-d15}, [%1]!\n\t" 11099a2dd95SBruce Richardson "vst1.8 {d16-d19}, [%1]!\n\t" 11199a2dd95SBruce Richardson "vst1.8 {d20-d23}, [%1]!\n\t" 11299a2dd95SBruce Richardson "vst1.8 {d24-d27}, [%1]!\n\t" 11399a2dd95SBruce Richardson "vst1.8 {d28-d31}, [%1]!\n\t" 11499a2dd95SBruce Richardson : "+r" (src), "+r" (dst) 11599a2dd95SBruce Richardson : 11699a2dd95SBruce Richardson : "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 11799a2dd95SBruce Richardson "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", 11899a2dd95SBruce Richardson "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", 11999a2dd95SBruce Richardson "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"); 12099a2dd95SBruce Richardson } 12199a2dd95SBruce Richardson 12299a2dd95SBruce Richardson #define rte_memcpy(dst, src, n) \ 12399a2dd95SBruce Richardson __extension__ ({ \ 12499a2dd95SBruce Richardson (__builtin_constant_p(n)) ? \ 12599a2dd95SBruce Richardson memcpy((dst), (src), (n)) : \ 12699a2dd95SBruce Richardson rte_memcpy_func((dst), (src), (n)); }) 12799a2dd95SBruce Richardson 12899a2dd95SBruce Richardson static inline void * 12999a2dd95SBruce Richardson rte_memcpy_func(void *dst, const void *src, size_t n) 13099a2dd95SBruce Richardson { 13199a2dd95SBruce Richardson void *ret = dst; 13299a2dd95SBruce Richardson 13399a2dd95SBruce Richardson /* We can't copy < 16 bytes using XMM registers so do it manually. */ 13499a2dd95SBruce Richardson if (n < 16) { 13599a2dd95SBruce Richardson if (n & 0x01) { 13699a2dd95SBruce Richardson *(uint8_t *)dst = *(const uint8_t *)src; 13799a2dd95SBruce Richardson dst = (uint8_t *)dst + 1; 13899a2dd95SBruce Richardson src = (const uint8_t *)src + 1; 13999a2dd95SBruce Richardson } 14099a2dd95SBruce Richardson if (n & 0x02) { 14199a2dd95SBruce Richardson *(uint16_t *)dst = *(const uint16_t *)src; 14299a2dd95SBruce Richardson dst = (uint16_t *)dst + 1; 14399a2dd95SBruce Richardson src = (const uint16_t *)src + 1; 14499a2dd95SBruce Richardson } 14599a2dd95SBruce Richardson if (n & 0x04) { 14699a2dd95SBruce Richardson *(uint32_t *)dst = *(const uint32_t *)src; 14799a2dd95SBruce Richardson dst = (uint32_t *)dst + 1; 14899a2dd95SBruce Richardson src = (const uint32_t *)src + 1; 14999a2dd95SBruce Richardson } 15099a2dd95SBruce Richardson if (n & 0x08) { 15199a2dd95SBruce Richardson /* ARMv7 can not handle unaligned access to long long 15299a2dd95SBruce Richardson * (uint64_t). Therefore two uint32_t operations are 15399a2dd95SBruce Richardson * used. 15499a2dd95SBruce Richardson */ 15599a2dd95SBruce Richardson *(uint32_t *)dst = *(const uint32_t *)src; 15699a2dd95SBruce Richardson dst = (uint32_t *)dst + 1; 15799a2dd95SBruce Richardson src = (const uint32_t *)src + 1; 15899a2dd95SBruce Richardson *(uint32_t *)dst = *(const uint32_t *)src; 15999a2dd95SBruce Richardson } 16099a2dd95SBruce Richardson return ret; 16199a2dd95SBruce Richardson } 16299a2dd95SBruce Richardson 16399a2dd95SBruce Richardson /* Special fast cases for <= 128 bytes */ 16499a2dd95SBruce Richardson if (n <= 32) { 16599a2dd95SBruce Richardson rte_mov16((uint8_t *)dst, (const uint8_t *)src); 16699a2dd95SBruce Richardson rte_mov16((uint8_t *)dst - 16 + n, 16799a2dd95SBruce Richardson (const uint8_t *)src - 16 + n); 16899a2dd95SBruce Richardson return ret; 16999a2dd95SBruce Richardson } 17099a2dd95SBruce Richardson 17199a2dd95SBruce Richardson if (n <= 64) { 17299a2dd95SBruce Richardson rte_mov32((uint8_t *)dst, (const uint8_t *)src); 17399a2dd95SBruce Richardson rte_mov32((uint8_t *)dst - 32 + n, 17499a2dd95SBruce Richardson (const uint8_t *)src - 32 + n); 17599a2dd95SBruce Richardson return ret; 17699a2dd95SBruce Richardson } 17799a2dd95SBruce Richardson 17899a2dd95SBruce Richardson if (n <= 128) { 17999a2dd95SBruce Richardson rte_mov64((uint8_t *)dst, (const uint8_t *)src); 18099a2dd95SBruce Richardson rte_mov64((uint8_t *)dst - 64 + n, 18199a2dd95SBruce Richardson (const uint8_t *)src - 64 + n); 18299a2dd95SBruce Richardson return ret; 18399a2dd95SBruce Richardson } 18499a2dd95SBruce Richardson 18599a2dd95SBruce Richardson /* 18699a2dd95SBruce Richardson * For large copies > 128 bytes. This combination of 256, 64 and 16 byte 18799a2dd95SBruce Richardson * copies was found to be faster than doing 128 and 32 byte copies as 18899a2dd95SBruce Richardson * well. 18999a2dd95SBruce Richardson */ 19099a2dd95SBruce Richardson for ( ; n >= 256; n -= 256) { 19199a2dd95SBruce Richardson rte_mov256((uint8_t *)dst, (const uint8_t *)src); 19299a2dd95SBruce Richardson dst = (uint8_t *)dst + 256; 19399a2dd95SBruce Richardson src = (const uint8_t *)src + 256; 19499a2dd95SBruce Richardson } 19599a2dd95SBruce Richardson 19699a2dd95SBruce Richardson /* 19799a2dd95SBruce Richardson * We split the remaining bytes (which will be less than 256) into 19899a2dd95SBruce Richardson * 64byte (2^6) chunks. 19999a2dd95SBruce Richardson * Using incrementing integers in the case labels of a switch statement 20099a2dd95SBruce Richardson * encourages the compiler to use a jump table. To get incrementing 20199a2dd95SBruce Richardson * integers, we shift the 2 relevant bits to the LSB position to first 20299a2dd95SBruce Richardson * get decrementing integers, and then subtract. 20399a2dd95SBruce Richardson */ 20499a2dd95SBruce Richardson switch (3 - (n >> 6)) { 20599a2dd95SBruce Richardson case 0x00: 20699a2dd95SBruce Richardson rte_mov64((uint8_t *)dst, (const uint8_t *)src); 20799a2dd95SBruce Richardson n -= 64; 20899a2dd95SBruce Richardson dst = (uint8_t *)dst + 64; 20999a2dd95SBruce Richardson src = (const uint8_t *)src + 64; /* fallthrough */ 21099a2dd95SBruce Richardson case 0x01: 21199a2dd95SBruce Richardson rte_mov64((uint8_t *)dst, (const uint8_t *)src); 21299a2dd95SBruce Richardson n -= 64; 21399a2dd95SBruce Richardson dst = (uint8_t *)dst + 64; 21499a2dd95SBruce Richardson src = (const uint8_t *)src + 64; /* fallthrough */ 21599a2dd95SBruce Richardson case 0x02: 21699a2dd95SBruce Richardson rte_mov64((uint8_t *)dst, (const uint8_t *)src); 21799a2dd95SBruce Richardson n -= 64; 21899a2dd95SBruce Richardson dst = (uint8_t *)dst + 64; 21999a2dd95SBruce Richardson src = (const uint8_t *)src + 64; /* fallthrough */ 22099a2dd95SBruce Richardson default: 22199a2dd95SBruce Richardson break; 22299a2dd95SBruce Richardson } 22399a2dd95SBruce Richardson 22499a2dd95SBruce Richardson /* 22599a2dd95SBruce Richardson * We split the remaining bytes (which will be less than 64) into 22699a2dd95SBruce Richardson * 16byte (2^4) chunks, using the same switch structure as above. 22799a2dd95SBruce Richardson */ 22899a2dd95SBruce Richardson switch (3 - (n >> 4)) { 22999a2dd95SBruce Richardson case 0x00: 23099a2dd95SBruce Richardson rte_mov16((uint8_t *)dst, (const uint8_t *)src); 23199a2dd95SBruce Richardson n -= 16; 23299a2dd95SBruce Richardson dst = (uint8_t *)dst + 16; 23399a2dd95SBruce Richardson src = (const uint8_t *)src + 16; /* fallthrough */ 23499a2dd95SBruce Richardson case 0x01: 23599a2dd95SBruce Richardson rte_mov16((uint8_t *)dst, (const uint8_t *)src); 23699a2dd95SBruce Richardson n -= 16; 23799a2dd95SBruce Richardson dst = (uint8_t *)dst + 16; 23899a2dd95SBruce Richardson src = (const uint8_t *)src + 16; /* fallthrough */ 23999a2dd95SBruce Richardson case 0x02: 24099a2dd95SBruce Richardson rte_mov16((uint8_t *)dst, (const uint8_t *)src); 24199a2dd95SBruce Richardson n -= 16; 24299a2dd95SBruce Richardson dst = (uint8_t *)dst + 16; 24399a2dd95SBruce Richardson src = (const uint8_t *)src + 16; /* fallthrough */ 24499a2dd95SBruce Richardson default: 24599a2dd95SBruce Richardson break; 24699a2dd95SBruce Richardson } 24799a2dd95SBruce Richardson 24899a2dd95SBruce Richardson /* Copy any remaining bytes, without going beyond end of buffers */ 24999a2dd95SBruce Richardson if (n != 0) 25099a2dd95SBruce Richardson rte_mov16((uint8_t *)dst - 16 + n, 25199a2dd95SBruce Richardson (const uint8_t *)src - 16 + n); 25299a2dd95SBruce Richardson return ret; 25399a2dd95SBruce Richardson } 25499a2dd95SBruce Richardson 25599a2dd95SBruce Richardson #else 25699a2dd95SBruce Richardson 25799a2dd95SBruce Richardson static inline void 25899a2dd95SBruce Richardson rte_mov16(uint8_t *dst, const uint8_t *src) 25999a2dd95SBruce Richardson { 26099a2dd95SBruce Richardson memcpy(dst, src, 16); 26199a2dd95SBruce Richardson } 26299a2dd95SBruce Richardson 26399a2dd95SBruce Richardson static inline void 26499a2dd95SBruce Richardson rte_mov32(uint8_t *dst, const uint8_t *src) 26599a2dd95SBruce Richardson { 26699a2dd95SBruce Richardson memcpy(dst, src, 32); 26799a2dd95SBruce Richardson } 26899a2dd95SBruce Richardson 26999a2dd95SBruce Richardson static inline void 27099a2dd95SBruce Richardson rte_mov48(uint8_t *dst, const uint8_t *src) 27199a2dd95SBruce Richardson { 27299a2dd95SBruce Richardson memcpy(dst, src, 48); 27399a2dd95SBruce Richardson } 27499a2dd95SBruce Richardson 27599a2dd95SBruce Richardson static inline void 27699a2dd95SBruce Richardson rte_mov64(uint8_t *dst, const uint8_t *src) 27799a2dd95SBruce Richardson { 27899a2dd95SBruce Richardson memcpy(dst, src, 64); 27999a2dd95SBruce Richardson } 28099a2dd95SBruce Richardson 28199a2dd95SBruce Richardson static inline void 28299a2dd95SBruce Richardson rte_mov128(uint8_t *dst, const uint8_t *src) 28399a2dd95SBruce Richardson { 28499a2dd95SBruce Richardson memcpy(dst, src, 128); 28599a2dd95SBruce Richardson } 28699a2dd95SBruce Richardson 28799a2dd95SBruce Richardson static inline void 28899a2dd95SBruce Richardson rte_mov256(uint8_t *dst, const uint8_t *src) 28999a2dd95SBruce Richardson { 29099a2dd95SBruce Richardson memcpy(dst, src, 256); 29199a2dd95SBruce Richardson } 29299a2dd95SBruce Richardson 29399a2dd95SBruce Richardson static inline void * 29499a2dd95SBruce Richardson rte_memcpy(void *dst, const void *src, size_t n) 29599a2dd95SBruce Richardson { 29699a2dd95SBruce Richardson return memcpy(dst, src, n); 29799a2dd95SBruce Richardson } 29899a2dd95SBruce Richardson 29999a2dd95SBruce Richardson #endif /* RTE_ARCH_ARM_NEON_MEMCPY */ 30099a2dd95SBruce Richardson 30199a2dd95SBruce Richardson #ifdef __cplusplus 30299a2dd95SBruce Richardson } 30399a2dd95SBruce Richardson #endif 30499a2dd95SBruce Richardson 30599a2dd95SBruce Richardson #endif /* _RTE_MEMCPY_ARM32_H_ */ 306