1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2015 RehiveTech. All rights reserved. 3 */ 4 5 #ifndef _RTE_MEMCPY_ARM32_H_ 6 #define _RTE_MEMCPY_ARM32_H_ 7 8 #include <stdint.h> 9 #include <string.h> 10 11 #include "generic/rte_memcpy.h" 12 13 #ifdef RTE_ARCH_ARM_NEON_MEMCPY 14 15 #ifndef __ARM_NEON 16 #error "Cannot optimize memcpy by NEON as the CPU seems to not support this" 17 #endif 18 19 /* ARM NEON Intrinsics are used to copy data */ 20 #include <arm_neon.h> 21 22 #ifdef __cplusplus 23 extern "C" { 24 #endif 25 26 static inline void 27 rte_mov16(uint8_t *dst, const uint8_t *src) 28 { 29 vst1q_u8(dst, vld1q_u8(src)); 30 } 31 32 static inline void 33 rte_mov32(uint8_t *dst, const uint8_t *src) 34 { 35 asm volatile ( 36 "vld1.8 {d0-d3}, [%0]\n\t" 37 "vst1.8 {d0-d3}, [%1]\n\t" 38 : "+r" (src), "+r" (dst) 39 : : "memory", "d0", "d1", "d2", "d3"); 40 } 41 42 static inline void 43 rte_mov48(uint8_t *dst, const uint8_t *src) 44 { 45 asm volatile ( 46 "vld1.8 {d0-d3}, [%0]!\n\t" 47 "vld1.8 {d4-d5}, [%0]\n\t" 48 "vst1.8 {d0-d3}, [%1]!\n\t" 49 "vst1.8 {d4-d5}, [%1]\n\t" 50 : "+r" (src), "+r" (dst) 51 : 52 : "memory", "d0", "d1", "d2", "d3", "d4", "d5"); 53 } 54 55 static inline void 56 rte_mov64(uint8_t *dst, const uint8_t *src) 57 { 58 asm volatile ( 59 "vld1.8 {d0-d3}, [%0]!\n\t" 60 "vld1.8 {d4-d7}, [%0]\n\t" 61 "vst1.8 {d0-d3}, [%1]!\n\t" 62 "vst1.8 {d4-d7}, [%1]\n\t" 63 : "+r" (src), "+r" (dst) 64 : 65 : "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"); 66 } 67 68 static inline void 69 rte_mov128(uint8_t *dst, const uint8_t *src) 70 { 71 asm volatile ("pld [%0, #64]" : : "r" (src)); 72 asm volatile ( 73 "vld1.8 {d0-d3}, [%0]!\n\t" 74 "vld1.8 {d4-d7}, [%0]!\n\t" 75 "vld1.8 {d8-d11}, [%0]!\n\t" 76 "vld1.8 {d12-d15}, [%0]\n\t" 77 "vst1.8 {d0-d3}, [%1]!\n\t" 78 "vst1.8 {d4-d7}, [%1]!\n\t" 79 "vst1.8 {d8-d11}, [%1]!\n\t" 80 "vst1.8 {d12-d15}, [%1]\n\t" 81 : "+r" (src), "+r" (dst) 82 : 83 : "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 84 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"); 85 } 86 87 static inline void 88 rte_mov256(uint8_t *dst, const uint8_t *src) 89 { 90 asm volatile ("pld [%0, #64]" : : "r" (src)); 91 asm volatile ("pld [%0, #128]" : : "r" (src)); 92 asm volatile ("pld [%0, #192]" : : "r" (src)); 93 asm volatile ("pld [%0, #256]" : : "r" (src)); 94 asm volatile ("pld [%0, #320]" : : "r" (src)); 95 asm volatile ("pld [%0, #384]" : : "r" (src)); 96 asm volatile ("pld [%0, #448]" : : "r" (src)); 97 asm volatile ( 98 "vld1.8 {d0-d3}, [%0]!\n\t" 99 "vld1.8 {d4-d7}, [%0]!\n\t" 100 "vld1.8 {d8-d11}, [%0]!\n\t" 101 "vld1.8 {d12-d15}, [%0]!\n\t" 102 "vld1.8 {d16-d19}, [%0]!\n\t" 103 "vld1.8 {d20-d23}, [%0]!\n\t" 104 "vld1.8 {d24-d27}, [%0]!\n\t" 105 "vld1.8 {d28-d31}, [%0]\n\t" 106 "vst1.8 {d0-d3}, [%1]!\n\t" 107 "vst1.8 {d4-d7}, [%1]!\n\t" 108 "vst1.8 {d8-d11}, [%1]!\n\t" 109 "vst1.8 {d12-d15}, [%1]!\n\t" 110 "vst1.8 {d16-d19}, [%1]!\n\t" 111 "vst1.8 {d20-d23}, [%1]!\n\t" 112 "vst1.8 {d24-d27}, [%1]!\n\t" 113 "vst1.8 {d28-d31}, [%1]!\n\t" 114 : "+r" (src), "+r" (dst) 115 : 116 : "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 117 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", 118 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", 119 "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"); 120 } 121 122 #define rte_memcpy(dst, src, n) \ 123 __extension__ ({ \ 124 (__builtin_constant_p(n)) ? \ 125 memcpy((dst), (src), (n)) : \ 126 rte_memcpy_func((dst), (src), (n)); }) 127 128 static inline void * 129 rte_memcpy_func(void *dst, const void *src, size_t n) 130 { 131 void *ret = dst; 132 133 /* We can't copy < 16 bytes using XMM registers so do it manually. */ 134 if (n < 16) { 135 if (n & 0x01) { 136 *(uint8_t *)dst = *(const uint8_t *)src; 137 dst = (uint8_t *)dst + 1; 138 src = (const uint8_t *)src + 1; 139 } 140 if (n & 0x02) { 141 *(uint16_t *)dst = *(const uint16_t *)src; 142 dst = (uint16_t *)dst + 1; 143 src = (const uint16_t *)src + 1; 144 } 145 if (n & 0x04) { 146 *(uint32_t *)dst = *(const uint32_t *)src; 147 dst = (uint32_t *)dst + 1; 148 src = (const uint32_t *)src + 1; 149 } 150 if (n & 0x08) { 151 /* ARMv7 can not handle unaligned access to long long 152 * (uint64_t). Therefore two uint32_t operations are 153 * used. 154 */ 155 *(uint32_t *)dst = *(const uint32_t *)src; 156 dst = (uint32_t *)dst + 1; 157 src = (const uint32_t *)src + 1; 158 *(uint32_t *)dst = *(const uint32_t *)src; 159 } 160 return ret; 161 } 162 163 /* Special fast cases for <= 128 bytes */ 164 if (n <= 32) { 165 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 166 rte_mov16((uint8_t *)dst - 16 + n, 167 (const uint8_t *)src - 16 + n); 168 return ret; 169 } 170 171 if (n <= 64) { 172 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 173 rte_mov32((uint8_t *)dst - 32 + n, 174 (const uint8_t *)src - 32 + n); 175 return ret; 176 } 177 178 if (n <= 128) { 179 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 180 rte_mov64((uint8_t *)dst - 64 + n, 181 (const uint8_t *)src - 64 + n); 182 return ret; 183 } 184 185 /* 186 * For large copies > 128 bytes. This combination of 256, 64 and 16 byte 187 * copies was found to be faster than doing 128 and 32 byte copies as 188 * well. 189 */ 190 for ( ; n >= 256; n -= 256) { 191 rte_mov256((uint8_t *)dst, (const uint8_t *)src); 192 dst = (uint8_t *)dst + 256; 193 src = (const uint8_t *)src + 256; 194 } 195 196 /* 197 * We split the remaining bytes (which will be less than 256) into 198 * 64byte (2^6) chunks. 199 * Using incrementing integers in the case labels of a switch statement 200 * encourages the compiler to use a jump table. To get incrementing 201 * integers, we shift the 2 relevant bits to the LSB position to first 202 * get decrementing integers, and then subtract. 203 */ 204 switch (3 - (n >> 6)) { 205 case 0x00: 206 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 207 n -= 64; 208 dst = (uint8_t *)dst + 64; 209 src = (const uint8_t *)src + 64; /* fallthrough */ 210 case 0x01: 211 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 212 n -= 64; 213 dst = (uint8_t *)dst + 64; 214 src = (const uint8_t *)src + 64; /* fallthrough */ 215 case 0x02: 216 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 217 n -= 64; 218 dst = (uint8_t *)dst + 64; 219 src = (const uint8_t *)src + 64; /* fallthrough */ 220 default: 221 break; 222 } 223 224 /* 225 * We split the remaining bytes (which will be less than 64) into 226 * 16byte (2^4) chunks, using the same switch structure as above. 227 */ 228 switch (3 - (n >> 4)) { 229 case 0x00: 230 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 231 n -= 16; 232 dst = (uint8_t *)dst + 16; 233 src = (const uint8_t *)src + 16; /* fallthrough */ 234 case 0x01: 235 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 236 n -= 16; 237 dst = (uint8_t *)dst + 16; 238 src = (const uint8_t *)src + 16; /* fallthrough */ 239 case 0x02: 240 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 241 n -= 16; 242 dst = (uint8_t *)dst + 16; 243 src = (const uint8_t *)src + 16; /* fallthrough */ 244 default: 245 break; 246 } 247 248 /* Copy any remaining bytes, without going beyond end of buffers */ 249 if (n != 0) 250 rte_mov16((uint8_t *)dst - 16 + n, 251 (const uint8_t *)src - 16 + n); 252 return ret; 253 } 254 255 #else 256 257 static inline void 258 rte_mov16(uint8_t *dst, const uint8_t *src) 259 { 260 memcpy(dst, src, 16); 261 } 262 263 static inline void 264 rte_mov32(uint8_t *dst, const uint8_t *src) 265 { 266 memcpy(dst, src, 32); 267 } 268 269 static inline void 270 rte_mov48(uint8_t *dst, const uint8_t *src) 271 { 272 memcpy(dst, src, 48); 273 } 274 275 static inline void 276 rte_mov64(uint8_t *dst, const uint8_t *src) 277 { 278 memcpy(dst, src, 64); 279 } 280 281 static inline void 282 rte_mov128(uint8_t *dst, const uint8_t *src) 283 { 284 memcpy(dst, src, 128); 285 } 286 287 static inline void 288 rte_mov256(uint8_t *dst, const uint8_t *src) 289 { 290 memcpy(dst, src, 256); 291 } 292 293 static inline void * 294 rte_memcpy(void *dst, const void *src, size_t n) 295 { 296 return memcpy(dst, src, n); 297 } 298 299 #endif /* RTE_ARCH_ARM_NEON_MEMCPY */ 300 301 #ifdef __cplusplus 302 } 303 #endif 304 305 #endif /* _RTE_MEMCPY_ARM32_H_ */ 306