1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2015 Cavium, Inc 3 */ 4 5 #ifndef _RTE_MEMCPY_ARM64_H_ 6 #define _RTE_MEMCPY_ARM64_H_ 7 8 #include <stdint.h> 9 #include <string.h> 10 11 #include "generic/rte_memcpy.h" 12 13 #ifdef RTE_ARCH_ARM64_MEMCPY 14 #include <rte_common.h> 15 #include <rte_branch_prediction.h> 16 17 #ifdef __cplusplus 18 extern "C" { 19 #endif 20 21 /* 22 * The memory copy performance differs on different AArch64 micro-architectures. 23 * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy() 24 * performance compared to old glibc versions. It's always suggested to use a 25 * more recent glibc if possible, from which the entire system can get benefit. 26 * 27 * This implementation improves memory copy on some aarch64 micro-architectures, 28 * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by 29 * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not 30 * always providing better performance than memcpy() so users need to run unit 31 * test "memcpy_perf_autotest" and customize parameters in customization section 32 * below for best performance. 33 * 34 * Compiler version will also impact the rte_memcpy() performance. It's observed 35 * on some platforms and with the same code, GCC 7.2.0 compiled binaries can 36 * provide better performance than GCC 4.8.5 compiled binaries. 37 */ 38 39 /************************************** 40 * Beginning of customization section 41 **************************************/ 42 #ifndef RTE_ARM64_MEMCPY_ALIGN_MASK 43 #define RTE_ARM64_MEMCPY_ALIGN_MASK ((RTE_CACHE_LINE_SIZE >> 3) - 1) 44 #endif 45 46 #ifndef RTE_ARM64_MEMCPY_STRICT_ALIGN 47 /* Only src unalignment will be treated as unaligned copy */ 48 #define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \ 49 ((uintptr_t)(src) & RTE_ARM64_MEMCPY_ALIGN_MASK) 50 #else 51 /* Both dst and src unalignment will be treated as unaligned copy */ 52 #define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \ 53 (((uintptr_t)(dst) | (uintptr_t)(src)) & RTE_ARM64_MEMCPY_ALIGN_MASK) 54 #endif 55 56 57 /* 58 * If copy size is larger than threshold, memcpy() will be used. 59 * Run "memcpy_perf_autotest" to determine the proper threshold. 60 */ 61 #ifdef RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD 62 #define USE_ALIGNED_RTE_MEMCPY(dst, src, n) \ 63 (!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \ 64 n <= (size_t)RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) 65 #else 66 #define USE_ALIGNED_RTE_MEMCPY(dst, src, n) \ 67 (!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src)) 68 #endif 69 #ifdef RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD 70 #define USE_UNALIGNED_RTE_MEMCPY(dst, src, n) \ 71 (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \ 72 n <= (size_t)RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD) 73 #else 74 #define USE_UNALIGNED_RTE_MEMCPY(dst, src, n) \ 75 (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src)) 76 #endif 77 /* 78 * The logic of USE_RTE_MEMCPY() can also be modified to best fit platform. 79 */ 80 #if defined(RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \ 81 || defined(RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD) 82 #define USE_RTE_MEMCPY(dst, src, n) \ 83 (USE_ALIGNED_RTE_MEMCPY(dst, src, n) || USE_UNALIGNED_RTE_MEMCPY(dst, src, n)) 84 #else 85 #define USE_RTE_MEMCPY(dst, src, n) (1) 86 #endif 87 /************************************** 88 * End of customization section 89 **************************************/ 90 91 92 #if RTE_CC_IS_GNU && !defined RTE_ARM64_MEMCPY_SKIP_GCC_VER_CHECK 93 #if (GCC_VERSION < 50400) 94 #warning "The GCC version is quite old, which may result in sub-optimal \ 95 performance of the compiled code. It is suggested that at least GCC 5.4.0 \ 96 be used." 97 #endif 98 #endif 99 100 static __rte_always_inline 101 void rte_mov16(uint8_t *dst, const uint8_t *src) 102 { 103 __uint128_t *dst128 = (__uint128_t *)dst; 104 const __uint128_t *src128 = (const __uint128_t *)src; 105 *dst128 = *src128; 106 } 107 108 static __rte_always_inline 109 void rte_mov32(uint8_t *dst, const uint8_t *src) 110 { 111 __uint128_t *dst128 = (__uint128_t *)dst; 112 const __uint128_t *src128 = (const __uint128_t *)src; 113 const __uint128_t x0 = src128[0], x1 = src128[1]; 114 dst128[0] = x0; 115 dst128[1] = x1; 116 } 117 118 static __rte_always_inline 119 void rte_mov48(uint8_t *dst, const uint8_t *src) 120 { 121 __uint128_t *dst128 = (__uint128_t *)dst; 122 const __uint128_t *src128 = (const __uint128_t *)src; 123 const __uint128_t x0 = src128[0], x1 = src128[1], x2 = src128[2]; 124 dst128[0] = x0; 125 dst128[1] = x1; 126 dst128[2] = x2; 127 } 128 129 static __rte_always_inline 130 void rte_mov64(uint8_t *dst, const uint8_t *src) 131 { 132 __uint128_t *dst128 = (__uint128_t *)dst; 133 const __uint128_t *src128 = (const __uint128_t *)src; 134 const __uint128_t 135 x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3]; 136 dst128[0] = x0; 137 dst128[1] = x1; 138 dst128[2] = x2; 139 dst128[3] = x3; 140 } 141 142 static __rte_always_inline 143 void rte_mov128(uint8_t *dst, const uint8_t *src) 144 { 145 __uint128_t *dst128 = (__uint128_t *)dst; 146 const __uint128_t *src128 = (const __uint128_t *)src; 147 /* Keep below declaration & copy sequence for optimized instructions */ 148 const __uint128_t 149 x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3]; 150 dst128[0] = x0; 151 __uint128_t x4 = src128[4]; 152 dst128[1] = x1; 153 __uint128_t x5 = src128[5]; 154 dst128[2] = x2; 155 __uint128_t x6 = src128[6]; 156 dst128[3] = x3; 157 __uint128_t x7 = src128[7]; 158 dst128[4] = x4; 159 dst128[5] = x5; 160 dst128[6] = x6; 161 dst128[7] = x7; 162 } 163 164 static __rte_always_inline 165 void rte_mov256(uint8_t *dst, const uint8_t *src) 166 { 167 rte_mov128(dst, src); 168 rte_mov128(dst + 128, src + 128); 169 } 170 171 static __rte_always_inline void 172 rte_memcpy_lt16(uint8_t *dst, const uint8_t *src, size_t n) 173 { 174 if (n & 0x08) { 175 /* copy 8 ~ 15 bytes */ 176 *(uint64_t *)dst = *(const uint64_t *)src; 177 *(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n); 178 } else if (n & 0x04) { 179 /* copy 4 ~ 7 bytes */ 180 *(uint32_t *)dst = *(const uint32_t *)src; 181 *(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n); 182 } else if (n & 0x02) { 183 /* copy 2 ~ 3 bytes */ 184 *(uint16_t *)dst = *(const uint16_t *)src; 185 *(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n); 186 } else if (n & 0x01) { 187 /* copy 1 byte */ 188 *dst = *src; 189 } 190 } 191 192 static __rte_always_inline 193 void rte_memcpy_ge16_lt128(uint8_t *dst, const uint8_t *src, size_t n) 194 { 195 if (n < 64) { 196 if (n == 16) { 197 rte_mov16(dst, src); 198 } else if (n <= 32) { 199 rte_mov16(dst, src); 200 rte_mov16(dst - 16 + n, src - 16 + n); 201 } else if (n <= 48) { 202 rte_mov32(dst, src); 203 rte_mov16(dst - 16 + n, src - 16 + n); 204 } else { 205 rte_mov48(dst, src); 206 rte_mov16(dst - 16 + n, src - 16 + n); 207 } 208 } else { 209 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 210 if (n > 48 + 64) 211 rte_mov64(dst - 64 + n, src - 64 + n); 212 else if (n > 32 + 64) 213 rte_mov48(dst - 48 + n, src - 48 + n); 214 else if (n > 16 + 64) 215 rte_mov32(dst - 32 + n, src - 32 + n); 216 else if (n > 64) 217 rte_mov16(dst - 16 + n, src - 16 + n); 218 } 219 } 220 221 static __rte_always_inline 222 void rte_memcpy_ge128(uint8_t *dst, const uint8_t *src, size_t n) 223 { 224 do { 225 rte_mov128(dst, src); 226 src += 128; 227 dst += 128; 228 n -= 128; 229 } while (likely(n >= 128)); 230 231 if (likely(n)) { 232 if (n <= 16) 233 rte_mov16(dst - 16 + n, src - 16 + n); 234 else if (n <= 32) 235 rte_mov32(dst - 32 + n, src - 32 + n); 236 else if (n <= 48) 237 rte_mov48(dst - 48 + n, src - 48 + n); 238 else if (n <= 64) 239 rte_mov64(dst - 64 + n, src - 64 + n); 240 else 241 rte_memcpy_ge16_lt128(dst, src, n); 242 } 243 } 244 245 static __rte_always_inline 246 void rte_memcpy_ge16_lt64(uint8_t *dst, const uint8_t *src, size_t n) 247 { 248 if (n == 16) { 249 rte_mov16(dst, src); 250 } else if (n <= 32) { 251 rte_mov16(dst, src); 252 rte_mov16(dst - 16 + n, src - 16 + n); 253 } else if (n <= 48) { 254 rte_mov32(dst, src); 255 rte_mov16(dst - 16 + n, src - 16 + n); 256 } else { 257 rte_mov48(dst, src); 258 rte_mov16(dst - 16 + n, src - 16 + n); 259 } 260 } 261 262 static __rte_always_inline 263 void rte_memcpy_ge64(uint8_t *dst, const uint8_t *src, size_t n) 264 { 265 do { 266 rte_mov64(dst, src); 267 src += 64; 268 dst += 64; 269 n -= 64; 270 } while (likely(n >= 64)); 271 272 if (likely(n)) { 273 if (n <= 16) 274 rte_mov16(dst - 16 + n, src - 16 + n); 275 else if (n <= 32) 276 rte_mov32(dst - 32 + n, src - 32 + n); 277 else if (n <= 48) 278 rte_mov48(dst - 48 + n, src - 48 + n); 279 else 280 rte_mov64(dst - 64 + n, src - 64 + n); 281 } 282 } 283 284 #if RTE_CACHE_LINE_SIZE >= 128 285 static __rte_always_inline 286 void *rte_memcpy(void *dst, const void *src, size_t n) 287 { 288 if (n < 16) { 289 rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n); 290 return dst; 291 } 292 if (n < 128) { 293 rte_memcpy_ge16_lt128((uint8_t *)dst, (const uint8_t *)src, n); 294 return dst; 295 } 296 __builtin_prefetch(src, 0, 0); 297 __builtin_prefetch(dst, 1, 0); 298 if (likely(USE_RTE_MEMCPY(dst, src, n))) { 299 rte_memcpy_ge128((uint8_t *)dst, (const uint8_t *)src, n); 300 return dst; 301 } else 302 return memcpy(dst, src, n); 303 } 304 305 #else 306 static __rte_always_inline 307 void *rte_memcpy(void *dst, const void *src, size_t n) 308 { 309 if (n < 16) { 310 rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n); 311 return dst; 312 } 313 if (n < 64) { 314 rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n); 315 return dst; 316 } 317 __builtin_prefetch(src, 0, 0); 318 __builtin_prefetch(dst, 1, 0); 319 if (likely(USE_RTE_MEMCPY(dst, src, n))) { 320 rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n); 321 return dst; 322 } else 323 return memcpy(dst, src, n); 324 } 325 #endif /* RTE_CACHE_LINE_SIZE >= 128 */ 326 327 #ifdef __cplusplus 328 } 329 #endif 330 331 #else /* RTE_ARCH_ARM64_MEMCPY */ 332 333 #ifdef __cplusplus 334 extern "C" { 335 #endif 336 337 static inline void 338 rte_mov16(uint8_t *dst, const uint8_t *src) 339 { 340 memcpy(dst, src, 16); 341 } 342 343 static inline void 344 rte_mov32(uint8_t *dst, const uint8_t *src) 345 { 346 memcpy(dst, src, 32); 347 } 348 349 static inline void 350 rte_mov48(uint8_t *dst, const uint8_t *src) 351 { 352 memcpy(dst, src, 48); 353 } 354 355 static inline void 356 rte_mov64(uint8_t *dst, const uint8_t *src) 357 { 358 memcpy(dst, src, 64); 359 } 360 361 static inline void 362 rte_mov128(uint8_t *dst, const uint8_t *src) 363 { 364 memcpy(dst, src, 128); 365 } 366 367 static inline void 368 rte_mov256(uint8_t *dst, const uint8_t *src) 369 { 370 memcpy(dst, src, 256); 371 } 372 373 #define rte_memcpy(d, s, n) memcpy((d), (s), (n)) 374 375 #ifdef __cplusplus 376 } 377 #endif 378 379 #endif /* RTE_ARCH_ARM64_MEMCPY */ 380 381 #endif /* _RTE_MEMCPY_ARM_64_H_ */ 382