1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation 3 */ 4 5 #ifndef _RTE_MEMCPY_X86_64_H_ 6 #define _RTE_MEMCPY_X86_64_H_ 7 8 /** 9 * @file 10 * 11 * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy(). 12 */ 13 14 #include <stdio.h> 15 #include <stdint.h> 16 #include <string.h> 17 #include <rte_vect.h> 18 #include <rte_common.h> 19 #include <rte_config.h> 20 21 #ifdef __cplusplus 22 extern "C" { 23 #endif 24 25 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) 26 #pragma GCC diagnostic push 27 #pragma GCC diagnostic ignored "-Wstringop-overflow" 28 #endif 29 30 /* 31 * GCC older than version 11 doesn't compile AVX properly, so use SSE instead. 32 * There are no problems with AVX2. 33 */ 34 #if defined __AVX2__ 35 #define RTE_MEMCPY_AVX 36 #elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)) 37 #define RTE_MEMCPY_AVX 38 #endif 39 40 /** 41 * Copy bytes from one location to another. The locations must not overlap. 42 * 43 * @note This is implemented as a macro, so it's address should not be taken 44 * and care is needed as parameter expressions may be evaluated multiple times. 45 * 46 * @param dst 47 * Pointer to the destination of the data. 48 * @param src 49 * Pointer to the source data. 50 * @param n 51 * Number of bytes to copy. 52 * @return 53 * Pointer to the destination data. 54 */ 55 static __rte_always_inline void * 56 rte_memcpy(void *dst, const void *src, size_t n); 57 58 /** 59 * Copy bytes from one location to another, 60 * locations should not overlap. 61 * Use with n <= 15. 62 */ 63 static __rte_always_inline void * 64 rte_mov15_or_less(void *dst, const void *src, size_t n) 65 { 66 /** 67 * Use the following structs to avoid violating C standard 68 * alignment requirements and to avoid strict aliasing bugs 69 */ 70 struct __rte_packed_begin rte_uint64_alias { 71 uint64_t val; 72 } __rte_packed_end __rte_may_alias; 73 struct __rte_packed_begin rte_uint32_alias { 74 uint32_t val; 75 } __rte_packed_end __rte_may_alias; 76 struct __rte_packed_begin rte_uint16_alias { 77 uint16_t val; 78 } __rte_packed_end __rte_may_alias; 79 80 void *ret = dst; 81 if (n & 8) { 82 ((struct rte_uint64_alias *)dst)->val = 83 ((const struct rte_uint64_alias *)src)->val; 84 src = (const uint64_t *)src + 1; 85 dst = (uint64_t *)dst + 1; 86 } 87 if (n & 4) { 88 ((struct rte_uint32_alias *)dst)->val = 89 ((const struct rte_uint32_alias *)src)->val; 90 src = (const uint32_t *)src + 1; 91 dst = (uint32_t *)dst + 1; 92 } 93 if (n & 2) { 94 ((struct rte_uint16_alias *)dst)->val = 95 ((const struct rte_uint16_alias *)src)->val; 96 src = (const uint16_t *)src + 1; 97 dst = (uint16_t *)dst + 1; 98 } 99 if (n & 1) 100 *(uint8_t *)dst = *(const uint8_t *)src; 101 return ret; 102 } 103 104 /** 105 * Copy 16 bytes from one location to another, 106 * locations should not overlap. 107 */ 108 static __rte_always_inline void 109 rte_mov16(uint8_t *dst, const uint8_t *src) 110 { 111 __m128i xmm0; 112 113 xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); 114 _mm_storeu_si128((__m128i *)(void *)dst, xmm0); 115 } 116 117 /** 118 * Copy 32 bytes from one location to another, 119 * locations should not overlap. 120 */ 121 static __rte_always_inline void 122 rte_mov32(uint8_t *dst, const uint8_t *src) 123 { 124 #if defined RTE_MEMCPY_AVX 125 __m256i ymm0; 126 127 ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); 128 _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); 129 #else /* SSE implementation */ 130 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); 131 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); 132 #endif 133 } 134 135 /** 136 * Copy 64 bytes from one location to another, 137 * locations should not overlap. 138 */ 139 static __rte_always_inline void 140 rte_mov64(uint8_t *dst, const uint8_t *src) 141 { 142 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 143 __m512i zmm0; 144 145 zmm0 = _mm512_loadu_si512((const void *)src); 146 _mm512_storeu_si512((void *)dst, zmm0); 147 #else /* AVX2, AVX & SSE implementation */ 148 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); 149 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); 150 #endif 151 } 152 153 /** 154 * Copy 128 bytes from one location to another, 155 * locations should not overlap. 156 */ 157 static __rte_always_inline void 158 rte_mov128(uint8_t *dst, const uint8_t *src) 159 { 160 rte_mov64(dst + 0 * 64, src + 0 * 64); 161 rte_mov64(dst + 1 * 64, src + 1 * 64); 162 } 163 164 /** 165 * Copy 256 bytes from one location to another, 166 * locations should not overlap. 167 */ 168 static __rte_always_inline void 169 rte_mov256(uint8_t *dst, const uint8_t *src) 170 { 171 rte_mov128(dst + 0 * 128, src + 0 * 128); 172 rte_mov128(dst + 1 * 128, src + 1 * 128); 173 } 174 175 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 176 177 /** 178 * AVX512 implementation below 179 */ 180 181 #define ALIGNMENT_MASK 0x3F 182 183 /** 184 * Copy 128-byte blocks from one location to another, 185 * locations should not overlap. 186 */ 187 static __rte_always_inline void 188 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) 189 { 190 __m512i zmm0, zmm1; 191 192 while (n >= 128) { 193 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); 194 n -= 128; 195 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); 196 src = src + 128; 197 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); 198 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); 199 dst = dst + 128; 200 } 201 } 202 203 /** 204 * Copy 512-byte blocks from one location to another, 205 * locations should not overlap. 206 */ 207 static inline void 208 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) 209 { 210 __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7; 211 212 while (n >= 512) { 213 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); 214 n -= 512; 215 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); 216 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64)); 217 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64)); 218 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64)); 219 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64)); 220 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64)); 221 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64)); 222 src = src + 512; 223 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); 224 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); 225 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2); 226 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3); 227 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4); 228 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5); 229 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6); 230 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7); 231 dst = dst + 512; 232 } 233 } 234 235 static __rte_always_inline void * 236 rte_memcpy_generic(void *dst, const void *src, size_t n) 237 { 238 void *ret = dst; 239 size_t dstofss; 240 size_t bits; 241 242 /** 243 * Copy less than 16 bytes 244 */ 245 if (n < 16) { 246 return rte_mov15_or_less(dst, src, n); 247 } 248 249 /** 250 * Fast way when copy size doesn't exceed 512 bytes 251 */ 252 if (__rte_constant(n) && n == 32) { 253 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 254 return ret; 255 } 256 if (n <= 32) { 257 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 258 if (__rte_constant(n) && n == 16) 259 return ret; /* avoid (harmless) duplicate copy */ 260 rte_mov16((uint8_t *)dst - 16 + n, 261 (const uint8_t *)src - 16 + n); 262 return ret; 263 } 264 if (__rte_constant(n) && n == 64) { 265 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 266 return ret; 267 } 268 if (n <= 64) { 269 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 270 rte_mov32((uint8_t *)dst - 32 + n, 271 (const uint8_t *)src - 32 + n); 272 return ret; 273 } 274 if (n <= 512) { 275 if (n >= 256) { 276 n -= 256; 277 rte_mov256((uint8_t *)dst, (const uint8_t *)src); 278 src = (const uint8_t *)src + 256; 279 dst = (uint8_t *)dst + 256; 280 } 281 if (n >= 128) { 282 n -= 128; 283 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 284 src = (const uint8_t *)src + 128; 285 dst = (uint8_t *)dst + 128; 286 } 287 COPY_BLOCK_128_BACK63: 288 if (n > 64) { 289 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 290 rte_mov64((uint8_t *)dst - 64 + n, 291 (const uint8_t *)src - 64 + n); 292 return ret; 293 } 294 if (n > 0) 295 rte_mov64((uint8_t *)dst - 64 + n, 296 (const uint8_t *)src - 64 + n); 297 return ret; 298 } 299 300 /** 301 * Make store aligned when copy size exceeds 512 bytes 302 */ 303 dstofss = ((uintptr_t)dst & 0x3F); 304 if (dstofss > 0) { 305 dstofss = 64 - dstofss; 306 n -= dstofss; 307 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 308 src = (const uint8_t *)src + dstofss; 309 dst = (uint8_t *)dst + dstofss; 310 } 311 312 /** 313 * Copy 512-byte blocks. 314 * Use copy block function for better instruction order control, 315 * which is important when load is unaligned. 316 */ 317 rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n); 318 bits = n; 319 n = n & 511; 320 bits -= n; 321 src = (const uint8_t *)src + bits; 322 dst = (uint8_t *)dst + bits; 323 324 /** 325 * Copy 128-byte blocks. 326 * Use copy block function for better instruction order control, 327 * which is important when load is unaligned. 328 */ 329 if (n >= 128) { 330 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); 331 bits = n; 332 n = n & 127; 333 bits -= n; 334 src = (const uint8_t *)src + bits; 335 dst = (uint8_t *)dst + bits; 336 } 337 338 /** 339 * Copy whatever left 340 */ 341 goto COPY_BLOCK_128_BACK63; 342 } 343 344 #elif defined RTE_MEMCPY_AVX 345 346 /** 347 * AVX implementation below 348 */ 349 350 #define ALIGNMENT_MASK 0x1F 351 352 /** 353 * Copy 128-byte blocks from one location to another, 354 * locations should not overlap. 355 */ 356 static __rte_always_inline void 357 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) 358 { 359 __m256i ymm0, ymm1, ymm2, ymm3; 360 361 while (n >= 128) { 362 ymm0 = _mm256_loadu_si256((const __m256i *)(const void *) 363 ((const uint8_t *)src + 0 * 32)); 364 n -= 128; 365 ymm1 = _mm256_loadu_si256((const __m256i *)(const void *) 366 ((const uint8_t *)src + 1 * 32)); 367 ymm2 = _mm256_loadu_si256((const __m256i *)(const void *) 368 ((const uint8_t *)src + 2 * 32)); 369 ymm3 = _mm256_loadu_si256((const __m256i *)(const void *) 370 ((const uint8_t *)src + 3 * 32)); 371 src = (const uint8_t *)src + 128; 372 _mm256_storeu_si256((__m256i *)(void *) 373 ((uint8_t *)dst + 0 * 32), ymm0); 374 _mm256_storeu_si256((__m256i *)(void *) 375 ((uint8_t *)dst + 1 * 32), ymm1); 376 _mm256_storeu_si256((__m256i *)(void *) 377 ((uint8_t *)dst + 2 * 32), ymm2); 378 _mm256_storeu_si256((__m256i *)(void *) 379 ((uint8_t *)dst + 3 * 32), ymm3); 380 dst = (uint8_t *)dst + 128; 381 } 382 } 383 384 static __rte_always_inline void * 385 rte_memcpy_generic(void *dst, const void *src, size_t n) 386 { 387 void *ret = dst; 388 size_t dstofss; 389 size_t bits; 390 391 /** 392 * Copy less than 16 bytes 393 */ 394 if (n < 16) { 395 return rte_mov15_or_less(dst, src, n); 396 } 397 398 /** 399 * Fast way when copy size doesn't exceed 256 bytes 400 */ 401 if (__rte_constant(n) && n == 32) { 402 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 403 return ret; 404 } 405 if (n <= 32) { 406 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 407 if (__rte_constant(n) && n == 16) 408 return ret; /* avoid (harmless) duplicate copy */ 409 rte_mov16((uint8_t *)dst - 16 + n, 410 (const uint8_t *)src - 16 + n); 411 return ret; 412 } 413 if (n <= 64) { 414 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 415 rte_mov32((uint8_t *)dst - 32 + n, 416 (const uint8_t *)src - 32 + n); 417 return ret; 418 } 419 if (n <= 256) { 420 if (n >= 128) { 421 n -= 128; 422 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 423 src = (const uint8_t *)src + 128; 424 dst = (uint8_t *)dst + 128; 425 } 426 COPY_BLOCK_128_BACK31: 427 if (n >= 64) { 428 n -= 64; 429 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 430 src = (const uint8_t *)src + 64; 431 dst = (uint8_t *)dst + 64; 432 } 433 if (n > 32) { 434 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 435 rte_mov32((uint8_t *)dst - 32 + n, 436 (const uint8_t *)src - 32 + n); 437 return ret; 438 } 439 if (n > 0) { 440 rte_mov32((uint8_t *)dst - 32 + n, 441 (const uint8_t *)src - 32 + n); 442 } 443 return ret; 444 } 445 446 /** 447 * Make store aligned when copy size exceeds 256 bytes 448 */ 449 dstofss = (uintptr_t)dst & 0x1F; 450 if (dstofss > 0) { 451 dstofss = 32 - dstofss; 452 n -= dstofss; 453 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 454 src = (const uint8_t *)src + dstofss; 455 dst = (uint8_t *)dst + dstofss; 456 } 457 458 /** 459 * Copy 128-byte blocks 460 */ 461 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); 462 bits = n; 463 n = n & 127; 464 bits -= n; 465 src = (const uint8_t *)src + bits; 466 dst = (uint8_t *)dst + bits; 467 468 /** 469 * Copy whatever left 470 */ 471 goto COPY_BLOCK_128_BACK31; 472 } 473 474 #else /* __AVX512F__ */ 475 476 /** 477 * SSE implementation below 478 */ 479 480 #define ALIGNMENT_MASK 0x0F 481 482 /** 483 * Macro for copying unaligned block from one location to another with constant load offset, 484 * 47 bytes leftover maximum, 485 * locations should not overlap. 486 * Requirements: 487 * - Store is aligned 488 * - Load offset is <offset>, which must be immediate value within [1, 15] 489 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading 490 * - <dst>, <src>, <len> must be variables 491 * - __m128i <xmm0> ~ <xmm8> must be pre-defined 492 */ 493 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset) \ 494 { \ 495 size_t tmp; \ 496 while (len >= 128 + 16 - offset) { \ 497 xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16)); \ 498 len -= 128; \ 499 xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16)); \ 500 xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16)); \ 501 xmm3 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 3 * 16)); \ 502 xmm4 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 4 * 16)); \ 503 xmm5 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 5 * 16)); \ 504 xmm6 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 6 * 16)); \ 505 xmm7 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 7 * 16)); \ 506 xmm8 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 8 * 16)); \ 507 src = (const uint8_t *)src + 128; \ 508 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ 509 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ 510 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ 511 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ 512 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ 513 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ 514 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ 515 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ 516 dst = (uint8_t *)dst + 128; \ 517 } \ 518 tmp = len; \ 519 len = ((len - 16 + offset) & 127) + 16 - offset; \ 520 tmp -= len; \ 521 src = (const uint8_t *)src + tmp; \ 522 dst = (uint8_t *)dst + tmp; \ 523 if (len >= 32 + 16 - offset) { \ 524 while (len >= 32 + 16 - offset) { \ 525 xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16)); \ 526 len -= 32; \ 527 xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16)); \ 528 xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16)); \ 529 src = (const uint8_t *)src + 32; \ 530 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ 531 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ 532 dst = (uint8_t *)dst + 32; \ 533 } \ 534 tmp = len; \ 535 len = ((len - 16 + offset) & 31) + 16 - offset; \ 536 tmp -= len; \ 537 src = (const uint8_t *)src + tmp; \ 538 dst = (uint8_t *)dst + tmp; \ 539 } \ 540 } 541 542 /** 543 * Macro for copying unaligned block from one location to another, 544 * 47 bytes leftover maximum, 545 * locations should not overlap. 546 * Use switch here because the aligning instruction requires immediate value for shift count. 547 * Requirements: 548 * - Store is aligned 549 * - Load offset is <offset>, which must be within [1, 15] 550 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading 551 * - <dst>, <src>, <len> must be variables 552 * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined 553 */ 554 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset) \ 555 { \ 556 switch (offset) { \ 557 case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ 558 case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ 559 case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ 560 case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ 561 case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ 562 case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ 563 case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ 564 case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ 565 case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ 566 case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ 567 case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ 568 case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ 569 case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ 570 case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ 571 case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ 572 default:; \ 573 } \ 574 } 575 576 static __rte_always_inline void * 577 rte_memcpy_generic(void *dst, const void *src, size_t n) 578 { 579 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; 580 void *ret = dst; 581 size_t dstofss; 582 size_t srcofs; 583 584 /** 585 * Copy less than 16 bytes 586 */ 587 if (n < 16) { 588 return rte_mov15_or_less(dst, src, n); 589 } 590 591 /** 592 * Fast way when copy size doesn't exceed 512 bytes 593 */ 594 if (n <= 32) { 595 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 596 if (__rte_constant(n) && n == 16) 597 return ret; /* avoid (harmless) duplicate copy */ 598 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 599 return ret; 600 } 601 if (n <= 64) { 602 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 603 if (n > 48) 604 rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); 605 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 606 return ret; 607 } 608 if (n <= 128) { 609 goto COPY_BLOCK_128_BACK15; 610 } 611 if (n <= 512) { 612 if (n >= 256) { 613 n -= 256; 614 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 615 rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128); 616 src = (const uint8_t *)src + 256; 617 dst = (uint8_t *)dst + 256; 618 } 619 COPY_BLOCK_255_BACK15: 620 if (n >= 128) { 621 n -= 128; 622 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 623 src = (const uint8_t *)src + 128; 624 dst = (uint8_t *)dst + 128; 625 } 626 COPY_BLOCK_128_BACK15: 627 if (n >= 64) { 628 n -= 64; 629 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 630 src = (const uint8_t *)src + 64; 631 dst = (uint8_t *)dst + 64; 632 } 633 COPY_BLOCK_64_BACK15: 634 if (n >= 32) { 635 n -= 32; 636 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 637 src = (const uint8_t *)src + 32; 638 dst = (uint8_t *)dst + 32; 639 } 640 if (n > 16) { 641 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 642 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 643 return ret; 644 } 645 if (n > 0) { 646 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 647 } 648 return ret; 649 } 650 651 /** 652 * Make store aligned when copy size exceeds 512 bytes, 653 * and make sure the first 15 bytes are copied, because 654 * unaligned copy functions require up to 15 bytes 655 * backwards access. 656 */ 657 dstofss = (uintptr_t)dst & 0x0F; 658 if (dstofss > 0) { 659 dstofss = 16 - dstofss + 16; 660 n -= dstofss; 661 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 662 src = (const uint8_t *)src + dstofss; 663 dst = (uint8_t *)dst + dstofss; 664 } 665 srcofs = ((uintptr_t)src & 0x0F); 666 667 /** 668 * For aligned copy 669 */ 670 if (srcofs == 0) { 671 /** 672 * Copy 256-byte blocks 673 */ 674 for (; n >= 256; n -= 256) { 675 rte_mov256((uint8_t *)dst, (const uint8_t *)src); 676 dst = (uint8_t *)dst + 256; 677 src = (const uint8_t *)src + 256; 678 } 679 680 /** 681 * Copy whatever left 682 */ 683 goto COPY_BLOCK_255_BACK15; 684 } 685 686 /** 687 * For copy with unaligned load 688 */ 689 MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); 690 691 /** 692 * Copy whatever left 693 */ 694 goto COPY_BLOCK_64_BACK15; 695 } 696 697 #endif /* __AVX512F__ */ 698 699 static __rte_always_inline void * 700 rte_memcpy_aligned(void *dst, const void *src, size_t n) 701 { 702 void *ret = dst; 703 704 /* Copy size < 16 bytes */ 705 if (n < 16) { 706 return rte_mov15_or_less(dst, src, n); 707 } 708 709 /* Copy 16 <= size <= 32 bytes */ 710 if (__rte_constant(n) && n == 32) { 711 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 712 return ret; 713 } 714 if (n <= 32) { 715 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 716 if (__rte_constant(n) && n == 16) 717 return ret; /* avoid (harmless) duplicate copy */ 718 rte_mov16((uint8_t *)dst - 16 + n, 719 (const uint8_t *)src - 16 + n); 720 721 return ret; 722 } 723 724 /* Copy 32 < size <= 64 bytes */ 725 if (__rte_constant(n) && n == 64) { 726 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 727 return ret; 728 } 729 if (n <= 64) { 730 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 731 rte_mov32((uint8_t *)dst - 32 + n, 732 (const uint8_t *)src - 32 + n); 733 734 return ret; 735 } 736 737 /* Copy 64 bytes blocks */ 738 for (; n > 64; n -= 64) { 739 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 740 dst = (uint8_t *)dst + 64; 741 src = (const uint8_t *)src + 64; 742 } 743 744 /* Copy whatever left */ 745 rte_mov64((uint8_t *)dst - 64 + n, 746 (const uint8_t *)src - 64 + n); 747 748 return ret; 749 } 750 751 static __rte_always_inline void * 752 rte_memcpy(void *dst, const void *src, size_t n) 753 { 754 if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK)) 755 return rte_memcpy_aligned(dst, src, n); 756 else 757 return rte_memcpy_generic(dst, src, n); 758 } 759 760 #undef ALIGNMENT_MASK 761 762 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) 763 #pragma GCC diagnostic pop 764 #endif 765 766 #ifdef __cplusplus 767 } 768 #endif 769 770 #endif /* _RTE_MEMCPY_X86_64_H_ */ 771