1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation 3 */ 4 5 #ifndef _RTE_MEMCPY_X86_64_H_ 6 #define _RTE_MEMCPY_X86_64_H_ 7 8 /** 9 * @file 10 * 11 * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy(). 12 */ 13 14 #include <stdio.h> 15 #include <stdint.h> 16 #include <string.h> 17 #include <rte_vect.h> 18 #include <rte_common.h> 19 #include <rte_config.h> 20 21 #ifdef __cplusplus 22 extern "C" { 23 #endif 24 25 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) 26 #pragma GCC diagnostic push 27 #pragma GCC diagnostic ignored "-Wstringop-overflow" 28 #endif 29 30 /** 31 * Copy bytes from one location to another. The locations must not overlap. 32 * 33 * @note This is implemented as a macro, so it's address should not be taken 34 * and care is needed as parameter expressions may be evaluated multiple times. 35 * 36 * @param dst 37 * Pointer to the destination of the data. 38 * @param src 39 * Pointer to the source data. 40 * @param n 41 * Number of bytes to copy. 42 * @return 43 * Pointer to the destination data. 44 */ 45 static __rte_always_inline void * 46 rte_memcpy(void *dst, const void *src, size_t n); 47 48 /** 49 * Copy bytes from one location to another, 50 * locations should not overlap. 51 * Use with n <= 15. 52 */ 53 static __rte_always_inline void * 54 rte_mov15_or_less(void *dst, const void *src, size_t n) 55 { 56 /** 57 * Use the following structs to avoid violating C standard 58 * alignment requirements and to avoid strict aliasing bugs 59 */ 60 struct rte_uint64_alias { 61 uint64_t val; 62 } __rte_packed __rte_may_alias; 63 struct rte_uint32_alias { 64 uint32_t val; 65 } __rte_packed __rte_may_alias; 66 struct rte_uint16_alias { 67 uint16_t val; 68 } __rte_packed __rte_may_alias; 69 70 void *ret = dst; 71 if (n & 8) { 72 ((struct rte_uint64_alias *)dst)->val = 73 ((const struct rte_uint64_alias *)src)->val; 74 src = (const uint64_t *)src + 1; 75 dst = (uint64_t *)dst + 1; 76 } 77 if (n & 4) { 78 ((struct rte_uint32_alias *)dst)->val = 79 ((const struct rte_uint32_alias *)src)->val; 80 src = (const uint32_t *)src + 1; 81 dst = (uint32_t *)dst + 1; 82 } 83 if (n & 2) { 84 ((struct rte_uint16_alias *)dst)->val = 85 ((const struct rte_uint16_alias *)src)->val; 86 src = (const uint16_t *)src + 1; 87 dst = (uint16_t *)dst + 1; 88 } 89 if (n & 1) 90 *(uint8_t *)dst = *(const uint8_t *)src; 91 return ret; 92 } 93 94 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 95 96 #define ALIGNMENT_MASK 0x3F 97 98 /** 99 * AVX512 implementation below 100 */ 101 102 /** 103 * Copy 16 bytes from one location to another, 104 * locations should not overlap. 105 */ 106 static __rte_always_inline void 107 rte_mov16(uint8_t *dst, const uint8_t *src) 108 { 109 __m128i xmm0; 110 111 xmm0 = _mm_loadu_si128((const __m128i *)src); 112 _mm_storeu_si128((__m128i *)dst, xmm0); 113 } 114 115 /** 116 * Copy 32 bytes from one location to another, 117 * locations should not overlap. 118 */ 119 static __rte_always_inline void 120 rte_mov32(uint8_t *dst, const uint8_t *src) 121 { 122 __m256i ymm0; 123 124 ymm0 = _mm256_loadu_si256((const __m256i *)src); 125 _mm256_storeu_si256((__m256i *)dst, ymm0); 126 } 127 128 /** 129 * Copy 64 bytes from one location to another, 130 * locations should not overlap. 131 */ 132 static __rte_always_inline void 133 rte_mov64(uint8_t *dst, const uint8_t *src) 134 { 135 __m512i zmm0; 136 137 zmm0 = _mm512_loadu_si512((const void *)src); 138 _mm512_storeu_si512((void *)dst, zmm0); 139 } 140 141 /** 142 * Copy 128 bytes from one location to another, 143 * locations should not overlap. 144 */ 145 static __rte_always_inline void 146 rte_mov128(uint8_t *dst, const uint8_t *src) 147 { 148 rte_mov64(dst + 0 * 64, src + 0 * 64); 149 rte_mov64(dst + 1 * 64, src + 1 * 64); 150 } 151 152 /** 153 * Copy 256 bytes from one location to another, 154 * locations should not overlap. 155 */ 156 static __rte_always_inline void 157 rte_mov256(uint8_t *dst, const uint8_t *src) 158 { 159 rte_mov64(dst + 0 * 64, src + 0 * 64); 160 rte_mov64(dst + 1 * 64, src + 1 * 64); 161 rte_mov64(dst + 2 * 64, src + 2 * 64); 162 rte_mov64(dst + 3 * 64, src + 3 * 64); 163 } 164 165 /** 166 * Copy 128-byte blocks from one location to another, 167 * locations should not overlap. 168 */ 169 static __rte_always_inline void 170 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) 171 { 172 __m512i zmm0, zmm1; 173 174 while (n >= 128) { 175 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); 176 n -= 128; 177 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); 178 src = src + 128; 179 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); 180 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); 181 dst = dst + 128; 182 } 183 } 184 185 /** 186 * Copy 512-byte blocks from one location to another, 187 * locations should not overlap. 188 */ 189 static inline void 190 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) 191 { 192 __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7; 193 194 while (n >= 512) { 195 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); 196 n -= 512; 197 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); 198 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64)); 199 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64)); 200 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64)); 201 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64)); 202 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64)); 203 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64)); 204 src = src + 512; 205 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); 206 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); 207 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2); 208 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3); 209 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4); 210 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5); 211 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6); 212 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7); 213 dst = dst + 512; 214 } 215 } 216 217 static __rte_always_inline void * 218 rte_memcpy_generic(void *dst, const void *src, size_t n) 219 { 220 void *ret = dst; 221 size_t dstofss; 222 size_t bits; 223 224 /** 225 * Copy less than 16 bytes 226 */ 227 if (n < 16) { 228 return rte_mov15_or_less(dst, src, n); 229 } 230 231 /** 232 * Fast way when copy size doesn't exceed 512 bytes 233 */ 234 if (n <= 32) { 235 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 236 rte_mov16((uint8_t *)dst - 16 + n, 237 (const uint8_t *)src - 16 + n); 238 return ret; 239 } 240 if (n <= 64) { 241 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 242 rte_mov32((uint8_t *)dst - 32 + n, 243 (const uint8_t *)src - 32 + n); 244 return ret; 245 } 246 if (n <= 512) { 247 if (n >= 256) { 248 n -= 256; 249 rte_mov256((uint8_t *)dst, (const uint8_t *)src); 250 src = (const uint8_t *)src + 256; 251 dst = (uint8_t *)dst + 256; 252 } 253 if (n >= 128) { 254 n -= 128; 255 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 256 src = (const uint8_t *)src + 128; 257 dst = (uint8_t *)dst + 128; 258 } 259 COPY_BLOCK_128_BACK63: 260 if (n > 64) { 261 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 262 rte_mov64((uint8_t *)dst - 64 + n, 263 (const uint8_t *)src - 64 + n); 264 return ret; 265 } 266 if (n > 0) 267 rte_mov64((uint8_t *)dst - 64 + n, 268 (const uint8_t *)src - 64 + n); 269 return ret; 270 } 271 272 /** 273 * Make store aligned when copy size exceeds 512 bytes 274 */ 275 dstofss = ((uintptr_t)dst & 0x3F); 276 if (dstofss > 0) { 277 dstofss = 64 - dstofss; 278 n -= dstofss; 279 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 280 src = (const uint8_t *)src + dstofss; 281 dst = (uint8_t *)dst + dstofss; 282 } 283 284 /** 285 * Copy 512-byte blocks. 286 * Use copy block function for better instruction order control, 287 * which is important when load is unaligned. 288 */ 289 rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n); 290 bits = n; 291 n = n & 511; 292 bits -= n; 293 src = (const uint8_t *)src + bits; 294 dst = (uint8_t *)dst + bits; 295 296 /** 297 * Copy 128-byte blocks. 298 * Use copy block function for better instruction order control, 299 * which is important when load is unaligned. 300 */ 301 if (n >= 128) { 302 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); 303 bits = n; 304 n = n & 127; 305 bits -= n; 306 src = (const uint8_t *)src + bits; 307 dst = (uint8_t *)dst + bits; 308 } 309 310 /** 311 * Copy whatever left 312 */ 313 goto COPY_BLOCK_128_BACK63; 314 } 315 316 #elif defined __AVX2__ 317 318 #define ALIGNMENT_MASK 0x1F 319 320 /** 321 * AVX2 implementation below 322 */ 323 324 /** 325 * Copy 16 bytes from one location to another, 326 * locations should not overlap. 327 */ 328 static __rte_always_inline void 329 rte_mov16(uint8_t *dst, const uint8_t *src) 330 { 331 __m128i xmm0; 332 333 xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); 334 _mm_storeu_si128((__m128i *)(void *)dst, xmm0); 335 } 336 337 /** 338 * Copy 32 bytes from one location to another, 339 * locations should not overlap. 340 */ 341 static __rte_always_inline void 342 rte_mov32(uint8_t *dst, const uint8_t *src) 343 { 344 __m256i ymm0; 345 346 ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); 347 _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); 348 } 349 350 /** 351 * Copy 64 bytes from one location to another, 352 * locations should not overlap. 353 */ 354 static __rte_always_inline void 355 rte_mov64(uint8_t *dst, const uint8_t *src) 356 { 357 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); 358 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); 359 } 360 361 /** 362 * Copy 128 bytes from one location to another, 363 * locations should not overlap. 364 */ 365 static __rte_always_inline void 366 rte_mov128(uint8_t *dst, const uint8_t *src) 367 { 368 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); 369 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); 370 rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); 371 rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); 372 } 373 374 /** 375 * Copy 128-byte blocks from one location to another, 376 * locations should not overlap. 377 */ 378 static __rte_always_inline void 379 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) 380 { 381 __m256i ymm0, ymm1, ymm2, ymm3; 382 383 while (n >= 128) { 384 ymm0 = _mm256_loadu_si256((const __m256i *)(const void *) 385 ((const uint8_t *)src + 0 * 32)); 386 n -= 128; 387 ymm1 = _mm256_loadu_si256((const __m256i *)(const void *) 388 ((const uint8_t *)src + 1 * 32)); 389 ymm2 = _mm256_loadu_si256((const __m256i *)(const void *) 390 ((const uint8_t *)src + 2 * 32)); 391 ymm3 = _mm256_loadu_si256((const __m256i *)(const void *) 392 ((const uint8_t *)src + 3 * 32)); 393 src = (const uint8_t *)src + 128; 394 _mm256_storeu_si256((__m256i *)(void *) 395 ((uint8_t *)dst + 0 * 32), ymm0); 396 _mm256_storeu_si256((__m256i *)(void *) 397 ((uint8_t *)dst + 1 * 32), ymm1); 398 _mm256_storeu_si256((__m256i *)(void *) 399 ((uint8_t *)dst + 2 * 32), ymm2); 400 _mm256_storeu_si256((__m256i *)(void *) 401 ((uint8_t *)dst + 3 * 32), ymm3); 402 dst = (uint8_t *)dst + 128; 403 } 404 } 405 406 static __rte_always_inline void * 407 rte_memcpy_generic(void *dst, const void *src, size_t n) 408 { 409 void *ret = dst; 410 size_t dstofss; 411 size_t bits; 412 413 /** 414 * Copy less than 16 bytes 415 */ 416 if (n < 16) { 417 return rte_mov15_or_less(dst, src, n); 418 } 419 420 /** 421 * Fast way when copy size doesn't exceed 256 bytes 422 */ 423 if (n <= 32) { 424 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 425 rte_mov16((uint8_t *)dst - 16 + n, 426 (const uint8_t *)src - 16 + n); 427 return ret; 428 } 429 if (n <= 48) { 430 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 431 rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); 432 rte_mov16((uint8_t *)dst - 16 + n, 433 (const uint8_t *)src - 16 + n); 434 return ret; 435 } 436 if (n <= 64) { 437 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 438 rte_mov32((uint8_t *)dst - 32 + n, 439 (const uint8_t *)src - 32 + n); 440 return ret; 441 } 442 if (n <= 256) { 443 if (n >= 128) { 444 n -= 128; 445 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 446 src = (const uint8_t *)src + 128; 447 dst = (uint8_t *)dst + 128; 448 } 449 COPY_BLOCK_128_BACK31: 450 if (n >= 64) { 451 n -= 64; 452 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 453 src = (const uint8_t *)src + 64; 454 dst = (uint8_t *)dst + 64; 455 } 456 if (n > 32) { 457 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 458 rte_mov32((uint8_t *)dst - 32 + n, 459 (const uint8_t *)src - 32 + n); 460 return ret; 461 } 462 if (n > 0) { 463 rte_mov32((uint8_t *)dst - 32 + n, 464 (const uint8_t *)src - 32 + n); 465 } 466 return ret; 467 } 468 469 /** 470 * Make store aligned when copy size exceeds 256 bytes 471 */ 472 dstofss = (uintptr_t)dst & 0x1F; 473 if (dstofss > 0) { 474 dstofss = 32 - dstofss; 475 n -= dstofss; 476 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 477 src = (const uint8_t *)src + dstofss; 478 dst = (uint8_t *)dst + dstofss; 479 } 480 481 /** 482 * Copy 128-byte blocks 483 */ 484 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); 485 bits = n; 486 n = n & 127; 487 bits -= n; 488 src = (const uint8_t *)src + bits; 489 dst = (uint8_t *)dst + bits; 490 491 /** 492 * Copy whatever left 493 */ 494 goto COPY_BLOCK_128_BACK31; 495 } 496 497 #else /* __AVX512F__ */ 498 499 #define ALIGNMENT_MASK 0x0F 500 501 /** 502 * SSE & AVX implementation below 503 */ 504 505 /** 506 * Copy 16 bytes from one location to another, 507 * locations should not overlap. 508 */ 509 static __rte_always_inline void 510 rte_mov16(uint8_t *dst, const uint8_t *src) 511 { 512 __m128i xmm0; 513 514 xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); 515 _mm_storeu_si128((__m128i *)(void *)dst, xmm0); 516 } 517 518 /** 519 * Copy 32 bytes from one location to another, 520 * locations should not overlap. 521 */ 522 static __rte_always_inline void 523 rte_mov32(uint8_t *dst, const uint8_t *src) 524 { 525 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); 526 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); 527 } 528 529 /** 530 * Copy 64 bytes from one location to another, 531 * locations should not overlap. 532 */ 533 static __rte_always_inline void 534 rte_mov64(uint8_t *dst, const uint8_t *src) 535 { 536 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); 537 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); 538 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); 539 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); 540 } 541 542 /** 543 * Copy 128 bytes from one location to another, 544 * locations should not overlap. 545 */ 546 static __rte_always_inline void 547 rte_mov128(uint8_t *dst, const uint8_t *src) 548 { 549 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); 550 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); 551 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); 552 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); 553 rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); 554 rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); 555 rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); 556 rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); 557 } 558 559 /** 560 * Copy 256 bytes from one location to another, 561 * locations should not overlap. 562 */ 563 static inline void 564 rte_mov256(uint8_t *dst, const uint8_t *src) 565 { 566 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); 567 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); 568 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); 569 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); 570 rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); 571 rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); 572 rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); 573 rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); 574 rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); 575 rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); 576 rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); 577 rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); 578 rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); 579 rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); 580 rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); 581 rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); 582 } 583 584 /** 585 * Macro for copying unaligned block from one location to another with constant load offset, 586 * 47 bytes leftover maximum, 587 * locations should not overlap. 588 * Requirements: 589 * - Store is aligned 590 * - Load offset is <offset>, which must be immediate value within [1, 15] 591 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading 592 * - <dst>, <src>, <len> must be variables 593 * - __m128i <xmm0> ~ <xmm8> must be pre-defined 594 */ 595 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset) \ 596 __extension__ ({ \ 597 size_t tmp; \ 598 while (len >= 128 + 16 - offset) { \ 599 xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16)); \ 600 len -= 128; \ 601 xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16)); \ 602 xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16)); \ 603 xmm3 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 3 * 16)); \ 604 xmm4 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 4 * 16)); \ 605 xmm5 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 5 * 16)); \ 606 xmm6 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 6 * 16)); \ 607 xmm7 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 7 * 16)); \ 608 xmm8 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 8 * 16)); \ 609 src = (const uint8_t *)src + 128; \ 610 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ 611 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ 612 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ 613 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ 614 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ 615 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ 616 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ 617 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ 618 dst = (uint8_t *)dst + 128; \ 619 } \ 620 tmp = len; \ 621 len = ((len - 16 + offset) & 127) + 16 - offset; \ 622 tmp -= len; \ 623 src = (const uint8_t *)src + tmp; \ 624 dst = (uint8_t *)dst + tmp; \ 625 if (len >= 32 + 16 - offset) { \ 626 while (len >= 32 + 16 - offset) { \ 627 xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16)); \ 628 len -= 32; \ 629 xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16)); \ 630 xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16)); \ 631 src = (const uint8_t *)src + 32; \ 632 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ 633 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ 634 dst = (uint8_t *)dst + 32; \ 635 } \ 636 tmp = len; \ 637 len = ((len - 16 + offset) & 31) + 16 - offset; \ 638 tmp -= len; \ 639 src = (const uint8_t *)src + tmp; \ 640 dst = (uint8_t *)dst + tmp; \ 641 } \ 642 }) 643 644 /** 645 * Macro for copying unaligned block from one location to another, 646 * 47 bytes leftover maximum, 647 * locations should not overlap. 648 * Use switch here because the aligning instruction requires immediate value for shift count. 649 * Requirements: 650 * - Store is aligned 651 * - Load offset is <offset>, which must be within [1, 15] 652 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading 653 * - <dst>, <src>, <len> must be variables 654 * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined 655 */ 656 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset) \ 657 __extension__ ({ \ 658 switch (offset) { \ 659 case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ 660 case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ 661 case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ 662 case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ 663 case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ 664 case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ 665 case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ 666 case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ 667 case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ 668 case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ 669 case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ 670 case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ 671 case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ 672 case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ 673 case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ 674 default:; \ 675 } \ 676 }) 677 678 static __rte_always_inline void * 679 rte_memcpy_generic(void *dst, const void *src, size_t n) 680 { 681 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; 682 void *ret = dst; 683 size_t dstofss; 684 size_t srcofs; 685 686 /** 687 * Copy less than 16 bytes 688 */ 689 if (n < 16) { 690 return rte_mov15_or_less(dst, src, n); 691 } 692 693 /** 694 * Fast way when copy size doesn't exceed 512 bytes 695 */ 696 if (n <= 32) { 697 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 698 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 699 return ret; 700 } 701 if (n <= 48) { 702 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 703 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 704 return ret; 705 } 706 if (n <= 64) { 707 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 708 rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); 709 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 710 return ret; 711 } 712 if (n <= 128) { 713 goto COPY_BLOCK_128_BACK15; 714 } 715 if (n <= 512) { 716 if (n >= 256) { 717 n -= 256; 718 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 719 rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128); 720 src = (const uint8_t *)src + 256; 721 dst = (uint8_t *)dst + 256; 722 } 723 COPY_BLOCK_255_BACK15: 724 if (n >= 128) { 725 n -= 128; 726 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 727 src = (const uint8_t *)src + 128; 728 dst = (uint8_t *)dst + 128; 729 } 730 COPY_BLOCK_128_BACK15: 731 if (n >= 64) { 732 n -= 64; 733 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 734 src = (const uint8_t *)src + 64; 735 dst = (uint8_t *)dst + 64; 736 } 737 COPY_BLOCK_64_BACK15: 738 if (n >= 32) { 739 n -= 32; 740 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 741 src = (const uint8_t *)src + 32; 742 dst = (uint8_t *)dst + 32; 743 } 744 if (n > 16) { 745 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 746 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 747 return ret; 748 } 749 if (n > 0) { 750 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 751 } 752 return ret; 753 } 754 755 /** 756 * Make store aligned when copy size exceeds 512 bytes, 757 * and make sure the first 15 bytes are copied, because 758 * unaligned copy functions require up to 15 bytes 759 * backwards access. 760 */ 761 dstofss = (uintptr_t)dst & 0x0F; 762 if (dstofss > 0) { 763 dstofss = 16 - dstofss + 16; 764 n -= dstofss; 765 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 766 src = (const uint8_t *)src + dstofss; 767 dst = (uint8_t *)dst + dstofss; 768 } 769 srcofs = ((uintptr_t)src & 0x0F); 770 771 /** 772 * For aligned copy 773 */ 774 if (srcofs == 0) { 775 /** 776 * Copy 256-byte blocks 777 */ 778 for (; n >= 256; n -= 256) { 779 rte_mov256((uint8_t *)dst, (const uint8_t *)src); 780 dst = (uint8_t *)dst + 256; 781 src = (const uint8_t *)src + 256; 782 } 783 784 /** 785 * Copy whatever left 786 */ 787 goto COPY_BLOCK_255_BACK15; 788 } 789 790 /** 791 * For copy with unaligned load 792 */ 793 MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); 794 795 /** 796 * Copy whatever left 797 */ 798 goto COPY_BLOCK_64_BACK15; 799 } 800 801 #endif /* __AVX512F__ */ 802 803 static __rte_always_inline void * 804 rte_memcpy_aligned(void *dst, const void *src, size_t n) 805 { 806 void *ret = dst; 807 808 /* Copy size < 16 bytes */ 809 if (n < 16) { 810 return rte_mov15_or_less(dst, src, n); 811 } 812 813 /* Copy 16 <= size <= 32 bytes */ 814 if (n <= 32) { 815 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 816 rte_mov16((uint8_t *)dst - 16 + n, 817 (const uint8_t *)src - 16 + n); 818 819 return ret; 820 } 821 822 /* Copy 32 < size <= 64 bytes */ 823 if (n <= 64) { 824 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 825 rte_mov32((uint8_t *)dst - 32 + n, 826 (const uint8_t *)src - 32 + n); 827 828 return ret; 829 } 830 831 /* Copy 64 bytes blocks */ 832 for (; n >= 64; n -= 64) { 833 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 834 dst = (uint8_t *)dst + 64; 835 src = (const uint8_t *)src + 64; 836 } 837 838 /* Copy whatever left */ 839 rte_mov64((uint8_t *)dst - 64 + n, 840 (const uint8_t *)src - 64 + n); 841 842 return ret; 843 } 844 845 static __rte_always_inline void * 846 rte_memcpy(void *dst, const void *src, size_t n) 847 { 848 if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK)) 849 return rte_memcpy_aligned(dst, src, n); 850 else 851 return rte_memcpy_generic(dst, src, n); 852 } 853 854 #undef ALIGNMENT_MASK 855 856 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) 857 #pragma GCC diagnostic pop 858 #endif 859 860 #ifdef __cplusplus 861 } 862 #endif 863 864 #endif /* _RTE_MEMCPY_X86_64_H_ */ 865