1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation 3 */ 4 5 #ifndef _RTE_MEMCPY_X86_64_H_ 6 #define _RTE_MEMCPY_X86_64_H_ 7 8 /** 9 * @file 10 * 11 * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy(). 12 */ 13 14 #include <stdio.h> 15 #include <stdint.h> 16 #include <string.h> 17 #include <rte_vect.h> 18 #include <rte_common.h> 19 #include <rte_config.h> 20 21 #ifdef __cplusplus 22 extern "C" { 23 #endif 24 25 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) 26 #pragma GCC diagnostic push 27 #pragma GCC diagnostic ignored "-Wstringop-overflow" 28 #endif 29 30 /** 31 * Copy bytes from one location to another. The locations must not overlap. 32 * 33 * @note This is implemented as a macro, so it's address should not be taken 34 * and care is needed as parameter expressions may be evaluated multiple times. 35 * 36 * @param dst 37 * Pointer to the destination of the data. 38 * @param src 39 * Pointer to the source data. 40 * @param n 41 * Number of bytes to copy. 42 * @return 43 * Pointer to the destination data. 44 */ 45 static __rte_always_inline void * 46 rte_memcpy(void *dst, const void *src, size_t n); 47 48 /** 49 * Copy bytes from one location to another, 50 * locations should not overlap. 51 * Use with n <= 15. 52 */ 53 static __rte_always_inline void * 54 rte_mov15_or_less(void *dst, const void *src, size_t n) 55 { 56 /** 57 * Use the following structs to avoid violating C standard 58 * alignment requirements and to avoid strict aliasing bugs 59 */ 60 struct rte_uint64_alias { 61 uint64_t val; 62 } __rte_packed __rte_may_alias; 63 struct rte_uint32_alias { 64 uint32_t val; 65 } __rte_packed __rte_may_alias; 66 struct rte_uint16_alias { 67 uint16_t val; 68 } __rte_packed __rte_may_alias; 69 70 void *ret = dst; 71 if (n & 8) { 72 ((struct rte_uint64_alias *)dst)->val = 73 ((const struct rte_uint64_alias *)src)->val; 74 src = (const uint64_t *)src + 1; 75 dst = (uint64_t *)dst + 1; 76 } 77 if (n & 4) { 78 ((struct rte_uint32_alias *)dst)->val = 79 ((const struct rte_uint32_alias *)src)->val; 80 src = (const uint32_t *)src + 1; 81 dst = (uint32_t *)dst + 1; 82 } 83 if (n & 2) { 84 ((struct rte_uint16_alias *)dst)->val = 85 ((const struct rte_uint16_alias *)src)->val; 86 src = (const uint16_t *)src + 1; 87 dst = (uint16_t *)dst + 1; 88 } 89 if (n & 1) 90 *(uint8_t *)dst = *(const uint8_t *)src; 91 return ret; 92 } 93 94 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 95 96 #define ALIGNMENT_MASK 0x3F 97 98 /** 99 * AVX512 implementation below 100 */ 101 102 /** 103 * Copy 16 bytes from one location to another, 104 * locations should not overlap. 105 */ 106 static __rte_always_inline void 107 rte_mov16(uint8_t *dst, const uint8_t *src) 108 { 109 __m128i xmm0; 110 111 xmm0 = _mm_loadu_si128((const __m128i *)src); 112 _mm_storeu_si128((__m128i *)dst, xmm0); 113 } 114 115 /** 116 * Copy 32 bytes from one location to another, 117 * locations should not overlap. 118 */ 119 static __rte_always_inline void 120 rte_mov32(uint8_t *dst, const uint8_t *src) 121 { 122 __m256i ymm0; 123 124 ymm0 = _mm256_loadu_si256((const __m256i *)src); 125 _mm256_storeu_si256((__m256i *)dst, ymm0); 126 } 127 128 /** 129 * Copy 64 bytes from one location to another, 130 * locations should not overlap. 131 */ 132 static __rte_always_inline void 133 rte_mov64(uint8_t *dst, const uint8_t *src) 134 { 135 __m512i zmm0; 136 137 zmm0 = _mm512_loadu_si512((const void *)src); 138 _mm512_storeu_si512((void *)dst, zmm0); 139 } 140 141 /** 142 * Copy 128 bytes from one location to another, 143 * locations should not overlap. 144 */ 145 static __rte_always_inline void 146 rte_mov128(uint8_t *dst, const uint8_t *src) 147 { 148 rte_mov64(dst + 0 * 64, src + 0 * 64); 149 rte_mov64(dst + 1 * 64, src + 1 * 64); 150 } 151 152 /** 153 * Copy 256 bytes from one location to another, 154 * locations should not overlap. 155 */ 156 static __rte_always_inline void 157 rte_mov256(uint8_t *dst, const uint8_t *src) 158 { 159 rte_mov64(dst + 0 * 64, src + 0 * 64); 160 rte_mov64(dst + 1 * 64, src + 1 * 64); 161 rte_mov64(dst + 2 * 64, src + 2 * 64); 162 rte_mov64(dst + 3 * 64, src + 3 * 64); 163 } 164 165 /** 166 * Copy 128-byte blocks from one location to another, 167 * locations should not overlap. 168 */ 169 static __rte_always_inline void 170 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) 171 { 172 __m512i zmm0, zmm1; 173 174 while (n >= 128) { 175 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); 176 n -= 128; 177 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); 178 src = src + 128; 179 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); 180 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); 181 dst = dst + 128; 182 } 183 } 184 185 /** 186 * Copy 512-byte blocks from one location to another, 187 * locations should not overlap. 188 */ 189 static inline void 190 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) 191 { 192 __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7; 193 194 while (n >= 512) { 195 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); 196 n -= 512; 197 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); 198 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64)); 199 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64)); 200 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64)); 201 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64)); 202 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64)); 203 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64)); 204 src = src + 512; 205 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); 206 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); 207 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2); 208 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3); 209 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4); 210 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5); 211 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6); 212 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7); 213 dst = dst + 512; 214 } 215 } 216 217 static __rte_always_inline void * 218 rte_memcpy_generic(void *dst, const void *src, size_t n) 219 { 220 void *ret = dst; 221 size_t dstofss; 222 size_t bits; 223 224 /** 225 * Copy less than 16 bytes 226 */ 227 if (n < 16) { 228 return rte_mov15_or_less(dst, src, n); 229 } 230 231 /** 232 * Fast way when copy size doesn't exceed 512 bytes 233 */ 234 if (n <= 32) { 235 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 236 rte_mov16((uint8_t *)dst - 16 + n, 237 (const uint8_t *)src - 16 + n); 238 return ret; 239 } 240 if (n <= 64) { 241 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 242 rte_mov32((uint8_t *)dst - 32 + n, 243 (const uint8_t *)src - 32 + n); 244 return ret; 245 } 246 if (n <= 512) { 247 if (n >= 256) { 248 n -= 256; 249 rte_mov256((uint8_t *)dst, (const uint8_t *)src); 250 src = (const uint8_t *)src + 256; 251 dst = (uint8_t *)dst + 256; 252 } 253 if (n >= 128) { 254 n -= 128; 255 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 256 src = (const uint8_t *)src + 128; 257 dst = (uint8_t *)dst + 128; 258 } 259 COPY_BLOCK_128_BACK63: 260 if (n > 64) { 261 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 262 rte_mov64((uint8_t *)dst - 64 + n, 263 (const uint8_t *)src - 64 + n); 264 return ret; 265 } 266 if (n > 0) 267 rte_mov64((uint8_t *)dst - 64 + n, 268 (const uint8_t *)src - 64 + n); 269 return ret; 270 } 271 272 /** 273 * Make store aligned when copy size exceeds 512 bytes 274 */ 275 dstofss = ((uintptr_t)dst & 0x3F); 276 if (dstofss > 0) { 277 dstofss = 64 - dstofss; 278 n -= dstofss; 279 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 280 src = (const uint8_t *)src + dstofss; 281 dst = (uint8_t *)dst + dstofss; 282 } 283 284 /** 285 * Copy 512-byte blocks. 286 * Use copy block function for better instruction order control, 287 * which is important when load is unaligned. 288 */ 289 rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n); 290 bits = n; 291 n = n & 511; 292 bits -= n; 293 src = (const uint8_t *)src + bits; 294 dst = (uint8_t *)dst + bits; 295 296 /** 297 * Copy 128-byte blocks. 298 * Use copy block function for better instruction order control, 299 * which is important when load is unaligned. 300 */ 301 if (n >= 128) { 302 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); 303 bits = n; 304 n = n & 127; 305 bits -= n; 306 src = (const uint8_t *)src + bits; 307 dst = (uint8_t *)dst + bits; 308 } 309 310 /** 311 * Copy whatever left 312 */ 313 goto COPY_BLOCK_128_BACK63; 314 } 315 316 #elif defined __AVX2__ 317 318 #define ALIGNMENT_MASK 0x1F 319 320 /** 321 * AVX2 implementation below 322 */ 323 324 /** 325 * Copy 16 bytes from one location to another, 326 * locations should not overlap. 327 */ 328 static __rte_always_inline void 329 rte_mov16(uint8_t *dst, const uint8_t *src) 330 { 331 __m128i xmm0; 332 333 xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); 334 _mm_storeu_si128((__m128i *)(void *)dst, xmm0); 335 } 336 337 /** 338 * Copy 32 bytes from one location to another, 339 * locations should not overlap. 340 */ 341 static __rte_always_inline void 342 rte_mov32(uint8_t *dst, const uint8_t *src) 343 { 344 __m256i ymm0; 345 346 ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); 347 _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); 348 } 349 350 /** 351 * Copy 64 bytes from one location to another, 352 * locations should not overlap. 353 */ 354 static __rte_always_inline void 355 rte_mov64(uint8_t *dst, const uint8_t *src) 356 { 357 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); 358 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); 359 } 360 361 /** 362 * Copy 128 bytes from one location to another, 363 * locations should not overlap. 364 */ 365 static __rte_always_inline void 366 rte_mov128(uint8_t *dst, const uint8_t *src) 367 { 368 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); 369 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); 370 rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); 371 rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); 372 } 373 374 /** 375 * Copy 256 bytes from one location to another, 376 * locations should not overlap. 377 */ 378 static __rte_always_inline void 379 rte_mov256(uint8_t *dst, const uint8_t *src) 380 { 381 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); 382 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); 383 rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); 384 rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); 385 rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32); 386 rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32); 387 rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32); 388 rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32); 389 } 390 391 /** 392 * Copy 128-byte blocks from one location to another, 393 * locations should not overlap. 394 */ 395 static __rte_always_inline void 396 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) 397 { 398 __m256i ymm0, ymm1, ymm2, ymm3; 399 400 while (n >= 128) { 401 ymm0 = _mm256_loadu_si256((const __m256i *)(const void *) 402 ((const uint8_t *)src + 0 * 32)); 403 n -= 128; 404 ymm1 = _mm256_loadu_si256((const __m256i *)(const void *) 405 ((const uint8_t *)src + 1 * 32)); 406 ymm2 = _mm256_loadu_si256((const __m256i *)(const void *) 407 ((const uint8_t *)src + 2 * 32)); 408 ymm3 = _mm256_loadu_si256((const __m256i *)(const void *) 409 ((const uint8_t *)src + 3 * 32)); 410 src = (const uint8_t *)src + 128; 411 _mm256_storeu_si256((__m256i *)(void *) 412 ((uint8_t *)dst + 0 * 32), ymm0); 413 _mm256_storeu_si256((__m256i *)(void *) 414 ((uint8_t *)dst + 1 * 32), ymm1); 415 _mm256_storeu_si256((__m256i *)(void *) 416 ((uint8_t *)dst + 2 * 32), ymm2); 417 _mm256_storeu_si256((__m256i *)(void *) 418 ((uint8_t *)dst + 3 * 32), ymm3); 419 dst = (uint8_t *)dst + 128; 420 } 421 } 422 423 static __rte_always_inline void * 424 rte_memcpy_generic(void *dst, const void *src, size_t n) 425 { 426 void *ret = dst; 427 size_t dstofss; 428 size_t bits; 429 430 /** 431 * Copy less than 16 bytes 432 */ 433 if (n < 16) { 434 return rte_mov15_or_less(dst, src, n); 435 } 436 437 /** 438 * Fast way when copy size doesn't exceed 256 bytes 439 */ 440 if (n <= 32) { 441 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 442 rte_mov16((uint8_t *)dst - 16 + n, 443 (const uint8_t *)src - 16 + n); 444 return ret; 445 } 446 if (n <= 48) { 447 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 448 rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); 449 rte_mov16((uint8_t *)dst - 16 + n, 450 (const uint8_t *)src - 16 + n); 451 return ret; 452 } 453 if (n <= 64) { 454 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 455 rte_mov32((uint8_t *)dst - 32 + n, 456 (const uint8_t *)src - 32 + n); 457 return ret; 458 } 459 if (n <= 256) { 460 if (n >= 128) { 461 n -= 128; 462 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 463 src = (const uint8_t *)src + 128; 464 dst = (uint8_t *)dst + 128; 465 } 466 COPY_BLOCK_128_BACK31: 467 if (n >= 64) { 468 n -= 64; 469 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 470 src = (const uint8_t *)src + 64; 471 dst = (uint8_t *)dst + 64; 472 } 473 if (n > 32) { 474 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 475 rte_mov32((uint8_t *)dst - 32 + n, 476 (const uint8_t *)src - 32 + n); 477 return ret; 478 } 479 if (n > 0) { 480 rte_mov32((uint8_t *)dst - 32 + n, 481 (const uint8_t *)src - 32 + n); 482 } 483 return ret; 484 } 485 486 /** 487 * Make store aligned when copy size exceeds 256 bytes 488 */ 489 dstofss = (uintptr_t)dst & 0x1F; 490 if (dstofss > 0) { 491 dstofss = 32 - dstofss; 492 n -= dstofss; 493 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 494 src = (const uint8_t *)src + dstofss; 495 dst = (uint8_t *)dst + dstofss; 496 } 497 498 /** 499 * Copy 128-byte blocks 500 */ 501 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); 502 bits = n; 503 n = n & 127; 504 bits -= n; 505 src = (const uint8_t *)src + bits; 506 dst = (uint8_t *)dst + bits; 507 508 /** 509 * Copy whatever left 510 */ 511 goto COPY_BLOCK_128_BACK31; 512 } 513 514 #else /* __AVX512F__ */ 515 516 #define ALIGNMENT_MASK 0x0F 517 518 /** 519 * SSE & AVX implementation below 520 */ 521 522 /** 523 * Copy 16 bytes from one location to another, 524 * locations should not overlap. 525 */ 526 static __rte_always_inline void 527 rte_mov16(uint8_t *dst, const uint8_t *src) 528 { 529 __m128i xmm0; 530 531 xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); 532 _mm_storeu_si128((__m128i *)(void *)dst, xmm0); 533 } 534 535 /** 536 * Copy 32 bytes from one location to another, 537 * locations should not overlap. 538 */ 539 static __rte_always_inline void 540 rte_mov32(uint8_t *dst, const uint8_t *src) 541 { 542 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); 543 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); 544 } 545 546 /** 547 * Copy 64 bytes from one location to another, 548 * locations should not overlap. 549 */ 550 static __rte_always_inline void 551 rte_mov64(uint8_t *dst, const uint8_t *src) 552 { 553 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); 554 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); 555 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); 556 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); 557 } 558 559 /** 560 * Copy 128 bytes from one location to another, 561 * locations should not overlap. 562 */ 563 static __rte_always_inline void 564 rte_mov128(uint8_t *dst, const uint8_t *src) 565 { 566 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); 567 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); 568 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); 569 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); 570 rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); 571 rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); 572 rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); 573 rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); 574 } 575 576 /** 577 * Copy 256 bytes from one location to another, 578 * locations should not overlap. 579 */ 580 static inline void 581 rte_mov256(uint8_t *dst, const uint8_t *src) 582 { 583 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); 584 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); 585 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); 586 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); 587 rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); 588 rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); 589 rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); 590 rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); 591 rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); 592 rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); 593 rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); 594 rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); 595 rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); 596 rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); 597 rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); 598 rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); 599 } 600 601 /** 602 * Macro for copying unaligned block from one location to another with constant load offset, 603 * 47 bytes leftover maximum, 604 * locations should not overlap. 605 * Requirements: 606 * - Store is aligned 607 * - Load offset is <offset>, which must be immediate value within [1, 15] 608 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading 609 * - <dst>, <src>, <len> must be variables 610 * - __m128i <xmm0> ~ <xmm8> must be pre-defined 611 */ 612 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset) \ 613 { \ 614 size_t tmp; \ 615 while (len >= 128 + 16 - offset) { \ 616 xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16)); \ 617 len -= 128; \ 618 xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16)); \ 619 xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16)); \ 620 xmm3 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 3 * 16)); \ 621 xmm4 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 4 * 16)); \ 622 xmm5 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 5 * 16)); \ 623 xmm6 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 6 * 16)); \ 624 xmm7 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 7 * 16)); \ 625 xmm8 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 8 * 16)); \ 626 src = (const uint8_t *)src + 128; \ 627 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ 628 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ 629 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ 630 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ 631 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ 632 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ 633 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ 634 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ 635 dst = (uint8_t *)dst + 128; \ 636 } \ 637 tmp = len; \ 638 len = ((len - 16 + offset) & 127) + 16 - offset; \ 639 tmp -= len; \ 640 src = (const uint8_t *)src + tmp; \ 641 dst = (uint8_t *)dst + tmp; \ 642 if (len >= 32 + 16 - offset) { \ 643 while (len >= 32 + 16 - offset) { \ 644 xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16)); \ 645 len -= 32; \ 646 xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16)); \ 647 xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16)); \ 648 src = (const uint8_t *)src + 32; \ 649 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ 650 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ 651 dst = (uint8_t *)dst + 32; \ 652 } \ 653 tmp = len; \ 654 len = ((len - 16 + offset) & 31) + 16 - offset; \ 655 tmp -= len; \ 656 src = (const uint8_t *)src + tmp; \ 657 dst = (uint8_t *)dst + tmp; \ 658 } \ 659 } 660 661 /** 662 * Macro for copying unaligned block from one location to another, 663 * 47 bytes leftover maximum, 664 * locations should not overlap. 665 * Use switch here because the aligning instruction requires immediate value for shift count. 666 * Requirements: 667 * - Store is aligned 668 * - Load offset is <offset>, which must be within [1, 15] 669 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading 670 * - <dst>, <src>, <len> must be variables 671 * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined 672 */ 673 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset) \ 674 { \ 675 switch (offset) { \ 676 case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ 677 case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ 678 case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ 679 case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ 680 case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ 681 case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ 682 case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ 683 case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ 684 case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ 685 case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ 686 case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ 687 case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ 688 case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ 689 case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ 690 case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ 691 default:; \ 692 } \ 693 } 694 695 static __rte_always_inline void * 696 rte_memcpy_generic(void *dst, const void *src, size_t n) 697 { 698 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; 699 void *ret = dst; 700 size_t dstofss; 701 size_t srcofs; 702 703 /** 704 * Copy less than 16 bytes 705 */ 706 if (n < 16) { 707 return rte_mov15_or_less(dst, src, n); 708 } 709 710 /** 711 * Fast way when copy size doesn't exceed 512 bytes 712 */ 713 if (n <= 32) { 714 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 715 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 716 return ret; 717 } 718 if (n <= 48) { 719 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 720 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 721 return ret; 722 } 723 if (n <= 64) { 724 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 725 rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); 726 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 727 return ret; 728 } 729 if (n <= 128) { 730 goto COPY_BLOCK_128_BACK15; 731 } 732 if (n <= 512) { 733 if (n >= 256) { 734 n -= 256; 735 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 736 rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128); 737 src = (const uint8_t *)src + 256; 738 dst = (uint8_t *)dst + 256; 739 } 740 COPY_BLOCK_255_BACK15: 741 if (n >= 128) { 742 n -= 128; 743 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 744 src = (const uint8_t *)src + 128; 745 dst = (uint8_t *)dst + 128; 746 } 747 COPY_BLOCK_128_BACK15: 748 if (n >= 64) { 749 n -= 64; 750 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 751 src = (const uint8_t *)src + 64; 752 dst = (uint8_t *)dst + 64; 753 } 754 COPY_BLOCK_64_BACK15: 755 if (n >= 32) { 756 n -= 32; 757 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 758 src = (const uint8_t *)src + 32; 759 dst = (uint8_t *)dst + 32; 760 } 761 if (n > 16) { 762 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 763 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 764 return ret; 765 } 766 if (n > 0) { 767 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 768 } 769 return ret; 770 } 771 772 /** 773 * Make store aligned when copy size exceeds 512 bytes, 774 * and make sure the first 15 bytes are copied, because 775 * unaligned copy functions require up to 15 bytes 776 * backwards access. 777 */ 778 dstofss = (uintptr_t)dst & 0x0F; 779 if (dstofss > 0) { 780 dstofss = 16 - dstofss + 16; 781 n -= dstofss; 782 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 783 src = (const uint8_t *)src + dstofss; 784 dst = (uint8_t *)dst + dstofss; 785 } 786 srcofs = ((uintptr_t)src & 0x0F); 787 788 /** 789 * For aligned copy 790 */ 791 if (srcofs == 0) { 792 /** 793 * Copy 256-byte blocks 794 */ 795 for (; n >= 256; n -= 256) { 796 rte_mov256((uint8_t *)dst, (const uint8_t *)src); 797 dst = (uint8_t *)dst + 256; 798 src = (const uint8_t *)src + 256; 799 } 800 801 /** 802 * Copy whatever left 803 */ 804 goto COPY_BLOCK_255_BACK15; 805 } 806 807 /** 808 * For copy with unaligned load 809 */ 810 MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); 811 812 /** 813 * Copy whatever left 814 */ 815 goto COPY_BLOCK_64_BACK15; 816 } 817 818 #endif /* __AVX512F__ */ 819 820 static __rte_always_inline void * 821 rte_memcpy_aligned(void *dst, const void *src, size_t n) 822 { 823 void *ret = dst; 824 825 /* Copy size < 16 bytes */ 826 if (n < 16) { 827 return rte_mov15_or_less(dst, src, n); 828 } 829 830 /* Copy 16 <= size <= 32 bytes */ 831 if (n <= 32) { 832 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 833 rte_mov16((uint8_t *)dst - 16 + n, 834 (const uint8_t *)src - 16 + n); 835 836 return ret; 837 } 838 839 /* Copy 32 < size <= 64 bytes */ 840 if (n <= 64) { 841 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 842 rte_mov32((uint8_t *)dst - 32 + n, 843 (const uint8_t *)src - 32 + n); 844 845 return ret; 846 } 847 848 /* Copy 64 bytes blocks */ 849 for (; n > 64; n -= 64) { 850 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 851 dst = (uint8_t *)dst + 64; 852 src = (const uint8_t *)src + 64; 853 } 854 855 /* Copy whatever left */ 856 rte_mov64((uint8_t *)dst - 64 + n, 857 (const uint8_t *)src - 64 + n); 858 859 return ret; 860 } 861 862 static __rte_always_inline void * 863 rte_memcpy(void *dst, const void *src, size_t n) 864 { 865 if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK)) 866 return rte_memcpy_aligned(dst, src, n); 867 else 868 return rte_memcpy_generic(dst, src, n); 869 } 870 871 #undef ALIGNMENT_MASK 872 873 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) 874 #pragma GCC diagnostic pop 875 #endif 876 877 #ifdef __cplusplus 878 } 879 #endif 880 881 #endif /* _RTE_MEMCPY_X86_64_H_ */ 882