1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation 3 */ 4 5 #ifndef _RTE_MEMCPY_X86_64_H_ 6 #define _RTE_MEMCPY_X86_64_H_ 7 8 /** 9 * @file 10 * 11 * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy(). 12 */ 13 14 #include <stdio.h> 15 #include <stdint.h> 16 #include <string.h> 17 #include <rte_vect.h> 18 #include <rte_common.h> 19 #include <rte_config.h> 20 21 #ifdef __cplusplus 22 extern "C" { 23 #endif 24 25 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) 26 #pragma GCC diagnostic push 27 #pragma GCC diagnostic ignored "-Wstringop-overflow" 28 #endif 29 30 /** 31 * Copy bytes from one location to another. The locations must not overlap. 32 * 33 * @note This is implemented as a macro, so it's address should not be taken 34 * and care is needed as parameter expressions may be evaluated multiple times. 35 * 36 * @param dst 37 * Pointer to the destination of the data. 38 * @param src 39 * Pointer to the source data. 40 * @param n 41 * Number of bytes to copy. 42 * @return 43 * Pointer to the destination data. 44 */ 45 static __rte_always_inline void * 46 rte_memcpy(void *dst, const void *src, size_t n); 47 48 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 49 50 #define ALIGNMENT_MASK 0x3F 51 52 /** 53 * AVX512 implementation below 54 */ 55 56 /** 57 * Copy 16 bytes from one location to another, 58 * locations should not overlap. 59 */ 60 static __rte_always_inline void 61 rte_mov16(uint8_t *dst, const uint8_t *src) 62 { 63 __m128i xmm0; 64 65 xmm0 = _mm_loadu_si128((const __m128i *)src); 66 _mm_storeu_si128((__m128i *)dst, xmm0); 67 } 68 69 /** 70 * Copy 32 bytes from one location to another, 71 * locations should not overlap. 72 */ 73 static __rte_always_inline void 74 rte_mov32(uint8_t *dst, const uint8_t *src) 75 { 76 __m256i ymm0; 77 78 ymm0 = _mm256_loadu_si256((const __m256i *)src); 79 _mm256_storeu_si256((__m256i *)dst, ymm0); 80 } 81 82 /** 83 * Copy 64 bytes from one location to another, 84 * locations should not overlap. 85 */ 86 static __rte_always_inline void 87 rte_mov64(uint8_t *dst, const uint8_t *src) 88 { 89 __m512i zmm0; 90 91 zmm0 = _mm512_loadu_si512((const void *)src); 92 _mm512_storeu_si512((void *)dst, zmm0); 93 } 94 95 /** 96 * Copy 128 bytes from one location to another, 97 * locations should not overlap. 98 */ 99 static __rte_always_inline void 100 rte_mov128(uint8_t *dst, const uint8_t *src) 101 { 102 rte_mov64(dst + 0 * 64, src + 0 * 64); 103 rte_mov64(dst + 1 * 64, src + 1 * 64); 104 } 105 106 /** 107 * Copy 256 bytes from one location to another, 108 * locations should not overlap. 109 */ 110 static __rte_always_inline void 111 rte_mov256(uint8_t *dst, const uint8_t *src) 112 { 113 rte_mov64(dst + 0 * 64, src + 0 * 64); 114 rte_mov64(dst + 1 * 64, src + 1 * 64); 115 rte_mov64(dst + 2 * 64, src + 2 * 64); 116 rte_mov64(dst + 3 * 64, src + 3 * 64); 117 } 118 119 /** 120 * Copy 128-byte blocks from one location to another, 121 * locations should not overlap. 122 */ 123 static __rte_always_inline void 124 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) 125 { 126 __m512i zmm0, zmm1; 127 128 while (n >= 128) { 129 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); 130 n -= 128; 131 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); 132 src = src + 128; 133 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); 134 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); 135 dst = dst + 128; 136 } 137 } 138 139 /** 140 * Copy 512-byte blocks from one location to another, 141 * locations should not overlap. 142 */ 143 static inline void 144 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) 145 { 146 __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7; 147 148 while (n >= 512) { 149 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); 150 n -= 512; 151 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); 152 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64)); 153 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64)); 154 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64)); 155 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64)); 156 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64)); 157 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64)); 158 src = src + 512; 159 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); 160 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); 161 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2); 162 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3); 163 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4); 164 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5); 165 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6); 166 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7); 167 dst = dst + 512; 168 } 169 } 170 171 static __rte_always_inline void * 172 rte_memcpy_generic(void *dst, const void *src, size_t n) 173 { 174 uintptr_t dstu = (uintptr_t)dst; 175 uintptr_t srcu = (uintptr_t)src; 176 void *ret = dst; 177 size_t dstofss; 178 size_t bits; 179 180 /** 181 * Copy less than 16 bytes 182 */ 183 if (n < 16) { 184 if (n & 0x01) { 185 *(uint8_t *)dstu = *(const uint8_t *)srcu; 186 srcu = (uintptr_t)((const uint8_t *)srcu + 1); 187 dstu = (uintptr_t)((uint8_t *)dstu + 1); 188 } 189 if (n & 0x02) { 190 *(uint16_t *)dstu = *(const uint16_t *)srcu; 191 srcu = (uintptr_t)((const uint16_t *)srcu + 1); 192 dstu = (uintptr_t)((uint16_t *)dstu + 1); 193 } 194 if (n & 0x04) { 195 *(uint32_t *)dstu = *(const uint32_t *)srcu; 196 srcu = (uintptr_t)((const uint32_t *)srcu + 1); 197 dstu = (uintptr_t)((uint32_t *)dstu + 1); 198 } 199 if (n & 0x08) 200 *(uint64_t *)dstu = *(const uint64_t *)srcu; 201 return ret; 202 } 203 204 /** 205 * Fast way when copy size doesn't exceed 512 bytes 206 */ 207 if (n <= 32) { 208 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 209 rte_mov16((uint8_t *)dst - 16 + n, 210 (const uint8_t *)src - 16 + n); 211 return ret; 212 } 213 if (n <= 64) { 214 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 215 rte_mov32((uint8_t *)dst - 32 + n, 216 (const uint8_t *)src - 32 + n); 217 return ret; 218 } 219 if (n <= 512) { 220 if (n >= 256) { 221 n -= 256; 222 rte_mov256((uint8_t *)dst, (const uint8_t *)src); 223 src = (const uint8_t *)src + 256; 224 dst = (uint8_t *)dst + 256; 225 } 226 if (n >= 128) { 227 n -= 128; 228 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 229 src = (const uint8_t *)src + 128; 230 dst = (uint8_t *)dst + 128; 231 } 232 COPY_BLOCK_128_BACK63: 233 if (n > 64) { 234 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 235 rte_mov64((uint8_t *)dst - 64 + n, 236 (const uint8_t *)src - 64 + n); 237 return ret; 238 } 239 if (n > 0) 240 rte_mov64((uint8_t *)dst - 64 + n, 241 (const uint8_t *)src - 64 + n); 242 return ret; 243 } 244 245 /** 246 * Make store aligned when copy size exceeds 512 bytes 247 */ 248 dstofss = ((uintptr_t)dst & 0x3F); 249 if (dstofss > 0) { 250 dstofss = 64 - dstofss; 251 n -= dstofss; 252 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 253 src = (const uint8_t *)src + dstofss; 254 dst = (uint8_t *)dst + dstofss; 255 } 256 257 /** 258 * Copy 512-byte blocks. 259 * Use copy block function for better instruction order control, 260 * which is important when load is unaligned. 261 */ 262 rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n); 263 bits = n; 264 n = n & 511; 265 bits -= n; 266 src = (const uint8_t *)src + bits; 267 dst = (uint8_t *)dst + bits; 268 269 /** 270 * Copy 128-byte blocks. 271 * Use copy block function for better instruction order control, 272 * which is important when load is unaligned. 273 */ 274 if (n >= 128) { 275 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); 276 bits = n; 277 n = n & 127; 278 bits -= n; 279 src = (const uint8_t *)src + bits; 280 dst = (uint8_t *)dst + bits; 281 } 282 283 /** 284 * Copy whatever left 285 */ 286 goto COPY_BLOCK_128_BACK63; 287 } 288 289 #elif defined __AVX2__ 290 291 #define ALIGNMENT_MASK 0x1F 292 293 /** 294 * AVX2 implementation below 295 */ 296 297 /** 298 * Copy 16 bytes from one location to another, 299 * locations should not overlap. 300 */ 301 static __rte_always_inline void 302 rte_mov16(uint8_t *dst, const uint8_t *src) 303 { 304 __m128i xmm0; 305 306 xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); 307 _mm_storeu_si128((__m128i *)(void *)dst, xmm0); 308 } 309 310 /** 311 * Copy 32 bytes from one location to another, 312 * locations should not overlap. 313 */ 314 static __rte_always_inline void 315 rte_mov32(uint8_t *dst, const uint8_t *src) 316 { 317 __m256i ymm0; 318 319 ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); 320 _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); 321 } 322 323 /** 324 * Copy 64 bytes from one location to another, 325 * locations should not overlap. 326 */ 327 static __rte_always_inline void 328 rte_mov64(uint8_t *dst, const uint8_t *src) 329 { 330 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); 331 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); 332 } 333 334 /** 335 * Copy 128 bytes from one location to another, 336 * locations should not overlap. 337 */ 338 static __rte_always_inline void 339 rte_mov128(uint8_t *dst, const uint8_t *src) 340 { 341 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); 342 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); 343 rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); 344 rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); 345 } 346 347 /** 348 * Copy 128-byte blocks from one location to another, 349 * locations should not overlap. 350 */ 351 static __rte_always_inline void 352 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) 353 { 354 __m256i ymm0, ymm1, ymm2, ymm3; 355 356 while (n >= 128) { 357 ymm0 = _mm256_loadu_si256((const __m256i *)(const void *) 358 ((const uint8_t *)src + 0 * 32)); 359 n -= 128; 360 ymm1 = _mm256_loadu_si256((const __m256i *)(const void *) 361 ((const uint8_t *)src + 1 * 32)); 362 ymm2 = _mm256_loadu_si256((const __m256i *)(const void *) 363 ((const uint8_t *)src + 2 * 32)); 364 ymm3 = _mm256_loadu_si256((const __m256i *)(const void *) 365 ((const uint8_t *)src + 3 * 32)); 366 src = (const uint8_t *)src + 128; 367 _mm256_storeu_si256((__m256i *)(void *) 368 ((uint8_t *)dst + 0 * 32), ymm0); 369 _mm256_storeu_si256((__m256i *)(void *) 370 ((uint8_t *)dst + 1 * 32), ymm1); 371 _mm256_storeu_si256((__m256i *)(void *) 372 ((uint8_t *)dst + 2 * 32), ymm2); 373 _mm256_storeu_si256((__m256i *)(void *) 374 ((uint8_t *)dst + 3 * 32), ymm3); 375 dst = (uint8_t *)dst + 128; 376 } 377 } 378 379 static __rte_always_inline void * 380 rte_memcpy_generic(void *dst, const void *src, size_t n) 381 { 382 uintptr_t dstu = (uintptr_t)dst; 383 uintptr_t srcu = (uintptr_t)src; 384 void *ret = dst; 385 size_t dstofss; 386 size_t bits; 387 388 /** 389 * Copy less than 16 bytes 390 */ 391 if (n < 16) { 392 if (n & 0x01) { 393 *(uint8_t *)dstu = *(const uint8_t *)srcu; 394 srcu = (uintptr_t)((const uint8_t *)srcu + 1); 395 dstu = (uintptr_t)((uint8_t *)dstu + 1); 396 } 397 if (n & 0x02) { 398 *(uint16_t *)dstu = *(const uint16_t *)srcu; 399 srcu = (uintptr_t)((const uint16_t *)srcu + 1); 400 dstu = (uintptr_t)((uint16_t *)dstu + 1); 401 } 402 if (n & 0x04) { 403 *(uint32_t *)dstu = *(const uint32_t *)srcu; 404 srcu = (uintptr_t)((const uint32_t *)srcu + 1); 405 dstu = (uintptr_t)((uint32_t *)dstu + 1); 406 } 407 if (n & 0x08) { 408 *(uint64_t *)dstu = *(const uint64_t *)srcu; 409 } 410 return ret; 411 } 412 413 /** 414 * Fast way when copy size doesn't exceed 256 bytes 415 */ 416 if (n <= 32) { 417 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 418 rte_mov16((uint8_t *)dst - 16 + n, 419 (const uint8_t *)src - 16 + n); 420 return ret; 421 } 422 if (n <= 48) { 423 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 424 rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); 425 rte_mov16((uint8_t *)dst - 16 + n, 426 (const uint8_t *)src - 16 + n); 427 return ret; 428 } 429 if (n <= 64) { 430 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 431 rte_mov32((uint8_t *)dst - 32 + n, 432 (const uint8_t *)src - 32 + n); 433 return ret; 434 } 435 if (n <= 256) { 436 if (n >= 128) { 437 n -= 128; 438 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 439 src = (const uint8_t *)src + 128; 440 dst = (uint8_t *)dst + 128; 441 } 442 COPY_BLOCK_128_BACK31: 443 if (n >= 64) { 444 n -= 64; 445 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 446 src = (const uint8_t *)src + 64; 447 dst = (uint8_t *)dst + 64; 448 } 449 if (n > 32) { 450 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 451 rte_mov32((uint8_t *)dst - 32 + n, 452 (const uint8_t *)src - 32 + n); 453 return ret; 454 } 455 if (n > 0) { 456 rte_mov32((uint8_t *)dst - 32 + n, 457 (const uint8_t *)src - 32 + n); 458 } 459 return ret; 460 } 461 462 /** 463 * Make store aligned when copy size exceeds 256 bytes 464 */ 465 dstofss = (uintptr_t)dst & 0x1F; 466 if (dstofss > 0) { 467 dstofss = 32 - dstofss; 468 n -= dstofss; 469 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 470 src = (const uint8_t *)src + dstofss; 471 dst = (uint8_t *)dst + dstofss; 472 } 473 474 /** 475 * Copy 128-byte blocks 476 */ 477 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); 478 bits = n; 479 n = n & 127; 480 bits -= n; 481 src = (const uint8_t *)src + bits; 482 dst = (uint8_t *)dst + bits; 483 484 /** 485 * Copy whatever left 486 */ 487 goto COPY_BLOCK_128_BACK31; 488 } 489 490 #else /* __AVX512F__ */ 491 492 #define ALIGNMENT_MASK 0x0F 493 494 /** 495 * SSE & AVX implementation below 496 */ 497 498 /** 499 * Copy 16 bytes from one location to another, 500 * locations should not overlap. 501 */ 502 static __rte_always_inline void 503 rte_mov16(uint8_t *dst, const uint8_t *src) 504 { 505 __m128i xmm0; 506 507 xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); 508 _mm_storeu_si128((__m128i *)(void *)dst, xmm0); 509 } 510 511 /** 512 * Copy 32 bytes from one location to another, 513 * locations should not overlap. 514 */ 515 static __rte_always_inline void 516 rte_mov32(uint8_t *dst, const uint8_t *src) 517 { 518 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); 519 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); 520 } 521 522 /** 523 * Copy 64 bytes from one location to another, 524 * locations should not overlap. 525 */ 526 static __rte_always_inline void 527 rte_mov64(uint8_t *dst, const uint8_t *src) 528 { 529 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); 530 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); 531 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); 532 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); 533 } 534 535 /** 536 * Copy 128 bytes from one location to another, 537 * locations should not overlap. 538 */ 539 static __rte_always_inline void 540 rte_mov128(uint8_t *dst, const uint8_t *src) 541 { 542 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); 543 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); 544 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); 545 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); 546 rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); 547 rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); 548 rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); 549 rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); 550 } 551 552 /** 553 * Copy 256 bytes from one location to another, 554 * locations should not overlap. 555 */ 556 static inline void 557 rte_mov256(uint8_t *dst, const uint8_t *src) 558 { 559 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); 560 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); 561 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); 562 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); 563 rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); 564 rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); 565 rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); 566 rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); 567 rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); 568 rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); 569 rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); 570 rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); 571 rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); 572 rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); 573 rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); 574 rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); 575 } 576 577 /** 578 * Macro for copying unaligned block from one location to another with constant load offset, 579 * 47 bytes leftover maximum, 580 * locations should not overlap. 581 * Requirements: 582 * - Store is aligned 583 * - Load offset is <offset>, which must be immediate value within [1, 15] 584 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading 585 * - <dst>, <src>, <len> must be variables 586 * - __m128i <xmm0> ~ <xmm8> must be pre-defined 587 */ 588 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset) \ 589 __extension__ ({ \ 590 size_t tmp; \ 591 while (len >= 128 + 16 - offset) { \ 592 xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16)); \ 593 len -= 128; \ 594 xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16)); \ 595 xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16)); \ 596 xmm3 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 3 * 16)); \ 597 xmm4 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 4 * 16)); \ 598 xmm5 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 5 * 16)); \ 599 xmm6 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 6 * 16)); \ 600 xmm7 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 7 * 16)); \ 601 xmm8 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 8 * 16)); \ 602 src = (const uint8_t *)src + 128; \ 603 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ 604 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ 605 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ 606 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ 607 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ 608 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ 609 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ 610 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ 611 dst = (uint8_t *)dst + 128; \ 612 } \ 613 tmp = len; \ 614 len = ((len - 16 + offset) & 127) + 16 - offset; \ 615 tmp -= len; \ 616 src = (const uint8_t *)src + tmp; \ 617 dst = (uint8_t *)dst + tmp; \ 618 if (len >= 32 + 16 - offset) { \ 619 while (len >= 32 + 16 - offset) { \ 620 xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16)); \ 621 len -= 32; \ 622 xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16)); \ 623 xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16)); \ 624 src = (const uint8_t *)src + 32; \ 625 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ 626 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ 627 dst = (uint8_t *)dst + 32; \ 628 } \ 629 tmp = len; \ 630 len = ((len - 16 + offset) & 31) + 16 - offset; \ 631 tmp -= len; \ 632 src = (const uint8_t *)src + tmp; \ 633 dst = (uint8_t *)dst + tmp; \ 634 } \ 635 }) 636 637 /** 638 * Macro for copying unaligned block from one location to another, 639 * 47 bytes leftover maximum, 640 * locations should not overlap. 641 * Use switch here because the aligning instruction requires immediate value for shift count. 642 * Requirements: 643 * - Store is aligned 644 * - Load offset is <offset>, which must be within [1, 15] 645 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading 646 * - <dst>, <src>, <len> must be variables 647 * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined 648 */ 649 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset) \ 650 __extension__ ({ \ 651 switch (offset) { \ 652 case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ 653 case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ 654 case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ 655 case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ 656 case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ 657 case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ 658 case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ 659 case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ 660 case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ 661 case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ 662 case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ 663 case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ 664 case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ 665 case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ 666 case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ 667 default:; \ 668 } \ 669 }) 670 671 static __rte_always_inline void * 672 rte_memcpy_generic(void *dst, const void *src, size_t n) 673 { 674 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; 675 uintptr_t dstu = (uintptr_t)dst; 676 uintptr_t srcu = (uintptr_t)src; 677 void *ret = dst; 678 size_t dstofss; 679 size_t srcofs; 680 681 /** 682 * Copy less than 16 bytes 683 */ 684 if (n < 16) { 685 if (n & 0x01) { 686 *(uint8_t *)dstu = *(const uint8_t *)srcu; 687 srcu = (uintptr_t)((const uint8_t *)srcu + 1); 688 dstu = (uintptr_t)((uint8_t *)dstu + 1); 689 } 690 if (n & 0x02) { 691 *(uint16_t *)dstu = *(const uint16_t *)srcu; 692 srcu = (uintptr_t)((const uint16_t *)srcu + 1); 693 dstu = (uintptr_t)((uint16_t *)dstu + 1); 694 } 695 if (n & 0x04) { 696 *(uint32_t *)dstu = *(const uint32_t *)srcu; 697 srcu = (uintptr_t)((const uint32_t *)srcu + 1); 698 dstu = (uintptr_t)((uint32_t *)dstu + 1); 699 } 700 if (n & 0x08) { 701 *(uint64_t *)dstu = *(const uint64_t *)srcu; 702 } 703 return ret; 704 } 705 706 /** 707 * Fast way when copy size doesn't exceed 512 bytes 708 */ 709 if (n <= 32) { 710 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 711 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 712 return ret; 713 } 714 if (n <= 48) { 715 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 716 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 717 return ret; 718 } 719 if (n <= 64) { 720 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 721 rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); 722 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 723 return ret; 724 } 725 if (n <= 128) { 726 goto COPY_BLOCK_128_BACK15; 727 } 728 if (n <= 512) { 729 if (n >= 256) { 730 n -= 256; 731 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 732 rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128); 733 src = (const uint8_t *)src + 256; 734 dst = (uint8_t *)dst + 256; 735 } 736 COPY_BLOCK_255_BACK15: 737 if (n >= 128) { 738 n -= 128; 739 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 740 src = (const uint8_t *)src + 128; 741 dst = (uint8_t *)dst + 128; 742 } 743 COPY_BLOCK_128_BACK15: 744 if (n >= 64) { 745 n -= 64; 746 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 747 src = (const uint8_t *)src + 64; 748 dst = (uint8_t *)dst + 64; 749 } 750 COPY_BLOCK_64_BACK15: 751 if (n >= 32) { 752 n -= 32; 753 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 754 src = (const uint8_t *)src + 32; 755 dst = (uint8_t *)dst + 32; 756 } 757 if (n > 16) { 758 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 759 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 760 return ret; 761 } 762 if (n > 0) { 763 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 764 } 765 return ret; 766 } 767 768 /** 769 * Make store aligned when copy size exceeds 512 bytes, 770 * and make sure the first 15 bytes are copied, because 771 * unaligned copy functions require up to 15 bytes 772 * backwards access. 773 */ 774 dstofss = (uintptr_t)dst & 0x0F; 775 if (dstofss > 0) { 776 dstofss = 16 - dstofss + 16; 777 n -= dstofss; 778 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 779 src = (const uint8_t *)src + dstofss; 780 dst = (uint8_t *)dst + dstofss; 781 } 782 srcofs = ((uintptr_t)src & 0x0F); 783 784 /** 785 * For aligned copy 786 */ 787 if (srcofs == 0) { 788 /** 789 * Copy 256-byte blocks 790 */ 791 for (; n >= 256; n -= 256) { 792 rte_mov256((uint8_t *)dst, (const uint8_t *)src); 793 dst = (uint8_t *)dst + 256; 794 src = (const uint8_t *)src + 256; 795 } 796 797 /** 798 * Copy whatever left 799 */ 800 goto COPY_BLOCK_255_BACK15; 801 } 802 803 /** 804 * For copy with unaligned load 805 */ 806 MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); 807 808 /** 809 * Copy whatever left 810 */ 811 goto COPY_BLOCK_64_BACK15; 812 } 813 814 #endif /* __AVX512F__ */ 815 816 static __rte_always_inline void * 817 rte_memcpy_aligned(void *dst, const void *src, size_t n) 818 { 819 void *ret = dst; 820 821 /* Copy size <= 16 bytes */ 822 if (n < 16) { 823 if (n & 0x01) { 824 *(uint8_t *)dst = *(const uint8_t *)src; 825 src = (const uint8_t *)src + 1; 826 dst = (uint8_t *)dst + 1; 827 } 828 if (n & 0x02) { 829 *(uint16_t *)dst = *(const uint16_t *)src; 830 src = (const uint16_t *)src + 1; 831 dst = (uint16_t *)dst + 1; 832 } 833 if (n & 0x04) { 834 *(uint32_t *)dst = *(const uint32_t *)src; 835 src = (const uint32_t *)src + 1; 836 dst = (uint32_t *)dst + 1; 837 } 838 if (n & 0x08) 839 *(uint64_t *)dst = *(const uint64_t *)src; 840 841 return ret; 842 } 843 844 /* Copy 16 <= size <= 32 bytes */ 845 if (n <= 32) { 846 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 847 rte_mov16((uint8_t *)dst - 16 + n, 848 (const uint8_t *)src - 16 + n); 849 850 return ret; 851 } 852 853 /* Copy 32 < size <= 64 bytes */ 854 if (n <= 64) { 855 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 856 rte_mov32((uint8_t *)dst - 32 + n, 857 (const uint8_t *)src - 32 + n); 858 859 return ret; 860 } 861 862 /* Copy 64 bytes blocks */ 863 for (; n >= 64; n -= 64) { 864 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 865 dst = (uint8_t *)dst + 64; 866 src = (const uint8_t *)src + 64; 867 } 868 869 /* Copy whatever left */ 870 rte_mov64((uint8_t *)dst - 64 + n, 871 (const uint8_t *)src - 64 + n); 872 873 return ret; 874 } 875 876 static __rte_always_inline void * 877 rte_memcpy(void *dst, const void *src, size_t n) 878 { 879 if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK)) 880 return rte_memcpy_aligned(dst, src, n); 881 else 882 return rte_memcpy_generic(dst, src, n); 883 } 884 885 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) 886 #pragma GCC diagnostic pop 887 #endif 888 889 #ifdef __cplusplus 890 } 891 #endif 892 893 #endif /* _RTE_MEMCPY_X86_64_H_ */ 894