1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation 3 */ 4 5 #ifndef _RTE_MEMCPY_X86_64_H_ 6 #define _RTE_MEMCPY_X86_64_H_ 7 8 /** 9 * @file 10 * 11 * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy(). 12 */ 13 14 #include <stdio.h> 15 #include <stdint.h> 16 #include <string.h> 17 #include <rte_vect.h> 18 #include <rte_common.h> 19 #include <rte_config.h> 20 21 #ifdef __cplusplus 22 extern "C" { 23 #endif 24 25 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) 26 #pragma GCC diagnostic push 27 #pragma GCC diagnostic ignored "-Wstringop-overflow" 28 #endif 29 30 /** 31 * Copy bytes from one location to another. The locations must not overlap. 32 * 33 * @note This is implemented as a macro, so it's address should not be taken 34 * and care is needed as parameter expressions may be evaluated multiple times. 35 * 36 * @param dst 37 * Pointer to the destination of the data. 38 * @param src 39 * Pointer to the source data. 40 * @param n 41 * Number of bytes to copy. 42 * @return 43 * Pointer to the destination data. 44 */ 45 static __rte_always_inline void * 46 rte_memcpy(void *dst, const void *src, size_t n); 47 48 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 49 50 #define ALIGNMENT_MASK 0x3F 51 52 /** 53 * AVX512 implementation below 54 */ 55 56 /** 57 * Copy 16 bytes from one location to another, 58 * locations should not overlap. 59 */ 60 static __rte_always_inline void 61 rte_mov16(uint8_t *dst, const uint8_t *src) 62 { 63 __m128i xmm0; 64 65 xmm0 = _mm_loadu_si128((const __m128i *)src); 66 _mm_storeu_si128((__m128i *)dst, xmm0); 67 } 68 69 /** 70 * Copy 32 bytes from one location to another, 71 * locations should not overlap. 72 */ 73 static __rte_always_inline void 74 rte_mov32(uint8_t *dst, const uint8_t *src) 75 { 76 __m256i ymm0; 77 78 ymm0 = _mm256_loadu_si256((const __m256i *)src); 79 _mm256_storeu_si256((__m256i *)dst, ymm0); 80 } 81 82 /** 83 * Copy 64 bytes from one location to another, 84 * locations should not overlap. 85 */ 86 static __rte_always_inline void 87 rte_mov64(uint8_t *dst, const uint8_t *src) 88 { 89 __m512i zmm0; 90 91 zmm0 = _mm512_loadu_si512((const void *)src); 92 _mm512_storeu_si512((void *)dst, zmm0); 93 } 94 95 /** 96 * Copy 128 bytes from one location to another, 97 * locations should not overlap. 98 */ 99 static __rte_always_inline void 100 rte_mov128(uint8_t *dst, const uint8_t *src) 101 { 102 rte_mov64(dst + 0 * 64, src + 0 * 64); 103 rte_mov64(dst + 1 * 64, src + 1 * 64); 104 } 105 106 /** 107 * Copy 256 bytes from one location to another, 108 * locations should not overlap. 109 */ 110 static __rte_always_inline void 111 rte_mov256(uint8_t *dst, const uint8_t *src) 112 { 113 rte_mov64(dst + 0 * 64, src + 0 * 64); 114 rte_mov64(dst + 1 * 64, src + 1 * 64); 115 rte_mov64(dst + 2 * 64, src + 2 * 64); 116 rte_mov64(dst + 3 * 64, src + 3 * 64); 117 } 118 119 /** 120 * Copy 128-byte blocks from one location to another, 121 * locations should not overlap. 122 */ 123 static __rte_always_inline void 124 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) 125 { 126 __m512i zmm0, zmm1; 127 128 while (n >= 128) { 129 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); 130 n -= 128; 131 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); 132 src = src + 128; 133 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); 134 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); 135 dst = dst + 128; 136 } 137 } 138 139 /** 140 * Copy 512-byte blocks from one location to another, 141 * locations should not overlap. 142 */ 143 static inline void 144 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) 145 { 146 __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7; 147 148 while (n >= 512) { 149 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); 150 n -= 512; 151 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); 152 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64)); 153 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64)); 154 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64)); 155 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64)); 156 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64)); 157 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64)); 158 src = src + 512; 159 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); 160 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); 161 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2); 162 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3); 163 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4); 164 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5); 165 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6); 166 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7); 167 dst = dst + 512; 168 } 169 } 170 171 static __rte_always_inline void * 172 rte_memcpy_generic(void *dst, const void *src, size_t n) 173 { 174 uintptr_t dstu = (uintptr_t)dst; 175 uintptr_t srcu = (uintptr_t)src; 176 void *ret = dst; 177 size_t dstofss; 178 size_t bits; 179 180 /** 181 * Copy less than 16 bytes 182 */ 183 if (n < 16) { 184 if (n & 0x01) { 185 *(uint8_t *)dstu = *(const uint8_t *)srcu; 186 srcu = (uintptr_t)((const uint8_t *)srcu + 1); 187 dstu = (uintptr_t)((uint8_t *)dstu + 1); 188 } 189 if (n & 0x02) { 190 *(uint16_t *)dstu = *(const uint16_t *)srcu; 191 srcu = (uintptr_t)((const uint16_t *)srcu + 1); 192 dstu = (uintptr_t)((uint16_t *)dstu + 1); 193 } 194 if (n & 0x04) { 195 *(uint32_t *)dstu = *(const uint32_t *)srcu; 196 srcu = (uintptr_t)((const uint32_t *)srcu + 1); 197 dstu = (uintptr_t)((uint32_t *)dstu + 1); 198 } 199 if (n & 0x08) 200 *(uint64_t *)dstu = *(const uint64_t *)srcu; 201 return ret; 202 } 203 204 /** 205 * Fast way when copy size doesn't exceed 512 bytes 206 */ 207 if (n <= 32) { 208 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 209 rte_mov16((uint8_t *)dst - 16 + n, 210 (const uint8_t *)src - 16 + n); 211 return ret; 212 } 213 if (n <= 64) { 214 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 215 rte_mov32((uint8_t *)dst - 32 + n, 216 (const uint8_t *)src - 32 + n); 217 return ret; 218 } 219 if (n <= 512) { 220 if (n >= 256) { 221 n -= 256; 222 rte_mov256((uint8_t *)dst, (const uint8_t *)src); 223 src = (const uint8_t *)src + 256; 224 dst = (uint8_t *)dst + 256; 225 } 226 if (n >= 128) { 227 n -= 128; 228 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 229 src = (const uint8_t *)src + 128; 230 dst = (uint8_t *)dst + 128; 231 } 232 COPY_BLOCK_128_BACK63: 233 if (n > 64) { 234 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 235 rte_mov64((uint8_t *)dst - 64 + n, 236 (const uint8_t *)src - 64 + n); 237 return ret; 238 } 239 if (n > 0) 240 rte_mov64((uint8_t *)dst - 64 + n, 241 (const uint8_t *)src - 64 + n); 242 return ret; 243 } 244 245 /** 246 * Make store aligned when copy size exceeds 512 bytes 247 */ 248 dstofss = ((uintptr_t)dst & 0x3F); 249 if (dstofss > 0) { 250 dstofss = 64 - dstofss; 251 n -= dstofss; 252 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 253 src = (const uint8_t *)src + dstofss; 254 dst = (uint8_t *)dst + dstofss; 255 } 256 257 /** 258 * Copy 512-byte blocks. 259 * Use copy block function for better instruction order control, 260 * which is important when load is unaligned. 261 */ 262 rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n); 263 bits = n; 264 n = n & 511; 265 bits -= n; 266 src = (const uint8_t *)src + bits; 267 dst = (uint8_t *)dst + bits; 268 269 /** 270 * Copy 128-byte blocks. 271 * Use copy block function for better instruction order control, 272 * which is important when load is unaligned. 273 */ 274 if (n >= 128) { 275 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); 276 bits = n; 277 n = n & 127; 278 bits -= n; 279 src = (const uint8_t *)src + bits; 280 dst = (uint8_t *)dst + bits; 281 } 282 283 /** 284 * Copy whatever left 285 */ 286 goto COPY_BLOCK_128_BACK63; 287 } 288 289 #elif defined __AVX2__ 290 291 #define ALIGNMENT_MASK 0x1F 292 293 /** 294 * AVX2 implementation below 295 */ 296 297 /** 298 * Copy 16 bytes from one location to another, 299 * locations should not overlap. 300 */ 301 static __rte_always_inline void 302 rte_mov16(uint8_t *dst, const uint8_t *src) 303 { 304 __m128i xmm0; 305 306 xmm0 = _mm_loadu_si128((const __m128i *)src); 307 _mm_storeu_si128((__m128i *)dst, xmm0); 308 } 309 310 /** 311 * Copy 32 bytes from one location to another, 312 * locations should not overlap. 313 */ 314 static __rte_always_inline void 315 rte_mov32(uint8_t *dst, const uint8_t *src) 316 { 317 __m256i ymm0; 318 319 ymm0 = _mm256_loadu_si256((const __m256i *)src); 320 _mm256_storeu_si256((__m256i *)dst, ymm0); 321 } 322 323 /** 324 * Copy 64 bytes from one location to another, 325 * locations should not overlap. 326 */ 327 static __rte_always_inline void 328 rte_mov64(uint8_t *dst, const uint8_t *src) 329 { 330 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); 331 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); 332 } 333 334 /** 335 * Copy 128 bytes from one location to another, 336 * locations should not overlap. 337 */ 338 static __rte_always_inline void 339 rte_mov128(uint8_t *dst, const uint8_t *src) 340 { 341 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); 342 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); 343 rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); 344 rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); 345 } 346 347 /** 348 * Copy 128-byte blocks from one location to another, 349 * locations should not overlap. 350 */ 351 static __rte_always_inline void 352 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) 353 { 354 __m256i ymm0, ymm1, ymm2, ymm3; 355 356 while (n >= 128) { 357 ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32)); 358 n -= 128; 359 ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32)); 360 ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32)); 361 ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32)); 362 src = (const uint8_t *)src + 128; 363 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0); 364 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1); 365 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2); 366 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3); 367 dst = (uint8_t *)dst + 128; 368 } 369 } 370 371 static __rte_always_inline void * 372 rte_memcpy_generic(void *dst, const void *src, size_t n) 373 { 374 uintptr_t dstu = (uintptr_t)dst; 375 uintptr_t srcu = (uintptr_t)src; 376 void *ret = dst; 377 size_t dstofss; 378 size_t bits; 379 380 /** 381 * Copy less than 16 bytes 382 */ 383 if (n < 16) { 384 if (n & 0x01) { 385 *(uint8_t *)dstu = *(const uint8_t *)srcu; 386 srcu = (uintptr_t)((const uint8_t *)srcu + 1); 387 dstu = (uintptr_t)((uint8_t *)dstu + 1); 388 } 389 if (n & 0x02) { 390 *(uint16_t *)dstu = *(const uint16_t *)srcu; 391 srcu = (uintptr_t)((const uint16_t *)srcu + 1); 392 dstu = (uintptr_t)((uint16_t *)dstu + 1); 393 } 394 if (n & 0x04) { 395 *(uint32_t *)dstu = *(const uint32_t *)srcu; 396 srcu = (uintptr_t)((const uint32_t *)srcu + 1); 397 dstu = (uintptr_t)((uint32_t *)dstu + 1); 398 } 399 if (n & 0x08) { 400 *(uint64_t *)dstu = *(const uint64_t *)srcu; 401 } 402 return ret; 403 } 404 405 /** 406 * Fast way when copy size doesn't exceed 256 bytes 407 */ 408 if (n <= 32) { 409 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 410 rte_mov16((uint8_t *)dst - 16 + n, 411 (const uint8_t *)src - 16 + n); 412 return ret; 413 } 414 if (n <= 48) { 415 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 416 rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); 417 rte_mov16((uint8_t *)dst - 16 + n, 418 (const uint8_t *)src - 16 + n); 419 return ret; 420 } 421 if (n <= 64) { 422 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 423 rte_mov32((uint8_t *)dst - 32 + n, 424 (const uint8_t *)src - 32 + n); 425 return ret; 426 } 427 if (n <= 256) { 428 if (n >= 128) { 429 n -= 128; 430 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 431 src = (const uint8_t *)src + 128; 432 dst = (uint8_t *)dst + 128; 433 } 434 COPY_BLOCK_128_BACK31: 435 if (n >= 64) { 436 n -= 64; 437 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 438 src = (const uint8_t *)src + 64; 439 dst = (uint8_t *)dst + 64; 440 } 441 if (n > 32) { 442 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 443 rte_mov32((uint8_t *)dst - 32 + n, 444 (const uint8_t *)src - 32 + n); 445 return ret; 446 } 447 if (n > 0) { 448 rte_mov32((uint8_t *)dst - 32 + n, 449 (const uint8_t *)src - 32 + n); 450 } 451 return ret; 452 } 453 454 /** 455 * Make store aligned when copy size exceeds 256 bytes 456 */ 457 dstofss = (uintptr_t)dst & 0x1F; 458 if (dstofss > 0) { 459 dstofss = 32 - dstofss; 460 n -= dstofss; 461 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 462 src = (const uint8_t *)src + dstofss; 463 dst = (uint8_t *)dst + dstofss; 464 } 465 466 /** 467 * Copy 128-byte blocks 468 */ 469 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); 470 bits = n; 471 n = n & 127; 472 bits -= n; 473 src = (const uint8_t *)src + bits; 474 dst = (uint8_t *)dst + bits; 475 476 /** 477 * Copy whatever left 478 */ 479 goto COPY_BLOCK_128_BACK31; 480 } 481 482 #else /* __AVX512F__ */ 483 484 #define ALIGNMENT_MASK 0x0F 485 486 /** 487 * SSE & AVX implementation below 488 */ 489 490 /** 491 * Copy 16 bytes from one location to another, 492 * locations should not overlap. 493 */ 494 static __rte_always_inline void 495 rte_mov16(uint8_t *dst, const uint8_t *src) 496 { 497 __m128i xmm0; 498 499 xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src); 500 _mm_storeu_si128((__m128i *)dst, xmm0); 501 } 502 503 /** 504 * Copy 32 bytes from one location to another, 505 * locations should not overlap. 506 */ 507 static __rte_always_inline void 508 rte_mov32(uint8_t *dst, const uint8_t *src) 509 { 510 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); 511 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); 512 } 513 514 /** 515 * Copy 64 bytes from one location to another, 516 * locations should not overlap. 517 */ 518 static __rte_always_inline void 519 rte_mov64(uint8_t *dst, const uint8_t *src) 520 { 521 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); 522 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); 523 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); 524 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); 525 } 526 527 /** 528 * Copy 128 bytes from one location to another, 529 * locations should not overlap. 530 */ 531 static __rte_always_inline void 532 rte_mov128(uint8_t *dst, const uint8_t *src) 533 { 534 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); 535 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); 536 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); 537 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); 538 rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); 539 rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); 540 rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); 541 rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); 542 } 543 544 /** 545 * Copy 256 bytes from one location to another, 546 * locations should not overlap. 547 */ 548 static inline void 549 rte_mov256(uint8_t *dst, const uint8_t *src) 550 { 551 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); 552 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); 553 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); 554 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); 555 rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); 556 rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); 557 rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); 558 rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); 559 rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); 560 rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); 561 rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); 562 rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); 563 rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); 564 rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); 565 rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); 566 rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); 567 } 568 569 /** 570 * Macro for copying unaligned block from one location to another with constant load offset, 571 * 47 bytes leftover maximum, 572 * locations should not overlap. 573 * Requirements: 574 * - Store is aligned 575 * - Load offset is <offset>, which must be immediate value within [1, 15] 576 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading 577 * - <dst>, <src>, <len> must be variables 578 * - __m128i <xmm0> ~ <xmm8> must be pre-defined 579 */ 580 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset) \ 581 __extension__ ({ \ 582 size_t tmp; \ 583 while (len >= 128 + 16 - offset) { \ 584 xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \ 585 len -= 128; \ 586 xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \ 587 xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \ 588 xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16)); \ 589 xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16)); \ 590 xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16)); \ 591 xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16)); \ 592 xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16)); \ 593 xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16)); \ 594 src = (const uint8_t *)src + 128; \ 595 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ 596 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ 597 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ 598 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ 599 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ 600 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ 601 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ 602 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ 603 dst = (uint8_t *)dst + 128; \ 604 } \ 605 tmp = len; \ 606 len = ((len - 16 + offset) & 127) + 16 - offset; \ 607 tmp -= len; \ 608 src = (const uint8_t *)src + tmp; \ 609 dst = (uint8_t *)dst + tmp; \ 610 if (len >= 32 + 16 - offset) { \ 611 while (len >= 32 + 16 - offset) { \ 612 xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \ 613 len -= 32; \ 614 xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \ 615 xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \ 616 src = (const uint8_t *)src + 32; \ 617 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ 618 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ 619 dst = (uint8_t *)dst + 32; \ 620 } \ 621 tmp = len; \ 622 len = ((len - 16 + offset) & 31) + 16 - offset; \ 623 tmp -= len; \ 624 src = (const uint8_t *)src + tmp; \ 625 dst = (uint8_t *)dst + tmp; \ 626 } \ 627 }) 628 629 /** 630 * Macro for copying unaligned block from one location to another, 631 * 47 bytes leftover maximum, 632 * locations should not overlap. 633 * Use switch here because the aligning instruction requires immediate value for shift count. 634 * Requirements: 635 * - Store is aligned 636 * - Load offset is <offset>, which must be within [1, 15] 637 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading 638 * - <dst>, <src>, <len> must be variables 639 * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined 640 */ 641 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset) \ 642 __extension__ ({ \ 643 switch (offset) { \ 644 case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ 645 case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ 646 case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ 647 case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ 648 case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ 649 case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ 650 case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ 651 case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ 652 case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ 653 case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ 654 case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ 655 case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ 656 case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ 657 case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ 658 case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ 659 default:; \ 660 } \ 661 }) 662 663 static __rte_always_inline void * 664 rte_memcpy_generic(void *dst, const void *src, size_t n) 665 { 666 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; 667 uintptr_t dstu = (uintptr_t)dst; 668 uintptr_t srcu = (uintptr_t)src; 669 void *ret = dst; 670 size_t dstofss; 671 size_t srcofs; 672 673 /** 674 * Copy less than 16 bytes 675 */ 676 if (n < 16) { 677 if (n & 0x01) { 678 *(uint8_t *)dstu = *(const uint8_t *)srcu; 679 srcu = (uintptr_t)((const uint8_t *)srcu + 1); 680 dstu = (uintptr_t)((uint8_t *)dstu + 1); 681 } 682 if (n & 0x02) { 683 *(uint16_t *)dstu = *(const uint16_t *)srcu; 684 srcu = (uintptr_t)((const uint16_t *)srcu + 1); 685 dstu = (uintptr_t)((uint16_t *)dstu + 1); 686 } 687 if (n & 0x04) { 688 *(uint32_t *)dstu = *(const uint32_t *)srcu; 689 srcu = (uintptr_t)((const uint32_t *)srcu + 1); 690 dstu = (uintptr_t)((uint32_t *)dstu + 1); 691 } 692 if (n & 0x08) { 693 *(uint64_t *)dstu = *(const uint64_t *)srcu; 694 } 695 return ret; 696 } 697 698 /** 699 * Fast way when copy size doesn't exceed 512 bytes 700 */ 701 if (n <= 32) { 702 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 703 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 704 return ret; 705 } 706 if (n <= 48) { 707 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 708 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 709 return ret; 710 } 711 if (n <= 64) { 712 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 713 rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); 714 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 715 return ret; 716 } 717 if (n <= 128) { 718 goto COPY_BLOCK_128_BACK15; 719 } 720 if (n <= 512) { 721 if (n >= 256) { 722 n -= 256; 723 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 724 rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128); 725 src = (const uint8_t *)src + 256; 726 dst = (uint8_t *)dst + 256; 727 } 728 COPY_BLOCK_255_BACK15: 729 if (n >= 128) { 730 n -= 128; 731 rte_mov128((uint8_t *)dst, (const uint8_t *)src); 732 src = (const uint8_t *)src + 128; 733 dst = (uint8_t *)dst + 128; 734 } 735 COPY_BLOCK_128_BACK15: 736 if (n >= 64) { 737 n -= 64; 738 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 739 src = (const uint8_t *)src + 64; 740 dst = (uint8_t *)dst + 64; 741 } 742 COPY_BLOCK_64_BACK15: 743 if (n >= 32) { 744 n -= 32; 745 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 746 src = (const uint8_t *)src + 32; 747 dst = (uint8_t *)dst + 32; 748 } 749 if (n > 16) { 750 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 751 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 752 return ret; 753 } 754 if (n > 0) { 755 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); 756 } 757 return ret; 758 } 759 760 /** 761 * Make store aligned when copy size exceeds 512 bytes, 762 * and make sure the first 15 bytes are copied, because 763 * unaligned copy functions require up to 15 bytes 764 * backwards access. 765 */ 766 dstofss = (uintptr_t)dst & 0x0F; 767 if (dstofss > 0) { 768 dstofss = 16 - dstofss + 16; 769 n -= dstofss; 770 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 771 src = (const uint8_t *)src + dstofss; 772 dst = (uint8_t *)dst + dstofss; 773 } 774 srcofs = ((uintptr_t)src & 0x0F); 775 776 /** 777 * For aligned copy 778 */ 779 if (srcofs == 0) { 780 /** 781 * Copy 256-byte blocks 782 */ 783 for (; n >= 256; n -= 256) { 784 rte_mov256((uint8_t *)dst, (const uint8_t *)src); 785 dst = (uint8_t *)dst + 256; 786 src = (const uint8_t *)src + 256; 787 } 788 789 /** 790 * Copy whatever left 791 */ 792 goto COPY_BLOCK_255_BACK15; 793 } 794 795 /** 796 * For copy with unaligned load 797 */ 798 MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); 799 800 /** 801 * Copy whatever left 802 */ 803 goto COPY_BLOCK_64_BACK15; 804 } 805 806 #endif /* __AVX512F__ */ 807 808 static __rte_always_inline void * 809 rte_memcpy_aligned(void *dst, const void *src, size_t n) 810 { 811 void *ret = dst; 812 813 /* Copy size <= 16 bytes */ 814 if (n < 16) { 815 if (n & 0x01) { 816 *(uint8_t *)dst = *(const uint8_t *)src; 817 src = (const uint8_t *)src + 1; 818 dst = (uint8_t *)dst + 1; 819 } 820 if (n & 0x02) { 821 *(uint16_t *)dst = *(const uint16_t *)src; 822 src = (const uint16_t *)src + 1; 823 dst = (uint16_t *)dst + 1; 824 } 825 if (n & 0x04) { 826 *(uint32_t *)dst = *(const uint32_t *)src; 827 src = (const uint32_t *)src + 1; 828 dst = (uint32_t *)dst + 1; 829 } 830 if (n & 0x08) 831 *(uint64_t *)dst = *(const uint64_t *)src; 832 833 return ret; 834 } 835 836 /* Copy 16 <= size <= 32 bytes */ 837 if (n <= 32) { 838 rte_mov16((uint8_t *)dst, (const uint8_t *)src); 839 rte_mov16((uint8_t *)dst - 16 + n, 840 (const uint8_t *)src - 16 + n); 841 842 return ret; 843 } 844 845 /* Copy 32 < size <= 64 bytes */ 846 if (n <= 64) { 847 rte_mov32((uint8_t *)dst, (const uint8_t *)src); 848 rte_mov32((uint8_t *)dst - 32 + n, 849 (const uint8_t *)src - 32 + n); 850 851 return ret; 852 } 853 854 /* Copy 64 bytes blocks */ 855 for (; n >= 64; n -= 64) { 856 rte_mov64((uint8_t *)dst, (const uint8_t *)src); 857 dst = (uint8_t *)dst + 64; 858 src = (const uint8_t *)src + 64; 859 } 860 861 /* Copy whatever left */ 862 rte_mov64((uint8_t *)dst - 64 + n, 863 (const uint8_t *)src - 64 + n); 864 865 return ret; 866 } 867 868 static __rte_always_inline void * 869 rte_memcpy(void *dst, const void *src, size_t n) 870 { 871 if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK)) 872 return rte_memcpy_aligned(dst, src, n); 873 else 874 return rte_memcpy_generic(dst, src, n); 875 } 876 877 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) 878 #pragma GCC diagnostic pop 879 #endif 880 881 #ifdef __cplusplus 882 } 883 #endif 884 885 #endif /* _RTE_MEMCPY_X86_64_H_ */ 886