1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2019 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29%ifndef __MEMCPY_ASM__ 30%define __MEMCPY_ASM__ 31 32%include "reg_sizes.asm" 33 34 35; This file defines a series of macros to copy small to medium amounts 36; of data from memory to memory, where the size is variable but limited. 37; 38; The macros are all called as: 39; memcpy DST, SRC, SIZE, TMP0, TMP1, XTMP0, XTMP1, XTMP2, XTMP3 40; with the parameters defined as: 41; DST : register: pointer to dst (not modified) 42; SRC : register: pointer to src (not modified) 43; SIZE : register: length in bytes (not modified) 44; TMP0 : 64-bit temp GPR (clobbered) 45; TMP1 : 64-bit temp GPR (clobbered) 46; XTMP0 : temp XMM (clobbered) 47; XTMP1 : temp XMM (clobbered) 48; XTMP2 : temp XMM (clobbered) 49; XTMP3 : temp XMM (clobbered) 50; 51; The name indicates the options. The name is of the form: 52; memcpy_<VEC>_<SZ><ZERO><RET> 53; where: 54; <VEC> is either "sse" or "avx" or "avx2" 55; <SZ> is either "64" or "128" and defines largest value of SIZE 56; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0) 57; <RET> is blank or "_ret". If blank, the code falls through. If "ret" 58; it does a "ret" at the end 59; 60; For the avx2 versions, the temp XMM registers need to be YMM registers 61; If the SZ is 64, then only two YMM temps are needed, i.e. it is called as: 62; memcpy_avx2_64 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1 63; memcpy_avx2_128 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1, YTMP2, YTMP3 64; 65; For example: 66; memcpy_sse_64 : SSE, 0 <= size < 64, falls through 67; memcpy_avx_64_1 : AVX1, 1 <= size < 64, falls through 68; memcpy_sse_128_ret : SSE, 0 <= size < 128, ends with ret 69; mempcy_avx_128_1_ret : AVX1, 1 <= size < 128, ends with ret 70; 71 72%macro memcpy_sse_64 9 73 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 0 74%endm 75 76%macro memcpy_sse_64_1 9 77 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 0 78%endm 79 80%macro memcpy_sse_128 9 81 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 0 82%endm 83 84%macro memcpy_sse_128_1 9 85 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 0 86%endm 87 88%macro memcpy_sse_64_ret 9 89 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 0 90%endm 91 92%macro memcpy_sse_64_1_ret 9 93 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 0 94%endm 95 96%macro memcpy_sse_128_ret 9 97 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 0 98%endm 99 100%macro memcpy_sse_128_1_ret 9 101 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 0 102%endm 103 104 105%macro memcpy_sse_16 5 106 __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 0 107%endm 108 109%macro memcpy_sse_16_1 5 110 __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 0 111%endm 112 113 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 114 115%macro memcpy_avx_64 9 116 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 1 117%endm 118 119%macro memcpy_avx_64_1 9 120 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 1 121%endm 122 123%macro memcpy_avx_128 9 124 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 1 125%endm 126 127%macro memcpy_avx_128_1 9 128 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 1 129%endm 130 131%macro memcpy_avx_64_ret 9 132 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 1 133%endm 134 135%macro memcpy_avx_64_1_ret 9 136 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 1 137%endm 138 139%macro memcpy_avx_128_ret 9 140 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 1 141%endm 142 143%macro memcpy_avx_128_1_ret 9 144 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 1 145%endm 146 147 148%macro memcpy_avx_16 5 149 __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 1 150%endm 151 152%macro memcpy_avx_16_1 5 153 __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 1 154%endm 155 156 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 157 158%macro memcpy_avx2_64 7 159 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 0, 2 160%endm 161 162%macro memcpy_avx2_64_1 7 163 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 0, 2 164%endm 165 166%macro memcpy_avx2_128 9 167 __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 0, 128, 0, 2 168%endm 169 170%macro memcpy_avx2_128_1 9 171 __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 1, 128, 0, 2 172%endm 173 174%macro memcpy_avx2_64_ret 7 175 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 1, 2 176%endm 177 178%macro memcpy_avx2_64_1_ret 7 179 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 1, 2 180%endm 181 182%macro memcpy_avx2_128_ret 9 183 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 128, 1, 2 184%endm 185 186%macro memcpy_avx2_128_1_ret 9 187 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 128, 1, 2 188%endm 189 190 191 192;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 193;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 194;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 195;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 196 197 198%macro __memcpy_int 13 199%define %%DST %1 ; register: pointer to dst (not modified) 200%define %%SRC %2 ; register: pointer to src (not modified) 201%define %%SIZE %3 ; register: length in bytes (not modified) 202%define %%TMP0 %4 ; 64-bit temp GPR (clobbered) 203%define %%TMP1 %5 ; 64-bit temp GPR (clobbered) 204%define %%XTMP0 %6 ; temp XMM (clobbered) 205%define %%XTMP1 %7 ; temp XMM (clobbered) 206%define %%XTMP2 %8 ; temp XMM (clobbered) 207%define %%XTMP3 %9 ; temp XMM (clobbered) 208%define %%NOT0 %10 ; if not 0, then assume size cannot be zero 209%define %%MAXSIZE %11 ; 128, 64, etc 210%define %%USERET %12 ; if not 0, use "ret" at end 211%define %%USEAVX %13 ; 0 = SSE, 1 = AVX1, 2 = AVX2 212 213%if (%%USERET != 0) 214 %define %%DONE ret 215%else 216 %define %%DONE jmp %%end 217%endif 218 219%if (%%USEAVX != 0) 220 %define %%MOVDQU vmovdqu 221%else 222 %define %%MOVDQU movdqu 223%endif 224 225%if (%%MAXSIZE >= 128) 226 test %%SIZE, 64 227 jz %%lt64 228 %if (%%USEAVX >= 2) 229 %%MOVDQU %%XTMP0, [%%SRC + 0*32] 230 %%MOVDQU %%XTMP1, [%%SRC + 1*32] 231 %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*32] 232 %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*32] 233 234 %%MOVDQU [%%DST + 0*32], %%XTMP0 235 %%MOVDQU [%%DST + 1*32], %%XTMP1 236 %%MOVDQU [%%DST + %%SIZE - 2*32], %%XTMP2 237 %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP3 238 %else 239 %%MOVDQU %%XTMP0, [%%SRC + 0*16] 240 %%MOVDQU %%XTMP1, [%%SRC + 1*16] 241 %%MOVDQU %%XTMP2, [%%SRC + 2*16] 242 %%MOVDQU %%XTMP3, [%%SRC + 3*16] 243 %%MOVDQU [%%DST + 0*16], %%XTMP0 244 %%MOVDQU [%%DST + 1*16], %%XTMP1 245 %%MOVDQU [%%DST + 2*16], %%XTMP2 246 %%MOVDQU [%%DST + 3*16], %%XTMP3 247 248 %%MOVDQU %%XTMP0, [%%SRC + %%SIZE - 4*16] 249 %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 3*16] 250 %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16] 251 %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16] 252 %%MOVDQU [%%DST + %%SIZE - 4*16], %%XTMP0 253 %%MOVDQU [%%DST + %%SIZE - 3*16], %%XTMP1 254 %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2 255 %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3 256 %endif 257 %%DONE 258%endif 259 260%if (%%MAXSIZE >= 64) 261%%lt64 262 test %%SIZE, 32 263 jz %%lt32 264 %if (%%USEAVX >= 2) 265 %%MOVDQU %%XTMP0, [%%SRC + 0*32] 266 %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*32] 267 %%MOVDQU [%%DST + 0*32], %%XTMP0 268 %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP1 269 %else 270 %%MOVDQU %%XTMP0, [%%SRC + 0*16] 271 %%MOVDQU %%XTMP1, [%%SRC + 1*16] 272 %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16] 273 %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16] 274 %%MOVDQU [%%DST + 0*16], %%XTMP0 275 %%MOVDQU [%%DST + 1*16], %%XTMP1 276 %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2 277 %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3 278 %endif 279 %%DONE 280%endif 281 282%if (%%MAXSIZE >= 32) 283%%lt32: 284 test %%SIZE, 16 285 jz %%lt16 286 %if (%%USEAVX >= 2) 287 %%MOVDQU XWORD(%%XTMP0), [%%SRC + 0*16] 288 %%MOVDQU XWORD(%%XTMP1), [%%SRC + %%SIZE - 1*16] 289 %%MOVDQU [%%DST + 0*16], XWORD(%%XTMP0) 290 %%MOVDQU [%%DST + %%SIZE - 1*16], XWORD(%%XTMP1) 291 %else 292 %%MOVDQU %%XTMP0, [%%SRC + 0*16] 293 %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*16] 294 %%MOVDQU [%%DST + 0*16], %%XTMP0 295 %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP1 296 %endif 297 %%DONE 298%endif 299 300%if (%%MAXSIZE >= 16) 301%%lt16: 302 test %%SIZE, 8 303 jz %%lt8 304 mov %%TMP0, [%%SRC] 305 mov %%TMP1, [%%SRC + %%SIZE - 8] 306 mov [%%DST], %%TMP0 307 mov [%%DST + %%SIZE - 8], %%TMP1 308 %%DONE 309%endif 310 311%if (%%MAXSIZE >= 8) 312%%lt8: 313 test %%SIZE, 4 314 jz %%lt4 315 mov DWORD(%%TMP0), [%%SRC] 316 mov DWORD(%%TMP1), [%%SRC + %%SIZE - 4] 317 mov [%%DST], DWORD(%%TMP0) 318 mov [%%DST + %%SIZE - 4], DWORD(%%TMP1) 319 %%DONE 320%endif 321 322%if (%%MAXSIZE >= 4) 323%%lt4: 324 test %%SIZE, 2 325 jz %%lt2 326 movzx DWORD(%%TMP0), word [%%SRC] 327 movzx DWORD(%%TMP1), byte [%%SRC + %%SIZE - 1] 328 mov [%%DST], WORD(%%TMP0) 329 mov [%%DST + %%SIZE - 1], BYTE(%%TMP1) 330 %%DONE 331%endif 332 333%%lt2: 334%if (%%NOT0 == 0) 335 test %%SIZE, 1 336 jz %%end 337%endif 338 movzx DWORD(%%TMP0), byte [%%SRC] 339 mov [%%DST], BYTE(%%TMP0) 340%%end: 341%if (%%USERET != 0) 342 ret 343%endif 344%endm 345 346;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 347;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 348;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 349;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 350 351;; Utility macro to assist with SIMD shifting 352%macro _PSRLDQ 3 353%define %%VEC %1 354%define %%REG %2 355%define %%IMM %3 356 357%ifidn %%VEC, SSE 358 psrldq %%REG, %%IMM 359%else 360 vpsrldq %%REG, %%REG, %%IMM 361%endif 362%endm 363 364 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 365 366; This section defines a series of macros to store small to medium amounts 367; of data from SIMD registers to memory, where the size is variable but limited. 368; 369; The macros are all called as: 370; memcpy DST, SRC, SIZE, TMP, IDX 371; with the parameters defined as: 372; DST : register: pointer to dst (not modified) 373; SRC : register: src data (clobbered) 374; SIZE : register: length in bytes (not modified) 375; TMP : 64-bit temp GPR (clobbered) 376; IDX : 64-bit GPR to store dst index/offset (clobbered) 377; 378; The name indicates the options. The name is of the form: 379; simd_store_<VEC> 380; where <VEC> is the SIMD instruction type e.g. "sse" or "avx" 381 382 383%macro simd_store_sse 5 384 __simd_store %1,%2,%3,%4,%5,SSE 385%endm 386 387%macro simd_store_avx 5 388 __simd_store %1,%2,%3,%4,%5,AVX 389%endm 390 391%macro simd_store_sse_15 5 392 __simd_store %1,%2,%3,%4,%5,SSE,15 393%endm 394 395%macro simd_store_avx_15 5 396 __simd_store %1,%2,%3,%4,%5,AVX,15 397%endm 398 399%macro __simd_store 6-7 400%define %%DST %1 ; register: pointer to dst (not modified) 401%define %%SRC %2 ; register: src data (clobbered) 402%define %%SIZE %3 ; register: length in bytes (not modified) 403%define %%TMP %4 ; 64-bit temp GPR (clobbered) 404%define %%IDX %5 ; 64-bit temp GPR to store dst idx (clobbered) 405%define %%SIMDTYPE %6 ; "SSE" or "AVX" 406%define %%MAX_LEN %7 ; [optional] maximum length to be stored, default 16 407 408%define %%PSRLDQ _PSRLDQ %%SIMDTYPE, 409 410%ifidn %%SIMDTYPE, SSE 411 %define %%MOVDQU movdqu 412 %define %%MOVQ movq 413%else 414 %define %%MOVDQU vmovdqu 415 %define %%MOVQ vmovq 416%endif 417 418;; determine max byte size for store operation 419%if %0 > 6 420%assign max_length_to_store %%MAX_LEN 421%else 422%assign max_length_to_store 16 423%endif 424 425%if max_length_to_store > 16 426%error "__simd_store macro invoked with MAX_LEN bigger than 16!" 427%endif 428 429 xor %%IDX, %%IDX ; zero idx 430 431%if max_length_to_store == 16 432 test %%SIZE, 16 433 jz %%lt16 434 %%MOVDQU [%%DST], %%SRC 435 jmp %%end 436%%lt16: 437%endif 438 439%if max_length_to_store >= 8 440 test %%SIZE, 8 441 jz %%lt8 442 %%MOVQ [%%DST + %%IDX], %%SRC 443 %%PSRLDQ %%SRC, 8 444 add %%IDX, 8 445%%lt8: 446%endif 447 448 %%MOVQ %%TMP, %%SRC ; use GPR from now on 449 450%if max_length_to_store >= 4 451 test %%SIZE, 4 452 jz %%lt4 453 mov [%%DST + %%IDX], DWORD(%%TMP) 454 shr %%TMP, 32 455 add %%IDX, 4 456%%lt4: 457%endif 458 459 test %%SIZE, 2 460 jz %%lt2 461 mov [%%DST + %%IDX], WORD(%%TMP) 462 shr %%TMP, 16 463 add %%IDX, 2 464%%lt2: 465 test %%SIZE, 1 466 jz %%end 467 mov [%%DST + %%IDX], BYTE(%%TMP) 468%%end: 469%endm 470 471; This section defines a series of macros to load small to medium amounts 472; (from 0 to 16 bytes) of data from memory to SIMD registers, 473; where the size is variable but limited. 474; 475; The macros are all called as: 476; simd_load DST, SRC, SIZE 477; with the parameters defined as: 478; DST : register: destination XMM register 479; SRC : register: pointer to src data (not modified) 480; SIZE : register: length in bytes (not modified) 481; 482; The name indicates the options. The name is of the form: 483; simd_load_<VEC>_<SZ><ZERO> 484; where: 485; <VEC> is either "sse" or "avx" 486; <SZ> is either "15" or "16" and defines largest value of SIZE 487; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0) 488; 489; For example: 490; simd_load_sse_16 : SSE, 0 <= size <= 16 491; simd_load_avx_15_1 : AVX, 1 <= size <= 15 492 493%macro simd_load_sse_15_1 3 494 __simd_load %1,%2,%3,0,0,SSE 495%endm 496%macro simd_load_sse_15 3 497 __simd_load %1,%2,%3,1,0,SSE 498%endm 499%macro simd_load_sse_16_1 3 500 __simd_load %1,%2,%3,0,1,SSE 501%endm 502%macro simd_load_sse_16 3 503 __simd_load %1,%2,%3,1,1,SSE 504%endm 505 506%macro simd_load_avx_15_1 3 507 __simd_load %1,%2,%3,0,0,AVX 508%endm 509%macro simd_load_avx_15 3 510 __simd_load %1,%2,%3,1,0,AVX 511%endm 512%macro simd_load_avx_16_1 3 513 __simd_load %1,%2,%3,0,1,AVX 514%endm 515%macro simd_load_avx_16 3 516 __simd_load %1,%2,%3,1,1,AVX 517%endm 518 519%macro __simd_load 6 520%define %%DST %1 ; [out] destination XMM register 521%define %%SRC %2 ; [in] pointer to src data 522%define %%SIZE %3 ; [in] length in bytes (0-16 bytes) 523%define %%ACCEPT_0 %4 ; 0 = min length = 1, 1 = min length = 0 524%define %%ACCEPT_16 %5 ; 0 = max length = 15 , 1 = max length = 16 525%define %%SIMDTYPE %6 ; "SSE" or "AVX" 526 527%ifidn %%SIMDTYPE, SSE 528 %define %%MOVDQU movdqu 529 %define %%PINSRB pinsrb 530 %define %%PINSRQ pinsrq 531 %define %%PXOR pxor 532%else 533 %define %%MOVDQU vmovdqu 534 %define %%PINSRB vpinsrb 535 %define %%PINSRQ vpinsrq 536 %define %%PXOR vpxor 537%endif 538 539%if (%%ACCEPT_16 != 0) 540 test %%SIZE, 16 541 jz %%_skip_16 542 %%MOVDQU %%DST, [%%SRC] 543 jmp %%end_load 544 545%%_skip_16: 546%endif 547 %%PXOR %%DST, %%DST ; clear XMM register 548%if (%%ACCEPT_0 != 0) 549 or %%SIZE, %%SIZE 550 je %%end_load 551%endif 552 cmp %%SIZE, 1 553 je %%_size_1 554 cmp %%SIZE, 2 555 je %%_size_2 556 cmp %%SIZE, 3 557 je %%_size_3 558 cmp %%SIZE, 4 559 je %%_size_4 560 cmp %%SIZE, 5 561 je %%_size_5 562 cmp %%SIZE, 6 563 je %%_size_6 564 cmp %%SIZE, 7 565 je %%_size_7 566 cmp %%SIZE, 8 567 je %%_size_8 568 cmp %%SIZE, 9 569 je %%_size_9 570 cmp %%SIZE, 10 571 je %%_size_10 572 cmp %%SIZE, 11 573 je %%_size_11 574 cmp %%SIZE, 12 575 je %%_size_12 576 cmp %%SIZE, 13 577 je %%_size_13 578 cmp %%SIZE, 14 579 je %%_size_14 580 581%%_size_15: 582 %%PINSRB %%DST, [%%SRC + 14], 14 583%%_size_14: 584 %%PINSRB %%DST, [%%SRC + 13], 13 585%%_size_13: 586 %%PINSRB %%DST, [%%SRC + 12], 12 587%%_size_12: 588 %%PINSRB %%DST, [%%SRC + 11], 11 589%%_size_11: 590 %%PINSRB %%DST, [%%SRC + 10], 10 591%%_size_10: 592 %%PINSRB %%DST, [%%SRC + 9], 9 593%%_size_9: 594 %%PINSRB %%DST, [%%SRC + 8], 8 595%%_size_8: 596 %%PINSRQ %%DST, [%%SRC], 0 597 jmp %%end_load 598%%_size_7: 599 %%PINSRB %%DST, [%%SRC + 6], 6 600%%_size_6: 601 %%PINSRB %%DST, [%%SRC + 5], 5 602%%_size_5: 603 %%PINSRB %%DST, [%%SRC + 4], 4 604%%_size_4: 605 %%PINSRB %%DST, [%%SRC + 3], 3 606%%_size_3: 607 %%PINSRB %%DST, [%%SRC + 2], 2 608%%_size_2: 609 %%PINSRB %%DST, [%%SRC + 1], 1 610%%_size_1: 611 %%PINSRB %%DST, [%%SRC + 0], 0 612%%end_load: 613%endm 614 615%endif ; ifndef __MEMCPY_ASM__ 616