1;; 2;; Copyright (c) 2023, Intel Corporation 3;; 4;; Redistribution and use in source and binary forms, with or without 5;; modification, are permitted provided that the following conditions are met: 6;; 7;; * Redistributions of source code must retain the above copyright notice, 8;; this list of conditions and the following disclaimer. 9;; * Redistributions in binary form must reproduce the above copyright 10;; notice, this list of conditions and the following disclaimer in the 11;; documentation and/or other materials provided with the distribution. 12;; * Neither the name of Intel Corporation nor the names of its contributors 13;; may be used to endorse or promote products derived from this software 14;; without specific prior written permission. 15;; 16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26;; 27 28%ifndef __MEMCPY_INC__ 29%define __MEMCPY_INC__ 30 31%include "reg_sizes.asm" 32 33; This section defines a series of macros to copy small to medium amounts 34; of data from memory to memory, where the size is variable but limited. 35; 36; The macros are all called as: 37; memcpy DST, SRC, SIZE, TMP0, TMP1, XTMP0, XTMP1, XTMP2, XTMP3 38; with the parameters defined as: 39; DST : register: pointer to dst (not modified) 40; SRC : register: pointer to src (not modified) 41; SIZE : register: length in bytes (not modified) 42; TMP0 : 64-bit temp GPR (clobbered) 43; TMP1 : 64-bit temp GPR (clobbered) 44; XTMP0 : temp XMM (clobbered) 45; XTMP1 : temp XMM (clobbered) 46; XTMP2 : temp XMM (clobbered) 47; XTMP3 : temp XMM (clobbered) 48; 49; The name indicates the options. The name is of the form: 50; memcpy_<VEC>_<SZ><ZERO><RET> 51; where: 52; <VEC> is either "sse" or "avx" or "avx2" 53; <SZ> is either "64" or "128" and defines largest value of SIZE 54; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0) 55; <RET> is blank or "_ret". If blank, the code falls through. If "ret" 56; it does a "ret" at the end 57; 58; For the avx2 versions, the temp XMM registers need to be YMM registers 59; If the SZ is 64, then only two YMM temps are needed, i.e. it is called as: 60; memcpy_avx2_64 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1 61; memcpy_avx2_128 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1, YTMP2, YTMP3 62; 63; For example: 64; memcpy_sse_64 : SSE, 0 <= size < 64, falls through 65; memcpy_avx_64_1 : AVX1, 1 <= size < 64, falls through 66; memcpy_sse_128_ret : SSE, 0 <= size < 128, ends with ret 67; mempcy_avx_128_1_ret : AVX1, 1 <= size < 128, ends with ret 68; 69 70%macro memcpy_sse_64 9 71 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 0 72%endm 73 74%macro memcpy_sse_64_1 9 75 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 0 76%endm 77 78%macro memcpy_sse_128 9 79 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 0 80%endm 81 82%macro memcpy_sse_128_1 9 83 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 0 84%endm 85 86%macro memcpy_sse_64_ret 9 87 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 0 88%endm 89 90%macro memcpy_sse_64_1_ret 9 91 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 0 92%endm 93 94%macro memcpy_sse_128_ret 9 95 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 0 96%endm 97 98%macro memcpy_sse_128_1_ret 9 99 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 0 100%endm 101 102%macro memcpy_sse_16 5 103 __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 0 104%endm 105 106%macro memcpy_sse_16_1 5 107 __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 0 108%endm 109 110 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 111 112%macro memcpy_avx_64 9 113 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 1 114%endm 115 116%macro memcpy_avx_64_1 9 117 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 1 118%endm 119 120%macro memcpy_avx_128 9 121 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 1 122%endm 123 124%macro memcpy_avx_128_1 9 125 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 1 126%endm 127 128%macro memcpy_avx_64_ret 9 129 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 1 130%endm 131 132%macro memcpy_avx_64_1_ret 9 133 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 1 134%endm 135 136%macro memcpy_avx_128_ret 9 137 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 1 138%endm 139 140%macro memcpy_avx_128_1_ret 9 141 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 1 142%endm 143 144%macro memcpy_avx_16 5 145 __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 1 146%endm 147 148%macro memcpy_avx_16_1 5 149 __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 1 150%endm 151 152 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 153 154%macro memcpy_avx2_64 7 155 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 0, 2 156%endm 157 158%macro memcpy_avx2_64_1 7 159 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 0, 2 160%endm 161 162%macro memcpy_avx2_128 9 163 __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 0, 128, 0, 2 164%endm 165 166%macro memcpy_avx2_128_1 9 167 __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 1, 128, 0, 2 168%endm 169 170%macro memcpy_avx2_64_ret 7 171 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 1, 2 172%endm 173 174%macro memcpy_avx2_64_1_ret 7 175 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 1, 2 176%endm 177 178%macro memcpy_avx2_128_ret 9 179 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 2 180%endm 181 182%macro memcpy_avx2_128_1_ret 9 183 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 2 184%endm 185 186;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 187;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 188;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 189;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 190 191%macro __memcpy_int 13 192%define %%DST %1 ; register: pointer to dst (not modified) 193%define %%SRC %2 ; register: pointer to src (not modified) 194%define %%SIZE %3 ; register: length in bytes (not modified) 195%define %%TMP0 %4 ; 64-bit temp GPR (clobbered) 196%define %%TMP1 %5 ; 64-bit temp GPR (clobbered) 197%define %%XTMP0 %6 ; temp XMM (clobbered) 198%define %%XTMP1 %7 ; temp XMM (clobbered) 199%define %%XTMP2 %8 ; temp XMM (clobbered) 200%define %%XTMP3 %9 ; temp XMM (clobbered) 201%define %%NOT0 %10 ; if not 0, then assume size cannot be zero 202%define %%MAXSIZE %11 ; 128, 64, etc 203%define %%USERET %12 ; if not 0, use "ret" at end 204%define %%USEAVX %13 ; 0 = SSE, 1 = AVX1, 2 = AVX2 205 206%if (%%USERET != 0) 207 %define %%DONE ret 208%else 209 %define %%DONE jmp %%end 210%endif 211 212%if (%%USEAVX != 0) 213 %define %%MOVDQU vmovdqu 214%else 215 %define %%MOVDQU movdqu 216%endif 217 218%if (%%MAXSIZE >= 128) 219 test %%SIZE, 64 220 jz %%lt64 221 %if (%%USEAVX >= 2) 222 %%MOVDQU %%XTMP0, [%%SRC + 0*32] 223 %%MOVDQU %%XTMP1, [%%SRC + 1*32] 224 %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*32] 225 %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*32] 226 227 %%MOVDQU [%%DST + 0*32], %%XTMP0 228 %%MOVDQU [%%DST + 1*32], %%XTMP1 229 %%MOVDQU [%%DST + %%SIZE - 2*32], %%XTMP2 230 %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP3 231 %else 232 %%MOVDQU %%XTMP0, [%%SRC + 0*16] 233 %%MOVDQU %%XTMP1, [%%SRC + 1*16] 234 %%MOVDQU %%XTMP2, [%%SRC + 2*16] 235 %%MOVDQU %%XTMP3, [%%SRC + 3*16] 236 %%MOVDQU [%%DST + 0*16], %%XTMP0 237 %%MOVDQU [%%DST + 1*16], %%XTMP1 238 %%MOVDQU [%%DST + 2*16], %%XTMP2 239 %%MOVDQU [%%DST + 3*16], %%XTMP3 240 241 %%MOVDQU %%XTMP0, [%%SRC + %%SIZE - 4*16] 242 %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 3*16] 243 %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16] 244 %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16] 245 %%MOVDQU [%%DST + %%SIZE - 4*16], %%XTMP0 246 %%MOVDQU [%%DST + %%SIZE - 3*16], %%XTMP1 247 %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2 248 %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3 249 %endif 250 %%DONE 251%endif 252 253%if (%%MAXSIZE >= 64) 254%%lt64: 255 test %%SIZE, 32 256 jz %%lt32 257 %if (%%USEAVX >= 2) 258 %%MOVDQU %%XTMP0, [%%SRC + 0*32] 259 %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*32] 260 %%MOVDQU [%%DST + 0*32], %%XTMP0 261 %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP1 262 %else 263 %%MOVDQU %%XTMP0, [%%SRC + 0*16] 264 %%MOVDQU %%XTMP1, [%%SRC + 1*16] 265 %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16] 266 %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16] 267 %%MOVDQU [%%DST + 0*16], %%XTMP0 268 %%MOVDQU [%%DST + 1*16], %%XTMP1 269 %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2 270 %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3 271 %endif 272 %%DONE 273%endif 274 275%if (%%MAXSIZE >= 32) 276%%lt32: 277 test %%SIZE, 16 278 jz %%lt16 279 %if (%%USEAVX >= 2) 280 %%MOVDQU XWORD(%%XTMP0), [%%SRC + 0*16] 281 %%MOVDQU XWORD(%%XTMP1), [%%SRC + %%SIZE - 1*16] 282 %%MOVDQU [%%DST + 0*16], XWORD(%%XTMP0) 283 %%MOVDQU [%%DST + %%SIZE - 1*16], XWORD(%%XTMP1) 284 %else 285 %%MOVDQU %%XTMP0, [%%SRC + 0*16] 286 %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*16] 287 %%MOVDQU [%%DST + 0*16], %%XTMP0 288 %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP1 289 %endif 290 %%DONE 291%endif 292 293%if (%%MAXSIZE >= 16) 294 test %%SIZE, 16 295 jz %%lt16 296 mov %%TMP0, [%%SRC] 297 mov %%TMP1, [%%SRC + 8] 298 mov [%%DST], %%TMP0 299 mov [%%DST + 8], %%TMP1 300%%lt16: 301 test %%SIZE, 8 302 jz %%lt8 303 mov %%TMP0, [%%SRC] 304 mov %%TMP1, [%%SRC + %%SIZE - 8] 305 mov [%%DST], %%TMP0 306 mov [%%DST + %%SIZE - 8], %%TMP1 307 %%DONE 308%endif 309 310%if (%%MAXSIZE >= 8) 311%%lt8: 312 test %%SIZE, 4 313 jz %%lt4 314 mov DWORD(%%TMP0), [%%SRC] 315 mov DWORD(%%TMP1), [%%SRC + %%SIZE - 4] 316 mov [%%DST], DWORD(%%TMP0) 317 mov [%%DST + %%SIZE - 4], DWORD(%%TMP1) 318 %%DONE 319%endif 320 321%if (%%MAXSIZE >= 4) 322%%lt4: 323 test %%SIZE, 2 324 jz %%lt2 325 movzx DWORD(%%TMP0), word [%%SRC] 326 movzx DWORD(%%TMP1), byte [%%SRC + %%SIZE - 1] 327 mov [%%DST], WORD(%%TMP0) 328 mov [%%DST + %%SIZE - 1], BYTE(%%TMP1) 329 %%DONE 330%endif 331 332%%lt2: 333%if (%%NOT0 == 0) 334 test %%SIZE, 1 335 jz %%end 336%endif 337 movzx DWORD(%%TMP0), byte [%%SRC] 338 mov [%%DST], BYTE(%%TMP0) 339%%end: 340%if (%%USERET != 0) 341 ret 342%endif 343%endm 344 345;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 346;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 347;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 348;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 349 350;; Utility macro to assist with SIMD shifting 351%macro _PSRLDQ 3 352%define %%VEC %1 353%define %%REG %2 354%define %%IMM %3 355 356%ifidn %%VEC, SSE 357 psrldq %%REG, %%IMM 358%else 359 vpsrldq %%REG, %%REG, %%IMM 360%endif 361%endm 362 363 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 364 365; This section defines a series of macros to store small to medium amounts 366; of data from SIMD registers to memory, where the size is variable but limited. 367; 368; The macros are all called as: 369; memcpy DST, SRC, SIZE, TMP, IDX 370; with the parameters defined as: 371; DST : register: pointer to dst (not modified) 372; SRC : register: src data (clobbered) 373; SIZE : register: length in bytes (not modified) 374; TMP : 64-bit temp GPR (clobbered) 375; IDX : 64-bit GPR to store dst index/offset (clobbered) 376; OFFSET ; Offset to be applied to destination pointer (optional) 377; 378; The name indicates the options. The name is of the form: 379; simd_store_<VEC> 380; where <VEC> is the SIMD instruction type e.g. "sse" or "avx" 381 382%macro simd_store_sse 5-6 383%if %0 == 6 384 __simd_store %1,%2,%3,%4,%5,SSE,16,%6 385%else 386 __simd_store %1,%2,%3,%4,%5,SSE,16 387%endif 388%endm 389 390%macro simd_store_avx 5-6 391%if %0 == 6 392 __simd_store %1,%2,%3,%4,%5,AVX,16,%6 393%else 394 __simd_store %1,%2,%3,%4,%5,AVX,16 395%endif 396%endm 397 398%macro simd_store_sse_15 5-6 399%if %0 == 6 400 __simd_store %1,%2,%3,%4,%5,SSE,15,%6 401%else 402 __simd_store %1,%2,%3,%4,%5,SSE,15 403%endif 404%endm 405 406%macro simd_store_avx_15 5-6 407%if %0 == 6 408 __simd_store %1,%2,%3,%4,%5,AVX,15,%6 409%else 410 __simd_store %1,%2,%3,%4,%5,AVX,15 411%endif 412%endm 413 414%macro __simd_store 7-8 415%define %%DST %1 ; register: pointer to dst (not modified) 416%define %%SRC %2 ; register: src data (clobbered) 417%define %%SIZE %3 ; register: length in bytes (not modified) 418%define %%TMP %4 ; 64-bit temp GPR (clobbered) 419%define %%IDX %5 ; 64-bit temp GPR to store dst idx (clobbered) 420%define %%SIMDTYPE %6 ; "SSE" or "AVX" 421%define %%MAX_LEN %7 ; maximum length to be stored 422%define %%OFFSET %8 ; offset to be applied to destination pointer 423 424%define %%PSRLDQ _PSRLDQ %%SIMDTYPE, 425 426%ifidn %%SIMDTYPE, SSE 427 %define %%MOVDQU movdqu 428 %define %%MOVQ movq 429%else 430 %define %%MOVDQU vmovdqu 431 %define %%MOVQ vmovq 432%endif 433 434;; determine max byte size for store operation 435%assign max_length_to_store %%MAX_LEN 436 437%if max_length_to_store > 16 438%error "__simd_store macro invoked with MAX_LEN bigger than 16!" 439%endif 440 441%if %0 == 8 442 mov %%IDX, %%OFFSET 443%else 444 xor %%IDX, %%IDX ; zero idx 445%endif 446 447%if max_length_to_store == 16 448 test %%SIZE, 16 449 jz %%lt16 450 %%MOVDQU [%%DST + %%IDX], %%SRC 451 jmp %%end 452%%lt16: 453%endif 454 455%if max_length_to_store >= 8 456 test %%SIZE, 8 457 jz %%lt8 458 %%MOVQ [%%DST + %%IDX], %%SRC 459 %%PSRLDQ %%SRC, 8 460 add %%IDX, 8 461%%lt8: 462%endif 463 464 %%MOVQ %%TMP, %%SRC ; use GPR from now on 465 466%if max_length_to_store >= 4 467 test %%SIZE, 4 468 jz %%lt4 469 mov [%%DST + %%IDX], DWORD(%%TMP) 470 shr %%TMP, 32 471 add %%IDX, 4 472%%lt4: 473%endif 474 475 test %%SIZE, 2 476 jz %%lt2 477 mov [%%DST + %%IDX], WORD(%%TMP) 478 shr %%TMP, 16 479 add %%IDX, 2 480%%lt2: 481 test %%SIZE, 1 482 jz %%end 483 mov [%%DST + %%IDX], BYTE(%%TMP) 484%%end: 485%endm 486 487; This section defines a series of macros to load small to medium amounts 488; (from 0 to 16 bytes) of data from memory to SIMD registers, 489; where the size is variable but limited. 490; 491; The macros are all called as: 492; simd_load DST, SRC, SIZE 493; with the parameters defined as: 494; DST : register: destination XMM register 495; SRC : register: pointer to src data (not modified) 496; SIZE : register: length in bytes (not modified) 497; 498; The name indicates the options. The name is of the form: 499; simd_load_<VEC>_<SZ><ZERO> 500; where: 501; <VEC> is either "sse" or "avx" 502; <SZ> is either "15" or "16" and defines largest value of SIZE 503; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0) 504; 505; For example: 506; simd_load_sse_16 : SSE, 0 <= size <= 16 507; simd_load_avx_15_1 : AVX, 1 <= size <= 15 508 509%macro simd_load_sse_15_1 3 510 __simd_load %1,%2,%3,0,0,SSE 511%endm 512%macro simd_load_sse_15 3 513 __simd_load %1,%2,%3,1,0,SSE 514%endm 515%macro simd_load_sse_16_1 3 516 __simd_load %1,%2,%3,0,1,SSE 517%endm 518%macro simd_load_sse_16 3 519 __simd_load %1,%2,%3,1,1,SSE 520%endm 521 522%macro simd_load_avx_15_1 3 523 __simd_load %1,%2,%3,0,0,AVX 524%endm 525%macro simd_load_avx_15 3 526 __simd_load %1,%2,%3,1,0,AVX 527%endm 528%macro simd_load_avx_16_1 3 529 __simd_load %1,%2,%3,0,1,AVX 530%endm 531%macro simd_load_avx_16 3 532 __simd_load %1,%2,%3,1,1,AVX 533%endm 534 535%macro __simd_load 6 536%define %%DST %1 ; [out] destination XMM register 537%define %%SRC %2 ; [in] pointer to src data 538%define %%SIZE %3 ; [in] length in bytes (0-16 bytes) 539%define %%ACCEPT_0 %4 ; 0 = min length = 1, 1 = min length = 0 540%define %%ACCEPT_16 %5 ; 0 = max length = 15 , 1 = max length = 16 541%define %%SIMDTYPE %6 ; "SSE" or "AVX" 542 543%ifidn %%SIMDTYPE, SSE 544 %define %%MOVDQU movdqu 545 %define %%PINSRB pinsrb 546 %define %%PINSRQ pinsrq 547 %define %%PXOR pxor 548%else 549 %define %%MOVDQU vmovdqu 550 %define %%PINSRB vpinsrb 551 %define %%PINSRQ vpinsrq 552 %define %%PXOR vpxor 553%endif 554 555%if (%%ACCEPT_16 != 0) 556 test %%SIZE, 16 557 jz %%_skip_16 558 %%MOVDQU %%DST, [%%SRC] 559 jmp %%end_load 560 561%%_skip_16: 562%endif 563 %%PXOR %%DST, %%DST ; clear XMM register 564%if (%%ACCEPT_0 != 0) 565 or %%SIZE, %%SIZE 566 je %%end_load 567%endif 568 cmp %%SIZE, 2 569 jb %%_size_1 570 je %%_size_2 571 cmp %%SIZE, 4 572 jb %%_size_3 573 je %%_size_4 574 cmp %%SIZE, 6 575 jb %%_size_5 576 je %%_size_6 577 cmp %%SIZE, 8 578 jb %%_size_7 579 je %%_size_8 580 cmp %%SIZE, 10 581 jb %%_size_9 582 je %%_size_10 583 cmp %%SIZE, 12 584 jb %%_size_11 585 je %%_size_12 586 cmp %%SIZE, 14 587 jb %%_size_13 588 je %%_size_14 589 590%%_size_15: 591 %%PINSRB %%DST, [%%SRC + 14], 14 592%%_size_14: 593 %%PINSRB %%DST, [%%SRC + 13], 13 594%%_size_13: 595 %%PINSRB %%DST, [%%SRC + 12], 12 596%%_size_12: 597 %%PINSRB %%DST, [%%SRC + 11], 11 598%%_size_11: 599 %%PINSRB %%DST, [%%SRC + 10], 10 600%%_size_10: 601 %%PINSRB %%DST, [%%SRC + 9], 9 602%%_size_9: 603 %%PINSRB %%DST, [%%SRC + 8], 8 604%%_size_8: 605 %%PINSRQ %%DST, [%%SRC], 0 606 jmp %%end_load 607%%_size_7: 608 %%PINSRB %%DST, [%%SRC + 6], 6 609%%_size_6: 610 %%PINSRB %%DST, [%%SRC + 5], 5 611%%_size_5: 612 %%PINSRB %%DST, [%%SRC + 4], 4 613%%_size_4: 614 %%PINSRB %%DST, [%%SRC + 3], 3 615%%_size_3: 616 %%PINSRB %%DST, [%%SRC + 2], 2 617%%_size_2: 618 %%PINSRB %%DST, [%%SRC + 1], 1 619%%_size_1: 620 %%PINSRB %%DST, [%%SRC + 0], 0 621%%end_load: 622%endm 623 624%macro simd_load_avx2 5 625%define %%DST %1 ; [out] destination YMM register 626%define %%SRC %2 ; [in] pointer to src data 627%define %%SIZE %3 ; [in] length in bytes (0-32 bytes) 628%define %%IDX %4 ; [clobbered] Temp GP register to store src idx 629%define %%TMP %5 ; [clobbered] Temp GP register 630 631 test %%SIZE, 32 632 jz %%_skip_32 633 vmovdqu %%DST, [%%SRC] 634 jmp %%end_load 635 636%%_skip_32: 637 vpxor %%DST, %%DST ; clear YMM register 638 or %%SIZE, %%SIZE 639 je %%end_load 640 641 lea %%IDX, [%%SRC] 642 mov %%TMP, %%SIZE 643 cmp %%SIZE, 16 644 jle %%_check_size 645 646 add %%IDX, 16 647 sub %%TMP, 16 648 649%%_check_size: 650 cmp %%TMP, 2 651 jb %%_size_1 652 je %%_size_2 653 cmp %%TMP, 4 654 jb %%_size_3 655 je %%_size_4 656 cmp %%TMP, 6 657 jb %%_size_5 658 je %%_size_6 659 cmp %%TMP, 8 660 jb %%_size_7 661 je %%_size_8 662 cmp %%TMP, 10 663 jb %%_size_9 664 je %%_size_10 665 cmp %%TMP, 12 666 jb %%_size_11 667 je %%_size_12 668 cmp %%TMP, 14 669 jb %%_size_13 670 je %%_size_14 671 cmp %%TMP, 15 672 je %%_size_15 673 674%%_size_16: 675 vmovdqu XWORD(%%DST), [%%IDX] 676 jmp %%end_load 677%%_size_15: 678 vpinsrb XWORD(%%DST), [%%IDX + 14], 14 679%%_size_14: 680 vpinsrb XWORD(%%DST), [%%IDX + 13], 13 681%%_size_13: 682 vpinsrb XWORD(%%DST), [%%IDX + 12], 12 683%%_size_12: 684 vpinsrb XWORD(%%DST), [%%IDX + 11], 11 685%%_size_11: 686 vpinsrb XWORD(%%DST), [%%IDX + 10], 10 687%%_size_10: 688 vpinsrb XWORD(%%DST), [%%IDX + 9], 9 689%%_size_9: 690 vpinsrb XWORD(%%DST), [%%IDX + 8], 8 691%%_size_8: 692 vpinsrq XWORD(%%DST), [%%IDX], 0 693 jmp %%_check_higher_16 694%%_size_7: 695 vpinsrb XWORD(%%DST), [%%IDX + 6], 6 696%%_size_6: 697 vpinsrb XWORD(%%DST), [%%IDX + 5], 5 698%%_size_5: 699 vpinsrb XWORD(%%DST), [%%IDX + 4], 4 700%%_size_4: 701 vpinsrb XWORD(%%DST), [%%IDX + 3], 3 702%%_size_3: 703 vpinsrb XWORD(%%DST), [%%IDX + 2], 2 704%%_size_2: 705 vpinsrb XWORD(%%DST), [%%IDX + 1], 1 706%%_size_1: 707 vpinsrb XWORD(%%DST), [%%IDX + 0], 0 708%%_check_higher_16: 709 test %%SIZE, 16 710 jz %%end_load 711 712 ; Move last bytes loaded to upper half and load 16 bytes in lower half 713 vinserti128 %%DST, XWORD(%%DST), 1 714 vinserti128 %%DST, [%%SRC], 0 715%%end_load: 716%endm 717 718%macro simd_store_avx2 5 719%define %%DST %1 ; register: pointer to dst (not modified) 720%define %%SRC %2 ; register: src data (clobbered) 721%define %%SIZE %3 ; register: length in bytes (not modified) 722%define %%TMP %4 ; 64-bit temp GPR (clobbered) 723%define %%IDX %5 ; 64-bit temp GPR to store dst idx (clobbered) 724 725 xor %%IDX, %%IDX ; zero idx 726 727 test %%SIZE, 32 728 jz %%lt32 729 vmovdqu [%%DST], %%SRC 730 jmp %%end 731%%lt32: 732 733 test %%SIZE, 16 734 jz %%lt16 735 vmovdqu [%%DST], XWORD(%%SRC) 736 ; Move upper half to lower half for further stores 737 vperm2i128 %%SRC, %%SRC, %%SRC, 0x81 738 add %%IDX, 16 739%%lt16: 740 741 test %%SIZE, 8 742 jz %%lt8 743 vmovq [%%DST + %%IDX], XWORD(%%SRC) 744 vpsrldq XWORD(%%SRC), 8 745 add %%IDX, 8 746%%lt8: 747 748 vmovq %%TMP, XWORD(%%SRC) ; use GPR from now on 749 750 test %%SIZE, 4 751 jz %%lt4 752 mov [%%DST + %%IDX], DWORD(%%TMP) 753 shr %%TMP, 32 754 add %%IDX, 4 755%%lt4: 756 757 test %%SIZE, 2 758 jz %%lt2 759 mov [%%DST + %%IDX], WORD(%%TMP) 760 shr %%TMP, 16 761 add %%IDX, 2 762%%lt2: 763 test %%SIZE, 1 764 jz %%end 765 mov [%%DST + %%IDX], BYTE(%%TMP) 766%%end: 767%endm 768 769%endif ; ifndef __MEMCPY_INC__ 770