xref: /isa-l_crypto/include/memcpy.asm (revision 1638fd0e3f11b8690915fde0bf6943137d23f68d)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29%ifndef __MEMCPY_ASM__
30%define __MEMCPY_ASM__
31
32%include "reg_sizes.asm"
33
34
35; This file defines a series of macros to copy small to medium amounts
36; of data from memory to memory, where the size is variable but limited.
37;
38; The macros are all called as:
39; memcpy DST, SRC, SIZE, TMP0, TMP1, XTMP0, XTMP1, XTMP2, XTMP3
40; with the parameters defined as:
41;    DST     : register: pointer to dst (not modified)
42;    SRC     : register: pointer to src (not modified)
43;    SIZE    : register: length in bytes (not modified)
44;    TMP0    : 64-bit temp GPR (clobbered)
45;    TMP1    : 64-bit temp GPR (clobbered)
46;    XTMP0   : temp XMM (clobbered)
47;    XTMP1   : temp XMM (clobbered)
48;    XTMP2   : temp XMM (clobbered)
49;    XTMP3   : temp XMM (clobbered)
50;
51; The name indicates the options. The name is of the form:
52; memcpy_<VEC>_<SZ><ZERO><RET>
53; where:
54; <VEC> is either "sse" or "avx" or "avx2"
55; <SZ> is either "64" or "128" and defines largest value of SIZE
56; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
57; <RET> is blank or "_ret". If blank, the code falls through. If "ret"
58;                           it does a "ret" at the end
59;
60; For the avx2 versions, the temp XMM registers need to be YMM registers
61; If the SZ is 64, then only two YMM temps are needed, i.e. it is called as:
62; memcpy_avx2_64 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1
63; memcpy_avx2_128 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1, YTMP2, YTMP3
64;
65; For example:
66; memcpy_sse_64		: SSE,  0 <= size < 64, falls through
67; memcpy_avx_64_1	: AVX1, 1 <= size < 64, falls through
68; memcpy_sse_128_ret	: SSE,  0 <= size < 128, ends with ret
69; mempcy_avx_128_1_ret	: AVX1, 1 <= size < 128, ends with ret
70;
71
72%macro memcpy_sse_64 9
73	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 0
74%endm
75
76%macro memcpy_sse_64_1 9
77	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 0
78%endm
79
80%macro memcpy_sse_128 9
81	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 0
82%endm
83
84%macro memcpy_sse_128_1 9
85	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 0
86%endm
87
88%macro memcpy_sse_64_ret 9
89	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 0
90%endm
91
92%macro memcpy_sse_64_1_ret 9
93	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 0
94%endm
95
96%macro memcpy_sse_128_ret 9
97	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 0
98%endm
99
100%macro memcpy_sse_128_1_ret 9
101	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 0
102%endm
103
104
105%macro memcpy_sse_16 5
106	__memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 0
107%endm
108
109%macro memcpy_sse_16_1 5
110	__memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 0
111%endm
112
113	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
114
115%macro memcpy_avx_64 9
116	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 1
117%endm
118
119%macro memcpy_avx_64_1 9
120	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 1
121%endm
122
123%macro memcpy_avx_128 9
124	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 1
125%endm
126
127%macro memcpy_avx_128_1 9
128	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 1
129%endm
130
131%macro memcpy_avx_64_ret 9
132	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 1
133%endm
134
135%macro memcpy_avx_64_1_ret 9
136	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 1
137%endm
138
139%macro memcpy_avx_128_ret 9
140	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 1
141%endm
142
143%macro memcpy_avx_128_1_ret 9
144	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 1
145%endm
146
147
148%macro memcpy_avx_16 5
149	__memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 1
150%endm
151
152%macro memcpy_avx_16_1 5
153	__memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 1
154%endm
155
156	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
157
158%macro memcpy_avx2_64 7
159	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 0, 2
160%endm
161
162%macro memcpy_avx2_64_1 7
163	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 0, 2
164%endm
165
166%macro memcpy_avx2_128 9
167	__memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 0, 128, 0, 2
168%endm
169
170%macro memcpy_avx2_128_1 9
171	__memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 1, 128, 0, 2
172%endm
173
174%macro memcpy_avx2_64_ret 7
175	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 1, 2
176%endm
177
178%macro memcpy_avx2_64_1_ret 7
179	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 1, 2
180%endm
181
182%macro memcpy_avx2_128_ret 9
183	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 128, 1, 2
184%endm
185
186%macro memcpy_avx2_128_1_ret 9
187	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 128, 1, 2
188%endm
189
190
191
192;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
193;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
194;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
195;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
196
197
198%macro __memcpy_int 13
199%define %%DST     %1	; register: pointer to dst (not modified)
200%define %%SRC     %2	; register: pointer to src (not modified)
201%define %%SIZE    %3	; register: length in bytes (not modified)
202%define %%TMP0    %4	; 64-bit temp GPR (clobbered)
203%define %%TMP1    %5	; 64-bit temp GPR (clobbered)
204%define %%XTMP0   %6	; temp XMM (clobbered)
205%define %%XTMP1   %7	; temp XMM (clobbered)
206%define %%XTMP2   %8	; temp XMM (clobbered)
207%define %%XTMP3   %9	; temp XMM (clobbered)
208%define %%NOT0    %10	; if not 0, then assume size cannot be zero
209%define %%MAXSIZE %11	; 128, 64, etc
210%define %%USERET  %12   ; if not 0, use "ret" at end
211%define %%USEAVX  %13   ; 0 = SSE, 1 = AVX1, 2 = AVX2
212
213%if (%%USERET != 0)
214 %define %%DONE	ret
215%else
216 %define %%DONE jmp %%end
217%endif
218
219%if (%%USEAVX != 0)
220 %define %%MOVDQU vmovdqu
221%else
222 %define %%MOVDQU movdqu
223%endif
224
225%if (%%MAXSIZE >= 128)
226	test	%%SIZE, 64
227	jz	%%lt64
228  %if (%%USEAVX >= 2)
229	%%MOVDQU	%%XTMP0, [%%SRC + 0*32]
230	%%MOVDQU	%%XTMP1, [%%SRC + 1*32]
231	%%MOVDQU	%%XTMP2, [%%SRC + %%SIZE - 2*32]
232	%%MOVDQU	%%XTMP3, [%%SRC + %%SIZE - 1*32]
233
234	%%MOVDQU	[%%DST + 0*32], %%XTMP0
235	%%MOVDQU	[%%DST + 1*32], %%XTMP1
236	%%MOVDQU	[%%DST + %%SIZE - 2*32], %%XTMP2
237	%%MOVDQU	[%%DST + %%SIZE - 1*32], %%XTMP3
238  %else
239	%%MOVDQU	%%XTMP0, [%%SRC + 0*16]
240	%%MOVDQU	%%XTMP1, [%%SRC + 1*16]
241	%%MOVDQU	%%XTMP2, [%%SRC + 2*16]
242	%%MOVDQU	%%XTMP3, [%%SRC + 3*16]
243	%%MOVDQU	[%%DST + 0*16], %%XTMP0
244	%%MOVDQU	[%%DST + 1*16], %%XTMP1
245	%%MOVDQU	[%%DST + 2*16], %%XTMP2
246	%%MOVDQU	[%%DST + 3*16], %%XTMP3
247
248	%%MOVDQU	%%XTMP0, [%%SRC + %%SIZE - 4*16]
249	%%MOVDQU	%%XTMP1, [%%SRC + %%SIZE - 3*16]
250	%%MOVDQU	%%XTMP2, [%%SRC + %%SIZE - 2*16]
251	%%MOVDQU	%%XTMP3, [%%SRC + %%SIZE - 1*16]
252	%%MOVDQU	[%%DST + %%SIZE - 4*16], %%XTMP0
253	%%MOVDQU	[%%DST + %%SIZE - 3*16], %%XTMP1
254	%%MOVDQU	[%%DST + %%SIZE - 2*16], %%XTMP2
255	%%MOVDQU	[%%DST + %%SIZE - 1*16], %%XTMP3
256  %endif
257	%%DONE
258%endif
259
260%if (%%MAXSIZE >= 64)
261%%lt64
262	test	%%SIZE, 32
263	jz	%%lt32
264  %if (%%USEAVX >= 2)
265	%%MOVDQU	%%XTMP0, [%%SRC + 0*32]
266	%%MOVDQU	%%XTMP1, [%%SRC + %%SIZE - 1*32]
267	%%MOVDQU	[%%DST + 0*32], %%XTMP0
268	%%MOVDQU	[%%DST + %%SIZE - 1*32], %%XTMP1
269  %else
270	%%MOVDQU	%%XTMP0, [%%SRC + 0*16]
271	%%MOVDQU	%%XTMP1, [%%SRC + 1*16]
272	%%MOVDQU	%%XTMP2, [%%SRC + %%SIZE - 2*16]
273	%%MOVDQU	%%XTMP3, [%%SRC + %%SIZE - 1*16]
274	%%MOVDQU	[%%DST + 0*16], %%XTMP0
275	%%MOVDQU	[%%DST + 1*16], %%XTMP1
276	%%MOVDQU	[%%DST + %%SIZE - 2*16], %%XTMP2
277	%%MOVDQU	[%%DST + %%SIZE - 1*16], %%XTMP3
278  %endif
279	%%DONE
280%endif
281
282%if (%%MAXSIZE >= 32)
283%%lt32:
284	test	%%SIZE, 16
285	jz	%%lt16
286  %if (%%USEAVX >= 2)
287	%%MOVDQU	XWORD(%%XTMP0), [%%SRC + 0*16]
288	%%MOVDQU	XWORD(%%XTMP1), [%%SRC + %%SIZE - 1*16]
289	%%MOVDQU	[%%DST + 0*16], XWORD(%%XTMP0)
290	%%MOVDQU	[%%DST + %%SIZE - 1*16], XWORD(%%XTMP1)
291  %else
292	%%MOVDQU	%%XTMP0, [%%SRC + 0*16]
293	%%MOVDQU	%%XTMP1, [%%SRC + %%SIZE - 1*16]
294	%%MOVDQU	[%%DST + 0*16], %%XTMP0
295	%%MOVDQU	[%%DST + %%SIZE - 1*16], %%XTMP1
296  %endif
297	%%DONE
298%endif
299
300%if (%%MAXSIZE >= 16)
301%%lt16:
302	test	%%SIZE, 8
303	jz	%%lt8
304	mov	%%TMP0, [%%SRC]
305	mov	%%TMP1, [%%SRC + %%SIZE - 8]
306	mov	[%%DST], %%TMP0
307	mov	[%%DST + %%SIZE - 8], %%TMP1
308	%%DONE
309%endif
310
311%if (%%MAXSIZE >= 8)
312%%lt8:
313	test	%%SIZE, 4
314	jz	%%lt4
315	mov	DWORD(%%TMP0), [%%SRC]
316	mov	DWORD(%%TMP1), [%%SRC + %%SIZE - 4]
317	mov	[%%DST], DWORD(%%TMP0)
318	mov	[%%DST + %%SIZE - 4], DWORD(%%TMP1)
319	%%DONE
320%endif
321
322%if (%%MAXSIZE >= 4)
323%%lt4:
324	test	%%SIZE, 2
325	jz	%%lt2
326	movzx	DWORD(%%TMP0), word [%%SRC]
327	movzx	DWORD(%%TMP1), byte [%%SRC + %%SIZE - 1]
328	mov	[%%DST], WORD(%%TMP0)
329	mov	[%%DST + %%SIZE - 1], BYTE(%%TMP1)
330	%%DONE
331%endif
332
333%%lt2:
334%if (%%NOT0 == 0)
335	 test	 %%SIZE, 1
336	 jz	 %%end
337%endif
338	movzx	DWORD(%%TMP0), byte [%%SRC]
339	mov	[%%DST], BYTE(%%TMP0)
340%%end:
341%if (%%USERET != 0)
342	ret
343%endif
344%endm
345
346;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
347;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
348;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
349;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
350
351;; Utility macro to assist with SIMD shifting
352%macro _PSRLDQ 3
353%define %%VEC   %1
354%define %%REG   %2
355%define %%IMM   %3
356
357%ifidn %%VEC, SSE
358        psrldq  %%REG, %%IMM
359%else
360        vpsrldq %%REG, %%REG, %%IMM
361%endif
362%endm
363
364        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
365
366; This section defines a series of macros to store small to medium amounts
367; of data from SIMD registers to memory, where the size is variable but limited.
368;
369; The macros are all called as:
370; memcpy DST, SRC, SIZE, TMP, IDX
371; with the parameters defined as:
372;    DST     : register: pointer to dst (not modified)
373;    SRC     : register: src data (clobbered)
374;    SIZE    : register: length in bytes (not modified)
375;    TMP     : 64-bit temp GPR (clobbered)
376;    IDX     : 64-bit GPR to store dst index/offset (clobbered)
377;
378; The name indicates the options. The name is of the form:
379; simd_store_<VEC>
380; where <VEC> is the SIMD instruction type e.g. "sse" or "avx"
381
382
383%macro simd_store_sse 5
384        __simd_store %1,%2,%3,%4,%5,SSE
385%endm
386
387%macro simd_store_avx 5
388        __simd_store %1,%2,%3,%4,%5,AVX
389%endm
390
391%macro simd_store_sse_15 5
392        __simd_store %1,%2,%3,%4,%5,SSE,15
393%endm
394
395%macro simd_store_avx_15 5
396        __simd_store %1,%2,%3,%4,%5,AVX,15
397%endm
398
399%macro __simd_store 6-7
400%define %%DST      %1    ; register: pointer to dst (not modified)
401%define %%SRC      %2    ; register: src data (clobbered)
402%define %%SIZE     %3    ; register: length in bytes (not modified)
403%define %%TMP      %4    ; 64-bit temp GPR (clobbered)
404%define %%IDX      %5    ; 64-bit temp GPR to store dst idx (clobbered)
405%define %%SIMDTYPE %6    ; "SSE" or "AVX"
406%define %%MAX_LEN  %7    ; [optional] maximum length to be stored, default 16
407
408%define %%PSRLDQ _PSRLDQ %%SIMDTYPE,
409
410%ifidn %%SIMDTYPE, SSE
411 %define %%MOVDQU movdqu
412 %define %%MOVQ movq
413%else
414 %define %%MOVDQU vmovdqu
415 %define %%MOVQ vmovq
416%endif
417
418;; determine max byte size for store operation
419%if %0 > 6
420%assign max_length_to_store %%MAX_LEN
421%else
422%assign max_length_to_store 16
423%endif
424
425%if max_length_to_store > 16
426%error "__simd_store macro invoked with MAX_LEN bigger than 16!"
427%endif
428
429        xor %%IDX, %%IDX        ; zero idx
430
431%if max_length_to_store == 16
432        test    %%SIZE, 16
433        jz      %%lt16
434        %%MOVDQU [%%DST], %%SRC
435        jmp     %%end
436%%lt16:
437%endif
438
439%if max_length_to_store >= 8
440        test    %%SIZE, 8
441        jz      %%lt8
442        %%MOVQ  [%%DST + %%IDX], %%SRC
443        %%PSRLDQ %%SRC, 8
444        add     %%IDX, 8
445%%lt8:
446%endif
447
448        %%MOVQ %%TMP, %%SRC     ; use GPR from now on
449
450%if max_length_to_store >= 4
451        test    %%SIZE, 4
452        jz      %%lt4
453        mov     [%%DST + %%IDX], DWORD(%%TMP)
454        shr     %%TMP, 32
455        add     %%IDX, 4
456%%lt4:
457%endif
458
459        test    %%SIZE, 2
460        jz      %%lt2
461        mov     [%%DST + %%IDX], WORD(%%TMP)
462        shr     %%TMP, 16
463        add     %%IDX, 2
464%%lt2:
465        test    %%SIZE, 1
466        jz      %%end
467        mov     [%%DST + %%IDX], BYTE(%%TMP)
468%%end:
469%endm
470
471; This section defines a series of macros to load small to medium amounts
472; (from 0 to 16 bytes) of data from memory to SIMD registers,
473; where the size is variable but limited.
474;
475; The macros are all called as:
476; simd_load DST, SRC, SIZE
477; with the parameters defined as:
478;    DST     : register: destination XMM register
479;    SRC     : register: pointer to src data (not modified)
480;    SIZE    : register: length in bytes (not modified)
481;
482; The name indicates the options. The name is of the form:
483; simd_load_<VEC>_<SZ><ZERO>
484; where:
485; <VEC> is either "sse" or "avx"
486; <SZ> is either "15" or "16" and defines largest value of SIZE
487; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
488;
489; For example:
490; simd_load_sse_16		: SSE, 0 <= size <= 16
491; simd_load_avx_15_1	        : AVX, 1 <= size <= 15
492
493%macro simd_load_sse_15_1 3
494        __simd_load %1,%2,%3,0,0,SSE
495%endm
496%macro simd_load_sse_15 3
497        __simd_load %1,%2,%3,1,0,SSE
498%endm
499%macro simd_load_sse_16_1 3
500        __simd_load %1,%2,%3,0,1,SSE
501%endm
502%macro simd_load_sse_16 3
503        __simd_load %1,%2,%3,1,1,SSE
504%endm
505
506%macro simd_load_avx_15_1 3
507        __simd_load %1,%2,%3,0,0,AVX
508%endm
509%macro simd_load_avx_15 3
510        __simd_load %1,%2,%3,1,0,AVX
511%endm
512%macro simd_load_avx_16_1 3
513        __simd_load %1,%2,%3,0,1,AVX
514%endm
515%macro simd_load_avx_16 3
516        __simd_load %1,%2,%3,1,1,AVX
517%endm
518
519%macro __simd_load 6
520%define %%DST       %1    ; [out] destination XMM register
521%define %%SRC       %2    ; [in] pointer to src data
522%define %%SIZE      %3    ; [in] length in bytes (0-16 bytes)
523%define %%ACCEPT_0  %4    ; 0 = min length = 1, 1 = min length = 0
524%define %%ACCEPT_16 %5    ; 0 = max length = 15 , 1 = max length = 16
525%define %%SIMDTYPE  %6    ; "SSE" or "AVX"
526
527%ifidn %%SIMDTYPE, SSE
528 %define %%MOVDQU movdqu
529 %define %%PINSRB pinsrb
530 %define %%PINSRQ pinsrq
531 %define %%PXOR   pxor
532%else
533 %define %%MOVDQU vmovdqu
534 %define %%PINSRB vpinsrb
535 %define %%PINSRQ vpinsrq
536 %define %%PXOR   vpxor
537%endif
538
539%if (%%ACCEPT_16 != 0)
540        test    %%SIZE, 16
541        jz      %%_skip_16
542        %%MOVDQU %%DST, [%%SRC]
543        jmp     %%end_load
544
545%%_skip_16:
546%endif
547        %%PXOR  %%DST, %%DST ; clear XMM register
548%if (%%ACCEPT_0 != 0)
549        or      %%SIZE, %%SIZE
550        je      %%end_load
551%endif
552        cmp     %%SIZE, 1
553        je      %%_size_1
554        cmp     %%SIZE, 2
555        je      %%_size_2
556        cmp     %%SIZE, 3
557        je      %%_size_3
558        cmp     %%SIZE, 4
559        je      %%_size_4
560        cmp     %%SIZE, 5
561        je      %%_size_5
562        cmp     %%SIZE, 6
563        je      %%_size_6
564        cmp     %%SIZE, 7
565        je      %%_size_7
566        cmp     %%SIZE, 8
567        je      %%_size_8
568        cmp     %%SIZE, 9
569        je      %%_size_9
570        cmp     %%SIZE, 10
571        je      %%_size_10
572        cmp     %%SIZE, 11
573        je      %%_size_11
574        cmp     %%SIZE, 12
575        je      %%_size_12
576        cmp     %%SIZE, 13
577        je      %%_size_13
578        cmp     %%SIZE, 14
579        je      %%_size_14
580
581%%_size_15:
582        %%PINSRB %%DST, [%%SRC + 14], 14
583%%_size_14:
584        %%PINSRB %%DST, [%%SRC + 13], 13
585%%_size_13:
586        %%PINSRB %%DST, [%%SRC + 12], 12
587%%_size_12:
588        %%PINSRB %%DST, [%%SRC + 11], 11
589%%_size_11:
590        %%PINSRB %%DST, [%%SRC + 10], 10
591%%_size_10:
592        %%PINSRB %%DST, [%%SRC + 9], 9
593%%_size_9:
594        %%PINSRB %%DST, [%%SRC + 8], 8
595%%_size_8:
596        %%PINSRQ %%DST, [%%SRC], 0
597        jmp    %%end_load
598%%_size_7:
599        %%PINSRB %%DST, [%%SRC + 6], 6
600%%_size_6:
601        %%PINSRB %%DST, [%%SRC + 5], 5
602%%_size_5:
603        %%PINSRB %%DST, [%%SRC + 4], 4
604%%_size_4:
605        %%PINSRB %%DST, [%%SRC + 3], 3
606%%_size_3:
607        %%PINSRB %%DST, [%%SRC + 2], 2
608%%_size_2:
609        %%PINSRB %%DST, [%%SRC + 1], 1
610%%_size_1:
611        %%PINSRB %%DST, [%%SRC + 0], 0
612%%end_load:
613%endm
614
615%endif ; ifndef __MEMCPY_ASM__
616