xref: /isa-l/include/memcpy.asm (revision 637f5a631d945c0ec736dadf956d5c0663df532b)
1;;
2;; Copyright (c) 2023, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;;     * Redistributions of source code must retain the above copyright notice,
8;;       this list of conditions and the following disclaimer.
9;;     * Redistributions in binary form must reproduce the above copyright
10;;       notice, this list of conditions and the following disclaimer in the
11;;       documentation and/or other materials provided with the distribution.
12;;     * Neither the name of Intel Corporation nor the names of its contributors
13;;       may be used to endorse or promote products derived from this software
14;;       without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28%ifndef __MEMCPY_INC__
29%define __MEMCPY_INC__
30
31%include "reg_sizes.asm"
32
33; This section defines a series of macros to copy small to medium amounts
34; of data from memory to memory, where the size is variable but limited.
35;
36; The macros are all called as:
37; memcpy DST, SRC, SIZE, TMP0, TMP1, XTMP0, XTMP1, XTMP2, XTMP3
38; with the parameters defined as:
39;    DST     : register: pointer to dst (not modified)
40;    SRC     : register: pointer to src (not modified)
41;    SIZE    : register: length in bytes (not modified)
42;    TMP0    : 64-bit temp GPR (clobbered)
43;    TMP1    : 64-bit temp GPR (clobbered)
44;    XTMP0   : temp XMM (clobbered)
45;    XTMP1   : temp XMM (clobbered)
46;    XTMP2   : temp XMM (clobbered)
47;    XTMP3   : temp XMM (clobbered)
48;
49; The name indicates the options. The name is of the form:
50; memcpy_<VEC>_<SZ><ZERO><RET>
51; where:
52; <VEC> is either "sse" or "avx" or "avx2"
53; <SZ> is either "64" or "128" and defines largest value of SIZE
54; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
55; <RET> is blank or "_ret". If blank, the code falls through. If "ret"
56;                           it does a "ret" at the end
57;
58; For the avx2 versions, the temp XMM registers need to be YMM registers
59; If the SZ is 64, then only two YMM temps are needed, i.e. it is called as:
60; memcpy_avx2_64 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1
61; memcpy_avx2_128 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1, YTMP2, YTMP3
62;
63; For example:
64; memcpy_sse_64		: SSE,  0 <= size < 64, falls through
65; memcpy_avx_64_1	: AVX1, 1 <= size < 64, falls through
66; memcpy_sse_128_ret	: SSE,  0 <= size < 128, ends with ret
67; mempcy_avx_128_1_ret	: AVX1, 1 <= size < 128, ends with ret
68;
69
70%macro memcpy_sse_64 9
71	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 0
72%endm
73
74%macro memcpy_sse_64_1 9
75	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 0
76%endm
77
78%macro memcpy_sse_128 9
79	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 0
80%endm
81
82%macro memcpy_sse_128_1 9
83	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 0
84%endm
85
86%macro memcpy_sse_64_ret 9
87	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 0
88%endm
89
90%macro memcpy_sse_64_1_ret 9
91	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 0
92%endm
93
94%macro memcpy_sse_128_ret 9
95	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 0
96%endm
97
98%macro memcpy_sse_128_1_ret 9
99	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 0
100%endm
101
102%macro memcpy_sse_16 5
103	__memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 0
104%endm
105
106%macro memcpy_sse_16_1 5
107	__memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 0
108%endm
109
110	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
111
112%macro memcpy_avx_64 9
113	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 1
114%endm
115
116%macro memcpy_avx_64_1 9
117	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 1
118%endm
119
120%macro memcpy_avx_128 9
121	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 1
122%endm
123
124%macro memcpy_avx_128_1 9
125	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 1
126%endm
127
128%macro memcpy_avx_64_ret 9
129	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 1
130%endm
131
132%macro memcpy_avx_64_1_ret 9
133	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 1
134%endm
135
136%macro memcpy_avx_128_ret 9
137	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 1
138%endm
139
140%macro memcpy_avx_128_1_ret 9
141	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 1
142%endm
143
144%macro memcpy_avx_16 5
145	__memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 1
146%endm
147
148%macro memcpy_avx_16_1 5
149	__memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 1
150%endm
151
152	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
153
154%macro memcpy_avx2_64 7
155	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 0, 2
156%endm
157
158%macro memcpy_avx2_64_1 7
159	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 0, 2
160%endm
161
162%macro memcpy_avx2_128 9
163	__memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 0, 128, 0, 2
164%endm
165
166%macro memcpy_avx2_128_1 9
167	__memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 1, 128, 0, 2
168%endm
169
170%macro memcpy_avx2_64_ret 7
171	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 1, 2
172%endm
173
174%macro memcpy_avx2_64_1_ret 7
175	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 1, 2
176%endm
177
178%macro memcpy_avx2_128_ret 9
179	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 2
180%endm
181
182%macro memcpy_avx2_128_1_ret 9
183	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 2
184%endm
185
186;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
187;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
188;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
189;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
190
191%macro __memcpy_int 13
192%define %%DST     %1	; register: pointer to dst (not modified)
193%define %%SRC     %2	; register: pointer to src (not modified)
194%define %%SIZE    %3	; register: length in bytes (not modified)
195%define %%TMP0    %4	; 64-bit temp GPR (clobbered)
196%define %%TMP1    %5	; 64-bit temp GPR (clobbered)
197%define %%XTMP0   %6	; temp XMM (clobbered)
198%define %%XTMP1   %7	; temp XMM (clobbered)
199%define %%XTMP2   %8	; temp XMM (clobbered)
200%define %%XTMP3   %9	; temp XMM (clobbered)
201%define %%NOT0    %10	; if not 0, then assume size cannot be zero
202%define %%MAXSIZE %11	; 128, 64, etc
203%define %%USERET  %12   ; if not 0, use "ret" at end
204%define %%USEAVX  %13   ; 0 = SSE, 1 = AVX1, 2 = AVX2
205
206%if (%%USERET != 0)
207 %define %%DONE	ret
208%else
209 %define %%DONE jmp %%end
210%endif
211
212%if (%%USEAVX != 0)
213 %define %%MOVDQU vmovdqu
214%else
215 %define %%MOVDQU movdqu
216%endif
217
218%if (%%MAXSIZE >= 128)
219	test	%%SIZE, 64
220	jz	%%lt64
221  %if (%%USEAVX >= 2)
222	%%MOVDQU	%%XTMP0, [%%SRC + 0*32]
223	%%MOVDQU	%%XTMP1, [%%SRC + 1*32]
224	%%MOVDQU	%%XTMP2, [%%SRC + %%SIZE - 2*32]
225	%%MOVDQU	%%XTMP3, [%%SRC + %%SIZE - 1*32]
226
227	%%MOVDQU	[%%DST + 0*32], %%XTMP0
228	%%MOVDQU	[%%DST + 1*32], %%XTMP1
229	%%MOVDQU	[%%DST + %%SIZE - 2*32], %%XTMP2
230	%%MOVDQU	[%%DST + %%SIZE - 1*32], %%XTMP3
231  %else
232	%%MOVDQU	%%XTMP0, [%%SRC + 0*16]
233	%%MOVDQU	%%XTMP1, [%%SRC + 1*16]
234	%%MOVDQU	%%XTMP2, [%%SRC + 2*16]
235	%%MOVDQU	%%XTMP3, [%%SRC + 3*16]
236	%%MOVDQU	[%%DST + 0*16], %%XTMP0
237	%%MOVDQU	[%%DST + 1*16], %%XTMP1
238	%%MOVDQU	[%%DST + 2*16], %%XTMP2
239	%%MOVDQU	[%%DST + 3*16], %%XTMP3
240
241	%%MOVDQU	%%XTMP0, [%%SRC + %%SIZE - 4*16]
242	%%MOVDQU	%%XTMP1, [%%SRC + %%SIZE - 3*16]
243	%%MOVDQU	%%XTMP2, [%%SRC + %%SIZE - 2*16]
244	%%MOVDQU	%%XTMP3, [%%SRC + %%SIZE - 1*16]
245	%%MOVDQU	[%%DST + %%SIZE - 4*16], %%XTMP0
246	%%MOVDQU	[%%DST + %%SIZE - 3*16], %%XTMP1
247	%%MOVDQU	[%%DST + %%SIZE - 2*16], %%XTMP2
248	%%MOVDQU	[%%DST + %%SIZE - 1*16], %%XTMP3
249  %endif
250	%%DONE
251%endif
252
253%if (%%MAXSIZE >= 64)
254%%lt64:
255	test	%%SIZE, 32
256	jz	%%lt32
257  %if (%%USEAVX >= 2)
258	%%MOVDQU	%%XTMP0, [%%SRC + 0*32]
259	%%MOVDQU	%%XTMP1, [%%SRC + %%SIZE - 1*32]
260	%%MOVDQU	[%%DST + 0*32], %%XTMP0
261	%%MOVDQU	[%%DST + %%SIZE - 1*32], %%XTMP1
262  %else
263	%%MOVDQU	%%XTMP0, [%%SRC + 0*16]
264	%%MOVDQU	%%XTMP1, [%%SRC + 1*16]
265	%%MOVDQU	%%XTMP2, [%%SRC + %%SIZE - 2*16]
266	%%MOVDQU	%%XTMP3, [%%SRC + %%SIZE - 1*16]
267	%%MOVDQU	[%%DST + 0*16], %%XTMP0
268	%%MOVDQU	[%%DST + 1*16], %%XTMP1
269	%%MOVDQU	[%%DST + %%SIZE - 2*16], %%XTMP2
270	%%MOVDQU	[%%DST + %%SIZE - 1*16], %%XTMP3
271  %endif
272	%%DONE
273%endif
274
275%if (%%MAXSIZE >= 32)
276%%lt32:
277	test	%%SIZE, 16
278	jz	%%lt16
279  %if (%%USEAVX >= 2)
280	%%MOVDQU	XWORD(%%XTMP0), [%%SRC + 0*16]
281	%%MOVDQU	XWORD(%%XTMP1), [%%SRC + %%SIZE - 1*16]
282	%%MOVDQU	[%%DST + 0*16], XWORD(%%XTMP0)
283	%%MOVDQU	[%%DST + %%SIZE - 1*16], XWORD(%%XTMP1)
284  %else
285	%%MOVDQU	%%XTMP0, [%%SRC + 0*16]
286	%%MOVDQU	%%XTMP1, [%%SRC + %%SIZE - 1*16]
287	%%MOVDQU	[%%DST + 0*16], %%XTMP0
288	%%MOVDQU	[%%DST + %%SIZE - 1*16], %%XTMP1
289  %endif
290	%%DONE
291%endif
292
293%if (%%MAXSIZE >= 16)
294        test    %%SIZE, 16
295        jz      %%lt16
296        mov	%%TMP0, [%%SRC]
297        mov	%%TMP1, [%%SRC + 8]
298        mov	[%%DST], %%TMP0
299        mov	[%%DST + 8], %%TMP1
300%%lt16:
301	test	%%SIZE, 8
302	jz	%%lt8
303	mov	%%TMP0, [%%SRC]
304	mov	%%TMP1, [%%SRC + %%SIZE - 8]
305	mov	[%%DST], %%TMP0
306	mov	[%%DST + %%SIZE - 8], %%TMP1
307	%%DONE
308%endif
309
310%if (%%MAXSIZE >= 8)
311%%lt8:
312	test	%%SIZE, 4
313	jz	%%lt4
314	mov	DWORD(%%TMP0), [%%SRC]
315	mov	DWORD(%%TMP1), [%%SRC + %%SIZE - 4]
316	mov	[%%DST], DWORD(%%TMP0)
317	mov	[%%DST + %%SIZE - 4], DWORD(%%TMP1)
318	%%DONE
319%endif
320
321%if (%%MAXSIZE >= 4)
322%%lt4:
323	test	%%SIZE, 2
324	jz	%%lt2
325	movzx	DWORD(%%TMP0), word [%%SRC]
326	movzx	DWORD(%%TMP1), byte [%%SRC + %%SIZE - 1]
327	mov	[%%DST], WORD(%%TMP0)
328	mov	[%%DST + %%SIZE - 1], BYTE(%%TMP1)
329	%%DONE
330%endif
331
332%%lt2:
333%if (%%NOT0 == 0)
334	 test	 %%SIZE, 1
335	 jz	 %%end
336%endif
337	movzx	DWORD(%%TMP0), byte [%%SRC]
338	mov	[%%DST], BYTE(%%TMP0)
339%%end:
340%if (%%USERET != 0)
341	ret
342%endif
343%endm
344
345;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
346;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
347;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
348;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
349
350;; Utility macro to assist with SIMD shifting
351%macro _PSRLDQ 3
352%define %%VEC   %1
353%define %%REG   %2
354%define %%IMM   %3
355
356%ifidn %%VEC, SSE
357        psrldq  %%REG, %%IMM
358%else
359        vpsrldq %%REG, %%REG, %%IMM
360%endif
361%endm
362
363        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
364
365; This section defines a series of macros to store small to medium amounts
366; of data from SIMD registers to memory, where the size is variable but limited.
367;
368; The macros are all called as:
369; memcpy DST, SRC, SIZE, TMP, IDX
370; with the parameters defined as:
371;    DST     : register: pointer to dst (not modified)
372;    SRC     : register: src data (clobbered)
373;    SIZE    : register: length in bytes (not modified)
374;    TMP     : 64-bit temp GPR (clobbered)
375;    IDX     : 64-bit GPR to store dst index/offset (clobbered)
376;    OFFSET  ; Offset to be applied to destination pointer (optional)
377;
378; The name indicates the options. The name is of the form:
379; simd_store_<VEC>
380; where <VEC> is the SIMD instruction type e.g. "sse" or "avx"
381
382%macro simd_store_sse 5-6
383%if %0 == 6
384        __simd_store %1,%2,%3,%4,%5,SSE,16,%6
385%else
386        __simd_store %1,%2,%3,%4,%5,SSE,16
387%endif
388%endm
389
390%macro simd_store_avx 5-6
391%if %0 == 6
392        __simd_store %1,%2,%3,%4,%5,AVX,16,%6
393%else
394        __simd_store %1,%2,%3,%4,%5,AVX,16
395%endif
396%endm
397
398%macro simd_store_sse_15 5-6
399%if %0 == 6
400        __simd_store %1,%2,%3,%4,%5,SSE,15,%6
401%else
402        __simd_store %1,%2,%3,%4,%5,SSE,15
403%endif
404%endm
405
406%macro simd_store_avx_15 5-6
407%if %0 == 6
408        __simd_store %1,%2,%3,%4,%5,AVX,15,%6
409%else
410        __simd_store %1,%2,%3,%4,%5,AVX,15
411%endif
412%endm
413
414%macro __simd_store 7-8
415%define %%DST      %1    ; register: pointer to dst (not modified)
416%define %%SRC      %2    ; register: src data (clobbered)
417%define %%SIZE     %3    ; register: length in bytes (not modified)
418%define %%TMP      %4    ; 64-bit temp GPR (clobbered)
419%define %%IDX      %5    ; 64-bit temp GPR to store dst idx (clobbered)
420%define %%SIMDTYPE %6    ; "SSE" or "AVX"
421%define %%MAX_LEN  %7    ; maximum length to be stored
422%define %%OFFSET   %8    ; offset to be applied to destination pointer
423
424%define %%PSRLDQ _PSRLDQ %%SIMDTYPE,
425
426%ifidn %%SIMDTYPE, SSE
427 %define %%MOVDQU movdqu
428 %define %%MOVQ movq
429%else
430 %define %%MOVDQU vmovdqu
431 %define %%MOVQ vmovq
432%endif
433
434;; determine max byte size for store operation
435%assign max_length_to_store %%MAX_LEN
436
437%if max_length_to_store > 16
438%error "__simd_store macro invoked with MAX_LEN bigger than 16!"
439%endif
440
441%if %0 == 8
442        mov     %%IDX, %%OFFSET
443%else
444        xor     %%IDX, %%IDX        ; zero idx
445%endif
446
447%if max_length_to_store == 16
448        test    %%SIZE, 16
449        jz      %%lt16
450        %%MOVDQU [%%DST + %%IDX], %%SRC
451        jmp     %%end
452%%lt16:
453%endif
454
455%if max_length_to_store >= 8
456        test    %%SIZE, 8
457        jz      %%lt8
458        %%MOVQ  [%%DST + %%IDX], %%SRC
459        %%PSRLDQ %%SRC, 8
460        add     %%IDX, 8
461%%lt8:
462%endif
463
464        %%MOVQ %%TMP, %%SRC     ; use GPR from now on
465
466%if max_length_to_store >= 4
467        test    %%SIZE, 4
468        jz      %%lt4
469        mov     [%%DST + %%IDX], DWORD(%%TMP)
470        shr     %%TMP, 32
471        add     %%IDX, 4
472%%lt4:
473%endif
474
475        test    %%SIZE, 2
476        jz      %%lt2
477        mov     [%%DST + %%IDX], WORD(%%TMP)
478        shr     %%TMP, 16
479        add     %%IDX, 2
480%%lt2:
481        test    %%SIZE, 1
482        jz      %%end
483        mov     [%%DST + %%IDX], BYTE(%%TMP)
484%%end:
485%endm
486
487; This section defines a series of macros to load small to medium amounts
488; (from 0 to 16 bytes) of data from memory to SIMD registers,
489; where the size is variable but limited.
490;
491; The macros are all called as:
492; simd_load DST, SRC, SIZE
493; with the parameters defined as:
494;    DST     : register: destination XMM register
495;    SRC     : register: pointer to src data (not modified)
496;    SIZE    : register: length in bytes (not modified)
497;
498; The name indicates the options. The name is of the form:
499; simd_load_<VEC>_<SZ><ZERO>
500; where:
501; <VEC> is either "sse" or "avx"
502; <SZ> is either "15" or "16" and defines largest value of SIZE
503; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
504;
505; For example:
506; simd_load_sse_16		: SSE, 0 <= size <= 16
507; simd_load_avx_15_1	        : AVX, 1 <= size <= 15
508
509%macro simd_load_sse_15_1 3
510        __simd_load %1,%2,%3,0,0,SSE
511%endm
512%macro simd_load_sse_15 3
513        __simd_load %1,%2,%3,1,0,SSE
514%endm
515%macro simd_load_sse_16_1 3
516        __simd_load %1,%2,%3,0,1,SSE
517%endm
518%macro simd_load_sse_16 3
519        __simd_load %1,%2,%3,1,1,SSE
520%endm
521
522%macro simd_load_avx_15_1 3
523        __simd_load %1,%2,%3,0,0,AVX
524%endm
525%macro simd_load_avx_15 3
526        __simd_load %1,%2,%3,1,0,AVX
527%endm
528%macro simd_load_avx_16_1 3
529        __simd_load %1,%2,%3,0,1,AVX
530%endm
531%macro simd_load_avx_16 3
532        __simd_load %1,%2,%3,1,1,AVX
533%endm
534
535%macro __simd_load 6
536%define %%DST       %1    ; [out] destination XMM register
537%define %%SRC       %2    ; [in] pointer to src data
538%define %%SIZE      %3    ; [in] length in bytes (0-16 bytes)
539%define %%ACCEPT_0  %4    ; 0 = min length = 1, 1 = min length = 0
540%define %%ACCEPT_16 %5    ; 0 = max length = 15 , 1 = max length = 16
541%define %%SIMDTYPE  %6    ; "SSE" or "AVX"
542
543%ifidn %%SIMDTYPE, SSE
544 %define %%MOVDQU movdqu
545 %define %%PINSRB pinsrb
546 %define %%PINSRQ pinsrq
547 %define %%PXOR   pxor
548%else
549 %define %%MOVDQU vmovdqu
550 %define %%PINSRB vpinsrb
551 %define %%PINSRQ vpinsrq
552 %define %%PXOR   vpxor
553%endif
554
555%if (%%ACCEPT_16 != 0)
556        test    %%SIZE, 16
557        jz      %%_skip_16
558        %%MOVDQU %%DST, [%%SRC]
559        jmp     %%end_load
560
561%%_skip_16:
562%endif
563        %%PXOR  %%DST, %%DST ; clear XMM register
564%if (%%ACCEPT_0 != 0)
565        or      %%SIZE, %%SIZE
566        je      %%end_load
567%endif
568        cmp     %%SIZE, 2
569        jb      %%_size_1
570        je      %%_size_2
571        cmp     %%SIZE, 4
572        jb      %%_size_3
573        je      %%_size_4
574        cmp     %%SIZE, 6
575        jb      %%_size_5
576        je      %%_size_6
577        cmp     %%SIZE, 8
578        jb      %%_size_7
579        je      %%_size_8
580        cmp     %%SIZE, 10
581        jb      %%_size_9
582        je      %%_size_10
583        cmp     %%SIZE, 12
584        jb      %%_size_11
585        je      %%_size_12
586        cmp     %%SIZE, 14
587        jb      %%_size_13
588        je      %%_size_14
589
590%%_size_15:
591        %%PINSRB %%DST, [%%SRC + 14], 14
592%%_size_14:
593        %%PINSRB %%DST, [%%SRC + 13], 13
594%%_size_13:
595        %%PINSRB %%DST, [%%SRC + 12], 12
596%%_size_12:
597        %%PINSRB %%DST, [%%SRC + 11], 11
598%%_size_11:
599        %%PINSRB %%DST, [%%SRC + 10], 10
600%%_size_10:
601        %%PINSRB %%DST, [%%SRC + 9], 9
602%%_size_9:
603        %%PINSRB %%DST, [%%SRC + 8], 8
604%%_size_8:
605        %%PINSRQ %%DST, [%%SRC], 0
606        jmp    %%end_load
607%%_size_7:
608        %%PINSRB %%DST, [%%SRC + 6], 6
609%%_size_6:
610        %%PINSRB %%DST, [%%SRC + 5], 5
611%%_size_5:
612        %%PINSRB %%DST, [%%SRC + 4], 4
613%%_size_4:
614        %%PINSRB %%DST, [%%SRC + 3], 3
615%%_size_3:
616        %%PINSRB %%DST, [%%SRC + 2], 2
617%%_size_2:
618        %%PINSRB %%DST, [%%SRC + 1], 1
619%%_size_1:
620        %%PINSRB %%DST, [%%SRC + 0], 0
621%%end_load:
622%endm
623
624%macro simd_load_avx2 5
625%define %%DST       %1    ; [out] destination YMM register
626%define %%SRC       %2    ; [in] pointer to src data
627%define %%SIZE      %3    ; [in] length in bytes (0-32 bytes)
628%define %%IDX       %4    ; [clobbered] Temp GP register to store src idx
629%define %%TMP       %5    ; [clobbered] Temp GP register
630
631        test    %%SIZE, 32
632        jz      %%_skip_32
633        vmovdqu %%DST, [%%SRC]
634        jmp     %%end_load
635
636%%_skip_32:
637        vpxor   %%DST, %%DST ; clear YMM register
638        or      %%SIZE, %%SIZE
639        je      %%end_load
640
641        lea     %%IDX, [%%SRC]
642        mov     %%TMP, %%SIZE
643        cmp     %%SIZE, 16
644        jle     %%_check_size
645
646        add     %%IDX, 16
647        sub     %%TMP, 16
648
649%%_check_size:
650        cmp     %%TMP, 2
651        jb      %%_size_1
652        je      %%_size_2
653        cmp     %%TMP, 4
654        jb      %%_size_3
655        je      %%_size_4
656        cmp     %%TMP, 6
657        jb      %%_size_5
658        je      %%_size_6
659        cmp     %%TMP, 8
660        jb      %%_size_7
661        je      %%_size_8
662        cmp     %%TMP, 10
663        jb      %%_size_9
664        je      %%_size_10
665        cmp     %%TMP, 12
666        jb      %%_size_11
667        je      %%_size_12
668        cmp     %%TMP, 14
669        jb      %%_size_13
670        je      %%_size_14
671        cmp     %%TMP, 15
672        je      %%_size_15
673
674%%_size_16:
675        vmovdqu XWORD(%%DST), [%%IDX]
676        jmp    %%end_load
677%%_size_15:
678        vpinsrb XWORD(%%DST), [%%IDX + 14], 14
679%%_size_14:
680        vpinsrb XWORD(%%DST), [%%IDX + 13], 13
681%%_size_13:
682        vpinsrb XWORD(%%DST), [%%IDX + 12], 12
683%%_size_12:
684        vpinsrb XWORD(%%DST), [%%IDX + 11], 11
685%%_size_11:
686        vpinsrb XWORD(%%DST), [%%IDX + 10], 10
687%%_size_10:
688        vpinsrb XWORD(%%DST), [%%IDX + 9], 9
689%%_size_9:
690        vpinsrb XWORD(%%DST), [%%IDX + 8], 8
691%%_size_8:
692        vpinsrq XWORD(%%DST), [%%IDX], 0
693        jmp    %%_check_higher_16
694%%_size_7:
695        vpinsrb XWORD(%%DST), [%%IDX + 6], 6
696%%_size_6:
697        vpinsrb XWORD(%%DST), [%%IDX + 5], 5
698%%_size_5:
699        vpinsrb XWORD(%%DST), [%%IDX + 4], 4
700%%_size_4:
701        vpinsrb XWORD(%%DST), [%%IDX + 3], 3
702%%_size_3:
703        vpinsrb XWORD(%%DST), [%%IDX + 2], 2
704%%_size_2:
705        vpinsrb XWORD(%%DST), [%%IDX + 1], 1
706%%_size_1:
707        vpinsrb XWORD(%%DST), [%%IDX + 0], 0
708%%_check_higher_16:
709        test    %%SIZE, 16
710        jz      %%end_load
711
712        ; Move last bytes loaded to upper half and load 16 bytes in lower half
713        vinserti128 %%DST, XWORD(%%DST), 1
714        vinserti128 %%DST, [%%SRC], 0
715%%end_load:
716%endm
717
718%macro simd_store_avx2 5
719%define %%DST      %1    ; register: pointer to dst (not modified)
720%define %%SRC      %2    ; register: src data (clobbered)
721%define %%SIZE     %3    ; register: length in bytes (not modified)
722%define %%TMP      %4    ; 64-bit temp GPR (clobbered)
723%define %%IDX      %5    ; 64-bit temp GPR to store dst idx (clobbered)
724
725        xor %%IDX, %%IDX        ; zero idx
726
727        test    %%SIZE, 32
728        jz      %%lt32
729        vmovdqu [%%DST], %%SRC
730        jmp     %%end
731%%lt32:
732
733        test    %%SIZE, 16
734        jz      %%lt16
735        vmovdqu [%%DST], XWORD(%%SRC)
736        ; Move upper half to lower half for further stores
737        vperm2i128 %%SRC, %%SRC, %%SRC, 0x81
738        add     %%IDX, 16
739%%lt16:
740
741        test    %%SIZE, 8
742        jz      %%lt8
743        vmovq  [%%DST + %%IDX], XWORD(%%SRC)
744        vpsrldq XWORD(%%SRC), 8
745        add     %%IDX, 8
746%%lt8:
747
748        vmovq %%TMP, XWORD(%%SRC)     ; use GPR from now on
749
750        test    %%SIZE, 4
751        jz      %%lt4
752        mov     [%%DST + %%IDX], DWORD(%%TMP)
753        shr     %%TMP, 32
754        add     %%IDX, 4
755%%lt4:
756
757        test    %%SIZE, 2
758        jz      %%lt2
759        mov     [%%DST + %%IDX], WORD(%%TMP)
760        shr     %%TMP, 16
761        add     %%IDX, 2
762%%lt2:
763        test    %%SIZE, 1
764        jz      %%end
765        mov     [%%DST + %%IDX], BYTE(%%TMP)
766%%end:
767%endm
768
769%endif ; ifndef __MEMCPY_INC__
770