xref: /isa-l/erasure_code/gf_6vect_dot_prod_avx512_gfni.asm (revision d65d2b5572a86946668df6c7c91581699d49ea4a)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2023 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;;;
31;;; gf_6vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, **dests);
32;;;
33
34%include "reg_sizes.asm"
35%include "gf_vect_gfni.inc"
36
37%if AS_FEATURE_LEVEL >= 10
38
39%ifidn __OUTPUT_FORMAT__, elf64
40 %define arg0  rdi
41 %define arg1  rsi
42 %define arg2  rdx
43 %define arg3  rcx
44 %define arg4  r8
45 %define arg5  r9
46
47 %define tmp   r11
48 %define tmp2  r10
49 %define tmp3  r13		; must be saved and restored
50 %define tmp4  r12		; must be saved and restored
51 %define tmp5  r14		; must be saved and restored
52 %define tmp6  r15		; must be saved and restored
53 %define tmp7  rbp		; must be saved and restored
54 %define tmp8  rbx		; must be saved and restored
55
56 %define func(x) x: endbranch
57 %macro FUNC_SAVE 0
58	push	r12
59	push	r13
60	push	r14
61	push	r15
62	push	rbp
63	push	rbx
64 %endmacro
65 %macro FUNC_RESTORE 0
66	pop	rbx
67	pop	rbp
68	pop	r15
69	pop	r14
70	pop	r13
71	pop	r12
72 %endmacro
73%endif
74
75%ifidn __OUTPUT_FORMAT__, win64
76 %define arg0   rcx
77 %define arg1   rdx
78 %define arg2   r8
79 %define arg3   r9
80
81 %define arg4   r12 		; must be saved, loaded and restored
82 %define arg5   r15 		; must be saved and restored
83 %define tmp    r11
84 %define tmp2   r10
85 %define tmp3   r13		; must be saved and restored
86 %define tmp4   r14		; must be saved and restored
87 %define tmp5   rdi		; must be saved and restored
88 %define tmp6   rsi		; must be saved and restored
89 %define tmp7   rbp		; must be saved and restored
90 %define tmp8   rbx		; must be saved and restored
91 %define stack_size  7*16 + 9*8		; must be an odd multiple of 8
92 %define arg(x)      [rsp + stack_size + 8 + 8*x]
93
94 %define func(x) proc_frame x
95 %macro FUNC_SAVE 0
96	alloc_stack	stack_size
97	vmovdqa	[rsp + 0*16], xmm6
98	vmovdqa	[rsp + 1*16], xmm7
99	vmovdqa	[rsp + 2*16], xmm8
100	vmovdqa	[rsp + 3*16], xmm9
101	vmovdqa	[rsp + 4*16], xmm10
102	vmovdqa	[rsp + 5*16], xmm11
103	vmovdqa	[rsp + 6*16], xmm12
104	mov	[rsp + 7*16 + 0*8], r12
105	mov	[rsp + 7*16 + 1*8], r13
106	mov	[rsp + 7*16 + 2*8], r14
107	mov	[rsp + 7*16 + 3*8], r15
108	mov	[rsp + 7*16 + 4*8], rdi
109	mov	[rsp + 7*16 + 5*8], rsi
110	mov	[rsp + 7*16 + 6*8], rbp
111	mov	[rsp + 7*16 + 7*8], rbx
112	end_prolog
113	mov	arg4, arg(4)
114 %endmacro
115
116 %macro FUNC_RESTORE 0
117	vmovdqa	xmm6, [rsp + 0*16]
118	vmovdqa	xmm7, [rsp + 1*16]
119	vmovdqa	xmm8, [rsp + 2*16]
120	vmovdqa	xmm9, [rsp + 3*16]
121	vmovdqa	xmm10, [rsp + 4*16]
122	vmovdqa	xmm11, [rsp + 5*16]
123	vmovdqa	xmm12, [rsp + 6*16]
124	mov	r12,  [rsp + 7*16 + 0*8]
125	mov	r13,  [rsp + 7*16 + 1*8]
126	mov	r14,  [rsp + 7*16 + 2*8]
127	mov	r15,  [rsp + 7*16 + 3*8]
128	mov	rdi,  [rsp + 7*16 + 4*8]
129	mov	rsi,  [rsp + 7*16 + 5*8]
130	mov	rbp,  [rsp + 7*16 + 6*8]
131	mov	rbx,  [rsp + 7*16 + 7*8]
132	add	rsp, stack_size
133 %endmacro
134%endif
135
136
137%define len    arg0
138%define vec    arg1
139%define mul_array arg2
140%define src    arg3
141%define dest1  arg4
142%define ptr    arg5
143%define vec_i  tmp2
144%define dest2  tmp3
145%define dest3  tmp4
146%define dest4  tmp5
147%define vskip3 tmp6
148%define dest5  tmp7
149%define vskip5 tmp8
150%define pos    rax
151
152
153%ifndef EC_ALIGNED_ADDR
154;;; Use Un-aligned load/store
155 %define XLDR vmovdqu8
156 %define XSTR vmovdqu8
157%else
158;;; Use Non-temporal load/stor
159 %ifdef NO_NT_LDST
160  %define XLDR vmovdqa64
161  %define XSTR vmovdqa64
162 %else
163  %define XLDR vmovntdqa
164  %define XSTR vmovntdq
165 %endif
166%endif
167
168%define xgft1  zmm7
169%define xgft2  zmm8
170%define xgft3  zmm9
171%define xgft4  zmm10
172%define xgft5  zmm11
173%define xgft6  zmm12
174
175%define x0     zmm0
176%define xp1    zmm1
177%define xp2    zmm2
178%define xp3    zmm3
179%define xp4    zmm4
180%define xp5    zmm5
181%define xp6    zmm6
182
183default rel
184[bits 64]
185
186section .text
187
188;;
189;; Encodes 64 bytes of all "k" sources into 6x 64 bytes (parity disks)
190;;
191%macro ENCODE_64B_6 0-1
192%define %%KMASK %1
193
194	vpxorq	xp1, xp1, xp1
195	vpxorq	xp2, xp2, xp2
196	vpxorq	xp3, xp3, xp3
197	vpxorq	xp4, xp4, xp4
198	vpxorq	xp5, xp5, xp5
199	vpxorq	xp6, xp6, xp6
200	mov	tmp, mul_array
201	xor	vec_i, vec_i
202
203%%next_vect:
204	mov	ptr, [src + vec_i]
205%if %0 == 1
206	vmovdqu8 x0{%%KMASK}, [ptr + pos]	;Get next source vector (less than 64 bytes)
207%else
208	XLDR	x0, [ptr + pos]		;Get next source vector (64 bytes)
209%endif
210	add	vec_i, 8
211
212        vbroadcastf32x2 xgft1, [tmp]
213        vbroadcastf32x2 xgft2, [tmp + vec]
214        vbroadcastf32x2 xgft3, [tmp + vec*2]
215        vbroadcastf32x2 xgft4, [tmp + vskip3]
216        vbroadcastf32x2 xgft5, [tmp + vec*4]
217        vbroadcastf32x2 xgft6, [tmp + vskip5]
218	add	tmp, 8
219
220        GF_MUL_XOR EVEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2, xgft3, xgft3, xp3, \
221                       xgft4, xgft4, xp4, xgft5, xgft5, xp5, xgft6, xgft6, xp6
222
223	cmp	vec_i, vec
224	jl	%%next_vect
225
226        mov     ptr, [dest1]			;reuse ptr
227        mov     tmp, [dest1 + 5*8]		;reuse tmp
228
229%if %0 == 1
230	vmovdqu8 [dest2 + pos]{%%KMASK}, xp2
231	vmovdqu8 [dest3 + pos]{%%KMASK}, xp3
232	vmovdqu8 [dest4 + pos]{%%KMASK}, xp4
233	vmovdqu8 [dest5 + pos]{%%KMASK}, xp5
234	vmovdqu8 [ptr + pos]{%%KMASK}, xp1 ; dest 1
235	vmovdqu8 [tmp + pos]{%%KMASK}, xp6 ; dest 6
236%else
237	XSTR	[dest2 + pos], xp2
238	XSTR	[dest3 + pos], xp3
239	XSTR	[dest4 + pos], xp4
240	XSTR	[dest5 + pos], xp5
241	XSTR	[ptr + pos], xp1 ; dest 1
242	XSTR	[tmp + pos], xp6 ; dest 6
243%endif
244%endmacro
245
246align 16
247mk_global gf_6vect_dot_prod_avx512_gfni, function
248func(gf_6vect_dot_prod_avx512_gfni)
249	FUNC_SAVE
250
251	xor	pos, pos
252	mov	vskip3, vec
253	imul	vskip3, 3*8
254	mov	vskip5, vec
255	imul	vskip5, 5*8
256	shl	vec, 3		;vec *= 8. Make vec_i count by 8
257	mov	dest2, [dest1 + 8]
258	mov	dest3, [dest1 + 2*8]
259	mov	dest4, [dest1 + 3*8]
260	mov	dest5, [dest1 + 4*8]      ;dest1 and dest6 are calculated later
261
262	cmp	len, 64
263        jl      .len_lt_64
264
265.loop64:
266
267        ENCODE_64B_6
268
269	add	pos, 64	                ;Loop on 64 bytes at a time
270        sub     len, 64
271	cmp	len, 64
272	jge	.loop64
273
274.len_lt_64:
275        cmp     len, 0
276        jle     .exit
277
278        xor     tmp, tmp
279        bts     tmp, len
280        dec     tmp
281        kmovq   k1, tmp
282
283        ENCODE_64B_6 k1
284
285.exit:
286        vzeroupper
287
288	FUNC_RESTORE
289	ret
290
291endproc_frame
292%endif  ; if AS_FEATURE_LEVEL >= 10
293