xref: /isa-l/erasure_code/gf_2vect_dot_prod_avx512_gfni.asm (revision 2ca781df19598766427435a011f263680d9e8faa)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2023 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;;;
31;;; gf_2vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, **dests);
32;;;
33
34%include "reg_sizes.asm"
35%include "gf_vect_gfni.inc"
36
37%if AS_FEATURE_LEVEL >= 10
38
39%ifidn __OUTPUT_FORMAT__, elf64
40 %define arg0  rdi
41 %define arg1  rsi
42 %define arg2  rdx
43 %define arg3  rcx
44 %define arg4  r8
45 %define arg5  r9
46
47 %define tmp   r11
48 %define tmp2  r10
49 %define tmp3  r12		; must be saved and restored
50
51 %define func(x) x: endbranch
52 %macro FUNC_SAVE 0
53	push	r12
54 %endmacro
55 %macro FUNC_RESTORE 0
56	pop	r12
57 %endmacro
58%endif
59
60%ifidn __OUTPUT_FORMAT__, win64
61 %define arg0   rcx
62 %define arg1   rdx
63 %define arg2   r8
64 %define arg3   r9
65
66 %define arg4   r12 		; must be saved, loaded and restored
67 %define arg5   r14 		; must be saved and restored
68 %define tmp    r11
69 %define tmp2   r10
70 %define tmp3   r13		; must be saved and restored
71 %define stack_size  3*8		; must be an odd multiple of 8
72 %define arg(x)      [rsp + stack_size + 8 + 8*x]
73
74 %define func(x) proc_frame x
75 %macro FUNC_SAVE 0
76	alloc_stack	stack_size
77	mov	[rsp + 0*8], r12
78	mov	[rsp + 1*8], r13
79	mov	[rsp + 2*8], r14
80	end_prolog
81	mov	arg4, arg(4)
82 %endmacro
83
84 %macro FUNC_RESTORE 0
85	mov	r12,  [rsp + 0*8]
86	mov	r13,  [rsp + 1*8]
87	mov	r14,  [rsp + 2*8]
88	add	rsp, stack_size
89 %endmacro
90%endif
91
92
93%define len    arg0
94%define vec    arg1
95%define mul_array arg2
96%define src    arg3
97%define dest1  arg4
98%define ptr    arg5
99%define vec_i  tmp2
100%define dest2  tmp3
101%define pos    rax
102
103
104%ifndef EC_ALIGNED_ADDR
105;;; Use Un-aligned load/store
106 %define XLDR vmovdqu8
107 %define XSTR vmovdqu8
108%else
109;;; Use Non-temporal load/stor
110 %ifdef NO_NT_LDST
111  %define XLDR vmovdqa64
112  %define XSTR vmovdqa64
113 %else
114  %define XLDR vmovntdqa
115  %define XSTR vmovntdq
116 %endif
117%endif
118
119%define xgft1  zmm3
120%define xgft2  zmm4
121
122%define x0        zmm0
123%define xp1       zmm1
124%define xp2       zmm2
125
126default rel
127[bits 64]
128
129section .text
130
131;;
132;; Encodes 64 bytes of all "k" sources into 2x 64 bytes (parity disks)
133;;
134%macro ENCODE_64B_2 0-1
135%define %%KMASK %1
136
137	vpxorq	xp1, xp1, xp1
138	vpxorq	xp2, xp2, xp2
139	mov	tmp, mul_array
140	xor	vec_i, vec_i
141
142%%next_vect:
143	mov	ptr, [src + vec_i]
144%if %0 == 1
145	vmovdqu8 x0{%%KMASK}, [ptr + pos]	;Get next source vector (less than 64 bytes)
146%else
147	XLDR	x0, [ptr + pos]		;Get next source vector (64 bytes)
148%endif
149	add	vec_i, 8
150
151        vbroadcastf32x2 xgft1, [tmp]
152        vbroadcastf32x2 xgft2, [tmp + vec]
153	add	tmp, 8
154
155        GF_MUL_XOR EVEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2
156
157	cmp	vec_i, vec
158	jl	%%next_vect
159
160%if %0 == 1
161	vmovdqu8 [dest1 + pos]{%%KMASK}, xp1
162	vmovdqu8 [dest2 + pos]{%%KMASK}, xp2
163%else
164	XSTR	[dest1 + pos], xp1
165	XSTR	[dest2 + pos], xp2
166%endif
167%endmacro
168
169align 16
170mk_global gf_2vect_dot_prod_avx512_gfni, function
171func(gf_2vect_dot_prod_avx512_gfni)
172	FUNC_SAVE
173
174	xor	pos, pos
175	shl	vec, 3		;vec *= 8. Make vec_i count by 8
176	mov	dest2, [dest1 + 8]
177	mov	dest1, [dest1]
178
179	cmp	len, 64
180        jb      .len_lt_64
181
182.loop64:
183
184        ENCODE_64B_2
185
186	add	pos, 64		;Loop on 64 bytes at a time
187        sub     len, 64
188	cmp	len, 64
189	jge	.loop64
190
191.len_lt_64:
192        cmp     len, 0
193        jle     .exit
194
195        xor     tmp, tmp
196        bts     tmp, len
197        dec     tmp
198        kmovq   k1, tmp
199
200        ENCODE_64B_2 k1
201
202.exit:
203        vzeroupper
204
205	FUNC_RESTORE
206	ret
207
208endproc_frame
209%endif  ; if AS_FEATURE_LEVEL >= 10
210