xref: /isa-l/erasure_code/gf_vect_mad_avx512.asm (revision d3cfb2fb772e375cf2007e484e0a6ec0c6a7c993)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;;;
31;;; gf_vect_mad_avx512(len, vec, vec_i, mul_array, src, dest);
32;;;
33
34%include "reg_sizes.asm"
35
36%ifdef HAVE_AS_KNOWS_AVX512
37
38%ifidn __OUTPUT_FORMAT__, elf64
39 %define arg0  rdi
40 %define arg1  rsi
41 %define arg2  rdx
42 %define arg3  rcx
43 %define arg4  r8
44 %define arg5  r9
45 %define tmp   r11
46 %define return rax
47 %define func(x) x: endbranch
48 %define FUNC_SAVE
49 %define FUNC_RESTORE
50%endif
51
52%ifidn __OUTPUT_FORMAT__, win64
53 %define arg0   rcx
54 %define arg1   rdx
55 %define arg2   r8
56 %define arg3   r9
57 %define arg4   r12 		; must be saved and loaded
58 %define arg5   r15
59 %define tmp    r11
60 %define return rax
61 %define PS 8
62 %define stack_size 16*3 + 3*8
63 %define arg(x)      [rsp + stack_size + PS + PS*x]
64 %define func(x) proc_frame x
65
66 %macro FUNC_SAVE 0
67	sub	rsp, stack_size
68	vmovdqa	[rsp+16*0],xmm6
69	vmovdqa	[rsp+16*1],xmm7
70	vmovdqa	[rsp+16*2],xmm8
71	save_reg	r12,  3*16 + 0*8
72	save_reg	r15,  3*16 + 1*8
73	end_prolog
74	mov	arg4, arg(4)
75	mov	arg5, arg(5)
76 %endmacro
77
78 %macro FUNC_RESTORE 0
79	vmovdqa	xmm6, [rsp+16*0]
80	vmovdqa	xmm7, [rsp+16*1]
81	vmovdqa	xmm8, [rsp+16*2]
82	mov	r12,  [rsp + 3*16 + 0*8]
83	mov	r15,  [rsp + 3*16 + 1*8]
84	add	rsp, stack_size
85 %endmacro
86%endif
87
88;;; gf_vect_mad_avx512(len, vec, vec_i, mul_array, src, dest)
89%define len   arg0
90%define vec   arg1
91%define vec_i    arg2
92%define mul_array arg3
93%define	src   arg4
94%define dest  arg5
95%define pos   return
96
97%ifndef EC_ALIGNED_ADDR
98;;; Use Un-aligned load/store
99 %define XLDR vmovdqu8
100 %define XSTR vmovdqu8
101%else
102;;; Use Non-temporal load/stor
103 %ifdef NO_NT_LDST
104  %define XLDR vmovdqa
105  %define XSTR vmovdqa
106 %else
107  %define XLDR vmovntdqa
108  %define XSTR vmovntdq
109 %endif
110%endif
111
112
113default rel
114
115[bits 64]
116section .text
117
118%define x0       zmm0
119%define xtmpa    zmm1
120%define xtmph    zmm2
121%define xtmpl    zmm3
122%define xd       zmm4
123%define xtmpd    zmm5
124%define xgft_hi  zmm6
125%define xgft_lo  zmm7
126%define xgft_loy ymm7
127%define xmask0f  zmm8
128
129align 16
130mk_global gf_vect_mad_avx512, function
131func(gf_vect_mad_avx512)
132	FUNC_SAVE
133	sub	len, 64
134	jl	.return_fail
135	xor	pos, pos
136	mov	tmp, 0x0f
137	vpbroadcastb xmask0f, tmp	;Construct mask 0x0f0f0f...
138	sal	vec_i, 5		;Multiply by 32
139	vmovdqu8 xgft_loy, [vec_i+mul_array]	;Load array Cx{00}..{0f}, Cx{00}..{f0}
140	vshufi64x2 xgft_hi, xgft_lo, xgft_lo, 0x55
141	vshufi64x2 xgft_lo, xgft_lo, xgft_lo, 0x00
142	mov	tmp, -1
143	kmovq	k1, tmp
144
145.loop64:
146	XLDR	xd, [dest+pos]		;Get next dest vector
147	XLDR	x0, [src+pos]		;Get next source vector
148
149	vpandq	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
150	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
151	vpandq	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
152
153	vpshufb	xtmph {k1}{z}, xgft_hi, x0	;Lookup mul table of high nibble
154	vpshufb	xtmpl {k1}{z}, xgft_lo, xtmpa	;Lookup mul table of low nibble
155	vpxorq	xtmph, xtmph, xtmpl	;GF add high and low partials
156	vpxorq	xd, xd, xtmph		;xd += partial
157
158	XSTR	[dest+pos], xd
159	add	pos, 64			;Loop on 64 bytes at a time
160	cmp	pos, len
161	jle	.loop64
162
163	lea	tmp, [len + 64]
164	cmp	pos, tmp
165	je	.return_pass
166
167	;; Tail len
168	mov	pos, (1 << 63)
169	lea	tmp, [len + 64 - 1]
170	and	tmp, 63
171	sarx	pos, pos, tmp
172	kmovq	k1, pos
173	mov	pos, len	;Overlapped offset length-64
174	jmp	.loop64		;Do one more overlap pass
175
176.return_pass:
177	mov	return, 0
178	FUNC_RESTORE
179	ret
180
181.return_fail:
182	mov	return, 1
183	FUNC_RESTORE
184	ret
185
186endproc_frame
187
188%else
189%ifidn __OUTPUT_FORMAT__, win64
190global no_gf_vect_mad_avx512
191no_gf_vect_mad_avx512:
192%endif
193%endif  ; ifdef HAVE_AS_KNOWS_AVX512
194