xref: /isa-l/erasure_code/gf_2vect_mad_sse.asm (revision e1470f70f6d996a4ed995fc6b378e563c795128a)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;;;
31;;; gf_2vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
32;;;
33
34%include "reg_sizes.asm"
35
36%define PS 8
37
38%ifidn __OUTPUT_FORMAT__, win64
39 %define arg0  rcx
40 %define arg0.w ecx
41 %define arg1  rdx
42 %define arg2  r8
43 %define arg3  r9
44 %define arg4  r12
45 %define arg5  r15
46 %define tmp   r11
47 %define tmp2   r10
48 %define return rax
49 %define return.w eax
50 %define stack_size 16*9 + 3*8
51 %define arg(x)      [rsp + stack_size + PS + PS*x]
52 %define func(x) proc_frame x
53
54%macro FUNC_SAVE 0
55	sub	rsp, stack_size
56	movdqa	[rsp+16*0],xmm6
57	movdqa	[rsp+16*1],xmm7
58	movdqa	[rsp+16*2],xmm8
59	movdqa	[rsp+16*3],xmm9
60	movdqa	[rsp+16*4],xmm10
61	movdqa	[rsp+16*5],xmm11
62	movdqa	[rsp+16*6],xmm12
63	movdqa	[rsp+16*7],xmm13
64	movdqa	[rsp+16*8],xmm14
65	save_reg	r12,  9*16 + 0*8
66	save_reg	r15,  9*16 + 1*8
67	end_prolog
68	mov	arg4, arg(4)
69	mov	arg5, arg(5)
70%endmacro
71
72%macro FUNC_RESTORE 0
73	movdqa	xmm6, [rsp+16*0]
74	movdqa	xmm7, [rsp+16*1]
75	movdqa	xmm8, [rsp+16*2]
76	movdqa	xmm9, [rsp+16*3]
77	movdqa	xmm10, [rsp+16*4]
78	movdqa	xmm11, [rsp+16*5]
79	movdqa	xmm12, [rsp+16*6]
80	movdqa	xmm13, [rsp+16*7]
81	movdqa	xmm14, [rsp+16*8]
82	mov	r12,  [rsp + 9*16 + 0*8]
83	mov	r15,  [rsp + 9*16 + 1*8]
84	add	rsp, stack_size
85%endmacro
86
87%elifidn __OUTPUT_FORMAT__, elf64
88 %define arg0  rdi
89 %define arg0.w edi
90 %define arg1  rsi
91 %define arg2  rdx
92 %define arg3  rcx
93 %define arg4  r8
94 %define arg5  r9
95 %define tmp   r11
96 %define tmp2   r10
97 %define return rax
98 %define return.w eax
99
100 %define func(x) x:
101 %define FUNC_SAVE
102 %define FUNC_RESTORE
103%endif
104
105;;; gf_2vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
106%define len   arg0
107%define len.w arg0.w
108%define vec    arg1
109%define vec_i    arg2
110%define mul_array arg3
111%define	src   arg4
112%define dest1  arg5
113%define pos   return
114%define pos.w return.w
115
116%define dest2 tmp2
117
118%ifndef EC_ALIGNED_ADDR
119;;; Use Un-aligned load/store
120 %define XLDR movdqu
121 %define XSTR movdqu
122%else
123;;; Use Non-temporal load/stor
124 %ifdef NO_NT_LDST
125  %define XLDR movdqa
126  %define XSTR movdqa
127 %else
128  %define XLDR movntdqa
129  %define XSTR movntdq
130 %endif
131%endif
132
133default rel
134
135[bits 64]
136section .text
137
138%define xmask0f  xmm14
139%define xgft1_lo  xmm13
140%define xgft1_hi  xmm12
141%define xgft2_lo  xmm11
142%define xgft2_hi  xmm10
143
144%define x0      xmm0
145%define xtmpa   xmm1
146%define xtmph1  xmm2
147%define xtmpl1  xmm3
148%define xtmph2  xmm4
149%define xtmpl2  xmm5
150%define xd1     xmm6
151%define xd2     xmm7
152%define xtmpd1  xmm8
153%define xtmpd2  xmm9
154
155
156align 16
157global gf_2vect_mad_sse:ISAL_SYM_TYPE_FUNCTION
158func(gf_2vect_mad_sse)
159	FUNC_SAVE
160	sub	len, 16
161	jl	.return_fail
162
163	xor	pos, pos
164	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
165	sal	vec_i, 5		;Multiply by 32
166	sal	vec, 5
167	lea	tmp, [mul_array + vec_i]
168	movdqu	xgft1_lo,[tmp]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
169	movdqu	xgft1_hi, [tmp+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
170	movdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
171	movdqu	xgft2_hi, [tmp+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
172	mov	dest2, [dest1+PS]
173	mov	dest1, [dest1]
174
175	XLDR	xtmpd1, [dest1+len]	;backup the last 16 bytes in dest
176	XLDR	xtmpd2, [dest2+len]	;backup the last 16 bytes in dest
177
178.loop16:
179	XLDR	xd1, [dest1+pos]		;Get next dest vector
180	XLDR	xd2, [dest2+pos]		;Get next dest vector
181.loop16_overlap:
182	XLDR	x0, [src+pos]		;Get next source vector
183	movdqa	xtmph1, xgft1_hi		;Reload const array registers
184	movdqa	xtmpl1, xgft1_lo
185	movdqa	xtmph2, xgft2_hi		;Reload const array registers
186	movdqa	xtmpl2, xgft2_lo
187	movdqa	xtmpa, x0		;Keep unshifted copy of src
188	psraw	x0, 4			;Shift to put high nibble into bits 4-0
189	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
190	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
191
192	pshufb	xtmph1, x0		;Lookup mul table of high nibble
193	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
194	pxor	xtmph1, xtmpl1		;GF add high and low partials
195	pxor	xd1, xtmph1
196
197	pshufb	xtmph2, x0		;Lookup mul table of high nibble
198	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
199	pxor	xtmph2, xtmpl2		;GF add high and low partials
200	pxor	xd2, xtmph2
201
202	XSTR	[dest1+pos], xd1	;Store result
203	XSTR	[dest2+pos], xd2	;Store result
204
205	add	pos, 16			;Loop on 16 bytes at a time
206	cmp	pos, len
207	jle	.loop16
208
209	lea	tmp, [len + 16]
210	cmp	pos, tmp
211	je	.return_pass
212
213	;; Tail len
214	mov	pos, len	;Overlapped offset length-16
215	movdqa	xd1, xtmpd1	;Restore xd1
216	movdqa	xd2, xtmpd2	;Restore xd2
217	jmp	.loop16_overlap	;Do one more overlap pass
218
219.return_pass:
220	FUNC_RESTORE
221	mov	return, 0
222	ret
223
224.return_fail:
225	FUNC_RESTORE
226	mov	return, 1
227	ret
228
229endproc_frame
230
231section .data
232
233align 16
234
235mask0f:
236	dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
237
238;;;       func             core, ver, snum
239slversion gf_2vect_mad_sse, 00,  01,  0203
240