xref: /isa-l/igzip/adler32_avx2_4.asm (revision 1500db751d08b6c4ad6097135fe78259540a2807)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30; uint32_t adler32_avx2(uint32_t init, const unsigned char *buf, uint64_t len)
31
32%define LIMIT 5552
33%define BASE  0xFFF1 ; 65521
34
35%define CHUNKSIZE 16
36%define CHUNKSIZE_M1 (CHUNKSIZE-1)
37
38%include "reg_sizes.asm"
39
40default rel
41[bits 64]
42
43; need to keep free: eax, ecx, edx
44
45%ifidn __OUTPUT_FORMAT__, elf64
46 %define arg1   rdi
47 %define arg2   rsi
48 %define arg3   rdx
49
50 %define init_d edi
51 %define data   r9
52 %define size   r10
53 %define s      r11
54 %define a_d    r12d
55 %define b_d    r8d
56 %define end    r13
57
58 %define func(x) x: endbranch
59 %macro FUNC_SAVE 0
60	push	r12
61	push	r13
62 %endmacro
63 %macro FUNC_RESTORE 0
64	pop	r13
65	pop	r12
66 %endmacro
67%endif
68
69%ifidn __OUTPUT_FORMAT__, win64
70 %define arg1   rcx
71 %define arg2   rdx
72 %define arg3   r8
73
74 %define init_d r12d
75 %define data   r9
76 %define size	r10
77 %define s	r11
78 %define a_d	esi
79 %define b_d	edi
80 %define end	r13
81
82 %define stack_size  2*16 + 5*8		; must be an odd multiple of 8
83 %define arg(x)      [rsp + stack_size + PS + PS*x]
84 %define func(x) proc_frame x
85 %macro FUNC_SAVE 0
86	alloc_stack	stack_size
87	vmovdqa	[rsp + 0*16], xmm6
88	vmovdqa	[rsp + 1*16], xmm7
89	save_reg	rdi,  2*16 + 0*8
90	save_reg	rsi,  2*16 + 1*8
91	save_reg	r12,  2*16 + 2*8
92	save_reg	r13,  2*16 + 3*8
93	end_prolog
94	mov	init_d, ecx	; initialize init_d from arg1 to keep ecx free
95 %endmacro
96
97 %macro FUNC_RESTORE 0
98	vmovdqa	xmm6, [rsp + 0*16]
99	vmovdqa	xmm7, [rsp + 1*16]
100	mov	rdi,  [rsp + 2*16 + 0*8]
101	mov	rsi,  [rsp + 2*16 + 1*8]
102	mov	r12,  [rsp + 2*16 + 2*8]
103	mov	r13,  [rsp + 2*16 + 3*8]
104	add	rsp, stack_size
105 %endmacro
106%endif
107
108%define ya	ymm0
109%define yb	ymm1
110%define ydata0	ymm2
111%define ydata1	ymm3
112%define ysa	ymm4
113%define ydata   ysa
114%define ytmp0   ydata0
115%define ytmp1   ydata1
116%define ytmp2   ymm5
117%define xa	xmm0
118%define xb      xmm1
119%define xtmp0   xmm2
120%define xtmp1   xmm3
121%define xsa     xmm4
122%define xtmp2   xmm5
123%define yshuf0	ymm6
124%define yshuf1	ymm7
125
126[bits 64]
127default rel
128section .text
129
130mk_global adler32_avx2_4, function
131func(adler32_avx2_4)
132	FUNC_SAVE
133
134	vmovdqa	yshuf0, [SHUF0]
135	vmovdqa	yshuf1, [SHUF1]
136
137	mov	data, arg2
138	mov	size, arg3
139
140	mov	b_d, init_d
141	shr	b_d, 16
142	and	init_d, 0xFFFF
143	cmp	size, 32
144	jb	.lt64
145	vmovd	xa, init_d
146	vpxor	yb, yb, yb
147.sloop1:
148	mov	s, LIMIT
149	cmp	s, size
150	cmova	s, size		; s = min(size, LIMIT)
151	lea	end, [data + s - CHUNKSIZE_M1]
152	cmp	data, end
153	jae	.skip_loop_1a
154align 32
155.sloop1a:
156	; do CHUNKSIZE adds
157	vbroadcastf128	ydata, [data]
158	add	data, CHUNKSIZE
159	vpshufb	ydata0, ydata, yshuf0
160	vpaddd	ya, ya, ydata0
161	vpaddd	yb, yb, ya
162	vpshufb	ydata1, ydata, yshuf1
163	vpaddd	ya, ya, ydata1
164	vpaddd	yb, yb, ya
165	cmp	data, end
166	jb	.sloop1a
167
168.skip_loop_1a:
169	add	end, CHUNKSIZE_M1
170
171	test	s, CHUNKSIZE_M1
172	jnz	.do_final
173
174	; either we're done, or we just did LIMIT
175	sub	size, s
176
177	; reduce
178	vpslld	yb, 3   ; b is scaled by 8
179	vpmulld	ysa, ya, [A_SCALE] ; scaled a
180
181	; compute horizontal sums of ya, yb, ysa
182	vextracti128 xtmp0, ya, 1
183	vextracti128 xtmp1, yb, 1
184	vextracti128 xtmp2, ysa, 1
185	vpaddd	xa, xa, xtmp0
186	vpaddd	xb, xb, xtmp1
187	vpaddd	xsa, xsa, xtmp2
188	vphaddd	xa, xa, xa
189	vphaddd	xb, xb, xb
190	vphaddd	xsa, xsa, xsa
191	vphaddd	xa, xa, xa
192	vphaddd	xb, xb, xb
193	vphaddd	xsa, xsa, xsa
194
195	vmovd	eax, xa
196	xor	edx, edx
197	mov	ecx, BASE
198	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
199	mov	a_d, edx
200
201	vpsubd	xb, xb, xsa
202	vmovd	eax, xb
203	add	eax, b_d
204	xor	edx, edx
205	mov	ecx, BASE
206	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
207	mov	b_d, edx
208
209	test	size, size
210	jz	.finish
211
212	; continue loop
213	vmovd	xa, a_d
214	vpxor	yb, yb
215	jmp	.sloop1
216
217.finish:
218	mov	eax, b_d
219	shl	eax, 16
220	or	eax, a_d
221	jmp	.end
222
223.lt64:
224	mov	a_d, init_d
225	lea	end, [data + size]
226	test	size, size
227	jnz	.final_loop
228	jmp	.zero_size
229
230	; handle remaining 1...15 bytes
231.do_final:
232	; reduce
233	vpslld	yb, 3   ; b is scaled by 8
234	vpmulld	ysa, ya, [A_SCALE] ; scaled a
235
236	vextracti128 xtmp0, ya, 1
237	vextracti128 xtmp1, yb, 1
238	vextracti128 xtmp2, ysa, 1
239	vpaddd	xa, xa, xtmp0
240	vpaddd	xb, xb, xtmp1
241	vpaddd	xsa, xsa, xtmp2
242	vphaddd	xa, xa, xa
243	vphaddd	xb, xb, xb
244	vphaddd	xsa, xsa, xsa
245	vphaddd	xa, xa, xa
246	vphaddd	xb, xb, xb
247	vphaddd	xsa, xsa, xsa
248	vpsubd	xb, xb, xsa
249
250	vmovd	a_d, xa
251	vmovd	eax, xb
252	add	b_d, eax
253
254align 32
255.final_loop:
256	movzx	eax, byte[data]
257	add	a_d, eax
258	inc	data
259	add	b_d, a_d
260	cmp	data, end
261	jb	.final_loop
262
263.zero_size:
264	mov	eax, a_d
265	xor	edx, edx
266	mov	ecx, BASE
267	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
268	mov	a_d, edx
269
270	mov	eax, b_d
271	xor	edx, edx
272	mov	ecx, BASE
273	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
274	shl	edx, 16
275	or	edx, a_d
276	mov	eax, edx
277
278.end:
279	FUNC_RESTORE
280	ret
281
282endproc_frame
283
284section .data
285align 32
286A_SCALE:
287	dq	0x0000000100000000, 0x0000000300000002
288	dq	0x0000000500000004, 0x0000000700000006
289SHUF0:
290	dq	0xFFFFFF01FFFFFF00, 0xFFFFFF03FFFFFF02
291	dq	0xFFFFFF05FFFFFF04, 0xFFFFFF07FFFFFF06
292SHUF1:
293	dq	0xFFFFFF09FFFFFF08, 0xFFFFFF0BFFFFFF0A
294	dq	0xFFFFFF0DFFFFFF0C, 0xFFFFFF0FFFFFFF0E
295
296