xref: /isa-l/igzip/igzip_compare_types.asm (revision ba1a0006802c8f857e536282e77a9b4ca34f43e8)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30%include "options.asm"
31%include "stdmac.asm"
32
33%ifndef UTILS_ASM
34%define UTILS_ASM
35; compare macro
36
37;; sttni2 is faster, but it can't be debugged
38;; so following code is based on "mine5"
39
40;; compares 8 bytes at a time, using xor
41;; assumes the input buffer has size at least 8
42;; compare_r src1, src2, result, result_max, tmp
43%macro compare_r 5
44%define %%src1		%1
45%define %%src2		%2
46%define %%result	%3
47%define %%result_max	%4
48%define %%tmp		%5
49%define %%tmp16		%5w	; tmp as a 16-bit register
50
51	sub	%%result_max, 16
52	cmp	%%result, %%result_max
53	jg	%%_by_8
54
55%%loop1:
56	mov	%%tmp, [%%src1 + %%result]
57	xor	%%tmp, [%%src2 + %%result]
58	jnz	%%miscompare_reg
59	add	%%result, 8
60
61	mov	%%tmp, [%%src1 + %%result]
62	xor	%%tmp, [%%src2 + %%result]
63	jnz	%%miscompare_reg
64	add	%%result, 8
65	cmp	%%result, %%result_max
66	jle	%%loop1
67
68%%_by_8:
69	add	%%result_max, 8
70	cmp	%%result, %%result_max
71	jg	%%_cmp_last
72
73	; compare last two bytes
74	mov	%%tmp, [%%src1 + %%result]
75	xor	%%tmp, [%%src2 + %%result]
76	jnz	%%miscompare_reg
77	add	%%result, 8
78
79%%_cmp_last:
80	add	%%result_max, 8
81	cmp	%%result, %%result_max
82	je	%%end
83
84	lea	%%result, [%%result_max - 8]
85
86	mov	%%tmp, [%%src1 + %%result]
87	xor	%%tmp, [%%src2 + %%result]
88	jnz	%%miscompare_reg
89	add	%%result, 8
90	jmp	%%end
91
92%%miscompare_reg:
93	bsf	%%tmp, %%tmp
94	shr	%%tmp, 3
95	add	%%result, %%tmp
96%%end:
97%endm
98
99;; compares 16 bytes at a time, using pcmpeqb/pmovmskb
100;; assumes the input buffer has size at least 8
101;; compare_x src1, src2, result, result_max, tmp, xtmp1, xtmp2
102%macro compare_x 7
103%define %%src1		%1
104%define %%src2		%2
105%define %%result	%3	; Accumulator for match_length
106%define %%result_max	%4
107%define %%tmp		%5
108%define %%tmp16		%5w	; tmp as a 16-bit register
109%define %%tmp32		%5d	; tmp as a 32-bit register
110%define %%xtmp		%6
111%define %%xtmp2		%7
112
113	sub	%%result_max, 32
114	cmp	%%result, %%result_max
115	jg	%%_by_16
116
117%%loop1:
118	MOVDQU		%%xtmp, [%%src1 + %%result]
119	MOVDQU		%%xtmp2, [%%src2 + %%result]
120	PCMPEQB		%%xtmp, %%xtmp, %%xtmp2
121	PMOVMSKB	%%tmp32, %%xtmp
122	xor		%%tmp, 0xFFFF
123	jnz		%%miscompare_vect
124	add		%%result, 16
125
126	MOVDQU		%%xtmp, [%%src1 + %%result]
127	MOVDQU		%%xtmp2, [%%src2 + %%result]
128	PCMPEQB		%%xtmp, %%xtmp, %%xtmp2
129	PMOVMSKB	%%tmp32, %%xtmp
130	xor		%%tmp, 0xFFFF
131	jnz		%%miscompare_vect
132	add		%%result, 16
133
134	cmp	%%result, %%result_max
135	jle	%%loop1
136
137%%_by_16:
138	add	%%result_max, 16
139	cmp	%%result, %%result_max
140	jg	%%_by_8
141
142	MOVDQU		%%xtmp, [%%src1 + %%result]
143	MOVDQU		%%xtmp2, [%%src2 + %%result]
144	PCMPEQB		%%xtmp, %%xtmp, %%xtmp2
145	PMOVMSKB	%%tmp32, %%xtmp
146	xor		%%tmp, 0xFFFF
147	jnz		%%miscompare_vect
148	add		%%result, 16
149
150%%_by_8:
151	add	%%result_max, 8
152	cmp	%%result, %%result_max
153	jg	%%_cmp_last
154
155	; compare last two bytes
156	mov	%%tmp, [%%src1 + %%result]
157	xor	%%tmp, [%%src2 + %%result]
158	jnz	%%miscompare_reg
159	add	%%result, 8
160
161%%_cmp_last:
162	add	%%result_max, 8
163	cmp	%%result, %%result_max
164	je	%%end
165
166	lea	%%result, [%%result_max - 8]
167
168	mov	%%tmp, [%%src1 + %%result]
169	xor	%%tmp, [%%src2 + %%result]
170	jnz	%%miscompare_reg
171	add	%%result, 8
172	jmp	%%end
173
174%%miscompare_reg:
175	bsf	%%tmp, %%tmp
176	shr	%%tmp, 3
177	add	%%result, %%tmp
178	jmp	%%end
179
180%%miscompare_vect:
181	bsf	%%tmp, %%tmp
182	add	%%result, %%tmp
183%%end:
184%endm
185
186;; compares 32 bytes at a time, using pcmpeqb/pmovmskb
187;; assumes the input buffer has size at least 8
188;; compare_y src1, src2, result, result_max, tmp, xtmp1, xtmp2
189%macro compare_y 7
190%define %%src1		%1
191%define %%src2		%2
192%define %%result	%3	; Accumulator for match_length
193%define %%result_max	%4
194%define %%tmp		%5
195%define %%tmp16		%5w	; tmp as a 16-bit register
196%define %%tmp32		%5d	; tmp as a 32-bit register
197%define %%ytmp		%6
198%define %%ytmp2		%7
199
200	sub	%%result_max, 64
201	cmp	%%result, %%result_max
202	jg	%%_by_32
203
204%%loop1:
205	vmovdqu		%%ytmp, [%%src1 + %%result]
206	vmovdqu		%%ytmp2, [%%src2 + %%result]
207	vpcmpeqb	%%ytmp, %%ytmp, %%ytmp2
208	vpmovmskb	%%tmp, %%ytmp
209	xor		%%tmp32, 0xFFFFFFFF
210	jnz		%%miscompare_vect
211	add		%%result, 32
212
213	vmovdqu		%%ytmp, [%%src1 + %%result]
214	vmovdqu		%%ytmp2, [%%src2 + %%result]
215	vpcmpeqb	%%ytmp, %%ytmp, %%ytmp2
216	vpmovmskb	%%tmp, %%ytmp
217	xor		%%tmp32, 0xFFFFFFFF
218	jnz		%%miscompare_vect
219	add		%%result, 32
220
221	cmp	%%result, %%result_max
222	jle	%%loop1
223
224%%_by_32:
225	add	%%result_max, 32
226	cmp	%%result, %%result_max
227	jg	%%_by_16
228
229	vmovdqu		%%ytmp, [%%src1 + %%result]
230	vmovdqu		%%ytmp2, [%%src2 + %%result]
231	vpcmpeqb	%%ytmp, %%ytmp, %%ytmp2
232	vpmovmskb	%%tmp, %%ytmp
233	xor		%%tmp32, 0xFFFFFFFF
234	jnz		%%miscompare_vect
235	add		%%result, 32
236
237%%_by_16:
238	add	%%result_max, 16
239	cmp	%%result, %%result_max
240	jg	%%_by_8
241
242	vmovdqu		%%ytmp %+ x, [%%src1 + %%result]
243	vmovdqu		%%ytmp2 %+ x, [%%src2 + %%result]
244	vpcmpeqb	%%ytmp %+ x, %%ytmp %+ x, %%ytmp2 %+ x
245	vpmovmskb	%%tmp, %%ytmp %+ x
246	xor		%%tmp32, 0xFFFF
247	jnz		%%miscompare_vect
248	add		%%result, 16
249
250%%_by_8:
251	add	%%result_max, 8
252	cmp	%%result, %%result_max
253	jg	%%_cmp_last
254
255	mov	%%tmp, [%%src1 + %%result]
256	xor	%%tmp, [%%src2 + %%result]
257	jnz	%%miscompare_reg
258	add	%%result, 8
259
260%%_cmp_last:
261	add	%%result_max, 8
262	cmp	%%result, %%result_max
263	je	%%end
264
265	lea	%%result, [%%result_max - 8]
266
267	; compare last two bytes
268	mov	%%tmp, [%%src1 + %%result]
269	xor	%%tmp, [%%src2 + %%result]
270	jnz	%%miscompare_reg
271	add	%%result, 8
272	jmp	%%end
273
274%%miscompare_reg:
275	bsf	%%tmp, %%tmp
276	shr	%%tmp, 3
277	add	%%result, %%tmp
278	jmp	%%end
279
280%%miscompare_vect:
281	tzcnt	%%tmp, %%tmp
282	add	%%result, %%tmp
283%%end:
284%endm
285
286;; compares 64 bytes at a time
287;; compare_z src1, src2, result, result_max, tmp, ktmp, ztmp1, ztmp2
288;; Clobbers result_max
289%macro compare_z 8
290%define %%src1		%1
291%define %%src2		%2
292%define %%result	%3	; Accumulator for match_length
293%define %%result_max	%4
294%define %%tmp		%5	; tmp as a 16-bit register
295%define %%ktmp		%6
296%define %%ztmp		%7
297%define %%ztmp2		%8
298
299	sub	%%result_max, 128
300	cmp	%%result, %%result_max
301	jg	%%_by_64
302
303%%loop1:
304	vmovdqu8	%%ztmp, [%%src1 + %%result]
305	vmovdqu8	%%ztmp2, [%%src2 + %%result]
306	vpcmpb		%%ktmp, %%ztmp, %%ztmp2, NEQ
307	ktestq		%%ktmp, %%ktmp
308	jnz		%%miscompare
309	add		%%result, 64
310
311	vmovdqu8	%%ztmp, [%%src1 + %%result]
312	vmovdqu8	%%ztmp2, [%%src2 + %%result]
313	vpcmpb		%%ktmp, %%ztmp, %%ztmp2, NEQ
314	ktestq		%%ktmp, %%ktmp
315	jnz		%%miscompare
316	add		%%result, 64
317
318	cmp	%%result, %%result_max
319	jle	%%loop1
320
321%%_by_64:
322	add	%%result_max, 64
323	cmp	%%result, %%result_max
324	jg	%%_less_than_64
325
326	vmovdqu8	%%ztmp, [%%src1 + %%result]
327	vmovdqu8	%%ztmp2, [%%src2 + %%result]
328	vpcmpb		%%ktmp, %%ztmp, %%ztmp2, NEQ
329	ktestq		%%ktmp, %%ktmp
330	jnz		%%miscompare
331	add		%%result, 64
332
333%%_less_than_64:
334	add	%%result_max, 64
335	sub	%%result_max, %%result
336	jle	%%end
337
338	mov	%%tmp, -1
339	bzhi	%%tmp, %%tmp, %%result_max
340	kmovq	%%ktmp, %%tmp
341
342	vmovdqu8	%%ztmp {%%ktmp}{z}, [%%src1 + %%result]
343	vmovdqu8	%%ztmp2 {%%ktmp}{z}, [%%src2 + %%result]
344	vpcmpb		%%ktmp, %%ztmp, %%ztmp2, NEQ
345	ktestq		%%ktmp, %%ktmp
346	jnz		%%miscompare
347	add		%%result, %%result_max
348
349	jmp	%%end
350%%miscompare:
351	kmovq	%%tmp, %%ktmp
352	tzcnt	%%tmp, %%tmp
353	add	%%result, %%tmp
354%%end:
355%endm
356
357%macro compare250 7
358%define %%src1		%1
359%define %%src2		%2
360%define %%result	%3
361%define %%result_max	%4
362%define %%tmp		%5
363%define %%xtmp0		%6x
364%define %%xtmp1		%7x
365%define %%ytmp0		%6
366%define %%ytmp1		%7
367
368	mov	%%tmp, 250
369	cmp	%%result_max, 250
370	cmovg	%%result_max, %%tmp
371
372%if (COMPARE_TYPE == 1)
373	compare_r	%%src1, %%src2, %%result, %%result_max, %%tmp
374%elif (COMPARE_TYPE == 2)
375	compare_x	%%src1, %%src2, %%result, %%result_max, %%tmp, %%xtmp0, %%xtmp1
376%elif (COMPARE_TYPE == 3)
377	compare_y	%%src1, %%src2, %%result, %%result_max, %%tmp, %%ytmp0, %%ytmp1
378%else
379%error Unknown Compare type COMPARE_TYPE
380 % error
381%endif
382%endmacro
383
384; Assumes the buffer has at least 8 bytes
385; Accumulates match length onto result
386%macro compare_large 7
387%define %%src1		%1
388%define %%src2		%2
389%define %%result	%3
390%define %%result_max	%4
391%define %%tmp		%5
392%define %%xtmp0		%6x
393%define %%xtmp1		%7x
394%define %%ytmp0		%6
395%define %%ytmp1		%7
396
397%if (COMPARE_TYPE == 1)
398	compare_r	%%src1, %%src2, %%result, %%result_max, %%tmp
399%elif (COMPARE_TYPE == 2)
400	compare_x	%%src1, %%src2, %%result, %%result_max, %%tmp, %%xtmp0, %%xtmp1
401%elif (COMPARE_TYPE == 3)
402	compare_y	%%src1, %%src2, %%result, %%result_max, %%tmp, %%ytmp0, %%ytmp1
403%else
404%error Unknown Compare type COMPARE_TYPE
405 % error
406%endif
407%endmacro
408
409;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
410;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
411
412;; compare size, src1, src2, result, tmp
413%macro compare 5
414%define %%size		%1
415%define %%src1		%2
416%define %%src2		%3
417%define %%result	%4
418%define %%tmp		%5
419%define %%tmp8		%5b	; tmp as a 8-bit register
420
421	xor	%%result, %%result
422	sub	%%size, 7
423	jle	%%lab2
424%%loop1:
425	mov	%%tmp, [%%src1 + %%result]
426	xor	%%tmp, [%%src2 + %%result]
427	jnz	%%miscompare
428	add	%%result, 8
429	sub	%%size, 8
430	jg	%%loop1
431%%lab2:
432	;; if we fall through from above, we have found no mismatches,
433	;; %%size+7 is the number of bytes left to look at, and %%result is the
434	;; number of bytes that have matched
435	add	%%size, 7
436	jle	%%end
437%%loop3:
438	mov	%%tmp8, [%%src1 + %%result]
439	cmp	%%tmp8, [%%src2 + %%result]
440	jne	%%end
441	inc	%%result
442	dec	%%size
443	jg	%%loop3
444	jmp	%%end
445%%miscompare:
446	bsf	%%tmp, %%tmp
447	shr	%%tmp, 3
448	add	%%result, %%tmp
449%%end:
450%endm
451
452%endif	;UTILS_ASM
453