xref: /isa-l/igzip/igzip_set_long_icf_fg_06.asm (revision 22ec5c7469da5a39d3178e79b20df5f3eb450d7c)
1%include "reg_sizes.asm"
2%include "lz0a_const.asm"
3%include "data_struct2.asm"
4%include "igzip_compare_types.asm"
5%define NEQ 4
6
7%ifdef HAVE_AS_KNOWS_AVX512
8%ifidn __OUTPUT_FORMAT__, win64
9%define arg1 rcx
10%define arg2 rdx
11%define arg3 r8
12%define dist_code rsi
13%define len rdi
14%else
15%define arg1 rdi
16%define arg2 rsi
17%define arg3 rdx
18%define dist_code rcx
19%define len r8
20%endif
21
22%define next_in arg1
23%define end_in arg2
24%define match_lookup arg3
25%define match_in rax
26%define dist r9
27%define match_offset r10
28%define tmp1 r11
29
30%define zmatch_lookup zmm0
31%define zmatch_lookup2 zmm1
32%define zlens zmm2
33%define zdist_codes zmm3
34%define zdist_extras zmm4
35%define zdists zmm5
36%define zdists2 zmm6
37%define zlens1 zmm7
38%define zlens2 zmm8
39%define zlookup zmm9
40%define zlookup2 zmm10
41%define datas zmm11
42%define ztmp1 zmm12
43%define ztmp2 zmm13
44%define zvect_size zmm17
45%define ztwofiftyfour zmm18
46%define ztwofiftysix zmm19
47%define ztwosixtytwo zmm20
48%define znlen_mask zmm21
49%define zbswap zmm22
50%define zqword_shuf zmm23
51%define zdatas_perm3 zmm24
52%define zdatas_perm2 zmm25
53%define zincrement zmm26
54%define zdists_mask zmm27
55%define zdists_start zmm28
56%define zlong_lens2 zmm29
57%define zlong_lens zmm30
58%define zlens_mask zmm31
59
60%ifidn __OUTPUT_FORMAT__, win64
61%define stack_size  8*16 + 2 * 8 + 8
62%define func(x) proc_frame x
63%macro FUNC_SAVE 0
64	alloc_stack	stack_size
65	vmovdqa	[rsp + 0*16], xmm6
66	vmovdqa	[rsp + 1*16], xmm7
67	vmovdqa	[rsp + 2*16], xmm8
68	vmovdqa	[rsp + 3*16], xmm9
69	vmovdqa	[rsp + 4*16], xmm10
70	vmovdqa	[rsp + 5*16], xmm11
71	vmovdqa	[rsp + 6*16], xmm12
72	vmovdqa	[rsp + 7*16], xmm13
73	save_reg	rsi, 8*16 + 0*8
74	save_reg	rdi, 8*16 + 1*8
75	end_prolog
76%endm
77
78%macro FUNC_RESTORE 0
79	vmovdqa	xmm6, [rsp + 0*16]
80	vmovdqa	xmm7, [rsp + 1*16]
81	vmovdqa	xmm8, [rsp + 2*16]
82	vmovdqa	xmm9, [rsp + 3*16]
83	vmovdqa	xmm10, [rsp + 4*16]
84	vmovdqa	xmm11, [rsp + 5*16]
85	vmovdqa	xmm12, [rsp + 6*16]
86	vmovdqa	xmm13, [rsp + 7*16]
87
88	mov	[rsp + 8*16 + 0*8], rsi
89	mov	[rsp + 8*16 + 1*8], rdi
90	add	rsp, stack_size
91%endm
92%else
93%define func(x) x:
94%macro FUNC_SAVE 0
95%endm
96
97%macro FUNC_RESTORE 0
98%endm
99%endif
100%define VECT_SIZE 16
101
102global set_long_icf_fg_06
103func(set_long_icf_fg_06)
104	FUNC_SAVE
105
106	sub	end_in, LA + 15
107	vmovdqu32 zlong_lens, [long_len]
108	vmovdqu32 zlong_lens2, [long_len2]
109	vmovdqu32 zlens_mask, [len_mask]
110	vmovdqu16 zdists_start, [dist_start]
111	vmovdqu32 zdists_mask, [dists_mask]
112	vmovdqu32 zincrement, [increment]
113	vmovdqu64 zdatas_perm2, [datas_perm2]
114	vmovdqu64 zdatas_perm3, [datas_perm3]
115	vmovdqu64 zqword_shuf, [qword_shuf]
116	vmovdqu64 zbswap, [bswap_shuf]
117	vmovdqu64 znlen_mask, [nlen_mask]
118	vmovdqu64 zvect_size, [vect_size]
119	vmovdqu64 ztwofiftyfour, [twofiftyfour]
120	vmovdqu64 ztwofiftysix, [twofiftysix]
121	vmovdqu64 ztwosixtytwo, [twosixtytwo]
122	vmovdqu32 zmatch_lookup, [match_lookup]
123
124fill_loop: ; Tahiti is a magical place
125	vmovdqu32 zmatch_lookup2, zmatch_lookup
126	vmovdqu32 zmatch_lookup, [match_lookup + ICF_CODE_BYTES * VECT_SIZE]
127
128	cmp	next_in, end_in
129	jae	end_fill
130	vpandd	zlens, zmatch_lookup2, zlens_mask
131	vpcmpgtd k3, zlens, zlong_lens
132
133;; Speculatively increment
134	add	next_in, VECT_SIZE
135	add	match_lookup, ICF_CODE_BYTES * VECT_SIZE
136
137	ktestw	k3, k3
138	jz	fill_loop
139
140	vpsrld	zdist_codes, zmatch_lookup2, DIST_OFFSET
141	vpmovdw	zdists %+ y, zdist_codes ; Relies on perm working mod 32
142	vpermw	zdists, zdists, zdists_start
143	vpmovzxwd zdists, zdists %+ y
144
145	vpsrld	zdist_extras, zmatch_lookup2, EXTRA_BITS_OFFSET
146	vpsubd	zdist_extras, zincrement, zdist_extras
147
148	vpsubd	zdists, zdist_extras, zdists
149	vextracti32x8 zdists2 %+ y, zdists, 1
150	kmovb	k6, k3
151	kshiftrw k7, k3, 8
152	vpgatherdq zlens1 {k6}, [next_in + zdists %+ y - 8]
153	vpgatherdq zlens2 {k7}, [next_in + zdists2 %+ y - 8]
154
155	vmovdqu8 datas %+ y, [next_in - 8]
156	vpermq	zlookup, zdatas_perm2, datas
157	vpshufb	zlookup, zlookup, zqword_shuf
158	vpermq	zlookup2, zdatas_perm3, datas
159	vpshufb	zlookup2, zlookup2, zqword_shuf
160
161	vpxorq	zlens1, zlens1, zlookup
162	vpxorq	zlens2, zlens2, zlookup2
163
164	vpshufb	zlens1, zlens1, zbswap
165	vpshufb	zlens2, zlens2, zbswap
166	vplzcntq zlens1, zlens1
167	vplzcntq zlens2, zlens2
168	vpmovqd	zlens1 %+ y, zlens1
169	vpmovqd	zlens2 %+ y, zlens2
170	vinserti32x8 zlens1, zlens2 %+ y, 1
171	vpsrld	zlens1 {k3}{z}, zlens1, 3
172
173	vpandd	zmatch_lookup2 {k3}{z}, zmatch_lookup2, znlen_mask
174	vpaddd	zmatch_lookup2 {k3}{z}, zmatch_lookup2, ztwosixtytwo
175	vpaddd	zmatch_lookup2 {k3}{z}, zmatch_lookup2, zlens1
176
177	vmovdqu32 [match_lookup - ICF_CODE_BYTES * VECT_SIZE] {k3}, zmatch_lookup2
178
179	vpcmpgtd k3, zlens1, zlong_lens2
180	ktestw	k3, k3
181	jz	fill_loop
182
183	vpsubd	zdists, zincrement, zdists
184
185	vpcompressd zdists2 {k3}, zdists
186	vpcompressd zmatch_lookup2 {k3}, zmatch_lookup2
187	kmovq	match_offset, k3
188	tzcnt	match_offset, match_offset
189
190	vmovd	dist %+ d, zdists2 %+ x
191	lea	next_in, [next_in + match_offset - VECT_SIZE]
192	lea	match_lookup, [match_lookup + ICF_CODE_BYTES * (match_offset - VECT_SIZE)]
193	mov	match_in, next_in
194	sub	match_in, dist
195
196	mov	len, 2
197%rep 3
198	vmovdqu8 ztmp1, [next_in + len]
199	vmovdqu8 ztmp2, [match_in + len]
200	vpcmpb	k3, ztmp1, [match_in + len], NEQ
201	ktestq	k3, k3
202	jnz	miscompare
203
204	add	len, 64
205%endrep
206
207	vmovdqu8 ztmp1, [next_in + len]
208	vmovdqu8 ztmp2, [match_in + len]
209	vpcmpb	k3, ztmp1, ztmp2, 4
210
211miscompare:
212	kmovq	tmp1, k3
213	tzcnt	tmp1, tmp1
214	add	len, tmp1
215	add	next_in, len
216	lea	match_lookup, [match_lookup + ICF_CODE_BYTES * len]
217	vmovdqu32 zmatch_lookup, [match_lookup]
218
219	vpbroadcastd zmatch_lookup2, zmatch_lookup2 %+ x
220	vpandd	zmatch_lookup2, zmatch_lookup2, znlen_mask
221
222	vpbroadcastd zlens1, len %+ d
223	vpsubd	zlens1, zlens1, zincrement
224	vpaddd	zlens1, zlens1, ztwofiftyfour
225	neg	len
226
227update_match_lookup:
228	vpandd	zlens2, zlens_mask, [match_lookup + ICF_CODE_BYTES * len]
229	vpcmpgtd k3, zlens1, zlens2
230	vpcmpgtd k4, zlens1, ztwofiftysix
231	kandw	k3, k3, k4
232
233	vpaddd	zlens2 {k3}{z}, zlens1, zmatch_lookup2
234
235	vmovdqu32 [match_lookup + ICF_CODE_BYTES * len] {k3}, zlens2
236
237	knotw	k3, k3
238	ktestw	k3, k3
239	jnz	fill_loop
240
241	add	len, VECT_SIZE
242	vpsubd	zlens1, zlens1, zvect_size
243
244	jmp	update_match_lookup
245end_fill:
246
247	FUNC_RESTORE
248	ret
249
250section .data
251align 64
252dist_start:
253	dw 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0007, 0x0009, 0x000d
254	dw 0x0011, 0x0019, 0x0021, 0x0031, 0x0041, 0x0061, 0x0081, 0x00c1
255	dw 0x0101, 0x0181, 0x0201, 0x0301, 0x0401, 0x0601, 0x0801, 0x0c01
256	dw 0x1001, 0x1801, 0x2001, 0x3001, 0x4001, 0x6001, 0x0000, 0x0000
257len_mask:
258	dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK
259	dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK
260	dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK
261	dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK
262dists_mask:
263	dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK
264	dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK
265	dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK
266	dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK
267long_len:
268	dd 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105
269	dd 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105
270long_len2:
271	dd 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7
272	dd 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7
273
274increment:
275	dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
276	dd 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
277datas_perm2:
278	dq 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1
279datas_perm3:
280	dq 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2
281bswap_shuf:
282	db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
283	db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
284	db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
285	db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
286	db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
287	db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
288	db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
289	db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
290qword_shuf:
291	db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
292	db 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8
293	db 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9
294	db 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa
295	db 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb
296	db 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc
297	db 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd
298	db 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe
299	db 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
300vect_size:
301	dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE
302	dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE
303	dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE
304	dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE
305twofiftyfour:
306	dd 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe
307	dd 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe
308twofiftysix:
309	dd 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100
310	dd 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100
311twosixtytwo:
312	dd 0x106, 0x106, 0x106, 0x106, 0x106, 0x106, 0x106, 0x106
313	dd 0x106, 0x106, 0x106, 0x106, 0x106, 0x106, 0x106, 0x106
314nlen_mask:
315	dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00
316	dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00
317	dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00
318	dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00
319%endif
320