xref: /isa-l/igzip/encode_df_04.asm (revision cd888f01a447dd04c3a8b50362079648d432d2ca)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2018 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30%include "reg_sizes.asm"
31%include "lz0a_const.asm"
32%include "data_struct2.asm"
33%include "stdmac.asm"
34
35%define ARCH 04
36%define USE_HSWNI
37
38; tree entry is 4 bytes:
39; lit/len tree (513 entries)
40; |  3  |  2   |  1 | 0 |
41; | len |       code    |
42;
43; dist tree
44; |  3  |  2   |  1 | 0 |
45; |eblen:codlen|   code |
46
47; token format:
48; DIST_OFFSET:0 : lit/len
49; 31:(DIST_OFFSET + 5) : dist Extra Bits
50; (DIST_OFFSET + 5):DIST_OFFSET : dist code
51; lit/len: 0-256 (literal)
52;          257-512 (dist + 254)
53
54; returns final token pointer
55; equal to token_end if successful
56;    uint32_t* encode_df(uint32_t *token_start, uint32_t *token_end,
57;                            BitBuf *out_buf, uint32_t *trees);
58
59%ifidn __OUTPUT_FORMAT__, win64
60%define arg1 rcx
61%define arg2 rdx
62%define arg3 r8
63%define arg4 r9
64%define sym		rsi
65%define dsym		rdi
66%define hufftables	r9
67%define ptr		r11
68%else
69; Linux
70%define arg1 rdi
71%define arg2 rsi
72%define arg3 rdx
73%define arg4 rcx
74%define sym		r9
75%define dsym		r8
76%define hufftables	r11
77%define ptr		rdi
78%endif
79
80%define in_buf_end	arg2
81%define bitbuf		arg3
82%define out_buf		bitbuf
83; bit_count is rcx
84%define bits		rax
85%define data		r12
86%define tmp		rbx
87%define len 		dsym
88%define tmp2 		r10
89%define end_ptr		rbp
90
91%define LIT_MASK	((0x1 << LIT_LEN_BIT_COUNT) - 1)
92%define DIST_MASK	((0x1 << DIST_LIT_BIT_COUNT) - 1)
93
94%define codes1		ymm1
95%define code_lens1	ymm2
96%define codes2		ymm3
97%define code_lens2	ymm4
98%define codes3		ymm5
99%define	code_lens3	ymm6
100%define codes4		ymm7
101%define syms		ymm7
102
103%define code_lens4	ymm8
104%define dsyms		ymm8
105
106%define ytmp		ymm9
107%define codes_lookup1	ymm10
108%define	codes_lookup2	ymm11
109%define datas		ymm12
110%define ybits		ymm13
111%define ybits_count	ymm14
112%define yoffset_mask	ymm15
113
114%define VECTOR_SIZE 0x20
115%define VECTOR_LOOP_PROCESSED (2 * VECTOR_SIZE)
116%define VECTOR_SLOP 0x20 - 8
117
118gpr_save_mem_offset	equ	0
119gpr_save_mem_size	equ	8 * 6
120xmm_save_mem_offset	equ	gpr_save_mem_offset + gpr_save_mem_size
121xmm_save_mem_size	equ	10 * 16
122bitbuf_mem_offset	equ	xmm_save_mem_offset + xmm_save_mem_size
123bitbuf_mem_size		equ	8
124stack_size		equ	gpr_save_mem_size + xmm_save_mem_size + bitbuf_mem_size
125
126
127%macro FUNC_SAVE 0
128	sub	rsp, stack_size
129	mov	[rsp + gpr_save_mem_offset + 0*8], rbx
130	mov	[rsp + gpr_save_mem_offset + 1*8], rbp
131	mov	[rsp + gpr_save_mem_offset + 2*8], r12
132
133%ifidn __OUTPUT_FORMAT__, win64
134	mov	[rsp + gpr_save_mem_offset + 3*8], rsi
135	mov	[rsp + gpr_save_mem_offset + 4*8], rdi
136
137	MOVDQU	[rsp + xmm_save_mem_offset + 0*8], xmm6
138	MOVDQU	[rsp + xmm_save_mem_offset + 1*8], xmm7
139	MOVDQU	[rsp + xmm_save_mem_offset + 2*8], xmm8
140	MOVDQU	[rsp + xmm_save_mem_offset + 3*8], xmm9
141	MOVDQU	[rsp + xmm_save_mem_offset + 4*8], xmm10
142	MOVDQU	[rsp + xmm_save_mem_offset + 5*8], xmm11
143	MOVDQU	[rsp + xmm_save_mem_offset + 6*8], xmm12
144	MOVDQU	[rsp + xmm_save_mem_offset + 7*8], xmm13
145	MOVDQU	[rsp + xmm_save_mem_offset + 8*8], xmm14
146	MOVDQU	[rsp + xmm_save_mem_offset + 9*8], xmm15
147%endif
148
149%endm
150
151%macro FUNC_RESTORE 0
152	mov	rbx, [rsp + gpr_save_mem_offset + 0*8]
153	mov	rbp, [rsp + gpr_save_mem_offset + 1*8]
154	mov	r12, [rsp + gpr_save_mem_offset + 2*8]
155
156%ifidn __OUTPUT_FORMAT__, win64
157	mov	rsi, [rsp + gpr_save_mem_offset + 3*8]
158	mov	rdi, [rsp + gpr_save_mem_offset + 4*8]
159
160	MOVDQU	xmm6, [rsp + xmm_save_mem_offset + 0*8]
161	MOVDQU	xmm7, [rsp + xmm_save_mem_offset + 1*8]
162	MOVDQU	xmm8, [rsp + xmm_save_mem_offset + 2*8]
163	MOVDQU	xmm9, [rsp + xmm_save_mem_offset + 3*8]
164	MOVDQU	xmm10, [rsp + xmm_save_mem_offset + 4*8]
165	MOVDQU	xmm11, [rsp + xmm_save_mem_offset + 5*8]
166	MOVDQU	xmm12, [rsp + xmm_save_mem_offset + 6*8]
167	MOVDQU	xmm13, [rsp + xmm_save_mem_offset + 7*8]
168	MOVDQU	xmm14, [rsp + xmm_save_mem_offset + 8*8]
169	MOVDQU	xmm15, [rsp + xmm_save_mem_offset + 9*8]
170%endif
171	add	rsp, stack_size
172
173%endmacro
174
175default rel
176section .text
177
178global encode_deflate_icf_ %+ ARCH
179encode_deflate_icf_ %+ ARCH:
180	endbranch
181	FUNC_SAVE
182
183%ifnidn ptr, arg1
184	mov	ptr, arg1
185%endif
186%ifnidn hufftables, arg4
187	mov	hufftables, arg4
188%endif
189
190	mov	[rsp + bitbuf_mem_offset], bitbuf
191	mov	bits, [bitbuf + _m_bits]
192	mov	ecx, [bitbuf + _m_bit_count]
193	mov	end_ptr, [bitbuf + _m_out_end]
194	mov	out_buf, [bitbuf + _m_out_buf]	; clobbers bitbuf
195
196	sub	end_ptr, VECTOR_SLOP
197	sub	in_buf_end, VECTOR_LOOP_PROCESSED
198	cmp	ptr, in_buf_end
199	jge	.finish
200
201	vpcmpeqq	ytmp, ytmp, ytmp
202	vmovdqu	datas, [ptr]
203	vpand	syms, datas, [lit_mask]
204	vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
205
206	vpcmpeqq	ytmp, ytmp, ytmp
207	vpsrld	dsyms, datas, DIST_OFFSET
208	vpand	dsyms, dsyms, [dist_mask]
209	vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
210
211	vmovq	ybits %+ x, bits
212	vmovq	ybits_count %+ x, rcx
213	vmovdqa	yoffset_mask, [offset_mask]
214
215.main_loop:
216	;;  Sets codes1 to contain lit/len codes andcode_lens1 the corresponding lengths
217	vpsrld	code_lens1, codes_lookup1, 24
218	vpand	codes1, codes_lookup1, [lit_icr_mask]
219
220	;; Sets codes2 to contain dist codes, code_lens2 the corresponding lengths,
221	;; and code_lens3 the extra bit counts
222	vpblendw	codes2, ybits, codes_lookup2, 0x55 ;Bits 8 and above of ybits are 0
223	vpsrld	code_lens2, codes_lookup2, 24
224	vpsrld	code_lens3, codes_lookup2, 16
225	vpand	code_lens3, [eb_icr_mask]
226
227	;; Set codes3 to contain the extra bits
228	vpsrld	codes3, datas, EXTRA_BITS_OFFSET
229
230	cmp	out_buf, end_ptr
231	ja	.main_loop_exit
232
233	;; Start code lookups for next iteration
234	add	ptr, VECTOR_SIZE
235	vpcmpeqq	ytmp, ytmp, ytmp
236	vmovdqu	datas, [ptr]
237	vpand	syms, datas, [lit_mask]
238	vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
239
240	vpcmpeqq	ytmp, ytmp, ytmp
241	vpsrld	dsyms, datas, DIST_OFFSET
242	vpand	dsyms, dsyms, [dist_mask]
243	vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
244
245	;; Merge dist code with extra bits
246	vpsllvd	codes3, codes3, code_lens2
247	vpxor	codes2, codes2, codes3
248	vpaddd	code_lens2, code_lens2, code_lens3
249
250	;; Check for long codes
251	vpaddd	code_lens3, code_lens1, code_lens2
252	vpcmpgtd	ytmp, code_lens3, [max_write_d]
253	vptest	ytmp, ytmp
254	jnz	.long_codes
255
256	;; Merge dist and len codes
257	vpsllvd	codes2, codes2, code_lens1
258	vpxor	codes1, codes1, codes2
259
260	;; Split buffer data into qwords, ytmp is 0 after last branch
261	vpblendd codes3, ytmp, codes1, 0x55
262	vpsrlq	codes1, codes1, 32
263	vpsrlq	code_lens1, code_lens3, 32
264	vpblendd	code_lens3, ytmp, code_lens3, 0x55
265
266	;; Merge bitbuf bits
267	vpsllvq codes3, codes3, ybits_count
268	vpxor	codes3, codes3, ybits
269	vpaddq	code_lens3, code_lens3, ybits_count
270
271	;; Merge two symbols into qwords
272	vpsllvq	codes1, codes1, code_lens3
273	vpxor codes1, codes1, codes3
274	vpaddq code_lens1, code_lens1, code_lens3
275
276	;; Split buffer data into dqwords, ytmp is 0 after last branch
277	vpblendd	codes2, ytmp, codes1, 0x33
278	vpblendd	code_lens2, ytmp, code_lens1, 0x33
279	vpsrldq	codes1, 8
280	vpsrldq	code_lens1, 8
281
282	;; Bit align dqwords
283	vpaddq	code_lens1, code_lens1, code_lens2
284	vpand	ybits_count, code_lens1, yoffset_mask ;Extra bits
285	vpermq	ybits_count, ybits_count, 0xcf
286	vpaddq	code_lens2, ybits_count
287	vpsllvq	codes2, codes2, ybits_count
288
289	;; Merge two qwords into dqwords
290	vmovdqa	ytmp, [q_64]
291	vpsubq	code_lens3, ytmp, code_lens2
292	vpsrlvq	codes3, codes1, code_lens3
293	vpslldq	codes3, codes3, 8
294
295	vpsllvq	codes1, codes1, code_lens2
296
297	vpxor	codes1, codes1, codes3
298	vpxor	codes1, codes1, codes2
299
300	vmovq	tmp, code_lens1 %+ x 	;Number of bytes
301	shr	tmp, 3
302
303	;; Extract last bytes
304	vpaddq	code_lens2, code_lens1, ybits_count
305	vpsrlq	code_lens2, code_lens2, 3
306	vpshufb	codes2, codes1, code_lens2
307	vpand	codes2, codes2, [bytes_mask]
308	vextracti128	ybits %+ x, codes2, 1
309
310	;; Check for short codes
311	vptest code_lens2, [min_write_mask]
312	jz	.short_codes
313.short_codes_next:
314
315	vpermq	codes2, codes2, 0x45
316	vpor	codes1, codes1, codes2
317
318	;; bit shift upper dqword combined bits to line up with lower dqword
319	vextracti128	code_lens2 %+ x, code_lens1, 1
320
321	; Write out lower dqword of combined bits
322	vmovdqu	[out_buf], codes1
323	vpaddq	code_lens1, code_lens1, code_lens2
324
325	vmovq	tmp2, code_lens1 %+ x	;Number of bytes
326	shr	tmp2, 3
327	vpand	ybits_count, code_lens1, yoffset_mask ;Extra bits
328
329	; Write out upper dqword of combined bits
330	vextracti128	[out_buf + tmp], codes1, 1
331	add	out_buf, tmp2
332
333	cmp	ptr, in_buf_end
334	jbe	.main_loop
335
336.main_loop_exit:
337	vmovq	rcx, ybits_count %+ x
338	vmovq	bits, ybits %+ x
339	jmp	.finish
340
341.short_codes:
342	;; Merge last bytes when the second dqword contains less than a byte
343	vpor ybits %+ x, codes2 %+ x
344	jmp .short_codes_next
345
346.long_codes:
347	add	end_ptr, VECTOR_SLOP
348	sub	ptr, VECTOR_SIZE
349
350	vpxor ytmp, ytmp, ytmp
351	vpblendd codes3, ytmp, codes1, 0x55
352	vpblendd code_lens3, ytmp, code_lens1, 0x55
353	vpblendd codes4, ytmp, codes2, 0x55
354
355	vpsllvq	codes4, codes4, code_lens3
356	vpxor	codes3, codes3, codes4
357	vpaddd	code_lens3, code_lens1, code_lens2
358
359	vpsrlq	codes1, codes1, 32
360	vpsrlq	code_lens1, code_lens1, 32
361	vpsrlq	codes2, codes2, 32
362
363	vpsllvq	codes2, codes2, code_lens1
364	vpxor codes1, codes1, codes2
365
366	vpsrlq code_lens1, code_lens3, 32
367	vpblendd	code_lens3, ytmp, code_lens3, 0x55
368
369	;; Merge bitbuf bits
370	vpsllvq codes3, codes3, ybits_count
371	vpxor	codes3, codes3, ybits
372	vpaddq	code_lens3, code_lens3, ybits_count
373	vpaddq code_lens1, code_lens1, code_lens3
374
375	xor	bits, bits
376	xor	rcx, rcx
377	vpsubq code_lens1, code_lens1, code_lens3
378%rep 2
379;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
380	cmp	out_buf, end_ptr
381	ja	.overflow
382	;; insert LL code
383	vmovq	sym, codes3 %+ x
384	vmovq	tmp2, code_lens3 %+ x
385	SHLX	sym, sym, rcx
386	or	bits, sym
387	add	rcx, tmp2
388
389	; empty bits
390	mov	[out_buf], bits
391	mov	tmp, rcx
392	shr	tmp, 3		; byte count
393	add	out_buf, tmp
394	mov	tmp, rcx
395	and	rcx, ~7
396	SHRX	bits, bits, rcx
397	mov	rcx, tmp
398	and	rcx, 7
399	add	ptr, 4
400
401;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
402	cmp	out_buf, end_ptr
403	ja	.overflow
404	;; insert LL code
405	vmovq	sym, codes1 %+ x
406	vmovq	tmp2, code_lens1 %+ x
407	SHLX	sym, sym, rcx
408	or	bits, sym
409	add	rcx, tmp2
410
411	; empty bits
412	mov	[out_buf], bits
413	mov	tmp, rcx
414	shr	tmp, 3		; byte count
415	add	out_buf, tmp
416	mov	tmp, rcx
417	and	rcx, ~7
418	SHRX	bits, bits, rcx
419	mov	rcx, tmp
420	and	rcx, 7
421	add	ptr, 4
422
423;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
424	cmp	out_buf, end_ptr
425	ja	.overflow
426	;; insert LL code
427	vpextrq	sym, codes3 %+ x, 1
428	vpextrq	tmp2, code_lens3 %+ x, 1
429	SHLX	sym, sym, rcx
430	or	bits, sym
431	add	rcx, tmp2
432
433	; empty bits
434	mov	[out_buf], bits
435	mov	tmp, rcx
436	shr	tmp, 3		; byte count
437	add	out_buf, tmp
438	mov	tmp, rcx
439	and	rcx, ~7
440	SHRX	bits, bits, rcx
441	mov	rcx, tmp
442	and	rcx, 7
443	add	ptr, 4
444
445;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
446	cmp	out_buf, end_ptr
447	ja	.overflow
448	;; insert LL code
449	vpextrq	sym, codes1 %+ x, 1
450	vpextrq	tmp2, code_lens1 %+ x, 1
451	SHLX	sym, sym, rcx
452	or	bits, sym
453	add	rcx, tmp2
454
455	; empty bits
456	mov	[out_buf], bits
457	mov	tmp, rcx
458	shr	tmp, 3		; byte count
459	add	out_buf, tmp
460	mov	tmp, rcx
461	and	rcx, ~7
462	SHRX	bits, bits, rcx
463	mov	rcx, tmp
464	and	rcx, 7
465	add	ptr, 4
466
467	vextracti128 codes3 %+ x, codes3, 1
468	vextracti128 code_lens3 %+ x, code_lens3, 1
469	vextracti128 codes1 %+ x, codes1, 1
470	vextracti128 code_lens1 %+ x, code_lens1, 1
471%endrep
472	sub	end_ptr, VECTOR_SLOP
473
474	vmovq	ybits %+ x, bits
475	vmovq	ybits_count %+ x, rcx
476	cmp	ptr, in_buf_end
477	jbe	.main_loop
478
479.finish:
480	add	in_buf_end, VECTOR_LOOP_PROCESSED
481	add	end_ptr, VECTOR_SLOP
482
483	cmp	ptr, in_buf_end
484	jge	.overflow
485
486.finish_loop:
487	mov	DWORD(data), [ptr]
488
489	cmp	out_buf, end_ptr
490	ja	.overflow
491
492	mov	sym, data
493	and	sym, LIT_MASK	; sym has ll_code
494	mov	DWORD(sym), [hufftables + _lit_len_table + sym * 4]
495
496	; look up dist sym
497	mov	dsym, data
498	shr	dsym, DIST_OFFSET
499	and	dsym, DIST_MASK
500	mov	DWORD(dsym), [hufftables + _dist_table + dsym * 4]
501
502	; insert LL code
503	; sym: 31:24 length; 23:0 code
504	mov	tmp2, sym
505	and	sym, 0xFFFFFF
506	SHLX	sym, sym, rcx
507	shr	tmp2, 24
508	or	bits, sym
509	add	rcx, tmp2
510
511	; insert dist code
512	movzx	tmp, WORD(dsym)
513	SHLX	tmp, tmp, rcx
514	or	bits, tmp
515	mov	tmp, dsym
516	shr	tmp, 24
517	add	rcx, tmp
518
519	; insert dist extra bits
520	shr	data, EXTRA_BITS_OFFSET
521	add	ptr, 4
522	SHLX	data, data, rcx
523	or	bits, data
524	shr	dsym, 16
525	and	dsym, 0xFF
526	add	rcx, dsym
527
528	; empty bits
529	mov	[out_buf], bits
530	mov	tmp, rcx
531	shr	tmp, 3		; byte count
532	add	out_buf, tmp
533	mov	tmp, rcx
534	and	rcx, ~7
535	SHRX	bits, bits, rcx
536	mov	rcx, tmp
537	and	rcx, 7
538
539	cmp	ptr, in_buf_end
540	jb	.finish_loop
541
542.overflow:
543	mov	tmp, [rsp + bitbuf_mem_offset]
544	mov	[tmp + _m_bits], bits
545	mov	[tmp + _m_bit_count], ecx
546	mov	[tmp + _m_out_buf], out_buf
547
548	mov	rax, ptr
549
550	FUNC_RESTORE
551
552	ret
553
554section .data
555	align 32
556max_write_d:
557	dd	0x1c, 0x1d, 0x1f, 0x20, 0x1c, 0x1d, 0x1f, 0x20
558min_write_mask:
559	dq	0x00, 0x00, 0xff, 0x00
560offset_mask:
561	dq	0x0000000000000007, 0x0000000000000000
562	dq	0x0000000000000000, 0x0000000000000000
563q_64:
564	dq	0x0000000000000040, 0x0000000000000000
565	dq	0x0000000000000040, 0x0000000000000000
566lit_mask:
567	dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
568	dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
569dist_mask:
570	dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
571	dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
572lit_icr_mask:
573	dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
574	dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
575eb_icr_mask:
576	dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
577	dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
578bytes_mask:
579	dq	0x00000000000000ff, 0x0000000000000000
580	dq	0x00000000000000ff, 0x0000000000000000
581