xref: /isa-l/igzip/igzip_body.asm (revision 1500db751d08b6c4ad6097135fe78259540a2807)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30%include "options.asm"
31
32%include "lz0a_const.asm"
33%include "data_struct2.asm"
34%include "bitbuf2.asm"
35%include "huffman.asm"
36%include "igzip_compare_types.asm"
37%include "reg_sizes.asm"
38
39%include "stdmac.asm"
40
41%define LARGE_MATCH_HASH_REP 1  ; Hash 4 * LARGE_MATCH_HASH_REP elements
42%define LARGE_MATCH_MIN 264 	; Minimum match size to enter large match emit loop
43%define MIN_INBUF_PADDING 16
44%define MAX_EMIT_SIZE 258 * 16
45;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
46;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
47;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
48
49%define	tmp2		rcx
50%define	hash2		rcx
51
52%define	curr_data	rax
53%define	code		rax
54%define	tmp5		rax
55
56%define	tmp4		rbx
57%define	dist		rbx
58%define	code2		rbx
59%define hmask1		rbx
60
61%define	hash		rdx
62%define	len		rdx
63%define	code_len3	rdx
64%define	tmp8		rdx
65
66%define	tmp1		rsi
67%define	code_len2	rsi
68
69%define	file_start	rdi
70
71%define	m_bit_count	rbp
72
73%define	curr_data2	r8
74%define	len2		r8
75%define	tmp6		r8
76%define	f_end_i		r8
77
78%define	m_bits		r9
79
80%define	f_i		r10
81
82%define	m_out_buf	r11
83
84%define	dist2		r12
85%define	tmp7		r12
86%define	code4		r12
87
88%define	tmp3		r13
89%define	code3		r13
90
91%define	stream		r14
92
93%define	hufftables	r15
94
95;; GPR r8 & r15 can be used
96
97%define xtmp0		xmm0	; tmp
98%define xtmp1		xmm1	; tmp
99%define	xhash		xmm2
100%define	xmask		xmm3
101%define	xdata		xmm4
102
103%define ytmp0		ymm0	; tmp
104%define ytmp1		ymm1	; tmp
105
106
107;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
108;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
109;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
110
111
112blen_mem_offset     equ  0	 ; local variable (8 bytes)
113f_end_i_mem_offset  equ  8
114inbuf_slop_offset    equ 16
115gpr_save_mem_offset equ 32       ; gpr save area (8*8 bytes)
116xmm_save_mem_offset equ 32 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
117stack_size          equ 4*8 + 8*8 + 4*16 + 8
118;;; 8 because stack address is odd multiple of 8 after a function call and
119;;; we want it aligned to 16 bytes
120
121;; Defines to generate functions for different architecture
122%xdefine ARCH 01
123%xdefine ARCH1 02
124%xdefine ARCH2 04
125
126%ifndef COMPARE_TYPE
127%xdefine COMPARE_TYPE_NOT_DEF
128%xdefine COMPARE_TYPE 1
129%xdefine COMPARE_TYPE1 2
130%xdefine COMPARE_TYPE2 3
131%endif
132
133%rep 3
134%if ARCH == 04
135%define USE_HSWNI
136%endif
137
138[bits 64]
139default rel
140section .text
141
142; void isal_deflate_body ( isal_zstream *stream )
143; arg 1: rcx: addr of stream
144global isal_deflate_body_ %+ ARCH
145isal_deflate_body_ %+ ARCH %+ :
146	endbranch
147%ifidn __OUTPUT_FORMAT__, elf64
148	mov	rcx, rdi
149%endif
150
151	;; do nothing if (avail_in == 0)
152	cmp	dword [rcx + _avail_in], 0
153	jne	.skip1
154
155	;; Set stream's next state
156	mov	rdx, ZSTATE_FLUSH_READ_BUFFER
157	mov	rax, ZSTATE_BODY
158	cmp	word [rcx + _end_of_stream], 0
159	cmovne	rax, rdx
160	cmp	word [rcx + _flush], _NO_FLUSH
161	cmovne	rax, rdx
162	mov	dword [rcx + _internal_state_state], eax
163	ret
164.skip1:
165
166%ifdef ALIGN_STACK
167	push	rbp
168	mov	rbp, rsp
169	sub	rsp, stack_size
170	and	rsp, ~15
171%else
172	sub	rsp, stack_size
173%endif
174
175	mov [rsp + gpr_save_mem_offset + 0*8], rbx
176	mov [rsp + gpr_save_mem_offset + 1*8], rsi
177	mov [rsp + gpr_save_mem_offset + 2*8], rdi
178	mov [rsp + gpr_save_mem_offset + 3*8], rbp
179	mov [rsp + gpr_save_mem_offset + 4*8], r12
180	mov [rsp + gpr_save_mem_offset + 5*8], r13
181	mov [rsp + gpr_save_mem_offset + 6*8], r14
182	mov [rsp + gpr_save_mem_offset + 7*8], r15
183
184	mov	stream, rcx
185	mov	byte [stream + _internal_state_has_eob], 0
186
187	MOVD	xmask, [stream + _internal_state_hash_mask]
188	PSHUFD	xmask, xmask, 0
189
190	; state->bitbuf.set_buf(stream->next_out, stream->avail_out);
191	mov	m_out_buf, [stream + _next_out]
192	mov	[stream + _internal_state_bitbuf_m_out_start], m_out_buf
193	mov	tmp1 %+ d, [stream + _avail_out]
194	add	tmp1, m_out_buf
195	sub	tmp1, SLOP
196
197	mov	[stream + _internal_state_bitbuf_m_out_end], tmp1
198
199	mov	m_bits,           [stream + _internal_state_bitbuf_m_bits]
200	mov	m_bit_count %+ d, [stream + _internal_state_bitbuf_m_bit_count]
201	mov	hufftables, [stream + _hufftables]
202
203	mov	file_start, [stream + _next_in]
204
205	mov	f_i %+ d, dword [stream + _total_in]
206	sub	file_start, f_i
207
208	mov	f_end_i %+ d, [stream + _avail_in]
209	add	f_end_i, f_i
210
211	mov	qword [rsp + inbuf_slop_offset], MIN_INBUF_PADDING
212	cmp	byte [stream + _end_of_stream], 0
213	jnz	.default_inbuf_padding
214	cmp	byte [stream + _flush], 0
215	jnz	.default_inbuf_padding
216	mov	qword [rsp + inbuf_slop_offset], LA
217.default_inbuf_padding:
218
219	; f_end_i -= INBUF_PADDING;
220	sub	f_end_i, [rsp + inbuf_slop_offset]
221	mov	[rsp + f_end_i_mem_offset], f_end_i
222	; if (f_end_i <= 0) continue;
223
224	cmp	f_end_i, f_i
225	jle	.input_end
226
227	MOVD	hmask1 %+ d, xmask
228	; for (f_i = f_start_i; f_i < f_end_i; f_i++) {
229	MOVDQU	xdata, [file_start + f_i]
230	mov	curr_data, [file_start + f_i]
231	mov	tmp3, curr_data
232	mov	tmp6, curr_data
233
234	compute_hash	hash, curr_data
235
236	shr	tmp3, 8
237	compute_hash	hash2, tmp3
238
239	and	hash %+ d, hmask1 %+ d
240	and	hash2 %+ d, hmask1 %+ d
241
242	cmp	byte [stream + _internal_state_has_hist], IGZIP_NO_HIST
243	je	.write_first_byte
244
245	jmp	.loop2
246	align	16
247
248.loop2:
249	mov	tmp3 %+ d, dword [stream + _internal_state_dist_mask]
250
251	; if (state->bitbuf.is_full()) {
252	cmp	m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
253	ja	.output_end
254
255	xor	dist, dist
256	xor	dist2, dist2
257
258	lea	tmp1, [file_start + f_i]
259
260	mov	dist %+ w, f_i %+ w
261	dec	dist
262	sub	dist %+ w, word [stream + _internal_state_head + 2 * hash]
263	mov	[stream + _internal_state_head + 2 * hash], f_i %+ w
264
265	inc	f_i
266
267	MOVQ	tmp6, xdata
268	shr	tmp5, 16
269	mov	tmp8, tmp5
270	compute_hash	tmp6, tmp5
271
272	mov	dist2 %+ w, f_i %+ w
273	dec	dist2
274	sub	dist2 %+ w, word [stream + _internal_state_head + 2 * hash2]
275	mov	[stream + _internal_state_head + 2 * hash2], f_i %+ w
276
277	; if ((dist-1) < (D-1)) {
278	and	dist, tmp3
279	neg	dist
280
281	shr	tmp8, 8
282	compute_hash	tmp2, tmp8
283
284	and	dist2, tmp3
285	neg	dist2
286
287	;; Check for long len/dist match (>7) with first literal
288	MOVQ	len, xdata
289	mov	curr_data, len
290	PSRLDQ	xdata, 1
291	xor	len, [tmp1 + dist - 1]
292	jz	.compare_loop
293
294	MOVD	xhash, tmp6 %+ d
295	PINSRD	xhash, tmp2 %+ d, 1
296	PAND	xhash, xhash, xmask
297
298	;; Check for len/dist match (>7) with second literal
299	MOVQ	len2, xdata
300	xor	len2, [tmp1 + dist2]
301	jz	.compare_loop2
302
303	;; Specutively load the code for the first literal
304	movzx   tmp1, curr_data %+ b
305	get_lit_code    tmp1, code3, rcx, hufftables
306
307	;; Check for len/dist match for first literal
308	test    len %+ d, 0xFFFFFFFF
309	jz      .len_dist_huffman_pre
310
311	;; Specutively load the code for the second literal
312	shr     curr_data, 8
313	and     curr_data, 0xff
314	get_lit_code    curr_data, code2, code_len2, hufftables
315
316	SHLX    code2, code2, rcx
317	or      code2, code3
318	add     code_len2, rcx
319
320	;; Check for len/dist match for second literal
321	test    len2 %+ d, 0xFFFFFFFF
322	jnz     .write_lit_bits
323
324.len_dist_lit_huffman_pre:
325	mov     code_len3, rcx
326	bsf	len2, len2
327	shr	len2, 3
328
329.len_dist_lit_huffman:
330	neg	dist2
331
332%ifndef LONGER_HUFFTABLE
333	mov	tmp4, dist2
334	get_dist_code	tmp4, code4, code_len2, hufftables ;; clobbers dist, rcx
335%else
336	get_dist_code	dist2, code4, code_len2, hufftables
337%endif
338	get_len_code	len2, code, rcx, hufftables ;; rcx is code_len
339
340	MOVD	hmask1 %+ d, xmask
341
342	SHLX	code4, code4, rcx
343	or	code4, code
344	add	code_len2, rcx
345
346	add	f_i, len2
347	neg	len2
348
349	SHLX	code4, code4, code_len3
350
351	MOVQ	tmp5, xdata
352	shr	tmp5, 24
353	compute_hash	hash2, tmp5
354	and	hash2 %+ d, hmask1 %+ d
355
356	or	code4, code3
357	add	code_len2, code_len3
358
359	;; Setup for updating hash
360	lea	tmp3, [f_i + len2 + 1]	; tmp3 <= k
361
362	mov	tmp6, [rsp + f_end_i_mem_offset]
363	cmp	f_i, tmp6
364	jge	.len_dist_lit_huffman_finish
365
366	MOVDQU	xdata, [file_start + f_i]
367	mov	curr_data, [file_start + f_i]
368
369	MOVD	hash %+ d, xhash
370	PEXTRD	tmp6 %+ d, xhash, 1
371	mov	[stream + _internal_state_head + 2 * hash], tmp3 %+ w
372
373	compute_hash	hash, curr_data
374
375	add	tmp3,1
376	mov	[stream + _internal_state_head + 2 * tmp6], tmp3 %+ w
377
378	add	tmp3, 1
379	mov	[stream + _internal_state_head + 2 * hash2], tmp3 %+ w
380
381	write_bits	m_bits, m_bit_count, code4, code_len2, m_out_buf
382
383	mov	curr_data2, curr_data
384	shr	curr_data2, 8
385	compute_hash	hash2, curr_data2
386
387%ifdef	NO_LIMIT_HASH_UPDATE
388.loop3:
389	add     tmp3,1
390	cmp	tmp3, f_i
391	jae	.loop3_done
392	mov     tmp6, [file_start + tmp3]
393	compute_hash    tmp1, tmp6
394	and     tmp1 %+ d, hmask1 %+ d
395	; state->head[hash] = k;
396	mov     [stream + _internal_state_head + 2 * tmp1], tmp3 %+ w
397	jmp      .loop3
398.loop3_done:
399%endif
400	; hash = compute_hash(state->file_start + f_i) & hash_mask;
401	and	hash %+ d, hmask1 %+ d
402	and	hash2 %+ d, hmask1 %+ d
403
404	; continue
405	jmp	.loop2
406	;; encode as dist/len
407.len_dist_lit_huffman_finish:
408	MOVD	hash %+ d, xhash
409	PEXTRD	tmp6 %+ d, xhash, 1
410	mov	[stream + _internal_state_head + 2 * hash], tmp3 %+ w
411	add	tmp3,1
412	mov	[stream + _internal_state_head + 2 * tmp6], tmp3 %+ w
413	add	tmp3, 1
414	mov	[stream + _internal_state_head + 2 * hash2], tmp3 %+ w
415
416	write_bits	m_bits, m_bit_count, code4, code_len2, m_out_buf
417	jmp	.input_end
418
419align	16
420.len_dist_huffman_pre:
421	bsf	len, len
422	shr	len, 3
423
424.len_dist_huffman:
425	dec	f_i
426	neg	dist
427
428	; get_dist_code(dist, &code2, &code_len2);
429%ifndef LONGER_HUFFTABLE
430	mov tmp3, dist	; since code2 and dist are rbx
431	get_dist_code	tmp3, code2, code_len2, hufftables ;; clobbers dist, rcx
432%else
433	get_dist_code	dist, code2, code_len2, hufftables
434%endif
435	; get_len_code(len, &code, &code_len);
436	get_len_code	len, code, rcx, hufftables ;; rcx is code_len
437
438	; code2 <<= code_len
439	; code2 |= code
440	; code_len2 += code_len
441	SHLX	code4, code2, rcx
442	or	code4, code
443	add	code_len2, rcx
444
445	;; Setup for updating hash
446	lea	tmp3, [f_i + 2]	; tmp3 <= k
447	add	f_i, len
448
449	MOVD	hash %+ d, xhash
450	PEXTRD	hash2 %+ d, xhash, 1
451	mov	[stream + _internal_state_head + 2 * hash], tmp3 %+ w
452	add	tmp3,1
453	mov	[stream + _internal_state_head + 2 * hash2], tmp3 %+ w
454
455	MOVD	hmask1 %+ d, xmask
456
457	cmp	f_i, [rsp + f_end_i_mem_offset]
458	jge	.len_dist_huffman_finish
459
460	MOVDQU	xdata, [file_start + f_i]
461	mov	curr_data, [file_start + f_i]
462	compute_hash	hash, curr_data
463
464	write_bits	m_bits, m_bit_count, code4, code_len2, m_out_buf
465
466	mov	curr_data2, curr_data
467	shr	curr_data2, 8
468	compute_hash	hash2, curr_data2
469
470%ifdef	NO_LIMIT_HASH_UPDATE
471.loop4:
472	add     tmp3,1
473	cmp	tmp3, f_i
474	jae	.loop4_done
475	mov     tmp6, [file_start + tmp3]
476	compute_hash    tmp1, tmp6
477	and     tmp1 %+ d, hmask1 %+ d
478	mov     [stream + _internal_state_head + 2 * tmp1], tmp3 %+ w
479	jmp     .loop4
480.loop4_done:
481%endif
482
483	; hash = compute_hash(state->file_start + f_i) & hash_mask;
484	and	hash %+ d, hmask1 %+ d
485	and	hash2 %+ d, hmask1 %+ d
486
487	; continue
488	jmp	.loop2
489
490.len_dist_huffman_finish:
491	write_bits	m_bits, m_bit_count, code4, code_len2, m_out_buf
492	jmp	.input_end
493
494align	16
495.write_lit_bits:
496	PSRLDQ	xdata, 1
497
498	add	f_i, 1
499	cmp     f_i, [rsp + f_end_i_mem_offset]
500	jge     .write_lit_bits_finish
501
502	MOVQ	curr_data, xdata
503	MOVDQU	xdata, [file_start + f_i]
504
505	MOVD	hash %+ d, xhash
506
507	write_bits	m_bits, m_bit_count, code2, code_len2, m_out_buf
508
509	PEXTRD	hash2 %+ d, xhash, 1
510	jmp      .loop2
511
512.write_lit_bits_finish:
513	write_bits	m_bits, m_bit_count, code2, code_len2, m_out_buf
514
515.input_end:
516	mov	tmp1, ZSTATE_FLUSH_READ_BUFFER
517	mov	tmp5, ZSTATE_BODY
518	cmp	word [stream + _end_of_stream], 0
519	cmovne	tmp5, tmp1
520	cmp	word [stream + _flush], _NO_FLUSH
521	cmovne	tmp5, tmp1
522	mov	dword [stream + _internal_state_state], tmp5 %+ d
523
524.output_end:
525	;; update input buffer
526	mov	f_end_i, [rsp + f_end_i_mem_offset]
527	add	f_end_i, [rsp + inbuf_slop_offset]
528	mov	[stream + _total_in], f_i %+ d
529	add	file_start, f_i
530	mov     [stream + _next_in], file_start
531	sub	f_end_i, f_i
532	mov     [stream + _avail_in], f_end_i %+ d
533
534	;; update output buffer
535	mov	[stream + _next_out], m_out_buf
536	sub	m_out_buf, [stream + _internal_state_bitbuf_m_out_start]
537	sub	[stream + _avail_out], m_out_buf %+ d
538	add	[stream + _total_out], m_out_buf %+ d
539
540	mov	[stream + _internal_state_bitbuf_m_bits], m_bits
541	mov	[stream + _internal_state_bitbuf_m_bit_count], m_bit_count %+ d
542
543	mov rbx, [rsp + gpr_save_mem_offset + 0*8]
544	mov rsi, [rsp + gpr_save_mem_offset + 1*8]
545	mov rdi, [rsp + gpr_save_mem_offset + 2*8]
546	mov rbp, [rsp + gpr_save_mem_offset + 3*8]
547	mov r12, [rsp + gpr_save_mem_offset + 4*8]
548	mov r13, [rsp + gpr_save_mem_offset + 5*8]
549	mov r14, [rsp + gpr_save_mem_offset + 6*8]
550	mov r15, [rsp + gpr_save_mem_offset + 7*8]
551
552%ifndef ALIGN_STACK
553	add	rsp, stack_size
554%else
555	mov	rsp, rbp
556	pop	rbp
557%endif
558	ret
559
560align	16
561.compare_loop:
562	MOVD	xhash, tmp6 %+ d
563	PINSRD	xhash, tmp2 %+ d, 1
564	PAND	xhash, xhash, xmask
565	lea	tmp2, [tmp1 + dist - 1]
566
567	mov	len2, [rsp + f_end_i_mem_offset]
568	sub	len2, f_i
569	add	len2, [rsp + inbuf_slop_offset]
570	add	len2, 1
571	mov	tmp3,  MAX_EMIT_SIZE
572	cmp	len2, tmp3
573	cmovg	len2, tmp3
574
575	mov	len, 8
576	compare_large	tmp1, tmp2, len, len2, tmp3, ytmp0, ytmp1
577
578	cmp	len, 258
579	jle	.len_dist_huffman
580	cmp	len, LARGE_MATCH_MIN
581	jge	.do_emit
582	mov	len, 258
583	jmp	.len_dist_huffman
584
585align	16
586.compare_loop2:
587	lea	tmp2, [tmp1 + dist2]
588	add	tmp1, 1
589
590	mov	len, [rsp + f_end_i_mem_offset]
591	sub	len, f_i
592	add	len, [rsp + inbuf_slop_offset]
593	mov	tmp3,  MAX_EMIT_SIZE
594	cmp	len, tmp3
595	cmovg	len, tmp3
596
597	mov	len2, 8
598	compare_large	tmp1, tmp2, len2, len, tmp3, ytmp0, ytmp1
599
600	and	curr_data, 0xff
601	get_lit_code	curr_data, code3, code_len3, hufftables
602	cmp	len2, 258
603	jle	.len_dist_lit_huffman
604	cmp	len2, LARGE_MATCH_MIN
605	jge	.do_emit2
606	mov	len2, 258
607	jmp	.len_dist_lit_huffman
608
609align	16
610.do_emit2:
611	neg	dist2
612
613	; get_dist_code(dist2, &code2, &code_len2);
614	get_dist_code	dist2, code2, code_len2, hufftables
615
616	; get_len_code(len, &code, &code_len);
617	get_len_code	258, code, rcx, hufftables ;; rcx is code_len
618
619	; code2 <<= code_len
620	; code2 |= code
621	; code_len2 += code_len
622	SHLX	code4, code2, rcx
623	or	code4, code
624	add	code_len2, rcx
625	mov	tmp5, rcx
626
627	mov	rcx, code_len3
628	SHLX	tmp8, code4, rcx
629	or	code3, tmp8
630	add	rcx, code_len2
631	mov	code_len3, rcx
632
633	write_bits	m_bits, m_bit_count, code3, code_len3, m_out_buf
634
635	lea	tmp3, [f_i + 2]	; tmp3 <= k
636	MOVD	tmp2 %+ d, xhash
637	mov	[stream + _internal_state_head + 2 * tmp2], tmp3 %+ w
638	add	tmp3,1
639	PEXTRD	tmp2 %+ d, xhash, 1
640	mov	[stream + _internal_state_head + 2 * tmp2], tmp3 %+ w
641
642	add	f_i, 258
643	lea	len, [len2 - 258]
644
645	jmp	.emit_loop
646
647.do_emit:
648	dec	f_i
649	neg	dist
650
651	; get_dist_code(dist, &code2, &code_len2);
652%ifndef LONGER_HUFFTABLE
653	mov tmp3, dist	; since code2 and dist are rbx
654	get_dist_code	tmp3, code2, code_len2, hufftables ;; clobbers dist, rcx
655%else
656	get_dist_code	dist, code2, code_len2, hufftables
657%endif
658	; get_len_code(len, &code, &code_len);
659	get_len_code	258, code, rcx, hufftables ;; rcx is code_len
660
661	; code2 <<= code_len
662	; code2 |= code
663	; code_len2 += code_len
664	SHLX	code4, code2, rcx
665	or	code4, code
666	add	code_len2, rcx
667
668	lea	tmp3, [f_i + 2]	; tmp3 <= k
669	MOVD	tmp6 %+ d, xhash
670	PEXTRD	tmp5 %+ d, xhash, 1
671	mov	[stream + _internal_state_head + 2 * tmp6], tmp3 %+ w
672	add	tmp3,1
673	mov	[stream + _internal_state_head + 2 * tmp5], tmp3 %+ w
674	mov	tmp5, rcx
675
676.emit:
677	add	f_i, 258
678	sub	len, 258
679	mov	code3, code4
680
681	write_bits	m_bits, m_bit_count, code3, code_len2, m_out_buf
682
683.emit_loop:
684	cmp	m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
685	ja	.output_end
686	cmp	len, LARGE_MATCH_MIN
687	jge	.emit
688
689	mov	len2, 258
690	cmp	len, len2
691	cmovg	len, len2
692
693	add	f_i, len
694
695	sub	code_len2, tmp5
696	get_len_code	len, code, rcx, hufftables
697	SHLX	code4, code2, rcx
698	or	code4, code
699	add	code_len2, rcx
700
701	write_bits	m_bits, m_bit_count, code4, code_len2, m_out_buf
702
703	cmp	f_i, [rsp + f_end_i_mem_offset]
704	jge	.input_end
705
706	lea	tmp7, [f_i - 4 * LARGE_MATCH_HASH_REP]
707	MOVD	hmask1 %+ d, xmask
708%rep LARGE_MATCH_HASH_REP
709	mov	curr_data %+ d, dword [file_start + tmp7]
710	mov	curr_data2 %+ d, dword [file_start + tmp7 + 1]
711
712	compute_hash	hash, curr_data
713	compute_hash	hash2, curr_data2
714
715	and	hash %+ d, hmask1 %+ d
716	and	hash2 %+ d, hmask1 %+ d
717
718	mov	[stream + _internal_state_head + 2 * hash], tmp7 %+ w
719	add	tmp7, 1
720	mov	[stream + _internal_state_head + 2 * hash2], tmp7 %+ w
721	add	tmp7, 1
722
723	mov	curr_data %+ d, dword [file_start + tmp7]
724	mov	curr_data2 %+ d, dword [file_start + tmp7 + 1]
725
726	compute_hash	hash, curr_data
727	compute_hash	hash2, curr_data2
728
729	and	hash %+ d, hmask1 %+ d
730	and	hash2 %+ d, hmask1 %+ d
731
732	mov	[stream + _internal_state_head + 2 * hash], tmp7 %+ w
733	add	tmp7, 1
734	mov	[stream + _internal_state_head + 2 * hash2], tmp7 %+ w
735%if (LARGE_MATCH_HASH_REP > 1)
736	add	tmp7, 1
737%endif
738%endrep
739
740	MOVDQU	xdata, [file_start + f_i]
741	mov	curr_data, [file_start + f_i]
742	compute_hash	hash, curr_data
743
744
745	mov	curr_data2, curr_data
746	shr	curr_data2, 8
747	compute_hash	hash2, curr_data2
748
749	; hash = compute_hash(state->file_start + f_i) & hash_mask;
750	and	hash %+ d, hmask1 %+ d
751	and	hash2 %+ d, hmask1 %+ d
752
753	; continue
754	jmp	.loop2
755
756.write_first_byte:
757	cmp	m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
758	ja	.output_end
759
760	mov	byte [stream + _internal_state_has_hist], IGZIP_HIST
761
762	mov	[stream + _internal_state_head + 2 * hash], f_i %+ w
763
764	mov	hash, hash2
765	shr	tmp6, 16
766	compute_hash	hash2, tmp6
767
768	MOVD	xhash, hash %+ d
769	PINSRD	xhash, hash2 %+ d, 1
770	PAND	xhash, xhash, xmask
771
772	and	curr_data, 0xff
773	get_lit_code	curr_data, code2, code_len2, hufftables
774	jmp	.write_lit_bits
775
776%ifdef USE_HSWNI
777%undef USE_HSWNI
778%endif
779
780;; Shift defines over in order to iterate over all versions
781%undef ARCH
782%xdefine ARCH ARCH1
783%undef ARCH1
784%xdefine ARCH1 ARCH2
785
786%ifdef COMPARE_TYPE_NOT_DEF
787%undef COMPARE_TYPE
788%xdefine COMPARE_TYPE COMPARE_TYPE1
789%undef COMPARE_TYPE1
790%xdefine COMPARE_TYPE1 COMPARE_TYPE2
791%endif
792%endrep
793