xref: /isa-l/igzip/igzip_update_histogram.asm (revision cd888f01a447dd04c3a8b50362079648d432d2ca)
1b1c45175SGreg Tucker;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2b1c45175SGreg Tucker;  Copyright(c) 2011-2018 Intel Corporation All rights reserved.
3b1c45175SGreg Tucker;
4b1c45175SGreg Tucker;  Redistribution and use in source and binary forms, with or without
5b1c45175SGreg Tucker;  modification, are permitted provided that the following conditions
6b1c45175SGreg Tucker;  are met:
7b1c45175SGreg Tucker;    * Redistributions of source code must retain the above copyright
8b1c45175SGreg Tucker;      notice, this list of conditions and the following disclaimer.
9b1c45175SGreg Tucker;    * Redistributions in binary form must reproduce the above copyright
10b1c45175SGreg Tucker;      notice, this list of conditions and the following disclaimer in
11b1c45175SGreg Tucker;      the documentation and/or other materials provided with the
12b1c45175SGreg Tucker;      distribution.
13b1c45175SGreg Tucker;    * Neither the name of Intel Corporation nor the names of its
14b1c45175SGreg Tucker;      contributors may be used to endorse or promote products derived
15b1c45175SGreg Tucker;      from this software without specific prior written permission.
16b1c45175SGreg Tucker;
17b1c45175SGreg Tucker;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18b1c45175SGreg Tucker;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19b1c45175SGreg Tucker;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20b1c45175SGreg Tucker;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21b1c45175SGreg Tucker;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22b1c45175SGreg Tucker;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23b1c45175SGreg Tucker;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24b1c45175SGreg Tucker;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25b1c45175SGreg Tucker;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26b1c45175SGreg Tucker;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27b1c45175SGreg Tucker;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28b1c45175SGreg Tucker;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2931814483SRoy Oursler
3031814483SRoy Oursler%include "options.asm"
3131814483SRoy Oursler
3231814483SRoy Oursler%include "lz0a_const.asm"
3331814483SRoy Oursler%include "data_struct2.asm"
3431814483SRoy Oursler%include "bitbuf2.asm"
3531814483SRoy Oursler%include "huffman.asm"
3631814483SRoy Oursler%include "igzip_compare_types.asm"
3731814483SRoy Oursler%include "reg_sizes.asm"
3831814483SRoy Oursler
3931814483SRoy Oursler%include "stdmac.asm"
4031814483SRoy Oursler
4131814483SRoy Ourslerextern rfc1951_lookup_table
4231814483SRoy Oursler_len_to_code_offset	equ	0
4331814483SRoy Oursler
4431814483SRoy Oursler%define LAST_BYTES_COUNT	3 ; Bytes to prevent reading out of array bounds
45cf30138cSRoy Oursler%define LA_STATELESS	280	  ; Max number of bytes read in loop2 rounded up to 8 byte boundary
4631814483SRoy Oursler%define LIT_LEN 286
4731814483SRoy Oursler%define DIST_LEN 30
4831814483SRoy Oursler%define HIST_ELEM_SIZE	8
4931814483SRoy Oursler
5031814483SRoy Oursler%ifdef DEBUG
5131814483SRoy Oursler%macro MARK 1
5231814483SRoy Ourslerglobal %1
5331814483SRoy Oursler%1:
5431814483SRoy Oursler%endm
5531814483SRoy Oursler%else
5631814483SRoy Oursler%macro MARK 1
5731814483SRoy Oursler%endm
5831814483SRoy Oursler%endif
5931814483SRoy Oursler
6031814483SRoy Oursler;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
6131814483SRoy Oursler;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
6231814483SRoy Oursler;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
6331814483SRoy Oursler%define	file_start	rdi
6431814483SRoy Oursler%define file_length	rsi
6531814483SRoy Oursler%define	histogram	rdx
6631814483SRoy Oursler%define rfc_lookup	r9
6731814483SRoy Oursler%define	f_i		r10
6831814483SRoy Oursler
6931814483SRoy Oursler%define	curr_data	rax
7031814483SRoy Oursler
7131814483SRoy Oursler%define	tmp2		rcx
7231814483SRoy Oursler
7331814483SRoy Oursler%define	dist		rbx
7431814483SRoy Oursler%define	dist_code2	rbx
7531814483SRoy Oursler
7631814483SRoy Oursler%define	dist2		r12
7731814483SRoy Oursler%define	dist_code	r12
7831814483SRoy Oursler
7931814483SRoy Oursler%define	len		rbp
8031814483SRoy Oursler%define	len_code	rbp
8131814483SRoy Oursler%define	hash3		rbp
8231814483SRoy Oursler
8331814483SRoy Oursler%define	curr_data2	r8
8431814483SRoy Oursler%define	len2		r8
854d1fe78bSRoy Oursler%define	tmp4		r8
8631814483SRoy Oursler
8731814483SRoy Oursler%define	tmp1		r11
8831814483SRoy Oursler
8931814483SRoy Oursler%define	tmp3		r13
9031814483SRoy Oursler
9131814483SRoy Oursler%define	hash		r14
9231814483SRoy Oursler
9331814483SRoy Oursler%define	hash2		r15
9431814483SRoy Oursler
9531814483SRoy Oursler%define	xtmp0		xmm0
9631814483SRoy Oursler%define	xtmp1		xmm1
9717dac9f6SRoy Oursler%define	xdata		xmm2
9831814483SRoy Oursler
9931814483SRoy Oursler%define	ytmp0		ymm0
10031814483SRoy Oursler%define	ytmp1		ymm1
10131814483SRoy Oursler
1024d40cd36SRoy Oursler%if(ARCH == 01)
1034d40cd36SRoy Oursler%define	vtmp0	xtmp0
1044d40cd36SRoy Oursler%define	vtmp1	xtmp1
1054d40cd36SRoy Oursler%define	V_LENGTH	16
1064d40cd36SRoy Oursler%else
1074d40cd36SRoy Oursler%define	vtmp0	ytmp0
1084d40cd36SRoy Oursler%define	vtmp1	ytmp1
1094d40cd36SRoy Oursler%define	V_LENGTH	32
1104d40cd36SRoy Oursler%endif
11131814483SRoy Oursler;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
11231814483SRoy Oursler;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
11331814483SRoy Oursler;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
11431814483SRoy Oursler_eob_count_offset   equ  0	 ; local variable (8 bytes)
11531814483SRoy Ourslerf_end_i_mem_offset  equ  8
11631814483SRoy Ourslergpr_save_mem_offset equ 16       ; gpr save area (8*8 bytes)
11731814483SRoy Ourslerxmm_save_mem_offset equ 16 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
11831814483SRoy Ourslerstack_size          equ 2*8 + 8*8 + 4*16 + 8
11931814483SRoy Oursler;;; 8 because stack address is odd multiple of 8 after a function call and
12031814483SRoy Oursler;;; we want it aligned to 16 bytes
121d06e14b9SRoy Oursler
122d06e14b9SRoy Oursler%ifidn __OUTPUT_FORMAT__, elf64
123d06e14b9SRoy Oursler%define arg0	rdi
124d06e14b9SRoy Oursler%define	arg1	rsi
125d06e14b9SRoy Oursler%define arg2	rdx
126d06e14b9SRoy Oursler
127d06e14b9SRoy Oursler%macro FUNC_SAVE 0
128d06e14b9SRoy Oursler%ifdef ALIGN_STACK
129d06e14b9SRoy Oursler	push	rbp
130d06e14b9SRoy Oursler	mov	rbp, rsp
131d06e14b9SRoy Oursler	sub	rsp, stack_size
132d06e14b9SRoy Oursler	and	rsp, ~15
133d06e14b9SRoy Oursler%else
134d06e14b9SRoy Oursler	sub	rsp, stack_size
135d06e14b9SRoy Oursler%endif
136d06e14b9SRoy Oursler
137d06e14b9SRoy Oursler	mov [rsp + gpr_save_mem_offset + 0*8], rbx
138d06e14b9SRoy Oursler	mov [rsp + gpr_save_mem_offset + 1*8], rbp
139d06e14b9SRoy Oursler	mov [rsp + gpr_save_mem_offset + 2*8], r12
140d06e14b9SRoy Oursler	mov [rsp + gpr_save_mem_offset + 3*8], r13
141d06e14b9SRoy Oursler	mov [rsp + gpr_save_mem_offset + 4*8], r14
142d06e14b9SRoy Oursler	mov [rsp + gpr_save_mem_offset + 5*8], r15
143d06e14b9SRoy Oursler%endm
144d06e14b9SRoy Oursler
145d06e14b9SRoy Oursler%macro FUNC_RESTORE 0
146d06e14b9SRoy Oursler	mov	rbx, [rsp + gpr_save_mem_offset + 0*8]
147d06e14b9SRoy Oursler	mov	rbp, [rsp + gpr_save_mem_offset + 1*8]
148d06e14b9SRoy Oursler	mov	r12, [rsp + gpr_save_mem_offset + 2*8]
149d06e14b9SRoy Oursler	mov	r13, [rsp + gpr_save_mem_offset + 3*8]
150d06e14b9SRoy Oursler	mov	r14, [rsp + gpr_save_mem_offset + 4*8]
151d06e14b9SRoy Oursler	mov	r15, [rsp + gpr_save_mem_offset + 5*8]
152d06e14b9SRoy Oursler
153d06e14b9SRoy Oursler%ifndef ALIGN_STACK
154d06e14b9SRoy Oursler	add	rsp, stack_size
155d06e14b9SRoy Oursler%else
156d06e14b9SRoy Oursler	mov	rsp, rbp
157d06e14b9SRoy Oursler	pop	rbp
158d06e14b9SRoy Oursler%endif
159d06e14b9SRoy Oursler%endm
160d06e14b9SRoy Oursler%endif
161d06e14b9SRoy Oursler
162d06e14b9SRoy Oursler%ifidn __OUTPUT_FORMAT__, win64
163d06e14b9SRoy Oursler%define arg0	rcx
164d06e14b9SRoy Oursler%define	arg1	rdx
165d06e14b9SRoy Oursler%define	arg2	r8
166d06e14b9SRoy Oursler
167d06e14b9SRoy Oursler%macro FUNC_SAVE 0
168d06e14b9SRoy Oursler%ifdef ALIGN_STACK
169d06e14b9SRoy Oursler	push	rbp
170d06e14b9SRoy Oursler	mov	rbp, rsp
171d06e14b9SRoy Oursler	sub	rsp, stack_size
172d06e14b9SRoy Oursler	and	rsp, ~15
173d06e14b9SRoy Oursler%else
174d06e14b9SRoy Oursler	sub	rsp, stack_size
175d06e14b9SRoy Oursler%endif
176d06e14b9SRoy Oursler
177d06e14b9SRoy Oursler	mov [rsp + gpr_save_mem_offset + 0*8], rbx
178d06e14b9SRoy Oursler	mov [rsp + gpr_save_mem_offset + 1*8], rsi
179d06e14b9SRoy Oursler	mov [rsp + gpr_save_mem_offset + 2*8], rdi
180d06e14b9SRoy Oursler	mov [rsp + gpr_save_mem_offset + 3*8], rbp
181d06e14b9SRoy Oursler	mov [rsp + gpr_save_mem_offset + 4*8], r12
182d06e14b9SRoy Oursler	mov [rsp + gpr_save_mem_offset + 5*8], r13
183d06e14b9SRoy Oursler	mov [rsp + gpr_save_mem_offset + 6*8], r14
184d06e14b9SRoy Oursler	mov [rsp + gpr_save_mem_offset + 7*8], r15
185d06e14b9SRoy Oursler%endm
186d06e14b9SRoy Oursler
187d06e14b9SRoy Oursler%macro FUNC_RESTORE 0
188d06e14b9SRoy Oursler	mov	rbx, [rsp + gpr_save_mem_offset + 0*8]
189d06e14b9SRoy Oursler	mov	rsi, [rsp + gpr_save_mem_offset + 1*8]
190d06e14b9SRoy Oursler	mov	rdi, [rsp + gpr_save_mem_offset + 2*8]
191d06e14b9SRoy Oursler	mov	rbp, [rsp + gpr_save_mem_offset + 3*8]
192d06e14b9SRoy Oursler	mov	r12, [rsp + gpr_save_mem_offset + 4*8]
193d06e14b9SRoy Oursler	mov	r13, [rsp + gpr_save_mem_offset + 5*8]
194d06e14b9SRoy Oursler	mov	r14, [rsp + gpr_save_mem_offset + 6*8]
195d06e14b9SRoy Oursler	mov	r15, [rsp + gpr_save_mem_offset + 7*8]
196d06e14b9SRoy Oursler
197d06e14b9SRoy Oursler%ifndef ALIGN_STACK
198d06e14b9SRoy Oursler	add	rsp, stack_size
199d06e14b9SRoy Oursler%else
200d06e14b9SRoy Oursler	mov	rsp, rbp
201d06e14b9SRoy Oursler	pop	rbp
202d06e14b9SRoy Oursler%endif
203d06e14b9SRoy Oursler%endm
204d06e14b9SRoy Oursler%endif
205d06e14b9SRoy Oursler
206d06e14b9SRoy Oursler
20731814483SRoy Oursler_lit_len_offset	equ	0
20831814483SRoy Oursler_dist_offset	equ	(8 * LIT_LEN)
20931814483SRoy Oursler_hash_offset	equ	(_dist_offset + 8 * DIST_LEN)
21031814483SRoy Oursler
211d06e14b9SRoy Oursler
21231814483SRoy Oursler%macro len_to_len_code 3
21331814483SRoy Oursler%define %%len_code	%1 	; Output
21431814483SRoy Oursler%define	%%len		%2	; Input
21531814483SRoy Oursler%define	%%rfc_lookup	%3
21631814483SRoy Oursler	movzx	%%len_code, byte [%%rfc_lookup + _len_to_code_offset + %%len]
21731814483SRoy Oursler	or	%%len_code, 0x100
21831814483SRoy Oursler%endm
21931814483SRoy Oursler
22031814483SRoy Oursler;;; Clobbers rcx and dist
22131814483SRoy Oursler%macro	dist_to_dist_code 2
22231814483SRoy Oursler%define %%dist_code	%1	; Output code associated with dist
22331814483SRoy Oursler%define	%%dist_coded	%1d
22431814483SRoy Oursler%define	%%dist		%2d	; Input dist
22531814483SRoy Oursler	dec	%%dist
22631814483SRoy Oursler	mov	%%dist_coded, %%dist
22731814483SRoy Oursler	bsr	ecx, %%dist_coded
22831814483SRoy Oursler	dec	ecx
22931814483SRoy Oursler	SHRX	%%dist_code, %%dist_code, rcx
23031814483SRoy Oursler	lea	%%dist_coded, [%%dist_coded + 2*ecx]
23131814483SRoy Oursler
23231814483SRoy Oursler	cmp	%%dist, 1
23331814483SRoy Oursler	cmovle	%%dist_coded, %%dist
23431814483SRoy Oursler%endm
23531814483SRoy Oursler
23631814483SRoy Oursler;;; Clobbers rcx and dist
23731814483SRoy Oursler%macro	dist_to_dist_code2 2
23831814483SRoy Oursler%define	%%dist_code	%1	; Output code associated with dist
23931814483SRoy Oursler%define %%dist_coded	%1d
24031814483SRoy Oursler%define	%%dist		%2d	; Input -(dist - 1)
24131814483SRoy Oursler	neg	%%dist
24231814483SRoy Oursler	mov	%%dist_coded, %%dist
24331814483SRoy Oursler	bsr	ecx, %%dist_coded
24431814483SRoy Oursler	dec	ecx
24531814483SRoy Oursler	SHRX	%%dist_code, %%dist_code, rcx
24631814483SRoy Oursler	lea	%%dist_coded, [%%dist_coded + 2*ecx]
24731814483SRoy Oursler
24831814483SRoy Oursler	cmp	%%dist, 1
24931814483SRoy Oursler	cmovle	%%dist_coded, %%dist
25031814483SRoy Oursler%endm
25131814483SRoy Oursler
252ede04f0aSGreg Tucker[bits 64]
253ede04f0aSGreg Tuckerdefault rel
254ede04f0aSGreg Tuckersection .text
255ede04f0aSGreg Tucker
25631814483SRoy Oursler; void isal_update_histogram
25731814483SRoy Ourslerglobal isal_update_histogram_ %+ ARCH
25831814483SRoy Ourslerisal_update_histogram_ %+ ARCH %+ :
259*cd888f01SH.J. Lu	endbranch
260d06e14b9SRoy Oursler	FUNC_SAVE
26131814483SRoy Oursler
262d06e14b9SRoy Oursler%ifnidn	file_start, arg0
263d06e14b9SRoy Oursler	mov	file_start, arg0
264d06e14b9SRoy Oursler%endif
265d06e14b9SRoy Oursler%ifnidn	file_length, arg1
266d06e14b9SRoy Oursler	mov	file_length, arg1
267d06e14b9SRoy Oursler%endif
268d06e14b9SRoy Oursler%ifnidn	histogram, arg2
269d06e14b9SRoy Oursler	mov	histogram, arg2
27031814483SRoy Oursler%endif
27131814483SRoy Oursler	mov	f_i, 0
272bda088b3SGreg Tucker	cmp	file_length, 0
273bda088b3SGreg Tucker	je	exit_ret	; If nothing to do then exit
27431814483SRoy Oursler
27531814483SRoy Oursler	mov	tmp1, qword [histogram + _lit_len_offset + 8*256]
27631814483SRoy Oursler	inc	tmp1
27731814483SRoy Oursler	mov	[rsp + _eob_count_offset], tmp1
27831814483SRoy Oursler
27931814483SRoy Oursler	lea	rfc_lookup, [rfc1951_lookup_table]
28031814483SRoy Oursler
28131814483SRoy Oursler	;; Init hash_table
282ee2e2bceSRoy Oursler	PXOR	vtmp0, vtmp0, vtmp0
2834ae2d1beSRoy Oursler	mov	rcx, (IGZIP_LVL0_HASH_SIZE - V_LENGTH)
28431814483SRoy Ourslerinit_hash_table:
2854d40cd36SRoy Oursler	MOVDQU	[histogram + _hash_offset + 2 * rcx], vtmp0
2864d40cd36SRoy Oursler	MOVDQU	[histogram + _hash_offset + 2 * (rcx + V_LENGTH / 2)], vtmp0
2874d40cd36SRoy Oursler	sub	rcx, V_LENGTH
28831814483SRoy Oursler	jge	init_hash_table
28931814483SRoy Oursler
29031814483SRoy Oursler	sub	file_length, LA_STATELESS
29131814483SRoy Oursler	cmp	file_length, 0
29231814483SRoy Oursler	jle	end_loop_2
29331814483SRoy Oursler
29431814483SRoy Oursler
29531814483SRoy Oursler	;; Load first literal into histogram
29631814483SRoy Oursler	mov	curr_data, [file_start + f_i]
29731814483SRoy Oursler	compute_hash	hash, curr_data
2984ae2d1beSRoy Oursler	and	hash %+ d, LVL0_HASH_MASK
29931814483SRoy Oursler	mov	[histogram + _hash_offset + 2 * hash], f_i %+ w
30031814483SRoy Oursler	and	curr_data, 0xff
30131814483SRoy Oursler	inc	qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
30231814483SRoy Oursler	inc	f_i
30331814483SRoy Oursler
30431814483SRoy Oursler	;; Setup to begin loop 2
30517dac9f6SRoy Oursler	MOVDQU	xdata, [file_start + f_i]
30631814483SRoy Oursler	mov	curr_data, [file_start + f_i]
30731814483SRoy Oursler	mov	curr_data2, curr_data
30831814483SRoy Oursler	compute_hash	hash, curr_data
30931814483SRoy Oursler	shr	curr_data2, 8
31031814483SRoy Oursler	compute_hash	hash2, curr_data2
31131814483SRoy Oursler
3124ae2d1beSRoy Oursler	and	hash2 %+ d, LVL0_HASH_MASK
3134ae2d1beSRoy Oursler	and	hash, LVL0_HASH_MASK
31431814483SRoy Ourslerloop2:
31531814483SRoy Oursler	xor	dist, dist
31631814483SRoy Oursler	xor	dist2, dist2
31731814483SRoy Oursler	xor	tmp3, tmp3
31831814483SRoy Oursler
31931814483SRoy Oursler	lea	tmp1, [file_start + f_i]
32031814483SRoy Oursler
32117dac9f6SRoy Oursler	MOVQ	curr_data, xdata
32217dac9f6SRoy Oursler	PSRLDQ	xdata, 1
32317dac9f6SRoy Oursler
32431814483SRoy Oursler	;; Load possible look back distances and update hash data
32531814483SRoy Oursler	mov	dist %+ w, f_i %+ w
326ee2e2bceSRoy Oursler	sub	dist, 1
32731814483SRoy Oursler	sub	dist %+ w, word [histogram + _hash_offset + 2 * hash]
32831814483SRoy Oursler	mov	[histogram + _hash_offset + 2 * hash], f_i %+ w
32931814483SRoy Oursler
33031814483SRoy Oursler	add	f_i, 1
33131814483SRoy Oursler
33231814483SRoy Oursler	mov	dist2 %+ w, f_i %+ w
333ee2e2bceSRoy Oursler	sub	dist2, 1
33431814483SRoy Oursler	sub	dist2 %+ w, word [histogram + _hash_offset + 2 * hash2]
33531814483SRoy Oursler	mov	[histogram + _hash_offset + 2 * hash2], f_i %+ w
33631814483SRoy Oursler
33731814483SRoy Oursler	;; Start computing hashes to be used in either the next loop or
33831814483SRoy Oursler	;; for updating the hash if a match is found
33917dac9f6SRoy Oursler	MOVQ	curr_data2, xdata
34017dac9f6SRoy Oursler	MOVQ	tmp2, xdata
34117dac9f6SRoy Oursler	shr	curr_data2, 8
34231814483SRoy Oursler	compute_hash	hash, curr_data2
34331814483SRoy Oursler
34431814483SRoy Oursler	;; Check if look back distances are valid. Load a junk distance of 1
34531814483SRoy Oursler	;; if the look back distance is too long for speculative lookups.
346ee2e2bceSRoy Oursler	and	dist %+ d, (D-1)
34731814483SRoy Oursler	neg	dist
34831814483SRoy Oursler
349ee2e2bceSRoy Oursler	and	dist2 %+ d, (D-1)
35031814483SRoy Oursler	neg	dist2
35131814483SRoy Oursler
35217dac9f6SRoy Oursler	shr	tmp2, 16
35331814483SRoy Oursler	compute_hash	hash2, tmp2
35431814483SRoy Oursler
35531814483SRoy Oursler	;; Check for long len/dist matches (>7)
35617dac9f6SRoy Oursler	mov	len, curr_data
35731814483SRoy Oursler	xor	len, [tmp1 + dist - 1]
35831814483SRoy Oursler	jz	compare_loop
35931814483SRoy Oursler
3604ae2d1beSRoy Oursler	and	hash %+ d, LVL0_HASH_MASK
3614ae2d1beSRoy Oursler	and	hash2 %+ d, LVL0_HASH_MASK
36231814483SRoy Oursler
36317dac9f6SRoy Oursler	MOVQ	len2, xdata
36431814483SRoy Oursler	xor	len2, [tmp1 + dist2]
36531814483SRoy Oursler	jz	compare_loop2
36631814483SRoy Oursler
36731814483SRoy Oursler	;; Specutively load the code for the first literal
36831814483SRoy Oursler	movzx   tmp1, curr_data %+ b
36931814483SRoy Oursler	shr	curr_data, 8
37031814483SRoy Oursler
37131814483SRoy Oursler	lea	tmp3, [f_i + 1]
37231814483SRoy Oursler
37331814483SRoy Oursler	;; Check for len/dist match for first literal
37431814483SRoy Oursler	test    len %+ d, 0xFFFFFFFF
37531814483SRoy Oursler	jz      len_dist_huffman_pre
37631814483SRoy Oursler
37731814483SRoy Oursler	;; Store first literal
37831814483SRoy Oursler	inc	qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * tmp1]
37931814483SRoy Oursler
38031814483SRoy Oursler	;; Check for len/dist match for second literal
38131814483SRoy Oursler	test    len2 %+ d, 0xFFFFFFFF
38231814483SRoy Oursler	jnz     lit_lit_huffman
38331814483SRoy Ourslerlen_dist_lit_huffman_pre:
38431814483SRoy Oursler	;; Calculate repeat length
38531814483SRoy Oursler	tzcnt	len2, len2
38631814483SRoy Oursler	shr	len2, 3
38731814483SRoy Oursler
38831814483SRoy Ourslerlen_dist_lit_huffman:
38917dac9f6SRoy Oursler	MOVQ	curr_data, xdata
39017dac9f6SRoy Oursler	shr	curr_data, 24
391af9c0c0fSRoy Oursler	compute_hash hash3, curr_data
392af9c0c0fSRoy Oursler
39331814483SRoy Oursler	;; Store updated hashes
39431814483SRoy Oursler	mov	[histogram + _hash_offset + 2 * hash], tmp3 %+ w
39531814483SRoy Oursler	add	tmp3,1
39631814483SRoy Oursler	mov	[histogram + _hash_offset + 2 * hash2], tmp3 %+ w
397af9c0c0fSRoy Oursler	add	tmp3, 1
39831814483SRoy Oursler
39931814483SRoy Oursler	add	f_i, len2
40031814483SRoy Oursler
40117dac9f6SRoy Oursler	MOVDQU	xdata, [file_start + f_i]
40231814483SRoy Oursler	mov	curr_data, [file_start + f_i]
40331814483SRoy Oursler	mov	tmp1, curr_data
40431814483SRoy Oursler	compute_hash	hash, curr_data
40531814483SRoy Oursler
4064ae2d1beSRoy Oursler	and	hash3, LVL0_HASH_MASK
407af9c0c0fSRoy Oursler	mov	[histogram + _hash_offset + 2 * hash3], tmp3 %+ w
408af9c0c0fSRoy Oursler
40931814483SRoy Oursler	dist_to_dist_code2 dist_code2, dist2
41031814483SRoy Oursler
41131814483SRoy Oursler	len_to_len_code len_code, len2, rfc_lookup
41231814483SRoy Oursler
41331814483SRoy Oursler	shr	tmp1, 8
41431814483SRoy Oursler	compute_hash	hash2, tmp1
41531814483SRoy Oursler
41631814483SRoy Oursler	inc	qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code]
41731814483SRoy Oursler	inc	qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code2]
41831814483SRoy Oursler
4194ae2d1beSRoy Oursler	and	hash2 %+ d, LVL0_HASH_MASK
4204ae2d1beSRoy Oursler	and	hash, LVL0_HASH_MASK
42131814483SRoy Oursler
42231814483SRoy Oursler	cmp	f_i, file_length
42331814483SRoy Oursler	jl	loop2
42431814483SRoy Oursler	jmp	end_loop_2
42531814483SRoy Oursler	;; encode as dist/len
42631814483SRoy Oursler
42731814483SRoy Ourslerlen_dist_huffman_pre:
42831814483SRoy Oursler	tzcnt	len, len
42931814483SRoy Oursler	shr	len, 3
43031814483SRoy Oursler
43131814483SRoy Ourslerlen_dist_huffman:
43231814483SRoy Oursler	mov	[histogram + _hash_offset + 2 * hash], tmp3 %+ w
433af9c0c0fSRoy Oursler	add	tmp3,1
434af9c0c0fSRoy Oursler	mov	[histogram + _hash_offset + 2 * hash2], tmp3 %+ w
43531814483SRoy Oursler
43631814483SRoy Oursler	dec	f_i
43731814483SRoy Oursler	add	f_i, len
43831814483SRoy Oursler
43917dac9f6SRoy Oursler	MOVDQU	xdata, [file_start + f_i]
44031814483SRoy Oursler	mov	curr_data, [file_start + f_i]
44131814483SRoy Oursler	mov	tmp1, curr_data
44231814483SRoy Oursler	compute_hash	hash, curr_data
44331814483SRoy Oursler
44431814483SRoy Oursler	dist_to_dist_code2 dist_code, dist
44531814483SRoy Oursler
44631814483SRoy Oursler	len_to_len_code len_code, len, rfc_lookup
44731814483SRoy Oursler
44831814483SRoy Oursler	shr	tmp1, 8
44931814483SRoy Oursler	compute_hash	hash2, tmp1
45031814483SRoy Oursler
45131814483SRoy Oursler	inc	qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code]
45231814483SRoy Oursler	inc	qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code]
45331814483SRoy Oursler
4544ae2d1beSRoy Oursler	and	hash2 %+ d, LVL0_HASH_MASK
4554ae2d1beSRoy Oursler	and	hash, LVL0_HASH_MASK
45631814483SRoy Oursler
45731814483SRoy Oursler	cmp	f_i, file_length
45831814483SRoy Oursler	jl	loop2
45931814483SRoy Oursler	jmp	end_loop_2
46031814483SRoy Oursler
46131814483SRoy Ourslerlit_lit_huffman:
46217dac9f6SRoy Oursler	MOVDQU	xdata, [file_start + f_i + 1]
463af9c0c0fSRoy Oursler	and     curr_data, 0xff
46431814483SRoy Oursler	add	f_i, 1
46531814483SRoy Oursler	inc	qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
46631814483SRoy Oursler
46731814483SRoy Oursler	cmp	f_i, file_length
46831814483SRoy Oursler	jl	loop2
46931814483SRoy Oursler
47031814483SRoy Ourslerend_loop_2:
47131814483SRoy Oursler	add	file_length, LA_STATELESS - LAST_BYTES_COUNT
47231814483SRoy Oursler	cmp	f_i, file_length
47331814483SRoy Oursler	jge	final_bytes
47431814483SRoy Oursler
47531814483SRoy Ourslerloop2_finish:
4764d1fe78bSRoy Oursler	mov	curr_data %+ d, dword [file_start + f_i]
47731814483SRoy Oursler	compute_hash	hash, curr_data
4784ae2d1beSRoy Oursler	and	hash %+ d, LVL0_HASH_MASK
47931814483SRoy Oursler
48031814483SRoy Oursler	;; Calculate possible distance for length/dist pair.
48131814483SRoy Oursler	xor	dist, dist
48231814483SRoy Oursler	mov	dist %+ w, f_i %+ w
48331814483SRoy Oursler	sub	dist %+ w, word [histogram + _hash_offset + 2 * hash]
48431814483SRoy Oursler	mov	[histogram + _hash_offset + 2 * hash], f_i %+ w
48531814483SRoy Oursler
48631814483SRoy Oursler	;; Check if look back distance is valid (the dec is to handle when dist = 0)
48731814483SRoy Oursler	dec	dist
48831814483SRoy Oursler	cmp	dist %+ d, (D-1)
48931814483SRoy Oursler	jae	encode_literal_finish
49031814483SRoy Oursler	inc	dist
49131814483SRoy Oursler
49231814483SRoy Oursler	;; Check if look back distance is a match
4934d1fe78bSRoy Oursler	lea	tmp4, [file_length + LAST_BYTES_COUNT]
4944d1fe78bSRoy Oursler	sub	tmp4, f_i
49531814483SRoy Oursler	lea	tmp1, [file_start + f_i]
49631814483SRoy Oursler	mov	tmp2, tmp1
49731814483SRoy Oursler	sub	tmp2, dist
4984d1fe78bSRoy Oursler	compare	tmp4, tmp1, tmp2, len, tmp3
49931814483SRoy Oursler
50031814483SRoy Oursler	;; Limit len to maximum value of 258
50131814483SRoy Oursler	mov	tmp2, 258
50231814483SRoy Oursler	cmp	len, 258
50331814483SRoy Oursler	cmova	len, tmp2
50431814483SRoy Oursler	cmp	len, SHORTEST_MATCH
50531814483SRoy Oursler	jb	encode_literal_finish
50631814483SRoy Oursler
50731814483SRoy Oursler	add	f_i, len
50831814483SRoy Oursler
50931814483SRoy Oursler	len_to_len_code	len_code, len, rfc_lookup
51031814483SRoy Oursler	dist_to_dist_code dist_code, dist
51131814483SRoy Oursler
51231814483SRoy Oursler	inc	qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code]
51331814483SRoy Oursler	inc	qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code]
51431814483SRoy Oursler
51531814483SRoy Oursler	cmp	f_i, file_length
51631814483SRoy Oursler	jl	loop2_finish
51731814483SRoy Oursler	jmp	final_bytes
51831814483SRoy Oursler
51931814483SRoy Ourslerencode_literal_finish:
52031814483SRoy Oursler	;; Encode literal
52131814483SRoy Oursler	and	curr_data %+ d, 0xFF
52231814483SRoy Oursler	inc	qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
52331814483SRoy Oursler
52431814483SRoy Oursler	;; Setup for next loop
52531814483SRoy Oursler	add	f_i, 1
52631814483SRoy Oursler	cmp	f_i, file_length
52731814483SRoy Oursler	jl	loop2_finish
52831814483SRoy Oursler
52931814483SRoy Ourslerfinal_bytes:
53031814483SRoy Oursler	add	file_length, LAST_BYTES_COUNT
53131814483SRoy Ourslerfinal_bytes_loop:
53231814483SRoy Oursler	cmp	f_i, file_length
53331814483SRoy Oursler	jge	end
53431814483SRoy Oursler	movzx	curr_data, byte [file_start + f_i]
53531814483SRoy Oursler	inc	qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
53631814483SRoy Oursler	inc	f_i
53731814483SRoy Oursler	jmp	final_bytes_loop
53831814483SRoy Oursler
53931814483SRoy Ourslerend:
54031814483SRoy Oursler	;; Handle eob at end of stream
54131814483SRoy Oursler	mov	tmp1, [rsp + _eob_count_offset]
54231814483SRoy Oursler	mov	qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * 256], tmp1
54331814483SRoy Oursler
544bda088b3SGreg Tuckerexit_ret:
545d06e14b9SRoy Oursler	FUNC_RESTORE
54631814483SRoy Oursler	ret
54731814483SRoy Oursler
54831814483SRoy Ourslercompare_loop:
5494ae2d1beSRoy Oursler	and	hash %+ d, LVL0_HASH_MASK
5504ae2d1beSRoy Oursler	and	hash2 %+ d, LVL0_HASH_MASK
55131814483SRoy Oursler	lea	tmp2, [tmp1 + dist - 1]
552d389b8d6SRoy Oursler
55373454909SRoy Oursler	mov	len2, 250
55473454909SRoy Oursler	mov	len, 8
55573454909SRoy Oursler	compare250	tmp1, tmp2, len, len2, tmp3, ytmp0, ytmp1
556d389b8d6SRoy Oursler
55731814483SRoy Oursler	lea	tmp3, [f_i + 1]
55831814483SRoy Oursler	jmp	len_dist_huffman
55931814483SRoy Oursler
56031814483SRoy Ourslercompare_loop2:
56131814483SRoy Oursler	add	tmp1, 1
56231814483SRoy Oursler	lea	tmp2, [tmp1 + dist2 - 1]
56331814483SRoy Oursler
56473454909SRoy Oursler	mov	len, 250
56573454909SRoy Oursler	mov	len2, 8
56673454909SRoy Oursler	compare250	tmp1, tmp2, len2, len, tmp3, ytmp0, ytmp1
567d389b8d6SRoy Oursler
56831814483SRoy Oursler	and	curr_data, 0xff
56931814483SRoy Oursler	inc	qword [histogram + _lit_len_offset + 8 * curr_data]
57031814483SRoy Oursler	lea	tmp3, [f_i + 1]
57131814483SRoy Oursler	jmp	len_dist_lit_huffman
57231814483SRoy Oursler
57331814483SRoy Ourslersection .data
5744d40cd36SRoy Oursler	align 32
575b25ef61aSRoy OurslerD_vector:
576b25ef61aSRoy Oursler	dw	-(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF
577b25ef61aSRoy Oursler	dw	-(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF
578b25ef61aSRoy Oursler	dw	-(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF
579b25ef61aSRoy Oursler	dw	-(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF
580