xref: /isa-l/igzip/igzip_decode_block_stateless.asm (revision 1500db751d08b6c4ad6097135fe78259540a2807)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2018 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30default rel
31
32%include "reg_sizes.asm"
33
34%define DECOMP_OK 0
35%define END_INPUT 1
36%define OUT_OVERFLOW 2
37%define INVALID_BLOCK -1
38%define INVALID_SYMBOL -2
39%define INVALID_LOOKBACK -3
40
41%define ISAL_DECODE_LONG_BITS 12
42%define ISAL_DECODE_SHORT_BITS 10
43
44%define COPY_SIZE 16
45%define	COPY_LEN_MAX 258
46
47%define IN_BUFFER_SLOP 8
48%define OUT_BUFFER_SLOP	COPY_SIZE + COPY_LEN_MAX
49
50%include "inflate_data_structs.asm"
51%include "stdmac.asm"
52
53extern rfc1951_lookup_table
54
55
56
57%define LARGE_SHORT_SYM_LEN 25
58%define LARGE_SHORT_SYM_MASK ((1 << LARGE_SHORT_SYM_LEN) - 1)
59%define LARGE_LONG_SYM_LEN 10
60%define LARGE_LONG_SYM_MASK ((1 << LARGE_LONG_SYM_LEN) - 1)
61%define LARGE_SHORT_CODE_LEN_OFFSET 28
62%define LARGE_LONG_CODE_LEN_OFFSET 10
63%define LARGE_FLAG_BIT_OFFSET 25
64%define LARGE_FLAG_BIT (1 << LARGE_FLAG_BIT_OFFSET)
65%define LARGE_SYM_COUNT_OFFSET 26
66%define LARGE_SYM_COUNT_LEN 2
67%define LARGE_SYM_COUNT_MASK ((1 << LARGE_SYM_COUNT_LEN) - 1)
68%define LARGE_SHORT_MAX_LEN_OFFSET 26
69
70%define SMALL_SHORT_SYM_LEN 9
71%define SMALL_SHORT_SYM_MASK ((1 << SMALL_SHORT_SYM_LEN) - 1)
72%define SMALL_LONG_SYM_LEN 9
73%define SMALL_LONG_SYM_MASK ((1 << SMALL_LONG_SYM_LEN) - 1)
74%define SMALL_SHORT_CODE_LEN_OFFSET 11
75%define SMALL_LONG_CODE_LEN_OFFSET 10
76%define SMALL_FLAG_BIT_OFFSET 10
77%define SMALL_FLAG_BIT (1 << SMALL_FLAG_BIT_OFFSET)
78
79%define DIST_SYM_OFFSET 0
80%define DIST_SYM_LEN 5
81%define DIST_SYM_MASK ((1 << DIST_SYM_LEN) - 1)
82%define DIST_SYM_EXTRA_OFFSET 5
83%define DIST_SYM_EXTRA_LEN 4
84%define DIST_SYM_EXTRA_MASK ((1 << DIST_SYM_EXTRA_LEN) - 1)
85
86;; rax
87%define	tmp3		rax
88%define	read_in_2	rax
89%define	look_back_dist	rax
90
91;; rcx
92;; rdx	arg3
93%define	next_sym2	rdx
94%define copy_start	rdx
95%define	tmp4		rdx
96
97;; rdi	arg1
98%define	tmp1		rdi
99%define look_back_dist2 rdi
100%define next_bits2	rdi
101%define next_sym3	rdi
102
103;; rsi	arg2
104%define	tmp2		rsi
105%define next_sym_num	rsi
106%define	next_bits	rsi
107
108;; rbx ; Saved
109%define	next_in		rbx
110
111;; rbp ; Saved
112%define	end_in		rbp
113
114;; r8
115%define	repeat_length	r8
116
117;; r9
118%define	read_in		r9
119
120;; r10
121%define read_in_length	r10
122
123;; r11
124%define	state		r11
125
126;; r12 ; Saved
127%define next_out	r12
128
129;; r13 ; Saved
130%define	end_out		r13
131
132;; r14 ; Saved
133%define	next_sym	r14
134
135;; r15 ; Saved
136%define rfc_lookup	r15
137
138start_out_mem_offset	equ	0
139read_in_mem_offset	equ	8
140read_in_length_mem_offset	equ	16
141next_out_mem_offset	equ	24
142gpr_save_mem_offset	equ	32
143stack_size		equ	4 * 8 + 8 * 8
144
145%define	_dist_extra_bit_count	264
146%define	_dist_start		_dist_extra_bit_count + 1*32
147%define	_len_extra_bit_count	_dist_start + 4*32
148%define	_len_start		_len_extra_bit_count + 1*32
149
150%ifidn __OUTPUT_FORMAT__, elf64
151%define arg0	rdi
152%define arg1	rsi
153
154%macro FUNC_SAVE 0
155%ifdef ALIGN_STACK
156	push	rbp
157	mov	rbp, rsp
158	sub	rsp, stack_size
159	and	rsp, ~15
160%else
161	sub	rsp, stack_size
162%endif
163
164	mov [rsp + gpr_save_mem_offset + 0*8], rbx
165	mov [rsp + gpr_save_mem_offset + 1*8], rbp
166	mov [rsp + gpr_save_mem_offset + 2*8], r12
167	mov [rsp + gpr_save_mem_offset + 3*8], r13
168	mov [rsp + gpr_save_mem_offset + 4*8], r14
169	mov [rsp + gpr_save_mem_offset + 5*8], r15
170%endm
171
172%macro FUNC_RESTORE 0
173	mov	rbx, [rsp + gpr_save_mem_offset + 0*8]
174	mov	rbp, [rsp + gpr_save_mem_offset + 1*8]
175	mov	r12, [rsp + gpr_save_mem_offset + 2*8]
176	mov	r13, [rsp + gpr_save_mem_offset + 3*8]
177	mov	r14, [rsp + gpr_save_mem_offset + 4*8]
178	mov	r15, [rsp + gpr_save_mem_offset + 5*8]
179
180%ifndef ALIGN_STACK
181	add	rsp, stack_size
182%else
183	mov	rsp, rbp
184	pop	rbp
185%endif
186%endm
187%endif
188
189%ifidn __OUTPUT_FORMAT__, win64
190%define arg0	rcx
191%define arg1	rdx
192
193%macro FUNC_SAVE 0
194%ifdef ALIGN_STACK
195	push	rbp
196	mov	rbp, rsp
197	sub	rsp, stack_size
198	and	rsp, ~15
199%else
200	sub	rsp, stack_size
201%endif
202
203	mov [rsp + gpr_save_mem_offset + 0*8], rbx
204	mov [rsp + gpr_save_mem_offset + 1*8], rsi
205	mov [rsp + gpr_save_mem_offset + 2*8], rdi
206	mov [rsp + gpr_save_mem_offset + 3*8], rbp
207	mov [rsp + gpr_save_mem_offset + 4*8], r12
208	mov [rsp + gpr_save_mem_offset + 5*8], r13
209	mov [rsp + gpr_save_mem_offset + 6*8], r14
210	mov [rsp + gpr_save_mem_offset + 7*8], r15
211%endm
212
213%macro FUNC_RESTORE 0
214	mov	rbx, [rsp + gpr_save_mem_offset + 0*8]
215	mov	rsi, [rsp + gpr_save_mem_offset + 1*8]
216	mov	rdi, [rsp + gpr_save_mem_offset + 2*8]
217	mov	rbp, [rsp + gpr_save_mem_offset + 3*8]
218	mov	r12, [rsp + gpr_save_mem_offset + 4*8]
219	mov	r13, [rsp + gpr_save_mem_offset + 5*8]
220	mov	r14, [rsp + gpr_save_mem_offset + 6*8]
221	mov	r15, [rsp + gpr_save_mem_offset + 7*8]
222
223%ifndef ALIGN_STACK
224	add	rsp, stack_size
225%else
226	mov	rsp, rbp
227	pop	rbp
228%endif
229%endm
230%endif
231
232;; Load read_in and updated in_buffer accordingly
233;; when there are at least 8 bytes in the in buffer
234;; Clobbers rcx, unless rcx is %%read_in_length
235%macro inflate_in_load 6
236%define	%%next_in		%1
237%define %%end_in		%2
238%define %%read_in		%3
239%define %%read_in_length	%4
240%define %%tmp1			%5 ; Tmp registers
241%define %%tmp2			%6
242
243	SHLX	%%tmp1, [%%next_in], %%read_in_length
244	or	%%read_in, %%tmp1
245
246	mov	%%tmp1, 64
247	sub	%%tmp1, %%read_in_length
248	shr	%%tmp1, 3
249
250	add	%%next_in, %%tmp1
251	lea	%%read_in_length, [%%read_in_length + 8 * %%tmp1]
252%%end:
253%endm
254
255;; Load read_in and updated in_buffer accordingly
256;; Clobbers rcx, unless rcx is %%read_in_length
257%macro inflate_in_small_load 6
258%define	%%next_in		%1
259%define %%end_in		%2
260%define %%read_in		%3
261%define %%read_in_length	%4
262%define %%avail_in		%5 ; Tmp registers
263%define %%tmp1			%5
264%define %%loop_count		%6
265
266	mov	%%avail_in, %%end_in
267	sub	%%avail_in, %%next_in
268
269%ifnidn %%read_in_length, rcx
270	mov	rcx, %%read_in_length
271%endif
272
273	mov	%%loop_count, 64
274	sub	%%loop_count, %%read_in_length
275	shr	%%loop_count, 3
276
277	cmp	%%loop_count, %%avail_in
278	cmovg	%%loop_count, %%avail_in
279	cmp	%%loop_count, 0
280	je	%%end
281
282%%load_byte:
283	xor	%%tmp1, %%tmp1
284	mov	%%tmp1 %+ b, byte [%%next_in]
285	SHLX	%%tmp1, %%tmp1, rcx
286	or	%%read_in, %%tmp1
287	add	rcx, 8
288	add	%%next_in, 1
289	sub	%%loop_count, 1
290	jg	%%load_byte
291%ifnidn %%read_in_length, rcx
292	mov	%%read_in_length, rcx
293%endif
294%%end:
295%endm
296
297;; Clears all bits at index %%bit_count and above in %%next_bits
298;; May clobber rcx and %%bit_count
299%macro CLEAR_HIGH_BITS		3
300%define %%next_bits		%1
301%define %%bit_count		%2
302%define %%lookup_size		%3
303
304	sub	%%bit_count, 0x40 + %%lookup_size
305;; Extract the 15-DECODE_LOOKUP_SIZE bits beyond the first DECODE_LOOKUP_SIZE bits.
306%ifdef USE_HSWNI
307	and	%%bit_count, 0x1F
308	bzhi	%%next_bits, %%next_bits, %%bit_count
309%else
310%ifnidn %%bit_count, rcx
311	mov rcx, %%bit_count
312%endif
313	neg	rcx
314	shl	%%next_bits, cl
315	shr	%%next_bits, cl
316%endif
317
318%endm
319
320;; Decode next symbol
321;; Clobber rcx
322%macro decode_next_lit_len	8
323%define	%%state			%1 ; State structure associated with compressed stream
324%define %%lookup_size		%2 ; Number of bits used for small lookup
325%define	%%state_offset		%3 ; Type of huff code, should be either LIT or DIST
326%define %%read_in		%4 ; Bits read in from compressed stream
327%define %%read_in_length	%5 ; Number of valid bits in read_in
328%define %%next_sym		%6 ; Returned symbols
329%define %%next_sym_num		%7 ; Returned symbols count
330%define	%%next_bits		%8
331
332	mov	%%next_sym_num, %%next_sym
333	mov	rcx, %%next_sym
334	shr	rcx, LARGE_SHORT_CODE_LEN_OFFSET
335	jz	invalid_symbol
336
337	and	%%next_sym_num, LARGE_SYM_COUNT_MASK << LARGE_SYM_COUNT_OFFSET
338	shr	%%next_sym_num, LARGE_SYM_COUNT_OFFSET
339
340	;; Check if symbol or hint was looked up
341	and	%%next_sym, LARGE_FLAG_BIT | LARGE_SHORT_SYM_MASK
342	test	%%next_sym, LARGE_FLAG_BIT
343	jz	%%end
344
345	shl	rcx, LARGE_SYM_COUNT_LEN
346	or	rcx, %%next_sym_num
347
348	;; Save length associated with symbol
349	mov	%%next_bits, %%read_in
350	shr	%%next_bits, %%lookup_size
351
352	;; Extract the bits beyond the first %%lookup_size bits.
353	CLEAR_HIGH_BITS %%next_bits, rcx, %%lookup_size
354
355	and	%%next_sym, LARGE_SHORT_SYM_MASK
356	add	%%next_sym, %%next_bits
357
358	;; Lookup actual next symbol
359	movzx	%%next_sym, word [%%state + LARGE_LONG_CODE_SIZE * %%next_sym + %%state_offset + LARGE_SHORT_CODE_SIZE * (1 << %%lookup_size)]
360	mov	%%next_sym_num, 1
361
362	;; Save length associated with symbol
363	mov	rcx, %%next_sym
364	shr	rcx, LARGE_LONG_CODE_LEN_OFFSET
365	jz	invalid_symbol
366	and	%%next_sym, LARGE_LONG_SYM_MASK
367
368%%end:
369;; Updated read_in to reflect the bits which were decoded
370	SHRX	%%read_in, %%read_in, rcx
371	sub	%%read_in_length, rcx
372%endm
373
374;; Decode next symbol
375;; Clobber rcx
376%macro decode_next_lit_len_with_load	8
377%define	%%state			%1 ; State structure associated with compressed stream
378%define %%lookup_size		%2 ; Number of bits used for small lookup
379%define	%%state_offset		%3
380%define %%read_in		%4 ; Bits read in from compressed stream
381%define %%read_in_length	%5 ; Number of valid bits in read_in
382%define %%next_sym		%6 ; Returned symbols
383%define %%next_sym_num		%7 ; Returned symbols count
384%define %%next_bits		%8
385
386	;; Lookup possible next symbol
387	mov	%%next_bits, %%read_in
388	and	%%next_bits, (1 << %%lookup_size) - 1
389	mov	%%next_sym %+ d, dword [%%state + %%state_offset + LARGE_SHORT_CODE_SIZE * %%next_bits]
390
391	decode_next_lit_len %%state, %%lookup_size, %%state_offset, %%read_in, %%read_in_length, %%next_sym, %%next_sym_num, %%next_bits
392%endm
393
394;; Decode next symbol
395;; Clobber rcx
396%macro decode_next_dist	8
397%define	%%state			%1 ; State structure associated with compressed stream
398%define %%lookup_size		%2 ; Number of bits used for small lookup
399%define	%%state_offset		%3 ; Type of huff code, should be either LIT or DIST
400%define %%read_in		%4 ; Bits read in from compressed stream
401%define %%read_in_length	%5 ; Number of valid bits in read_in
402%define %%next_sym		%6 ; Returned symbol
403%define %%next_extra_bits	%7
404%define	%%next_bits		%8
405
406	mov	rcx, %%next_sym
407	shr	rcx, SMALL_SHORT_CODE_LEN_OFFSET
408	jz	invalid_dist_symbol_ %+ %%next_sym
409
410	;; Check if symbol or hint was looked up
411	and	%%next_sym, SMALL_FLAG_BIT | SMALL_SHORT_SYM_MASK
412	test	%%next_sym, SMALL_FLAG_BIT
413	jz	%%end
414
415	;; Save length associated with symbol
416	mov	%%next_bits, %%read_in
417	shr	%%next_bits, %%lookup_size
418
419	;; Extract the 15-DECODE_LOOKUP_SIZE bits beyond the first %%lookup_size bits.
420	lea	%%next_sym, [%%state + SMALL_LONG_CODE_SIZE * %%next_sym]
421
422	CLEAR_HIGH_BITS %%next_bits, rcx, %%lookup_size
423
424	;; Lookup actual next symbol
425	movzx	%%next_sym, word [%%next_sym + %%state_offset + SMALL_LONG_CODE_SIZE * %%next_bits + SMALL_SHORT_CODE_SIZE * (1 << %%lookup_size) - SMALL_LONG_CODE_SIZE * SMALL_FLAG_BIT]
426
427	;; Save length associated with symbol
428	mov	rcx, %%next_sym
429	shr	rcx, SMALL_LONG_CODE_LEN_OFFSET
430	jz	invalid_dist_symbol_ %+ %%next_sym
431	and	%%next_sym, SMALL_SHORT_SYM_MASK
432
433%%end:
434	;; Updated read_in to reflect the bits which were decoded
435	SHRX	%%read_in, %%read_in, rcx
436	sub	%%read_in_length, rcx
437	mov	rcx, %%next_sym
438	shr	rcx, DIST_SYM_EXTRA_OFFSET
439	and	%%next_sym, DIST_SYM_MASK
440%endm
441
442;; Decode next symbol
443;; Clobber rcx
444%macro decode_next_dist_with_load	8
445%define	%%state			%1 ; State structure associated with compressed stream
446%define %%lookup_size		%2 ; Number of bits used for small lookup
447%define	%%state_offset		%3
448%define %%read_in		%4 ; Bits read in from compressed stream
449%define %%read_in_length	%5 ; Number of valid bits in read_in
450%define %%next_sym		%6 ; Returned symbol
451%define %%next_extra_bits	%7
452%define %%next_bits		%8
453
454	;; Lookup possible next symbol
455	mov	%%next_bits, %%read_in
456	and	%%next_bits, (1 << %%lookup_size) - 1
457	movzx	%%next_sym, word [%%state + %%state_offset + SMALL_SHORT_CODE_SIZE * %%next_bits]
458
459	decode_next_dist %%state, %%lookup_size, %%state_offset, %%read_in, %%read_in_length, %%next_sym, %%next_extra_bits, %%next_bits
460%endm
461
462[bits 64]
463default rel
464section .text
465
466global decode_huffman_code_block_stateless_ %+ ARCH
467decode_huffman_code_block_stateless_ %+ ARCH %+ :
468	endbranch
469
470	FUNC_SAVE
471
472	mov	state, arg0
473	mov	[rsp + start_out_mem_offset], arg1
474	lea	rfc_lookup, [rfc1951_lookup_table]
475
476	mov	read_in,[state + _read_in]
477	mov	read_in_length %+ d, dword [state + _read_in_length]
478	mov	next_out, [state + _next_out]
479	mov	end_out %+ d, dword [state + _avail_out]
480	add	end_out, next_out
481	mov	next_in, [state + _next_in]
482	mov	end_in %+ d, dword [state + _avail_in]
483	add	end_in, next_in
484
485	mov	dword [state + _copy_overflow_len], 0
486	mov	dword [state + _copy_overflow_dist], 0
487
488	sub	end_out, OUT_BUFFER_SLOP
489	sub	end_in, IN_BUFFER_SLOP
490
491	cmp	next_in, end_in
492	jg	end_loop_block_pre
493
494	cmp	read_in_length, 64
495	je	skip_load
496
497	inflate_in_load	next_in, end_in, read_in, read_in_length, tmp1, tmp2
498
499skip_load:
500	mov	tmp3, read_in
501	and	tmp3, (1 << ISAL_DECODE_LONG_BITS) - 1
502	mov	next_sym %+ d, dword [state + _lit_huff_code + LARGE_SHORT_CODE_SIZE * tmp3]
503
504;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
505; Main Loop
506;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
507loop_block:
508	;; Check if near end of in buffer or out buffer
509	cmp	next_in, end_in
510	jg	end_loop_block_pre
511	cmp	next_out, end_out
512	jg	end_loop_block_pre
513
514	;; Decode next symbol and reload the read_in buffer
515	decode_next_lit_len	state, ISAL_DECODE_LONG_BITS, _lit_huff_code, read_in, read_in_length, next_sym, next_sym_num, tmp1
516
517	;; Specutively write next_sym if it is a literal
518	mov	[next_out], next_sym
519	add	next_out, next_sym_num
520	lea	next_sym2, [8 * next_sym_num - 8]
521	SHRX	next_sym2, next_sym, next_sym2
522
523	;; Find index to specutively preload next_sym from
524	mov	tmp3, (1 << ISAL_DECODE_LONG_BITS) - 1
525	and	tmp3, read_in
526
527	;; Start reloading read_in
528	mov	tmp1, [next_in]
529	SHLX	tmp1, tmp1, read_in_length
530	or	read_in, tmp1
531
532	;; Specutively load data associated with length symbol
533	lea	repeat_length, [next_sym2 - 254]
534
535	;; Test for end of block symbol
536	cmp	next_sym2, 256
537	je	end_symbol_pre
538
539	;; Specutively load next_sym for next loop if a literal was decoded
540	mov	next_sym %+ d, dword [state + _lit_huff_code + LARGE_SHORT_CODE_SIZE * tmp3]
541
542	;; Finish updating read_in_length for read_in
543	mov	tmp1, 64
544	sub	tmp1, read_in_length
545	shr	tmp1, 3
546	add	next_in, tmp1
547	lea	read_in_length, [read_in_length + 8 * tmp1]
548
549	;; Specultively load next dist code
550	mov	next_bits2, (1 << ISAL_DECODE_SHORT_BITS) - 1
551	and	next_bits2, read_in
552	movzx	next_sym3, word [state + _dist_huff_code + SMALL_SHORT_CODE_SIZE * next_bits2]
553
554	;; Check if next_sym2 is a literal, length, or end of block symbol
555	cmp	next_sym2, 256
556	jl	loop_block
557
558decode_len_dist:
559	;; Determine next_out after the copy is finished
560	lea	next_out, [next_out + repeat_length - 1]
561
562	;; Decode distance code
563	decode_next_dist state, ISAL_DECODE_SHORT_BITS, _dist_huff_code, read_in, read_in_length, next_sym3, rcx, tmp2
564
565	mov	look_back_dist2 %+ d, [rfc_lookup + _dist_start + 4 * next_sym3]
566
567	; ;; Load distance code extra bits
568	mov	next_bits, read_in
569
570	;; Calculate the look back distance
571	BZHI	next_bits, next_bits, rcx, tmp4
572	SHRX	read_in, read_in, rcx
573
574	;; Setup next_sym, read_in, and read_in_length for next loop
575	mov	read_in_2, (1 << ISAL_DECODE_LONG_BITS) - 1
576	and	read_in_2, read_in
577	mov	next_sym %+ d, dword [state + _lit_huff_code + LARGE_SHORT_CODE_SIZE * read_in_2]
578	sub	read_in_length, rcx
579
580	;; Copy distance in len/dist pair
581	add	look_back_dist2, next_bits
582
583	;; Find beginning of copy
584	mov	copy_start, next_out
585	sub	copy_start, repeat_length
586	sub	copy_start, look_back_dist2
587
588	;; Check if a valid look back distances was decoded
589	cmp	copy_start, [rsp + start_out_mem_offset]
590	jl	invalid_look_back_distance
591	MOVDQU	xmm1, [copy_start]
592
593	;; Set tmp2 to be the minimum of COPY_SIZE and repeat_length
594	;; This is to decrease use of small_byte_copy branch
595	mov	tmp2, COPY_SIZE
596	cmp	tmp2, repeat_length
597	cmovg	tmp2, repeat_length
598
599	;; Check for overlapping memory in the copy
600	cmp	look_back_dist2, tmp2
601	jl	small_byte_copy_pre
602
603large_byte_copy:
604	;; Copy length distance pair when memory overlap is not an issue
605	MOVDQU [copy_start + look_back_dist2], xmm1
606
607	sub	repeat_length, COPY_SIZE
608	jle	loop_block
609
610	add	copy_start, COPY_SIZE
611	MOVDQU	xmm1, [copy_start]
612	jmp	large_byte_copy
613
614small_byte_copy_pre:
615	;; Copy length distance pair when source and destination overlap
616	add	repeat_length, look_back_dist2
617small_byte_copy:
618	MOVDQU [copy_start + look_back_dist2], xmm1
619
620	shl	look_back_dist2, 1
621	MOVDQU	xmm1, [copy_start]
622	cmp	look_back_dist2, COPY_SIZE
623	jl	small_byte_copy
624
625	sub	repeat_length, look_back_dist2
626	jge	large_byte_copy
627	jmp	loop_block
628
629;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
630; Finish Main Loop
631;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
632end_loop_block_pre:
633	;; Fix up in buffer and out buffer to reflect the actual buffer end
634	add	end_out, OUT_BUFFER_SLOP
635	add	end_in, IN_BUFFER_SLOP
636
637end_loop_block:
638	;; Load read in buffer and decode next lit/len symbol
639	inflate_in_small_load	next_in, end_in, read_in, read_in_length, tmp1, tmp2
640	mov	[rsp + read_in_mem_offset], read_in
641	mov	[rsp + read_in_length_mem_offset], read_in_length
642	mov	[rsp + next_out_mem_offset], next_out
643
644	decode_next_lit_len_with_load state, ISAL_DECODE_LONG_BITS, _lit_huff_code, read_in, read_in_length, next_sym, next_sym_num, tmp1
645
646	;; Check that enough input was available to decode symbol
647	cmp	read_in_length, 0
648	jl	end_of_input
649
650multi_symbol_start:
651	cmp	next_sym_num, 1
652	jg	decode_literal
653
654	cmp	next_sym, 256
655	jl	decode_literal
656	je	end_symbol
657
658decode_len_dist_2:
659	lea	repeat_length, [next_sym - 254]
660	;; Decode distance code
661	decode_next_dist_with_load state, ISAL_DECODE_SHORT_BITS, _dist_huff_code, read_in, read_in_length, next_sym, rcx, tmp1
662
663	;; Load distance code extra bits
664	mov	next_bits, read_in
665	mov	look_back_dist %+ d, [rfc_lookup + _dist_start + 4 * next_sym]
666
667	;; Calculate the look back distance and check for enough input
668	BZHI	next_bits, next_bits, rcx, tmp1
669	SHRX	read_in, read_in, rcx
670	add	look_back_dist, next_bits
671	sub	read_in_length, rcx
672	jl	end_of_input
673
674	;; Setup code for byte copy using rep  movsb
675	mov	rsi, next_out
676	mov	rdi, rsi
677	mov	rcx, repeat_length
678	sub	rsi, look_back_dist
679
680	;; Check if a valid look back distance was decoded
681	cmp	rsi, [rsp + start_out_mem_offset]
682	jl	invalid_look_back_distance
683
684	;; Check for out buffer overflow
685	add	repeat_length, next_out
686	cmp	repeat_length, end_out
687	jg	out_buffer_overflow_repeat
688
689	mov	next_out, repeat_length
690
691	rep	movsb
692	jmp	end_loop_block
693
694decode_literal:
695	;; Store literal decoded from the input stream
696	cmp	next_out, end_out
697	jge	out_buffer_overflow_lit
698	add	next_out, 1
699	mov	byte [next_out - 1], next_sym %+ b
700	sub	next_sym_num, 1
701	jz	end_loop_block
702	shr	next_sym, 8
703	jmp	multi_symbol_start
704
705;; Set exit codes
706end_of_input:
707	mov	read_in, [rsp + read_in_mem_offset]
708	mov	read_in_length, [rsp + read_in_length_mem_offset]
709	mov	next_out, [rsp + next_out_mem_offset]
710	xor	tmp1, tmp1
711	mov	dword [state + _write_overflow_lits], tmp1 %+ d
712	mov	dword [state + _write_overflow_len], tmp1 %+ d
713	mov	rax, END_INPUT
714	jmp	end
715
716out_buffer_overflow_repeat:
717	mov	rcx, end_out
718	sub	rcx, next_out
719	sub	repeat_length, rcx
720	sub	repeat_length, next_out
721	rep	movsb
722
723	mov	[state + _copy_overflow_len], repeat_length %+ d
724	mov	[state + _copy_overflow_dist], look_back_dist %+ d
725
726	mov	next_out, end_out
727
728	mov	rax, OUT_OVERFLOW
729	jmp	end
730
731out_buffer_overflow_lit:
732	mov	dword [state + _write_overflow_lits], next_sym %+ d
733	mov	dword [state + _write_overflow_len], next_sym_num %+ d
734	sub	next_sym_num, 1
735	shl	next_sym_num, 3
736	SHRX	next_sym, next_sym, next_sym_num
737	mov	rax, OUT_OVERFLOW
738	shr	next_sym_num, 3
739	cmp	next_sym, 256
740	jl	end
741	mov	dword [state + _write_overflow_len], next_sym_num %+ d
742	jg	decode_len_dist_2
743	jmp	end_state
744
745invalid_look_back_distance:
746	mov	rax, INVALID_LOOKBACK
747	jmp	end
748
749invalid_dist_symbol_ %+ next_sym:
750	cmp	read_in_length, next_sym
751	jl	end_of_input
752	jmp	invalid_symbol
753invalid_dist_symbol_ %+ next_sym3:
754	cmp	read_in_length, next_sym3
755	jl	end_of_input
756invalid_symbol:
757	mov	rax, INVALID_SYMBOL
758	jmp	end
759
760end_symbol_pre:
761	;; Fix up in buffer and out buffer to reflect the actual buffer
762	sub	next_out, 1
763	add	end_out, OUT_BUFFER_SLOP
764	add	end_in, IN_BUFFER_SLOP
765end_symbol:
766	xor	rax, rax
767end_state:
768	;;  Set flag identifying a new block is required
769	mov	byte [state + _block_state], ISAL_BLOCK_NEW_HDR
770	cmp	dword [state + _bfinal], 0
771	je	end
772	mov	byte [state + _block_state], ISAL_BLOCK_INPUT_DONE
773
774end:
775	;; Save current buffer states
776	mov	[state + _read_in], read_in
777	mov	[state + _read_in_length], read_in_length %+ d
778
779	;; Set avail_out
780	sub	end_out, next_out
781	mov	dword [state + _avail_out], end_out %+ d
782
783	;; Set total_out
784	mov	tmp1, next_out
785	sub	tmp1, [state + _next_out]
786	add	[state + _total_out], tmp1 %+ d
787
788	;; Set next_out
789	mov	[state + _next_out], next_out
790
791	;; Set next_in
792	mov	[state + _next_in], next_in
793
794	;; Set avail_in
795	sub	end_in, next_in
796	mov	[state + _avail_in], end_in %+ d
797
798	FUNC_RESTORE
799
800	ret
801