xref: /isa-l/igzip/igzip_update_histogram.asm (revision cd888f01a447dd04c3a8b50362079648d432d2ca)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2018 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30%include "options.asm"
31
32%include "lz0a_const.asm"
33%include "data_struct2.asm"
34%include "bitbuf2.asm"
35%include "huffman.asm"
36%include "igzip_compare_types.asm"
37%include "reg_sizes.asm"
38
39%include "stdmac.asm"
40
41extern rfc1951_lookup_table
42_len_to_code_offset	equ	0
43
44%define LAST_BYTES_COUNT	3 ; Bytes to prevent reading out of array bounds
45%define LA_STATELESS	280	  ; Max number of bytes read in loop2 rounded up to 8 byte boundary
46%define LIT_LEN 286
47%define DIST_LEN 30
48%define HIST_ELEM_SIZE	8
49
50%ifdef DEBUG
51%macro MARK 1
52global %1
53%1:
54%endm
55%else
56%macro MARK 1
57%endm
58%endif
59
60;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
61;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
62;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
63%define	file_start	rdi
64%define file_length	rsi
65%define	histogram	rdx
66%define rfc_lookup	r9
67%define	f_i		r10
68
69%define	curr_data	rax
70
71%define	tmp2		rcx
72
73%define	dist		rbx
74%define	dist_code2	rbx
75
76%define	dist2		r12
77%define	dist_code	r12
78
79%define	len		rbp
80%define	len_code	rbp
81%define	hash3		rbp
82
83%define	curr_data2	r8
84%define	len2		r8
85%define	tmp4		r8
86
87%define	tmp1		r11
88
89%define	tmp3		r13
90
91%define	hash		r14
92
93%define	hash2		r15
94
95%define	xtmp0		xmm0
96%define	xtmp1		xmm1
97%define	xdata		xmm2
98
99%define	ytmp0		ymm0
100%define	ytmp1		ymm1
101
102%if(ARCH == 01)
103%define	vtmp0	xtmp0
104%define	vtmp1	xtmp1
105%define	V_LENGTH	16
106%else
107%define	vtmp0	ytmp0
108%define	vtmp1	ytmp1
109%define	V_LENGTH	32
110%endif
111;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
112;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
113;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
114_eob_count_offset   equ  0	 ; local variable (8 bytes)
115f_end_i_mem_offset  equ  8
116gpr_save_mem_offset equ 16       ; gpr save area (8*8 bytes)
117xmm_save_mem_offset equ 16 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
118stack_size          equ 2*8 + 8*8 + 4*16 + 8
119;;; 8 because stack address is odd multiple of 8 after a function call and
120;;; we want it aligned to 16 bytes
121
122%ifidn __OUTPUT_FORMAT__, elf64
123%define arg0	rdi
124%define	arg1	rsi
125%define arg2	rdx
126
127%macro FUNC_SAVE 0
128%ifdef ALIGN_STACK
129	push	rbp
130	mov	rbp, rsp
131	sub	rsp, stack_size
132	and	rsp, ~15
133%else
134	sub	rsp, stack_size
135%endif
136
137	mov [rsp + gpr_save_mem_offset + 0*8], rbx
138	mov [rsp + gpr_save_mem_offset + 1*8], rbp
139	mov [rsp + gpr_save_mem_offset + 2*8], r12
140	mov [rsp + gpr_save_mem_offset + 3*8], r13
141	mov [rsp + gpr_save_mem_offset + 4*8], r14
142	mov [rsp + gpr_save_mem_offset + 5*8], r15
143%endm
144
145%macro FUNC_RESTORE 0
146	mov	rbx, [rsp + gpr_save_mem_offset + 0*8]
147	mov	rbp, [rsp + gpr_save_mem_offset + 1*8]
148	mov	r12, [rsp + gpr_save_mem_offset + 2*8]
149	mov	r13, [rsp + gpr_save_mem_offset + 3*8]
150	mov	r14, [rsp + gpr_save_mem_offset + 4*8]
151	mov	r15, [rsp + gpr_save_mem_offset + 5*8]
152
153%ifndef ALIGN_STACK
154	add	rsp, stack_size
155%else
156	mov	rsp, rbp
157	pop	rbp
158%endif
159%endm
160%endif
161
162%ifidn __OUTPUT_FORMAT__, win64
163%define arg0	rcx
164%define	arg1	rdx
165%define	arg2	r8
166
167%macro FUNC_SAVE 0
168%ifdef ALIGN_STACK
169	push	rbp
170	mov	rbp, rsp
171	sub	rsp, stack_size
172	and	rsp, ~15
173%else
174	sub	rsp, stack_size
175%endif
176
177	mov [rsp + gpr_save_mem_offset + 0*8], rbx
178	mov [rsp + gpr_save_mem_offset + 1*8], rsi
179	mov [rsp + gpr_save_mem_offset + 2*8], rdi
180	mov [rsp + gpr_save_mem_offset + 3*8], rbp
181	mov [rsp + gpr_save_mem_offset + 4*8], r12
182	mov [rsp + gpr_save_mem_offset + 5*8], r13
183	mov [rsp + gpr_save_mem_offset + 6*8], r14
184	mov [rsp + gpr_save_mem_offset + 7*8], r15
185%endm
186
187%macro FUNC_RESTORE 0
188	mov	rbx, [rsp + gpr_save_mem_offset + 0*8]
189	mov	rsi, [rsp + gpr_save_mem_offset + 1*8]
190	mov	rdi, [rsp + gpr_save_mem_offset + 2*8]
191	mov	rbp, [rsp + gpr_save_mem_offset + 3*8]
192	mov	r12, [rsp + gpr_save_mem_offset + 4*8]
193	mov	r13, [rsp + gpr_save_mem_offset + 5*8]
194	mov	r14, [rsp + gpr_save_mem_offset + 6*8]
195	mov	r15, [rsp + gpr_save_mem_offset + 7*8]
196
197%ifndef ALIGN_STACK
198	add	rsp, stack_size
199%else
200	mov	rsp, rbp
201	pop	rbp
202%endif
203%endm
204%endif
205
206
207_lit_len_offset	equ	0
208_dist_offset	equ	(8 * LIT_LEN)
209_hash_offset	equ	(_dist_offset + 8 * DIST_LEN)
210
211
212%macro len_to_len_code 3
213%define %%len_code	%1 	; Output
214%define	%%len		%2	; Input
215%define	%%rfc_lookup	%3
216	movzx	%%len_code, byte [%%rfc_lookup + _len_to_code_offset + %%len]
217	or	%%len_code, 0x100
218%endm
219
220;;; Clobbers rcx and dist
221%macro	dist_to_dist_code 2
222%define %%dist_code	%1	; Output code associated with dist
223%define	%%dist_coded	%1d
224%define	%%dist		%2d	; Input dist
225	dec	%%dist
226	mov	%%dist_coded, %%dist
227	bsr	ecx, %%dist_coded
228	dec	ecx
229	SHRX	%%dist_code, %%dist_code, rcx
230	lea	%%dist_coded, [%%dist_coded + 2*ecx]
231
232	cmp	%%dist, 1
233	cmovle	%%dist_coded, %%dist
234%endm
235
236;;; Clobbers rcx and dist
237%macro	dist_to_dist_code2 2
238%define	%%dist_code	%1	; Output code associated with dist
239%define %%dist_coded	%1d
240%define	%%dist		%2d	; Input -(dist - 1)
241	neg	%%dist
242	mov	%%dist_coded, %%dist
243	bsr	ecx, %%dist_coded
244	dec	ecx
245	SHRX	%%dist_code, %%dist_code, rcx
246	lea	%%dist_coded, [%%dist_coded + 2*ecx]
247
248	cmp	%%dist, 1
249	cmovle	%%dist_coded, %%dist
250%endm
251
252[bits 64]
253default rel
254section .text
255
256; void isal_update_histogram
257global isal_update_histogram_ %+ ARCH
258isal_update_histogram_ %+ ARCH %+ :
259	endbranch
260	FUNC_SAVE
261
262%ifnidn	file_start, arg0
263	mov	file_start, arg0
264%endif
265%ifnidn	file_length, arg1
266	mov	file_length, arg1
267%endif
268%ifnidn	histogram, arg2
269	mov	histogram, arg2
270%endif
271	mov	f_i, 0
272	cmp	file_length, 0
273	je	exit_ret	; If nothing to do then exit
274
275	mov	tmp1, qword [histogram + _lit_len_offset + 8*256]
276	inc	tmp1
277	mov	[rsp + _eob_count_offset], tmp1
278
279	lea	rfc_lookup, [rfc1951_lookup_table]
280
281	;; Init hash_table
282	PXOR	vtmp0, vtmp0, vtmp0
283	mov	rcx, (IGZIP_LVL0_HASH_SIZE - V_LENGTH)
284init_hash_table:
285	MOVDQU	[histogram + _hash_offset + 2 * rcx], vtmp0
286	MOVDQU	[histogram + _hash_offset + 2 * (rcx + V_LENGTH / 2)], vtmp0
287	sub	rcx, V_LENGTH
288	jge	init_hash_table
289
290	sub	file_length, LA_STATELESS
291	cmp	file_length, 0
292	jle	end_loop_2
293
294
295	;; Load first literal into histogram
296	mov	curr_data, [file_start + f_i]
297	compute_hash	hash, curr_data
298	and	hash %+ d, LVL0_HASH_MASK
299	mov	[histogram + _hash_offset + 2 * hash], f_i %+ w
300	and	curr_data, 0xff
301	inc	qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
302	inc	f_i
303
304	;; Setup to begin loop 2
305	MOVDQU	xdata, [file_start + f_i]
306	mov	curr_data, [file_start + f_i]
307	mov	curr_data2, curr_data
308	compute_hash	hash, curr_data
309	shr	curr_data2, 8
310	compute_hash	hash2, curr_data2
311
312	and	hash2 %+ d, LVL0_HASH_MASK
313	and	hash, LVL0_HASH_MASK
314loop2:
315	xor	dist, dist
316	xor	dist2, dist2
317	xor	tmp3, tmp3
318
319	lea	tmp1, [file_start + f_i]
320
321	MOVQ	curr_data, xdata
322	PSRLDQ	xdata, 1
323
324	;; Load possible look back distances and update hash data
325	mov	dist %+ w, f_i %+ w
326	sub	dist, 1
327	sub	dist %+ w, word [histogram + _hash_offset + 2 * hash]
328	mov	[histogram + _hash_offset + 2 * hash], f_i %+ w
329
330	add	f_i, 1
331
332	mov	dist2 %+ w, f_i %+ w
333	sub	dist2, 1
334	sub	dist2 %+ w, word [histogram + _hash_offset + 2 * hash2]
335	mov	[histogram + _hash_offset + 2 * hash2], f_i %+ w
336
337	;; Start computing hashes to be used in either the next loop or
338	;; for updating the hash if a match is found
339	MOVQ	curr_data2, xdata
340	MOVQ	tmp2, xdata
341	shr	curr_data2, 8
342	compute_hash	hash, curr_data2
343
344	;; Check if look back distances are valid. Load a junk distance of 1
345	;; if the look back distance is too long for speculative lookups.
346	and	dist %+ d, (D-1)
347	neg	dist
348
349	and	dist2 %+ d, (D-1)
350	neg	dist2
351
352	shr	tmp2, 16
353	compute_hash	hash2, tmp2
354
355	;; Check for long len/dist matches (>7)
356	mov	len, curr_data
357	xor	len, [tmp1 + dist - 1]
358	jz	compare_loop
359
360	and	hash %+ d, LVL0_HASH_MASK
361	and	hash2 %+ d, LVL0_HASH_MASK
362
363	MOVQ	len2, xdata
364	xor	len2, [tmp1 + dist2]
365	jz	compare_loop2
366
367	;; Specutively load the code for the first literal
368	movzx   tmp1, curr_data %+ b
369	shr	curr_data, 8
370
371	lea	tmp3, [f_i + 1]
372
373	;; Check for len/dist match for first literal
374	test    len %+ d, 0xFFFFFFFF
375	jz      len_dist_huffman_pre
376
377	;; Store first literal
378	inc	qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * tmp1]
379
380	;; Check for len/dist match for second literal
381	test    len2 %+ d, 0xFFFFFFFF
382	jnz     lit_lit_huffman
383len_dist_lit_huffman_pre:
384	;; Calculate repeat length
385	tzcnt	len2, len2
386	shr	len2, 3
387
388len_dist_lit_huffman:
389	MOVQ	curr_data, xdata
390	shr	curr_data, 24
391	compute_hash hash3, curr_data
392
393	;; Store updated hashes
394	mov	[histogram + _hash_offset + 2 * hash], tmp3 %+ w
395	add	tmp3,1
396	mov	[histogram + _hash_offset + 2 * hash2], tmp3 %+ w
397	add	tmp3, 1
398
399	add	f_i, len2
400
401	MOVDQU	xdata, [file_start + f_i]
402	mov	curr_data, [file_start + f_i]
403	mov	tmp1, curr_data
404	compute_hash	hash, curr_data
405
406	and	hash3, LVL0_HASH_MASK
407	mov	[histogram + _hash_offset + 2 * hash3], tmp3 %+ w
408
409	dist_to_dist_code2 dist_code2, dist2
410
411	len_to_len_code len_code, len2, rfc_lookup
412
413	shr	tmp1, 8
414	compute_hash	hash2, tmp1
415
416	inc	qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code]
417	inc	qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code2]
418
419	and	hash2 %+ d, LVL0_HASH_MASK
420	and	hash, LVL0_HASH_MASK
421
422	cmp	f_i, file_length
423	jl	loop2
424	jmp	end_loop_2
425	;; encode as dist/len
426
427len_dist_huffman_pre:
428	tzcnt	len, len
429	shr	len, 3
430
431len_dist_huffman:
432	mov	[histogram + _hash_offset + 2 * hash], tmp3 %+ w
433	add	tmp3,1
434	mov	[histogram + _hash_offset + 2 * hash2], tmp3 %+ w
435
436	dec	f_i
437	add	f_i, len
438
439	MOVDQU	xdata, [file_start + f_i]
440	mov	curr_data, [file_start + f_i]
441	mov	tmp1, curr_data
442	compute_hash	hash, curr_data
443
444	dist_to_dist_code2 dist_code, dist
445
446	len_to_len_code len_code, len, rfc_lookup
447
448	shr	tmp1, 8
449	compute_hash	hash2, tmp1
450
451	inc	qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code]
452	inc	qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code]
453
454	and	hash2 %+ d, LVL0_HASH_MASK
455	and	hash, LVL0_HASH_MASK
456
457	cmp	f_i, file_length
458	jl	loop2
459	jmp	end_loop_2
460
461lit_lit_huffman:
462	MOVDQU	xdata, [file_start + f_i + 1]
463	and     curr_data, 0xff
464	add	f_i, 1
465	inc	qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
466
467	cmp	f_i, file_length
468	jl	loop2
469
470end_loop_2:
471	add	file_length, LA_STATELESS - LAST_BYTES_COUNT
472	cmp	f_i, file_length
473	jge	final_bytes
474
475loop2_finish:
476	mov	curr_data %+ d, dword [file_start + f_i]
477	compute_hash	hash, curr_data
478	and	hash %+ d, LVL0_HASH_MASK
479
480	;; Calculate possible distance for length/dist pair.
481	xor	dist, dist
482	mov	dist %+ w, f_i %+ w
483	sub	dist %+ w, word [histogram + _hash_offset + 2 * hash]
484	mov	[histogram + _hash_offset + 2 * hash], f_i %+ w
485
486	;; Check if look back distance is valid (the dec is to handle when dist = 0)
487	dec	dist
488	cmp	dist %+ d, (D-1)
489	jae	encode_literal_finish
490	inc	dist
491
492	;; Check if look back distance is a match
493	lea	tmp4, [file_length + LAST_BYTES_COUNT]
494	sub	tmp4, f_i
495	lea	tmp1, [file_start + f_i]
496	mov	tmp2, tmp1
497	sub	tmp2, dist
498	compare	tmp4, tmp1, tmp2, len, tmp3
499
500	;; Limit len to maximum value of 258
501	mov	tmp2, 258
502	cmp	len, 258
503	cmova	len, tmp2
504	cmp	len, SHORTEST_MATCH
505	jb	encode_literal_finish
506
507	add	f_i, len
508
509	len_to_len_code	len_code, len, rfc_lookup
510	dist_to_dist_code dist_code, dist
511
512	inc	qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code]
513	inc	qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code]
514
515	cmp	f_i, file_length
516	jl	loop2_finish
517	jmp	final_bytes
518
519encode_literal_finish:
520	;; Encode literal
521	and	curr_data %+ d, 0xFF
522	inc	qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
523
524	;; Setup for next loop
525	add	f_i, 1
526	cmp	f_i, file_length
527	jl	loop2_finish
528
529final_bytes:
530	add	file_length, LAST_BYTES_COUNT
531final_bytes_loop:
532	cmp	f_i, file_length
533	jge	end
534	movzx	curr_data, byte [file_start + f_i]
535	inc	qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
536	inc	f_i
537	jmp	final_bytes_loop
538
539end:
540	;; Handle eob at end of stream
541	mov	tmp1, [rsp + _eob_count_offset]
542	mov	qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * 256], tmp1
543
544exit_ret:
545	FUNC_RESTORE
546	ret
547
548compare_loop:
549	and	hash %+ d, LVL0_HASH_MASK
550	and	hash2 %+ d, LVL0_HASH_MASK
551	lea	tmp2, [tmp1 + dist - 1]
552
553	mov	len2, 250
554	mov	len, 8
555	compare250	tmp1, tmp2, len, len2, tmp3, ytmp0, ytmp1
556
557	lea	tmp3, [f_i + 1]
558	jmp	len_dist_huffman
559
560compare_loop2:
561	add	tmp1, 1
562	lea	tmp2, [tmp1 + dist2 - 1]
563
564	mov	len, 250
565	mov	len2, 8
566	compare250	tmp1, tmp2, len2, len, tmp3, ytmp0, ytmp1
567
568	and	curr_data, 0xff
569	inc	qword [histogram + _lit_len_offset + 8 * curr_data]
570	lea	tmp3, [f_i + 1]
571	jmp	len_dist_lit_huffman
572
573section .data
574	align 32
575D_vector:
576	dw	-(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF
577	dw	-(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF
578	dw	-(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF
579	dw	-(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF
580