1b1c45175SGreg Tucker;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2b1c45175SGreg Tucker; Copyright(c) 2011-2018 Intel Corporation All rights reserved. 3b1c45175SGreg Tucker; 4b1c45175SGreg Tucker; Redistribution and use in source and binary forms, with or without 5b1c45175SGreg Tucker; modification, are permitted provided that the following conditions 6b1c45175SGreg Tucker; are met: 7b1c45175SGreg Tucker; * Redistributions of source code must retain the above copyright 8b1c45175SGreg Tucker; notice, this list of conditions and the following disclaimer. 9b1c45175SGreg Tucker; * Redistributions in binary form must reproduce the above copyright 10b1c45175SGreg Tucker; notice, this list of conditions and the following disclaimer in 11b1c45175SGreg Tucker; the documentation and/or other materials provided with the 12b1c45175SGreg Tucker; distribution. 13b1c45175SGreg Tucker; * Neither the name of Intel Corporation nor the names of its 14b1c45175SGreg Tucker; contributors may be used to endorse or promote products derived 15b1c45175SGreg Tucker; from this software without specific prior written permission. 16b1c45175SGreg Tucker; 17b1c45175SGreg Tucker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18b1c45175SGreg Tucker; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19b1c45175SGreg Tucker; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20b1c45175SGreg Tucker; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21b1c45175SGreg Tucker; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22b1c45175SGreg Tucker; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23b1c45175SGreg Tucker; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24b1c45175SGreg Tucker; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25b1c45175SGreg Tucker; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26b1c45175SGreg Tucker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27b1c45175SGreg Tucker; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28b1c45175SGreg Tucker;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2931814483SRoy Oursler 3031814483SRoy Oursler%include "options.asm" 3131814483SRoy Oursler 3231814483SRoy Oursler%include "lz0a_const.asm" 3331814483SRoy Oursler%include "data_struct2.asm" 3431814483SRoy Oursler%include "bitbuf2.asm" 3531814483SRoy Oursler%include "huffman.asm" 3631814483SRoy Oursler%include "igzip_compare_types.asm" 3731814483SRoy Oursler%include "reg_sizes.asm" 3831814483SRoy Oursler 3931814483SRoy Oursler%include "stdmac.asm" 4031814483SRoy Oursler 4131814483SRoy Ourslerextern rfc1951_lookup_table 4231814483SRoy Oursler_len_to_code_offset equ 0 4331814483SRoy Oursler 4431814483SRoy Oursler%define LAST_BYTES_COUNT 3 ; Bytes to prevent reading out of array bounds 45cf30138cSRoy Oursler%define LA_STATELESS 280 ; Max number of bytes read in loop2 rounded up to 8 byte boundary 4631814483SRoy Oursler%define LIT_LEN 286 4731814483SRoy Oursler%define DIST_LEN 30 4831814483SRoy Oursler%define HIST_ELEM_SIZE 8 4931814483SRoy Oursler 5031814483SRoy Oursler%ifdef DEBUG 5131814483SRoy Oursler%macro MARK 1 5231814483SRoy Ourslerglobal %1 5331814483SRoy Oursler%1: 5431814483SRoy Oursler%endm 5531814483SRoy Oursler%else 5631814483SRoy Oursler%macro MARK 1 5731814483SRoy Oursler%endm 5831814483SRoy Oursler%endif 5931814483SRoy Oursler 6031814483SRoy Oursler;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 6131814483SRoy Oursler;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 6231814483SRoy Oursler;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 6331814483SRoy Oursler%define file_start rdi 6431814483SRoy Oursler%define file_length rsi 6531814483SRoy Oursler%define histogram rdx 6631814483SRoy Oursler%define rfc_lookup r9 6731814483SRoy Oursler%define f_i r10 6831814483SRoy Oursler 6931814483SRoy Oursler%define curr_data rax 7031814483SRoy Oursler 7131814483SRoy Oursler%define tmp2 rcx 7231814483SRoy Oursler 7331814483SRoy Oursler%define dist rbx 7431814483SRoy Oursler%define dist_code2 rbx 7531814483SRoy Oursler 7631814483SRoy Oursler%define dist2 r12 7731814483SRoy Oursler%define dist_code r12 7831814483SRoy Oursler 7931814483SRoy Oursler%define len rbp 8031814483SRoy Oursler%define len_code rbp 8131814483SRoy Oursler%define hash3 rbp 8231814483SRoy Oursler 8331814483SRoy Oursler%define curr_data2 r8 8431814483SRoy Oursler%define len2 r8 854d1fe78bSRoy Oursler%define tmp4 r8 8631814483SRoy Oursler 8731814483SRoy Oursler%define tmp1 r11 8831814483SRoy Oursler 8931814483SRoy Oursler%define tmp3 r13 9031814483SRoy Oursler 9131814483SRoy Oursler%define hash r14 9231814483SRoy Oursler 9331814483SRoy Oursler%define hash2 r15 9431814483SRoy Oursler 9531814483SRoy Oursler%define xtmp0 xmm0 9631814483SRoy Oursler%define xtmp1 xmm1 9717dac9f6SRoy Oursler%define xdata xmm2 9831814483SRoy Oursler 9931814483SRoy Oursler%define ytmp0 ymm0 10031814483SRoy Oursler%define ytmp1 ymm1 10131814483SRoy Oursler 1024d40cd36SRoy Oursler%if(ARCH == 01) 1034d40cd36SRoy Oursler%define vtmp0 xtmp0 1044d40cd36SRoy Oursler%define vtmp1 xtmp1 1054d40cd36SRoy Oursler%define V_LENGTH 16 1064d40cd36SRoy Oursler%else 1074d40cd36SRoy Oursler%define vtmp0 ytmp0 1084d40cd36SRoy Oursler%define vtmp1 ytmp1 1094d40cd36SRoy Oursler%define V_LENGTH 32 1104d40cd36SRoy Oursler%endif 11131814483SRoy Oursler;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 11231814483SRoy Oursler;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 11331814483SRoy Oursler;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 11431814483SRoy Oursler_eob_count_offset equ 0 ; local variable (8 bytes) 11531814483SRoy Ourslerf_end_i_mem_offset equ 8 11631814483SRoy Ourslergpr_save_mem_offset equ 16 ; gpr save area (8*8 bytes) 11731814483SRoy Ourslerxmm_save_mem_offset equ 16 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned) 11831814483SRoy Ourslerstack_size equ 2*8 + 8*8 + 4*16 + 8 11931814483SRoy Oursler;;; 8 because stack address is odd multiple of 8 after a function call and 12031814483SRoy Oursler;;; we want it aligned to 16 bytes 121d06e14b9SRoy Oursler 122d06e14b9SRoy Oursler%ifidn __OUTPUT_FORMAT__, elf64 123d06e14b9SRoy Oursler%define arg0 rdi 124d06e14b9SRoy Oursler%define arg1 rsi 125d06e14b9SRoy Oursler%define arg2 rdx 126d06e14b9SRoy Oursler 127d06e14b9SRoy Oursler%macro FUNC_SAVE 0 128d06e14b9SRoy Oursler%ifdef ALIGN_STACK 129d06e14b9SRoy Oursler push rbp 130d06e14b9SRoy Oursler mov rbp, rsp 131d06e14b9SRoy Oursler sub rsp, stack_size 132d06e14b9SRoy Oursler and rsp, ~15 133d06e14b9SRoy Oursler%else 134d06e14b9SRoy Oursler sub rsp, stack_size 135d06e14b9SRoy Oursler%endif 136d06e14b9SRoy Oursler 137d06e14b9SRoy Oursler mov [rsp + gpr_save_mem_offset + 0*8], rbx 138d06e14b9SRoy Oursler mov [rsp + gpr_save_mem_offset + 1*8], rbp 139d06e14b9SRoy Oursler mov [rsp + gpr_save_mem_offset + 2*8], r12 140d06e14b9SRoy Oursler mov [rsp + gpr_save_mem_offset + 3*8], r13 141d06e14b9SRoy Oursler mov [rsp + gpr_save_mem_offset + 4*8], r14 142d06e14b9SRoy Oursler mov [rsp + gpr_save_mem_offset + 5*8], r15 143d06e14b9SRoy Oursler%endm 144d06e14b9SRoy Oursler 145d06e14b9SRoy Oursler%macro FUNC_RESTORE 0 146d06e14b9SRoy Oursler mov rbx, [rsp + gpr_save_mem_offset + 0*8] 147d06e14b9SRoy Oursler mov rbp, [rsp + gpr_save_mem_offset + 1*8] 148d06e14b9SRoy Oursler mov r12, [rsp + gpr_save_mem_offset + 2*8] 149d06e14b9SRoy Oursler mov r13, [rsp + gpr_save_mem_offset + 3*8] 150d06e14b9SRoy Oursler mov r14, [rsp + gpr_save_mem_offset + 4*8] 151d06e14b9SRoy Oursler mov r15, [rsp + gpr_save_mem_offset + 5*8] 152d06e14b9SRoy Oursler 153d06e14b9SRoy Oursler%ifndef ALIGN_STACK 154d06e14b9SRoy Oursler add rsp, stack_size 155d06e14b9SRoy Oursler%else 156d06e14b9SRoy Oursler mov rsp, rbp 157d06e14b9SRoy Oursler pop rbp 158d06e14b9SRoy Oursler%endif 159d06e14b9SRoy Oursler%endm 160d06e14b9SRoy Oursler%endif 161d06e14b9SRoy Oursler 162d06e14b9SRoy Oursler%ifidn __OUTPUT_FORMAT__, win64 163d06e14b9SRoy Oursler%define arg0 rcx 164d06e14b9SRoy Oursler%define arg1 rdx 165d06e14b9SRoy Oursler%define arg2 r8 166d06e14b9SRoy Oursler 167d06e14b9SRoy Oursler%macro FUNC_SAVE 0 168d06e14b9SRoy Oursler%ifdef ALIGN_STACK 169d06e14b9SRoy Oursler push rbp 170d06e14b9SRoy Oursler mov rbp, rsp 171d06e14b9SRoy Oursler sub rsp, stack_size 172d06e14b9SRoy Oursler and rsp, ~15 173d06e14b9SRoy Oursler%else 174d06e14b9SRoy Oursler sub rsp, stack_size 175d06e14b9SRoy Oursler%endif 176d06e14b9SRoy Oursler 177d06e14b9SRoy Oursler mov [rsp + gpr_save_mem_offset + 0*8], rbx 178d06e14b9SRoy Oursler mov [rsp + gpr_save_mem_offset + 1*8], rsi 179d06e14b9SRoy Oursler mov [rsp + gpr_save_mem_offset + 2*8], rdi 180d06e14b9SRoy Oursler mov [rsp + gpr_save_mem_offset + 3*8], rbp 181d06e14b9SRoy Oursler mov [rsp + gpr_save_mem_offset + 4*8], r12 182d06e14b9SRoy Oursler mov [rsp + gpr_save_mem_offset + 5*8], r13 183d06e14b9SRoy Oursler mov [rsp + gpr_save_mem_offset + 6*8], r14 184d06e14b9SRoy Oursler mov [rsp + gpr_save_mem_offset + 7*8], r15 185d06e14b9SRoy Oursler%endm 186d06e14b9SRoy Oursler 187d06e14b9SRoy Oursler%macro FUNC_RESTORE 0 188d06e14b9SRoy Oursler mov rbx, [rsp + gpr_save_mem_offset + 0*8] 189d06e14b9SRoy Oursler mov rsi, [rsp + gpr_save_mem_offset + 1*8] 190d06e14b9SRoy Oursler mov rdi, [rsp + gpr_save_mem_offset + 2*8] 191d06e14b9SRoy Oursler mov rbp, [rsp + gpr_save_mem_offset + 3*8] 192d06e14b9SRoy Oursler mov r12, [rsp + gpr_save_mem_offset + 4*8] 193d06e14b9SRoy Oursler mov r13, [rsp + gpr_save_mem_offset + 5*8] 194d06e14b9SRoy Oursler mov r14, [rsp + gpr_save_mem_offset + 6*8] 195d06e14b9SRoy Oursler mov r15, [rsp + gpr_save_mem_offset + 7*8] 196d06e14b9SRoy Oursler 197d06e14b9SRoy Oursler%ifndef ALIGN_STACK 198d06e14b9SRoy Oursler add rsp, stack_size 199d06e14b9SRoy Oursler%else 200d06e14b9SRoy Oursler mov rsp, rbp 201d06e14b9SRoy Oursler pop rbp 202d06e14b9SRoy Oursler%endif 203d06e14b9SRoy Oursler%endm 204d06e14b9SRoy Oursler%endif 205d06e14b9SRoy Oursler 206d06e14b9SRoy Oursler 20731814483SRoy Oursler_lit_len_offset equ 0 20831814483SRoy Oursler_dist_offset equ (8 * LIT_LEN) 20931814483SRoy Oursler_hash_offset equ (_dist_offset + 8 * DIST_LEN) 21031814483SRoy Oursler 211d06e14b9SRoy Oursler 21231814483SRoy Oursler%macro len_to_len_code 3 21331814483SRoy Oursler%define %%len_code %1 ; Output 21431814483SRoy Oursler%define %%len %2 ; Input 21531814483SRoy Oursler%define %%rfc_lookup %3 21631814483SRoy Oursler movzx %%len_code, byte [%%rfc_lookup + _len_to_code_offset + %%len] 21731814483SRoy Oursler or %%len_code, 0x100 21831814483SRoy Oursler%endm 21931814483SRoy Oursler 22031814483SRoy Oursler;;; Clobbers rcx and dist 22131814483SRoy Oursler%macro dist_to_dist_code 2 22231814483SRoy Oursler%define %%dist_code %1 ; Output code associated with dist 22331814483SRoy Oursler%define %%dist_coded %1d 22431814483SRoy Oursler%define %%dist %2d ; Input dist 22531814483SRoy Oursler dec %%dist 22631814483SRoy Oursler mov %%dist_coded, %%dist 22731814483SRoy Oursler bsr ecx, %%dist_coded 22831814483SRoy Oursler dec ecx 22931814483SRoy Oursler SHRX %%dist_code, %%dist_code, rcx 23031814483SRoy Oursler lea %%dist_coded, [%%dist_coded + 2*ecx] 23131814483SRoy Oursler 23231814483SRoy Oursler cmp %%dist, 1 23331814483SRoy Oursler cmovle %%dist_coded, %%dist 23431814483SRoy Oursler%endm 23531814483SRoy Oursler 23631814483SRoy Oursler;;; Clobbers rcx and dist 23731814483SRoy Oursler%macro dist_to_dist_code2 2 23831814483SRoy Oursler%define %%dist_code %1 ; Output code associated with dist 23931814483SRoy Oursler%define %%dist_coded %1d 24031814483SRoy Oursler%define %%dist %2d ; Input -(dist - 1) 24131814483SRoy Oursler neg %%dist 24231814483SRoy Oursler mov %%dist_coded, %%dist 24331814483SRoy Oursler bsr ecx, %%dist_coded 24431814483SRoy Oursler dec ecx 24531814483SRoy Oursler SHRX %%dist_code, %%dist_code, rcx 24631814483SRoy Oursler lea %%dist_coded, [%%dist_coded + 2*ecx] 24731814483SRoy Oursler 24831814483SRoy Oursler cmp %%dist, 1 24931814483SRoy Oursler cmovle %%dist_coded, %%dist 25031814483SRoy Oursler%endm 25131814483SRoy Oursler 252ede04f0aSGreg Tucker[bits 64] 253ede04f0aSGreg Tuckerdefault rel 254ede04f0aSGreg Tuckersection .text 255ede04f0aSGreg Tucker 25631814483SRoy Oursler; void isal_update_histogram 25731814483SRoy Ourslerglobal isal_update_histogram_ %+ ARCH 25831814483SRoy Ourslerisal_update_histogram_ %+ ARCH %+ : 259*cd888f01SH.J. Lu endbranch 260d06e14b9SRoy Oursler FUNC_SAVE 26131814483SRoy Oursler 262d06e14b9SRoy Oursler%ifnidn file_start, arg0 263d06e14b9SRoy Oursler mov file_start, arg0 264d06e14b9SRoy Oursler%endif 265d06e14b9SRoy Oursler%ifnidn file_length, arg1 266d06e14b9SRoy Oursler mov file_length, arg1 267d06e14b9SRoy Oursler%endif 268d06e14b9SRoy Oursler%ifnidn histogram, arg2 269d06e14b9SRoy Oursler mov histogram, arg2 27031814483SRoy Oursler%endif 27131814483SRoy Oursler mov f_i, 0 272bda088b3SGreg Tucker cmp file_length, 0 273bda088b3SGreg Tucker je exit_ret ; If nothing to do then exit 27431814483SRoy Oursler 27531814483SRoy Oursler mov tmp1, qword [histogram + _lit_len_offset + 8*256] 27631814483SRoy Oursler inc tmp1 27731814483SRoy Oursler mov [rsp + _eob_count_offset], tmp1 27831814483SRoy Oursler 27931814483SRoy Oursler lea rfc_lookup, [rfc1951_lookup_table] 28031814483SRoy Oursler 28131814483SRoy Oursler ;; Init hash_table 282ee2e2bceSRoy Oursler PXOR vtmp0, vtmp0, vtmp0 2834ae2d1beSRoy Oursler mov rcx, (IGZIP_LVL0_HASH_SIZE - V_LENGTH) 28431814483SRoy Ourslerinit_hash_table: 2854d40cd36SRoy Oursler MOVDQU [histogram + _hash_offset + 2 * rcx], vtmp0 2864d40cd36SRoy Oursler MOVDQU [histogram + _hash_offset + 2 * (rcx + V_LENGTH / 2)], vtmp0 2874d40cd36SRoy Oursler sub rcx, V_LENGTH 28831814483SRoy Oursler jge init_hash_table 28931814483SRoy Oursler 29031814483SRoy Oursler sub file_length, LA_STATELESS 29131814483SRoy Oursler cmp file_length, 0 29231814483SRoy Oursler jle end_loop_2 29331814483SRoy Oursler 29431814483SRoy Oursler 29531814483SRoy Oursler ;; Load first literal into histogram 29631814483SRoy Oursler mov curr_data, [file_start + f_i] 29731814483SRoy Oursler compute_hash hash, curr_data 2984ae2d1beSRoy Oursler and hash %+ d, LVL0_HASH_MASK 29931814483SRoy Oursler mov [histogram + _hash_offset + 2 * hash], f_i %+ w 30031814483SRoy Oursler and curr_data, 0xff 30131814483SRoy Oursler inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data] 30231814483SRoy Oursler inc f_i 30331814483SRoy Oursler 30431814483SRoy Oursler ;; Setup to begin loop 2 30517dac9f6SRoy Oursler MOVDQU xdata, [file_start + f_i] 30631814483SRoy Oursler mov curr_data, [file_start + f_i] 30731814483SRoy Oursler mov curr_data2, curr_data 30831814483SRoy Oursler compute_hash hash, curr_data 30931814483SRoy Oursler shr curr_data2, 8 31031814483SRoy Oursler compute_hash hash2, curr_data2 31131814483SRoy Oursler 3124ae2d1beSRoy Oursler and hash2 %+ d, LVL0_HASH_MASK 3134ae2d1beSRoy Oursler and hash, LVL0_HASH_MASK 31431814483SRoy Ourslerloop2: 31531814483SRoy Oursler xor dist, dist 31631814483SRoy Oursler xor dist2, dist2 31731814483SRoy Oursler xor tmp3, tmp3 31831814483SRoy Oursler 31931814483SRoy Oursler lea tmp1, [file_start + f_i] 32031814483SRoy Oursler 32117dac9f6SRoy Oursler MOVQ curr_data, xdata 32217dac9f6SRoy Oursler PSRLDQ xdata, 1 32317dac9f6SRoy Oursler 32431814483SRoy Oursler ;; Load possible look back distances and update hash data 32531814483SRoy Oursler mov dist %+ w, f_i %+ w 326ee2e2bceSRoy Oursler sub dist, 1 32731814483SRoy Oursler sub dist %+ w, word [histogram + _hash_offset + 2 * hash] 32831814483SRoy Oursler mov [histogram + _hash_offset + 2 * hash], f_i %+ w 32931814483SRoy Oursler 33031814483SRoy Oursler add f_i, 1 33131814483SRoy Oursler 33231814483SRoy Oursler mov dist2 %+ w, f_i %+ w 333ee2e2bceSRoy Oursler sub dist2, 1 33431814483SRoy Oursler sub dist2 %+ w, word [histogram + _hash_offset + 2 * hash2] 33531814483SRoy Oursler mov [histogram + _hash_offset + 2 * hash2], f_i %+ w 33631814483SRoy Oursler 33731814483SRoy Oursler ;; Start computing hashes to be used in either the next loop or 33831814483SRoy Oursler ;; for updating the hash if a match is found 33917dac9f6SRoy Oursler MOVQ curr_data2, xdata 34017dac9f6SRoy Oursler MOVQ tmp2, xdata 34117dac9f6SRoy Oursler shr curr_data2, 8 34231814483SRoy Oursler compute_hash hash, curr_data2 34331814483SRoy Oursler 34431814483SRoy Oursler ;; Check if look back distances are valid. Load a junk distance of 1 34531814483SRoy Oursler ;; if the look back distance is too long for speculative lookups. 346ee2e2bceSRoy Oursler and dist %+ d, (D-1) 34731814483SRoy Oursler neg dist 34831814483SRoy Oursler 349ee2e2bceSRoy Oursler and dist2 %+ d, (D-1) 35031814483SRoy Oursler neg dist2 35131814483SRoy Oursler 35217dac9f6SRoy Oursler shr tmp2, 16 35331814483SRoy Oursler compute_hash hash2, tmp2 35431814483SRoy Oursler 35531814483SRoy Oursler ;; Check for long len/dist matches (>7) 35617dac9f6SRoy Oursler mov len, curr_data 35731814483SRoy Oursler xor len, [tmp1 + dist - 1] 35831814483SRoy Oursler jz compare_loop 35931814483SRoy Oursler 3604ae2d1beSRoy Oursler and hash %+ d, LVL0_HASH_MASK 3614ae2d1beSRoy Oursler and hash2 %+ d, LVL0_HASH_MASK 36231814483SRoy Oursler 36317dac9f6SRoy Oursler MOVQ len2, xdata 36431814483SRoy Oursler xor len2, [tmp1 + dist2] 36531814483SRoy Oursler jz compare_loop2 36631814483SRoy Oursler 36731814483SRoy Oursler ;; Specutively load the code for the first literal 36831814483SRoy Oursler movzx tmp1, curr_data %+ b 36931814483SRoy Oursler shr curr_data, 8 37031814483SRoy Oursler 37131814483SRoy Oursler lea tmp3, [f_i + 1] 37231814483SRoy Oursler 37331814483SRoy Oursler ;; Check for len/dist match for first literal 37431814483SRoy Oursler test len %+ d, 0xFFFFFFFF 37531814483SRoy Oursler jz len_dist_huffman_pre 37631814483SRoy Oursler 37731814483SRoy Oursler ;; Store first literal 37831814483SRoy Oursler inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * tmp1] 37931814483SRoy Oursler 38031814483SRoy Oursler ;; Check for len/dist match for second literal 38131814483SRoy Oursler test len2 %+ d, 0xFFFFFFFF 38231814483SRoy Oursler jnz lit_lit_huffman 38331814483SRoy Ourslerlen_dist_lit_huffman_pre: 38431814483SRoy Oursler ;; Calculate repeat length 38531814483SRoy Oursler tzcnt len2, len2 38631814483SRoy Oursler shr len2, 3 38731814483SRoy Oursler 38831814483SRoy Ourslerlen_dist_lit_huffman: 38917dac9f6SRoy Oursler MOVQ curr_data, xdata 39017dac9f6SRoy Oursler shr curr_data, 24 391af9c0c0fSRoy Oursler compute_hash hash3, curr_data 392af9c0c0fSRoy Oursler 39331814483SRoy Oursler ;; Store updated hashes 39431814483SRoy Oursler mov [histogram + _hash_offset + 2 * hash], tmp3 %+ w 39531814483SRoy Oursler add tmp3,1 39631814483SRoy Oursler mov [histogram + _hash_offset + 2 * hash2], tmp3 %+ w 397af9c0c0fSRoy Oursler add tmp3, 1 39831814483SRoy Oursler 39931814483SRoy Oursler add f_i, len2 40031814483SRoy Oursler 40117dac9f6SRoy Oursler MOVDQU xdata, [file_start + f_i] 40231814483SRoy Oursler mov curr_data, [file_start + f_i] 40331814483SRoy Oursler mov tmp1, curr_data 40431814483SRoy Oursler compute_hash hash, curr_data 40531814483SRoy Oursler 4064ae2d1beSRoy Oursler and hash3, LVL0_HASH_MASK 407af9c0c0fSRoy Oursler mov [histogram + _hash_offset + 2 * hash3], tmp3 %+ w 408af9c0c0fSRoy Oursler 40931814483SRoy Oursler dist_to_dist_code2 dist_code2, dist2 41031814483SRoy Oursler 41131814483SRoy Oursler len_to_len_code len_code, len2, rfc_lookup 41231814483SRoy Oursler 41331814483SRoy Oursler shr tmp1, 8 41431814483SRoy Oursler compute_hash hash2, tmp1 41531814483SRoy Oursler 41631814483SRoy Oursler inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code] 41731814483SRoy Oursler inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code2] 41831814483SRoy Oursler 4194ae2d1beSRoy Oursler and hash2 %+ d, LVL0_HASH_MASK 4204ae2d1beSRoy Oursler and hash, LVL0_HASH_MASK 42131814483SRoy Oursler 42231814483SRoy Oursler cmp f_i, file_length 42331814483SRoy Oursler jl loop2 42431814483SRoy Oursler jmp end_loop_2 42531814483SRoy Oursler ;; encode as dist/len 42631814483SRoy Oursler 42731814483SRoy Ourslerlen_dist_huffman_pre: 42831814483SRoy Oursler tzcnt len, len 42931814483SRoy Oursler shr len, 3 43031814483SRoy Oursler 43131814483SRoy Ourslerlen_dist_huffman: 43231814483SRoy Oursler mov [histogram + _hash_offset + 2 * hash], tmp3 %+ w 433af9c0c0fSRoy Oursler add tmp3,1 434af9c0c0fSRoy Oursler mov [histogram + _hash_offset + 2 * hash2], tmp3 %+ w 43531814483SRoy Oursler 43631814483SRoy Oursler dec f_i 43731814483SRoy Oursler add f_i, len 43831814483SRoy Oursler 43917dac9f6SRoy Oursler MOVDQU xdata, [file_start + f_i] 44031814483SRoy Oursler mov curr_data, [file_start + f_i] 44131814483SRoy Oursler mov tmp1, curr_data 44231814483SRoy Oursler compute_hash hash, curr_data 44331814483SRoy Oursler 44431814483SRoy Oursler dist_to_dist_code2 dist_code, dist 44531814483SRoy Oursler 44631814483SRoy Oursler len_to_len_code len_code, len, rfc_lookup 44731814483SRoy Oursler 44831814483SRoy Oursler shr tmp1, 8 44931814483SRoy Oursler compute_hash hash2, tmp1 45031814483SRoy Oursler 45131814483SRoy Oursler inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code] 45231814483SRoy Oursler inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code] 45331814483SRoy Oursler 4544ae2d1beSRoy Oursler and hash2 %+ d, LVL0_HASH_MASK 4554ae2d1beSRoy Oursler and hash, LVL0_HASH_MASK 45631814483SRoy Oursler 45731814483SRoy Oursler cmp f_i, file_length 45831814483SRoy Oursler jl loop2 45931814483SRoy Oursler jmp end_loop_2 46031814483SRoy Oursler 46131814483SRoy Ourslerlit_lit_huffman: 46217dac9f6SRoy Oursler MOVDQU xdata, [file_start + f_i + 1] 463af9c0c0fSRoy Oursler and curr_data, 0xff 46431814483SRoy Oursler add f_i, 1 46531814483SRoy Oursler inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data] 46631814483SRoy Oursler 46731814483SRoy Oursler cmp f_i, file_length 46831814483SRoy Oursler jl loop2 46931814483SRoy Oursler 47031814483SRoy Ourslerend_loop_2: 47131814483SRoy Oursler add file_length, LA_STATELESS - LAST_BYTES_COUNT 47231814483SRoy Oursler cmp f_i, file_length 47331814483SRoy Oursler jge final_bytes 47431814483SRoy Oursler 47531814483SRoy Ourslerloop2_finish: 4764d1fe78bSRoy Oursler mov curr_data %+ d, dword [file_start + f_i] 47731814483SRoy Oursler compute_hash hash, curr_data 4784ae2d1beSRoy Oursler and hash %+ d, LVL0_HASH_MASK 47931814483SRoy Oursler 48031814483SRoy Oursler ;; Calculate possible distance for length/dist pair. 48131814483SRoy Oursler xor dist, dist 48231814483SRoy Oursler mov dist %+ w, f_i %+ w 48331814483SRoy Oursler sub dist %+ w, word [histogram + _hash_offset + 2 * hash] 48431814483SRoy Oursler mov [histogram + _hash_offset + 2 * hash], f_i %+ w 48531814483SRoy Oursler 48631814483SRoy Oursler ;; Check if look back distance is valid (the dec is to handle when dist = 0) 48731814483SRoy Oursler dec dist 48831814483SRoy Oursler cmp dist %+ d, (D-1) 48931814483SRoy Oursler jae encode_literal_finish 49031814483SRoy Oursler inc dist 49131814483SRoy Oursler 49231814483SRoy Oursler ;; Check if look back distance is a match 4934d1fe78bSRoy Oursler lea tmp4, [file_length + LAST_BYTES_COUNT] 4944d1fe78bSRoy Oursler sub tmp4, f_i 49531814483SRoy Oursler lea tmp1, [file_start + f_i] 49631814483SRoy Oursler mov tmp2, tmp1 49731814483SRoy Oursler sub tmp2, dist 4984d1fe78bSRoy Oursler compare tmp4, tmp1, tmp2, len, tmp3 49931814483SRoy Oursler 50031814483SRoy Oursler ;; Limit len to maximum value of 258 50131814483SRoy Oursler mov tmp2, 258 50231814483SRoy Oursler cmp len, 258 50331814483SRoy Oursler cmova len, tmp2 50431814483SRoy Oursler cmp len, SHORTEST_MATCH 50531814483SRoy Oursler jb encode_literal_finish 50631814483SRoy Oursler 50731814483SRoy Oursler add f_i, len 50831814483SRoy Oursler 50931814483SRoy Oursler len_to_len_code len_code, len, rfc_lookup 51031814483SRoy Oursler dist_to_dist_code dist_code, dist 51131814483SRoy Oursler 51231814483SRoy Oursler inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code] 51331814483SRoy Oursler inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code] 51431814483SRoy Oursler 51531814483SRoy Oursler cmp f_i, file_length 51631814483SRoy Oursler jl loop2_finish 51731814483SRoy Oursler jmp final_bytes 51831814483SRoy Oursler 51931814483SRoy Ourslerencode_literal_finish: 52031814483SRoy Oursler ;; Encode literal 52131814483SRoy Oursler and curr_data %+ d, 0xFF 52231814483SRoy Oursler inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data] 52331814483SRoy Oursler 52431814483SRoy Oursler ;; Setup for next loop 52531814483SRoy Oursler add f_i, 1 52631814483SRoy Oursler cmp f_i, file_length 52731814483SRoy Oursler jl loop2_finish 52831814483SRoy Oursler 52931814483SRoy Ourslerfinal_bytes: 53031814483SRoy Oursler add file_length, LAST_BYTES_COUNT 53131814483SRoy Ourslerfinal_bytes_loop: 53231814483SRoy Oursler cmp f_i, file_length 53331814483SRoy Oursler jge end 53431814483SRoy Oursler movzx curr_data, byte [file_start + f_i] 53531814483SRoy Oursler inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data] 53631814483SRoy Oursler inc f_i 53731814483SRoy Oursler jmp final_bytes_loop 53831814483SRoy Oursler 53931814483SRoy Ourslerend: 54031814483SRoy Oursler ;; Handle eob at end of stream 54131814483SRoy Oursler mov tmp1, [rsp + _eob_count_offset] 54231814483SRoy Oursler mov qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * 256], tmp1 54331814483SRoy Oursler 544bda088b3SGreg Tuckerexit_ret: 545d06e14b9SRoy Oursler FUNC_RESTORE 54631814483SRoy Oursler ret 54731814483SRoy Oursler 54831814483SRoy Ourslercompare_loop: 5494ae2d1beSRoy Oursler and hash %+ d, LVL0_HASH_MASK 5504ae2d1beSRoy Oursler and hash2 %+ d, LVL0_HASH_MASK 55131814483SRoy Oursler lea tmp2, [tmp1 + dist - 1] 552d389b8d6SRoy Oursler 55373454909SRoy Oursler mov len2, 250 55473454909SRoy Oursler mov len, 8 55573454909SRoy Oursler compare250 tmp1, tmp2, len, len2, tmp3, ytmp0, ytmp1 556d389b8d6SRoy Oursler 55731814483SRoy Oursler lea tmp3, [f_i + 1] 55831814483SRoy Oursler jmp len_dist_huffman 55931814483SRoy Oursler 56031814483SRoy Ourslercompare_loop2: 56131814483SRoy Oursler add tmp1, 1 56231814483SRoy Oursler lea tmp2, [tmp1 + dist2 - 1] 56331814483SRoy Oursler 56473454909SRoy Oursler mov len, 250 56573454909SRoy Oursler mov len2, 8 56673454909SRoy Oursler compare250 tmp1, tmp2, len2, len, tmp3, ytmp0, ytmp1 567d389b8d6SRoy Oursler 56831814483SRoy Oursler and curr_data, 0xff 56931814483SRoy Oursler inc qword [histogram + _lit_len_offset + 8 * curr_data] 57031814483SRoy Oursler lea tmp3, [f_i + 1] 57131814483SRoy Oursler jmp len_dist_lit_huffman 57231814483SRoy Oursler 57331814483SRoy Ourslersection .data 5744d40cd36SRoy Oursler align 32 575b25ef61aSRoy OurslerD_vector: 576b25ef61aSRoy Oursler dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF 577b25ef61aSRoy Oursler dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF 578b25ef61aSRoy Oursler dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF 579b25ef61aSRoy Oursler dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF 580