1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2018 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30%include "options.asm" 31 32%include "lz0a_const.asm" 33%include "data_struct2.asm" 34%include "bitbuf2.asm" 35%include "huffman.asm" 36%include "igzip_compare_types.asm" 37%include "reg_sizes.asm" 38 39%include "stdmac.asm" 40 41extern rfc1951_lookup_table 42_len_to_code_offset equ 0 43 44%define LAST_BYTES_COUNT 3 ; Bytes to prevent reading out of array bounds 45%define LA_STATELESS 280 ; Max number of bytes read in loop2 rounded up to 8 byte boundary 46%define LIT_LEN 286 47%define DIST_LEN 30 48%define HIST_ELEM_SIZE 8 49 50%ifdef DEBUG 51%macro MARK 1 52global %1 53%1: 54%endm 55%else 56%macro MARK 1 57%endm 58%endif 59 60;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 61;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 62;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 63%define file_start rdi 64%define file_length rsi 65%define histogram rdx 66%define rfc_lookup r9 67%define f_i r10 68 69%define curr_data rax 70 71%define tmp2 rcx 72 73%define dist rbx 74%define dist_code2 rbx 75 76%define dist2 r12 77%define dist_code r12 78 79%define len rbp 80%define len_code rbp 81%define hash3 rbp 82 83%define curr_data2 r8 84%define len2 r8 85%define tmp4 r8 86 87%define tmp1 r11 88 89%define tmp3 r13 90 91%define hash r14 92 93%define hash2 r15 94 95%define xtmp0 xmm0 96%define xtmp1 xmm1 97%define xdata xmm2 98 99%define ytmp0 ymm0 100%define ytmp1 ymm1 101 102%if(ARCH == 01) 103%define vtmp0 xtmp0 104%define vtmp1 xtmp1 105%define V_LENGTH 16 106%else 107%define vtmp0 ytmp0 108%define vtmp1 ytmp1 109%define V_LENGTH 32 110%endif 111;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 112;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 113;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 114_eob_count_offset equ 0 ; local variable (8 bytes) 115f_end_i_mem_offset equ 8 116gpr_save_mem_offset equ 16 ; gpr save area (8*8 bytes) 117xmm_save_mem_offset equ 16 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned) 118stack_size equ 2*8 + 8*8 + 4*16 + 8 119;;; 8 because stack address is odd multiple of 8 after a function call and 120;;; we want it aligned to 16 bytes 121 122%ifidn __OUTPUT_FORMAT__, elf64 123%define arg0 rdi 124%define arg1 rsi 125%define arg2 rdx 126 127%macro FUNC_SAVE 0 128%ifdef ALIGN_STACK 129 push rbp 130 mov rbp, rsp 131 sub rsp, stack_size 132 and rsp, ~15 133%else 134 sub rsp, stack_size 135%endif 136 137 mov [rsp + gpr_save_mem_offset + 0*8], rbx 138 mov [rsp + gpr_save_mem_offset + 1*8], rbp 139 mov [rsp + gpr_save_mem_offset + 2*8], r12 140 mov [rsp + gpr_save_mem_offset + 3*8], r13 141 mov [rsp + gpr_save_mem_offset + 4*8], r14 142 mov [rsp + gpr_save_mem_offset + 5*8], r15 143%endm 144 145%macro FUNC_RESTORE 0 146 mov rbx, [rsp + gpr_save_mem_offset + 0*8] 147 mov rbp, [rsp + gpr_save_mem_offset + 1*8] 148 mov r12, [rsp + gpr_save_mem_offset + 2*8] 149 mov r13, [rsp + gpr_save_mem_offset + 3*8] 150 mov r14, [rsp + gpr_save_mem_offset + 4*8] 151 mov r15, [rsp + gpr_save_mem_offset + 5*8] 152 153%ifndef ALIGN_STACK 154 add rsp, stack_size 155%else 156 mov rsp, rbp 157 pop rbp 158%endif 159%endm 160%endif 161 162%ifidn __OUTPUT_FORMAT__, win64 163%define arg0 rcx 164%define arg1 rdx 165%define arg2 r8 166 167%macro FUNC_SAVE 0 168%ifdef ALIGN_STACK 169 push rbp 170 mov rbp, rsp 171 sub rsp, stack_size 172 and rsp, ~15 173%else 174 sub rsp, stack_size 175%endif 176 177 mov [rsp + gpr_save_mem_offset + 0*8], rbx 178 mov [rsp + gpr_save_mem_offset + 1*8], rsi 179 mov [rsp + gpr_save_mem_offset + 2*8], rdi 180 mov [rsp + gpr_save_mem_offset + 3*8], rbp 181 mov [rsp + gpr_save_mem_offset + 4*8], r12 182 mov [rsp + gpr_save_mem_offset + 5*8], r13 183 mov [rsp + gpr_save_mem_offset + 6*8], r14 184 mov [rsp + gpr_save_mem_offset + 7*8], r15 185%endm 186 187%macro FUNC_RESTORE 0 188 mov rbx, [rsp + gpr_save_mem_offset + 0*8] 189 mov rsi, [rsp + gpr_save_mem_offset + 1*8] 190 mov rdi, [rsp + gpr_save_mem_offset + 2*8] 191 mov rbp, [rsp + gpr_save_mem_offset + 3*8] 192 mov r12, [rsp + gpr_save_mem_offset + 4*8] 193 mov r13, [rsp + gpr_save_mem_offset + 5*8] 194 mov r14, [rsp + gpr_save_mem_offset + 6*8] 195 mov r15, [rsp + gpr_save_mem_offset + 7*8] 196 197%ifndef ALIGN_STACK 198 add rsp, stack_size 199%else 200 mov rsp, rbp 201 pop rbp 202%endif 203%endm 204%endif 205 206 207_lit_len_offset equ 0 208_dist_offset equ (8 * LIT_LEN) 209_hash_offset equ (_dist_offset + 8 * DIST_LEN) 210 211 212%macro len_to_len_code 3 213%define %%len_code %1 ; Output 214%define %%len %2 ; Input 215%define %%rfc_lookup %3 216 movzx %%len_code, byte [%%rfc_lookup + _len_to_code_offset + %%len] 217 or %%len_code, 0x100 218%endm 219 220;;; Clobbers rcx and dist 221%macro dist_to_dist_code 2 222%define %%dist_code %1 ; Output code associated with dist 223%define %%dist_coded %1d 224%define %%dist %2d ; Input dist 225 dec %%dist 226 mov %%dist_coded, %%dist 227 bsr ecx, %%dist_coded 228 dec ecx 229 SHRX %%dist_code, %%dist_code, rcx 230 lea %%dist_coded, [%%dist_coded + 2*ecx] 231 232 cmp %%dist, 1 233 cmovle %%dist_coded, %%dist 234%endm 235 236;;; Clobbers rcx and dist 237%macro dist_to_dist_code2 2 238%define %%dist_code %1 ; Output code associated with dist 239%define %%dist_coded %1d 240%define %%dist %2d ; Input -(dist - 1) 241 neg %%dist 242 mov %%dist_coded, %%dist 243 bsr ecx, %%dist_coded 244 dec ecx 245 SHRX %%dist_code, %%dist_code, rcx 246 lea %%dist_coded, [%%dist_coded + 2*ecx] 247 248 cmp %%dist, 1 249 cmovle %%dist_coded, %%dist 250%endm 251 252[bits 64] 253default rel 254section .text 255 256; void isal_update_histogram 257global isal_update_histogram_ %+ ARCH 258isal_update_histogram_ %+ ARCH %+ : 259 endbranch 260 FUNC_SAVE 261 262%ifnidn file_start, arg0 263 mov file_start, arg0 264%endif 265%ifnidn file_length, arg1 266 mov file_length, arg1 267%endif 268%ifnidn histogram, arg2 269 mov histogram, arg2 270%endif 271 mov f_i, 0 272 cmp file_length, 0 273 je exit_ret ; If nothing to do then exit 274 275 mov tmp1, qword [histogram + _lit_len_offset + 8*256] 276 inc tmp1 277 mov [rsp + _eob_count_offset], tmp1 278 279 lea rfc_lookup, [rfc1951_lookup_table] 280 281 ;; Init hash_table 282 PXOR vtmp0, vtmp0, vtmp0 283 mov rcx, (IGZIP_LVL0_HASH_SIZE - V_LENGTH) 284init_hash_table: 285 MOVDQU [histogram + _hash_offset + 2 * rcx], vtmp0 286 MOVDQU [histogram + _hash_offset + 2 * (rcx + V_LENGTH / 2)], vtmp0 287 sub rcx, V_LENGTH 288 jge init_hash_table 289 290 sub file_length, LA_STATELESS 291 cmp file_length, 0 292 jle end_loop_2 293 294 295 ;; Load first literal into histogram 296 mov curr_data, [file_start + f_i] 297 compute_hash hash, curr_data 298 and hash %+ d, LVL0_HASH_MASK 299 mov [histogram + _hash_offset + 2 * hash], f_i %+ w 300 and curr_data, 0xff 301 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data] 302 inc f_i 303 304 ;; Setup to begin loop 2 305 MOVDQU xdata, [file_start + f_i] 306 mov curr_data, [file_start + f_i] 307 mov curr_data2, curr_data 308 compute_hash hash, curr_data 309 shr curr_data2, 8 310 compute_hash hash2, curr_data2 311 312 and hash2 %+ d, LVL0_HASH_MASK 313 and hash, LVL0_HASH_MASK 314loop2: 315 xor dist, dist 316 xor dist2, dist2 317 xor tmp3, tmp3 318 319 lea tmp1, [file_start + f_i] 320 321 MOVQ curr_data, xdata 322 PSRLDQ xdata, 1 323 324 ;; Load possible look back distances and update hash data 325 mov dist %+ w, f_i %+ w 326 sub dist, 1 327 sub dist %+ w, word [histogram + _hash_offset + 2 * hash] 328 mov [histogram + _hash_offset + 2 * hash], f_i %+ w 329 330 add f_i, 1 331 332 mov dist2 %+ w, f_i %+ w 333 sub dist2, 1 334 sub dist2 %+ w, word [histogram + _hash_offset + 2 * hash2] 335 mov [histogram + _hash_offset + 2 * hash2], f_i %+ w 336 337 ;; Start computing hashes to be used in either the next loop or 338 ;; for updating the hash if a match is found 339 MOVQ curr_data2, xdata 340 MOVQ tmp2, xdata 341 shr curr_data2, 8 342 compute_hash hash, curr_data2 343 344 ;; Check if look back distances are valid. Load a junk distance of 1 345 ;; if the look back distance is too long for speculative lookups. 346 and dist %+ d, (D-1) 347 neg dist 348 349 and dist2 %+ d, (D-1) 350 neg dist2 351 352 shr tmp2, 16 353 compute_hash hash2, tmp2 354 355 ;; Check for long len/dist matches (>7) 356 mov len, curr_data 357 xor len, [tmp1 + dist - 1] 358 jz compare_loop 359 360 and hash %+ d, LVL0_HASH_MASK 361 and hash2 %+ d, LVL0_HASH_MASK 362 363 MOVQ len2, xdata 364 xor len2, [tmp1 + dist2] 365 jz compare_loop2 366 367 ;; Specutively load the code for the first literal 368 movzx tmp1, curr_data %+ b 369 shr curr_data, 8 370 371 lea tmp3, [f_i + 1] 372 373 ;; Check for len/dist match for first literal 374 test len %+ d, 0xFFFFFFFF 375 jz len_dist_huffman_pre 376 377 ;; Store first literal 378 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * tmp1] 379 380 ;; Check for len/dist match for second literal 381 test len2 %+ d, 0xFFFFFFFF 382 jnz lit_lit_huffman 383len_dist_lit_huffman_pre: 384 ;; Calculate repeat length 385 tzcnt len2, len2 386 shr len2, 3 387 388len_dist_lit_huffman: 389 MOVQ curr_data, xdata 390 shr curr_data, 24 391 compute_hash hash3, curr_data 392 393 ;; Store updated hashes 394 mov [histogram + _hash_offset + 2 * hash], tmp3 %+ w 395 add tmp3,1 396 mov [histogram + _hash_offset + 2 * hash2], tmp3 %+ w 397 add tmp3, 1 398 399 add f_i, len2 400 401 MOVDQU xdata, [file_start + f_i] 402 mov curr_data, [file_start + f_i] 403 mov tmp1, curr_data 404 compute_hash hash, curr_data 405 406 and hash3, LVL0_HASH_MASK 407 mov [histogram + _hash_offset + 2 * hash3], tmp3 %+ w 408 409 dist_to_dist_code2 dist_code2, dist2 410 411 len_to_len_code len_code, len2, rfc_lookup 412 413 shr tmp1, 8 414 compute_hash hash2, tmp1 415 416 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code] 417 inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code2] 418 419 and hash2 %+ d, LVL0_HASH_MASK 420 and hash, LVL0_HASH_MASK 421 422 cmp f_i, file_length 423 jl loop2 424 jmp end_loop_2 425 ;; encode as dist/len 426 427len_dist_huffman_pre: 428 tzcnt len, len 429 shr len, 3 430 431len_dist_huffman: 432 mov [histogram + _hash_offset + 2 * hash], tmp3 %+ w 433 add tmp3,1 434 mov [histogram + _hash_offset + 2 * hash2], tmp3 %+ w 435 436 dec f_i 437 add f_i, len 438 439 MOVDQU xdata, [file_start + f_i] 440 mov curr_data, [file_start + f_i] 441 mov tmp1, curr_data 442 compute_hash hash, curr_data 443 444 dist_to_dist_code2 dist_code, dist 445 446 len_to_len_code len_code, len, rfc_lookup 447 448 shr tmp1, 8 449 compute_hash hash2, tmp1 450 451 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code] 452 inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code] 453 454 and hash2 %+ d, LVL0_HASH_MASK 455 and hash, LVL0_HASH_MASK 456 457 cmp f_i, file_length 458 jl loop2 459 jmp end_loop_2 460 461lit_lit_huffman: 462 MOVDQU xdata, [file_start + f_i + 1] 463 and curr_data, 0xff 464 add f_i, 1 465 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data] 466 467 cmp f_i, file_length 468 jl loop2 469 470end_loop_2: 471 add file_length, LA_STATELESS - LAST_BYTES_COUNT 472 cmp f_i, file_length 473 jge final_bytes 474 475loop2_finish: 476 mov curr_data %+ d, dword [file_start + f_i] 477 compute_hash hash, curr_data 478 and hash %+ d, LVL0_HASH_MASK 479 480 ;; Calculate possible distance for length/dist pair. 481 xor dist, dist 482 mov dist %+ w, f_i %+ w 483 sub dist %+ w, word [histogram + _hash_offset + 2 * hash] 484 mov [histogram + _hash_offset + 2 * hash], f_i %+ w 485 486 ;; Check if look back distance is valid (the dec is to handle when dist = 0) 487 dec dist 488 cmp dist %+ d, (D-1) 489 jae encode_literal_finish 490 inc dist 491 492 ;; Check if look back distance is a match 493 lea tmp4, [file_length + LAST_BYTES_COUNT] 494 sub tmp4, f_i 495 lea tmp1, [file_start + f_i] 496 mov tmp2, tmp1 497 sub tmp2, dist 498 compare tmp4, tmp1, tmp2, len, tmp3 499 500 ;; Limit len to maximum value of 258 501 mov tmp2, 258 502 cmp len, 258 503 cmova len, tmp2 504 cmp len, SHORTEST_MATCH 505 jb encode_literal_finish 506 507 add f_i, len 508 509 len_to_len_code len_code, len, rfc_lookup 510 dist_to_dist_code dist_code, dist 511 512 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code] 513 inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code] 514 515 cmp f_i, file_length 516 jl loop2_finish 517 jmp final_bytes 518 519encode_literal_finish: 520 ;; Encode literal 521 and curr_data %+ d, 0xFF 522 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data] 523 524 ;; Setup for next loop 525 add f_i, 1 526 cmp f_i, file_length 527 jl loop2_finish 528 529final_bytes: 530 add file_length, LAST_BYTES_COUNT 531final_bytes_loop: 532 cmp f_i, file_length 533 jge end 534 movzx curr_data, byte [file_start + f_i] 535 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data] 536 inc f_i 537 jmp final_bytes_loop 538 539end: 540 ;; Handle eob at end of stream 541 mov tmp1, [rsp + _eob_count_offset] 542 mov qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * 256], tmp1 543 544exit_ret: 545 FUNC_RESTORE 546 ret 547 548compare_loop: 549 and hash %+ d, LVL0_HASH_MASK 550 and hash2 %+ d, LVL0_HASH_MASK 551 lea tmp2, [tmp1 + dist - 1] 552 553 mov len2, 250 554 mov len, 8 555 compare250 tmp1, tmp2, len, len2, tmp3, ytmp0, ytmp1 556 557 lea tmp3, [f_i + 1] 558 jmp len_dist_huffman 559 560compare_loop2: 561 add tmp1, 1 562 lea tmp2, [tmp1 + dist2 - 1] 563 564 mov len, 250 565 mov len2, 8 566 compare250 tmp1, tmp2, len2, len, tmp3, ytmp0, ytmp1 567 568 and curr_data, 0xff 569 inc qword [histogram + _lit_len_offset + 8 * curr_data] 570 lea tmp3, [f_i + 1] 571 jmp len_dist_lit_huffman 572 573section .data 574 align 32 575D_vector: 576 dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF 577 dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF 578 dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF 579 dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF 580