1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2016 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30%include "options.asm" 31 32%include "lz0a_const.asm" 33%include "data_struct2.asm" 34%include "bitbuf2.asm" 35%include "huffman.asm" 36%include "igzip_compare_types.asm" 37%include "reg_sizes.asm" 38 39%include "stdmac.asm" 40 41%define LARGE_MATCH_HASH_REP 1 ; Hash 4 * LARGE_MATCH_HASH_REP elements 42%define LARGE_MATCH_MIN 264 ; Minimum match size to enter large match emit loop 43%define MIN_INBUF_PADDING 16 44%define MAX_EMIT_SIZE 258 * 16 45;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 46;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 47;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 48 49%define tmp2 rcx 50%define hash2 rcx 51 52%define curr_data rax 53%define code rax 54%define tmp5 rax 55 56%define tmp4 rbx 57%define dist rbx 58%define code2 rbx 59%define hmask1 rbx 60 61%define hash rdx 62%define len rdx 63%define code_len3 rdx 64%define tmp8 rdx 65 66%define tmp1 rsi 67%define code_len2 rsi 68 69%define file_start rdi 70 71%define m_bit_count rbp 72 73%define curr_data2 r8 74%define len2 r8 75%define tmp6 r8 76%define f_end_i r8 77 78%define m_bits r9 79 80%define f_i r10 81 82%define m_out_buf r11 83 84%define dist2 r12 85%define tmp7 r12 86%define code4 r12 87 88%define tmp3 r13 89%define code3 r13 90 91%define stream r14 92 93%define hufftables r15 94 95;; GPR r8 & r15 can be used 96 97%define xtmp0 xmm0 ; tmp 98%define xtmp1 xmm1 ; tmp 99%define xhash xmm2 100%define xmask xmm3 101%define xdata xmm4 102 103%define ytmp0 ymm0 ; tmp 104%define ytmp1 ymm1 ; tmp 105 106 107;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 108;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 109;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 110 111 112blen_mem_offset equ 0 ; local variable (8 bytes) 113f_end_i_mem_offset equ 8 114inbuf_slop_offset equ 16 115gpr_save_mem_offset equ 32 ; gpr save area (8*8 bytes) 116xmm_save_mem_offset equ 32 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned) 117stack_size equ 4*8 + 8*8 + 4*16 + 8 118;;; 8 because stack address is odd multiple of 8 after a function call and 119;;; we want it aligned to 16 bytes 120 121;; Defines to generate functions for different architecture 122%xdefine ARCH 01 123%xdefine ARCH1 02 124%xdefine ARCH2 04 125 126%ifndef COMPARE_TYPE 127%xdefine COMPARE_TYPE_NOT_DEF 128%xdefine COMPARE_TYPE 1 129%xdefine COMPARE_TYPE1 2 130%xdefine COMPARE_TYPE2 3 131%endif 132 133%rep 3 134%if ARCH == 04 135%define USE_HSWNI 136%endif 137 138[bits 64] 139default rel 140section .text 141 142; void isal_deflate_body ( isal_zstream *stream ) 143; arg 1: rcx: addr of stream 144global isal_deflate_body_ %+ ARCH 145isal_deflate_body_ %+ ARCH %+ : 146 endbranch 147%ifidn __OUTPUT_FORMAT__, elf64 148 mov rcx, rdi 149%endif 150 151 ;; do nothing if (avail_in == 0) 152 cmp dword [rcx + _avail_in], 0 153 jne .skip1 154 155 ;; Set stream's next state 156 mov rdx, ZSTATE_FLUSH_READ_BUFFER 157 mov rax, ZSTATE_BODY 158 cmp word [rcx + _end_of_stream], 0 159 cmovne rax, rdx 160 cmp word [rcx + _flush], _NO_FLUSH 161 cmovne rax, rdx 162 mov dword [rcx + _internal_state_state], eax 163 ret 164.skip1: 165 166%ifdef ALIGN_STACK 167 push rbp 168 mov rbp, rsp 169 sub rsp, stack_size 170 and rsp, ~15 171%else 172 sub rsp, stack_size 173%endif 174 175 mov [rsp + gpr_save_mem_offset + 0*8], rbx 176 mov [rsp + gpr_save_mem_offset + 1*8], rsi 177 mov [rsp + gpr_save_mem_offset + 2*8], rdi 178 mov [rsp + gpr_save_mem_offset + 3*8], rbp 179 mov [rsp + gpr_save_mem_offset + 4*8], r12 180 mov [rsp + gpr_save_mem_offset + 5*8], r13 181 mov [rsp + gpr_save_mem_offset + 6*8], r14 182 mov [rsp + gpr_save_mem_offset + 7*8], r15 183 184 mov stream, rcx 185 mov byte [stream + _internal_state_has_eob], 0 186 187 MOVD xmask, [stream + _internal_state_hash_mask] 188 PSHUFD xmask, xmask, 0 189 190 ; state->bitbuf.set_buf(stream->next_out, stream->avail_out); 191 mov m_out_buf, [stream + _next_out] 192 mov [stream + _internal_state_bitbuf_m_out_start], m_out_buf 193 mov tmp1 %+ d, [stream + _avail_out] 194 add tmp1, m_out_buf 195 sub tmp1, SLOP 196 197 mov [stream + _internal_state_bitbuf_m_out_end], tmp1 198 199 mov m_bits, [stream + _internal_state_bitbuf_m_bits] 200 mov m_bit_count %+ d, [stream + _internal_state_bitbuf_m_bit_count] 201 mov hufftables, [stream + _hufftables] 202 203 mov file_start, [stream + _next_in] 204 205 mov f_i %+ d, dword [stream + _total_in] 206 sub file_start, f_i 207 208 mov f_end_i %+ d, [stream + _avail_in] 209 add f_end_i, f_i 210 211 mov qword [rsp + inbuf_slop_offset], MIN_INBUF_PADDING 212 cmp byte [stream + _end_of_stream], 0 213 jnz .default_inbuf_padding 214 cmp byte [stream + _flush], 0 215 jnz .default_inbuf_padding 216 mov qword [rsp + inbuf_slop_offset], LA 217.default_inbuf_padding: 218 219 ; f_end_i -= INBUF_PADDING; 220 sub f_end_i, [rsp + inbuf_slop_offset] 221 mov [rsp + f_end_i_mem_offset], f_end_i 222 ; if (f_end_i <= 0) continue; 223 224 cmp f_end_i, f_i 225 jle .input_end 226 227 MOVD hmask1 %+ d, xmask 228 ; for (f_i = f_start_i; f_i < f_end_i; f_i++) { 229 MOVDQU xdata, [file_start + f_i] 230 mov curr_data, [file_start + f_i] 231 mov tmp3, curr_data 232 mov tmp6, curr_data 233 234 compute_hash hash, curr_data 235 236 shr tmp3, 8 237 compute_hash hash2, tmp3 238 239 and hash %+ d, hmask1 %+ d 240 and hash2 %+ d, hmask1 %+ d 241 242 cmp byte [stream + _internal_state_has_hist], IGZIP_NO_HIST 243 je .write_first_byte 244 245 jmp .loop2 246 align 16 247 248.loop2: 249 mov tmp3 %+ d, dword [stream + _internal_state_dist_mask] 250 251 ; if (state->bitbuf.is_full()) { 252 cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end] 253 ja .output_end 254 255 xor dist, dist 256 xor dist2, dist2 257 258 lea tmp1, [file_start + f_i] 259 260 mov dist %+ w, f_i %+ w 261 dec dist 262 sub dist %+ w, word [stream + _internal_state_head + 2 * hash] 263 mov [stream + _internal_state_head + 2 * hash], f_i %+ w 264 265 inc f_i 266 267 MOVQ tmp6, xdata 268 shr tmp5, 16 269 mov tmp8, tmp5 270 compute_hash tmp6, tmp5 271 272 mov dist2 %+ w, f_i %+ w 273 dec dist2 274 sub dist2 %+ w, word [stream + _internal_state_head + 2 * hash2] 275 mov [stream + _internal_state_head + 2 * hash2], f_i %+ w 276 277 ; if ((dist-1) < (D-1)) { 278 and dist, tmp3 279 neg dist 280 281 shr tmp8, 8 282 compute_hash tmp2, tmp8 283 284 and dist2, tmp3 285 neg dist2 286 287 ;; Check for long len/dist match (>7) with first literal 288 MOVQ len, xdata 289 mov curr_data, len 290 PSRLDQ xdata, 1 291 xor len, [tmp1 + dist - 1] 292 jz .compare_loop 293 294 MOVD xhash, tmp6 %+ d 295 PINSRD xhash, tmp2 %+ d, 1 296 PAND xhash, xhash, xmask 297 298 ;; Check for len/dist match (>7) with second literal 299 MOVQ len2, xdata 300 xor len2, [tmp1 + dist2] 301 jz .compare_loop2 302 303 ;; Specutively load the code for the first literal 304 movzx tmp1, curr_data %+ b 305 get_lit_code tmp1, code3, rcx, hufftables 306 307 ;; Check for len/dist match for first literal 308 test len %+ d, 0xFFFFFFFF 309 jz .len_dist_huffman_pre 310 311 ;; Specutively load the code for the second literal 312 shr curr_data, 8 313 and curr_data, 0xff 314 get_lit_code curr_data, code2, code_len2, hufftables 315 316 SHLX code2, code2, rcx 317 or code2, code3 318 add code_len2, rcx 319 320 ;; Check for len/dist match for second literal 321 test len2 %+ d, 0xFFFFFFFF 322 jnz .write_lit_bits 323 324.len_dist_lit_huffman_pre: 325 mov code_len3, rcx 326 bsf len2, len2 327 shr len2, 3 328 329.len_dist_lit_huffman: 330 neg dist2 331 332%ifndef LONGER_HUFFTABLE 333 mov tmp4, dist2 334 get_dist_code tmp4, code4, code_len2, hufftables ;; clobbers dist, rcx 335%else 336 get_dist_code dist2, code4, code_len2, hufftables 337%endif 338 get_len_code len2, code, rcx, hufftables ;; rcx is code_len 339 340 MOVD hmask1 %+ d, xmask 341 342 SHLX code4, code4, rcx 343 or code4, code 344 add code_len2, rcx 345 346 add f_i, len2 347 neg len2 348 349 SHLX code4, code4, code_len3 350 351 MOVQ tmp5, xdata 352 shr tmp5, 24 353 compute_hash hash2, tmp5 354 and hash2 %+ d, hmask1 %+ d 355 356 or code4, code3 357 add code_len2, code_len3 358 359 ;; Setup for updating hash 360 lea tmp3, [f_i + len2 + 1] ; tmp3 <= k 361 362 mov tmp6, [rsp + f_end_i_mem_offset] 363 cmp f_i, tmp6 364 jge .len_dist_lit_huffman_finish 365 366 MOVDQU xdata, [file_start + f_i] 367 mov curr_data, [file_start + f_i] 368 369 MOVD hash %+ d, xhash 370 PEXTRD tmp6 %+ d, xhash, 1 371 mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w 372 373 compute_hash hash, curr_data 374 375 add tmp3,1 376 mov [stream + _internal_state_head + 2 * tmp6], tmp3 %+ w 377 378 add tmp3, 1 379 mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w 380 381 write_bits m_bits, m_bit_count, code4, code_len2, m_out_buf 382 383 mov curr_data2, curr_data 384 shr curr_data2, 8 385 compute_hash hash2, curr_data2 386 387%ifdef NO_LIMIT_HASH_UPDATE 388.loop3: 389 add tmp3,1 390 cmp tmp3, f_i 391 jae .loop3_done 392 mov tmp6, [file_start + tmp3] 393 compute_hash tmp1, tmp6 394 and tmp1 %+ d, hmask1 %+ d 395 ; state->head[hash] = k; 396 mov [stream + _internal_state_head + 2 * tmp1], tmp3 %+ w 397 jmp .loop3 398.loop3_done: 399%endif 400 ; hash = compute_hash(state->file_start + f_i) & hash_mask; 401 and hash %+ d, hmask1 %+ d 402 and hash2 %+ d, hmask1 %+ d 403 404 ; continue 405 jmp .loop2 406 ;; encode as dist/len 407.len_dist_lit_huffman_finish: 408 MOVD hash %+ d, xhash 409 PEXTRD tmp6 %+ d, xhash, 1 410 mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w 411 add tmp3,1 412 mov [stream + _internal_state_head + 2 * tmp6], tmp3 %+ w 413 add tmp3, 1 414 mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w 415 416 write_bits m_bits, m_bit_count, code4, code_len2, m_out_buf 417 jmp .input_end 418 419align 16 420.len_dist_huffman_pre: 421 bsf len, len 422 shr len, 3 423 424.len_dist_huffman: 425 dec f_i 426 neg dist 427 428 ; get_dist_code(dist, &code2, &code_len2); 429%ifndef LONGER_HUFFTABLE 430 mov tmp3, dist ; since code2 and dist are rbx 431 get_dist_code tmp3, code2, code_len2, hufftables ;; clobbers dist, rcx 432%else 433 get_dist_code dist, code2, code_len2, hufftables 434%endif 435 ; get_len_code(len, &code, &code_len); 436 get_len_code len, code, rcx, hufftables ;; rcx is code_len 437 438 ; code2 <<= code_len 439 ; code2 |= code 440 ; code_len2 += code_len 441 SHLX code4, code2, rcx 442 or code4, code 443 add code_len2, rcx 444 445 ;; Setup for updating hash 446 lea tmp3, [f_i + 2] ; tmp3 <= k 447 add f_i, len 448 449 MOVD hash %+ d, xhash 450 PEXTRD hash2 %+ d, xhash, 1 451 mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w 452 add tmp3,1 453 mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w 454 455 MOVD hmask1 %+ d, xmask 456 457 cmp f_i, [rsp + f_end_i_mem_offset] 458 jge .len_dist_huffman_finish 459 460 MOVDQU xdata, [file_start + f_i] 461 mov curr_data, [file_start + f_i] 462 compute_hash hash, curr_data 463 464 write_bits m_bits, m_bit_count, code4, code_len2, m_out_buf 465 466 mov curr_data2, curr_data 467 shr curr_data2, 8 468 compute_hash hash2, curr_data2 469 470%ifdef NO_LIMIT_HASH_UPDATE 471.loop4: 472 add tmp3,1 473 cmp tmp3, f_i 474 jae .loop4_done 475 mov tmp6, [file_start + tmp3] 476 compute_hash tmp1, tmp6 477 and tmp1 %+ d, hmask1 %+ d 478 mov [stream + _internal_state_head + 2 * tmp1], tmp3 %+ w 479 jmp .loop4 480.loop4_done: 481%endif 482 483 ; hash = compute_hash(state->file_start + f_i) & hash_mask; 484 and hash %+ d, hmask1 %+ d 485 and hash2 %+ d, hmask1 %+ d 486 487 ; continue 488 jmp .loop2 489 490.len_dist_huffman_finish: 491 write_bits m_bits, m_bit_count, code4, code_len2, m_out_buf 492 jmp .input_end 493 494align 16 495.write_lit_bits: 496 PSRLDQ xdata, 1 497 498 add f_i, 1 499 cmp f_i, [rsp + f_end_i_mem_offset] 500 jge .write_lit_bits_finish 501 502 MOVQ curr_data, xdata 503 MOVDQU xdata, [file_start + f_i] 504 505 MOVD hash %+ d, xhash 506 507 write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf 508 509 PEXTRD hash2 %+ d, xhash, 1 510 jmp .loop2 511 512.write_lit_bits_finish: 513 write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf 514 515.input_end: 516 mov tmp1, ZSTATE_FLUSH_READ_BUFFER 517 mov tmp5, ZSTATE_BODY 518 cmp word [stream + _end_of_stream], 0 519 cmovne tmp5, tmp1 520 cmp word [stream + _flush], _NO_FLUSH 521 cmovne tmp5, tmp1 522 mov dword [stream + _internal_state_state], tmp5 %+ d 523 524.output_end: 525 ;; update input buffer 526 mov f_end_i, [rsp + f_end_i_mem_offset] 527 add f_end_i, [rsp + inbuf_slop_offset] 528 mov [stream + _total_in], f_i %+ d 529 add file_start, f_i 530 mov [stream + _next_in], file_start 531 sub f_end_i, f_i 532 mov [stream + _avail_in], f_end_i %+ d 533 534 ;; update output buffer 535 mov [stream + _next_out], m_out_buf 536 sub m_out_buf, [stream + _internal_state_bitbuf_m_out_start] 537 sub [stream + _avail_out], m_out_buf %+ d 538 add [stream + _total_out], m_out_buf %+ d 539 540 mov [stream + _internal_state_bitbuf_m_bits], m_bits 541 mov [stream + _internal_state_bitbuf_m_bit_count], m_bit_count %+ d 542 543 mov rbx, [rsp + gpr_save_mem_offset + 0*8] 544 mov rsi, [rsp + gpr_save_mem_offset + 1*8] 545 mov rdi, [rsp + gpr_save_mem_offset + 2*8] 546 mov rbp, [rsp + gpr_save_mem_offset + 3*8] 547 mov r12, [rsp + gpr_save_mem_offset + 4*8] 548 mov r13, [rsp + gpr_save_mem_offset + 5*8] 549 mov r14, [rsp + gpr_save_mem_offset + 6*8] 550 mov r15, [rsp + gpr_save_mem_offset + 7*8] 551 552%ifndef ALIGN_STACK 553 add rsp, stack_size 554%else 555 mov rsp, rbp 556 pop rbp 557%endif 558 ret 559 560align 16 561.compare_loop: 562 MOVD xhash, tmp6 %+ d 563 PINSRD xhash, tmp2 %+ d, 1 564 PAND xhash, xhash, xmask 565 lea tmp2, [tmp1 + dist - 1] 566 567 mov len2, [rsp + f_end_i_mem_offset] 568 sub len2, f_i 569 add len2, [rsp + inbuf_slop_offset] 570 add len2, 1 571 mov tmp3, MAX_EMIT_SIZE 572 cmp len2, tmp3 573 cmovg len2, tmp3 574 575 mov len, 8 576 compare_large tmp1, tmp2, len, len2, tmp3, ytmp0, ytmp1 577 578 cmp len, 258 579 jle .len_dist_huffman 580 cmp len, LARGE_MATCH_MIN 581 jge .do_emit 582 mov len, 258 583 jmp .len_dist_huffman 584 585align 16 586.compare_loop2: 587 lea tmp2, [tmp1 + dist2] 588 add tmp1, 1 589 590 mov len, [rsp + f_end_i_mem_offset] 591 sub len, f_i 592 add len, [rsp + inbuf_slop_offset] 593 mov tmp3, MAX_EMIT_SIZE 594 cmp len, tmp3 595 cmovg len, tmp3 596 597 mov len2, 8 598 compare_large tmp1, tmp2, len2, len, tmp3, ytmp0, ytmp1 599 600 and curr_data, 0xff 601 get_lit_code curr_data, code3, code_len3, hufftables 602 cmp len2, 258 603 jle .len_dist_lit_huffman 604 cmp len2, LARGE_MATCH_MIN 605 jge .do_emit2 606 mov len2, 258 607 jmp .len_dist_lit_huffman 608 609align 16 610.do_emit2: 611 neg dist2 612 613 ; get_dist_code(dist2, &code2, &code_len2); 614 get_dist_code dist2, code2, code_len2, hufftables 615 616 ; get_len_code(len, &code, &code_len); 617 get_len_code 258, code, rcx, hufftables ;; rcx is code_len 618 619 ; code2 <<= code_len 620 ; code2 |= code 621 ; code_len2 += code_len 622 SHLX code4, code2, rcx 623 or code4, code 624 add code_len2, rcx 625 mov tmp5, rcx 626 627 mov rcx, code_len3 628 SHLX tmp8, code4, rcx 629 or code3, tmp8 630 add rcx, code_len2 631 mov code_len3, rcx 632 633 write_bits m_bits, m_bit_count, code3, code_len3, m_out_buf 634 635 lea tmp3, [f_i + 2] ; tmp3 <= k 636 MOVD tmp2 %+ d, xhash 637 mov [stream + _internal_state_head + 2 * tmp2], tmp3 %+ w 638 add tmp3,1 639 PEXTRD tmp2 %+ d, xhash, 1 640 mov [stream + _internal_state_head + 2 * tmp2], tmp3 %+ w 641 642 add f_i, 258 643 lea len, [len2 - 258] 644 645 jmp .emit_loop 646 647.do_emit: 648 dec f_i 649 neg dist 650 651 ; get_dist_code(dist, &code2, &code_len2); 652%ifndef LONGER_HUFFTABLE 653 mov tmp3, dist ; since code2 and dist are rbx 654 get_dist_code tmp3, code2, code_len2, hufftables ;; clobbers dist, rcx 655%else 656 get_dist_code dist, code2, code_len2, hufftables 657%endif 658 ; get_len_code(len, &code, &code_len); 659 get_len_code 258, code, rcx, hufftables ;; rcx is code_len 660 661 ; code2 <<= code_len 662 ; code2 |= code 663 ; code_len2 += code_len 664 SHLX code4, code2, rcx 665 or code4, code 666 add code_len2, rcx 667 668 lea tmp3, [f_i + 2] ; tmp3 <= k 669 MOVD tmp6 %+ d, xhash 670 PEXTRD tmp5 %+ d, xhash, 1 671 mov [stream + _internal_state_head + 2 * tmp6], tmp3 %+ w 672 add tmp3,1 673 mov [stream + _internal_state_head + 2 * tmp5], tmp3 %+ w 674 mov tmp5, rcx 675 676.emit: 677 add f_i, 258 678 sub len, 258 679 mov code3, code4 680 681 write_bits m_bits, m_bit_count, code3, code_len2, m_out_buf 682 683.emit_loop: 684 cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end] 685 ja .output_end 686 cmp len, LARGE_MATCH_MIN 687 jge .emit 688 689 mov len2, 258 690 cmp len, len2 691 cmovg len, len2 692 693 add f_i, len 694 695 sub code_len2, tmp5 696 get_len_code len, code, rcx, hufftables 697 SHLX code4, code2, rcx 698 or code4, code 699 add code_len2, rcx 700 701 write_bits m_bits, m_bit_count, code4, code_len2, m_out_buf 702 703 cmp f_i, [rsp + f_end_i_mem_offset] 704 jge .input_end 705 706 lea tmp7, [f_i - 4 * LARGE_MATCH_HASH_REP] 707 MOVD hmask1 %+ d, xmask 708%rep LARGE_MATCH_HASH_REP 709 mov curr_data %+ d, dword [file_start + tmp7] 710 mov curr_data2 %+ d, dword [file_start + tmp7 + 1] 711 712 compute_hash hash, curr_data 713 compute_hash hash2, curr_data2 714 715 and hash %+ d, hmask1 %+ d 716 and hash2 %+ d, hmask1 %+ d 717 718 mov [stream + _internal_state_head + 2 * hash], tmp7 %+ w 719 add tmp7, 1 720 mov [stream + _internal_state_head + 2 * hash2], tmp7 %+ w 721 add tmp7, 1 722 723 mov curr_data %+ d, dword [file_start + tmp7] 724 mov curr_data2 %+ d, dword [file_start + tmp7 + 1] 725 726 compute_hash hash, curr_data 727 compute_hash hash2, curr_data2 728 729 and hash %+ d, hmask1 %+ d 730 and hash2 %+ d, hmask1 %+ d 731 732 mov [stream + _internal_state_head + 2 * hash], tmp7 %+ w 733 add tmp7, 1 734 mov [stream + _internal_state_head + 2 * hash2], tmp7 %+ w 735%if (LARGE_MATCH_HASH_REP > 1) 736 add tmp7, 1 737%endif 738%endrep 739 740 MOVDQU xdata, [file_start + f_i] 741 mov curr_data, [file_start + f_i] 742 compute_hash hash, curr_data 743 744 745 mov curr_data2, curr_data 746 shr curr_data2, 8 747 compute_hash hash2, curr_data2 748 749 ; hash = compute_hash(state->file_start + f_i) & hash_mask; 750 and hash %+ d, hmask1 %+ d 751 and hash2 %+ d, hmask1 %+ d 752 753 ; continue 754 jmp .loop2 755 756.write_first_byte: 757 cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end] 758 ja .output_end 759 760 mov byte [stream + _internal_state_has_hist], IGZIP_HIST 761 762 mov [stream + _internal_state_head + 2 * hash], f_i %+ w 763 764 mov hash, hash2 765 shr tmp6, 16 766 compute_hash hash2, tmp6 767 768 MOVD xhash, hash %+ d 769 PINSRD xhash, hash2 %+ d, 1 770 PAND xhash, xhash, xmask 771 772 and curr_data, 0xff 773 get_lit_code curr_data, code2, code_len2, hufftables 774 jmp .write_lit_bits 775 776%ifdef USE_HSWNI 777%undef USE_HSWNI 778%endif 779 780;; Shift defines over in order to iterate over all versions 781%undef ARCH 782%xdefine ARCH ARCH1 783%undef ARCH1 784%xdefine ARCH1 ARCH2 785 786%ifdef COMPARE_TYPE_NOT_DEF 787%undef COMPARE_TYPE 788%xdefine COMPARE_TYPE COMPARE_TYPE1 789%undef COMPARE_TYPE1 790%xdefine COMPARE_TYPE1 COMPARE_TYPE2 791%endif 792%endrep 793