1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2018 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30%include "reg_sizes.asm" 31%include "lz0a_const.asm" 32%include "data_struct2.asm" 33%include "stdmac.asm" 34 35%ifdef HAVE_AS_KNOWS_AVX512 36 37%define ARCH 06 38%define USE_HSWNI 39 40; tree entry is 4 bytes: 41; lit/len tree (513 entries) 42; | 3 | 2 | 1 | 0 | 43; | len | code | 44; 45; dist tree 46; | 3 | 2 | 1 | 0 | 47; |eblen:codlen| code | 48 49; token format: 50; DIST_OFFSET:0 : lit/len 51; 31:(DIST_OFFSET + 5) : dist Extra Bits 52; (DIST_OFFSET + 5):DIST_OFFSET : dist code 53; lit/len: 0-256 (literal) 54; 257-512 (dist + 254) 55 56; returns final token pointer 57; equal to token_end if successful 58; uint32_t* encode_df(uint32_t *token_start, uint32_t *token_end, 59; BitBuf *out_buf, uint32_t *trees); 60 61%ifidn __OUTPUT_FORMAT__, win64 62%define arg1 rcx 63%define arg2 rdx 64%define arg3 r8 65%define arg4 r9 66%define sym rsi 67%define dsym rdi 68%define hufftables r9 69%define ptr r11 70%else 71; Linux 72%define arg1 rdi 73%define arg2 rsi 74%define arg3 rdx 75%define arg4 rcx 76%define sym r9 77%define dsym r8 78%define hufftables r11 79%define ptr rdi 80%endif 81 82%define in_buf_end arg2 83%define bitbuf arg3 84%define out_buf bitbuf 85; bit_count is rcx 86%define bits rax 87%define data r12 88%define tmp rbx 89%define len dsym 90%define tmp2 r10 91%define end_ptr rbp 92 93%define LIT_MASK ((0x1 << LIT_LEN_BIT_COUNT) - 1) 94%define DIST_MASK ((0x1 << DIST_LIT_BIT_COUNT) - 1) 95 96%define codes1 zmm1 97%define code_lens1 zmm2 98%define codes2 zmm3 99%define code_lens2 zmm4 100%define codes3 zmm5 101%define ztmp zmm5 102%define code_lens3 zmm6 103%define codes4 zmm7 104%define syms zmm7 105 106%define code_lens4 zmm8 107%define dsyms zmm8 108%define zbits_count_q zmm8 109 110%define codes_lookup1 zmm9 111%define codes_lookup2 zmm10 112%define datas zmm11 113%define zbits zmm12 114%define zbits_count zmm13 115%define zoffset_mask zmm14 116%define znotoffset_mask zmm23 117 118%define zq_64 zmm15 119%define zlit_mask zmm16 120%define zdist_mask zmm17 121%define zlit_icr_mask zmm18 122%define zeb_icr_mask zmm19 123%define zmax_write zmm20 124%define zrot_perm zmm21 125%define zq_8 zmm22 126 127%define VECTOR_SIZE 0x40 128%define VECTOR_LOOP_PROCESSED (2 * VECTOR_SIZE) 129%define VECTOR_SLOP 0x40 - 8 130 131gpr_save_mem_offset equ 0 132gpr_save_mem_size equ 8 * 6 133xmm_save_mem_offset equ gpr_save_mem_offset + gpr_save_mem_size 134xmm_save_mem_size equ 10 * 16 135bitbuf_mem_offset equ xmm_save_mem_offset + xmm_save_mem_size 136bitbuf_mem_size equ 8 137stack_size equ gpr_save_mem_size + xmm_save_mem_size + bitbuf_mem_size 138 139 140%macro FUNC_SAVE 0 141 sub rsp, stack_size 142 mov [rsp + gpr_save_mem_offset + 0*8], rbx 143 mov [rsp + gpr_save_mem_offset + 1*8], rbp 144 mov [rsp + gpr_save_mem_offset + 2*8], r12 145 146%ifidn __OUTPUT_FORMAT__, win64 147 mov [rsp + gpr_save_mem_offset + 3*8], rsi 148 mov [rsp + gpr_save_mem_offset + 4*8], rdi 149 150 MOVDQU [rsp + xmm_save_mem_offset + 0*8], xmm6 151 MOVDQU [rsp + xmm_save_mem_offset + 1*8], xmm7 152 MOVDQU [rsp + xmm_save_mem_offset + 2*8], xmm8 153 MOVDQU [rsp + xmm_save_mem_offset + 3*8], xmm9 154 MOVDQU [rsp + xmm_save_mem_offset + 4*8], xmm10 155 MOVDQU [rsp + xmm_save_mem_offset + 5*8], xmm11 156 MOVDQU [rsp + xmm_save_mem_offset + 6*8], xmm12 157 MOVDQU [rsp + xmm_save_mem_offset + 7*8], xmm13 158 MOVDQU [rsp + xmm_save_mem_offset + 8*8], xmm14 159 MOVDQU [rsp + xmm_save_mem_offset + 9*8], xmm15 160%endif 161 162%endm 163 164%macro FUNC_RESTORE 0 165 mov rbx, [rsp + gpr_save_mem_offset + 0*8] 166 mov rbp, [rsp + gpr_save_mem_offset + 1*8] 167 mov r12, [rsp + gpr_save_mem_offset + 2*8] 168 169%ifidn __OUTPUT_FORMAT__, win64 170 mov rsi, [rsp + gpr_save_mem_offset + 3*8] 171 mov rdi, [rsp + gpr_save_mem_offset + 4*8] 172 173 MOVDQU xmm6, [rsp + xmm_save_mem_offset + 0*8] 174 MOVDQU xmm7, [rsp + xmm_save_mem_offset + 1*8] 175 MOVDQU xmm8, [rsp + xmm_save_mem_offset + 2*8] 176 MOVDQU xmm9, [rsp + xmm_save_mem_offset + 3*8] 177 MOVDQU xmm10, [rsp + xmm_save_mem_offset + 4*8] 178 MOVDQU xmm11, [rsp + xmm_save_mem_offset + 5*8] 179 MOVDQU xmm12, [rsp + xmm_save_mem_offset + 6*8] 180 MOVDQU xmm13, [rsp + xmm_save_mem_offset + 7*8] 181 MOVDQU xmm14, [rsp + xmm_save_mem_offset + 8*8] 182 MOVDQU xmm15, [rsp + xmm_save_mem_offset + 9*8] 183%endif 184 add rsp, stack_size 185 186%endmacro 187 188default rel 189section .text 190 191global encode_deflate_icf_ %+ ARCH 192encode_deflate_icf_ %+ ARCH: 193 endbranch 194 FUNC_SAVE 195 196%ifnidn ptr, arg1 197 mov ptr, arg1 198%endif 199%ifnidn hufftables, arg4 200 mov hufftables, arg4 201%endif 202 203 mov [rsp + bitbuf_mem_offset], bitbuf 204 mov bits, [bitbuf + _m_bits] 205 mov ecx, [bitbuf + _m_bit_count] 206 mov end_ptr, [bitbuf + _m_out_end] 207 mov out_buf, [bitbuf + _m_out_buf] ; clobbers bitbuf 208 209 sub end_ptr, VECTOR_SLOP 210 sub in_buf_end, VECTOR_LOOP_PROCESSED 211 cmp ptr, in_buf_end 212 jge .finish 213 214 kxorq k0, k0, k0 215 kmovq k1, [k_mask_1] 216 kmovq k2, [k_mask_2] 217 kmovq k3, [k_mask_3] 218 kmovq k4, [k_mask_4] 219 kmovq k5, [k_mask_5] 220 221 vmovdqa64 zrot_perm, [rot_perm] 222 223 vbroadcasti64x2 zq_64, [q_64] 224 vbroadcasti64x2 zq_8, [q_8] 225 226 vpbroadcastq zoffset_mask, [offset_mask] 227 vpternlogd znotoffset_mask, znotoffset_mask, zoffset_mask, 0x55 228 229 vpbroadcastd zlit_mask, [lit_mask] 230 vpbroadcastd zdist_mask, [dist_mask] 231 vpbroadcastd zlit_icr_mask, [lit_icr_mask] 232 vpbroadcastd zeb_icr_mask, [eb_icr_mask] 233 vpbroadcastd zmax_write, [max_write_d] 234 235 knotq k6, k0 236 vmovdqu64 datas, [ptr] 237 vpandd syms, datas, zlit_mask 238 vpgatherdd codes_lookup1 {k6}, [hufftables + _lit_len_table + 4 * syms] 239 240 knotq k7, k0 241 vpsrld dsyms, datas, DIST_OFFSET 242 vpandd dsyms, dsyms, zdist_mask 243 vpgatherdd codes_lookup2 {k7}, [hufftables + _dist_table + 4 * dsyms] 244 245 vmovq zbits %+ x, bits 246 vmovq zbits_count %+ x, rcx 247 248.main_loop: 249 ;; Sets codes1 to contain lit/len codes andcode_lens1 the corresponding lengths 250 vpsrld code_lens1, codes_lookup1, 24 251 vpandd codes1, codes_lookup1, zlit_icr_mask 252 253 ;; Sets codes2 to contain dist codes, code_lens2 the corresponding lengths, 254 ;; and code_lens3 the extra bit counts 255 vmovdqu16 codes2 {k1}{z}, codes_lookup2 ;Bits 8 and above of zbits are 0 256 vpsrld code_lens2, codes_lookup2, 24 257 vpsrld code_lens3, codes_lookup2, 16 258 vpandd code_lens3, code_lens3, zeb_icr_mask 259 260 ;; Set codes3 to contain the extra bits 261 vpsrld codes3, datas, EXTRA_BITS_OFFSET 262 263 cmp out_buf, end_ptr 264 ja .main_loop_exit 265 266 ;; Start code lookups for next iteration 267 knotq k6, k0 268 add ptr, VECTOR_SIZE 269 vmovdqu64 datas, [ptr] 270 vpandd syms, datas, zlit_mask 271 vpgatherdd codes_lookup1 {k6}, [hufftables + _lit_len_table + 4 * syms] 272 273 knotq k7, k0 274 vpsrld dsyms, datas, DIST_OFFSET 275 vpandd dsyms, dsyms, zdist_mask 276 vpgatherdd codes_lookup2 {k7}, [hufftables + _dist_table + 4 * dsyms] 277 278 ;; Merge dist code with extra bits 279 vpsllvd codes3, codes3, code_lens2 280 vpxord codes2, codes2, codes3 281 vpaddd code_lens2, code_lens2, code_lens3 282 283 ;; Check for long codes 284 vpaddd code_lens3, code_lens1, code_lens2 285 vpcmpgtd k6, code_lens3, zmax_write 286 ktestd k6, k6 287 jnz .long_codes 288 289 ;; Merge dist and len codes 290 vpsllvd codes2, codes2, code_lens1 291 vpxord codes1, codes1, codes2 292 293 vmovdqa32 codes3 {k1}{z}, codes1 294 vpsrlq codes1, codes1, 32 295 vpsrlq code_lens1, code_lens3, 32 296 vmovdqa32 code_lens3 {k1}{z}, code_lens3 297 298 ;; Merge bitbuf bits 299 vpsllvq codes3, codes3, zbits_count 300 vpxord codes3, codes3, zbits 301 vpaddq code_lens3, code_lens3, zbits_count 302 303 ;; Merge two symbols into qwords 304 vpsllvq codes1, codes1, code_lens3 305 vpxord codes1, codes1, codes3 306 vpaddq code_lens1, code_lens1, code_lens3 307 308 ;; Determine total bits at end of each qword 309 vpermq zbits_count {k5}{z}, zrot_perm, code_lens1 310 vpaddq code_lens2, zbits_count, code_lens1 311 vshufi64x2 zbits_count {k3}{z}, code_lens2, code_lens2, 0x90 312 vpaddq code_lens2, code_lens2, zbits_count 313 vshufi64x2 zbits_count {k2}{z}, code_lens2, code_lens2, 0x40 314 vpaddq code_lens2, code_lens2, zbits_count 315 316 ;; Bit align quadwords 317 vpandd zbits_count, code_lens2, zoffset_mask 318 vpermq zbits_count_q {k5}{z}, zrot_perm, zbits_count 319 vpsllvq codes1, codes1, zbits_count_q 320 321 ;; Check whether any of the last bytes overlap 322 vpcmpq k6 {k5}, code_lens1, zbits_count, 1 323 324 ;; Get last byte in each qword 325 vpsrlq code_lens2, code_lens2, 3 326 vpaddq code_lens1, code_lens1, zbits_count_q 327 vpandq code_lens1, code_lens1, znotoffset_mask 328 vpsrlvq codes3, codes1, code_lens1 329 330 ;; Branch to handle overlapping last bytes 331 ktestd k6, k6 332 jnz .small_codes 333 334.small_codes_next: 335 ;; Save off zbits and zbits_count for next loop 336 knotq k7, k5 337 vpermq zbits {k7}{z}, zrot_perm, codes3 338 vpermq zbits_count {k7}{z}, zrot_perm, zbits_count 339 340 ;; Merge last byte in each qword with the next qword 341 vpermq codes3 {k5}{z}, zrot_perm, codes3 342 vpxord codes1, codes1, codes3 343 344 ;; Determine total bytes written 345 vextracti64x2 code_lens1 %+ x, code_lens2, 3 346 vpextrq tmp2, code_lens1 %+ x, 1 347 348 ;; Write out qwords 349 knotq k6, k0 350 vpermq code_lens2 {k5}{z}, zrot_perm, code_lens2 351 vpscatterqq [out_buf + code_lens2] {k6}, codes1 352 353 add out_buf, tmp2 354 355 cmp ptr, in_buf_end 356 jbe .main_loop 357 358.main_loop_exit: 359 vmovq rcx, zbits_count %+ x 360 vmovq bits, zbits %+ x 361 jmp .finish 362 363.small_codes: 364 ;; Merge overlapping last bytes 365 vpermq codes4 {k6}{z}, zrot_perm, codes3 366 vporq codes3, codes3, codes4 367 kshiftlq k7, k6, 1 368 ktestd k6, k7 369 jz .small_codes_next 370 371 kandq k6, k6, k7 372 jmp .small_codes 373 374.long_codes: 375 add end_ptr, VECTOR_SLOP 376 sub ptr, VECTOR_SIZE 377 378 vmovdqa32 codes3 {k1}{z}, codes1 379 vmovdqa32 code_lens3 {k1}{z}, code_lens1 380 vmovdqa32 codes4 {k1}{z}, codes2 381 382 vpsllvq codes4, codes4, code_lens3 383 vpxord codes3, codes3, codes4 384 vpaddd code_lens3, code_lens1, code_lens2 385 386 vpsrlq codes1, codes1, 32 387 vpsrlq code_lens1, code_lens1, 32 388 vpsrlq codes2, codes2, 32 389 390 vpsllvq codes2, codes2, code_lens1 391 vpxord codes1, codes1, codes2 392 393 vpsrlq code_lens1, code_lens3, 32 394 vmovdqa32 code_lens3 {k1}{z}, code_lens3 395 396 ;; Merge bitbuf bits 397 vpsllvq codes3, codes3, zbits_count 398 vpxord codes3, codes3, zbits 399 vpaddq code_lens3, code_lens3, zbits_count 400 vpaddq code_lens1, code_lens1, code_lens3 401 402 xor bits, bits 403 xor rcx, rcx 404 vpsubq code_lens1, code_lens1, code_lens3 405 406 vmovdqu64 codes2, codes1 407 vmovdqu64 code_lens2, code_lens1 408 vmovdqu64 codes4, codes3 409 vmovdqu64 code_lens4, code_lens3 410%assign i 0 411%rep 4 412%assign i (i + 1) 413;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 414 cmp out_buf, end_ptr 415 ja .overflow 416 ;; insert LL code 417 vmovq sym, codes3 %+ x 418 vmovq tmp2, code_lens3 %+ x 419 SHLX sym, sym, rcx 420 or bits, sym 421 add rcx, tmp2 422 423 ; empty bits 424 mov [out_buf], bits 425 mov tmp, rcx 426 shr tmp, 3 ; byte count 427 add out_buf, tmp 428 mov tmp, rcx 429 and rcx, ~7 430 SHRX bits, bits, rcx 431 mov rcx, tmp 432 and rcx, 7 433 add ptr, 4 434 435;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 436 cmp out_buf, end_ptr 437 ja .overflow 438 ;; insert LL code 439 vmovq sym, codes1 %+ x 440 vmovq tmp2, code_lens1 %+ x 441 SHLX sym, sym, rcx 442 or bits, sym 443 add rcx, tmp2 444 445 ; empty bits 446 mov [out_buf], bits 447 mov tmp, rcx 448 shr tmp, 3 ; byte count 449 add out_buf, tmp 450 mov tmp, rcx 451 and rcx, ~7 452 SHRX bits, bits, rcx 453 mov rcx, tmp 454 and rcx, 7 455 add ptr, 4 456 457;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 458 cmp out_buf, end_ptr 459 ja .overflow 460 ;; insert LL code 461 vpextrq sym, codes3 %+ x, 1 462 vpextrq tmp2, code_lens3 %+ x, 1 463 SHLX sym, sym, rcx 464 or bits, sym 465 add rcx, tmp2 466 467 ; empty bits 468 mov [out_buf], bits 469 mov tmp, rcx 470 shr tmp, 3 ; byte count 471 add out_buf, tmp 472 mov tmp, rcx 473 and rcx, ~7 474 SHRX bits, bits, rcx 475 mov rcx, tmp 476 and rcx, 7 477 add ptr, 4 478 479;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 480 cmp out_buf, end_ptr 481 ja .overflow 482 ;; insert LL code 483 vpextrq sym, codes1 %+ x, 1 484 vpextrq tmp2, code_lens1 %+ x, 1 485 SHLX sym, sym, rcx 486 or bits, sym 487 add rcx, tmp2 488 489 ; empty bits 490 mov [out_buf], bits 491 mov tmp, rcx 492 shr tmp, 3 ; byte count 493 add out_buf, tmp 494 mov tmp, rcx 495 and rcx, ~7 496 SHRX bits, bits, rcx 497 mov rcx, tmp 498 and rcx, 7 499 add ptr, 4 500 501 vextracti32x4 codes3 %+ x, codes4, i 502 vextracti32x4 code_lens3 %+ x, code_lens4, i 503 vextracti32x4 codes1 %+ x, codes2, i 504 vextracti32x4 code_lens1 %+ x, code_lens2, i 505%endrep 506 sub end_ptr, VECTOR_SLOP 507 508 vmovq zbits %+ x, bits 509 vmovq zbits_count %+ x, rcx 510 cmp ptr, in_buf_end 511 jbe .main_loop 512 513.finish: 514 add in_buf_end, VECTOR_LOOP_PROCESSED 515 add end_ptr, VECTOR_SLOP 516 517 cmp ptr, in_buf_end 518 jge .overflow 519 520.finish_loop: 521 mov DWORD(data), [ptr] 522 523 cmp out_buf, end_ptr 524 ja .overflow 525 526 mov sym, data 527 and sym, LIT_MASK ; sym has ll_code 528 mov DWORD(sym), [hufftables + _lit_len_table + sym * 4] 529 530 ; look up dist sym 531 mov dsym, data 532 shr dsym, DIST_OFFSET 533 and dsym, DIST_MASK 534 mov DWORD(dsym), [hufftables + _dist_table + dsym * 4] 535 536 ; insert LL code 537 ; sym: 31:24 length; 23:0 code 538 mov tmp2, sym 539 and sym, 0xFFFFFF 540 SHLX sym, sym, rcx 541 shr tmp2, 24 542 or bits, sym 543 add rcx, tmp2 544 545 ; insert dist code 546 movzx tmp, WORD(dsym) 547 SHLX tmp, tmp, rcx 548 or bits, tmp 549 mov tmp, dsym 550 shr tmp, 24 551 add rcx, tmp 552 553 ; insert dist extra bits 554 shr data, EXTRA_BITS_OFFSET 555 add ptr, 4 556 SHLX data, data, rcx 557 or bits, data 558 shr dsym, 16 559 and dsym, 0xFF 560 add rcx, dsym 561 562 ; empty bits 563 mov [out_buf], bits 564 mov tmp, rcx 565 shr tmp, 3 ; byte count 566 add out_buf, tmp 567 mov tmp, rcx 568 and rcx, ~7 569 SHRX bits, bits, rcx 570 mov rcx, tmp 571 and rcx, 7 572 573 cmp ptr, in_buf_end 574 jb .finish_loop 575 576.overflow: 577 mov tmp, [rsp + bitbuf_mem_offset] 578 mov [tmp + _m_bits], bits 579 mov [tmp + _m_bit_count], ecx 580 mov [tmp + _m_out_buf], out_buf 581 582 mov rax, ptr 583 584 FUNC_RESTORE 585 586 ret 587 588section .data 589 align 64 590;; 64 byte data 591rot_perm: 592 dq 0x00000007, 0x00000000, 0x00000001, 0x00000002 593 dq 0x00000003, 0x00000004, 0x00000005, 0x00000006 594 595;; 16 byte data 596q_64: 597 dq 0x0000000000000040, 0x0000000000000000 598q_8 : 599 dq 0x0000000000000000, 0x0000000000000008 600 601;; 8 byte data 602offset_mask: 603 dq 0x0000000000000007 604 605;; 4 byte data 606max_write_d: 607 dd 0x1c 608lit_mask: 609 dd LIT_MASK 610dist_mask: 611 dd DIST_MASK 612lit_icr_mask: 613 dd 0x00ffffff 614eb_icr_mask: 615 dd 0x000000ff 616 617;; k mask constants 618k_mask_1: dq 0x55555555 619k_mask_2: dq 0xfffffff0 620k_mask_3: dq 0xfffffffc 621k_mask_4: dw 0x0101, 0x0101, 0x0101, 0x0101 622k_mask_5: dq 0xfffffffe 623 624%endif 625