1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2018 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30%include "reg_sizes.asm" 31%include "lz0a_const.asm" 32%include "data_struct2.asm" 33%include "stdmac.asm" 34 35%define ARCH 04 36%define USE_HSWNI 37 38; tree entry is 4 bytes: 39; lit/len tree (513 entries) 40; | 3 | 2 | 1 | 0 | 41; | len | code | 42; 43; dist tree 44; | 3 | 2 | 1 | 0 | 45; |eblen:codlen| code | 46 47; token format: 48; DIST_OFFSET:0 : lit/len 49; 31:(DIST_OFFSET + 5) : dist Extra Bits 50; (DIST_OFFSET + 5):DIST_OFFSET : dist code 51; lit/len: 0-256 (literal) 52; 257-512 (dist + 254) 53 54; returns final token pointer 55; equal to token_end if successful 56; uint32_t* encode_df(uint32_t *token_start, uint32_t *token_end, 57; BitBuf *out_buf, uint32_t *trees); 58 59%ifidn __OUTPUT_FORMAT__, win64 60%define arg1 rcx 61%define arg2 rdx 62%define arg3 r8 63%define arg4 r9 64%define sym rsi 65%define dsym rdi 66%define hufftables r9 67%define ptr r11 68%else 69; Linux 70%define arg1 rdi 71%define arg2 rsi 72%define arg3 rdx 73%define arg4 rcx 74%define sym r9 75%define dsym r8 76%define hufftables r11 77%define ptr rdi 78%endif 79 80%define in_buf_end arg2 81%define bitbuf arg3 82%define out_buf bitbuf 83; bit_count is rcx 84%define bits rax 85%define data r12 86%define tmp rbx 87%define len dsym 88%define tmp2 r10 89%define end_ptr rbp 90 91%define LIT_MASK ((0x1 << LIT_LEN_BIT_COUNT) - 1) 92%define DIST_MASK ((0x1 << DIST_LIT_BIT_COUNT) - 1) 93 94%define codes1 ymm1 95%define code_lens1 ymm2 96%define codes2 ymm3 97%define code_lens2 ymm4 98%define codes3 ymm5 99%define code_lens3 ymm6 100%define codes4 ymm7 101%define syms ymm7 102 103%define code_lens4 ymm8 104%define dsyms ymm8 105 106%define ytmp ymm9 107%define codes_lookup1 ymm10 108%define codes_lookup2 ymm11 109%define datas ymm12 110%define ybits ymm13 111%define ybits_count ymm14 112%define yoffset_mask ymm15 113 114%define VECTOR_SIZE 0x20 115%define VECTOR_LOOP_PROCESSED (2 * VECTOR_SIZE) 116%define VECTOR_SLOP 0x20 - 8 117 118gpr_save_mem_offset equ 0 119gpr_save_mem_size equ 8 * 6 120xmm_save_mem_offset equ gpr_save_mem_offset + gpr_save_mem_size 121xmm_save_mem_size equ 10 * 16 122bitbuf_mem_offset equ xmm_save_mem_offset + xmm_save_mem_size 123bitbuf_mem_size equ 8 124stack_size equ gpr_save_mem_size + xmm_save_mem_size + bitbuf_mem_size 125 126 127%macro FUNC_SAVE 0 128 sub rsp, stack_size 129 mov [rsp + gpr_save_mem_offset + 0*8], rbx 130 mov [rsp + gpr_save_mem_offset + 1*8], rbp 131 mov [rsp + gpr_save_mem_offset + 2*8], r12 132 133%ifidn __OUTPUT_FORMAT__, win64 134 mov [rsp + gpr_save_mem_offset + 3*8], rsi 135 mov [rsp + gpr_save_mem_offset + 4*8], rdi 136 137 MOVDQU [rsp + xmm_save_mem_offset + 0*8], xmm6 138 MOVDQU [rsp + xmm_save_mem_offset + 1*8], xmm7 139 MOVDQU [rsp + xmm_save_mem_offset + 2*8], xmm8 140 MOVDQU [rsp + xmm_save_mem_offset + 3*8], xmm9 141 MOVDQU [rsp + xmm_save_mem_offset + 4*8], xmm10 142 MOVDQU [rsp + xmm_save_mem_offset + 5*8], xmm11 143 MOVDQU [rsp + xmm_save_mem_offset + 6*8], xmm12 144 MOVDQU [rsp + xmm_save_mem_offset + 7*8], xmm13 145 MOVDQU [rsp + xmm_save_mem_offset + 8*8], xmm14 146 MOVDQU [rsp + xmm_save_mem_offset + 9*8], xmm15 147%endif 148 149%endm 150 151%macro FUNC_RESTORE 0 152 mov rbx, [rsp + gpr_save_mem_offset + 0*8] 153 mov rbp, [rsp + gpr_save_mem_offset + 1*8] 154 mov r12, [rsp + gpr_save_mem_offset + 2*8] 155 156%ifidn __OUTPUT_FORMAT__, win64 157 mov rsi, [rsp + gpr_save_mem_offset + 3*8] 158 mov rdi, [rsp + gpr_save_mem_offset + 4*8] 159 160 MOVDQU xmm6, [rsp + xmm_save_mem_offset + 0*8] 161 MOVDQU xmm7, [rsp + xmm_save_mem_offset + 1*8] 162 MOVDQU xmm8, [rsp + xmm_save_mem_offset + 2*8] 163 MOVDQU xmm9, [rsp + xmm_save_mem_offset + 3*8] 164 MOVDQU xmm10, [rsp + xmm_save_mem_offset + 4*8] 165 MOVDQU xmm11, [rsp + xmm_save_mem_offset + 5*8] 166 MOVDQU xmm12, [rsp + xmm_save_mem_offset + 6*8] 167 MOVDQU xmm13, [rsp + xmm_save_mem_offset + 7*8] 168 MOVDQU xmm14, [rsp + xmm_save_mem_offset + 8*8] 169 MOVDQU xmm15, [rsp + xmm_save_mem_offset + 9*8] 170%endif 171 add rsp, stack_size 172 173%endmacro 174 175default rel 176section .text 177 178global encode_deflate_icf_ %+ ARCH 179encode_deflate_icf_ %+ ARCH: 180 endbranch 181 FUNC_SAVE 182 183%ifnidn ptr, arg1 184 mov ptr, arg1 185%endif 186%ifnidn hufftables, arg4 187 mov hufftables, arg4 188%endif 189 190 mov [rsp + bitbuf_mem_offset], bitbuf 191 mov bits, [bitbuf + _m_bits] 192 mov ecx, [bitbuf + _m_bit_count] 193 mov end_ptr, [bitbuf + _m_out_end] 194 mov out_buf, [bitbuf + _m_out_buf] ; clobbers bitbuf 195 196 sub end_ptr, VECTOR_SLOP 197 sub in_buf_end, VECTOR_LOOP_PROCESSED 198 cmp ptr, in_buf_end 199 jge .finish 200 201 vpcmpeqq ytmp, ytmp, ytmp 202 vmovdqu datas, [ptr] 203 vpand syms, datas, [lit_mask] 204 vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp 205 206 vpcmpeqq ytmp, ytmp, ytmp 207 vpsrld dsyms, datas, DIST_OFFSET 208 vpand dsyms, dsyms, [dist_mask] 209 vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp 210 211 vmovq ybits %+ x, bits 212 vmovq ybits_count %+ x, rcx 213 vmovdqa yoffset_mask, [offset_mask] 214 215.main_loop: 216 ;; Sets codes1 to contain lit/len codes andcode_lens1 the corresponding lengths 217 vpsrld code_lens1, codes_lookup1, 24 218 vpand codes1, codes_lookup1, [lit_icr_mask] 219 220 ;; Sets codes2 to contain dist codes, code_lens2 the corresponding lengths, 221 ;; and code_lens3 the extra bit counts 222 vpblendw codes2, ybits, codes_lookup2, 0x55 ;Bits 8 and above of ybits are 0 223 vpsrld code_lens2, codes_lookup2, 24 224 vpsrld code_lens3, codes_lookup2, 16 225 vpand code_lens3, [eb_icr_mask] 226 227 ;; Set codes3 to contain the extra bits 228 vpsrld codes3, datas, EXTRA_BITS_OFFSET 229 230 cmp out_buf, end_ptr 231 ja .main_loop_exit 232 233 ;; Start code lookups for next iteration 234 add ptr, VECTOR_SIZE 235 vpcmpeqq ytmp, ytmp, ytmp 236 vmovdqu datas, [ptr] 237 vpand syms, datas, [lit_mask] 238 vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp 239 240 vpcmpeqq ytmp, ytmp, ytmp 241 vpsrld dsyms, datas, DIST_OFFSET 242 vpand dsyms, dsyms, [dist_mask] 243 vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp 244 245 ;; Merge dist code with extra bits 246 vpsllvd codes3, codes3, code_lens2 247 vpxor codes2, codes2, codes3 248 vpaddd code_lens2, code_lens2, code_lens3 249 250 ;; Check for long codes 251 vpaddd code_lens3, code_lens1, code_lens2 252 vpcmpgtd ytmp, code_lens3, [max_write_d] 253 vptest ytmp, ytmp 254 jnz .long_codes 255 256 ;; Merge dist and len codes 257 vpsllvd codes2, codes2, code_lens1 258 vpxor codes1, codes1, codes2 259 260 ;; Split buffer data into qwords, ytmp is 0 after last branch 261 vpblendd codes3, ytmp, codes1, 0x55 262 vpsrlq codes1, codes1, 32 263 vpsrlq code_lens1, code_lens3, 32 264 vpblendd code_lens3, ytmp, code_lens3, 0x55 265 266 ;; Merge bitbuf bits 267 vpsllvq codes3, codes3, ybits_count 268 vpxor codes3, codes3, ybits 269 vpaddq code_lens3, code_lens3, ybits_count 270 271 ;; Merge two symbols into qwords 272 vpsllvq codes1, codes1, code_lens3 273 vpxor codes1, codes1, codes3 274 vpaddq code_lens1, code_lens1, code_lens3 275 276 ;; Split buffer data into dqwords, ytmp is 0 after last branch 277 vpblendd codes2, ytmp, codes1, 0x33 278 vpblendd code_lens2, ytmp, code_lens1, 0x33 279 vpsrldq codes1, 8 280 vpsrldq code_lens1, 8 281 282 ;; Bit align dqwords 283 vpaddq code_lens1, code_lens1, code_lens2 284 vpand ybits_count, code_lens1, yoffset_mask ;Extra bits 285 vpermq ybits_count, ybits_count, 0xcf 286 vpaddq code_lens2, ybits_count 287 vpsllvq codes2, codes2, ybits_count 288 289 ;; Merge two qwords into dqwords 290 vmovdqa ytmp, [q_64] 291 vpsubq code_lens3, ytmp, code_lens2 292 vpsrlvq codes3, codes1, code_lens3 293 vpslldq codes3, codes3, 8 294 295 vpsllvq codes1, codes1, code_lens2 296 297 vpxor codes1, codes1, codes3 298 vpxor codes1, codes1, codes2 299 300 vmovq tmp, code_lens1 %+ x ;Number of bytes 301 shr tmp, 3 302 303 ;; Extract last bytes 304 vpaddq code_lens2, code_lens1, ybits_count 305 vpsrlq code_lens2, code_lens2, 3 306 vpshufb codes2, codes1, code_lens2 307 vpand codes2, codes2, [bytes_mask] 308 vextracti128 ybits %+ x, codes2, 1 309 310 ;; Check for short codes 311 vptest code_lens2, [min_write_mask] 312 jz .short_codes 313.short_codes_next: 314 315 vpermq codes2, codes2, 0x45 316 vpor codes1, codes1, codes2 317 318 ;; bit shift upper dqword combined bits to line up with lower dqword 319 vextracti128 code_lens2 %+ x, code_lens1, 1 320 321 ; Write out lower dqword of combined bits 322 vmovdqu [out_buf], codes1 323 vpaddq code_lens1, code_lens1, code_lens2 324 325 vmovq tmp2, code_lens1 %+ x ;Number of bytes 326 shr tmp2, 3 327 vpand ybits_count, code_lens1, yoffset_mask ;Extra bits 328 329 ; Write out upper dqword of combined bits 330 vextracti128 [out_buf + tmp], codes1, 1 331 add out_buf, tmp2 332 333 cmp ptr, in_buf_end 334 jbe .main_loop 335 336.main_loop_exit: 337 vmovq rcx, ybits_count %+ x 338 vmovq bits, ybits %+ x 339 jmp .finish 340 341.short_codes: 342 ;; Merge last bytes when the second dqword contains less than a byte 343 vpor ybits %+ x, codes2 %+ x 344 jmp .short_codes_next 345 346.long_codes: 347 add end_ptr, VECTOR_SLOP 348 sub ptr, VECTOR_SIZE 349 350 vpxor ytmp, ytmp, ytmp 351 vpblendd codes3, ytmp, codes1, 0x55 352 vpblendd code_lens3, ytmp, code_lens1, 0x55 353 vpblendd codes4, ytmp, codes2, 0x55 354 355 vpsllvq codes4, codes4, code_lens3 356 vpxor codes3, codes3, codes4 357 vpaddd code_lens3, code_lens1, code_lens2 358 359 vpsrlq codes1, codes1, 32 360 vpsrlq code_lens1, code_lens1, 32 361 vpsrlq codes2, codes2, 32 362 363 vpsllvq codes2, codes2, code_lens1 364 vpxor codes1, codes1, codes2 365 366 vpsrlq code_lens1, code_lens3, 32 367 vpblendd code_lens3, ytmp, code_lens3, 0x55 368 369 ;; Merge bitbuf bits 370 vpsllvq codes3, codes3, ybits_count 371 vpxor codes3, codes3, ybits 372 vpaddq code_lens3, code_lens3, ybits_count 373 vpaddq code_lens1, code_lens1, code_lens3 374 375 xor bits, bits 376 xor rcx, rcx 377 vpsubq code_lens1, code_lens1, code_lens3 378%rep 2 379;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 380 cmp out_buf, end_ptr 381 ja .overflow 382 ;; insert LL code 383 vmovq sym, codes3 %+ x 384 vmovq tmp2, code_lens3 %+ x 385 SHLX sym, sym, rcx 386 or bits, sym 387 add rcx, tmp2 388 389 ; empty bits 390 mov [out_buf], bits 391 mov tmp, rcx 392 shr tmp, 3 ; byte count 393 add out_buf, tmp 394 mov tmp, rcx 395 and rcx, ~7 396 SHRX bits, bits, rcx 397 mov rcx, tmp 398 and rcx, 7 399 add ptr, 4 400 401;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 402 cmp out_buf, end_ptr 403 ja .overflow 404 ;; insert LL code 405 vmovq sym, codes1 %+ x 406 vmovq tmp2, code_lens1 %+ x 407 SHLX sym, sym, rcx 408 or bits, sym 409 add rcx, tmp2 410 411 ; empty bits 412 mov [out_buf], bits 413 mov tmp, rcx 414 shr tmp, 3 ; byte count 415 add out_buf, tmp 416 mov tmp, rcx 417 and rcx, ~7 418 SHRX bits, bits, rcx 419 mov rcx, tmp 420 and rcx, 7 421 add ptr, 4 422 423;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 424 cmp out_buf, end_ptr 425 ja .overflow 426 ;; insert LL code 427 vpextrq sym, codes3 %+ x, 1 428 vpextrq tmp2, code_lens3 %+ x, 1 429 SHLX sym, sym, rcx 430 or bits, sym 431 add rcx, tmp2 432 433 ; empty bits 434 mov [out_buf], bits 435 mov tmp, rcx 436 shr tmp, 3 ; byte count 437 add out_buf, tmp 438 mov tmp, rcx 439 and rcx, ~7 440 SHRX bits, bits, rcx 441 mov rcx, tmp 442 and rcx, 7 443 add ptr, 4 444 445;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 446 cmp out_buf, end_ptr 447 ja .overflow 448 ;; insert LL code 449 vpextrq sym, codes1 %+ x, 1 450 vpextrq tmp2, code_lens1 %+ x, 1 451 SHLX sym, sym, rcx 452 or bits, sym 453 add rcx, tmp2 454 455 ; empty bits 456 mov [out_buf], bits 457 mov tmp, rcx 458 shr tmp, 3 ; byte count 459 add out_buf, tmp 460 mov tmp, rcx 461 and rcx, ~7 462 SHRX bits, bits, rcx 463 mov rcx, tmp 464 and rcx, 7 465 add ptr, 4 466 467 vextracti128 codes3 %+ x, codes3, 1 468 vextracti128 code_lens3 %+ x, code_lens3, 1 469 vextracti128 codes1 %+ x, codes1, 1 470 vextracti128 code_lens1 %+ x, code_lens1, 1 471%endrep 472 sub end_ptr, VECTOR_SLOP 473 474 vmovq ybits %+ x, bits 475 vmovq ybits_count %+ x, rcx 476 cmp ptr, in_buf_end 477 jbe .main_loop 478 479.finish: 480 add in_buf_end, VECTOR_LOOP_PROCESSED 481 add end_ptr, VECTOR_SLOP 482 483 cmp ptr, in_buf_end 484 jge .overflow 485 486.finish_loop: 487 mov DWORD(data), [ptr] 488 489 cmp out_buf, end_ptr 490 ja .overflow 491 492 mov sym, data 493 and sym, LIT_MASK ; sym has ll_code 494 mov DWORD(sym), [hufftables + _lit_len_table + sym * 4] 495 496 ; look up dist sym 497 mov dsym, data 498 shr dsym, DIST_OFFSET 499 and dsym, DIST_MASK 500 mov DWORD(dsym), [hufftables + _dist_table + dsym * 4] 501 502 ; insert LL code 503 ; sym: 31:24 length; 23:0 code 504 mov tmp2, sym 505 and sym, 0xFFFFFF 506 SHLX sym, sym, rcx 507 shr tmp2, 24 508 or bits, sym 509 add rcx, tmp2 510 511 ; insert dist code 512 movzx tmp, WORD(dsym) 513 SHLX tmp, tmp, rcx 514 or bits, tmp 515 mov tmp, dsym 516 shr tmp, 24 517 add rcx, tmp 518 519 ; insert dist extra bits 520 shr data, EXTRA_BITS_OFFSET 521 add ptr, 4 522 SHLX data, data, rcx 523 or bits, data 524 shr dsym, 16 525 and dsym, 0xFF 526 add rcx, dsym 527 528 ; empty bits 529 mov [out_buf], bits 530 mov tmp, rcx 531 shr tmp, 3 ; byte count 532 add out_buf, tmp 533 mov tmp, rcx 534 and rcx, ~7 535 SHRX bits, bits, rcx 536 mov rcx, tmp 537 and rcx, 7 538 539 cmp ptr, in_buf_end 540 jb .finish_loop 541 542.overflow: 543 mov tmp, [rsp + bitbuf_mem_offset] 544 mov [tmp + _m_bits], bits 545 mov [tmp + _m_bit_count], ecx 546 mov [tmp + _m_out_buf], out_buf 547 548 mov rax, ptr 549 550 FUNC_RESTORE 551 552 ret 553 554section .data 555 align 32 556max_write_d: 557 dd 0x1c, 0x1d, 0x1f, 0x20, 0x1c, 0x1d, 0x1f, 0x20 558min_write_mask: 559 dq 0x00, 0x00, 0xff, 0x00 560offset_mask: 561 dq 0x0000000000000007, 0x0000000000000000 562 dq 0x0000000000000000, 0x0000000000000000 563q_64: 564 dq 0x0000000000000040, 0x0000000000000000 565 dq 0x0000000000000040, 0x0000000000000000 566lit_mask: 567 dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK 568 dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK 569dist_mask: 570 dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK 571 dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK 572lit_icr_mask: 573 dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF 574 dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF 575eb_icr_mask: 576 dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF 577 dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF 578bytes_mask: 579 dq 0x00000000000000ff, 0x0000000000000000 580 dq 0x00000000000000ff, 0x0000000000000000 581