1%include "reg_sizes.asm" 2%include "lz0a_const.asm" 3%include "data_struct2.asm" 4%include "igzip_compare_types.asm" 5%define NEQ 4 6 7%ifdef HAVE_AS_KNOWS_AVX512 8%ifidn __OUTPUT_FORMAT__, win64 9%define arg1 rcx 10%define arg2 rdx 11%define arg3 r8 12%define dist_code rsi 13%define len rdi 14%else 15%define arg1 rdi 16%define arg2 rsi 17%define arg3 rdx 18%define dist_code rcx 19%define len r8 20%endif 21 22%define next_in arg1 23%define end_in arg2 24%define match_lookup arg3 25%define match_in rax 26%define dist r9 27%define match_offset r10 28%define tmp1 r11 29 30%define zmatch_lookup zmm0 31%define zmatch_lookup2 zmm1 32%define zlens zmm2 33%define zdist_codes zmm3 34%define zdist_extras zmm4 35%define zdists zmm5 36%define zdists2 zmm6 37%define zlens1 zmm7 38%define zlens2 zmm8 39%define zlookup zmm9 40%define zlookup2 zmm10 41%define datas zmm11 42%define ztmp1 zmm12 43%define ztmp2 zmm13 44%define zvect_size zmm17 45%define ztwofiftyfour zmm18 46%define ztwofiftysix zmm19 47%define ztwosixtytwo zmm20 48%define znlen_mask zmm21 49%define zbswap zmm22 50%define zqword_shuf zmm23 51%define zdatas_perm3 zmm24 52%define zdatas_perm2 zmm25 53%define zincrement zmm26 54%define zdists_mask zmm27 55%define zdists_start zmm28 56%define zlong_lens2 zmm29 57%define zlong_lens zmm30 58%define zlens_mask zmm31 59 60%ifidn __OUTPUT_FORMAT__, win64 61%define stack_size 8*16 + 2 * 8 + 8 62%define func(x) proc_frame x 63%macro FUNC_SAVE 0 64 alloc_stack stack_size 65 vmovdqa [rsp + 0*16], xmm6 66 vmovdqa [rsp + 1*16], xmm7 67 vmovdqa [rsp + 2*16], xmm8 68 vmovdqa [rsp + 3*16], xmm9 69 vmovdqa [rsp + 4*16], xmm10 70 vmovdqa [rsp + 5*16], xmm11 71 vmovdqa [rsp + 6*16], xmm12 72 vmovdqa [rsp + 7*16], xmm13 73 save_reg rsi, 8*16 + 0*8 74 save_reg rdi, 8*16 + 1*8 75 end_prolog 76%endm 77 78%macro FUNC_RESTORE 0 79 vmovdqa xmm6, [rsp + 0*16] 80 vmovdqa xmm7, [rsp + 1*16] 81 vmovdqa xmm8, [rsp + 2*16] 82 vmovdqa xmm9, [rsp + 3*16] 83 vmovdqa xmm10, [rsp + 4*16] 84 vmovdqa xmm11, [rsp + 5*16] 85 vmovdqa xmm12, [rsp + 6*16] 86 vmovdqa xmm13, [rsp + 7*16] 87 88 mov [rsp + 8*16 + 0*8], rsi 89 mov [rsp + 8*16 + 1*8], rdi 90 add rsp, stack_size 91%endm 92%else 93%define func(x) x: 94%macro FUNC_SAVE 0 95%endm 96 97%macro FUNC_RESTORE 0 98%endm 99%endif 100%define VECT_SIZE 16 101 102global set_long_icf_fg_06 103func(set_long_icf_fg_06) 104 FUNC_SAVE 105 106 sub end_in, LA + 15 107 vmovdqu32 zlong_lens, [long_len] 108 vmovdqu32 zlong_lens2, [long_len2] 109 vmovdqu32 zlens_mask, [len_mask] 110 vmovdqu16 zdists_start, [dist_start] 111 vmovdqu32 zdists_mask, [dists_mask] 112 vmovdqu32 zincrement, [increment] 113 vmovdqu64 zdatas_perm2, [datas_perm2] 114 vmovdqu64 zdatas_perm3, [datas_perm3] 115 vmovdqu64 zqword_shuf, [qword_shuf] 116 vmovdqu64 zbswap, [bswap_shuf] 117 vmovdqu64 znlen_mask, [nlen_mask] 118 vmovdqu64 zvect_size, [vect_size] 119 vmovdqu64 ztwofiftyfour, [twofiftyfour] 120 vmovdqu64 ztwofiftysix, [twofiftysix] 121 vmovdqu64 ztwosixtytwo, [twosixtytwo] 122 vmovdqu32 zmatch_lookup, [match_lookup] 123 124fill_loop: ; Tahiti is a magical place 125 vmovdqu32 zmatch_lookup2, zmatch_lookup 126 vmovdqu32 zmatch_lookup, [match_lookup + ICF_CODE_BYTES * VECT_SIZE] 127 128 cmp next_in, end_in 129 jae end_fill 130 vpandd zlens, zmatch_lookup2, zlens_mask 131 vpcmpgtd k3, zlens, zlong_lens 132 133;; Speculatively increment 134 add next_in, VECT_SIZE 135 add match_lookup, ICF_CODE_BYTES * VECT_SIZE 136 137 ktestw k3, k3 138 jz fill_loop 139 140 vpsrld zdist_codes, zmatch_lookup2, DIST_OFFSET 141 vpmovdw zdists %+ y, zdist_codes ; Relies on perm working mod 32 142 vpermw zdists, zdists, zdists_start 143 vpmovzxwd zdists, zdists %+ y 144 145 vpsrld zdist_extras, zmatch_lookup2, EXTRA_BITS_OFFSET 146 vpsubd zdist_extras, zincrement, zdist_extras 147 148 vpsubd zdists, zdist_extras, zdists 149 vextracti32x8 zdists2 %+ y, zdists, 1 150 kmovb k6, k3 151 kshiftrw k7, k3, 8 152 vpgatherdq zlens1 {k6}, [next_in + zdists %+ y - 8] 153 vpgatherdq zlens2 {k7}, [next_in + zdists2 %+ y - 8] 154 155 vmovdqu8 datas %+ y, [next_in - 8] 156 vpermq zlookup, zdatas_perm2, datas 157 vpshufb zlookup, zlookup, zqword_shuf 158 vpermq zlookup2, zdatas_perm3, datas 159 vpshufb zlookup2, zlookup2, zqword_shuf 160 161 vpxorq zlens1, zlens1, zlookup 162 vpxorq zlens2, zlens2, zlookup2 163 164 vpshufb zlens1, zlens1, zbswap 165 vpshufb zlens2, zlens2, zbswap 166 vplzcntq zlens1, zlens1 167 vplzcntq zlens2, zlens2 168 vpmovqd zlens1 %+ y, zlens1 169 vpmovqd zlens2 %+ y, zlens2 170 vinserti32x8 zlens1, zlens2 %+ y, 1 171 vpsrld zlens1 {k3}{z}, zlens1, 3 172 173 vpandd zmatch_lookup2 {k3}{z}, zmatch_lookup2, znlen_mask 174 vpaddd zmatch_lookup2 {k3}{z}, zmatch_lookup2, ztwosixtytwo 175 vpaddd zmatch_lookup2 {k3}{z}, zmatch_lookup2, zlens1 176 177 vmovdqu32 [match_lookup - ICF_CODE_BYTES * VECT_SIZE] {k3}, zmatch_lookup2 178 179 vpcmpgtd k3, zlens1, zlong_lens2 180 ktestw k3, k3 181 jz fill_loop 182 183 vpsubd zdists, zincrement, zdists 184 185 vpcompressd zdists2 {k3}, zdists 186 vpcompressd zmatch_lookup2 {k3}, zmatch_lookup2 187 kmovq match_offset, k3 188 tzcnt match_offset, match_offset 189 190 vmovd dist %+ d, zdists2 %+ x 191 lea next_in, [next_in + match_offset - VECT_SIZE] 192 lea match_lookup, [match_lookup + ICF_CODE_BYTES * (match_offset - VECT_SIZE)] 193 mov match_in, next_in 194 sub match_in, dist 195 196 mov len, 2 197%rep 3 198 vmovdqu8 ztmp1, [next_in + len] 199 vmovdqu8 ztmp2, [match_in + len] 200 vpcmpb k3, ztmp1, [match_in + len], NEQ 201 ktestq k3, k3 202 jnz miscompare 203 204 add len, 64 205%endrep 206 207 vmovdqu8 ztmp1, [next_in + len] 208 vmovdqu8 ztmp2, [match_in + len] 209 vpcmpb k3, ztmp1, ztmp2, 4 210 211miscompare: 212 kmovq tmp1, k3 213 tzcnt tmp1, tmp1 214 add len, tmp1 215 add next_in, len 216 lea match_lookup, [match_lookup + ICF_CODE_BYTES * len] 217 vmovdqu32 zmatch_lookup, [match_lookup] 218 219 vpbroadcastd zmatch_lookup2, zmatch_lookup2 %+ x 220 vpandd zmatch_lookup2, zmatch_lookup2, znlen_mask 221 222 vpbroadcastd zlens1, len %+ d 223 vpsubd zlens1, zlens1, zincrement 224 vpaddd zlens1, zlens1, ztwofiftyfour 225 neg len 226 227update_match_lookup: 228 vpandd zlens2, zlens_mask, [match_lookup + ICF_CODE_BYTES * len] 229 vpcmpgtd k3, zlens1, zlens2 230 vpcmpgtd k4, zlens1, ztwofiftysix 231 kandw k3, k3, k4 232 233 vpaddd zlens2 {k3}{z}, zlens1, zmatch_lookup2 234 235 vmovdqu32 [match_lookup + ICF_CODE_BYTES * len] {k3}, zlens2 236 237 knotw k3, k3 238 ktestw k3, k3 239 jnz fill_loop 240 241 add len, VECT_SIZE 242 vpsubd zlens1, zlens1, zvect_size 243 244 jmp update_match_lookup 245end_fill: 246 247 FUNC_RESTORE 248 ret 249 250section .data 251align 64 252dist_start: 253 dw 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0007, 0x0009, 0x000d 254 dw 0x0011, 0x0019, 0x0021, 0x0031, 0x0041, 0x0061, 0x0081, 0x00c1 255 dw 0x0101, 0x0181, 0x0201, 0x0301, 0x0401, 0x0601, 0x0801, 0x0c01 256 dw 0x1001, 0x1801, 0x2001, 0x3001, 0x4001, 0x6001, 0x0000, 0x0000 257len_mask: 258 dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK 259 dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK 260 dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK 261 dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK 262dists_mask: 263 dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK 264 dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK 265 dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK 266 dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK 267long_len: 268 dd 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105 269 dd 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105 270long_len2: 271 dd 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7 272 dd 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7 273 274increment: 275 dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 276 dd 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 277datas_perm2: 278 dq 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1 279datas_perm3: 280 dq 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2 281bswap_shuf: 282 db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 283 db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 284 db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 285 db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 286 db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 287 db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 288 db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 289 db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 290qword_shuf: 291 db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 292 db 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8 293 db 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9 294 db 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa 295 db 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb 296 db 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc 297 db 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd 298 db 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe 299 db 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 300vect_size: 301 dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE 302 dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE 303 dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE 304 dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE 305twofiftyfour: 306 dd 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe 307 dd 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe 308twofiftysix: 309 dd 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100 310 dd 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100 311twosixtytwo: 312 dd 0x106, 0x106, 0x106, 0x106, 0x106, 0x106, 0x106, 0x106 313 dd 0x106, 0x106, 0x106, 0x106, 0x106, 0x106, 0x106, 0x106 314nlen_mask: 315 dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00 316 dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00 317 dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00 318 dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00 319%endif 320