xref: /isa-l/igzip/encode_df_06.asm (revision cd888f01a447dd04c3a8b50362079648d432d2ca)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2018 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30%include "reg_sizes.asm"
31%include "lz0a_const.asm"
32%include "data_struct2.asm"
33%include "stdmac.asm"
34
35%ifdef HAVE_AS_KNOWS_AVX512
36
37%define ARCH 06
38%define USE_HSWNI
39
40; tree entry is 4 bytes:
41; lit/len tree (513 entries)
42; |  3  |  2   |  1 | 0 |
43; | len |       code    |
44;
45; dist tree
46; |  3  |  2   |  1 | 0 |
47; |eblen:codlen|   code |
48
49; token format:
50; DIST_OFFSET:0 : lit/len
51; 31:(DIST_OFFSET + 5) : dist Extra Bits
52; (DIST_OFFSET + 5):DIST_OFFSET : dist code
53; lit/len: 0-256 (literal)
54;          257-512 (dist + 254)
55
56; returns final token pointer
57; equal to token_end if successful
58;    uint32_t* encode_df(uint32_t *token_start, uint32_t *token_end,
59;                            BitBuf *out_buf, uint32_t *trees);
60
61%ifidn __OUTPUT_FORMAT__, win64
62%define arg1 rcx
63%define arg2 rdx
64%define arg3 r8
65%define arg4 r9
66%define sym		rsi
67%define dsym		rdi
68%define hufftables	r9
69%define ptr		r11
70%else
71; Linux
72%define arg1 rdi
73%define arg2 rsi
74%define arg3 rdx
75%define arg4 rcx
76%define sym		r9
77%define dsym		r8
78%define hufftables	r11
79%define ptr		rdi
80%endif
81
82%define in_buf_end	arg2
83%define bitbuf		arg3
84%define out_buf		bitbuf
85; bit_count is rcx
86%define bits		rax
87%define data		r12
88%define tmp		rbx
89%define len 		dsym
90%define tmp2 		r10
91%define end_ptr		rbp
92
93%define LIT_MASK	((0x1 << LIT_LEN_BIT_COUNT) - 1)
94%define DIST_MASK	((0x1 << DIST_LIT_BIT_COUNT) - 1)
95
96%define codes1		zmm1
97%define code_lens1	zmm2
98%define codes2		zmm3
99%define code_lens2	zmm4
100%define codes3		zmm5
101%define ztmp		zmm5
102%define	code_lens3	zmm6
103%define codes4		zmm7
104%define syms		zmm7
105
106%define code_lens4	zmm8
107%define dsyms		zmm8
108%define zbits_count_q	zmm8
109
110%define codes_lookup1	zmm9
111%define	codes_lookup2	zmm10
112%define datas		zmm11
113%define zbits		zmm12
114%define zbits_count	zmm13
115%define zoffset_mask	zmm14
116%define znotoffset_mask	zmm23
117
118%define zq_64		zmm15
119%define zlit_mask	zmm16
120%define zdist_mask	zmm17
121%define zlit_icr_mask	zmm18
122%define zeb_icr_mask	zmm19
123%define zmax_write	zmm20
124%define zrot_perm	zmm21
125%define zq_8		zmm22
126
127%define VECTOR_SIZE 0x40
128%define VECTOR_LOOP_PROCESSED (2 * VECTOR_SIZE)
129%define VECTOR_SLOP 0x40 - 8
130
131gpr_save_mem_offset	equ	0
132gpr_save_mem_size	equ	8 * 6
133xmm_save_mem_offset	equ	gpr_save_mem_offset + gpr_save_mem_size
134xmm_save_mem_size	equ	10 * 16
135bitbuf_mem_offset	equ	xmm_save_mem_offset + xmm_save_mem_size
136bitbuf_mem_size		equ	8
137stack_size		equ	gpr_save_mem_size + xmm_save_mem_size + bitbuf_mem_size
138
139
140%macro FUNC_SAVE 0
141	sub	rsp, stack_size
142	mov	[rsp + gpr_save_mem_offset + 0*8], rbx
143	mov	[rsp + gpr_save_mem_offset + 1*8], rbp
144	mov	[rsp + gpr_save_mem_offset + 2*8], r12
145
146%ifidn __OUTPUT_FORMAT__, win64
147	mov	[rsp + gpr_save_mem_offset + 3*8], rsi
148	mov	[rsp + gpr_save_mem_offset + 4*8], rdi
149
150	MOVDQU	[rsp + xmm_save_mem_offset + 0*8], xmm6
151	MOVDQU	[rsp + xmm_save_mem_offset + 1*8], xmm7
152	MOVDQU	[rsp + xmm_save_mem_offset + 2*8], xmm8
153	MOVDQU	[rsp + xmm_save_mem_offset + 3*8], xmm9
154	MOVDQU	[rsp + xmm_save_mem_offset + 4*8], xmm10
155	MOVDQU	[rsp + xmm_save_mem_offset + 5*8], xmm11
156	MOVDQU	[rsp + xmm_save_mem_offset + 6*8], xmm12
157	MOVDQU	[rsp + xmm_save_mem_offset + 7*8], xmm13
158	MOVDQU	[rsp + xmm_save_mem_offset + 8*8], xmm14
159	MOVDQU	[rsp + xmm_save_mem_offset + 9*8], xmm15
160%endif
161
162%endm
163
164%macro FUNC_RESTORE 0
165	mov	rbx, [rsp + gpr_save_mem_offset + 0*8]
166	mov	rbp, [rsp + gpr_save_mem_offset + 1*8]
167	mov	r12, [rsp + gpr_save_mem_offset + 2*8]
168
169%ifidn __OUTPUT_FORMAT__, win64
170	mov	rsi, [rsp + gpr_save_mem_offset + 3*8]
171	mov	rdi, [rsp + gpr_save_mem_offset + 4*8]
172
173	MOVDQU	xmm6, [rsp + xmm_save_mem_offset + 0*8]
174	MOVDQU	xmm7, [rsp + xmm_save_mem_offset + 1*8]
175	MOVDQU	xmm8, [rsp + xmm_save_mem_offset + 2*8]
176	MOVDQU	xmm9, [rsp + xmm_save_mem_offset + 3*8]
177	MOVDQU	xmm10, [rsp + xmm_save_mem_offset + 4*8]
178	MOVDQU	xmm11, [rsp + xmm_save_mem_offset + 5*8]
179	MOVDQU	xmm12, [rsp + xmm_save_mem_offset + 6*8]
180	MOVDQU	xmm13, [rsp + xmm_save_mem_offset + 7*8]
181	MOVDQU	xmm14, [rsp + xmm_save_mem_offset + 8*8]
182	MOVDQU	xmm15, [rsp + xmm_save_mem_offset + 9*8]
183%endif
184	add	rsp, stack_size
185
186%endmacro
187
188default rel
189section .text
190
191global encode_deflate_icf_ %+ ARCH
192encode_deflate_icf_ %+ ARCH:
193	endbranch
194	FUNC_SAVE
195
196%ifnidn ptr, arg1
197	mov	ptr, arg1
198%endif
199%ifnidn hufftables, arg4
200	mov	hufftables, arg4
201%endif
202
203	mov	[rsp + bitbuf_mem_offset], bitbuf
204	mov	bits, [bitbuf + _m_bits]
205	mov	ecx, [bitbuf + _m_bit_count]
206	mov	end_ptr, [bitbuf + _m_out_end]
207	mov	out_buf, [bitbuf + _m_out_buf]	; clobbers bitbuf
208
209	sub	end_ptr, VECTOR_SLOP
210	sub	in_buf_end, VECTOR_LOOP_PROCESSED
211	cmp	ptr, in_buf_end
212	jge	.finish
213
214	kxorq	k0, k0, k0
215	kmovq	k1, [k_mask_1]
216	kmovq	k2, [k_mask_2]
217	kmovq	k3, [k_mask_3]
218	kmovq	k4, [k_mask_4]
219	kmovq	k5, [k_mask_5]
220
221	vmovdqa64 zrot_perm, [rot_perm]
222
223	vbroadcasti64x2 zq_64, [q_64]
224	vbroadcasti64x2 zq_8, [q_8]
225
226	vpbroadcastq zoffset_mask, [offset_mask]
227	vpternlogd znotoffset_mask, znotoffset_mask, zoffset_mask, 0x55
228
229	vpbroadcastd zlit_mask, [lit_mask]
230	vpbroadcastd zdist_mask, [dist_mask]
231	vpbroadcastd zlit_icr_mask, [lit_icr_mask]
232	vpbroadcastd zeb_icr_mask, [eb_icr_mask]
233	vpbroadcastd zmax_write, [max_write_d]
234
235	knotq	k6, k0
236	vmovdqu64	datas, [ptr]
237	vpandd	syms, datas, zlit_mask
238	vpgatherdd codes_lookup1 {k6}, [hufftables + _lit_len_table + 4 * syms]
239
240	knotq	k7, k0
241	vpsrld	dsyms, datas, DIST_OFFSET
242	vpandd	dsyms, dsyms, zdist_mask
243	vpgatherdd codes_lookup2 {k7}, [hufftables + _dist_table + 4 * dsyms]
244
245	vmovq	zbits %+ x, bits
246	vmovq	zbits_count %+ x, rcx
247
248.main_loop:
249	;;  Sets codes1 to contain lit/len codes andcode_lens1 the corresponding lengths
250	vpsrld	code_lens1, codes_lookup1, 24
251	vpandd	codes1, codes_lookup1, zlit_icr_mask
252
253	;; Sets codes2 to contain dist codes, code_lens2 the corresponding lengths,
254	;; and code_lens3 the extra bit counts
255	vmovdqu16	codes2 {k1}{z}, codes_lookup2 ;Bits 8 and above of zbits are 0
256	vpsrld	code_lens2, codes_lookup2, 24
257	vpsrld	code_lens3, codes_lookup2, 16
258	vpandd	code_lens3, code_lens3, zeb_icr_mask
259
260	;; Set codes3 to contain the extra bits
261	vpsrld	codes3, datas, EXTRA_BITS_OFFSET
262
263	cmp	out_buf, end_ptr
264	ja	.main_loop_exit
265
266	;; Start code lookups for next iteration
267	knotq	k6, k0
268	add	ptr, VECTOR_SIZE
269	vmovdqu64	datas, [ptr]
270	vpandd	syms, datas, zlit_mask
271	vpgatherdd codes_lookup1 {k6}, [hufftables + _lit_len_table + 4 * syms]
272
273	knotq	k7, k0
274	vpsrld	dsyms, datas, DIST_OFFSET
275	vpandd	dsyms, dsyms, zdist_mask
276	vpgatherdd codes_lookup2 {k7}, [hufftables + _dist_table + 4 * dsyms]
277
278	;; Merge dist code with extra bits
279	vpsllvd	codes3, codes3, code_lens2
280	vpxord	codes2, codes2, codes3
281	vpaddd	code_lens2, code_lens2, code_lens3
282
283	;; Check for long codes
284	vpaddd	code_lens3, code_lens1, code_lens2
285	vpcmpgtd	k6, code_lens3, zmax_write
286	ktestd	k6, k6
287	jnz	.long_codes
288
289	;; Merge dist and len codes
290	vpsllvd	codes2, codes2, code_lens1
291	vpxord	codes1, codes1, codes2
292
293	vmovdqa32 codes3 {k1}{z}, codes1
294	vpsrlq	codes1, codes1, 32
295	vpsrlq	code_lens1, code_lens3, 32
296	vmovdqa32	code_lens3 {k1}{z}, code_lens3
297
298	;; Merge bitbuf bits
299	vpsllvq codes3, codes3, zbits_count
300	vpxord	codes3, codes3, zbits
301	vpaddq	code_lens3, code_lens3, zbits_count
302
303	;; Merge two symbols into qwords
304	vpsllvq	codes1, codes1, code_lens3
305	vpxord codes1, codes1, codes3
306	vpaddq code_lens1, code_lens1, code_lens3
307
308	;; Determine total bits at end of each qword
309	vpermq	zbits_count {k5}{z}, zrot_perm, code_lens1
310	vpaddq	code_lens2, zbits_count, code_lens1
311	vshufi64x2 zbits_count {k3}{z}, code_lens2, code_lens2, 0x90
312	vpaddq	code_lens2, code_lens2, zbits_count
313	vshufi64x2 zbits_count {k2}{z}, code_lens2, code_lens2, 0x40
314	vpaddq	code_lens2, code_lens2, zbits_count
315
316	;; Bit align quadwords
317	vpandd	zbits_count, code_lens2, zoffset_mask
318	vpermq	zbits_count_q {k5}{z}, zrot_perm, zbits_count
319	vpsllvq	codes1, codes1, zbits_count_q
320
321	;; Check whether any of the last bytes overlap
322	vpcmpq k6 {k5}, code_lens1, zbits_count, 1
323
324	;; Get last byte in each qword
325	vpsrlq	code_lens2, code_lens2, 3
326	vpaddq	code_lens1, code_lens1, zbits_count_q
327	vpandq	code_lens1, code_lens1, znotoffset_mask
328	vpsrlvq	codes3, codes1, code_lens1
329
330	;; Branch to handle overlapping last bytes
331	ktestd k6, k6
332	jnz .small_codes
333
334.small_codes_next:
335	;; Save off zbits and zbits_count for next loop
336	knotq	k7, k5
337	vpermq	zbits {k7}{z}, zrot_perm, codes3
338	vpermq	zbits_count {k7}{z}, zrot_perm, zbits_count
339
340	;; Merge last byte in each qword with the next qword
341	vpermq	codes3 {k5}{z}, zrot_perm, codes3
342	vpxord codes1, codes1, codes3
343
344	;; Determine total bytes written
345	vextracti64x2 code_lens1 %+ x, code_lens2, 3
346	vpextrq tmp2, code_lens1 %+ x, 1
347
348	;; Write out qwords
349	knotq	k6, k0
350	vpermq code_lens2 {k5}{z}, zrot_perm, code_lens2
351	vpscatterqq [out_buf + code_lens2] {k6}, codes1
352
353	add	out_buf, tmp2
354
355	cmp	ptr, in_buf_end
356	jbe	.main_loop
357
358.main_loop_exit:
359	vmovq	rcx, zbits_count %+ x
360	vmovq	bits, zbits %+ x
361	jmp	.finish
362
363.small_codes:
364	;; Merge overlapping last bytes
365	vpermq	codes4 {k6}{z}, zrot_perm, codes3
366	vporq codes3, codes3, codes4
367	kshiftlq k7, k6, 1
368	ktestd k6, k7
369	jz .small_codes_next
370
371	kandq k6, k6, k7
372	jmp .small_codes
373
374.long_codes:
375	add	end_ptr, VECTOR_SLOP
376	sub	ptr, VECTOR_SIZE
377
378	vmovdqa32 codes3 {k1}{z}, codes1
379	vmovdqa32 code_lens3 {k1}{z}, code_lens1
380	vmovdqa32 codes4 {k1}{z}, codes2
381
382	vpsllvq	codes4, codes4, code_lens3
383	vpxord	codes3, codes3, codes4
384	vpaddd	code_lens3, code_lens1, code_lens2
385
386	vpsrlq	codes1, codes1, 32
387	vpsrlq	code_lens1, code_lens1, 32
388	vpsrlq	codes2, codes2, 32
389
390	vpsllvq	codes2, codes2, code_lens1
391	vpxord codes1, codes1, codes2
392
393	vpsrlq code_lens1, code_lens3, 32
394	vmovdqa32	code_lens3 {k1}{z}, code_lens3
395
396	;; Merge bitbuf bits
397	vpsllvq codes3, codes3, zbits_count
398	vpxord	codes3, codes3, zbits
399	vpaddq	code_lens3, code_lens3, zbits_count
400	vpaddq code_lens1, code_lens1, code_lens3
401
402	xor	bits, bits
403	xor	rcx, rcx
404	vpsubq code_lens1, code_lens1, code_lens3
405
406	vmovdqu64 codes2, codes1
407	vmovdqu64 code_lens2, code_lens1
408	vmovdqu64 codes4, codes3
409	vmovdqu64 code_lens4, code_lens3
410%assign i 0
411%rep 4
412%assign i (i + 1)
413;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
414	cmp	out_buf, end_ptr
415	ja	.overflow
416	;; insert LL code
417	vmovq	sym, codes3 %+ x
418	vmovq	tmp2, code_lens3 %+ x
419	SHLX	sym, sym, rcx
420	or	bits, sym
421	add	rcx, tmp2
422
423	; empty bits
424	mov	[out_buf], bits
425	mov	tmp, rcx
426	shr	tmp, 3		; byte count
427	add	out_buf, tmp
428	mov	tmp, rcx
429	and	rcx, ~7
430	SHRX	bits, bits, rcx
431	mov	rcx, tmp
432	and	rcx, 7
433	add	ptr, 4
434
435;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
436	cmp	out_buf, end_ptr
437	ja	.overflow
438	;; insert LL code
439	vmovq	sym, codes1 %+ x
440	vmovq	tmp2, code_lens1 %+ x
441	SHLX	sym, sym, rcx
442	or	bits, sym
443	add	rcx, tmp2
444
445	; empty bits
446	mov	[out_buf], bits
447	mov	tmp, rcx
448	shr	tmp, 3		; byte count
449	add	out_buf, tmp
450	mov	tmp, rcx
451	and	rcx, ~7
452	SHRX	bits, bits, rcx
453	mov	rcx, tmp
454	and	rcx, 7
455	add	ptr, 4
456
457;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
458	cmp	out_buf, end_ptr
459	ja	.overflow
460	;; insert LL code
461	vpextrq	sym, codes3 %+ x, 1
462	vpextrq	tmp2, code_lens3 %+ x, 1
463	SHLX	sym, sym, rcx
464	or	bits, sym
465	add	rcx, tmp2
466
467	; empty bits
468	mov	[out_buf], bits
469	mov	tmp, rcx
470	shr	tmp, 3		; byte count
471	add	out_buf, tmp
472	mov	tmp, rcx
473	and	rcx, ~7
474	SHRX	bits, bits, rcx
475	mov	rcx, tmp
476	and	rcx, 7
477	add	ptr, 4
478
479;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
480	cmp	out_buf, end_ptr
481	ja	.overflow
482	;; insert LL code
483	vpextrq	sym, codes1 %+ x, 1
484	vpextrq	tmp2, code_lens1 %+ x, 1
485	SHLX	sym, sym, rcx
486	or	bits, sym
487	add	rcx, tmp2
488
489	; empty bits
490	mov	[out_buf], bits
491	mov	tmp, rcx
492	shr	tmp, 3		; byte count
493	add	out_buf, tmp
494	mov	tmp, rcx
495	and	rcx, ~7
496	SHRX	bits, bits, rcx
497	mov	rcx, tmp
498	and	rcx, 7
499	add	ptr, 4
500
501	vextracti32x4 codes3 %+ x, codes4, i
502	vextracti32x4 code_lens3 %+ x, code_lens4, i
503	vextracti32x4 codes1 %+ x, codes2, i
504	vextracti32x4 code_lens1 %+ x, code_lens2, i
505%endrep
506	sub	end_ptr, VECTOR_SLOP
507
508	vmovq	zbits %+ x, bits
509	vmovq	zbits_count %+ x, rcx
510	cmp	ptr, in_buf_end
511	jbe	.main_loop
512
513.finish:
514	add	in_buf_end, VECTOR_LOOP_PROCESSED
515	add	end_ptr, VECTOR_SLOP
516
517	cmp	ptr, in_buf_end
518	jge	.overflow
519
520.finish_loop:
521	mov	DWORD(data), [ptr]
522
523	cmp	out_buf, end_ptr
524	ja	.overflow
525
526	mov	sym, data
527	and	sym, LIT_MASK	; sym has ll_code
528	mov	DWORD(sym), [hufftables + _lit_len_table + sym * 4]
529
530	; look up dist sym
531	mov	dsym, data
532	shr	dsym, DIST_OFFSET
533	and	dsym, DIST_MASK
534	mov	DWORD(dsym), [hufftables + _dist_table + dsym * 4]
535
536	; insert LL code
537	; sym: 31:24 length; 23:0 code
538	mov	tmp2, sym
539	and	sym, 0xFFFFFF
540	SHLX	sym, sym, rcx
541	shr	tmp2, 24
542	or	bits, sym
543	add	rcx, tmp2
544
545	; insert dist code
546	movzx	tmp, WORD(dsym)
547	SHLX	tmp, tmp, rcx
548	or	bits, tmp
549	mov	tmp, dsym
550	shr	tmp, 24
551	add	rcx, tmp
552
553	; insert dist extra bits
554	shr	data, EXTRA_BITS_OFFSET
555	add	ptr, 4
556	SHLX	data, data, rcx
557	or	bits, data
558	shr	dsym, 16
559	and	dsym, 0xFF
560	add	rcx, dsym
561
562	; empty bits
563	mov	[out_buf], bits
564	mov	tmp, rcx
565	shr	tmp, 3		; byte count
566	add	out_buf, tmp
567	mov	tmp, rcx
568	and	rcx, ~7
569	SHRX	bits, bits, rcx
570	mov	rcx, tmp
571	and	rcx, 7
572
573	cmp	ptr, in_buf_end
574	jb	.finish_loop
575
576.overflow:
577	mov	tmp, [rsp + bitbuf_mem_offset]
578	mov	[tmp + _m_bits], bits
579	mov	[tmp + _m_bit_count], ecx
580	mov	[tmp + _m_out_buf], out_buf
581
582	mov	rax, ptr
583
584	FUNC_RESTORE
585
586	ret
587
588section .data
589	align 64
590;; 64 byte data
591rot_perm:
592	dq 0x00000007, 0x00000000, 0x00000001, 0x00000002
593	dq 0x00000003, 0x00000004, 0x00000005, 0x00000006
594
595;; 16 byte data
596q_64:
597	dq 0x0000000000000040, 0x0000000000000000
598q_8 :
599	dq 0x0000000000000000, 0x0000000000000008
600
601;; 8 byte data
602offset_mask:
603	dq 0x0000000000000007
604
605;; 4 byte data
606max_write_d:
607	dd 0x1c
608lit_mask:
609	dd LIT_MASK
610dist_mask:
611	dd DIST_MASK
612lit_icr_mask:
613	dd 0x00ffffff
614eb_icr_mask:
615	dd 0x000000ff
616
617;; k mask constants
618k_mask_1: dq 0x55555555
619k_mask_2: dq 0xfffffff0
620k_mask_3: dq 0xfffffffc
621k_mask_4: dw 0x0101, 0x0101, 0x0101, 0x0101
622k_mask_5: dq 0xfffffffe
623
624%endif
625