xref: /isa-l/crc/crc32_gzip_refl_by16_10.asm (revision acbe0deecfbd7afce3eacca744b672bde5118927)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31;       Function API:
32;       UINT32 crc32_gzip_refl_by16_10(
33;               UINT32 init_crc, //initial CRC value, 32 bits
34;               const unsigned char *buf, //buffer pointer to calculate CRC on
35;               UINT64 len //buffer length in bytes (64-bit data)
36;       );
37;
38;       Authors:
39;               Erdinc Ozturk
40;               Vinodh Gopal
41;               James Guilford
42;
43;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
44;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
45;
46;
47;       sample yasm command line:
48;       yasm -f x64 -f elf64 -X gnu -g dwarf2 crc32_gzip_refl_by8
49;
50;       As explained here:
51;       http://docs.oracle.com/javase/7/docs/api/java/util/zip/package-summary.html
52;       CRC-32 checksum is described in RFC 1952
53;       Implementing RFC 1952 CRC:
54;       http://www.ietf.org/rfc/rfc1952.txt
55
56%include "reg_sizes.asm"
57
58%ifndef FUNCTION_NAME
59%define FUNCTION_NAME crc32_gzip_refl_by16_10
60%endif
61
62%if (AS_FEATURE_LEVEL) >= 10
63
64%define	fetch_dist	1024
65
66[bits 64]
67default rel
68
69section .text
70
71
72%ifidn __OUTPUT_FORMAT__, win64
73	%xdefine	arg1 rcx
74	%xdefine	arg2 rdx
75	%xdefine	arg3 r8
76
77	%xdefine	arg1_low32 ecx
78%else
79	%xdefine	arg1 rdi
80	%xdefine	arg2 rsi
81	%xdefine	arg3 rdx
82
83	%xdefine	arg1_low32 edi
84%endif
85
86align 16
87mk_global FUNCTION_NAME, function
88FUNCTION_NAME:
89	endbranch
90
91	not		arg1_low32
92
93%ifidn __OUTPUT_FORMAT__, win64
94	sub		rsp, (16*10 + 8)
95
96	; push the xmm registers into the stack to maintain
97	vmovdqa		[rsp + 16*0], xmm6
98	vmovdqa		[rsp + 16*1], xmm7
99	vmovdqa		[rsp + 16*2], xmm8
100	vmovdqa		[rsp + 16*3], xmm9
101	vmovdqa		[rsp + 16*4], xmm10
102	vmovdqa		[rsp + 16*5], xmm11
103	vmovdqa		[rsp + 16*6], xmm12
104	vmovdqa		[rsp + 16*7], xmm13
105	vmovdqa		[rsp + 16*8], xmm14
106	vmovdqa		[rsp + 16*9], xmm15
107%endif
108
109	; check if smaller than 256B
110	cmp		arg3, 256
111	jl		.less_than_256
112
113	; load the initial crc value
114	vmovd		xmm10, arg1_low32      ; initial crc
115
116	; receive the initial 64B data, xor the initial crc value
117	vmovdqu8	zmm0, [arg2+16*0]
118	vmovdqu8	zmm4, [arg2+16*4]
119	vpxorq		zmm0, zmm10
120	vbroadcasti32x4	zmm10, [rk3]	;xmm10 has rk3 and rk4
121					;imm value of pclmulqdq instruction will determine which constant to use
122
123	sub		arg3, 256
124	cmp		arg3, 256
125	jl		.fold_128_B_loop
126
127	vmovdqu8	zmm7, [arg2+16*8]
128	vmovdqu8	zmm8, [arg2+16*12]
129	vbroadcasti32x4 zmm16, [rk_1]	;zmm16 has rk-1 and rk-2
130	sub		arg3, 256
131
132align 16
133.fold_256_B_loop:
134	add		arg2, 256
135	vpclmulqdq	zmm1, zmm0, zmm16, 0x10
136	vpclmulqdq	zmm0, zmm0, zmm16, 0x01
137	vpternlogq	zmm0, zmm1, [arg2+16*0], 0x96
138
139	vpclmulqdq	zmm2, zmm4, zmm16, 0x10
140	vpclmulqdq	zmm4, zmm4, zmm16, 0x01
141	vpternlogq	zmm4, zmm2, [arg2+16*4], 0x96
142
143	vpclmulqdq	zmm3, zmm7, zmm16, 0x10
144	vpclmulqdq	zmm7, zmm7, zmm16, 0x01
145	vpternlogq	zmm7, zmm3, [arg2+16*8], 0x96
146
147	vpclmulqdq	zmm5, zmm8, zmm16, 0x10
148	vpclmulqdq	zmm8, zmm8, zmm16, 0x01
149	vpternlogq	zmm8, zmm5, [arg2+16*12], 0x96
150
151	sub		arg3, 256
152	jge     	.fold_256_B_loop
153
154	;; Fold 256 into 128
155	add		arg2, 256
156	vpclmulqdq	zmm1, zmm0, zmm10, 0x01
157	vpclmulqdq	zmm2, zmm0, zmm10, 0x10
158	vpternlogq	zmm7, zmm1, zmm2, 0x96	; xor ABC
159
160	vpclmulqdq	zmm5, zmm4, zmm10, 0x01
161	vpclmulqdq	zmm6, zmm4, zmm10, 0x10
162	vpternlogq	zmm8, zmm5, zmm6, 0x96	; xor ABC
163
164	vmovdqa32	zmm0, zmm7
165	vmovdqa32	zmm4, zmm8
166
167	add		arg3, 128
168	jmp		.less_than_128_B
169
170	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
171	; loop will fold 128B at a time until we have 128+y Bytes of buffer
172
173	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
174align 16
175.fold_128_B_loop:
176	add		arg2, 128
177	vpclmulqdq	zmm2, zmm0, zmm10, 0x10
178	vpclmulqdq	zmm0, zmm0, zmm10, 0x01
179	vpternlogq	zmm0, zmm2, [arg2+16*0], 0x96
180
181	vpclmulqdq	zmm5, zmm4, zmm10, 0x10
182	vpclmulqdq	zmm4, zmm4, zmm10, 0x01
183	vpternlogq	zmm4, zmm5, [arg2+16*4], 0x96
184
185	sub		arg3, 128
186	jge		.fold_128_B_loop
187	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
188
189	add		arg2, 128
190align 16
191.less_than_128_B:
192        ;; At this point, the buffer pointer is pointing at the last
193        ;; y bytes of the buffer, where 0 <= y < 128.
194        ;; The 128 bytes of folded data is in 2 of the zmm registers:
195        ;;     zmm0 and zmm4
196
197        cmp             arg3, -64
198        jl              .fold_128_B_register
199
200        vbroadcasti32x4 zmm10, [rk15]
201        ;; If there are still 64 bytes left, folds from 128 bytes to 64 bytes
202        ;; and handles the next 64 bytes
203        vpclmulqdq      zmm2, zmm0, zmm10, 0x10
204        vpclmulqdq      zmm0, zmm0, zmm10, 0x01
205        vpternlogq      zmm0, zmm2, zmm4, 0x96
206        add             arg3, 128
207
208        jmp             .fold_64B_loop
209
210align 16
211.fold_128_B_register:
212	; fold the 8 128b parts into 1 xmm register with different constants
213	vmovdqu8	zmm16, [rk9]		; multiply by rk9-rk16
214	vmovdqu8	zmm11, [rk17]		; multiply by rk17-rk20, rk1,rk2, 0,0
215	vpclmulqdq	zmm1, zmm0, zmm16, 0x01
216	vpclmulqdq	zmm2, zmm0, zmm16, 0x10
217	vextracti64x2	xmm7, zmm4, 3		; save last that has no multiplicand
218
219	vpclmulqdq	zmm5, zmm4, zmm11, 0x01
220	vpclmulqdq	zmm6, zmm4, zmm11, 0x10
221	vmovdqa		xmm10, [rk1]		; Needed later in reduction loop
222	vpternlogq	zmm1, zmm2, zmm5, 0x96	; xor ABC
223	vpternlogq	zmm1, zmm6, zmm7, 0x96	; xor ABC
224
225	vshufi64x2      zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
226	vpxorq          ymm8, ymm8, ymm1
227	vextracti64x2   xmm5, ymm8, 1
228	vpxorq          xmm7, xmm5, xmm8
229
230	; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
231	; instead of a cmp instruction, we use the negative flag with the jl instruction
232	add		arg3, 128-16
233	jl		.final_reduction_for_128
234
235	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
236	; we can fold 16 bytes at a time if y>=16
237	; continue folding 16B at a time
238
239align 16
240.16B_reduction_loop:
241	vpclmulqdq	xmm8, xmm7, xmm10, 0x1
242	vpclmulqdq	xmm7, xmm7, xmm10, 0x10
243        vpternlogq      xmm7, xmm8, [arg2], 0x96
244	add		arg2, 16
245	sub		arg3, 16
246	; instead of a cmp instruction, we utilize the flags with the jge instruction
247	; equivalent of: cmp arg3, 16-16
248	; check if there is any more 16B in the buffer to be able to fold
249	jge		.16B_reduction_loop
250
251	;now we have 16+z bytes left to reduce, where 0<= z < 16.
252	;first, we reduce the data in the xmm7 register
253
254
255align 16
256.final_reduction_for_128:
257	add		arg3, 16
258	je		.128_done
259
260	; here we are getting data that is less than 16 bytes.
261	; since we know that there was data before the pointer, we can offset
262	; the input pointer before the actual point, to receive exactly 16 bytes.
263	; after that the registers need to be adjusted.
264align 16
265.get_last_two_xmms:
266
267	vmovdqa		xmm2, xmm7
268	vmovdqu		xmm1, [arg2 - 16 + arg3]
269
270	; get rid of the extra data that was loaded before
271	; load the shift constant
272	lea		rax, [rel pshufb_shf_table]
273	add		rax, arg3
274	vmovdqu		xmm0, [rax]
275
276	vpshufb		xmm7, xmm0
277	vpxor		xmm0, [mask3]
278	vpshufb		xmm2, xmm0
279
280	vpblendvb	xmm2, xmm2, xmm1, xmm0
281	;;;;;;;;;;
282	vpclmulqdq	xmm8, xmm7, xmm10, 0x1
283	vpclmulqdq	xmm7, xmm7, xmm10, 0x10
284        vpternlogq      xmm7, xmm8, xmm2, 0x96
285
286align 16
287.128_done:
288	; compute crc of a 128-bit value
289	vmovdqa		xmm10, [rk5]
290	vmovdqa		xmm0, xmm7
291
292	;64b fold
293	vpclmulqdq	xmm7, xmm10, 0
294	vpsrldq		xmm0, 8
295	vpxor		xmm7, xmm0
296
297	;32b fold
298	vmovdqa		xmm0, xmm7
299	vpslldq		xmm7, 4
300	vpclmulqdq	xmm7, xmm10, 0x10
301	vpxor		xmm7, xmm0
302
303
304	;barrett reduction
305align 16
306.barrett:
307	vpand		xmm7, [mask2]
308	vmovdqa		xmm1, xmm7
309	vmovdqa		xmm2, xmm7
310	vmovdqa		xmm10, [rk7]
311
312	vpclmulqdq	xmm7, xmm10, 0
313        vpternlogq      xmm7, xmm2, [mask], 0x28
314	vmovdqa		xmm2, xmm7
315	vpclmulqdq	xmm7, xmm10, 0x10
316        vpternlogq      xmm7, xmm2, xmm1, 0x96
317	vpextrd		eax, xmm7, 2
318
319align 16
320.cleanup:
321	not		eax
322
323
324%ifidn __OUTPUT_FORMAT__, win64
325	vmovdqa		xmm6, [rsp + 16*0]
326	vmovdqa		xmm7, [rsp + 16*1]
327	vmovdqa		xmm8, [rsp + 16*2]
328	vmovdqa		xmm9, [rsp + 16*3]
329	vmovdqa		xmm10, [rsp + 16*4]
330	vmovdqa		xmm11, [rsp + 16*5]
331	vmovdqa		xmm12, [rsp + 16*6]
332	vmovdqa		xmm13, [rsp + 16*7]
333	vmovdqa		xmm14, [rsp + 16*8]
334	vmovdqa		xmm15, [rsp + 16*9]
335
336	add		rsp, (16*10 + 8)
337%endif
338	ret
339
340
341;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
342;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
343;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
344;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
345
346align 16
347.less_than_256:
348
349	; check if there is enough buffer to be able to fold 16B at a time
350	cmp	arg3, 32
351	jl	.less_than_32
352
353	vmovd	xmm1, arg1_low32	; get the initial crc value	; if there is, load the constants
354
355	cmp	arg3, 64
356	jl	.less_than_64
357
358        ;; receive the initial 64B data, xor the initial crc value
359        vmovdqu8        zmm0, [arg2]
360        vpxorq          zmm0, zmm1
361        add             arg2, 64
362        sub             arg3, 64
363
364        cmp             arg3, 64
365        jb              .reduce_64B
366
367        vbroadcasti32x4 zmm10, [rk15]
368
369align 16
370.fold_64B_loop:
371        vmovdqu8        zmm4, [arg2]
372        vpclmulqdq      zmm2, zmm0, zmm10, 0x10
373        vpclmulqdq      zmm0, zmm0, zmm10, 0x01
374        vpternlogq      zmm0, zmm2, zmm4, 0x96
375
376        add             arg2, 64
377        sub             arg3, 64
378
379        cmp             arg3, 64
380        jge             .fold_64B_loop
381
382align 16
383.reduce_64B:
384        ; Reduce from 64 bytes to 16 bytes
385	vmovdqu8	zmm11, [rk17]
386	vpclmulqdq	zmm1, zmm0, zmm11, 0x01
387	vpclmulqdq	zmm2, zmm0, zmm11, 0x10
388	vextracti64x2	xmm7, zmm0, 3		; save last that has no multiplicand
389        vpternlogq      zmm1, zmm2, zmm7, 0x96
390
391	vmovdqa		xmm10, [rk_1b] ; Needed later in reduction loop
392
393	vshufi64x2      zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
394	vpxorq          ymm8, ymm8, ymm1
395	vextracti64x2   xmm5, ymm8, 1
396	vpxorq          xmm7, xmm5, xmm8
397
398        sub             arg3, 16
399        jns             .16B_reduction_loop ; At least 16 bytes of data to digest
400        jmp             .final_reduction_for_128
401
402align 16
403.less_than_64:
404	;; if there is, load the constants
405	vmovdqa	xmm10, [rk_1b]
406
407	vmovdqu	xmm7, [arg2]		; load the plaintext
408	vpxor	xmm7, xmm1              ; xmm1 already has initial crc value
409
410	;; update the buffer pointer
411	add	arg2, 16
412
413        ;; update the counter
414        ;; - subtract 32 instead of 16 to save one instruction from the loop
415	sub	arg3, 32
416	jmp	.16B_reduction_loop
417
418
419align 16
420.less_than_32:
421	; mov initial crc to the return value. this is necessary for zero-length buffers.
422	mov	eax, arg1_low32
423	test	arg3, arg3
424	je	.cleanup
425
426	vmovd	xmm0, arg1_low32	; get the initial crc value
427
428	cmp	arg3, 16
429	je	.exact_16_left
430	jl	.less_than_16_left
431
432	vmovdqu	xmm7, [arg2]		; load the plaintext
433	vpxor	xmm7, xmm0		; xor the initial crc value
434	add	arg2, 16
435	sub	arg3, 16
436	vmovdqa	xmm10, [rk1]		; rk1 and rk2 in xmm10
437	jmp	.get_last_two_xmms
438
439align 16
440.less_than_16_left:
441        xor     r10, r10
442        bts     r10, arg3
443        dec     r10
444        kmovw   k2, r10d
445        vmovdqu8 xmm7{k2}{z}, [arg2]
446
447	vpxor	xmm7, xmm0	; xor the initial crc value
448
449	cmp	arg3, 4
450	jb	.only_less_than_4
451
452	lea	rax, [rel pshufb_shf_table]
453	vmovdqu	xmm0, [rax + arg3]
454	vpshufb	xmm7,xmm0
455	jmp	.128_done
456
457align 16
458.exact_16_left:
459	vmovdqu	xmm7, [arg2]
460	vpxor	xmm7, xmm0      ; xor the initial crc value
461	jmp	.128_done
462
463align 16
464.only_less_than_4:
465        lea     r11, [rel pshufb_shift_table]
466        vmovdqu	xmm0, [r11 + arg3]
467        vpshufb	xmm7, xmm0
468        jmp	.barrett
469
470section .data
471align 32
472
473%ifndef USE_CONSTS
474; precomputed constants
475rk_1: dq 0x00000000e95c1271
476rk_2: dq 0x00000000ce3371cb
477rk1:  dq 0x00000000ccaa009e
478rk2:  dq 0x00000001751997d0
479rk3:  dq 0x000000014a7fe880
480rk4:  dq 0x00000001e88ef372
481rk5:  dq 0x00000000ccaa009e
482rk6:  dq 0x0000000163cd6124
483rk7:  dq 0x00000001f7011640
484rk8:  dq 0x00000001db710640
485rk9:  dq 0x00000001d7cfc6ac
486rk10: dq 0x00000001ea89367e
487rk11: dq 0x000000018cb44e58
488rk12: dq 0x00000000df068dc2
489rk13: dq 0x00000000ae0b5394
490rk14: dq 0x00000001c7569e54
491rk15: dq 0x00000001c6e41596
492rk16: dq 0x0000000154442bd4
493rk17: dq 0x0000000174359406
494rk18: dq 0x000000003db1ecdc
495rk19: dq 0x000000015a546366
496rk20: dq 0x00000000f1da05aa
497
498rk_1b: dq 0x00000000ccaa009e
499rk_2b: dq 0x00000001751997d0
500	dq 0x0000000000000000
501	dq 0x0000000000000000
502%else
503INCLUDE_CONSTS
504%endif
505
506pshufb_shf_table:
507; use these values for shift constants for the pshufb instruction
508; different alignments result in values as shown:
509;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
510;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
511;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
512;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
513;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
514;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
515;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
516;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
517;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
518;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
519;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
520;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
521;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
522;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
523;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
524dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
525dq 0x0706050403020100, 0x000e0d0c0b0a0908
526
527align 16
528pshufb_shift_table:
529        ;; use these values to shift data for the pshufb instruction
530        db 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
531        db 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
532        db 0x08, 0x09, 0x0A
533
534mask:  dq     0xFFFFFFFFFFFFFFFF, 0x0000000000000000
535mask2: dq     0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
536mask3: dq     0x8080808080808080, 0x8080808080808080
537
538%else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
539%ifidn __OUTPUT_FORMAT__, win64
540global no_ %+ FUNCTION_NAME
541no_ %+ FUNCTION_NAME %+ :
542%endif
543%endif ; (AS_FEATURE_LEVEL) >= 10
544