xref: /isa-l/crc/crc64_iso_refl_by16_10.asm (revision c2bec3ea65ce35b01311d1cc4b314f6b4986b9c8)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31;       Function API:
32;       uint64_t crc64_iso_refl_by16_10(
33;               uint64_t init_crc, //initial CRC value, 64 bits
34;               const unsigned char *buf, //buffer pointer to calculate CRC on
35;               uint64_t len //buffer length in bytes (64-bit data)
36;       );
37;
38%include "reg_sizes.asm"
39
40%ifndef FUNCTION_NAME
41%define FUNCTION_NAME crc64_iso_refl_by16_10
42%endif
43
44%if (AS_FEATURE_LEVEL) >= 10
45
46%define	fetch_dist	1024
47
48[bits 64]
49default rel
50
51section .text
52
53
54%ifidn __OUTPUT_FORMAT__, win64
55	%xdefine	arg1 rcx
56	%xdefine	arg2 rdx
57	%xdefine	arg3 r8
58%else
59	%xdefine	arg1 rdi
60	%xdefine	arg2 rsi
61	%xdefine	arg3 rdx
62%endif
63
64%define TMP 16*0
65%ifidn __OUTPUT_FORMAT__, win64
66	%define XMM_SAVE 16*2
67	%define VARIABLE_OFFSET 16*12+8
68%else
69	%define VARIABLE_OFFSET 16*2+8
70%endif
71
72align 16
73mk_global FUNCTION_NAME, function
74FUNCTION_NAME:
75	endbranch
76	not		arg1
77	sub		rsp, VARIABLE_OFFSET
78
79%ifidn __OUTPUT_FORMAT__, win64
80	; push the xmm registers into the stack to maintain
81	vmovdqa		[rsp + XMM_SAVE + 16*0], xmm6
82	vmovdqa		[rsp + XMM_SAVE + 16*1], xmm7
83	vmovdqa		[rsp + XMM_SAVE + 16*2], xmm8
84	vmovdqa		[rsp + XMM_SAVE + 16*3], xmm9
85	vmovdqa		[rsp + XMM_SAVE + 16*4], xmm10
86	vmovdqa		[rsp + XMM_SAVE + 16*5], xmm11
87	vmovdqa		[rsp + XMM_SAVE + 16*6], xmm12
88	vmovdqa		[rsp + XMM_SAVE + 16*7], xmm13
89	vmovdqa		[rsp + XMM_SAVE + 16*8], xmm14
90	vmovdqa		[rsp + XMM_SAVE + 16*9], xmm15
91%endif
92
93	cmp		arg3, 256
94	jl		_less_than_256
95
96	; load the initial crc value
97	vmovq		xmm10, arg1      ; initial crc
98
99	; receive the initial 128B data, xor the initial crc value
100	vmovdqu8	zmm0, [arg2+16*0]
101	vmovdqu8	zmm4, [arg2+16*4]
102	vpxorq		zmm0, zmm10
103	vbroadcasti32x4 zmm10, [rk3]	;zmm10 has rk3 and rk4
104					;imm value of pclmulqdq instruction will determine which constant to use
105
106	sub		arg3, 256
107	cmp		arg3, 256
108	jl		_fold_128_B_loop
109
110	vmovdqu8	zmm7, [arg2+16*8]
111	vmovdqu8	zmm8, [arg2+16*12]
112	vbroadcasti32x4 zmm16, [rk_1]	;zmm16 has rk-1 and rk-2
113	sub		arg3, 256
114
115_fold_256_B_loop:
116	add		arg2, 256
117	vpclmulqdq	zmm1, zmm0, zmm16, 0x10
118	vpclmulqdq	zmm0, zmm0, zmm16, 0x01
119	vpternlogq	zmm0, zmm1, [arg2+16*0], 0x96
120
121	vpclmulqdq	zmm2, zmm4, zmm16, 0x10
122	vpclmulqdq	zmm4, zmm4, zmm16, 0x01
123	vpternlogq	zmm4, zmm2, [arg2+16*4], 0x96
124
125	vpclmulqdq	zmm3, zmm7, zmm16, 0x10
126	vpclmulqdq	zmm7, zmm7, zmm16, 0x01
127	vpternlogq	zmm7, zmm3, [arg2+16*8], 0x96
128
129	vpclmulqdq	zmm5, zmm8, zmm16, 0x10
130	vpclmulqdq	zmm8, zmm8, zmm16, 0x01
131	vpternlogq	zmm8, zmm5, [arg2+16*12], 0x96
132
133	sub		arg3, 256
134	jge     	_fold_256_B_loop
135
136	;; Fold 256 into 128
137	add		arg2, 256
138	vpclmulqdq	zmm1, zmm0, zmm10, 0x01
139	vpclmulqdq	zmm2, zmm0, zmm10, 0x10
140	vpternlogq	zmm7, zmm1, zmm2, 0x96	; xor ABC
141
142	vpclmulqdq	zmm5, zmm4, zmm10, 0x01
143	vpclmulqdq	zmm6, zmm4, zmm10, 0x10
144	vpternlogq	zmm8, zmm5, zmm6, 0x96	; xor ABC
145
146	vmovdqa32	zmm0, zmm7
147	vmovdqa32	zmm4, zmm8
148
149	add		arg3, 128
150	jmp		_fold_128_B_register
151
152	; fold 128B at a time. This section of the code folds 2 zmm registers in parallel
153_fold_128_B_loop:
154	add		arg2, 128	; update the buffer pointer
155	vpclmulqdq	zmm1, zmm0, zmm10, 0x10
156	vpclmulqdq	zmm0, zmm0, zmm10, 0x01
157	vpternlogq	zmm0, zmm1, [arg2+16*0], 0x96
158
159	vpclmulqdq	zmm5, zmm4, zmm10, 0x10
160	vpclmulqdq	zmm4, zmm4, zmm10, 0x01
161	vpternlogq	zmm4, zmm5, [arg2+16*4], 0x96
162
163	sub		arg3, 128
164	jge		_fold_128_B_loop
165	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
166
167	add     arg2, 128
168	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
169	; the 128B of folded data is in 2 zmm registers: zmm0, zmm4
170
171_fold_128_B_register:
172	; fold the 8 128b parts into 1 xmm register with different constants
173	vmovdqu8	zmm16, [rk9]		; multiply by rk9-rk16
174	vmovdqu8	zmm11, [rk17]		; multiply by rk17-rk20, rk1,rk2, 0,0
175	vpclmulqdq	zmm1, zmm0, zmm16, 0x01
176	vpclmulqdq	zmm2, zmm0, zmm16, 0x10
177	vextracti64x2	xmm7, zmm4, 3		; save last that has no multiplicand
178
179	vpclmulqdq	zmm5, zmm4, zmm11, 0x01
180	vpclmulqdq	zmm6, zmm4, zmm11, 0x10
181	vmovdqa		xmm10, [rk1]		; Needed later in reduction loop
182	vpternlogq	zmm1, zmm2, zmm5, 0x96	; xor ABC
183	vpternlogq	zmm1, zmm6, zmm7, 0x96	; xor ABC
184
185	vshufi64x2      zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
186	vpxorq          ymm8, ymm8, ymm1
187	vextracti64x2   xmm5, ymm8, 1
188	vpxorq          xmm7, xmm5, xmm8
189
190	; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
191	; instead of a cmp instruction, we use the negative flag with the jl instruction
192	add		arg3, 128-16
193	jl		_final_reduction_for_128
194
195	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
196	; we can fold 16 bytes at a time if y>=16
197	; continue folding 16B at a time
198
199_16B_reduction_loop:
200	vmovdqa		xmm8, xmm7
201	vpclmulqdq	xmm7, xmm10, 0x1
202	vpclmulqdq	xmm8, xmm10, 0x10
203	vpxor		xmm7, xmm8
204	vmovdqu		xmm0, [arg2]
205	vpxor		xmm7, xmm0
206	add		arg2, 16
207	sub		arg3, 16
208	; instead of a cmp instruction, we utilize the flags with the jge instruction
209	; equivalent of: cmp arg3, 16-16
210	; check if there is any more 16B in the buffer to be able to fold
211	jge		_16B_reduction_loop
212
213	;now we have 16+z bytes left to reduce, where 0<= z < 16.
214	;first, we reduce the data in the xmm7 register
215
216
217_final_reduction_for_128:
218	add		arg3, 16
219	je		_128_done
220	; here we are getting data that is less than 16 bytes.
221	; since we know that there was data before the pointer, we can offset
222	; the input pointer before the actual point, to receive exactly 16 bytes.
223	; after that the registers need to be adjusted.
224_get_last_two_xmms:
225
226
227	vmovdqa		xmm2, xmm7
228	vmovdqu		xmm1, [arg2 - 16 + arg3]
229
230	; get rid of the extra data that was loaded before
231	; load the shift constant
232	lea		rax, [pshufb_shf_table]
233	add		rax, arg3
234	vmovdqu		xmm0, [rax]
235
236
237	vpshufb		xmm7, xmm0
238	vpxor		xmm0, [mask3]
239	vpshufb		xmm2, xmm0
240
241	vpblendvb	xmm2, xmm2, xmm1, xmm0
242	;;;;;;;;;;
243	vmovdqa		xmm8, xmm7
244	vpclmulqdq	xmm7, xmm10, 0x1
245
246	vpclmulqdq	xmm8, xmm10, 0x10
247	vpxor		xmm7, xmm8
248	vpxor		xmm7, xmm2
249
250_128_done:
251	; compute crc of a 128-bit value
252	vmovdqa		xmm10, [rk5]
253	vmovdqa		xmm0, xmm7
254
255	;64b fold
256	vpclmulqdq	xmm7, xmm10, 0
257	vpsrldq		xmm0, 8
258	vpxor		xmm7, xmm0
259
260	;barrett reduction
261_barrett:
262	vmovdqa		xmm1, xmm7
263	vmovdqa		xmm10, [rk7]
264
265	vpclmulqdq	xmm7, xmm10, 0
266	vmovdqa		xmm2, xmm7
267	vpclmulqdq	xmm7, xmm10, 0x10
268	vpslldq		xmm2, 8
269	vpxor		xmm7, xmm2
270	vpxor		xmm7, xmm1
271	vpextrq		rax, xmm7, 1
272
273_cleanup:
274	not		rax
275
276
277%ifidn __OUTPUT_FORMAT__, win64
278	vmovdqa		xmm6, [rsp + XMM_SAVE + 16*0]
279	vmovdqa		xmm7, [rsp + XMM_SAVE + 16*1]
280	vmovdqa		xmm8, [rsp + XMM_SAVE + 16*2]
281	vmovdqa		xmm9, [rsp + XMM_SAVE + 16*3]
282	vmovdqa		xmm10, [rsp + XMM_SAVE + 16*4]
283	vmovdqa		xmm11, [rsp + XMM_SAVE + 16*5]
284	vmovdqa		xmm12, [rsp + XMM_SAVE + 16*6]
285	vmovdqa		xmm13, [rsp + XMM_SAVE + 16*7]
286	vmovdqa		xmm14, [rsp + XMM_SAVE + 16*8]
287	vmovdqa		xmm15, [rsp + XMM_SAVE + 16*9]
288%endif
289	add		rsp, VARIABLE_OFFSET
290	ret
291
292;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
293;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
294;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
295;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
296
297align 16
298_less_than_256:
299
300	; check if there is enough buffer to be able to fold 16B at a time
301	cmp	arg3, 32
302	jl	_less_than_32
303
304	; if there is, load the constants
305	vmovdqa	xmm10, [rk1]	; rk1 and rk2 in xmm10
306
307	vmovq	xmm0, arg1	; get the initial crc value
308	vmovdqu	xmm7, [arg2]	; load the plaintext
309	vpxor	xmm7, xmm0
310
311	; update the buffer pointer
312	add	arg2, 16
313
314	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
315	sub	arg3, 32
316
317	jmp	_16B_reduction_loop
318
319align 16
320_less_than_32:
321	; mov initial crc to the return value. this is necessary for zero-length buffers.
322	mov	rax, arg1
323	test	arg3, arg3
324	je	_cleanup
325
326	vmovq	xmm0, arg1	; get the initial crc value
327
328	cmp	arg3, 16
329	je	_exact_16_left
330	jl	_less_than_16_left
331
332	vmovdqu	xmm7, [arg2]	; load the plaintext
333	vpxor	xmm7, xmm0	; xor the initial crc value
334	add	arg2, 16
335	sub	arg3, 16
336	vmovdqa	xmm10, [rk1]    ; rk1 and rk2 in xmm10
337	jmp	_get_last_two_xmms
338
339
340align 16
341_less_than_16_left:
342	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
343
344	vpxor	xmm1, xmm1
345	mov	r11, rsp
346	vmovdqa	[r11], xmm1
347
348	; backup the counter value
349	mov	r9, arg3
350	cmp	arg3, 8
351	jl	_less_than_8_left
352
353	; load 8 Bytes
354	mov	rax, [arg2]
355	mov	[r11], rax
356	add	r11, 8
357	sub	arg3, 8
358	add	arg2, 8
359_less_than_8_left:
360
361	cmp	arg3, 4
362	jl	_less_than_4_left
363
364	; load 4 Bytes
365	mov	eax, [arg2]
366	mov	[r11], eax
367	add	r11, 4
368	sub	arg3, 4
369	add	arg2, 4
370_less_than_4_left:
371
372	cmp	arg3, 2
373	jl	_less_than_2_left
374
375	; load 2 Bytes
376	mov	ax, [arg2]
377	mov	[r11], ax
378	add	r11, 2
379	sub	arg3, 2
380	add	arg2, 2
381_less_than_2_left:
382	cmp	arg3, 1
383	jl	_zero_left
384
385	; load 1 Byte
386	mov	al, [arg2]
387	mov	[r11], al
388
389_zero_left:
390	vmovdqa	xmm7, [rsp]
391	vpxor	xmm7, xmm0	; xor the initial crc value
392
393	lea	rax,[pshufb_shf_table]
394
395	cmp	r9, 8
396	jl	_end_1to7
397
398_end_8to15:
399	vmovdqu	xmm0, [rax + r9]
400	vpshufb	xmm7,xmm0
401	jmp	_128_done
402
403_end_1to7:
404	; Left shift (8-length) bytes in XMM
405	vmovdqu	xmm0, [rax + r9 + 8]
406	vpshufb	xmm7,xmm0
407
408	jmp	_barrett
409
410align 16
411_exact_16_left:
412	vmovdqu	xmm7, [arg2]
413	vpxor	xmm7, xmm0	; xor the initial crc value
414
415	jmp	_128_done
416
417section .data
418align 32
419
420%ifndef USE_CONSTS
421; precomputed constants
422rk_1: dq 0x45000000b0000000
423rk_2: dq 0x6b700000f5000000
424rk1:  dq 0xf500000000000001
425rk2:  dq 0x6b70000000000001
426rk3:  dq 0xb001000000010000
427rk4:  dq 0xf501b0000001b000
428rk5:  dq 0xf500000000000001
429rk6:  dq 0x0000000000000000
430rk7:  dq 0xb000000000000001
431rk8:  dq 0xb000000000000000
432rk9:  dq 0xe014514514501501
433rk10: dq 0x771db6db6db71c71
434rk11: dq 0xa101101101110001
435rk12: dq 0x1ab1ab1ab1aab001
436rk13: dq 0xf445014445000001
437rk14: dq 0x6aab71daab700001
438rk15: dq 0xb100010100000001
439rk16: dq 0x01b001b1b0000001
440rk17: dq 0xe145150000000001
441rk18: dq 0x76db6c7000000001
442rk19: dq 0xa011000000000001
443rk20: dq 0x1b1ab00000000001
444
445rk_1b: dq 0xf500000000000001
446rk_2b: dq 0x6b70000000000001
447	dq 0x0000000000000000
448	dq 0x0000000000000000
449%else
450INCLUDE_CONSTS
451%endif
452
453pshufb_shf_table:
454; use these values for shift constants for the pshufb instruction
455; different alignments result in values as shown:
456;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
457;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
458;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
459;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
460;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
461;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
462;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
463;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
464;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
465;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
466;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
467;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
468;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
469;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
470;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
471dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
472dq 0x0706050403020100, 0x000e0d0c0b0a0908
473
474mask:  dq     0xFFFFFFFFFFFFFFFF, 0x0000000000000000
475mask2: dq     0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
476mask3: dq     0x8080808080808080, 0x8080808080808080
477
478%else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
479%ifidn __OUTPUT_FORMAT__, win64
480global no_ %+ FUNCTION_NAME
481no_ %+ FUNCTION_NAME %+ :
482%endif
483%endif ; (AS_FEATURE_LEVEL) >= 10
484