xref: /isa-l/crc/crc16_t10dif_01.asm (revision 4f59eeda903272a4daf93cbe5352ba7917b7a85d)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;       Function API:
31;       UINT16 crc16_t10dif_01(
32;               UINT16 init_crc, //initial CRC value, 16 bits
33;               const unsigned char *buf, //buffer pointer to calculate CRC on
34;               UINT64 len //buffer length in bytes (64-bit data)
35;       );
36;
37;       Authors:
38;               Erdinc Ozturk
39;               Vinodh Gopal
40;               James Guilford
41;
42;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
43;       URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
44
45%include "reg_sizes.asm"
46
47%define	fetch_dist	1024
48
49[bits 64]
50default rel
51
52section .text
53
54%ifidn __OUTPUT_FORMAT__, win64
55        %xdefine        arg1 rcx
56        %xdefine        arg2 rdx
57        %xdefine        arg3 r8
58
59        %xdefine        arg1_low32 ecx
60%else
61        %xdefine        arg1 rdi
62        %xdefine        arg2 rsi
63        %xdefine        arg3 rdx
64
65        %xdefine        arg1_low32 edi
66%endif
67
68%ifidn __OUTPUT_FORMAT__, win64
69        %define XMM_SAVE 16*2
70        %define VARIABLE_OFFSET 16*10+8
71%else
72        %define VARIABLE_OFFSET 16*2+8
73%endif
74
75align 16
76global	crc16_t10dif_01:function
77crc16_t10dif_01:
78
79	; adjust the 16-bit initial_crc value, scale it to 32 bits
80	shl	arg1_low32, 16
81
82	; After this point, code flow is exactly same as a 32-bit CRC.
83	; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
84
85	sub	rsp, VARIABLE_OFFSET
86%ifidn __OUTPUT_FORMAT__, win64
87	; push the xmm registers into the stack to maintain
88	movdqa [rsp+16*2],xmm6
89	movdqa [rsp+16*3],xmm7
90	movdqa [rsp+16*4],xmm8
91	movdqa [rsp+16*5],xmm9
92	movdqa [rsp+16*6],xmm10
93	movdqa [rsp+16*7],xmm11
94	movdqa [rsp+16*8],xmm12
95	movdqa [rsp+16*9],xmm13
96%endif
97
98	; check if smaller than 256
99	cmp	arg3, 256
100
101	; for sizes less than 256, we can't fold 128B at a time...
102	jl	_less_than_256
103
104
105	; load the initial crc value
106	movd	xmm10, arg1_low32	; initial crc
107
108	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
109	; because data will be byte-reflected and will align with initial crc at correct place.
110	pslldq	xmm10, 12
111
112	movdqa xmm11, [SHUF_MASK]
113	; receive the initial 128B data, xor the initial crc value
114	movdqu	xmm0, [arg2+16*0]
115	movdqu	xmm1, [arg2+16*1]
116	movdqu	xmm2, [arg2+16*2]
117	movdqu	xmm3, [arg2+16*3]
118	movdqu	xmm4, [arg2+16*4]
119	movdqu	xmm5, [arg2+16*5]
120	movdqu	xmm6, [arg2+16*6]
121	movdqu	xmm7, [arg2+16*7]
122
123	pshufb	xmm0, xmm11
124	; XOR the initial_crc value
125	pxor	xmm0, xmm10
126	pshufb	xmm1, xmm11
127	pshufb	xmm2, xmm11
128	pshufb	xmm3, xmm11
129	pshufb	xmm4, xmm11
130	pshufb	xmm5, xmm11
131	pshufb	xmm6, xmm11
132	pshufb	xmm7, xmm11
133
134	movdqa	xmm10, [rk3]	;xmm10 has rk3 and rk4
135					;imm value of pclmulqdq instruction will determine which constant to use
136	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
137	; we subtract 256 instead of 128 to save one instruction from the loop
138	sub	arg3, 256
139
140	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
141	; loop will fold 128B at a time until we have 128+y Bytes of buffer
142
143
144	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
145_fold_128_B_loop:
146
147	; update the buffer pointer
148	add	arg2, 128		;    buf += 128;
149
150	prefetchnta [arg2+fetch_dist+0]
151	movdqu	xmm9, [arg2+16*0]
152	movdqu	xmm12, [arg2+16*1]
153	pshufb	xmm9, xmm11
154	pshufb	xmm12, xmm11
155	movdqa	xmm8, xmm0
156	movdqa	xmm13, xmm1
157	pclmulqdq	xmm0, xmm10, 0x0
158	pclmulqdq	xmm8, xmm10 , 0x11
159	pclmulqdq	xmm1, xmm10, 0x0
160	pclmulqdq	xmm13, xmm10 , 0x11
161	pxor	xmm0, xmm9
162	xorps	xmm0, xmm8
163	pxor	xmm1, xmm12
164	xorps	xmm1, xmm13
165
166	prefetchnta [arg2+fetch_dist+32]
167	movdqu	xmm9, [arg2+16*2]
168	movdqu	xmm12, [arg2+16*3]
169	pshufb	xmm9, xmm11
170	pshufb	xmm12, xmm11
171	movdqa	xmm8, xmm2
172	movdqa	xmm13, xmm3
173	pclmulqdq	xmm2, xmm10, 0x0
174	pclmulqdq	xmm8, xmm10 , 0x11
175	pclmulqdq	xmm3, xmm10, 0x0
176	pclmulqdq	xmm13, xmm10 , 0x11
177	pxor	xmm2, xmm9
178	xorps	xmm2, xmm8
179	pxor	xmm3, xmm12
180	xorps	xmm3, xmm13
181
182	prefetchnta [arg2+fetch_dist+64]
183	movdqu	xmm9, [arg2+16*4]
184	movdqu	xmm12, [arg2+16*5]
185	pshufb	xmm9, xmm11
186	pshufb	xmm12, xmm11
187	movdqa	xmm8, xmm4
188	movdqa	xmm13, xmm5
189	pclmulqdq	xmm4, xmm10, 0x0
190	pclmulqdq	xmm8, xmm10 , 0x11
191	pclmulqdq	xmm5, xmm10, 0x0
192	pclmulqdq	xmm13, xmm10 , 0x11
193	pxor	xmm4, xmm9
194	xorps	xmm4, xmm8
195	pxor	xmm5, xmm12
196	xorps	xmm5, xmm13
197
198	prefetchnta [arg2+fetch_dist+96]
199	movdqu	xmm9, [arg2+16*6]
200	movdqu	xmm12, [arg2+16*7]
201	pshufb	xmm9, xmm11
202	pshufb	xmm12, xmm11
203	movdqa	xmm8, xmm6
204	movdqa	xmm13, xmm7
205	pclmulqdq	xmm6, xmm10, 0x0
206	pclmulqdq	xmm8, xmm10 , 0x11
207	pclmulqdq	xmm7, xmm10, 0x0
208	pclmulqdq	xmm13, xmm10 , 0x11
209	pxor	xmm6, xmm9
210	xorps	xmm6, xmm8
211	pxor	xmm7, xmm12
212	xorps	xmm7, xmm13
213
214	sub	arg3, 128
215
216	; check if there is another 128B in the buffer to be able to fold
217	jge	_fold_128_B_loop
218	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
219
220
221	add	arg2, 128
222	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
223	; fold the 8 xmm registers to 1 xmm register with different constants
224
225	movdqa	xmm10, [rk9]
226	movdqa	xmm8, xmm0
227	pclmulqdq	xmm0, xmm10, 0x11
228	pclmulqdq	xmm8, xmm10, 0x0
229	pxor	xmm7, xmm8
230	xorps	xmm7, xmm0
231
232	movdqa	xmm10, [rk11]
233	movdqa	xmm8, xmm1
234	pclmulqdq	xmm1, xmm10, 0x11
235	pclmulqdq	xmm8, xmm10, 0x0
236	pxor	xmm7, xmm8
237	xorps	xmm7, xmm1
238
239	movdqa	xmm10, [rk13]
240	movdqa	xmm8, xmm2
241	pclmulqdq	xmm2, xmm10, 0x11
242	pclmulqdq	xmm8, xmm10, 0x0
243	pxor	xmm7, xmm8
244	pxor	xmm7, xmm2
245
246	movdqa	xmm10, [rk15]
247	movdqa	xmm8, xmm3
248	pclmulqdq	xmm3, xmm10, 0x11
249	pclmulqdq	xmm8, xmm10, 0x0
250	pxor	xmm7, xmm8
251	xorps	xmm7, xmm3
252
253	movdqa	xmm10, [rk17]
254	movdqa	xmm8, xmm4
255	pclmulqdq	xmm4, xmm10, 0x11
256	pclmulqdq	xmm8, xmm10, 0x0
257	pxor	xmm7, xmm8
258	pxor	xmm7, xmm4
259
260	movdqa	xmm10, [rk19]
261	movdqa	xmm8, xmm5
262	pclmulqdq	xmm5, xmm10, 0x11
263	pclmulqdq	xmm8, xmm10, 0x0
264	pxor	xmm7, xmm8
265	xorps	xmm7, xmm5
266
267	movdqa	xmm10, [rk1]	;xmm10 has rk1 and rk2
268				;imm value of pclmulqdq instruction will determine which constant to use
269	movdqa	xmm8, xmm6
270	pclmulqdq	xmm6, xmm10, 0x11
271	pclmulqdq	xmm8, xmm10, 0x0
272	pxor	xmm7, xmm8
273	pxor	xmm7, xmm6
274
275
276	; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
277	; instead of a cmp instruction, we use the negative flag with the jl instruction
278	add	arg3, 128-16
279	jl	_final_reduction_for_128
280
281	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
282	; we can fold 16 bytes at a time if y>=16
283	; continue folding 16B at a time
284
285_16B_reduction_loop:
286	movdqa	xmm8, xmm7
287	pclmulqdq	xmm7, xmm10, 0x11
288	pclmulqdq	xmm8, xmm10, 0x0
289	pxor	xmm7, xmm8
290	movdqu	xmm0, [arg2]
291	pshufb	xmm0, xmm11
292	pxor	xmm7, xmm0
293	add	arg2, 16
294	sub	arg3, 16
295	; instead of a cmp instruction, we utilize the flags with the jge instruction
296	; equivalent of: cmp arg3, 16-16
297	; check if there is any more 16B in the buffer to be able to fold
298	jge	_16B_reduction_loop
299
300	;now we have 16+z bytes left to reduce, where 0<= z < 16.
301	;first, we reduce the data in the xmm7 register
302
303
304_final_reduction_for_128:
305	; check if any more data to fold. If not, compute the CRC of the final 128 bits
306	add	arg3, 16
307	je	_128_done
308
309	; here we are getting data that is less than 16 bytes.
310	; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
311	; after that the registers need to be adjusted.
312_get_last_two_xmms:
313	movdqa	xmm2, xmm7
314
315	movdqu	xmm1, [arg2 - 16 + arg3]
316	pshufb	xmm1, xmm11
317
318	; get rid of the extra data that was loaded before
319	; load the shift constant
320	lea	rax, [pshufb_shf_table + 16]
321	sub	rax, arg3
322	movdqu	xmm0, [rax]
323
324	; shift xmm2 to the left by arg3 bytes
325	pshufb	xmm2, xmm0
326
327	; shift xmm7 to the right by 16-arg3 bytes
328	pxor	xmm0, [mask1]
329	pshufb	xmm7, xmm0
330	pblendvb	xmm1, xmm2	;xmm0 is implicit
331
332	; fold 16 Bytes
333	movdqa	xmm2, xmm1
334	movdqa	xmm8, xmm7
335	pclmulqdq	xmm7, xmm10, 0x11
336	pclmulqdq	xmm8, xmm10, 0x0
337	pxor	xmm7, xmm8
338	pxor	xmm7, xmm2
339
340_128_done:
341	; compute crc of a 128-bit value
342	movdqa	xmm10, [rk5]	; rk5 and rk6 in xmm10
343	movdqa	xmm0, xmm7
344
345	;64b fold
346	pclmulqdq	xmm7, xmm10, 0x1
347	pslldq	xmm0, 8
348	pxor	xmm7, xmm0
349
350	;32b fold
351	movdqa	xmm0, xmm7
352
353	pand	xmm0, [mask2]
354
355	psrldq	xmm7, 12
356	pclmulqdq	xmm7, xmm10, 0x10
357	pxor	xmm7, xmm0
358
359	;barrett reduction
360_barrett:
361	movdqa	xmm10, [rk7]	; rk7 and rk8 in xmm10
362	movdqa	xmm0, xmm7
363	pclmulqdq	xmm7, xmm10, 0x01
364	pslldq	xmm7, 4
365	pclmulqdq	xmm7, xmm10, 0x11
366
367	pslldq	xmm7, 4
368	pxor	xmm7, xmm0
369	pextrd	eax, xmm7,1
370
371_cleanup:
372	; scale the result back to 16 bits
373	shr	eax, 16
374%ifidn __OUTPUT_FORMAT__, win64
375	movdqa	xmm6, [rsp+16*2]
376	movdqa	xmm7, [rsp+16*3]
377	movdqa	xmm8, [rsp+16*4]
378	movdqa	xmm9, [rsp+16*5]
379	movdqa	xmm10, [rsp+16*6]
380	movdqa	xmm11, [rsp+16*7]
381	movdqa	xmm12, [rsp+16*8]
382	movdqa	xmm13, [rsp+16*9]
383%endif
384	add	rsp, VARIABLE_OFFSET
385	ret
386
387
388;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
389;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
390;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
391;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
392
393align 16
394_less_than_256:
395
396	; check if there is enough buffer to be able to fold 16B at a time
397	cmp	arg3, 32
398	jl	_less_than_32
399	movdqa xmm11, [SHUF_MASK]
400
401	; if there is, load the constants
402	movdqa	xmm10, [rk1]	; rk1 and rk2 in xmm10
403
404	movd	xmm0, arg1_low32	; get the initial crc value
405	pslldq	xmm0, 12	; align it to its correct place
406	movdqu	xmm7, [arg2]	; load the plaintext
407	pshufb	xmm7, xmm11	; byte-reflect the plaintext
408	pxor	xmm7, xmm0
409
410
411	; update the buffer pointer
412	add	arg2, 16
413
414	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
415	sub	arg3, 32
416
417	jmp	_16B_reduction_loop
418
419
420align 16
421_less_than_32:
422	; mov initial crc to the return value. this is necessary for zero-length buffers.
423	mov	eax, arg1_low32
424	test	arg3, arg3
425	je	_cleanup
426
427	movdqa xmm11, [SHUF_MASK]
428
429	movd	xmm0, arg1_low32	; get the initial crc value
430	pslldq	xmm0, 12	; align it to its correct place
431
432	cmp	arg3, 16
433	je	_exact_16_left
434	jl	_less_than_16_left
435
436	movdqu	xmm7, [arg2]	; load the plaintext
437	pshufb	xmm7, xmm11	; byte-reflect the plaintext
438	pxor	xmm7, xmm0	; xor the initial crc value
439	add	arg2, 16
440	sub	arg3, 16
441	movdqa	xmm10, [rk1]	; rk1 and rk2 in xmm10
442	jmp	_get_last_two_xmms
443
444
445align 16
446_less_than_16_left:
447	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
448
449	pxor	xmm1, xmm1
450	mov	r11, rsp
451	movdqa	[r11], xmm1
452
453	cmp	arg3, 4
454	jl	_only_less_than_4
455
456	;	backup the counter value
457	mov	r9, arg3
458	cmp	arg3, 8
459	jl	_less_than_8_left
460
461	; load 8 Bytes
462	mov	rax, [arg2]
463	mov	[r11], rax
464	add	r11, 8
465	sub	arg3, 8
466	add	arg2, 8
467_less_than_8_left:
468
469	cmp	arg3, 4
470	jl	_less_than_4_left
471
472	; load 4 Bytes
473	mov	eax, [arg2]
474	mov	[r11], eax
475	add	r11, 4
476	sub	arg3, 4
477	add	arg2, 4
478_less_than_4_left:
479
480	cmp	arg3, 2
481	jl	_less_than_2_left
482
483	; load 2 Bytes
484	mov	ax, [arg2]
485	mov	[r11], ax
486	add	r11, 2
487	sub	arg3, 2
488	add	arg2, 2
489_less_than_2_left:
490	cmp     arg3, 1
491        jl      _zero_left
492
493	; load 1 Byte
494	mov	al, [arg2]
495	mov	[r11], al
496_zero_left:
497	movdqa	xmm7, [rsp]
498	pshufb	xmm7, xmm11
499	pxor	xmm7, xmm0	; xor the initial crc value
500
501	lea	rax, [pshufb_shf_table + 16]
502	sub	rax, r9
503	movdqu	xmm0, [rax]
504	pxor	xmm0, [mask1]
505
506	pshufb	xmm7, xmm0
507	jmp	_128_done
508
509align 16
510_exact_16_left:
511	movdqu	xmm7, [arg2]
512	pshufb	xmm7, xmm11
513	pxor	xmm7, xmm0	; xor the initial crc value
514
515	jmp	_128_done
516
517_only_less_than_4:
518	cmp	arg3, 3
519	jl	_only_less_than_3
520
521	; load 3 Bytes
522	mov	al, [arg2]
523	mov	[r11], al
524
525	mov	al, [arg2+1]
526	mov	[r11+1], al
527
528	mov	al, [arg2+2]
529	mov	[r11+2], al
530
531	movdqa	xmm7, [rsp]
532	pshufb	xmm7, xmm11
533	pxor	xmm7, xmm0	; xor the initial crc value
534
535	psrldq	xmm7, 5
536
537	jmp	_barrett
538_only_less_than_3:
539	cmp	arg3, 2
540	jl	_only_less_than_2
541
542	; load 2 Bytes
543	mov	al, [arg2]
544	mov	[r11], al
545
546	mov	al, [arg2+1]
547	mov	[r11+1], al
548
549	movdqa	xmm7, [rsp]
550	pshufb	xmm7, xmm11
551	pxor	xmm7, xmm0	; xor the initial crc value
552
553	psrldq	xmm7, 6
554
555	jmp	_barrett
556_only_less_than_2:
557
558	; load 1 Byte
559	mov	al, [arg2]
560	mov	[r11], al
561
562	movdqa	xmm7, [rsp]
563	pshufb	xmm7, xmm11
564	pxor	xmm7, xmm0	; xor the initial crc value
565
566	psrldq	xmm7, 7
567
568	jmp	_barrett
569
570section .data
571
572; precomputed constants
573; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
574align 16
575; Q = 0x18BB70000
576; rk1 = 2^(32*3) mod Q << 32
577; rk2 = 2^(32*5) mod Q << 32
578; rk3 = 2^(32*15) mod Q << 32
579; rk4 = 2^(32*17) mod Q << 32
580; rk5 = 2^(32*3) mod Q << 32
581; rk6 = 2^(32*2) mod Q << 32
582; rk7 = floor(2^64/Q)
583; rk8 = Q
584rk1:
585DQ 0x2d56000000000000
586rk2:
587DQ 0x06df000000000000
588rk3:
589DQ 0x9d9d000000000000
590rk4:
591DQ 0x7cf5000000000000
592rk5:
593DQ 0x2d56000000000000
594rk6:
595DQ 0x1368000000000000
596rk7:
597DQ 0x00000001f65a57f8
598rk8:
599DQ 0x000000018bb70000
600
601rk9:
602DQ 0xceae000000000000
603rk10:
604DQ 0xbfd6000000000000
605rk11:
606DQ 0x1e16000000000000
607rk12:
608DQ 0x713c000000000000
609rk13:
610DQ 0xf7f9000000000000
611rk14:
612DQ 0x80a6000000000000
613rk15:
614DQ 0x044c000000000000
615rk16:
616DQ 0xe658000000000000
617rk17:
618DQ 0xad18000000000000
619rk18:
620DQ 0xa497000000000000
621rk19:
622DQ 0x6ee3000000000000
623rk20:
624DQ 0xe7b5000000000000
625
626
627
628
629
630
631
632
633
634mask1:
635dq 0x8080808080808080, 0x8080808080808080
636mask2:
637dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
638
639SHUF_MASK:
640dq 0x08090A0B0C0D0E0F, 0x0001020304050607
641
642pshufb_shf_table:
643; use these values for shift constants for the pshufb instruction
644; different alignments result in values as shown:
645;	dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
646;	dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
647;	dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
648;	dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
649;	dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
650;	dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
651;	dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
652;	dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
653;	dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
654;	dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
655;	dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
656;	dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
657;	dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
658;	dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
659;	dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
660dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
661dq 0x0706050403020100, 0x000e0d0c0b0a0908
662
663;;;       func          core, ver, snum
664slversion crc16_t10dif_01, 01,   06,  0010
665
666