xref: /isa-l/crc/crc32_ieee_by16_10.asm (revision c2bec3ea65ce35b01311d1cc4b314f6b4986b9c8)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31;       Function API:
32;       UINT32 crc32_gzip_refl_by16_10(
33;               UINT32 init_crc, //initial CRC value, 32 bits
34;               const unsigned char *buf, //buffer pointer to calculate CRC on
35;               UINT64 len //buffer length in bytes (64-bit data)
36;       );
37;
38;       Authors:
39;               Erdinc Ozturk
40;               Vinodh Gopal
41;               James Guilford
42;
43;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
44;       URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
45;
46;
47
48%include "reg_sizes.asm"
49
50%ifndef FUNCTION_NAME
51%define FUNCTION_NAME crc32_ieee_by16_10
52%endif
53
54%if (AS_FEATURE_LEVEL) >= 10
55
56[bits 64]
57default rel
58
59section .text
60
61
62%ifidn __OUTPUT_FORMAT__, win64
63	%xdefine	arg1 rcx
64	%xdefine	arg2 rdx
65	%xdefine	arg3 r8
66
67	%xdefine	arg1_low32 ecx
68%else
69	%xdefine	arg1 rdi
70	%xdefine	arg2 rsi
71	%xdefine	arg3 rdx
72
73	%xdefine	arg1_low32 edi
74%endif
75
76%define TMP 16*0
77%ifidn __OUTPUT_FORMAT__, win64
78	%define XMM_SAVE 16*2
79	%define VARIABLE_OFFSET 16*12+8
80%else
81	%define VARIABLE_OFFSET 16*2+8
82%endif
83
84align 16
85mk_global FUNCTION_NAME, function
86FUNCTION_NAME:
87	endbranch
88
89	not		arg1_low32
90	sub		rsp, VARIABLE_OFFSET
91
92%ifidn __OUTPUT_FORMAT__, win64
93	; push the xmm registers into the stack to maintain
94	vmovdqa		[rsp + XMM_SAVE + 16*0], xmm6
95	vmovdqa		[rsp + XMM_SAVE + 16*1], xmm7
96	vmovdqa		[rsp + XMM_SAVE + 16*2], xmm8
97	vmovdqa		[rsp + XMM_SAVE + 16*3], xmm9
98	vmovdqa		[rsp + XMM_SAVE + 16*4], xmm10
99	vmovdqa		[rsp + XMM_SAVE + 16*5], xmm11
100	vmovdqa		[rsp + XMM_SAVE + 16*6], xmm12
101	vmovdqa		[rsp + XMM_SAVE + 16*7], xmm13
102	vmovdqa		[rsp + XMM_SAVE + 16*8], xmm14
103	vmovdqa		[rsp + XMM_SAVE + 16*9], xmm15
104%endif
105
106	vbroadcasti32x4 zmm18, [SHUF_MASK]
107	cmp		arg3, 256
108	jl		.less_than_256
109
110	; load the initial crc value
111	vmovd		xmm10, arg1_low32      ; initial crc
112
113	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
114	; because data will be byte-reflected and will align with initial crc at correct place.
115	vpslldq		xmm10, 12
116
117	; receive the initial 64B data, xor the initial crc value
118	vmovdqu8	zmm0, [arg2+16*0]
119	vmovdqu8	zmm4, [arg2+16*4]
120	vpshufb		zmm0, zmm0, zmm18
121	vpshufb		zmm4, zmm4, zmm18
122	vpxorq		zmm0, zmm10
123	vbroadcasti32x4	zmm10, [rk3]	;xmm10 has rk3 and rk4
124					;imm value of pclmulqdq instruction will determine which constant to use
125
126	sub		arg3, 256
127	cmp		arg3, 256
128	jl		.fold_128_B_loop
129
130	vmovdqu8	zmm7, [arg2+16*8]
131	vmovdqu8	zmm8, [arg2+16*12]
132	vpshufb		zmm7, zmm7, zmm18
133	vpshufb		zmm8, zmm8, zmm18
134	vbroadcasti32x4 zmm16, [rk_1]	;zmm16 has rk-1 and rk-2
135	sub		arg3, 256
136
137.fold_256_B_loop:
138	add		arg2, 256
139	vmovdqu8	zmm3, [arg2+16*0]
140	vpshufb		zmm3, zmm3, zmm18
141	vpclmulqdq	zmm1, zmm0, zmm16, 0x00
142	vpclmulqdq	zmm0, zmm0, zmm16, 0x11
143	vpternlogq	zmm0, zmm1, zmm3, 0x96
144
145	vmovdqu8	zmm9, [arg2+16*4]
146	vpshufb		zmm9, zmm9, zmm18
147	vpclmulqdq	zmm5, zmm4, zmm16, 0x00
148	vpclmulqdq	zmm4, zmm4, zmm16, 0x11
149	vpternlogq	zmm4, zmm5, zmm9, 0x96
150
151	vmovdqu8	zmm11, [arg2+16*8]
152	vpshufb		zmm11, zmm11, zmm18
153	vpclmulqdq	zmm12, zmm7, zmm16, 0x00
154	vpclmulqdq	zmm7, zmm7, zmm16, 0x11
155	vpternlogq	zmm7, zmm12, zmm11, 0x96
156
157	vmovdqu8	zmm17, [arg2+16*12]
158	vpshufb		zmm17, zmm17, zmm18
159	vpclmulqdq	zmm14, zmm8, zmm16, 0x00
160	vpclmulqdq	zmm8, zmm8, zmm16, 0x11
161	vpternlogq	zmm8, zmm14, zmm17, 0x96
162
163	sub		arg3, 256
164	jge     	.fold_256_B_loop
165
166	;; Fold 256 into 128
167	add		arg2, 256
168	vpclmulqdq	zmm1, zmm0, zmm10, 0x00
169	vpclmulqdq	zmm2, zmm0, zmm10, 0x11
170	vpternlogq	zmm7, zmm1, zmm2, 0x96	; xor ABC
171
172	vpclmulqdq	zmm5, zmm4, zmm10, 0x00
173	vpclmulqdq	zmm6, zmm4, zmm10, 0x11
174	vpternlogq	zmm8, zmm5, zmm6, 0x96	; xor ABC
175
176	vmovdqa32	zmm0, zmm7
177	vmovdqa32	zmm4, zmm8
178
179	add		arg3, 128
180	jmp		.fold_128_B_register
181
182
183
184	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
185	; loop will fold 128B at a time until we have 128+y Bytes of buffer
186
187	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
188.fold_128_B_loop:
189	add		arg2, 128
190	vmovdqu8	zmm8, [arg2+16*0]
191	vpshufb		zmm8, zmm8, zmm18
192	vpclmulqdq	zmm2, zmm0, zmm10, 0x00
193	vpclmulqdq	zmm0, zmm0, zmm10, 0x11
194	vpternlogq	zmm0, zmm2, zmm8, 0x96
195
196	vmovdqu8	zmm9, [arg2+16*4]
197	vpshufb		zmm9, zmm9, zmm18
198	vpclmulqdq	zmm5, zmm4, zmm10, 0x00
199	vpclmulqdq	zmm4, zmm4, zmm10, 0x11
200	vpternlogq	zmm4, zmm5, zmm9, 0x96
201
202	sub		arg3, 128
203	jge		.fold_128_B_loop
204	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
205
206	add		arg2, 128
207	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
208	; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
209
210.fold_128_B_register:
211	; fold the 8 128b parts into 1 xmm register with different constants
212	vmovdqu8	zmm16, [rk9]		; multiply by rk9-rk16
213	vmovdqu8	zmm11, [rk17]		; multiply by rk17-rk20, rk1,rk2, 0,0
214	vpclmulqdq	zmm1, zmm0, zmm16, 0x00
215	vpclmulqdq	zmm2, zmm0, zmm16, 0x11
216	vextracti64x2	xmm7, zmm4, 3		; save last that has no multiplicand
217
218	vpclmulqdq	zmm5, zmm4, zmm11, 0x00
219	vpclmulqdq	zmm6, zmm4, zmm11, 0x11
220	vmovdqa		xmm10, [rk1]		; Needed later in reduction loop
221	vpternlogq	zmm1, zmm2, zmm5, 0x96	; xor ABC
222	vpternlogq	zmm1, zmm6, zmm7, 0x96	; xor ABC
223
224	vshufi64x2      zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
225	vpxorq          ymm8, ymm8, ymm1
226	vextracti64x2   xmm5, ymm8, 1
227	vpxorq          xmm7, xmm5, xmm8
228
229	; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
230	; instead of a cmp instruction, we use the negative flag with the jl instruction
231	add		arg3, 128-16
232	jl		.final_reduction_for_128
233
234	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
235	; we can fold 16 bytes at a time if y>=16
236	; continue folding 16B at a time
237
238.16B_reduction_loop:
239	vpclmulqdq	xmm8, xmm7, xmm10, 0x11
240	vpclmulqdq	xmm7, xmm7, xmm10, 0x00
241	vpxor		xmm7, xmm8
242	vmovdqu		xmm0, [arg2]
243	vpshufb		xmm0, xmm0, xmm18
244	vpxor		xmm7, xmm0
245	add		arg2, 16
246	sub		arg3, 16
247	; instead of a cmp instruction, we utilize the flags with the jge instruction
248	; equivalent of: cmp arg3, 16-16
249	; check if there is any more 16B in the buffer to be able to fold
250	jge		.16B_reduction_loop
251
252	;now we have 16+z bytes left to reduce, where 0<= z < 16.
253	;first, we reduce the data in the xmm7 register
254
255
256.final_reduction_for_128:
257	add		arg3, 16
258	je		.128_done
259
260	; here we are getting data that is less than 16 bytes.
261	; since we know that there was data before the pointer, we can offset
262	; the input pointer before the actual point, to receive exactly 16 bytes.
263	; after that the registers need to be adjusted.
264.get_last_two_xmms:
265
266	vmovdqa		xmm2, xmm7
267	vmovdqu		xmm1, [arg2 - 16 + arg3]
268	vpshufb		xmm1, xmm18
269
270	; get rid of the extra data that was loaded before
271	; load the shift constant
272	lea		rax, [pshufb_shf_table + 16]
273	sub		rax, arg3
274	vmovdqu		xmm0, [rax]
275
276	vpshufb		xmm2, xmm0
277	vpxor		xmm0, [mask1]
278	vpshufb		xmm7, xmm0
279	vpblendvb	xmm1, xmm1, xmm2, xmm0
280
281	vpclmulqdq	xmm8, xmm7, xmm10, 0x11
282	vpclmulqdq	xmm7, xmm7, xmm10, 0x00
283	vpxor		xmm7, xmm8
284	vpxor		xmm7, xmm1
285
286.128_done:
287	; compute crc of a 128-bit value
288	vmovdqa		xmm10, [rk5]
289	vmovdqa		xmm0, xmm7
290
291	;64b fold
292	vpclmulqdq	xmm7, xmm10, 0x01	; H*L
293	vpslldq		xmm0, 8
294	vpxor		xmm7, xmm0
295
296	;32b fold
297	vmovdqa		xmm0, xmm7
298	vpand		xmm0, [mask2]
299	vpsrldq		xmm7, 12
300	vpclmulqdq	xmm7, xmm10, 0x10
301	vpxor		xmm7, xmm0
302
303	;barrett reduction
304.barrett:
305	vmovdqa		xmm10, [rk7]	; rk7 and rk8 in xmm10
306	vmovdqa		xmm0, xmm7
307	vpclmulqdq	xmm7, xmm10, 0x01
308	vpslldq		xmm7, 4
309	vpclmulqdq	xmm7, xmm10, 0x11
310
311	vpslldq		xmm7, 4
312	vpxor		xmm7, xmm0
313	vpextrd		eax, xmm7, 1
314
315.cleanup:
316	not		eax
317
318
319%ifidn __OUTPUT_FORMAT__, win64
320	vmovdqa		xmm6, [rsp + XMM_SAVE + 16*0]
321	vmovdqa		xmm7, [rsp + XMM_SAVE + 16*1]
322	vmovdqa		xmm8, [rsp + XMM_SAVE + 16*2]
323	vmovdqa		xmm9, [rsp + XMM_SAVE + 16*3]
324	vmovdqa		xmm10, [rsp + XMM_SAVE + 16*4]
325	vmovdqa		xmm11, [rsp + XMM_SAVE + 16*5]
326	vmovdqa		xmm12, [rsp + XMM_SAVE + 16*6]
327	vmovdqa		xmm13, [rsp + XMM_SAVE + 16*7]
328	vmovdqa		xmm14, [rsp + XMM_SAVE + 16*8]
329	vmovdqa		xmm15, [rsp + XMM_SAVE + 16*9]
330%endif
331	add		rsp, VARIABLE_OFFSET
332	ret
333
334
335;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
336;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
337;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
338;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
339
340align 16
341.less_than_256:
342
343	; check if there is enough buffer to be able to fold 16B at a time
344	cmp	arg3, 32
345	jl	.less_than_32
346
347	; if there is, load the constants
348	vmovdqa	xmm10, [rk1]    ; rk1 and rk2 in xmm10
349
350	vmovd	xmm0, arg1_low32	; get the initial crc value
351	vpslldq	xmm0, 12		; align it to its correct place
352	vmovdqu	xmm7, [arg2]		; load the plaintext
353	vpshufb	xmm7, xmm18		; byte-reflect the plaintext
354	vpxor	xmm7, xmm0
355
356	; update the buffer pointer
357	add	arg2, 16
358
359	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
360	sub	arg3, 32
361
362	jmp	.16B_reduction_loop
363
364
365align 16
366.less_than_32:
367	; mov initial crc to the return value. this is necessary for zero-length buffers.
368	mov	eax, arg1_low32
369	test	arg3, arg3
370	je	.cleanup
371
372	vmovd	xmm0, arg1_low32	; get the initial crc value
373	vpslldq	xmm0, 12		; align it to its correct place
374
375	cmp	arg3, 16
376	je	.exact_16_left
377	jl	.less_than_16_left
378
379	vmovdqu	xmm7, [arg2]		; load the plaintext
380	vpshufb	xmm7, xmm18
381	vpxor	xmm7, xmm0		; xor the initial crc value
382	add	arg2, 16
383	sub	arg3, 16
384	vmovdqa	xmm10, [rk1]		; rk1 and rk2 in xmm10
385	jmp	.get_last_two_xmms
386
387align 16
388.less_than_16_left:
389	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
390
391	vpxor	xmm1, xmm1
392	mov	r11, rsp
393	vmovdqa	[r11], xmm1
394
395	cmp	arg3, 4
396	jl	.only_less_than_4
397
398	; backup the counter value
399	mov	r9, arg3
400	cmp	arg3, 8
401	jl	.less_than_8_left
402
403	; load 8 Bytes
404	mov	rax, [arg2]
405	mov	[r11], rax
406	add	r11, 8
407	sub	arg3, 8
408	add	arg2, 8
409.less_than_8_left:
410
411	cmp	arg3, 4
412	jl	.less_than_4_left
413
414	; load 4 Bytes
415	mov	eax, [arg2]
416	mov	[r11], eax
417	add	r11, 4
418	sub	arg3, 4
419	add	arg2, 4
420.less_than_4_left:
421
422	cmp	arg3, 2
423	jl	.less_than_2_left
424
425	; load 2 Bytes
426	mov	ax, [arg2]
427	mov	[r11], ax
428	add	r11, 2
429	sub	arg3, 2
430	add	arg2, 2
431.less_than_2_left:
432	cmp	arg3, 1
433	jl	.zero_left
434
435	; load 1 Byte
436	mov	al, [arg2]
437	mov	[r11], al
438
439.zero_left:
440	vmovdqa	xmm7, [rsp]
441	vpshufb	xmm7, xmm18
442	vpxor	xmm7, xmm0	; xor the initial crc value
443
444	lea	rax, [pshufb_shf_table + 16]
445	sub	rax, r9
446	vmovdqu	xmm0, [rax]
447	vpxor	xmm0, [mask1]
448
449	vpshufb	xmm7,xmm0
450	jmp	.128_done
451
452align 16
453.exact_16_left:
454	vmovdqu	xmm7, [arg2]
455	vpshufb	xmm7, xmm18
456	vpxor	xmm7, xmm0      ; xor the initial crc value
457	jmp	.128_done
458
459.only_less_than_4:
460	cmp	arg3, 3
461	jl	.only_less_than_3
462
463	; load 3 Bytes
464	mov	al, [arg2]
465	mov	[r11], al
466
467	mov	al, [arg2+1]
468	mov	[r11+1], al
469
470	mov	al, [arg2+2]
471	mov	[r11+2], al
472
473	vmovdqa	xmm7, [rsp]
474	vpshufb	xmm7, xmm18
475	vpxor	xmm7, xmm0	; xor the initial crc value
476
477	vpsrldq	xmm7, 5
478	jmp	.barrett
479
480.only_less_than_3:
481	cmp	arg3, 2
482	jl	.only_less_than_2
483
484	; load 2 Bytes
485	mov	al, [arg2]
486	mov	[r11], al
487
488	mov	al, [arg2+1]
489	mov	[r11+1], al
490
491	vmovdqa	xmm7, [rsp]
492	vpshufb	xmm7, xmm18
493	vpxor	xmm7, xmm0	; xor the initial crc value
494
495	vpsrldq	xmm7, 6
496	jmp	.barrett
497
498.only_less_than_2:
499	; load 1 Byte
500	mov	al, [arg2]
501	mov	[r11], al
502
503	vmovdqa	xmm7, [rsp]
504	vpshufb	xmm7, xmm18
505	vpxor	xmm7, xmm0      ; xor the initial crc value
506
507	vpsrldq	xmm7, 7
508	jmp	.barrett
509
510section .data
511align 32
512
513%ifndef USE_CONSTS
514; precomputed constants
515rk_1: dq 0x1851689900000000
516rk_2: dq 0xa3dc855100000000
517rk1:  dq 0xf200aa6600000000
518rk2:  dq 0x17d3315d00000000
519rk3:  dq 0x022ffca500000000
520rk4:  dq 0x9d9ee22f00000000
521rk5:  dq 0xf200aa6600000000
522rk6:  dq 0x490d678d00000000
523rk7:  dq 0x0000000104d101df
524rk8:  dq 0x0000000104c11db7
525rk9:  dq 0x6ac7e7d700000000
526rk10: dq 0xfcd922af00000000
527rk11: dq 0x34e45a6300000000
528rk12: dq 0x8762c1f600000000
529rk13: dq 0x5395a0ea00000000
530rk14: dq 0x54f2d5c700000000
531rk15: dq 0xd3504ec700000000
532rk16: dq 0x57a8445500000000
533rk17: dq 0xc053585d00000000
534rk18: dq 0x766f1b7800000000
535rk19: dq 0xcd8c54b500000000
536rk20: dq 0xab40b71e00000000
537
538rk_1b: dq 0xf200aa6600000000
539rk_2b: dq 0x17d3315d00000000
540	dq 0x0000000000000000
541	dq 0x0000000000000000
542%else
543INCLUDE_CONSTS
544%endif
545
546mask1: dq 0x8080808080808080, 0x8080808080808080
547mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
548
549SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
550
551pshufb_shf_table:
552; use these values for shift constants for the pshufb instruction
553; different alignments result in values as shown:
554;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
555;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
556;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
557;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
558;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
559;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
560;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
561;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
562;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
563;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
564;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
565;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
566;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
567;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
568;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
569dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
570dq 0x0706050403020100, 0x000e0d0c0b0a0908
571dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
572dq 0x8080808080808080, 0x8080808080808080
573
574%else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
575%ifidn __OUTPUT_FORMAT__, win64
576global no_ %+ FUNCTION_NAME
577no_ %+ FUNCTION_NAME %+ :
578%endif
579%endif ; (AS_FEATURE_LEVEL) >= 10
580