1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2020 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 31; Function API: 32; UINT32 crc32_gzip_refl_by16_10( 33; UINT32 init_crc, //initial CRC value, 32 bits 34; const unsigned char *buf, //buffer pointer to calculate CRC on 35; UINT64 len //buffer length in bytes (64-bit data) 36; ); 37; 38; Authors: 39; Erdinc Ozturk 40; Vinodh Gopal 41; James Guilford 42; 43; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" 44; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 45; 46; 47 48%include "reg_sizes.asm" 49 50%ifndef FUNCTION_NAME 51%define FUNCTION_NAME crc32_ieee_by16_10 52%endif 53 54%if (AS_FEATURE_LEVEL) >= 10 55 56[bits 64] 57default rel 58 59section .text 60 61 62%ifidn __OUTPUT_FORMAT__, win64 63 %xdefine arg1 rcx 64 %xdefine arg2 rdx 65 %xdefine arg3 r8 66 67 %xdefine arg1_low32 ecx 68%else 69 %xdefine arg1 rdi 70 %xdefine arg2 rsi 71 %xdefine arg3 rdx 72 73 %xdefine arg1_low32 edi 74%endif 75 76%define TMP 16*0 77%ifidn __OUTPUT_FORMAT__, win64 78 %define XMM_SAVE 16*2 79 %define VARIABLE_OFFSET 16*12+8 80%else 81 %define VARIABLE_OFFSET 16*2+8 82%endif 83 84align 16 85mk_global FUNCTION_NAME, function 86FUNCTION_NAME: 87 endbranch 88 89 not arg1_low32 90 sub rsp, VARIABLE_OFFSET 91 92%ifidn __OUTPUT_FORMAT__, win64 93 ; push the xmm registers into the stack to maintain 94 vmovdqa [rsp + XMM_SAVE + 16*0], xmm6 95 vmovdqa [rsp + XMM_SAVE + 16*1], xmm7 96 vmovdqa [rsp + XMM_SAVE + 16*2], xmm8 97 vmovdqa [rsp + XMM_SAVE + 16*3], xmm9 98 vmovdqa [rsp + XMM_SAVE + 16*4], xmm10 99 vmovdqa [rsp + XMM_SAVE + 16*5], xmm11 100 vmovdqa [rsp + XMM_SAVE + 16*6], xmm12 101 vmovdqa [rsp + XMM_SAVE + 16*7], xmm13 102 vmovdqa [rsp + XMM_SAVE + 16*8], xmm14 103 vmovdqa [rsp + XMM_SAVE + 16*9], xmm15 104%endif 105 106 vbroadcasti32x4 zmm18, [SHUF_MASK] 107 cmp arg3, 256 108 jl .less_than_256 109 110 ; load the initial crc value 111 vmovd xmm10, arg1_low32 ; initial crc 112 113 ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register. 114 ; because data will be byte-reflected and will align with initial crc at correct place. 115 vpslldq xmm10, 12 116 117 ; receive the initial 64B data, xor the initial crc value 118 vmovdqu8 zmm0, [arg2+16*0] 119 vmovdqu8 zmm4, [arg2+16*4] 120 vpshufb zmm0, zmm0, zmm18 121 vpshufb zmm4, zmm4, zmm18 122 vpxorq zmm0, zmm10 123 vbroadcasti32x4 zmm10, [rk3] ;xmm10 has rk3 and rk4 124 ;imm value of pclmulqdq instruction will determine which constant to use 125 126 sub arg3, 256 127 cmp arg3, 256 128 jl .fold_128_B_loop 129 130 vmovdqu8 zmm7, [arg2+16*8] 131 vmovdqu8 zmm8, [arg2+16*12] 132 vpshufb zmm7, zmm7, zmm18 133 vpshufb zmm8, zmm8, zmm18 134 vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2 135 sub arg3, 256 136 137.fold_256_B_loop: 138 add arg2, 256 139 vmovdqu8 zmm3, [arg2+16*0] 140 vpshufb zmm3, zmm3, zmm18 141 vpclmulqdq zmm1, zmm0, zmm16, 0x00 142 vpclmulqdq zmm0, zmm0, zmm16, 0x11 143 vpternlogq zmm0, zmm1, zmm3, 0x96 144 145 vmovdqu8 zmm9, [arg2+16*4] 146 vpshufb zmm9, zmm9, zmm18 147 vpclmulqdq zmm5, zmm4, zmm16, 0x00 148 vpclmulqdq zmm4, zmm4, zmm16, 0x11 149 vpternlogq zmm4, zmm5, zmm9, 0x96 150 151 vmovdqu8 zmm11, [arg2+16*8] 152 vpshufb zmm11, zmm11, zmm18 153 vpclmulqdq zmm12, zmm7, zmm16, 0x00 154 vpclmulqdq zmm7, zmm7, zmm16, 0x11 155 vpternlogq zmm7, zmm12, zmm11, 0x96 156 157 vmovdqu8 zmm17, [arg2+16*12] 158 vpshufb zmm17, zmm17, zmm18 159 vpclmulqdq zmm14, zmm8, zmm16, 0x00 160 vpclmulqdq zmm8, zmm8, zmm16, 0x11 161 vpternlogq zmm8, zmm14, zmm17, 0x96 162 163 sub arg3, 256 164 jge .fold_256_B_loop 165 166 ;; Fold 256 into 128 167 add arg2, 256 168 vpclmulqdq zmm1, zmm0, zmm10, 0x00 169 vpclmulqdq zmm2, zmm0, zmm10, 0x11 170 vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC 171 172 vpclmulqdq zmm5, zmm4, zmm10, 0x00 173 vpclmulqdq zmm6, zmm4, zmm10, 0x11 174 vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC 175 176 vmovdqa32 zmm0, zmm7 177 vmovdqa32 zmm4, zmm8 178 179 add arg3, 128 180 jmp .fold_128_B_register 181 182 183 184 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop 185 ; loop will fold 128B at a time until we have 128+y Bytes of buffer 186 187 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel 188.fold_128_B_loop: 189 add arg2, 128 190 vmovdqu8 zmm8, [arg2+16*0] 191 vpshufb zmm8, zmm8, zmm18 192 vpclmulqdq zmm2, zmm0, zmm10, 0x00 193 vpclmulqdq zmm0, zmm0, zmm10, 0x11 194 vpternlogq zmm0, zmm2, zmm8, 0x96 195 196 vmovdqu8 zmm9, [arg2+16*4] 197 vpshufb zmm9, zmm9, zmm18 198 vpclmulqdq zmm5, zmm4, zmm10, 0x00 199 vpclmulqdq zmm4, zmm4, zmm10, 0x11 200 vpternlogq zmm4, zmm5, zmm9, 0x96 201 202 sub arg3, 128 203 jge .fold_128_B_loop 204 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 205 206 add arg2, 128 207 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128 208 ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 209 210.fold_128_B_register: 211 ; fold the 8 128b parts into 1 xmm register with different constants 212 vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16 213 vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0 214 vpclmulqdq zmm1, zmm0, zmm16, 0x00 215 vpclmulqdq zmm2, zmm0, zmm16, 0x11 216 vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand 217 218 vpclmulqdq zmm5, zmm4, zmm11, 0x00 219 vpclmulqdq zmm6, zmm4, zmm11, 0x11 220 vmovdqa xmm10, [rk1] ; Needed later in reduction loop 221 vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC 222 vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC 223 224 vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10 225 vpxorq ymm8, ymm8, ymm1 226 vextracti64x2 xmm5, ymm8, 1 227 vpxorq xmm7, xmm5, xmm8 228 229 ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop 230 ; instead of a cmp instruction, we use the negative flag with the jl instruction 231 add arg3, 128-16 232 jl .final_reduction_for_128 233 234 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory 235 ; we can fold 16 bytes at a time if y>=16 236 ; continue folding 16B at a time 237 238.16B_reduction_loop: 239 vpclmulqdq xmm8, xmm7, xmm10, 0x11 240 vpclmulqdq xmm7, xmm7, xmm10, 0x00 241 vpxor xmm7, xmm8 242 vmovdqu xmm0, [arg2] 243 vpshufb xmm0, xmm0, xmm18 244 vpxor xmm7, xmm0 245 add arg2, 16 246 sub arg3, 16 247 ; instead of a cmp instruction, we utilize the flags with the jge instruction 248 ; equivalent of: cmp arg3, 16-16 249 ; check if there is any more 16B in the buffer to be able to fold 250 jge .16B_reduction_loop 251 252 ;now we have 16+z bytes left to reduce, where 0<= z < 16. 253 ;first, we reduce the data in the xmm7 register 254 255 256.final_reduction_for_128: 257 add arg3, 16 258 je .128_done 259 260 ; here we are getting data that is less than 16 bytes. 261 ; since we know that there was data before the pointer, we can offset 262 ; the input pointer before the actual point, to receive exactly 16 bytes. 263 ; after that the registers need to be adjusted. 264.get_last_two_xmms: 265 266 vmovdqa xmm2, xmm7 267 vmovdqu xmm1, [arg2 - 16 + arg3] 268 vpshufb xmm1, xmm18 269 270 ; get rid of the extra data that was loaded before 271 ; load the shift constant 272 lea rax, [pshufb_shf_table + 16] 273 sub rax, arg3 274 vmovdqu xmm0, [rax] 275 276 vpshufb xmm2, xmm0 277 vpxor xmm0, [mask1] 278 vpshufb xmm7, xmm0 279 vpblendvb xmm1, xmm1, xmm2, xmm0 280 281 vpclmulqdq xmm8, xmm7, xmm10, 0x11 282 vpclmulqdq xmm7, xmm7, xmm10, 0x00 283 vpxor xmm7, xmm8 284 vpxor xmm7, xmm1 285 286.128_done: 287 ; compute crc of a 128-bit value 288 vmovdqa xmm10, [rk5] 289 vmovdqa xmm0, xmm7 290 291 ;64b fold 292 vpclmulqdq xmm7, xmm10, 0x01 ; H*L 293 vpslldq xmm0, 8 294 vpxor xmm7, xmm0 295 296 ;32b fold 297 vmovdqa xmm0, xmm7 298 vpand xmm0, [mask2] 299 vpsrldq xmm7, 12 300 vpclmulqdq xmm7, xmm10, 0x10 301 vpxor xmm7, xmm0 302 303 ;barrett reduction 304.barrett: 305 vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10 306 vmovdqa xmm0, xmm7 307 vpclmulqdq xmm7, xmm10, 0x01 308 vpslldq xmm7, 4 309 vpclmulqdq xmm7, xmm10, 0x11 310 311 vpslldq xmm7, 4 312 vpxor xmm7, xmm0 313 vpextrd eax, xmm7, 1 314 315.cleanup: 316 not eax 317 318 319%ifidn __OUTPUT_FORMAT__, win64 320 vmovdqa xmm6, [rsp + XMM_SAVE + 16*0] 321 vmovdqa xmm7, [rsp + XMM_SAVE + 16*1] 322 vmovdqa xmm8, [rsp + XMM_SAVE + 16*2] 323 vmovdqa xmm9, [rsp + XMM_SAVE + 16*3] 324 vmovdqa xmm10, [rsp + XMM_SAVE + 16*4] 325 vmovdqa xmm11, [rsp + XMM_SAVE + 16*5] 326 vmovdqa xmm12, [rsp + XMM_SAVE + 16*6] 327 vmovdqa xmm13, [rsp + XMM_SAVE + 16*7] 328 vmovdqa xmm14, [rsp + XMM_SAVE + 16*8] 329 vmovdqa xmm15, [rsp + XMM_SAVE + 16*9] 330%endif 331 add rsp, VARIABLE_OFFSET 332 ret 333 334 335;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 336;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 337;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 338;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 339 340align 16 341.less_than_256: 342 343 ; check if there is enough buffer to be able to fold 16B at a time 344 cmp arg3, 32 345 jl .less_than_32 346 347 ; if there is, load the constants 348 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10 349 350 vmovd xmm0, arg1_low32 ; get the initial crc value 351 vpslldq xmm0, 12 ; align it to its correct place 352 vmovdqu xmm7, [arg2] ; load the plaintext 353 vpshufb xmm7, xmm18 ; byte-reflect the plaintext 354 vpxor xmm7, xmm0 355 356 ; update the buffer pointer 357 add arg2, 16 358 359 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop 360 sub arg3, 32 361 362 jmp .16B_reduction_loop 363 364 365align 16 366.less_than_32: 367 ; mov initial crc to the return value. this is necessary for zero-length buffers. 368 mov eax, arg1_low32 369 test arg3, arg3 370 je .cleanup 371 372 vmovd xmm0, arg1_low32 ; get the initial crc value 373 vpslldq xmm0, 12 ; align it to its correct place 374 375 cmp arg3, 16 376 je .exact_16_left 377 jl .less_than_16_left 378 379 vmovdqu xmm7, [arg2] ; load the plaintext 380 vpshufb xmm7, xmm18 381 vpxor xmm7, xmm0 ; xor the initial crc value 382 add arg2, 16 383 sub arg3, 16 384 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10 385 jmp .get_last_two_xmms 386 387align 16 388.less_than_16_left: 389 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first. 390 391 vpxor xmm1, xmm1 392 mov r11, rsp 393 vmovdqa [r11], xmm1 394 395 cmp arg3, 4 396 jl .only_less_than_4 397 398 ; backup the counter value 399 mov r9, arg3 400 cmp arg3, 8 401 jl .less_than_8_left 402 403 ; load 8 Bytes 404 mov rax, [arg2] 405 mov [r11], rax 406 add r11, 8 407 sub arg3, 8 408 add arg2, 8 409.less_than_8_left: 410 411 cmp arg3, 4 412 jl .less_than_4_left 413 414 ; load 4 Bytes 415 mov eax, [arg2] 416 mov [r11], eax 417 add r11, 4 418 sub arg3, 4 419 add arg2, 4 420.less_than_4_left: 421 422 cmp arg3, 2 423 jl .less_than_2_left 424 425 ; load 2 Bytes 426 mov ax, [arg2] 427 mov [r11], ax 428 add r11, 2 429 sub arg3, 2 430 add arg2, 2 431.less_than_2_left: 432 cmp arg3, 1 433 jl .zero_left 434 435 ; load 1 Byte 436 mov al, [arg2] 437 mov [r11], al 438 439.zero_left: 440 vmovdqa xmm7, [rsp] 441 vpshufb xmm7, xmm18 442 vpxor xmm7, xmm0 ; xor the initial crc value 443 444 lea rax, [pshufb_shf_table + 16] 445 sub rax, r9 446 vmovdqu xmm0, [rax] 447 vpxor xmm0, [mask1] 448 449 vpshufb xmm7,xmm0 450 jmp .128_done 451 452align 16 453.exact_16_left: 454 vmovdqu xmm7, [arg2] 455 vpshufb xmm7, xmm18 456 vpxor xmm7, xmm0 ; xor the initial crc value 457 jmp .128_done 458 459.only_less_than_4: 460 cmp arg3, 3 461 jl .only_less_than_3 462 463 ; load 3 Bytes 464 mov al, [arg2] 465 mov [r11], al 466 467 mov al, [arg2+1] 468 mov [r11+1], al 469 470 mov al, [arg2+2] 471 mov [r11+2], al 472 473 vmovdqa xmm7, [rsp] 474 vpshufb xmm7, xmm18 475 vpxor xmm7, xmm0 ; xor the initial crc value 476 477 vpsrldq xmm7, 5 478 jmp .barrett 479 480.only_less_than_3: 481 cmp arg3, 2 482 jl .only_less_than_2 483 484 ; load 2 Bytes 485 mov al, [arg2] 486 mov [r11], al 487 488 mov al, [arg2+1] 489 mov [r11+1], al 490 491 vmovdqa xmm7, [rsp] 492 vpshufb xmm7, xmm18 493 vpxor xmm7, xmm0 ; xor the initial crc value 494 495 vpsrldq xmm7, 6 496 jmp .barrett 497 498.only_less_than_2: 499 ; load 1 Byte 500 mov al, [arg2] 501 mov [r11], al 502 503 vmovdqa xmm7, [rsp] 504 vpshufb xmm7, xmm18 505 vpxor xmm7, xmm0 ; xor the initial crc value 506 507 vpsrldq xmm7, 7 508 jmp .barrett 509 510section .data 511align 32 512 513%ifndef USE_CONSTS 514; precomputed constants 515rk_1: dq 0x1851689900000000 516rk_2: dq 0xa3dc855100000000 517rk1: dq 0xf200aa6600000000 518rk2: dq 0x17d3315d00000000 519rk3: dq 0x022ffca500000000 520rk4: dq 0x9d9ee22f00000000 521rk5: dq 0xf200aa6600000000 522rk6: dq 0x490d678d00000000 523rk7: dq 0x0000000104d101df 524rk8: dq 0x0000000104c11db7 525rk9: dq 0x6ac7e7d700000000 526rk10: dq 0xfcd922af00000000 527rk11: dq 0x34e45a6300000000 528rk12: dq 0x8762c1f600000000 529rk13: dq 0x5395a0ea00000000 530rk14: dq 0x54f2d5c700000000 531rk15: dq 0xd3504ec700000000 532rk16: dq 0x57a8445500000000 533rk17: dq 0xc053585d00000000 534rk18: dq 0x766f1b7800000000 535rk19: dq 0xcd8c54b500000000 536rk20: dq 0xab40b71e00000000 537 538rk_1b: dq 0xf200aa6600000000 539rk_2b: dq 0x17d3315d00000000 540 dq 0x0000000000000000 541 dq 0x0000000000000000 542%else 543INCLUDE_CONSTS 544%endif 545 546mask1: dq 0x8080808080808080, 0x8080808080808080 547mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF 548 549SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607 550 551pshufb_shf_table: 552; use these values for shift constants for the pshufb instruction 553; different alignments result in values as shown: 554; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 555; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 556; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 557; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 558; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 559; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 560; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 561; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 562; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 563; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 564; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 565; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 566; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 567; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 568; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 569dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 570dq 0x0706050403020100, 0x000e0d0c0b0a0908 571dq 0x8080808080808080, 0x0f0e0d0c0b0a0908 572dq 0x8080808080808080, 0x8080808080808080 573 574%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. 575%ifidn __OUTPUT_FORMAT__, win64 576global no_ %+ FUNCTION_NAME 577no_ %+ FUNCTION_NAME %+ : 578%endif 579%endif ; (AS_FEATURE_LEVEL) >= 10 580