1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2019 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 31; Function API: 32; uint64_t crc64_iso_refl_by16_10( 33; uint64_t init_crc, //initial CRC value, 64 bits 34; const unsigned char *buf, //buffer pointer to calculate CRC on 35; uint64_t len //buffer length in bytes (64-bit data) 36; ); 37; 38%include "reg_sizes.asm" 39 40%ifndef FUNCTION_NAME 41%define FUNCTION_NAME crc64_iso_refl_by16_10 42%endif 43 44%if (AS_FEATURE_LEVEL) >= 10 45 46%define fetch_dist 1024 47 48[bits 64] 49default rel 50 51section .text 52 53 54%ifidn __OUTPUT_FORMAT__, win64 55 %xdefine arg1 rcx 56 %xdefine arg2 rdx 57 %xdefine arg3 r8 58%else 59 %xdefine arg1 rdi 60 %xdefine arg2 rsi 61 %xdefine arg3 rdx 62%endif 63 64%define TMP 16*0 65%ifidn __OUTPUT_FORMAT__, win64 66 %define XMM_SAVE 16*2 67 %define VARIABLE_OFFSET 16*12+8 68%else 69 %define VARIABLE_OFFSET 16*2+8 70%endif 71 72align 16 73mk_global FUNCTION_NAME, function 74FUNCTION_NAME: 75 endbranch 76 not arg1 77 sub rsp, VARIABLE_OFFSET 78 79%ifidn __OUTPUT_FORMAT__, win64 80 ; push the xmm registers into the stack to maintain 81 vmovdqa [rsp + XMM_SAVE + 16*0], xmm6 82 vmovdqa [rsp + XMM_SAVE + 16*1], xmm7 83 vmovdqa [rsp + XMM_SAVE + 16*2], xmm8 84 vmovdqa [rsp + XMM_SAVE + 16*3], xmm9 85 vmovdqa [rsp + XMM_SAVE + 16*4], xmm10 86 vmovdqa [rsp + XMM_SAVE + 16*5], xmm11 87 vmovdqa [rsp + XMM_SAVE + 16*6], xmm12 88 vmovdqa [rsp + XMM_SAVE + 16*7], xmm13 89 vmovdqa [rsp + XMM_SAVE + 16*8], xmm14 90 vmovdqa [rsp + XMM_SAVE + 16*9], xmm15 91%endif 92 93 cmp arg3, 256 94 jl _less_than_256 95 96 ; load the initial crc value 97 vmovq xmm10, arg1 ; initial crc 98 99 ; receive the initial 128B data, xor the initial crc value 100 vmovdqu8 zmm0, [arg2+16*0] 101 vmovdqu8 zmm4, [arg2+16*4] 102 vpxorq zmm0, zmm10 103 vbroadcasti32x4 zmm10, [rk3] ;zmm10 has rk3 and rk4 104 ;imm value of pclmulqdq instruction will determine which constant to use 105 106 sub arg3, 256 107 cmp arg3, 256 108 jl _fold_128_B_loop 109 110 vmovdqu8 zmm7, [arg2+16*8] 111 vmovdqu8 zmm8, [arg2+16*12] 112 vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2 113 sub arg3, 256 114 115_fold_256_B_loop: 116 add arg2, 256 117 vpclmulqdq zmm1, zmm0, zmm16, 0x10 118 vpclmulqdq zmm0, zmm0, zmm16, 0x01 119 vpternlogq zmm0, zmm1, [arg2+16*0], 0x96 120 121 vpclmulqdq zmm2, zmm4, zmm16, 0x10 122 vpclmulqdq zmm4, zmm4, zmm16, 0x01 123 vpternlogq zmm4, zmm2, [arg2+16*4], 0x96 124 125 vpclmulqdq zmm3, zmm7, zmm16, 0x10 126 vpclmulqdq zmm7, zmm7, zmm16, 0x01 127 vpternlogq zmm7, zmm3, [arg2+16*8], 0x96 128 129 vpclmulqdq zmm5, zmm8, zmm16, 0x10 130 vpclmulqdq zmm8, zmm8, zmm16, 0x01 131 vpternlogq zmm8, zmm5, [arg2+16*12], 0x96 132 133 sub arg3, 256 134 jge _fold_256_B_loop 135 136 ;; Fold 256 into 128 137 add arg2, 256 138 vpclmulqdq zmm1, zmm0, zmm10, 0x01 139 vpclmulqdq zmm2, zmm0, zmm10, 0x10 140 vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC 141 142 vpclmulqdq zmm5, zmm4, zmm10, 0x01 143 vpclmulqdq zmm6, zmm4, zmm10, 0x10 144 vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC 145 146 vmovdqa32 zmm0, zmm7 147 vmovdqa32 zmm4, zmm8 148 149 add arg3, 128 150 jmp _fold_128_B_register 151 152 ; fold 128B at a time. This section of the code folds 2 zmm registers in parallel 153_fold_128_B_loop: 154 add arg2, 128 ; update the buffer pointer 155 vpclmulqdq zmm1, zmm0, zmm10, 0x10 156 vpclmulqdq zmm0, zmm0, zmm10, 0x01 157 vpternlogq zmm0, zmm1, [arg2+16*0], 0x96 158 159 vpclmulqdq zmm5, zmm4, zmm10, 0x10 160 vpclmulqdq zmm4, zmm4, zmm10, 0x01 161 vpternlogq zmm4, zmm5, [arg2+16*4], 0x96 162 163 sub arg3, 128 164 jge _fold_128_B_loop 165 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 166 167 add arg2, 128 168 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128 169 ; the 128B of folded data is in 2 zmm registers: zmm0, zmm4 170 171_fold_128_B_register: 172 ; fold the 8 128b parts into 1 xmm register with different constants 173 vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16 174 vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0 175 vpclmulqdq zmm1, zmm0, zmm16, 0x01 176 vpclmulqdq zmm2, zmm0, zmm16, 0x10 177 vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand 178 179 vpclmulqdq zmm5, zmm4, zmm11, 0x01 180 vpclmulqdq zmm6, zmm4, zmm11, 0x10 181 vmovdqa xmm10, [rk1] ; Needed later in reduction loop 182 vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC 183 vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC 184 185 vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10 186 vpxorq ymm8, ymm8, ymm1 187 vextracti64x2 xmm5, ymm8, 1 188 vpxorq xmm7, xmm5, xmm8 189 190 ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop 191 ; instead of a cmp instruction, we use the negative flag with the jl instruction 192 add arg3, 128-16 193 jl _final_reduction_for_128 194 195 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory 196 ; we can fold 16 bytes at a time if y>=16 197 ; continue folding 16B at a time 198 199_16B_reduction_loop: 200 vmovdqa xmm8, xmm7 201 vpclmulqdq xmm7, xmm10, 0x1 202 vpclmulqdq xmm8, xmm10, 0x10 203 vpxor xmm7, xmm8 204 vmovdqu xmm0, [arg2] 205 vpxor xmm7, xmm0 206 add arg2, 16 207 sub arg3, 16 208 ; instead of a cmp instruction, we utilize the flags with the jge instruction 209 ; equivalent of: cmp arg3, 16-16 210 ; check if there is any more 16B in the buffer to be able to fold 211 jge _16B_reduction_loop 212 213 ;now we have 16+z bytes left to reduce, where 0<= z < 16. 214 ;first, we reduce the data in the xmm7 register 215 216 217_final_reduction_for_128: 218 add arg3, 16 219 je _128_done 220 ; here we are getting data that is less than 16 bytes. 221 ; since we know that there was data before the pointer, we can offset 222 ; the input pointer before the actual point, to receive exactly 16 bytes. 223 ; after that the registers need to be adjusted. 224_get_last_two_xmms: 225 226 227 vmovdqa xmm2, xmm7 228 vmovdqu xmm1, [arg2 - 16 + arg3] 229 230 ; get rid of the extra data that was loaded before 231 ; load the shift constant 232 lea rax, [pshufb_shf_table] 233 add rax, arg3 234 vmovdqu xmm0, [rax] 235 236 237 vpshufb xmm7, xmm0 238 vpxor xmm0, [mask3] 239 vpshufb xmm2, xmm0 240 241 vpblendvb xmm2, xmm2, xmm1, xmm0 242 ;;;;;;;;;; 243 vmovdqa xmm8, xmm7 244 vpclmulqdq xmm7, xmm10, 0x1 245 246 vpclmulqdq xmm8, xmm10, 0x10 247 vpxor xmm7, xmm8 248 vpxor xmm7, xmm2 249 250_128_done: 251 ; compute crc of a 128-bit value 252 vmovdqa xmm10, [rk5] 253 vmovdqa xmm0, xmm7 254 255 ;64b fold 256 vpclmulqdq xmm7, xmm10, 0 257 vpsrldq xmm0, 8 258 vpxor xmm7, xmm0 259 260 ;barrett reduction 261_barrett: 262 vmovdqa xmm1, xmm7 263 vmovdqa xmm10, [rk7] 264 265 vpclmulqdq xmm7, xmm10, 0 266 vmovdqa xmm2, xmm7 267 vpclmulqdq xmm7, xmm10, 0x10 268 vpslldq xmm2, 8 269 vpxor xmm7, xmm2 270 vpxor xmm7, xmm1 271 vpextrq rax, xmm7, 1 272 273_cleanup: 274 not rax 275 276 277%ifidn __OUTPUT_FORMAT__, win64 278 vmovdqa xmm6, [rsp + XMM_SAVE + 16*0] 279 vmovdqa xmm7, [rsp + XMM_SAVE + 16*1] 280 vmovdqa xmm8, [rsp + XMM_SAVE + 16*2] 281 vmovdqa xmm9, [rsp + XMM_SAVE + 16*3] 282 vmovdqa xmm10, [rsp + XMM_SAVE + 16*4] 283 vmovdqa xmm11, [rsp + XMM_SAVE + 16*5] 284 vmovdqa xmm12, [rsp + XMM_SAVE + 16*6] 285 vmovdqa xmm13, [rsp + XMM_SAVE + 16*7] 286 vmovdqa xmm14, [rsp + XMM_SAVE + 16*8] 287 vmovdqa xmm15, [rsp + XMM_SAVE + 16*9] 288%endif 289 add rsp, VARIABLE_OFFSET 290 ret 291 292;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 293;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 294;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 295;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 296 297align 16 298_less_than_256: 299 300 ; check if there is enough buffer to be able to fold 16B at a time 301 cmp arg3, 32 302 jl _less_than_32 303 304 ; if there is, load the constants 305 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10 306 307 vmovq xmm0, arg1 ; get the initial crc value 308 vmovdqu xmm7, [arg2] ; load the plaintext 309 vpxor xmm7, xmm0 310 311 ; update the buffer pointer 312 add arg2, 16 313 314 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop 315 sub arg3, 32 316 317 jmp _16B_reduction_loop 318 319align 16 320_less_than_32: 321 ; mov initial crc to the return value. this is necessary for zero-length buffers. 322 mov rax, arg1 323 test arg3, arg3 324 je _cleanup 325 326 vmovq xmm0, arg1 ; get the initial crc value 327 328 cmp arg3, 16 329 je _exact_16_left 330 jl _less_than_16_left 331 332 vmovdqu xmm7, [arg2] ; load the plaintext 333 vpxor xmm7, xmm0 ; xor the initial crc value 334 add arg2, 16 335 sub arg3, 16 336 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10 337 jmp _get_last_two_xmms 338 339 340align 16 341_less_than_16_left: 342 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first. 343 344 vpxor xmm1, xmm1 345 mov r11, rsp 346 vmovdqa [r11], xmm1 347 348 ; backup the counter value 349 mov r9, arg3 350 cmp arg3, 8 351 jl _less_than_8_left 352 353 ; load 8 Bytes 354 mov rax, [arg2] 355 mov [r11], rax 356 add r11, 8 357 sub arg3, 8 358 add arg2, 8 359_less_than_8_left: 360 361 cmp arg3, 4 362 jl _less_than_4_left 363 364 ; load 4 Bytes 365 mov eax, [arg2] 366 mov [r11], eax 367 add r11, 4 368 sub arg3, 4 369 add arg2, 4 370_less_than_4_left: 371 372 cmp arg3, 2 373 jl _less_than_2_left 374 375 ; load 2 Bytes 376 mov ax, [arg2] 377 mov [r11], ax 378 add r11, 2 379 sub arg3, 2 380 add arg2, 2 381_less_than_2_left: 382 cmp arg3, 1 383 jl _zero_left 384 385 ; load 1 Byte 386 mov al, [arg2] 387 mov [r11], al 388 389_zero_left: 390 vmovdqa xmm7, [rsp] 391 vpxor xmm7, xmm0 ; xor the initial crc value 392 393 lea rax,[pshufb_shf_table] 394 395 cmp r9, 8 396 jl _end_1to7 397 398_end_8to15: 399 vmovdqu xmm0, [rax + r9] 400 vpshufb xmm7,xmm0 401 jmp _128_done 402 403_end_1to7: 404 ; Left shift (8-length) bytes in XMM 405 vmovdqu xmm0, [rax + r9 + 8] 406 vpshufb xmm7,xmm0 407 408 jmp _barrett 409 410align 16 411_exact_16_left: 412 vmovdqu xmm7, [arg2] 413 vpxor xmm7, xmm0 ; xor the initial crc value 414 415 jmp _128_done 416 417section .data 418align 32 419 420%ifndef USE_CONSTS 421; precomputed constants 422rk_1: dq 0x45000000b0000000 423rk_2: dq 0x6b700000f5000000 424rk1: dq 0xf500000000000001 425rk2: dq 0x6b70000000000001 426rk3: dq 0xb001000000010000 427rk4: dq 0xf501b0000001b000 428rk5: dq 0xf500000000000001 429rk6: dq 0x0000000000000000 430rk7: dq 0xb000000000000001 431rk8: dq 0xb000000000000000 432rk9: dq 0xe014514514501501 433rk10: dq 0x771db6db6db71c71 434rk11: dq 0xa101101101110001 435rk12: dq 0x1ab1ab1ab1aab001 436rk13: dq 0xf445014445000001 437rk14: dq 0x6aab71daab700001 438rk15: dq 0xb100010100000001 439rk16: dq 0x01b001b1b0000001 440rk17: dq 0xe145150000000001 441rk18: dq 0x76db6c7000000001 442rk19: dq 0xa011000000000001 443rk20: dq 0x1b1ab00000000001 444 445rk_1b: dq 0xf500000000000001 446rk_2b: dq 0x6b70000000000001 447 dq 0x0000000000000000 448 dq 0x0000000000000000 449%else 450INCLUDE_CONSTS 451%endif 452 453pshufb_shf_table: 454; use these values for shift constants for the pshufb instruction 455; different alignments result in values as shown: 456; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 457; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 458; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 459; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 460; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 461; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 462; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 463; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 464; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 465; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 466; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 467; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 468; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 469; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 470; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 471dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 472dq 0x0706050403020100, 0x000e0d0c0b0a0908 473 474mask: dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 475mask2: dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF 476mask3: dq 0x8080808080808080, 0x8080808080808080 477 478%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. 479%ifidn __OUTPUT_FORMAT__, win64 480global no_ %+ FUNCTION_NAME 481no_ %+ FUNCTION_NAME %+ : 482%endif 483%endif ; (AS_FEATURE_LEVEL) >= 10 484