1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2016 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30%include "sha512_job.asm" 31%include "sha512_mb_mgr_datastruct.asm" 32%include "reg_sizes.asm" 33 34extern sha512_mb_x8_avx512 35 36[bits 64] 37default rel 38section .text 39 40%ifidn __OUTPUT_FORMAT__, elf64 41; LINUX register definitions 42%define arg1 rdi ; rcx 43%define arg2 rsi ; rdx 44 45; idx needs to be other than arg1, arg2, rbx, r12 46%define idx rdx ; rsi 47%else 48; WINDOWS register definitions 49%define arg1 rcx 50%define arg2 rdx 51 52; idx needs to be other than arg1, arg2, rbx, r12 53%define idx rsi 54%endif 55 56; Common definitions 57%define state arg1 58%define job arg2 59%define len2 arg2 60 61%define num_lanes_inuse r9 62%define unused_lanes rbx 63%define lane_data rbx 64%define tmp2 rbx 65 66%define job_rax rax 67%define tmp1 rax 68%define size_offset rax 69%define tmp rax 70%define start_offset rax 71 72%define tmp3 arg1 73 74%define extra_blocks arg2 75%define p arg2 76 77%define tmp4 r8 78%define lens0 r8 79 80%define num_lanes_inuse r9 81%define lens1 r9 82%define lens2 r10 83%define lens3 r11 84 85struc stack_frame 86 .xmm: resb 16*10 87 .gpr: resb 8*8 88 .rsp: resb 8 89endstruc 90 91; STACK_SPACE needs to be an odd multiple of 8 92%define _XMM_SAVE stack_frame.xmm 93%define _GPR_SAVE stack_frame.gpr 94%define STACK_SPACE stack_frame_size 95 96%define APPEND(a,b) a %+ b 97 98; ISAL_SHA512_JOB* _sha512_mb_mgr_flush_avx512(ISAL_SHA512_MB_JOB_MGR *state) 99; arg 1 : rcx : state 100mk_global _sha512_mb_mgr_flush_avx512, function, internal 101_sha512_mb_mgr_flush_avx512: 102 endbranch 103 104 mov rax, rsp 105 106 sub rsp, STACK_SPACE 107 108 mov [rsp + stack_frame.rsp], rax 109 110 mov [rsp + _GPR_SAVE + 8*0], rbx 111 mov [rsp + _GPR_SAVE + 8*3], rbp 112 mov [rsp + _GPR_SAVE + 8*4], r12 113 mov [rsp + _GPR_SAVE + 8*5], r13 114 mov [rsp + _GPR_SAVE + 8*6], r14 115 mov [rsp + _GPR_SAVE + 8*7], r15 116%ifidn __OUTPUT_FORMAT__, win64 117 mov [rsp + _GPR_SAVE + 8*1], rsi 118 mov [rsp + _GPR_SAVE + 8*2], rdi 119 vmovdqu [rsp + _XMM_SAVE + 16*0], xmm6 120 vmovdqu [rsp + _XMM_SAVE + 16*1], xmm7 121 vmovdqu [rsp + _XMM_SAVE + 16*2], xmm8 122 vmovdqu [rsp + _XMM_SAVE + 16*3], xmm9 123 vmovdqu [rsp + _XMM_SAVE + 16*4], xmm10 124 vmovdqu [rsp + _XMM_SAVE + 16*5], xmm11 125 vmovdqu [rsp + _XMM_SAVE + 16*6], xmm12 126 vmovdqu [rsp + _XMM_SAVE + 16*7], xmm13 127 vmovdqu [rsp + _XMM_SAVE + 16*8], xmm14 128 vmovdqu [rsp + _XMM_SAVE + 16*9], xmm15 129%endif 130 131 mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] 132 cmp num_lanes_inuse, 0 133 jz return_null 134 135 ; find a lane with a non-null job 136 xor idx, idx 137%assign I 1 138%rep 7 139 cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 140 cmovne idx, [APPEND(lane_,I)] 141%assign I (I+1) 142%endrep 143 144 ; copy idx to empty lanes 145copy_lane_data: 146 mov tmp, [state + _args + _data_ptr + 8*idx] 147 148%assign I 0 149%rep 8 150 cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 151 jne APPEND(skip_,I) 152 mov [state + _args + _data_ptr + 8*I], tmp 153 mov dword [state + _lens + 4 + 8*I], 0xFFFFFFFF 154APPEND(skip_,I): 155%assign I (I+1) 156%endrep 157 158 ; Find min length, len in sha512_mgr is 64bit, high 32bit is block num, low 8bit is idx 159 vmovdqu ymm0, [state + _lens + 0*32] ; ymm0 has {D,d,C,c,B,b,A,a} 160 vmovdqu ymm1, [state + _lens + 1*32] 161 162 vpminuq ymm2, ymm0, ymm1 ; ymm2 has {D,i,C,i,B,i,A,i} 163 vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,i,D,i,x,i,B,i} 164 vpminuq ymm2, ymm2, ymm3 ; ymm2 has {x,i,F,i,x,i,E,i} 165 vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,i,x,i,x,i,F,i} 166 vpminuq ymm2, ymm2, ymm3 ; ymm2 has min value in high dword 167 168 vmovq idx, xmm2 169 mov len2, idx 170 and idx, 0xF 171 shr len2, 32 ; SHA512 blocksize is 1024bit 172 jz len_is_0 173 174 vperm2i128 ymm2, ymm2, ymm2, 0 ; ymm2 has {x,x,E,i,x,x,E,i} 175 vpand ymm2, ymm2, [rel clear_low_nibble] ; ymm2 has {0,0,E,0,0,0,E,0} 176 vpshufd ymm2, ymm2, 0x44 ; ymm2 has {E,0,E,0,E,0,E,0} 177 178 vpsubd ymm0, ymm0, ymm2 179 vpsubd ymm1, ymm1, ymm2 180 181 vmovdqu [state + _lens + 0*32], ymm0 182 vmovdqu [state + _lens + 1*32], ymm1 183 184 ; "state" and "args" are the same address, arg1 185 ; len is arg2 186 call sha512_mb_x8_avx512 187 ; state and idx are intact 188 189len_is_0: 190 ; process completed job "idx" 191 imul lane_data, idx, _LANE_DATA_size 192 lea lane_data, [state + _ldata + lane_data] 193 194 mov job_rax, [lane_data + _job_in_lane] 195 mov qword [lane_data + _job_in_lane], 0 196 mov dword [job_rax + _status], ISAL_STS_COMPLETED 197 mov unused_lanes, [state + _unused_lanes] 198 shl unused_lanes, 8 199 or unused_lanes, idx 200 mov [state + _unused_lanes], unused_lanes 201 202 mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] 203 sub num_lanes_inuse, 1 204 mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) 205 vmovq xmm0, [state + _args_digest + 8*idx + 0*64] 206 vpinsrq xmm0, [state + _args_digest + 8*idx + 1*64], 1 207 vmovq xmm1, [state + _args_digest + 8*idx + 2*64] 208 vpinsrq xmm1, [state + _args_digest + 8*idx + 3*64], 1 209 vmovq xmm2, [state + _args_digest + 8*idx + 4*64] 210 vpinsrq xmm2, [state + _args_digest + 8*idx + 5*64], 1 211 vmovq xmm3, [state + _args_digest + 8*idx + 6*64] 212 vpinsrq xmm3, [state + _args_digest + 8*idx + 7*64], 1 213 214 vmovdqa [job_rax + _result_digest + 0*16], xmm0 215 vmovdqa [job_rax + _result_digest + 1*16], xmm1 216 vmovdqa [job_rax + _result_digest + 2*16], xmm2 217 vmovdqa [job_rax + _result_digest + 3*16], xmm3 218 219return: 220 221%ifidn __OUTPUT_FORMAT__, win64 222 vmovdqu xmm6, [rsp + _XMM_SAVE + 16*0] 223 vmovdqu xmm7, [rsp + _XMM_SAVE + 16*1] 224 vmovdqu xmm8, [rsp + _XMM_SAVE + 16*2] 225 vmovdqu xmm9, [rsp + _XMM_SAVE + 16*3] 226 vmovdqu xmm10, [rsp + _XMM_SAVE + 16*4] 227 vmovdqu xmm11, [rsp + _XMM_SAVE + 16*5] 228 vmovdqu xmm12, [rsp + _XMM_SAVE + 16*6] 229 vmovdqu xmm13, [rsp + _XMM_SAVE + 16*7] 230 vmovdqu xmm14, [rsp + _XMM_SAVE + 16*8] 231 vmovdqu xmm15, [rsp + _XMM_SAVE + 16*9] 232 mov rsi, [rsp + _GPR_SAVE + 8*1] 233 mov rdi, [rsp + _GPR_SAVE + 8*2] 234%endif 235 mov rbx, [rsp + _GPR_SAVE + 8*0] 236 mov rbp, [rsp + _GPR_SAVE + 8*3] 237 mov r12, [rsp + _GPR_SAVE + 8*4] 238 mov r13, [rsp + _GPR_SAVE + 8*5] 239 mov r14, [rsp + _GPR_SAVE + 8*6] 240 mov r15, [rsp + _GPR_SAVE + 8*7] 241 242 mov rsp, [rsp + stack_frame.rsp] 243 244 ret 245 246return_null: 247 xor job_rax, job_rax 248 jmp return 249 250section .data align=32 251 252align 32 253clear_low_nibble: ; mgr len element 0xnnnnnnnn 0000000m, nnnnnnnn is blocknum, m is index 254 dq 0xFFFFFFFF00000000, 0x0000000000000000 255 dq 0xFFFFFFFF00000000, 0x0000000000000000 256lane_1: dq 1 257lane_2: dq 2 258lane_3: dq 3 259lane_4: dq 4 260lane_5: dq 5 261lane_6: dq 6 262lane_7: dq 7 263