1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2016 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30%include "sha512_job.asm" 31%include "sha512_mb_mgr_datastruct.asm" 32 33%include "reg_sizes.asm" 34 35extern sha512_mb_x2_avx 36 37%ifidn __OUTPUT_FORMAT__, elf64 38; Linux register definitions 39%define arg1 rdi ; rcx 40%define arg2 rsi ; rdx 41 42; idx needs to be other than arg1, arg2, rbx, r12 43%define idx rdx ; rsi 44%define last_len rdx ; rsi 45 46%define size_offset rcx ; rdi 47%define tmp2 rcx ; rdi 48 49%else 50; WINDOWS register definitions 51%define arg1 rcx 52%define arg2 rdx 53 54; idx needs to be other than arg1, arg2, rbx, r12 55%define last_len rsi 56%define idx rsi 57 58%define size_offset rdi 59%define tmp2 rdi 60 61%endif 62 63; Common definitions 64%define state arg1 65%define job arg2 66%define len2 arg2 67%define p2 arg2 68 69%define p r11 70%define start_offset r11 71 72%define unused_lanes rbx 73 74%define job_rax rax 75%define len rax 76 77%define lane rbp 78%define tmp3 rbp 79%define lens3 rbp 80 81%define extra_blocks r8 82%define lens0 r8 83 84%define tmp r9 85%define lens1 r9 86 87%define lane_data r10 88%define lens2 r10 89 90struc stack_frame 91 .xmm: resb 16*10 92 .gpr: resb 8*5 93 .rsp: resb 8 94endstruc 95 96; STACK_SPACE needs to be an odd multiple of 8 97%define _XMM_SAVE stack_frame.gpr 98%define _GPR_SAVE stack_frame.rsp 99%define STACK_SPACE stack_frame_size 100 101; ISAL_SHA512_JOB* _sha512_mb_mgr_submit_avx(ISAL_SHA512_MB_JOB_MGR *state, ISAL_SHA512_JOB *job) 102; arg 1 : rcx : state 103; arg 2 : rdx : job 104mk_global _sha512_mb_mgr_submit_avx, function, internal 105_sha512_mb_mgr_submit_avx: 106 endbranch 107 108 mov rax, rsp 109 110 sub rsp, STACK_SPACE 111 and rsp, ~31 112 113 mov [rsp + stack_frame.rsp], rax 114 115 mov [rsp + _XMM_SAVE + 8*0], rbx 116 mov [rsp + _XMM_SAVE + 8*1], rbp 117 mov [rsp + _XMM_SAVE + 8*2], r12 118%ifidn __OUTPUT_FORMAT__, win64 119 mov [rsp + _XMM_SAVE + 8*3], rsi 120 mov [rsp + _XMM_SAVE + 8*4], rdi 121 vmovdqa [rsp + 16*0], xmm6 122 vmovdqa [rsp + 16*1], xmm7 123 vmovdqa [rsp + 16*2], xmm8 124 vmovdqa [rsp + 16*3], xmm9 125 vmovdqa [rsp + 16*4], xmm10 126 vmovdqa [rsp + 16*5], xmm11 127 vmovdqa [rsp + 16*6], xmm12 128 vmovdqa [rsp + 16*7], xmm13 129 vmovdqa [rsp + 16*8], xmm14 130 vmovdqa [rsp + 16*9], xmm15 131%endif 132 133 mov unused_lanes, [state + _unused_lanes] 134 movzx lane, BYTE(unused_lanes) 135 shr unused_lanes, 8 136 imul lane_data, lane, _LANE_DATA_size 137 mov dword [job + _status], ISAL_STS_BEING_PROCESSED 138 lea lane_data, [state + _ldata + lane_data] 139 mov [state + _unused_lanes], unused_lanes 140 mov DWORD(len), [job + _len] 141 142 mov [lane_data + _job_in_lane], job 143 mov [state + _lens + 4 + 8*lane], DWORD(len) 144 145 146 ; Load digest words from result_digest 147 vmovdqa xmm0, [job + _result_digest + 0*16] 148 vmovdqa xmm1, [job + _result_digest + 1*16] 149 vmovdqa xmm2, [job + _result_digest + 2*16] 150 vmovdqa xmm3, [job + _result_digest + 3*16] 151 vmovq [state + _args_digest + 8*lane + 0*32], xmm0 152 vpextrq [state + _args_digest + 8*lane + 1*32], xmm0, 1 153 vmovq [state + _args_digest + 8*lane + 2*32], xmm1 154 vpextrq [state + _args_digest + 8*lane + 3*32], xmm1, 1 155 vmovq [state + _args_digest + 8*lane + 4*32], xmm2 156 vpextrq [state + _args_digest + 8*lane + 5*32], xmm2, 1 157 vmovq [state + _args_digest + 8*lane + 6*32], xmm3 158 vpextrq [state + _args_digest + 8*lane + 7*32], xmm3, 1 159 160 mov p, [job + _buffer] 161 mov [state + _args_data_ptr + 8*lane], p 162 163 add dword [state + _num_lanes_inuse], 1 164 cmp unused_lanes, 0xff 165 jne return_null 166 167start_loop: 168 169 ; Find min length 170 mov lens0, [state + _lens + 0*8] 171 mov idx, lens0 172 mov lens1, [state + _lens + 1*8] 173 cmp lens1, idx 174 cmovb idx, lens1 175 176 mov len2, idx 177 and idx, 0xF 178 and len2, ~0xFF 179 jz len_is_0 180 181 sub lens0, len2 182 sub lens1, len2 183 shr len2, 32 184 mov [state + _lens + 0*8], lens0 185 mov [state + _lens + 1*8], lens1 186 187 ; "state" and "args" are the same address, arg1 188 ; len is arg2 189 call sha512_mb_x2_avx 190 ; state and idx are intact 191 192len_is_0: 193 194 ; process completed job "idx" 195 imul lane_data, idx, _LANE_DATA_size 196 lea lane_data, [state + _ldata + lane_data] 197 198 mov job_rax, [lane_data + _job_in_lane] 199 200 mov unused_lanes, [state + _unused_lanes] 201 mov qword [lane_data + _job_in_lane], 0 202 mov dword [job_rax + _status], ISAL_STS_COMPLETED 203 shl unused_lanes, 8 204 or unused_lanes, idx 205 mov [state + _unused_lanes], unused_lanes 206 207 sub dword [state + _num_lanes_inuse], 1 208 209 vmovq xmm0, [state + _args_digest + 8*idx + 0*32] 210 vpinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1 211 vmovq xmm1, [state + _args_digest + 8*idx + 2*32] 212 vpinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1 213 vmovq xmm2, [state + _args_digest + 8*idx + 4*32] 214 vpinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1 215 vmovq xmm3, [state + _args_digest + 8*idx + 6*32] 216 vpinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1 217 218 219 vmovdqa [job_rax + _result_digest + 0*16], xmm0 220 vmovdqa [job_rax + _result_digest + 1*16], xmm1 221 vmovdqa [job_rax + _result_digest + 2*16], xmm2 222 vmovdqa [job_rax + _result_digest + 3*16], xmm3 223 224return: 225 226%ifidn __OUTPUT_FORMAT__, win64 227 vmovdqa xmm6, [rsp + 16*0] 228 vmovdqa xmm7, [rsp + 16*1] 229 vmovdqa xmm8, [rsp + 16*2] 230 vmovdqa xmm9, [rsp + 16*3] 231 vmovdqa xmm10, [rsp + 16*4] 232 vmovdqa xmm11, [rsp + 16*5] 233 vmovdqa xmm12, [rsp + 16*6] 234 vmovdqa xmm13, [rsp + 16*7] 235 vmovdqa xmm14, [rsp + 16*8] 236 vmovdqa xmm15, [rsp + 16*9] 237 mov rsi, [rsp + _XMM_SAVE + 8*3] 238 mov rdi, [rsp + _XMM_SAVE + 8*4] 239%endif 240 mov rbx, [rsp + _XMM_SAVE + 8*0] 241 mov rbp, [rsp + _XMM_SAVE + 8*1] 242 mov r12, [rsp + _XMM_SAVE + 8*2] 243 mov rsp, [rsp + stack_frame.rsp] 244 245 ret 246 247return_null: 248 xor job_rax, job_rax 249 jmp return 250 251section .data align=16 252 253align 16 254H0: dd 0x6a09e667 255H1: dd 0xbb67ae85 256H2: dd 0x3c6ef372 257H3: dd 0xa54ff53a 258H4: dd 0x510e527f 259H5: dd 0x9b05688c 260H6: dd 0x1f83d9ab 261H7: dd 0x5be0cd19 262 263