1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2016 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30%include "sha512_job.asm" 31%include "sha512_mb_mgr_datastruct.asm" 32%include "reg_sizes.asm" 33 34extern sha512_mb_x8_avx512 35 36%ifidn __OUTPUT_FORMAT__, elf64 37; LINUX register definitions 38%define arg1 rdi ; rcx 39%define arg2 rsi ; rdx 40 41; idx needs to be other than arg1, arg2, rbx, r12 42%define idx rdx ; rsi 43%define last_len rdx ; rsi 44 45%define size_offset rcx ; rdi 46%define tmp2 rcx ; rdi 47 48%else 49; WINDOWS register definitions 50%define arg1 rcx 51%define arg2 rdx 52 53; idx needs to be other than arg1, arg2, rbx, r12 54%define last_len rsi 55%define idx rsi 56 57%define size_offset rdi 58%define tmp2 rdi 59 60%endif 61 62; Common definitions 63%define state arg1 64%define job arg2 65%define len2 arg2 66%define p2 arg2 67 68%define p r11 69%define start_offset r11 70 71%define unused_lanes rbx 72 73%define job_rax rax 74%define len rax 75 76%define lane rbp 77%define tmp3 rbp 78%define lens3 rbp 79 80%define extra_blocks r8 81%define lens0 r8 82 83%define num_lanes_inuse r9 84%define tmp r9 85%define lens1 r9 86 87%define lane_data r10 88%define lens2 r10 89 90struc stack_frame 91 .xmm: resb 16*10 92 .gpr: resb 8*8 93 .rsp: resb 8 94endstruc 95 96; STACK_SPACE needs to be an odd multiple of 8 97%define _XMM_SAVE stack_frame.gpr 98%define _GPR_SAVE stack_frame.rsp 99%define STACK_SPACE stack_frame_size 100 101; ISAL_SHA512_JOB* _sha512_mb_mgr_submit_avx512(ISAL_SHA512_MB_JOB_MGR *state, ISAL_SHA512_JOB *job) 102; arg 1 : rcx : state 103; arg 2 : rdx : job 104mk_global _sha512_mb_mgr_submit_avx512, function, internal 105_sha512_mb_mgr_submit_avx512: 106 endbranch 107 108 mov rax, rsp 109 110 sub rsp, STACK_SPACE 111 112 mov [rsp + stack_frame.rsp], rax 113 114 mov [rsp + _XMM_SAVE + 8*0], rbx 115 mov [rsp + _XMM_SAVE + 8*1], rbp 116 mov [rsp + _XMM_SAVE + 8*2], r12 117 mov [rsp + _XMM_SAVE + 8*5], r13 118 mov [rsp + _XMM_SAVE + 8*6], r14 119 mov [rsp + _XMM_SAVE + 8*7], r15 120%ifidn __OUTPUT_FORMAT__, win64 121 mov [rsp + _XMM_SAVE + 8*3], rsi 122 mov [rsp + _XMM_SAVE + 8*4], rdi 123 vmovdqu [rsp + 16*0], xmm6 124 vmovdqu [rsp + 16*1], xmm7 125 vmovdqu [rsp + 16*2], xmm8 126 vmovdqu [rsp + 16*3], xmm9 127 vmovdqu [rsp + 16*4], xmm10 128 vmovdqu [rsp + 16*5], xmm11 129 vmovdqu [rsp + 16*6], xmm12 130 vmovdqu [rsp + 16*7], xmm13 131 vmovdqu [rsp + 16*8], xmm14 132 vmovdqu [rsp + 16*9], xmm15 133%endif 134 135 mov unused_lanes, [state + _unused_lanes] 136 movzx lane, BYTE(unused_lanes) 137 shr unused_lanes, 8 138 imul lane_data, lane, _LANE_DATA_size 139 mov dword [job + _status], ISAL_STS_BEING_PROCESSED 140 lea lane_data, [state + _ldata + lane_data] 141 mov [state + _unused_lanes], unused_lanes 142 mov DWORD(len), [job + _len] 143 144 mov [lane_data + _job_in_lane], job 145 mov [state + _lens + 4 + 8*lane], DWORD(len) 146 147 148 ; Load digest words from result_digest 149 vmovdqa xmm0, [job + _result_digest + 0*16] 150 vmovdqa xmm1, [job + _result_digest + 1*16] 151 vmovdqa xmm2, [job + _result_digest + 2*16] 152 vmovdqa xmm3, [job + _result_digest + 3*16] 153 vmovq [state + _args_digest + 8*lane + 0*64], xmm0 154 vpextrq [state + _args_digest + 8*lane + 1*64], xmm0, 1 155 vmovq [state + _args_digest + 8*lane + 2*64], xmm1 156 vpextrq [state + _args_digest + 8*lane + 3*64], xmm1, 1 157 vmovq [state + _args_digest + 8*lane + 4*64], xmm2 158 vpextrq [state + _args_digest + 8*lane + 5*64], xmm2, 1 159 vmovq [state + _args_digest + 8*lane + 6*64], xmm3 160 vpextrq [state + _args_digest + 8*lane + 7*64], xmm3, 1 161 162 mov p, [job + _buffer] 163 mov [state + _args_data_ptr + 8*lane], p 164 165 mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] 166 add num_lanes_inuse, 1 167 mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) 168 cmp num_lanes_inuse, 8 169 jne return_null 170 171start_loop: 172 ; Find min length, len in sha512_mgr is 64bit, high 32bit is block num, low 8bit is idx 173 vmovdqu ymm0, [state + _lens + 0*32] ; ymm0 has {D,d,C,c,B,b,A,a} 174 vmovdqu ymm1, [state + _lens + 1*32] 175 176 vpminuq ymm2, ymm0, ymm1 ; ymm2 has {D,i,C,i,B,i,A,i} 177 vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,i,D,i,x,i,B,i} 178 vpminuq ymm2, ymm2, ymm3 ; ymm2 has {x,i,F,i,x,i,E,i} 179 vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,i,x,i,x,i,F,i} 180 vpminuq ymm2, ymm2, ymm3 ; ymm2 has min value in high dword 181 182 vmovq idx, xmm2 183 mov len2, idx 184 and idx, 0xF 185 shr len2, 32 186 jz len_is_0 187 188 189 vperm2i128 ymm2, ymm2, ymm2, 0 ; ymm2 has {x,x,E,i,x,x,E,i} 190 vpand ymm2, ymm2, [rel clear_low_nibble] ; ymm2 has {0,0,E,0,0,0,E,0} 191 vpshufd ymm2, ymm2, 0x44 ; ymm2 has {E,0,E,0,E,0,E,0} 192 193 vpsubd ymm0, ymm0, ymm2 194 vpsubd ymm1, ymm1, ymm2 195 196 vmovdqu [state + _lens + 0*32], ymm0 197 vmovdqu [state + _lens + 1*32], ymm1 198 199 ; "state" and "args" are the same address, arg1 200 ; len is arg2 201 call sha512_mb_x8_avx512 202 ; state and idx are intact 203 204len_is_0: 205 206 ; process completed job "idx" 207 imul lane_data, idx, _LANE_DATA_size 208 lea lane_data, [state + _ldata + lane_data] 209 210 mov job_rax, [lane_data + _job_in_lane] 211 212 213 mov unused_lanes, [state + _unused_lanes] 214 mov qword [lane_data + _job_in_lane], 0 215 mov dword [job_rax + _status], ISAL_STS_COMPLETED 216 shl unused_lanes, 8 217 or unused_lanes, idx 218 mov [state + _unused_lanes], unused_lanes 219 220 mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] 221 sub num_lanes_inuse, 1 222 mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) 223 vmovq xmm0, [state + _args_digest + 8*idx + 0*64] 224 vpinsrq xmm0, [state + _args_digest + 8*idx + 1*64], 1 225 vmovq xmm1, [state + _args_digest + 8*idx + 2*64] 226 vpinsrq xmm1, [state + _args_digest + 8*idx + 3*64], 1 227 vmovq xmm2, [state + _args_digest + 8*idx + 4*64] 228 vpinsrq xmm2, [state + _args_digest + 8*idx + 5*64], 1 229 vmovq xmm3, [state + _args_digest + 8*idx + 6*64] 230 vpinsrq xmm3, [state + _args_digest + 8*idx + 7*64], 1 231 vmovdqa [job_rax + _result_digest + 0*16], xmm0 232 vmovdqa [job_rax + _result_digest + 1*16], xmm1 233 vmovdqa [job_rax + _result_digest + 2*16], xmm2 234 vmovdqa [job_rax + _result_digest + 3*16], xmm3 235 236return: 237 238%ifidn __OUTPUT_FORMAT__, win64 239 vmovdqu xmm6, [rsp + 16*0] 240 vmovdqu xmm7, [rsp + 16*1] 241 vmovdqu xmm8, [rsp + 16*2] 242 vmovdqu xmm9, [rsp + 16*3] 243 vmovdqu xmm10, [rsp + 16*4] 244 vmovdqu xmm11, [rsp + 16*5] 245 vmovdqu xmm12, [rsp + 16*6] 246 vmovdqu xmm13, [rsp + 16*7] 247 vmovdqu xmm14, [rsp + 16*8] 248 vmovdqu xmm15, [rsp + 16*9] 249 mov rsi, [rsp + _XMM_SAVE + 8*3] 250 mov rdi, [rsp + _XMM_SAVE + 8*4] 251%endif 252 mov rbx, [rsp + _XMM_SAVE + 8*0] 253 mov rbp, [rsp + _XMM_SAVE + 8*1] 254 mov r12, [rsp + _XMM_SAVE + 8*2] 255 mov r13, [rsp + _XMM_SAVE + 8*5] 256 mov r14, [rsp + _XMM_SAVE + 8*6] 257 mov r15, [rsp + _XMM_SAVE + 8*7] 258 259 mov rsp, [rsp + stack_frame.rsp] 260 261 ret 262 263return_null: 264 xor job_rax, job_rax 265 jmp return 266 267section .data align=32 268 269align 32 270clear_low_nibble: ; mgr len element 0xnnnnnnnn 0000000m, nnnnnnnn is blocknum, m is index 271 dq 0xFFFFFFFF00000000, 0x0000000000000000 272 dq 0xFFFFFFFF00000000, 0x0000000000000000 273