1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2016 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30%include "sha256_job.asm" 31%include "sha256_mb_mgr_datastruct.asm" 32 33%include "reg_sizes.asm" 34 35extern sha256_mb_x8_avx2 36extern sha256_opt_x1 37 38[bits 64] 39default rel 40section .text 41 42%ifidn __OUTPUT_FORMAT__, elf64 43;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 44; LINUX register definitions 45%define arg1 rdi ; rcx 46%define arg2 rsi ; rdx 47 48%define tmp4 rdx 49;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 50 51%else 52 53;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 54; WINDOWS register definitions 55%define arg1 rcx 56%define arg2 rdx 57 58%define tmp4 rsi 59;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 60%endif 61 62; Common register definitions 63 64%define state arg1 65%define job arg2 66%define len2 arg2 67 68; idx must be a register not clobberred by sha256_mb_x8_avx2 and sha256_opt_x1 69%define idx rbp 70 71%define unused_lanes rbx 72%define lane_data rbx 73%define tmp2 rbx 74 75%define job_rax rax 76%define tmp1 rax 77%define size_offset rax 78%define tmp rax 79%define start_offset rax 80 81%define tmp3 arg1 82 83%define extra_blocks arg2 84%define p arg2 85 86 87; STACK_SPACE needs to be an odd multiple of 8 88_XMM_SAVE_SIZE equ 10*16 89_GPR_SAVE_SIZE equ 8*8 90_ALIGN_SIZE equ 8 91 92_XMM_SAVE equ 0 93_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE 94STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE 95 96%define APPEND(a,b) a %+ b 97 98; ISAL_SHA256_JOB* _sha256_mb_mgr_flush_avx2(ISAL_SHA256_MB_JOB_MGR *state) 99; arg 1 : rcx : state 100mk_global _sha256_mb_mgr_flush_avx2, function, internal 101_sha256_mb_mgr_flush_avx2: 102 endbranch 103 sub rsp, STACK_SPACE 104 mov [rsp + _GPR_SAVE + 8*0], rbx 105 mov [rsp + _GPR_SAVE + 8*3], rbp 106 mov [rsp + _GPR_SAVE + 8*4], r12 107 mov [rsp + _GPR_SAVE + 8*5], r13 108 mov [rsp + _GPR_SAVE + 8*6], r14 109 mov [rsp + _GPR_SAVE + 8*7], r15 110%ifidn __OUTPUT_FORMAT__, win64 111 mov [rsp + _GPR_SAVE + 8*1], rsi 112 mov [rsp + _GPR_SAVE + 8*2], rdi 113 vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 114 vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 115 vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 116 vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 117 vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 118 vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 119 vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 120 vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 121 vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 122 vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 123%endif 124 125 ; use num_lanes_inuse to judge all lanes are empty 126 cmp dword [state + _num_lanes_inuse], 0 127 jz return_null 128 129 ; find a lane with a non-null job 130 xor idx, idx 131 cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 132 cmovne idx, [one] 133 cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 134 cmovne idx, [two] 135 cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 136 cmovne idx, [three] 137 cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0 138 cmovne idx, [four] 139 cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0 140 cmovne idx, [five] 141 cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0 142 cmovne idx, [six] 143 cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0 144 cmovne idx, [seven] 145 146 ; copy idx to empty lanes 147copy_lane_data: 148 mov tmp, [state + _args + _data_ptr + 8*idx] 149 150%assign I 0 151%rep 8 152 cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 153 jne APPEND(skip_,I) 154 mov [state + _args + _data_ptr + 8*I], tmp 155 mov dword [state + _lens + 4*I], 0xFFFFFFFF 156APPEND(skip_,I): 157%assign I (I+1) 158%endrep 159 160 ; Find min length 161 vmovdqa xmm0, [state + _lens + 0*16] 162 vmovdqa xmm1, [state + _lens + 1*16] 163 164 vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A} 165 vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} 166 vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F} 167 vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} 168 vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword 169 170 vmovd DWORD(idx), xmm2 171 mov len2, idx 172 and idx, 0xF 173 shr len2, 4 174 jz len_is_0 175 176 ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func 177 cmp dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_AVX2 178 ja mb_processing 179 180 ; lensN-len2=idx 181 mov [state + _lens + idx*4], DWORD(idx) 182 mov r10, idx 183 or r10, 0x2000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32 184 ; "state" and "args" are the same address, arg1 185 ; len is arg2, idx and nlane in r10 186 call sha256_opt_x1 187 ; state and idx are intact 188 jmp len_is_0 189 190mb_processing: 191 192 vpand xmm2, xmm2, [rel clear_low_nibble] 193 vpshufd xmm2, xmm2, 0 194 195 vpsubd xmm0, xmm0, xmm2 196 vpsubd xmm1, xmm1, xmm2 197 198 vmovdqa [state + _lens + 0*16], xmm0 199 vmovdqa [state + _lens + 1*16], xmm1 200 201 ; "state" and "args" are the same address, arg1 202 ; len is arg2 203 call sha256_mb_x8_avx2 204 ; state and idx are intact 205 206len_is_0: 207 ; process completed job "idx" 208 imul lane_data, idx, _LANE_DATA_size 209 lea lane_data, [state + _ldata + lane_data] 210 211 mov job_rax, [lane_data + _job_in_lane] 212 mov qword [lane_data + _job_in_lane], 0 213 mov dword [job_rax + _status], ISAL_STS_COMPLETED 214 mov unused_lanes, [state + _unused_lanes] 215 shl unused_lanes, 4 216 or unused_lanes, idx 217 mov [state + _unused_lanes], unused_lanes 218 219 sub dword [state + _num_lanes_inuse], 1 220 221 vmovd xmm0, [state + _args_digest + 4*idx + 0*4*8] 222 vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*8], 1 223 vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*8], 2 224 vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*8], 3 225 vmovd xmm1, [state + _args_digest + 4*idx + 4*4*8] 226 vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*8], 1 227 vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*8], 2 228 vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*8], 3 229 230 vmovdqa [job_rax + _result_digest + 0*16], xmm0 231 vmovdqa [job_rax + _result_digest + 1*16], xmm1 232 233return: 234%ifidn __OUTPUT_FORMAT__, win64 235 vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] 236 vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] 237 vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] 238 vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] 239 vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] 240 vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] 241 vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] 242 vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] 243 vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] 244 vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] 245 mov rsi, [rsp + _GPR_SAVE + 8*1] 246 mov rdi, [rsp + _GPR_SAVE + 8*2] 247%endif 248 mov rbx, [rsp + _GPR_SAVE + 8*0] 249 mov rbp, [rsp + _GPR_SAVE + 8*3] 250 mov r12, [rsp + _GPR_SAVE + 8*4] 251 mov r13, [rsp + _GPR_SAVE + 8*5] 252 mov r14, [rsp + _GPR_SAVE + 8*6] 253 mov r15, [rsp + _GPR_SAVE + 8*7] 254 add rsp, STACK_SPACE 255 256 ret 257 258return_null: 259 xor job_rax, job_rax 260 jmp return 261 262section .data align=16 263 264align 16 265clear_low_nibble: 266 dq 0x00000000FFFFFFF0, 0x0000000000000000 267one: dq 1 268two: dq 2 269three: dq 3 270four: dq 4 271five: dq 5 272six: dq 6 273seven: dq 7 274 275