xref: /isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm (revision d28f1034f736e3eb791c3cf6bff3e2fa81fb5331)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30%include "sha512_job.asm"
31%include "sha512_mb_mgr_datastruct.asm"
32%include "reg_sizes.asm"
33
34extern sha512_mb_x8_avx512
35
36[bits 64]
37default rel
38section .text
39
40%ifidn __OUTPUT_FORMAT__, elf64
41; LINUX register definitions
42%define arg1    rdi ; rcx
43%define arg2    rsi ; rdx
44
45; idx needs to be other than arg1, arg2, rbx, r12
46%define idx     rdx ; rsi
47%else
48; WINDOWS register definitions
49%define arg1    rcx
50%define arg2    rdx
51
52; idx needs to be other than arg1, arg2, rbx, r12
53%define idx     rsi
54%endif
55
56; Common definitions
57%define state   arg1
58%define job     arg2
59%define len2    arg2
60
61%define num_lanes_inuse r9
62%define unused_lanes    rbx
63%define lane_data       rbx
64%define tmp2            rbx
65
66%define job_rax         rax
67%define tmp1            rax
68%define size_offset     rax
69%define tmp             rax
70%define start_offset    rax
71
72%define tmp3            arg1
73
74%define extra_blocks    arg2
75%define p               arg2
76
77%define tmp4            r8
78%define lens0           r8
79
80%define num_lanes_inuse r9
81%define lens1           r9
82%define lens2           r10
83%define lens3           r11
84
85struc stack_frame
86	.xmm: resb 16*10
87	.gpr: resb 8*8
88	.rsp: resb 8
89endstruc
90
91; STACK_SPACE needs to be an odd multiple of 8
92%define _XMM_SAVE       stack_frame.xmm
93%define _GPR_SAVE       stack_frame.gpr
94%define STACK_SPACE     stack_frame_size
95
96%define APPEND(a,b) a %+ b
97
98; ISAL_SHA512_JOB* _sha512_mb_mgr_flush_avx512(ISAL_SHA512_MB_JOB_MGR *state)
99; arg 1 : rcx : state
100mk_global _sha512_mb_mgr_flush_avx512, function, internal
101_sha512_mb_mgr_flush_avx512:
102	endbranch
103
104	mov     rax, rsp
105
106	sub     rsp, STACK_SPACE
107
108	mov     [rsp + stack_frame.rsp], rax
109
110	mov     [rsp + _GPR_SAVE + 8*0], rbx
111	mov     [rsp + _GPR_SAVE + 8*3], rbp
112	mov     [rsp + _GPR_SAVE + 8*4], r12
113	mov     [rsp + _GPR_SAVE + 8*5], r13
114	mov     [rsp + _GPR_SAVE + 8*6], r14
115	mov     [rsp + _GPR_SAVE + 8*7], r15
116%ifidn __OUTPUT_FORMAT__, win64
117	mov     [rsp + _GPR_SAVE + 8*1], rsi
118	mov     [rsp + _GPR_SAVE + 8*2], rdi
119	vmovdqu  [rsp + _XMM_SAVE + 16*0], xmm6
120	vmovdqu  [rsp + _XMM_SAVE + 16*1], xmm7
121	vmovdqu  [rsp + _XMM_SAVE + 16*2], xmm8
122	vmovdqu  [rsp + _XMM_SAVE + 16*3], xmm9
123	vmovdqu  [rsp + _XMM_SAVE + 16*4], xmm10
124	vmovdqu  [rsp + _XMM_SAVE + 16*5], xmm11
125	vmovdqu  [rsp + _XMM_SAVE + 16*6], xmm12
126	vmovdqu  [rsp + _XMM_SAVE + 16*7], xmm13
127	vmovdqu  [rsp + _XMM_SAVE + 16*8], xmm14
128	vmovdqu  [rsp + _XMM_SAVE + 16*9], xmm15
129%endif
130
131	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
132	cmp	num_lanes_inuse, 0
133	jz	return_null
134
135	; find a lane with a non-null job
136	xor     idx, idx
137%assign I 1
138%rep 7
139	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
140	cmovne	idx, [APPEND(lane_,I)]
141%assign I (I+1)
142%endrep
143
144	; copy idx to empty lanes
145copy_lane_data:
146	mov     tmp, [state + _args + _data_ptr + 8*idx]
147
148%assign I 0
149%rep 8
150	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
151	jne     APPEND(skip_,I)
152	mov     [state + _args + _data_ptr + 8*I], tmp
153	mov     dword [state + _lens + 4 + 8*I], 0xFFFFFFFF
154APPEND(skip_,I):
155%assign I (I+1)
156%endrep
157
158	; Find min length, len in sha512_mgr is 64bit, high 32bit is block num, low 8bit is idx
159	vmovdqu ymm0, [state + _lens + 0*32]	; ymm0 has {D,d,C,c,B,b,A,a}
160	vmovdqu ymm1, [state + _lens + 1*32]
161
162	vpminuq ymm2, ymm0, ymm1	; ymm2 has {D,i,C,i,B,i,A,i}
163	vpalignr ymm3, ymm3, ymm2, 8	; ymm3 has {x,i,D,i,x,i,B,i}
164	vpminuq ymm2, ymm2, ymm3	; ymm2 has {x,i,F,i,x,i,E,i}
165	vperm2i128 ymm3, ymm2, ymm2, 1	; ymm3 has {x,i,x,i,x,i,F,i}
166	vpminuq ymm2, ymm2, ymm3	; ymm2 has min value in high dword
167
168	vmovq   idx, xmm2
169	mov     len2, idx
170	and     idx, 0xF
171	shr     len2, 32		; SHA512 blocksize is 1024bit
172	jz      len_is_0
173
174	vperm2i128 ymm2, ymm2, ymm2, 0	; ymm2 has {x,x,E,i,x,x,E,i}
175	vpand   ymm2, ymm2, [rel clear_low_nibble]	; ymm2 has {0,0,E,0,0,0,E,0}
176	vpshufd ymm2, ymm2, 0x44	; ymm2 has {E,0,E,0,E,0,E,0}
177
178	vpsubd  ymm0, ymm0, ymm2
179	vpsubd  ymm1, ymm1, ymm2
180
181	vmovdqu [state + _lens + 0*32], ymm0
182	vmovdqu [state + _lens + 1*32], ymm1
183
184	; "state" and "args" are the same address, arg1
185	; len is arg2
186	call    sha512_mb_x8_avx512
187	; state and idx are intact
188
189len_is_0:
190	; process completed job "idx"
191	imul    lane_data, idx, _LANE_DATA_size
192	lea     lane_data, [state + _ldata + lane_data]
193
194	mov     job_rax, [lane_data + _job_in_lane]
195	mov     qword [lane_data + _job_in_lane], 0
196	mov     dword [job_rax + _status], ISAL_STS_COMPLETED
197	mov     unused_lanes, [state + _unused_lanes]
198	shl     unused_lanes, 8
199	or      unused_lanes, idx
200	mov     [state + _unused_lanes], unused_lanes
201
202        mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
203        sub     num_lanes_inuse, 1
204        mov     [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
205	vmovq    xmm0, [state + _args_digest + 8*idx + 0*64]
206	vpinsrq  xmm0, [state + _args_digest + 8*idx + 1*64], 1
207	vmovq    xmm1, [state + _args_digest + 8*idx + 2*64]
208	vpinsrq  xmm1, [state + _args_digest + 8*idx + 3*64], 1
209	vmovq    xmm2, [state + _args_digest + 8*idx + 4*64]
210	vpinsrq  xmm2, [state + _args_digest + 8*idx + 5*64], 1
211	vmovq    xmm3, [state + _args_digest + 8*idx + 6*64]
212	vpinsrq  xmm3, [state + _args_digest + 8*idx + 7*64], 1
213
214	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
215	vmovdqa  [job_rax + _result_digest + 1*16], xmm1
216	vmovdqa  [job_rax + _result_digest + 2*16], xmm2
217	vmovdqa  [job_rax + _result_digest + 3*16], xmm3
218
219return:
220
221%ifidn __OUTPUT_FORMAT__, win64
222	vmovdqu  xmm6,  [rsp + _XMM_SAVE + 16*0]
223	vmovdqu  xmm7,  [rsp + _XMM_SAVE + 16*1]
224	vmovdqu  xmm8,  [rsp + _XMM_SAVE + 16*2]
225	vmovdqu  xmm9,  [rsp + _XMM_SAVE + 16*3]
226	vmovdqu  xmm10, [rsp + _XMM_SAVE + 16*4]
227	vmovdqu  xmm11, [rsp + _XMM_SAVE + 16*5]
228	vmovdqu  xmm12, [rsp + _XMM_SAVE + 16*6]
229	vmovdqu  xmm13, [rsp + _XMM_SAVE + 16*7]
230	vmovdqu  xmm14, [rsp + _XMM_SAVE + 16*8]
231	vmovdqu  xmm15, [rsp + _XMM_SAVE + 16*9]
232	mov     rsi, [rsp + _GPR_SAVE + 8*1]
233	mov     rdi, [rsp + _GPR_SAVE + 8*2]
234%endif
235	mov     rbx, [rsp + _GPR_SAVE + 8*0]
236	mov     rbp, [rsp + _GPR_SAVE + 8*3]
237	mov     r12, [rsp + _GPR_SAVE + 8*4]
238	mov     r13, [rsp + _GPR_SAVE + 8*5]
239	mov     r14, [rsp + _GPR_SAVE + 8*6]
240	mov     r15, [rsp + _GPR_SAVE + 8*7]
241
242	mov	rsp, [rsp + stack_frame.rsp]
243
244	ret
245
246return_null:
247	xor     job_rax, job_rax
248	jmp     return
249
250section .data align=32
251
252align 32
253clear_low_nibble:	; mgr len element 0xnnnnnnnn 0000000m, nnnnnnnn is blocknum, m is index
254	dq 0xFFFFFFFF00000000, 0x0000000000000000
255	dq 0xFFFFFFFF00000000, 0x0000000000000000
256lane_1:     dq  1
257lane_2:     dq  2
258lane_3:     dq  3
259lane_4:     dq  4
260lane_5:     dq  5
261lane_6:     dq  6
262lane_7:     dq  7
263