xref: /isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm (revision 592e639e5cd0e9fa1a927dd7459a23176ec36070)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30%include "sha512_job.asm"
31%include "sha512_mb_mgr_datastruct.asm"
32
33%include "reg_sizes.asm"
34
35extern sha512_mb_x2_avx
36
37%ifidn __OUTPUT_FORMAT__, elf64
38; Linux register definitions
39%define arg1    rdi ; rcx
40%define arg2    rsi ; rdx
41
42; idx needs to be other than arg1, arg2, rbx, r12
43%define idx             rdx ; rsi
44%define last_len        rdx ; rsi
45
46%define size_offset     rcx ; rdi
47%define tmp2            rcx ; rdi
48
49%else
50; WINDOWS register definitions
51%define arg1    rcx
52%define arg2    rdx
53
54; idx needs to be other than arg1, arg2, rbx, r12
55%define last_len        rsi
56%define idx             rsi
57
58%define size_offset     rdi
59%define tmp2            rdi
60
61%endif
62
63; Common definitions
64%define state   arg1
65%define job     arg2
66%define len2    arg2
67%define p2      arg2
68
69%define p               r11
70%define start_offset    r11
71
72%define unused_lanes    rbx
73
74%define job_rax         rax
75%define len             rax
76
77%define lane            rbp
78%define tmp3            rbp
79%define lens3           rbp
80
81%define extra_blocks    r8
82%define lens0           r8
83
84%define tmp             r9
85%define lens1           r9
86
87%define lane_data       r10
88%define lens2           r10
89
90struc stack_frame
91	.xmm: resb 16*10
92	.gpr: resb 8*5
93	.rsp: resb 8
94endstruc
95
96; STACK_SPACE needs to be an odd multiple of 8
97%define _XMM_SAVE       stack_frame.gpr
98%define _GPR_SAVE       stack_frame.rsp
99%define STACK_SPACE     stack_frame_size
100
101; ISAL_SHA512_JOB* _sha512_mb_mgr_submit_avx(ISAL_SHA512_MB_JOB_MGR *state, ISAL_SHA512_JOB *job)
102; arg 1 : rcx : state
103; arg 2 : rdx : job
104mk_global _sha512_mb_mgr_submit_avx, function, internal
105_sha512_mb_mgr_submit_avx:
106	endbranch
107
108	mov	rax, rsp
109
110	sub     rsp, STACK_SPACE
111	and	rsp, ~31
112
113	mov	[rsp + stack_frame.rsp], rax
114
115	mov     [rsp + _XMM_SAVE + 8*0], rbx
116	mov     [rsp + _XMM_SAVE + 8*1], rbp
117	mov     [rsp + _XMM_SAVE + 8*2], r12
118%ifidn __OUTPUT_FORMAT__, win64
119	mov     [rsp + _XMM_SAVE + 8*3], rsi
120	mov     [rsp + _XMM_SAVE + 8*4], rdi
121	vmovdqa  [rsp + 16*0], xmm6
122	vmovdqa  [rsp + 16*1], xmm7
123	vmovdqa  [rsp + 16*2], xmm8
124	vmovdqa  [rsp + 16*3], xmm9
125	vmovdqa  [rsp + 16*4], xmm10
126	vmovdqa  [rsp + 16*5], xmm11
127	vmovdqa  [rsp + 16*6], xmm12
128	vmovdqa  [rsp + 16*7], xmm13
129	vmovdqa  [rsp + 16*8], xmm14
130	vmovdqa  [rsp + 16*9], xmm15
131%endif
132
133	mov     unused_lanes, [state + _unused_lanes]
134	movzx   lane, BYTE(unused_lanes)
135	shr     unused_lanes, 8
136	imul    lane_data, lane, _LANE_DATA_size
137	mov     dword [job + _status], ISAL_STS_BEING_PROCESSED
138	lea     lane_data, [state + _ldata + lane_data]
139	mov     [state + _unused_lanes], unused_lanes
140	mov     DWORD(len), [job + _len]
141
142	mov     [lane_data + _job_in_lane], job
143	mov     [state + _lens + 4 + 8*lane], DWORD(len)
144
145
146	; Load digest words from result_digest
147	vmovdqa	xmm0, [job + _result_digest + 0*16]
148	vmovdqa	xmm1, [job + _result_digest + 1*16]
149	vmovdqa	xmm2, [job + _result_digest + 2*16]
150	vmovdqa	xmm3, [job + _result_digest + 3*16]
151	vmovq    [state + _args_digest + 8*lane + 0*32], xmm0
152	vpextrq  [state + _args_digest + 8*lane + 1*32], xmm0, 1
153	vmovq    [state + _args_digest + 8*lane + 2*32], xmm1
154	vpextrq  [state + _args_digest + 8*lane + 3*32], xmm1, 1
155	vmovq    [state + _args_digest + 8*lane + 4*32], xmm2
156	vpextrq  [state + _args_digest + 8*lane + 5*32], xmm2, 1
157	vmovq    [state + _args_digest + 8*lane + 6*32], xmm3
158	vpextrq  [state + _args_digest + 8*lane + 7*32], xmm3, 1
159
160	mov     p, [job + _buffer]
161	mov     [state + _args_data_ptr + 8*lane], p
162
163	add     dword [state + _num_lanes_inuse], 1
164	cmp     unused_lanes, 0xff
165	jne     return_null
166
167start_loop:
168
169	; Find min length
170	mov     lens0, [state + _lens + 0*8]
171	mov     idx, lens0
172	mov     lens1, [state + _lens + 1*8]
173	cmp     lens1, idx
174	cmovb   idx, lens1
175
176	mov     len2, idx
177	and     idx, 0xF
178	and     len2, ~0xFF
179	jz      len_is_0
180
181	sub     lens0, len2
182	sub     lens1, len2
183	shr     len2, 32
184	mov     [state + _lens + 0*8], lens0
185	mov     [state + _lens + 1*8], lens1
186
187	; "state" and "args" are the same address, arg1
188	; len is arg2
189	call    sha512_mb_x2_avx
190	; state and idx are intact
191
192len_is_0:
193
194	; process completed job "idx"
195	imul    lane_data, idx, _LANE_DATA_size
196	lea     lane_data, [state + _ldata + lane_data]
197
198	mov     job_rax, [lane_data + _job_in_lane]
199
200	mov     unused_lanes, [state + _unused_lanes]
201	mov     qword [lane_data + _job_in_lane], 0
202	mov     dword [job_rax + _status], ISAL_STS_COMPLETED
203	shl     unused_lanes, 8
204	or      unused_lanes, idx
205	mov     [state + _unused_lanes], unused_lanes
206
207	sub      dword [state + _num_lanes_inuse], 1
208
209	vmovq    xmm0, [state + _args_digest + 8*idx + 0*32]
210	vpinsrq  xmm0, [state + _args_digest + 8*idx + 1*32], 1
211	vmovq    xmm1, [state + _args_digest + 8*idx + 2*32]
212	vpinsrq  xmm1, [state + _args_digest + 8*idx + 3*32], 1
213	vmovq    xmm2, [state + _args_digest + 8*idx + 4*32]
214	vpinsrq  xmm2, [state + _args_digest + 8*idx + 5*32], 1
215	vmovq    xmm3, [state + _args_digest + 8*idx + 6*32]
216	vpinsrq  xmm3, [state + _args_digest + 8*idx + 7*32], 1
217
218
219	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
220	vmovdqa  [job_rax + _result_digest + 1*16], xmm1
221	vmovdqa  [job_rax + _result_digest + 2*16], xmm2
222	vmovdqa  [job_rax + _result_digest + 3*16], xmm3
223
224return:
225
226%ifidn __OUTPUT_FORMAT__, win64
227	vmovdqa  xmm6,  [rsp + 16*0]
228	vmovdqa  xmm7,  [rsp + 16*1]
229	vmovdqa  xmm8,  [rsp + 16*2]
230	vmovdqa  xmm9,  [rsp + 16*3]
231	vmovdqa  xmm10, [rsp + 16*4]
232	vmovdqa  xmm11, [rsp + 16*5]
233	vmovdqa  xmm12, [rsp + 16*6]
234	vmovdqa  xmm13, [rsp + 16*7]
235	vmovdqa  xmm14, [rsp + 16*8]
236	vmovdqa  xmm15, [rsp + 16*9]
237	mov     rsi, [rsp + _XMM_SAVE + 8*3]
238	mov     rdi, [rsp + _XMM_SAVE + 8*4]
239%endif
240	mov     rbx, [rsp + _XMM_SAVE + 8*0]
241	mov     rbp, [rsp + _XMM_SAVE + 8*1]
242	mov     r12, [rsp + _XMM_SAVE + 8*2]
243	mov	rsp, [rsp + stack_frame.rsp]
244
245	ret
246
247return_null:
248	xor     job_rax, job_rax
249	jmp     return
250
251section .data align=16
252
253align 16
254H0:     dd  0x6a09e667
255H1:     dd  0xbb67ae85
256H2:     dd  0x3c6ef372
257H3:     dd  0xa54ff53a
258H4:     dd  0x510e527f
259H5:     dd  0x9b05688c
260H6:     dd  0x1f83d9ab
261H7:     dd  0x5be0cd19
262
263