xref: /isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm (revision 592e639e5cd0e9fa1a927dd7459a23176ec36070)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30%include "sha512_job.asm"
31%include "sha512_mb_mgr_datastruct.asm"
32
33%include "reg_sizes.asm"
34
35extern sha512_mb_x4_avx2
36
37%ifidn __OUTPUT_FORMAT__, elf64
38; LINUX register definitions
39%define arg1    rdi ; rcx
40%define arg2    rsi ; rdx
41
42; idx needs to be other than arg1, arg2, rbx, r12
43%define idx             rdx ; rsi
44%define last_len        rdx ; rsi
45
46%define size_offset     rcx ; rdi
47%define tmp2            rcx ; rdi
48
49%else
50; WINDOWS register definitions
51%define arg1    rcx
52%define arg2    rdx
53
54; idx needs to be other than arg1, arg2, rbx, r12
55%define last_len        rsi
56%define idx             rsi
57
58%define size_offset     rdi
59%define tmp2            rdi
60
61%endif
62
63; Common definitions
64%define state   arg1
65%define job     arg2
66%define len2    arg2
67%define p2      arg2
68
69%define p               r11
70%define start_offset    r11
71
72%define unused_lanes    rbx
73
74%define job_rax         rax
75%define len             rax
76
77%define lane            rbp
78%define tmp3            rbp
79%define lens3           rbp
80
81%define extra_blocks    r8
82%define lens0           r8
83
84%define tmp             r9
85%define lens1           r9
86
87%define lane_data       r10
88%define lens2           r10
89
90struc stack_frame
91	.xmm: resb 16*10
92	.gpr: resb 8*5
93	.rsp: resb 8
94endstruc
95
96; STACK_SPACE needs to be an odd multiple of 8
97%define _XMM_SAVE       stack_frame.gpr
98%define _GPR_SAVE       stack_frame.rsp
99%define STACK_SPACE     stack_frame_size
100
101; ISAL_SHA512_JOB* _sha512_mb_mgr_submit_avx2(ISAL_SHA512_MB_JOB_MGR *state, ISAL_SHA512_JOB *job)
102; arg 1 : rcx : state
103; arg 2 : rdx : job
104mk_global _sha512_mb_mgr_submit_avx2, function, internal
105_sha512_mb_mgr_submit_avx2:
106	endbranch
107
108	mov     rax, rsp
109
110	sub     rsp, STACK_SPACE
111	and     rsp, ~31
112
113	mov     [rsp + stack_frame.rsp], rax
114
115	mov     [rsp + _XMM_SAVE + 8*0], rbx
116	mov     [rsp + _XMM_SAVE + 8*1], rbp
117	mov     [rsp + _XMM_SAVE + 8*2], r12
118%ifidn __OUTPUT_FORMAT__, win64
119	mov     [rsp + _XMM_SAVE + 8*3], rsi
120	mov     [rsp + _XMM_SAVE + 8*4], rdi
121	vmovdqa  [rsp + 16*0], xmm6
122	vmovdqa  [rsp + 16*1], xmm7
123	vmovdqa  [rsp + 16*2], xmm8
124	vmovdqa  [rsp + 16*3], xmm9
125	vmovdqa  [rsp + 16*4], xmm10
126	vmovdqa  [rsp + 16*5], xmm11
127	vmovdqa  [rsp + 16*6], xmm12
128	vmovdqa  [rsp + 16*7], xmm13
129	vmovdqa  [rsp + 16*8], xmm14
130	vmovdqa  [rsp + 16*9], xmm15
131%endif
132
133	mov     unused_lanes, [state + _unused_lanes]
134	movzx   lane, BYTE(unused_lanes)
135	shr     unused_lanes, 8
136	imul    lane_data, lane, _LANE_DATA_size
137	mov     dword [job + _status], ISAL_STS_BEING_PROCESSED
138	lea     lane_data, [state + _ldata + lane_data]
139	mov     [state + _unused_lanes], unused_lanes
140	mov     DWORD(len), [job + _len]
141
142	mov     [lane_data + _job_in_lane], job
143	mov     [state + _lens + 4 + 8*lane], DWORD(len)
144
145
146	; Load digest words from result_digest
147	vmovdqa  xmm0, [job + _result_digest + 0*16]
148	vmovdqa  xmm1, [job + _result_digest + 1*16]
149	vmovdqa  xmm2, [job + _result_digest + 2*16]
150	vmovdqa  xmm3, [job + _result_digest + 3*16]
151	vmovq    [state + _args_digest + 8*lane + 0*32], xmm0
152	vpextrq  [state + _args_digest + 8*lane + 1*32], xmm0, 1
153	vmovq    [state + _args_digest + 8*lane + 2*32], xmm1
154	vpextrq  [state + _args_digest + 8*lane + 3*32], xmm1, 1
155	vmovq    [state + _args_digest + 8*lane + 4*32], xmm2
156	vpextrq  [state + _args_digest + 8*lane + 5*32], xmm2, 1
157	vmovq    [state + _args_digest + 8*lane + 6*32], xmm3
158	vpextrq  [state + _args_digest + 8*lane + 7*32], xmm3, 1
159
160	mov     p, [job + _buffer]
161	mov     [state + _args_data_ptr + 8*lane], p
162
163	add     dword [state + _num_lanes_inuse], 1
164	cmp     unused_lanes, 0xff
165	jne     return_null
166
167start_loop:
168
169	; Find min length
170	mov     lens0, [state + _lens + 0*8]
171	mov     idx, lens0
172	mov     lens1, [state + _lens + 1*8]
173	cmp     lens1, idx
174	cmovb   idx, lens1
175	mov     lens2, [state + _lens + 2*8]
176	cmp     lens2, idx
177	cmovb   idx, lens2
178	mov     lens3, [state + _lens + 3*8]
179	cmp     lens3, idx
180	cmovb   idx, lens3
181	mov     len2, idx
182	and     idx, 0xF
183	and     len2, ~0xFF
184	jz      len_is_0
185
186	sub     lens0, len2
187	sub     lens1, len2
188	sub     lens2, len2
189	sub     lens3, len2
190	shr     len2, 32
191	mov     [state + _lens + 0*8], lens0
192	mov     [state + _lens + 1*8], lens1
193	mov     [state + _lens + 2*8], lens2
194	mov     [state + _lens + 3*8], lens3
195
196	; "state" and "args" are the same address, arg1
197	; len is arg2
198	call    sha512_mb_x4_avx2
199	; state and idx are intact
200
201len_is_0:
202
203	; process completed job "idx"
204	imul    lane_data, idx, _LANE_DATA_size
205	lea     lane_data, [state + _ldata + lane_data]
206
207	mov     job_rax, [lane_data + _job_in_lane]
208
209
210	mov     unused_lanes, [state + _unused_lanes]
211	mov     qword [lane_data + _job_in_lane], 0
212	mov     dword [job_rax + _status], ISAL_STS_COMPLETED
213	shl     unused_lanes, 8
214	or      unused_lanes, idx
215	mov     [state + _unused_lanes], unused_lanes
216
217	sub     dword [state + _num_lanes_inuse], 1
218
219	vmovq    xmm0, [state + _args_digest + 8*idx + 0*32]
220	vpinsrq  xmm0, [state + _args_digest + 8*idx + 1*32], 1
221	vmovq    xmm1, [state + _args_digest + 8*idx + 2*32]
222	vpinsrq  xmm1, [state + _args_digest + 8*idx + 3*32], 1
223	vmovq    xmm2, [state + _args_digest + 8*idx + 4*32]
224	vpinsrq  xmm2, [state + _args_digest + 8*idx + 5*32], 1
225	vmovq    xmm3, [state + _args_digest + 8*idx + 6*32]
226	vpinsrq  xmm3, [state + _args_digest + 8*idx + 7*32], 1
227	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
228	vmovdqa  [job_rax + _result_digest + 1*16], xmm1
229	vmovdqa  [job_rax + _result_digest + 2*16], xmm2
230	vmovdqa  [job_rax + _result_digest + 3*16], xmm3
231
232return:
233
234%ifidn __OUTPUT_FORMAT__, win64
235	vmovdqa  xmm6,  [rsp + 16*0]
236	vmovdqa  xmm7,  [rsp + 16*1]
237	vmovdqa  xmm8,  [rsp + 16*2]
238	vmovdqa  xmm9,  [rsp + 16*3]
239	vmovdqa  xmm10, [rsp + 16*4]
240	vmovdqa  xmm11, [rsp + 16*5]
241	vmovdqa  xmm12, [rsp + 16*6]
242	vmovdqa  xmm13, [rsp + 16*7]
243	vmovdqa  xmm14, [rsp + 16*8]
244	vmovdqa  xmm15, [rsp + 16*9]
245	mov     rsi, [rsp + _XMM_SAVE + 8*3]
246	mov     rdi, [rsp + _XMM_SAVE + 8*4]
247%endif
248	mov     rbx, [rsp + _XMM_SAVE + 8*0]
249	mov     rbp, [rsp + _XMM_SAVE + 8*1]
250	mov     r12, [rsp + _XMM_SAVE + 8*2]
251	mov	rsp, [rsp + stack_frame.rsp]
252
253	ret
254
255return_null:
256	xor     job_rax, job_rax
257	jmp     return
258
259section .data align=16
260
261align 16
262H0:     dd  0x6a09e667
263H1:     dd  0xbb67ae85
264H2:     dd  0x3c6ef372
265H3:     dd  0xa54ff53a
266H4:     dd  0x510e527f
267H5:     dd  0x9b05688c
268H6:     dd  0x1f83d9ab
269H7:     dd  0x5be0cd19
270
271