xref: /isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm (revision 0a437795c8360736f38dfa5934aa03a1861d784c)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30%include "sha256_job.asm"
31%include "sha256_mb_mgr_datastruct.asm"
32
33%include "reg_sizes.asm"
34
35extern  sha256_mb_x4_sse
36
37[bits 64]
38default rel
39section .text
40
41%ifidn __OUTPUT_FORMAT__, elf64
42; Linux register definitions
43%define arg1    rdi ; rcx
44%define arg2    rsi ; rdx
45
46; idx needs to be other than arg1, arg2, rbx, r12
47%define idx             rdx ; rsi
48%define last_len        rdx ; rsi
49
50%define size_offset     rcx ; rdi
51%define tmp2            rcx ; rdi
52
53%else
54; WINDOWS register definitions
55%define arg1    rcx
56%define arg2    rdx
57
58; idx needs to be other than arg1, arg2, rbx, r12
59%define last_len        rsi
60%define idx             rsi
61
62%define size_offset     rdi
63%define tmp2            rdi
64
65%endif
66
67; Common definitions
68%define state   arg1
69%define job     arg2
70%define len2    arg2
71%define p2      arg2
72
73%define p               r11
74%define start_offset    r11
75
76%define unused_lanes    rbx
77
78%define job_rax         rax
79%define len             rax
80
81%define lane            rbp
82%define tmp3            rbp
83%define lens3           rbp
84
85%define extra_blocks    r8
86%define lens0           r8
87
88%define tmp             r9
89%define lens1           r9
90
91%define lane_data       r10
92%define lens2           r10
93
94
95; STACK_SPACE needs to be an odd multiple of 8
96%define _XMM_SAVE       16*10
97%define _GPR_SAVE       8*5
98%define STACK_SPACE     _GPR_SAVE + _XMM_SAVE
99
100; ISAL_SHA256_JOB* _sha256_mb_mgr_submit_sse(ISAL_SHA256_MB_JOB_MGR *state, ISAL_SHA256_JOB *job)
101; arg 1 : rcx : state
102; arg 2 : rdx : job
103mk_global _sha256_mb_mgr_submit_sse, function, internal
104_sha256_mb_mgr_submit_sse:
105	endbranch
106
107	sub     rsp, STACK_SPACE
108	mov     [rsp + _XMM_SAVE + 8*0], rbx
109	mov     [rsp + _XMM_SAVE + 8*1], rbp
110	mov     [rsp + _XMM_SAVE + 8*2], r12
111%ifidn __OUTPUT_FORMAT__, win64
112	mov     [rsp + _XMM_SAVE + 8*3], rsi
113	mov     [rsp + _XMM_SAVE + 8*4], rdi
114	movdqa  [rsp + 16*0], xmm6
115	movdqa  [rsp + 16*1], xmm7
116	movdqa  [rsp + 16*2], xmm8
117	movdqa  [rsp + 16*3], xmm9
118	movdqa  [rsp + 16*4], xmm10
119	movdqa  [rsp + 16*5], xmm11
120	movdqa  [rsp + 16*6], xmm12
121	movdqa  [rsp + 16*7], xmm13
122	movdqa  [rsp + 16*8], xmm14
123	movdqa  [rsp + 16*9], xmm15
124%endif
125
126	mov     unused_lanes, [state + _unused_lanes]
127	movzx   lane, BYTE(unused_lanes)
128	and     lane, 0xF
129	shr     unused_lanes, 4
130	imul    lane_data, lane, _LANE_DATA_size
131	mov     dword [job + _status], ISAL_STS_BEING_PROCESSED
132	lea     lane_data, [state + _ldata + lane_data]
133	mov     [state + _unused_lanes], unused_lanes
134	mov     DWORD(len), [job + _len]
135
136	shl	len, 4
137	or	len, lane
138
139	mov     [lane_data + _job_in_lane], job
140	mov     [state + _lens + 4*lane], DWORD(len)
141
142	; Load digest words from result_digest
143	movdqa	xmm0, [job + _result_digest + 0*16]
144	movdqa	xmm1, [job + _result_digest + 1*16]
145	movd    [state + _args_digest + 4*lane + 0*16], xmm0
146	pextrd  [state + _args_digest + 4*lane + 1*16], xmm0, 1
147	pextrd  [state + _args_digest + 4*lane + 2*16], xmm0, 2
148	pextrd  [state + _args_digest + 4*lane + 3*16], xmm0, 3
149	movd    [state + _args_digest + 4*lane + 4*16], xmm1
150	pextrd  [state + _args_digest + 4*lane + 5*16], xmm1, 1
151	pextrd  [state + _args_digest + 4*lane + 6*16], xmm1, 2
152	pextrd  [state + _args_digest + 4*lane + 7*16], xmm1, 3
153
154
155	mov     p, [job + _buffer]
156	mov     [state + _args_data_ptr + 8*lane], p
157
158	add	dword [state + _num_lanes_inuse], 1
159	cmp     unused_lanes, 0xF
160	jne     return_null
161
162start_loop:
163	; Find min length
164	mov     DWORD(lens0), [state + _lens + 0*4]
165	mov     idx, lens0
166	mov     DWORD(lens1), [state + _lens + 1*4]
167	cmp     lens1, idx
168	cmovb   idx, lens1
169	mov     DWORD(lens2), [state + _lens + 2*4]
170	cmp     lens2, idx
171	cmovb   idx, lens2
172	mov     DWORD(lens3), [state + _lens + 3*4]
173	cmp     lens3, idx
174	cmovb   idx, lens3
175	mov     len2, idx
176	and     idx, 0xF
177	and     len2, ~0xF
178	jz      len_is_0
179
180	sub     lens0, len2
181	sub     lens1, len2
182	sub     lens2, len2
183	sub     lens3, len2
184	shr     len2, 4
185	mov     [state + _lens + 0*4], DWORD(lens0)
186	mov     [state + _lens + 1*4], DWORD(lens1)
187	mov     [state + _lens + 2*4], DWORD(lens2)
188	mov     [state + _lens + 3*4], DWORD(lens3)
189
190	; "state" and "args" are the same address, arg1
191	; len is arg2
192	call     sha256_mb_x4_sse
193	; state and idx are intact
194
195len_is_0:
196	; process completed job "idx"
197	imul    lane_data, idx, _LANE_DATA_size
198	lea     lane_data, [state + _ldata + lane_data]
199
200	mov     job_rax, [lane_data + _job_in_lane]
201	mov     unused_lanes, [state + _unused_lanes]
202	mov     qword [lane_data + _job_in_lane], 0
203	mov     dword [job_rax + _status], ISAL_STS_COMPLETED
204	shl     unused_lanes, 4
205	or      unused_lanes, idx
206	mov     [state + _unused_lanes], unused_lanes
207
208	sub	dword [state + _num_lanes_inuse], 1
209
210	movd    xmm0, [state + _args_digest + 4*idx + 0*16]
211	pinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
212	pinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
213	pinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
214	movd    xmm1, [state + _args_digest + 4*idx + 4*16]
215	pinsrd  xmm1, [state + _args_digest + 4*idx + 5*16], 1
216	pinsrd  xmm1, [state + _args_digest + 4*idx + 6*16], 2
217	pinsrd  xmm1, [state + _args_digest + 4*idx + 7*16], 3
218
219	movdqa  [job_rax + _result_digest + 0*16], xmm0
220	movdqa  [job_rax + _result_digest + 1*16], xmm1
221
222return:
223
224%ifidn __OUTPUT_FORMAT__, win64
225	movdqa  xmm6,  [rsp + 16*0]
226	movdqa  xmm7,  [rsp + 16*1]
227	movdqa  xmm8,  [rsp + 16*2]
228	movdqa  xmm9,  [rsp + 16*3]
229	movdqa  xmm10, [rsp + 16*4]
230	movdqa  xmm11, [rsp + 16*5]
231	movdqa  xmm12, [rsp + 16*6]
232	movdqa  xmm13, [rsp + 16*7]
233	movdqa  xmm14, [rsp + 16*8]
234	movdqa  xmm15, [rsp + 16*9]
235	mov     rsi, [rsp + _XMM_SAVE + 8*3]
236	mov     rdi, [rsp + _XMM_SAVE + 8*4]
237%endif
238	mov     rbx, [rsp + _XMM_SAVE + 8*0]
239	mov     rbp, [rsp + _XMM_SAVE + 8*1]
240	mov     r12, [rsp + _XMM_SAVE + 8*2]
241	add     rsp, STACK_SPACE
242
243	ret
244
245return_null:
246	xor     job_rax, job_rax
247	jmp     return
248
249
250section .data align=16
251
252align 16
253H0:     dd  0x6a09e667
254H1:     dd  0xbb67ae85
255H2:     dd  0x3c6ef372
256H3:     dd  0xa54ff53a
257H4:     dd  0x510e527f
258H5:     dd  0x9b05688c
259H6:     dd  0x1f83d9ab
260H7:     dd  0x5be0cd19
261
262