xref: /isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm (revision d28f1034f736e3eb791c3cf6bff3e2fa81fb5331)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30%include "sha256_job.asm"
31%include "sha256_mb_mgr_datastruct.asm"
32
33%include "reg_sizes.asm"
34
35extern  sha256_mb_x4_sse
36extern  sha256_ni_x2
37
38[bits 64]
39default rel
40section .text
41
42%ifidn __OUTPUT_FORMAT__, elf64
43; Linux register definitions
44%define arg1    rdi ; rcx
45%define arg2    rsi ; rdx
46
47; idx needs to be other than arg1, arg2, rbx, r12
48%define idx             rdx ; rsi
49%define last_len        rdx ; rsi
50
51%define size_offset     rcx ; rdi
52%define tmp2            rcx ; rdi
53
54%else
55; WINDOWS register definitions
56%define arg1    rcx
57%define arg2    rdx
58
59; idx needs to be other than arg1, arg2, rbx, r12
60%define last_len        rsi
61%define idx             rsi
62
63%define size_offset     rdi
64%define tmp2            rdi
65
66%endif
67
68; Common definitions
69%define state   arg1
70%define job     arg2
71%define len2    arg2
72%define p2      arg2
73
74%define p               r11
75%define start_offset    r11
76
77%define unused_lanes    rbx
78
79%define job_rax         rax
80%define len             rax
81
82%define lane            rbp
83%define tmp3            rbp
84%define lens3           rbp
85
86%define extra_blocks    r8
87%define lens0           r8
88
89%define tmp             r9
90%define lens1           r9
91
92%define lane_data       r10
93%define lens2           r10
94
95; STACK_SPACE needs to be an odd multiple of 8
96%define _XMM_SAVE       16*10
97%define _GPR_SAVE       8*7
98%define STACK_SPACE     _GPR_SAVE + _XMM_SAVE
99
100; ISAL_SHA256_JOB* _sha256_mb_mgr_submit_sse_ni(ISAL_SHA256_MB_JOB_MGR *state, ISAL_SHA256_JOB *job)
101; arg 1 : rcx : state
102; arg 2 : rdx : job
103mk_global _sha256_mb_mgr_submit_sse_ni, function, internal
104_sha256_mb_mgr_submit_sse_ni:
105	endbranch
106
107	sub     rsp, STACK_SPACE
108	mov     [rsp + _XMM_SAVE + 8*0], rbx
109	mov     [rsp + _XMM_SAVE + 8*1], rbp
110	mov     [rsp + _XMM_SAVE + 8*2], r12
111	mov     [rsp + _XMM_SAVE + 8*5], r13
112	mov     [rsp + _XMM_SAVE + 8*6], r14
113%ifidn __OUTPUT_FORMAT__, win64
114	mov     [rsp + _XMM_SAVE + 8*3], rsi
115	mov     [rsp + _XMM_SAVE + 8*4], rdi
116	movdqa  [rsp + 16*0], xmm6
117	movdqa  [rsp + 16*1], xmm7
118	movdqa  [rsp + 16*2], xmm8
119	movdqa  [rsp + 16*3], xmm9
120	movdqa  [rsp + 16*4], xmm10
121	movdqa  [rsp + 16*5], xmm11
122	movdqa  [rsp + 16*6], xmm12
123	movdqa  [rsp + 16*7], xmm13
124	movdqa  [rsp + 16*8], xmm14
125	movdqa  [rsp + 16*9], xmm15
126%endif
127
128	mov     unused_lanes, [state + _unused_lanes]
129	movzx   lane, BYTE(unused_lanes)
130	and     lane, 0xF
131	shr     unused_lanes, 4
132	imul    lane_data, lane, _LANE_DATA_size
133	mov     dword [job + _status], ISAL_STS_BEING_PROCESSED
134	lea     lane_data, [state + _ldata + lane_data]
135	mov     [state + _unused_lanes], unused_lanes
136	mov     DWORD(len), [job + _len]
137
138	shl     len, 4
139	or      len, lane
140
141	mov     [lane_data + _job_in_lane], job
142	mov     [state + _lens + 4*lane], DWORD(len)
143
144	; Load digest words from result_digest
145	movdqa  xmm0, [job + _result_digest + 0*16]
146	movdqa  xmm1, [job + _result_digest + 1*16]
147	movd    [state + _args_digest + 4*lane + 0*16], xmm0
148	pextrd  [state + _args_digest + 4*lane + 1*16], xmm0, 1
149	pextrd  [state + _args_digest + 4*lane + 2*16], xmm0, 2
150	pextrd  [state + _args_digest + 4*lane + 3*16], xmm0, 3
151	movd    [state + _args_digest + 4*lane + 4*16], xmm1
152	pextrd  [state + _args_digest + 4*lane + 5*16], xmm1, 1
153	pextrd  [state + _args_digest + 4*lane + 6*16], xmm1, 2
154	pextrd  [state + _args_digest + 4*lane + 7*16], xmm1, 3
155
156	mov     p, [job + _buffer]
157	mov     [state + _args_data_ptr + 8*lane], p
158
159	add     dword [state + _num_lanes_inuse], 1
160
161	cmp     unused_lanes, 0xF32	; we will process two jobs at the same time
162	jne 	return_null		; wait for another sha_ni job
163
164	; compare with shani-sb threshold, if num_lanes_sse <= threshold, using shani func
165  %if SHA256_NI_SB_THRESHOLD_SSE >= 4   ; there are 4 lanes in sse mb
166  ; shani glue code
167	mov     DWORD(lens0), [state + _lens + 0*4]
168	mov     idx, lens0
169	mov     DWORD(lens1), [state + _lens + 1*4]
170	cmp     lens1, idx
171	cmovb   idx, lens1
172	mov     len2, idx
173	and     idx, 0xF
174	and     len2, ~0xF
175	jz      len_is_0
176	; lensN-len2=idx
177	sub     lens0, len2
178	sub     lens1, len2
179
180	shr     len2, 4
181	mov     [state + _lens + 0*4], DWORD(lens0)
182	mov     [state + _lens + 1*4], DWORD(lens1)
183	mov     r10, idx
184	or      r10, 0x1000     ; sse has 4 lanes *4, r10b is idx, r10b2 is 16
185	; "state" and "args" are the same address, arg1
186	; len is arg2, idx and nlane in r10
187	call    sha256_ni_x2
188	; state and idx are intact
189  %else
190  ; original mb code
191	cmp     unused_lanes, 0xF
192	jne     return_null
193
194    start_loop:
195	; Find min length
196	mov     DWORD(lens0), [state + _lens + 0*4]
197	mov     idx, lens0
198	mov     DWORD(lens1), [state + _lens + 1*4]
199	cmp     lens1, idx
200	cmovb   idx, lens1
201	mov     DWORD(lens2), [state + _lens + 2*4]
202	cmp     lens2, idx
203	cmovb   idx, lens2
204	mov     DWORD(lens3), [state + _lens + 3*4]
205	cmp     lens3, idx
206	cmovb   idx, lens3
207	mov     len2, idx
208	and     idx, 0xF
209	and     len2, ~0xF
210	jz      len_is_0
211
212	sub     lens0, len2
213	sub     lens1, len2
214	sub     lens2, len2
215	sub     lens3, len2
216	shr     len2, 4
217	mov     [state + _lens + 0*4], DWORD(lens0)
218	mov     [state + _lens + 1*4], DWORD(lens1)
219	mov     [state + _lens + 2*4], DWORD(lens2)
220	mov     [state + _lens + 3*4], DWORD(lens3)
221
222	; "state" and "args" are the same address, arg1
223	; len is arg2
224	call     sha256_mb_x4_sse
225	; state and idx are intact
226  %endif
227len_is_0:
228	; process completed job "idx"
229	imul    lane_data, idx, _LANE_DATA_size
230	lea     lane_data, [state + _ldata + lane_data]
231
232	mov     job_rax, [lane_data + _job_in_lane]
233	mov     unused_lanes, [state + _unused_lanes]
234	mov     qword [lane_data + _job_in_lane], 0
235	mov     dword [job_rax + _status], ISAL_STS_COMPLETED
236	shl     unused_lanes, 4
237	or      unused_lanes, idx
238	mov     [state + _unused_lanes], unused_lanes
239
240	sub     dword [state + _num_lanes_inuse], 1
241
242	movd    xmm0, [state + _args_digest + 4*idx + 0*16]
243	pinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
244	pinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
245	pinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
246	movd    xmm1, [state + _args_digest + 4*idx + 4*16]
247	pinsrd  xmm1, [state + _args_digest + 4*idx + 5*16], 1
248	pinsrd  xmm1, [state + _args_digest + 4*idx + 6*16], 2
249	pinsrd  xmm1, [state + _args_digest + 4*idx + 7*16], 3
250
251	movdqa  [job_rax + _result_digest + 0*16], xmm0
252	movdqa  [job_rax + _result_digest + 1*16], xmm1
253
254return:
255
256%ifidn __OUTPUT_FORMAT__, win64
257	movdqa  xmm6,  [rsp + 16*0]
258	movdqa  xmm7,  [rsp + 16*1]
259	movdqa  xmm8,  [rsp + 16*2]
260	movdqa  xmm9,  [rsp + 16*3]
261	movdqa  xmm10, [rsp + 16*4]
262	movdqa  xmm11, [rsp + 16*5]
263	movdqa  xmm12, [rsp + 16*6]
264	movdqa  xmm13, [rsp + 16*7]
265	movdqa  xmm14, [rsp + 16*8]
266	movdqa  xmm15, [rsp + 16*9]
267	mov     rsi, [rsp + _XMM_SAVE + 8*3]
268	mov     rdi, [rsp + _XMM_SAVE + 8*4]
269%endif
270	mov     rbx, [rsp + _XMM_SAVE + 8*0]
271	mov     rbp, [rsp + _XMM_SAVE + 8*1]
272	mov     r12, [rsp + _XMM_SAVE + 8*2]
273	mov     r13, [rsp + _XMM_SAVE + 8*5]
274	mov     r14, [rsp + _XMM_SAVE + 8*6]
275	add     rsp, STACK_SPACE
276
277	ret
278
279return_null:
280	xor     job_rax, job_rax
281	jmp     return
282
283section .data align=16
284
285align 16
286H0:     dd  0x6a09e667
287H1:     dd  0xbb67ae85
288H2:     dd  0x3c6ef372
289H3:     dd  0xa54ff53a
290H4:     dd  0x510e527f
291H5:     dd  0x9b05688c
292H6:     dd  0x1f83d9ab
293H7:     dd  0x5be0cd19
294