xref: /isa-l_crypto/sha256_mb/sha256_opt_x1.asm (revision 860585444755e926bd72600b39758458c0a1c9da)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Implement fast SHA-256 with SSSE3 instructions. (x86_64)
3;
4; Copyright (C) 2013 Intel Corporation.
5;
6; Authors:
7;     James Guilford <james.guilford@intel.com>
8;     Kirk Yap <kirk.s.yap@intel.com>
9;     Tim Chen <tim.c.chen@linux.intel.com>
10; Transcoded by:
11;     Xiaodong Liu <xiaodong.liu@intel.com>
12;
13; This software is available to you under the OpenIB.org BSD license
14; below:
15;
16;     Redistribution and use in source and binary forms, with or
17;     without modification, are permitted provided that the following
18;     conditions are met:
19;
20;      - Redistributions of source code must retain the above
21;        copyright notice, this list of conditions and the following
22;        disclaimer.
23;
24;      - Redistributions in binary form must reproduce the above
25;        copyright notice, this list of conditions and the following
26;        disclaimer in the documentation and/or other materials
27;        provided with the distribution.
28;
29; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
33; BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
34; ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
35; CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
36; SOFTWARE.
37;
38;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
39;
40; This code is described in an Intel White-Paper:
41; "Fast SHA-256 Implementations on Intel Architecture Processors"
42;
43; To find it, surf to http://www.intel.com/p/en_US/embedded
44; and search for that title.
45;
46;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
47%include "sha256_mb_mgr_datastruct.asm"
48%include "reg_sizes.asm"
49
50[bits 64]
51default rel
52section .text
53
54;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
55
56%ifidn __OUTPUT_FORMAT__, elf64
57 ; Linux
58 %define arg0  rdi
59 %define arg1  rsi
60%else
61 ; Windows
62 %define arg0   rcx
63 %define arg1   rdx
64%endif
65
66%xdefine X0 xmm4
67%xdefine X1 xmm5
68%xdefine X2 xmm6
69%xdefine X3 xmm7
70
71%xdefine XTMP0 xmm0
72%xdefine XTMP1 xmm1
73%xdefine XTMP2 xmm2
74%xdefine XTMP3 xmm3
75%xdefine XTMP4 xmm8
76%xdefine XFER xmm9
77
78%define SHUF_00BA xmm10      ; shuffle xBxA -> 00BA
79%define SHUF_DC00 xmm11      ; shuffle xDxC -> DC00
80%define BYTE_FLIP_MASK xmm12
81
82; arg index is start from 0 while mgr_flush/submit is from 1
83%define MGR	arg0	; rdi or rcx
84%define NBLK	arg1	; rsi or rdx
85%define IDX	r8	; local variable -- consistent with caller
86%define NLANX4	r10	; consistent with caller, should be r10
87
88%define TMGR r9	; data pointer stored in stack named _TMGR
89%define INP r9	; data pointer stored in stack named _INP
90%define SRND r9	; clobbers INP
91%define TMP r9	; local variable -- assistant to address digest
92
93%xdefine TBL rbp
94%xdefine c ecx
95%xdefine d esi
96%xdefine e edx
97%xdefine a eax
98%xdefine b ebx
99
100%xdefine f edi
101%xdefine g r12d
102%xdefine h r11d
103
104%xdefine y0 r13d
105%xdefine y1 r14d
106%xdefine y2 r15d
107
108
109;; FRAMESZ plus pushes must be an odd multiple of 8
110%define _STACK_ALIGN_SIZE 8	; 0 or 8 depends on pushes
111%define _INP_END_SIZE 8
112%define _INP_SIZE 8
113%define _TMGR_SIZE 8
114%define _XFER_SIZE 16
115%define _XMM_SAVE_SIZE 0
116%define _GPR_SAVE_SIZE 8*9	;rbx, rdx, rbp, (rdi, rsi), r12~r15
117
118%define _STACK_ALIGN 0
119%define _INP_END (_STACK_ALIGN  + _STACK_ALIGN_SIZE)
120%define _INP (_INP_END  + _INP_END_SIZE)
121%define _TMGR (_INP + _INP_SIZE)
122%define _XFER (_TMGR + _TMGR_SIZE)
123%define _XMM_SAVE (_XFER + _XFER_SIZE)
124%define _GPR_SAVE (_XMM_SAVE + _XMM_SAVE_SIZE)
125%define STACK_SIZE (_GPR_SAVE + _GPR_SAVE_SIZE)
126
127;; assume buffers not aligned
128%define    MOVDQ movdqu
129
130;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
131
132; addm [mem], reg
133; Add reg to mem using reg-mem add and store
134%macro addm 2
135        add     %2, %1 ;changed
136        mov     %1, %2 ;changed
137%endmacro
138
139;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
140
141; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
142; Load xmm with mem and byte swap each dword
143%macro COPY_XMM_AND_BSWAP 3
144        MOVDQ %1, %2 ;changed
145        pshufb %1, %3 ;changed
146%endmacro
147
148; rotate_Xs
149; Rotate values of symbols X0...X3
150%macro rotate_Xs 0
151%xdefine X_ X0
152%xdefine X0 X1
153%xdefine X1 X2
154%xdefine X2 X3
155%xdefine X3 X_
156%endmacro
157
158; ROTATE_ARGS
159; Rotate values of symbols a...h
160%macro ROTATE_ARGS 0
161%xdefine TMP_ h
162%xdefine h g
163%xdefine g f
164%xdefine f e
165%xdefine e d
166%xdefine d c
167%xdefine c b
168%xdefine b a
169%xdefine a TMP_
170%endmacro
171
172%macro FOUR_ROUNDS_AND_SCHED 0
173	;; compute s0 four at a time and s1 two at a time
174	;; compute W[-16] + W[-7] 4 at a time
175	movdqa  XTMP0, X3
176	mov     y0, e 			; y0 = e
177	ror     y0, (25-11)             ; y0 = e >> (25-11)
178	mov     y1, a                   ; y1 = a
179	palignr XTMP0, X2, 4            ; XTMP0 = W[-7]
180	ror     y1, (22-13)             ; y1 = a >> (22-13)
181	xor     y0, e                   ; y0 = e ^ (e >> (25-11))
182	mov     y2, f                   ; y2 = f
183	ror     y0, (11-6)              ; y0 = (e >> (11-6)) ^ (e >> (25-6))
184	movdqa  XTMP1, X1
185	xor     y1, a                   ; y1 = a ^ (a >> (22-13)
186	xor     y2, g                   ; y2 = f^g
187	paddd   XTMP0, X0               ; XTMP0 = W[-7] + W[-16]
188	xor     y0, e                   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
189	and     y2, e                   ; y2 = (f^g)&e
190	ror     y1, (13-2)              ; y1 = (a >> (13-2)) ^ (a >> (22-2))
191	;; compute s0
192	palignr XTMP1, X0, 4            ; XTMP1 = W[-15]
193	xor     y1, a                   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
194	ror     y0, 6                   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
195	xor     y2, g                   ; y2 = CH = ((f^g)&e)^g
196	movdqa  XTMP2, XTMP1            ; XTMP2 = W[-15]
197	ror     y1, 2                   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
198	add     y2, y0                  ; y2 = S1 + CH
199	add     y2 , [rsp + _XFER]      ; y2 = k + w + S1 + CH
200	movdqa  XTMP3, XTMP1            ; XTMP3 = W[-15]
201	mov     y0, a                   ; y0 = a
202	add     h, y2                   ; h = h + S1 + CH + k + w
203	mov     y2, a                   ; y2 = a
204	pslld   XTMP1, (32-7)           ;
205	or      y0, c                   ; y0 = a|c
206	add     d, h                    ; d = d + h + S1 + CH + k + w
207	and     y2, c                   ; y2 = a&c
208	psrld   XTMP2, 7                ;
209	and     y0, b                   ; y0 = (a|c)&b
210	add     h, y1                   ; h = h + S1 + CH + k + w + S0
211	por     XTMP1, XTMP2            ; XTMP1 = W[-15] ror 7
212	or      y0, y2                  ; y0 = MAJ = (a|c)&b)|(a&c)
213	add     h, y0                   ; h = h + S1 + CH + k + w + S0 + MAJ
214
215	ROTATE_ARGS
216	movdqa  XTMP2, XTMP3            ; XTMP2 = W[-15]
217	mov     y0, e                   ; y0 = e
218	mov     y1, a                   ; y1 = a
219	movdqa  XTMP4, XTMP3            ; XTMP4 = W[-15]
220	ror     y0, (25-11)             ; y0 = e >> (25-11)
221	xor     y0, e                   ; y0 = e ^ (e >> (25-11))
222	mov     y2, f                   ; y2 = f
223	ror     y1, (22-13)             ; y1 = a >> (22-13)
224	pslld   XTMP3, (32-18)          ;
225	xor     y1, a                   ; y1 = a ^ (a >> (22-13)
226	ror     y0, (11-6)              ; y0 = (e >> (11-6)) ^ (e >> (25-6))
227	xor     y2, g                   ; y2 = f^g
228	psrld   XTMP2, 18               ;
229	ror     y1, (13-2)              ; y1 = (a >> (13-2)) ^ (a >> (22-2))
230	xor     y0, e                   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
231	and     y2, e                   ; y2 = (f^g)&e
232	ror     y0, 6                   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
233	pxor    XTMP1, XTMP3
234	xor     y1, a                   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
235	xor     y2, g                   ; y2 = CH = ((f^g)&e)^g
236	psrld   XTMP4, 3                ; XTMP4 = W[-15] >> 3
237	add     y2, y0                  ; y2 = S1 + CH
238	add     y2, [rsp + (1*4 + _XFER)] ; y2 = k + w + S1 + CH
239	ror     y1, 2                   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
240	pxor    XTMP1, XTMP2            ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
241	mov     y0, a                   ; y0 = a
242	add     h, y2                   ; h = h + S1 + CH + k + w
243	mov     y2, a                   ; y2 = a
244	pxor    XTMP1, XTMP4            ; XTMP1 = s0
245	or      y0, c                   ; y0 = a|c
246	add     d, h                    ; d = d + h + S1 + CH + k + w
247	and     y2, c                   ; y2 = a&c
248	;; compute low s1
249	pshufd  XTMP2, X3, 11111010B    ; XTMP2 = W[-2] {BBAA}
250	and     y0, b 			; y0 = (a|c)&b
251	add     h, y1                   ; h = h + S1 + CH + k + w + S0
252	paddd   XTMP0, XTMP1            ; XTMP0 = W[-16] + W[-7] + s0
253	or      y0, y2                  ; y0 = MAJ = (a|c)&b)|(a&c)
254	add     h, y0                   ; h = h + S1 + CH + k + w + S0 + MAJ
255
256	ROTATE_ARGS
257	movdqa  XTMP3, XTMP2            ; XTMP3 = W[-2] {BBAA}
258	mov     y0, e                   ; y0 = e
259	mov     y1, a                   ; y1 = a
260	ror     y0, (25-11)             ; y0 = e >> (25-11)
261	movdqa  XTMP4, XTMP2            ; XTMP4 = W[-2] {BBAA}
262	xor     y0, e                   ; y0 = e ^ (e >> (25-11))
263	ror     y1, (22-13)             ; y1 = a >> (22-13)
264	mov     y2, f                   ; y2 = f
265	xor     y1, a                   ; y1 = a ^ (a >> (22-13)
266	ror     y0, (11-6)              ; y0 = (e >> (11-6)) ^ (e >> (25-6))
267	psrlq   XTMP2, 17               ; XTMP2 = W[-2] ror 17 {xBxA}
268	xor     y2, g                   ; y2 = f^g
269	psrlq   XTMP3, 19               ; XTMP3 = W[-2] ror 19 {xBxA}
270	xor     y0, e                   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
271	and     y2, e                   ; y2 = (f^g)&e
272	psrld   XTMP4, 10               ; XTMP4 = W[-2] >> 10 {BBAA}
273	ror     y1, (13-2)              ; y1 = (a >> (13-2)) ^ (a >> (22-2))
274	xor     y1, a                   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
275	xor     y2, g                   ; y2 = CH = ((f^g)&e)^g
276	ror     y0, 6                   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
277	pxor    XTMP2, XTMP3
278	add     y2, y0                  ; y2 = S1 + CH
279	ror     y1, 2                   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
280	add     y2, [rsp + (2*4 + _XFER)] ; y2 = k + w + S1 + CH
281	pxor    XTMP4, XTMP2            ; XTMP4 = s1 {xBxA}
282	mov     y0, a                   ; y0 = a
283	add     h, y2                   ; h = h + S1 + CH + k + w
284	mov     y2, a                   ; y2 = a
285	pshufb  XTMP4, SHUF_00BA        ; XTMP4 = s1 {00BA}
286	or      y0, c                   ; y0 = a|c
287	add     d, h                    ; d = d + h + S1 + CH + k + w
288	and     y2, c                   ; y2 = a&c
289	paddd   XTMP0, XTMP4            ; XTMP0 = {..., ..., W[1], W[0]}
290	and     y0, b                   ; y0 = (a|c)&b
291	add     h, y1                   ; h = h + S1 + CH + k + w + S0
292	;; compute high s1
293	pshufd  XTMP2, XTMP0, 01010000B ; XTMP2 = W[-2] {BBAA}
294	or      y0, y2                  ; y0 = MAJ = (a|c)&b)|(a&c)
295	add     h, y0                   ; h = h + S1 + CH + k + w + S0 + MAJ
296
297	ROTATE_ARGS
298	movdqa  XTMP3, XTMP2            ; XTMP3 = W[-2] {DDCC}
299	mov     y0, e                   ; y0 = e
300	ror     y0, (25-11)             ; y0 = e >> (25-11)
301	mov     y1, a                   ; y1 = a
302	movdqa  X0, XTMP2               ; X0    = W[-2] {DDCC}
303	ror     y1, (22-13)             ; y1 = a >> (22-13)
304	xor     y0, e                   ; y0 = e ^ (e >> (25-11))
305	mov     y2, f                   ; y2 = f
306	ror     y0, (11-6)              ; y0 = (e >> (11-6)) ^ (e >> (25-6))
307	psrlq   XTMP2, 17               ; XTMP2 = W[-2] ror 17 {xDxC}
308	xor     y1, a                   ; y1 = a ^ (a >> (22-13)
309	xor     y2, g                   ; y2 = f^g
310	psrlq   XTMP3, 19               ; XTMP3 = W[-2] ror 19 {xDxC}
311	xor     y0, e                   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25
312	and     y2, e                   ; y2 = (f^g)&e
313	ror     y1, (13-2)              ; y1 = (a >> (13-2)) ^ (a >> (22-2))
314	psrld   X0, 10                  ; X0 = W[-2] >> 10 {DDCC}
315	xor     y1, a                   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22
316	ror     y0, 6                   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
317	xor     y2, g                   ; y2 = CH = ((f^g)&e)^g
318	pxor    XTMP2, XTMP3            ;
319	ror     y1, 2                   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
320	add     y2, y0                  ; y2 = S1 + CH
321	add     y2, [rsp + (3*4 + _XFER)] ; y2 = k + w + S1 + CH
322	pxor    X0, XTMP2               ; X0 = s1 {xDxC}
323	mov     y0, a                   ; y0 = a
324	add     h, y2                   ; h = h + S1 + CH + k + w
325	mov     y2, a                   ; y2 = a
326	pshufb  X0, SHUF_DC00           ; X0 = s1 {DC00}
327	or      y0, c                   ; y0 = a|c
328	add     d, h                    ; d = d + h + S1 + CH + k + w
329	and     y2, c                   ; y2 = a&c
330	paddd   X0, XTMP0               ; X0 = {W[3], W[2], W[1], W[0]}
331	and     y0, b                   ; y0 = (a|c)&b
332	add     h, y1                   ; h = h + S1 + CH + k + w + S0
333	or      y0, y2                  ; y0 = MAJ = (a|c)&b)|(a&c)
334	add     h, y0                   ; h = h + S1 + CH + k + w + S0 + MAJ
335
336	ROTATE_ARGS
337	rotate_Xs
338%endmacro
339
340;; input is [rsp + _XFER + %1 * 4]
341%macro DO_ROUND 1
342	mov     y0, e                 ; y0 = e
343	ror     y0, (25-11)           ; y0 = e >> (25-11)
344	mov     y1, a                 ; y1 = a
345	xor     y0, e                 ; y0 = e ^ (e >> (25-11))
346	ror     y1, (22-13)           ; y1 = a >> (22-13)
347	mov     y2, f                 ; y2 = f
348	xor     y1, a                 ; y1 = a ^ (a >> (22-13)
349	ror     y0, (11-6)            ; y0 = (e >> (11-6)) ^ (e >> (25-6))
350	xor     y2, g                 ; y2 = f^g
351	xor     y0, e                 ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
352	ror     y1, (13-2)            ; y1 = (a >> (13-2)) ^ (a >> (22-2))
353	and     y2, e                 ; y2 = (f^g)&e
354	xor     y1, a                 ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
355	ror     y0, 6                 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
356	xor     y2, g                 ; y2 = CH = ((f^g)&e)^g
357	add     y2, y0                ; y2 = S1 + CH
358	ror     y1, 2                 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
359	%xdefine offset (%1 * 4 + _XFER)
360	add     y2, [rsp + offset]    ; y2 = k + w + S1 + CH
361	mov     y0, a                 ; y0 = a
362	add     h, y2                 ; h = h + S1 + CH + k + w
363	mov     y2, a                 ; y2 = a
364	or      y0, c                 ; y0 = a|c
365	add     d, h                  ; d = d + h + S1 + CH + k + w
366	and     y2, c                 ; y2 = a&c
367	and     y0, b                 ; y0 = (a|c)&b
368	add     h, y1                 ; h = h + S1 + CH + k + w + S0
369	or      y0, y2 		      ; y0 = MAJ = (a|c)&b)|(a&c)
370	add     h, y0 		      ; h = h + S1 + CH + k + w + S0 + MAJ
371	ROTATE_ARGS
372%endmacro
373
374;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
375; void sha1_opt_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks);
376; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
377; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
378; invisible arg 2 : IDX : hash on which lane
379; invisible arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
380; 		 (sse/avx is 4, avx2 is 8, avx512 is 16)
381;
382; Clobbers registers: all general regs, xmm0-xmm12
383;	{rbx, rdx, rbp, (rdi, rsi), r12~r15 are saved on stack}
384;
385;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
386section .text
387mk_global sha256_opt_x1, function, internal
388sha256_opt_x1:
389	endbranch
390	sub     rsp, STACK_SIZE
391	mov     [rsp + _GPR_SAVE + 8*0], rbx
392	mov     [rsp + _GPR_SAVE + 8*1], rbp
393%ifidn __OUTPUT_FORMAT__, win64
394	mov     [rsp + _GPR_SAVE + 8*2], rdi
395	mov     [rsp + _GPR_SAVE + 8*3], rsi
396	; caller has already stored XMM6~10
397%endif
398	mov     [rsp + _GPR_SAVE + 8*4], r12
399	mov     [rsp + _GPR_SAVE + 8*5], r13
400	mov     [rsp + _GPR_SAVE + 8*6], r14
401	mov     [rsp + _GPR_SAVE + 8*7], r15
402	mov     [rsp + _GPR_SAVE + 8*8], rdx
403
404	shl     NBLK, 6 		 ; convert to bytes
405	jz      done_hash
406
407	; detach idx from nlanx4
408	mov	IDX, NLANX4
409	shr	NLANX4, 8
410	and	IDX, 0xff
411
412	mov     [rsp + _TMGR], MGR
413	;; Load input pointers
414	mov     INP, [MGR + _data_ptr + IDX*8]
415	mov     [rsp + _INP], INP
416	;; nblk is used to indicate data end
417	add     NBLK, INP
418	mov     [rsp + _INP_END], NBLK  ; pointer to end of data
419
420
421	mov     TMGR, [rsp + _TMGR]
422	;; load initial digest
423	lea	TMP, [TMGR + 4*IDX]
424	mov     a, [TMP + 0*NLANX4]
425	mov     b, [TMP + 1*NLANX4]
426	mov     c, [TMP + 2*NLANX4]
427	lea	TMP, [TMP + 2*NLANX4]	; MGR + 4*IDX + 2*NLANX4
428	mov     d, [TMP + 1*NLANX4]
429	mov     e, [TMP + 2*NLANX4]
430	mov     g, [TMP + 4*NLANX4]
431	lea	TMP, [TMP + 1*NLANX4]	; MGR + 4*IDX + 3*NLANX4
432	mov     f, [TMP + 2*NLANX4]
433	mov     h, [TMP + 4*NLANX4]
434
435	movdqa  BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK]
436	movdqa  SHUF_00BA, [_SHUF_00BA]
437	movdqa  SHUF_DC00, [_SHUF_DC00]
438
439	mov     INP, [rsp + _INP]
440loop0:
441	lea     TBL, [K256]
442
443	;; byte swap first 16 dwords
444	COPY_XMM_AND_BSWAP      X0, [INP + 0*16], BYTE_FLIP_MASK
445	COPY_XMM_AND_BSWAP      X1, [INP + 1*16], BYTE_FLIP_MASK
446	COPY_XMM_AND_BSWAP      X2, [INP + 2*16], BYTE_FLIP_MASK
447	COPY_XMM_AND_BSWAP      X3, [INP + 3*16], BYTE_FLIP_MASK
448
449	mov     [rsp + _INP], INP
450
451	;; schedule 48 input dwords, by doing 3 rounds of 16 each
452	mov     SRND, 3
453
454loop1:
455	movdqa  XFER, [TBL]
456	paddd   XFER, X0
457	movdqa  [rsp + _XFER], XFER
458	FOUR_ROUNDS_AND_SCHED
459
460	movdqa  XFER, [TBL + 1*16]
461	paddd   XFER, X0
462	movdqa  [rsp + _XFER], XFER
463	FOUR_ROUNDS_AND_SCHED
464
465	movdqa  XFER, [TBL + 2*16]
466	paddd   XFER, X0
467	movdqa  [rsp + _XFER], XFER
468	FOUR_ROUNDS_AND_SCHED
469
470	movdqa  XFER, [TBL + 3*16]
471	paddd   XFER, X0
472	movdqa  [rsp + _XFER], XFER
473	add     TBL, 4*16
474	FOUR_ROUNDS_AND_SCHED
475
476	sub     SRND, 1
477	jne     loop1
478
479	mov     SRND, 2
480loop2:
481	paddd   X0, [TBL]
482	movdqa  [rsp + _XFER], X0
483	DO_ROUND        0
484	DO_ROUND        1
485	DO_ROUND        2
486	DO_ROUND        3
487	paddd   X1, [TBL + 1*16]
488	movdqa  [rsp + _XFER], X1
489	add     TBL, 2*16
490	DO_ROUND        0
491	DO_ROUND        1
492	DO_ROUND        2
493	DO_ROUND        3
494
495	movdqa  X0, X2
496	movdqa  X1, X3
497
498	sub     SRND, 1
499	jne     loop2
500
501	; write out digests
502	mov     TMGR, [rsp + _TMGR]
503	lea	TMP, [TMGR + 4*IDX]
504	addm    a, [TMP + 0*NLANX4]
505	addm    b, [TMP + 1*NLANX4]
506	addm    c, [TMP + 2*NLANX4]
507	lea	TMP, [TMP + 2*NLANX4]	; MGR + 4*IDX + 2*NLANX4
508	addm    d, [TMP + 1*NLANX4]
509	addm    e, [TMP + 2*NLANX4]
510	addm    g, [TMP + 4*NLANX4]
511	lea	TMP, [TMP + 1*NLANX4]	; MGR + 4*IDX + 3*NLANX4
512	addm    f, [TMP + 2*NLANX4]
513	addm    h, [TMP + 4*NLANX4]
514
515	mov     INP, [rsp + _INP]
516	add     INP, 64
517	cmp     INP, [rsp + _INP_END]
518	jne     loop0
519
520done_hash:
521	mov     MGR, [rsp + _TMGR]
522
523	mov     rdx, [rsp + _GPR_SAVE + 8*8]
524	mov     r15, [rsp + _GPR_SAVE + 8*7]
525	mov     r14, [rsp + _GPR_SAVE + 8*6]
526	mov     r13, [rsp + _GPR_SAVE + 8*5]
527	mov     r12, [rsp + _GPR_SAVE + 8*4]
528%ifidn __OUTPUT_FORMAT__, win64
529	mov     rsi, [rsp + _GPR_SAVE + 8*3]
530	mov     rdi, [rsp + _GPR_SAVE + 8*2]
531%endif
532	mov     rbp, [rsp + _GPR_SAVE + 8*1]
533	mov     rbx, [rsp + _GPR_SAVE + 8*0]
534	add     rsp, STACK_SIZE
535
536	ret
537
538section .data
539align 64
540K256:
541        DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
542        DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
543        DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
544        DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
545        DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
546        DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
547        DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
548        DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
549        DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
550        DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
551        DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
552        DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
553        DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
554        DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
555        DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
556        DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
557
558PSHUFFLE_BYTE_FLIP_MASK:
559	DQ 0x0405060700010203, 0x0c0d0e0f08090a0b
560
561; shuffle xBxA -> 00BA
562_SHUF_00BA:
563	DQ 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF
564
565; shuffle xDxC -> DC00
566_SHUF_DC00:
567	DQ 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100
568