xref: /isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm (revision 0a437795c8360736f38dfa5934aa03a1861d784c)
16df3ef80SGreg Tucker;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
26df3ef80SGreg Tucker;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
36df3ef80SGreg Tucker;
46df3ef80SGreg Tucker;  Redistribution and use in source and binary forms, with or without
56df3ef80SGreg Tucker;  modification, are permitted provided that the following conditions
66df3ef80SGreg Tucker;  are met:
76df3ef80SGreg Tucker;    * Redistributions of source code must retain the above copyright
86df3ef80SGreg Tucker;      notice, this list of conditions and the following disclaimer.
96df3ef80SGreg Tucker;    * Redistributions in binary form must reproduce the above copyright
106df3ef80SGreg Tucker;      notice, this list of conditions and the following disclaimer in
116df3ef80SGreg Tucker;      the documentation and/or other materials provided with the
126df3ef80SGreg Tucker;      distribution.
136df3ef80SGreg Tucker;    * Neither the name of Intel Corporation nor the names of its
146df3ef80SGreg Tucker;      contributors may be used to endorse or promote products derived
156df3ef80SGreg Tucker;      from this software without specific prior written permission.
166df3ef80SGreg Tucker;
176df3ef80SGreg Tucker;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
186df3ef80SGreg Tucker;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
196df3ef80SGreg Tucker;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
206df3ef80SGreg Tucker;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
216df3ef80SGreg Tucker;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
226df3ef80SGreg Tucker;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
236df3ef80SGreg Tucker;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
246df3ef80SGreg Tucker;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
256df3ef80SGreg Tucker;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
266df3ef80SGreg Tucker;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
276df3ef80SGreg Tucker;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
286df3ef80SGreg Tucker;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
296df3ef80SGreg Tucker
306df3ef80SGreg Tucker%include "sha256_job.asm"
316df3ef80SGreg Tucker%include "sha256_mb_mgr_datastruct.asm"
326df3ef80SGreg Tucker
336df3ef80SGreg Tucker%include "reg_sizes.asm"
346df3ef80SGreg Tucker
356df3ef80SGreg Tuckerextern  sha256_mb_x4_sse
36246221b0SXiaodong Liuextern sha256_opt_x1
37246221b0SXiaodong Liu
387516bd6bSGreg Tucker[bits 64]
396df3ef80SGreg Tuckerdefault rel
407516bd6bSGreg Tuckersection .text
416df3ef80SGreg Tucker
426df3ef80SGreg Tucker%ifidn __OUTPUT_FORMAT__, elf64
436df3ef80SGreg Tucker; LINUX register definitions
446df3ef80SGreg Tucker%define arg1    rdi ; rcx
456df3ef80SGreg Tucker%define arg2    rsi ; rdx
466df3ef80SGreg Tucker
476df3ef80SGreg Tucker; idx needs to be other than arg1, arg2, rbx, r12
486df3ef80SGreg Tucker%define idx     rdx ; rsi
496df3ef80SGreg Tucker%else
506df3ef80SGreg Tucker; WINDOWS register definitions
516df3ef80SGreg Tucker%define arg1    rcx
526df3ef80SGreg Tucker%define arg2    rdx
536df3ef80SGreg Tucker
546df3ef80SGreg Tucker; idx needs to be other than arg1, arg2, rbx, r12
556df3ef80SGreg Tucker%define idx     rsi
566df3ef80SGreg Tucker%endif
576df3ef80SGreg Tucker
586df3ef80SGreg Tucker; Common definitions
596df3ef80SGreg Tucker%define state   arg1
606df3ef80SGreg Tucker%define job     arg2
616df3ef80SGreg Tucker%define len2    arg2
626df3ef80SGreg Tucker
636df3ef80SGreg Tucker%define unused_lanes    rbx
646df3ef80SGreg Tucker%define lane_data       rbx
656df3ef80SGreg Tucker%define tmp2            rbx
666df3ef80SGreg Tucker
676df3ef80SGreg Tucker%define job_rax         rax
686df3ef80SGreg Tucker%define tmp1            rax
696df3ef80SGreg Tucker%define size_offset     rax
706df3ef80SGreg Tucker%define tmp             rax
716df3ef80SGreg Tucker%define start_offset    rax
726df3ef80SGreg Tucker
736df3ef80SGreg Tucker%define tmp3            arg1
746df3ef80SGreg Tucker
756df3ef80SGreg Tucker%define extra_blocks    arg2
766df3ef80SGreg Tucker%define p               arg2
776df3ef80SGreg Tucker
786df3ef80SGreg Tucker%define tmp4            r8
796df3ef80SGreg Tucker%define lens0           r8
806df3ef80SGreg Tucker
816df3ef80SGreg Tucker%define lens1           r9
826df3ef80SGreg Tucker%define lens2           r10
836df3ef80SGreg Tucker%define lens3           r11
846df3ef80SGreg Tucker
856df3ef80SGreg Tucker
866df3ef80SGreg Tucker; STACK_SPACE needs to be an odd multiple of 8
876df3ef80SGreg Tucker_XMM_SAVE_SIZE  equ 10*16
886df3ef80SGreg Tucker_GPR_SAVE_SIZE  equ 8*3
896df3ef80SGreg Tucker_ALIGN_SIZE     equ 0
906df3ef80SGreg Tucker
916df3ef80SGreg Tucker_XMM_SAVE       equ 0
926df3ef80SGreg Tucker_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
936df3ef80SGreg TuckerSTACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
946df3ef80SGreg Tucker
956df3ef80SGreg Tucker%define APPEND(a,b) a %+ b
966df3ef80SGreg Tucker
97*0a437795SPablo de Lara; ISAL_SHA256_JOB* _sha256_mb_mgr_flush_sse(ISAL_SHA256_MB_JOB_MGR *state)
986df3ef80SGreg Tucker; arg 1 : rcx : state
993c423071SPablo de Laramk_global _sha256_mb_mgr_flush_sse, function, internal
1003c423071SPablo de Lara_sha256_mb_mgr_flush_sse:
1017f8ce0f8SJohn Kariuki	endbranch
1026df3ef80SGreg Tucker
1036df3ef80SGreg Tucker	sub     rsp, STACK_SPACE
1046df3ef80SGreg Tucker	mov     [rsp + _GPR_SAVE + 8*0], rbx
1056df3ef80SGreg Tucker	mov     [rsp + _GPR_SAVE + 8*1], r12
1066df3ef80SGreg Tucker%ifidn __OUTPUT_FORMAT__, win64
1076df3ef80SGreg Tucker	mov     [rsp + _GPR_SAVE + 8*2], rsi
1086df3ef80SGreg Tucker	movdqa  [rsp + _XMM_SAVE + 16*0], xmm6
1096df3ef80SGreg Tucker	movdqa  [rsp + _XMM_SAVE + 16*1], xmm7
1106df3ef80SGreg Tucker	movdqa  [rsp + _XMM_SAVE + 16*2], xmm8
1116df3ef80SGreg Tucker	movdqa  [rsp + _XMM_SAVE + 16*3], xmm9
1126df3ef80SGreg Tucker	movdqa  [rsp + _XMM_SAVE + 16*4], xmm10
1136df3ef80SGreg Tucker	movdqa  [rsp + _XMM_SAVE + 16*5], xmm11
1146df3ef80SGreg Tucker	movdqa  [rsp + _XMM_SAVE + 16*6], xmm12
1156df3ef80SGreg Tucker	movdqa  [rsp + _XMM_SAVE + 16*7], xmm13
1166df3ef80SGreg Tucker	movdqa  [rsp + _XMM_SAVE + 16*8], xmm14
1176df3ef80SGreg Tucker	movdqa  [rsp + _XMM_SAVE + 16*9], xmm15
1186df3ef80SGreg Tucker%endif
1196df3ef80SGreg Tucker
120246221b0SXiaodong Liu	; use num_lanes_inuse to judge all lanes are empty
121246221b0SXiaodong Liu	cmp	dword [state + _num_lanes_inuse], 0
122246221b0SXiaodong Liu	jz	return_null
1236df3ef80SGreg Tucker
1246df3ef80SGreg Tucker	; find a lane with a non-null job
1256df3ef80SGreg Tucker	xor     idx, idx
1266df3ef80SGreg Tucker	cmp     qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
1276df3ef80SGreg Tucker	cmovne  idx, [one]
1286df3ef80SGreg Tucker	cmp     qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
1296df3ef80SGreg Tucker	cmovne  idx, [two]
1306df3ef80SGreg Tucker	cmp     qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
1316df3ef80SGreg Tucker	cmovne  idx, [three]
1326df3ef80SGreg Tucker
1336df3ef80SGreg Tucker	; copy idx to empty lanes
1346df3ef80SGreg Tuckercopy_lane_data:
1356df3ef80SGreg Tucker	mov     tmp, [state + _args + _data_ptr + 8*idx]
1366df3ef80SGreg Tucker
1376df3ef80SGreg Tucker%assign I 0
1386df3ef80SGreg Tucker%rep 4
1396df3ef80SGreg Tucker	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
1406df3ef80SGreg Tucker	jne     APPEND(skip_,I)
1416df3ef80SGreg Tucker	mov     [state + _args + _data_ptr + 8*I], tmp
1426df3ef80SGreg Tucker	mov     dword [state + _lens + 4*I], 0xFFFFFFFF
1436df3ef80SGreg TuckerAPPEND(skip_,I):
1446df3ef80SGreg Tucker%assign I (I+1)
1456df3ef80SGreg Tucker%endrep
1466df3ef80SGreg Tucker
1476df3ef80SGreg Tucker	; Find min length
1486df3ef80SGreg Tucker	mov     DWORD(lens0), [state + _lens + 0*4]
1496df3ef80SGreg Tucker	mov     idx, lens0
1506df3ef80SGreg Tucker	mov     DWORD(lens1), [state + _lens + 1*4]
1516df3ef80SGreg Tucker	cmp     lens1, idx
1526df3ef80SGreg Tucker	cmovb   idx, lens1
1536df3ef80SGreg Tucker	mov     DWORD(lens2), [state + _lens + 2*4]
1546df3ef80SGreg Tucker	cmp     lens2, idx
1556df3ef80SGreg Tucker	cmovb   idx, lens2
1566df3ef80SGreg Tucker	mov     DWORD(lens3), [state + _lens + 3*4]
1576df3ef80SGreg Tucker	cmp     lens3, idx
1586df3ef80SGreg Tucker	cmovb   idx, lens3
1596df3ef80SGreg Tucker	mov     len2, idx
1606df3ef80SGreg Tucker	and     idx, 0xF
1616df3ef80SGreg Tucker	and     len2, ~0xF
1626df3ef80SGreg Tucker	jz      len_is_0
1636df3ef80SGreg Tucker
164246221b0SXiaodong Liu	; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
165246221b0SXiaodong Liu	cmp	dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_SSE
166246221b0SXiaodong Liu	ja	mb_processing
167246221b0SXiaodong Liu
168246221b0SXiaodong Liu	; lensN-len2=idx
169246221b0SXiaodong Liu	shr     len2, 4
170246221b0SXiaodong Liu	mov     [state + _lens + idx*4], DWORD(idx)
171246221b0SXiaodong Liu	mov	r10, idx
172246221b0SXiaodong Liu	or	r10, 0x1000	; sse has 4 lanes *4, r10b is idx, r10b2 is 16
173246221b0SXiaodong Liu	; "state" and "args" are the same address, arg1
174246221b0SXiaodong Liu	; len is arg2, idx and nlane in r10
175246221b0SXiaodong Liu	call    sha256_opt_x1
176246221b0SXiaodong Liu	; state and idx are intact
177246221b0SXiaodong Liu	jmp	len_is_0
178246221b0SXiaodong Liu
179246221b0SXiaodong Liumb_processing:
180246221b0SXiaodong Liu
1816df3ef80SGreg Tucker	sub     lens0, len2
1826df3ef80SGreg Tucker	sub     lens1, len2
1836df3ef80SGreg Tucker	sub     lens2, len2
1846df3ef80SGreg Tucker	sub     lens3, len2
1856df3ef80SGreg Tucker	shr     len2, 4
1866df3ef80SGreg Tucker	mov     [state + _lens + 0*4], DWORD(lens0)
1876df3ef80SGreg Tucker	mov     [state + _lens + 1*4], DWORD(lens1)
1886df3ef80SGreg Tucker	mov     [state + _lens + 2*4], DWORD(lens2)
1896df3ef80SGreg Tucker	mov     [state + _lens + 3*4], DWORD(lens3)
1906df3ef80SGreg Tucker
1916df3ef80SGreg Tucker	; "state" and "args" are the same address, arg1
1926df3ef80SGreg Tucker	; len is arg2
1936df3ef80SGreg Tucker	call     sha256_mb_x4_sse
1946df3ef80SGreg Tucker	; state and idx are intact
1956df3ef80SGreg Tucker
1966df3ef80SGreg Tuckerlen_is_0:
1976df3ef80SGreg Tucker	; process completed job "idx"
1986df3ef80SGreg Tucker	imul    lane_data, idx, _LANE_DATA_size
1996df3ef80SGreg Tucker	lea     lane_data, [state + _ldata + lane_data]
2006df3ef80SGreg Tucker
2016df3ef80SGreg Tucker	mov     job_rax, [lane_data + _job_in_lane]
2026df3ef80SGreg Tucker	mov     qword [lane_data + _job_in_lane], 0
2038cb7fe78SPablo de Lara	mov     dword [job_rax + _status], ISAL_STS_COMPLETED
2046df3ef80SGreg Tucker	mov     unused_lanes, [state + _unused_lanes]
2056df3ef80SGreg Tucker	shl     unused_lanes, 4
2066df3ef80SGreg Tucker	or      unused_lanes, idx
2076df3ef80SGreg Tucker	mov     [state + _unused_lanes], unused_lanes
2086df3ef80SGreg Tucker
209246221b0SXiaodong Liu	sub     dword [state + _num_lanes_inuse], 1
210246221b0SXiaodong Liu
2116df3ef80SGreg Tucker	movd    xmm0, [state + _args_digest + 4*idx + 0*16]
2126df3ef80SGreg Tucker	pinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
2136df3ef80SGreg Tucker	pinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
2146df3ef80SGreg Tucker	pinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
2156df3ef80SGreg Tucker	movd    xmm1, [state + _args_digest + 4*idx + 4*16]
2166df3ef80SGreg Tucker	pinsrd  xmm1, [state + _args_digest + 4*idx + 5*16], 1
2176df3ef80SGreg Tucker	pinsrd  xmm1, [state + _args_digest + 4*idx + 6*16], 2
2186df3ef80SGreg Tucker	pinsrd  xmm1, [state + _args_digest + 4*idx + 7*16], 3
2196df3ef80SGreg Tucker
2206df3ef80SGreg Tucker	movdqa  [job_rax + _result_digest + 0*16], xmm0
2216df3ef80SGreg Tucker	movdqa  [job_rax + _result_digest + 1*16], xmm1
2226df3ef80SGreg Tucker
2236df3ef80SGreg Tuckerreturn:
2246df3ef80SGreg Tucker
2256df3ef80SGreg Tucker%ifidn __OUTPUT_FORMAT__, win64
2266df3ef80SGreg Tucker	movdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
2276df3ef80SGreg Tucker	movdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
2286df3ef80SGreg Tucker	movdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
2296df3ef80SGreg Tucker	movdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
2306df3ef80SGreg Tucker	movdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
2316df3ef80SGreg Tucker	movdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
2326df3ef80SGreg Tucker	movdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
2336df3ef80SGreg Tucker	movdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
2346df3ef80SGreg Tucker	movdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
2356df3ef80SGreg Tucker	movdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
2366df3ef80SGreg Tucker	mov     rsi, [rsp + _GPR_SAVE + 8*2]
2376df3ef80SGreg Tucker%endif
2386df3ef80SGreg Tucker	mov     rbx, [rsp + _GPR_SAVE + 8*0]
2396df3ef80SGreg Tucker	mov     r12, [rsp + _GPR_SAVE + 8*1]
2406df3ef80SGreg Tucker	add     rsp, STACK_SPACE
2416df3ef80SGreg Tucker
2426df3ef80SGreg Tucker	ret
2436df3ef80SGreg Tucker
2446df3ef80SGreg Tuckerreturn_null:
2456df3ef80SGreg Tucker	xor     job_rax, job_rax
2466df3ef80SGreg Tucker	jmp     return
2476df3ef80SGreg Tucker
2486df3ef80SGreg Tuckersection .data align=16
2496df3ef80SGreg Tucker
2506df3ef80SGreg Tuckeralign 16
2516df3ef80SGreg Tuckerone:    dq  1
2526df3ef80SGreg Tuckertwo:    dq  2
2536df3ef80SGreg Tuckerthree:  dq  3
2546df3ef80SGreg Tucker
255