xref: /openbsd-src/lib/libcrypto/sha/asm/sha1-ppc.pl (revision a9dd6dd9c022ae208649d051ce3d85279e05f25b)
1f1535dc8Sdjm#!/usr/bin/env perl
2f1535dc8Sdjm
3f1535dc8Sdjm# ====================================================================
4f1535dc8Sdjm# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5f1535dc8Sdjm# project. The module is, however, dual licensed under OpenSSL and
6f1535dc8Sdjm# CRYPTOGAMS licenses depending on where you obtain it. For further
7f1535dc8Sdjm# details see http://www.openssl.org/~appro/cryptogams/.
8f1535dc8Sdjm# ====================================================================
9f1535dc8Sdjm
10f1535dc8Sdjm# I let hardware handle unaligned input(*), except on page boundaries
11f1535dc8Sdjm# (see below for details). Otherwise straightforward implementation
12f1535dc8Sdjm# with X vector in register bank. The module is big-endian [which is
13f1535dc8Sdjm# not big deal as there're no little-endian targets left around].
14f1535dc8Sdjm#
15f1535dc8Sdjm# (*) this means that this module is inappropriate for PPC403? Does
16f1535dc8Sdjm#     anybody know if pre-POWER3 can sustain unaligned load?
17f1535dc8Sdjm
18f1535dc8Sdjm# 			-m64	-m32
19f1535dc8Sdjm# ----------------------------------
20f1535dc8Sdjm# PPC970,gcc-4.0.0	+76%	+59%
21f1535dc8Sdjm# Power6,xlc-7		+68%	+33%
22f1535dc8Sdjm
23f1535dc8Sdjm$flavour = shift;
24f1535dc8Sdjm
25f1535dc8Sdjmif ($flavour =~ /64/) {
26f1535dc8Sdjm	$SIZE_T	=8;
27*ec07fdf1Sdjm	$LRSAVE	=2*$SIZE_T;
28f1535dc8Sdjm	$UCMP	="cmpld";
29f1535dc8Sdjm	$STU	="stdu";
30f1535dc8Sdjm	$POP	="ld";
31f1535dc8Sdjm	$PUSH	="std";
32f1535dc8Sdjm} elsif ($flavour =~ /32/) {
33f1535dc8Sdjm	$SIZE_T	=4;
34*ec07fdf1Sdjm	$LRSAVE	=$SIZE_T;
35f1535dc8Sdjm	$UCMP	="cmplw";
36f1535dc8Sdjm	$STU	="stwu";
37f1535dc8Sdjm	$POP	="lwz";
38f1535dc8Sdjm	$PUSH	="stw";
39f1535dc8Sdjm} else { die "nonsense $flavour"; }
40f1535dc8Sdjm
41f1535dc8Sdjm$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42f1535dc8Sdjm( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
43f1535dc8Sdjm( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
44f1535dc8Sdjmdie "can't locate ppc-xlate.pl";
45f1535dc8Sdjm
46f1535dc8Sdjmopen STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
47f1535dc8Sdjm
48*ec07fdf1Sdjm$FRAME=24*$SIZE_T+64;
49*ec07fdf1Sdjm$LOCALS=6*$SIZE_T;
50f1535dc8Sdjm
51f1535dc8Sdjm$K  ="r0";
52f1535dc8Sdjm$sp ="r1";
53f1535dc8Sdjm$toc="r2";
54f1535dc8Sdjm$ctx="r3";
55f1535dc8Sdjm$inp="r4";
56f1535dc8Sdjm$num="r5";
57f1535dc8Sdjm$t0 ="r15";
58f1535dc8Sdjm$t1 ="r6";
59f1535dc8Sdjm
60f1535dc8Sdjm$A  ="r7";
61f1535dc8Sdjm$B  ="r8";
62f1535dc8Sdjm$C  ="r9";
63f1535dc8Sdjm$D  ="r10";
64f1535dc8Sdjm$E  ="r11";
65f1535dc8Sdjm$T  ="r12";
66f1535dc8Sdjm
67f1535dc8Sdjm@V=($A,$B,$C,$D,$E,$T);
68f1535dc8Sdjm@X=("r16","r17","r18","r19","r20","r21","r22","r23",
69f1535dc8Sdjm    "r24","r25","r26","r27","r28","r29","r30","r31");
70f1535dc8Sdjm
71f1535dc8Sdjmsub BODY_00_19 {
72f1535dc8Sdjmmy ($i,$a,$b,$c,$d,$e,$f)=@_;
73f1535dc8Sdjmmy $j=$i+1;
74f1535dc8Sdjm$code.=<<___ if ($i==0);
75f1535dc8Sdjm	lwz	@X[$i],`$i*4`($inp)
76f1535dc8Sdjm___
77f1535dc8Sdjm$code.=<<___ if ($i<15);
78f1535dc8Sdjm	lwz	@X[$j],`$j*4`($inp)
79f1535dc8Sdjm	add	$f,$K,$e
80f1535dc8Sdjm	rotlwi	$e,$a,5
81f1535dc8Sdjm	add	$f,$f,@X[$i]
82f1535dc8Sdjm	and	$t0,$c,$b
83f1535dc8Sdjm	add	$f,$f,$e
84f1535dc8Sdjm	andc	$t1,$d,$b
85f1535dc8Sdjm	rotlwi	$b,$b,30
86f1535dc8Sdjm	or	$t0,$t0,$t1
87f1535dc8Sdjm	add	$f,$f,$t0
88f1535dc8Sdjm___
89f1535dc8Sdjm$code.=<<___ if ($i>=15);
90f1535dc8Sdjm	add	$f,$K,$e
91f1535dc8Sdjm	rotlwi	$e,$a,5
92f1535dc8Sdjm	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
93f1535dc8Sdjm	add	$f,$f,@X[$i%16]
94f1535dc8Sdjm	and	$t0,$c,$b
95f1535dc8Sdjm	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
96f1535dc8Sdjm	add	$f,$f,$e
97f1535dc8Sdjm	andc	$t1,$d,$b
98f1535dc8Sdjm	rotlwi	$b,$b,30
99f1535dc8Sdjm	or	$t0,$t0,$t1
100f1535dc8Sdjm	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
101f1535dc8Sdjm	add	$f,$f,$t0
102f1535dc8Sdjm	rotlwi	@X[$j%16],@X[$j%16],1
103f1535dc8Sdjm___
104f1535dc8Sdjm}
105f1535dc8Sdjm
106f1535dc8Sdjmsub BODY_20_39 {
107f1535dc8Sdjmmy ($i,$a,$b,$c,$d,$e,$f)=@_;
108f1535dc8Sdjmmy $j=$i+1;
109f1535dc8Sdjm$code.=<<___ if ($i<79);
110f1535dc8Sdjm	add	$f,$K,$e
111f1535dc8Sdjm	rotlwi	$e,$a,5
112f1535dc8Sdjm	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
113f1535dc8Sdjm	add	$f,$f,@X[$i%16]
114f1535dc8Sdjm	xor	$t0,$b,$c
115f1535dc8Sdjm	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
116f1535dc8Sdjm	add	$f,$f,$e
117f1535dc8Sdjm	rotlwi	$b,$b,30
118f1535dc8Sdjm	xor	$t0,$t0,$d
119f1535dc8Sdjm	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
120f1535dc8Sdjm	add	$f,$f,$t0
121f1535dc8Sdjm	rotlwi	@X[$j%16],@X[$j%16],1
122f1535dc8Sdjm___
123f1535dc8Sdjm$code.=<<___ if ($i==79);
124f1535dc8Sdjm	add	$f,$K,$e
125f1535dc8Sdjm	rotlwi	$e,$a,5
126f1535dc8Sdjm	lwz	r16,0($ctx)
127f1535dc8Sdjm	add	$f,$f,@X[$i%16]
128f1535dc8Sdjm	xor	$t0,$b,$c
129f1535dc8Sdjm	lwz	r17,4($ctx)
130f1535dc8Sdjm	add	$f,$f,$e
131f1535dc8Sdjm	rotlwi	$b,$b,30
132f1535dc8Sdjm	lwz	r18,8($ctx)
133f1535dc8Sdjm	xor	$t0,$t0,$d
134f1535dc8Sdjm	lwz	r19,12($ctx)
135f1535dc8Sdjm	add	$f,$f,$t0
136f1535dc8Sdjm	lwz	r20,16($ctx)
137f1535dc8Sdjm___
138f1535dc8Sdjm}
139f1535dc8Sdjm
140f1535dc8Sdjmsub BODY_40_59 {
141f1535dc8Sdjmmy ($i,$a,$b,$c,$d,$e,$f)=@_;
142f1535dc8Sdjmmy $j=$i+1;
143f1535dc8Sdjm$code.=<<___;
144f1535dc8Sdjm	add	$f,$K,$e
145f1535dc8Sdjm	rotlwi	$e,$a,5
146f1535dc8Sdjm	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
147f1535dc8Sdjm	add	$f,$f,@X[$i%16]
148f1535dc8Sdjm	and	$t0,$b,$c
149f1535dc8Sdjm	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
150f1535dc8Sdjm	add	$f,$f,$e
151f1535dc8Sdjm	or	$t1,$b,$c
152f1535dc8Sdjm	rotlwi	$b,$b,30
153f1535dc8Sdjm	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
154f1535dc8Sdjm	and	$t1,$t1,$d
155f1535dc8Sdjm	or	$t0,$t0,$t1
156f1535dc8Sdjm	rotlwi	@X[$j%16],@X[$j%16],1
157f1535dc8Sdjm	add	$f,$f,$t0
158f1535dc8Sdjm___
159f1535dc8Sdjm}
160f1535dc8Sdjm
161f1535dc8Sdjm$code=<<___;
162f1535dc8Sdjm.machine	"any"
163f1535dc8Sdjm.text
164f1535dc8Sdjm
165f1535dc8Sdjm.globl	.sha1_block_data_order
166f1535dc8Sdjm.align	4
167f1535dc8Sdjm.sha1_block_data_order:
168*ec07fdf1Sdjm	$STU	$sp,-$FRAME($sp)
169f1535dc8Sdjm	mflr	r0
170f1535dc8Sdjm	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
171f1535dc8Sdjm	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
172f1535dc8Sdjm	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
173f1535dc8Sdjm	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
174f1535dc8Sdjm	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
175f1535dc8Sdjm	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
176f1535dc8Sdjm	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
177f1535dc8Sdjm	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
178f1535dc8Sdjm	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
179f1535dc8Sdjm	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
180f1535dc8Sdjm	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
181f1535dc8Sdjm	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
182f1535dc8Sdjm	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
183f1535dc8Sdjm	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
184f1535dc8Sdjm	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
185f1535dc8Sdjm	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
186f1535dc8Sdjm	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
187*ec07fdf1Sdjm	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
188f1535dc8Sdjm	lwz	$A,0($ctx)
189f1535dc8Sdjm	lwz	$B,4($ctx)
190f1535dc8Sdjm	lwz	$C,8($ctx)
191f1535dc8Sdjm	lwz	$D,12($ctx)
192f1535dc8Sdjm	lwz	$E,16($ctx)
193f1535dc8Sdjm	andi.	r0,$inp,3
194f1535dc8Sdjm	bne	Lunaligned
195f1535dc8SdjmLaligned:
196f1535dc8Sdjm	mtctr	$num
197f1535dc8Sdjm	bl	Lsha1_block_private
198*ec07fdf1Sdjm	b	Ldone
199*ec07fdf1Sdjm
200*ec07fdf1Sdjm; PowerPC specification allows an implementation to be ill-behaved
201*ec07fdf1Sdjm; upon unaligned access which crosses page boundary. "Better safe
202*ec07fdf1Sdjm; than sorry" principle makes me treat it specially. But I don't
203*ec07fdf1Sdjm; look for particular offending word, but rather for 64-byte input
204*ec07fdf1Sdjm; block which crosses the boundary. Once found that block is aligned
205*ec07fdf1Sdjm; and hashed separately...
206*ec07fdf1Sdjm.align	4
207*ec07fdf1SdjmLunaligned:
208*ec07fdf1Sdjm	subfic	$t1,$inp,4096
209*ec07fdf1Sdjm	andi.	$t1,$t1,4095	; distance to closest page boundary
210*ec07fdf1Sdjm	srwi.	$t1,$t1,6	; t1/=64
211*ec07fdf1Sdjm	beq	Lcross_page
212*ec07fdf1Sdjm	$UCMP	$num,$t1
213*ec07fdf1Sdjm	ble-	Laligned	; didn't cross the page boundary
214*ec07fdf1Sdjm	mtctr	$t1
215*ec07fdf1Sdjm	subfc	$num,$t1,$num
216*ec07fdf1Sdjm	bl	Lsha1_block_private
217*ec07fdf1SdjmLcross_page:
218*ec07fdf1Sdjm	li	$t1,16
219*ec07fdf1Sdjm	mtctr	$t1
220*ec07fdf1Sdjm	addi	r20,$sp,$LOCALS	; spot within the frame
221*ec07fdf1SdjmLmemcpy:
222*ec07fdf1Sdjm	lbz	r16,0($inp)
223*ec07fdf1Sdjm	lbz	r17,1($inp)
224*ec07fdf1Sdjm	lbz	r18,2($inp)
225*ec07fdf1Sdjm	lbz	r19,3($inp)
226*ec07fdf1Sdjm	addi	$inp,$inp,4
227*ec07fdf1Sdjm	stb	r16,0(r20)
228*ec07fdf1Sdjm	stb	r17,1(r20)
229*ec07fdf1Sdjm	stb	r18,2(r20)
230*ec07fdf1Sdjm	stb	r19,3(r20)
231*ec07fdf1Sdjm	addi	r20,r20,4
232*ec07fdf1Sdjm	bdnz	Lmemcpy
233*ec07fdf1Sdjm
234*ec07fdf1Sdjm	$PUSH	$inp,`$FRAME-$SIZE_T*18`($sp)
235*ec07fdf1Sdjm	li	$t1,1
236*ec07fdf1Sdjm	addi	$inp,$sp,$LOCALS
237*ec07fdf1Sdjm	mtctr	$t1
238*ec07fdf1Sdjm	bl	Lsha1_block_private
239*ec07fdf1Sdjm	$POP	$inp,`$FRAME-$SIZE_T*18`($sp)
240*ec07fdf1Sdjm	addic.	$num,$num,-1
241*ec07fdf1Sdjm	bne-	Lunaligned
242*ec07fdf1Sdjm
243f1535dc8SdjmLdone:
244*ec07fdf1Sdjm	$POP	r0,`$FRAME+$LRSAVE`($sp)
245f1535dc8Sdjm	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
246f1535dc8Sdjm	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
247f1535dc8Sdjm	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
248f1535dc8Sdjm	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
249f1535dc8Sdjm	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
250f1535dc8Sdjm	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
251f1535dc8Sdjm	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
252f1535dc8Sdjm	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
253f1535dc8Sdjm	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
254f1535dc8Sdjm	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
255f1535dc8Sdjm	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
256f1535dc8Sdjm	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
257f1535dc8Sdjm	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
258f1535dc8Sdjm	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
259f1535dc8Sdjm	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
260f1535dc8Sdjm	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
261f1535dc8Sdjm	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
262f1535dc8Sdjm	mtlr	r0
263*ec07fdf1Sdjm	addi	$sp,$sp,$FRAME
264f1535dc8Sdjm	blr
265f1535dc8Sdjm___
266f1535dc8Sdjm
267f1535dc8Sdjm# This is private block function, which uses tailored calling
268f1535dc8Sdjm# interface, namely upon entry SHA_CTX is pre-loaded to given
269f1535dc8Sdjm# registers and counter register contains amount of chunks to
270f1535dc8Sdjm# digest...
271f1535dc8Sdjm$code.=<<___;
272f1535dc8Sdjm.align	4
273f1535dc8SdjmLsha1_block_private:
274f1535dc8Sdjm___
275f1535dc8Sdjm$code.=<<___;	# load K_00_19
276f1535dc8Sdjm	lis	$K,0x5a82
277f1535dc8Sdjm	ori	$K,$K,0x7999
278f1535dc8Sdjm___
279f1535dc8Sdjmfor($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
280f1535dc8Sdjm$code.=<<___;	# load K_20_39
281f1535dc8Sdjm	lis	$K,0x6ed9
282f1535dc8Sdjm	ori	$K,$K,0xeba1
283f1535dc8Sdjm___
284f1535dc8Sdjmfor(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
285f1535dc8Sdjm$code.=<<___;	# load K_40_59
286f1535dc8Sdjm	lis	$K,0x8f1b
287f1535dc8Sdjm	ori	$K,$K,0xbcdc
288f1535dc8Sdjm___
289f1535dc8Sdjmfor(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
290f1535dc8Sdjm$code.=<<___;	# load K_60_79
291f1535dc8Sdjm	lis	$K,0xca62
292f1535dc8Sdjm	ori	$K,$K,0xc1d6
293f1535dc8Sdjm___
294f1535dc8Sdjmfor(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
295f1535dc8Sdjm$code.=<<___;
296f1535dc8Sdjm	add	r16,r16,$E
297f1535dc8Sdjm	add	r17,r17,$T
298f1535dc8Sdjm	add	r18,r18,$A
299f1535dc8Sdjm	add	r19,r19,$B
300f1535dc8Sdjm	add	r20,r20,$C
301f1535dc8Sdjm	stw	r16,0($ctx)
302f1535dc8Sdjm	mr	$A,r16
303f1535dc8Sdjm	stw	r17,4($ctx)
304f1535dc8Sdjm	mr	$B,r17
305f1535dc8Sdjm	stw	r18,8($ctx)
306f1535dc8Sdjm	mr	$C,r18
307f1535dc8Sdjm	stw	r19,12($ctx)
308f1535dc8Sdjm	mr	$D,r19
309f1535dc8Sdjm	stw	r20,16($ctx)
310f1535dc8Sdjm	mr	$E,r20
311f1535dc8Sdjm	addi	$inp,$inp,`16*4`
312f1535dc8Sdjm	bdnz-	Lsha1_block_private
313f1535dc8Sdjm	blr
314f1535dc8Sdjm___
315f1535dc8Sdjm
316f1535dc8Sdjm$code =~ s/\`([^\`]*)\`/eval $1/gem;
317f1535dc8Sdjmprint $code;
318f1535dc8Sdjmclose STDOUT;
319