xref: /openbsd-src/lib/libcrypto/sha/asm/sha512-parisc.pl (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256/512 block procedure for PA-RISC.
11
12# June 2009.
13#
14# SHA256 performance is >75% better than gcc 3.2 generated code on
15# PA-7100LC. Compared to code generated by vendor compiler this
16# implementation is almost 70% faster in 64-bit build, but delivers
17# virtually same performance in 32-bit build on PA-8600.
18#
19# SHA512 performance is >2.9x better than gcc 3.2 generated code on
20# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
21# code is executed on PA-RISC 2.0 processor and switches to 64-bit
22# code path delivering adequate performance even in "blended" 32-bit
23# build. Though 64-bit code is not any faster than code generated by
24# vendor compiler on PA-8600...
25#
26# Special thanks to polarhome.com for providing HP-UX account.
27
28$flavour = shift;
29$output = shift;
30open STDOUT,">$output";
31
32if ($flavour =~ /64/) {
33	$LEVEL		="2.0W";
34	$SIZE_T		=8;
35	$FRAME_MARKER	=80;
36	$SAVED_RP	=16;
37	$PUSH		="std";
38	$PUSHMA		="std,ma";
39	$POP		="ldd";
40	$POPMB		="ldd,mb";
41} else {
42	$LEVEL		="1.0";
43	$SIZE_T		=4;
44	$FRAME_MARKER	=48;
45	$SAVED_RP	=20;
46	$PUSH		="stw";
47	$PUSHMA		="stwm";
48	$POP		="ldw";
49	$POPMB		="ldwm";
50}
51
52if ($output =~ /512/) {
53	$func="sha512_block_data_order";
54	$SZ=8;
55	@Sigma0=(28,34,39);
56	@Sigma1=(14,18,41);
57	@sigma0=(1,  8, 7);
58	@sigma1=(19,61, 6);
59	$rounds=80;
60	$LAST10BITS=0x017;
61	$LD="ldd";
62	$LDM="ldd,ma";
63	$ST="std";
64} else {
65	$func="sha256_block_data_order";
66	$SZ=4;
67	@Sigma0=( 2,13,22);
68	@Sigma1=( 6,11,25);
69	@sigma0=( 7,18, 3);
70	@sigma1=(17,19,10);
71	$rounds=64;
72	$LAST10BITS=0x0f2;
73	$LD="ldw";
74	$LDM="ldwm";
75	$ST="stw";
76}
77
78$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
79				#                 [+ argument transfer]
80$XOFF=16*$SZ+32;		# local variables
81$FRAME+=$XOFF;
82$XOFF+=$FRAME_MARKER;		# distance between %sp and local variables
83
84$ctx="%r26";	# zapped by $a0
85$inp="%r25";	# zapped by $a1
86$num="%r24";	# zapped by $t0
87
88$a0 ="%r26";
89$a1 ="%r25";
90$t0 ="%r24";
91$t1 ="%r29";
92$Tbl="%r31";
93
94@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
95
96@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
97    "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
98
99sub ROUND_00_15 {
100my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
101$code.=<<___;
102	_ror	$e,$Sigma1[0],$a0
103	and	$f,$e,$t0
104	_ror	$e,$Sigma1[1],$a1
105	addl	$t1,$h,$h
106	andcm	$g,$e,$t1
107	xor	$a1,$a0,$a0
108	_ror	$a1,`$Sigma1[2]-$Sigma1[1]`,$a1
109	or	$t0,$t1,$t1		; Ch(e,f,g)
110	addl	@X[$i%16],$h,$h
111	xor	$a0,$a1,$a1		; Sigma1(e)
112	addl	$t1,$h,$h
113	_ror	$a,$Sigma0[0],$a0
114	addl	$a1,$h,$h
115
116	_ror	$a,$Sigma0[1],$a1
117	and	$a,$b,$t0
118	and	$a,$c,$t1
119	xor	$a1,$a0,$a0
120	_ror	$a1,`$Sigma0[2]-$Sigma0[1]`,$a1
121	xor	$t1,$t0,$t0
122	and	$b,$c,$t1
123	xor	$a0,$a1,$a1		; Sigma0(a)
124	addl	$h,$d,$d
125	xor	$t1,$t0,$t0		; Maj(a,b,c)
126	`"$LDM	$SZ($Tbl),$t1" if ($i<15)`
127	addl	$a1,$h,$h
128	addl	$t0,$h,$h
129
130___
131}
132
133sub ROUND_16_xx {
134my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
135$i-=16;
136$code.=<<___;
137	_ror	@X[($i+1)%16],$sigma0[0],$a0
138	_ror	@X[($i+1)%16],$sigma0[1],$a1
139	addl	@X[($i+9)%16],@X[$i],@X[$i]
140	_ror	@X[($i+14)%16],$sigma1[0],$t0
141	_ror	@X[($i+14)%16],$sigma1[1],$t1
142	xor	$a1,$a0,$a0
143	_shr	@X[($i+1)%16],$sigma0[2],$a1
144	xor	$t1,$t0,$t0
145	_shr	@X[($i+14)%16],$sigma1[2],$t1
146	xor	$a1,$a0,$a0		; sigma0(X[(i+1)&0x0f])
147	xor	$t1,$t0,$t0		; sigma1(X[(i+14)&0x0f])
148	$LDM	$SZ($Tbl),$t1
149	addl	$a0,@X[$i],@X[$i]
150	addl	$t0,@X[$i],@X[$i]
151___
152$code.=<<___ if ($i==15);
153	extru	$t1,31,10,$a1
154	comiclr,<> $LAST10BITS,$a1,%r0
155	ldo	1($Tbl),$Tbl		; signal end of $Tbl
156___
157&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
158}
159
160$code=<<___;
161	.LEVEL	$LEVEL
162#if 0
163	.SPACE	\$TEXT\$
164	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
165#else
166	.text
167#endif
168
169	.ALIGN	64
170L\$table
171___
172$code.=<<___ if ($SZ==8);
173	.WORD	0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
174	.WORD	0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
175	.WORD	0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
176	.WORD	0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
177	.WORD	0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
178	.WORD	0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
179	.WORD	0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
180	.WORD	0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
181	.WORD	0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
182	.WORD	0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
183	.WORD	0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
184	.WORD	0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
185	.WORD	0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
186	.WORD	0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
187	.WORD	0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
188	.WORD	0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
189	.WORD	0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
190	.WORD	0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
191	.WORD	0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
192	.WORD	0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
193	.WORD	0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
194	.WORD	0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
195	.WORD	0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
196	.WORD	0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
197	.WORD	0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
198	.WORD	0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
199	.WORD	0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
200	.WORD	0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
201	.WORD	0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
202	.WORD	0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
203	.WORD	0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
204	.WORD	0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
205	.WORD	0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
206	.WORD	0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
207	.WORD	0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
208	.WORD	0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
209	.WORD	0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
210	.WORD	0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
211	.WORD	0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
212	.WORD	0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
213___
214$code.=<<___ if ($SZ==4);
215	.WORD	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
216	.WORD	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
217	.WORD	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
218	.WORD	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
219	.WORD	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
220	.WORD	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
221	.WORD	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
222	.WORD	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
223	.WORD	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
224	.WORD	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
225	.WORD	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
226	.WORD	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
227	.WORD	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
228	.WORD	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
229	.WORD	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
230	.WORD	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
231___
232$code.=<<___;
233
234	.EXPORT	$func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
235	.ALIGN	64
236$func
237	.PROC
238	.CALLINFO	FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
239	.ENTRY
240	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
241	$PUSHMA	%r3,$FRAME(%sp)
242	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
243	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
244	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
245	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
246	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
247	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
248	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
249	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
250	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
251	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
252	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
253	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
254	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
255	$PUSH	%r17,`-$FRAME+14*$SIZE_T`(%sp)
256	$PUSH	%r18,`-$FRAME+15*$SIZE_T`(%sp)
257
258	_shl	$num,`log(16*$SZ)/log(2)`,$num
259	addl	$inp,$num,$num		; $num to point at the end of $inp
260
261	$PUSH	$num,`-$FRAME_MARKER-4*$SIZE_T`(%sp)	; save arguments
262	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
263	$PUSH	$ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
264
265	blr	%r0,$Tbl
266	ldi	3,$t1
267L\$pic
268	andcm	$Tbl,$t1,$Tbl		; wipe privilege level
269	ldo	L\$table-L\$pic($Tbl),$Tbl
270___
271$code.=<<___ if ($SZ==8 && $SIZE_T==4);
272#ifndef __OpenBSD__
273___
274$code.=<<___ if ($SZ==8 && $SIZE_T==4);
275	ldi	31,$t1
276	mtctl	$t1,%cr11
277	extrd,u,*= $t1,%sar,1,$t1	; executes on PA-RISC 1.0
278	b	L\$parisc1
279	nop
280___
281$code.=<<___;
282	$LD	`0*$SZ`($ctx),$A	; load context
283	$LD	`1*$SZ`($ctx),$B
284	$LD	`2*$SZ`($ctx),$C
285	$LD	`3*$SZ`($ctx),$D
286	$LD	`4*$SZ`($ctx),$E
287	$LD	`5*$SZ`($ctx),$F
288	$LD	`6*$SZ`($ctx),$G
289	$LD	`7*$SZ`($ctx),$H
290
291	extru	$inp,31,`log($SZ)/log(2)`,$t0
292	sh3addl	$t0,%r0,$t0
293	subi	`8*$SZ`,$t0,$t0
294	mtctl	$t0,%cr11		; load %sar with align factor
295
296L\$oop
297	ldi	`$SZ-1`,$t0
298	$LDM	$SZ($Tbl),$t1
299	andcm	$inp,$t0,$t0		; align $inp
300___
301	for ($i=0;$i<15;$i++) {		# load input block
302	$code.="\t$LD	`$SZ*$i`($t0),@X[$i]\n";		}
303$code.=<<___;
304	cmpb,*=	$inp,$t0,L\$aligned
305	$LD	`$SZ*15`($t0),@X[15]
306	$LD	`$SZ*16`($t0),@X[16]
307___
308	for ($i=0;$i<16;$i++) {		# align data
309	$code.="\t_align	@X[$i],@X[$i+1],@X[$i]\n";	}
310$code.=<<___;
311L\$aligned
312	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
313___
314
315for($i=0;$i<16;$i++)	{ &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
316$code.=<<___;
317L\$rounds
318	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
319___
320for(;$i<32;$i++)	{ &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
321$code.=<<___;
322	bb,>=	$Tbl,31,L\$rounds	; end of $Tbl signalled?
323	nop
324
325	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
326	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
327	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
328	ldo	`-$rounds*$SZ-1`($Tbl),$Tbl		; rewind $Tbl
329
330	$LD	`0*$SZ`($ctx),@X[0]	; load context
331	$LD	`1*$SZ`($ctx),@X[1]
332	$LD	`2*$SZ`($ctx),@X[2]
333	$LD	`3*$SZ`($ctx),@X[3]
334	$LD	`4*$SZ`($ctx),@X[4]
335	$LD	`5*$SZ`($ctx),@X[5]
336	addl	@X[0],$A,$A
337	$LD	`6*$SZ`($ctx),@X[6]
338	addl	@X[1],$B,$B
339	$LD	`7*$SZ`($ctx),@X[7]
340	ldo	`16*$SZ`($inp),$inp	; advance $inp
341
342	$ST	$A,`0*$SZ`($ctx)	; save context
343	addl	@X[2],$C,$C
344	$ST	$B,`1*$SZ`($ctx)
345	addl	@X[3],$D,$D
346	$ST	$C,`2*$SZ`($ctx)
347	addl	@X[4],$E,$E
348	$ST	$D,`3*$SZ`($ctx)
349	addl	@X[5],$F,$F
350	$ST	$E,`4*$SZ`($ctx)
351	addl	@X[6],$G,$G
352	$ST	$F,`5*$SZ`($ctx)
353	addl	@X[7],$H,$H
354	$ST	$G,`6*$SZ`($ctx)
355	$ST	$H,`7*$SZ`($ctx)
356
357	cmpb,*<>,n $inp,$num,L\$oop
358	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
359___
360if ($SZ==8 && $SIZE_T==4)	# SHA512 for 32-bit PA-RISC 1.0
361{{
362$code.=<<___;
363	b	L\$done
364	nop
365
366	.ALIGN	64
367L\$parisc1
368___
369$code.=<<___ if ($SZ==8 && $SIZE_T==4);
370#endif
371___
372
373@V=(  $Ahi,  $Alo,  $Bhi,  $Blo,  $Chi,  $Clo,  $Dhi,  $Dlo,
374      $Ehi,  $Elo,  $Fhi,  $Flo,  $Ghi,  $Glo,  $Hhi,  $Hlo) =
375   ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
376     "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
377$a0 ="%r17";
378$a1 ="%r18";
379$a2 ="%r19";
380$a3 ="%r20";
381$t0 ="%r21";
382$t1 ="%r22";
383$t2 ="%r28";
384$t3 ="%r29";
385$Tbl="%r31";
386
387@X=("%r23","%r24","%r25","%r26");	# zaps $num,$inp,$ctx
388
389sub ROUND_00_15_pa1 {
390my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
391       $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
392my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
393
394$code.=<<___ if (!$flag);
395	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
396	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
397___
398$code.=<<___;
399	shd	$ehi,$elo,$Sigma1[0],$t0
400	 add	$Xlo,$hlo,$hlo
401	shd	$elo,$ehi,$Sigma1[0],$t1
402	 addc	$Xhi,$hhi,$hhi		; h += X[i]
403	shd	$ehi,$elo,$Sigma1[1],$t2
404	 ldwm	8($Tbl),$Xhi
405	shd	$elo,$ehi,$Sigma1[1],$t3
406	 ldw	-4($Tbl),$Xlo		; load K[i]
407	xor	$t2,$t0,$t0
408	xor	$t3,$t1,$t1
409	 and	$flo,$elo,$a0
410	 and	$fhi,$ehi,$a1
411	shd	$ehi,$elo,$Sigma1[2],$t2
412	 andcm	$glo,$elo,$a2
413	shd	$elo,$ehi,$Sigma1[2],$t3
414	 andcm	$ghi,$ehi,$a3
415	xor	$t2,$t0,$t0
416	xor	$t3,$t1,$t1		; Sigma1(e)
417	add	$Xlo,$hlo,$hlo
418	 xor	$a2,$a0,$a0
419	addc	$Xhi,$hhi,$hhi		; h += K[i]
420	 xor	$a3,$a1,$a1		; Ch(e,f,g)
421
422	 add	$t0,$hlo,$hlo
423	shd	$ahi,$alo,$Sigma0[0],$t0
424	 addc	$t1,$hhi,$hhi		; h += Sigma1(e)
425	shd	$alo,$ahi,$Sigma0[0],$t1
426	 add	$a0,$hlo,$hlo
427	shd	$ahi,$alo,$Sigma0[1],$t2
428	 addc	$a1,$hhi,$hhi		; h += Ch(e,f,g)
429	shd	$alo,$ahi,$Sigma0[1],$t3
430
431	xor	$t2,$t0,$t0
432	xor	$t3,$t1,$t1
433	shd	$ahi,$alo,$Sigma0[2],$t2
434	and	$alo,$blo,$a0
435	shd	$alo,$ahi,$Sigma0[2],$t3
436	and	$ahi,$bhi,$a1
437	xor	$t2,$t0,$t0
438	xor	$t3,$t1,$t1		; Sigma0(a)
439
440	and	$alo,$clo,$a2
441	and	$ahi,$chi,$a3
442	xor	$a2,$a0,$a0
443	 add	$hlo,$dlo,$dlo
444	xor	$a3,$a1,$a1
445	 addc	$hhi,$dhi,$dhi		; d += h
446	and	$blo,$clo,$a2
447	 add	$t0,$hlo,$hlo
448	and	$bhi,$chi,$a3
449	 addc	$t1,$hhi,$hhi		; h += Sigma0(a)
450	xor	$a2,$a0,$a0
451	 add	$a0,$hlo,$hlo
452	xor	$a3,$a1,$a1		; Maj(a,b,c)
453	 addc	$a1,$hhi,$hhi		; h += Maj(a,b,c)
454
455___
456$code.=<<___ if ($i==15 && $flag);
457	extru	$Xlo,31,10,$Xlo
458	comiclr,= $LAST10BITS,$Xlo,%r0
459	b	L\$rounds_pa1
460	nop
461___
462push(@X,shift(@X)); push(@X,shift(@X));
463}
464
465sub ROUND_16_xx_pa1 {
466my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
467my ($i)=shift;
468$i-=16;
469$code.=<<___;
470	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
471	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
472	ldw	`-$XOFF+8*(($i+9)%16)`(%sp),$a1
473	ldw	`-$XOFF+8*(($i+9)%16)+4`(%sp),$a0	; load X[i+9]
474	ldw	`-$XOFF+8*(($i+14)%16)`(%sp),$a3
475	ldw	`-$XOFF+8*(($i+14)%16)+4`(%sp),$a2	; load X[i+14]
476	shd	$Xnhi,$Xnlo,$sigma0[0],$t0
477	shd	$Xnlo,$Xnhi,$sigma0[0],$t1
478	 add	$a0,$Xlo,$Xlo
479	shd	$Xnhi,$Xnlo,$sigma0[1],$t2
480	 addc	$a1,$Xhi,$Xhi
481	shd	$Xnlo,$Xnhi,$sigma0[1],$t3
482	xor	$t2,$t0,$t0
483	shd	$Xnhi,$Xnlo,$sigma0[2],$t2
484	xor	$t3,$t1,$t1
485	extru	$Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
486	xor	$t2,$t0,$t0
487	 shd	$a3,$a2,$sigma1[0],$a0
488	xor	$t3,$t1,$t1		; sigma0(X[i+1)&0x0f])
489	 shd	$a2,$a3,$sigma1[0],$a1
490	add	$t0,$Xlo,$Xlo
491	 shd	$a3,$a2,$sigma1[1],$t2
492	addc	$t1,$Xhi,$Xhi
493	 shd	$a2,$a3,$sigma1[1],$t3
494	xor	$t2,$a0,$a0
495	shd	$a3,$a2,$sigma1[2],$t2
496	xor	$t3,$a1,$a1
497	extru	$a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
498	xor	$t2,$a0,$a0
499	xor	$t3,$a1,$a1		; sigma0(X[i+14)&0x0f])
500	add	$a0,$Xlo,$Xlo
501	addc	$a1,$Xhi,$Xhi
502
503	stw	$Xhi,`-$XOFF+8*($i%16)`(%sp)
504	stw	$Xlo,`-$XOFF+8*($i%16)+4`(%sp)
505___
506&ROUND_00_15_pa1($i,@_,1);
507}
508$code.=<<___;
509	ldw	`0*4`($ctx),$Ahi		; load context
510	ldw	`1*4`($ctx),$Alo
511	ldw	`2*4`($ctx),$Bhi
512	ldw	`3*4`($ctx),$Blo
513	ldw	`4*4`($ctx),$Chi
514	ldw	`5*4`($ctx),$Clo
515	ldw	`6*4`($ctx),$Dhi
516	ldw	`7*4`($ctx),$Dlo
517	ldw	`8*4`($ctx),$Ehi
518	ldw	`9*4`($ctx),$Elo
519	ldw	`10*4`($ctx),$Fhi
520	ldw	`11*4`($ctx),$Flo
521	ldw	`12*4`($ctx),$Ghi
522	ldw	`13*4`($ctx),$Glo
523	ldw	`14*4`($ctx),$Hhi
524	ldw	`15*4`($ctx),$Hlo
525
526	extru	$inp,31,2,$t0
527	sh3addl	$t0,%r0,$t0
528	subi	32,$t0,$t0
529	mtctl	$t0,%cr11		; load %sar with align factor
530
531L\$oop_pa1
532	extru	$inp,31,2,$a3
533	comib,=	0,$a3,L\$aligned_pa1
534	sub	$inp,$a3,$inp
535
536	ldw	`0*4`($inp),$X[0]
537	ldw	`1*4`($inp),$X[1]
538	ldw	`2*4`($inp),$t2
539	ldw	`3*4`($inp),$t3
540	ldw	`4*4`($inp),$a0
541	ldw	`5*4`($inp),$a1
542	ldw	`6*4`($inp),$a2
543	ldw	`7*4`($inp),$a3
544	vshd	$X[0],$X[1],$X[0]
545	vshd	$X[1],$t2,$X[1]
546	stw	$X[0],`-$XOFF+0*4`(%sp)
547	ldw	`8*4`($inp),$t0
548	vshd	$t2,$t3,$t2
549	stw	$X[1],`-$XOFF+1*4`(%sp)
550	ldw	`9*4`($inp),$t1
551	vshd	$t3,$a0,$t3
552___
553{
554my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
555for ($i=2;$i<=(128/4-8);$i++) {
556$code.=<<___;
557	stw	$t[0],`-$XOFF+$i*4`(%sp)
558	ldw	`(8+$i)*4`($inp),$t[0]
559	vshd	$t[1],$t[2],$t[1]
560___
561push(@t,shift(@t));
562}
563for (;$i<(128/4-1);$i++) {
564$code.=<<___;
565	stw	$t[0],`-$XOFF+$i*4`(%sp)
566	vshd	$t[1],$t[2],$t[1]
567___
568push(@t,shift(@t));
569}
570$code.=<<___;
571	b	L\$collected_pa1
572	stw	$t[0],`-$XOFF+$i*4`(%sp)
573
574___
575}
576$code.=<<___;
577L\$aligned_pa1
578	ldw	`0*4`($inp),$X[0]
579	ldw	`1*4`($inp),$X[1]
580	ldw	`2*4`($inp),$t2
581	ldw	`3*4`($inp),$t3
582	ldw	`4*4`($inp),$a0
583	ldw	`5*4`($inp),$a1
584	ldw	`6*4`($inp),$a2
585	ldw	`7*4`($inp),$a3
586	stw	$X[0],`-$XOFF+0*4`(%sp)
587	ldw	`8*4`($inp),$t0
588	stw	$X[1],`-$XOFF+1*4`(%sp)
589	ldw	`9*4`($inp),$t1
590___
591{
592my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
593for ($i=2;$i<(128/4-8);$i++) {
594$code.=<<___;
595	stw	$t[0],`-$XOFF+$i*4`(%sp)
596	ldw	`(8+$i)*4`($inp),$t[0]
597___
598push(@t,shift(@t));
599}
600for (;$i<128/4;$i++) {
601$code.=<<___;
602	stw	$t[0],`-$XOFF+$i*4`(%sp)
603___
604push(@t,shift(@t));
605}
606$code.="L\$collected_pa1\n";
607}
608
609for($i=0;$i<16;$i++)	{ &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
610$code.="L\$rounds_pa1\n";
611for(;$i<32;$i++)	{ &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
612
613$code.=<<___;
614	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
615	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
616	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
617	ldo	`-$rounds*$SZ`($Tbl),$Tbl		; rewind $Tbl
618
619	ldw	`0*4`($ctx),$t1		; update context
620	ldw	`1*4`($ctx),$t0
621	ldw	`2*4`($ctx),$t3
622	ldw	`3*4`($ctx),$t2
623	ldw	`4*4`($ctx),$a1
624	ldw	`5*4`($ctx),$a0
625	ldw	`6*4`($ctx),$a3
626	add	$t0,$Alo,$Alo
627	ldw	`7*4`($ctx),$a2
628	addc	$t1,$Ahi,$Ahi
629	ldw	`8*4`($ctx),$t1
630	add	$t2,$Blo,$Blo
631	ldw	`9*4`($ctx),$t0
632	addc	$t3,$Bhi,$Bhi
633	ldw	`10*4`($ctx),$t3
634	add	$a0,$Clo,$Clo
635	ldw	`11*4`($ctx),$t2
636	addc	$a1,$Chi,$Chi
637	ldw	`12*4`($ctx),$a1
638	add	$a2,$Dlo,$Dlo
639	ldw	`13*4`($ctx),$a0
640	addc	$a3,$Dhi,$Dhi
641	ldw	`14*4`($ctx),$a3
642	add	$t0,$Elo,$Elo
643	ldw	`15*4`($ctx),$a2
644	addc	$t1,$Ehi,$Ehi
645	stw	$Ahi,`0*4`($ctx)
646	add	$t2,$Flo,$Flo
647	stw	$Alo,`1*4`($ctx)
648	addc	$t3,$Fhi,$Fhi
649	stw	$Bhi,`2*4`($ctx)
650	add	$a0,$Glo,$Glo
651	stw	$Blo,`3*4`($ctx)
652	addc	$a1,$Ghi,$Ghi
653	stw	$Chi,`4*4`($ctx)
654	add	$a2,$Hlo,$Hlo
655	stw	$Clo,`5*4`($ctx)
656	addc	$a3,$Hhi,$Hhi
657	stw	$Dhi,`6*4`($ctx)
658	ldo	`16*$SZ`($inp),$inp	; advance $inp
659	stw	$Dlo,`7*4`($ctx)
660	stw	$Ehi,`8*4`($ctx)
661	stw	$Elo,`9*4`($ctx)
662	stw	$Fhi,`10*4`($ctx)
663	stw	$Flo,`11*4`($ctx)
664	stw	$Ghi,`12*4`($ctx)
665	stw	$Glo,`13*4`($ctx)
666	stw	$Hhi,`14*4`($ctx)
667	comb,=	$inp,$num,L\$done
668	stw	$Hlo,`15*4`($ctx)
669	b	L\$oop_pa1
670	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
671L\$done
672___
673}}
674$code.=<<___;
675	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
676	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
677	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
678	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
679	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
680	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
681	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
682	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
683	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
684	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
685	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
686	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
687	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
688	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
689	$POP	`-$FRAME+14*$SIZE_T`(%sp),%r17
690	$POP	`-$FRAME+15*$SIZE_T`(%sp),%r18
691	bv	(%r2)
692	.EXIT
693	$POPMB	-$FRAME(%sp),%r3
694	.PROCEND
695
696	.data
697	.STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
698___
699
700# Explicitly encode PA-RISC 2.0 instructions used in this module, so
701# that it can be compiled with .LEVEL 1.0. It should be noted that I
702# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
703# directive...
704
705my $ldd = sub {
706  my ($mod,$args) = @_;
707  my $orig = "ldd$mod\t$args";
708
709    if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
710    {	my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
711	$opcode|=(1<<3) if ($mod =~ /^,m/);
712	$opcode|=(1<<2) if ($mod =~ /^,mb/);
713	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
714    }
715    else { "\t".$orig; }
716};
717
718my $std = sub {
719  my ($mod,$args) = @_;
720  my $orig = "std$mod\t$args";
721
722    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
723    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
724	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
725    }
726    else { "\t".$orig; }
727};
728
729my $extrd = sub {
730  my ($mod,$args) = @_;
731  my $orig = "extrd$mod\t$args";
732
733    # I only have ",u" completer, it's implicitly encoded...
734    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
735    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
736	my $len=32-$3;
737	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
738	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
739	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
740    }
741    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
742    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
743	my $len=32-$2;
744	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
745	$opcode |= (1<<13) if ($mod =~ /,\**=/);
746	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
747    }
748    else { "\t".$orig; }
749};
750
751my $shrpd = sub {
752  my ($mod,$args) = @_;
753  my $orig = "shrpd$mod\t$args";
754
755    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
756    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
757	my $cpos=63-$3;
758	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
759	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
760    }
761    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
762    {	sprintf "\t.WORD\t0x%08x\t; %s",
763		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
764    }
765    else { "\t".$orig; }
766};
767
768sub assemble {
769  my ($mnemonic,$mod,$args)=@_;
770  my $opcode = eval("\$$mnemonic");
771
772    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
773}
774
775foreach (split("\n",$code)) {
776	s/\`([^\`]*)\`/eval $1/ge;
777
778	s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
779		$3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32)	# rotation for >=32
780		:       sprintf("shd\t%$1,%$2,%d",$3)/e			or
781	# translate made up instructons: _ror, _shr, _align, _shl
782	s/_ror(\s+)(%r[0-9]+),/
783		($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e			or
784
785	s/_shr(\s+%r[0-9]+),([0-9]+),/
786		$SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
787		:        sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e	or
788
789	s/_align(\s+%r[0-9]+,%r[0-9]+),/
790		($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e		or
791
792	s/_shl(\s+%r[0-9]+),([0-9]+),/
793		$SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
794		:            sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
795
796	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
797
798	s/cmpb,\*/comb,/ if ($SIZE_T==4);
799
800	s/\bbv\b/bve/    if ($SIZE_T==8);
801
802	print $_,"\n";
803}
804
805close STDOUT;
806