xref: /openbsd-src/lib/libcrypto/sha/asm/sha512-sparcv9.pl (revision 9484a439e95949b9d8685055f43a77c2ab822582)
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256 performance improvement over compiler generated code varies
11# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
12# build]. Just like in SHA1 module I aim to ensure scalability on
13# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
14
15# SHA512 on pre-T1 UltraSPARC.
16#
17# Performance is >75% better than 64-bit code generated by Sun C and
18# over 2x than 32-bit code. X[16] resides on stack, but access to it
19# is scheduled for L2 latency and staged through 32 least significant
20# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
21# duality. Nevetheless it's ~40% faster than SHA256, which is pretty
22# good [optimal coefficient is 50%].
23#
24# SHA512 on UltraSPARC T1.
25#
26# It's not any faster than 64-bit code generated by Sun C 5.8. This is
27# because 64-bit code generator has the advantage of using 64-bit
28# loads(*) to access X[16], which I consciously traded for 32-/64-bit
29# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
30# code by 60%, not to mention that it doesn't suffer from severe decay
31# when running 4 times physical cores threads and that it leaves gcc
32# [3.4] behind by over 4x factor! If compared to SHA256, single thread
33# performance is only 10% better, but overall throughput for maximum
34# amount of threads for given CPU exceeds corresponding one of SHA256
35# by 30% [again, optimal coefficient is 50%].
36#
37# (*)	Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
38#	in-order, i.e. load instruction has to complete prior next
39#	instruction in given thread is executed, even if the latter is
40#	not dependent on load result! This means that on T1 two 32-bit
41#	loads are always slower than one 64-bit load. Once again this
42#	is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
43#	2x32-bit loads can be as fast as 1x64-bit ones.
44
45$bits=32;
46for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
47if ($bits==64)	{ $bias=2047; $frame=192; }
48else		{ $bias=0;    $frame=112; }
49
50$output=shift;
51open STDOUT,">$output";
52
53if ($output =~ /512/) {
54	$label="512";
55	$SZ=8;
56	$LD="ldx";		# load from memory
57	$ST="stx";		# store to memory
58	$SLL="sllx";		# shift left logical
59	$SRL="srlx";		# shift right logical
60	@Sigma0=(28,34,39);
61	@Sigma1=(14,18,41);
62	@sigma0=( 7, 1, 8);	# right shift first
63	@sigma1=( 6,19,61);	# right shift first
64	$lastK=0x817;
65	$rounds=80;
66	$align=4;
67
68	$locals=16*$SZ;		# X[16]
69
70	$A="%o0";
71	$B="%o1";
72	$C="%o2";
73	$D="%o3";
74	$E="%o4";
75	$F="%o5";
76	$G="%g1";
77	$H="%o7";
78	@V=($A,$B,$C,$D,$E,$F,$G,$H);
79} else {
80	$label="256";
81	$SZ=4;
82	$LD="ld";		# load from memory
83	$ST="st";		# store to memory
84	$SLL="sll";		# shift left logical
85	$SRL="srl";		# shift right logical
86	@Sigma0=( 2,13,22);
87	@Sigma1=( 6,11,25);
88	@sigma0=( 3, 7,18);	# right shift first
89	@sigma1=(10,17,19);	# right shift first
90	$lastK=0x8f2;
91	$rounds=64;
92	$align=8;
93
94	$locals=0;		# X[16] is register resident
95	@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
96
97	$A="%l0";
98	$B="%l1";
99	$C="%l2";
100	$D="%l3";
101	$E="%l4";
102	$F="%l5";
103	$G="%l6";
104	$H="%l7";
105	@V=($A,$B,$C,$D,$E,$F,$G,$H);
106}
107$T1="%g2";
108$tmp0="%g3";
109$tmp1="%g4";
110$tmp2="%g5";
111
112$ctx="%i0";
113$inp="%i1";
114$len="%i2";
115$Ktbl="%i3";
116$tmp31="%i4";
117$tmp32="%i5";
118
119########### SHA256
120$Xload = sub {
121my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
122
123    if ($i==0) {
124$code.=<<___;
125	ldx	[$inp+0],@X[0]
126	ldx	[$inp+16],@X[2]
127	ldx	[$inp+32],@X[4]
128	ldx	[$inp+48],@X[6]
129	ldx	[$inp+8],@X[1]
130	ldx	[$inp+24],@X[3]
131	subcc	%g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
132	ldx	[$inp+40],@X[5]
133	bz,pt	%icc,.Laligned
134	ldx	[$inp+56],@X[7]
135
136	sllx	@X[0],$tmp31,@X[0]
137	ldx	[$inp+64],$T1
138___
139for($j=0;$j<7;$j++)
140{   $code.=<<___;
141	srlx	@X[$j+1],$tmp32,$tmp1
142	sllx	@X[$j+1],$tmp31,@X[$j+1]
143	or	$tmp1,@X[$j],@X[$j]
144___
145}
146$code.=<<___;
147	srlx	$T1,$tmp32,$T1
148	or	$T1,@X[7],@X[7]
149.Laligned:
150___
151    }
152
153    if ($i&1) {
154	$code.="\tadd	@X[$i/2],$h,$T1\n";
155    } else {
156	$code.="\tsrlx	@X[$i/2],32,$T1\n\tadd	$h,$T1,$T1\n";
157    }
158} if ($SZ==4);
159
160########### SHA512
161$Xload = sub {
162my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
163my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
164
165$code.=<<___ if ($i==0);
166	ld	[$inp+0],%l0
167	ld	[$inp+4],%l1
168	ld	[$inp+8],%l2
169	ld	[$inp+12],%l3
170	ld	[$inp+16],%l4
171	ld	[$inp+20],%l5
172	ld	[$inp+24],%l6
173	ld	[$inp+28],%l7
174___
175$code.=<<___ if ($i<15);
176	sllx	@pair[1],$tmp31,$tmp2	! Xload($i)
177	add	$tmp31,32,$tmp0
178	sllx	@pair[0],$tmp0,$tmp1
179	`"ld	[$inp+".eval(32+0+$i*8)."],@pair[0]"	if ($i<12)`
180	srlx	@pair[2],$tmp32,@pair[1]
181	or	$tmp1,$tmp2,$tmp2
182	or	@pair[1],$tmp2,$tmp2
183	`"ld	[$inp+".eval(32+4+$i*8)."],@pair[1]"	if ($i<12)`
184	add	$h,$tmp2,$T1
185	$ST	$tmp2,[%sp+`$bias+$frame+$i*$SZ`]
186___
187$code.=<<___ if ($i==12);
188	brnz,a	$tmp31,.+8
189	ld	[$inp+128],%l0
190___
191$code.=<<___ if ($i==15);
192	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
193	sllx	@pair[1],$tmp31,$tmp2	! Xload($i)
194	add	$tmp31,32,$tmp0
195	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
196	sllx	@pair[0],$tmp0,$tmp1
197	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
198	srlx	@pair[2],$tmp32,@pair[1]
199	or	$tmp1,$tmp2,$tmp2
200	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
201	or	@pair[1],$tmp2,$tmp2
202	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
203	add	$h,$tmp2,$T1
204	$ST	$tmp2,[%sp+`$bias+$frame+$i*$SZ`]
205	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
206	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
207	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
208___
209} if ($SZ==8);
210
211########### common
212sub BODY_00_15 {
213my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
214
215    if ($i<16) {
216	&$Xload(@_);
217    } else {
218	$code.="\tadd	$h,$T1,$T1\n";
219    }
220
221$code.=<<___;
222	$SRL	$e,@Sigma1[0],$h	!! $i
223	xor	$f,$g,$tmp2
224	$SLL	$e,`$SZ*8-@Sigma1[2]`,$tmp1
225	and	$e,$tmp2,$tmp2
226	$SRL	$e,@Sigma1[1],$tmp0
227	xor	$tmp1,$h,$h
228	$SLL	$e,`$SZ*8-@Sigma1[1]`,$tmp1
229	xor	$tmp0,$h,$h
230	$SRL	$e,@Sigma1[2],$tmp0
231	xor	$tmp1,$h,$h
232	$SLL	$e,`$SZ*8-@Sigma1[0]`,$tmp1
233	xor	$tmp0,$h,$h
234	xor	$g,$tmp2,$tmp2		! Ch(e,f,g)
235	xor	$tmp1,$h,$tmp0		! Sigma1(e)
236
237	$SRL	$a,@Sigma0[0],$h
238	add	$tmp2,$T1,$T1
239	$LD	[$Ktbl+`$i*$SZ`],$tmp2	! K[$i]
240	$SLL	$a,`$SZ*8-@Sigma0[2]`,$tmp1
241	add	$tmp0,$T1,$T1
242	$SRL	$a,@Sigma0[1],$tmp0
243	xor	$tmp1,$h,$h
244	$SLL	$a,`$SZ*8-@Sigma0[1]`,$tmp1
245	xor	$tmp0,$h,$h
246	$SRL	$a,@Sigma0[2],$tmp0
247	xor	$tmp1,$h,$h
248	$SLL	$a,`$SZ*8-@Sigma0[0]`,$tmp1
249	xor	$tmp0,$h,$h
250	xor	$tmp1,$h,$h		! Sigma0(a)
251
252	or	$a,$b,$tmp0
253	and	$a,$b,$tmp1
254	and	$c,$tmp0,$tmp0
255	or	$tmp0,$tmp1,$tmp1	! Maj(a,b,c)
256	add	$tmp2,$T1,$T1		! +=K[$i]
257	add	$tmp1,$h,$h
258
259	add	$T1,$d,$d
260	add	$T1,$h,$h
261___
262}
263
264########### SHA256
265$BODY_16_XX = sub {
266my $i=@_[0];
267my $xi;
268
269    if ($i&1) {
270	$xi=$tmp32;
271	$code.="\tsrlx	@X[(($i+1)/2)%8],32,$xi\n";
272    } else {
273	$xi=@X[(($i+1)/2)%8];
274    }
275$code.=<<___;
276	srl	$xi,@sigma0[0],$T1		!! Xupdate($i)
277	sll	$xi,`32-@sigma0[2]`,$tmp1
278	srl	$xi,@sigma0[1],$tmp0
279	xor	$tmp1,$T1,$T1
280	sll	$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
281	xor	$tmp0,$T1,$T1
282	srl	$xi,@sigma0[2],$tmp0
283	xor	$tmp1,$T1,$T1
284___
285    if ($i&1) {
286	$xi=@X[(($i+14)/2)%8];
287    } else {
288	$xi=$tmp32;
289	$code.="\tsrlx	@X[(($i+14)/2)%8],32,$xi\n";
290    }
291$code.=<<___;
292	srl	$xi,@sigma1[0],$tmp2
293	xor	$tmp0,$T1,$T1			! T1=sigma0(X[i+1])
294	sll	$xi,`32-@sigma1[2]`,$tmp1
295	srl	$xi,@sigma1[1],$tmp0
296	xor	$tmp1,$tmp2,$tmp2
297	sll	$tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
298	xor	$tmp0,$tmp2,$tmp2
299	srl	$xi,@sigma1[2],$tmp0
300	xor	$tmp1,$tmp2,$tmp2
301___
302    if ($i&1) {
303	$xi=@X[($i/2)%8];
304$code.=<<___;
305	srlx	@X[(($i+9)/2)%8],32,$tmp1	! X[i+9]
306	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
307	srl	@X[($i/2)%8],0,$tmp0
308	add	$tmp2,$tmp1,$tmp1
309	add	$xi,$T1,$T1			! +=X[i]
310	xor	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
311	add	$tmp1,$T1,$T1
312
313	srl	$T1,0,$T1
314	or	$T1,@X[($i/2)%8],@X[($i/2)%8]
315___
316    } else {
317	$xi=@X[(($i+9)/2)%8];
318$code.=<<___;
319	srlx	@X[($i/2)%8],32,$tmp1		! X[i]
320	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
321	add	$xi,$T1,$T1			! +=X[i+9]
322	add	$tmp2,$tmp1,$tmp1
323	srl	@X[($i/2)%8],0,@X[($i/2)%8]
324	add	$tmp1,$T1,$T1
325
326	sllx	$T1,32,$tmp0
327	or	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
328___
329    }
330    &BODY_00_15(@_);
331} if ($SZ==4);
332
333########### SHA512
334$BODY_16_XX = sub {
335my $i=@_[0];
336my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
337
338$code.=<<___;
339	sllx	%l2,32,$tmp0		!! Xupdate($i)
340	or	%l3,$tmp0,$tmp0
341
342	srlx	$tmp0,@sigma0[0],$T1
343	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
344	sllx	$tmp0,`64-@sigma0[2]`,$tmp1
345	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
346	srlx	$tmp0,@sigma0[1],$tmp0
347	xor	$tmp1,$T1,$T1
348	sllx	$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
349	xor	$tmp0,$T1,$T1
350	srlx	$tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
351	xor	$tmp1,$T1,$T1
352	sllx	%l6,32,$tmp2
353	xor	$tmp0,$T1,$T1		! sigma0(X[$i+1])
354	or	%l7,$tmp2,$tmp2
355
356	srlx	$tmp2,@sigma1[0],$tmp1
357	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
358	sllx	$tmp2,`64-@sigma1[2]`,$tmp0
359	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
360	srlx	$tmp2,@sigma1[1],$tmp2
361	xor	$tmp0,$tmp1,$tmp1
362	sllx	$tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
363	xor	$tmp2,$tmp1,$tmp1
364	srlx	$tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
365	xor	$tmp0,$tmp1,$tmp1
366	sllx	%l4,32,$tmp0
367	xor	$tmp2,$tmp1,$tmp1	! sigma1(X[$i+14])
368	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
369	or	%l5,$tmp0,$tmp0
370	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
371
372	sllx	%l0,32,$tmp2
373	add	$tmp1,$T1,$T1
374	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
375	or	%l1,$tmp2,$tmp2
376	add	$tmp0,$T1,$T1		! +=X[$i+9]
377	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
378	add	$tmp2,$T1,$T1		! +=X[$i]
379	$ST	$T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
380___
381    &BODY_00_15(@_);
382} if ($SZ==8);
383
384$code.=<<___ if ($bits==64);
385.register	%g2,#scratch
386.register	%g3,#scratch
387___
388$code.=<<___;
389.section	".rodata",#alloc
390
391.align	64
392K${label}:
393.type	K${label},#object
394___
395if ($SZ==4) {
396$code.=<<___;
397	.long	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
398	.long	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
399	.long	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
400	.long	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
401	.long	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
402	.long	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
403	.long	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
404	.long	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
405	.long	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
406	.long	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
407	.long	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
408	.long	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
409	.long	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
410	.long	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
411	.long	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
412	.long	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
413___
414} else {
415$code.=<<___;
416	.long	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
417	.long	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
418	.long	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
419	.long	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
420	.long	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
421	.long	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
422	.long	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
423	.long	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
424	.long	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
425	.long	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
426	.long	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
427	.long	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
428	.long	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
429	.long	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
430	.long	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
431	.long	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
432	.long	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
433	.long	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
434	.long	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
435	.long	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
436	.long	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
437	.long	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
438	.long	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
439	.long	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
440	.long	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
441	.long	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
442	.long	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
443	.long	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
444	.long	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
445	.long	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
446	.long	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
447	.long	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
448	.long	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
449	.long	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
450	.long	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
451	.long	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
452	.long	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
453	.long	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
454	.long	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
455	.long	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
456___
457}
458$code.=<<___;
459.size	K${label},.-K${label}
460
461.section	".text",#alloc,#execinstr
462.globl	sha${label}_block_data_order
463sha${label}_block_data_order:
464	save	%sp,`-$frame-$locals`,%sp
465#ifdef __PIC__
466	sethi	%hi(_GLOBAL_OFFSET_TABLE_-4), %o5
467	rd	%pc, %o4
468	or	%o5, %lo(_GLOBAL_OFFSET_TABLE_+4), %o5
469	add	%o5, %o4, %o5
470#endif
471	and	$inp,`$align-1`,$tmp31
472	sllx	$len,`log(16*$SZ)/log(2)`,$len
473	andn	$inp,`$align-1`,$inp
474	sll	$tmp31,3,$tmp31
475	add	$inp,$len,$len
476___
477$code.=<<___ if ($SZ==8); # SHA512
478	mov	32,$tmp32
479	sub	$tmp32,$tmp31,$tmp32
480___
481$code.=<<___;
482#ifdef __PIC__
483	set	K${label}, $Ktbl
484	ldx	[$Ktbl+%o5], $Ktbl
485#else
486	set	K${label}, $Ktbl
487#endif
488
489	$LD	[$ctx+`0*$SZ`],$A
490	$LD	[$ctx+`1*$SZ`],$B
491	$LD	[$ctx+`2*$SZ`],$C
492	$LD	[$ctx+`3*$SZ`],$D
493	$LD	[$ctx+`4*$SZ`],$E
494	$LD	[$ctx+`5*$SZ`],$F
495	$LD	[$ctx+`6*$SZ`],$G
496	$LD	[$ctx+`7*$SZ`],$H
497
498.Lloop:
499___
500for ($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
501$code.=".L16_xx:\n";
502for (;$i<32;$i++)	{ &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
503$code.=<<___;
504	and	$tmp2,0xfff,$tmp2
505	cmp	$tmp2,$lastK
506	bne	.L16_xx
507	add	$Ktbl,`16*$SZ`,$Ktbl	! Ktbl+=16
508
509___
510$code.=<<___ if ($SZ==4); # SHA256
511	$LD	[$ctx+`0*$SZ`],@X[0]
512	$LD	[$ctx+`1*$SZ`],@X[1]
513	$LD	[$ctx+`2*$SZ`],@X[2]
514	$LD	[$ctx+`3*$SZ`],@X[3]
515	$LD	[$ctx+`4*$SZ`],@X[4]
516	$LD	[$ctx+`5*$SZ`],@X[5]
517	$LD	[$ctx+`6*$SZ`],@X[6]
518	$LD	[$ctx+`7*$SZ`],@X[7]
519
520	add	$A,@X[0],$A
521	$ST	$A,[$ctx+`0*$SZ`]
522	add	$B,@X[1],$B
523	$ST	$B,[$ctx+`1*$SZ`]
524	add	$C,@X[2],$C
525	$ST	$C,[$ctx+`2*$SZ`]
526	add	$D,@X[3],$D
527	$ST	$D,[$ctx+`3*$SZ`]
528	add	$E,@X[4],$E
529	$ST	$E,[$ctx+`4*$SZ`]
530	add	$F,@X[5],$F
531	$ST	$F,[$ctx+`5*$SZ`]
532	add	$G,@X[6],$G
533	$ST	$G,[$ctx+`6*$SZ`]
534	add	$H,@X[7],$H
535	$ST	$H,[$ctx+`7*$SZ`]
536___
537$code.=<<___ if ($SZ==8); # SHA512
538	ld	[$ctx+`0*$SZ+0`],%l0
539	ld	[$ctx+`0*$SZ+4`],%l1
540	ld	[$ctx+`1*$SZ+0`],%l2
541	ld	[$ctx+`1*$SZ+4`],%l3
542	ld	[$ctx+`2*$SZ+0`],%l4
543	ld	[$ctx+`2*$SZ+4`],%l5
544	ld	[$ctx+`3*$SZ+0`],%l6
545
546	sllx	%l0,32,$tmp0
547	ld	[$ctx+`3*$SZ+4`],%l7
548	sllx	%l2,32,$tmp1
549	or	%l1,$tmp0,$tmp0
550	or	%l3,$tmp1,$tmp1
551	add	$tmp0,$A,$A
552	add	$tmp1,$B,$B
553	$ST	$A,[$ctx+`0*$SZ`]
554	sllx	%l4,32,$tmp2
555	$ST	$B,[$ctx+`1*$SZ`]
556	sllx	%l6,32,$T1
557	or	%l5,$tmp2,$tmp2
558	or	%l7,$T1,$T1
559	add	$tmp2,$C,$C
560	$ST	$C,[$ctx+`2*$SZ`]
561	add	$T1,$D,$D
562	$ST	$D,[$ctx+`3*$SZ`]
563
564	ld	[$ctx+`4*$SZ+0`],%l0
565	ld	[$ctx+`4*$SZ+4`],%l1
566	ld	[$ctx+`5*$SZ+0`],%l2
567	ld	[$ctx+`5*$SZ+4`],%l3
568	ld	[$ctx+`6*$SZ+0`],%l4
569	ld	[$ctx+`6*$SZ+4`],%l5
570	ld	[$ctx+`7*$SZ+0`],%l6
571
572	sllx	%l0,32,$tmp0
573	ld	[$ctx+`7*$SZ+4`],%l7
574	sllx	%l2,32,$tmp1
575	or	%l1,$tmp0,$tmp0
576	or	%l3,$tmp1,$tmp1
577	add	$tmp0,$E,$E
578	add	$tmp1,$F,$F
579	$ST	$E,[$ctx+`4*$SZ`]
580	sllx	%l4,32,$tmp2
581	$ST	$F,[$ctx+`5*$SZ`]
582	sllx	%l6,32,$T1
583	or	%l5,$tmp2,$tmp2
584	or	%l7,$T1,$T1
585	add	$tmp2,$G,$G
586	$ST	$G,[$ctx+`6*$SZ`]
587	add	$T1,$H,$H
588	$ST	$H,[$ctx+`7*$SZ`]
589___
590$code.=<<___;
591	add	$inp,`16*$SZ`,$inp		! advance inp
592	cmp	$inp,$len
593	bne	`$bits==64?"%xcc":"%icc"`,.Lloop
594	sub	$Ktbl,`($rounds-16)*$SZ`,$Ktbl	! rewind Ktbl
595
596	ret
597	restore
598.type	sha${label}_block_data_order,#function
599.size	sha${label}_block_data_order,(.-sha${label}_block_data_order)
600___
601
602$code =~ s/\`([^\`]*)\`/eval $1/gem;
603print $code;
604close STDOUT;
605