1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# SHA256 performance improvement over compiler generated code varies 11# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit 12# build]. Just like in SHA1 module I aim to ensure scalability on 13# UltraSPARC T1 by packing X[16] to 8 64-bit registers. 14 15# SHA512 on pre-T1 UltraSPARC. 16# 17# Performance is >75% better than 64-bit code generated by Sun C and 18# over 2x than 32-bit code. X[16] resides on stack, but access to it 19# is scheduled for L2 latency and staged through 32 least significant 20# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI 21# duality. Nevetheless it's ~40% faster than SHA256, which is pretty 22# good [optimal coefficient is 50%]. 23# 24# SHA512 on UltraSPARC T1. 25# 26# It's not any faster than 64-bit code generated by Sun C 5.8. This is 27# because 64-bit code generator has the advantage of using 64-bit 28# loads(*) to access X[16], which I consciously traded for 32-/64-bit 29# ABI duality [as per above]. But it surpasses 32-bit Sun C generated 30# code by 60%, not to mention that it doesn't suffer from severe decay 31# when running 4 times physical cores threads and that it leaves gcc 32# [3.4] behind by over 4x factor! If compared to SHA256, single thread 33# performance is only 10% better, but overall throughput for maximum 34# amount of threads for given CPU exceeds corresponding one of SHA256 35# by 30% [again, optimal coefficient is 50%]. 36# 37# (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly 38# in-order, i.e. load instruction has to complete prior next 39# instruction in given thread is executed, even if the latter is 40# not dependent on load result! This means that on T1 two 32-bit 41# loads are always slower than one 64-bit load. Once again this 42# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately, 43# 2x32-bit loads can be as fast as 1x64-bit ones. 44 45$bits=32; 46for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } 47if ($bits==64) { $bias=2047; $frame=192; } 48else { $bias=0; $frame=112; } 49 50$output=shift; 51open STDOUT,">$output"; 52 53if ($output =~ /512/) { 54 $label="512"; 55 $SZ=8; 56 $LD="ldx"; # load from memory 57 $ST="stx"; # store to memory 58 $SLL="sllx"; # shift left logical 59 $SRL="srlx"; # shift right logical 60 @Sigma0=(28,34,39); 61 @Sigma1=(14,18,41); 62 @sigma0=( 7, 1, 8); # right shift first 63 @sigma1=( 6,19,61); # right shift first 64 $lastK=0x817; 65 $rounds=80; 66 $align=4; 67 68 $locals=16*$SZ; # X[16] 69 70 $A="%o0"; 71 $B="%o1"; 72 $C="%o2"; 73 $D="%o3"; 74 $E="%o4"; 75 $F="%o5"; 76 $G="%g1"; 77 $H="%o7"; 78 @V=($A,$B,$C,$D,$E,$F,$G,$H); 79} else { 80 $label="256"; 81 $SZ=4; 82 $LD="ld"; # load from memory 83 $ST="st"; # store to memory 84 $SLL="sll"; # shift left logical 85 $SRL="srl"; # shift right logical 86 @Sigma0=( 2,13,22); 87 @Sigma1=( 6,11,25); 88 @sigma0=( 3, 7,18); # right shift first 89 @sigma1=(10,17,19); # right shift first 90 $lastK=0x8f2; 91 $rounds=64; 92 $align=8; 93 94 $locals=0; # X[16] is register resident 95 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7"); 96 97 $A="%l0"; 98 $B="%l1"; 99 $C="%l2"; 100 $D="%l3"; 101 $E="%l4"; 102 $F="%l5"; 103 $G="%l6"; 104 $H="%l7"; 105 @V=($A,$B,$C,$D,$E,$F,$G,$H); 106} 107$T1="%g2"; 108$tmp0="%g3"; 109$tmp1="%g4"; 110$tmp2="%g5"; 111 112$ctx="%i0"; 113$inp="%i1"; 114$len="%i2"; 115$Ktbl="%i3"; 116$tmp31="%i4"; 117$tmp32="%i5"; 118 119########### SHA256 120$Xload = sub { 121my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 122 123 if ($i==0) { 124$code.=<<___; 125 ldx [$inp+0],@X[0] 126 ldx [$inp+16],@X[2] 127 ldx [$inp+32],@X[4] 128 ldx [$inp+48],@X[6] 129 ldx [$inp+8],@X[1] 130 ldx [$inp+24],@X[3] 131 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too 132 ldx [$inp+40],@X[5] 133 bz,pt %icc,.Laligned 134 ldx [$inp+56],@X[7] 135 136 sllx @X[0],$tmp31,@X[0] 137 ldx [$inp+64],$T1 138___ 139for($j=0;$j<7;$j++) 140{ $code.=<<___; 141 srlx @X[$j+1],$tmp32,$tmp1 142 sllx @X[$j+1],$tmp31,@X[$j+1] 143 or $tmp1,@X[$j],@X[$j] 144___ 145} 146$code.=<<___; 147 srlx $T1,$tmp32,$T1 148 or $T1,@X[7],@X[7] 149.Laligned: 150___ 151 } 152 153 if ($i&1) { 154 $code.="\tadd @X[$i/2],$h,$T1\n"; 155 } else { 156 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n"; 157 } 158} if ($SZ==4); 159 160########### SHA512 161$Xload = sub { 162my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 163my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8)); 164 165$code.=<<___ if ($i==0); 166 ld [$inp+0],%l0 167 ld [$inp+4],%l1 168 ld [$inp+8],%l2 169 ld [$inp+12],%l3 170 ld [$inp+16],%l4 171 ld [$inp+20],%l5 172 ld [$inp+24],%l6 173 ld [$inp+28],%l7 174___ 175$code.=<<___ if ($i<15); 176 sllx @pair[1],$tmp31,$tmp2 ! Xload($i) 177 add $tmp31,32,$tmp0 178 sllx @pair[0],$tmp0,$tmp1 179 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)` 180 srlx @pair[2],$tmp32,@pair[1] 181 or $tmp1,$tmp2,$tmp2 182 or @pair[1],$tmp2,$tmp2 183 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)` 184 add $h,$tmp2,$T1 185 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`] 186___ 187$code.=<<___ if ($i==12); 188 brnz,a $tmp31,.+8 189 ld [$inp+128],%l0 190___ 191$code.=<<___ if ($i==15); 192 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2 193 sllx @pair[1],$tmp31,$tmp2 ! Xload($i) 194 add $tmp31,32,$tmp0 195 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3 196 sllx @pair[0],$tmp0,$tmp1 197 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4 198 srlx @pair[2],$tmp32,@pair[1] 199 or $tmp1,$tmp2,$tmp2 200 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5 201 or @pair[1],$tmp2,$tmp2 202 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6 203 add $h,$tmp2,$T1 204 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`] 205 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7 206 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0 207 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1 208___ 209} if ($SZ==8); 210 211########### common 212sub BODY_00_15 { 213my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 214 215 if ($i<16) { 216 &$Xload(@_); 217 } else { 218 $code.="\tadd $h,$T1,$T1\n"; 219 } 220 221$code.=<<___; 222 $SRL $e,@Sigma1[0],$h !! $i 223 xor $f,$g,$tmp2 224 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1 225 and $e,$tmp2,$tmp2 226 $SRL $e,@Sigma1[1],$tmp0 227 xor $tmp1,$h,$h 228 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1 229 xor $tmp0,$h,$h 230 $SRL $e,@Sigma1[2],$tmp0 231 xor $tmp1,$h,$h 232 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1 233 xor $tmp0,$h,$h 234 xor $g,$tmp2,$tmp2 ! Ch(e,f,g) 235 xor $tmp1,$h,$tmp0 ! Sigma1(e) 236 237 $SRL $a,@Sigma0[0],$h 238 add $tmp2,$T1,$T1 239 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i] 240 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1 241 add $tmp0,$T1,$T1 242 $SRL $a,@Sigma0[1],$tmp0 243 xor $tmp1,$h,$h 244 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1 245 xor $tmp0,$h,$h 246 $SRL $a,@Sigma0[2],$tmp0 247 xor $tmp1,$h,$h 248 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1 249 xor $tmp0,$h,$h 250 xor $tmp1,$h,$h ! Sigma0(a) 251 252 or $a,$b,$tmp0 253 and $a,$b,$tmp1 254 and $c,$tmp0,$tmp0 255 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c) 256 add $tmp2,$T1,$T1 ! +=K[$i] 257 add $tmp1,$h,$h 258 259 add $T1,$d,$d 260 add $T1,$h,$h 261___ 262} 263 264########### SHA256 265$BODY_16_XX = sub { 266my $i=@_[0]; 267my $xi; 268 269 if ($i&1) { 270 $xi=$tmp32; 271 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n"; 272 } else { 273 $xi=@X[(($i+1)/2)%8]; 274 } 275$code.=<<___; 276 srl $xi,@sigma0[0],$T1 !! Xupdate($i) 277 sll $xi,`32-@sigma0[2]`,$tmp1 278 srl $xi,@sigma0[1],$tmp0 279 xor $tmp1,$T1,$T1 280 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1 281 xor $tmp0,$T1,$T1 282 srl $xi,@sigma0[2],$tmp0 283 xor $tmp1,$T1,$T1 284___ 285 if ($i&1) { 286 $xi=@X[(($i+14)/2)%8]; 287 } else { 288 $xi=$tmp32; 289 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n"; 290 } 291$code.=<<___; 292 srl $xi,@sigma1[0],$tmp2 293 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1]) 294 sll $xi,`32-@sigma1[2]`,$tmp1 295 srl $xi,@sigma1[1],$tmp0 296 xor $tmp1,$tmp2,$tmp2 297 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1 298 xor $tmp0,$tmp2,$tmp2 299 srl $xi,@sigma1[2],$tmp0 300 xor $tmp1,$tmp2,$tmp2 301___ 302 if ($i&1) { 303 $xi=@X[($i/2)%8]; 304$code.=<<___; 305 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9] 306 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) 307 srl @X[($i/2)%8],0,$tmp0 308 add $tmp2,$tmp1,$tmp1 309 add $xi,$T1,$T1 ! +=X[i] 310 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8] 311 add $tmp1,$T1,$T1 312 313 srl $T1,0,$T1 314 or $T1,@X[($i/2)%8],@X[($i/2)%8] 315___ 316 } else { 317 $xi=@X[(($i+9)/2)%8]; 318$code.=<<___; 319 srlx @X[($i/2)%8],32,$tmp1 ! X[i] 320 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) 321 add $xi,$T1,$T1 ! +=X[i+9] 322 add $tmp2,$tmp1,$tmp1 323 srl @X[($i/2)%8],0,@X[($i/2)%8] 324 add $tmp1,$T1,$T1 325 326 sllx $T1,32,$tmp0 327 or $tmp0,@X[($i/2)%8],@X[($i/2)%8] 328___ 329 } 330 &BODY_00_15(@_); 331} if ($SZ==4); 332 333########### SHA512 334$BODY_16_XX = sub { 335my $i=@_[0]; 336my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1)); 337 338$code.=<<___; 339 sllx %l2,32,$tmp0 !! Xupdate($i) 340 or %l3,$tmp0,$tmp0 341 342 srlx $tmp0,@sigma0[0],$T1 343 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2 344 sllx $tmp0,`64-@sigma0[2]`,$tmp1 345 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3 346 srlx $tmp0,@sigma0[1],$tmp0 347 xor $tmp1,$T1,$T1 348 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1 349 xor $tmp0,$T1,$T1 350 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0 351 xor $tmp1,$T1,$T1 352 sllx %l6,32,$tmp2 353 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1]) 354 or %l7,$tmp2,$tmp2 355 356 srlx $tmp2,@sigma1[0],$tmp1 357 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6 358 sllx $tmp2,`64-@sigma1[2]`,$tmp0 359 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7 360 srlx $tmp2,@sigma1[1],$tmp2 361 xor $tmp0,$tmp1,$tmp1 362 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0 363 xor $tmp2,$tmp1,$tmp1 364 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2 365 xor $tmp0,$tmp1,$tmp1 366 sllx %l4,32,$tmp0 367 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14]) 368 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4 369 or %l5,$tmp0,$tmp0 370 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5 371 372 sllx %l0,32,$tmp2 373 add $tmp1,$T1,$T1 374 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0 375 or %l1,$tmp2,$tmp2 376 add $tmp0,$T1,$T1 ! +=X[$i+9] 377 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1 378 add $tmp2,$T1,$T1 ! +=X[$i] 379 $ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`] 380___ 381 &BODY_00_15(@_); 382} if ($SZ==8); 383 384$code.=<<___ if ($bits==64); 385.register %g2,#scratch 386.register %g3,#scratch 387___ 388$code.=<<___; 389.section ".rodata",#alloc 390 391.align 64 392K${label}: 393.type K${label},#object 394___ 395if ($SZ==4) { 396$code.=<<___; 397 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 398 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 399 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 400 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 401 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc 402 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da 403 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 404 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 405 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 406 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 407 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 408 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 409 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 410 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 411 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 412 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 413___ 414} else { 415$code.=<<___; 416 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd 417 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc 418 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 419 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 420 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe 421 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 422 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 423 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 424 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 425 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 426 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 427 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 428 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 429 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 430 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 431 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 432 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 433 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df 434 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 435 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b 436 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 437 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 438 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 439 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 440 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 441 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 442 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb 443 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 444 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 445 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec 446 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 447 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b 448 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 449 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 450 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 451 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b 452 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 453 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c 454 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a 455 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 456___ 457} 458$code.=<<___; 459.size K${label},.-K${label} 460 461.section ".text",#alloc,#execinstr 462.globl sha${label}_block_data_order 463sha${label}_block_data_order: 464 save %sp,`-$frame-$locals`,%sp 465#ifdef __PIC__ 466 sethi %hi(_GLOBAL_OFFSET_TABLE_-4), %o5 467 rd %pc, %o4 468 or %o5, %lo(_GLOBAL_OFFSET_TABLE_+4), %o5 469 add %o5, %o4, %o5 470#endif 471 and $inp,`$align-1`,$tmp31 472 sllx $len,`log(16*$SZ)/log(2)`,$len 473 andn $inp,`$align-1`,$inp 474 sll $tmp31,3,$tmp31 475 add $inp,$len,$len 476___ 477$code.=<<___ if ($SZ==8); # SHA512 478 mov 32,$tmp32 479 sub $tmp32,$tmp31,$tmp32 480___ 481$code.=<<___; 482#ifdef __PIC__ 483 set K${label}, $Ktbl 484 ldx [$Ktbl+%o5], $Ktbl 485#else 486 set K${label}, $Ktbl 487#endif 488 489 $LD [$ctx+`0*$SZ`],$A 490 $LD [$ctx+`1*$SZ`],$B 491 $LD [$ctx+`2*$SZ`],$C 492 $LD [$ctx+`3*$SZ`],$D 493 $LD [$ctx+`4*$SZ`],$E 494 $LD [$ctx+`5*$SZ`],$F 495 $LD [$ctx+`6*$SZ`],$G 496 $LD [$ctx+`7*$SZ`],$H 497 498.Lloop: 499___ 500for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 501$code.=".L16_xx:\n"; 502for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 503$code.=<<___; 504 and $tmp2,0xfff,$tmp2 505 cmp $tmp2,$lastK 506 bne .L16_xx 507 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16 508 509___ 510$code.=<<___ if ($SZ==4); # SHA256 511 $LD [$ctx+`0*$SZ`],@X[0] 512 $LD [$ctx+`1*$SZ`],@X[1] 513 $LD [$ctx+`2*$SZ`],@X[2] 514 $LD [$ctx+`3*$SZ`],@X[3] 515 $LD [$ctx+`4*$SZ`],@X[4] 516 $LD [$ctx+`5*$SZ`],@X[5] 517 $LD [$ctx+`6*$SZ`],@X[6] 518 $LD [$ctx+`7*$SZ`],@X[7] 519 520 add $A,@X[0],$A 521 $ST $A,[$ctx+`0*$SZ`] 522 add $B,@X[1],$B 523 $ST $B,[$ctx+`1*$SZ`] 524 add $C,@X[2],$C 525 $ST $C,[$ctx+`2*$SZ`] 526 add $D,@X[3],$D 527 $ST $D,[$ctx+`3*$SZ`] 528 add $E,@X[4],$E 529 $ST $E,[$ctx+`4*$SZ`] 530 add $F,@X[5],$F 531 $ST $F,[$ctx+`5*$SZ`] 532 add $G,@X[6],$G 533 $ST $G,[$ctx+`6*$SZ`] 534 add $H,@X[7],$H 535 $ST $H,[$ctx+`7*$SZ`] 536___ 537$code.=<<___ if ($SZ==8); # SHA512 538 ld [$ctx+`0*$SZ+0`],%l0 539 ld [$ctx+`0*$SZ+4`],%l1 540 ld [$ctx+`1*$SZ+0`],%l2 541 ld [$ctx+`1*$SZ+4`],%l3 542 ld [$ctx+`2*$SZ+0`],%l4 543 ld [$ctx+`2*$SZ+4`],%l5 544 ld [$ctx+`3*$SZ+0`],%l6 545 546 sllx %l0,32,$tmp0 547 ld [$ctx+`3*$SZ+4`],%l7 548 sllx %l2,32,$tmp1 549 or %l1,$tmp0,$tmp0 550 or %l3,$tmp1,$tmp1 551 add $tmp0,$A,$A 552 add $tmp1,$B,$B 553 $ST $A,[$ctx+`0*$SZ`] 554 sllx %l4,32,$tmp2 555 $ST $B,[$ctx+`1*$SZ`] 556 sllx %l6,32,$T1 557 or %l5,$tmp2,$tmp2 558 or %l7,$T1,$T1 559 add $tmp2,$C,$C 560 $ST $C,[$ctx+`2*$SZ`] 561 add $T1,$D,$D 562 $ST $D,[$ctx+`3*$SZ`] 563 564 ld [$ctx+`4*$SZ+0`],%l0 565 ld [$ctx+`4*$SZ+4`],%l1 566 ld [$ctx+`5*$SZ+0`],%l2 567 ld [$ctx+`5*$SZ+4`],%l3 568 ld [$ctx+`6*$SZ+0`],%l4 569 ld [$ctx+`6*$SZ+4`],%l5 570 ld [$ctx+`7*$SZ+0`],%l6 571 572 sllx %l0,32,$tmp0 573 ld [$ctx+`7*$SZ+4`],%l7 574 sllx %l2,32,$tmp1 575 or %l1,$tmp0,$tmp0 576 or %l3,$tmp1,$tmp1 577 add $tmp0,$E,$E 578 add $tmp1,$F,$F 579 $ST $E,[$ctx+`4*$SZ`] 580 sllx %l4,32,$tmp2 581 $ST $F,[$ctx+`5*$SZ`] 582 sllx %l6,32,$T1 583 or %l5,$tmp2,$tmp2 584 or %l7,$T1,$T1 585 add $tmp2,$G,$G 586 $ST $G,[$ctx+`6*$SZ`] 587 add $T1,$H,$H 588 $ST $H,[$ctx+`7*$SZ`] 589___ 590$code.=<<___; 591 add $inp,`16*$SZ`,$inp ! advance inp 592 cmp $inp,$len 593 bne `$bits==64?"%xcc":"%icc"`,.Lloop 594 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl 595 596 ret 597 restore 598.type sha${label}_block_data_order,#function 599.size sha${label}_block_data_order,(.-sha${label}_block_data_order) 600___ 601 602$code =~ s/\`([^\`]*)\`/eval $1/gem; 603print $code; 604close STDOUT; 605