1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# SHA256/512 block procedure for PA-RISC. 11 12# June 2009. 13# 14# SHA256 performance is >75% better than gcc 3.2 generated code on 15# PA-7100LC. Compared to code generated by vendor compiler this 16# implementation is almost 70% faster in 64-bit build, but delivers 17# virtually same performance in 32-bit build on PA-8600. 18# 19# SHA512 performance is >2.9x better than gcc 3.2 generated code on 20# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the 21# code is executed on PA-RISC 2.0 processor and switches to 64-bit 22# code path delivering adequate performance even in "blended" 32-bit 23# build. Though 64-bit code is not any faster than code generated by 24# vendor compiler on PA-8600... 25# 26# Special thanks to polarhome.com for providing HP-UX account. 27 28$flavour = shift; 29$output = shift; 30open STDOUT,">$output"; 31 32if ($flavour =~ /64/) { 33 $LEVEL ="2.0W"; 34 $SIZE_T =8; 35 $FRAME_MARKER =80; 36 $SAVED_RP =16; 37 $PUSH ="std"; 38 $PUSHMA ="std,ma"; 39 $POP ="ldd"; 40 $POPMB ="ldd,mb"; 41} else { 42 $LEVEL ="1.0"; 43 $SIZE_T =4; 44 $FRAME_MARKER =48; 45 $SAVED_RP =20; 46 $PUSH ="stw"; 47 $PUSHMA ="stwm"; 48 $POP ="ldw"; 49 $POPMB ="ldwm"; 50} 51 52if ($output =~ /512/) { 53 $func="sha512_block_data_order"; 54 $SZ=8; 55 @Sigma0=(28,34,39); 56 @Sigma1=(14,18,41); 57 @sigma0=(1, 8, 7); 58 @sigma1=(19,61, 6); 59 $rounds=80; 60 $LAST10BITS=0x017; 61 $LD="ldd"; 62 $LDM="ldd,ma"; 63 $ST="std"; 64} else { 65 $func="sha256_block_data_order"; 66 $SZ=4; 67 @Sigma0=( 2,13,22); 68 @Sigma1=( 6,11,25); 69 @sigma0=( 7,18, 3); 70 @sigma1=(17,19,10); 71 $rounds=64; 72 $LAST10BITS=0x0f2; 73 $LD="ldw"; 74 $LDM="ldwm"; 75 $ST="stw"; 76} 77 78$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker 79 # [+ argument transfer] 80$XOFF=16*$SZ+32; # local variables 81$FRAME+=$XOFF; 82$XOFF+=$FRAME_MARKER; # distance between %sp and local variables 83 84$ctx="%r26"; # zapped by $a0 85$inp="%r25"; # zapped by $a1 86$num="%r24"; # zapped by $t0 87 88$a0 ="%r26"; 89$a1 ="%r25"; 90$t0 ="%r24"; 91$t1 ="%r29"; 92$Tbl="%r31"; 93 94@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28"); 95 96@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", 97 "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp); 98 99sub ROUND_00_15 { 100my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 101$code.=<<___; 102 _ror $e,$Sigma1[0],$a0 103 and $f,$e,$t0 104 _ror $e,$Sigma1[1],$a1 105 addl $t1,$h,$h 106 andcm $g,$e,$t1 107 xor $a1,$a0,$a0 108 _ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1 109 or $t0,$t1,$t1 ; Ch(e,f,g) 110 addl @X[$i%16],$h,$h 111 xor $a0,$a1,$a1 ; Sigma1(e) 112 addl $t1,$h,$h 113 _ror $a,$Sigma0[0],$a0 114 addl $a1,$h,$h 115 116 _ror $a,$Sigma0[1],$a1 117 and $a,$b,$t0 118 and $a,$c,$t1 119 xor $a1,$a0,$a0 120 _ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1 121 xor $t1,$t0,$t0 122 and $b,$c,$t1 123 xor $a0,$a1,$a1 ; Sigma0(a) 124 addl $h,$d,$d 125 xor $t1,$t0,$t0 ; Maj(a,b,c) 126 `"$LDM $SZ($Tbl),$t1" if ($i<15)` 127 addl $a1,$h,$h 128 addl $t0,$h,$h 129 130___ 131} 132 133sub ROUND_16_xx { 134my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 135$i-=16; 136$code.=<<___; 137 _ror @X[($i+1)%16],$sigma0[0],$a0 138 _ror @X[($i+1)%16],$sigma0[1],$a1 139 addl @X[($i+9)%16],@X[$i],@X[$i] 140 _ror @X[($i+14)%16],$sigma1[0],$t0 141 _ror @X[($i+14)%16],$sigma1[1],$t1 142 xor $a1,$a0,$a0 143 _shr @X[($i+1)%16],$sigma0[2],$a1 144 xor $t1,$t0,$t0 145 _shr @X[($i+14)%16],$sigma1[2],$t1 146 xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f]) 147 xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f]) 148 $LDM $SZ($Tbl),$t1 149 addl $a0,@X[$i],@X[$i] 150 addl $t0,@X[$i],@X[$i] 151___ 152$code.=<<___ if ($i==15); 153 extru $t1,31,10,$a1 154 comiclr,<> $LAST10BITS,$a1,%r0 155 ldo 1($Tbl),$Tbl ; signal end of $Tbl 156___ 157&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h); 158} 159 160$code=<<___; 161 .LEVEL $LEVEL 162#if 0 163 .SPACE \$TEXT\$ 164 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY 165#else 166 .text 167#endif 168 169 .ALIGN 64 170L\$table 171___ 172$code.=<<___ if ($SZ==8); 173 .WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd 174 .WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc 175 .WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019 176 .WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118 177 .WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe 178 .WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2 179 .WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1 180 .WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694 181 .WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3 182 .WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65 183 .WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483 184 .WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5 185 .WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210 186 .WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4 187 .WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725 188 .WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70 189 .WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926 190 .WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df 191 .WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8 192 .WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b 193 .WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001 194 .WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30 195 .WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910 196 .WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8 197 .WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53 198 .WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8 199 .WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb 200 .WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3 201 .WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60 202 .WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec 203 .WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9 204 .WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b 205 .WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207 206 .WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178 207 .WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6 208 .WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b 209 .WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493 210 .WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c 211 .WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a 212 .WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817 213___ 214$code.=<<___ if ($SZ==4); 215 .WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 216 .WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 217 .WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 218 .WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 219 .WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 220 .WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 221 .WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 222 .WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 223 .WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 224 .WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 225 .WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 226 .WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 227 .WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 228 .WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 229 .WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 230 .WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 231___ 232$code.=<<___; 233 234 .EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR 235 .ALIGN 64 236$func 237 .PROC 238 .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18 239 .ENTRY 240 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 241 $PUSHMA %r3,$FRAME(%sp) 242 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 243 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 244 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 245 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 246 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 247 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 248 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 249 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) 250 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) 251 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) 252 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) 253 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) 254 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) 255 $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp) 256 $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp) 257 258 _shl $num,`log(16*$SZ)/log(2)`,$num 259 addl $inp,$num,$num ; $num to point at the end of $inp 260 261 $PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments 262 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) 263 $PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp) 264 265 blr %r0,$Tbl 266 ldi 3,$t1 267L\$pic 268 andcm $Tbl,$t1,$Tbl ; wipe privilege level 269 ldo L\$table-L\$pic($Tbl),$Tbl 270___ 271$code.=<<___ if ($SZ==8 && $SIZE_T==4); 272#ifndef __OpenBSD__ 273___ 274$code.=<<___ if ($SZ==8 && $SIZE_T==4); 275 ldi 31,$t1 276 mtctl $t1,%cr11 277 extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0 278 b L\$parisc1 279 nop 280___ 281$code.=<<___; 282 $LD `0*$SZ`($ctx),$A ; load context 283 $LD `1*$SZ`($ctx),$B 284 $LD `2*$SZ`($ctx),$C 285 $LD `3*$SZ`($ctx),$D 286 $LD `4*$SZ`($ctx),$E 287 $LD `5*$SZ`($ctx),$F 288 $LD `6*$SZ`($ctx),$G 289 $LD `7*$SZ`($ctx),$H 290 291 extru $inp,31,`log($SZ)/log(2)`,$t0 292 sh3addl $t0,%r0,$t0 293 subi `8*$SZ`,$t0,$t0 294 mtctl $t0,%cr11 ; load %sar with align factor 295 296L\$oop 297 ldi `$SZ-1`,$t0 298 $LDM $SZ($Tbl),$t1 299 andcm $inp,$t0,$t0 ; align $inp 300___ 301 for ($i=0;$i<15;$i++) { # load input block 302 $code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; } 303$code.=<<___; 304 cmpb,*= $inp,$t0,L\$aligned 305 $LD `$SZ*15`($t0),@X[15] 306 $LD `$SZ*16`($t0),@X[16] 307___ 308 for ($i=0;$i<16;$i++) { # align data 309 $code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; } 310$code.=<<___; 311L\$aligned 312 nop ; otherwise /usr/ccs/bin/as is confused by below .WORD 313___ 314 315for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); } 316$code.=<<___; 317L\$rounds 318 nop ; otherwise /usr/ccs/bin/as is confused by below .WORD 319___ 320for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); } 321$code.=<<___; 322 bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled? 323 nop 324 325 $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments 326 $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp 327 $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num 328 ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl 329 330 $LD `0*$SZ`($ctx),@X[0] ; load context 331 $LD `1*$SZ`($ctx),@X[1] 332 $LD `2*$SZ`($ctx),@X[2] 333 $LD `3*$SZ`($ctx),@X[3] 334 $LD `4*$SZ`($ctx),@X[4] 335 $LD `5*$SZ`($ctx),@X[5] 336 addl @X[0],$A,$A 337 $LD `6*$SZ`($ctx),@X[6] 338 addl @X[1],$B,$B 339 $LD `7*$SZ`($ctx),@X[7] 340 ldo `16*$SZ`($inp),$inp ; advance $inp 341 342 $ST $A,`0*$SZ`($ctx) ; save context 343 addl @X[2],$C,$C 344 $ST $B,`1*$SZ`($ctx) 345 addl @X[3],$D,$D 346 $ST $C,`2*$SZ`($ctx) 347 addl @X[4],$E,$E 348 $ST $D,`3*$SZ`($ctx) 349 addl @X[5],$F,$F 350 $ST $E,`4*$SZ`($ctx) 351 addl @X[6],$G,$G 352 $ST $F,`5*$SZ`($ctx) 353 addl @X[7],$H,$H 354 $ST $G,`6*$SZ`($ctx) 355 $ST $H,`7*$SZ`($ctx) 356 357 cmpb,*<>,n $inp,$num,L\$oop 358 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp 359___ 360if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0 361{{ 362$code.=<<___; 363 b L\$done 364 nop 365 366 .ALIGN 64 367L\$parisc1 368___ 369$code.=<<___ if ($SZ==8 && $SIZE_T==4); 370#endif 371___ 372 373@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo, 374 $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) = 375 ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", 376 "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16"); 377$a0 ="%r17"; 378$a1 ="%r18"; 379$a2 ="%r19"; 380$a3 ="%r20"; 381$t0 ="%r21"; 382$t1 ="%r22"; 383$t2 ="%r28"; 384$t3 ="%r29"; 385$Tbl="%r31"; 386 387@X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx 388 389sub ROUND_00_15_pa1 { 390my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo, 391 $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_; 392my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X; 393 394$code.=<<___ if (!$flag); 395 ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi 396 ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1] 397___ 398$code.=<<___; 399 shd $ehi,$elo,$Sigma1[0],$t0 400 add $Xlo,$hlo,$hlo 401 shd $elo,$ehi,$Sigma1[0],$t1 402 addc $Xhi,$hhi,$hhi ; h += X[i] 403 shd $ehi,$elo,$Sigma1[1],$t2 404 ldwm 8($Tbl),$Xhi 405 shd $elo,$ehi,$Sigma1[1],$t3 406 ldw -4($Tbl),$Xlo ; load K[i] 407 xor $t2,$t0,$t0 408 xor $t3,$t1,$t1 409 and $flo,$elo,$a0 410 and $fhi,$ehi,$a1 411 shd $ehi,$elo,$Sigma1[2],$t2 412 andcm $glo,$elo,$a2 413 shd $elo,$ehi,$Sigma1[2],$t3 414 andcm $ghi,$ehi,$a3 415 xor $t2,$t0,$t0 416 xor $t3,$t1,$t1 ; Sigma1(e) 417 add $Xlo,$hlo,$hlo 418 xor $a2,$a0,$a0 419 addc $Xhi,$hhi,$hhi ; h += K[i] 420 xor $a3,$a1,$a1 ; Ch(e,f,g) 421 422 add $t0,$hlo,$hlo 423 shd $ahi,$alo,$Sigma0[0],$t0 424 addc $t1,$hhi,$hhi ; h += Sigma1(e) 425 shd $alo,$ahi,$Sigma0[0],$t1 426 add $a0,$hlo,$hlo 427 shd $ahi,$alo,$Sigma0[1],$t2 428 addc $a1,$hhi,$hhi ; h += Ch(e,f,g) 429 shd $alo,$ahi,$Sigma0[1],$t3 430 431 xor $t2,$t0,$t0 432 xor $t3,$t1,$t1 433 shd $ahi,$alo,$Sigma0[2],$t2 434 and $alo,$blo,$a0 435 shd $alo,$ahi,$Sigma0[2],$t3 436 and $ahi,$bhi,$a1 437 xor $t2,$t0,$t0 438 xor $t3,$t1,$t1 ; Sigma0(a) 439 440 and $alo,$clo,$a2 441 and $ahi,$chi,$a3 442 xor $a2,$a0,$a0 443 add $hlo,$dlo,$dlo 444 xor $a3,$a1,$a1 445 addc $hhi,$dhi,$dhi ; d += h 446 and $blo,$clo,$a2 447 add $t0,$hlo,$hlo 448 and $bhi,$chi,$a3 449 addc $t1,$hhi,$hhi ; h += Sigma0(a) 450 xor $a2,$a0,$a0 451 add $a0,$hlo,$hlo 452 xor $a3,$a1,$a1 ; Maj(a,b,c) 453 addc $a1,$hhi,$hhi ; h += Maj(a,b,c) 454 455___ 456$code.=<<___ if ($i==15 && $flag); 457 extru $Xlo,31,10,$Xlo 458 comiclr,= $LAST10BITS,$Xlo,%r0 459 b L\$rounds_pa1 460 nop 461___ 462push(@X,shift(@X)); push(@X,shift(@X)); 463} 464 465sub ROUND_16_xx_pa1 { 466my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X; 467my ($i)=shift; 468$i-=16; 469$code.=<<___; 470 ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi 471 ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1] 472 ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1 473 ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9] 474 ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3 475 ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14] 476 shd $Xnhi,$Xnlo,$sigma0[0],$t0 477 shd $Xnlo,$Xnhi,$sigma0[0],$t1 478 add $a0,$Xlo,$Xlo 479 shd $Xnhi,$Xnlo,$sigma0[1],$t2 480 addc $a1,$Xhi,$Xhi 481 shd $Xnlo,$Xnhi,$sigma0[1],$t3 482 xor $t2,$t0,$t0 483 shd $Xnhi,$Xnlo,$sigma0[2],$t2 484 xor $t3,$t1,$t1 485 extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3 486 xor $t2,$t0,$t0 487 shd $a3,$a2,$sigma1[0],$a0 488 xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f]) 489 shd $a2,$a3,$sigma1[0],$a1 490 add $t0,$Xlo,$Xlo 491 shd $a3,$a2,$sigma1[1],$t2 492 addc $t1,$Xhi,$Xhi 493 shd $a2,$a3,$sigma1[1],$t3 494 xor $t2,$a0,$a0 495 shd $a3,$a2,$sigma1[2],$t2 496 xor $t3,$a1,$a1 497 extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3 498 xor $t2,$a0,$a0 499 xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f]) 500 add $a0,$Xlo,$Xlo 501 addc $a1,$Xhi,$Xhi 502 503 stw $Xhi,`-$XOFF+8*($i%16)`(%sp) 504 stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp) 505___ 506&ROUND_00_15_pa1($i,@_,1); 507} 508$code.=<<___; 509 ldw `0*4`($ctx),$Ahi ; load context 510 ldw `1*4`($ctx),$Alo 511 ldw `2*4`($ctx),$Bhi 512 ldw `3*4`($ctx),$Blo 513 ldw `4*4`($ctx),$Chi 514 ldw `5*4`($ctx),$Clo 515 ldw `6*4`($ctx),$Dhi 516 ldw `7*4`($ctx),$Dlo 517 ldw `8*4`($ctx),$Ehi 518 ldw `9*4`($ctx),$Elo 519 ldw `10*4`($ctx),$Fhi 520 ldw `11*4`($ctx),$Flo 521 ldw `12*4`($ctx),$Ghi 522 ldw `13*4`($ctx),$Glo 523 ldw `14*4`($ctx),$Hhi 524 ldw `15*4`($ctx),$Hlo 525 526 extru $inp,31,2,$t0 527 sh3addl $t0,%r0,$t0 528 subi 32,$t0,$t0 529 mtctl $t0,%cr11 ; load %sar with align factor 530 531L\$oop_pa1 532 extru $inp,31,2,$a3 533 comib,= 0,$a3,L\$aligned_pa1 534 sub $inp,$a3,$inp 535 536 ldw `0*4`($inp),$X[0] 537 ldw `1*4`($inp),$X[1] 538 ldw `2*4`($inp),$t2 539 ldw `3*4`($inp),$t3 540 ldw `4*4`($inp),$a0 541 ldw `5*4`($inp),$a1 542 ldw `6*4`($inp),$a2 543 ldw `7*4`($inp),$a3 544 vshd $X[0],$X[1],$X[0] 545 vshd $X[1],$t2,$X[1] 546 stw $X[0],`-$XOFF+0*4`(%sp) 547 ldw `8*4`($inp),$t0 548 vshd $t2,$t3,$t2 549 stw $X[1],`-$XOFF+1*4`(%sp) 550 ldw `9*4`($inp),$t1 551 vshd $t3,$a0,$t3 552___ 553{ 554my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1); 555for ($i=2;$i<=(128/4-8);$i++) { 556$code.=<<___; 557 stw $t[0],`-$XOFF+$i*4`(%sp) 558 ldw `(8+$i)*4`($inp),$t[0] 559 vshd $t[1],$t[2],$t[1] 560___ 561push(@t,shift(@t)); 562} 563for (;$i<(128/4-1);$i++) { 564$code.=<<___; 565 stw $t[0],`-$XOFF+$i*4`(%sp) 566 vshd $t[1],$t[2],$t[1] 567___ 568push(@t,shift(@t)); 569} 570$code.=<<___; 571 b L\$collected_pa1 572 stw $t[0],`-$XOFF+$i*4`(%sp) 573 574___ 575} 576$code.=<<___; 577L\$aligned_pa1 578 ldw `0*4`($inp),$X[0] 579 ldw `1*4`($inp),$X[1] 580 ldw `2*4`($inp),$t2 581 ldw `3*4`($inp),$t3 582 ldw `4*4`($inp),$a0 583 ldw `5*4`($inp),$a1 584 ldw `6*4`($inp),$a2 585 ldw `7*4`($inp),$a3 586 stw $X[0],`-$XOFF+0*4`(%sp) 587 ldw `8*4`($inp),$t0 588 stw $X[1],`-$XOFF+1*4`(%sp) 589 ldw `9*4`($inp),$t1 590___ 591{ 592my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1); 593for ($i=2;$i<(128/4-8);$i++) { 594$code.=<<___; 595 stw $t[0],`-$XOFF+$i*4`(%sp) 596 ldw `(8+$i)*4`($inp),$t[0] 597___ 598push(@t,shift(@t)); 599} 600for (;$i<128/4;$i++) { 601$code.=<<___; 602 stw $t[0],`-$XOFF+$i*4`(%sp) 603___ 604push(@t,shift(@t)); 605} 606$code.="L\$collected_pa1\n"; 607} 608 609for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); } 610$code.="L\$rounds_pa1\n"; 611for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); } 612 613$code.=<<___; 614 $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments 615 $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp 616 $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num 617 ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl 618 619 ldw `0*4`($ctx),$t1 ; update context 620 ldw `1*4`($ctx),$t0 621 ldw `2*4`($ctx),$t3 622 ldw `3*4`($ctx),$t2 623 ldw `4*4`($ctx),$a1 624 ldw `5*4`($ctx),$a0 625 ldw `6*4`($ctx),$a3 626 add $t0,$Alo,$Alo 627 ldw `7*4`($ctx),$a2 628 addc $t1,$Ahi,$Ahi 629 ldw `8*4`($ctx),$t1 630 add $t2,$Blo,$Blo 631 ldw `9*4`($ctx),$t0 632 addc $t3,$Bhi,$Bhi 633 ldw `10*4`($ctx),$t3 634 add $a0,$Clo,$Clo 635 ldw `11*4`($ctx),$t2 636 addc $a1,$Chi,$Chi 637 ldw `12*4`($ctx),$a1 638 add $a2,$Dlo,$Dlo 639 ldw `13*4`($ctx),$a0 640 addc $a3,$Dhi,$Dhi 641 ldw `14*4`($ctx),$a3 642 add $t0,$Elo,$Elo 643 ldw `15*4`($ctx),$a2 644 addc $t1,$Ehi,$Ehi 645 stw $Ahi,`0*4`($ctx) 646 add $t2,$Flo,$Flo 647 stw $Alo,`1*4`($ctx) 648 addc $t3,$Fhi,$Fhi 649 stw $Bhi,`2*4`($ctx) 650 add $a0,$Glo,$Glo 651 stw $Blo,`3*4`($ctx) 652 addc $a1,$Ghi,$Ghi 653 stw $Chi,`4*4`($ctx) 654 add $a2,$Hlo,$Hlo 655 stw $Clo,`5*4`($ctx) 656 addc $a3,$Hhi,$Hhi 657 stw $Dhi,`6*4`($ctx) 658 ldo `16*$SZ`($inp),$inp ; advance $inp 659 stw $Dlo,`7*4`($ctx) 660 stw $Ehi,`8*4`($ctx) 661 stw $Elo,`9*4`($ctx) 662 stw $Fhi,`10*4`($ctx) 663 stw $Flo,`11*4`($ctx) 664 stw $Ghi,`12*4`($ctx) 665 stw $Glo,`13*4`($ctx) 666 stw $Hhi,`14*4`($ctx) 667 comb,= $inp,$num,L\$done 668 stw $Hlo,`15*4`($ctx) 669 b L\$oop_pa1 670 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp 671L\$done 672___ 673}} 674$code.=<<___; 675 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 676 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 677 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 678 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 679 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 680 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 681 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 682 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 683 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 684 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 685 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 686 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 687 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 688 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 689 $POP `-$FRAME+14*$SIZE_T`(%sp),%r17 690 $POP `-$FRAME+15*$SIZE_T`(%sp),%r18 691 bv (%r2) 692 .EXIT 693 $POPMB -$FRAME(%sp),%r3 694 .PROCEND 695 696 .data 697 .STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" 698___ 699 700# Explicitly encode PA-RISC 2.0 instructions used in this module, so 701# that it can be compiled with .LEVEL 1.0. It should be noted that I 702# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 703# directive... 704 705my $ldd = sub { 706 my ($mod,$args) = @_; 707 my $orig = "ldd$mod\t$args"; 708 709 if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices 710 { my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1); 711 $opcode|=(1<<3) if ($mod =~ /^,m/); 712 $opcode|=(1<<2) if ($mod =~ /^,mb/); 713 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 714 } 715 else { "\t".$orig; } 716}; 717 718my $std = sub { 719 my ($mod,$args) = @_; 720 my $orig = "std$mod\t$args"; 721 722 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices 723 { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1); 724 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 725 } 726 else { "\t".$orig; } 727}; 728 729my $extrd = sub { 730 my ($mod,$args) = @_; 731 my $orig = "extrd$mod\t$args"; 732 733 # I only have ",u" completer, it's implicitly encoded... 734 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 735 { my $opcode=(0x36<<26)|($1<<21)|($4<<16); 736 my $len=32-$3; 737 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos 738 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 739 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 740 } 741 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 742 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); 743 my $len=32-$2; 744 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len 745 $opcode |= (1<<13) if ($mod =~ /,\**=/); 746 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 747 } 748 else { "\t".$orig; } 749}; 750 751my $shrpd = sub { 752 my ($mod,$args) = @_; 753 my $orig = "shrpd$mod\t$args"; 754 755 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 756 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; 757 my $cpos=63-$3; 758 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa 759 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 760 } 761 elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11 762 { sprintf "\t.WORD\t0x%08x\t; %s", 763 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig; 764 } 765 else { "\t".$orig; } 766}; 767 768sub assemble { 769 my ($mnemonic,$mod,$args)=@_; 770 my $opcode = eval("\$$mnemonic"); 771 772 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; 773} 774 775foreach (split("\n",$code)) { 776 s/\`([^\`]*)\`/eval $1/ge; 777 778 s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/ 779 $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32 780 : sprintf("shd\t%$1,%$2,%d",$3)/e or 781 # translate made up instructons: _ror, _shr, _align, _shl 782 s/_ror(\s+)(%r[0-9]+),/ 783 ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or 784 785 s/_shr(\s+%r[0-9]+),([0-9]+),/ 786 $SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2) 787 : sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or 788 789 s/_align(\s+%r[0-9]+,%r[0-9]+),/ 790 ($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or 791 792 s/_shl(\s+%r[0-9]+),([0-9]+),/ 793 $SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2) 794 : sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e; 795 796 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4); 797 798 s/cmpb,\*/comb,/ if ($SIZE_T==4); 799 800 s/\bbv\b/bve/ if ($SIZE_T==8); 801 802 print $_,"\n"; 803} 804 805close STDOUT; 806