1f1535dc8Sdjm#!/usr/bin/env perl 2f1535dc8Sdjm 3f1535dc8Sdjm# ==================================================================== 4f1535dc8Sdjm# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5f1535dc8Sdjm# project. The module is, however, dual licensed under OpenSSL and 6f1535dc8Sdjm# CRYPTOGAMS licenses depending on where you obtain it. For further 7f1535dc8Sdjm# details see http://www.openssl.org/~appro/cryptogams/. 8f1535dc8Sdjm# ==================================================================== 9f1535dc8Sdjm 10f1535dc8Sdjm# I let hardware handle unaligned input(*), except on page boundaries 11f1535dc8Sdjm# (see below for details). Otherwise straightforward implementation 12f1535dc8Sdjm# with X vector in register bank. The module is big-endian [which is 13f1535dc8Sdjm# not big deal as there're no little-endian targets left around]. 14f1535dc8Sdjm# 15f1535dc8Sdjm# (*) this means that this module is inappropriate for PPC403? Does 16f1535dc8Sdjm# anybody know if pre-POWER3 can sustain unaligned load? 17f1535dc8Sdjm 18f1535dc8Sdjm# -m64 -m32 19f1535dc8Sdjm# ---------------------------------- 20f1535dc8Sdjm# PPC970,gcc-4.0.0 +76% +59% 21f1535dc8Sdjm# Power6,xlc-7 +68% +33% 22f1535dc8Sdjm 23f1535dc8Sdjm$flavour = shift; 24f1535dc8Sdjm 25f1535dc8Sdjmif ($flavour =~ /64/) { 26f1535dc8Sdjm $SIZE_T =8; 27*ec07fdf1Sdjm $LRSAVE =2*$SIZE_T; 28f1535dc8Sdjm $UCMP ="cmpld"; 29f1535dc8Sdjm $STU ="stdu"; 30f1535dc8Sdjm $POP ="ld"; 31f1535dc8Sdjm $PUSH ="std"; 32f1535dc8Sdjm} elsif ($flavour =~ /32/) { 33f1535dc8Sdjm $SIZE_T =4; 34*ec07fdf1Sdjm $LRSAVE =$SIZE_T; 35f1535dc8Sdjm $UCMP ="cmplw"; 36f1535dc8Sdjm $STU ="stwu"; 37f1535dc8Sdjm $POP ="lwz"; 38f1535dc8Sdjm $PUSH ="stw"; 39f1535dc8Sdjm} else { die "nonsense $flavour"; } 40f1535dc8Sdjm 41f1535dc8Sdjm$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 42f1535dc8Sdjm( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 43f1535dc8Sdjm( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 44f1535dc8Sdjmdie "can't locate ppc-xlate.pl"; 45f1535dc8Sdjm 46f1535dc8Sdjmopen STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 47f1535dc8Sdjm 48*ec07fdf1Sdjm$FRAME=24*$SIZE_T+64; 49*ec07fdf1Sdjm$LOCALS=6*$SIZE_T; 50f1535dc8Sdjm 51f1535dc8Sdjm$K ="r0"; 52f1535dc8Sdjm$sp ="r1"; 53f1535dc8Sdjm$toc="r2"; 54f1535dc8Sdjm$ctx="r3"; 55f1535dc8Sdjm$inp="r4"; 56f1535dc8Sdjm$num="r5"; 57f1535dc8Sdjm$t0 ="r15"; 58f1535dc8Sdjm$t1 ="r6"; 59f1535dc8Sdjm 60f1535dc8Sdjm$A ="r7"; 61f1535dc8Sdjm$B ="r8"; 62f1535dc8Sdjm$C ="r9"; 63f1535dc8Sdjm$D ="r10"; 64f1535dc8Sdjm$E ="r11"; 65f1535dc8Sdjm$T ="r12"; 66f1535dc8Sdjm 67f1535dc8Sdjm@V=($A,$B,$C,$D,$E,$T); 68f1535dc8Sdjm@X=("r16","r17","r18","r19","r20","r21","r22","r23", 69f1535dc8Sdjm "r24","r25","r26","r27","r28","r29","r30","r31"); 70f1535dc8Sdjm 71f1535dc8Sdjmsub BODY_00_19 { 72f1535dc8Sdjmmy ($i,$a,$b,$c,$d,$e,$f)=@_; 73f1535dc8Sdjmmy $j=$i+1; 74f1535dc8Sdjm$code.=<<___ if ($i==0); 75f1535dc8Sdjm lwz @X[$i],`$i*4`($inp) 76f1535dc8Sdjm___ 77f1535dc8Sdjm$code.=<<___ if ($i<15); 78f1535dc8Sdjm lwz @X[$j],`$j*4`($inp) 79f1535dc8Sdjm add $f,$K,$e 80f1535dc8Sdjm rotlwi $e,$a,5 81f1535dc8Sdjm add $f,$f,@X[$i] 82f1535dc8Sdjm and $t0,$c,$b 83f1535dc8Sdjm add $f,$f,$e 84f1535dc8Sdjm andc $t1,$d,$b 85f1535dc8Sdjm rotlwi $b,$b,30 86f1535dc8Sdjm or $t0,$t0,$t1 87f1535dc8Sdjm add $f,$f,$t0 88f1535dc8Sdjm___ 89f1535dc8Sdjm$code.=<<___ if ($i>=15); 90f1535dc8Sdjm add $f,$K,$e 91f1535dc8Sdjm rotlwi $e,$a,5 92f1535dc8Sdjm xor @X[$j%16],@X[$j%16],@X[($j+2)%16] 93f1535dc8Sdjm add $f,$f,@X[$i%16] 94f1535dc8Sdjm and $t0,$c,$b 95f1535dc8Sdjm xor @X[$j%16],@X[$j%16],@X[($j+8)%16] 96f1535dc8Sdjm add $f,$f,$e 97f1535dc8Sdjm andc $t1,$d,$b 98f1535dc8Sdjm rotlwi $b,$b,30 99f1535dc8Sdjm or $t0,$t0,$t1 100f1535dc8Sdjm xor @X[$j%16],@X[$j%16],@X[($j+13)%16] 101f1535dc8Sdjm add $f,$f,$t0 102f1535dc8Sdjm rotlwi @X[$j%16],@X[$j%16],1 103f1535dc8Sdjm___ 104f1535dc8Sdjm} 105f1535dc8Sdjm 106f1535dc8Sdjmsub BODY_20_39 { 107f1535dc8Sdjmmy ($i,$a,$b,$c,$d,$e,$f)=@_; 108f1535dc8Sdjmmy $j=$i+1; 109f1535dc8Sdjm$code.=<<___ if ($i<79); 110f1535dc8Sdjm add $f,$K,$e 111f1535dc8Sdjm rotlwi $e,$a,5 112f1535dc8Sdjm xor @X[$j%16],@X[$j%16],@X[($j+2)%16] 113f1535dc8Sdjm add $f,$f,@X[$i%16] 114f1535dc8Sdjm xor $t0,$b,$c 115f1535dc8Sdjm xor @X[$j%16],@X[$j%16],@X[($j+8)%16] 116f1535dc8Sdjm add $f,$f,$e 117f1535dc8Sdjm rotlwi $b,$b,30 118f1535dc8Sdjm xor $t0,$t0,$d 119f1535dc8Sdjm xor @X[$j%16],@X[$j%16],@X[($j+13)%16] 120f1535dc8Sdjm add $f,$f,$t0 121f1535dc8Sdjm rotlwi @X[$j%16],@X[$j%16],1 122f1535dc8Sdjm___ 123f1535dc8Sdjm$code.=<<___ if ($i==79); 124f1535dc8Sdjm add $f,$K,$e 125f1535dc8Sdjm rotlwi $e,$a,5 126f1535dc8Sdjm lwz r16,0($ctx) 127f1535dc8Sdjm add $f,$f,@X[$i%16] 128f1535dc8Sdjm xor $t0,$b,$c 129f1535dc8Sdjm lwz r17,4($ctx) 130f1535dc8Sdjm add $f,$f,$e 131f1535dc8Sdjm rotlwi $b,$b,30 132f1535dc8Sdjm lwz r18,8($ctx) 133f1535dc8Sdjm xor $t0,$t0,$d 134f1535dc8Sdjm lwz r19,12($ctx) 135f1535dc8Sdjm add $f,$f,$t0 136f1535dc8Sdjm lwz r20,16($ctx) 137f1535dc8Sdjm___ 138f1535dc8Sdjm} 139f1535dc8Sdjm 140f1535dc8Sdjmsub BODY_40_59 { 141f1535dc8Sdjmmy ($i,$a,$b,$c,$d,$e,$f)=@_; 142f1535dc8Sdjmmy $j=$i+1; 143f1535dc8Sdjm$code.=<<___; 144f1535dc8Sdjm add $f,$K,$e 145f1535dc8Sdjm rotlwi $e,$a,5 146f1535dc8Sdjm xor @X[$j%16],@X[$j%16],@X[($j+2)%16] 147f1535dc8Sdjm add $f,$f,@X[$i%16] 148f1535dc8Sdjm and $t0,$b,$c 149f1535dc8Sdjm xor @X[$j%16],@X[$j%16],@X[($j+8)%16] 150f1535dc8Sdjm add $f,$f,$e 151f1535dc8Sdjm or $t1,$b,$c 152f1535dc8Sdjm rotlwi $b,$b,30 153f1535dc8Sdjm xor @X[$j%16],@X[$j%16],@X[($j+13)%16] 154f1535dc8Sdjm and $t1,$t1,$d 155f1535dc8Sdjm or $t0,$t0,$t1 156f1535dc8Sdjm rotlwi @X[$j%16],@X[$j%16],1 157f1535dc8Sdjm add $f,$f,$t0 158f1535dc8Sdjm___ 159f1535dc8Sdjm} 160f1535dc8Sdjm 161f1535dc8Sdjm$code=<<___; 162f1535dc8Sdjm.machine "any" 163f1535dc8Sdjm.text 164f1535dc8Sdjm 165f1535dc8Sdjm.globl .sha1_block_data_order 166f1535dc8Sdjm.align 4 167f1535dc8Sdjm.sha1_block_data_order: 168*ec07fdf1Sdjm $STU $sp,-$FRAME($sp) 169f1535dc8Sdjm mflr r0 170f1535dc8Sdjm $PUSH r15,`$FRAME-$SIZE_T*17`($sp) 171f1535dc8Sdjm $PUSH r16,`$FRAME-$SIZE_T*16`($sp) 172f1535dc8Sdjm $PUSH r17,`$FRAME-$SIZE_T*15`($sp) 173f1535dc8Sdjm $PUSH r18,`$FRAME-$SIZE_T*14`($sp) 174f1535dc8Sdjm $PUSH r19,`$FRAME-$SIZE_T*13`($sp) 175f1535dc8Sdjm $PUSH r20,`$FRAME-$SIZE_T*12`($sp) 176f1535dc8Sdjm $PUSH r21,`$FRAME-$SIZE_T*11`($sp) 177f1535dc8Sdjm $PUSH r22,`$FRAME-$SIZE_T*10`($sp) 178f1535dc8Sdjm $PUSH r23,`$FRAME-$SIZE_T*9`($sp) 179f1535dc8Sdjm $PUSH r24,`$FRAME-$SIZE_T*8`($sp) 180f1535dc8Sdjm $PUSH r25,`$FRAME-$SIZE_T*7`($sp) 181f1535dc8Sdjm $PUSH r26,`$FRAME-$SIZE_T*6`($sp) 182f1535dc8Sdjm $PUSH r27,`$FRAME-$SIZE_T*5`($sp) 183f1535dc8Sdjm $PUSH r28,`$FRAME-$SIZE_T*4`($sp) 184f1535dc8Sdjm $PUSH r29,`$FRAME-$SIZE_T*3`($sp) 185f1535dc8Sdjm $PUSH r30,`$FRAME-$SIZE_T*2`($sp) 186f1535dc8Sdjm $PUSH r31,`$FRAME-$SIZE_T*1`($sp) 187*ec07fdf1Sdjm $PUSH r0,`$FRAME+$LRSAVE`($sp) 188f1535dc8Sdjm lwz $A,0($ctx) 189f1535dc8Sdjm lwz $B,4($ctx) 190f1535dc8Sdjm lwz $C,8($ctx) 191f1535dc8Sdjm lwz $D,12($ctx) 192f1535dc8Sdjm lwz $E,16($ctx) 193f1535dc8Sdjm andi. r0,$inp,3 194f1535dc8Sdjm bne Lunaligned 195f1535dc8SdjmLaligned: 196f1535dc8Sdjm mtctr $num 197f1535dc8Sdjm bl Lsha1_block_private 198*ec07fdf1Sdjm b Ldone 199*ec07fdf1Sdjm 200*ec07fdf1Sdjm; PowerPC specification allows an implementation to be ill-behaved 201*ec07fdf1Sdjm; upon unaligned access which crosses page boundary. "Better safe 202*ec07fdf1Sdjm; than sorry" principle makes me treat it specially. But I don't 203*ec07fdf1Sdjm; look for particular offending word, but rather for 64-byte input 204*ec07fdf1Sdjm; block which crosses the boundary. Once found that block is aligned 205*ec07fdf1Sdjm; and hashed separately... 206*ec07fdf1Sdjm.align 4 207*ec07fdf1SdjmLunaligned: 208*ec07fdf1Sdjm subfic $t1,$inp,4096 209*ec07fdf1Sdjm andi. $t1,$t1,4095 ; distance to closest page boundary 210*ec07fdf1Sdjm srwi. $t1,$t1,6 ; t1/=64 211*ec07fdf1Sdjm beq Lcross_page 212*ec07fdf1Sdjm $UCMP $num,$t1 213*ec07fdf1Sdjm ble- Laligned ; didn't cross the page boundary 214*ec07fdf1Sdjm mtctr $t1 215*ec07fdf1Sdjm subfc $num,$t1,$num 216*ec07fdf1Sdjm bl Lsha1_block_private 217*ec07fdf1SdjmLcross_page: 218*ec07fdf1Sdjm li $t1,16 219*ec07fdf1Sdjm mtctr $t1 220*ec07fdf1Sdjm addi r20,$sp,$LOCALS ; spot within the frame 221*ec07fdf1SdjmLmemcpy: 222*ec07fdf1Sdjm lbz r16,0($inp) 223*ec07fdf1Sdjm lbz r17,1($inp) 224*ec07fdf1Sdjm lbz r18,2($inp) 225*ec07fdf1Sdjm lbz r19,3($inp) 226*ec07fdf1Sdjm addi $inp,$inp,4 227*ec07fdf1Sdjm stb r16,0(r20) 228*ec07fdf1Sdjm stb r17,1(r20) 229*ec07fdf1Sdjm stb r18,2(r20) 230*ec07fdf1Sdjm stb r19,3(r20) 231*ec07fdf1Sdjm addi r20,r20,4 232*ec07fdf1Sdjm bdnz Lmemcpy 233*ec07fdf1Sdjm 234*ec07fdf1Sdjm $PUSH $inp,`$FRAME-$SIZE_T*18`($sp) 235*ec07fdf1Sdjm li $t1,1 236*ec07fdf1Sdjm addi $inp,$sp,$LOCALS 237*ec07fdf1Sdjm mtctr $t1 238*ec07fdf1Sdjm bl Lsha1_block_private 239*ec07fdf1Sdjm $POP $inp,`$FRAME-$SIZE_T*18`($sp) 240*ec07fdf1Sdjm addic. $num,$num,-1 241*ec07fdf1Sdjm bne- Lunaligned 242*ec07fdf1Sdjm 243f1535dc8SdjmLdone: 244*ec07fdf1Sdjm $POP r0,`$FRAME+$LRSAVE`($sp) 245f1535dc8Sdjm $POP r15,`$FRAME-$SIZE_T*17`($sp) 246f1535dc8Sdjm $POP r16,`$FRAME-$SIZE_T*16`($sp) 247f1535dc8Sdjm $POP r17,`$FRAME-$SIZE_T*15`($sp) 248f1535dc8Sdjm $POP r18,`$FRAME-$SIZE_T*14`($sp) 249f1535dc8Sdjm $POP r19,`$FRAME-$SIZE_T*13`($sp) 250f1535dc8Sdjm $POP r20,`$FRAME-$SIZE_T*12`($sp) 251f1535dc8Sdjm $POP r21,`$FRAME-$SIZE_T*11`($sp) 252f1535dc8Sdjm $POP r22,`$FRAME-$SIZE_T*10`($sp) 253f1535dc8Sdjm $POP r23,`$FRAME-$SIZE_T*9`($sp) 254f1535dc8Sdjm $POP r24,`$FRAME-$SIZE_T*8`($sp) 255f1535dc8Sdjm $POP r25,`$FRAME-$SIZE_T*7`($sp) 256f1535dc8Sdjm $POP r26,`$FRAME-$SIZE_T*6`($sp) 257f1535dc8Sdjm $POP r27,`$FRAME-$SIZE_T*5`($sp) 258f1535dc8Sdjm $POP r28,`$FRAME-$SIZE_T*4`($sp) 259f1535dc8Sdjm $POP r29,`$FRAME-$SIZE_T*3`($sp) 260f1535dc8Sdjm $POP r30,`$FRAME-$SIZE_T*2`($sp) 261f1535dc8Sdjm $POP r31,`$FRAME-$SIZE_T*1`($sp) 262f1535dc8Sdjm mtlr r0 263*ec07fdf1Sdjm addi $sp,$sp,$FRAME 264f1535dc8Sdjm blr 265f1535dc8Sdjm___ 266f1535dc8Sdjm 267f1535dc8Sdjm# This is private block function, which uses tailored calling 268f1535dc8Sdjm# interface, namely upon entry SHA_CTX is pre-loaded to given 269f1535dc8Sdjm# registers and counter register contains amount of chunks to 270f1535dc8Sdjm# digest... 271f1535dc8Sdjm$code.=<<___; 272f1535dc8Sdjm.align 4 273f1535dc8SdjmLsha1_block_private: 274f1535dc8Sdjm___ 275f1535dc8Sdjm$code.=<<___; # load K_00_19 276f1535dc8Sdjm lis $K,0x5a82 277f1535dc8Sdjm ori $K,$K,0x7999 278f1535dc8Sdjm___ 279f1535dc8Sdjmfor($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 280f1535dc8Sdjm$code.=<<___; # load K_20_39 281f1535dc8Sdjm lis $K,0x6ed9 282f1535dc8Sdjm ori $K,$K,0xeba1 283f1535dc8Sdjm___ 284f1535dc8Sdjmfor(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 285f1535dc8Sdjm$code.=<<___; # load K_40_59 286f1535dc8Sdjm lis $K,0x8f1b 287f1535dc8Sdjm ori $K,$K,0xbcdc 288f1535dc8Sdjm___ 289f1535dc8Sdjmfor(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 290f1535dc8Sdjm$code.=<<___; # load K_60_79 291f1535dc8Sdjm lis $K,0xca62 292f1535dc8Sdjm ori $K,$K,0xc1d6 293f1535dc8Sdjm___ 294f1535dc8Sdjmfor(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 295f1535dc8Sdjm$code.=<<___; 296f1535dc8Sdjm add r16,r16,$E 297f1535dc8Sdjm add r17,r17,$T 298f1535dc8Sdjm add r18,r18,$A 299f1535dc8Sdjm add r19,r19,$B 300f1535dc8Sdjm add r20,r20,$C 301f1535dc8Sdjm stw r16,0($ctx) 302f1535dc8Sdjm mr $A,r16 303f1535dc8Sdjm stw r17,4($ctx) 304f1535dc8Sdjm mr $B,r17 305f1535dc8Sdjm stw r18,8($ctx) 306f1535dc8Sdjm mr $C,r18 307f1535dc8Sdjm stw r19,12($ctx) 308f1535dc8Sdjm mr $D,r19 309f1535dc8Sdjm stw r20,16($ctx) 310f1535dc8Sdjm mr $E,r20 311f1535dc8Sdjm addi $inp,$inp,`16*4` 312f1535dc8Sdjm bdnz- Lsha1_block_private 313f1535dc8Sdjm blr 314f1535dc8Sdjm___ 315f1535dc8Sdjm 316f1535dc8Sdjm$code =~ s/\`([^\`]*)\`/eval $1/gem; 317f1535dc8Sdjmprint $code; 318f1535dc8Sdjmclose STDOUT; 319