1*ebfedea0SLionel Sambuc#!/usr/bin/env perl 2*ebfedea0SLionel Sambuc 3*ebfedea0SLionel Sambuc# ==================================================================== 4*ebfedea0SLionel Sambuc# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5*ebfedea0SLionel Sambuc# project. The module is, however, dual licensed under OpenSSL and 6*ebfedea0SLionel Sambuc# CRYPTOGAMS licenses depending on where you obtain it. For further 7*ebfedea0SLionel Sambuc# details see http://www.openssl.org/~appro/cryptogams/. 8*ebfedea0SLionel Sambuc# ==================================================================== 9*ebfedea0SLionel Sambuc# 10*ebfedea0SLionel Sambuc# March 2010 11*ebfedea0SLionel Sambuc# 12*ebfedea0SLionel Sambuc# The module implements "4-bit" GCM GHASH function and underlying 13*ebfedea0SLionel Sambuc# single multiplication operation in GF(2^128). "4-bit" means that it 14*ebfedea0SLionel Sambuc# uses 256 bytes per-key table [+128 bytes shared table]. Streamed 15*ebfedea0SLionel Sambuc# GHASH performance was measured to be 6.67 cycles per processed byte 16*ebfedea0SLionel Sambuc# on Itanium 2, which is >90% better than Microsoft compiler generated 17*ebfedea0SLionel Sambuc# code. To anchor to something else sha1-ia64.pl module processes one 18*ebfedea0SLionel Sambuc# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per 19*ebfedea0SLionel Sambuc# byte. 20*ebfedea0SLionel Sambuc 21*ebfedea0SLionel Sambuc# September 2010 22*ebfedea0SLionel Sambuc# 23*ebfedea0SLionel Sambuc# It was originally thought that it makes lesser sense to implement 24*ebfedea0SLionel Sambuc# "528B" variant on Itanium 2 for following reason. Because number of 25*ebfedea0SLionel Sambuc# functional units is naturally limited, it appeared impossible to 26*ebfedea0SLionel Sambuc# implement "528B" loop in 4 cycles, only in 5. This would mean that 27*ebfedea0SLionel Sambuc# theoretically performance improvement couldn't be more than 20%. 28*ebfedea0SLionel Sambuc# But occasionally you prove yourself wrong:-) I figured out a way to 29*ebfedea0SLionel Sambuc# fold couple of instructions and having freed yet another instruction 30*ebfedea0SLionel Sambuc# slot by unrolling the loop... Resulting performance is 4.45 cycles 31*ebfedea0SLionel Sambuc# per processed byte and 50% better than "256B" version. On original 32*ebfedea0SLionel Sambuc# Itanium performance should remain the same as the "256B" version, 33*ebfedea0SLionel Sambuc# i.e. ~8.5 cycles. 34*ebfedea0SLionel Sambuc 35*ebfedea0SLionel Sambuc$output=shift and (open STDOUT,">$output" or die "can't open $output: $!"); 36*ebfedea0SLionel Sambuc 37*ebfedea0SLionel Sambucif ($^O eq "hpux") { 38*ebfedea0SLionel Sambuc $ADDP="addp4"; 39*ebfedea0SLionel Sambuc for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } 40*ebfedea0SLionel Sambuc} else { $ADDP="add"; } 41*ebfedea0SLionel Sambucfor (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); 42*ebfedea0SLionel Sambuc $big_endian=0 if (/\-DL_ENDIAN/); } 43*ebfedea0SLionel Sambucif (!defined($big_endian)) 44*ebfedea0SLionel Sambuc { $big_endian=(unpack('L',pack('N',1))==1); } 45*ebfedea0SLionel Sambuc 46*ebfedea0SLionel Sambucsub loop() { 47*ebfedea0SLionel Sambucmy $label=shift; 48*ebfedea0SLionel Sambucmy ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp 49*ebfedea0SLionel Sambuc 50*ebfedea0SLionel Sambuc# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e. 51*ebfedea0SLionel Sambuc# in scalable manner;-) Naturally assuming data in L1 cache... 52*ebfedea0SLionel Sambuc# Special note about 'dep' instruction, which is used to construct 53*ebfedea0SLionel Sambuc# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128 54*ebfedea0SLionel Sambuc# bytes boundary and lower 7 bits of its address are guaranteed to 55*ebfedea0SLionel Sambuc# be zero. 56*ebfedea0SLionel Sambuc$code.=<<___; 57*ebfedea0SLionel Sambuc$label: 58*ebfedea0SLionel Sambuc{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8 59*ebfedea0SLionel Sambuc (p19) dep rem=Zlo,rem_4bitp,3,4 } 60*ebfedea0SLionel Sambuc{ .mfi; (p19) xor Zhi=Zhi,Hhi 61*ebfedea0SLionel Sambuc ($p17) xor xi[1]=xi[1],in[1] };; 62*ebfedea0SLionel Sambuc{ .mfi; (p18) ld8 Hhi=[Hi[1]] 63*ebfedea0SLionel Sambuc (p19) shrp Zlo=Zhi,Zlo,4 } 64*ebfedea0SLionel Sambuc{ .mfi; (p19) ld8 rem=[rem] 65*ebfedea0SLionel Sambuc (p18) and Hi[1]=mask0xf0,xi[2] };; 66*ebfedea0SLionel Sambuc{ .mmi; ($p16) ld1 in[0]=[inp],-1 67*ebfedea0SLionel Sambuc (p18) xor Zlo=Zlo,Hlo 68*ebfedea0SLionel Sambuc (p19) shr.u Zhi=Zhi,4 } 69*ebfedea0SLionel Sambuc{ .mib; (p19) xor Hhi=Hhi,rem 70*ebfedea0SLionel Sambuc (p18) add Hi[1]=Htbl,Hi[1] };; 71*ebfedea0SLionel Sambuc 72*ebfedea0SLionel Sambuc{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8 73*ebfedea0SLionel Sambuc (p18) dep rem=Zlo,rem_4bitp,3,4 } 74*ebfedea0SLionel Sambuc{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0 75*ebfedea0SLionel Sambuc (p18) xor Zhi=Zhi,Hhi };; 76*ebfedea0SLionel Sambuc{ .mfi; (p18) ld8 Hhi=[Hi[1]] 77*ebfedea0SLionel Sambuc (p18) shrp Zlo=Zhi,Zlo,4 } 78*ebfedea0SLionel Sambuc{ .mfi; (p18) ld8 rem=[rem] 79*ebfedea0SLionel Sambuc (p17) and Hi[0]=mask0xf0,Hi[0] };; 80*ebfedea0SLionel Sambuc{ .mmi; (p16) ld1 xi[0]=[Xi],-1 81*ebfedea0SLionel Sambuc (p18) xor Zlo=Zlo,Hlo 82*ebfedea0SLionel Sambuc (p18) shr.u Zhi=Zhi,4 } 83*ebfedea0SLionel Sambuc{ .mib; (p18) xor Hhi=Hhi,rem 84*ebfedea0SLionel Sambuc (p17) add Hi[0]=Htbl,Hi[0] 85*ebfedea0SLionel Sambuc br.ctop.sptk $label };; 86*ebfedea0SLionel Sambuc___ 87*ebfedea0SLionel Sambuc} 88*ebfedea0SLionel Sambuc 89*ebfedea0SLionel Sambuc$code=<<___; 90*ebfedea0SLionel Sambuc.explicit 91*ebfedea0SLionel Sambuc.text 92*ebfedea0SLionel Sambuc 93*ebfedea0SLionel Sambucprevfs=r2; prevlc=r3; prevpr=r8; 94*ebfedea0SLionel Sambucmask0xf0=r21; 95*ebfedea0SLionel Sambucrem=r22; rem_4bitp=r23; 96*ebfedea0SLionel SambucXi=r24; Htbl=r25; 97*ebfedea0SLionel Sambucinp=r26; end=r27; 98*ebfedea0SLionel SambucHhi=r28; Hlo=r29; 99*ebfedea0SLionel SambucZhi=r30; Zlo=r31; 100*ebfedea0SLionel Sambuc 101*ebfedea0SLionel Sambuc.align 128 102*ebfedea0SLionel Sambuc.skip 16 // aligns loop body 103*ebfedea0SLionel Sambuc.global gcm_gmult_4bit# 104*ebfedea0SLionel Sambuc.proc gcm_gmult_4bit# 105*ebfedea0SLionel Sambucgcm_gmult_4bit: 106*ebfedea0SLionel Sambuc .prologue 107*ebfedea0SLionel Sambuc{ .mmi; .save ar.pfs,prevfs 108*ebfedea0SLionel Sambuc alloc prevfs=ar.pfs,2,6,0,8 109*ebfedea0SLionel Sambuc $ADDP Xi=15,in0 // &Xi[15] 110*ebfedea0SLionel Sambuc mov rem_4bitp=ip } 111*ebfedea0SLionel Sambuc{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo 112*ebfedea0SLionel Sambuc .save ar.lc,prevlc 113*ebfedea0SLionel Sambuc mov prevlc=ar.lc 114*ebfedea0SLionel Sambuc .save pr,prevpr 115*ebfedea0SLionel Sambuc mov prevpr=pr };; 116*ebfedea0SLionel Sambuc 117*ebfedea0SLionel Sambuc .body 118*ebfedea0SLionel Sambuc .rotr in[3],xi[3],Hi[2] 119*ebfedea0SLionel Sambuc 120*ebfedea0SLionel Sambuc{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15] 121*ebfedea0SLionel Sambuc mov mask0xf0=0xf0 122*ebfedea0SLionel Sambuc brp.loop.imp .Loop1,.Lend1-16};; 123*ebfedea0SLionel Sambuc{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14] 124*ebfedea0SLionel Sambuc };; 125*ebfedea0SLionel Sambuc{ .mii; shladd Hi[1]=xi[2],4,r0 126*ebfedea0SLionel Sambuc mov pr.rot=0x7<<16 127*ebfedea0SLionel Sambuc mov ar.lc=13 };; 128*ebfedea0SLionel Sambuc{ .mii; and Hi[1]=mask0xf0,Hi[1] 129*ebfedea0SLionel Sambuc mov ar.ec=3 130*ebfedea0SLionel Sambuc xor Zlo=Zlo,Zlo };; 131*ebfedea0SLionel Sambuc{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo 132*ebfedea0SLionel Sambuc add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp 133*ebfedea0SLionel Sambuc xor Zhi=Zhi,Zhi };; 134*ebfedea0SLionel Sambuc___ 135*ebfedea0SLionel Sambuc &loop (".Loop1",1); 136*ebfedea0SLionel Sambuc$code.=<<___; 137*ebfedea0SLionel Sambuc.Lend1: 138*ebfedea0SLionel Sambuc{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact 139*ebfedea0SLionel Sambuc{ .mib; mux1 Zlo=Zlo,\@rev };; 140*ebfedea0SLionel Sambuc{ .mib; mux1 Zhi=Zhi,\@rev };; 141*ebfedea0SLionel Sambuc{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent 142*ebfedea0SLionel Sambuc add Hhi=1,Xi };; // pipeline flush on Itanium 143*ebfedea0SLionel Sambuc{ .mib; st8 [Hlo]=Zlo 144*ebfedea0SLionel Sambuc mov pr=prevpr,0x1ffff };; 145*ebfedea0SLionel Sambuc{ .mib; st8 [Hhi]=Zhi 146*ebfedea0SLionel Sambuc mov ar.lc=prevlc 147*ebfedea0SLionel Sambuc br.ret.sptk.many b0 };; 148*ebfedea0SLionel Sambuc.endp gcm_gmult_4bit# 149*ebfedea0SLionel Sambuc___ 150*ebfedea0SLionel Sambuc 151*ebfedea0SLionel Sambuc###################################################################### 152*ebfedea0SLionel Sambuc# "528B" (well, "512B" actualy) streamed GHASH 153*ebfedea0SLionel Sambuc# 154*ebfedea0SLionel Sambuc$Xip="in0"; 155*ebfedea0SLionel Sambuc$Htbl="in1"; 156*ebfedea0SLionel Sambuc$inp="in2"; 157*ebfedea0SLionel Sambuc$len="in3"; 158*ebfedea0SLionel Sambuc$rem_8bit="loc0"; 159*ebfedea0SLionel Sambuc$mask0xff="loc1"; 160*ebfedea0SLionel Sambuc($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum"); 161*ebfedea0SLionel Sambuc 162*ebfedea0SLionel Sambucsub load_htable() { 163*ebfedea0SLionel Sambuc for (my $i=0;$i<8;$i++) { 164*ebfedea0SLionel Sambuc $code.=<<___; 165*ebfedea0SLionel Sambuc{ .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi 166*ebfedea0SLionel Sambuc ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo 167*ebfedea0SLionel Sambuc{ .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi 168*ebfedea0SLionel Sambuc ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo 169*ebfedea0SLionel Sambuc___ 170*ebfedea0SLionel Sambuc $code.=shift if (($i+$#_)==7); 171*ebfedea0SLionel Sambuc $code.="\t};;\n" 172*ebfedea0SLionel Sambuc } 173*ebfedea0SLionel Sambuc} 174*ebfedea0SLionel Sambuc 175*ebfedea0SLionel Sambuc$code.=<<___; 176*ebfedea0SLionel Sambucprevsp=r3; 177*ebfedea0SLionel Sambuc 178*ebfedea0SLionel Sambuc.align 32 179*ebfedea0SLionel Sambuc.skip 16 // aligns loop body 180*ebfedea0SLionel Sambuc.global gcm_ghash_4bit# 181*ebfedea0SLionel Sambuc.proc gcm_ghash_4bit# 182*ebfedea0SLionel Sambucgcm_ghash_4bit: 183*ebfedea0SLionel Sambuc .prologue 184*ebfedea0SLionel Sambuc{ .mmi; .save ar.pfs,prevfs 185*ebfedea0SLionel Sambuc alloc prevfs=ar.pfs,4,2,0,0 186*ebfedea0SLionel Sambuc .vframe prevsp 187*ebfedea0SLionel Sambuc mov prevsp=sp 188*ebfedea0SLionel Sambuc mov $rem_8bit=ip };; 189*ebfedea0SLionel Sambuc .body 190*ebfedea0SLionel Sambuc{ .mfi; $ADDP r8=0+0,$Htbl 191*ebfedea0SLionel Sambuc $ADDP r9=0+8,$Htbl } 192*ebfedea0SLionel Sambuc{ .mfi; $ADDP r10=128+0,$Htbl 193*ebfedea0SLionel Sambuc $ADDP r11=128+8,$Htbl };; 194*ebfedea0SLionel Sambuc___ 195*ebfedea0SLionel Sambuc &load_htable( 196*ebfedea0SLionel Sambuc " $ADDP $Xip=15,$Xip", # &Xi[15] 197*ebfedea0SLionel Sambuc " $ADDP $len=$len,$inp", # &inp[len] 198*ebfedea0SLionel Sambuc " $ADDP $inp=15,$inp", # &inp[15] 199*ebfedea0SLionel Sambuc " mov $mask0xff=0xff", 200*ebfedea0SLionel Sambuc " add sp=-512,sp", 201*ebfedea0SLionel Sambuc " andcm sp=sp,$mask0xff", # align stack frame 202*ebfedea0SLionel Sambuc " add r14=0,sp", 203*ebfedea0SLionel Sambuc " add r15=8,sp"); 204*ebfedea0SLionel Sambuc$code.=<<___; 205*ebfedea0SLionel Sambuc{ .mmi; $sum 1<<1 // go big-endian 206*ebfedea0SLionel Sambuc add r8=256+0,sp 207*ebfedea0SLionel Sambuc add r9=256+8,sp } 208*ebfedea0SLionel Sambuc{ .mmi; add r10=256+128+0,sp 209*ebfedea0SLionel Sambuc add r11=256+128+8,sp 210*ebfedea0SLionel Sambuc add $len=-17,$len };; 211*ebfedea0SLionel Sambuc___ 212*ebfedea0SLionel Sambucfor($i=0;$i<8;$i++) { # generate first half of Hshr4[] 213*ebfedea0SLionel Sambucmy ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1)); 214*ebfedea0SLionel Sambuc$code.=<<___; 215*ebfedea0SLionel Sambuc{ .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo 216*ebfedea0SLionel Sambuc st8 [r9]=$rhi,16 // Htable[$i].hi 217*ebfedea0SLionel Sambuc shrp $rlo=$rhi,$rlo,4 }//;; 218*ebfedea0SLionel Sambuc{ .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo 219*ebfedea0SLionel Sambuc stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi 220*ebfedea0SLionel Sambuc shr.u $rhi=$rhi,4 };; 221*ebfedea0SLionel Sambuc{ .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4 222*ebfedea0SLionel Sambuc st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4 223*ebfedea0SLionel Sambuc___ 224*ebfedea0SLionel Sambuc} 225*ebfedea0SLionel Sambuc$code.=<<___; 226*ebfedea0SLionel Sambuc{ .mmi; ld8 r16=[r8],16 // Htable[8].lo 227*ebfedea0SLionel Sambuc ld8 r17=[r9],16 };; // Htable[8].hi 228*ebfedea0SLionel Sambuc{ .mmi; ld8 r18=[r8],16 // Htable[9].lo 229*ebfedea0SLionel Sambuc ld8 r19=[r9],16 } // Htable[9].hi 230*ebfedea0SLionel Sambuc{ .mmi; rum 1<<5 // clear um.mfh 231*ebfedea0SLionel Sambuc shrp r16=r17,r16,4 };; 232*ebfedea0SLionel Sambuc___ 233*ebfedea0SLionel Sambucfor($i=0;$i<6;$i++) { # generate second half of Hshr4[] 234*ebfedea0SLionel Sambuc$code.=<<___; 235*ebfedea0SLionel Sambuc{ .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo 236*ebfedea0SLionel Sambuc ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi 237*ebfedea0SLionel Sambuc shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; 238*ebfedea0SLionel Sambuc{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 239*ebfedea0SLionel Sambuc st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 240*ebfedea0SLionel Sambuc shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } 241*ebfedea0SLionel Sambuc___ 242*ebfedea0SLionel Sambuc} 243*ebfedea0SLionel Sambuc$code.=<<___; 244*ebfedea0SLionel Sambuc{ .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; 245*ebfedea0SLionel Sambuc{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 246*ebfedea0SLionel Sambuc st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 247*ebfedea0SLionel Sambuc shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } 248*ebfedea0SLionel Sambuc{ .mmi; add $Htbl=256,sp // &Htable[0] 249*ebfedea0SLionel Sambuc add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit 250*ebfedea0SLionel Sambuc shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };; 251*ebfedea0SLionel Sambuc{ .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4 252*ebfedea0SLionel Sambuc st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4 253*ebfedea0SLionel Sambuc___ 254*ebfedea0SLionel Sambuc 255*ebfedea0SLionel Sambuc$in="r15"; 256*ebfedea0SLionel Sambuc@xi=("r16","r17"); 257*ebfedea0SLionel Sambuc@rem=("r18","r19"); 258*ebfedea0SLionel Sambuc($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25"); 259*ebfedea0SLionel Sambuc($Atbl,$Btbl)=("r26","r27"); 260*ebfedea0SLionel Sambuc 261*ebfedea0SLionel Sambuc$code.=<<___; # (p16) 262*ebfedea0SLionel Sambuc{ .mmi; ld1 $in=[$inp],-1 //(p16) *inp-- 263*ebfedea0SLionel Sambuc ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 264*ebfedea0SLionel Sambuc cmp.eq p0,p6=r0,r0 };; // clear p6 265*ebfedea0SLionel Sambuc___ 266*ebfedea0SLionel Sambucpush (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 267*ebfedea0SLionel Sambuc 268*ebfedea0SLionel Sambuc$code.=<<___; # (p16),(p17) 269*ebfedea0SLionel Sambuc{ .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 270*ebfedea0SLionel Sambuc xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 271*ebfedea0SLionel Sambuc{ .mii; ld1 $in=[$inp],-1 //(p16) *inp-- 272*ebfedea0SLionel Sambuc dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo 273*ebfedea0SLionel Sambuc and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 274*ebfedea0SLionel Sambuc.align 32 275*ebfedea0SLionel Sambuc.LOOP: 276*ebfedea0SLionel Sambuc{ .mmi; 277*ebfedea0SLionel Sambuc(p6) st8 [$Xip]=$Zhi,13 278*ebfedea0SLionel Sambuc xor $Zlo=$Zlo,$Zlo 279*ebfedea0SLionel Sambuc add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo 280*ebfedea0SLionel Sambuc___ 281*ebfedea0SLionel Sambucpush (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 282*ebfedea0SLionel Sambuc 283*ebfedea0SLionel Sambuc$code.=<<___; # (p16),(p17),(p18) 284*ebfedea0SLionel Sambuc{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 285*ebfedea0SLionel Sambuc ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo 286*ebfedea0SLionel Sambuc xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 287*ebfedea0SLionel Sambuc{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 288*ebfedea0SLionel Sambuc dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo 289*ebfedea0SLionel Sambuc{ .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 290*ebfedea0SLionel Sambuc xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo 291*ebfedea0SLionel Sambuc{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi 292*ebfedea0SLionel Sambuc ld1 $in=[$inp],-1 } //(p16) *inp-- 293*ebfedea0SLionel Sambuc{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) 294*ebfedea0SLionel Sambuc mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi 295*ebfedea0SLionel Sambuc and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 296*ebfedea0SLionel Sambuc{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi 297*ebfedea0SLionel Sambuc ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 298*ebfedea0SLionel Sambuc shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) 299*ebfedea0SLionel Sambuc{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 300*ebfedea0SLionel Sambuc add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] 301*ebfedea0SLionel Sambuc___ 302*ebfedea0SLionel Sambucpush (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 303*ebfedea0SLionel Sambuc 304*ebfedea0SLionel Sambucfor ($i=1;$i<14;$i++) { 305*ebfedea0SLionel Sambuc# Above and below fragments are derived from this one by removing 306*ebfedea0SLionel Sambuc# unsuitable (p??) instructions. 307*ebfedea0SLionel Sambuc$code.=<<___; # (p16),(p17),(p18),(p19) 308*ebfedea0SLionel Sambuc{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 309*ebfedea0SLionel Sambuc ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo 310*ebfedea0SLionel Sambuc shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 311*ebfedea0SLionel Sambuc{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 312*ebfedea0SLionel Sambuc xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo 313*ebfedea0SLionel Sambuc xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 314*ebfedea0SLionel Sambuc{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 315*ebfedea0SLionel Sambuc ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 316*ebfedea0SLionel Sambuc dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo 317*ebfedea0SLionel Sambuc{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 318*ebfedea0SLionel Sambuc xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo 319*ebfedea0SLionel Sambuc xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi 320*ebfedea0SLionel Sambuc{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi 321*ebfedea0SLionel Sambuc ld1 $in=[$inp],-1 //(p16) *inp-- 322*ebfedea0SLionel Sambuc shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 323*ebfedea0SLionel Sambuc{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) 324*ebfedea0SLionel Sambuc xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi 325*ebfedea0SLionel Sambuc and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 326*ebfedea0SLionel Sambuc{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi 327*ebfedea0SLionel Sambuc ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 328*ebfedea0SLionel Sambuc shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) 329*ebfedea0SLionel Sambuc{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 330*ebfedea0SLionel Sambuc xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 331*ebfedea0SLionel Sambuc add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] 332*ebfedea0SLionel Sambuc___ 333*ebfedea0SLionel Sambucpush (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 334*ebfedea0SLionel Sambuc} 335*ebfedea0SLionel Sambuc 336*ebfedea0SLionel Sambuc$code.=<<___; # (p17),(p18),(p19) 337*ebfedea0SLionel Sambuc{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 338*ebfedea0SLionel Sambuc ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo 339*ebfedea0SLionel Sambuc shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 340*ebfedea0SLionel Sambuc{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 341*ebfedea0SLionel Sambuc xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo 342*ebfedea0SLionel Sambuc xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 343*ebfedea0SLionel Sambuc{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 344*ebfedea0SLionel Sambuc ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 345*ebfedea0SLionel Sambuc dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo 346*ebfedea0SLionel Sambuc{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 347*ebfedea0SLionel Sambuc xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo 348*ebfedea0SLionel Sambuc xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi 349*ebfedea0SLionel Sambuc{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi 350*ebfedea0SLionel Sambuc shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 351*ebfedea0SLionel Sambuc{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) 352*ebfedea0SLionel Sambuc xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi 353*ebfedea0SLionel Sambuc and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 354*ebfedea0SLionel Sambuc{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi 355*ebfedea0SLionel Sambuc shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) 356*ebfedea0SLionel Sambuc{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 357*ebfedea0SLionel Sambuc xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 358*ebfedea0SLionel Sambuc add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] 359*ebfedea0SLionel Sambuc___ 360*ebfedea0SLionel Sambucpush (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 361*ebfedea0SLionel Sambuc 362*ebfedea0SLionel Sambuc$code.=<<___; # (p18),(p19) 363*ebfedea0SLionel Sambuc{ .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 364*ebfedea0SLionel Sambuc shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 365*ebfedea0SLionel Sambuc{ .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 366*ebfedea0SLionel Sambuc xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo 367*ebfedea0SLionel Sambuc{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 368*ebfedea0SLionel Sambuc xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo 369*ebfedea0SLionel Sambuc{ .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 370*ebfedea0SLionel Sambuc xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi 371*ebfedea0SLionel Sambuc{ .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi 372*ebfedea0SLionel Sambuc shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 373*ebfedea0SLionel Sambuc{ .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4 374*ebfedea0SLionel Sambuc xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi 375*ebfedea0SLionel Sambuc{ .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi 376*ebfedea0SLionel Sambuc shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4) 377*ebfedea0SLionel Sambuc{ .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 378*ebfedea0SLionel Sambuc xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48 379*ebfedea0SLionel Sambuc___ 380*ebfedea0SLionel Sambucpush (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 381*ebfedea0SLionel Sambuc 382*ebfedea0SLionel Sambuc$code.=<<___; # (p19) 383*ebfedea0SLionel Sambuc{ .mmi; cmp.ltu p6,p0=$inp,$len 384*ebfedea0SLionel Sambuc add $inp=32,$inp 385*ebfedea0SLionel Sambuc shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4 386*ebfedea0SLionel Sambuc{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 387*ebfedea0SLionel Sambuc xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo 388*ebfedea0SLionel Sambuc add $Xip=9,$Xip };; // &Xi.lo 389*ebfedea0SLionel Sambuc{ .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 390*ebfedea0SLionel Sambuc(p6) ld1 $in=[$inp],-1 //[p16] *inp-- 391*ebfedea0SLionel Sambuc(p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14] 392*ebfedea0SLionel Sambuc{ .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi 393*ebfedea0SLionel Sambuc(p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15] 394*ebfedea0SLionel Sambuc{ .mmi; st8 [$Xip]=$Zlo,-8 395*ebfedea0SLionel Sambuc(p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i] 396*ebfedea0SLionel Sambuc shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48 397*ebfedea0SLionel Sambuc{ .mmi; 398*ebfedea0SLionel Sambuc(p6) ld1 $in=[$inp],-1 //[p16] *inp-- 399*ebfedea0SLionel Sambuc xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 400*ebfedea0SLionel Sambuc(p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo 401*ebfedea0SLionel Sambuc{ .mib; 402*ebfedea0SLionel Sambuc(p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0 403*ebfedea0SLionel Sambuc(p6) br.cond.dptk.many .LOOP };; 404*ebfedea0SLionel Sambuc 405*ebfedea0SLionel Sambuc{ .mib; st8 [$Xip]=$Zhi };; 406*ebfedea0SLionel Sambuc{ .mib; $rum 1<<1 // return to little-endian 407*ebfedea0SLionel Sambuc .restore sp 408*ebfedea0SLionel Sambuc mov sp=prevsp 409*ebfedea0SLionel Sambuc br.ret.sptk.many b0 };; 410*ebfedea0SLionel Sambuc.endp gcm_ghash_4bit# 411*ebfedea0SLionel Sambuc___ 412*ebfedea0SLionel Sambuc$code.=<<___; 413*ebfedea0SLionel Sambuc.align 128 414*ebfedea0SLionel Sambuc.type rem_4bit#,\@object 415*ebfedea0SLionel Sambucrem_4bit: 416*ebfedea0SLionel Sambuc data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 417*ebfedea0SLionel Sambuc data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 418*ebfedea0SLionel Sambuc data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 419*ebfedea0SLionel Sambuc data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 420*ebfedea0SLionel Sambuc.size rem_4bit#,128 421*ebfedea0SLionel Sambuc.type rem_8bit#,\@object 422*ebfedea0SLionel Sambucrem_8bit: 423*ebfedea0SLionel Sambuc data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E 424*ebfedea0SLionel Sambuc data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E 425*ebfedea0SLionel Sambuc data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E 426*ebfedea0SLionel Sambuc data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E 427*ebfedea0SLionel Sambuc data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E 428*ebfedea0SLionel Sambuc data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E 429*ebfedea0SLionel Sambuc data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E 430*ebfedea0SLionel Sambuc data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E 431*ebfedea0SLionel Sambuc data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE 432*ebfedea0SLionel Sambuc data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE 433*ebfedea0SLionel Sambuc data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE 434*ebfedea0SLionel Sambuc data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE 435*ebfedea0SLionel Sambuc data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E 436*ebfedea0SLionel Sambuc data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E 437*ebfedea0SLionel Sambuc data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE 438*ebfedea0SLionel Sambuc data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE 439*ebfedea0SLionel Sambuc data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E 440*ebfedea0SLionel Sambuc data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E 441*ebfedea0SLionel Sambuc data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E 442*ebfedea0SLionel Sambuc data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E 443*ebfedea0SLionel Sambuc data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E 444*ebfedea0SLionel Sambuc data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E 445*ebfedea0SLionel Sambuc data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E 446*ebfedea0SLionel Sambuc data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E 447*ebfedea0SLionel Sambuc data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE 448*ebfedea0SLionel Sambuc data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE 449*ebfedea0SLionel Sambuc data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE 450*ebfedea0SLionel Sambuc data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE 451*ebfedea0SLionel Sambuc data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E 452*ebfedea0SLionel Sambuc data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E 453*ebfedea0SLionel Sambuc data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE 454*ebfedea0SLionel Sambuc data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE 455*ebfedea0SLionel Sambuc.size rem_8bit#,512 456*ebfedea0SLionel Sambucstringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>" 457*ebfedea0SLionel Sambuc___ 458*ebfedea0SLionel Sambuc 459*ebfedea0SLionel Sambuc$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian); 460*ebfedea0SLionel Sambuc$code =~ s/\`([^\`]*)\`/eval $1/gem; 461*ebfedea0SLionel Sambuc 462*ebfedea0SLionel Sambucprint $code; 463*ebfedea0SLionel Sambucclose STDOUT; 464