xref: /minix3/crypto/external/bsd/openssl/dist/crypto/modes/asm/ghash-ia64.pl (revision ebfedea0ce5bbe81e252ddf32d732e40fb633fae)
1*ebfedea0SLionel Sambuc#!/usr/bin/env perl
2*ebfedea0SLionel Sambuc
3*ebfedea0SLionel Sambuc# ====================================================================
4*ebfedea0SLionel Sambuc# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5*ebfedea0SLionel Sambuc# project. The module is, however, dual licensed under OpenSSL and
6*ebfedea0SLionel Sambuc# CRYPTOGAMS licenses depending on where you obtain it. For further
7*ebfedea0SLionel Sambuc# details see http://www.openssl.org/~appro/cryptogams/.
8*ebfedea0SLionel Sambuc# ====================================================================
9*ebfedea0SLionel Sambuc#
10*ebfedea0SLionel Sambuc# March 2010
11*ebfedea0SLionel Sambuc#
12*ebfedea0SLionel Sambuc# The module implements "4-bit" GCM GHASH function and underlying
13*ebfedea0SLionel Sambuc# single multiplication operation in GF(2^128). "4-bit" means that it
14*ebfedea0SLionel Sambuc# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
15*ebfedea0SLionel Sambuc# GHASH performance was measured to be 6.67 cycles per processed byte
16*ebfedea0SLionel Sambuc# on Itanium 2, which is >90% better than Microsoft compiler generated
17*ebfedea0SLionel Sambuc# code. To anchor to something else sha1-ia64.pl module processes one
18*ebfedea0SLionel Sambuc# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
19*ebfedea0SLionel Sambuc# byte.
20*ebfedea0SLionel Sambuc
21*ebfedea0SLionel Sambuc# September 2010
22*ebfedea0SLionel Sambuc#
23*ebfedea0SLionel Sambuc# It was originally thought that it makes lesser sense to implement
24*ebfedea0SLionel Sambuc# "528B" variant on Itanium 2 for following reason. Because number of
25*ebfedea0SLionel Sambuc# functional units is naturally limited, it appeared impossible to
26*ebfedea0SLionel Sambuc# implement "528B" loop in 4 cycles, only in 5. This would mean that
27*ebfedea0SLionel Sambuc# theoretically performance improvement couldn't be more than 20%.
28*ebfedea0SLionel Sambuc# But occasionally you prove yourself wrong:-) I figured out a way to
29*ebfedea0SLionel Sambuc# fold couple of instructions and having freed yet another instruction
30*ebfedea0SLionel Sambuc# slot by unrolling the loop... Resulting performance is 4.45 cycles
31*ebfedea0SLionel Sambuc# per processed byte and 50% better than "256B" version. On original
32*ebfedea0SLionel Sambuc# Itanium performance should remain the same as the "256B" version,
33*ebfedea0SLionel Sambuc# i.e. ~8.5 cycles.
34*ebfedea0SLionel Sambuc
35*ebfedea0SLionel Sambuc$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
36*ebfedea0SLionel Sambuc
37*ebfedea0SLionel Sambucif ($^O eq "hpux") {
38*ebfedea0SLionel Sambuc    $ADDP="addp4";
39*ebfedea0SLionel Sambuc    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
40*ebfedea0SLionel Sambuc} else { $ADDP="add"; }
41*ebfedea0SLionel Sambucfor (@ARGV)  {  $big_endian=1 if (/\-DB_ENDIAN/);
42*ebfedea0SLionel Sambuc                $big_endian=0 if (/\-DL_ENDIAN/);  }
43*ebfedea0SLionel Sambucif (!defined($big_endian))
44*ebfedea0SLionel Sambuc             {  $big_endian=(unpack('L',pack('N',1))==1);  }
45*ebfedea0SLionel Sambuc
46*ebfedea0SLionel Sambucsub loop() {
47*ebfedea0SLionel Sambucmy $label=shift;
48*ebfedea0SLionel Sambucmy ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
49*ebfedea0SLionel Sambuc
50*ebfedea0SLionel Sambuc# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
51*ebfedea0SLionel Sambuc# in scalable manner;-) Naturally assuming data in L1 cache...
52*ebfedea0SLionel Sambuc# Special note about 'dep' instruction, which is used to construct
53*ebfedea0SLionel Sambuc# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
54*ebfedea0SLionel Sambuc# bytes boundary and lower 7 bits of its address are guaranteed to
55*ebfedea0SLionel Sambuc# be zero.
56*ebfedea0SLionel Sambuc$code.=<<___;
57*ebfedea0SLionel Sambuc$label:
58*ebfedea0SLionel Sambuc{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
59*ebfedea0SLionel Sambuc	(p19)	dep	rem=Zlo,rem_4bitp,3,4	}
60*ebfedea0SLionel Sambuc{ .mfi;	(p19)	xor	Zhi=Zhi,Hhi
61*ebfedea0SLionel Sambuc	($p17)	xor	xi[1]=xi[1],in[1]	};;
62*ebfedea0SLionel Sambuc{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
63*ebfedea0SLionel Sambuc	(p19)	shrp	Zlo=Zhi,Zlo,4		}
64*ebfedea0SLionel Sambuc{ .mfi;	(p19)	ld8	rem=[rem]
65*ebfedea0SLionel Sambuc	(p18)	and	Hi[1]=mask0xf0,xi[2]	};;
66*ebfedea0SLionel Sambuc{ .mmi;	($p16)	ld1	in[0]=[inp],-1
67*ebfedea0SLionel Sambuc	(p18)	xor	Zlo=Zlo,Hlo
68*ebfedea0SLionel Sambuc	(p19)	shr.u	Zhi=Zhi,4		}
69*ebfedea0SLionel Sambuc{ .mib;	(p19)	xor	Hhi=Hhi,rem
70*ebfedea0SLionel Sambuc	(p18)	add	Hi[1]=Htbl,Hi[1]	};;
71*ebfedea0SLionel Sambuc
72*ebfedea0SLionel Sambuc{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
73*ebfedea0SLionel Sambuc	(p18)	dep	rem=Zlo,rem_4bitp,3,4	}
74*ebfedea0SLionel Sambuc{ .mfi;	(p17)	shladd	Hi[0]=xi[1],4,r0
75*ebfedea0SLionel Sambuc	(p18)	xor	Zhi=Zhi,Hhi		};;
76*ebfedea0SLionel Sambuc{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
77*ebfedea0SLionel Sambuc	(p18)	shrp	Zlo=Zhi,Zlo,4		}
78*ebfedea0SLionel Sambuc{ .mfi;	(p18)	ld8	rem=[rem]
79*ebfedea0SLionel Sambuc	(p17)	and	Hi[0]=mask0xf0,Hi[0]	};;
80*ebfedea0SLionel Sambuc{ .mmi;	(p16)	ld1	xi[0]=[Xi],-1
81*ebfedea0SLionel Sambuc	(p18)	xor	Zlo=Zlo,Hlo
82*ebfedea0SLionel Sambuc	(p18)	shr.u	Zhi=Zhi,4		}
83*ebfedea0SLionel Sambuc{ .mib;	(p18)	xor	Hhi=Hhi,rem
84*ebfedea0SLionel Sambuc	(p17)	add	Hi[0]=Htbl,Hi[0]
85*ebfedea0SLionel Sambuc	br.ctop.sptk	$label			};;
86*ebfedea0SLionel Sambuc___
87*ebfedea0SLionel Sambuc}
88*ebfedea0SLionel Sambuc
89*ebfedea0SLionel Sambuc$code=<<___;
90*ebfedea0SLionel Sambuc.explicit
91*ebfedea0SLionel Sambuc.text
92*ebfedea0SLionel Sambuc
93*ebfedea0SLionel Sambucprevfs=r2;	prevlc=r3;	prevpr=r8;
94*ebfedea0SLionel Sambucmask0xf0=r21;
95*ebfedea0SLionel Sambucrem=r22;	rem_4bitp=r23;
96*ebfedea0SLionel SambucXi=r24;		Htbl=r25;
97*ebfedea0SLionel Sambucinp=r26;	end=r27;
98*ebfedea0SLionel SambucHhi=r28;	Hlo=r29;
99*ebfedea0SLionel SambucZhi=r30;	Zlo=r31;
100*ebfedea0SLionel Sambuc
101*ebfedea0SLionel Sambuc.align	128
102*ebfedea0SLionel Sambuc.skip	16					// aligns loop body
103*ebfedea0SLionel Sambuc.global	gcm_gmult_4bit#
104*ebfedea0SLionel Sambuc.proc	gcm_gmult_4bit#
105*ebfedea0SLionel Sambucgcm_gmult_4bit:
106*ebfedea0SLionel Sambuc	.prologue
107*ebfedea0SLionel Sambuc{ .mmi;	.save	ar.pfs,prevfs
108*ebfedea0SLionel Sambuc	alloc	prevfs=ar.pfs,2,6,0,8
109*ebfedea0SLionel Sambuc	$ADDP	Xi=15,in0			// &Xi[15]
110*ebfedea0SLionel Sambuc	mov	rem_4bitp=ip		}
111*ebfedea0SLionel Sambuc{ .mii;	$ADDP	Htbl=8,in1			// &Htbl[0].lo
112*ebfedea0SLionel Sambuc	.save	ar.lc,prevlc
113*ebfedea0SLionel Sambuc	mov	prevlc=ar.lc
114*ebfedea0SLionel Sambuc	.save	pr,prevpr
115*ebfedea0SLionel Sambuc	mov	prevpr=pr		};;
116*ebfedea0SLionel Sambuc
117*ebfedea0SLionel Sambuc	.body
118*ebfedea0SLionel Sambuc	.rotr	in[3],xi[3],Hi[2]
119*ebfedea0SLionel Sambuc
120*ebfedea0SLionel Sambuc{ .mib;	ld1	xi[2]=[Xi],-1			// Xi[15]
121*ebfedea0SLionel Sambuc	mov	mask0xf0=0xf0
122*ebfedea0SLionel Sambuc	brp.loop.imp	.Loop1,.Lend1-16};;
123*ebfedea0SLionel Sambuc{ .mmi;	ld1	xi[1]=[Xi],-1			// Xi[14]
124*ebfedea0SLionel Sambuc					};;
125*ebfedea0SLionel Sambuc{ .mii;	shladd	Hi[1]=xi[2],4,r0
126*ebfedea0SLionel Sambuc	mov	pr.rot=0x7<<16
127*ebfedea0SLionel Sambuc	mov	ar.lc=13		};;
128*ebfedea0SLionel Sambuc{ .mii;	and	Hi[1]=mask0xf0,Hi[1]
129*ebfedea0SLionel Sambuc	mov	ar.ec=3
130*ebfedea0SLionel Sambuc	xor	Zlo=Zlo,Zlo		};;
131*ebfedea0SLionel Sambuc{ .mii;	add	Hi[1]=Htbl,Hi[1]		// &Htbl[nlo].lo
132*ebfedea0SLionel Sambuc	add	rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
133*ebfedea0SLionel Sambuc	xor	Zhi=Zhi,Zhi		};;
134*ebfedea0SLionel Sambuc___
135*ebfedea0SLionel Sambuc	&loop	(".Loop1",1);
136*ebfedea0SLionel Sambuc$code.=<<___;
137*ebfedea0SLionel Sambuc.Lend1:
138*ebfedea0SLionel Sambuc{ .mib;	xor	Zhi=Zhi,Hhi		};;	// modulo-scheduling artefact
139*ebfedea0SLionel Sambuc{ .mib;	mux1	Zlo=Zlo,\@rev		};;
140*ebfedea0SLionel Sambuc{ .mib;	mux1	Zhi=Zhi,\@rev		};;
141*ebfedea0SLionel Sambuc{ .mmi;	add	Hlo=9,Xi;;			// ;; is here to prevent
142*ebfedea0SLionel Sambuc	add	Hhi=1,Xi		};;	// pipeline flush on Itanium
143*ebfedea0SLionel Sambuc{ .mib;	st8	[Hlo]=Zlo
144*ebfedea0SLionel Sambuc	mov	pr=prevpr,0x1ffff	};;
145*ebfedea0SLionel Sambuc{ .mib;	st8	[Hhi]=Zhi
146*ebfedea0SLionel Sambuc	mov	ar.lc=prevlc
147*ebfedea0SLionel Sambuc	br.ret.sptk.many	b0	};;
148*ebfedea0SLionel Sambuc.endp	gcm_gmult_4bit#
149*ebfedea0SLionel Sambuc___
150*ebfedea0SLionel Sambuc
151*ebfedea0SLionel Sambuc######################################################################
152*ebfedea0SLionel Sambuc# "528B" (well, "512B" actualy) streamed GHASH
153*ebfedea0SLionel Sambuc#
154*ebfedea0SLionel Sambuc$Xip="in0";
155*ebfedea0SLionel Sambuc$Htbl="in1";
156*ebfedea0SLionel Sambuc$inp="in2";
157*ebfedea0SLionel Sambuc$len="in3";
158*ebfedea0SLionel Sambuc$rem_8bit="loc0";
159*ebfedea0SLionel Sambuc$mask0xff="loc1";
160*ebfedea0SLionel Sambuc($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
161*ebfedea0SLionel Sambuc
162*ebfedea0SLionel Sambucsub load_htable() {
163*ebfedea0SLionel Sambuc    for (my $i=0;$i<8;$i++) {
164*ebfedea0SLionel Sambuc	$code.=<<___;
165*ebfedea0SLionel Sambuc{ .mmi;	ld8	r`16+2*$i+1`=[r8],16		// Htable[$i].hi
166*ebfedea0SLionel Sambuc	ld8	r`16+2*$i`=[r9],16	}	// Htable[$i].lo
167*ebfedea0SLionel Sambuc{ .mmi;	ldf8	f`32+2*$i+1`=[r10],16		// Htable[`8+$i`].hi
168*ebfedea0SLionel Sambuc	ldf8	f`32+2*$i`=[r11],16		// Htable[`8+$i`].lo
169*ebfedea0SLionel Sambuc___
170*ebfedea0SLionel Sambuc	$code.=shift	if (($i+$#_)==7);
171*ebfedea0SLionel Sambuc	$code.="\t};;\n"
172*ebfedea0SLionel Sambuc    }
173*ebfedea0SLionel Sambuc}
174*ebfedea0SLionel Sambuc
175*ebfedea0SLionel Sambuc$code.=<<___;
176*ebfedea0SLionel Sambucprevsp=r3;
177*ebfedea0SLionel Sambuc
178*ebfedea0SLionel Sambuc.align	32
179*ebfedea0SLionel Sambuc.skip	16					// aligns loop body
180*ebfedea0SLionel Sambuc.global	gcm_ghash_4bit#
181*ebfedea0SLionel Sambuc.proc	gcm_ghash_4bit#
182*ebfedea0SLionel Sambucgcm_ghash_4bit:
183*ebfedea0SLionel Sambuc	.prologue
184*ebfedea0SLionel Sambuc{ .mmi;	.save	ar.pfs,prevfs
185*ebfedea0SLionel Sambuc	alloc	prevfs=ar.pfs,4,2,0,0
186*ebfedea0SLionel Sambuc	.vframe	prevsp
187*ebfedea0SLionel Sambuc	mov	prevsp=sp
188*ebfedea0SLionel Sambuc	mov	$rem_8bit=ip		};;
189*ebfedea0SLionel Sambuc	.body
190*ebfedea0SLionel Sambuc{ .mfi;	$ADDP	r8=0+0,$Htbl
191*ebfedea0SLionel Sambuc	$ADDP	r9=0+8,$Htbl		}
192*ebfedea0SLionel Sambuc{ .mfi;	$ADDP	r10=128+0,$Htbl
193*ebfedea0SLionel Sambuc	$ADDP	r11=128+8,$Htbl		};;
194*ebfedea0SLionel Sambuc___
195*ebfedea0SLionel Sambuc	&load_htable(
196*ebfedea0SLionel Sambuc	"	$ADDP	$Xip=15,$Xip",		# &Xi[15]
197*ebfedea0SLionel Sambuc	"	$ADDP	$len=$len,$inp",	# &inp[len]
198*ebfedea0SLionel Sambuc	"	$ADDP	$inp=15,$inp",		# &inp[15]
199*ebfedea0SLionel Sambuc	"	mov	$mask0xff=0xff",
200*ebfedea0SLionel Sambuc	"	add	sp=-512,sp",
201*ebfedea0SLionel Sambuc	"	andcm	sp=sp,$mask0xff",	# align stack frame
202*ebfedea0SLionel Sambuc	"	add	r14=0,sp",
203*ebfedea0SLionel Sambuc	"	add	r15=8,sp");
204*ebfedea0SLionel Sambuc$code.=<<___;
205*ebfedea0SLionel Sambuc{ .mmi;	$sum	1<<1				// go big-endian
206*ebfedea0SLionel Sambuc	add	r8=256+0,sp
207*ebfedea0SLionel Sambuc	add	r9=256+8,sp		}
208*ebfedea0SLionel Sambuc{ .mmi;	add	r10=256+128+0,sp
209*ebfedea0SLionel Sambuc	add	r11=256+128+8,sp
210*ebfedea0SLionel Sambuc	add	$len=-17,$len		};;
211*ebfedea0SLionel Sambuc___
212*ebfedea0SLionel Sambucfor($i=0;$i<8;$i++) {	# generate first half of Hshr4[]
213*ebfedea0SLionel Sambucmy ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
214*ebfedea0SLionel Sambuc$code.=<<___;
215*ebfedea0SLionel Sambuc{ .mmi;	st8	[r8]=$rlo,16			// Htable[$i].lo
216*ebfedea0SLionel Sambuc	st8	[r9]=$rhi,16			// Htable[$i].hi
217*ebfedea0SLionel Sambuc	shrp	$rlo=$rhi,$rlo,4	}//;;
218*ebfedea0SLionel Sambuc{ .mmi;	stf8	[r10]=f`32+2*$i`,16		// Htable[`8+$i`].lo
219*ebfedea0SLionel Sambuc	stf8	[r11]=f`32+2*$i+1`,16		// Htable[`8+$i`].hi
220*ebfedea0SLionel Sambuc	shr.u	$rhi=$rhi,4		};;
221*ebfedea0SLionel Sambuc{ .mmi;	st8	[r14]=$rlo,16			// Htable[$i].lo>>4
222*ebfedea0SLionel Sambuc	st8	[r15]=$rhi,16		}//;;	// Htable[$i].hi>>4
223*ebfedea0SLionel Sambuc___
224*ebfedea0SLionel Sambuc}
225*ebfedea0SLionel Sambuc$code.=<<___;
226*ebfedea0SLionel Sambuc{ .mmi;	ld8	r16=[r8],16			// Htable[8].lo
227*ebfedea0SLionel Sambuc	ld8	r17=[r9],16		};;	// Htable[8].hi
228*ebfedea0SLionel Sambuc{ .mmi;	ld8	r18=[r8],16			// Htable[9].lo
229*ebfedea0SLionel Sambuc	ld8	r19=[r9],16		}	// Htable[9].hi
230*ebfedea0SLionel Sambuc{ .mmi;	rum	1<<5				// clear um.mfh
231*ebfedea0SLionel Sambuc	shrp	r16=r17,r16,4		};;
232*ebfedea0SLionel Sambuc___
233*ebfedea0SLionel Sambucfor($i=0;$i<6;$i++) {	# generate second half of Hshr4[]
234*ebfedea0SLionel Sambuc$code.=<<___;
235*ebfedea0SLionel Sambuc{ .mmi;	ld8	r`20+2*$i`=[r8],16		// Htable[`10+$i`].lo
236*ebfedea0SLionel Sambuc	ld8	r`20+2*$i+1`=[r9],16		// Htable[`10+$i`].hi
237*ebfedea0SLionel Sambuc	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
238*ebfedea0SLionel Sambuc{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
239*ebfedea0SLionel Sambuc	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
240*ebfedea0SLionel Sambuc	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
241*ebfedea0SLionel Sambuc___
242*ebfedea0SLionel Sambuc}
243*ebfedea0SLionel Sambuc$code.=<<___;
244*ebfedea0SLionel Sambuc{ .mmi;	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
245*ebfedea0SLionel Sambuc{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
246*ebfedea0SLionel Sambuc	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
247*ebfedea0SLionel Sambuc	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
248*ebfedea0SLionel Sambuc{ .mmi;	add	$Htbl=256,sp			// &Htable[0]
249*ebfedea0SLionel Sambuc	add	$rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
250*ebfedea0SLionel Sambuc	shr.u	r`18+2*$i+1`=r`18+2*$i+1`,4	};;
251*ebfedea0SLionel Sambuc{ .mmi;	st8	[r14]=r`18+2*$i`		// Htable[`8+$i`].lo>>4
252*ebfedea0SLionel Sambuc	st8	[r15]=r`18+2*$i+1`	}	// Htable[`8+$i`].hi>>4
253*ebfedea0SLionel Sambuc___
254*ebfedea0SLionel Sambuc
255*ebfedea0SLionel Sambuc$in="r15";
256*ebfedea0SLionel Sambuc@xi=("r16","r17");
257*ebfedea0SLionel Sambuc@rem=("r18","r19");
258*ebfedea0SLionel Sambuc($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
259*ebfedea0SLionel Sambuc($Atbl,$Btbl)=("r26","r27");
260*ebfedea0SLionel Sambuc
261*ebfedea0SLionel Sambuc$code.=<<___;	# (p16)
262*ebfedea0SLionel Sambuc{ .mmi;	ld1	$in=[$inp],-1			//(p16) *inp--
263*ebfedea0SLionel Sambuc	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
264*ebfedea0SLionel Sambuc	cmp.eq	p0,p6=r0,r0		};;	//	clear p6
265*ebfedea0SLionel Sambuc___
266*ebfedea0SLionel Sambucpush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
267*ebfedea0SLionel Sambuc
268*ebfedea0SLionel Sambuc$code.=<<___;	# (p16),(p17)
269*ebfedea0SLionel Sambuc{ .mmi;	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
270*ebfedea0SLionel Sambuc	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
271*ebfedea0SLionel Sambuc{ .mii;	ld1	$in=[$inp],-1			//(p16) *inp--
272*ebfedea0SLionel Sambuc	dep	$Atbl=$xi[1],$Htbl,4,4		//(p17) &Htable[nlo].lo
273*ebfedea0SLionel Sambuc	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
274*ebfedea0SLionel Sambuc.align	32
275*ebfedea0SLionel Sambuc.LOOP:
276*ebfedea0SLionel Sambuc{ .mmi;
277*ebfedea0SLionel Sambuc(p6)	st8	[$Xip]=$Zhi,13
278*ebfedea0SLionel Sambuc	xor	$Zlo=$Zlo,$Zlo
279*ebfedea0SLionel Sambuc	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi].lo
280*ebfedea0SLionel Sambuc___
281*ebfedea0SLionel Sambucpush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
282*ebfedea0SLionel Sambuc
283*ebfedea0SLionel Sambuc$code.=<<___;	# (p16),(p17),(p18)
284*ebfedea0SLionel Sambuc{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
285*ebfedea0SLionel Sambuc	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
286*ebfedea0SLionel Sambuc	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
287*ebfedea0SLionel Sambuc{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
288*ebfedea0SLionel Sambuc	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
289*ebfedea0SLionel Sambuc{ .mfi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
290*ebfedea0SLionel Sambuc	xor	$Zlo=$Zlo,$Alo		};;	//(p18) Z.lo^=Htable[nlo].lo
291*ebfedea0SLionel Sambuc{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
292*ebfedea0SLionel Sambuc	ld1	$in=[$inp],-1		}	//(p16) *inp--
293*ebfedea0SLionel Sambuc{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
294*ebfedea0SLionel Sambuc	mov	$Zhi=$Ahi			//(p18) Z.hi^=Htable[nlo].hi
295*ebfedea0SLionel Sambuc	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
296*ebfedea0SLionel Sambuc{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
297*ebfedea0SLionel Sambuc	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
298*ebfedea0SLionel Sambuc	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
299*ebfedea0SLionel Sambuc{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
300*ebfedea0SLionel Sambuc	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
301*ebfedea0SLionel Sambuc___
302*ebfedea0SLionel Sambucpush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
303*ebfedea0SLionel Sambuc
304*ebfedea0SLionel Sambucfor ($i=1;$i<14;$i++) {
305*ebfedea0SLionel Sambuc# Above and below fragments are derived from this one by removing
306*ebfedea0SLionel Sambuc# unsuitable (p??) instructions.
307*ebfedea0SLionel Sambuc$code.=<<___;	# (p16),(p17),(p18),(p19)
308*ebfedea0SLionel Sambuc{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
309*ebfedea0SLionel Sambuc	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
310*ebfedea0SLionel Sambuc	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
311*ebfedea0SLionel Sambuc{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
312*ebfedea0SLionel Sambuc	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
313*ebfedea0SLionel Sambuc	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
314*ebfedea0SLionel Sambuc{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
315*ebfedea0SLionel Sambuc	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
316*ebfedea0SLionel Sambuc	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
317*ebfedea0SLionel Sambuc{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
318*ebfedea0SLionel Sambuc	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
319*ebfedea0SLionel Sambuc	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
320*ebfedea0SLionel Sambuc{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
321*ebfedea0SLionel Sambuc	ld1	$in=[$inp],-1			//(p16) *inp--
322*ebfedea0SLionel Sambuc	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
323*ebfedea0SLionel Sambuc{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
324*ebfedea0SLionel Sambuc	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
325*ebfedea0SLionel Sambuc	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
326*ebfedea0SLionel Sambuc{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
327*ebfedea0SLionel Sambuc	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
328*ebfedea0SLionel Sambuc	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
329*ebfedea0SLionel Sambuc{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
330*ebfedea0SLionel Sambuc	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
331*ebfedea0SLionel Sambuc	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
332*ebfedea0SLionel Sambuc___
333*ebfedea0SLionel Sambucpush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
334*ebfedea0SLionel Sambuc}
335*ebfedea0SLionel Sambuc
336*ebfedea0SLionel Sambuc$code.=<<___;	# (p17),(p18),(p19)
337*ebfedea0SLionel Sambuc{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
338*ebfedea0SLionel Sambuc	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
339*ebfedea0SLionel Sambuc	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
340*ebfedea0SLionel Sambuc{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
341*ebfedea0SLionel Sambuc	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
342*ebfedea0SLionel Sambuc	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
343*ebfedea0SLionel Sambuc{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
344*ebfedea0SLionel Sambuc	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
345*ebfedea0SLionel Sambuc	dep	$Atbl=$xi[1],$Htbl,4,4	};;	//(p17) &Htable[nlo].lo
346*ebfedea0SLionel Sambuc{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
347*ebfedea0SLionel Sambuc	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
348*ebfedea0SLionel Sambuc	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
349*ebfedea0SLionel Sambuc{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
350*ebfedea0SLionel Sambuc	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
351*ebfedea0SLionel Sambuc{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
352*ebfedea0SLionel Sambuc	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
353*ebfedea0SLionel Sambuc	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
354*ebfedea0SLionel Sambuc{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
355*ebfedea0SLionel Sambuc	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
356*ebfedea0SLionel Sambuc{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
357*ebfedea0SLionel Sambuc	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
358*ebfedea0SLionel Sambuc	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
359*ebfedea0SLionel Sambuc___
360*ebfedea0SLionel Sambucpush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
361*ebfedea0SLionel Sambuc
362*ebfedea0SLionel Sambuc$code.=<<___;	# (p18),(p19)
363*ebfedea0SLionel Sambuc{ .mfi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
364*ebfedea0SLionel Sambuc	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
365*ebfedea0SLionel Sambuc{ .mfi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
366*ebfedea0SLionel Sambuc	xor	$Zlo=$Zlo,$Blo		};;	//(p19) Z.lo^=Hshr4[nhi].lo
367*ebfedea0SLionel Sambuc{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
368*ebfedea0SLionel Sambuc	xor	$Zlo=$Zlo,$Alo		}	//(p18) Z.lo^=Htable[nlo].lo
369*ebfedea0SLionel Sambuc{ .mfi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
370*ebfedea0SLionel Sambuc	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
371*ebfedea0SLionel Sambuc{ .mfi;	ld8	$Blo=[$Btbl],8			//(p18) Htable[nhi].lo,&Htable[nhi].hi
372*ebfedea0SLionel Sambuc	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
373*ebfedea0SLionel Sambuc{ .mfi;	shladd	$rem[0]=$Zlo,4,r0		//(p18) Z.lo<<4
374*ebfedea0SLionel Sambuc	xor	$Zhi=$Zhi,$Ahi		};;	//(p18) Z.hi^=Htable[nlo].hi
375*ebfedea0SLionel Sambuc{ .mfi;	ld8	$Bhi=[$Btbl]			//(p18) Htable[nhi].hi
376*ebfedea0SLionel Sambuc	shrp	$Zlo=$Zhi,$Zlo,4	}	//(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
377*ebfedea0SLionel Sambuc{ .mfi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
378*ebfedea0SLionel Sambuc	xor	$Zhi=$Zhi,$rem[1]	};;	//(p19) Z.hi^=rem_8bit[rem]<<48
379*ebfedea0SLionel Sambuc___
380*ebfedea0SLionel Sambucpush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
381*ebfedea0SLionel Sambuc
382*ebfedea0SLionel Sambuc$code.=<<___;	# (p19)
383*ebfedea0SLionel Sambuc{ .mmi;	cmp.ltu	p6,p0=$inp,$len
384*ebfedea0SLionel Sambuc	add	$inp=32,$inp
385*ebfedea0SLionel Sambuc	shr.u	$Zhi=$Zhi,4		}	//(p19) Z.hi>>=4
386*ebfedea0SLionel Sambuc{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
387*ebfedea0SLionel Sambuc	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
388*ebfedea0SLionel Sambuc	add	$Xip=9,$Xip		};;	//	&Xi.lo
389*ebfedea0SLionel Sambuc{ .mmi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
390*ebfedea0SLionel Sambuc(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
391*ebfedea0SLionel Sambuc(p6)	extr.u	$xi[1]=$Zlo,8,8		}	//[p17] Xi[14]
392*ebfedea0SLionel Sambuc{ .mmi;	xor	$Zhi=$Zhi,$Bhi			//(p19) Z.hi^=Hshr4[nhi].hi
393*ebfedea0SLionel Sambuc(p6)	and	$xi[0]=$Zlo,$mask0xff	};;	//[p16] Xi[15]
394*ebfedea0SLionel Sambuc{ .mmi;	st8	[$Xip]=$Zlo,-8
395*ebfedea0SLionel Sambuc(p6)	xor	$xi[0]=$xi[0],$in		//[p17] xi=$xi[i]^inp[i]
396*ebfedea0SLionel Sambuc	shl	$rem[1]=$rem[1],48	};;	//(p19) rem_8bit[rem]<<48
397*ebfedea0SLionel Sambuc{ .mmi;
398*ebfedea0SLionel Sambuc(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
399*ebfedea0SLionel Sambuc	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
400*ebfedea0SLionel Sambuc(p6)	dep	$Atbl=$xi[0],$Htbl,4,4	}	//[p17] &Htable[nlo].lo
401*ebfedea0SLionel Sambuc{ .mib;
402*ebfedea0SLionel Sambuc(p6)	and	$xi[0]=-16,$xi[0]		//[p17] nhi=xi&0xf0
403*ebfedea0SLionel Sambuc(p6)	br.cond.dptk.many	.LOOP	};;
404*ebfedea0SLionel Sambuc
405*ebfedea0SLionel Sambuc{ .mib;	st8	[$Xip]=$Zhi		};;
406*ebfedea0SLionel Sambuc{ .mib;	$rum	1<<1				// return to little-endian
407*ebfedea0SLionel Sambuc	.restore	sp
408*ebfedea0SLionel Sambuc	mov	sp=prevsp
409*ebfedea0SLionel Sambuc	br.ret.sptk.many	b0	};;
410*ebfedea0SLionel Sambuc.endp	gcm_ghash_4bit#
411*ebfedea0SLionel Sambuc___
412*ebfedea0SLionel Sambuc$code.=<<___;
413*ebfedea0SLionel Sambuc.align	128
414*ebfedea0SLionel Sambuc.type	rem_4bit#,\@object
415*ebfedea0SLionel Sambucrem_4bit:
416*ebfedea0SLionel Sambuc        data8	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
417*ebfedea0SLionel Sambuc        data8	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
418*ebfedea0SLionel Sambuc        data8	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
419*ebfedea0SLionel Sambuc        data8	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
420*ebfedea0SLionel Sambuc.size	rem_4bit#,128
421*ebfedea0SLionel Sambuc.type	rem_8bit#,\@object
422*ebfedea0SLionel Sambucrem_8bit:
423*ebfedea0SLionel Sambuc	data1	0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
424*ebfedea0SLionel Sambuc	data1	0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
425*ebfedea0SLionel Sambuc	data1	0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
426*ebfedea0SLionel Sambuc	data1	0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
427*ebfedea0SLionel Sambuc	data1	0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
428*ebfedea0SLionel Sambuc	data1	0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
429*ebfedea0SLionel Sambuc	data1	0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
430*ebfedea0SLionel Sambuc	data1	0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
431*ebfedea0SLionel Sambuc	data1	0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
432*ebfedea0SLionel Sambuc	data1	0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
433*ebfedea0SLionel Sambuc	data1	0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
434*ebfedea0SLionel Sambuc	data1	0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
435*ebfedea0SLionel Sambuc	data1	0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
436*ebfedea0SLionel Sambuc	data1	0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
437*ebfedea0SLionel Sambuc	data1	0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
438*ebfedea0SLionel Sambuc	data1	0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
439*ebfedea0SLionel Sambuc	data1	0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
440*ebfedea0SLionel Sambuc	data1	0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
441*ebfedea0SLionel Sambuc	data1	0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
442*ebfedea0SLionel Sambuc	data1	0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
443*ebfedea0SLionel Sambuc	data1	0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
444*ebfedea0SLionel Sambuc	data1	0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
445*ebfedea0SLionel Sambuc	data1	0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
446*ebfedea0SLionel Sambuc	data1	0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
447*ebfedea0SLionel Sambuc	data1	0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
448*ebfedea0SLionel Sambuc	data1	0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
449*ebfedea0SLionel Sambuc	data1	0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
450*ebfedea0SLionel Sambuc	data1	0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
451*ebfedea0SLionel Sambuc	data1	0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
452*ebfedea0SLionel Sambuc	data1	0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
453*ebfedea0SLionel Sambuc	data1	0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
454*ebfedea0SLionel Sambuc	data1	0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
455*ebfedea0SLionel Sambuc.size	rem_8bit#,512
456*ebfedea0SLionel Sambucstringz	"GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
457*ebfedea0SLionel Sambuc___
458*ebfedea0SLionel Sambuc
459*ebfedea0SLionel Sambuc$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm      if ($big_endian);
460*ebfedea0SLionel Sambuc$code =~ s/\`([^\`]*)\`/eval $1/gem;
461*ebfedea0SLionel Sambuc
462*ebfedea0SLionel Sambucprint $code;
463*ebfedea0SLionel Sambucclose STDOUT;
464