xref: /openbsd-src/lib/libcrypto/modes/asm/ghash-parisc.pl (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# April 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
15# it processes one byte in 19.6 cycles, which is more than twice as
16# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
17# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
18# processed byte. This is ~2.2x faster than 64-bit code generated by
19# vendor compiler (which used to be very hard to beat:-).
20#
21# Special thanks to polarhome.com for providing HP-UX account.
22
23$flavour = shift;
24$output = shift;
25open STDOUT,">$output";
26
27if ($flavour =~ /64/) {
28	$LEVEL		="2.0W";
29	$SIZE_T		=8;
30	$FRAME_MARKER	=80;
31	$SAVED_RP	=16;
32	$PUSH		="std";
33	$PUSHMA		="std,ma";
34	$POP		="ldd";
35	$POPMB		="ldd,mb";
36	$NREGS		=6;
37} else {
38	$LEVEL		="1.0";	#"\n\t.ALLOW\t2.0";
39	$SIZE_T		=4;
40	$FRAME_MARKER	=48;
41	$SAVED_RP	=20;
42	$PUSH		="stw";
43	$PUSHMA		="stwm";
44	$POP		="ldw";
45	$POPMB		="ldwm";
46	$NREGS		=11;
47}
48
49$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
50				#                 [+ argument transfer]
51
52################# volatile registers
53$Xi="%r26";	# argument block
54$Htbl="%r25";
55$inp="%r24";
56$len="%r23";
57$Hhh=$Htbl;	# variables
58$Hll="%r22";
59$Zhh="%r21";
60$Zll="%r20";
61$cnt="%r19";
62$rem_4bit="%r28";
63$rem="%r29";
64$mask0xf0="%r31";
65
66################# preserved registers
67$Thh="%r1";
68$Tll="%r2";
69$nlo="%r3";
70$nhi="%r4";
71$byte="%r5";
72if ($SIZE_T==4) {
73	$Zhl="%r6";
74	$Zlh="%r7";
75	$Hhl="%r8";
76	$Hlh="%r9";
77	$Thl="%r10";
78	$Tlh="%r11";
79}
80$rem2="%r6";	# used in PA-RISC 2.0 code
81
82$code.=<<___;
83	.LEVEL	$LEVEL
84#if 0
85	.SPACE	\$TEXT\$
86	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
87#else
88	.text
89#endif
90
91	.EXPORT	gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
92	.ALIGN	64
93gcm_gmult_4bit
94	.PROC
95	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
96	.ENTRY
97	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
98	$PUSHMA	%r3,$FRAME(%sp)
99	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
100	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
101	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
102___
103$code.=<<___ if ($SIZE_T==4);
104	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
105	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
106	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
107	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
108	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
109___
110$code.=<<___;
111	blr	%r0,$rem_4bit
112	ldi	3,$rem
113L\$pic_gmult
114	andcm	$rem_4bit,$rem,$rem_4bit
115	addl	$inp,$len,$len
116	ldo	L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
117	ldi	0xf0,$mask0xf0
118___
119$code.=<<___ if ($SIZE_T==4);
120#ifndef __OpenBSD__
121	ldi	31,$rem
122	mtctl	$rem,%cr11
123	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
124	b	L\$parisc1_gmult
125	nop
126___
127
128$code.=<<___;
129	ldb	15($Xi),$nlo
130	ldo	8($Htbl),$Hll
131
132	and	$mask0xf0,$nlo,$nhi
133	depd,z	$nlo,59,4,$nlo
134
135	ldd	$nlo($Hll),$Zll
136	ldd	$nlo($Hhh),$Zhh
137
138	depd,z	$Zll,60,4,$rem
139	shrpd	$Zhh,$Zll,4,$Zll
140	extrd,u	$Zhh,59,60,$Zhh
141	ldb	14($Xi),$nlo
142
143	ldd	$nhi($Hll),$Tll
144	ldd	$nhi($Hhh),$Thh
145	and	$mask0xf0,$nlo,$nhi
146	depd,z	$nlo,59,4,$nlo
147
148	xor	$Tll,$Zll,$Zll
149	xor	$Thh,$Zhh,$Zhh
150	ldd	$rem($rem_4bit),$rem
151	b	L\$oop_gmult_pa2
152	ldi	13,$cnt
153
154	.ALIGN	8
155L\$oop_gmult_pa2
156	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
157	depd,z	$Zll,60,4,$rem
158
159	shrpd	$Zhh,$Zll,4,$Zll
160	extrd,u	$Zhh,59,60,$Zhh
161	ldd	$nlo($Hll),$Tll
162	ldd	$nlo($Hhh),$Thh
163
164	xor	$Tll,$Zll,$Zll
165	xor	$Thh,$Zhh,$Zhh
166	ldd	$rem($rem_4bit),$rem
167
168	xor	$rem,$Zhh,$Zhh
169	depd,z	$Zll,60,4,$rem
170	ldbx	$cnt($Xi),$nlo
171
172	shrpd	$Zhh,$Zll,4,$Zll
173	extrd,u	$Zhh,59,60,$Zhh
174	ldd	$nhi($Hll),$Tll
175	ldd	$nhi($Hhh),$Thh
176
177	and	$mask0xf0,$nlo,$nhi
178	depd,z	$nlo,59,4,$nlo
179	ldd	$rem($rem_4bit),$rem
180
181	xor	$Tll,$Zll,$Zll
182	addib,uv -1,$cnt,L\$oop_gmult_pa2
183	xor	$Thh,$Zhh,$Zhh
184
185	xor	$rem,$Zhh,$Zhh
186	depd,z	$Zll,60,4,$rem
187
188	shrpd	$Zhh,$Zll,4,$Zll
189	extrd,u	$Zhh,59,60,$Zhh
190	ldd	$nlo($Hll),$Tll
191	ldd	$nlo($Hhh),$Thh
192
193	xor	$Tll,$Zll,$Zll
194	xor	$Thh,$Zhh,$Zhh
195	ldd	$rem($rem_4bit),$rem
196
197	xor	$rem,$Zhh,$Zhh
198	depd,z	$Zll,60,4,$rem
199
200	shrpd	$Zhh,$Zll,4,$Zll
201	extrd,u	$Zhh,59,60,$Zhh
202	ldd	$nhi($Hll),$Tll
203	ldd	$nhi($Hhh),$Thh
204
205	xor	$Tll,$Zll,$Zll
206	xor	$Thh,$Zhh,$Zhh
207	ldd	$rem($rem_4bit),$rem
208
209	xor	$rem,$Zhh,$Zhh
210	std	$Zll,8($Xi)
211	std	$Zhh,0($Xi)
212___
213
214$code.=<<___ if ($SIZE_T==4);
215	b	L\$done_gmult
216	nop
217
218L\$parisc1_gmult
219#endif
220	ldb	15($Xi),$nlo
221	ldo	12($Htbl),$Hll
222	ldo	8($Htbl),$Hlh
223	ldo	4($Htbl),$Hhl
224
225	and	$mask0xf0,$nlo,$nhi
226	zdep	$nlo,27,4,$nlo
227
228	ldwx	$nlo($Hll),$Zll
229	ldwx	$nlo($Hlh),$Zlh
230	ldwx	$nlo($Hhl),$Zhl
231	ldwx	$nlo($Hhh),$Zhh
232	zdep	$Zll,28,4,$rem
233	ldb	14($Xi),$nlo
234	ldwx	$rem($rem_4bit),$rem
235	shrpw	$Zlh,$Zll,4,$Zll
236	ldwx	$nhi($Hll),$Tll
237	shrpw	$Zhl,$Zlh,4,$Zlh
238	ldwx	$nhi($Hlh),$Tlh
239	shrpw	$Zhh,$Zhl,4,$Zhl
240	ldwx	$nhi($Hhl),$Thl
241	extru	$Zhh,27,28,$Zhh
242	ldwx	$nhi($Hhh),$Thh
243	xor	$rem,$Zhh,$Zhh
244	and	$mask0xf0,$nlo,$nhi
245	zdep	$nlo,27,4,$nlo
246
247	xor	$Tll,$Zll,$Zll
248	ldwx	$nlo($Hll),$Tll
249	xor	$Tlh,$Zlh,$Zlh
250	ldwx	$nlo($Hlh),$Tlh
251	xor	$Thl,$Zhl,$Zhl
252	b	L\$oop_gmult_pa1
253	ldi	13,$cnt
254
255	.ALIGN	8
256L\$oop_gmult_pa1
257	zdep	$Zll,28,4,$rem
258	ldwx	$nlo($Hhl),$Thl
259	xor	$Thh,$Zhh,$Zhh
260	ldwx	$rem($rem_4bit),$rem
261	shrpw	$Zlh,$Zll,4,$Zll
262	ldwx	$nlo($Hhh),$Thh
263	shrpw	$Zhl,$Zlh,4,$Zlh
264	ldbx	$cnt($Xi),$nlo
265	xor	$Tll,$Zll,$Zll
266	ldwx	$nhi($Hll),$Tll
267	shrpw	$Zhh,$Zhl,4,$Zhl
268	xor	$Tlh,$Zlh,$Zlh
269	ldwx	$nhi($Hlh),$Tlh
270	extru	$Zhh,27,28,$Zhh
271	xor	$Thl,$Zhl,$Zhl
272	ldwx	$nhi($Hhl),$Thl
273	xor	$rem,$Zhh,$Zhh
274	zdep	$Zll,28,4,$rem
275	xor	$Thh,$Zhh,$Zhh
276	ldwx	$nhi($Hhh),$Thh
277	shrpw	$Zlh,$Zll,4,$Zll
278	ldwx	$rem($rem_4bit),$rem
279	shrpw	$Zhl,$Zlh,4,$Zlh
280	shrpw	$Zhh,$Zhl,4,$Zhl
281	and	$mask0xf0,$nlo,$nhi
282	extru	$Zhh,27,28,$Zhh
283	zdep	$nlo,27,4,$nlo
284	xor	$Tll,$Zll,$Zll
285	ldwx	$nlo($Hll),$Tll
286	xor	$Tlh,$Zlh,$Zlh
287	ldwx	$nlo($Hlh),$Tlh
288	xor	$rem,$Zhh,$Zhh
289	addib,uv -1,$cnt,L\$oop_gmult_pa1
290	xor	$Thl,$Zhl,$Zhl
291
292	zdep	$Zll,28,4,$rem
293	ldwx	$nlo($Hhl),$Thl
294	xor	$Thh,$Zhh,$Zhh
295	ldwx	$rem($rem_4bit),$rem
296	shrpw	$Zlh,$Zll,4,$Zll
297	ldwx	$nlo($Hhh),$Thh
298	shrpw	$Zhl,$Zlh,4,$Zlh
299	xor	$Tll,$Zll,$Zll
300	ldwx	$nhi($Hll),$Tll
301	shrpw	$Zhh,$Zhl,4,$Zhl
302	xor	$Tlh,$Zlh,$Zlh
303	ldwx	$nhi($Hlh),$Tlh
304	extru	$Zhh,27,28,$Zhh
305	xor	$rem,$Zhh,$Zhh
306	xor	$Thl,$Zhl,$Zhl
307	ldwx	$nhi($Hhl),$Thl
308	xor	$Thh,$Zhh,$Zhh
309	ldwx	$nhi($Hhh),$Thh
310	zdep	$Zll,28,4,$rem
311	ldwx	$rem($rem_4bit),$rem
312	shrpw	$Zlh,$Zll,4,$Zll
313	shrpw	$Zhl,$Zlh,4,$Zlh
314	shrpw	$Zhh,$Zhl,4,$Zhl
315	extru	$Zhh,27,28,$Zhh
316	xor	$Tll,$Zll,$Zll
317	xor	$Tlh,$Zlh,$Zlh
318	xor	$rem,$Zhh,$Zhh
319	stw	$Zll,12($Xi)
320	xor	$Thl,$Zhl,$Zhl
321	stw	$Zlh,8($Xi)
322	xor	$Thh,$Zhh,$Zhh
323	stw	$Zhl,4($Xi)
324	stw	$Zhh,0($Xi)
325___
326$code.=<<___;
327L\$done_gmult
328	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
329	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
330	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
331	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
332___
333$code.=<<___ if ($SIZE_T==4);
334	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
335	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
336	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
337	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
338	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
339___
340$code.=<<___;
341	bv	(%r2)
342	.EXIT
343	$POPMB	-$FRAME(%sp),%r3
344	.PROCEND
345
346	.EXPORT	gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
347	.ALIGN	64
348gcm_ghash_4bit
349	.PROC
350	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
351	.ENTRY
352	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
353	$PUSHMA	%r3,$FRAME(%sp)
354	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
355	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
356	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
357___
358$code.=<<___ if ($SIZE_T==4);
359	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
360	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
361	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
362	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
363	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
364___
365$code.=<<___;
366	blr	%r0,$rem_4bit
367	ldi	3,$rem
368L\$pic_ghash
369	andcm	$rem_4bit,$rem,$rem_4bit
370	addl	$inp,$len,$len
371	ldo	L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
372	ldi	0xf0,$mask0xf0
373___
374$code.=<<___ if ($SIZE_T==4);
375#ifndef __OpenBSD__
376	ldi	31,$rem
377	mtctl	$rem,%cr11
378	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
379	b	L\$parisc1_ghash
380	nop
381___
382
383$code.=<<___;
384	ldb	15($Xi),$nlo
385	ldo	8($Htbl),$Hll
386
387L\$outer_ghash_pa2
388	ldb	15($inp),$nhi
389	xor	$nhi,$nlo,$nlo
390	and	$mask0xf0,$nlo,$nhi
391	depd,z	$nlo,59,4,$nlo
392
393	ldd	$nlo($Hll),$Zll
394	ldd	$nlo($Hhh),$Zhh
395
396	depd,z	$Zll,60,4,$rem
397	shrpd	$Zhh,$Zll,4,$Zll
398	extrd,u	$Zhh,59,60,$Zhh
399	ldb	14($Xi),$nlo
400	ldb	14($inp),$byte
401
402	ldd	$nhi($Hll),$Tll
403	ldd	$nhi($Hhh),$Thh
404	xor	$byte,$nlo,$nlo
405	and	$mask0xf0,$nlo,$nhi
406	depd,z	$nlo,59,4,$nlo
407
408	xor	$Tll,$Zll,$Zll
409	xor	$Thh,$Zhh,$Zhh
410	ldd	$rem($rem_4bit),$rem
411	b	L\$oop_ghash_pa2
412	ldi	13,$cnt
413
414	.ALIGN	8
415L\$oop_ghash_pa2
416	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
417	depd,z	$Zll,60,4,$rem2
418
419	shrpd	$Zhh,$Zll,4,$Zll
420	extrd,u	$Zhh,59,60,$Zhh
421	ldd	$nlo($Hll),$Tll
422	ldd	$nlo($Hhh),$Thh
423
424	xor	$Tll,$Zll,$Zll
425	xor	$Thh,$Zhh,$Zhh
426	ldbx	$cnt($Xi),$nlo
427	ldbx	$cnt($inp),$byte
428
429	depd,z	$Zll,60,4,$rem
430	shrpd	$Zhh,$Zll,4,$Zll
431	ldd	$rem2($rem_4bit),$rem2
432
433	xor	$rem2,$Zhh,$Zhh
434	xor	$byte,$nlo,$nlo
435	ldd	$nhi($Hll),$Tll
436	ldd	$nhi($Hhh),$Thh
437
438	and	$mask0xf0,$nlo,$nhi
439	depd,z	$nlo,59,4,$nlo
440
441	extrd,u	$Zhh,59,60,$Zhh
442	xor	$Tll,$Zll,$Zll
443
444	ldd	$rem($rem_4bit),$rem
445	addib,uv -1,$cnt,L\$oop_ghash_pa2
446	xor	$Thh,$Zhh,$Zhh
447
448	xor	$rem,$Zhh,$Zhh
449	depd,z	$Zll,60,4,$rem2
450
451	shrpd	$Zhh,$Zll,4,$Zll
452	extrd,u	$Zhh,59,60,$Zhh
453	ldd	$nlo($Hll),$Tll
454	ldd	$nlo($Hhh),$Thh
455
456	xor	$Tll,$Zll,$Zll
457	xor	$Thh,$Zhh,$Zhh
458
459	depd,z	$Zll,60,4,$rem
460	shrpd	$Zhh,$Zll,4,$Zll
461	ldd	$rem2($rem_4bit),$rem2
462
463	xor	$rem2,$Zhh,$Zhh
464	ldd	$nhi($Hll),$Tll
465	ldd	$nhi($Hhh),$Thh
466
467	extrd,u	$Zhh,59,60,$Zhh
468	xor	$Tll,$Zll,$Zll
469	xor	$Thh,$Zhh,$Zhh
470	ldd	$rem($rem_4bit),$rem
471
472	xor	$rem,$Zhh,$Zhh
473	std	$Zll,8($Xi)
474	ldo	16($inp),$inp
475	std	$Zhh,0($Xi)
476	cmpb,*<> $inp,$len,L\$outer_ghash_pa2
477	copy	$Zll,$nlo
478___
479
480$code.=<<___ if ($SIZE_T==4);
481	b	L\$done_ghash
482	nop
483
484L\$parisc1_ghash
485#endif
486	ldb	15($Xi),$nlo
487	ldo	12($Htbl),$Hll
488	ldo	8($Htbl),$Hlh
489	ldo	4($Htbl),$Hhl
490
491L\$outer_ghash_pa1
492	ldb	15($inp),$byte
493	xor	$byte,$nlo,$nlo
494	and	$mask0xf0,$nlo,$nhi
495	zdep	$nlo,27,4,$nlo
496
497	ldwx	$nlo($Hll),$Zll
498	ldwx	$nlo($Hlh),$Zlh
499	ldwx	$nlo($Hhl),$Zhl
500	ldwx	$nlo($Hhh),$Zhh
501	zdep	$Zll,28,4,$rem
502	ldb	14($Xi),$nlo
503	ldb	14($inp),$byte
504	ldwx	$rem($rem_4bit),$rem
505	shrpw	$Zlh,$Zll,4,$Zll
506	ldwx	$nhi($Hll),$Tll
507	shrpw	$Zhl,$Zlh,4,$Zlh
508	ldwx	$nhi($Hlh),$Tlh
509	shrpw	$Zhh,$Zhl,4,$Zhl
510	ldwx	$nhi($Hhl),$Thl
511	extru	$Zhh,27,28,$Zhh
512	ldwx	$nhi($Hhh),$Thh
513	xor	$byte,$nlo,$nlo
514	xor	$rem,$Zhh,$Zhh
515	and	$mask0xf0,$nlo,$nhi
516	zdep	$nlo,27,4,$nlo
517
518	xor	$Tll,$Zll,$Zll
519	ldwx	$nlo($Hll),$Tll
520	xor	$Tlh,$Zlh,$Zlh
521	ldwx	$nlo($Hlh),$Tlh
522	xor	$Thl,$Zhl,$Zhl
523	b	L\$oop_ghash_pa1
524	ldi	13,$cnt
525
526	.ALIGN	8
527L\$oop_ghash_pa1
528	zdep	$Zll,28,4,$rem
529	ldwx	$nlo($Hhl),$Thl
530	xor	$Thh,$Zhh,$Zhh
531	ldwx	$rem($rem_4bit),$rem
532	shrpw	$Zlh,$Zll,4,$Zll
533	ldwx	$nlo($Hhh),$Thh
534	shrpw	$Zhl,$Zlh,4,$Zlh
535	ldbx	$cnt($Xi),$nlo
536	xor	$Tll,$Zll,$Zll
537	ldwx	$nhi($Hll),$Tll
538	shrpw	$Zhh,$Zhl,4,$Zhl
539	ldbx	$cnt($inp),$byte
540	xor	$Tlh,$Zlh,$Zlh
541	ldwx	$nhi($Hlh),$Tlh
542	extru	$Zhh,27,28,$Zhh
543	xor	$Thl,$Zhl,$Zhl
544	ldwx	$nhi($Hhl),$Thl
545	xor	$rem,$Zhh,$Zhh
546	zdep	$Zll,28,4,$rem
547	xor	$Thh,$Zhh,$Zhh
548	ldwx	$nhi($Hhh),$Thh
549	shrpw	$Zlh,$Zll,4,$Zll
550	ldwx	$rem($rem_4bit),$rem
551	shrpw	$Zhl,$Zlh,4,$Zlh
552	xor	$byte,$nlo,$nlo
553	shrpw	$Zhh,$Zhl,4,$Zhl
554	and	$mask0xf0,$nlo,$nhi
555	extru	$Zhh,27,28,$Zhh
556	zdep	$nlo,27,4,$nlo
557	xor	$Tll,$Zll,$Zll
558	ldwx	$nlo($Hll),$Tll
559	xor	$Tlh,$Zlh,$Zlh
560	ldwx	$nlo($Hlh),$Tlh
561	xor	$rem,$Zhh,$Zhh
562	addib,uv -1,$cnt,L\$oop_ghash_pa1
563	xor	$Thl,$Zhl,$Zhl
564
565	zdep	$Zll,28,4,$rem
566	ldwx	$nlo($Hhl),$Thl
567	xor	$Thh,$Zhh,$Zhh
568	ldwx	$rem($rem_4bit),$rem
569	shrpw	$Zlh,$Zll,4,$Zll
570	ldwx	$nlo($Hhh),$Thh
571	shrpw	$Zhl,$Zlh,4,$Zlh
572	xor	$Tll,$Zll,$Zll
573	ldwx	$nhi($Hll),$Tll
574	shrpw	$Zhh,$Zhl,4,$Zhl
575	xor	$Tlh,$Zlh,$Zlh
576	ldwx	$nhi($Hlh),$Tlh
577	extru	$Zhh,27,28,$Zhh
578	xor	$rem,$Zhh,$Zhh
579	xor	$Thl,$Zhl,$Zhl
580	ldwx	$nhi($Hhl),$Thl
581	xor	$Thh,$Zhh,$Zhh
582	ldwx	$nhi($Hhh),$Thh
583	zdep	$Zll,28,4,$rem
584	ldwx	$rem($rem_4bit),$rem
585	shrpw	$Zlh,$Zll,4,$Zll
586	shrpw	$Zhl,$Zlh,4,$Zlh
587	shrpw	$Zhh,$Zhl,4,$Zhl
588	extru	$Zhh,27,28,$Zhh
589	xor	$Tll,$Zll,$Zll
590	xor	$Tlh,$Zlh,$Zlh
591	xor	$rem,$Zhh,$Zhh
592	stw	$Zll,12($Xi)
593	xor	$Thl,$Zhl,$Zhl
594	stw	$Zlh,8($Xi)
595	xor	$Thh,$Zhh,$Zhh
596	stw	$Zhl,4($Xi)
597	ldo	16($inp),$inp
598	stw	$Zhh,0($Xi)
599	comb,<>	$inp,$len,L\$outer_ghash_pa1
600	copy	$Zll,$nlo
601___
602$code.=<<___;
603L\$done_ghash
604	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
605	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
606	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
607	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
608___
609$code.=<<___ if ($SIZE_T==4);
610	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
611	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
612	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
613	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
614	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
615___
616$code.=<<___;
617	bv	(%r2)
618	.EXIT
619	$POPMB	-$FRAME(%sp),%r3
620	.PROCEND
621
622	.ALIGN	64
623L\$rem_4bit
624	.WORD	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
625	.WORD	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
626	.WORD	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
627	.WORD	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
628
629	.data
630	.STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
631	.ALIGN	64
632___
633
634# Explicitly encode PA-RISC 2.0 instructions used in this module, so
635# that it can be compiled with .LEVEL 1.0. It should be noted that I
636# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
637# directive...
638
639my $ldd = sub {
640  my ($mod,$args) = @_;
641  my $orig = "ldd$mod\t$args";
642
643    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
644    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
645	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
646    }
647    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
648    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
649	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
650	$opcode|=(1<<5)  if ($mod =~ /^,m/);
651	$opcode|=(1<<13) if ($mod =~ /^,mb/);
652	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
653    }
654    else { "\t".$orig; }
655};
656
657my $std = sub {
658  my ($mod,$args) = @_;
659  my $orig = "std$mod\t$args";
660
661    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
662    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
663	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
664    }
665    else { "\t".$orig; }
666};
667
668my $extrd = sub {
669  my ($mod,$args) = @_;
670  my $orig = "extrd$mod\t$args";
671
672    # I only have ",u" completer, it's implicitly encoded...
673    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
674    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
675	my $len=32-$3;
676	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
677	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
678	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
679    }
680    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
681    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
682	my $len=32-$2;
683	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
684	$opcode |= (1<<13) if ($mod =~ /,\**=/);
685	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
686    }
687    else { "\t".$orig; }
688};
689
690my $shrpd = sub {
691  my ($mod,$args) = @_;
692  my $orig = "shrpd$mod\t$args";
693
694    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
695    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
696	my $cpos=63-$3;
697	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
698	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
699    }
700    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
701    {	sprintf "\t.WORD\t0x%08x\t; %s",
702		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
703    }
704    else { "\t".$orig; }
705};
706
707my $depd = sub {
708  my ($mod,$args) = @_;
709  my $orig = "depd$mod\t$args";
710
711    # I only have ",z" completer, it's implicitly encoded...
712    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 16
713    {	my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
714    	my $cpos=63-$2;
715	my $len=32-$3;
716	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode pos
717	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
718	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
719    }
720    else { "\t".$orig; }
721};
722
723sub assemble {
724  my ($mnemonic,$mod,$args)=@_;
725  my $opcode = eval("\$$mnemonic");
726
727    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
728}
729
730foreach (split("\n",$code)) {
731	s/\`([^\`]*)\`/eval $1/ge;
732	if ($SIZE_T==4) {
733		s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
734		s/cmpb,\*/comb,/;
735		s/,\*/,/;
736	}
737	s/\bbv\b/bve/	if ($SIZE_T==8);
738	print $_,"\n";
739}
740
741close STDOUT;
742