xref: /openbsd-src/lib/libcrypto/modes/asm/ghash-sparcv9.pl (revision 9484a439e95949b9d8685055f43a77c2ab822582)
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Performance
15# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
16# and are expressed in cycles per processed byte, less is better:
17#
18#		gcc 3.3.x	cc 5.2		this assembler
19#
20# 32-bit build	81.4		43.3		12.6	(+546%/+244%)
21# 64-bit build	20.2		21.2		12.6	(+60%/+68%)
22#
23# Here is data collected on UltraSPARC T1 system running Linux:
24#
25#		gcc 4.4.1			this assembler
26#
27# 32-bit build	566				50	(+1000%)
28# 64-bit build	56				50	(+12%)
29#
30# I don't quite understand why difference between 32-bit and 64-bit
31# compiler-generated code is so big. Compilers *were* instructed to
32# generate code for UltraSPARC and should have used 64-bit registers
33# for Z vector (see C code) even in 32-bit build... Oh well, it only
34# means more impressive improvement coefficients for this assembler
35# module;-) Loops are aggressively modulo-scheduled in respect to
36# references to input data and Z.hi updates to achieve 12 cycles
37# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
38# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
39
40$bits=32;
41for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
42if ($bits==64)  { $bias=2047; $frame=192; }
43else            { $bias=0;    $frame=112; }
44
45$output=shift;
46open STDOUT,">$output";
47
48$Zhi="%o0";	# 64-bit values
49$Zlo="%o1";
50$Thi="%o2";
51$Tlo="%o3";
52$rem="%o4";
53$tmp="%o5";
54
55$nhi="%l0";	# small values and pointers
56$nlo="%l1";
57$xi0="%l2";
58$xi1="%l3";
59$rem_4bit="%l4";
60$remi="%l5";
61$Htblo="%l6";
62$cnt="%l7";
63
64$Xi="%i0";	# input argument block
65$Htbl="%i1";
66$inp="%i2";
67$len="%i3";
68
69$code.=<<___;
70.section	".rodata",#alloc
71
72.align	64
73rem_4bit:
74	.long	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
75	.long	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
76	.long	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
77	.long	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
78.type	rem_4bit,#object
79.size	rem_4bit,(.-rem_4bit)
80
81.section	".text",#alloc,#execinstr
82.globl	gcm_ghash_4bit
83.align	32
84gcm_ghash_4bit:
85	save	%sp,-$frame,%sp
86#ifdef __PIC__
87	sethi	%hi(_GLOBAL_OFFSET_TABLE_-4), $tmp
88	rd	%pc, $rem
89	or	$tmp, %lo(_GLOBAL_OFFSET_TABLE_+4), $tmp
90	add	$tmp, $rem, $tmp
91#endif
92
93	ldub	[$inp+15],$nlo
94	ldub	[$Xi+15],$xi0
95	ldub	[$Xi+14],$xi1
96	add	$len,$inp,$len
97	add	$Htbl,8,$Htblo
98
99#ifdef __PIC__
100	set	rem_4bit, $rem_4bit
101	ldx	[$rem_4bit+$tmp], $rem_4bit
102#else
103	set	rem_4bit, $rem_4bit
104#endif
105
106.Louter:
107	xor	$xi0,$nlo,$nlo
108	and	$nlo,0xf0,$nhi
109	and	$nlo,0x0f,$nlo
110	sll	$nlo,4,$nlo
111	ldx	[$Htblo+$nlo],$Zlo
112	ldx	[$Htbl+$nlo],$Zhi
113
114	ldub	[$inp+14],$nlo
115
116	ldx	[$Htblo+$nhi],$Tlo
117	and	$Zlo,0xf,$remi
118	ldx	[$Htbl+$nhi],$Thi
119	sll	$remi,3,$remi
120	ldx	[$rem_4bit+$remi],$rem
121	srlx	$Zlo,4,$Zlo
122	mov	13,$cnt
123	sllx	$Zhi,60,$tmp
124	xor	$Tlo,$Zlo,$Zlo
125	srlx	$Zhi,4,$Zhi
126	xor	$Zlo,$tmp,$Zlo
127
128	xor	$xi1,$nlo,$nlo
129	and	$Zlo,0xf,$remi
130	and	$nlo,0xf0,$nhi
131	and	$nlo,0x0f,$nlo
132	ba	.Lghash_inner
133	sll	$nlo,4,$nlo
134.align	32
135.Lghash_inner:
136	ldx	[$Htblo+$nlo],$Tlo
137	sll	$remi,3,$remi
138	xor	$Thi,$Zhi,$Zhi
139	ldx	[$Htbl+$nlo],$Thi
140	srlx	$Zlo,4,$Zlo
141	xor	$rem,$Zhi,$Zhi
142	ldx	[$rem_4bit+$remi],$rem
143	sllx	$Zhi,60,$tmp
144	xor	$Tlo,$Zlo,$Zlo
145	ldub	[$inp+$cnt],$nlo
146	srlx	$Zhi,4,$Zhi
147	xor	$Zlo,$tmp,$Zlo
148	ldub	[$Xi+$cnt],$xi1
149	xor	$Thi,$Zhi,$Zhi
150	and	$Zlo,0xf,$remi
151
152	ldx	[$Htblo+$nhi],$Tlo
153	sll	$remi,3,$remi
154	xor	$rem,$Zhi,$Zhi
155	ldx	[$Htbl+$nhi],$Thi
156	srlx	$Zlo,4,$Zlo
157	ldx	[$rem_4bit+$remi],$rem
158	sllx	$Zhi,60,$tmp
159	xor	$xi1,$nlo,$nlo
160	srlx	$Zhi,4,$Zhi
161	and	$nlo,0xf0,$nhi
162	addcc	$cnt,-1,$cnt
163	xor	$Zlo,$tmp,$Zlo
164	and	$nlo,0x0f,$nlo
165	xor	$Tlo,$Zlo,$Zlo
166	sll	$nlo,4,$nlo
167	blu	.Lghash_inner
168	and	$Zlo,0xf,$remi
169
170	ldx	[$Htblo+$nlo],$Tlo
171	sll	$remi,3,$remi
172	xor	$Thi,$Zhi,$Zhi
173	ldx	[$Htbl+$nlo],$Thi
174	srlx	$Zlo,4,$Zlo
175	xor	$rem,$Zhi,$Zhi
176	ldx	[$rem_4bit+$remi],$rem
177	sllx	$Zhi,60,$tmp
178	xor	$Tlo,$Zlo,$Zlo
179	srlx	$Zhi,4,$Zhi
180	xor	$Zlo,$tmp,$Zlo
181	xor	$Thi,$Zhi,$Zhi
182
183	add	$inp,16,$inp
184	cmp	$inp,$len
185	be,pn	`$bits==64?"%xcc":"%icc"`,.Ldone
186	and	$Zlo,0xf,$remi
187
188	ldx	[$Htblo+$nhi],$Tlo
189	sll	$remi,3,$remi
190	xor	$rem,$Zhi,$Zhi
191	ldx	[$Htbl+$nhi],$Thi
192	srlx	$Zlo,4,$Zlo
193	ldx	[$rem_4bit+$remi],$rem
194	sllx	$Zhi,60,$tmp
195	xor	$Tlo,$Zlo,$Zlo
196	ldub	[$inp+15],$nlo
197	srlx	$Zhi,4,$Zhi
198	xor	$Zlo,$tmp,$Zlo
199	xor	$Thi,$Zhi,$Zhi
200	stx	$Zlo,[$Xi+8]
201	xor	$rem,$Zhi,$Zhi
202	stx	$Zhi,[$Xi]
203	srl	$Zlo,8,$xi1
204	and	$Zlo,0xff,$xi0
205	ba	.Louter
206	and	$xi1,0xff,$xi1
207.align	32
208.Ldone:
209	ldx	[$Htblo+$nhi],$Tlo
210	sll	$remi,3,$remi
211	xor	$rem,$Zhi,$Zhi
212	ldx	[$Htbl+$nhi],$Thi
213	srlx	$Zlo,4,$Zlo
214	ldx	[$rem_4bit+$remi],$rem
215	sllx	$Zhi,60,$tmp
216	xor	$Tlo,$Zlo,$Zlo
217	srlx	$Zhi,4,$Zhi
218	xor	$Zlo,$tmp,$Zlo
219	xor	$Thi,$Zhi,$Zhi
220	stx	$Zlo,[$Xi+8]
221	xor	$rem,$Zhi,$Zhi
222	stx	$Zhi,[$Xi]
223
224	ret
225	restore
226.type	gcm_ghash_4bit,#function
227.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
228___
229
230undef $inp;
231undef $len;
232
233$code.=<<___;
234.globl	gcm_gmult_4bit
235.align	32
236gcm_gmult_4bit:
237	save	%sp,-$frame,%sp
238#ifdef __PIC__
239	sethi	%hi(_GLOBAL_OFFSET_TABLE_-4), $tmp
240	rd	%pc, $rem
241	or	$tmp, %lo(_GLOBAL_OFFSET_TABLE_+4), $tmp
242	add	$tmp, $rem, $tmp
243#endif
244
245	ldub	[$Xi+15],$nlo
246	add	$Htbl,8,$Htblo
247
248#ifdef __PIC__
249	set	rem_4bit, $rem_4bit
250	ldx	[$rem_4bit+$tmp], $rem_4bit
251#else
252	set	rem_4bit, $rem_4bit
253#endif
254
255	and	$nlo,0xf0,$nhi
256	and	$nlo,0x0f,$nlo
257	sll	$nlo,4,$nlo
258	ldx	[$Htblo+$nlo],$Zlo
259	ldx	[$Htbl+$nlo],$Zhi
260
261	ldub	[$Xi+14],$nlo
262
263	ldx	[$Htblo+$nhi],$Tlo
264	and	$Zlo,0xf,$remi
265	ldx	[$Htbl+$nhi],$Thi
266	sll	$remi,3,$remi
267	ldx	[$rem_4bit+$remi],$rem
268	srlx	$Zlo,4,$Zlo
269	mov	13,$cnt
270	sllx	$Zhi,60,$tmp
271	xor	$Tlo,$Zlo,$Zlo
272	srlx	$Zhi,4,$Zhi
273	xor	$Zlo,$tmp,$Zlo
274
275	and	$Zlo,0xf,$remi
276	and	$nlo,0xf0,$nhi
277	and	$nlo,0x0f,$nlo
278	ba	.Lgmult_inner
279	sll	$nlo,4,$nlo
280.align	32
281.Lgmult_inner:
282	ldx	[$Htblo+$nlo],$Tlo
283	sll	$remi,3,$remi
284	xor	$Thi,$Zhi,$Zhi
285	ldx	[$Htbl+$nlo],$Thi
286	srlx	$Zlo,4,$Zlo
287	xor	$rem,$Zhi,$Zhi
288	ldx	[$rem_4bit+$remi],$rem
289	sllx	$Zhi,60,$tmp
290	xor	$Tlo,$Zlo,$Zlo
291	ldub	[$Xi+$cnt],$nlo
292	srlx	$Zhi,4,$Zhi
293	xor	$Zlo,$tmp,$Zlo
294	xor	$Thi,$Zhi,$Zhi
295	and	$Zlo,0xf,$remi
296
297	ldx	[$Htblo+$nhi],$Tlo
298	sll	$remi,3,$remi
299	xor	$rem,$Zhi,$Zhi
300	ldx	[$Htbl+$nhi],$Thi
301	srlx	$Zlo,4,$Zlo
302	ldx	[$rem_4bit+$remi],$rem
303	sllx	$Zhi,60,$tmp
304	srlx	$Zhi,4,$Zhi
305	and	$nlo,0xf0,$nhi
306	addcc	$cnt,-1,$cnt
307	xor	$Zlo,$tmp,$Zlo
308	and	$nlo,0x0f,$nlo
309	xor	$Tlo,$Zlo,$Zlo
310	sll	$nlo,4,$nlo
311	blu	.Lgmult_inner
312	and	$Zlo,0xf,$remi
313
314	ldx	[$Htblo+$nlo],$Tlo
315	sll	$remi,3,$remi
316	xor	$Thi,$Zhi,$Zhi
317	ldx	[$Htbl+$nlo],$Thi
318	srlx	$Zlo,4,$Zlo
319	xor	$rem,$Zhi,$Zhi
320	ldx	[$rem_4bit+$remi],$rem
321	sllx	$Zhi,60,$tmp
322	xor	$Tlo,$Zlo,$Zlo
323	srlx	$Zhi,4,$Zhi
324	xor	$Zlo,$tmp,$Zlo
325	xor	$Thi,$Zhi,$Zhi
326	and	$Zlo,0xf,$remi
327
328	ldx	[$Htblo+$nhi],$Tlo
329	sll	$remi,3,$remi
330	xor	$rem,$Zhi,$Zhi
331	ldx	[$Htbl+$nhi],$Thi
332	srlx	$Zlo,4,$Zlo
333	ldx	[$rem_4bit+$remi],$rem
334	sllx	$Zhi,60,$tmp
335	xor	$Tlo,$Zlo,$Zlo
336	srlx	$Zhi,4,$Zhi
337	xor	$Zlo,$tmp,$Zlo
338	xor	$Thi,$Zhi,$Zhi
339	stx	$Zlo,[$Xi+8]
340	xor	$rem,$Zhi,$Zhi
341	stx	$Zhi,[$Xi]
342
343	ret
344	restore
345.type	gcm_gmult_4bit,#function
346.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
347___
348
349$code =~ s/\`([^\`]*)\`/eval $1/gem;
350print $code;
351close STDOUT;
352