xref: /onnv-gate/usr/src/common/openssl/crypto/des/asm/des_enc.m4 (revision 2139:6243c3338933)
1*2139Sjp161948!  des_enc.m4
2*2139Sjp161948!  des_enc.S  (generated from des_enc.m4)
3*2139Sjp161948!
4*2139Sjp161948!  UltraSPARC assembler version of the LibDES/SSLeay/OpenSSL des_enc.c file.
5*2139Sjp161948!
6*2139Sjp161948!  Version 1.0. 32-bit version.
7*2139Sjp161948!
8*2139Sjp161948!  June 8, 2000.
9*2139Sjp161948!
10*2139Sjp161948!  Version 2.0. 32/64-bit, PIC-ification, blended CPU adaptation
11*2139Sjp161948!		by Andy Polyakov.
12*2139Sjp161948!
13*2139Sjp161948!  January 1, 2003.
14*2139Sjp161948!
15*2139Sjp161948!  Assembler version: Copyright Svend Olaf Mikkelsen.
16*2139Sjp161948!
17*2139Sjp161948!  Original C code: Copyright Eric A. Young.
18*2139Sjp161948!
19*2139Sjp161948!  This code can be freely used by LibDES/SSLeay/OpenSSL users.
20*2139Sjp161948!
21*2139Sjp161948!  The LibDES/SSLeay/OpenSSL copyright notices must be respected.
22*2139Sjp161948!
23*2139Sjp161948!  This version can be redistributed.
24*2139Sjp161948!
25*2139Sjp161948!  To expand the m4 macros: m4 -B 8192 des_enc.m4 > des_enc.S
26*2139Sjp161948!
27*2139Sjp161948!  Global registers 1 to 5 are used. This is the same as done by the
28*2139Sjp161948!  cc compiler. The UltraSPARC load/store little endian feature is used.
29*2139Sjp161948!
30*2139Sjp161948!  Instruction grouping often refers to one CPU cycle.
31*2139Sjp161948!
32*2139Sjp161948!  Assemble through gcc: gcc -c -mcpu=ultrasparc -o des_enc.o des_enc.S
33*2139Sjp161948!
34*2139Sjp161948!  Assemble through cc:  cc -c -xarch=v8plusa -o des_enc.o des_enc.S
35*2139Sjp161948!
36*2139Sjp161948!  Performance improvement according to './apps/openssl speed des'
37*2139Sjp161948!
38*2139Sjp161948!	32-bit build:
39*2139Sjp161948!		23%  faster than cc-5.2 -xarch=v8plus -xO5
40*2139Sjp161948!		115% faster than gcc-3.2.1 -m32 -mcpu=ultrasparc -O5
41*2139Sjp161948!	64-bit build:
42*2139Sjp161948!		50%  faster than cc-5.2 -xarch=v9 -xO5
43*2139Sjp161948!		100% faster than gcc-3.2.1 -m64 -mcpu=ultrasparc -O5
44*2139Sjp161948!
45*2139Sjp161948
46*2139Sjp161948.ident "des_enc.m4 2.1"
47*2139Sjp161948
48*2139Sjp161948#if defined(__SUNPRO_C) && defined(__sparcv9)
49*2139Sjp161948# define ABI64  /* They've said -xarch=v9 at command line */
50*2139Sjp161948#elif defined(__GNUC__) && defined(__arch64__)
51*2139Sjp161948# define ABI64  /* They've said -m64 at command line */
52*2139Sjp161948#endif
53*2139Sjp161948
54*2139Sjp161948#ifdef ABI64
55*2139Sjp161948  .register	%g2,#scratch
56*2139Sjp161948  .register	%g3,#scratch
57*2139Sjp161948# define	FRAME	-192
58*2139Sjp161948# define	BIAS	2047
59*2139Sjp161948# define	LDPTR	ldx
60*2139Sjp161948# define	STPTR	stx
61*2139Sjp161948# define	ARG0	128
62*2139Sjp161948# define	ARGSZ	8
63*2139Sjp161948# ifndef OPENSSL_SYSNAME_ULTRASPARC
64*2139Sjp161948# define OPENSSL_SYSNAME_ULTRASPARC
65*2139Sjp161948# endif
66*2139Sjp161948#else
67*2139Sjp161948# define	FRAME	-96
68*2139Sjp161948# define	BIAS	0
69*2139Sjp161948# define	LDPTR	ld
70*2139Sjp161948# define	STPTR	st
71*2139Sjp161948# define	ARG0	68
72*2139Sjp161948# define	ARGSZ	4
73*2139Sjp161948#endif
74*2139Sjp161948
75*2139Sjp161948#define LOOPS 7
76*2139Sjp161948
77*2139Sjp161948#define global0 %g0
78*2139Sjp161948#define global1 %g1
79*2139Sjp161948#define global2 %g2
80*2139Sjp161948#define global3 %g3
81*2139Sjp161948#define global4 %g4
82*2139Sjp161948#define global5 %g5
83*2139Sjp161948
84*2139Sjp161948#define local0 %l0
85*2139Sjp161948#define local1 %l1
86*2139Sjp161948#define local2 %l2
87*2139Sjp161948#define local3 %l3
88*2139Sjp161948#define local4 %l4
89*2139Sjp161948#define local5 %l5
90*2139Sjp161948#define local7 %l6
91*2139Sjp161948#define local6 %l7
92*2139Sjp161948
93*2139Sjp161948#define in0 %i0
94*2139Sjp161948#define in1 %i1
95*2139Sjp161948#define in2 %i2
96*2139Sjp161948#define in3 %i3
97*2139Sjp161948#define in4 %i4
98*2139Sjp161948#define in5 %i5
99*2139Sjp161948#define in6 %i6
100*2139Sjp161948#define in7 %i7
101*2139Sjp161948
102*2139Sjp161948#define out0 %o0
103*2139Sjp161948#define out1 %o1
104*2139Sjp161948#define out2 %o2
105*2139Sjp161948#define out3 %o3
106*2139Sjp161948#define out4 %o4
107*2139Sjp161948#define out5 %o5
108*2139Sjp161948#define out6 %o6
109*2139Sjp161948#define out7 %o7
110*2139Sjp161948
111*2139Sjp161948#define stub stb
112*2139Sjp161948
113*2139Sjp161948changequote({,})
114*2139Sjp161948
115*2139Sjp161948
116*2139Sjp161948! Macro definitions:
117*2139Sjp161948
118*2139Sjp161948
119*2139Sjp161948! {ip_macro}
120*2139Sjp161948!
121*2139Sjp161948! The logic used in initial and final permutations is the same as in
122*2139Sjp161948! the C code. The permutations are done with a clever shift, xor, and
123*2139Sjp161948! technique.
124*2139Sjp161948!
125*2139Sjp161948! The macro also loads address sbox 1 to 5 to global 1 to 5, address
126*2139Sjp161948! sbox 6 to local6, and addres sbox 8 to out3.
127*2139Sjp161948!
128*2139Sjp161948! Rotates the halfs 3 left to bring the sbox bits in convenient positions.
129*2139Sjp161948!
130*2139Sjp161948! Loads key first round from address in parameter 5 to out0, out1.
131*2139Sjp161948!
132*2139Sjp161948! After the the original LibDES initial permutation, the resulting left
133*2139Sjp161948! is in the variable initially used for right and vice versa. The macro
134*2139Sjp161948! implements the possibility to keep the halfs in the original registers.
135*2139Sjp161948!
136*2139Sjp161948! parameter 1  left
137*2139Sjp161948! parameter 2  right
138*2139Sjp161948! parameter 3  result left (modify in first round)
139*2139Sjp161948! parameter 4  result right (use in first round)
140*2139Sjp161948! parameter 5  key address
141*2139Sjp161948! parameter 6  1/2 for include encryption/decryption
142*2139Sjp161948! parameter 7  1 for move in1 to in3
143*2139Sjp161948! parameter 8  1 for move in3 to in4, 2 for move in4 to in3
144*2139Sjp161948! parameter 9  1 for load ks3 and ks2 to in4 and in3
145*2139Sjp161948
146*2139Sjp161948define(ip_macro, {
147*2139Sjp161948
148*2139Sjp161948! {ip_macro}
149*2139Sjp161948! $1 $2 $4 $3 $5 $6 $7 $8 $9
150*2139Sjp161948
151*2139Sjp161948	ld	[out2+256], local1
152*2139Sjp161948	srl	$2, 4, local4
153*2139Sjp161948
154*2139Sjp161948	xor	local4, $1, local4
155*2139Sjp161948	ifelse($7,1,{mov in1, in3},{nop})
156*2139Sjp161948
157*2139Sjp161948	ld	[out2+260], local2
158*2139Sjp161948	and	local4, local1, local4
159*2139Sjp161948	ifelse($8,1,{mov in3, in4},{})
160*2139Sjp161948	ifelse($8,2,{mov in4, in3},{})
161*2139Sjp161948
162*2139Sjp161948	ld	[out2+280], out4          ! loop counter
163*2139Sjp161948	sll	local4, 4, local1
164*2139Sjp161948	xor	$1, local4, $1
165*2139Sjp161948
166*2139Sjp161948	ld	[out2+264], local3
167*2139Sjp161948	srl	$1, 16, local4
168*2139Sjp161948	xor	$2, local1, $2
169*2139Sjp161948
170*2139Sjp161948	ifelse($9,1,{LDPTR	KS3, in4},{})
171*2139Sjp161948	xor	local4, $2, local4
172*2139Sjp161948	nop	!sethi	%hi(DES_SPtrans), global1 ! sbox addr
173*2139Sjp161948
174*2139Sjp161948	ifelse($9,1,{LDPTR	KS2, in3},{})
175*2139Sjp161948	and	local4, local2, local4
176*2139Sjp161948	nop	!or	global1, %lo(DES_SPtrans), global1   ! sbox addr
177*2139Sjp161948
178*2139Sjp161948	sll	local4, 16, local1
179*2139Sjp161948	xor	$2, local4, $2
180*2139Sjp161948
181*2139Sjp161948	srl	$2, 2, local4
182*2139Sjp161948	xor	$1, local1, $1
183*2139Sjp161948
184*2139Sjp161948	sethi	%hi(16711680), local5
185*2139Sjp161948	xor	local4, $1, local4
186*2139Sjp161948
187*2139Sjp161948	and	local4, local3, local4
188*2139Sjp161948	or	local5, 255, local5
189*2139Sjp161948
190*2139Sjp161948	sll	local4, 2, local2
191*2139Sjp161948	xor	$1, local4, $1
192*2139Sjp161948
193*2139Sjp161948	srl	$1, 8, local4
194*2139Sjp161948	xor	$2, local2, $2
195*2139Sjp161948
196*2139Sjp161948	xor	local4, $2, local4
197*2139Sjp161948	add	global1, 768, global4
198*2139Sjp161948
199*2139Sjp161948	and	local4, local5, local4
200*2139Sjp161948	add	global1, 1024, global5
201*2139Sjp161948
202*2139Sjp161948	ld	[out2+272], local7
203*2139Sjp161948	sll	local4, 8, local1
204*2139Sjp161948	xor	$2, local4, $2
205*2139Sjp161948
206*2139Sjp161948	srl	$2, 1, local4
207*2139Sjp161948	xor	$1, local1, $1
208*2139Sjp161948
209*2139Sjp161948	ld	[$5], out0                ! key 7531
210*2139Sjp161948	xor	local4, $1, local4
211*2139Sjp161948	add	global1, 256, global2
212*2139Sjp161948
213*2139Sjp161948	ld	[$5+4], out1              ! key 8642
214*2139Sjp161948	and	local4, local7, local4
215*2139Sjp161948	add	global1, 512, global3
216*2139Sjp161948
217*2139Sjp161948	sll	local4, 1, local1
218*2139Sjp161948	xor	$1, local4, $1
219*2139Sjp161948
220*2139Sjp161948	sll	$1, 3, local3
221*2139Sjp161948	xor	$2, local1, $2
222*2139Sjp161948
223*2139Sjp161948	sll	$2, 3, local2
224*2139Sjp161948	add	global1, 1280, local6     ! address sbox 8
225*2139Sjp161948
226*2139Sjp161948	srl	$1, 29, local4
227*2139Sjp161948	add	global1, 1792, out3       ! address sbox 8
228*2139Sjp161948
229*2139Sjp161948	srl	$2, 29, local1
230*2139Sjp161948	or	local4, local3, $4
231*2139Sjp161948
232*2139Sjp161948	or	local2, local1, $3
233*2139Sjp161948
234*2139Sjp161948	ifelse($6, 1, {
235*2139Sjp161948
236*2139Sjp161948		ld	[out2+284], local5     ! 0x0000FC00 used in the rounds
237*2139Sjp161948		or	local2, local1, $3
238*2139Sjp161948		xor	$4, out0, local1
239*2139Sjp161948
240*2139Sjp161948		call .des_enc.1
241*2139Sjp161948		and	local1, 252, local1
242*2139Sjp161948
243*2139Sjp161948	},{})
244*2139Sjp161948
245*2139Sjp161948	ifelse($6, 2, {
246*2139Sjp161948
247*2139Sjp161948		ld	[out2+284], local5     ! 0x0000FC00 used in the rounds
248*2139Sjp161948		or	local2, local1, $3
249*2139Sjp161948		xor	$4, out0, local1
250*2139Sjp161948
251*2139Sjp161948		call .des_dec.1
252*2139Sjp161948		and	local1, 252, local1
253*2139Sjp161948
254*2139Sjp161948	},{})
255*2139Sjp161948})
256*2139Sjp161948
257*2139Sjp161948
258*2139Sjp161948! {rounds_macro}
259*2139Sjp161948!
260*2139Sjp161948! The logic used in the DES rounds is the same as in the C code,
261*2139Sjp161948! except that calculations for sbox 1 and sbox 5 begin before
262*2139Sjp161948! the previous round is finished.
263*2139Sjp161948!
264*2139Sjp161948! In each round one half (work) is modified based on key and the
265*2139Sjp161948! other half (use).
266*2139Sjp161948!
267*2139Sjp161948! In this version we do two rounds in a loop repeated 7 times
268*2139Sjp161948! and two rounds seperately.
269*2139Sjp161948!
270*2139Sjp161948! One half has the bits for the sboxes in the following positions:
271*2139Sjp161948!
272*2139Sjp161948!	777777xx555555xx333333xx111111xx
273*2139Sjp161948!
274*2139Sjp161948!	88xx666666xx444444xx222222xx8888
275*2139Sjp161948!
276*2139Sjp161948! The bits for each sbox are xor-ed with the key bits for that box.
277*2139Sjp161948! The above xx bits are cleared, and the result used for lookup in
278*2139Sjp161948! the sbox table. Each sbox entry contains the 4 output bits permuted
279*2139Sjp161948! into 32 bits according to the P permutation.
280*2139Sjp161948!
281*2139Sjp161948! In the description of DES, left and right are switched after
282*2139Sjp161948! each round, except after last round. In this code the original
283*2139Sjp161948! left and right are kept in the same register in all rounds, meaning
284*2139Sjp161948! that after the 16 rounds the result for right is in the register
285*2139Sjp161948! originally used for left.
286*2139Sjp161948!
287*2139Sjp161948! parameter 1  first work (left in first round)
288*2139Sjp161948! parameter 2  first use (right in first round)
289*2139Sjp161948! parameter 3  enc/dec  1/-1
290*2139Sjp161948! parameter 4  loop label
291*2139Sjp161948! parameter 5  key address register
292*2139Sjp161948! parameter 6  optional address for key next encryption/decryption
293*2139Sjp161948! parameter 7  not empty for include retl
294*2139Sjp161948!
295*2139Sjp161948! also compares in2 to 8
296*2139Sjp161948
297*2139Sjp161948define(rounds_macro, {
298*2139Sjp161948
299*2139Sjp161948! {rounds_macro}
300*2139Sjp161948! $1 $2 $3 $4 $5 $6 $7 $8 $9
301*2139Sjp161948
302*2139Sjp161948	xor	$2, out0, local1
303*2139Sjp161948
304*2139Sjp161948	ld	[out2+284], local5        ! 0x0000FC00
305*2139Sjp161948	ba	$4
306*2139Sjp161948	and	local1, 252, local1
307*2139Sjp161948
308*2139Sjp161948	.align 32
309*2139Sjp161948
310*2139Sjp161948$4:
311*2139Sjp161948	! local6 is address sbox 6
312*2139Sjp161948	! out3   is address sbox 8
313*2139Sjp161948	! out4   is loop counter
314*2139Sjp161948
315*2139Sjp161948	ld	[global1+local1], local1
316*2139Sjp161948	xor	$2, out1, out1            ! 8642
317*2139Sjp161948	xor	$2, out0, out0            ! 7531
318*2139Sjp161948	fmovs	%f0, %f0                  ! fxor used for alignment
319*2139Sjp161948
320*2139Sjp161948	srl	out1, 4, local0           ! rotate 4 right
321*2139Sjp161948	and	out0, local5, local3      ! 3
322*2139Sjp161948	fmovs	%f0, %f0
323*2139Sjp161948
324*2139Sjp161948	ld	[$5+$3*8], local7         ! key 7531 next round
325*2139Sjp161948	srl	local3, 8, local3         ! 3
326*2139Sjp161948	and	local0, 252, local2       ! 2
327*2139Sjp161948	fmovs	%f0, %f0
328*2139Sjp161948
329*2139Sjp161948	ld	[global3+local3],local3   ! 3
330*2139Sjp161948	sll	out1, 28, out1            ! rotate
331*2139Sjp161948	xor	$1, local1, $1            ! 1 finished, local1 now sbox 7
332*2139Sjp161948
333*2139Sjp161948	ld	[global2+local2], local2  ! 2
334*2139Sjp161948	srl	out0, 24, local1          ! 7
335*2139Sjp161948	or	out1, local0, out1        ! rotate
336*2139Sjp161948
337*2139Sjp161948	ldub	[out2+local1], local1     ! 7 (and 0xFC)
338*2139Sjp161948	srl	out1, 24, local0          ! 8
339*2139Sjp161948	and	out1, local5, local4      ! 4
340*2139Sjp161948
341*2139Sjp161948	ldub	[out2+local0], local0     ! 8 (and 0xFC)
342*2139Sjp161948	srl	local4, 8, local4         ! 4
343*2139Sjp161948	xor	$1, local2, $1            ! 2 finished local2 now sbox 6
344*2139Sjp161948
345*2139Sjp161948	ld	[global4+local4],local4   ! 4
346*2139Sjp161948	srl	out1, 16, local2          ! 6
347*2139Sjp161948	xor	$1, local3, $1            ! 3 finished local3 now sbox 5
348*2139Sjp161948
349*2139Sjp161948	ld	[out3+local0],local0      ! 8
350*2139Sjp161948	and	local2, 252, local2       ! 6
351*2139Sjp161948	add	global1, 1536, local5     ! address sbox 7
352*2139Sjp161948
353*2139Sjp161948	ld	[local6+local2], local2   ! 6
354*2139Sjp161948	srl	out0, 16, local3          ! 5
355*2139Sjp161948	xor	$1, local4, $1            ! 4 finished
356*2139Sjp161948
357*2139Sjp161948	ld	[local5+local1],local1    ! 7
358*2139Sjp161948	and	local3, 252, local3       ! 5
359*2139Sjp161948	xor	$1, local0, $1            ! 8 finished
360*2139Sjp161948
361*2139Sjp161948	ld	[global5+local3],local3   ! 5
362*2139Sjp161948	xor	$1, local2, $1            ! 6 finished
363*2139Sjp161948	subcc	out4, 1, out4
364*2139Sjp161948
365*2139Sjp161948	ld	[$5+$3*8+4], out0         ! key 8642 next round
366*2139Sjp161948	xor	$1, local7, local2        ! sbox 5 next round
367*2139Sjp161948	xor	$1, local1, $1            ! 7 finished
368*2139Sjp161948
369*2139Sjp161948	srl	local2, 16, local2        ! sbox 5 next round
370*2139Sjp161948	xor	$1, local3, $1            ! 5 finished
371*2139Sjp161948
372*2139Sjp161948	ld	[$5+$3*16+4], out1        ! key 8642 next round again
373*2139Sjp161948	and	local2, 252, local2       ! sbox5 next round
374*2139Sjp161948! next round
375*2139Sjp161948	xor	$1, local7, local7        ! 7531
376*2139Sjp161948
377*2139Sjp161948	ld	[global5+local2], local2  ! 5
378*2139Sjp161948	srl	local7, 24, local3        ! 7
379*2139Sjp161948	xor	$1, out0, out0            ! 8642
380*2139Sjp161948
381*2139Sjp161948	ldub	[out2+local3], local3     ! 7 (and 0xFC)
382*2139Sjp161948	srl	out0, 4, local0           ! rotate 4 right
383*2139Sjp161948	and	local7, 252, local1       ! 1
384*2139Sjp161948
385*2139Sjp161948	sll	out0, 28, out0            ! rotate
386*2139Sjp161948	xor	$2, local2, $2            ! 5 finished local2 used
387*2139Sjp161948
388*2139Sjp161948	srl	local0, 8, local4         ! 4
389*2139Sjp161948	and	local0, 252, local2       ! 2
390*2139Sjp161948	ld	[local5+local3], local3   ! 7
391*2139Sjp161948
392*2139Sjp161948	srl	local0, 16, local5        ! 6
393*2139Sjp161948	or	out0, local0, out0        ! rotate
394*2139Sjp161948	ld	[global2+local2], local2  ! 2
395*2139Sjp161948
396*2139Sjp161948	srl	out0, 24, local0
397*2139Sjp161948	ld	[$5+$3*16], out0          ! key 7531 next round
398*2139Sjp161948	and	local4, 252, local4	  ! 4
399*2139Sjp161948
400*2139Sjp161948	and	local5, 252, local5       ! 6
401*2139Sjp161948	ld	[global4+local4], local4  ! 4
402*2139Sjp161948	xor	$2, local3, $2            ! 7 finished local3 used
403*2139Sjp161948
404*2139Sjp161948	and	local0, 252, local0       ! 8
405*2139Sjp161948	ld	[local6+local5], local5   ! 6
406*2139Sjp161948	xor	$2, local2, $2            ! 2 finished local2 now sbox 3
407*2139Sjp161948
408*2139Sjp161948	srl	local7, 8, local2         ! 3 start
409*2139Sjp161948	ld	[out3+local0], local0     ! 8
410*2139Sjp161948	xor	$2, local4, $2            ! 4 finished
411*2139Sjp161948
412*2139Sjp161948	and	local2, 252, local2       ! 3
413*2139Sjp161948	ld	[global1+local1], local1  ! 1
414*2139Sjp161948	xor	$2, local5, $2            ! 6 finished local5 used
415*2139Sjp161948
416*2139Sjp161948	ld	[global3+local2], local2  ! 3
417*2139Sjp161948	xor	$2, local0, $2            ! 8 finished
418*2139Sjp161948	add	$5, $3*16, $5             ! enc add 8, dec add -8 to key pointer
419*2139Sjp161948
420*2139Sjp161948	ld	[out2+284], local5        ! 0x0000FC00
421*2139Sjp161948	xor	$2, out0, local4          ! sbox 1 next round
422*2139Sjp161948	xor	$2, local1, $2            ! 1 finished
423*2139Sjp161948
424*2139Sjp161948	xor	$2, local2, $2            ! 3 finished
425*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
426*2139Sjp161948	bne,pt	%icc, $4
427*2139Sjp161948#else
428*2139Sjp161948	bne	$4
429*2139Sjp161948#endif
430*2139Sjp161948	and	local4, 252, local1       ! sbox 1 next round
431*2139Sjp161948
432*2139Sjp161948! two rounds more:
433*2139Sjp161948
434*2139Sjp161948	ld	[global1+local1], local1
435*2139Sjp161948	xor	$2, out1, out1
436*2139Sjp161948	xor	$2, out0, out0
437*2139Sjp161948
438*2139Sjp161948	srl	out1, 4, local0           ! rotate
439*2139Sjp161948	and	out0, local5, local3
440*2139Sjp161948
441*2139Sjp161948	ld	[$5+$3*8], local7         ! key 7531
442*2139Sjp161948	srl	local3, 8, local3
443*2139Sjp161948	and	local0, 252, local2
444*2139Sjp161948
445*2139Sjp161948	ld	[global3+local3],local3
446*2139Sjp161948	sll	out1, 28, out1            ! rotate
447*2139Sjp161948	xor	$1, local1, $1            ! 1 finished, local1 now sbox 7
448*2139Sjp161948
449*2139Sjp161948	ld	[global2+local2], local2
450*2139Sjp161948	srl	out0, 24, local1
451*2139Sjp161948	or	out1, local0, out1        ! rotate
452*2139Sjp161948
453*2139Sjp161948	ldub	[out2+local1], local1
454*2139Sjp161948	srl	out1, 24, local0
455*2139Sjp161948	and	out1, local5, local4
456*2139Sjp161948
457*2139Sjp161948	ldub	[out2+local0], local0
458*2139Sjp161948	srl	local4, 8, local4
459*2139Sjp161948	xor	$1, local2, $1            ! 2 finished local2 now sbox 6
460*2139Sjp161948
461*2139Sjp161948	ld	[global4+local4],local4
462*2139Sjp161948	srl	out1, 16, local2
463*2139Sjp161948	xor	$1, local3, $1            ! 3 finished local3 now sbox 5
464*2139Sjp161948
465*2139Sjp161948	ld	[out3+local0],local0
466*2139Sjp161948	and	local2, 252, local2
467*2139Sjp161948	add	global1, 1536, local5     ! address sbox 7
468*2139Sjp161948
469*2139Sjp161948	ld	[local6+local2], local2
470*2139Sjp161948	srl	out0, 16, local3
471*2139Sjp161948	xor	$1, local4, $1            ! 4 finished
472*2139Sjp161948
473*2139Sjp161948	ld	[local5+local1],local1
474*2139Sjp161948	and	local3, 252, local3
475*2139Sjp161948	xor	$1, local0, $1
476*2139Sjp161948
477*2139Sjp161948	ld	[global5+local3],local3
478*2139Sjp161948	xor	$1, local2, $1            ! 6 finished
479*2139Sjp161948	cmp	in2, 8
480*2139Sjp161948
481*2139Sjp161948	ifelse($6,{}, {}, {ld	[out2+280], out4})  ! loop counter
482*2139Sjp161948	xor	$1, local7, local2        ! sbox 5 next round
483*2139Sjp161948	xor	$1, local1, $1            ! 7 finished
484*2139Sjp161948
485*2139Sjp161948	ld	[$5+$3*8+4], out0
486*2139Sjp161948	srl	local2, 16, local2        ! sbox 5 next round
487*2139Sjp161948	xor	$1, local3, $1            ! 5 finished
488*2139Sjp161948
489*2139Sjp161948	and	local2, 252, local2
490*2139Sjp161948! next round (two rounds more)
491*2139Sjp161948	xor	$1, local7, local7        ! 7531
492*2139Sjp161948
493*2139Sjp161948	ld	[global5+local2], local2
494*2139Sjp161948	srl	local7, 24, local3
495*2139Sjp161948	xor	$1, out0, out0            ! 8642
496*2139Sjp161948
497*2139Sjp161948	ldub	[out2+local3], local3
498*2139Sjp161948	srl	out0, 4, local0           ! rotate
499*2139Sjp161948	and	local7, 252, local1
500*2139Sjp161948
501*2139Sjp161948	sll	out0, 28, out0            ! rotate
502*2139Sjp161948	xor	$2, local2, $2            ! 5 finished local2 used
503*2139Sjp161948
504*2139Sjp161948	srl	local0, 8, local4
505*2139Sjp161948	and	local0, 252, local2
506*2139Sjp161948	ld	[local5+local3], local3
507*2139Sjp161948
508*2139Sjp161948	srl	local0, 16, local5
509*2139Sjp161948	or	out0, local0, out0        ! rotate
510*2139Sjp161948	ld	[global2+local2], local2
511*2139Sjp161948
512*2139Sjp161948	srl	out0, 24, local0
513*2139Sjp161948	ifelse($6,{}, {}, {ld	[$6], out0})   ! key next encryption/decryption
514*2139Sjp161948	and	local4, 252, local4
515*2139Sjp161948
516*2139Sjp161948	and	local5, 252, local5
517*2139Sjp161948	ld	[global4+local4], local4
518*2139Sjp161948	xor	$2, local3, $2            ! 7 finished local3 used
519*2139Sjp161948
520*2139Sjp161948	and	local0, 252, local0
521*2139Sjp161948	ld	[local6+local5], local5
522*2139Sjp161948	xor	$2, local2, $2            ! 2 finished local2 now sbox 3
523*2139Sjp161948
524*2139Sjp161948	srl	local7, 8, local2         ! 3 start
525*2139Sjp161948	ld	[out3+local0], local0
526*2139Sjp161948	xor	$2, local4, $2
527*2139Sjp161948
528*2139Sjp161948	and	local2, 252, local2
529*2139Sjp161948	ld	[global1+local1], local1
530*2139Sjp161948	xor	$2, local5, $2            ! 6 finished local5 used
531*2139Sjp161948
532*2139Sjp161948	ld	[global3+local2], local2
533*2139Sjp161948	srl	$1, 3, local3
534*2139Sjp161948	xor	$2, local0, $2
535*2139Sjp161948
536*2139Sjp161948	ifelse($6,{}, {}, {ld	[$6+4], out1}) ! key next encryption/decryption
537*2139Sjp161948	sll	$1, 29, local4
538*2139Sjp161948	xor	$2, local1, $2
539*2139Sjp161948
540*2139Sjp161948	ifelse($7,{}, {}, {retl})
541*2139Sjp161948	xor	$2, local2, $2
542*2139Sjp161948})
543*2139Sjp161948
544*2139Sjp161948
545*2139Sjp161948! {fp_macro}
546*2139Sjp161948!
547*2139Sjp161948!  parameter 1   right (original left)
548*2139Sjp161948!  parameter 2   left (original right)
549*2139Sjp161948!  parameter 3   1 for optional store to [in0]
550*2139Sjp161948!  parameter 4   1 for load input/output address to local5/7
551*2139Sjp161948!
552*2139Sjp161948!  The final permutation logic switches the halfes, meaning that
553*2139Sjp161948!  left and right ends up the the registers originally used.
554*2139Sjp161948
555*2139Sjp161948define(fp_macro, {
556*2139Sjp161948
557*2139Sjp161948! {fp_macro}
558*2139Sjp161948! $1 $2 $3 $4 $5 $6 $7 $8 $9
559*2139Sjp161948
560*2139Sjp161948	! initially undo the rotate 3 left done after initial permutation
561*2139Sjp161948	! original left is received shifted 3 right and 29 left in local3/4
562*2139Sjp161948
563*2139Sjp161948	sll	$2, 29, local1
564*2139Sjp161948	or	local3, local4, $1
565*2139Sjp161948
566*2139Sjp161948	srl	$2, 3, $2
567*2139Sjp161948	sethi	%hi(0x55555555), local2
568*2139Sjp161948
569*2139Sjp161948	or	$2, local1, $2
570*2139Sjp161948	or	local2, %lo(0x55555555), local2
571*2139Sjp161948
572*2139Sjp161948	srl	$2, 1, local3
573*2139Sjp161948	sethi	%hi(0x00ff00ff), local1
574*2139Sjp161948	xor	local3, $1, local3
575*2139Sjp161948	or	local1, %lo(0x00ff00ff), local1
576*2139Sjp161948	and	local3, local2, local3
577*2139Sjp161948	sethi	%hi(0x33333333), local4
578*2139Sjp161948	sll	local3, 1, local2
579*2139Sjp161948
580*2139Sjp161948	xor	$1, local3, $1
581*2139Sjp161948
582*2139Sjp161948	srl	$1, 8, local3
583*2139Sjp161948	xor	$2, local2, $2
584*2139Sjp161948	xor	local3, $2, local3
585*2139Sjp161948	or	local4, %lo(0x33333333), local4
586*2139Sjp161948	and	local3, local1, local3
587*2139Sjp161948	sethi	%hi(0x0000ffff), local1
588*2139Sjp161948	sll	local3, 8, local2
589*2139Sjp161948
590*2139Sjp161948	xor	$2, local3, $2
591*2139Sjp161948
592*2139Sjp161948	srl	$2, 2, local3
593*2139Sjp161948	xor	$1, local2, $1
594*2139Sjp161948	xor	local3, $1, local3
595*2139Sjp161948	or	local1, %lo(0x0000ffff), local1
596*2139Sjp161948	and	local3, local4, local3
597*2139Sjp161948	sethi	%hi(0x0f0f0f0f), local4
598*2139Sjp161948	sll	local3, 2, local2
599*2139Sjp161948
600*2139Sjp161948	ifelse($4,1, {LDPTR INPUT, local5})
601*2139Sjp161948	xor	$1, local3, $1
602*2139Sjp161948
603*2139Sjp161948	ifelse($4,1, {LDPTR OUTPUT, local7})
604*2139Sjp161948	srl	$1, 16, local3
605*2139Sjp161948	xor	$2, local2, $2
606*2139Sjp161948	xor	local3, $2, local3
607*2139Sjp161948	or	local4, %lo(0x0f0f0f0f), local4
608*2139Sjp161948	and	local3, local1, local3
609*2139Sjp161948	sll	local3, 16, local2
610*2139Sjp161948
611*2139Sjp161948	xor	$2, local3, local1
612*2139Sjp161948
613*2139Sjp161948	srl	local1, 4, local3
614*2139Sjp161948	xor	$1, local2, $1
615*2139Sjp161948	xor	local3, $1, local3
616*2139Sjp161948	and	local3, local4, local3
617*2139Sjp161948	sll	local3, 4, local2
618*2139Sjp161948
619*2139Sjp161948	xor	$1, local3, $1
620*2139Sjp161948
621*2139Sjp161948	! optional store:
622*2139Sjp161948
623*2139Sjp161948	ifelse($3,1, {st $1, [in0]})
624*2139Sjp161948
625*2139Sjp161948	xor	local1, local2, $2
626*2139Sjp161948
627*2139Sjp161948	ifelse($3,1, {st $2, [in0+4]})
628*2139Sjp161948
629*2139Sjp161948})
630*2139Sjp161948
631*2139Sjp161948
632*2139Sjp161948! {fp_ip_macro}
633*2139Sjp161948!
634*2139Sjp161948! Does initial permutation for next block mixed with
635*2139Sjp161948! final permutation for current block.
636*2139Sjp161948!
637*2139Sjp161948! parameter 1   original left
638*2139Sjp161948! parameter 2   original right
639*2139Sjp161948! parameter 3   left ip
640*2139Sjp161948! parameter 4   right ip
641*2139Sjp161948! parameter 5   1: load ks1/ks2 to in3/in4, add 120 to in4
642*2139Sjp161948!                2: mov in4 to in3
643*2139Sjp161948!
644*2139Sjp161948! also adds -8 to length in2 and loads loop counter to out4
645*2139Sjp161948
646*2139Sjp161948define(fp_ip_macro, {
647*2139Sjp161948
648*2139Sjp161948! {fp_ip_macro}
649*2139Sjp161948! $1 $2 $3 $4 $5 $6 $7 $8 $9
650*2139Sjp161948
651*2139Sjp161948	define({temp1},{out4})
652*2139Sjp161948	define({temp2},{local3})
653*2139Sjp161948
654*2139Sjp161948	define({ip1},{local1})
655*2139Sjp161948	define({ip2},{local2})
656*2139Sjp161948	define({ip4},{local4})
657*2139Sjp161948	define({ip5},{local5})
658*2139Sjp161948
659*2139Sjp161948	! $1 in local3, local4
660*2139Sjp161948
661*2139Sjp161948	ld	[out2+256], ip1
662*2139Sjp161948	sll	out5, 29, temp1
663*2139Sjp161948	or	local3, local4, $1
664*2139Sjp161948
665*2139Sjp161948	srl	out5, 3, $2
666*2139Sjp161948	ifelse($5,2,{mov in4, in3})
667*2139Sjp161948
668*2139Sjp161948	ld	[out2+272], ip5
669*2139Sjp161948	srl	$4, 4, local0
670*2139Sjp161948	or	$2, temp1, $2
671*2139Sjp161948
672*2139Sjp161948	srl	$2, 1, temp1
673*2139Sjp161948	xor	temp1, $1, temp1
674*2139Sjp161948
675*2139Sjp161948	and	temp1, ip5, temp1
676*2139Sjp161948	xor	local0, $3, local0
677*2139Sjp161948
678*2139Sjp161948	sll	temp1, 1, temp2
679*2139Sjp161948	xor	$1, temp1, $1
680*2139Sjp161948
681*2139Sjp161948	and	local0, ip1, local0
682*2139Sjp161948	add	in2, -8, in2
683*2139Sjp161948
684*2139Sjp161948	sll	local0, 4, local7
685*2139Sjp161948	xor	$3, local0, $3
686*2139Sjp161948
687*2139Sjp161948	ld	[out2+268], ip4
688*2139Sjp161948	srl	$1, 8, temp1
689*2139Sjp161948	xor	$2, temp2, $2
690*2139Sjp161948	ld	[out2+260], ip2
691*2139Sjp161948	srl	$3, 16, local0
692*2139Sjp161948	xor	$4, local7, $4
693*2139Sjp161948	xor	temp1, $2, temp1
694*2139Sjp161948	xor	local0, $4, local0
695*2139Sjp161948	and	temp1, ip4, temp1
696*2139Sjp161948	and	local0, ip2, local0
697*2139Sjp161948	sll	temp1, 8, temp2
698*2139Sjp161948	xor	$2, temp1, $2
699*2139Sjp161948	sll	local0, 16, local7
700*2139Sjp161948	xor	$4, local0, $4
701*2139Sjp161948
702*2139Sjp161948	srl	$2, 2, temp1
703*2139Sjp161948	xor	$1, temp2, $1
704*2139Sjp161948
705*2139Sjp161948	ld	[out2+264], temp2         ! ip3
706*2139Sjp161948	srl	$4, 2, local0
707*2139Sjp161948	xor	$3, local7, $3
708*2139Sjp161948	xor	temp1, $1, temp1
709*2139Sjp161948	xor	local0, $3, local0
710*2139Sjp161948	and	temp1, temp2, temp1
711*2139Sjp161948	and	local0, temp2, local0
712*2139Sjp161948	sll	temp1, 2, temp2
713*2139Sjp161948	xor	$1, temp1, $1
714*2139Sjp161948	sll	local0, 2, local7
715*2139Sjp161948	xor	$3, local0, $3
716*2139Sjp161948
717*2139Sjp161948	srl	$1, 16, temp1
718*2139Sjp161948	xor	$2, temp2, $2
719*2139Sjp161948	srl	$3, 8, local0
720*2139Sjp161948	xor	$4, local7, $4
721*2139Sjp161948	xor	temp1, $2, temp1
722*2139Sjp161948	xor	local0, $4, local0
723*2139Sjp161948	and	temp1, ip2, temp1
724*2139Sjp161948	and	local0, ip4, local0
725*2139Sjp161948	sll	temp1, 16, temp2
726*2139Sjp161948	xor	$2, temp1, local4
727*2139Sjp161948	sll	local0, 8, local7
728*2139Sjp161948	xor	$4, local0, $4
729*2139Sjp161948
730*2139Sjp161948	srl	$4, 1, local0
731*2139Sjp161948	xor	$3, local7, $3
732*2139Sjp161948
733*2139Sjp161948	srl	local4, 4, temp1
734*2139Sjp161948	xor	local0, $3, local0
735*2139Sjp161948
736*2139Sjp161948	xor	$1, temp2, $1
737*2139Sjp161948	and	local0, ip5, local0
738*2139Sjp161948
739*2139Sjp161948	sll	local0, 1, local7
740*2139Sjp161948	xor	temp1, $1, temp1
741*2139Sjp161948
742*2139Sjp161948	xor	$3, local0, $3
743*2139Sjp161948	xor	$4, local7, $4
744*2139Sjp161948
745*2139Sjp161948	sll	$3, 3, local5
746*2139Sjp161948	and	temp1, ip1, temp1
747*2139Sjp161948
748*2139Sjp161948	sll	temp1, 4, temp2
749*2139Sjp161948	xor	$1, temp1, $1
750*2139Sjp161948
751*2139Sjp161948	ifelse($5,1,{LDPTR	KS2, in4})
752*2139Sjp161948	sll	$4, 3, local2
753*2139Sjp161948	xor	local4, temp2, $2
754*2139Sjp161948
755*2139Sjp161948	! reload since used as temporar:
756*2139Sjp161948
757*2139Sjp161948	ld	[out2+280], out4          ! loop counter
758*2139Sjp161948
759*2139Sjp161948	srl	$3, 29, local0
760*2139Sjp161948	ifelse($5,1,{add in4, 120, in4})
761*2139Sjp161948
762*2139Sjp161948	ifelse($5,1,{LDPTR	KS1, in3})
763*2139Sjp161948	srl	$4, 29, local7
764*2139Sjp161948
765*2139Sjp161948	or	local0, local5, $4
766*2139Sjp161948	or	local2, local7, $3
767*2139Sjp161948
768*2139Sjp161948})
769*2139Sjp161948
770*2139Sjp161948
771*2139Sjp161948
772*2139Sjp161948! {load_little_endian}
773*2139Sjp161948!
774*2139Sjp161948! parameter 1  address
775*2139Sjp161948! parameter 2  destination left
776*2139Sjp161948! parameter 3  destination right
777*2139Sjp161948! parameter 4  temporar
778*2139Sjp161948! parameter 5  label
779*2139Sjp161948
780*2139Sjp161948define(load_little_endian, {
781*2139Sjp161948
782*2139Sjp161948! {load_little_endian}
783*2139Sjp161948! $1 $2 $3 $4 $5 $6 $7 $8 $9
784*2139Sjp161948
785*2139Sjp161948	! first in memory to rightmost in register
786*2139Sjp161948
787*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
788*2139Sjp161948	andcc	$1, 3, global0
789*2139Sjp161948	bne,pn	%icc, $5
790*2139Sjp161948	nop
791*2139Sjp161948
792*2139Sjp161948	lda	[$1] 0x88, $2
793*2139Sjp161948	add	$1, 4, $4
794*2139Sjp161948
795*2139Sjp161948	ba,pt	%icc, $5a
796*2139Sjp161948	lda	[$4] 0x88, $3
797*2139Sjp161948#endif
798*2139Sjp161948
799*2139Sjp161948$5:
800*2139Sjp161948	ldub	[$1+3], $2
801*2139Sjp161948
802*2139Sjp161948	ldub	[$1+2], $4
803*2139Sjp161948	sll	$2, 8, $2
804*2139Sjp161948	or	$2, $4, $2
805*2139Sjp161948
806*2139Sjp161948	ldub	[$1+1], $4
807*2139Sjp161948	sll	$2, 8, $2
808*2139Sjp161948	or	$2, $4, $2
809*2139Sjp161948
810*2139Sjp161948	ldub	[$1+0], $4
811*2139Sjp161948	sll	$2, 8, $2
812*2139Sjp161948	or	$2, $4, $2
813*2139Sjp161948
814*2139Sjp161948
815*2139Sjp161948	ldub	[$1+3+4], $3
816*2139Sjp161948
817*2139Sjp161948	ldub	[$1+2+4], $4
818*2139Sjp161948	sll	$3, 8, $3
819*2139Sjp161948	or	$3, $4, $3
820*2139Sjp161948
821*2139Sjp161948	ldub	[$1+1+4], $4
822*2139Sjp161948	sll	$3, 8, $3
823*2139Sjp161948	or	$3, $4, $3
824*2139Sjp161948
825*2139Sjp161948	ldub	[$1+0+4], $4
826*2139Sjp161948	sll	$3, 8, $3
827*2139Sjp161948	or	$3, $4, $3
828*2139Sjp161948$5a:
829*2139Sjp161948
830*2139Sjp161948})
831*2139Sjp161948
832*2139Sjp161948
833*2139Sjp161948! {load_little_endian_inc}
834*2139Sjp161948!
835*2139Sjp161948! parameter 1  address
836*2139Sjp161948! parameter 2  destination left
837*2139Sjp161948! parameter 3  destination right
838*2139Sjp161948! parameter 4  temporar
839*2139Sjp161948! parameter 4  label
840*2139Sjp161948!
841*2139Sjp161948! adds 8 to address
842*2139Sjp161948
843*2139Sjp161948define(load_little_endian_inc, {
844*2139Sjp161948
845*2139Sjp161948! {load_little_endian_inc}
846*2139Sjp161948! $1 $2 $3 $4 $5 $6 $7 $8 $9
847*2139Sjp161948
848*2139Sjp161948	! first in memory to rightmost in register
849*2139Sjp161948
850*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
851*2139Sjp161948	andcc	$1, 3, global0
852*2139Sjp161948	bne,pn	%icc, $5
853*2139Sjp161948	nop
854*2139Sjp161948
855*2139Sjp161948	lda	[$1] 0x88, $2
856*2139Sjp161948	add	$1, 4, $1
857*2139Sjp161948
858*2139Sjp161948	lda	[$1] 0x88, $3
859*2139Sjp161948	ba,pt	%icc, $5a
860*2139Sjp161948	add	$1, 4, $1
861*2139Sjp161948#endif
862*2139Sjp161948
863*2139Sjp161948$5:
864*2139Sjp161948	ldub	[$1+3], $2
865*2139Sjp161948
866*2139Sjp161948	ldub	[$1+2], $4
867*2139Sjp161948	sll	$2, 8, $2
868*2139Sjp161948	or	$2, $4, $2
869*2139Sjp161948
870*2139Sjp161948	ldub	[$1+1], $4
871*2139Sjp161948	sll	$2, 8, $2
872*2139Sjp161948	or	$2, $4, $2
873*2139Sjp161948
874*2139Sjp161948	ldub	[$1+0], $4
875*2139Sjp161948	sll	$2, 8, $2
876*2139Sjp161948	or	$2, $4, $2
877*2139Sjp161948
878*2139Sjp161948	ldub	[$1+3+4], $3
879*2139Sjp161948	add	$1, 8, $1
880*2139Sjp161948
881*2139Sjp161948	ldub	[$1+2+4-8], $4
882*2139Sjp161948	sll	$3, 8, $3
883*2139Sjp161948	or	$3, $4, $3
884*2139Sjp161948
885*2139Sjp161948	ldub	[$1+1+4-8], $4
886*2139Sjp161948	sll	$3, 8, $3
887*2139Sjp161948	or	$3, $4, $3
888*2139Sjp161948
889*2139Sjp161948	ldub	[$1+0+4-8], $4
890*2139Sjp161948	sll	$3, 8, $3
891*2139Sjp161948	or	$3, $4, $3
892*2139Sjp161948$5a:
893*2139Sjp161948
894*2139Sjp161948})
895*2139Sjp161948
896*2139Sjp161948
897*2139Sjp161948! {load_n_bytes}
898*2139Sjp161948!
899*2139Sjp161948! Loads 1 to 7 bytes little endian
900*2139Sjp161948! Remaining bytes are zeroed.
901*2139Sjp161948!
902*2139Sjp161948! parameter 1  address
903*2139Sjp161948! parameter 2  length
904*2139Sjp161948! parameter 3  destination register left
905*2139Sjp161948! parameter 4  destination register right
906*2139Sjp161948! parameter 5  temp
907*2139Sjp161948! parameter 6  temp2
908*2139Sjp161948! parameter 7  label
909*2139Sjp161948! parameter 8  return label
910*2139Sjp161948
911*2139Sjp161948define(load_n_bytes, {
912*2139Sjp161948
913*2139Sjp161948! {load_n_bytes}
914*2139Sjp161948! $1 $2 $5 $6 $7 $8 $7 $8 $9
915*2139Sjp161948
916*2139Sjp161948$7.0:	call	.+8
917*2139Sjp161948	sll	$2, 2, $6
918*2139Sjp161948
919*2139Sjp161948	add	%o7,$7.jmp.table-$7.0,$5
920*2139Sjp161948
921*2139Sjp161948	add	$5, $6, $5
922*2139Sjp161948	mov	0, $4
923*2139Sjp161948
924*2139Sjp161948	ld	[$5], $5
925*2139Sjp161948
926*2139Sjp161948	jmp	%o7+$5
927*2139Sjp161948	mov	0, $3
928*2139Sjp161948
929*2139Sjp161948$7.7:
930*2139Sjp161948	ldub	[$1+6], $5
931*2139Sjp161948	sll	$5, 16, $5
932*2139Sjp161948	or	$3, $5, $3
933*2139Sjp161948$7.6:
934*2139Sjp161948	ldub	[$1+5], $5
935*2139Sjp161948	sll	$5, 8, $5
936*2139Sjp161948	or	$3, $5, $3
937*2139Sjp161948$7.5:
938*2139Sjp161948	ldub	[$1+4], $5
939*2139Sjp161948	or	$3, $5, $3
940*2139Sjp161948$7.4:
941*2139Sjp161948	ldub	[$1+3], $5
942*2139Sjp161948	sll	$5, 24, $5
943*2139Sjp161948	or	$4, $5, $4
944*2139Sjp161948$7.3:
945*2139Sjp161948	ldub	[$1+2], $5
946*2139Sjp161948	sll	$5, 16, $5
947*2139Sjp161948	or	$4, $5, $4
948*2139Sjp161948$7.2:
949*2139Sjp161948	ldub	[$1+1], $5
950*2139Sjp161948	sll	$5, 8, $5
951*2139Sjp161948	or	$4, $5, $4
952*2139Sjp161948$7.1:
953*2139Sjp161948	ldub	[$1+0], $5
954*2139Sjp161948	ba	$8
955*2139Sjp161948	or	$4, $5, $4
956*2139Sjp161948
957*2139Sjp161948	.align 4
958*2139Sjp161948
959*2139Sjp161948$7.jmp.table:
960*2139Sjp161948	.word	0
961*2139Sjp161948	.word	$7.1-$7.0
962*2139Sjp161948	.word	$7.2-$7.0
963*2139Sjp161948	.word	$7.3-$7.0
964*2139Sjp161948	.word	$7.4-$7.0
965*2139Sjp161948	.word	$7.5-$7.0
966*2139Sjp161948	.word	$7.6-$7.0
967*2139Sjp161948	.word	$7.7-$7.0
968*2139Sjp161948})
969*2139Sjp161948
970*2139Sjp161948
971*2139Sjp161948! {store_little_endian}
972*2139Sjp161948!
973*2139Sjp161948! parameter 1  address
974*2139Sjp161948! parameter 2  source left
975*2139Sjp161948! parameter 3  source right
976*2139Sjp161948! parameter 4  temporar
977*2139Sjp161948
978*2139Sjp161948define(store_little_endian, {
979*2139Sjp161948
980*2139Sjp161948! {store_little_endian}
981*2139Sjp161948! $1 $2 $3 $4 $5 $6 $7 $8 $9
982*2139Sjp161948
983*2139Sjp161948	! rightmost in register to first in memory
984*2139Sjp161948
985*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
986*2139Sjp161948	andcc	$1, 3, global0
987*2139Sjp161948	bne,pn	%icc, $5
988*2139Sjp161948	nop
989*2139Sjp161948
990*2139Sjp161948	sta	$2, [$1] 0x88
991*2139Sjp161948	add	$1, 4, $4
992*2139Sjp161948
993*2139Sjp161948	ba,pt	%icc, $5a
994*2139Sjp161948	sta	$3, [$4] 0x88
995*2139Sjp161948#endif
996*2139Sjp161948
997*2139Sjp161948$5:
998*2139Sjp161948	and	$2, 255, $4
999*2139Sjp161948	stub	$4, [$1+0]
1000*2139Sjp161948
1001*2139Sjp161948	srl	$2, 8, $4
1002*2139Sjp161948	and	$4, 255, $4
1003*2139Sjp161948	stub	$4, [$1+1]
1004*2139Sjp161948
1005*2139Sjp161948	srl	$2, 16, $4
1006*2139Sjp161948	and	$4, 255, $4
1007*2139Sjp161948	stub	$4, [$1+2]
1008*2139Sjp161948
1009*2139Sjp161948	srl	$2, 24, $4
1010*2139Sjp161948	stub	$4, [$1+3]
1011*2139Sjp161948
1012*2139Sjp161948
1013*2139Sjp161948	and	$3, 255, $4
1014*2139Sjp161948	stub	$4, [$1+0+4]
1015*2139Sjp161948
1016*2139Sjp161948	srl	$3, 8, $4
1017*2139Sjp161948	and	$4, 255, $4
1018*2139Sjp161948	stub	$4, [$1+1+4]
1019*2139Sjp161948
1020*2139Sjp161948	srl	$3, 16, $4
1021*2139Sjp161948	and	$4, 255, $4
1022*2139Sjp161948	stub	$4, [$1+2+4]
1023*2139Sjp161948
1024*2139Sjp161948	srl	$3, 24, $4
1025*2139Sjp161948	stub	$4, [$1+3+4]
1026*2139Sjp161948
1027*2139Sjp161948$5a:
1028*2139Sjp161948
1029*2139Sjp161948})
1030*2139Sjp161948
1031*2139Sjp161948
1032*2139Sjp161948! {store_n_bytes}
1033*2139Sjp161948!
1034*2139Sjp161948! Stores 1 to 7 bytes little endian
1035*2139Sjp161948!
1036*2139Sjp161948! parameter 1  address
1037*2139Sjp161948! parameter 2  length
1038*2139Sjp161948! parameter 3  source register left
1039*2139Sjp161948! parameter 4  source register right
1040*2139Sjp161948! parameter 5  temp
1041*2139Sjp161948! parameter 6  temp2
1042*2139Sjp161948! parameter 7  label
1043*2139Sjp161948! parameter 8  return label
1044*2139Sjp161948
1045*2139Sjp161948define(store_n_bytes, {
1046*2139Sjp161948
1047*2139Sjp161948! {store_n_bytes}
1048*2139Sjp161948! $1 $2 $5 $6 $7 $8 $7 $8 $9
1049*2139Sjp161948
1050*2139Sjp161948$7.0:	call	.+8
1051*2139Sjp161948	sll	$2, 2, $6
1052*2139Sjp161948
1053*2139Sjp161948	add	%o7,$7.jmp.table-$7.0,$5
1054*2139Sjp161948
1055*2139Sjp161948	add	$5, $6, $5
1056*2139Sjp161948
1057*2139Sjp161948	ld	[$5], $5
1058*2139Sjp161948
1059*2139Sjp161948	jmp	%o7+$5
1060*2139Sjp161948	nop
1061*2139Sjp161948
1062*2139Sjp161948$7.7:
1063*2139Sjp161948	srl	$3, 16, $5
1064*2139Sjp161948	and	$5, 0xff, $5
1065*2139Sjp161948	stub	$5, [$1+6]
1066*2139Sjp161948$7.6:
1067*2139Sjp161948	srl	$3, 8, $5
1068*2139Sjp161948	and	$5, 0xff, $5
1069*2139Sjp161948	stub	$5, [$1+5]
1070*2139Sjp161948$7.5:
1071*2139Sjp161948	and	$3, 0xff, $5
1072*2139Sjp161948	stub	$5, [$1+4]
1073*2139Sjp161948$7.4:
1074*2139Sjp161948	srl	$4, 24, $5
1075*2139Sjp161948	stub	$5, [$1+3]
1076*2139Sjp161948$7.3:
1077*2139Sjp161948	srl	$4, 16, $5
1078*2139Sjp161948	and	$5, 0xff, $5
1079*2139Sjp161948	stub	$5, [$1+2]
1080*2139Sjp161948$7.2:
1081*2139Sjp161948	srl	$4, 8, $5
1082*2139Sjp161948	and	$5, 0xff, $5
1083*2139Sjp161948	stub	$5, [$1+1]
1084*2139Sjp161948$7.1:
1085*2139Sjp161948	and	$4, 0xff, $5
1086*2139Sjp161948
1087*2139Sjp161948
1088*2139Sjp161948	ba	$8
1089*2139Sjp161948	stub	$5, [$1]
1090*2139Sjp161948
1091*2139Sjp161948	.align 4
1092*2139Sjp161948
1093*2139Sjp161948$7.jmp.table:
1094*2139Sjp161948
1095*2139Sjp161948	.word	0
1096*2139Sjp161948	.word	$7.1-$7.0
1097*2139Sjp161948	.word	$7.2-$7.0
1098*2139Sjp161948	.word	$7.3-$7.0
1099*2139Sjp161948	.word	$7.4-$7.0
1100*2139Sjp161948	.word	$7.5-$7.0
1101*2139Sjp161948	.word	$7.6-$7.0
1102*2139Sjp161948	.word	$7.7-$7.0
1103*2139Sjp161948})
1104*2139Sjp161948
1105*2139Sjp161948
1106*2139Sjp161948define(testvalue,{1})
1107*2139Sjp161948
1108*2139Sjp161948define(register_init, {
1109*2139Sjp161948
1110*2139Sjp161948! For test purposes:
1111*2139Sjp161948
1112*2139Sjp161948	sethi	%hi(testvalue), local0
1113*2139Sjp161948	or	local0, %lo(testvalue), local0
1114*2139Sjp161948
1115*2139Sjp161948	ifelse($1,{},{}, {mov	local0, $1})
1116*2139Sjp161948	ifelse($2,{},{}, {mov	local0, $2})
1117*2139Sjp161948	ifelse($3,{},{}, {mov	local0, $3})
1118*2139Sjp161948	ifelse($4,{},{}, {mov	local0, $4})
1119*2139Sjp161948	ifelse($5,{},{}, {mov	local0, $5})
1120*2139Sjp161948	ifelse($6,{},{}, {mov	local0, $6})
1121*2139Sjp161948	ifelse($7,{},{}, {mov	local0, $7})
1122*2139Sjp161948	ifelse($8,{},{}, {mov	local0, $8})
1123*2139Sjp161948
1124*2139Sjp161948	mov	local0, local1
1125*2139Sjp161948	mov	local0, local2
1126*2139Sjp161948	mov	local0, local3
1127*2139Sjp161948	mov	local0, local4
1128*2139Sjp161948	mov	local0, local5
1129*2139Sjp161948	mov	local0, local7
1130*2139Sjp161948	mov	local0, local6
1131*2139Sjp161948	mov	local0, out0
1132*2139Sjp161948	mov	local0, out1
1133*2139Sjp161948	mov	local0, out2
1134*2139Sjp161948	mov	local0, out3
1135*2139Sjp161948	mov	local0, out4
1136*2139Sjp161948	mov	local0, out5
1137*2139Sjp161948	mov	local0, global1
1138*2139Sjp161948	mov	local0, global2
1139*2139Sjp161948	mov	local0, global3
1140*2139Sjp161948	mov	local0, global4
1141*2139Sjp161948	mov	local0, global5
1142*2139Sjp161948
1143*2139Sjp161948})
1144*2139Sjp161948
1145*2139Sjp161948.section	".text"
1146*2139Sjp161948
1147*2139Sjp161948	.align 32
1148*2139Sjp161948
1149*2139Sjp161948.des_enc:
1150*2139Sjp161948
1151*2139Sjp161948	! key address in3
1152*2139Sjp161948	! loads key next encryption/decryption first round from [in4]
1153*2139Sjp161948
1154*2139Sjp161948	rounds_macro(in5, out5, 1, .des_enc.1, in3, in4, retl)
1155*2139Sjp161948
1156*2139Sjp161948
1157*2139Sjp161948	.align 32
1158*2139Sjp161948
1159*2139Sjp161948.des_dec:
1160*2139Sjp161948
1161*2139Sjp161948	! implemented with out5 as first parameter to avoid
1162*2139Sjp161948	! register exchange in ede modes
1163*2139Sjp161948
1164*2139Sjp161948	! key address in4
1165*2139Sjp161948	! loads key next encryption/decryption first round from [in3]
1166*2139Sjp161948
1167*2139Sjp161948	rounds_macro(out5, in5, -1, .des_dec.1, in4, in3, retl)
1168*2139Sjp161948
1169*2139Sjp161948
1170*2139Sjp161948
1171*2139Sjp161948! void DES_encrypt1(data, ks, enc)
1172*2139Sjp161948! *******************************
1173*2139Sjp161948
1174*2139Sjp161948	.align 32
1175*2139Sjp161948	.global DES_encrypt1
1176*2139Sjp161948	.type	 DES_encrypt1,#function
1177*2139Sjp161948
1178*2139Sjp161948DES_encrypt1:
1179*2139Sjp161948
1180*2139Sjp161948	save	%sp, FRAME, %sp
1181*2139Sjp161948
1182*2139Sjp161948	call	.PIC.me.up
1183*2139Sjp161948	mov	.PIC.me.up-(.-4),out0
1184*2139Sjp161948
1185*2139Sjp161948	ld	[in0], in5                ! left
1186*2139Sjp161948	cmp	in2, 0                    ! enc
1187*2139Sjp161948
1188*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
1189*2139Sjp161948	be,pn	%icc, .encrypt.dec        ! enc/dec
1190*2139Sjp161948#else
1191*2139Sjp161948	be	.encrypt.dec
1192*2139Sjp161948#endif
1193*2139Sjp161948	ld	[in0+4], out5             ! right
1194*2139Sjp161948
1195*2139Sjp161948	! parameter 6  1/2 for include encryption/decryption
1196*2139Sjp161948	! parameter 7  1 for move in1 to in3
1197*2139Sjp161948	! parameter 8  1 for move in3 to in4, 2 for move in4 to in3
1198*2139Sjp161948
1199*2139Sjp161948	ip_macro(in5, out5, in5, out5, in3, 0, 1, 1)
1200*2139Sjp161948
1201*2139Sjp161948	rounds_macro(in5, out5, 1, .des_encrypt1.1, in3, in4) ! in4 not used
1202*2139Sjp161948
1203*2139Sjp161948	fp_macro(in5, out5, 1)            ! 1 for store to [in0]
1204*2139Sjp161948
1205*2139Sjp161948	ret
1206*2139Sjp161948	restore
1207*2139Sjp161948
1208*2139Sjp161948.encrypt.dec:
1209*2139Sjp161948
1210*2139Sjp161948	add	in1, 120, in3             ! use last subkey for first round
1211*2139Sjp161948
1212*2139Sjp161948	! parameter 6  1/2 for include encryption/decryption
1213*2139Sjp161948	! parameter 7  1 for move in1 to in3
1214*2139Sjp161948	! parameter 8  1 for move in3 to in4, 2 for move in4 to in3
1215*2139Sjp161948
1216*2139Sjp161948	ip_macro(in5, out5, out5, in5, in4, 2, 0, 1) ! include dec,  ks in4
1217*2139Sjp161948
1218*2139Sjp161948	fp_macro(out5, in5, 1)            ! 1 for store to [in0]
1219*2139Sjp161948
1220*2139Sjp161948	ret
1221*2139Sjp161948	restore
1222*2139Sjp161948
1223*2139Sjp161948.DES_encrypt1.end:
1224*2139Sjp161948	.size	 DES_encrypt1,.DES_encrypt1.end-DES_encrypt1
1225*2139Sjp161948
1226*2139Sjp161948
1227*2139Sjp161948! void DES_encrypt2(data, ks, enc)
1228*2139Sjp161948!*********************************
1229*2139Sjp161948
1230*2139Sjp161948	! encrypts/decrypts without initial/final permutation
1231*2139Sjp161948
1232*2139Sjp161948	.align 32
1233*2139Sjp161948	.global DES_encrypt2
1234*2139Sjp161948	.type	 DES_encrypt2,#function
1235*2139Sjp161948
1236*2139Sjp161948DES_encrypt2:
1237*2139Sjp161948
1238*2139Sjp161948	save	%sp, FRAME, %sp
1239*2139Sjp161948
1240*2139Sjp161948	call	.PIC.me.up
1241*2139Sjp161948	mov	.PIC.me.up-(.-4),out0
1242*2139Sjp161948
1243*2139Sjp161948	! Set sbox address 1 to 6 and rotate halfs 3 left
1244*2139Sjp161948	! Errors caught by destest? Yes. Still? *NO*
1245*2139Sjp161948
1246*2139Sjp161948	!sethi	%hi(DES_SPtrans), global1 ! address sbox 1
1247*2139Sjp161948
1248*2139Sjp161948	!or	global1, %lo(DES_SPtrans), global1  ! sbox 1
1249*2139Sjp161948
1250*2139Sjp161948	add	global1, 256, global2     ! sbox 2
1251*2139Sjp161948	add	global1, 512, global3     ! sbox 3
1252*2139Sjp161948
1253*2139Sjp161948	ld	[in0], out5               ! right
1254*2139Sjp161948	add	global1, 768, global4     ! sbox 4
1255*2139Sjp161948	add	global1, 1024, global5    ! sbox 5
1256*2139Sjp161948
1257*2139Sjp161948	ld	[in0+4], in5              ! left
1258*2139Sjp161948	add	global1, 1280, local6     ! sbox 6
1259*2139Sjp161948	add	global1, 1792, out3       ! sbox 8
1260*2139Sjp161948
1261*2139Sjp161948	! rotate
1262*2139Sjp161948
1263*2139Sjp161948	sll	in5, 3, local5
1264*2139Sjp161948	mov	in1, in3                  ! key address to in3
1265*2139Sjp161948
1266*2139Sjp161948	sll	out5, 3, local7
1267*2139Sjp161948	srl	in5, 29, in5
1268*2139Sjp161948
1269*2139Sjp161948	srl	out5, 29, out5
1270*2139Sjp161948	add	in5, local5, in5
1271*2139Sjp161948
1272*2139Sjp161948	add	out5, local7, out5
1273*2139Sjp161948	cmp	in2, 0
1274*2139Sjp161948
1275*2139Sjp161948	! we use our own stackframe
1276*2139Sjp161948
1277*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
1278*2139Sjp161948	be,pn	%icc, .encrypt2.dec       ! decryption
1279*2139Sjp161948#else
1280*2139Sjp161948	be	.encrypt2.dec
1281*2139Sjp161948#endif
1282*2139Sjp161948	STPTR	in0, [%sp+BIAS+ARG0+0*ARGSZ]
1283*2139Sjp161948
1284*2139Sjp161948	ld	[in3], out0               ! key 7531 first round
1285*2139Sjp161948	mov	LOOPS, out4               ! loop counter
1286*2139Sjp161948
1287*2139Sjp161948	ld	[in3+4], out1             ! key 8642 first round
1288*2139Sjp161948	sethi	%hi(0x0000FC00), local5
1289*2139Sjp161948
1290*2139Sjp161948	call .des_enc
1291*2139Sjp161948	mov	in3, in4
1292*2139Sjp161948
1293*2139Sjp161948	! rotate
1294*2139Sjp161948	sll	in5, 29, in0
1295*2139Sjp161948	srl	in5, 3, in5
1296*2139Sjp161948	sll	out5, 29, in1
1297*2139Sjp161948	add	in5, in0, in5
1298*2139Sjp161948	srl	out5, 3, out5
1299*2139Sjp161948	LDPTR	[%sp+BIAS+ARG0+0*ARGSZ], in0
1300*2139Sjp161948	add	out5, in1, out5
1301*2139Sjp161948	st	in5, [in0]
1302*2139Sjp161948	st	out5, [in0+4]
1303*2139Sjp161948
1304*2139Sjp161948	ret
1305*2139Sjp161948	restore
1306*2139Sjp161948
1307*2139Sjp161948
1308*2139Sjp161948.encrypt2.dec:
1309*2139Sjp161948
1310*2139Sjp161948	add in3, 120, in4
1311*2139Sjp161948
1312*2139Sjp161948	ld	[in4], out0               ! key 7531 first round
1313*2139Sjp161948	mov	LOOPS, out4               ! loop counter
1314*2139Sjp161948
1315*2139Sjp161948	ld	[in4+4], out1             ! key 8642 first round
1316*2139Sjp161948	sethi	%hi(0x0000FC00), local5
1317*2139Sjp161948
1318*2139Sjp161948	mov	in5, local1               ! left expected in out5
1319*2139Sjp161948	mov	out5, in5
1320*2139Sjp161948
1321*2139Sjp161948	call .des_dec
1322*2139Sjp161948	mov	local1, out5
1323*2139Sjp161948
1324*2139Sjp161948.encrypt2.finish:
1325*2139Sjp161948
1326*2139Sjp161948	! rotate
1327*2139Sjp161948	sll	in5, 29, in0
1328*2139Sjp161948	srl	in5, 3, in5
1329*2139Sjp161948	sll	out5, 29, in1
1330*2139Sjp161948	add	in5, in0, in5
1331*2139Sjp161948	srl	out5, 3, out5
1332*2139Sjp161948	LDPTR	[%sp+BIAS+ARG0+0*ARGSZ], in0
1333*2139Sjp161948	add	out5, in1, out5
1334*2139Sjp161948	st	out5, [in0]
1335*2139Sjp161948	st	in5, [in0+4]
1336*2139Sjp161948
1337*2139Sjp161948	ret
1338*2139Sjp161948	restore
1339*2139Sjp161948
1340*2139Sjp161948.DES_encrypt2.end:
1341*2139Sjp161948	.size	 DES_encrypt2, .DES_encrypt2.end-DES_encrypt2
1342*2139Sjp161948
1343*2139Sjp161948
1344*2139Sjp161948! void DES_encrypt3(data, ks1, ks2, ks3)
1345*2139Sjp161948! **************************************
1346*2139Sjp161948
1347*2139Sjp161948	.align 32
1348*2139Sjp161948	.global DES_encrypt3
1349*2139Sjp161948	.type	 DES_encrypt3,#function
1350*2139Sjp161948
1351*2139Sjp161948DES_encrypt3:
1352*2139Sjp161948
1353*2139Sjp161948	save	%sp, FRAME, %sp
1354*2139Sjp161948
1355*2139Sjp161948	call	.PIC.me.up
1356*2139Sjp161948	mov	.PIC.me.up-(.-4),out0
1357*2139Sjp161948
1358*2139Sjp161948	ld	[in0], in5                ! left
1359*2139Sjp161948	add	in2, 120, in4             ! ks2
1360*2139Sjp161948
1361*2139Sjp161948	ld	[in0+4], out5             ! right
1362*2139Sjp161948	mov	in3, in2                  ! save ks3
1363*2139Sjp161948
1364*2139Sjp161948	! parameter 6  1/2 for include encryption/decryption
1365*2139Sjp161948	! parameter 7  1 for mov in1 to in3
1366*2139Sjp161948	! parameter 8  1 for mov in3 to in4
1367*2139Sjp161948	! parameter 9  1 for load ks3 and ks2 to in4 and in3
1368*2139Sjp161948
1369*2139Sjp161948	ip_macro(in5, out5, in5, out5, in3, 1, 1, 0, 0)
1370*2139Sjp161948
1371*2139Sjp161948	call	.des_dec
1372*2139Sjp161948	mov	in2, in3                  ! preload ks3
1373*2139Sjp161948
1374*2139Sjp161948	call	.des_enc
1375*2139Sjp161948	nop
1376*2139Sjp161948
1377*2139Sjp161948	fp_macro(in5, out5, 1)
1378*2139Sjp161948
1379*2139Sjp161948	ret
1380*2139Sjp161948	restore
1381*2139Sjp161948
1382*2139Sjp161948.DES_encrypt3.end:
1383*2139Sjp161948	.size	 DES_encrypt3,.DES_encrypt3.end-DES_encrypt3
1384*2139Sjp161948
1385*2139Sjp161948
1386*2139Sjp161948! void DES_decrypt3(data, ks1, ks2, ks3)
1387*2139Sjp161948! **************************************
1388*2139Sjp161948
1389*2139Sjp161948	.align 32
1390*2139Sjp161948	.global DES_decrypt3
1391*2139Sjp161948	.type	 DES_decrypt3,#function
1392*2139Sjp161948
1393*2139Sjp161948DES_decrypt3:
1394*2139Sjp161948
1395*2139Sjp161948	save	%sp, FRAME, %sp
1396*2139Sjp161948
1397*2139Sjp161948	call	.PIC.me.up
1398*2139Sjp161948	mov	.PIC.me.up-(.-4),out0
1399*2139Sjp161948
1400*2139Sjp161948	ld	[in0], in5                ! left
1401*2139Sjp161948	add	in3, 120, in4             ! ks3
1402*2139Sjp161948
1403*2139Sjp161948	ld	[in0+4], out5             ! right
1404*2139Sjp161948	mov	in2, in3                  ! ks2
1405*2139Sjp161948
1406*2139Sjp161948	! parameter 6  1/2 for include encryption/decryption
1407*2139Sjp161948	! parameter 7  1 for mov in1 to in3
1408*2139Sjp161948	! parameter 8  1 for mov in3 to in4
1409*2139Sjp161948	! parameter 9  1 for load ks3 and ks2 to in4 and in3
1410*2139Sjp161948
1411*2139Sjp161948	ip_macro(in5, out5, out5, in5, in4, 2, 0, 0, 0)
1412*2139Sjp161948
1413*2139Sjp161948	call	.des_enc
1414*2139Sjp161948	add	in1, 120, in4             ! preload ks1
1415*2139Sjp161948
1416*2139Sjp161948	call	.des_dec
1417*2139Sjp161948	nop
1418*2139Sjp161948
1419*2139Sjp161948	fp_macro(out5, in5, 1)
1420*2139Sjp161948
1421*2139Sjp161948	ret
1422*2139Sjp161948	restore
1423*2139Sjp161948
1424*2139Sjp161948.DES_decrypt3.end:
1425*2139Sjp161948	.size	 DES_decrypt3,.DES_decrypt3.end-DES_decrypt3
1426*2139Sjp161948
1427*2139Sjp161948	.align	256
1428*2139Sjp161948	.type	 .des_and,#object
1429*2139Sjp161948	.size	 .des_and,284
1430*2139Sjp161948
1431*2139Sjp161948.des_and:
1432*2139Sjp161948
1433*2139Sjp161948! This table is used for AND 0xFC when it is known that register
1434*2139Sjp161948! bits 8-31 are zero. Makes it possible to do three arithmetic
1435*2139Sjp161948! operations in one cycle.
1436*2139Sjp161948
1437*2139Sjp161948	.byte  0, 0, 0, 0, 4, 4, 4, 4
1438*2139Sjp161948	.byte  8, 8, 8, 8, 12, 12, 12, 12
1439*2139Sjp161948	.byte  16, 16, 16, 16, 20, 20, 20, 20
1440*2139Sjp161948	.byte  24, 24, 24, 24, 28, 28, 28, 28
1441*2139Sjp161948	.byte  32, 32, 32, 32, 36, 36, 36, 36
1442*2139Sjp161948	.byte  40, 40, 40, 40, 44, 44, 44, 44
1443*2139Sjp161948	.byte  48, 48, 48, 48, 52, 52, 52, 52
1444*2139Sjp161948	.byte  56, 56, 56, 56, 60, 60, 60, 60
1445*2139Sjp161948	.byte  64, 64, 64, 64, 68, 68, 68, 68
1446*2139Sjp161948	.byte  72, 72, 72, 72, 76, 76, 76, 76
1447*2139Sjp161948	.byte  80, 80, 80, 80, 84, 84, 84, 84
1448*2139Sjp161948	.byte  88, 88, 88, 88, 92, 92, 92, 92
1449*2139Sjp161948	.byte  96, 96, 96, 96, 100, 100, 100, 100
1450*2139Sjp161948	.byte  104, 104, 104, 104, 108, 108, 108, 108
1451*2139Sjp161948	.byte  112, 112, 112, 112, 116, 116, 116, 116
1452*2139Sjp161948	.byte  120, 120, 120, 120, 124, 124, 124, 124
1453*2139Sjp161948	.byte  128, 128, 128, 128, 132, 132, 132, 132
1454*2139Sjp161948	.byte  136, 136, 136, 136, 140, 140, 140, 140
1455*2139Sjp161948	.byte  144, 144, 144, 144, 148, 148, 148, 148
1456*2139Sjp161948	.byte  152, 152, 152, 152, 156, 156, 156, 156
1457*2139Sjp161948	.byte  160, 160, 160, 160, 164, 164, 164, 164
1458*2139Sjp161948	.byte  168, 168, 168, 168, 172, 172, 172, 172
1459*2139Sjp161948	.byte  176, 176, 176, 176, 180, 180, 180, 180
1460*2139Sjp161948	.byte  184, 184, 184, 184, 188, 188, 188, 188
1461*2139Sjp161948	.byte  192, 192, 192, 192, 196, 196, 196, 196
1462*2139Sjp161948	.byte  200, 200, 200, 200, 204, 204, 204, 204
1463*2139Sjp161948	.byte  208, 208, 208, 208, 212, 212, 212, 212
1464*2139Sjp161948	.byte  216, 216, 216, 216, 220, 220, 220, 220
1465*2139Sjp161948	.byte  224, 224, 224, 224, 228, 228, 228, 228
1466*2139Sjp161948	.byte  232, 232, 232, 232, 236, 236, 236, 236
1467*2139Sjp161948	.byte  240, 240, 240, 240, 244, 244, 244, 244
1468*2139Sjp161948	.byte  248, 248, 248, 248, 252, 252, 252, 252
1469*2139Sjp161948
1470*2139Sjp161948	! 5 numbers for initil/final permutation
1471*2139Sjp161948
1472*2139Sjp161948	.word   0x0f0f0f0f                ! offset 256
1473*2139Sjp161948	.word	0x0000ffff                ! 260
1474*2139Sjp161948	.word	0x33333333                ! 264
1475*2139Sjp161948	.word	0x00ff00ff                ! 268
1476*2139Sjp161948	.word	0x55555555                ! 272
1477*2139Sjp161948
1478*2139Sjp161948	.word	0                         ! 276
1479*2139Sjp161948	.word	LOOPS                     ! 280
1480*2139Sjp161948	.word	0x0000FC00                ! 284
1481*2139Sjp161948.PIC.DES_SPtrans:
1482*2139Sjp161948	.word	%r_disp32(DES_SPtrans)
1483*2139Sjp161948
1484*2139Sjp161948! input:	out0	offset between .PIC.me.up and caller
1485*2139Sjp161948! output:	out0	pointer to .PIC.me.up
1486*2139Sjp161948!		out2	pointer to .des_and
1487*2139Sjp161948!		global1	pointer to DES_SPtrans
1488*2139Sjp161948	.align	32
1489*2139Sjp161948.PIC.me.up:
1490*2139Sjp161948	add	out0,%o7,out0			! pointer to .PIC.me.up
1491*2139Sjp161948#if 1
1492*2139Sjp161948	ld	[out0+(.PIC.DES_SPtrans-.PIC.me.up)],global1
1493*2139Sjp161948	add	global1,(.PIC.DES_SPtrans-.PIC.me.up),global1
1494*2139Sjp161948	add	global1,out0,global1
1495*2139Sjp161948#else
1496*2139Sjp161948# ifdef OPENSSL_PIC
1497*2139Sjp161948	! In case anybody wonders why this code is same for both ABI.
1498*2139Sjp161948	! To start with it is not. Do note LDPTR below. But of course
1499*2139Sjp161948	! you must be wondering why the rest of it does not contain
1500*2139Sjp161948	! things like %hh, %hm and %lm. Well, those are needed only
1501*2139Sjp161948	! if OpenSSL library *itself* will become larger than 4GB,
1502*2139Sjp161948	! which is not going to happen any time soon.
1503*2139Sjp161948	sethi	%hi(DES_SPtrans),global1
1504*2139Sjp161948	or	global1,%lo(DES_SPtrans),global1
1505*2139Sjp161948	sethi	%hi(_GLOBAL_OFFSET_TABLE_-(.PIC.me.up-.)),out2
1506*2139Sjp161948	add	global1,out0,global1
1507*2139Sjp161948	add	out2,%lo(_GLOBAL_OFFSET_TABLE_-(.PIC.me.up-.)),out2
1508*2139Sjp161948	LDPTR	[out2+global1],global1
1509*2139Sjp161948# elif 0
1510*2139Sjp161948	setn	DES_SPtrans,out2,global1	! synthetic instruction !
1511*2139Sjp161948# elif defined(ABI64)
1512*2139Sjp161948	sethi	%hh(DES_SPtrans),out2
1513*2139Sjp161948	or	out2,%hm(DES_SPtrans),out2
1514*2139Sjp161948	sethi	%lm(DES_SPtrans),global1
1515*2139Sjp161948	or	global1,%lo(DES_SPtrans),global1
1516*2139Sjp161948	sllx	out2,32,out2
1517*2139Sjp161948	or	out2,global1,global1
1518*2139Sjp161948# else
1519*2139Sjp161948	sethi	%hi(DES_SPtrans),global1
1520*2139Sjp161948	or	global1,%lo(DES_SPtrans),global1
1521*2139Sjp161948# endif
1522*2139Sjp161948#endif
1523*2139Sjp161948	retl
1524*2139Sjp161948	add	out0,.des_and-.PIC.me.up,out2
1525*2139Sjp161948
1526*2139Sjp161948! void DES_ncbc_encrypt(input, output, length, schedule, ivec, enc)
1527*2139Sjp161948! *****************************************************************
1528*2139Sjp161948
1529*2139Sjp161948
1530*2139Sjp161948	.align 32
1531*2139Sjp161948	.global DES_ncbc_encrypt
1532*2139Sjp161948	.type	 DES_ncbc_encrypt,#function
1533*2139Sjp161948
1534*2139Sjp161948DES_ncbc_encrypt:
1535*2139Sjp161948
1536*2139Sjp161948	save	%sp, FRAME, %sp
1537*2139Sjp161948
1538*2139Sjp161948	define({INPUT},  { [%sp+BIAS+ARG0+0*ARGSZ] })
1539*2139Sjp161948	define({OUTPUT}, { [%sp+BIAS+ARG0+1*ARGSZ] })
1540*2139Sjp161948	define({IVEC},   { [%sp+BIAS+ARG0+4*ARGSZ] })
1541*2139Sjp161948
1542*2139Sjp161948	call	.PIC.me.up
1543*2139Sjp161948	mov	.PIC.me.up-(.-4),out0
1544*2139Sjp161948
1545*2139Sjp161948	cmp	in5, 0                    ! enc
1546*2139Sjp161948
1547*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
1548*2139Sjp161948	be,pn	%icc, .ncbc.dec
1549*2139Sjp161948#else
1550*2139Sjp161948	be	.ncbc.dec
1551*2139Sjp161948#endif
1552*2139Sjp161948	STPTR	in4, IVEC
1553*2139Sjp161948
1554*2139Sjp161948	! addr  left  right  temp  label
1555*2139Sjp161948	load_little_endian(in4, in5, out5, local3, .LLE1)  ! iv
1556*2139Sjp161948
1557*2139Sjp161948	addcc	in2, -8, in2              ! bytes missing when first block done
1558*2139Sjp161948
1559*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
1560*2139Sjp161948	bl,pn	%icc, .ncbc.enc.seven.or.less
1561*2139Sjp161948#else
1562*2139Sjp161948	bl	.ncbc.enc.seven.or.less
1563*2139Sjp161948#endif
1564*2139Sjp161948	mov	in3, in4                  ! schedule
1565*2139Sjp161948
1566*2139Sjp161948.ncbc.enc.next.block:
1567*2139Sjp161948
1568*2139Sjp161948	load_little_endian(in0, out4, global4, local3, .LLE2)  ! block
1569*2139Sjp161948
1570*2139Sjp161948.ncbc.enc.next.block_1:
1571*2139Sjp161948
1572*2139Sjp161948	xor	in5, out4, in5            ! iv xor
1573*2139Sjp161948	xor	out5, global4, out5       ! iv xor
1574*2139Sjp161948
1575*2139Sjp161948	! parameter 8  1 for move in3 to in4, 2 for move in4 to in3
1576*2139Sjp161948	ip_macro(in5, out5, in5, out5, in3, 0, 0, 2)
1577*2139Sjp161948
1578*2139Sjp161948.ncbc.enc.next.block_2:
1579*2139Sjp161948
1580*2139Sjp161948!//	call .des_enc                     ! compares in2 to 8
1581*2139Sjp161948!	rounds inlined for alignment purposes
1582*2139Sjp161948
1583*2139Sjp161948	add	global1, 768, global4     ! address sbox 4 since register used below
1584*2139Sjp161948
1585*2139Sjp161948	rounds_macro(in5, out5, 1, .ncbc.enc.1, in3, in4) ! include encryption  ks in3
1586*2139Sjp161948
1587*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
1588*2139Sjp161948	bl,pn	%icc, .ncbc.enc.next.block_fp
1589*2139Sjp161948#else
1590*2139Sjp161948	bl	.ncbc.enc.next.block_fp
1591*2139Sjp161948#endif
1592*2139Sjp161948	add	in0, 8, in0               ! input address
1593*2139Sjp161948
1594*2139Sjp161948	! If 8 or more bytes are to be encrypted after this block,
1595*2139Sjp161948	! we combine final permutation for this block with initial
1596*2139Sjp161948	! permutation for next block. Load next block:
1597*2139Sjp161948
1598*2139Sjp161948	load_little_endian(in0, global3, global4, local5, .LLE12)
1599*2139Sjp161948
1600*2139Sjp161948	!  parameter 1   original left
1601*2139Sjp161948	!  parameter 2   original right
1602*2139Sjp161948	!  parameter 3   left ip
1603*2139Sjp161948	!  parameter 4   right ip
1604*2139Sjp161948	!  parameter 5   1: load ks1/ks2 to in3/in4, add 120 to in4
1605*2139Sjp161948	!                2: mov in4 to in3
1606*2139Sjp161948	!
1607*2139Sjp161948	! also adds -8 to length in2 and loads loop counter to out4
1608*2139Sjp161948
1609*2139Sjp161948	fp_ip_macro(out0, out1, global3, global4, 2)
1610*2139Sjp161948
1611*2139Sjp161948	store_little_endian(in1, out0, out1, local3, .SLE10)  ! block
1612*2139Sjp161948
1613*2139Sjp161948	ld	[in3], out0               ! key 7531 first round next block
1614*2139Sjp161948	mov 	in5, local1
1615*2139Sjp161948	xor	global3, out5, in5        ! iv xor next block
1616*2139Sjp161948
1617*2139Sjp161948	ld	[in3+4], out1             ! key 8642
1618*2139Sjp161948	add	global1, 512, global3     ! address sbox 3 since register used
1619*2139Sjp161948	xor	global4, local1, out5     ! iv xor next block
1620*2139Sjp161948
1621*2139Sjp161948	ba	.ncbc.enc.next.block_2
1622*2139Sjp161948	add	in1, 8, in1               ! output adress
1623*2139Sjp161948
1624*2139Sjp161948.ncbc.enc.next.block_fp:
1625*2139Sjp161948
1626*2139Sjp161948	fp_macro(in5, out5)
1627*2139Sjp161948
1628*2139Sjp161948	store_little_endian(in1, in5, out5, local3, .SLE1)  ! block
1629*2139Sjp161948
1630*2139Sjp161948	addcc   in2, -8, in2              ! bytes missing when next block done
1631*2139Sjp161948
1632*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
1633*2139Sjp161948	bpos,pt	%icc, .ncbc.enc.next.block  ! also jumps if 0
1634*2139Sjp161948#else
1635*2139Sjp161948	bpos	.ncbc.enc.next.block
1636*2139Sjp161948#endif
1637*2139Sjp161948	add	in1, 8, in1
1638*2139Sjp161948
1639*2139Sjp161948.ncbc.enc.seven.or.less:
1640*2139Sjp161948
1641*2139Sjp161948	cmp	in2, -8
1642*2139Sjp161948
1643*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
1644*2139Sjp161948	ble,pt	%icc, .ncbc.enc.finish
1645*2139Sjp161948#else
1646*2139Sjp161948	ble	.ncbc.enc.finish
1647*2139Sjp161948#endif
1648*2139Sjp161948	nop
1649*2139Sjp161948
1650*2139Sjp161948	add	in2, 8, local1            ! bytes to load
1651*2139Sjp161948
1652*2139Sjp161948	! addr, length, dest left, dest right, temp, temp2, label, ret label
1653*2139Sjp161948	load_n_bytes(in0, local1, global4, out4, local2, local3, .LNB1, .ncbc.enc.next.block_1)
1654*2139Sjp161948
1655*2139Sjp161948	! Loads 1 to 7 bytes little endian to global4, out4
1656*2139Sjp161948
1657*2139Sjp161948
1658*2139Sjp161948.ncbc.enc.finish:
1659*2139Sjp161948
1660*2139Sjp161948	LDPTR	IVEC, local4
1661*2139Sjp161948	store_little_endian(local4, in5, out5, local5, .SLE2)  ! ivec
1662*2139Sjp161948
1663*2139Sjp161948	ret
1664*2139Sjp161948	restore
1665*2139Sjp161948
1666*2139Sjp161948
1667*2139Sjp161948.ncbc.dec:
1668*2139Sjp161948
1669*2139Sjp161948	STPTR	in0, INPUT
1670*2139Sjp161948	cmp	in2, 0                    ! length
1671*2139Sjp161948	add	in3, 120, in3
1672*2139Sjp161948
1673*2139Sjp161948	LDPTR	IVEC, local7              ! ivec
1674*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
1675*2139Sjp161948	ble,pn	%icc, .ncbc.dec.finish
1676*2139Sjp161948#else
1677*2139Sjp161948	ble	.ncbc.dec.finish
1678*2139Sjp161948#endif
1679*2139Sjp161948	mov	in3, in4                  ! schedule
1680*2139Sjp161948
1681*2139Sjp161948	STPTR	in1, OUTPUT
1682*2139Sjp161948	mov	in0, local5               ! input
1683*2139Sjp161948
1684*2139Sjp161948	load_little_endian(local7, in0, in1, local3, .LLE3)   ! ivec
1685*2139Sjp161948
1686*2139Sjp161948.ncbc.dec.next.block:
1687*2139Sjp161948
1688*2139Sjp161948	load_little_endian(local5, in5, out5, local3, .LLE4)  ! block
1689*2139Sjp161948
1690*2139Sjp161948	! parameter 6  1/2 for include encryption/decryption
1691*2139Sjp161948	! parameter 7  1 for mov in1 to in3
1692*2139Sjp161948	! parameter 8  1 for mov in3 to in4
1693*2139Sjp161948
1694*2139Sjp161948	ip_macro(in5, out5, out5, in5, in4, 2, 0, 1) ! include decryprion  ks in4
1695*2139Sjp161948
1696*2139Sjp161948	fp_macro(out5, in5, 0, 1) ! 1 for input and output address to local5/7
1697*2139Sjp161948
1698*2139Sjp161948	! in2 is bytes left to be stored
1699*2139Sjp161948	! in2 is compared to 8 in the rounds
1700*2139Sjp161948
1701*2139Sjp161948	xor	out5, in0, out4           ! iv xor
1702*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
1703*2139Sjp161948	bl,pn	%icc, .ncbc.dec.seven.or.less
1704*2139Sjp161948#else
1705*2139Sjp161948	bl	.ncbc.dec.seven.or.less
1706*2139Sjp161948#endif
1707*2139Sjp161948	xor	in5, in1, global4         ! iv xor
1708*2139Sjp161948
1709*2139Sjp161948	! Load ivec next block now, since input and output address might be the same.
1710*2139Sjp161948
1711*2139Sjp161948	load_little_endian_inc(local5, in0, in1, local3, .LLE5)  ! iv
1712*2139Sjp161948
1713*2139Sjp161948	store_little_endian(local7, out4, global4, local3, .SLE3)
1714*2139Sjp161948
1715*2139Sjp161948	STPTR	local5, INPUT
1716*2139Sjp161948	add	local7, 8, local7
1717*2139Sjp161948	addcc   in2, -8, in2
1718*2139Sjp161948
1719*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
1720*2139Sjp161948	bg,pt	%icc, .ncbc.dec.next.block
1721*2139Sjp161948#else
1722*2139Sjp161948	bg	.ncbc.dec.next.block
1723*2139Sjp161948#endif
1724*2139Sjp161948	STPTR	local7, OUTPUT
1725*2139Sjp161948
1726*2139Sjp161948
1727*2139Sjp161948.ncbc.dec.store.iv:
1728*2139Sjp161948
1729*2139Sjp161948	LDPTR	IVEC, local4              ! ivec
1730*2139Sjp161948	store_little_endian(local4, in0, in1, local5, .SLE4)
1731*2139Sjp161948
1732*2139Sjp161948.ncbc.dec.finish:
1733*2139Sjp161948
1734*2139Sjp161948	ret
1735*2139Sjp161948	restore
1736*2139Sjp161948
1737*2139Sjp161948.ncbc.dec.seven.or.less:
1738*2139Sjp161948
1739*2139Sjp161948	load_little_endian_inc(local5, in0, in1, local3, .LLE13)     ! ivec
1740*2139Sjp161948
1741*2139Sjp161948	store_n_bytes(local7, in2, global4, out4, local3, local4, .SNB1, .ncbc.dec.store.iv)
1742*2139Sjp161948
1743*2139Sjp161948
1744*2139Sjp161948.DES_ncbc_encrypt.end:
1745*2139Sjp161948	.size	 DES_ncbc_encrypt, .DES_ncbc_encrypt.end-DES_ncbc_encrypt
1746*2139Sjp161948
1747*2139Sjp161948
1748*2139Sjp161948! void DES_ede3_cbc_encrypt(input, output, lenght, ks1, ks2, ks3, ivec, enc)
1749*2139Sjp161948! **************************************************************************
1750*2139Sjp161948
1751*2139Sjp161948
1752*2139Sjp161948	.align 32
1753*2139Sjp161948	.global DES_ede3_cbc_encrypt
1754*2139Sjp161948	.type	 DES_ede3_cbc_encrypt,#function
1755*2139Sjp161948
1756*2139Sjp161948DES_ede3_cbc_encrypt:
1757*2139Sjp161948
1758*2139Sjp161948	save	%sp, FRAME, %sp
1759*2139Sjp161948
1760*2139Sjp161948	define({KS1}, { [%sp+BIAS+ARG0+3*ARGSZ] })
1761*2139Sjp161948	define({KS2}, { [%sp+BIAS+ARG0+4*ARGSZ] })
1762*2139Sjp161948	define({KS3}, { [%sp+BIAS+ARG0+5*ARGSZ] })
1763*2139Sjp161948
1764*2139Sjp161948	call	.PIC.me.up
1765*2139Sjp161948	mov	.PIC.me.up-(.-4),out0
1766*2139Sjp161948
1767*2139Sjp161948	LDPTR	[%fp+BIAS+ARG0+7*ARGSZ], local3          ! enc
1768*2139Sjp161948	LDPTR	[%fp+BIAS+ARG0+6*ARGSZ], local4          ! ivec
1769*2139Sjp161948	cmp	local3, 0                 ! enc
1770*2139Sjp161948
1771*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
1772*2139Sjp161948	be,pn	%icc, .ede3.dec
1773*2139Sjp161948#else
1774*2139Sjp161948	be	.ede3.dec
1775*2139Sjp161948#endif
1776*2139Sjp161948	STPTR	in4, KS2
1777*2139Sjp161948
1778*2139Sjp161948	STPTR	in5, KS3
1779*2139Sjp161948
1780*2139Sjp161948	load_little_endian(local4, in5, out5, local3, .LLE6)  ! ivec
1781*2139Sjp161948
1782*2139Sjp161948	addcc	in2, -8, in2              ! bytes missing after next block
1783*2139Sjp161948
1784*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
1785*2139Sjp161948	bl,pn	%icc,  .ede3.enc.seven.or.less
1786*2139Sjp161948#else
1787*2139Sjp161948	bl	.ede3.enc.seven.or.less
1788*2139Sjp161948#endif
1789*2139Sjp161948	STPTR	in3, KS1
1790*2139Sjp161948
1791*2139Sjp161948.ede3.enc.next.block:
1792*2139Sjp161948
1793*2139Sjp161948	load_little_endian(in0, out4, global4, local3, .LLE7)
1794*2139Sjp161948
1795*2139Sjp161948.ede3.enc.next.block_1:
1796*2139Sjp161948
1797*2139Sjp161948	LDPTR	KS2, in4
1798*2139Sjp161948	xor	in5, out4, in5            ! iv xor
1799*2139Sjp161948	xor	out5, global4, out5       ! iv xor
1800*2139Sjp161948
1801*2139Sjp161948	LDPTR	KS1, in3
1802*2139Sjp161948	add	in4, 120, in4             ! for decryption we use last subkey first
1803*2139Sjp161948	nop
1804*2139Sjp161948
1805*2139Sjp161948	ip_macro(in5, out5, in5, out5, in3)
1806*2139Sjp161948
1807*2139Sjp161948.ede3.enc.next.block_2:
1808*2139Sjp161948
1809*2139Sjp161948	call .des_enc                     ! ks1 in3
1810*2139Sjp161948	nop
1811*2139Sjp161948
1812*2139Sjp161948	call .des_dec                     ! ks2 in4
1813*2139Sjp161948	LDPTR	KS3, in3
1814*2139Sjp161948
1815*2139Sjp161948	call .des_enc                     ! ks3 in3  compares in2 to 8
1816*2139Sjp161948	nop
1817*2139Sjp161948
1818*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
1819*2139Sjp161948	bl,pn	%icc, .ede3.enc.next.block_fp
1820*2139Sjp161948#else
1821*2139Sjp161948	bl	.ede3.enc.next.block_fp
1822*2139Sjp161948#endif
1823*2139Sjp161948	add	in0, 8, in0
1824*2139Sjp161948
1825*2139Sjp161948	! If 8 or more bytes are to be encrypted after this block,
1826*2139Sjp161948	! we combine final permutation for this block with initial
1827*2139Sjp161948	! permutation for next block. Load next block:
1828*2139Sjp161948
1829*2139Sjp161948	load_little_endian(in0, global3, global4, local5, .LLE11)
1830*2139Sjp161948
1831*2139Sjp161948	!  parameter 1   original left
1832*2139Sjp161948	!  parameter 2   original right
1833*2139Sjp161948	!  parameter 3   left ip
1834*2139Sjp161948	!  parameter 4   right ip
1835*2139Sjp161948	!  parameter 5   1: load ks1/ks2 to in3/in4, add 120 to in4
1836*2139Sjp161948	!                2: mov in4 to in3
1837*2139Sjp161948	!
1838*2139Sjp161948	! also adds -8 to length in2 and loads loop counter to out4
1839*2139Sjp161948
1840*2139Sjp161948	fp_ip_macro(out0, out1, global3, global4, 1)
1841*2139Sjp161948
1842*2139Sjp161948	store_little_endian(in1, out0, out1, local3, .SLE9)  ! block
1843*2139Sjp161948
1844*2139Sjp161948	mov 	in5, local1
1845*2139Sjp161948	xor	global3, out5, in5        ! iv xor next block
1846*2139Sjp161948
1847*2139Sjp161948	ld	[in3], out0               ! key 7531
1848*2139Sjp161948	add	global1, 512, global3     ! address sbox 3
1849*2139Sjp161948	xor	global4, local1, out5     ! iv xor next block
1850*2139Sjp161948
1851*2139Sjp161948	ld	[in3+4], out1             ! key 8642
1852*2139Sjp161948	add	global1, 768, global4     ! address sbox 4
1853*2139Sjp161948	ba	.ede3.enc.next.block_2
1854*2139Sjp161948	add	in1, 8, in1
1855*2139Sjp161948
1856*2139Sjp161948.ede3.enc.next.block_fp:
1857*2139Sjp161948
1858*2139Sjp161948	fp_macro(in5, out5)
1859*2139Sjp161948
1860*2139Sjp161948	store_little_endian(in1, in5, out5, local3, .SLE5)  ! block
1861*2139Sjp161948
1862*2139Sjp161948	addcc   in2, -8, in2              ! bytes missing when next block done
1863*2139Sjp161948
1864*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
1865*2139Sjp161948	bpos,pt	%icc, .ede3.enc.next.block
1866*2139Sjp161948#else
1867*2139Sjp161948	bpos	.ede3.enc.next.block
1868*2139Sjp161948#endif
1869*2139Sjp161948	add	in1, 8, in1
1870*2139Sjp161948
1871*2139Sjp161948.ede3.enc.seven.or.less:
1872*2139Sjp161948
1873*2139Sjp161948	cmp	in2, -8
1874*2139Sjp161948
1875*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
1876*2139Sjp161948	ble,pt	%icc, .ede3.enc.finish
1877*2139Sjp161948#else
1878*2139Sjp161948	ble	.ede3.enc.finish
1879*2139Sjp161948#endif
1880*2139Sjp161948	nop
1881*2139Sjp161948
1882*2139Sjp161948	add	in2, 8, local1            ! bytes to load
1883*2139Sjp161948
1884*2139Sjp161948	! addr, length, dest left, dest right, temp, temp2, label, ret label
1885*2139Sjp161948	load_n_bytes(in0, local1, global4, out4, local2, local3, .LNB2, .ede3.enc.next.block_1)
1886*2139Sjp161948
1887*2139Sjp161948.ede3.enc.finish:
1888*2139Sjp161948
1889*2139Sjp161948	LDPTR	[%fp+BIAS+ARG0+6*ARGSZ], local4          ! ivec
1890*2139Sjp161948	store_little_endian(local4, in5, out5, local5, .SLE6)  ! ivec
1891*2139Sjp161948
1892*2139Sjp161948	ret
1893*2139Sjp161948	restore
1894*2139Sjp161948
1895*2139Sjp161948.ede3.dec:
1896*2139Sjp161948
1897*2139Sjp161948	STPTR	in0, INPUT
1898*2139Sjp161948	add	in5, 120, in5
1899*2139Sjp161948
1900*2139Sjp161948	STPTR	in1, OUTPUT
1901*2139Sjp161948	mov	in0, local5
1902*2139Sjp161948	add	in3, 120, in3
1903*2139Sjp161948
1904*2139Sjp161948	STPTR	in3, KS1
1905*2139Sjp161948	cmp	in2, 0
1906*2139Sjp161948
1907*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
1908*2139Sjp161948	ble	%icc, .ede3.dec.finish
1909*2139Sjp161948#else
1910*2139Sjp161948	ble	.ede3.dec.finish
1911*2139Sjp161948#endif
1912*2139Sjp161948	STPTR	in5, KS3
1913*2139Sjp161948
1914*2139Sjp161948	LDPTR	[%fp+BIAS+ARG0+6*ARGSZ], local7          ! iv
1915*2139Sjp161948	load_little_endian(local7, in0, in1, local3, .LLE8)
1916*2139Sjp161948
1917*2139Sjp161948.ede3.dec.next.block:
1918*2139Sjp161948
1919*2139Sjp161948	load_little_endian(local5, in5, out5, local3, .LLE9)
1920*2139Sjp161948
1921*2139Sjp161948	! parameter 6  1/2 for include encryption/decryption
1922*2139Sjp161948	! parameter 7  1 for mov in1 to in3
1923*2139Sjp161948	! parameter 8  1 for mov in3 to in4
1924*2139Sjp161948	! parameter 9  1 for load ks3 and ks2 to in4 and in3
1925*2139Sjp161948
1926*2139Sjp161948	ip_macro(in5, out5, out5, in5, in4, 2, 0, 0, 1) ! inc .des_dec ks3 in4
1927*2139Sjp161948
1928*2139Sjp161948	call .des_enc                     ! ks2 in3
1929*2139Sjp161948	LDPTR	KS1, in4
1930*2139Sjp161948
1931*2139Sjp161948	call .des_dec                     ! ks1 in4
1932*2139Sjp161948	nop
1933*2139Sjp161948
1934*2139Sjp161948	fp_macro(out5, in5, 0, 1)   ! 1 for input and output address local5/7
1935*2139Sjp161948
1936*2139Sjp161948	! in2 is bytes left to be stored
1937*2139Sjp161948	! in2 is compared to 8 in the rounds
1938*2139Sjp161948
1939*2139Sjp161948	xor	out5, in0, out4
1940*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
1941*2139Sjp161948	bl,pn	%icc, .ede3.dec.seven.or.less
1942*2139Sjp161948#else
1943*2139Sjp161948	bl	.ede3.dec.seven.or.less
1944*2139Sjp161948#endif
1945*2139Sjp161948	xor	in5, in1, global4
1946*2139Sjp161948
1947*2139Sjp161948	load_little_endian_inc(local5, in0, in1, local3, .LLE10)   ! iv next block
1948*2139Sjp161948
1949*2139Sjp161948	store_little_endian(local7, out4, global4, local3, .SLE7)  ! block
1950*2139Sjp161948
1951*2139Sjp161948	STPTR	local5, INPUT
1952*2139Sjp161948	addcc   in2, -8, in2
1953*2139Sjp161948	add	local7, 8, local7
1954*2139Sjp161948
1955*2139Sjp161948#ifdef OPENSSL_SYSNAME_ULTRASPARC
1956*2139Sjp161948	bg,pt	%icc, .ede3.dec.next.block
1957*2139Sjp161948#else
1958*2139Sjp161948	bg	.ede3.dec.next.block
1959*2139Sjp161948#endif
1960*2139Sjp161948	STPTR	local7, OUTPUT
1961*2139Sjp161948
1962*2139Sjp161948.ede3.dec.store.iv:
1963*2139Sjp161948
1964*2139Sjp161948	LDPTR	[%fp+BIAS+ARG0+6*ARGSZ], local4          ! ivec
1965*2139Sjp161948	store_little_endian(local4, in0, in1, local5, .SLE8)  ! ivec
1966*2139Sjp161948
1967*2139Sjp161948.ede3.dec.finish:
1968*2139Sjp161948
1969*2139Sjp161948	ret
1970*2139Sjp161948	restore
1971*2139Sjp161948
1972*2139Sjp161948.ede3.dec.seven.or.less:
1973*2139Sjp161948
1974*2139Sjp161948	load_little_endian_inc(local5, in0, in1, local3, .LLE14)     ! iv
1975*2139Sjp161948
1976*2139Sjp161948	store_n_bytes(local7, in2, global4, out4, local3, local4, .SNB2, .ede3.dec.store.iv)
1977*2139Sjp161948
1978*2139Sjp161948
1979*2139Sjp161948.DES_ede3_cbc_encrypt.end:
1980*2139Sjp161948	.size	 DES_ede3_cbc_encrypt,.DES_ede3_cbc_encrypt.end-DES_ede3_cbc_encrypt
1981