xref: /minix3/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/armv4-mont.S (revision 0a6a1f1d05b60e214de2f05a7310ddd1f0e590e7)
1*0a6a1f1dSLionel Sambuc#include "arm_arch.h"
2*0a6a1f1dSLionel Sambuc#include "arm_asm.h"
3*0a6a1f1dSLionel Sambuc
4*0a6a1f1dSLionel Sambuc.text
5*0a6a1f1dSLionel Sambuc.code	32
6*0a6a1f1dSLionel Sambuc
7*0a6a1f1dSLionel Sambuc#if __ARM_MAX_ARCH__>=7
8*0a6a1f1dSLionel Sambuc.align	5
9*0a6a1f1dSLionel Sambuc.LOPENSSL_armcap:
10*0a6a1f1dSLionel Sambuc.word	OPENSSL_armcap_P-bn_mul_mont
11*0a6a1f1dSLionel Sambuc#endif
12*0a6a1f1dSLionel Sambuc
13*0a6a1f1dSLionel Sambuc.global	bn_mul_mont
14*0a6a1f1dSLionel Sambuc.type	bn_mul_mont,%function
15*0a6a1f1dSLionel Sambuc
16*0a6a1f1dSLionel Sambuc.align	5
17*0a6a1f1dSLionel Sambucbn_mul_mont:
18*0a6a1f1dSLionel Sambuc	ldr	ip,[sp,#4]		@ load num
19*0a6a1f1dSLionel Sambuc	stmdb	sp!,{r0,r2}		@ sp points at argument block
20*0a6a1f1dSLionel Sambuc#if __ARM_MAX_ARCH__>=7
21*0a6a1f1dSLionel Sambuc	tst	ip,#7
22*0a6a1f1dSLionel Sambuc	bne	.Lialu
23*0a6a1f1dSLionel Sambuc	adr	r0,bn_mul_mont
24*0a6a1f1dSLionel Sambuc	ldr	r2,.LOPENSSL_armcap
25*0a6a1f1dSLionel Sambuc	ldr	r0,[r0,r2]
26*0a6a1f1dSLionel Sambuc	tst	r0,#1			@ NEON available?
27*0a6a1f1dSLionel Sambuc	ldmia	sp, {r0,r2}
28*0a6a1f1dSLionel Sambuc	beq	.Lialu
29*0a6a1f1dSLionel Sambuc	add	sp,sp,#8
30*0a6a1f1dSLionel Sambuc	b	bn_mul8x_mont_neon
31*0a6a1f1dSLionel Sambuc.align	4
32*0a6a1f1dSLionel Sambuc.Lialu:
33*0a6a1f1dSLionel Sambuc#endif
34*0a6a1f1dSLionel Sambuc	cmp	ip,#2
35*0a6a1f1dSLionel Sambuc	mov	r0,ip			@ load num
36*0a6a1f1dSLionel Sambuc	movlt	r0,#0
37*0a6a1f1dSLionel Sambuc	addlt	sp,sp,#2*4
38*0a6a1f1dSLionel Sambuc	blt	.Labrt
39*0a6a1f1dSLionel Sambuc
40*0a6a1f1dSLionel Sambuc	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
41*0a6a1f1dSLionel Sambuc
42*0a6a1f1dSLionel Sambuc	mov	r0,r0,lsl#2		@ rescale r0 for byte count
43*0a6a1f1dSLionel Sambuc	sub	sp,sp,r0		@ alloca(4*num)
44*0a6a1f1dSLionel Sambuc	sub	sp,sp,#4		@ +extra dword
45*0a6a1f1dSLionel Sambuc	sub	r0,r0,#4		@ "num=num-1"
46*0a6a1f1dSLionel Sambuc	add	r4,r2,r0		@ &bp[num-1]
47*0a6a1f1dSLionel Sambuc
48*0a6a1f1dSLionel Sambuc	add	r0,sp,r0		@ r0 to point at &tp[num-1]
49*0a6a1f1dSLionel Sambuc	ldr	r8,[r0,#14*4]		@ &n0
50*0a6a1f1dSLionel Sambuc	ldr	r2,[r2]		@ bp[0]
51*0a6a1f1dSLionel Sambuc	ldr	r5,[r1],#4		@ ap[0],ap++
52*0a6a1f1dSLionel Sambuc	ldr	r6,[r3],#4		@ np[0],np++
53*0a6a1f1dSLionel Sambuc	ldr	r8,[r8]		@ *n0
54*0a6a1f1dSLionel Sambuc	str	r4,[r0,#15*4]		@ save &bp[num]
55*0a6a1f1dSLionel Sambuc
56*0a6a1f1dSLionel Sambuc	umull	r10,r11,r5,r2	@ ap[0]*bp[0]
57*0a6a1f1dSLionel Sambuc	str	r8,[r0,#14*4]		@ save n0 value
58*0a6a1f1dSLionel Sambuc	mul	r8,r10,r8		@ "tp[0]"*n0
59*0a6a1f1dSLionel Sambuc	mov	r12,#0
60*0a6a1f1dSLionel Sambuc	umlal	r10,r12,r6,r8	@ np[0]*n0+"t[0]"
61*0a6a1f1dSLionel Sambuc	mov	r4,sp
62*0a6a1f1dSLionel Sambuc
63*0a6a1f1dSLionel Sambuc.L1st:
64*0a6a1f1dSLionel Sambuc	ldr	r5,[r1],#4		@ ap[j],ap++
65*0a6a1f1dSLionel Sambuc	mov	r10,r11
66*0a6a1f1dSLionel Sambuc	ldr	r6,[r3],#4		@ np[j],np++
67*0a6a1f1dSLionel Sambuc	mov	r11,#0
68*0a6a1f1dSLionel Sambuc	umlal	r10,r11,r5,r2	@ ap[j]*bp[0]
69*0a6a1f1dSLionel Sambuc	mov	r14,#0
70*0a6a1f1dSLionel Sambuc	umlal	r12,r14,r6,r8	@ np[j]*n0
71*0a6a1f1dSLionel Sambuc	adds	r12,r12,r10
72*0a6a1f1dSLionel Sambuc	str	r12,[r4],#4		@ tp[j-1]=,tp++
73*0a6a1f1dSLionel Sambuc	adc	r12,r14,#0
74*0a6a1f1dSLionel Sambuc	cmp	r4,r0
75*0a6a1f1dSLionel Sambuc	bne	.L1st
76*0a6a1f1dSLionel Sambuc
77*0a6a1f1dSLionel Sambuc	adds	r12,r12,r11
78*0a6a1f1dSLionel Sambuc	ldr	r4,[r0,#13*4]		@ restore bp
79*0a6a1f1dSLionel Sambuc	mov	r14,#0
80*0a6a1f1dSLionel Sambuc	ldr	r8,[r0,#14*4]		@ restore n0
81*0a6a1f1dSLionel Sambuc	adc	r14,r14,#0
82*0a6a1f1dSLionel Sambuc	str	r12,[r0]		@ tp[num-1]=
83*0a6a1f1dSLionel Sambuc	str	r14,[r0,#4]		@ tp[num]=
84*0a6a1f1dSLionel Sambuc
85*0a6a1f1dSLionel Sambuc.Louter:
86*0a6a1f1dSLionel Sambuc	sub	r7,r0,sp		@ "original" r0-1 value
87*0a6a1f1dSLionel Sambuc	sub	r1,r1,r7		@ "rewind" ap to &ap[1]
88*0a6a1f1dSLionel Sambuc	ldr	r2,[r4,#4]!		@ *(++bp)
89*0a6a1f1dSLionel Sambuc	sub	r3,r3,r7		@ "rewind" np to &np[1]
90*0a6a1f1dSLionel Sambuc	ldr	r5,[r1,#-4]		@ ap[0]
91*0a6a1f1dSLionel Sambuc	ldr	r10,[sp]		@ tp[0]
92*0a6a1f1dSLionel Sambuc	ldr	r6,[r3,#-4]		@ np[0]
93*0a6a1f1dSLionel Sambuc	ldr	r7,[sp,#4]		@ tp[1]
94*0a6a1f1dSLionel Sambuc
95*0a6a1f1dSLionel Sambuc	mov	r11,#0
96*0a6a1f1dSLionel Sambuc	umlal	r10,r11,r5,r2	@ ap[0]*bp[i]+tp[0]
97*0a6a1f1dSLionel Sambuc	str	r4,[r0,#13*4]		@ save bp
98*0a6a1f1dSLionel Sambuc	mul	r8,r10,r8
99*0a6a1f1dSLionel Sambuc	mov	r12,#0
100*0a6a1f1dSLionel Sambuc	umlal	r10,r12,r6,r8	@ np[0]*n0+"tp[0]"
101*0a6a1f1dSLionel Sambuc	mov	r4,sp
102*0a6a1f1dSLionel Sambuc
103*0a6a1f1dSLionel Sambuc.Linner:
104*0a6a1f1dSLionel Sambuc	ldr	r5,[r1],#4		@ ap[j],ap++
105*0a6a1f1dSLionel Sambuc	adds	r10,r11,r7		@ +=tp[j]
106*0a6a1f1dSLionel Sambuc	ldr	r6,[r3],#4		@ np[j],np++
107*0a6a1f1dSLionel Sambuc	mov	r11,#0
108*0a6a1f1dSLionel Sambuc	umlal	r10,r11,r5,r2	@ ap[j]*bp[i]
109*0a6a1f1dSLionel Sambuc	mov	r14,#0
110*0a6a1f1dSLionel Sambuc	umlal	r12,r14,r6,r8	@ np[j]*n0
111*0a6a1f1dSLionel Sambuc	adc	r11,r11,#0
112*0a6a1f1dSLionel Sambuc	ldr	r7,[r4,#8]		@ tp[j+1]
113*0a6a1f1dSLionel Sambuc	adds	r12,r12,r10
114*0a6a1f1dSLionel Sambuc	str	r12,[r4],#4		@ tp[j-1]=,tp++
115*0a6a1f1dSLionel Sambuc	adc	r12,r14,#0
116*0a6a1f1dSLionel Sambuc	cmp	r4,r0
117*0a6a1f1dSLionel Sambuc	bne	.Linner
118*0a6a1f1dSLionel Sambuc
119*0a6a1f1dSLionel Sambuc	adds	r12,r12,r11
120*0a6a1f1dSLionel Sambuc	mov	r14,#0
121*0a6a1f1dSLionel Sambuc	ldr	r4,[r0,#13*4]		@ restore bp
122*0a6a1f1dSLionel Sambuc	adc	r14,r14,#0
123*0a6a1f1dSLionel Sambuc	ldr	r8,[r0,#14*4]		@ restore n0
124*0a6a1f1dSLionel Sambuc	adds	r12,r12,r7
125*0a6a1f1dSLionel Sambuc	ldr	r7,[r0,#15*4]		@ restore &bp[num]
126*0a6a1f1dSLionel Sambuc	adc	r14,r14,#0
127*0a6a1f1dSLionel Sambuc	str	r12,[r0]		@ tp[num-1]=
128*0a6a1f1dSLionel Sambuc	str	r14,[r0,#4]		@ tp[num]=
129*0a6a1f1dSLionel Sambuc
130*0a6a1f1dSLionel Sambuc	cmp	r4,r7
131*0a6a1f1dSLionel Sambuc	bne	.Louter
132*0a6a1f1dSLionel Sambuc
133*0a6a1f1dSLionel Sambuc	ldr	r2,[r0,#12*4]		@ pull rp
134*0a6a1f1dSLionel Sambuc	add	r0,r0,#4		@ r0 to point at &tp[num]
135*0a6a1f1dSLionel Sambuc	sub	r5,r0,sp		@ "original" num value
136*0a6a1f1dSLionel Sambuc	mov	r4,sp			@ "rewind" r4
137*0a6a1f1dSLionel Sambuc	mov	r1,r4			@ "borrow" r1
138*0a6a1f1dSLionel Sambuc	sub	r3,r3,r5		@ "rewind" r3 to &np[0]
139*0a6a1f1dSLionel Sambuc
140*0a6a1f1dSLionel Sambuc	subs	r7,r7,r7		@ "clear" carry flag
141*0a6a1f1dSLionel Sambuc.Lsub:	ldr	r7,[r4],#4
142*0a6a1f1dSLionel Sambuc	ldr	r6,[r3],#4
143*0a6a1f1dSLionel Sambuc	sbcs	r7,r7,r6		@ tp[j]-np[j]
144*0a6a1f1dSLionel Sambuc	str	r7,[r2],#4		@ rp[j]=
145*0a6a1f1dSLionel Sambuc	teq	r4,r0		@ preserve carry
146*0a6a1f1dSLionel Sambuc	bne	.Lsub
147*0a6a1f1dSLionel Sambuc	sbcs	r14,r14,#0		@ upmost carry
148*0a6a1f1dSLionel Sambuc	mov	r4,sp			@ "rewind" r4
149*0a6a1f1dSLionel Sambuc	sub	r2,r2,r5		@ "rewind" r2
150*0a6a1f1dSLionel Sambuc
151*0a6a1f1dSLionel Sambuc	and	r1,r4,r14
152*0a6a1f1dSLionel Sambuc	bic	r3,r2,r14
153*0a6a1f1dSLionel Sambuc	orr	r1,r1,r3		@ ap=borrow?tp:rp
154*0a6a1f1dSLionel Sambuc
155*0a6a1f1dSLionel Sambuc.Lcopy:	ldr	r7,[r1],#4		@ copy or in-place refresh
156*0a6a1f1dSLionel Sambuc	str	sp,[r4],#4		@ zap tp
157*0a6a1f1dSLionel Sambuc	str	r7,[r2],#4
158*0a6a1f1dSLionel Sambuc	cmp	r4,r0
159*0a6a1f1dSLionel Sambuc	bne	.Lcopy
160*0a6a1f1dSLionel Sambuc
161*0a6a1f1dSLionel Sambuc	add	sp,r0,#4		@ skip over tp[num+1]
162*0a6a1f1dSLionel Sambuc	ldmia	sp!,{r4-r12,lr}		@ restore registers
163*0a6a1f1dSLionel Sambuc	add	sp,sp,#2*4		@ skip over {r0,r2}
164*0a6a1f1dSLionel Sambuc	mov	r0,#1
165*0a6a1f1dSLionel Sambuc.Labrt:
166*0a6a1f1dSLionel Sambuc#if __ARM_ARCH__>=5
167*0a6a1f1dSLionel Sambuc	RET				@ .word	0xe12fff1e
168*0a6a1f1dSLionel Sambuc#else
169*0a6a1f1dSLionel Sambuc	tst	lr,#1
170*0a6a1f1dSLionel Sambuc	moveq	pc,lr			@ be binary compatible with V4, yet
171*0a6a1f1dSLionel Sambuc	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
172*0a6a1f1dSLionel Sambuc#endif
173*0a6a1f1dSLionel Sambuc.size	bn_mul_mont,.-bn_mul_mont
174*0a6a1f1dSLionel Sambuc#if __ARM_MAX_ARCH__>=7
175*0a6a1f1dSLionel Sambuc.arch	armv7-a
176*0a6a1f1dSLionel Sambuc.fpu	neon
177*0a6a1f1dSLionel Sambuc
178*0a6a1f1dSLionel Sambuc.type	bn_mul8x_mont_neon,%function
179*0a6a1f1dSLionel Sambuc.align	5
180*0a6a1f1dSLionel Sambucbn_mul8x_mont_neon:
181*0a6a1f1dSLionel Sambuc	mov	ip,sp
182*0a6a1f1dSLionel Sambuc	stmdb	sp!,{r4-r11}
183*0a6a1f1dSLionel Sambuc	vstmdb	sp!,{d8-d15}		@ ABI specification says so
184*0a6a1f1dSLionel Sambuc	ldmia	ip,{r4-r5}		@ load rest of parameter block
185*0a6a1f1dSLionel Sambuc
186*0a6a1f1dSLionel Sambuc	sub		r7,sp,#16
187*0a6a1f1dSLionel Sambuc	vld1.32		{d28[0]}, [r2,:32]!
188*0a6a1f1dSLionel Sambuc	sub		r7,r7,r5,lsl#4
189*0a6a1f1dSLionel Sambuc	vld1.32		{d0-d3},  [r1]!		@ can't specify :32 :-(
190*0a6a1f1dSLionel Sambuc	and		r7,r7,#-64
191*0a6a1f1dSLionel Sambuc	vld1.32		{d30[0]}, [r4,:32]
192*0a6a1f1dSLionel Sambuc	mov		sp,r7			@ alloca
193*0a6a1f1dSLionel Sambuc	veor		d8,d8,d8
194*0a6a1f1dSLionel Sambuc	subs		r8,r5,#8
195*0a6a1f1dSLionel Sambuc	vzip.16		d28,d8
196*0a6a1f1dSLionel Sambuc
197*0a6a1f1dSLionel Sambuc	vmull.u32	q6,d28,d0[0]
198*0a6a1f1dSLionel Sambuc	vmull.u32	q7,d28,d0[1]
199*0a6a1f1dSLionel Sambuc	vmull.u32	q8,d28,d1[0]
200*0a6a1f1dSLionel Sambuc	vshl.i64	d10,d13,#16
201*0a6a1f1dSLionel Sambuc	vmull.u32	q9,d28,d1[1]
202*0a6a1f1dSLionel Sambuc
203*0a6a1f1dSLionel Sambuc	vadd.u64	d10,d10,d12
204*0a6a1f1dSLionel Sambuc	veor		d8,d8,d8
205*0a6a1f1dSLionel Sambuc	vmul.u32	d29,d10,d30
206*0a6a1f1dSLionel Sambuc
207*0a6a1f1dSLionel Sambuc	vmull.u32	q10,d28,d2[0]
208*0a6a1f1dSLionel Sambuc	 vld1.32	{d4-d7}, [r3]!
209*0a6a1f1dSLionel Sambuc	vmull.u32	q11,d28,d2[1]
210*0a6a1f1dSLionel Sambuc	vmull.u32	q12,d28,d3[0]
211*0a6a1f1dSLionel Sambuc	vzip.16		d29,d8
212*0a6a1f1dSLionel Sambuc	vmull.u32	q13,d28,d3[1]
213*0a6a1f1dSLionel Sambuc
214*0a6a1f1dSLionel Sambuc	bne	.LNEON_1st
215*0a6a1f1dSLionel Sambuc
216*0a6a1f1dSLionel Sambuc	@ special case for num=8, everything is in register bank...
217*0a6a1f1dSLionel Sambuc
218*0a6a1f1dSLionel Sambuc	vmlal.u32	q6,d29,d4[0]
219*0a6a1f1dSLionel Sambuc	sub		r9,r5,#1
220*0a6a1f1dSLionel Sambuc	vmlal.u32	q7,d29,d4[1]
221*0a6a1f1dSLionel Sambuc	vmlal.u32	q8,d29,d5[0]
222*0a6a1f1dSLionel Sambuc	vmlal.u32	q9,d29,d5[1]
223*0a6a1f1dSLionel Sambuc
224*0a6a1f1dSLionel Sambuc	vmlal.u32	q10,d29,d6[0]
225*0a6a1f1dSLionel Sambuc	vmov		q5,q6
226*0a6a1f1dSLionel Sambuc	vmlal.u32	q11,d29,d6[1]
227*0a6a1f1dSLionel Sambuc	vmov		q6,q7
228*0a6a1f1dSLionel Sambuc	vmlal.u32	q12,d29,d7[0]
229*0a6a1f1dSLionel Sambuc	vmov		q7,q8
230*0a6a1f1dSLionel Sambuc	vmlal.u32	q13,d29,d7[1]
231*0a6a1f1dSLionel Sambuc	vmov		q8,q9
232*0a6a1f1dSLionel Sambuc	vmov		q9,q10
233*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d10,#16
234*0a6a1f1dSLionel Sambuc	vmov		q10,q11
235*0a6a1f1dSLionel Sambuc	vmov		q11,q12
236*0a6a1f1dSLionel Sambuc	vadd.u64	d10,d10,d11
237*0a6a1f1dSLionel Sambuc	vmov		q12,q13
238*0a6a1f1dSLionel Sambuc	veor		q13,q13
239*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d10,#16
240*0a6a1f1dSLionel Sambuc
241*0a6a1f1dSLionel Sambuc	b	.LNEON_outer8
242*0a6a1f1dSLionel Sambuc
243*0a6a1f1dSLionel Sambuc.align	4
244*0a6a1f1dSLionel Sambuc.LNEON_outer8:
245*0a6a1f1dSLionel Sambuc	vld1.32		{d28[0]}, [r2,:32]!
246*0a6a1f1dSLionel Sambuc	veor		d8,d8,d8
247*0a6a1f1dSLionel Sambuc	vzip.16		d28,d8
248*0a6a1f1dSLionel Sambuc	vadd.u64	d12,d12,d10
249*0a6a1f1dSLionel Sambuc
250*0a6a1f1dSLionel Sambuc	vmlal.u32	q6,d28,d0[0]
251*0a6a1f1dSLionel Sambuc	vmlal.u32	q7,d28,d0[1]
252*0a6a1f1dSLionel Sambuc	vmlal.u32	q8,d28,d1[0]
253*0a6a1f1dSLionel Sambuc	vshl.i64	d10,d13,#16
254*0a6a1f1dSLionel Sambuc	vmlal.u32	q9,d28,d1[1]
255*0a6a1f1dSLionel Sambuc
256*0a6a1f1dSLionel Sambuc	vadd.u64	d10,d10,d12
257*0a6a1f1dSLionel Sambuc	veor		d8,d8,d8
258*0a6a1f1dSLionel Sambuc	subs		r9,r9,#1
259*0a6a1f1dSLionel Sambuc	vmul.u32	d29,d10,d30
260*0a6a1f1dSLionel Sambuc
261*0a6a1f1dSLionel Sambuc	vmlal.u32	q10,d28,d2[0]
262*0a6a1f1dSLionel Sambuc	vmlal.u32	q11,d28,d2[1]
263*0a6a1f1dSLionel Sambuc	vmlal.u32	q12,d28,d3[0]
264*0a6a1f1dSLionel Sambuc	vzip.16		d29,d8
265*0a6a1f1dSLionel Sambuc	vmlal.u32	q13,d28,d3[1]
266*0a6a1f1dSLionel Sambuc
267*0a6a1f1dSLionel Sambuc	vmlal.u32	q6,d29,d4[0]
268*0a6a1f1dSLionel Sambuc	vmlal.u32	q7,d29,d4[1]
269*0a6a1f1dSLionel Sambuc	vmlal.u32	q8,d29,d5[0]
270*0a6a1f1dSLionel Sambuc	vmlal.u32	q9,d29,d5[1]
271*0a6a1f1dSLionel Sambuc
272*0a6a1f1dSLionel Sambuc	vmlal.u32	q10,d29,d6[0]
273*0a6a1f1dSLionel Sambuc	vmov		q5,q6
274*0a6a1f1dSLionel Sambuc	vmlal.u32	q11,d29,d6[1]
275*0a6a1f1dSLionel Sambuc	vmov		q6,q7
276*0a6a1f1dSLionel Sambuc	vmlal.u32	q12,d29,d7[0]
277*0a6a1f1dSLionel Sambuc	vmov		q7,q8
278*0a6a1f1dSLionel Sambuc	vmlal.u32	q13,d29,d7[1]
279*0a6a1f1dSLionel Sambuc	vmov		q8,q9
280*0a6a1f1dSLionel Sambuc	vmov		q9,q10
281*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d10,#16
282*0a6a1f1dSLionel Sambuc	vmov		q10,q11
283*0a6a1f1dSLionel Sambuc	vmov		q11,q12
284*0a6a1f1dSLionel Sambuc	vadd.u64	d10,d10,d11
285*0a6a1f1dSLionel Sambuc	vmov		q12,q13
286*0a6a1f1dSLionel Sambuc	veor		q13,q13
287*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d10,#16
288*0a6a1f1dSLionel Sambuc
289*0a6a1f1dSLionel Sambuc	bne	.LNEON_outer8
290*0a6a1f1dSLionel Sambuc
291*0a6a1f1dSLionel Sambuc	vadd.u64	d12,d12,d10
292*0a6a1f1dSLionel Sambuc	mov		r7,sp
293*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d12,#16
294*0a6a1f1dSLionel Sambuc	mov		r8,r5
295*0a6a1f1dSLionel Sambuc	vadd.u64	d13,d13,d10
296*0a6a1f1dSLionel Sambuc	add		r6,sp,#16
297*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d13,#16
298*0a6a1f1dSLionel Sambuc	vzip.16		d12,d13
299*0a6a1f1dSLionel Sambuc
300*0a6a1f1dSLionel Sambuc	b	.LNEON_tail2
301*0a6a1f1dSLionel Sambuc
302*0a6a1f1dSLionel Sambuc.align	4
303*0a6a1f1dSLionel Sambuc.LNEON_1st:
304*0a6a1f1dSLionel Sambuc	vmlal.u32	q6,d29,d4[0]
305*0a6a1f1dSLionel Sambuc	 vld1.32	{d0-d3}, [r1]!
306*0a6a1f1dSLionel Sambuc	vmlal.u32	q7,d29,d4[1]
307*0a6a1f1dSLionel Sambuc	subs		r8,r8,#8
308*0a6a1f1dSLionel Sambuc	vmlal.u32	q8,d29,d5[0]
309*0a6a1f1dSLionel Sambuc	vmlal.u32	q9,d29,d5[1]
310*0a6a1f1dSLionel Sambuc
311*0a6a1f1dSLionel Sambuc	vmlal.u32	q10,d29,d6[0]
312*0a6a1f1dSLionel Sambuc	 vld1.32	{d4-d5}, [r3]!
313*0a6a1f1dSLionel Sambuc	vmlal.u32	q11,d29,d6[1]
314*0a6a1f1dSLionel Sambuc	 vst1.64	{q6-q7}, [r7,:256]!
315*0a6a1f1dSLionel Sambuc	vmlal.u32	q12,d29,d7[0]
316*0a6a1f1dSLionel Sambuc	vmlal.u32	q13,d29,d7[1]
317*0a6a1f1dSLionel Sambuc	 vst1.64	{q8-q9}, [r7,:256]!
318*0a6a1f1dSLionel Sambuc
319*0a6a1f1dSLionel Sambuc	vmull.u32	q6,d28,d0[0]
320*0a6a1f1dSLionel Sambuc	 vld1.32	{d6-d7}, [r3]!
321*0a6a1f1dSLionel Sambuc	vmull.u32	q7,d28,d0[1]
322*0a6a1f1dSLionel Sambuc	 vst1.64	{q10-q11}, [r7,:256]!
323*0a6a1f1dSLionel Sambuc	vmull.u32	q8,d28,d1[0]
324*0a6a1f1dSLionel Sambuc	vmull.u32	q9,d28,d1[1]
325*0a6a1f1dSLionel Sambuc	 vst1.64	{q12-q13}, [r7,:256]!
326*0a6a1f1dSLionel Sambuc
327*0a6a1f1dSLionel Sambuc	vmull.u32	q10,d28,d2[0]
328*0a6a1f1dSLionel Sambuc	vmull.u32	q11,d28,d2[1]
329*0a6a1f1dSLionel Sambuc	vmull.u32	q12,d28,d3[0]
330*0a6a1f1dSLionel Sambuc	vmull.u32	q13,d28,d3[1]
331*0a6a1f1dSLionel Sambuc
332*0a6a1f1dSLionel Sambuc	bne	.LNEON_1st
333*0a6a1f1dSLionel Sambuc
334*0a6a1f1dSLionel Sambuc	vmlal.u32	q6,d29,d4[0]
335*0a6a1f1dSLionel Sambuc	add		r6,sp,#16
336*0a6a1f1dSLionel Sambuc	vmlal.u32	q7,d29,d4[1]
337*0a6a1f1dSLionel Sambuc	sub		r1,r1,r5,lsl#2		@ rewind r1
338*0a6a1f1dSLionel Sambuc	vmlal.u32	q8,d29,d5[0]
339*0a6a1f1dSLionel Sambuc	 vld1.64	{q5}, [sp,:128]
340*0a6a1f1dSLionel Sambuc	vmlal.u32	q9,d29,d5[1]
341*0a6a1f1dSLionel Sambuc	sub		r9,r5,#1
342*0a6a1f1dSLionel Sambuc
343*0a6a1f1dSLionel Sambuc	vmlal.u32	q10,d29,d6[0]
344*0a6a1f1dSLionel Sambuc	vst1.64		{q6-q7}, [r7,:256]!
345*0a6a1f1dSLionel Sambuc	vmlal.u32	q11,d29,d6[1]
346*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d10,#16
347*0a6a1f1dSLionel Sambuc	 vld1.64	{q6},       [r6, :128]!
348*0a6a1f1dSLionel Sambuc	vmlal.u32	q12,d29,d7[0]
349*0a6a1f1dSLionel Sambuc	vst1.64		{q8-q9}, [r7,:256]!
350*0a6a1f1dSLionel Sambuc	vmlal.u32	q13,d29,d7[1]
351*0a6a1f1dSLionel Sambuc
352*0a6a1f1dSLionel Sambuc	vst1.64		{q10-q11}, [r7,:256]!
353*0a6a1f1dSLionel Sambuc	vadd.u64	d10,d10,d11
354*0a6a1f1dSLionel Sambuc	veor		q4,q4,q4
355*0a6a1f1dSLionel Sambuc	vst1.64		{q12-q13}, [r7,:256]!
356*0a6a1f1dSLionel Sambuc	 vld1.64	{q7-q8}, [r6, :256]!
357*0a6a1f1dSLionel Sambuc	vst1.64		{q4},          [r7,:128]
358*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d10,#16
359*0a6a1f1dSLionel Sambuc
360*0a6a1f1dSLionel Sambuc	b		.LNEON_outer
361*0a6a1f1dSLionel Sambuc
362*0a6a1f1dSLionel Sambuc.align	4
363*0a6a1f1dSLionel Sambuc.LNEON_outer:
364*0a6a1f1dSLionel Sambuc	vld1.32		{d28[0]}, [r2,:32]!
365*0a6a1f1dSLionel Sambuc	sub		r3,r3,r5,lsl#2		@ rewind r3
366*0a6a1f1dSLionel Sambuc	vld1.32		{d0-d3},  [r1]!
367*0a6a1f1dSLionel Sambuc	veor		d8,d8,d8
368*0a6a1f1dSLionel Sambuc	mov		r7,sp
369*0a6a1f1dSLionel Sambuc	vzip.16		d28,d8
370*0a6a1f1dSLionel Sambuc	sub		r8,r5,#8
371*0a6a1f1dSLionel Sambuc	vadd.u64	d12,d12,d10
372*0a6a1f1dSLionel Sambuc
373*0a6a1f1dSLionel Sambuc	vmlal.u32	q6,d28,d0[0]
374*0a6a1f1dSLionel Sambuc	 vld1.64	{q9-q10},[r6,:256]!
375*0a6a1f1dSLionel Sambuc	vmlal.u32	q7,d28,d0[1]
376*0a6a1f1dSLionel Sambuc	vmlal.u32	q8,d28,d1[0]
377*0a6a1f1dSLionel Sambuc	 vld1.64	{q11-q12},[r6,:256]!
378*0a6a1f1dSLionel Sambuc	vmlal.u32	q9,d28,d1[1]
379*0a6a1f1dSLionel Sambuc
380*0a6a1f1dSLionel Sambuc	vshl.i64	d10,d13,#16
381*0a6a1f1dSLionel Sambuc	veor		d8,d8,d8
382*0a6a1f1dSLionel Sambuc	vadd.u64	d10,d10,d12
383*0a6a1f1dSLionel Sambuc	 vld1.64	{q13},[r6,:128]!
384*0a6a1f1dSLionel Sambuc	vmul.u32	d29,d10,d30
385*0a6a1f1dSLionel Sambuc
386*0a6a1f1dSLionel Sambuc	vmlal.u32	q10,d28,d2[0]
387*0a6a1f1dSLionel Sambuc	 vld1.32	{d4-d7}, [r3]!
388*0a6a1f1dSLionel Sambuc	vmlal.u32	q11,d28,d2[1]
389*0a6a1f1dSLionel Sambuc	vmlal.u32	q12,d28,d3[0]
390*0a6a1f1dSLionel Sambuc	vzip.16		d29,d8
391*0a6a1f1dSLionel Sambuc	vmlal.u32	q13,d28,d3[1]
392*0a6a1f1dSLionel Sambuc
393*0a6a1f1dSLionel Sambuc.LNEON_inner:
394*0a6a1f1dSLionel Sambuc	vmlal.u32	q6,d29,d4[0]
395*0a6a1f1dSLionel Sambuc	 vld1.32	{d0-d3}, [r1]!
396*0a6a1f1dSLionel Sambuc	vmlal.u32	q7,d29,d4[1]
397*0a6a1f1dSLionel Sambuc	 subs		r8,r8,#8
398*0a6a1f1dSLionel Sambuc	vmlal.u32	q8,d29,d5[0]
399*0a6a1f1dSLionel Sambuc	vmlal.u32	q9,d29,d5[1]
400*0a6a1f1dSLionel Sambuc	vst1.64		{q6-q7}, [r7,:256]!
401*0a6a1f1dSLionel Sambuc
402*0a6a1f1dSLionel Sambuc	vmlal.u32	q10,d29,d6[0]
403*0a6a1f1dSLionel Sambuc	 vld1.64	{q6},       [r6, :128]!
404*0a6a1f1dSLionel Sambuc	vmlal.u32	q11,d29,d6[1]
405*0a6a1f1dSLionel Sambuc	vst1.64		{q8-q9}, [r7,:256]!
406*0a6a1f1dSLionel Sambuc	vmlal.u32	q12,d29,d7[0]
407*0a6a1f1dSLionel Sambuc	 vld1.64	{q7-q8}, [r6, :256]!
408*0a6a1f1dSLionel Sambuc	vmlal.u32	q13,d29,d7[1]
409*0a6a1f1dSLionel Sambuc	vst1.64		{q10-q11}, [r7,:256]!
410*0a6a1f1dSLionel Sambuc
411*0a6a1f1dSLionel Sambuc	vmlal.u32	q6,d28,d0[0]
412*0a6a1f1dSLionel Sambuc	 vld1.64	{q9-q10}, [r6, :256]!
413*0a6a1f1dSLionel Sambuc	vmlal.u32	q7,d28,d0[1]
414*0a6a1f1dSLionel Sambuc	vst1.64		{q12-q13}, [r7,:256]!
415*0a6a1f1dSLionel Sambuc	vmlal.u32	q8,d28,d1[0]
416*0a6a1f1dSLionel Sambuc	 vld1.64	{q11-q12}, [r6, :256]!
417*0a6a1f1dSLionel Sambuc	vmlal.u32	q9,d28,d1[1]
418*0a6a1f1dSLionel Sambuc	 vld1.32	{d4-d7}, [r3]!
419*0a6a1f1dSLionel Sambuc
420*0a6a1f1dSLionel Sambuc	vmlal.u32	q10,d28,d2[0]
421*0a6a1f1dSLionel Sambuc	 vld1.64	{q13},       [r6, :128]!
422*0a6a1f1dSLionel Sambuc	vmlal.u32	q11,d28,d2[1]
423*0a6a1f1dSLionel Sambuc	vmlal.u32	q12,d28,d3[0]
424*0a6a1f1dSLionel Sambuc	vmlal.u32	q13,d28,d3[1]
425*0a6a1f1dSLionel Sambuc
426*0a6a1f1dSLionel Sambuc	bne	.LNEON_inner
427*0a6a1f1dSLionel Sambuc
428*0a6a1f1dSLionel Sambuc	vmlal.u32	q6,d29,d4[0]
429*0a6a1f1dSLionel Sambuc	add		r6,sp,#16
430*0a6a1f1dSLionel Sambuc	vmlal.u32	q7,d29,d4[1]
431*0a6a1f1dSLionel Sambuc	sub		r1,r1,r5,lsl#2		@ rewind r1
432*0a6a1f1dSLionel Sambuc	vmlal.u32	q8,d29,d5[0]
433*0a6a1f1dSLionel Sambuc	 vld1.64	{q5}, [sp,:128]
434*0a6a1f1dSLionel Sambuc	vmlal.u32	q9,d29,d5[1]
435*0a6a1f1dSLionel Sambuc	subs		r9,r9,#1
436*0a6a1f1dSLionel Sambuc
437*0a6a1f1dSLionel Sambuc	vmlal.u32	q10,d29,d6[0]
438*0a6a1f1dSLionel Sambuc	vst1.64		{q6-q7}, [r7,:256]!
439*0a6a1f1dSLionel Sambuc	vmlal.u32	q11,d29,d6[1]
440*0a6a1f1dSLionel Sambuc	 vld1.64	{q6},       [r6, :128]!
441*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d10,#16
442*0a6a1f1dSLionel Sambuc	vst1.64		{q8-q9}, [r7,:256]!
443*0a6a1f1dSLionel Sambuc	vmlal.u32	q12,d29,d7[0]
444*0a6a1f1dSLionel Sambuc	 vld1.64	{q7-q8}, [r6, :256]!
445*0a6a1f1dSLionel Sambuc	vmlal.u32	q13,d29,d7[1]
446*0a6a1f1dSLionel Sambuc
447*0a6a1f1dSLionel Sambuc	vst1.64		{q10-q11}, [r7,:256]!
448*0a6a1f1dSLionel Sambuc	vadd.u64	d10,d10,d11
449*0a6a1f1dSLionel Sambuc	vst1.64		{q12-q13}, [r7,:256]!
450*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d10,#16
451*0a6a1f1dSLionel Sambuc
452*0a6a1f1dSLionel Sambuc	bne	.LNEON_outer
453*0a6a1f1dSLionel Sambuc
454*0a6a1f1dSLionel Sambuc	mov		r7,sp
455*0a6a1f1dSLionel Sambuc	mov		r8,r5
456*0a6a1f1dSLionel Sambuc
457*0a6a1f1dSLionel Sambuc.LNEON_tail:
458*0a6a1f1dSLionel Sambuc	vadd.u64	d12,d12,d10
459*0a6a1f1dSLionel Sambuc	vld1.64		{q9-q10}, [r6, :256]!
460*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d12,#16
461*0a6a1f1dSLionel Sambuc	vadd.u64	d13,d13,d10
462*0a6a1f1dSLionel Sambuc	vld1.64		{q11-q12}, [r6, :256]!
463*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d13,#16
464*0a6a1f1dSLionel Sambuc	vld1.64		{q13},       [r6, :128]!
465*0a6a1f1dSLionel Sambuc	vzip.16		d12,d13
466*0a6a1f1dSLionel Sambuc
467*0a6a1f1dSLionel Sambuc.LNEON_tail2:
468*0a6a1f1dSLionel Sambuc	vadd.u64	d14,d14,d10
469*0a6a1f1dSLionel Sambuc	vst1.32		{d12[0]}, [r7, :32]!
470*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d14,#16
471*0a6a1f1dSLionel Sambuc	vadd.u64	d15,d15,d10
472*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d15,#16
473*0a6a1f1dSLionel Sambuc	vzip.16		d14,d15
474*0a6a1f1dSLionel Sambuc
475*0a6a1f1dSLionel Sambuc	vadd.u64	d16,d16,d10
476*0a6a1f1dSLionel Sambuc	vst1.32		{d14[0]}, [r7, :32]!
477*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d16,#16
478*0a6a1f1dSLionel Sambuc	vadd.u64	d17,d17,d10
479*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d17,#16
480*0a6a1f1dSLionel Sambuc	vzip.16		d16,d17
481*0a6a1f1dSLionel Sambuc
482*0a6a1f1dSLionel Sambuc	vadd.u64	d18,d18,d10
483*0a6a1f1dSLionel Sambuc	vst1.32		{d16[0]}, [r7, :32]!
484*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d18,#16
485*0a6a1f1dSLionel Sambuc	vadd.u64	d19,d19,d10
486*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d19,#16
487*0a6a1f1dSLionel Sambuc	vzip.16		d18,d19
488*0a6a1f1dSLionel Sambuc
489*0a6a1f1dSLionel Sambuc	vadd.u64	d20,d20,d10
490*0a6a1f1dSLionel Sambuc	vst1.32		{d18[0]}, [r7, :32]!
491*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d20,#16
492*0a6a1f1dSLionel Sambuc	vadd.u64	d21,d21,d10
493*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d21,#16
494*0a6a1f1dSLionel Sambuc	vzip.16		d20,d21
495*0a6a1f1dSLionel Sambuc
496*0a6a1f1dSLionel Sambuc	vadd.u64	d22,d22,d10
497*0a6a1f1dSLionel Sambuc	vst1.32		{d20[0]}, [r7, :32]!
498*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d22,#16
499*0a6a1f1dSLionel Sambuc	vadd.u64	d23,d23,d10
500*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d23,#16
501*0a6a1f1dSLionel Sambuc	vzip.16		d22,d23
502*0a6a1f1dSLionel Sambuc
503*0a6a1f1dSLionel Sambuc	vadd.u64	d24,d24,d10
504*0a6a1f1dSLionel Sambuc	vst1.32		{d22[0]}, [r7, :32]!
505*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d24,#16
506*0a6a1f1dSLionel Sambuc	vadd.u64	d25,d25,d10
507*0a6a1f1dSLionel Sambuc	vld1.64		{q6}, [r6, :128]!
508*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d25,#16
509*0a6a1f1dSLionel Sambuc	vzip.16		d24,d25
510*0a6a1f1dSLionel Sambuc
511*0a6a1f1dSLionel Sambuc	vadd.u64	d26,d26,d10
512*0a6a1f1dSLionel Sambuc	vst1.32		{d24[0]}, [r7, :32]!
513*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d26,#16
514*0a6a1f1dSLionel Sambuc	vadd.u64	d27,d27,d10
515*0a6a1f1dSLionel Sambuc	vld1.64		{q7-q8},	[r6, :256]!
516*0a6a1f1dSLionel Sambuc	vshr.u64	d10,d27,#16
517*0a6a1f1dSLionel Sambuc	vzip.16		d26,d27
518*0a6a1f1dSLionel Sambuc	subs		r8,r8,#8
519*0a6a1f1dSLionel Sambuc	vst1.32		{d26[0]}, [r7, :32]!
520*0a6a1f1dSLionel Sambuc
521*0a6a1f1dSLionel Sambuc	bne	.LNEON_tail
522*0a6a1f1dSLionel Sambuc
523*0a6a1f1dSLionel Sambuc	vst1.32	{d10[0]}, [r7, :32]		@ top-most bit
524*0a6a1f1dSLionel Sambuc	sub	r3,r3,r5,lsl#2			@ rewind r3
525*0a6a1f1dSLionel Sambuc	subs	r1,sp,#0				@ clear carry flag
526*0a6a1f1dSLionel Sambuc	add	r2,sp,r5,lsl#2
527*0a6a1f1dSLionel Sambuc
528*0a6a1f1dSLionel Sambuc.LNEON_sub:
529*0a6a1f1dSLionel Sambuc	ldmia	r1!, {r4-r7}
530*0a6a1f1dSLionel Sambuc	ldmia	r3!, {r8-r11}
531*0a6a1f1dSLionel Sambuc	sbcs	r8, r4,r8
532*0a6a1f1dSLionel Sambuc	sbcs	r9, r5,r9
533*0a6a1f1dSLionel Sambuc	sbcs	r10,r6,r10
534*0a6a1f1dSLionel Sambuc	sbcs	r11,r7,r11
535*0a6a1f1dSLionel Sambuc	teq	r1,r2				@ preserves carry
536*0a6a1f1dSLionel Sambuc	stmia	r0!, {r8-r11}
537*0a6a1f1dSLionel Sambuc	bne	.LNEON_sub
538*0a6a1f1dSLionel Sambuc
539*0a6a1f1dSLionel Sambuc	ldr	r10, [r1]				@ load top-most bit
540*0a6a1f1dSLionel Sambuc	veor	q0,q0,q0
541*0a6a1f1dSLionel Sambuc	sub	r11,r2,sp				@ this is num*4
542*0a6a1f1dSLionel Sambuc	veor	q1,q1,q1
543*0a6a1f1dSLionel Sambuc	mov	r1,sp
544*0a6a1f1dSLionel Sambuc	sub	r0,r0,r11				@ rewind r0
545*0a6a1f1dSLionel Sambuc	mov	r3,r2				@ second 3/4th of frame
546*0a6a1f1dSLionel Sambuc	sbcs	r10,r10,#0				@ result is carry flag
547*0a6a1f1dSLionel Sambuc
548*0a6a1f1dSLionel Sambuc.LNEON_copy_n_zap:
549*0a6a1f1dSLionel Sambuc	ldmia	r1!, {r4-r7}
550*0a6a1f1dSLionel Sambuc	ldmia	r0,  {r8-r11}
551*0a6a1f1dSLionel Sambuc	movcc	r8, r4
552*0a6a1f1dSLionel Sambuc	vst1.64	{q0-q1}, [r3,:256]!			@ wipe
553*0a6a1f1dSLionel Sambuc	movcc	r9, r5
554*0a6a1f1dSLionel Sambuc	movcc	r10,r6
555*0a6a1f1dSLionel Sambuc	vst1.64	{q0-q1}, [r3,:256]!			@ wipe
556*0a6a1f1dSLionel Sambuc	movcc	r11,r7
557*0a6a1f1dSLionel Sambuc	ldmia	r1, {r4-r7}
558*0a6a1f1dSLionel Sambuc	stmia	r0!, {r8-r11}
559*0a6a1f1dSLionel Sambuc	sub	r1,r1,#16
560*0a6a1f1dSLionel Sambuc	ldmia	r0, {r8-r11}
561*0a6a1f1dSLionel Sambuc	movcc	r8, r4
562*0a6a1f1dSLionel Sambuc	vst1.64	{q0-q1}, [r1,:256]!			@ wipe
563*0a6a1f1dSLionel Sambuc	movcc	r9, r5
564*0a6a1f1dSLionel Sambuc	movcc	r10,r6
565*0a6a1f1dSLionel Sambuc	vst1.64	{q0-q1}, [r3,:256]!			@ wipe
566*0a6a1f1dSLionel Sambuc	movcc	r11,r7
567*0a6a1f1dSLionel Sambuc	teq	r1,r2				@ preserves carry
568*0a6a1f1dSLionel Sambuc	stmia	r0!, {r8-r11}
569*0a6a1f1dSLionel Sambuc	bne	.LNEON_copy_n_zap
570*0a6a1f1dSLionel Sambuc
571*0a6a1f1dSLionel Sambuc	sub	sp,ip,#96
572*0a6a1f1dSLionel Sambuc        vldmia  sp!,{d8-d15}
573*0a6a1f1dSLionel Sambuc        ldmia   sp!,{r4-r11}
574*0a6a1f1dSLionel Sambuc	RET						@ .word	0xe12fff1e
575*0a6a1f1dSLionel Sambuc.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
576*0a6a1f1dSLionel Sambuc#endif
577*0a6a1f1dSLionel Sambuc.asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
578*0a6a1f1dSLionel Sambuc.align	2
579*0a6a1f1dSLionel Sambuc#if __ARM_MAX_ARCH__>=7
580*0a6a1f1dSLionel Sambuc.comm	OPENSSL_armcap_P,4,4
581*0a6a1f1dSLionel Sambuc#endif
582