xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/arm/v5/mod_1_2.asm (revision 7e30e94394d0994ab9534f68a8f91665045c91ce)
1dnl  ARM mpn_mod_1s_2p
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2012 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C	     cycles/limb
25C StrongARM	 ?
26C XScale	 ?
27C Cortex-A8	 ?
28C Cortex-A9	 4.25
29C Cortex-A15	 3
30
31define(`ap', `r0')
32define(`n',  `r1')
33define(`d',  `r2')
34define(`cps',`r3')
35
36ASM_START()
37PROLOGUE(mpn_mod_1s_2p)
38	push	{r4-r10}
39	tst	n, #1
40	add	r7, r3, #8
41	ldmia	r7, {r7, r8, r12}	C load B1, B2, B3
42	add	ap, ap, n, lsl #2	C put ap at operand end
43	beq	L(evn)
44
45L(odd):	subs	n, n, #1
46	beq	L(1)
47	ldmdb	ap!, {r4,r6,r9}
48	mov	r10, #0
49	umlal	r4, r10, r6, r7
50	umlal	r4, r10, r9, r8
51	b	L(com)
52
53L(evn):	ldmdb	ap!, {r4,r10}
54L(com):	subs	n, n, #2
55	ble	L(end)
56	ldmdb	ap!, {r5,r6}
57	b	L(mid)
58
59L(top):	mov	r9, #0
60	umlal	r5, r9, r6, r7		C B1
61	umlal	r5, r9, r4, r8		C B2
62	ldmdb	ap!, {r4,r6}
63	umlal	r5, r9, r10, r12	C B3
64	ble	L(xit)
65	mov	r10, #0
66	umlal	r4, r10, r6, r7		C B1
67	umlal	r4, r10, r5, r8		C B2
68	ldmdb	ap!, {r5,r6}
69	umlal	r4, r10, r9, r12	C B3
70L(mid):	subs	n, n, #4
71	bge	L(top)
72
73	mov	r9, #0
74	umlal	r5, r9, r6, r7		C B1
75	umlal	r5, r9, r4, r8		C B2
76	umlal	r5, r9, r10, r12	C B3
77	mov	r4, r5
78
79L(end):	movge	   r9, r10		C executed iff coming via xit
80	ldr	r6, [r3, #4]		C cps[1] = cnt
81	mov	r5, #0
82	umlal	r4, r5, r9, r7
83	mov	r7, r5, lsl r6
84L(x):	rsb	r1, r6, #32
85	orr	r8, r7, r4, lsr r1
86	mov	r9, r4, lsl r6
87	ldr	r5, [r3, #0]
88	add	r0, r8, #1
89	umull	r12, r1, r8, r5
90	adds	r4, r12, r9
91	adc	r1, r1, r0
92	mul	r5, r2, r1
93	sub	r9, r9, r5
94	cmp	r9, r4
95	addhi	r9, r9, r2
96	cmp	r2, r9
97	subls	r9, r9, r2
98	mov	r0, r9, lsr r6
99	pop	{r4-r10}
100	bx	r14
101
102L(xit):	mov	r10, #0
103	umlal	r4, r10, r6, r7		C B1
104	umlal	r4, r10, r5, r8		C B2
105	umlal	r4, r10, r9, r12	C B3
106	b	L(end)
107
108L(1):	ldr	r6, [r3, #4]		C cps[1] = cnt
109	ldr	r4, [ap, #-4]		C ap[0]
110	mov	r7, #0
111	b	L(x)
112EPILOGUE()
113
114PROLOGUE(mpn_mod_1s_2p_cps)
115	push	{r4-r8, r14}
116	clz	r4, r1
117	mov	r5, r1, lsl r4		C b <<= cnt
118	mov	r6, r0			C r6 = cps
119	mov	r0, r5
120	bl	mpn_invert_limb
121	rsb	r3, r4, #32
122	mov	r3, r0, lsr r3
123	mov	r2, #1
124	orr	r3, r3, r2, lsl r4
125	rsb	r1, r5, #0
126	mul	r2, r1, r3
127	umull	r3, r12, r2, r0
128	add	r12, r2, r12
129	mvn	r12, r12
130	mul	r1, r5, r12
131	cmp	r1, r3
132	addhi	r1, r1, r5
133	umull	r12, r7, r1, r0
134	add	r7, r1, r7
135	mvn	r7, r7
136	mul	r3, r5, r7
137	cmp	r3, r12
138	addhi	r3, r3, r5
139	mov	r5, r2, lsr r4
140	mov	r7, r1, lsr r4
141	mov	r8, r3, lsr r4
142	stmia	r6, {r0,r4,r5,r7,r8}	C fill cps
143	pop	{r4-r8, pc}
144EPILOGUE()
145