xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/arm/v6/addmul_3.asm (revision 72c7faa4dbb41dbb0238d6b4a109da0d4b236dd4)
1dnl  ARM mpn_addmul_3.
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C StrongARM:	 -
37C XScale	 -
38C ARM11		 4.33
39C Cortex-A5	 3.28
40C Cortex-A7	 3.25
41C Cortex-A8	 3.17
42C Cortex-A9	 2.125
43C Cortex-A15	 2
44C Cortex-A17	 2.11
45C Cortex-A53	 4.18
46
47C TODO
48C  * Use a fast path for n <= KARATSUBA_MUL_THRESHOLD using a jump table,
49C    avoiding the current multiply.
50C  * Start the first multiply or multiplies early.
51
52define(`rp',`r0')
53define(`up',`r1')
54define(`n', `r2')
55define(`vp',`r3')
56
57define(`v0',`r4')  define(`v1',`r5')  define(`v2',`r6')
58define(`u0',`r3')  define(`u1',`r14')
59define(`w0',`r7')  define(`w1',`r8')  define(`w2',`r9')
60define(`cy0',`r10')  define(`cy1',`r11') define(`cy2',`r12')
61
62
63ASM_START()
64PROLOGUE(mpn_addmul_3)
65	push	{ r4-r11, r14 }
66
67	ldr	w0, =0xaaaaaaab		C 3^{-1} mod 2^32
68	ldm	vp, { v0,v1,v2 }
69	mov	cy0, #0
70	mov	cy1, #0
71	mov	cy2, #0
72
73C Tricky n mod 6
74	mul	w0, w0, n		C n * 3^{-1} mod 2^32
75	and	w0, w0, #0xc0000001	C pseudo-CRT mod 3,2
76	sub	n, n, #3
77ifdef(`PIC',`
78	add	pc, pc, w0, ror $28
79	nop
80	b	L(b0)
81	b	L(b2)
82	b	L(b4)
83	.word	0xe7f000f0	C udf
84	b	L(b3)
85	b	L(b5)
86	b	L(b1)
87',`
88	ldr	pc, [pc, w0, ror $28]
89	nop
90	.word	L(b0), L(b2), L(b4), 0, L(b3), L(b5), L(b1)
91')
92
93L(b5):	add	up, up, #-8
94	ldr	w1, [rp, #0]
95	ldr	w2, [rp, #4]
96	ldr	u1, [up, #8]
97	b	L(lo5)
98
99L(b4):	add	rp, rp, #-4
100	add	up, up, #-12
101	ldr	w2, [rp, #4]
102	ldr	w0, [rp, #8]
103	ldr	u0, [up, #12]
104	b	L(lo4)
105
106L(b3):	add	rp, rp, #-8
107	add	up, up, #-16
108	ldr	w0, [rp, #8]
109	ldr	w1, [rp, #12]
110	ldr	u1, [up, #16]
111	b	L(lo3)
112
113L(b1):	add	rp, rp, #8
114	ldr	w2, [rp, #-8]
115	ldr	w0, [rp, #-4]
116	ldr	u1, [up, #0]
117	b	L(lo1)
118
119L(b0):	add	rp, rp, #4
120	add	up, up, #-4
121	ldr	w0, [rp, #-4]
122	ldr	w1, [rp, #0]
123	ldr	u0, [up, #4]
124	b	L(lo0)
125
126L(b2):	add	rp, rp, #12
127	add	up, up, #4
128	ldr	w1, [rp, #-12]
129	ldr	w2, [rp, #-8]
130	ldr	u0, [up, #-4]
131
132	ALIGN(16)
133L(top):	ldr	w0, [rp, #-4]
134	umaal	w1, cy0, u0, v0
135	ldr	u1, [up, #0]
136	umaal	w2, cy1, u0, v1
137	str	w1, [rp, #-12]
138	umaal	w0, cy2, u0, v2
139L(lo1):	ldr	w1, [rp, #0]
140	umaal	w2, cy0, u1, v0
141	ldr	u0, [up, #4]
142	umaal	w0, cy1, u1, v1
143	str	w2, [rp, #-8]
144	umaal	w1, cy2, u1, v2
145L(lo0):	ldr	w2, [rp, #4]
146	umaal	w0, cy0, u0, v0
147	ldr	u1, [up, #8]
148	umaal	w1, cy1, u0, v1
149	str	w0, [rp, #-4]
150	umaal	w2, cy2, u0, v2
151L(lo5):	ldr	w0, [rp, #8]
152	umaal	w1, cy0, u1, v0
153	ldr	u0, [up, #12]
154	umaal	w2, cy1, u1, v1
155	str	w1, [rp, #0]
156	umaal	w0, cy2, u1, v2
157L(lo4):	ldr	w1, [rp, #12]
158	umaal	w2, cy0, u0, v0
159	ldr	u1, [up, #16]
160	umaal	w0, cy1, u0, v1
161	str	w2, [rp, #4]
162	umaal	w1, cy2, u0, v2
163L(lo3):	ldr	w2, [rp, #16]
164	umaal	w0, cy0, u1, v0
165	ldr	u0, [up, #20]
166	umaal	w1, cy1, u1, v1
167	str	w0, [rp, #8]
168	umaal	w2, cy2, u1, v2
169L(lo2):	subs	n, n, #6
170	add	up, up, #24
171	add	rp, rp, #24
172	bge	L(top)
173
174L(end):	umaal	w1, cy0, u0, v0
175	ldr	u1, [up, #0]
176	umaal	w2, cy1, u0, v1
177	str	w1, [rp, #-12]
178	mov	w0, #0
179	umaal	w0, cy2, u0, v2
180	umaal	w2, cy0, u1, v0
181	umaal	w0, cy1, u1, v1
182	str	w2, [rp, #-8]
183	umaal	cy1, cy2, u1, v2
184	adds	w0, w0, cy0
185	str	w0, [rp, #-4]
186	adcs	w1, cy1, #0
187	str	w1, [rp, #0]
188	adc	r0, cy2, #0
189
190	pop	{ r4-r11, pc }
191EPILOGUE()
192