xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/arm/v6/addmul_2.asm (revision 6a493d6bc668897c91594964a732d38505b70cbb)
1dnl  ARM mpn_addmul_2.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2012 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C	     cycles/limb
25C StrongARM:	 -
26C XScale	 -
27C Cortex-A8	 ?
28C Cortex-A9	 2.38
29C Cortex-A15	 2.5
30
31C TODO
32C  * Consider using more registers for the r[] loads, allowing better load-use
33C    scheduling for a 6% speedup (on A9).  Free: r10, r11, r14
34
35define(`rp',`r0')
36define(`up',`r1')
37define(`n', `r2')
38define(`vp',`r3')
39
40define(`v0',`r6')
41define(`v1',`r7')
42define(`u0',`r3')
43define(`u1',`r9')
44
45define(`cya',`r8')
46define(`cyb',`r12')
47
48
49ASM_START()
50PROLOGUE(mpn_addmul_2)
51	push	{ r4, r5, r6, r7, r8, r9 }
52
53	ldm	vp, { v0, v1 }
54	mov	cya, #0
55	mov	cyb, #0
56
57	tst	n, #1
58	beq	L(evn)
59L(odd):	ldr	r5, [rp, #0]
60	ldr	u0, [up, #0]
61	ldr	r4, [rp, #4]
62	tst	n, #2
63	beq	L(fi1)
64L(fi3):	sub	up, up, #12
65	sub	rp, rp, #16
66	b	L(lo3)
67L(fi1):	sub	n, n, #1
68	sub	up, up, #4
69	sub	rp, rp, #8
70	b	L(lo1)
71L(evn):	ldr	r4, [rp, #0]
72	ldr	u1, [up, #0]
73	ldr	r5, [rp, #4]
74	tst	n, #2
75	bne	L(fi2)
76L(fi0):	sub	up, up, #8
77	sub	rp, rp, #12
78	b	L(lo0)
79L(fi2):	subs	n, n, #2
80	sub	rp, rp, #4
81	bls	L(end)
82
83	ALIGN(16)
84L(top):	ldr	u0, [up, #4]
85	umaal	r4, cya, u1, v0
86	str	r4, [rp, #4]
87	ldr	r4, [rp, #12]
88	umaal	r5, cyb, u1, v1
89L(lo1):	ldr	u1, [up, #8]
90	umaal	r5, cya, u0, v0
91	str	r5, [rp, #8]
92	ldr	r5, [rp, #16]
93	umaal	r4, cyb, u0, v1
94L(lo0):	ldr	u0, [up, #12]
95	umaal	r4, cya, u1, v0
96	str	r4, [rp, #12]
97	ldr	r4, [rp, #20]
98	umaal	r5, cyb, u1, v1
99L(lo3):	ldr	u1, [up, #16]!
100	umaal	r5, cya, u0, v0
101	str	r5, [rp, #16]!
102	ldr	r5, [rp, #8]
103	umaal	r4, cyb, u0, v1
104	subs	n, n, #4
105	bhi	L(top)
106
107L(end):	umaal	r4, cya, u1, v0
108	ldr	u0, [up, #4]
109	umaal	r5, cyb, u1, v1
110	str	r4, [rp, #4]
111	umaal	r5, cya, u0, v0
112	umaal	cya, cyb, u0, v1
113	str	r5, [rp, #8]
114	str	cya, [rp, #12]
115	mov	r0, cyb
116
117	pop	{ r4, r5, r6, r7, r8, r9 }
118	bx	r14
119EPILOGUE()
120