xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/aorsmul_1.asm (revision 49d8c9ecf4abd21261269266ef64939f71b3cd09)
1dnl  PowerPC-64 mpn_addmul_1 and mpn_submul_1.
2
3dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2010, 2011, 2012
4dnl  Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C               mpn_addmul_1    mpn_submul_1
24C               cycles/limb     cycles/limb
25C POWER3/PPC630   6-18             6-18
26C POWER4/PPC970    8                8.3
27C POWER5           8                8.25
28C POWER6          16.25            16.75
29C POWER7           3.77             4.9
30
31C TODO
32C  * Try to reduce the number of needed live registers
33C  * Add support for _1c entry points
34
35C INPUT PARAMETERS
36define(`rp', `r3')
37define(`up', `r4')
38define(`n',  `r5')
39define(`vl', `r6')
40
41ifdef(`OPERATION_addmul_1',`
42  define(ADDSUBC,	adde)
43  define(ADDSUB,	addc)
44  define(func,		mpn_addmul_1)
45  define(func_nc,	mpn_addmul_1c)	C FIXME: not really supported
46  define(SM,		`')
47')
48ifdef(`OPERATION_submul_1',`
49  define(ADDSUBC,	subfe)
50  define(ADDSUB,	subfc)
51  define(func,		mpn_submul_1)
52  define(func_nc,	mpn_submul_1c)	C FIXME: not really supported
53  define(SM,		`$1')
54')
55
56MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
57
58ASM_START()
59PROLOGUE(func)
60	std	r31, -8(r1)
61	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
62	std	r30, -16(r1)
63	cmpdi	cr6, r0, 2
64	std	r29, -24(r1)
65	addi	n, n, 3		C compute count...
66	std	r28, -32(r1)
67	srdi	n, n, 2		C ...for ctr
68	std	r27, -40(r1)
69	mtctr	n		C copy count into ctr
70	beq	cr0, L(b00)
71	blt	cr6, L(b01)
72	beq	cr6, L(b10)
73
74L(b11):	ld	r9, 0(up)
75	ld	r28, 0(rp)
76	mulld	r0, r9, r6
77	mulhdu	r12, r9, r6
78	ADDSUB	r0, r0, r28
79	std	r0, 0(rp)
80	addi	rp, rp, 8
81	ld	r9, 8(up)
82	ld	r27, 16(up)
83	addi	up, up, 24
84SM(`	subfe	r11, r11, r11 ')
85	b	L(bot)
86
87	ALIGN(16)
88L(b00):	ld	r9, 0(up)
89	ld	r27, 8(up)
90	ld	r28, 0(rp)
91	ld	r29, 8(rp)
92	mulld	r0, r9, r6
93	mulhdu	r5, r9, r6
94	mulld	r7, r27, r6
95	mulhdu	r8, r27, r6
96	addc	r7, r7, r5
97	addze	r12, r8
98	ADDSUB	r0, r0, r28
99	std	r0, 0(rp)
100	ADDSUBC	r7, r7, r29
101	std	r7, 8(rp)
102	addi	rp, rp, 16
103	ld	r9, 16(up)
104	ld	r27, 24(up)
105	addi	up, up, 32
106SM(`	subfe	r11, r11, r11 ')
107	b	L(bot)
108
109	ALIGN(16)
110L(b01):	bdnz	L(gt1)
111	ld	r9, 0(up)
112	ld	r11, 0(rp)
113	mulld	r0, r9, r6
114	mulhdu	r8, r9, r6
115	ADDSUB	r0, r0, r11
116	std	r0, 0(rp)
117SM(`	subfe	r11, r11, r11 ')
118SM(`	addic	r11, r11, 1 ')
119	addze	r3, r8
120	blr
121L(gt1):	ld	r9, 0(up)
122	ld	r27, 8(up)
123	mulld	r0, r9, r6
124	mulhdu	r5, r9, r6
125	mulld	r7, r27, r6
126	mulhdu	r8, r27, r6
127	ld	r9, 16(up)
128	ld	r28, 0(rp)
129	ld	r29, 8(rp)
130	ld	r30, 16(rp)
131	mulld	r11, r9, r6
132	mulhdu	r10, r9, r6
133	addc	r7, r7, r5
134	adde	r11, r11, r8
135	addze	r12, r10
136	ADDSUB	r0, r0, r28
137	std	r0, 0(rp)
138	ADDSUBC	r7, r7, r29
139	std	r7, 8(rp)
140	ADDSUBC	r11, r11, r30
141	std	r11, 16(rp)
142	addi	rp, rp, 24
143	ld	r9, 24(up)
144	ld	r27, 32(up)
145	addi	up, up, 40
146SM(`	subfe	r11, r11, r11 ')
147	b	L(bot)
148
149L(b10):	addic	r0, r0, 0
150	li	r12, 0		C cy_limb = 0
151	ld	r9, 0(up)
152	ld	r27, 8(up)
153	bdz	L(end)
154	addi	up, up, 16
155
156	ALIGN(16)
157L(top):	mulld	r0, r9, r6
158	mulhdu	r5, r9, r6	C 9
159	mulld	r7, r27, r6
160	mulhdu	r8, r27, r6	C 27
161	ld	r9, 0(up)
162	ld	r28, 0(rp)
163	ld	r27, 8(up)
164	ld	r29, 8(rp)
165	adde	r0, r0, r12	C 0 12
166	adde	r7, r7, r5	C 5 7
167	mulld	r5, r9, r6
168	mulhdu	r10, r9, r6	C 9
169	mulld	r11, r27, r6
170	mulhdu	r12, r27, r6	C 27
171	ld	r9, 16(up)
172	ld	r30, 16(rp)
173	ld	r27, 24(up)
174	ld	r31, 24(rp)
175	adde	r5, r5, r8	C 8 5
176	adde	r11, r11, r10	C 10 11
177	addze	r12, r12	C 12
178	ADDSUB	r0, r0, r28	C 0 28
179	std	r0, 0(rp)	C 0
180	ADDSUBC	r7, r7, r29	C 7 29
181	std	r7, 8(rp)	C 7
182	ADDSUBC	r5, r5, r30	C 5 30
183	std	r5, 16(rp)	C 5
184	ADDSUBC	r11, r11, r31	C 11 31
185	std	r11, 24(rp)	C 11
186	addi	up, up, 32
187SM(`	subfe	r11, r11, r11 ')
188	addi	rp, rp, 32
189L(bot):
190SM(`	addic	r11, r11, 1 ')
191	bdnz	L(top)
192
193L(end):	mulld	r0, r9, r6
194	mulhdu	r5, r9, r6
195	mulld	r7, r27, r6
196	mulhdu	r8, r27, r6
197	ld	r28, 0(rp)
198	ld	r29, 8(rp)
199	adde	r0, r0, r12
200	adde	r7, r7, r5
201	addze	r8, r8
202	ADDSUB	r0, r0, r28
203	std	r0, 0(rp)
204	ADDSUBC	r7, r7, r29
205	std	r7, 8(rp)
206SM(`	subfe	r11, r11, r11 ')
207SM(`	addic	r11, r11, 1 ')
208	addze	r3, r8
209	ld	r31, -8(r1)
210	ld	r30, -16(r1)
211	ld	r29, -24(r1)
212	ld	r28, -32(r1)
213	ld	r27, -40(r1)
214	blr
215EPILOGUE()
216