xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/aorsmul_1.asm (revision 4ac76180e904e771b9d522c7e57296d371f06499)
1dnl  PowerPC-64 mpn_addmul_1 and mpn_submul_1.
2
3dnl  Copyright 1999-2001, 2003-2006, 2010-2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C                   mpn_addmul_1    mpn_submul_1
34C                   cycles/limb     cycles/limb
35C POWER3/PPC630		6-18		6-18
36C POWER4/PPC970		 8		 8.3
37C POWER5		 8		 8.25
38C POWER6		16.25		16.75
39C POWER7		 3.77		 4.9
40
41C TODO
42C  * Try to reduce the number of needed live registers
43C  * Add support for _1c entry points
44
45C INPUT PARAMETERS
46define(`rp', `r3')
47define(`up', `r4')
48define(`n',  `r5')
49define(`vl', `r6')
50
51ifdef(`OPERATION_addmul_1',`
52  define(ADDSUBC,	adde)
53  define(ADDSUB,	addc)
54  define(func,		mpn_addmul_1)
55  define(func_nc,	mpn_addmul_1c)	C FIXME: not really supported
56  define(SM,		`')
57')
58ifdef(`OPERATION_submul_1',`
59  define(ADDSUBC,	subfe)
60  define(ADDSUB,	subfc)
61  define(func,		mpn_submul_1)
62  define(func_nc,	mpn_submul_1c)	C FIXME: not really supported
63  define(SM,		`$1')
64')
65
66MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
67
68ASM_START()
69PROLOGUE(func)
70	std	r31, -8(r1)
71	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
72	std	r30, -16(r1)
73	cmpdi	cr6, r0, 2
74	std	r29, -24(r1)
75	addi	n, n, 3		C compute count...
76	std	r28, -32(r1)
77	srdi	n, n, 2		C ...for ctr
78	std	r27, -40(r1)
79	mtctr	n		C copy count into ctr
80	beq	cr0, L(b00)
81	blt	cr6, L(b01)
82	beq	cr6, L(b10)
83
84L(b11):	ld	r9, 0(up)
85	ld	r28, 0(rp)
86	mulld	r0, r9, r6
87	mulhdu	r12, r9, r6
88	ADDSUB	r0, r0, r28
89	std	r0, 0(rp)
90	addi	rp, rp, 8
91	ld	r9, 8(up)
92	ld	r27, 16(up)
93	addi	up, up, 24
94SM(`	subfe	r11, r11, r11 ')
95	b	L(bot)
96
97	ALIGN(16)
98L(b00):	ld	r9, 0(up)
99	ld	r27, 8(up)
100	ld	r28, 0(rp)
101	ld	r29, 8(rp)
102	mulld	r0, r9, r6
103	mulhdu	r5, r9, r6
104	mulld	r7, r27, r6
105	mulhdu	r8, r27, r6
106	addc	r7, r7, r5
107	addze	r12, r8
108	ADDSUB	r0, r0, r28
109	std	r0, 0(rp)
110	ADDSUBC	r7, r7, r29
111	std	r7, 8(rp)
112	addi	rp, rp, 16
113	ld	r9, 16(up)
114	ld	r27, 24(up)
115	addi	up, up, 32
116SM(`	subfe	r11, r11, r11 ')
117	b	L(bot)
118
119	ALIGN(16)
120L(b01):	bdnz	L(gt1)
121	ld	r9, 0(up)
122	ld	r11, 0(rp)
123	mulld	r0, r9, r6
124	mulhdu	r8, r9, r6
125	ADDSUB	r0, r0, r11
126	std	r0, 0(rp)
127SM(`	subfe	r11, r11, r11 ')
128SM(`	addic	r11, r11, 1 ')
129	addze	r3, r8
130	blr
131L(gt1):	ld	r9, 0(up)
132	ld	r27, 8(up)
133	mulld	r0, r9, r6
134	mulhdu	r5, r9, r6
135	mulld	r7, r27, r6
136	mulhdu	r8, r27, r6
137	ld	r9, 16(up)
138	ld	r28, 0(rp)
139	ld	r29, 8(rp)
140	ld	r30, 16(rp)
141	mulld	r11, r9, r6
142	mulhdu	r10, r9, r6
143	addc	r7, r7, r5
144	adde	r11, r11, r8
145	addze	r12, r10
146	ADDSUB	r0, r0, r28
147	std	r0, 0(rp)
148	ADDSUBC	r7, r7, r29
149	std	r7, 8(rp)
150	ADDSUBC	r11, r11, r30
151	std	r11, 16(rp)
152	addi	rp, rp, 24
153	ld	r9, 24(up)
154	ld	r27, 32(up)
155	addi	up, up, 40
156SM(`	subfe	r11, r11, r11 ')
157	b	L(bot)
158
159L(b10):	addic	r0, r0, 0
160	li	r12, 0		C cy_limb = 0
161	ld	r9, 0(up)
162	ld	r27, 8(up)
163	bdz	L(end)
164	addi	up, up, 16
165
166	ALIGN(16)
167L(top):	mulld	r0, r9, r6
168	mulhdu	r5, r9, r6	C 9
169	mulld	r7, r27, r6
170	mulhdu	r8, r27, r6	C 27
171	ld	r9, 0(up)
172	ld	r28, 0(rp)
173	ld	r27, 8(up)
174	ld	r29, 8(rp)
175	adde	r0, r0, r12	C 0 12
176	adde	r7, r7, r5	C 5 7
177	mulld	r5, r9, r6
178	mulhdu	r10, r9, r6	C 9
179	mulld	r11, r27, r6
180	mulhdu	r12, r27, r6	C 27
181	ld	r9, 16(up)
182	ld	r30, 16(rp)
183	ld	r27, 24(up)
184	ld	r31, 24(rp)
185	adde	r5, r5, r8	C 8 5
186	adde	r11, r11, r10	C 10 11
187	addze	r12, r12	C 12
188	ADDSUB	r0, r0, r28	C 0 28
189	std	r0, 0(rp)	C 0
190	ADDSUBC	r7, r7, r29	C 7 29
191	std	r7, 8(rp)	C 7
192	ADDSUBC	r5, r5, r30	C 5 30
193	std	r5, 16(rp)	C 5
194	ADDSUBC	r11, r11, r31	C 11 31
195	std	r11, 24(rp)	C 11
196	addi	up, up, 32
197SM(`	subfe	r11, r11, r11 ')
198	addi	rp, rp, 32
199L(bot):
200SM(`	addic	r11, r11, 1 ')
201	bdnz	L(top)
202
203L(end):	mulld	r0, r9, r6
204	mulhdu	r5, r9, r6
205	mulld	r7, r27, r6
206	mulhdu	r8, r27, r6
207	ld	r28, 0(rp)
208	ld	r29, 8(rp)
209	adde	r0, r0, r12
210	adde	r7, r7, r5
211	addze	r8, r8
212	ADDSUB	r0, r0, r28
213	std	r0, 0(rp)
214	ADDSUBC	r7, r7, r29
215	std	r7, 8(rp)
216SM(`	subfe	r11, r11, r11 ')
217SM(`	addic	r11, r11, 1 ')
218	addze	r3, r8
219	ld	r31, -8(r1)
220	ld	r30, -16(r1)
221	ld	r29, -24(r1)
222	ld	r28, -32(r1)
223	ld	r27, -40(r1)
224	blr
225EPILOGUE()
226