xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/divrem_1.asm (revision 7d3af8c6a2070d16ec6d1aef203d052d6683100d)
1dnl  PowerPC-64 mpn_divrem_1 -- Divide an mpn number by an unnormalized limb.
2
3dnl  Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C			    cycles/limb
23C			norm	unorm	frac
24C POWER3/PPC630		16-34	16-34	~11
25C POWER4/PPC970		 29		 19
26C POWER5		 29	 29	~20
27
28C INPUT PARAMETERS
29C qp  = r3
30C fn  = r4
31C up  = r5
32C un  = r6
33C d   = r7
34
35C We use a not very predictable branch in the frac code, therefore the cycle
36C count wobbles somewhat.  With the alternative branch-free code, things run
37C considerably slower on POWER4/PPC970 and POWER5.
38
39C Add preinv entry point.
40
41
42ASM_START()
43
44EXTERN_FUNC(mpn_invert_limb)
45
46PROLOGUE(mpn_divrem_1)
47
48	mfcr	r12
49	add.	r10, r6, r4
50	std	r25, -56(r1)
51	mr	r25, r4
52	mflr	r0
53	std	r26, -48(r1)
54	mr	r26, r5
55	std	r28, -32(r1)
56	mr	r28, r6
57	std	r29, -24(r1)
58	mr	r29, r3
59	li	r3, 0
60	std	r30, -16(r1)
61	mr	r30, r7
62	std	r31, -8(r1)
63	li	r31, 0
64	std	r27, -40(r1)
65	std	r0, 16(r1)
66	stw	r12, 8(r1)
67	stdu	r1, -176(r1)
68	beq-	cr0, L(1)
69	cmpdi	cr7, r7, 0
70	sldi	r0, r10, 3
71	add	r11, r0, r29
72	addi	r29, r11, -8
73	blt-	cr7, L(162)
74	cmpdi	cr4, r6, 0
75	beq+	cr4, L(71)
76L(163):
77	sldi	r9, r6, 3
78	add	r9, r9, r5
79	ld	r7, -8(r9)
80	cmpld	cr7, r7, r30
81	bge-	cr7, L(71)
82	cmpdi	cr7, r10, 1
83	li	r0, 0
84	mr	r31, r7
85	std	r0, -8(r11)
86	addi	r29, r29, -8
87	mr	r3, r7
88	beq-	cr7, L(1)
89	addi	r28, r6, -1
90	cmpdi	cr4, r28, 0
91L(71):
92	cntlzd	r27, r30
93	sld	r30, r30, r27
94	sld	r31, r31, r27
95	mr	r3, r30
96	CALL(	mpn_invert_limb)
97	nop
98	beq-	cr4, L(110)
99	sldi	r9, r28, 3
100	addic.	r6, r28, -2
101	add	r9, r9, r26
102	subfic	r5, r27, 64
103	ld	r8, -8(r9)
104	srd	r0, r8, r5
105	or	r31, r31, r0
106	sld	r7, r8, r27
107	blt-	cr0, L(154)
108	addi	r28, r28, -1
109	mtctr	r28
110	sldi	r6, r6, 3
111	ALIGN(16)
112L(uloop):
113	addi	r11, r31, 1
114	ldx	r8, r26, r6
115	mulld	r0, r31, r3
116	mulhdu	r10, r31, r3
117	addi	r6, r6, -8
118	srd	r9, r8, r5
119	or	r9, r7, r9
120	addc	r0, r0, r9
121	adde	r10, r10, r11
122	mulld	r31, r10, r30
123	subf	r31, r31, r9
124	subfc	r0, r0, r31	C r >= ql
125	subfe	r0, r0, r0	C r0 = -(r >= ql)
126	not	r7, r0
127	add	r10, r7, r10	C qh -= (r >= ql)
128	andc	r0, r30, r0
129	add	r31, r31, r0
130	cmpld	cr7, r31, r30
131	bge-	cr7, L(164)
132L(123):
133	std	r10, 0(r29)
134	addi	r29, r29, -8
135	sld	r7, r8, r27
136	bdnz	L(uloop)
137L(154):
138	addi	r11, r31, 1
139	nop
140	mulld	r0, r31, r3
141	mulhdu	r8, r31, r3
142	addc	r0, r0, r7
143	adde	r8, r8, r11
144	mulld	r31, r8, r30
145	subf	r31, r31, r7
146	subfc	r0, r0, r31	C r >= ql
147	subfe	r0, r0, r0	C r0 = -(r >= ql)
148	not	r7, r0
149	add	r8, r7, r8	C qh -= (r >= ql)
150	andc	r0, r30, r0
151	add	r31, r31, r0
152	cmpld	cr7, r31, r30
153	bge-	cr7, L(165)
154L(134):
155	std	r8, 0(r29)
156	addi	r29, r29, -8
157L(110):
158	addic.	r0, r25, -1
159	blt-	cr0, L(156)
160	mtctr	r25
161	neg	r9, r30
162	ALIGN(16)
163L(ufloop):
164	addi	r11, r31, 1
165	nop
166	mulld	r7, r3, r31
167	mulhdu	r10, r3, r31
168	add	r10, r10, r11
169	mulld	r31, r9, r10
170ifelse(0,1,`
171	subfc	r0, r7, r31
172	subfe	r0, r0, r0	C r0 = -(r >= ql)
173	not	r7, r0
174	add	r10, r7, r10	C qh -= (r >= ql)
175	andc	r0, r30, r0
176	add	r31, r31, r0
177',`
178	cmpld	cr7, r31, r7
179	blt	cr7, L(29)
180	add	r31, r30, r31
181	addi	r10, r10, -1
182L(29):
183')
184	std	r10, 0(r29)
185	addi	r29, r29, -8
186	bdnz	L(ufloop)
187L(156):
188	srd	r3, r31, r27
189L(1):
190	addi	r1, r1, 176
191	ld	r0, 16(r1)
192	lwz	r12, 8(r1)
193	mtlr	r0
194	ld	r25, -56(r1)
195	ld	r26, -48(r1)
196	mtcrf	8, r12
197	ld	r27, -40(r1)
198	ld	r28, -32(r1)
199	ld	r29, -24(r1)
200	ld	r30, -16(r1)
201	ld	r31, -8(r1)
202	blr
203L(162):
204	cmpdi	cr7, r6, 0
205	beq-	cr7, L(8)
206	sldi	r9, r6, 3
207	addi	r29, r29, -8
208	add	r9, r9, r5
209	addi	r28, r6, -1
210	ld	r31, -8(r9)
211	subfc	r9, r7, r31
212	li	r9, 0
213	adde	r9, r9, r9
214	neg	r0, r9
215	std	r9, -8(r11)
216	and	r0, r0, r7
217	subf	r31, r0, r31
218L(8):
219L(10):
220	mr	r3, r30
221	CALL(	mpn_invert_limb)
222	nop
223	addic.	r6, r28, -1
224	blt-	cr0, L(150)
225	mtctr	r28
226	sldi	r6, r6, 3
227	ALIGN(16)
228L(nloop):
229	addi	r11, r31, 1
230	ldx	r8, r26, r6
231	mulld	r0, r31, r3
232	addi	r6, r6, -8
233	mulhdu	r10, r31, r3
234	addc	r7, r0, r8
235	adde	r10, r10, r11
236	mulld	r31, r10, r30
237	subf	r31, r31, r8	C r = nl - qh * d
238	subfc	r0, r7, r31	C r >= ql
239	subfe	r0, r0, r0	C r0 = -(r >= ql)
240	not	r7, r0
241	add	r10, r7, r10	C qh -= (r >= ql)
242	andc	r0, r30, r0
243	add	r31, r31, r0
244	cmpld	cr7, r31, r30
245	bge-	cr7, L(167)
246L(51):
247	std	r10, 0(r29)
248	addi	r29, r29, -8
249	bdnz	L(nloop)
250
251L(150):
252	addic.	r9, r25, -1
253	blt-	cr0, L(152)
254	mtctr	r25
255	neg	r9, r30
256	ALIGN(16)
257L(nfloop):
258	addi	r11, r31, 1
259	nop
260	mulld	r7, r3, r31
261	mulhdu	r10, r3, r31
262	add	r10, r10, r11
263	mulld	r31, r9, r10
264ifelse(0,1,`
265	subfc	r0, r7, r31
266	subfe	r0, r0, r0	C r0 = -(r >= ql)
267	not	r7, r0
268	add	r10, r7, r10	C qh -= (r >= ql)
269	andc	r0, r30, r0
270	add	r31, r31, r0
271',`
272	cmpld	cr7, r31, r7
273	blt	cr7, L(28)
274	add	r31, r30, r31
275	addi	r10, r10, -1
276L(28):
277')
278	std	r10, 0(r29)
279	addi	r29, r29, -8
280	bdnz	L(nfloop)
281L(152):
282	addi	r1, r1, 176
283	mr	r3, r31
284	ld	r0, 16(r1)
285	lwz	r12, 8(r1)
286	mtlr	r0
287	ld	r25, -56(r1)
288	ld	r26, -48(r1)
289	mtcrf	8, r12
290	ld	r27, -40(r1)
291	ld	r28, -32(r1)
292	ld	r29, -24(r1)
293	ld	r30, -16(r1)
294	ld	r31, -8(r1)
295	blr
296L(164):
297	subf	r31, r30, r31
298	addi	r10, r10, 1
299	b	L(123)
300L(167):
301	subf	r31, r30, r31
302	addi	r10, r10, 1
303	b	L(51)
304L(165):
305	subf	r31, r30, r31
306	addi	r8, r8, 1
307	b	L(134)
308EPILOGUE()
309