xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/divrem_1.asm (revision d909946ca08dceb44d7d0f22ec9488679695d976)
1dnl  PowerPC-64 mpn_divrem_1 -- Divide an mpn number by an unnormalized limb.
2
3dnl  Copyright 2003, 2004, 2005, 2007, 2008, 2010, 2012 Free Software
4dnl  Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C                           cycles/limb
24C                       norm    unorm   frac
25C POWER3/PPC630         16-34   16-34   ~11   outdated figures
26C POWER4/PPC970          28      28      19
27C POWER5                 29      29     ~19
28C POWER6                 49      59     ~42
29C POWER7                 24.5    23     ~14
30
31C INPUT PARAMETERS
32C qp  = r3
33C fn  = r4
34C up  = r5
35C un  = r6
36C d   = r7
37
38C We use a not very predictable branch in the frac code, therefore the cycle
39C count wobbles somewhat.  With the alternative branch-free code, things run
40C considerably slower on POWER4/PPC970 and POWER5.
41
42C Add preinv entry point.
43
44
45ASM_START()
46
47EXTERN_FUNC(mpn_invert_limb)
48
49PROLOGUE(mpn_divrem_1)
50
51	mfcr	r12
52	add.	r10, r6, r4
53	std	r25, -56(r1)
54	mr	r25, r4
55	mflr	r0
56	std	r26, -48(r1)
57	mr	r26, r5
58	std	r28, -32(r1)
59	mr	r28, r6
60	std	r29, -24(r1)
61	mr	r29, r3
62	li	r3, 0
63	std	r30, -16(r1)
64	mr	r30, r7
65	std	r31, -8(r1)
66	li	r31, 0
67	std	r27, -40(r1)
68	std	r0, 16(r1)
69	stw	r12, 8(r1)
70	stdu	r1, -176(r1)
71	beq-	cr0, L(1)
72	cmpdi	cr7, r7, 0
73	sldi	r0, r10, 3
74	add	r11, r0, r29
75	addi	r29, r11, -8
76	blt-	cr7, L(162)
77	cmpdi	cr4, r6, 0
78	beq+	cr4, L(71)
79L(163):
80	sldi	r9, r6, 3
81	add	r9, r9, r5
82	ld	r7, -8(r9)
83	cmpld	cr7, r7, r30
84	bge-	cr7, L(71)
85	cmpdi	cr7, r10, 1
86	li	r0, 0
87	mr	r31, r7
88	std	r0, -8(r11)
89	addi	r29, r29, -8
90	mr	r3, r7
91	beq-	cr7, L(1)
92	addi	r28, r6, -1
93	cmpdi	cr4, r28, 0
94L(71):
95	cntlzd	r27, r30
96	sld	r30, r30, r27
97	sld	r31, r31, r27
98	mr	r3, r30
99	CALL(	mpn_invert_limb)
100	nop
101	beq-	cr4, L(110)
102	sldi	r9, r28, 3
103	addic.	r6, r28, -2
104	add	r9, r9, r26
105	subfic	r5, r27, 64
106	ld	r8, -8(r9)
107	srd	r0, r8, r5
108	or	r31, r31, r0
109	sld	r7, r8, r27
110	blt-	cr0, L(154)
111	addi	r28, r28, -1
112	mtctr	r28
113	sldi	r6, r6, 3
114	ALIGN(16)
115L(uloop):
116	ldx	r8, r26, r6
117	nop
118	mulld	r0, r31, r3
119	mulhdu	r10, r31, r3
120	addi	r11, r31, 1
121	srd	r9, r8, r5
122	addi	r6, r6, -8
123	or	r9, r7, r9
124	addc	r0, r0, r9
125	adde	r10, r10, r11
126	mulld	r31, r10, r30
127	subf	r31, r31, r9
128	subfc	r0, r31, r0	C r <= ql
129	subfe	r0, r0, r0	C r0 = -(r <= ql)
130	and	r9, r30, r0
131	add	r31, r31, r9
132	add	r10, r0, r10	C qh -= (r >= ql)
133	cmpld	cr7, r31, r30
134	bge-	cr7, L(164)
135L(123):
136	std	r10, 0(r29)
137	addi	r29, r29, -8
138	sld	r7, r8, r27
139	bdnz	L(uloop)
140L(154):
141	addi	r11, r31, 1
142	nop
143	mulld	r0, r31, r3
144	mulhdu	r8, r31, r3
145	addc	r0, r0, r7
146	adde	r8, r8, r11
147	mulld	r31, r8, r30
148	subf	r31, r31, r7
149	subfc	r0, r0, r31	C r >= ql
150	subfe	r0, r0, r0	C r0 = -(r >= ql)
151	not	r7, r0
152	add	r8, r7, r8	C qh -= (r >= ql)
153	andc	r0, r30, r0
154	add	r31, r31, r0
155	cmpld	cr7, r31, r30
156	bge-	cr7, L(165)
157L(134):
158	std	r8, 0(r29)
159	addi	r29, r29, -8
160L(110):
161	addic.	r0, r25, -1
162	blt-	cr0, L(156)
163	mtctr	r25
164	neg	r9, r30
165	ALIGN(16)
166L(ufloop):
167	addi	r11, r31, 1
168	nop
169	mulld	r0, r3, r31
170	mulhdu	r10, r3, r31
171	add	r10, r10, r11
172	mulld	r31, r9, r10
173ifelse(0,1,`
174	subfc	r0, r0, r31
175	subfe	r0, r0, r0	C r0 = -(r >= ql)
176	not	r7, r0
177	add	r10, r7, r10	C qh -= (r >= ql)
178	andc	r0, r30, r0
179	add	r31, r31, r0
180',`
181	cmpld	cr7, r31, r0
182	blt	cr7, L(29)
183	add	r31, r30, r31
184	addi	r10, r10, -1
185L(29):
186')
187	std	r10, 0(r29)
188	addi	r29, r29, -8
189	bdnz	L(ufloop)
190L(156):
191	srd	r3, r31, r27
192L(1):
193	addi	r1, r1, 176
194	ld	r0, 16(r1)
195	lwz	r12, 8(r1)
196	mtlr	r0
197	ld	r25, -56(r1)
198	ld	r26, -48(r1)
199	mtcrf	8, r12
200	ld	r27, -40(r1)
201	ld	r28, -32(r1)
202	ld	r29, -24(r1)
203	ld	r30, -16(r1)
204	ld	r31, -8(r1)
205	blr
206L(162):
207	cmpdi	cr7, r6, 0
208	beq-	cr7, L(8)
209	sldi	r9, r6, 3
210	addi	r29, r29, -8
211	add	r9, r9, r5
212	addi	r28, r6, -1
213	ld	r31, -8(r9)
214	subfc	r9, r7, r31
215	li	r9, 0
216	adde	r9, r9, r9
217	neg	r0, r9
218	std	r9, -8(r11)
219	and	r0, r0, r7
220	subf	r31, r0, r31
221L(8):
222	mr	r3, r30
223	CALL(	mpn_invert_limb)
224	li	r27, 0
225	addic.	r6, r28, -1
226	blt-	cr0, L(110)
227	mtctr	r28
228	sldi	r6, r6, 3
229	ALIGN(16)
230L(nloop):
231	addi	r11, r31, 1
232	ldx	r8, r26, r6
233	mulld	r0, r31, r3
234	mulhdu	r10, r31, r3
235	addi	r6, r6, -8
236	addc	r0, r0, r8
237	adde	r10, r10, r11
238	mulld	r31, r10, r30
239	subf	r31, r31, r8	C r = nl - qh * d
240	subfc	r0, r31, r0	C r <= ql
241	subfe	r0, r0, r0	C r0 = -(r <= ql)
242	and	r9, r30, r0
243	add	r31, r31, r9
244	add	r10, r0, r10	C qh -= (r >= ql)
245	cmpld	cr7, r31, r30
246	bge-	cr7, L(167)
247L(51):
248	std	r10, 0(r29)
249	addi	r29, r29, -8
250	bdnz	L(nloop)
251	b	L(110)
252
253L(164):
254	subf	r31, r30, r31
255	addi	r10, r10, 1
256	b	L(123)
257L(167):
258	subf	r31, r30, r31
259	addi	r10, r10, 1
260	b	L(51)
261L(165):
262	subf	r31, r30, r31
263	addi	r8, r8, 1
264	b	L(134)
265EPILOGUE()
266