xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/divrem_1.asm (revision 4c3eb207d36f67d31994830c0a694161fc1ca39b)
1dnl  PowerPC-64 mpn_divrem_1 -- Divide an mpn number by an unnormalized limb.
2
3dnl  Copyright 2003-2005, 2007, 2008, 2010, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C                           cycles/limb
34C                       norm    unorm   frac
35C POWER3/PPC630         16-34   16-34   ~11   outdated figures
36C POWER4/PPC970          28      28      19
37C POWER5                 29      29     ~19
38C POWER6                 49      59     ~42
39C POWER7                 24.5    23     ~14
40
41C INPUT PARAMETERS
42C qp  = r3
43C fn  = r4
44C up  = r5
45C un  = r6
46C d   = r7
47
48C We use a not very predictable branch in the frac code, therefore the cycle
49C count wobbles somewhat.  With the alternative branch-free code, things run
50C considerably slower on POWER4/PPC970 and POWER5.
51
52C Add preinv entry point.
53
54
55ASM_START()
56
57EXTERN_FUNC(mpn_invert_limb)
58
59PROLOGUE(mpn_divrem_1,toc)
60
61	mfcr	r12
62	add.	r10, r6, r4
63	std	r25, -56(r1)
64	mr	r25, r4
65	mflr	r0
66	std	r26, -48(r1)
67	mr	r26, r5
68	std	r28, -32(r1)
69	mr	r28, r6
70	std	r29, -24(r1)
71	mr	r29, r3
72	li	r3, 0
73	std	r30, -16(r1)
74	mr	r30, r7
75	std	r31, -8(r1)
76	li	r31, 0
77	std	r27, -40(r1)
78	std	r0, 16(r1)
79	stw	r12, 8(r1)
80	stdu	r1, -176(r1)
81	beq-	cr0, L(1)
82	cmpdi	cr7, r7, 0
83	sldi	r0, r10, 3
84	add	r11, r0, r29
85	addi	r29, r11, -8
86	blt-	cr7, L(162)
87	cmpdi	cr4, r6, 0
88	beq+	cr4, L(71)
89L(163):
90	sldi	r9, r6, 3
91	add	r9, r9, r5
92	ld	r7, -8(r9)
93	cmpld	cr7, r7, r30
94	bge-	cr7, L(71)
95	cmpdi	cr7, r10, 1
96	li	r0, 0
97	mr	r31, r7
98	std	r0, -8(r11)
99	addi	r29, r29, -8
100	mr	r3, r7
101	beq-	cr7, L(1)
102	addi	r28, r6, -1
103	cmpdi	cr4, r28, 0
104L(71):
105	cntlzd	r27, r30
106	sld	r30, r30, r27
107	sld	r31, r31, r27
108	mr	r3, r30
109	CALL(	mpn_invert_limb)
110	beq-	cr4, L(110)
111	sldi	r9, r28, 3
112	addic.	r6, r28, -2
113	add	r9, r9, r26
114	subfic	r5, r27, 64
115	ld	r8, -8(r9)
116	srd	r0, r8, r5
117	or	r31, r31, r0
118	sld	r7, r8, r27
119	blt-	cr0, L(154)
120	addi	r28, r28, -1
121	mtctr	r28
122	sldi	r6, r6, 3
123	ALIGN(16)
124L(uloop):
125	ldx	r8, r26, r6
126	nop
127	mulld	r0, r31, r3
128	mulhdu	r10, r31, r3
129	addi	r11, r31, 1
130	srd	r9, r8, r5
131	addi	r6, r6, -8
132	or	r9, r7, r9
133	addc	r0, r0, r9
134	adde	r10, r10, r11
135	mulld	r31, r10, r30
136	subf	r31, r31, r9
137	subfc	r0, r31, r0	C r <= ql
138	subfe	r0, r0, r0	C r0 = -(r <= ql)
139	and	r9, r30, r0
140	add	r31, r31, r9
141	add	r10, r0, r10	C qh -= (r >= ql)
142	cmpld	cr7, r31, r30
143	bge-	cr7, L(164)
144L(123):
145	std	r10, 0(r29)
146	addi	r29, r29, -8
147	sld	r7, r8, r27
148	bdnz	L(uloop)
149L(154):
150	addi	r11, r31, 1
151	nop
152	mulld	r0, r31, r3
153	mulhdu	r8, r31, r3
154	addc	r0, r0, r7
155	adde	r8, r8, r11
156	mulld	r31, r8, r30
157	subf	r31, r31, r7
158	subfc	r0, r0, r31	C r >= ql
159	subfe	r0, r0, r0	C r0 = -(r >= ql)
160	not	r7, r0
161	add	r8, r7, r8	C qh -= (r >= ql)
162	andc	r0, r30, r0
163	add	r31, r31, r0
164	cmpld	cr7, r31, r30
165	bge-	cr7, L(165)
166L(134):
167	std	r8, 0(r29)
168	addi	r29, r29, -8
169L(110):
170	addic.	r0, r25, -1
171	blt-	cr0, L(156)
172	mtctr	r25
173	neg	r9, r30
174	ALIGN(16)
175L(ufloop):
176	addi	r11, r31, 1
177	nop
178	mulld	r0, r3, r31
179	mulhdu	r10, r3, r31
180	add	r10, r10, r11
181	mulld	r31, r9, r10
182ifelse(0,1,`
183	subfc	r0, r0, r31
184	subfe	r0, r0, r0	C r0 = -(r >= ql)
185	not	r7, r0
186	add	r10, r7, r10	C qh -= (r >= ql)
187	andc	r0, r30, r0
188	add	r31, r31, r0
189',`
190	cmpld	cr7, r31, r0
191	blt	cr7, L(29)
192	add	r31, r30, r31
193	addi	r10, r10, -1
194L(29):
195')
196	std	r10, 0(r29)
197	addi	r29, r29, -8
198	bdnz	L(ufloop)
199L(156):
200	srd	r3, r31, r27
201L(1):
202	addi	r1, r1, 176
203	ld	r0, 16(r1)
204	lwz	r12, 8(r1)
205	mtlr	r0
206	ld	r25, -56(r1)
207	ld	r26, -48(r1)
208	mtcrf	8, r12
209	ld	r27, -40(r1)
210	ld	r28, -32(r1)
211	ld	r29, -24(r1)
212	ld	r30, -16(r1)
213	ld	r31, -8(r1)
214	blr
215L(162):
216	cmpdi	cr7, r6, 0
217	beq-	cr7, L(8)
218	sldi	r9, r6, 3
219	addi	r29, r29, -8
220	add	r9, r9, r5
221	addi	r28, r6, -1
222	ld	r31, -8(r9)
223	subfc	r9, r7, r31
224	li	r9, 0
225	adde	r9, r9, r9
226	neg	r0, r9
227	std	r9, -8(r11)
228	and	r0, r0, r7
229	subf	r31, r0, r31
230L(8):
231	mr	r3, r30
232	CALL(	mpn_invert_limb)
233	li	r27, 0
234	addic.	r6, r28, -1
235	blt-	cr0, L(110)
236	mtctr	r28
237	sldi	r6, r6, 3
238	ALIGN(16)
239L(nloop):
240	addi	r11, r31, 1
241	ldx	r8, r26, r6
242	mulld	r0, r31, r3
243	mulhdu	r10, r31, r3
244	addi	r6, r6, -8
245	addc	r0, r0, r8
246	adde	r10, r10, r11
247	mulld	r31, r10, r30
248	subf	r31, r31, r8	C r = nl - qh * d
249	subfc	r0, r31, r0	C r <= ql
250	subfe	r0, r0, r0	C r0 = -(r <= ql)
251	and	r9, r30, r0
252	add	r31, r31, r9
253	add	r10, r0, r10	C qh -= (r >= ql)
254	cmpld	cr7, r31, r30
255	bge-	cr7, L(167)
256L(51):
257	std	r10, 0(r29)
258	addi	r29, r29, -8
259	bdnz	L(nloop)
260	b	L(110)
261
262L(164):
263	subf	r31, r30, r31
264	addi	r10, r10, 1
265	b	L(123)
266L(167):
267	subf	r31, r30, r31
268	addi	r10, r10, 1
269	b	L(51)
270L(165):
271	subf	r31, r30, r31
272	addi	r8, r8, 1
273	b	L(134)
274EPILOGUE()
275