xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc32/divrem_2.asm (revision 413d532bcc3f62d122e56d92e13ac64825a40baf)
1dnl  PPC-32 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
2
3dnl  Copyright 2007, 2008, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C		cycles/limb
23C		norm	frac
24C 7410		~36.5	~36.5
25C 744x, 745x	 29	 29
26
27C INPUT PARAMETERS
28C qp  = r3
29C fn  = r4
30C up  = r5
31C un  = r6
32C d   = r7
33
34C TODO
35C  * Decrease register usage.
36C  * Make sure mul operands and optimal for early-out.
37C  * Check that things work well for a shared library build.
38C  * Write an invert_limb, perhaps inline, perhaps as a private call.  Or at
39C    least vastly improve the current __udiv_qrnnd_c based code.
40
41
42ASM_START()
43PROLOGUE(mpn_divrem_2)
44	stwu	r1, -32(r1)
45	slwi	r0, r6, 2
46	add	r5, r5, r0
47	stmw	r28, 8(r1)
48	addi	r29, r5, -8		C up = up_param + un - 2
49	lwz	r10, 4(r7)
50	lwz	r12, 4(r29)
51	addi	r8, r3, -12
52	lwz	r7, 0(r7)
53	cmplw	cr7, r12, r10
54	lwz	r28, 0(r29)
55	blt-	cr7, L(2)
56	bgt+	cr7, L(4)
57	cmplw	cr7, r28, r7
58	blt-	cr7, L(2)
59L(4):	subfc	r28, r7, r28
60	subfe	r12, r10, r12
61	li	r3, 1
62	b	L(6)
63L(2):	li	r3, 0
64
65L(6):	add	r0, r4, r6
66	addic.	r30, r0, -2
67	ble-	cr0, L(ret)
68
69	slwi	r9, r0, 2
70	add	r8, r8, r9		C rp += un + fn
71	mtctr	r30
72
73C Compute di from d1
74	srwi	r11, r10, 16
75	nor	r0, r10, r10
76	divwu	r31, r0, r11
77	rlwinm	r5, r10, 0, 16, 31
78	mullw	r9, r11, r31
79	mullw	r6, r5, r31
80	subf	r0, r9, r0
81	slwi	r0, r0, 16
82	ori	r0, r0, 65535
83	cmplw	cr7, r0, r6
84	bge-	cr7, L(9)
85	add	r0, r0, r10
86	cmplw	cr7, r0, r10
87	cmplw	cr6, r6, r0
88	addi	r31, r31, -1		C q1--
89	crorc	28, 28, 25
90	bc+	12, 28, L(9)
91	addi	r31, r31, -1		C q1--
92	add	r0, r0, r10
93L(9):	subf	r0, r6, r0
94	divwu	r6, r0, r11
95	mullw	r9, r11, r6
96	mullw	r11, r5, r6
97	subf	r0, r9, r0
98	slwi	r0, r0, 16
99	ori	r0, r0, 65535
100	cmplw	cr7, r0, r11
101	bge-	cr7, L(13)
102	add	r0, r0, r10
103	cmplw	cr7, r0, r10
104	cmplw	cr6, r11, r0
105	addi	r6, r6, -1		C q0--
106	crorc	28, 28, 25
107	bc+	12, 28, L(13)
108C	add	r0, r0, r10		C final remainder
109	addi	r6, r6, -1		C q0--
110L(13):	rlwimi	r6, r31, 16, 0, 15	C assemble final quotient
111
112C Adjust di by including d0
113	mullw	r9, r10, r6		C t0 = LO(di * d1)
114	addc	r11, r9, r7
115	subfe	r0, r1, r1
116	mulhwu	r9, r6, r7		C s1 = HI(di * d0)
117	addc	r9, r11, r9
118	addze.	r0, r0
119	blt	cr0, L(17)
120L(18):	subfc	r9, r10, r9
121	addi	r6, r6, -1
122	addme.	r0, r0
123	bge+	cr0, L(18)
124L(17):
125
126C r0  r3  r4  r5  r6  r7  r8  r9 r10 r11 r12 r28 r29 r30 r31
127C     msl         di  d0  qp     d1          fn  up  un
128L(loop):
129	mullw	r0, r12, r6		C q0 = LO(n2 * di)
130	cmpw	cr7, r30, r4
131	addc	r31, r0, r28		C q0 += n1
132	mulhwu	r9, r12, r6		C q  = HI(n2 * di)
133	adde	r12, r9, r12		C q  += n2
134	addi	r30, r30, -1
135	mullw	r0, r10, r12		C d1 * q
136	li	r9, 0
137	subf	r0, r0, r28		C n1 -= d1 * q
138	addi	r5, r12, 1
139	ble-	cr7, L(23)
140	lwzu	r9, -4(r29)
141L(23):	mullw	r11, r12, r7		C t0 = LO(d0 * q)
142	subfc	r28, r7, r9		C n0 -= d0
143	subfe	r0, r10, r0		C n1 -= d1
144	mulhwu	r12, r12, r7		C t1 = HI(d0 * q)
145	subfc	r28, r11, r28		C n0 -= t0
146	subfe	r12, r12, r0		C n1 -= t1
147	cmplw	cr7, r12, r31
148	blt+	cr7, L(24)
149	addc	r28, r28, r7
150	adde	r12, r12, r10
151	addi	r5, r5, -1
152L(24):	cmplw	cr7, r12, r10
153	bge-	cr7, L(fix)
154L(bck):	stw	r5, 0(r8)
155	addi	r8, r8, -4
156	bdnz	L(loop)
157
158L(ret):	stw	r28, 0(r29)
159	stw	r12, 4(r29)
160	lmw	r28, 8(r1)
161	addi	r1, r1, 32
162	blr
163
164L(fix):	cmplw	cr6, r28, r7
165	bgt+	cr7, L(28)
166	blt-	cr6, L(bck)
167L(28):	subfc	r28, r7, r28
168	subfe	r12, r10, r12
169	addi	r5, r5, 1
170	b	L(bck)
171EPILOGUE()
172