xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc32/divrem_2.asm (revision cef8759bd76c1b621f8eab8faa6f208faabc2e15)
1dnl  PPC-32 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
2
3dnl  Copyright 2007, 2008, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C		cycles/limb
34C		norm	frac
35C 7410		~36.5	~36.5
36C 744x, 745x	 29	 29
37
38C INPUT PARAMETERS
39C qp  = r3
40C fn  = r4
41C up  = r5
42C un  = r6
43C d   = r7
44
45C TODO
46C  * Decrease register usage.
47C  * Make sure mul operands and optimal for early-out.
48C  * Check that things work well for a shared library build.
49C  * Write an invert_limb, perhaps inline, perhaps as a private call.  Or at
50C    least vastly improve the current __udiv_qrnnd_c based code.
51
52
53ASM_START()
54PROLOGUE(mpn_divrem_2)
55	stwu	r1, -32(r1)
56	slwi	r0, r6, 2
57	add	r5, r5, r0
58	stmw	r28, 8(r1)
59	addi	r29, r5, -8		C up = up_param + un - 2
60	lwz	r10, 4(r7)
61	lwz	r12, 4(r29)
62	addi	r8, r3, -12
63	lwz	r7, 0(r7)
64	cmplw	cr7, r12, r10
65	lwz	r28, 0(r29)
66	blt-	cr7, L(2)
67	bgt+	cr7, L(4)
68	cmplw	cr7, r28, r7
69	blt-	cr7, L(2)
70L(4):	subfc	r28, r7, r28
71	subfe	r12, r10, r12
72	li	r3, 1
73	b	L(6)
74L(2):	li	r3, 0
75
76L(6):	add	r0, r4, r6
77	addic.	r30, r0, -2
78	ble-	cr0, L(ret)
79
80	slwi	r9, r0, 2
81	add	r8, r8, r9		C rp += un + fn
82	mtctr	r30
83
84C Compute di from d1
85	srwi	r11, r10, 16
86	nor	r0, r10, r10
87	divwu	r31, r0, r11
88	rlwinm	r5, r10, 0, 16, 31
89	mullw	r9, r11, r31
90	mullw	r6, r5, r31
91	subf	r0, r9, r0
92	slwi	r0, r0, 16
93	ori	r0, r0, 65535
94	cmplw	cr7, r0, r6
95	bge-	cr7, L(9)
96	add	r0, r0, r10
97	cmplw	cr7, r0, r10
98	cmplw	cr6, r6, r0
99	addi	r31, r31, -1		C q1--
100	crorc	28, 28, 25
101	bc+	12, 28, L(9)
102	addi	r31, r31, -1		C q1--
103	add	r0, r0, r10
104L(9):	subf	r0, r6, r0
105	divwu	r6, r0, r11
106	mullw	r9, r11, r6
107	mullw	r11, r5, r6
108	subf	r0, r9, r0
109	slwi	r0, r0, 16
110	ori	r0, r0, 65535
111	cmplw	cr7, r0, r11
112	bge-	cr7, L(13)
113	add	r0, r0, r10
114	cmplw	cr7, r0, r10
115	cmplw	cr6, r11, r0
116	addi	r6, r6, -1		C q0--
117	crorc	28, 28, 25
118	bc+	12, 28, L(13)
119C	add	r0, r0, r10		C final remainder
120	addi	r6, r6, -1		C q0--
121L(13):	rlwimi	r6, r31, 16, 0, 15	C assemble final quotient
122
123C Adjust di by including d0
124	mullw	r9, r10, r6		C t0 = LO(di * d1)
125	addc	r11, r9, r7
126	subfe	r0, r1, r1
127	mulhwu	r9, r6, r7		C s1 = HI(di * d0)
128	addc	r9, r11, r9
129	addze.	r0, r0
130	blt	cr0, L(17)
131L(18):	subfc	r9, r10, r9
132	addi	r6, r6, -1
133	addme.	r0, r0
134	bge+	cr0, L(18)
135L(17):
136
137C r0  r3  r4  r5  r6  r7  r8  r9 r10 r11 r12 r28 r29 r30 r31
138C     msl         di  d0  qp     d1          fn  up  un
139L(loop):
140	mullw	r0, r12, r6		C q0 = LO(n2 * di)
141	cmpw	cr7, r30, r4
142	addc	r31, r0, r28		C q0 += n1
143	mulhwu	r9, r12, r6		C q  = HI(n2 * di)
144	adde	r12, r9, r12		C q  += n2
145	addi	r30, r30, -1
146	mullw	r0, r10, r12		C d1 * q
147	li	r9, 0
148	subf	r0, r0, r28		C n1 -= d1 * q
149	addi	r5, r12, 1
150	ble-	cr7, L(23)
151	lwzu	r9, -4(r29)
152L(23):	mullw	r11, r12, r7		C t0 = LO(d0 * q)
153	subfc	r28, r7, r9		C n0 -= d0
154	subfe	r0, r10, r0		C n1 -= d1
155	mulhwu	r12, r12, r7		C t1 = HI(d0 * q)
156	subfc	r28, r11, r28		C n0 -= t0
157	subfe	r12, r12, r0		C n1 -= t1
158	cmplw	cr7, r12, r31
159	blt+	cr7, L(24)
160	addc	r28, r28, r7
161	adde	r12, r12, r10
162	addi	r5, r5, -1
163L(24):	cmplw	cr7, r12, r10
164	bge-	cr7, L(fix)
165L(bck):	stw	r5, 0(r8)
166	addi	r8, r8, -4
167	bdnz	L(loop)
168
169L(ret):	stw	r28, 0(r29)
170	stw	r12, 4(r29)
171	lmw	r28, 8(r1)
172	addi	r1, r1, 32
173	blr
174
175L(fix):	cmplw	cr6, r28, r7
176	bgt+	cr7, L(28)
177	blt-	cr6, L(bck)
178L(28):	subfc	r28, r7, r28
179	subfe	r12, r10, r12
180	addi	r5, r5, 1
181	b	L(bck)
182EPILOGUE()
183