1 /* UltraSparc 64 mpn_divrem_1 -- mpn by limb division.
2
3 Copyright 1991, 1993, 1994, 1996, 1998-2001, 2003 Free Software Foundation,
4 Inc.
5
6 This file is part of the GNU MP Library.
7
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of either:
10
11 * the GNU Lesser General Public License as published by the Free
12 Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
14
15 or
16
17 * the GNU General Public License as published by the Free Software
18 Foundation; either version 2 of the License, or (at your option) any
19 later version.
20
21 or both in parallel, as here.
22
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 for more details.
27
28 You should have received copies of the GNU General Public License and the
29 GNU Lesser General Public License along with the GNU MP Library. If not,
30 see https://www.gnu.org/licenses/. */
31
32 #include "gmp-impl.h"
33 #include "longlong.h"
34
35 #include "mpn/sparc64/sparc64.h"
36
37
38 /* 64-bit divisor 32-bit divisor
39 cycles/limb cycles/limb
40 (approx) (approx)
41 integer fraction integer fraction
42 Ultrasparc 2i: 160 160 122 96
43 */
44
45
46 /* 32-bit divisors are treated in special case code. This requires 4 mulx
47 per limb instead of 8 in the general case.
48
49 For big endian systems we need HALF_ENDIAN_ADJ included in the src[i]
50 addressing, to get the two halves of each limb read in the correct order.
51 This is kept in an adj variable. Doing that measures about 4 c/l faster
52 than just writing HALF_ENDIAN_ADJ(i) in the integer loop. The latter
53 shouldn't be 6 cycles worth of work, but perhaps it doesn't schedule well
54 (on gcc 3.2.1 at least). The fraction loop doesn't seem affected, but we
55 still use a variable since that ought to work out best. */
56
57 mp_limb_t
mpn_divrem_1(mp_ptr qp_limbptr,mp_size_t xsize_limbs,mp_srcptr ap_limbptr,mp_size_t size_limbs,mp_limb_t d_limb)58 mpn_divrem_1 (mp_ptr qp_limbptr, mp_size_t xsize_limbs,
59 mp_srcptr ap_limbptr, mp_size_t size_limbs, mp_limb_t d_limb)
60 {
61 mp_size_t total_size_limbs;
62 mp_size_t i;
63
64 ASSERT (xsize_limbs >= 0);
65 ASSERT (size_limbs >= 0);
66 ASSERT (d_limb != 0);
67 /* FIXME: What's the correct overlap rule when xsize!=0? */
68 ASSERT (MPN_SAME_OR_SEPARATE_P (qp_limbptr + xsize_limbs,
69 ap_limbptr, size_limbs));
70
71 total_size_limbs = size_limbs + xsize_limbs;
72 if (UNLIKELY (total_size_limbs == 0))
73 return 0;
74
75 /* udivx is good for total_size==1, and no need to bother checking
76 limb<divisor, since if that's likely the caller should check */
77 if (UNLIKELY (total_size_limbs == 1))
78 {
79 mp_limb_t a, q;
80 a = (LIKELY (size_limbs != 0) ? ap_limbptr[0] : 0);
81 q = a / d_limb;
82 qp_limbptr[0] = q;
83 return a - q*d_limb;
84 }
85
86 if (d_limb <= CNST_LIMB(0xFFFFFFFF))
87 {
88 mp_size_t size, xsize, total_size, adj;
89 unsigned *qp, n1, n0, q, r, nshift, norm_rmask;
90 mp_limb_t dinv_limb;
91 const unsigned *ap;
92 int norm, norm_rshift;
93
94 size = 2 * size_limbs;
95 xsize = 2 * xsize_limbs;
96 total_size = size + xsize;
97
98 ap = (unsigned *) ap_limbptr;
99 qp = (unsigned *) qp_limbptr;
100
101 qp += xsize;
102 r = 0; /* initial remainder */
103
104 if (LIKELY (size != 0))
105 {
106 n1 = ap[size-1 + HALF_ENDIAN_ADJ(1)];
107
108 /* If the length of the source is uniformly distributed, then
109 there's a 50% chance of the high 32-bits being zero, which we
110 can skip. */
111 if (n1 == 0)
112 {
113 n1 = ap[size-2 + HALF_ENDIAN_ADJ(0)];
114 total_size--;
115 size--;
116 ASSERT (size > 0); /* because always even */
117 qp[size + HALF_ENDIAN_ADJ(1)] = 0;
118 }
119
120 /* Skip a division if high < divisor (high quotient 0). Testing
121 here before before normalizing will still skip as often as
122 possible. */
123 if (n1 < d_limb)
124 {
125 r = n1;
126 size--;
127 qp[size + HALF_ENDIAN_ADJ(size)] = 0;
128 total_size--;
129 if (total_size == 0)
130 return r;
131 }
132 }
133
134 count_leading_zeros_32 (norm, d_limb);
135 norm -= 32;
136 d_limb <<= norm;
137 r <<= norm;
138
139 norm_rshift = 32 - norm;
140 norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF);
141
142 invert_half_limb (dinv_limb, d_limb);
143
144 if (LIKELY (size != 0))
145 {
146 i = size - 1;
147 adj = HALF_ENDIAN_ADJ (i);
148 n1 = ap[i + adj];
149 adj = -adj;
150 r |= ((n1 >> norm_rshift) & norm_rmask);
151 for ( ; i > 0; i--)
152 {
153 n0 = ap[i-1 + adj];
154 adj = -adj;
155 nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
156 udiv_qrnnd_half_preinv (q, r, r, nshift, d_limb, dinv_limb);
157 qp[i + adj] = q;
158 n1 = n0;
159 }
160 nshift = n1 << norm;
161 udiv_qrnnd_half_preinv (q, r, r, nshift, d_limb, dinv_limb);
162 qp[0 + HALF_ENDIAN_ADJ(0)] = q;
163 }
164 qp -= xsize;
165 adj = HALF_ENDIAN_ADJ (0);
166 for (i = xsize-1; i >= 0; i--)
167 {
168 udiv_qrnnd_half_preinv (q, r, r, 0, d_limb, dinv_limb);
169 adj = -adj;
170 qp[i + adj] = q;
171 }
172
173 return r >> norm;
174 }
175 else
176 {
177 mp_srcptr ap;
178 mp_ptr qp;
179 mp_size_t size, xsize, total_size;
180 mp_limb_t d, n1, n0, q, r, dinv, nshift, norm_rmask;
181 int norm, norm_rshift;
182
183 ap = ap_limbptr;
184 qp = qp_limbptr;
185 size = size_limbs;
186 xsize = xsize_limbs;
187 total_size = total_size_limbs;
188 d = d_limb;
189
190 qp += total_size; /* above high limb */
191 r = 0; /* initial remainder */
192
193 if (LIKELY (size != 0))
194 {
195 /* Skip a division if high < divisor (high quotient 0). Testing
196 here before before normalizing will still skip as often as
197 possible. */
198 n1 = ap[size-1];
199 if (n1 < d)
200 {
201 r = n1;
202 *--qp = 0;
203 total_size--;
204 if (total_size == 0)
205 return r;
206 size--;
207 }
208 }
209
210 count_leading_zeros (norm, d);
211 d <<= norm;
212 r <<= norm;
213
214 norm_rshift = GMP_LIMB_BITS - norm;
215 norm_rmask = (norm == 0 ? 0 : ~CNST_LIMB(0));
216
217 invert_limb (dinv, d);
218
219 if (LIKELY (size != 0))
220 {
221 n1 = ap[size-1];
222 r |= ((n1 >> norm_rshift) & norm_rmask);
223 for (i = size-2; i >= 0; i--)
224 {
225 n0 = ap[i];
226 nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
227 udiv_qrnnd_preinv (q, r, r, nshift, d, dinv);
228 *--qp = q;
229 n1 = n0;
230 }
231 nshift = n1 << norm;
232 udiv_qrnnd_preinv (q, r, r, nshift, d, dinv);
233 *--qp = q;
234 }
235 for (i = 0; i < xsize; i++)
236 {
237 udiv_qrnnd_preinv (q, r, r, CNST_LIMB(0), d, dinv);
238 *--qp = q;
239 }
240 return r >> norm;
241 }
242 }
243