1 /* mpn_mu_divappr_q, mpn_preinv_mu_divappr_q.
2
3 Compute Q = floor(N / D) + e. N is nn limbs, D is dn limbs and must be
4 normalized, and Q must be nn-dn limbs, 0 <= e <= 4. The requirement that Q
5 is nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us
6 to let N be unmodified during the operation.
7
8 Contributed to the GNU project by Torbjorn Granlund.
9
10 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
11 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
12 GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
13
14 Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc.
15
16 This file is part of the GNU MP Library.
17
18 The GNU MP Library is free software; you can redistribute it and/or modify
19 it under the terms of either:
20
21 * the GNU Lesser General Public License as published by the Free
22 Software Foundation; either version 3 of the License, or (at your
23 option) any later version.
24
25 or
26
27 * the GNU General Public License as published by the Free Software
28 Foundation; either version 2 of the License, or (at your option) any
29 later version.
30
31 or both in parallel, as here.
32
33 The GNU MP Library is distributed in the hope that it will be useful, but
34 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
35 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
36 for more details.
37
38 You should have received copies of the GNU General Public License and the
39 GNU Lesser General Public License along with the GNU MP Library. If not,
40 see https://www.gnu.org/licenses/. */
41
42
43 /*
44 The idea of the algorithm used herein is to compute a smaller inverted value
45 than used in the standard Barrett algorithm, and thus save time in the
46 Newton iterations, and pay just a small price when using the inverted value
47 for developing quotient bits. This algorithm was presented at ICMS 2006.
48 */
49
50 /* CAUTION: This code and the code in mu_div_qr.c should be edited in sync.
51
52 Things to work on:
53
54 * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
55 demonstrated by the fact that the mpn_invertappr function's scratch needs
56 mean that we need to keep a large allocation long after it is needed.
57 Things are worse as mpn_mul_fft does not accept any scratch parameter,
58 which means we'll have a large memory hole while in mpn_mul_fft. In
59 general, a peak scratch need in the beginning of a function isn't
60 well-handled by the itch/scratch scheme.
61 */
62
63 #ifdef STAT
64 #undef STAT
65 #define STAT(x) x
66 #else
67 #define STAT(x)
68 #endif
69
70 #include <stdlib.h> /* for NULL */
71 #include "gmp-impl.h"
72
73 static mp_limb_t mpn_preinv_mu_divappr_q (mp_ptr, mp_srcptr, mp_size_t,
74 mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr);
75 static mp_size_t mpn_mu_divappr_q_choose_in (mp_size_t, mp_size_t, int);
76
77 mp_limb_t
mpn_mu_divappr_q(mp_ptr qp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)78 mpn_mu_divappr_q (mp_ptr qp,
79 mp_srcptr np,
80 mp_size_t nn,
81 mp_srcptr dp,
82 mp_size_t dn,
83 mp_ptr scratch)
84 {
85 mp_size_t qn, in;
86 mp_limb_t cy, qh;
87 mp_ptr ip, tp;
88
89 ASSERT (dn > 1);
90
91 qn = nn - dn;
92
93 /* If Q is smaller than D, truncate operands. */
94 if (qn + 1 < dn)
95 {
96 np += dn - (qn + 1);
97 nn -= dn - (qn + 1);
98 dp += dn - (qn + 1);
99 dn = qn + 1;
100 }
101
102 /* Compute the inverse size. */
103 in = mpn_mu_divappr_q_choose_in (qn, dn, 0);
104 ASSERT (in <= dn);
105
106 #if 1
107 /* This alternative inverse computation method gets slightly more accurate
108 results. FIXMEs: (1) Temp allocation needs not analysed (2) itch function
109 not adapted (3) mpn_invertappr scratch needs not met. */
110 ip = scratch;
111 tp = scratch + in + 1;
112
113 /* compute an approximate inverse on (in+1) limbs */
114 if (dn == in)
115 {
116 MPN_COPY (tp + 1, dp, in);
117 tp[0] = 1;
118 mpn_invertappr (ip, tp, in + 1, tp + in + 1);
119 MPN_COPY_INCR (ip, ip + 1, in);
120 }
121 else
122 {
123 cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
124 if (UNLIKELY (cy != 0))
125 MPN_ZERO (ip, in);
126 else
127 {
128 mpn_invertappr (ip, tp, in + 1, tp + in + 1);
129 MPN_COPY_INCR (ip, ip + 1, in);
130 }
131 }
132 #else
133 /* This older inverse computation method gets slightly worse results than the
134 one above. */
135 ip = scratch;
136 tp = scratch + in;
137
138 /* Compute inverse of D to in+1 limbs, then round to 'in' limbs. Ideally the
139 inversion function should do this automatically. */
140 if (dn == in)
141 {
142 tp[in + 1] = 0;
143 MPN_COPY (tp + in + 2, dp, in);
144 mpn_invertappr (tp, tp + in + 1, in + 1, NULL);
145 }
146 else
147 {
148 mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);
149 }
150 cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
151 if (UNLIKELY (cy != 0))
152 MPN_ZERO (tp + 1, in);
153 MPN_COPY (ip, tp + 1, in);
154 #endif
155
156 qh = mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in);
157
158 return qh;
159 }
160
161 static mp_limb_t
mpn_preinv_mu_divappr_q(mp_ptr qp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_srcptr ip,mp_size_t in,mp_ptr scratch)162 mpn_preinv_mu_divappr_q (mp_ptr qp,
163 mp_srcptr np,
164 mp_size_t nn,
165 mp_srcptr dp,
166 mp_size_t dn,
167 mp_srcptr ip,
168 mp_size_t in,
169 mp_ptr scratch)
170 {
171 mp_size_t qn;
172 mp_limb_t cy, cx, qh;
173 mp_limb_t r;
174 mp_size_t tn, wn;
175
176 #define rp scratch
177 #define tp (scratch + dn)
178 #define scratch_out (scratch + dn + tn)
179
180 qn = nn - dn;
181
182 np += qn;
183 qp += qn;
184
185 qh = mpn_cmp (np, dp, dn) >= 0;
186 if (qh != 0)
187 mpn_sub_n (rp, np, dp, dn);
188 else
189 MPN_COPY (rp, np, dn);
190
191 if (qn == 0)
192 return qh; /* Degenerate use. Should we allow this? */
193
194 while (qn > 0)
195 {
196 if (qn < in)
197 {
198 ip += in - qn;
199 in = qn;
200 }
201 np -= in;
202 qp -= in;
203
204 /* Compute the next block of quotient limbs by multiplying the inverse I
205 by the upper part of the partial remainder R. */
206 mpn_mul_n (tp, rp + dn - in, ip, in); /* mulhi */
207 cy = mpn_add_n (qp, tp + in, rp + dn - in, in); /* I's msb implicit */
208 ASSERT_ALWAYS (cy == 0);
209
210 qn -= in;
211 if (qn == 0)
212 break;
213
214 /* Compute the product of the quotient block and the divisor D, to be
215 subtracted from the partial remainder combined with new limbs from the
216 dividend N. We only really need the low dn limbs. */
217
218 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
219 mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancels */
220 else
221 {
222 tn = mpn_mulmod_bnm1_next_size (dn + 1);
223 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
224 wn = dn + in - tn; /* number of wrapped limbs */
225 if (wn > 0)
226 {
227 cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);
228 cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);
229 cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;
230 ASSERT_ALWAYS (cx >= cy);
231 mpn_incr_u (tp, cx - cy);
232 }
233 }
234
235 r = rp[dn - in] - tp[dn];
236
237 /* Subtract the product from the partial remainder combined with new
238 limbs from the dividend N, generating a new partial remainder R. */
239 if (dn != in)
240 {
241 cy = mpn_sub_n (tp, np, tp, in); /* get next 'in' limbs from N */
242 cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
243 MPN_COPY (rp, tp, dn); /* FIXME: try to avoid this */
244 }
245 else
246 {
247 cy = mpn_sub_n (rp, np, tp, in); /* get next 'in' limbs from N */
248 }
249
250 STAT (int i; int err = 0;
251 static int errarr[5]; static int err_rec; static int tot);
252
253 /* Check the remainder R and adjust the quotient as needed. */
254 r -= cy;
255 while (r != 0)
256 {
257 /* We loop 0 times with about 69% probability, 1 time with about 31%
258 probability, 2 times with about 0.6% probability, if inverse is
259 computed as recommended. */
260 mpn_incr_u (qp, 1);
261 cy = mpn_sub_n (rp, rp, dp, dn);
262 r -= cy;
263 STAT (err++);
264 }
265 if (mpn_cmp (rp, dp, dn) >= 0)
266 {
267 /* This is executed with about 76% probability. */
268 mpn_incr_u (qp, 1);
269 cy = mpn_sub_n (rp, rp, dp, dn);
270 STAT (err++);
271 }
272
273 STAT (
274 tot++;
275 errarr[err]++;
276 if (err > err_rec)
277 err_rec = err;
278 if (tot % 0x10000 == 0)
279 {
280 for (i = 0; i <= err_rec; i++)
281 printf (" %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
282 printf ("\n");
283 }
284 );
285 }
286
287 /* FIXME: We should perhaps be somewhat more elegant in our rounding of the
288 quotient. For now, just make sure the returned quotient is >= the real
289 quotient; add 3 with saturating arithmetic. */
290 qn = nn - dn;
291 cy += mpn_add_1 (qp, qp, qn, 3);
292 if (cy != 0)
293 {
294 if (qh != 0)
295 {
296 /* Return a quotient of just 1-bits, with qh set. */
297 mp_size_t i;
298 for (i = 0; i < qn; i++)
299 qp[i] = GMP_NUMB_MAX;
300 }
301 else
302 {
303 /* Propagate carry into qh. */
304 qh = 1;
305 }
306 }
307
308 return qh;
309 }
310
311 /* In case k=0 (automatic choice), we distinguish 3 cases:
312 (a) dn < qn: in = ceil(qn / ceil(qn/dn))
313 (b) dn/3 < qn <= dn: in = ceil(qn / 2)
314 (c) qn < dn/3: in = qn
315 In all cases we have in <= dn.
316 */
317 static mp_size_t
mpn_mu_divappr_q_choose_in(mp_size_t qn,mp_size_t dn,int k)318 mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k)
319 {
320 mp_size_t in;
321
322 if (k == 0)
323 {
324 mp_size_t b;
325 if (qn > dn)
326 {
327 /* Compute an inverse size that is a nice partition of the quotient. */
328 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
329 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
330 }
331 else if (3 * qn > dn)
332 {
333 in = (qn - 1) / 2 + 1; /* b = 2 */
334 }
335 else
336 {
337 in = (qn - 1) / 1 + 1; /* b = 1 */
338 }
339 }
340 else
341 {
342 mp_size_t xn;
343 xn = MIN (dn, qn);
344 in = (xn - 1) / k + 1;
345 }
346
347 return in;
348 }
349
350 mp_size_t
mpn_mu_divappr_q_itch(mp_size_t nn,mp_size_t dn,int mua_k)351 mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
352 {
353 mp_size_t qn, in, itch_local, itch_out, itch_invapp;
354
355 qn = nn - dn;
356 if (qn + 1 < dn)
357 {
358 dn = qn + 1;
359 }
360 in = mpn_mu_divappr_q_choose_in (qn, dn, mua_k);
361
362 itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
363 itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
364 itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */
365
366 ASSERT (dn + itch_local + itch_out >= itch_invapp);
367 return in + MAX (dn + itch_local + itch_out, itch_invapp);
368 }
369