xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/generic/mu_divappr_q.c (revision 72c7faa4dbb41dbb0238d6b4a109da0d4b236dd4)
1 /* mpn_mu_divappr_q, mpn_preinv_mu_divappr_q.
2 
3    Compute Q = floor(N / D) + e.  N is nn limbs, D is dn limbs and must be
4    normalized, and Q must be nn-dn limbs, 0 <= e <= 4.  The requirement that Q
5    is nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us
6    to let N be unmodified during the operation.
7 
8    Contributed to the GNU project by Torbjorn Granlund.
9 
10    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
11    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
12    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
13 
14 Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc.
15 
16 This file is part of the GNU MP Library.
17 
18 The GNU MP Library is free software; you can redistribute it and/or modify
19 it under the terms of either:
20 
21   * the GNU Lesser General Public License as published by the Free
22     Software Foundation; either version 3 of the License, or (at your
23     option) any later version.
24 
25 or
26 
27   * the GNU General Public License as published by the Free Software
28     Foundation; either version 2 of the License, or (at your option) any
29     later version.
30 
31 or both in parallel, as here.
32 
33 The GNU MP Library is distributed in the hope that it will be useful, but
34 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
35 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
36 for more details.
37 
38 You should have received copies of the GNU General Public License and the
39 GNU Lesser General Public License along with the GNU MP Library.  If not,
40 see https://www.gnu.org/licenses/.  */
41 
42 
43 /*
44    The idea of the algorithm used herein is to compute a smaller inverted value
45    than used in the standard Barrett algorithm, and thus save time in the
46    Newton iterations, and pay just a small price when using the inverted value
47    for developing quotient bits.  This algorithm was presented at ICMS 2006.
48 */
49 
50 /* CAUTION: This code and the code in mu_div_qr.c should be edited in sync.
51 
52  Things to work on:
53 
54   * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
55     demonstrated by the fact that the mpn_invertappr function's scratch needs
56     mean that we need to keep a large allocation long after it is needed.
57     Things are worse as mpn_mul_fft does not accept any scratch parameter,
58     which means we'll have a large memory hole while in mpn_mul_fft.  In
59     general, a peak scratch need in the beginning of a function isn't
60     well-handled by the itch/scratch scheme.
61 */
62 
63 #ifdef STAT
64 #undef STAT
65 #define STAT(x) x
66 #else
67 #define STAT(x)
68 #endif
69 
70 #include <stdlib.h>		/* for NULL */
71 #include "gmp-impl.h"
72 
73 static mp_limb_t mpn_preinv_mu_divappr_q (mp_ptr, mp_srcptr, mp_size_t,
74 			 mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr);
75 static mp_size_t mpn_mu_divappr_q_choose_in (mp_size_t, mp_size_t, int);
76 
77 mp_limb_t
mpn_mu_divappr_q(mp_ptr qp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)78 mpn_mu_divappr_q (mp_ptr qp,
79 		  mp_srcptr np,
80 		  mp_size_t nn,
81 		  mp_srcptr dp,
82 		  mp_size_t dn,
83 		  mp_ptr scratch)
84 {
85   mp_size_t qn, in;
86   mp_limb_t cy, qh;
87   mp_ptr ip, tp;
88 
89   ASSERT (dn > 1);
90 
91   qn = nn - dn;
92 
93   /* If Q is smaller than D, truncate operands. */
94   if (qn + 1 < dn)
95     {
96       np += dn - (qn + 1);
97       nn -= dn - (qn + 1);
98       dp += dn - (qn + 1);
99       dn = qn + 1;
100     }
101 
102   /* Compute the inverse size.  */
103   in = mpn_mu_divappr_q_choose_in (qn, dn, 0);
104   ASSERT (in <= dn);
105 
106 #if 1
107   /* This alternative inverse computation method gets slightly more accurate
108      results.  FIXMEs: (1) Temp allocation needs not analysed (2) itch function
109      not adapted (3) mpn_invertappr scratch needs not met.  */
110   ip = scratch;
111   tp = scratch + in + 1;
112 
113   /* compute an approximate inverse on (in+1) limbs */
114   if (dn == in)
115     {
116       MPN_COPY (tp + 1, dp, in);
117       tp[0] = 1;
118       mpn_invertappr (ip, tp, in + 1, tp + in + 1);
119       MPN_COPY_INCR (ip, ip + 1, in);
120     }
121   else
122     {
123       cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
124       if (UNLIKELY (cy != 0))
125 	MPN_ZERO (ip, in);
126       else
127 	{
128 	  mpn_invertappr (ip, tp, in + 1, tp + in + 1);
129 	  MPN_COPY_INCR (ip, ip + 1, in);
130 	}
131     }
132 #else
133   /* This older inverse computation method gets slightly worse results than the
134      one above.  */
135   ip = scratch;
136   tp = scratch + in;
137 
138   /* Compute inverse of D to in+1 limbs, then round to 'in' limbs.  Ideally the
139      inversion function should do this automatically.  */
140   if (dn == in)
141     {
142       tp[in + 1] = 0;
143       MPN_COPY (tp + in + 2, dp, in);
144       mpn_invertappr (tp, tp + in + 1, in + 1, NULL);
145     }
146   else
147     {
148       mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);
149     }
150   cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
151   if (UNLIKELY (cy != 0))
152     MPN_ZERO (tp + 1, in);
153   MPN_COPY (ip, tp + 1, in);
154 #endif
155 
156   qh = mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in);
157 
158   return qh;
159 }
160 
161 static mp_limb_t
mpn_preinv_mu_divappr_q(mp_ptr qp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_srcptr ip,mp_size_t in,mp_ptr scratch)162 mpn_preinv_mu_divappr_q (mp_ptr qp,
163 			 mp_srcptr np,
164 			 mp_size_t nn,
165 			 mp_srcptr dp,
166 			 mp_size_t dn,
167 			 mp_srcptr ip,
168 			 mp_size_t in,
169 			 mp_ptr scratch)
170 {
171   mp_size_t qn;
172   mp_limb_t cy, cx, qh;
173   mp_limb_t r;
174   mp_size_t tn, wn;
175 
176 #define rp           scratch
177 #define tp           (scratch + dn)
178 #define scratch_out  (scratch + dn + tn)
179 
180   qn = nn - dn;
181 
182   np += qn;
183   qp += qn;
184 
185   qh = mpn_cmp (np, dp, dn) >= 0;
186   if (qh != 0)
187     mpn_sub_n (rp, np, dp, dn);
188   else
189     MPN_COPY (rp, np, dn);
190 
191   if (qn == 0)
192     return qh;			/* Degenerate use.  Should we allow this? */
193 
194   while (qn > 0)
195     {
196       if (qn < in)
197 	{
198 	  ip += in - qn;
199 	  in = qn;
200 	}
201       np -= in;
202       qp -= in;
203 
204       /* Compute the next block of quotient limbs by multiplying the inverse I
205 	 by the upper part of the partial remainder R.  */
206       mpn_mul_n (tp, rp + dn - in, ip, in);		/* mulhi  */
207       cy = mpn_add_n (qp, tp + in, rp + dn - in, in);	/* I's msb implicit */
208       ASSERT_ALWAYS (cy == 0);
209 
210       qn -= in;
211       if (qn == 0)
212 	break;
213 
214       /* Compute the product of the quotient block and the divisor D, to be
215 	 subtracted from the partial remainder combined with new limbs from the
216 	 dividend N.  We only really need the low dn limbs.  */
217 
218       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
219 	mpn_mul (tp, dp, dn, qp, in);		/* dn+in limbs, high 'in' cancels */
220       else
221 	{
222 	  tn = mpn_mulmod_bnm1_next_size (dn + 1);
223 	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
224 	  wn = dn + in - tn;			/* number of wrapped limbs */
225 	  if (wn > 0)
226 	    {
227 	      cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);
228 	      cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);
229 	      cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;
230 	      ASSERT_ALWAYS (cx >= cy);
231 	      mpn_incr_u (tp, cx - cy);
232 	    }
233 	}
234 
235       r = rp[dn - in] - tp[dn];
236 
237       /* Subtract the product from the partial remainder combined with new
238 	 limbs from the dividend N, generating a new partial remainder R.  */
239       if (dn != in)
240 	{
241 	  cy = mpn_sub_n (tp, np, tp, in);	/* get next 'in' limbs from N */
242 	  cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
243 	  MPN_COPY (rp, tp, dn);		/* FIXME: try to avoid this */
244 	}
245       else
246 	{
247 	  cy = mpn_sub_n (rp, np, tp, in);	/* get next 'in' limbs from N */
248 	}
249 
250       STAT (int i; int err = 0;
251 	    static int errarr[5]; static int err_rec; static int tot);
252 
253       /* Check the remainder R and adjust the quotient as needed.  */
254       r -= cy;
255       while (r != 0)
256 	{
257 	  /* We loop 0 times with about 69% probability, 1 time with about 31%
258 	     probability, 2 times with about 0.6% probability, if inverse is
259 	     computed as recommended.  */
260 	  mpn_incr_u (qp, 1);
261 	  cy = mpn_sub_n (rp, rp, dp, dn);
262 	  r -= cy;
263 	  STAT (err++);
264 	}
265       if (mpn_cmp (rp, dp, dn) >= 0)
266 	{
267 	  /* This is executed with about 76% probability.  */
268 	  mpn_incr_u (qp, 1);
269 	  cy = mpn_sub_n (rp, rp, dp, dn);
270 	  STAT (err++);
271 	}
272 
273       STAT (
274 	    tot++;
275 	    errarr[err]++;
276 	    if (err > err_rec)
277 	      err_rec = err;
278 	    if (tot % 0x10000 == 0)
279 	      {
280 		for (i = 0; i <= err_rec; i++)
281 		  printf ("  %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
282 		printf ("\n");
283 	      }
284 	    );
285     }
286 
287   /* FIXME: We should perhaps be somewhat more elegant in our rounding of the
288      quotient.  For now, just make sure the returned quotient is >= the real
289      quotient; add 3 with saturating arithmetic.  */
290   qn = nn - dn;
291   cy += mpn_add_1 (qp, qp, qn, 3);
292   if (cy != 0)
293     {
294       if (qh != 0)
295 	{
296 	  /* Return a quotient of just 1-bits, with qh set.  */
297 	  mp_size_t i;
298 	  for (i = 0; i < qn; i++)
299 	    qp[i] = GMP_NUMB_MAX;
300 	}
301       else
302 	{
303 	  /* Propagate carry into qh.  */
304 	  qh = 1;
305 	}
306     }
307 
308   return qh;
309 }
310 
311 /* In case k=0 (automatic choice), we distinguish 3 cases:
312    (a) dn < qn:         in = ceil(qn / ceil(qn/dn))
313    (b) dn/3 < qn <= dn: in = ceil(qn / 2)
314    (c) qn < dn/3:       in = qn
315    In all cases we have in <= dn.
316  */
317 static mp_size_t
mpn_mu_divappr_q_choose_in(mp_size_t qn,mp_size_t dn,int k)318 mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k)
319 {
320   mp_size_t in;
321 
322   if (k == 0)
323     {
324       mp_size_t b;
325       if (qn > dn)
326 	{
327 	  /* Compute an inverse size that is a nice partition of the quotient.  */
328 	  b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
329 	  in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
330 	}
331       else if (3 * qn > dn)
332 	{
333 	  in = (qn - 1) / 2 + 1;	/* b = 2 */
334 	}
335       else
336 	{
337 	  in = (qn - 1) / 1 + 1;	/* b = 1 */
338 	}
339     }
340   else
341     {
342       mp_size_t xn;
343       xn = MIN (dn, qn);
344       in = (xn - 1) / k + 1;
345     }
346 
347   return in;
348 }
349 
350 mp_size_t
mpn_mu_divappr_q_itch(mp_size_t nn,mp_size_t dn,int mua_k)351 mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
352 {
353   mp_size_t qn, in, itch_local, itch_out, itch_invapp;
354 
355   qn = nn - dn;
356   if (qn + 1 < dn)
357     {
358       dn = qn + 1;
359     }
360   in = mpn_mu_divappr_q_choose_in (qn, dn, mua_k);
361 
362   itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
363   itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
364   itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */
365 
366   ASSERT (dn + itch_local + itch_out >= itch_invapp);
367   return in + MAX (dn + itch_local + itch_out, itch_invapp);
368 }
369