186d7f5d3SJohn Marino /* mpn_mu_divappr_q, mpn_preinv_mu_divappr_q.
286d7f5d3SJohn Marino
386d7f5d3SJohn Marino Compute Q = floor(N / D) + e. N is nn limbs, D is dn limbs and must be
486d7f5d3SJohn Marino normalized, and Q must be nn-dn limbs, 0 <= e <= 4. The requirement that Q
586d7f5d3SJohn Marino is nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us
686d7f5d3SJohn Marino to let N be unmodified during the operation.
786d7f5d3SJohn Marino
886d7f5d3SJohn Marino Contributed to the GNU project by Torbjorn Granlund.
986d7f5d3SJohn Marino
1086d7f5d3SJohn Marino THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
1186d7f5d3SJohn Marino SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
1286d7f5d3SJohn Marino GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
1386d7f5d3SJohn Marino
1486d7f5d3SJohn Marino Copyright 2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
1586d7f5d3SJohn Marino
1686d7f5d3SJohn Marino This file is part of the GNU MP Library.
1786d7f5d3SJohn Marino
1886d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1986d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
2086d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
2186d7f5d3SJohn Marino option) any later version.
2286d7f5d3SJohn Marino
2386d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
2486d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2586d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
2686d7f5d3SJohn Marino License for more details.
2786d7f5d3SJohn Marino
2886d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2986d7f5d3SJohn Marino along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
3086d7f5d3SJohn Marino
3186d7f5d3SJohn Marino
3286d7f5d3SJohn Marino /*
3386d7f5d3SJohn Marino The idea of the algorithm used herein is to compute a smaller inverted value
3486d7f5d3SJohn Marino than used in the standard Barrett algorithm, and thus save time in the
3586d7f5d3SJohn Marino Newton iterations, and pay just a small price when using the inverted value
3686d7f5d3SJohn Marino for developing quotient bits. This algorithm was presented at ICMS 2006.
3786d7f5d3SJohn Marino */
3886d7f5d3SJohn Marino
3986d7f5d3SJohn Marino /* CAUTION: This code and the code in mu_div_qr.c should be edited in sync.
4086d7f5d3SJohn Marino
4186d7f5d3SJohn Marino Things to work on:
4286d7f5d3SJohn Marino
4386d7f5d3SJohn Marino * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
4486d7f5d3SJohn Marino demonstrated by the fact that the mpn_invertappr function's scratch needs
4586d7f5d3SJohn Marino mean that we need to keep a large allocation long after it is needed.
4686d7f5d3SJohn Marino Things are worse as mpn_mul_fft does not accept any scratch parameter,
4786d7f5d3SJohn Marino which means we'll have a large memory hole while in mpn_mul_fft. In
4886d7f5d3SJohn Marino general, a peak scratch need in the beginning of a function isn't
4986d7f5d3SJohn Marino well-handled by the itch/scratch scheme.
5086d7f5d3SJohn Marino */
5186d7f5d3SJohn Marino
5286d7f5d3SJohn Marino #ifdef STAT
5386d7f5d3SJohn Marino #undef STAT
5486d7f5d3SJohn Marino #define STAT(x) x
5586d7f5d3SJohn Marino #else
5686d7f5d3SJohn Marino #define STAT(x)
5786d7f5d3SJohn Marino #endif
5886d7f5d3SJohn Marino
5986d7f5d3SJohn Marino #include <stdlib.h> /* for NULL */
6086d7f5d3SJohn Marino #include "gmp.h"
6186d7f5d3SJohn Marino #include "gmp-impl.h"
6286d7f5d3SJohn Marino
6386d7f5d3SJohn Marino
6486d7f5d3SJohn Marino mp_limb_t
mpn_mu_divappr_q(mp_ptr qp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)6586d7f5d3SJohn Marino mpn_mu_divappr_q (mp_ptr qp,
6686d7f5d3SJohn Marino mp_srcptr np,
6786d7f5d3SJohn Marino mp_size_t nn,
6886d7f5d3SJohn Marino mp_srcptr dp,
6986d7f5d3SJohn Marino mp_size_t dn,
7086d7f5d3SJohn Marino mp_ptr scratch)
7186d7f5d3SJohn Marino {
7286d7f5d3SJohn Marino mp_size_t qn, in;
7386d7f5d3SJohn Marino mp_limb_t cy, qh;
7486d7f5d3SJohn Marino mp_ptr ip, tp;
7586d7f5d3SJohn Marino
7686d7f5d3SJohn Marino ASSERT (dn > 1);
7786d7f5d3SJohn Marino
7886d7f5d3SJohn Marino qn = nn - dn;
7986d7f5d3SJohn Marino
8086d7f5d3SJohn Marino /* If Q is smaller than D, truncate operands. */
8186d7f5d3SJohn Marino if (qn + 1 < dn)
8286d7f5d3SJohn Marino {
8386d7f5d3SJohn Marino np += dn - (qn + 1);
8486d7f5d3SJohn Marino nn -= dn - (qn + 1);
8586d7f5d3SJohn Marino dp += dn - (qn + 1);
8686d7f5d3SJohn Marino dn = qn + 1;
8786d7f5d3SJohn Marino }
8886d7f5d3SJohn Marino
8986d7f5d3SJohn Marino /* Compute the inverse size. */
9086d7f5d3SJohn Marino in = mpn_mu_divappr_q_choose_in (qn, dn, 0);
9186d7f5d3SJohn Marino ASSERT (in <= dn);
9286d7f5d3SJohn Marino
9386d7f5d3SJohn Marino #if 1
9486d7f5d3SJohn Marino /* This alternative inverse computation method gets slightly more accurate
9586d7f5d3SJohn Marino results. FIXMEs: (1) Temp allocation needs not analysed (2) itch function
9686d7f5d3SJohn Marino not adapted (3) mpn_invertappr scratch needs not met. */
9786d7f5d3SJohn Marino ip = scratch;
9886d7f5d3SJohn Marino tp = scratch + in + 1;
9986d7f5d3SJohn Marino
10086d7f5d3SJohn Marino /* compute an approximate inverse on (in+1) limbs */
10186d7f5d3SJohn Marino if (dn == in)
10286d7f5d3SJohn Marino {
10386d7f5d3SJohn Marino MPN_COPY (tp + 1, dp, in);
10486d7f5d3SJohn Marino tp[0] = 1;
10586d7f5d3SJohn Marino mpn_invertappr (ip, tp, in + 1, NULL);
10686d7f5d3SJohn Marino MPN_COPY_INCR (ip, ip + 1, in);
10786d7f5d3SJohn Marino }
10886d7f5d3SJohn Marino else
10986d7f5d3SJohn Marino {
11086d7f5d3SJohn Marino cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
11186d7f5d3SJohn Marino if (UNLIKELY (cy != 0))
11286d7f5d3SJohn Marino MPN_ZERO (ip, in);
11386d7f5d3SJohn Marino else
11486d7f5d3SJohn Marino {
11586d7f5d3SJohn Marino mpn_invertappr (ip, tp, in + 1, NULL);
11686d7f5d3SJohn Marino MPN_COPY_INCR (ip, ip + 1, in);
11786d7f5d3SJohn Marino }
11886d7f5d3SJohn Marino }
11986d7f5d3SJohn Marino #else
12086d7f5d3SJohn Marino /* This older inverse computation method gets slightly worse results than the
12186d7f5d3SJohn Marino one above. */
12286d7f5d3SJohn Marino ip = scratch;
12386d7f5d3SJohn Marino tp = scratch + in;
12486d7f5d3SJohn Marino
12586d7f5d3SJohn Marino /* Compute inverse of D to in+1 limbs, then round to 'in' limbs. Ideally the
12686d7f5d3SJohn Marino inversion function should do this automatically. */
12786d7f5d3SJohn Marino if (dn == in)
12886d7f5d3SJohn Marino {
12986d7f5d3SJohn Marino tp[in + 1] = 0;
13086d7f5d3SJohn Marino MPN_COPY (tp + in + 2, dp, in);
13186d7f5d3SJohn Marino mpn_invertappr (tp, tp + in + 1, in + 1, NULL);
13286d7f5d3SJohn Marino }
13386d7f5d3SJohn Marino else
13486d7f5d3SJohn Marino {
13586d7f5d3SJohn Marino mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);
13686d7f5d3SJohn Marino }
13786d7f5d3SJohn Marino cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
13886d7f5d3SJohn Marino if (UNLIKELY (cy != 0))
13986d7f5d3SJohn Marino MPN_ZERO (tp + 1, in);
14086d7f5d3SJohn Marino MPN_COPY (ip, tp + 1, in);
14186d7f5d3SJohn Marino #endif
14286d7f5d3SJohn Marino
14386d7f5d3SJohn Marino qh = mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in);
14486d7f5d3SJohn Marino
14586d7f5d3SJohn Marino return qh;
14686d7f5d3SJohn Marino }
14786d7f5d3SJohn Marino
14886d7f5d3SJohn Marino mp_limb_t
mpn_preinv_mu_divappr_q(mp_ptr qp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_srcptr ip,mp_size_t in,mp_ptr scratch)14986d7f5d3SJohn Marino mpn_preinv_mu_divappr_q (mp_ptr qp,
15086d7f5d3SJohn Marino mp_srcptr np,
15186d7f5d3SJohn Marino mp_size_t nn,
15286d7f5d3SJohn Marino mp_srcptr dp,
15386d7f5d3SJohn Marino mp_size_t dn,
15486d7f5d3SJohn Marino mp_srcptr ip,
15586d7f5d3SJohn Marino mp_size_t in,
15686d7f5d3SJohn Marino mp_ptr scratch)
15786d7f5d3SJohn Marino {
15886d7f5d3SJohn Marino mp_size_t qn;
15986d7f5d3SJohn Marino mp_limb_t cy, cx, qh;
16086d7f5d3SJohn Marino mp_limb_t r;
16186d7f5d3SJohn Marino mp_size_t tn, wn;
16286d7f5d3SJohn Marino
16386d7f5d3SJohn Marino #define rp scratch
16486d7f5d3SJohn Marino #define tp (scratch + dn)
16586d7f5d3SJohn Marino #define scratch_out (scratch + dn + tn)
16686d7f5d3SJohn Marino
16786d7f5d3SJohn Marino qn = nn - dn;
16886d7f5d3SJohn Marino
16986d7f5d3SJohn Marino np += qn;
17086d7f5d3SJohn Marino qp += qn;
17186d7f5d3SJohn Marino
17286d7f5d3SJohn Marino qh = mpn_cmp (np, dp, dn) >= 0;
17386d7f5d3SJohn Marino if (qh != 0)
17486d7f5d3SJohn Marino mpn_sub_n (rp, np, dp, dn);
17586d7f5d3SJohn Marino else
17686d7f5d3SJohn Marino MPN_COPY (rp, np, dn);
17786d7f5d3SJohn Marino
17886d7f5d3SJohn Marino if (qn == 0)
17986d7f5d3SJohn Marino return qh; /* Degenerate use. Should we allow this? */
18086d7f5d3SJohn Marino
18186d7f5d3SJohn Marino while (qn > 0)
18286d7f5d3SJohn Marino {
18386d7f5d3SJohn Marino if (qn < in)
18486d7f5d3SJohn Marino {
18586d7f5d3SJohn Marino ip += in - qn;
18686d7f5d3SJohn Marino in = qn;
18786d7f5d3SJohn Marino }
18886d7f5d3SJohn Marino np -= in;
18986d7f5d3SJohn Marino qp -= in;
19086d7f5d3SJohn Marino
19186d7f5d3SJohn Marino /* Compute the next block of quotient limbs by multiplying the inverse I
19286d7f5d3SJohn Marino by the upper part of the partial remainder R. */
19386d7f5d3SJohn Marino mpn_mul_n (tp, rp + dn - in, ip, in); /* mulhi */
19486d7f5d3SJohn Marino cy = mpn_add_n (qp, tp + in, rp + dn - in, in); /* I's msb implicit */
19586d7f5d3SJohn Marino ASSERT_ALWAYS (cy == 0);
19686d7f5d3SJohn Marino
19786d7f5d3SJohn Marino qn -= in;
19886d7f5d3SJohn Marino if (qn == 0)
19986d7f5d3SJohn Marino break;
20086d7f5d3SJohn Marino
20186d7f5d3SJohn Marino /* Compute the product of the quotient block and the divisor D, to be
20286d7f5d3SJohn Marino subtracted from the partial remainder combined with new limbs from the
20386d7f5d3SJohn Marino dividend N. We only really need the low dn limbs. */
20486d7f5d3SJohn Marino
20586d7f5d3SJohn Marino if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
20686d7f5d3SJohn Marino mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancels */
20786d7f5d3SJohn Marino else
20886d7f5d3SJohn Marino {
20986d7f5d3SJohn Marino tn = mpn_mulmod_bnm1_next_size (dn + 1);
21086d7f5d3SJohn Marino mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
21186d7f5d3SJohn Marino wn = dn + in - tn; /* number of wrapped limbs */
21286d7f5d3SJohn Marino if (wn > 0)
21386d7f5d3SJohn Marino {
21486d7f5d3SJohn Marino cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);
21586d7f5d3SJohn Marino cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);
21686d7f5d3SJohn Marino cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;
21786d7f5d3SJohn Marino ASSERT_ALWAYS (cx >= cy);
21886d7f5d3SJohn Marino mpn_incr_u (tp, cx - cy);
21986d7f5d3SJohn Marino }
22086d7f5d3SJohn Marino }
22186d7f5d3SJohn Marino
22286d7f5d3SJohn Marino r = rp[dn - in] - tp[dn];
22386d7f5d3SJohn Marino
22486d7f5d3SJohn Marino /* Subtract the product from the partial remainder combined with new
22586d7f5d3SJohn Marino limbs from the dividend N, generating a new partial remainder R. */
22686d7f5d3SJohn Marino if (dn != in)
22786d7f5d3SJohn Marino {
22886d7f5d3SJohn Marino cy = mpn_sub_n (tp, np, tp, in); /* get next 'in' limbs from N */
22986d7f5d3SJohn Marino cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
23086d7f5d3SJohn Marino MPN_COPY (rp, tp, dn); /* FIXME: try to avoid this */
23186d7f5d3SJohn Marino }
23286d7f5d3SJohn Marino else
23386d7f5d3SJohn Marino {
23486d7f5d3SJohn Marino cy = mpn_sub_n (rp, np, tp, in); /* get next 'in' limbs from N */
23586d7f5d3SJohn Marino }
23686d7f5d3SJohn Marino
23786d7f5d3SJohn Marino STAT (int i; int err = 0;
23886d7f5d3SJohn Marino static int errarr[5]; static int err_rec; static int tot);
23986d7f5d3SJohn Marino
24086d7f5d3SJohn Marino /* Check the remainder R and adjust the quotient as needed. */
24186d7f5d3SJohn Marino r -= cy;
24286d7f5d3SJohn Marino while (r != 0)
24386d7f5d3SJohn Marino {
24486d7f5d3SJohn Marino /* We loop 0 times with about 69% probability, 1 time with about 31%
24586d7f5d3SJohn Marino probability, 2 times with about 0.6% probability, if inverse is
24686d7f5d3SJohn Marino computed as recommended. */
24786d7f5d3SJohn Marino mpn_incr_u (qp, 1);
24886d7f5d3SJohn Marino cy = mpn_sub_n (rp, rp, dp, dn);
24986d7f5d3SJohn Marino r -= cy;
25086d7f5d3SJohn Marino STAT (err++);
25186d7f5d3SJohn Marino }
25286d7f5d3SJohn Marino if (mpn_cmp (rp, dp, dn) >= 0)
25386d7f5d3SJohn Marino {
25486d7f5d3SJohn Marino /* This is executed with about 76% probability. */
25586d7f5d3SJohn Marino mpn_incr_u (qp, 1);
25686d7f5d3SJohn Marino cy = mpn_sub_n (rp, rp, dp, dn);
25786d7f5d3SJohn Marino STAT (err++);
25886d7f5d3SJohn Marino }
25986d7f5d3SJohn Marino
26086d7f5d3SJohn Marino STAT (
26186d7f5d3SJohn Marino tot++;
26286d7f5d3SJohn Marino errarr[err]++;
26386d7f5d3SJohn Marino if (err > err_rec)
26486d7f5d3SJohn Marino err_rec = err;
26586d7f5d3SJohn Marino if (tot % 0x10000 == 0)
26686d7f5d3SJohn Marino {
26786d7f5d3SJohn Marino for (i = 0; i <= err_rec; i++)
26886d7f5d3SJohn Marino printf (" %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
26986d7f5d3SJohn Marino printf ("\n");
27086d7f5d3SJohn Marino }
27186d7f5d3SJohn Marino );
27286d7f5d3SJohn Marino }
27386d7f5d3SJohn Marino
27486d7f5d3SJohn Marino /* FIXME: We should perhaps be somewhat more elegant in our rounding of the
27586d7f5d3SJohn Marino quotient. For now, just make sure the returned quotient is >= the real
27686d7f5d3SJohn Marino quotient; add 3 with saturating arithmetic. */
27786d7f5d3SJohn Marino qn = nn - dn;
27886d7f5d3SJohn Marino cy += mpn_add_1 (qp, qp, qn, 3);
27986d7f5d3SJohn Marino if (cy != 0)
28086d7f5d3SJohn Marino {
28186d7f5d3SJohn Marino if (qh != 0)
28286d7f5d3SJohn Marino {
28386d7f5d3SJohn Marino /* Return a quotient of just 1-bits, with qh set. */
28486d7f5d3SJohn Marino mp_size_t i;
28586d7f5d3SJohn Marino for (i = 0; i < qn; i++)
28686d7f5d3SJohn Marino qp[i] = GMP_NUMB_MAX;
28786d7f5d3SJohn Marino }
28886d7f5d3SJohn Marino else
28986d7f5d3SJohn Marino {
29086d7f5d3SJohn Marino /* Propagate carry into qh. */
29186d7f5d3SJohn Marino qh = 1;
29286d7f5d3SJohn Marino }
29386d7f5d3SJohn Marino }
29486d7f5d3SJohn Marino
29586d7f5d3SJohn Marino return qh;
29686d7f5d3SJohn Marino }
29786d7f5d3SJohn Marino
29886d7f5d3SJohn Marino /* In case k=0 (automatic choice), we distinguish 3 cases:
29986d7f5d3SJohn Marino (a) dn < qn: in = ceil(qn / ceil(qn/dn))
30086d7f5d3SJohn Marino (b) dn/3 < qn <= dn: in = ceil(qn / 2)
30186d7f5d3SJohn Marino (c) qn < dn/3: in = qn
30286d7f5d3SJohn Marino In all cases we have in <= dn.
30386d7f5d3SJohn Marino */
30486d7f5d3SJohn Marino mp_size_t
mpn_mu_divappr_q_choose_in(mp_size_t qn,mp_size_t dn,int k)30586d7f5d3SJohn Marino mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k)
30686d7f5d3SJohn Marino {
30786d7f5d3SJohn Marino mp_size_t in;
30886d7f5d3SJohn Marino
30986d7f5d3SJohn Marino if (k == 0)
31086d7f5d3SJohn Marino {
31186d7f5d3SJohn Marino mp_size_t b;
31286d7f5d3SJohn Marino if (qn > dn)
31386d7f5d3SJohn Marino {
31486d7f5d3SJohn Marino /* Compute an inverse size that is a nice partition of the quotient. */
31586d7f5d3SJohn Marino b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
31686d7f5d3SJohn Marino in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
31786d7f5d3SJohn Marino }
31886d7f5d3SJohn Marino else if (3 * qn > dn)
31986d7f5d3SJohn Marino {
32086d7f5d3SJohn Marino in = (qn - 1) / 2 + 1; /* b = 2 */
32186d7f5d3SJohn Marino }
32286d7f5d3SJohn Marino else
32386d7f5d3SJohn Marino {
32486d7f5d3SJohn Marino in = (qn - 1) / 1 + 1; /* b = 1 */
32586d7f5d3SJohn Marino }
32686d7f5d3SJohn Marino }
32786d7f5d3SJohn Marino else
32886d7f5d3SJohn Marino {
32986d7f5d3SJohn Marino mp_size_t xn;
33086d7f5d3SJohn Marino xn = MIN (dn, qn);
33186d7f5d3SJohn Marino in = (xn - 1) / k + 1;
33286d7f5d3SJohn Marino }
33386d7f5d3SJohn Marino
33486d7f5d3SJohn Marino return in;
33586d7f5d3SJohn Marino }
33686d7f5d3SJohn Marino
33786d7f5d3SJohn Marino mp_size_t
mpn_mu_divappr_q_itch(mp_size_t nn,mp_size_t dn,int mua_k)33886d7f5d3SJohn Marino mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
33986d7f5d3SJohn Marino {
34086d7f5d3SJohn Marino mp_size_t qn, in, itch_local, itch_out;
34186d7f5d3SJohn Marino
34286d7f5d3SJohn Marino qn = nn - dn;
34386d7f5d3SJohn Marino if (qn + 1 < dn)
34486d7f5d3SJohn Marino {
34586d7f5d3SJohn Marino dn = qn + 1;
34686d7f5d3SJohn Marino }
34786d7f5d3SJohn Marino in = mpn_mu_divappr_q_choose_in (qn, dn, mua_k);
34886d7f5d3SJohn Marino
34986d7f5d3SJohn Marino itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
35086d7f5d3SJohn Marino itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
35186d7f5d3SJohn Marino return in + dn + itch_local + itch_out;
35286d7f5d3SJohn Marino }
353