186d7f5d3SJohn Marino /* mpn_mu_div_q.
286d7f5d3SJohn Marino
386d7f5d3SJohn Marino Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
486d7f5d3SJohn Marino
586d7f5d3SJohn Marino THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
686d7f5d3SJohn Marino SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
786d7f5d3SJohn Marino GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
886d7f5d3SJohn Marino
986d7f5d3SJohn Marino Copyright 2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
1086d7f5d3SJohn Marino
1186d7f5d3SJohn Marino This file is part of the GNU MP Library.
1286d7f5d3SJohn Marino
1386d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1486d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1586d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1686d7f5d3SJohn Marino option) any later version.
1786d7f5d3SJohn Marino
1886d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
1986d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2086d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
2186d7f5d3SJohn Marino License for more details.
2286d7f5d3SJohn Marino
2386d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2486d7f5d3SJohn Marino along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
2586d7f5d3SJohn Marino
2686d7f5d3SJohn Marino
2786d7f5d3SJohn Marino /*
2886d7f5d3SJohn Marino The idea of the algorithm used herein is to compute a smaller inverted value
2986d7f5d3SJohn Marino than used in the standard Barrett algorithm, and thus save time in the
3086d7f5d3SJohn Marino Newton iterations, and pay just a small price when using the inverted value
3186d7f5d3SJohn Marino for developing quotient bits. This algorithm was presented at ICMS 2006.
3286d7f5d3SJohn Marino */
3386d7f5d3SJohn Marino
3486d7f5d3SJohn Marino /*
3586d7f5d3SJohn Marino Things to work on:
3686d7f5d3SJohn Marino
3786d7f5d3SJohn Marino 1. This is a rudimentary implementation of mpn_mu_div_q. The algorithm is
3886d7f5d3SJohn Marino probably close to optimal, except when mpn_mu_divappr_q fails.
3986d7f5d3SJohn Marino
4086d7f5d3SJohn Marino An alternative which could be considered for much simpler code for the
4186d7f5d3SJohn Marino complex qn>=dn arm would be to allocate a temporary nn+1 limb buffer, then
4286d7f5d3SJohn Marino simply call mpn_mu_divappr_q. Such a temporary allocation is
4386d7f5d3SJohn Marino unfortunately very large.
4486d7f5d3SJohn Marino
4586d7f5d3SJohn Marino 2. We used to fall back to mpn_mu_div_qr when we detect a possible
4686d7f5d3SJohn Marino mpn_mu_divappr_q rounding problem, now we multiply and compare.
4786d7f5d3SJohn Marino Unfortunately, since mpn_mu_divappr_q does not return the partial
4886d7f5d3SJohn Marino remainder, this also doesn't become optimal. A mpn_mu_divappr_qr could
4986d7f5d3SJohn Marino solve that.
5086d7f5d3SJohn Marino
5186d7f5d3SJohn Marino 3. The allocations done here should be made from the scratch area, which
5286d7f5d3SJohn Marino then would need to be amended.
5386d7f5d3SJohn Marino */
5486d7f5d3SJohn Marino
5586d7f5d3SJohn Marino #include <stdlib.h> /* for NULL */
5686d7f5d3SJohn Marino #include "gmp.h"
5786d7f5d3SJohn Marino #include "gmp-impl.h"
5886d7f5d3SJohn Marino
5986d7f5d3SJohn Marino
6086d7f5d3SJohn Marino mp_limb_t
mpn_mu_div_q(mp_ptr qp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)6186d7f5d3SJohn Marino mpn_mu_div_q (mp_ptr qp,
6286d7f5d3SJohn Marino mp_srcptr np, mp_size_t nn,
6386d7f5d3SJohn Marino mp_srcptr dp, mp_size_t dn,
6486d7f5d3SJohn Marino mp_ptr scratch)
6586d7f5d3SJohn Marino {
6686d7f5d3SJohn Marino mp_ptr tp, rp, ip, this_ip;
6786d7f5d3SJohn Marino mp_size_t qn, in, this_in;
6886d7f5d3SJohn Marino mp_limb_t cy, qh;
6986d7f5d3SJohn Marino TMP_DECL;
7086d7f5d3SJohn Marino
7186d7f5d3SJohn Marino TMP_MARK;
7286d7f5d3SJohn Marino
7386d7f5d3SJohn Marino qn = nn - dn;
7486d7f5d3SJohn Marino
7586d7f5d3SJohn Marino tp = TMP_BALLOC_LIMBS (qn + 1);
7686d7f5d3SJohn Marino
7786d7f5d3SJohn Marino if (qn >= dn) /* nn >= 2*dn + 1 */
7886d7f5d3SJohn Marino {
7986d7f5d3SJohn Marino /* Find max inverse size needed by the two preinv calls. FIXME: This is
8086d7f5d3SJohn Marino not optimal, it underestimates the invariance. */
8186d7f5d3SJohn Marino if (dn != qn)
8286d7f5d3SJohn Marino {
8386d7f5d3SJohn Marino mp_size_t in1, in2;
8486d7f5d3SJohn Marino
8586d7f5d3SJohn Marino in1 = mpn_mu_div_qr_choose_in (qn - dn, dn, 0);
8686d7f5d3SJohn Marino in2 = mpn_mu_divappr_q_choose_in (dn + 1, dn, 0);
8786d7f5d3SJohn Marino in = MAX (in1, in2);
8886d7f5d3SJohn Marino }
8986d7f5d3SJohn Marino else
9086d7f5d3SJohn Marino {
9186d7f5d3SJohn Marino in = mpn_mu_divappr_q_choose_in (dn + 1, dn, 0);
9286d7f5d3SJohn Marino }
9386d7f5d3SJohn Marino
9486d7f5d3SJohn Marino ip = TMP_BALLOC_LIMBS (in + 1);
9586d7f5d3SJohn Marino
9686d7f5d3SJohn Marino if (dn == in)
9786d7f5d3SJohn Marino {
9886d7f5d3SJohn Marino MPN_COPY (scratch + 1, dp, in);
9986d7f5d3SJohn Marino scratch[0] = 1;
10086d7f5d3SJohn Marino mpn_invertappr (ip, scratch, in + 1, NULL);
10186d7f5d3SJohn Marino MPN_COPY_INCR (ip, ip + 1, in);
10286d7f5d3SJohn Marino }
10386d7f5d3SJohn Marino else
10486d7f5d3SJohn Marino {
10586d7f5d3SJohn Marino cy = mpn_add_1 (scratch, dp + dn - (in + 1), in + 1, 1);
10686d7f5d3SJohn Marino if (UNLIKELY (cy != 0))
10786d7f5d3SJohn Marino MPN_ZERO (ip, in);
10886d7f5d3SJohn Marino else
10986d7f5d3SJohn Marino {
11086d7f5d3SJohn Marino mpn_invertappr (ip, scratch, in + 1, NULL);
11186d7f5d3SJohn Marino MPN_COPY_INCR (ip, ip + 1, in);
11286d7f5d3SJohn Marino }
11386d7f5d3SJohn Marino }
11486d7f5d3SJohn Marino
11586d7f5d3SJohn Marino /* |_______________________| dividend
11686d7f5d3SJohn Marino |________| divisor */
11786d7f5d3SJohn Marino rp = TMP_BALLOC_LIMBS (2 * dn + 1);
11886d7f5d3SJohn Marino
11986d7f5d3SJohn Marino this_in = mpn_mu_div_qr_choose_in (qn - dn, dn, 0);
12086d7f5d3SJohn Marino this_ip = ip + in - this_in;
12186d7f5d3SJohn Marino qh = mpn_preinv_mu_div_qr (tp + dn + 1, rp + dn + 1, np + dn, qn, dp, dn,
12286d7f5d3SJohn Marino this_ip, this_in, scratch);
12386d7f5d3SJohn Marino
12486d7f5d3SJohn Marino MPN_COPY (rp + 1, np, dn);
12586d7f5d3SJohn Marino rp[0] = 0;
12686d7f5d3SJohn Marino this_in = mpn_mu_divappr_q_choose_in (dn + 1, dn, 0);
12786d7f5d3SJohn Marino this_ip = ip + in - this_in;
12886d7f5d3SJohn Marino cy = mpn_preinv_mu_divappr_q (tp, rp, 2 * dn + 1, dp, dn,
12986d7f5d3SJohn Marino this_ip, this_in, scratch);
13086d7f5d3SJohn Marino
13186d7f5d3SJohn Marino if (UNLIKELY (cy != 0))
13286d7f5d3SJohn Marino {
13386d7f5d3SJohn Marino /* Since the partial remainder fed to mpn_preinv_mu_divappr_q was
13486d7f5d3SJohn Marino canonically reduced, replace the returned value of B^(qn-dn)+eps
13586d7f5d3SJohn Marino by the largest possible value. */
13686d7f5d3SJohn Marino mp_size_t i;
13786d7f5d3SJohn Marino for (i = 0; i < dn + 1; i++)
13886d7f5d3SJohn Marino tp[i] = GMP_NUMB_MAX;
13986d7f5d3SJohn Marino }
14086d7f5d3SJohn Marino
14186d7f5d3SJohn Marino /* The max error of mpn_mu_divappr_q is +4. If the low quotient limb is
14286d7f5d3SJohn Marino greater than the max error, we cannot trust the quotient. */
14386d7f5d3SJohn Marino if (tp[0] > 4)
14486d7f5d3SJohn Marino {
14586d7f5d3SJohn Marino MPN_COPY (qp, tp + 1, qn);
14686d7f5d3SJohn Marino }
14786d7f5d3SJohn Marino else
14886d7f5d3SJohn Marino {
14986d7f5d3SJohn Marino mp_limb_t cy;
15086d7f5d3SJohn Marino mp_ptr pp;
15186d7f5d3SJohn Marino
15286d7f5d3SJohn Marino /* FIXME: can we use already allocated space? */
15386d7f5d3SJohn Marino pp = TMP_BALLOC_LIMBS (nn);
15486d7f5d3SJohn Marino mpn_mul (pp, tp + 1, qn, dp, dn);
15586d7f5d3SJohn Marino
15686d7f5d3SJohn Marino cy = (qh != 0) ? mpn_add_n (pp + qn, pp + qn, dp, dn) : 0;
15786d7f5d3SJohn Marino
15886d7f5d3SJohn Marino if (cy || mpn_cmp (pp, np, nn) > 0) /* At most is wrong by one, no cycle. */
15986d7f5d3SJohn Marino qh -= mpn_sub_1 (qp, tp + 1, qn, 1);
16086d7f5d3SJohn Marino else /* Same as above */
16186d7f5d3SJohn Marino MPN_COPY (qp, tp + 1, qn);
16286d7f5d3SJohn Marino }
16386d7f5d3SJohn Marino }
16486d7f5d3SJohn Marino else
16586d7f5d3SJohn Marino {
16686d7f5d3SJohn Marino /* |_______________________| dividend
16786d7f5d3SJohn Marino |________________| divisor */
16886d7f5d3SJohn Marino
16986d7f5d3SJohn Marino /* FIXME: When nn = 2dn-1, qn becomes dn-1, and the numerator size passed
17086d7f5d3SJohn Marino here becomes 2dn, i.e., more than nn. This shouldn't hurt, since only
17186d7f5d3SJohn Marino the most significant dn-1 limbs will actually be read, but it is not
17286d7f5d3SJohn Marino pretty. */
17386d7f5d3SJohn Marino
17486d7f5d3SJohn Marino qh = mpn_mu_divappr_q (tp, np + nn - (2 * qn + 2), 2 * qn + 2,
17586d7f5d3SJohn Marino dp + dn - (qn + 1), qn + 1, scratch);
17686d7f5d3SJohn Marino
17786d7f5d3SJohn Marino /* The max error of mpn_mu_divappr_q is +4, but we get an additional
17886d7f5d3SJohn Marino error from the divisor truncation. */
17986d7f5d3SJohn Marino if (tp[0] > 6)
18086d7f5d3SJohn Marino {
18186d7f5d3SJohn Marino MPN_COPY (qp, tp + 1, qn);
18286d7f5d3SJohn Marino }
18386d7f5d3SJohn Marino else
18486d7f5d3SJohn Marino {
18586d7f5d3SJohn Marino mp_limb_t cy;
18686d7f5d3SJohn Marino
18786d7f5d3SJohn Marino /* FIXME: a shorter product should be enough; we may use already
18886d7f5d3SJohn Marino allocated space... */
18986d7f5d3SJohn Marino rp = TMP_BALLOC_LIMBS (nn);
19086d7f5d3SJohn Marino mpn_mul (rp, dp, dn, tp + 1, qn);
19186d7f5d3SJohn Marino
19286d7f5d3SJohn Marino cy = (qh != 0) ? mpn_add_n (rp + qn, rp + qn, dp, dn) : 0;
19386d7f5d3SJohn Marino
19486d7f5d3SJohn Marino if (cy || mpn_cmp (rp, np, nn) > 0) /* At most is wrong by one, no cycle. */
19586d7f5d3SJohn Marino qh -= mpn_sub_1 (qp, tp + 1, qn, 1);
19686d7f5d3SJohn Marino else /* Same as above */
19786d7f5d3SJohn Marino MPN_COPY (qp, tp + 1, qn);
19886d7f5d3SJohn Marino }
19986d7f5d3SJohn Marino }
20086d7f5d3SJohn Marino
20186d7f5d3SJohn Marino TMP_FREE;
20286d7f5d3SJohn Marino return qh;
20386d7f5d3SJohn Marino }
20486d7f5d3SJohn Marino
20586d7f5d3SJohn Marino mp_size_t
mpn_mu_div_q_itch(mp_size_t nn,mp_size_t dn,int mua_k)20686d7f5d3SJohn Marino mpn_mu_div_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
20786d7f5d3SJohn Marino {
20886d7f5d3SJohn Marino mp_size_t qn, itch1, itch2;
20986d7f5d3SJohn Marino
21086d7f5d3SJohn Marino qn = nn - dn;
21186d7f5d3SJohn Marino if (qn >= dn)
21286d7f5d3SJohn Marino {
21386d7f5d3SJohn Marino itch1 = mpn_mu_div_qr_itch (qn, dn, mua_k);
21486d7f5d3SJohn Marino itch2 = mpn_mu_divappr_q_itch (2 * dn + 1, dn, mua_k);
21586d7f5d3SJohn Marino return MAX (itch1, itch2);
21686d7f5d3SJohn Marino }
21786d7f5d3SJohn Marino else
21886d7f5d3SJohn Marino {
21986d7f5d3SJohn Marino itch1 = mpn_mu_divappr_q_itch (2 * qn + 2, qn + 1, mua_k);
22086d7f5d3SJohn Marino return itch1;
22186d7f5d3SJohn Marino }
22286d7f5d3SJohn Marino }
223