186d7f5d3SJohn Marino /* mpn_mu_bdiv_qr(qp,rp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^qn,
286d7f5d3SJohn Marino where qn = nn-dn, storing the result in {qp,qn}. Overlap allowed between Q
386d7f5d3SJohn Marino and N; all other overlap disallowed.
486d7f5d3SJohn Marino
586d7f5d3SJohn Marino Contributed to the GNU project by Torbjorn Granlund.
686d7f5d3SJohn Marino
786d7f5d3SJohn Marino THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
886d7f5d3SJohn Marino SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
986d7f5d3SJohn Marino GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
1086d7f5d3SJohn Marino
1186d7f5d3SJohn Marino Copyright 2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
1286d7f5d3SJohn Marino
1386d7f5d3SJohn Marino This file is part of the GNU MP Library.
1486d7f5d3SJohn Marino
1586d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1686d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1786d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1886d7f5d3SJohn Marino option) any later version.
1986d7f5d3SJohn Marino
2086d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
2186d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2286d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
2386d7f5d3SJohn Marino License for more details.
2486d7f5d3SJohn Marino
2586d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2686d7f5d3SJohn Marino along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
2786d7f5d3SJohn Marino
2886d7f5d3SJohn Marino
2986d7f5d3SJohn Marino /*
3086d7f5d3SJohn Marino The idea of the algorithm used herein is to compute a smaller inverted value
3186d7f5d3SJohn Marino than used in the standard Barrett algorithm, and thus save time in the
3286d7f5d3SJohn Marino Newton iterations, and pay just a small price when using the inverted value
3386d7f5d3SJohn Marino for developing quotient bits. This algorithm was presented at ICMS 2006.
3486d7f5d3SJohn Marino */
3586d7f5d3SJohn Marino
3686d7f5d3SJohn Marino #include "gmp.h"
3786d7f5d3SJohn Marino #include "gmp-impl.h"
3886d7f5d3SJohn Marino
3986d7f5d3SJohn Marino
4086d7f5d3SJohn Marino /* N = {np,nn}
4186d7f5d3SJohn Marino D = {dp,dn}
4286d7f5d3SJohn Marino
4386d7f5d3SJohn Marino Requirements: N >= D
4486d7f5d3SJohn Marino D >= 1
4586d7f5d3SJohn Marino D odd
4686d7f5d3SJohn Marino dn >= 2
4786d7f5d3SJohn Marino nn >= 2
4886d7f5d3SJohn Marino scratch space as determined by mpn_mu_bdiv_qr_itch(nn,dn).
4986d7f5d3SJohn Marino
5086d7f5d3SJohn Marino Write quotient to Q = {qp,nn-dn}.
5186d7f5d3SJohn Marino
5286d7f5d3SJohn Marino FIXME: When iterating, perhaps do the small step before loop, not after.
5386d7f5d3SJohn Marino FIXME: Try to avoid the scalar divisions when computing inverse size.
5486d7f5d3SJohn Marino FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible. In
5586d7f5d3SJohn Marino particular, when dn==in, tp and rp could use the same space.
5686d7f5d3SJohn Marino */
5786d7f5d3SJohn Marino mp_limb_t
mpn_mu_bdiv_qr(mp_ptr qp,mp_ptr rp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)5886d7f5d3SJohn Marino mpn_mu_bdiv_qr (mp_ptr qp,
5986d7f5d3SJohn Marino mp_ptr rp,
6086d7f5d3SJohn Marino mp_srcptr np, mp_size_t nn,
6186d7f5d3SJohn Marino mp_srcptr dp, mp_size_t dn,
6286d7f5d3SJohn Marino mp_ptr scratch)
6386d7f5d3SJohn Marino {
6486d7f5d3SJohn Marino mp_size_t qn;
6586d7f5d3SJohn Marino mp_size_t in;
6686d7f5d3SJohn Marino mp_limb_t cy, c0;
6786d7f5d3SJohn Marino int k;
6886d7f5d3SJohn Marino mp_size_t tn, wn;
6986d7f5d3SJohn Marino mp_size_t i;
7086d7f5d3SJohn Marino
7186d7f5d3SJohn Marino qn = nn - dn;
7286d7f5d3SJohn Marino
7386d7f5d3SJohn Marino ASSERT (dn >= 2);
7486d7f5d3SJohn Marino ASSERT (qn >= 2);
7586d7f5d3SJohn Marino
7686d7f5d3SJohn Marino if (qn > dn)
7786d7f5d3SJohn Marino {
7886d7f5d3SJohn Marino mp_size_t b;
7986d7f5d3SJohn Marino
8086d7f5d3SJohn Marino /* |_______________________| dividend
8186d7f5d3SJohn Marino |________| divisor */
8286d7f5d3SJohn Marino
8386d7f5d3SJohn Marino #define ip scratch /* in */
8486d7f5d3SJohn Marino #define tp (scratch + in) /* dn+in or next_size(dn) or rest >= binvert_itch(in) */
8586d7f5d3SJohn Marino #define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
8686d7f5d3SJohn Marino
8786d7f5d3SJohn Marino /* Compute an inverse size that is a nice partition of the quotient. */
8886d7f5d3SJohn Marino b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
8986d7f5d3SJohn Marino in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
9086d7f5d3SJohn Marino
9186d7f5d3SJohn Marino /* Some notes on allocation:
9286d7f5d3SJohn Marino
9386d7f5d3SJohn Marino When in = dn, R dies when mpn_mullo returns, if in < dn the low in
9486d7f5d3SJohn Marino limbs of R dies at that point. We could save memory by letting T live
9586d7f5d3SJohn Marino just under R, and let the upper part of T expand into R. These changes
9686d7f5d3SJohn Marino should reduce itch to perhaps 3dn.
9786d7f5d3SJohn Marino */
9886d7f5d3SJohn Marino
9986d7f5d3SJohn Marino mpn_binvert (ip, dp, in, tp);
10086d7f5d3SJohn Marino
10186d7f5d3SJohn Marino MPN_COPY (rp, np, dn);
10286d7f5d3SJohn Marino np += dn;
10386d7f5d3SJohn Marino cy = 0;
10486d7f5d3SJohn Marino
10586d7f5d3SJohn Marino while (qn > in)
10686d7f5d3SJohn Marino {
10786d7f5d3SJohn Marino mpn_mullo_n (qp, rp, ip, in);
10886d7f5d3SJohn Marino
10986d7f5d3SJohn Marino if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
11086d7f5d3SJohn Marino mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[dn+in-1...in] */
11186d7f5d3SJohn Marino else
11286d7f5d3SJohn Marino {
11386d7f5d3SJohn Marino tn = mpn_mulmod_bnm1_next_size (dn);
11486d7f5d3SJohn Marino mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
11586d7f5d3SJohn Marino wn = dn + in - tn; /* number of wrapped limbs */
11686d7f5d3SJohn Marino if (wn > 0)
11786d7f5d3SJohn Marino {
11886d7f5d3SJohn Marino c0 = mpn_sub_n (tp + tn, tp, rp, wn);
11986d7f5d3SJohn Marino mpn_decr_u (tp + wn, c0);
12086d7f5d3SJohn Marino }
12186d7f5d3SJohn Marino }
12286d7f5d3SJohn Marino
12386d7f5d3SJohn Marino qp += in;
12486d7f5d3SJohn Marino qn -= in;
12586d7f5d3SJohn Marino
12686d7f5d3SJohn Marino if (dn != in)
12786d7f5d3SJohn Marino {
12886d7f5d3SJohn Marino /* Subtract tp[dn-1...in] from partial remainder. */
12986d7f5d3SJohn Marino cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
13086d7f5d3SJohn Marino if (cy == 2)
13186d7f5d3SJohn Marino {
13286d7f5d3SJohn Marino mpn_incr_u (tp + dn, 1);
13386d7f5d3SJohn Marino cy = 1;
13486d7f5d3SJohn Marino }
13586d7f5d3SJohn Marino }
13686d7f5d3SJohn Marino /* Subtract tp[dn+in-1...dn] from dividend. */
13786d7f5d3SJohn Marino cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
13886d7f5d3SJohn Marino np += in;
13986d7f5d3SJohn Marino }
14086d7f5d3SJohn Marino
14186d7f5d3SJohn Marino /* Generate last qn limbs. */
14286d7f5d3SJohn Marino mpn_mullo_n (qp, rp, ip, qn);
14386d7f5d3SJohn Marino
14486d7f5d3SJohn Marino if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
14586d7f5d3SJohn Marino mpn_mul (tp, dp, dn, qp, qn); /* mulhi, need tp[qn+in-1...in] */
14686d7f5d3SJohn Marino else
14786d7f5d3SJohn Marino {
14886d7f5d3SJohn Marino tn = mpn_mulmod_bnm1_next_size (dn);
14986d7f5d3SJohn Marino mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
15086d7f5d3SJohn Marino wn = dn + qn - tn; /* number of wrapped limbs */
15186d7f5d3SJohn Marino if (wn > 0)
15286d7f5d3SJohn Marino {
15386d7f5d3SJohn Marino c0 = mpn_sub_n (tp + tn, tp, rp, wn);
15486d7f5d3SJohn Marino mpn_decr_u (tp + wn, c0);
15586d7f5d3SJohn Marino }
15686d7f5d3SJohn Marino }
15786d7f5d3SJohn Marino
15886d7f5d3SJohn Marino if (dn != qn)
15986d7f5d3SJohn Marino {
16086d7f5d3SJohn Marino cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
16186d7f5d3SJohn Marino if (cy == 2)
16286d7f5d3SJohn Marino {
16386d7f5d3SJohn Marino mpn_incr_u (tp + dn, 1);
16486d7f5d3SJohn Marino cy = 1;
16586d7f5d3SJohn Marino }
16686d7f5d3SJohn Marino }
16786d7f5d3SJohn Marino return mpn_sub_nc (rp + dn - qn, np, tp + dn, qn, cy);
16886d7f5d3SJohn Marino
16986d7f5d3SJohn Marino #undef ip
17086d7f5d3SJohn Marino #undef tp
17186d7f5d3SJohn Marino #undef scratch_out
17286d7f5d3SJohn Marino }
17386d7f5d3SJohn Marino else
17486d7f5d3SJohn Marino {
17586d7f5d3SJohn Marino /* |_______________________| dividend
17686d7f5d3SJohn Marino |________________| divisor */
17786d7f5d3SJohn Marino
17886d7f5d3SJohn Marino #define ip scratch /* in */
17986d7f5d3SJohn Marino #define tp (scratch + in) /* dn+in or next_size(dn) or rest >= binvert_itch(in) */
18086d7f5d3SJohn Marino #define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
18186d7f5d3SJohn Marino
18286d7f5d3SJohn Marino /* Compute half-sized inverse. */
18386d7f5d3SJohn Marino in = qn - (qn >> 1);
18486d7f5d3SJohn Marino
18586d7f5d3SJohn Marino mpn_binvert (ip, dp, in, tp);
18686d7f5d3SJohn Marino
18786d7f5d3SJohn Marino mpn_mullo_n (qp, np, ip, in); /* low `in' quotient limbs */
18886d7f5d3SJohn Marino
18986d7f5d3SJohn Marino if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
19086d7f5d3SJohn Marino mpn_mul (tp, dp, dn, qp, in); /* mulhigh */
19186d7f5d3SJohn Marino else
19286d7f5d3SJohn Marino {
19386d7f5d3SJohn Marino tn = mpn_mulmod_bnm1_next_size (dn);
19486d7f5d3SJohn Marino mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
19586d7f5d3SJohn Marino wn = dn + in - tn; /* number of wrapped limbs */
19686d7f5d3SJohn Marino if (wn > 0)
19786d7f5d3SJohn Marino {
19886d7f5d3SJohn Marino c0 = mpn_sub_n (tp + tn, tp, np, wn);
19986d7f5d3SJohn Marino mpn_decr_u (tp + wn, c0);
20086d7f5d3SJohn Marino }
20186d7f5d3SJohn Marino }
20286d7f5d3SJohn Marino
20386d7f5d3SJohn Marino qp += in;
20486d7f5d3SJohn Marino qn -= in;
20586d7f5d3SJohn Marino
20686d7f5d3SJohn Marino cy = mpn_sub_n (rp, np + in, tp + in, dn);
20786d7f5d3SJohn Marino mpn_mullo_n (qp, rp, ip, qn); /* high qn quotient limbs */
20886d7f5d3SJohn Marino
20986d7f5d3SJohn Marino if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
21086d7f5d3SJohn Marino mpn_mul (tp, dp, dn, qp, qn); /* mulhigh */
21186d7f5d3SJohn Marino else
21286d7f5d3SJohn Marino {
21386d7f5d3SJohn Marino tn = mpn_mulmod_bnm1_next_size (dn);
21486d7f5d3SJohn Marino mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
21586d7f5d3SJohn Marino wn = dn + qn - tn; /* number of wrapped limbs */
21686d7f5d3SJohn Marino if (wn > 0)
21786d7f5d3SJohn Marino {
21886d7f5d3SJohn Marino c0 = mpn_sub_n (tp + tn, tp, rp, wn);
21986d7f5d3SJohn Marino mpn_decr_u (tp + wn, c0);
22086d7f5d3SJohn Marino }
22186d7f5d3SJohn Marino }
22286d7f5d3SJohn Marino
22386d7f5d3SJohn Marino cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
22486d7f5d3SJohn Marino if (cy == 2)
22586d7f5d3SJohn Marino {
22686d7f5d3SJohn Marino mpn_incr_u (tp + dn, 1);
22786d7f5d3SJohn Marino cy = 1;
22886d7f5d3SJohn Marino }
22986d7f5d3SJohn Marino return mpn_sub_nc (rp + dn - qn, np + dn + in, tp + dn, qn, cy);
23086d7f5d3SJohn Marino
23186d7f5d3SJohn Marino #undef ip
23286d7f5d3SJohn Marino #undef tp
23386d7f5d3SJohn Marino #undef scratch_out
23486d7f5d3SJohn Marino }
23586d7f5d3SJohn Marino }
23686d7f5d3SJohn Marino
23786d7f5d3SJohn Marino mp_size_t
mpn_mu_bdiv_qr_itch(mp_size_t nn,mp_size_t dn)23886d7f5d3SJohn Marino mpn_mu_bdiv_qr_itch (mp_size_t nn, mp_size_t dn)
23986d7f5d3SJohn Marino {
24086d7f5d3SJohn Marino mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
24186d7f5d3SJohn Marino mp_size_t b;
24286d7f5d3SJohn Marino
24386d7f5d3SJohn Marino qn = nn - dn;
24486d7f5d3SJohn Marino
24586d7f5d3SJohn Marino if (qn > dn)
24686d7f5d3SJohn Marino {
24786d7f5d3SJohn Marino b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
24886d7f5d3SJohn Marino in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
24986d7f5d3SJohn Marino if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
25086d7f5d3SJohn Marino {
25186d7f5d3SJohn Marino tn = dn + in;
25286d7f5d3SJohn Marino itch_out = 0;
25386d7f5d3SJohn Marino }
25486d7f5d3SJohn Marino else
25586d7f5d3SJohn Marino {
25686d7f5d3SJohn Marino tn = mpn_mulmod_bnm1_next_size (dn);
25786d7f5d3SJohn Marino itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
25886d7f5d3SJohn Marino }
25986d7f5d3SJohn Marino itch_binvert = mpn_binvert_itch (in);
26086d7f5d3SJohn Marino itches = tn + itch_out;
26186d7f5d3SJohn Marino return in + MAX (itches, itch_binvert);
26286d7f5d3SJohn Marino }
26386d7f5d3SJohn Marino else
26486d7f5d3SJohn Marino {
26586d7f5d3SJohn Marino in = qn - (qn >> 1);
26686d7f5d3SJohn Marino if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
26786d7f5d3SJohn Marino {
26886d7f5d3SJohn Marino tn = dn + in;
26986d7f5d3SJohn Marino itch_out = 0;
27086d7f5d3SJohn Marino }
27186d7f5d3SJohn Marino else
27286d7f5d3SJohn Marino {
27386d7f5d3SJohn Marino tn = mpn_mulmod_bnm1_next_size (dn);
27486d7f5d3SJohn Marino itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
27586d7f5d3SJohn Marino }
27686d7f5d3SJohn Marino }
27786d7f5d3SJohn Marino itch_binvert = mpn_binvert_itch (in);
27886d7f5d3SJohn Marino itches = tn + itch_out;
27986d7f5d3SJohn Marino return in + MAX (itches, itch_binvert);
28086d7f5d3SJohn Marino }
281