xref: /dflybsd-src/contrib/gmp/mpn/generic/mu_bdiv_qr.c (revision 86d7f5d305c6adaa56ff4582ece9859d73106103)
186d7f5d3SJohn Marino /* mpn_mu_bdiv_qr(qp,rp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^qn,
286d7f5d3SJohn Marino    where qn = nn-dn, storing the result in {qp,qn}.  Overlap allowed between Q
386d7f5d3SJohn Marino    and N; all other overlap disallowed.
486d7f5d3SJohn Marino 
586d7f5d3SJohn Marino    Contributed to the GNU project by Torbjorn Granlund.
686d7f5d3SJohn Marino 
786d7f5d3SJohn Marino    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
886d7f5d3SJohn Marino    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
986d7f5d3SJohn Marino    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
1086d7f5d3SJohn Marino 
1186d7f5d3SJohn Marino Copyright 2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
1286d7f5d3SJohn Marino 
1386d7f5d3SJohn Marino This file is part of the GNU MP Library.
1486d7f5d3SJohn Marino 
1586d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1686d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1786d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1886d7f5d3SJohn Marino option) any later version.
1986d7f5d3SJohn Marino 
2086d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
2186d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2286d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
2386d7f5d3SJohn Marino License for more details.
2486d7f5d3SJohn Marino 
2586d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2686d7f5d3SJohn Marino along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
2786d7f5d3SJohn Marino 
2886d7f5d3SJohn Marino 
2986d7f5d3SJohn Marino /*
3086d7f5d3SJohn Marino    The idea of the algorithm used herein is to compute a smaller inverted value
3186d7f5d3SJohn Marino    than used in the standard Barrett algorithm, and thus save time in the
3286d7f5d3SJohn Marino    Newton iterations, and pay just a small price when using the inverted value
3386d7f5d3SJohn Marino    for developing quotient bits.  This algorithm was presented at ICMS 2006.
3486d7f5d3SJohn Marino */
3586d7f5d3SJohn Marino 
3686d7f5d3SJohn Marino #include "gmp.h"
3786d7f5d3SJohn Marino #include "gmp-impl.h"
3886d7f5d3SJohn Marino 
3986d7f5d3SJohn Marino 
4086d7f5d3SJohn Marino /* N = {np,nn}
4186d7f5d3SJohn Marino    D = {dp,dn}
4286d7f5d3SJohn Marino 
4386d7f5d3SJohn Marino    Requirements: N >= D
4486d7f5d3SJohn Marino 		 D >= 1
4586d7f5d3SJohn Marino 		 D odd
4686d7f5d3SJohn Marino 		 dn >= 2
4786d7f5d3SJohn Marino 		 nn >= 2
4886d7f5d3SJohn Marino 		 scratch space as determined by mpn_mu_bdiv_qr_itch(nn,dn).
4986d7f5d3SJohn Marino 
5086d7f5d3SJohn Marino    Write quotient to Q = {qp,nn-dn}.
5186d7f5d3SJohn Marino 
5286d7f5d3SJohn Marino    FIXME: When iterating, perhaps do the small step before loop, not after.
5386d7f5d3SJohn Marino    FIXME: Try to avoid the scalar divisions when computing inverse size.
5486d7f5d3SJohn Marino    FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible.  In
5586d7f5d3SJohn Marino 	  particular, when dn==in, tp and rp could use the same space.
5686d7f5d3SJohn Marino */
5786d7f5d3SJohn Marino mp_limb_t
mpn_mu_bdiv_qr(mp_ptr qp,mp_ptr rp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)5886d7f5d3SJohn Marino mpn_mu_bdiv_qr (mp_ptr qp,
5986d7f5d3SJohn Marino 		mp_ptr rp,
6086d7f5d3SJohn Marino 		mp_srcptr np, mp_size_t nn,
6186d7f5d3SJohn Marino 		mp_srcptr dp, mp_size_t dn,
6286d7f5d3SJohn Marino 		mp_ptr scratch)
6386d7f5d3SJohn Marino {
6486d7f5d3SJohn Marino   mp_size_t qn;
6586d7f5d3SJohn Marino   mp_size_t in;
6686d7f5d3SJohn Marino   mp_limb_t cy, c0;
6786d7f5d3SJohn Marino   int k;
6886d7f5d3SJohn Marino   mp_size_t tn, wn;
6986d7f5d3SJohn Marino   mp_size_t i;
7086d7f5d3SJohn Marino 
7186d7f5d3SJohn Marino   qn = nn - dn;
7286d7f5d3SJohn Marino 
7386d7f5d3SJohn Marino   ASSERT (dn >= 2);
7486d7f5d3SJohn Marino   ASSERT (qn >= 2);
7586d7f5d3SJohn Marino 
7686d7f5d3SJohn Marino   if (qn > dn)
7786d7f5d3SJohn Marino     {
7886d7f5d3SJohn Marino       mp_size_t b;
7986d7f5d3SJohn Marino 
8086d7f5d3SJohn Marino       /* |_______________________|   dividend
8186d7f5d3SJohn Marino 			|________|   divisor  */
8286d7f5d3SJohn Marino 
8386d7f5d3SJohn Marino #define ip           scratch		/* in */
8486d7f5d3SJohn Marino #define tp           (scratch + in)	/* dn+in or next_size(dn) or rest >= binvert_itch(in) */
8586d7f5d3SJohn Marino #define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
8686d7f5d3SJohn Marino 
8786d7f5d3SJohn Marino       /* Compute an inverse size that is a nice partition of the quotient.  */
8886d7f5d3SJohn Marino       b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
8986d7f5d3SJohn Marino       in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
9086d7f5d3SJohn Marino 
9186d7f5d3SJohn Marino       /* Some notes on allocation:
9286d7f5d3SJohn Marino 
9386d7f5d3SJohn Marino 	 When in = dn, R dies when mpn_mullo returns, if in < dn the low in
9486d7f5d3SJohn Marino 	 limbs of R dies at that point.  We could save memory by letting T live
9586d7f5d3SJohn Marino 	 just under R, and let the upper part of T expand into R. These changes
9686d7f5d3SJohn Marino 	 should reduce itch to perhaps 3dn.
9786d7f5d3SJohn Marino        */
9886d7f5d3SJohn Marino 
9986d7f5d3SJohn Marino       mpn_binvert (ip, dp, in, tp);
10086d7f5d3SJohn Marino 
10186d7f5d3SJohn Marino       MPN_COPY (rp, np, dn);
10286d7f5d3SJohn Marino       np += dn;
10386d7f5d3SJohn Marino       cy = 0;
10486d7f5d3SJohn Marino 
10586d7f5d3SJohn Marino       while (qn > in)
10686d7f5d3SJohn Marino 	{
10786d7f5d3SJohn Marino 	  mpn_mullo_n (qp, rp, ip, in);
10886d7f5d3SJohn Marino 
10986d7f5d3SJohn Marino 	  if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
11086d7f5d3SJohn Marino 	    mpn_mul (tp, dp, dn, qp, in);	/* mulhi, need tp[dn+in-1...in] */
11186d7f5d3SJohn Marino 	  else
11286d7f5d3SJohn Marino 	    {
11386d7f5d3SJohn Marino 	      tn = mpn_mulmod_bnm1_next_size (dn);
11486d7f5d3SJohn Marino 	      mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
11586d7f5d3SJohn Marino 	      wn = dn + in - tn;		/* number of wrapped limbs */
11686d7f5d3SJohn Marino 	      if (wn > 0)
11786d7f5d3SJohn Marino 		{
11886d7f5d3SJohn Marino 		  c0 = mpn_sub_n (tp + tn, tp, rp, wn);
11986d7f5d3SJohn Marino 		  mpn_decr_u (tp + wn, c0);
12086d7f5d3SJohn Marino 		}
12186d7f5d3SJohn Marino 	    }
12286d7f5d3SJohn Marino 
12386d7f5d3SJohn Marino 	  qp += in;
12486d7f5d3SJohn Marino 	  qn -= in;
12586d7f5d3SJohn Marino 
12686d7f5d3SJohn Marino 	  if (dn != in)
12786d7f5d3SJohn Marino 	    {
12886d7f5d3SJohn Marino 	      /* Subtract tp[dn-1...in] from partial remainder.  */
12986d7f5d3SJohn Marino 	      cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
13086d7f5d3SJohn Marino 	      if (cy == 2)
13186d7f5d3SJohn Marino 		{
13286d7f5d3SJohn Marino 		  mpn_incr_u (tp + dn, 1);
13386d7f5d3SJohn Marino 		  cy = 1;
13486d7f5d3SJohn Marino 		}
13586d7f5d3SJohn Marino 	    }
13686d7f5d3SJohn Marino 	  /* Subtract tp[dn+in-1...dn] from dividend.  */
13786d7f5d3SJohn Marino 	  cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
13886d7f5d3SJohn Marino 	  np += in;
13986d7f5d3SJohn Marino 	}
14086d7f5d3SJohn Marino 
14186d7f5d3SJohn Marino       /* Generate last qn limbs.  */
14286d7f5d3SJohn Marino       mpn_mullo_n (qp, rp, ip, qn);
14386d7f5d3SJohn Marino 
14486d7f5d3SJohn Marino       if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
14586d7f5d3SJohn Marino 	mpn_mul (tp, dp, dn, qp, qn);		/* mulhi, need tp[qn+in-1...in] */
14686d7f5d3SJohn Marino       else
14786d7f5d3SJohn Marino 	{
14886d7f5d3SJohn Marino 	  tn = mpn_mulmod_bnm1_next_size (dn);
14986d7f5d3SJohn Marino 	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
15086d7f5d3SJohn Marino 	  wn = dn + qn - tn;			/* number of wrapped limbs */
15186d7f5d3SJohn Marino 	  if (wn > 0)
15286d7f5d3SJohn Marino 	    {
15386d7f5d3SJohn Marino 	      c0 = mpn_sub_n (tp + tn, tp, rp, wn);
15486d7f5d3SJohn Marino 	      mpn_decr_u (tp + wn, c0);
15586d7f5d3SJohn Marino 	    }
15686d7f5d3SJohn Marino 	}
15786d7f5d3SJohn Marino 
15886d7f5d3SJohn Marino       if (dn != qn)
15986d7f5d3SJohn Marino 	{
16086d7f5d3SJohn Marino 	  cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
16186d7f5d3SJohn Marino 	  if (cy == 2)
16286d7f5d3SJohn Marino 	    {
16386d7f5d3SJohn Marino 	      mpn_incr_u (tp + dn, 1);
16486d7f5d3SJohn Marino 	      cy = 1;
16586d7f5d3SJohn Marino 	    }
16686d7f5d3SJohn Marino 	}
16786d7f5d3SJohn Marino       return mpn_sub_nc (rp + dn - qn, np, tp + dn, qn, cy);
16886d7f5d3SJohn Marino 
16986d7f5d3SJohn Marino #undef ip
17086d7f5d3SJohn Marino #undef tp
17186d7f5d3SJohn Marino #undef scratch_out
17286d7f5d3SJohn Marino     }
17386d7f5d3SJohn Marino   else
17486d7f5d3SJohn Marino     {
17586d7f5d3SJohn Marino       /* |_______________________|   dividend
17686d7f5d3SJohn Marino 		|________________|   divisor  */
17786d7f5d3SJohn Marino 
17886d7f5d3SJohn Marino #define ip           scratch		/* in */
17986d7f5d3SJohn Marino #define tp           (scratch + in)	/* dn+in or next_size(dn) or rest >= binvert_itch(in) */
18086d7f5d3SJohn Marino #define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
18186d7f5d3SJohn Marino 
18286d7f5d3SJohn Marino       /* Compute half-sized inverse.  */
18386d7f5d3SJohn Marino       in = qn - (qn >> 1);
18486d7f5d3SJohn Marino 
18586d7f5d3SJohn Marino       mpn_binvert (ip, dp, in, tp);
18686d7f5d3SJohn Marino 
18786d7f5d3SJohn Marino       mpn_mullo_n (qp, np, ip, in);		/* low `in' quotient limbs */
18886d7f5d3SJohn Marino 
18986d7f5d3SJohn Marino       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
19086d7f5d3SJohn Marino 	mpn_mul (tp, dp, dn, qp, in);		/* mulhigh */
19186d7f5d3SJohn Marino       else
19286d7f5d3SJohn Marino 	{
19386d7f5d3SJohn Marino 	  tn = mpn_mulmod_bnm1_next_size (dn);
19486d7f5d3SJohn Marino 	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
19586d7f5d3SJohn Marino 	  wn = dn + in - tn;			/* number of wrapped limbs */
19686d7f5d3SJohn Marino 	  if (wn > 0)
19786d7f5d3SJohn Marino 	    {
19886d7f5d3SJohn Marino 	      c0 = mpn_sub_n (tp + tn, tp, np, wn);
19986d7f5d3SJohn Marino 	      mpn_decr_u (tp + wn, c0);
20086d7f5d3SJohn Marino 	    }
20186d7f5d3SJohn Marino 	}
20286d7f5d3SJohn Marino 
20386d7f5d3SJohn Marino       qp += in;
20486d7f5d3SJohn Marino       qn -= in;
20586d7f5d3SJohn Marino 
20686d7f5d3SJohn Marino       cy = mpn_sub_n (rp, np + in, tp + in, dn);
20786d7f5d3SJohn Marino       mpn_mullo_n (qp, rp, ip, qn);		/* high qn quotient limbs */
20886d7f5d3SJohn Marino 
20986d7f5d3SJohn Marino       if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
21086d7f5d3SJohn Marino 	mpn_mul (tp, dp, dn, qp, qn);		/* mulhigh */
21186d7f5d3SJohn Marino       else
21286d7f5d3SJohn Marino 	{
21386d7f5d3SJohn Marino 	  tn = mpn_mulmod_bnm1_next_size (dn);
21486d7f5d3SJohn Marino 	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
21586d7f5d3SJohn Marino 	  wn = dn + qn - tn;			/* number of wrapped limbs */
21686d7f5d3SJohn Marino 	  if (wn > 0)
21786d7f5d3SJohn Marino 	    {
21886d7f5d3SJohn Marino 	      c0 = mpn_sub_n (tp + tn, tp, rp, wn);
21986d7f5d3SJohn Marino 	      mpn_decr_u (tp + wn, c0);
22086d7f5d3SJohn Marino 	    }
22186d7f5d3SJohn Marino 	}
22286d7f5d3SJohn Marino 
22386d7f5d3SJohn Marino       cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
22486d7f5d3SJohn Marino       if (cy == 2)
22586d7f5d3SJohn Marino 	{
22686d7f5d3SJohn Marino 	  mpn_incr_u (tp + dn, 1);
22786d7f5d3SJohn Marino 	  cy = 1;
22886d7f5d3SJohn Marino 	}
22986d7f5d3SJohn Marino       return mpn_sub_nc (rp + dn - qn, np + dn + in, tp + dn, qn, cy);
23086d7f5d3SJohn Marino 
23186d7f5d3SJohn Marino #undef ip
23286d7f5d3SJohn Marino #undef tp
23386d7f5d3SJohn Marino #undef scratch_out
23486d7f5d3SJohn Marino     }
23586d7f5d3SJohn Marino }
23686d7f5d3SJohn Marino 
23786d7f5d3SJohn Marino mp_size_t
mpn_mu_bdiv_qr_itch(mp_size_t nn,mp_size_t dn)23886d7f5d3SJohn Marino mpn_mu_bdiv_qr_itch (mp_size_t nn, mp_size_t dn)
23986d7f5d3SJohn Marino {
24086d7f5d3SJohn Marino   mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
24186d7f5d3SJohn Marino   mp_size_t b;
24286d7f5d3SJohn Marino 
24386d7f5d3SJohn Marino   qn = nn - dn;
24486d7f5d3SJohn Marino 
24586d7f5d3SJohn Marino   if (qn > dn)
24686d7f5d3SJohn Marino     {
24786d7f5d3SJohn Marino       b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
24886d7f5d3SJohn Marino       in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
24986d7f5d3SJohn Marino       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
25086d7f5d3SJohn Marino 	{
25186d7f5d3SJohn Marino 	  tn = dn + in;
25286d7f5d3SJohn Marino 	  itch_out = 0;
25386d7f5d3SJohn Marino 	}
25486d7f5d3SJohn Marino       else
25586d7f5d3SJohn Marino 	{
25686d7f5d3SJohn Marino 	  tn = mpn_mulmod_bnm1_next_size (dn);
25786d7f5d3SJohn Marino 	  itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
25886d7f5d3SJohn Marino 	}
25986d7f5d3SJohn Marino       itch_binvert = mpn_binvert_itch (in);
26086d7f5d3SJohn Marino       itches = tn + itch_out;
26186d7f5d3SJohn Marino       return in + MAX (itches, itch_binvert);
26286d7f5d3SJohn Marino     }
26386d7f5d3SJohn Marino   else
26486d7f5d3SJohn Marino     {
26586d7f5d3SJohn Marino       in = qn - (qn >> 1);
26686d7f5d3SJohn Marino       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
26786d7f5d3SJohn Marino 	{
26886d7f5d3SJohn Marino 	  tn = dn + in;
26986d7f5d3SJohn Marino 	  itch_out = 0;
27086d7f5d3SJohn Marino 	}
27186d7f5d3SJohn Marino       else
27286d7f5d3SJohn Marino 	{
27386d7f5d3SJohn Marino 	  tn = mpn_mulmod_bnm1_next_size (dn);
27486d7f5d3SJohn Marino 	  itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
27586d7f5d3SJohn Marino 	}
27686d7f5d3SJohn Marino     }
27786d7f5d3SJohn Marino   itch_binvert = mpn_binvert_itch (in);
27886d7f5d3SJohn Marino   itches = tn + itch_out;
27986d7f5d3SJohn Marino   return in + MAX (itches, itch_binvert);
28086d7f5d3SJohn Marino }
281