mpfr/src/mulders.c

4a238c70SJohn Marino/* Mulders' MulHigh function (short product)
4a238c70SJohn Marino
*ab6d115fSJohn MarinoCopyright 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
*ab6d115fSJohn MarinoContributed by the AriC and Caramel projects, INRIA.
4a238c70SJohn Marino
4a238c70SJohn MarinoThis file is part of the GNU MPFR Library.
4a238c70SJohn Marino
4a238c70SJohn MarinoThe GNU MPFR Library is free software; you can redistribute it and/or modify
4a238c70SJohn Marinoit under the terms of the GNU Lesser General Public License as published by
4a238c70SJohn Marinothe Free Software Foundation; either version 3 of the License, or (at your
4a238c70SJohn Marinooption) any later version.
4a238c70SJohn Marino
4a238c70SJohn MarinoThe GNU MPFR Library is distributed in the hope that it will be useful, but
4a238c70SJohn MarinoWITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
4a238c70SJohn Marinoor FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
4a238c70SJohn MarinoLicense for more details.
4a238c70SJohn Marino
4a238c70SJohn MarinoYou should have received a copy of the GNU Lesser General Public License
4a238c70SJohn Marinoalong with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
4a238c70SJohn Marinohttp://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
4a238c70SJohn Marino51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
4a238c70SJohn Marino
4a238c70SJohn Marino/* References:
4a238c70SJohn Marino   [1] Short Division of Long Integers, David Harvey and Paul Zimmermann,
4a238c70SJohn Marino       Proceedings of the 20th Symposium on Computer Arithmetic (ARITH-20),
4a238c70SJohn Marino       July 25-27, 2011, pages 7-14.
4a238c70SJohn Marino*/
4a238c70SJohn Marino
4a238c70SJohn Marino#define MPFR_NEED_LONGLONG_H
4a238c70SJohn Marino#include "mpfr-impl.h"
4a238c70SJohn Marino
4a238c70SJohn Marino#ifndef MUL_FFT_THRESHOLD
4a238c70SJohn Marino#define MUL_FFT_THRESHOLD 8448
4a238c70SJohn Marino#endif
4a238c70SJohn Marino
4a238c70SJohn Marino/* Don't use MPFR_MULHIGH_SIZE since it is handled by tuneup */
4a238c70SJohn Marino#ifdef MPFR_MULHIGH_TAB_SIZE
4a238c70SJohn Marinostatic short mulhigh_ktab[MPFR_MULHIGH_TAB_SIZE];
4a238c70SJohn Marino#else
4a238c70SJohn Marinostatic short mulhigh_ktab[] = {MPFR_MULHIGH_TAB};
4a238c70SJohn Marino#define MPFR_MULHIGH_TAB_SIZE \
4a238c70SJohn Marino  ((mp_size_t) (sizeof(mulhigh_ktab) / sizeof(mulhigh_ktab[0])))
4a238c70SJohn Marino#endif
4a238c70SJohn Marino
4a238c70SJohn Marino/* Put in  rp[n..2n-1] an approximation of the n high limbs
4a238c70SJohn Marino   of {up, n} * {vp, n}. The error is less than n ulps of rp[n] (and the
4a238c70SJohn Marino   approximation is always less or equal to the truncated full product).
4a238c70SJohn Marino   Assume 2n limbs are allocated at rp.
4a238c70SJohn Marino
4a238c70SJohn Marino   Implements Algorithm ShortMulNaive from [1].
4a238c70SJohn Marino*/
4a238c70SJohn Marinostatic void
4a238c70SJohn Marinompfr_mulhigh_n_basecase (mpfr_limb_ptr rp, mpfr_limb_srcptr up,
4a238c70SJohn Marino                         mpfr_limb_srcptr vp, mp_size_t n)
4a238c70SJohn Marino{
4a238c70SJohn Marino  mp_size_t i;
4a238c70SJohn Marino
4a238c70SJohn Marino  rp += n - 1;
4a238c70SJohn Marino  umul_ppmm (rp[1], rp[0], up[n-1], vp[0]); /* we neglect up[0..n-2]*vp[0],
4a238c70SJohn Marino                                               which is less than B^n */
4a238c70SJohn Marino  for (i = 1 ; i < n ; i++)
4a238c70SJohn Marino    /* here, we neglect up[0..n-i-2] * vp[i], which is less than B^n too */
4a238c70SJohn Marino    rp[i + 1] = mpn_addmul_1 (rp, up + (n - i - 1), i + 1, vp[i]);
4a238c70SJohn Marino  /* in total, we neglect less than n*B^n, i.e., n ulps of rp[n]. */
4a238c70SJohn Marino}
4a238c70SJohn Marino
4a238c70SJohn Marino/* Put in  rp[0..n] the n+1 low limbs of {up, n} * {vp, n}.
4a238c70SJohn Marino   Assume 2n limbs are allocated at rp. */
4a238c70SJohn Marinostatic void
4a238c70SJohn Marinompfr_mullow_n_basecase (mpfr_limb_ptr rp, mpfr_limb_srcptr up,
4a238c70SJohn Marino                        mpfr_limb_srcptr vp, mp_size_t n)
4a238c70SJohn Marino{
4a238c70SJohn Marino  mp_size_t i;
4a238c70SJohn Marino
4a238c70SJohn Marino  rp[n] = mpn_mul_1 (rp, up, n, vp[0]);
4a238c70SJohn Marino  for (i = 1 ; i < n ; i++)
4a238c70SJohn Marino    mpn_addmul_1 (rp + i, up, n - i + 1, vp[i]);
4a238c70SJohn Marino}
4a238c70SJohn Marino
4a238c70SJohn Marino/* Put in  rp[n..2n-1] an approximation of the n high limbs
4a238c70SJohn Marino   of {np, n} * {mp, n}. The error is less than n ulps of rp[n] (and the
4a238c70SJohn Marino   approximation is always less or equal to the truncated full product).
4a238c70SJohn Marino
4a238c70SJohn Marino   Implements Algorithm ShortMul from [1].
4a238c70SJohn Marino*/
4a238c70SJohn Marinovoid
4a238c70SJohn Marinompfr_mulhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp,
4a238c70SJohn Marino                mp_size_t n)
4a238c70SJohn Marino{
4a238c70SJohn Marino  mp_size_t k;
4a238c70SJohn Marino
4a238c70SJohn Marino  MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */
4a238c70SJohn Marino  k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4);
4a238c70SJohn Marino  /* Algorithm ShortMul from [1] requires k >= (n+3)/2, which translates
4a238c70SJohn Marino     into k >= (n+4)/2 in the C language. */
4a238c70SJohn Marino  MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
4a238c70SJohn Marino  if (k < 0)
4a238c70SJohn Marino    mpn_mul_basecase (rp, np, n, mp, n); /* result is exact, no error */
4a238c70SJohn Marino  else if (k == 0)
4a238c70SJohn Marino    mpfr_mulhigh_n_basecase (rp, np, mp, n); /* basecase error < n ulps */
4a238c70SJohn Marino  else if (n > MUL_FFT_THRESHOLD)
4a238c70SJohn Marino    mpn_mul_n (rp, np, mp, n); /* result is exact, no error */
4a238c70SJohn Marino  else
4a238c70SJohn Marino    {
4a238c70SJohn Marino      mp_size_t l = n - k;
4a238c70SJohn Marino      mp_limb_t cy;
4a238c70SJohn Marino
4a238c70SJohn Marino      mpn_mul_n (rp + 2 * l, np + l, mp + l, k); /* fills rp[2l..2n-1] */
4a238c70SJohn Marino      mpfr_mulhigh_n (rp, np + k, mp, l);        /* fills rp[l-1..2l-1] */
4a238c70SJohn Marino      cy = mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
4a238c70SJohn Marino      mpfr_mulhigh_n (rp, np, mp + k, l);        /* fills rp[l-1..2l-1] */
4a238c70SJohn Marino      cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
4a238c70SJohn Marino      mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
4a238c70SJohn Marino    }
4a238c70SJohn Marino}
4a238c70SJohn Marino
4a238c70SJohn Marino/* Put in  rp[0..n] the n+1 low limbs of {np, n} * {mp, n}.
4a238c70SJohn Marino   Assume 2n limbs are allocated at rp. */
4a238c70SJohn Marinovoid
4a238c70SJohn Marinompfr_mullow_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp,
4a238c70SJohn Marino               mp_size_t n)
4a238c70SJohn Marino{
4a238c70SJohn Marino  mp_size_t k;
4a238c70SJohn Marino
4a238c70SJohn Marino  MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */
4a238c70SJohn Marino  k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4);
4a238c70SJohn Marino  MPFR_ASSERTD (k == -1 || k == 0 || (2 * k >= n && k < n));
4a238c70SJohn Marino  if (k < 0)
4a238c70SJohn Marino    mpn_mul_basecase (rp, np, n, mp, n);
4a238c70SJohn Marino  else if (k == 0)
4a238c70SJohn Marino    mpfr_mullow_n_basecase (rp, np, mp, n);
4a238c70SJohn Marino  else if (n > MUL_FFT_THRESHOLD)
4a238c70SJohn Marino    mpn_mul_n (rp, np, mp, n);
4a238c70SJohn Marino  else
4a238c70SJohn Marino    {
4a238c70SJohn Marino      mp_size_t l = n - k;
4a238c70SJohn Marino
4a238c70SJohn Marino      mpn_mul_n (rp, np, mp, k);                      /* fills rp[0..2k] */
4a238c70SJohn Marino      mpfr_mullow_n (rp + n, np + k, mp, l);          /* fills rp[n..n+2l] */
4a238c70SJohn Marino      mpn_add_n (rp + k, rp + k, rp + n, l + 1);
4a238c70SJohn Marino      mpfr_mullow_n (rp + n, np, mp + k, l);          /* fills rp[n..n+2l] */
4a238c70SJohn Marino      mpn_add_n (rp + k, rp + k, rp + n, l + 1);
4a238c70SJohn Marino    }
4a238c70SJohn Marino}
4a238c70SJohn Marino
4a238c70SJohn Marino#ifdef MPFR_SQRHIGH_TAB_SIZE
4a238c70SJohn Marinostatic short sqrhigh_ktab[MPFR_SQRHIGH_TAB_SIZE];
4a238c70SJohn Marino#else
4a238c70SJohn Marinostatic short sqrhigh_ktab[] = {MPFR_SQRHIGH_TAB};
4a238c70SJohn Marino#define MPFR_SQRHIGH_TAB_SIZE (sizeof(sqrhigh_ktab) / sizeof(sqrhigh_ktab[0]))
4a238c70SJohn Marino#endif
4a238c70SJohn Marino
4a238c70SJohn Marino/* Put in  rp[n..2n-1] an approximation of the n high limbs
4a238c70SJohn Marino   of {np, n}^2. The error is less than n ulps of rp[n]. */
4a238c70SJohn Marinovoid
4a238c70SJohn Marinompfr_sqrhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mp_size_t n)
4a238c70SJohn Marino{
4a238c70SJohn Marino  mp_size_t k;
4a238c70SJohn Marino
4a238c70SJohn Marino  MPFR_ASSERTN (MPFR_SQRHIGH_TAB_SIZE > 2); /* ensures k < n */
4a238c70SJohn Marino  k = MPFR_LIKELY (n < MPFR_SQRHIGH_TAB_SIZE) ? sqrhigh_ktab[n]
4a238c70SJohn Marino    : (n+4)/2; /* ensures that k >= (n+3)/2 */
4a238c70SJohn Marino  MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
4a238c70SJohn Marino  if (k < 0)
4a238c70SJohn Marino    /* we can't use mpn_sqr_basecase here, since it requires
4a238c70SJohn Marino       n <= SQR_KARATSUBA_THRESHOLD, where SQR_KARATSUBA_THRESHOLD
4a238c70SJohn Marino       is not exported by GMP */
4a238c70SJohn Marino    mpn_sqr_n (rp, np, n);
4a238c70SJohn Marino  else if (k == 0)
4a238c70SJohn Marino    mpfr_mulhigh_n_basecase (rp, np, np, n);
4a238c70SJohn Marino  else
4a238c70SJohn Marino    {
4a238c70SJohn Marino      mp_size_t l = n - k;
4a238c70SJohn Marino      mp_limb_t cy;
4a238c70SJohn Marino
4a238c70SJohn Marino      mpn_sqr_n (rp + 2 * l, np + l, k);          /* fills rp[2l..2n-1] */
4a238c70SJohn Marino      mpfr_mulhigh_n (rp, np, np + k, l);         /* fills rp[l-1..2l-1] */
4a238c70SJohn Marino      /* {rp+n-1,l+1} += 2 * {rp+l-1,l+1} */
4a238c70SJohn Marino      cy = mpn_lshift (rp + l - 1, rp + l - 1, l + 1, 1);
4a238c70SJohn Marino      cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
4a238c70SJohn Marino      mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
4a238c70SJohn Marino    }
4a238c70SJohn Marino}
4a238c70SJohn Marino
4a238c70SJohn Marino#ifdef MPFR_DIVHIGH_TAB_SIZE
4a238c70SJohn Marinostatic short divhigh_ktab[MPFR_DIVHIGH_TAB_SIZE];
4a238c70SJohn Marino#else
4a238c70SJohn Marinostatic short divhigh_ktab[] = {MPFR_DIVHIGH_TAB};
4a238c70SJohn Marino#define MPFR_DIVHIGH_TAB_SIZE (sizeof(divhigh_ktab) / sizeof(divhigh_ktab[0]))
4a238c70SJohn Marino#endif
4a238c70SJohn Marino
4a238c70SJohn Marino#ifndef __GMPFR_GMP_H__
4a238c70SJohn Marino#define mpfr_pi1_t gmp_pi1_t /* with a GMP build */
4a238c70SJohn Marino#endif
4a238c70SJohn Marino
4a238c70SJohn Marino#if !(defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q))
4a238c70SJohn Marino/* Put in Q={qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
4a238c70SJohn Marino   with the most significant limb of the quotient as return value (0 or 1).
4a238c70SJohn Marino   Assumes the most significant bit of D is set. Clobbers N.
4a238c70SJohn Marino
4a238c70SJohn Marino   The approximate quotient Q satisfies - 2(n-1) < N/D - Q <= 4.
4a238c70SJohn Marino*/
4a238c70SJohn Marinostatic mp_limb_t
4a238c70SJohn Marinompfr_divhigh_n_basecase (mpfr_limb_ptr qp, mpfr_limb_ptr np,
4a238c70SJohn Marino                         mpfr_limb_srcptr dp, mp_size_t n)
4a238c70SJohn Marino{
4a238c70SJohn Marino  mp_limb_t qh, d1, d0, dinv, q2, q1, q0;
4a238c70SJohn Marino  mpfr_pi1_t dinv2;
4a238c70SJohn Marino
4a238c70SJohn Marino  np += n;
4a238c70SJohn Marino
4a238c70SJohn Marino  if ((qh = (mpn_cmp (np, dp, n) >= 0)))
4a238c70SJohn Marino    mpn_sub_n (np, np, dp, n);
4a238c70SJohn Marino
4a238c70SJohn Marino  /* now {np, n} is less than D={dp, n}, which implies np[n-1] <= dp[n-1] */
4a238c70SJohn Marino
4a238c70SJohn Marino  d1 = dp[n - 1];
4a238c70SJohn Marino
4a238c70SJohn Marino  if (n == 1)
4a238c70SJohn Marino    {
4a238c70SJohn Marino      invert_limb (dinv, d1);
4a238c70SJohn Marino      umul_ppmm (q1, q0, np[0], dinv);
4a238c70SJohn Marino      qp[0] = np[0] + q1;
4a238c70SJohn Marino      return qh;
4a238c70SJohn Marino    }
4a238c70SJohn Marino
4a238c70SJohn Marino  /* now n >= 2 */
4a238c70SJohn Marino  d0 = dp[n - 2];
4a238c70SJohn Marino  invert_pi1 (dinv2, d1, d0);
4a238c70SJohn Marino  /* dinv2.inv32 = floor ((B^3 - 1) / (d0 + d1 B)) - B */
4a238c70SJohn Marino  while (n > 1)
4a238c70SJohn Marino    {
4a238c70SJohn Marino      /* Invariant: it remains to reduce n limbs from N (in addition to the
4a238c70SJohn Marino         initial low n limbs).
4a238c70SJohn Marino         Since n >= 2 here, necessarily we had n >= 2 initially, which means
4a238c70SJohn Marino         that in addition to the limb np[n-1] to reduce, we have at least 2
4a238c70SJohn Marino         extra limbs, thus accessing np[n-3] is valid. */
4a238c70SJohn Marino
4a238c70SJohn Marino      /* warning: we can have np[n-1]=d1 and np[n-2]=d0, but since {np,n} < D,
4a238c70SJohn Marino         the largest possible partial quotient is B-1 */
4a238c70SJohn Marino      if (MPFR_UNLIKELY(np[n - 1] == d1 && np[n - 2] == d0))
4a238c70SJohn Marino        q2 = ~ (mp_limb_t) 0;
4a238c70SJohn Marino      else
4a238c70SJohn Marino        udiv_qr_3by2 (q2, q1, q0, np[n - 1], np[n - 2], np[n - 3],
4a238c70SJohn Marino                      d1, d0, dinv2.inv32);
4a238c70SJohn Marino      /* since q2 = floor((np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0)),
4a238c70SJohn Marino         we have q2 <= (np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0),
4a238c70SJohn Marino         thus np[n-1]*B^2+np[n-2]*B+np[n-3] >= q2*(d1*B+d0)
4a238c70SJohn Marino         and {np-1, n} >= q2*D - q2*B^(n-2) >= q2*D - B^(n-1)
4a238c70SJohn Marino         thus {np-1, n} - (q2-1)*D >= D - B^(n-1) >= 0
4a238c70SJohn Marino         which proves that at most one correction is needed */
4a238c70SJohn Marino      q0 = mpn_submul_1 (np - 1, dp, n, q2);
4a238c70SJohn Marino      if (MPFR_UNLIKELY(q0 > np[n - 1]))
4a238c70SJohn Marino        {
4a238c70SJohn Marino          mpn_add_n (np - 1, np - 1, dp, n);
4a238c70SJohn Marino          q2 --;
4a238c70SJohn Marino        }
4a238c70SJohn Marino      qp[--n] = q2;
4a238c70SJohn Marino      dp ++;
4a238c70SJohn Marino    }
4a238c70SJohn Marino
4a238c70SJohn Marino  /* we have B+dinv2 = floor((B^3-1)/(d1*B+d0)) < B^2/d1
4a238c70SJohn Marino     q1 = floor(np[0]*(B+dinv2)/B) <= floor(np[0]*B/d1)
4a238c70SJohn Marino        <= floor((np[0]*B+np[1])/d1)
4a238c70SJohn Marino     thus q1 is not larger than the true quotient.
4a238c70SJohn Marino     q1 > np[0]*(B+dinv2)/B - 1 > np[0]*(B^3-1)/(d1*B+d0)/B - 2
4a238c70SJohn Marino     For d1*B+d0 <> B^2/2, we have B+dinv2 = floor(B^3/(d1*B+d0))
4a238c70SJohn Marino     thus q1 > np[0]*B^2/(d1*B+d0) - 2, i.e.,
4a238c70SJohn Marino     (d1*B+d0)*q1 > np[0]*B^2 - 2*(d1*B+d0)
4a238c70SJohn Marino     d1*B*q1 > np[0]*B^2 - 2*d1*B - 2*d0 - d0*q1 >= np[0]*B^2 - 2*d1*B - B^2
4a238c70SJohn Marino     thus q1 > np[0]*B/d1 - 2 - B/d1 > np[0]*B/d1 - 4.
4a238c70SJohn Marino
4a238c70SJohn Marino     For d1*B+d0 = B^2/2, dinv2 = B-1 thus q1 > np[0]*(2B-1)/B - 1 >
4a238c70SJohn Marino     np[0]*B/d1 - 2.
4a238c70SJohn Marino
4a238c70SJohn Marino     In all cases, if q = floor((np[0]*B+np[1])/d1), we have:
4a238c70SJohn Marino     q - 4 <= q1 <= q
4a238c70SJohn Marino  */
4a238c70SJohn Marino  umul_ppmm (q1, q0, np[0], dinv2.inv32);
4a238c70SJohn Marino  qp[0] = np[0] + q1;
4a238c70SJohn Marino
4a238c70SJohn Marino  return qh;
4a238c70SJohn Marino}
4a238c70SJohn Marino#endif
4a238c70SJohn Marino
4a238c70SJohn Marino/* Put in {qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
4a238c70SJohn Marino   with the most significant limb of the quotient as return value (0 or 1).
4a238c70SJohn Marino   Assumes the most significant bit of D is set. Clobbers N.
4a238c70SJohn Marino
4a238c70SJohn Marino   This implements the ShortDiv algorithm from reference [1].
4a238c70SJohn Marino*/
4a238c70SJohn Marino#if 1
4a238c70SJohn Marinomp_limb_t
4a238c70SJohn Marinompfr_divhigh_n (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_ptr dp,
4a238c70SJohn Marino                mp_size_t n)
4a238c70SJohn Marino{
4a238c70SJohn Marino  mp_size_t k, l;
4a238c70SJohn Marino  mp_limb_t qh, cy;
4a238c70SJohn Marino  mpfr_limb_ptr tp;
4a238c70SJohn Marino  MPFR_TMP_DECL(marker);
4a238c70SJohn Marino
4a238c70SJohn Marino  MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 15); /* so that 2*(n/3) >= (n+4)/2 */
4a238c70SJohn Marino  k = MPFR_LIKELY (n < MPFR_DIVHIGH_TAB_SIZE) ? divhigh_ktab[n] : 2*(n/3);
4a238c70SJohn Marino
4a238c70SJohn Marino  if (k == 0)
4a238c70SJohn Marino#if defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q)
4a238c70SJohn Marino  {
4a238c70SJohn Marino    mpfr_pi1_t dinv2;
4a238c70SJohn Marino    invert_pi1 (dinv2, dp[n - 1], dp[n - 2]);
4a238c70SJohn Marino    return __gmpn_sbpi1_divappr_q (qp, np, n + n, dp, n, dinv2.inv32);
4a238c70SJohn Marino  }
4a238c70SJohn Marino#else /* use our own code for base-case short division */
4a238c70SJohn Marino    return mpfr_divhigh_n_basecase (qp, np, dp, n);
4a238c70SJohn Marino#endif
4a238c70SJohn Marino  else if (k == n)
4a238c70SJohn Marino    /* for k=n, we use a division with remainder (mpn_divrem),
4a238c70SJohn Marino     which computes the exact quotient */
4a238c70SJohn Marino    return mpn_divrem (qp, 0, np, 2 * n, dp, n);
4a238c70SJohn Marino
4a238c70SJohn Marino  MPFR_ASSERTD ((n+4)/2 <= k && k < n); /* bounds from [1] */
4a238c70SJohn Marino  MPFR_TMP_MARK (marker);
4a238c70SJohn Marino  l = n - k;
4a238c70SJohn Marino  /* first divide the most significant 2k limbs from N by the most significant
4a238c70SJohn Marino     k limbs of D */
4a238c70SJohn Marino  qh = mpn_divrem (qp + l, 0, np + 2 * l, 2 * k, dp + l, k); /* exact */
4a238c70SJohn Marino
4a238c70SJohn Marino  /* it remains {np,2l+k} = {np,n+l} as remainder */
4a238c70SJohn Marino
4a238c70SJohn Marino  /* now we have to subtract high(Q1)*D0 where Q1=qh*B^k+{qp+l,k} and
4a238c70SJohn Marino     D0={dp,l} */
4a238c70SJohn Marino  tp = MPFR_TMP_LIMBS_ALLOC (2 * l);
4a238c70SJohn Marino  mpfr_mulhigh_n (tp, qp + k, dp, l);
4a238c70SJohn Marino  /* we are only interested in the upper l limbs from {tp,2l} */
4a238c70SJohn Marino  cy = mpn_sub_n (np + n, np + n, tp + l, l);
4a238c70SJohn Marino  if (qh)
4a238c70SJohn Marino    cy += mpn_sub_n (np + n, np + n, dp, l);
4a238c70SJohn Marino  while (cy > 0) /* Q1 was too large: subtract 1 to Q1 and add D to np+l */
4a238c70SJohn Marino    {
4a238c70SJohn Marino      qh -= mpn_sub_1 (qp + l, qp + l, k, MPFR_LIMB_ONE);
4a238c70SJohn Marino      cy -= mpn_add_n (np + l, np + l, dp, n);
4a238c70SJohn Marino    }
4a238c70SJohn Marino
4a238c70SJohn Marino  /* now it remains {np,n+l} to divide by D */
4a238c70SJohn Marino  cy = mpfr_divhigh_n (qp, np + k, dp + k, l);
4a238c70SJohn Marino  qh += mpn_add_1 (qp + l, qp + l, k, cy);
4a238c70SJohn Marino  MPFR_TMP_FREE(marker);
4a238c70SJohn Marino
4a238c70SJohn Marino  return qh;
4a238c70SJohn Marino}
4a238c70SJohn Marino#else /* below is the FoldDiv(K) algorithm from [1] */
4a238c70SJohn Marinomp_limb_t
4a238c70SJohn Marinompfr_divhigh_n (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_ptr dp,
4a238c70SJohn Marino                mp_size_t n)
4a238c70SJohn Marino{
4a238c70SJohn Marino  mp_size_t k, r;
4a238c70SJohn Marino  mpfr_limb_ptr ip, tp, up;
4a238c70SJohn Marino  mp_limb_t qh = 0, cy, cc;
4a238c70SJohn Marino  int count;
4a238c70SJohn Marino  MPFR_TMP_DECL(marker);
4a238c70SJohn Marino
4a238c70SJohn Marino#define K 3
4a238c70SJohn Marino  if (n < K)
4a238c70SJohn Marino    return mpn_divrem (qp, 0, np, 2 * n, dp, n);
4a238c70SJohn Marino
4a238c70SJohn Marino  k = (n - 1) / K + 1; /* ceil(n/K) */
4a238c70SJohn Marino
4a238c70SJohn Marino  MPFR_TMP_MARK (marker);
4a238c70SJohn Marino  ip = MPFR_TMP_LIMBS_ALLOC (k + 1);
4a238c70SJohn Marino  tp = MPFR_TMP_LIMBS_ALLOC (n + k);
4a238c70SJohn Marino  up = MPFR_TMP_LIMBS_ALLOC (2 * (k + 1));
4a238c70SJohn Marino  mpn_invert (ip, dp + n - (k + 1), k + 1, NULL); /* takes about 13% for n=1000 */
4a238c70SJohn Marino  /* {ip, k+1} = floor((B^(2k+2)-1)/D - B^(k+1) where D = {dp+n-(k+1),k+1} */
4a238c70SJohn Marino  for (r = n, cc = 0UL; r > 0;)
4a238c70SJohn Marino    {
4a238c70SJohn Marino      /* cc is the carry at np[n+r] */
4a238c70SJohn Marino      MPFR_ASSERTD(cc <= 1);
4a238c70SJohn Marino      /* FIXME: why can we have cc as large as say 8? */
4a238c70SJohn Marino      count = 0;
4a238c70SJohn Marino      while (cc > 0)
4a238c70SJohn Marino        {
4a238c70SJohn Marino          count ++;
4a238c70SJohn Marino          MPFR_ASSERTD(count <= 1);
4a238c70SJohn Marino          /* subtract {dp+n-r,r} from {np+n,r} */
4a238c70SJohn Marino          cc -= mpn_sub_n (np + n, np + n, dp + n - r, r);
4a238c70SJohn Marino          /* add 1 at qp[r] */
4a238c70SJohn Marino          qh += mpn_add_1 (qp + r, qp + r, n - r, 1UL);
4a238c70SJohn Marino        }
4a238c70SJohn Marino      /* it remains r limbs to reduce, i.e., the remainder is {np, n+r} */
4a238c70SJohn Marino      if (r < k)
4a238c70SJohn Marino        {
4a238c70SJohn Marino          ip += k - r;
4a238c70SJohn Marino          k = r;
4a238c70SJohn Marino        }
4a238c70SJohn Marino      /* now r >= k */
4a238c70SJohn Marino      /* qp + r - 2 * k -> up */
4a238c70SJohn Marino      mpfr_mulhigh_n (up, np + n + r - (k + 1), ip, k + 1);
4a238c70SJohn Marino      /* take into account the term B^k in the inverse: B^k * {np+n+r-k, k} */
4a238c70SJohn Marino      cy = mpn_add_n (qp + r - k, up + k + 2, np + n + r - k, k);
4a238c70SJohn Marino      /* since we need only r limbs of tp (below), it suffices to consider
4a238c70SJohn Marino         r high limbs of dp */
4a238c70SJohn Marino      if (r > k)
4a238c70SJohn Marino        {
4a238c70SJohn Marino#if 0
4a238c70SJohn Marino          mpn_mul (tp, dp + n - r, r, qp + r - k, k);
4a238c70SJohn Marino#else  /* use a short product for the low k x k limbs */
4a238c70SJohn Marino          /* we know the upper k limbs of the r-limb product cancel with the
4a238c70SJohn Marino             remainder, thus we only need to compute the low r-k limbs */
4a238c70SJohn Marino          if (r - k >= k)
4a238c70SJohn Marino            mpn_mul (tp + k, dp + n - r + k, r - k, qp + r - k, k);
4a238c70SJohn Marino          else /* r-k < k */
4a238c70SJohn Marino            {
4a238c70SJohn Marino/* #define LOW */
4a238c70SJohn Marino#ifndef LOW
4a238c70SJohn Marino              mpn_mul (tp + k, qp + r - k, k, dp + n - r + k, r - k);
4a238c70SJohn Marino#else
4a238c70SJohn Marino              mpfr_mullow_n_basecase (tp + k, qp + r - k, dp + n - r + k, r - k);
4a238c70SJohn Marino              /* take into account qp[2r-2k] * dp[n - r + k] */
4a238c70SJohn Marino              tp[r] += qp[2*r-2*k] * dp[n - r + k];
4a238c70SJohn Marino#endif
4a238c70SJohn Marino              /* tp[k..r] is filled */
4a238c70SJohn Marino            }
4a238c70SJohn Marino#if 0
4a238c70SJohn Marino          mpfr_mulhigh_n (up, dp + n - r, qp + r - k, k);
4a238c70SJohn Marino#else /* compute one more limb. FIXME: we could add one limb of dp in the
4a238c70SJohn Marino         above, to save one mpn_addmul_1 call */
4a238c70SJohn Marino          mpfr_mulhigh_n (up, dp + n - r, qp + r - k, k - 1); /* {up,2k-2} */
4a238c70SJohn Marino          /* add {qp + r - k, k - 1} * dp[n-r+k-1] */
4a238c70SJohn Marino          up[2*k-2] = mpn_addmul_1 (up + k - 1, qp + r - k, k-1, dp[n-r+k-1]);
4a238c70SJohn Marino          /* add {dp+n-r, k} * qp[r-1] */
4a238c70SJohn Marino          up[2*k-1] = mpn_addmul_1 (up + k - 1, dp + n - r, k, qp[r-1]);
4a238c70SJohn Marino#endif
4a238c70SJohn Marino#ifndef LOW
4a238c70SJohn Marino          cc = mpn_add_n (tp + k, tp + k, up + k, k);
4a238c70SJohn Marino          mpn_add_1 (tp + 2 * k, tp + 2 * k, r - k, cc);
4a238c70SJohn Marino#else
4a238c70SJohn Marino          /* update tp[k..r] */
4a238c70SJohn Marino          if (r - k + 1 <= k)
4a238c70SJohn Marino            mpn_add_n (tp + k, tp + k, up + k, r - k + 1);
4a238c70SJohn Marino          else /* r - k >= k */
4a238c70SJohn Marino            {
4a238c70SJohn Marino              cc = mpn_add_n (tp + k, tp + k, up + k, k);
4a238c70SJohn Marino              mpn_add_1 (tp + 2 * k, tp + 2 * k, r - 2 * k + 1, cc);
4a238c70SJohn Marino            }
4a238c70SJohn Marino#endif
4a238c70SJohn Marino#endif
4a238c70SJohn Marino        }
4a238c70SJohn Marino      else /* last step: since we only want the quotient, no need to update,
4a238c70SJohn Marino              just propagate the carry cy */
4a238c70SJohn Marino        {
4a238c70SJohn Marino          MPFR_ASSERTD(r < n);
4a238c70SJohn Marino          if (cy > 0)
4a238c70SJohn Marino            qh += mpn_add_1 (qp + r, qp + r, n - r, cy);
4a238c70SJohn Marino          break;
4a238c70SJohn Marino        }
4a238c70SJohn Marino      /* subtract {tp, n+k} from {np+r-k, n+k}; however we only want to
4a238c70SJohn Marino         update {np+n, n} */
4a238c70SJohn Marino      /* we should have tp[r] = np[n+r-k] up to 1 */
4a238c70SJohn Marino      MPFR_ASSERTD(tp[r] == np[n + r - k] || tp[r] + 1 == np[n + r - k]);
4a238c70SJohn Marino#ifndef LOW
4a238c70SJohn Marino      cc = mpn_sub_n (np + n - 1, np + n - 1, tp + k - 1, r + 1); /* borrow at np[n+r] */
4a238c70SJohn Marino#else
4a238c70SJohn Marino      cc = mpn_sub_n (np + n - 1, np + n - 1, tp + k - 1, r - k + 2);
4a238c70SJohn Marino#endif
4a238c70SJohn Marino      /* if cy = 1, subtract {dp, n} from {np+r, n}, thus
4a238c70SJohn Marino         {dp+n-r,r} from {np+n,r} */
4a238c70SJohn Marino      if (cy)
4a238c70SJohn Marino        {
4a238c70SJohn Marino          if (r < n)
4a238c70SJohn Marino            cc += mpn_sub_n (np + n - 1, np + n - 1, dp + n - r - 1, r + 1);
4a238c70SJohn Marino          else
4a238c70SJohn Marino            cc += mpn_sub_n (np + n, np + n, dp + n - r, r);
4a238c70SJohn Marino          /* propagate cy */
4a238c70SJohn Marino          if (r == n)
4a238c70SJohn Marino            qh = cy;
4a238c70SJohn Marino          else
4a238c70SJohn Marino            qh += mpn_add_1 (qp + r, qp + r, n - r, cy);
4a238c70SJohn Marino        }
4a238c70SJohn Marino      /* cc is the borrow at np[n+r] */
4a238c70SJohn Marino      count = 0;
4a238c70SJohn Marino      while (cc > 0) /* quotient was too large */
4a238c70SJohn Marino        {
4a238c70SJohn Marino          count++;
4a238c70SJohn Marino          MPFR_ASSERTD (count <= 1);
4a238c70SJohn Marino          cy = mpn_add_n (np + n, np + n, dp + n - (r - k), r - k);
4a238c70SJohn Marino          cc -= mpn_add_1 (np + n + r - k, np + n + r - k, k, cy);
4a238c70SJohn Marino          qh -= mpn_sub_1 (qp + r - k, qp + r - k, n - (r - k), 1UL);
4a238c70SJohn Marino        }
4a238c70SJohn Marino      r -= k;
4a238c70SJohn Marino      cc = np[n + r];
4a238c70SJohn Marino    }
4a238c70SJohn Marino  MPFR_TMP_FREE(marker);
4a238c70SJohn Marino
4a238c70SJohn Marino  return qh;
4a238c70SJohn Marino}
4a238c70SJohn Marino#endif