xref: /dflybsd-src/contrib/mpfr/src/mulders.c (revision 2786097444a0124b5d33763854de247e230c6629)
14a238c70SJohn Marino /* Mulders' MulHigh function (short product)
24a238c70SJohn Marino 
3*ab6d115fSJohn Marino Copyright 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
4*ab6d115fSJohn Marino Contributed by the AriC and Caramel projects, INRIA.
54a238c70SJohn Marino 
64a238c70SJohn Marino This file is part of the GNU MPFR Library.
74a238c70SJohn Marino 
84a238c70SJohn Marino The GNU MPFR Library is free software; you can redistribute it and/or modify
94a238c70SJohn Marino it under the terms of the GNU Lesser General Public License as published by
104a238c70SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
114a238c70SJohn Marino option) any later version.
124a238c70SJohn Marino 
134a238c70SJohn Marino The GNU MPFR Library is distributed in the hope that it will be useful, but
144a238c70SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
154a238c70SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
164a238c70SJohn Marino License for more details.
174a238c70SJohn Marino 
184a238c70SJohn Marino You should have received a copy of the GNU Lesser General Public License
194a238c70SJohn Marino along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
204a238c70SJohn Marino http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
214a238c70SJohn Marino 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
224a238c70SJohn Marino 
234a238c70SJohn Marino /* References:
244a238c70SJohn Marino    [1] Short Division of Long Integers, David Harvey and Paul Zimmermann,
254a238c70SJohn Marino        Proceedings of the 20th Symposium on Computer Arithmetic (ARITH-20),
264a238c70SJohn Marino        July 25-27, 2011, pages 7-14.
274a238c70SJohn Marino */
284a238c70SJohn Marino 
294a238c70SJohn Marino #define MPFR_NEED_LONGLONG_H
304a238c70SJohn Marino #include "mpfr-impl.h"
314a238c70SJohn Marino 
324a238c70SJohn Marino #ifndef MUL_FFT_THRESHOLD
334a238c70SJohn Marino #define MUL_FFT_THRESHOLD 8448
344a238c70SJohn Marino #endif
354a238c70SJohn Marino 
364a238c70SJohn Marino /* Don't use MPFR_MULHIGH_SIZE since it is handled by tuneup */
374a238c70SJohn Marino #ifdef MPFR_MULHIGH_TAB_SIZE
384a238c70SJohn Marino static short mulhigh_ktab[MPFR_MULHIGH_TAB_SIZE];
394a238c70SJohn Marino #else
404a238c70SJohn Marino static short mulhigh_ktab[] = {MPFR_MULHIGH_TAB};
414a238c70SJohn Marino #define MPFR_MULHIGH_TAB_SIZE \
424a238c70SJohn Marino   ((mp_size_t) (sizeof(mulhigh_ktab) / sizeof(mulhigh_ktab[0])))
434a238c70SJohn Marino #endif
444a238c70SJohn Marino 
454a238c70SJohn Marino /* Put in  rp[n..2n-1] an approximation of the n high limbs
464a238c70SJohn Marino    of {up, n} * {vp, n}. The error is less than n ulps of rp[n] (and the
474a238c70SJohn Marino    approximation is always less or equal to the truncated full product).
484a238c70SJohn Marino    Assume 2n limbs are allocated at rp.
494a238c70SJohn Marino 
504a238c70SJohn Marino    Implements Algorithm ShortMulNaive from [1].
514a238c70SJohn Marino */
524a238c70SJohn Marino static void
mpfr_mulhigh_n_basecase(mpfr_limb_ptr rp,mpfr_limb_srcptr up,mpfr_limb_srcptr vp,mp_size_t n)534a238c70SJohn Marino mpfr_mulhigh_n_basecase (mpfr_limb_ptr rp, mpfr_limb_srcptr up,
544a238c70SJohn Marino                          mpfr_limb_srcptr vp, mp_size_t n)
554a238c70SJohn Marino {
564a238c70SJohn Marino   mp_size_t i;
574a238c70SJohn Marino 
584a238c70SJohn Marino   rp += n - 1;
594a238c70SJohn Marino   umul_ppmm (rp[1], rp[0], up[n-1], vp[0]); /* we neglect up[0..n-2]*vp[0],
604a238c70SJohn Marino                                                which is less than B^n */
614a238c70SJohn Marino   for (i = 1 ; i < n ; i++)
624a238c70SJohn Marino     /* here, we neglect up[0..n-i-2] * vp[i], which is less than B^n too */
634a238c70SJohn Marino     rp[i + 1] = mpn_addmul_1 (rp, up + (n - i - 1), i + 1, vp[i]);
644a238c70SJohn Marino   /* in total, we neglect less than n*B^n, i.e., n ulps of rp[n]. */
654a238c70SJohn Marino }
664a238c70SJohn Marino 
674a238c70SJohn Marino /* Put in  rp[0..n] the n+1 low limbs of {up, n} * {vp, n}.
684a238c70SJohn Marino    Assume 2n limbs are allocated at rp. */
694a238c70SJohn Marino static void
mpfr_mullow_n_basecase(mpfr_limb_ptr rp,mpfr_limb_srcptr up,mpfr_limb_srcptr vp,mp_size_t n)704a238c70SJohn Marino mpfr_mullow_n_basecase (mpfr_limb_ptr rp, mpfr_limb_srcptr up,
714a238c70SJohn Marino                         mpfr_limb_srcptr vp, mp_size_t n)
724a238c70SJohn Marino {
734a238c70SJohn Marino   mp_size_t i;
744a238c70SJohn Marino 
754a238c70SJohn Marino   rp[n] = mpn_mul_1 (rp, up, n, vp[0]);
764a238c70SJohn Marino   for (i = 1 ; i < n ; i++)
774a238c70SJohn Marino     mpn_addmul_1 (rp + i, up, n - i + 1, vp[i]);
784a238c70SJohn Marino }
794a238c70SJohn Marino 
804a238c70SJohn Marino /* Put in  rp[n..2n-1] an approximation of the n high limbs
814a238c70SJohn Marino    of {np, n} * {mp, n}. The error is less than n ulps of rp[n] (and the
824a238c70SJohn Marino    approximation is always less or equal to the truncated full product).
834a238c70SJohn Marino 
844a238c70SJohn Marino    Implements Algorithm ShortMul from [1].
854a238c70SJohn Marino */
864a238c70SJohn Marino void
mpfr_mulhigh_n(mpfr_limb_ptr rp,mpfr_limb_srcptr np,mpfr_limb_srcptr mp,mp_size_t n)874a238c70SJohn Marino mpfr_mulhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp,
884a238c70SJohn Marino                 mp_size_t n)
894a238c70SJohn Marino {
904a238c70SJohn Marino   mp_size_t k;
914a238c70SJohn Marino 
924a238c70SJohn Marino   MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */
934a238c70SJohn Marino   k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4);
944a238c70SJohn Marino   /* Algorithm ShortMul from [1] requires k >= (n+3)/2, which translates
954a238c70SJohn Marino      into k >= (n+4)/2 in the C language. */
964a238c70SJohn Marino   MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
974a238c70SJohn Marino   if (k < 0)
984a238c70SJohn Marino     mpn_mul_basecase (rp, np, n, mp, n); /* result is exact, no error */
994a238c70SJohn Marino   else if (k == 0)
1004a238c70SJohn Marino     mpfr_mulhigh_n_basecase (rp, np, mp, n); /* basecase error < n ulps */
1014a238c70SJohn Marino   else if (n > MUL_FFT_THRESHOLD)
1024a238c70SJohn Marino     mpn_mul_n (rp, np, mp, n); /* result is exact, no error */
1034a238c70SJohn Marino   else
1044a238c70SJohn Marino     {
1054a238c70SJohn Marino       mp_size_t l = n - k;
1064a238c70SJohn Marino       mp_limb_t cy;
1074a238c70SJohn Marino 
1084a238c70SJohn Marino       mpn_mul_n (rp + 2 * l, np + l, mp + l, k); /* fills rp[2l..2n-1] */
1094a238c70SJohn Marino       mpfr_mulhigh_n (rp, np + k, mp, l);        /* fills rp[l-1..2l-1] */
1104a238c70SJohn Marino       cy = mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
1114a238c70SJohn Marino       mpfr_mulhigh_n (rp, np, mp + k, l);        /* fills rp[l-1..2l-1] */
1124a238c70SJohn Marino       cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
1134a238c70SJohn Marino       mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
1144a238c70SJohn Marino     }
1154a238c70SJohn Marino }
1164a238c70SJohn Marino 
1174a238c70SJohn Marino /* Put in  rp[0..n] the n+1 low limbs of {np, n} * {mp, n}.
1184a238c70SJohn Marino    Assume 2n limbs are allocated at rp. */
1194a238c70SJohn Marino void
mpfr_mullow_n(mpfr_limb_ptr rp,mpfr_limb_srcptr np,mpfr_limb_srcptr mp,mp_size_t n)1204a238c70SJohn Marino mpfr_mullow_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp,
1214a238c70SJohn Marino                mp_size_t n)
1224a238c70SJohn Marino {
1234a238c70SJohn Marino   mp_size_t k;
1244a238c70SJohn Marino 
1254a238c70SJohn Marino   MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */
1264a238c70SJohn Marino   k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4);
1274a238c70SJohn Marino   MPFR_ASSERTD (k == -1 || k == 0 || (2 * k >= n && k < n));
1284a238c70SJohn Marino   if (k < 0)
1294a238c70SJohn Marino     mpn_mul_basecase (rp, np, n, mp, n);
1304a238c70SJohn Marino   else if (k == 0)
1314a238c70SJohn Marino     mpfr_mullow_n_basecase (rp, np, mp, n);
1324a238c70SJohn Marino   else if (n > MUL_FFT_THRESHOLD)
1334a238c70SJohn Marino     mpn_mul_n (rp, np, mp, n);
1344a238c70SJohn Marino   else
1354a238c70SJohn Marino     {
1364a238c70SJohn Marino       mp_size_t l = n - k;
1374a238c70SJohn Marino 
1384a238c70SJohn Marino       mpn_mul_n (rp, np, mp, k);                      /* fills rp[0..2k] */
1394a238c70SJohn Marino       mpfr_mullow_n (rp + n, np + k, mp, l);          /* fills rp[n..n+2l] */
1404a238c70SJohn Marino       mpn_add_n (rp + k, rp + k, rp + n, l + 1);
1414a238c70SJohn Marino       mpfr_mullow_n (rp + n, np, mp + k, l);          /* fills rp[n..n+2l] */
1424a238c70SJohn Marino       mpn_add_n (rp + k, rp + k, rp + n, l + 1);
1434a238c70SJohn Marino     }
1444a238c70SJohn Marino }
1454a238c70SJohn Marino 
1464a238c70SJohn Marino #ifdef MPFR_SQRHIGH_TAB_SIZE
1474a238c70SJohn Marino static short sqrhigh_ktab[MPFR_SQRHIGH_TAB_SIZE];
1484a238c70SJohn Marino #else
1494a238c70SJohn Marino static short sqrhigh_ktab[] = {MPFR_SQRHIGH_TAB};
1504a238c70SJohn Marino #define MPFR_SQRHIGH_TAB_SIZE (sizeof(sqrhigh_ktab) / sizeof(sqrhigh_ktab[0]))
1514a238c70SJohn Marino #endif
1524a238c70SJohn Marino 
1534a238c70SJohn Marino /* Put in  rp[n..2n-1] an approximation of the n high limbs
1544a238c70SJohn Marino    of {np, n}^2. The error is less than n ulps of rp[n]. */
1554a238c70SJohn Marino void
mpfr_sqrhigh_n(mpfr_limb_ptr rp,mpfr_limb_srcptr np,mp_size_t n)1564a238c70SJohn Marino mpfr_sqrhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mp_size_t n)
1574a238c70SJohn Marino {
1584a238c70SJohn Marino   mp_size_t k;
1594a238c70SJohn Marino 
1604a238c70SJohn Marino   MPFR_ASSERTN (MPFR_SQRHIGH_TAB_SIZE > 2); /* ensures k < n */
1614a238c70SJohn Marino   k = MPFR_LIKELY (n < MPFR_SQRHIGH_TAB_SIZE) ? sqrhigh_ktab[n]
1624a238c70SJohn Marino     : (n+4)/2; /* ensures that k >= (n+3)/2 */
1634a238c70SJohn Marino   MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
1644a238c70SJohn Marino   if (k < 0)
1654a238c70SJohn Marino     /* we can't use mpn_sqr_basecase here, since it requires
1664a238c70SJohn Marino        n <= SQR_KARATSUBA_THRESHOLD, where SQR_KARATSUBA_THRESHOLD
1674a238c70SJohn Marino        is not exported by GMP */
1684a238c70SJohn Marino     mpn_sqr_n (rp, np, n);
1694a238c70SJohn Marino   else if (k == 0)
1704a238c70SJohn Marino     mpfr_mulhigh_n_basecase (rp, np, np, n);
1714a238c70SJohn Marino   else
1724a238c70SJohn Marino     {
1734a238c70SJohn Marino       mp_size_t l = n - k;
1744a238c70SJohn Marino       mp_limb_t cy;
1754a238c70SJohn Marino 
1764a238c70SJohn Marino       mpn_sqr_n (rp + 2 * l, np + l, k);          /* fills rp[2l..2n-1] */
1774a238c70SJohn Marino       mpfr_mulhigh_n (rp, np, np + k, l);         /* fills rp[l-1..2l-1] */
1784a238c70SJohn Marino       /* {rp+n-1,l+1} += 2 * {rp+l-1,l+1} */
1794a238c70SJohn Marino       cy = mpn_lshift (rp + l - 1, rp + l - 1, l + 1, 1);
1804a238c70SJohn Marino       cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
1814a238c70SJohn Marino       mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
1824a238c70SJohn Marino     }
1834a238c70SJohn Marino }
1844a238c70SJohn Marino 
1854a238c70SJohn Marino #ifdef MPFR_DIVHIGH_TAB_SIZE
1864a238c70SJohn Marino static short divhigh_ktab[MPFR_DIVHIGH_TAB_SIZE];
1874a238c70SJohn Marino #else
1884a238c70SJohn Marino static short divhigh_ktab[] = {MPFR_DIVHIGH_TAB};
1894a238c70SJohn Marino #define MPFR_DIVHIGH_TAB_SIZE (sizeof(divhigh_ktab) / sizeof(divhigh_ktab[0]))
1904a238c70SJohn Marino #endif
1914a238c70SJohn Marino 
1924a238c70SJohn Marino #ifndef __GMPFR_GMP_H__
1934a238c70SJohn Marino #define mpfr_pi1_t gmp_pi1_t /* with a GMP build */
1944a238c70SJohn Marino #endif
1954a238c70SJohn Marino 
1964a238c70SJohn Marino #if !(defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q))
1974a238c70SJohn Marino /* Put in Q={qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
1984a238c70SJohn Marino    with the most significant limb of the quotient as return value (0 or 1).
1994a238c70SJohn Marino    Assumes the most significant bit of D is set. Clobbers N.
2004a238c70SJohn Marino 
2014a238c70SJohn Marino    The approximate quotient Q satisfies - 2(n-1) < N/D - Q <= 4.
2024a238c70SJohn Marino */
2034a238c70SJohn Marino static mp_limb_t
mpfr_divhigh_n_basecase(mpfr_limb_ptr qp,mpfr_limb_ptr np,mpfr_limb_srcptr dp,mp_size_t n)2044a238c70SJohn Marino mpfr_divhigh_n_basecase (mpfr_limb_ptr qp, mpfr_limb_ptr np,
2054a238c70SJohn Marino                          mpfr_limb_srcptr dp, mp_size_t n)
2064a238c70SJohn Marino {
2074a238c70SJohn Marino   mp_limb_t qh, d1, d0, dinv, q2, q1, q0;
2084a238c70SJohn Marino   mpfr_pi1_t dinv2;
2094a238c70SJohn Marino 
2104a238c70SJohn Marino   np += n;
2114a238c70SJohn Marino 
2124a238c70SJohn Marino   if ((qh = (mpn_cmp (np, dp, n) >= 0)))
2134a238c70SJohn Marino     mpn_sub_n (np, np, dp, n);
2144a238c70SJohn Marino 
2154a238c70SJohn Marino   /* now {np, n} is less than D={dp, n}, which implies np[n-1] <= dp[n-1] */
2164a238c70SJohn Marino 
2174a238c70SJohn Marino   d1 = dp[n - 1];
2184a238c70SJohn Marino 
2194a238c70SJohn Marino   if (n == 1)
2204a238c70SJohn Marino     {
2214a238c70SJohn Marino       invert_limb (dinv, d1);
2224a238c70SJohn Marino       umul_ppmm (q1, q0, np[0], dinv);
2234a238c70SJohn Marino       qp[0] = np[0] + q1;
2244a238c70SJohn Marino       return qh;
2254a238c70SJohn Marino     }
2264a238c70SJohn Marino 
2274a238c70SJohn Marino   /* now n >= 2 */
2284a238c70SJohn Marino   d0 = dp[n - 2];
2294a238c70SJohn Marino   invert_pi1 (dinv2, d1, d0);
2304a238c70SJohn Marino   /* dinv2.inv32 = floor ((B^3 - 1) / (d0 + d1 B)) - B */
2314a238c70SJohn Marino   while (n > 1)
2324a238c70SJohn Marino     {
2334a238c70SJohn Marino       /* Invariant: it remains to reduce n limbs from N (in addition to the
2344a238c70SJohn Marino          initial low n limbs).
2354a238c70SJohn Marino          Since n >= 2 here, necessarily we had n >= 2 initially, which means
2364a238c70SJohn Marino          that in addition to the limb np[n-1] to reduce, we have at least 2
2374a238c70SJohn Marino          extra limbs, thus accessing np[n-3] is valid. */
2384a238c70SJohn Marino 
2394a238c70SJohn Marino       /* warning: we can have np[n-1]=d1 and np[n-2]=d0, but since {np,n} < D,
2404a238c70SJohn Marino          the largest possible partial quotient is B-1 */
2414a238c70SJohn Marino       if (MPFR_UNLIKELY(np[n - 1] == d1 && np[n - 2] == d0))
2424a238c70SJohn Marino         q2 = ~ (mp_limb_t) 0;
2434a238c70SJohn Marino       else
2444a238c70SJohn Marino         udiv_qr_3by2 (q2, q1, q0, np[n - 1], np[n - 2], np[n - 3],
2454a238c70SJohn Marino                       d1, d0, dinv2.inv32);
2464a238c70SJohn Marino       /* since q2 = floor((np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0)),
2474a238c70SJohn Marino          we have q2 <= (np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0),
2484a238c70SJohn Marino          thus np[n-1]*B^2+np[n-2]*B+np[n-3] >= q2*(d1*B+d0)
2494a238c70SJohn Marino          and {np-1, n} >= q2*D - q2*B^(n-2) >= q2*D - B^(n-1)
2504a238c70SJohn Marino          thus {np-1, n} - (q2-1)*D >= D - B^(n-1) >= 0
2514a238c70SJohn Marino          which proves that at most one correction is needed */
2524a238c70SJohn Marino       q0 = mpn_submul_1 (np - 1, dp, n, q2);
2534a238c70SJohn Marino       if (MPFR_UNLIKELY(q0 > np[n - 1]))
2544a238c70SJohn Marino         {
2554a238c70SJohn Marino           mpn_add_n (np - 1, np - 1, dp, n);
2564a238c70SJohn Marino           q2 --;
2574a238c70SJohn Marino         }
2584a238c70SJohn Marino       qp[--n] = q2;
2594a238c70SJohn Marino       dp ++;
2604a238c70SJohn Marino     }
2614a238c70SJohn Marino 
2624a238c70SJohn Marino   /* we have B+dinv2 = floor((B^3-1)/(d1*B+d0)) < B^2/d1
2634a238c70SJohn Marino      q1 = floor(np[0]*(B+dinv2)/B) <= floor(np[0]*B/d1)
2644a238c70SJohn Marino         <= floor((np[0]*B+np[1])/d1)
2654a238c70SJohn Marino      thus q1 is not larger than the true quotient.
2664a238c70SJohn Marino      q1 > np[0]*(B+dinv2)/B - 1 > np[0]*(B^3-1)/(d1*B+d0)/B - 2
2674a238c70SJohn Marino      For d1*B+d0 <> B^2/2, we have B+dinv2 = floor(B^3/(d1*B+d0))
2684a238c70SJohn Marino      thus q1 > np[0]*B^2/(d1*B+d0) - 2, i.e.,
2694a238c70SJohn Marino      (d1*B+d0)*q1 > np[0]*B^2 - 2*(d1*B+d0)
2704a238c70SJohn Marino      d1*B*q1 > np[0]*B^2 - 2*d1*B - 2*d0 - d0*q1 >= np[0]*B^2 - 2*d1*B - B^2
2714a238c70SJohn Marino      thus q1 > np[0]*B/d1 - 2 - B/d1 > np[0]*B/d1 - 4.
2724a238c70SJohn Marino 
2734a238c70SJohn Marino      For d1*B+d0 = B^2/2, dinv2 = B-1 thus q1 > np[0]*(2B-1)/B - 1 >
2744a238c70SJohn Marino      np[0]*B/d1 - 2.
2754a238c70SJohn Marino 
2764a238c70SJohn Marino      In all cases, if q = floor((np[0]*B+np[1])/d1), we have:
2774a238c70SJohn Marino      q - 4 <= q1 <= q
2784a238c70SJohn Marino   */
2794a238c70SJohn Marino   umul_ppmm (q1, q0, np[0], dinv2.inv32);
2804a238c70SJohn Marino   qp[0] = np[0] + q1;
2814a238c70SJohn Marino 
2824a238c70SJohn Marino   return qh;
2834a238c70SJohn Marino }
2844a238c70SJohn Marino #endif
2854a238c70SJohn Marino 
2864a238c70SJohn Marino /* Put in {qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
2874a238c70SJohn Marino    with the most significant limb of the quotient as return value (0 or 1).
2884a238c70SJohn Marino    Assumes the most significant bit of D is set. Clobbers N.
2894a238c70SJohn Marino 
2904a238c70SJohn Marino    This implements the ShortDiv algorithm from reference [1].
2914a238c70SJohn Marino */
2924a238c70SJohn Marino #if 1
2934a238c70SJohn Marino mp_limb_t
mpfr_divhigh_n(mpfr_limb_ptr qp,mpfr_limb_ptr np,mpfr_limb_ptr dp,mp_size_t n)2944a238c70SJohn Marino mpfr_divhigh_n (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_ptr dp,
2954a238c70SJohn Marino                 mp_size_t n)
2964a238c70SJohn Marino {
2974a238c70SJohn Marino   mp_size_t k, l;
2984a238c70SJohn Marino   mp_limb_t qh, cy;
2994a238c70SJohn Marino   mpfr_limb_ptr tp;
3004a238c70SJohn Marino   MPFR_TMP_DECL(marker);
3014a238c70SJohn Marino 
3024a238c70SJohn Marino   MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 15); /* so that 2*(n/3) >= (n+4)/2 */
3034a238c70SJohn Marino   k = MPFR_LIKELY (n < MPFR_DIVHIGH_TAB_SIZE) ? divhigh_ktab[n] : 2*(n/3);
3044a238c70SJohn Marino 
3054a238c70SJohn Marino   if (k == 0)
3064a238c70SJohn Marino #if defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q)
3074a238c70SJohn Marino   {
3084a238c70SJohn Marino     mpfr_pi1_t dinv2;
3094a238c70SJohn Marino     invert_pi1 (dinv2, dp[n - 1], dp[n - 2]);
3104a238c70SJohn Marino     return __gmpn_sbpi1_divappr_q (qp, np, n + n, dp, n, dinv2.inv32);
3114a238c70SJohn Marino   }
3124a238c70SJohn Marino #else /* use our own code for base-case short division */
3134a238c70SJohn Marino     return mpfr_divhigh_n_basecase (qp, np, dp, n);
3144a238c70SJohn Marino #endif
3154a238c70SJohn Marino   else if (k == n)
3164a238c70SJohn Marino     /* for k=n, we use a division with remainder (mpn_divrem),
3174a238c70SJohn Marino      which computes the exact quotient */
3184a238c70SJohn Marino     return mpn_divrem (qp, 0, np, 2 * n, dp, n);
3194a238c70SJohn Marino 
3204a238c70SJohn Marino   MPFR_ASSERTD ((n+4)/2 <= k && k < n); /* bounds from [1] */
3214a238c70SJohn Marino   MPFR_TMP_MARK (marker);
3224a238c70SJohn Marino   l = n - k;
3234a238c70SJohn Marino   /* first divide the most significant 2k limbs from N by the most significant
3244a238c70SJohn Marino      k limbs of D */
3254a238c70SJohn Marino   qh = mpn_divrem (qp + l, 0, np + 2 * l, 2 * k, dp + l, k); /* exact */
3264a238c70SJohn Marino 
3274a238c70SJohn Marino   /* it remains {np,2l+k} = {np,n+l} as remainder */
3284a238c70SJohn Marino 
3294a238c70SJohn Marino   /* now we have to subtract high(Q1)*D0 where Q1=qh*B^k+{qp+l,k} and
3304a238c70SJohn Marino      D0={dp,l} */
3314a238c70SJohn Marino   tp = MPFR_TMP_LIMBS_ALLOC (2 * l);
3324a238c70SJohn Marino   mpfr_mulhigh_n (tp, qp + k, dp, l);
3334a238c70SJohn Marino   /* we are only interested in the upper l limbs from {tp,2l} */
3344a238c70SJohn Marino   cy = mpn_sub_n (np + n, np + n, tp + l, l);
3354a238c70SJohn Marino   if (qh)
3364a238c70SJohn Marino     cy += mpn_sub_n (np + n, np + n, dp, l);
3374a238c70SJohn Marino   while (cy > 0) /* Q1 was too large: subtract 1 to Q1 and add D to np+l */
3384a238c70SJohn Marino     {
3394a238c70SJohn Marino       qh -= mpn_sub_1 (qp + l, qp + l, k, MPFR_LIMB_ONE);
3404a238c70SJohn Marino       cy -= mpn_add_n (np + l, np + l, dp, n);
3414a238c70SJohn Marino     }
3424a238c70SJohn Marino 
3434a238c70SJohn Marino   /* now it remains {np,n+l} to divide by D */
3444a238c70SJohn Marino   cy = mpfr_divhigh_n (qp, np + k, dp + k, l);
3454a238c70SJohn Marino   qh += mpn_add_1 (qp + l, qp + l, k, cy);
3464a238c70SJohn Marino   MPFR_TMP_FREE(marker);
3474a238c70SJohn Marino 
3484a238c70SJohn Marino   return qh;
3494a238c70SJohn Marino }
3504a238c70SJohn Marino #else /* below is the FoldDiv(K) algorithm from [1] */
3514a238c70SJohn Marino mp_limb_t
mpfr_divhigh_n(mpfr_limb_ptr qp,mpfr_limb_ptr np,mpfr_limb_ptr dp,mp_size_t n)3524a238c70SJohn Marino mpfr_divhigh_n (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_ptr dp,
3534a238c70SJohn Marino                 mp_size_t n)
3544a238c70SJohn Marino {
3554a238c70SJohn Marino   mp_size_t k, r;
3564a238c70SJohn Marino   mpfr_limb_ptr ip, tp, up;
3574a238c70SJohn Marino   mp_limb_t qh = 0, cy, cc;
3584a238c70SJohn Marino   int count;
3594a238c70SJohn Marino   MPFR_TMP_DECL(marker);
3604a238c70SJohn Marino 
3614a238c70SJohn Marino #define K 3
3624a238c70SJohn Marino   if (n < K)
3634a238c70SJohn Marino     return mpn_divrem (qp, 0, np, 2 * n, dp, n);
3644a238c70SJohn Marino 
3654a238c70SJohn Marino   k = (n - 1) / K + 1; /* ceil(n/K) */
3664a238c70SJohn Marino 
3674a238c70SJohn Marino   MPFR_TMP_MARK (marker);
3684a238c70SJohn Marino   ip = MPFR_TMP_LIMBS_ALLOC (k + 1);
3694a238c70SJohn Marino   tp = MPFR_TMP_LIMBS_ALLOC (n + k);
3704a238c70SJohn Marino   up = MPFR_TMP_LIMBS_ALLOC (2 * (k + 1));
3714a238c70SJohn Marino   mpn_invert (ip, dp + n - (k + 1), k + 1, NULL); /* takes about 13% for n=1000 */
3724a238c70SJohn Marino   /* {ip, k+1} = floor((B^(2k+2)-1)/D - B^(k+1) where D = {dp+n-(k+1),k+1} */
3734a238c70SJohn Marino   for (r = n, cc = 0UL; r > 0;)
3744a238c70SJohn Marino     {
3754a238c70SJohn Marino       /* cc is the carry at np[n+r] */
3764a238c70SJohn Marino       MPFR_ASSERTD(cc <= 1);
3774a238c70SJohn Marino       /* FIXME: why can we have cc as large as say 8? */
3784a238c70SJohn Marino       count = 0;
3794a238c70SJohn Marino       while (cc > 0)
3804a238c70SJohn Marino         {
3814a238c70SJohn Marino           count ++;
3824a238c70SJohn Marino           MPFR_ASSERTD(count <= 1);
3834a238c70SJohn Marino           /* subtract {dp+n-r,r} from {np+n,r} */
3844a238c70SJohn Marino           cc -= mpn_sub_n (np + n, np + n, dp + n - r, r);
3854a238c70SJohn Marino           /* add 1 at qp[r] */
3864a238c70SJohn Marino           qh += mpn_add_1 (qp + r, qp + r, n - r, 1UL);
3874a238c70SJohn Marino         }
3884a238c70SJohn Marino       /* it remains r limbs to reduce, i.e., the remainder is {np, n+r} */
3894a238c70SJohn Marino       if (r < k)
3904a238c70SJohn Marino         {
3914a238c70SJohn Marino           ip += k - r;
3924a238c70SJohn Marino           k = r;
3934a238c70SJohn Marino         }
3944a238c70SJohn Marino       /* now r >= k */
3954a238c70SJohn Marino       /* qp + r - 2 * k -> up */
3964a238c70SJohn Marino       mpfr_mulhigh_n (up, np + n + r - (k + 1), ip, k + 1);
3974a238c70SJohn Marino       /* take into account the term B^k in the inverse: B^k * {np+n+r-k, k} */
3984a238c70SJohn Marino       cy = mpn_add_n (qp + r - k, up + k + 2, np + n + r - k, k);
3994a238c70SJohn Marino       /* since we need only r limbs of tp (below), it suffices to consider
4004a238c70SJohn Marino          r high limbs of dp */
4014a238c70SJohn Marino       if (r > k)
4024a238c70SJohn Marino         {
4034a238c70SJohn Marino #if 0
4044a238c70SJohn Marino           mpn_mul (tp, dp + n - r, r, qp + r - k, k);
4054a238c70SJohn Marino #else  /* use a short product for the low k x k limbs */
4064a238c70SJohn Marino           /* we know the upper k limbs of the r-limb product cancel with the
4074a238c70SJohn Marino              remainder, thus we only need to compute the low r-k limbs */
4084a238c70SJohn Marino           if (r - k >= k)
4094a238c70SJohn Marino             mpn_mul (tp + k, dp + n - r + k, r - k, qp + r - k, k);
4104a238c70SJohn Marino           else /* r-k < k */
4114a238c70SJohn Marino             {
4124a238c70SJohn Marino /* #define LOW */
4134a238c70SJohn Marino #ifndef LOW
4144a238c70SJohn Marino               mpn_mul (tp + k, qp + r - k, k, dp + n - r + k, r - k);
4154a238c70SJohn Marino #else
4164a238c70SJohn Marino               mpfr_mullow_n_basecase (tp + k, qp + r - k, dp + n - r + k, r - k);
4174a238c70SJohn Marino               /* take into account qp[2r-2k] * dp[n - r + k] */
4184a238c70SJohn Marino               tp[r] += qp[2*r-2*k] * dp[n - r + k];
4194a238c70SJohn Marino #endif
4204a238c70SJohn Marino               /* tp[k..r] is filled */
4214a238c70SJohn Marino             }
4224a238c70SJohn Marino #if 0
4234a238c70SJohn Marino           mpfr_mulhigh_n (up, dp + n - r, qp + r - k, k);
4244a238c70SJohn Marino #else /* compute one more limb. FIXME: we could add one limb of dp in the
4254a238c70SJohn Marino          above, to save one mpn_addmul_1 call */
4264a238c70SJohn Marino           mpfr_mulhigh_n (up, dp + n - r, qp + r - k, k - 1); /* {up,2k-2} */
4274a238c70SJohn Marino           /* add {qp + r - k, k - 1} * dp[n-r+k-1] */
4284a238c70SJohn Marino           up[2*k-2] = mpn_addmul_1 (up + k - 1, qp + r - k, k-1, dp[n-r+k-1]);
4294a238c70SJohn Marino           /* add {dp+n-r, k} * qp[r-1] */
4304a238c70SJohn Marino           up[2*k-1] = mpn_addmul_1 (up + k - 1, dp + n - r, k, qp[r-1]);
4314a238c70SJohn Marino #endif
4324a238c70SJohn Marino #ifndef LOW
4334a238c70SJohn Marino           cc = mpn_add_n (tp + k, tp + k, up + k, k);
4344a238c70SJohn Marino           mpn_add_1 (tp + 2 * k, tp + 2 * k, r - k, cc);
4354a238c70SJohn Marino #else
4364a238c70SJohn Marino           /* update tp[k..r] */
4374a238c70SJohn Marino           if (r - k + 1 <= k)
4384a238c70SJohn Marino             mpn_add_n (tp + k, tp + k, up + k, r - k + 1);
4394a238c70SJohn Marino           else /* r - k >= k */
4404a238c70SJohn Marino             {
4414a238c70SJohn Marino               cc = mpn_add_n (tp + k, tp + k, up + k, k);
4424a238c70SJohn Marino               mpn_add_1 (tp + 2 * k, tp + 2 * k, r - 2 * k + 1, cc);
4434a238c70SJohn Marino             }
4444a238c70SJohn Marino #endif
4454a238c70SJohn Marino #endif
4464a238c70SJohn Marino         }
4474a238c70SJohn Marino       else /* last step: since we only want the quotient, no need to update,
4484a238c70SJohn Marino               just propagate the carry cy */
4494a238c70SJohn Marino         {
4504a238c70SJohn Marino           MPFR_ASSERTD(r < n);
4514a238c70SJohn Marino           if (cy > 0)
4524a238c70SJohn Marino             qh += mpn_add_1 (qp + r, qp + r, n - r, cy);
4534a238c70SJohn Marino           break;
4544a238c70SJohn Marino         }
4554a238c70SJohn Marino       /* subtract {tp, n+k} from {np+r-k, n+k}; however we only want to
4564a238c70SJohn Marino          update {np+n, n} */
4574a238c70SJohn Marino       /* we should have tp[r] = np[n+r-k] up to 1 */
4584a238c70SJohn Marino       MPFR_ASSERTD(tp[r] == np[n + r - k] || tp[r] + 1 == np[n + r - k]);
4594a238c70SJohn Marino #ifndef LOW
4604a238c70SJohn Marino       cc = mpn_sub_n (np + n - 1, np + n - 1, tp + k - 1, r + 1); /* borrow at np[n+r] */
4614a238c70SJohn Marino #else
4624a238c70SJohn Marino       cc = mpn_sub_n (np + n - 1, np + n - 1, tp + k - 1, r - k + 2);
4634a238c70SJohn Marino #endif
4644a238c70SJohn Marino       /* if cy = 1, subtract {dp, n} from {np+r, n}, thus
4654a238c70SJohn Marino          {dp+n-r,r} from {np+n,r} */
4664a238c70SJohn Marino       if (cy)
4674a238c70SJohn Marino         {
4684a238c70SJohn Marino           if (r < n)
4694a238c70SJohn Marino             cc += mpn_sub_n (np + n - 1, np + n - 1, dp + n - r - 1, r + 1);
4704a238c70SJohn Marino           else
4714a238c70SJohn Marino             cc += mpn_sub_n (np + n, np + n, dp + n - r, r);
4724a238c70SJohn Marino           /* propagate cy */
4734a238c70SJohn Marino           if (r == n)
4744a238c70SJohn Marino             qh = cy;
4754a238c70SJohn Marino           else
4764a238c70SJohn Marino             qh += mpn_add_1 (qp + r, qp + r, n - r, cy);
4774a238c70SJohn Marino         }
4784a238c70SJohn Marino       /* cc is the borrow at np[n+r] */
4794a238c70SJohn Marino       count = 0;
4804a238c70SJohn Marino       while (cc > 0) /* quotient was too large */
4814a238c70SJohn Marino         {
4824a238c70SJohn Marino           count++;
4834a238c70SJohn Marino           MPFR_ASSERTD (count <= 1);
4844a238c70SJohn Marino           cy = mpn_add_n (np + n, np + n, dp + n - (r - k), r - k);
4854a238c70SJohn Marino           cc -= mpn_add_1 (np + n + r - k, np + n + r - k, k, cy);
4864a238c70SJohn Marino           qh -= mpn_sub_1 (qp + r - k, qp + r - k, n - (r - k), 1UL);
4874a238c70SJohn Marino         }
4884a238c70SJohn Marino       r -= k;
4894a238c70SJohn Marino       cc = np[n + r];
4904a238c70SJohn Marino     }
4914a238c70SJohn Marino   MPFR_TMP_FREE(marker);
4924a238c70SJohn Marino 
4934a238c70SJohn Marino   return qh;
4944a238c70SJohn Marino }
4954a238c70SJohn Marino #endif
496