151c586b8Smrg /* mpn_toom_interpolate_8pts -- Interpolate for toom54, 63, 72.
251c586b8Smrg
351c586b8Smrg Contributed to the GNU project by Marco Bodrato.
451c586b8Smrg
551c586b8Smrg THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
651c586b8Smrg SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
751c586b8Smrg GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
851c586b8Smrg
9dab47db4Smrg Copyright 2009, 2011, 2012 Free Software Foundation, Inc.
1051c586b8Smrg
1151c586b8Smrg This file is part of the GNU MP Library.
1251c586b8Smrg
1351c586b8Smrg The GNU MP Library is free software; you can redistribute it and/or modify
14ce543368Smrg it under the terms of either:
15ce543368Smrg
16ce543368Smrg * the GNU Lesser General Public License as published by the Free
17ce543368Smrg Software Foundation; either version 3 of the License, or (at your
1851c586b8Smrg option) any later version.
1951c586b8Smrg
20ce543368Smrg or
21ce543368Smrg
22ce543368Smrg * the GNU General Public License as published by the Free Software
23ce543368Smrg Foundation; either version 2 of the License, or (at your option) any
24ce543368Smrg later version.
25ce543368Smrg
26ce543368Smrg or both in parallel, as here.
27ce543368Smrg
2851c586b8Smrg The GNU MP Library is distributed in the hope that it will be useful, but
2951c586b8Smrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30ce543368Smrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
31ce543368Smrg for more details.
3251c586b8Smrg
33ce543368Smrg You should have received copies of the GNU General Public License and the
34ce543368Smrg GNU Lesser General Public License along with the GNU MP Library. If not,
35ce543368Smrg see https://www.gnu.org/licenses/. */
3651c586b8Smrg
3751c586b8Smrg #include "gmp-impl.h"
3851c586b8Smrg
3951c586b8Smrg #define BINVERT_3 MODLIMB_INVERSE_3
4051c586b8Smrg
4151c586b8Smrg #define BINVERT_15 \
4251c586b8Smrg ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15)
4351c586b8Smrg
4451c586b8Smrg #define BINVERT_45 ((BINVERT_15 * BINVERT_3) & GMP_NUMB_MASK)
4551c586b8Smrg
4651c586b8Smrg #ifndef mpn_divexact_by3
4751c586b8Smrg #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
4851c586b8Smrg #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)
4951c586b8Smrg #else
5051c586b8Smrg #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
5151c586b8Smrg #endif
5251c586b8Smrg #endif
5351c586b8Smrg
5451c586b8Smrg #ifndef mpn_divexact_by45
5551c586b8Smrg #if GMP_NUMB_BITS % 12 == 0
5651c586b8Smrg #define mpn_divexact_by45(dst,src,size) \
5751c586b8Smrg (63 & 19 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 45)))
5851c586b8Smrg #else
5951c586b8Smrg #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
6051c586b8Smrg #define mpn_divexact_by45(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,45,BINVERT_45,0)
6151c586b8Smrg #else
6251c586b8Smrg #define mpn_divexact_by45(dst,src,size) mpn_divexact_1(dst,src,size,45)
6351c586b8Smrg #endif
6451c586b8Smrg #endif
6551c586b8Smrg #endif
6651c586b8Smrg
67dab47db4Smrg #if HAVE_NATIVE_mpn_sublsh2_n_ip1
68dab47db4Smrg #define DO_mpn_sublsh2_n(dst,src,n,ws) mpn_sublsh2_n_ip1(dst,src,n)
69dab47db4Smrg #else
70dab47db4Smrg #define DO_mpn_sublsh2_n(dst,src,n,ws) DO_mpn_sublsh_n(dst,src,n,2,ws)
71dab47db4Smrg #endif
72dab47db4Smrg
7351c586b8Smrg #if HAVE_NATIVE_mpn_sublsh_n
74dab47db4Smrg #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n (dst,dst,src,n,s)
7551c586b8Smrg #else
7651c586b8Smrg static mp_limb_t
DO_mpn_sublsh_n(mp_ptr dst,mp_srcptr src,mp_size_t n,unsigned int s,mp_ptr ws)7751c586b8Smrg DO_mpn_sublsh_n (mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
7851c586b8Smrg {
79dab47db4Smrg #if USE_MUL_1 && 0
8051c586b8Smrg return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
8151c586b8Smrg #else
8251c586b8Smrg mp_limb_t __cy;
8351c586b8Smrg __cy = mpn_lshift (ws,src,n,s);
8451c586b8Smrg return __cy + mpn_sub_n (dst,dst,ws,n);
8551c586b8Smrg #endif
8651c586b8Smrg }
8751c586b8Smrg #endif
8851c586b8Smrg
8951c586b8Smrg
9051c586b8Smrg #if HAVE_NATIVE_mpn_subrsh
9151c586b8Smrg #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh (dst,nd,src,ns,s)
9251c586b8Smrg #else
9351c586b8Smrg /* This is not a correct definition, it assumes no carry */
9451c586b8Smrg #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \
9551c586b8Smrg do { \
9651c586b8Smrg mp_limb_t __cy; \
9751c586b8Smrg MPN_DECR_U (dst, nd, src[0] >> s); \
9851c586b8Smrg __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \
9951c586b8Smrg MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \
10051c586b8Smrg } while (0)
10151c586b8Smrg #endif
10251c586b8Smrg
10351c586b8Smrg /* Interpolation for Toom-4.5 (or Toom-4), using the evaluation
10451c586b8Smrg points: infinity(4.5 only), 4, -4, 2, -2, 1, -1, 0. More precisely,
10551c586b8Smrg we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
10651c586b8Smrg degree 7 (or 6), given the 8 (rsp. 7) values:
10751c586b8Smrg
10851c586b8Smrg r1 = limit at infinity of f(x) / x^7,
10951c586b8Smrg r2 = f(4),
11051c586b8Smrg r3 = f(-4),
11151c586b8Smrg r4 = f(2),
11251c586b8Smrg r5 = f(-2),
11351c586b8Smrg r6 = f(1),
11451c586b8Smrg r7 = f(-1),
11551c586b8Smrg r8 = f(0).
11651c586b8Smrg
11751c586b8Smrg All couples of the form f(n),f(-n) must be already mixed with
11851c586b8Smrg toom_couple_handling(f(n),...,f(-n),...)
11951c586b8Smrg
12051c586b8Smrg The result is stored in {pp, spt + 7*n (or 6*n)}.
12151c586b8Smrg At entry, r8 is stored at {pp, 2n},
12251c586b8Smrg r5 is stored at {pp + 3n, 3n + 1}.
12351c586b8Smrg
12451c586b8Smrg The other values are 2n+... limbs each (with most significant limbs small).
12551c586b8Smrg
12651c586b8Smrg All intermediate results are positive.
12751c586b8Smrg Inputs are destroyed.
12851c586b8Smrg */
12951c586b8Smrg
13051c586b8Smrg void
mpn_toom_interpolate_8pts(mp_ptr pp,mp_size_t n,mp_ptr r3,mp_ptr r7,mp_size_t spt,mp_ptr ws)13151c586b8Smrg mpn_toom_interpolate_8pts (mp_ptr pp, mp_size_t n,
13251c586b8Smrg mp_ptr r3, mp_ptr r7,
13351c586b8Smrg mp_size_t spt, mp_ptr ws)
13451c586b8Smrg {
13551c586b8Smrg mp_limb_signed_t cy;
13651c586b8Smrg mp_ptr r5, r1;
13751c586b8Smrg r5 = (pp + 3 * n); /* 3n+1 */
13851c586b8Smrg r1 = (pp + 7 * n); /* spt */
13951c586b8Smrg
14051c586b8Smrg /******************************* interpolation *****************************/
14151c586b8Smrg
14251c586b8Smrg DO_mpn_subrsh(r3+n, 2 * n + 1, pp, 2 * n, 4, ws);
14351c586b8Smrg cy = DO_mpn_sublsh_n (r3, r1, spt, 12, ws);
14451c586b8Smrg MPN_DECR_U (r3 + spt, 3 * n + 1 - spt, cy);
14551c586b8Smrg
14651c586b8Smrg DO_mpn_subrsh(r5+n, 2 * n + 1, pp, 2 * n, 2, ws);
14751c586b8Smrg cy = DO_mpn_sublsh_n (r5, r1, spt, 6, ws);
14851c586b8Smrg MPN_DECR_U (r5 + spt, 3 * n + 1 - spt, cy);
14951c586b8Smrg
15051c586b8Smrg r7[3*n] -= mpn_sub_n (r7+n, r7+n, pp, 2 * n);
15151c586b8Smrg cy = mpn_sub_n (r7, r7, r1, spt);
15251c586b8Smrg MPN_DECR_U (r7 + spt, 3 * n + 1 - spt, cy);
15351c586b8Smrg
15451c586b8Smrg ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1));
15551c586b8Smrg ASSERT_NOCARRY(mpn_rshift(r3, r3, 3 * n + 1, 2));
15651c586b8Smrg
15751c586b8Smrg ASSERT_NOCARRY(mpn_sub_n (r5, r5, r7, 3 * n + 1));
15851c586b8Smrg
15951c586b8Smrg ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1));
16051c586b8Smrg
16151c586b8Smrg mpn_divexact_by45 (r3, r3, 3 * n + 1);
16251c586b8Smrg
16351c586b8Smrg ASSERT_NOCARRY(mpn_divexact_by3 (r5, r5, 3 * n + 1));
16451c586b8Smrg
165dab47db4Smrg ASSERT_NOCARRY(DO_mpn_sublsh2_n (r5, r3, 3 * n + 1, ws));
16651c586b8Smrg
16751c586b8Smrg /* last interpolation steps... */
16851c586b8Smrg /* ... are mixed with recomposition */
16951c586b8Smrg
17051c586b8Smrg /***************************** recomposition *******************************/
17151c586b8Smrg /*
17251c586b8Smrg pp[] prior to operations:
17351c586b8Smrg |_H r1|_L r1|____||_H r5|_M_r5|_L r5|_____|_H r8|_L r8|pp
17451c586b8Smrg
17551c586b8Smrg summation scheme for remaining operations:
17651c586b8Smrg |____8|n___7|n___6|n___5|n___4|n___3|n___2|n____|n____|pp
17751c586b8Smrg |_H r1|_L r1|____||_H*r5|_M r5|_L r5|_____|_H_r8|_L r8|pp
17851c586b8Smrg ||_H r3|_M r3|_L*r3|
17951c586b8Smrg ||_H_r7|_M_r7|_L_r7|
18051c586b8Smrg ||-H r3|-M r3|-L*r3|
18151c586b8Smrg ||-H*r5|-M_r5|-L_r5|
18251c586b8Smrg */
18351c586b8Smrg
18451c586b8Smrg cy = mpn_add_n (pp + n, pp + n, r7, n); /* Hr8+Lr7-Lr5 */
18551c586b8Smrg cy-= mpn_sub_n (pp + n, pp + n, r5, n);
186*72c7faa4Smrg if (cy > 0) {
187*72c7faa4Smrg MPN_INCR_U (r7 + n, 2*n + 1, 1);
188*72c7faa4Smrg cy = 0;
189*72c7faa4Smrg }
19051c586b8Smrg
191*72c7faa4Smrg cy = mpn_sub_nc (pp + 2*n, r7 + n, r5 + n, n, -cy); /* Mr7-Mr5 */
19251c586b8Smrg MPN_DECR_U (r7 + 2*n, n + 1, cy);
19351c586b8Smrg
19451c586b8Smrg cy = mpn_add_n (pp + 3*n, r5, r7+ 2*n, n+1); /* Hr7+Lr5 */
19551c586b8Smrg r5[3*n]+= mpn_add_n (r5 + 2*n, r5 + 2*n, r3, n); /* Hr5+Lr3 */
19651c586b8Smrg cy-= mpn_sub_n (pp + 3*n, pp + 3*n, r5 + 2*n, n+1); /* Hr7-Hr5+Lr5-Lr3 */
19751c586b8Smrg if (UNLIKELY(0 > cy))
19851c586b8Smrg MPN_DECR_U (r5 + n + 1, 2*n, 1);
19951c586b8Smrg else
20051c586b8Smrg MPN_INCR_U (r5 + n + 1, 2*n, cy);
20151c586b8Smrg
20251c586b8Smrg ASSERT_NOCARRY(mpn_sub_n(pp + 4*n, r5 + n, r3 + n, 2*n +1)); /* Mr5-Mr3,Hr5-Hr3 */
20351c586b8Smrg
20451c586b8Smrg cy = mpn_add_1 (pp + 6*n, r3 + n, n, pp[6*n]);
20551c586b8Smrg MPN_INCR_U (r3 + 2*n, n + 1, cy);
206dab47db4Smrg cy = mpn_add_n (pp + 7*n, pp + 7*n, r3 + 2*n, n);
207dab47db4Smrg if (LIKELY(spt != n))
208dab47db4Smrg MPN_INCR_U (pp + 8*n, spt - n, cy + r3[3*n]);
209dab47db4Smrg else
210ce543368Smrg ASSERT (r3[3*n] + cy == 0);
21151c586b8Smrg }
212