xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/generic/toom_interpolate_8pts.c (revision 72c7faa4dbb41dbb0238d6b4a109da0d4b236dd4)
151c586b8Smrg /* mpn_toom_interpolate_8pts -- Interpolate for toom54, 63, 72.
251c586b8Smrg 
351c586b8Smrg    Contributed to the GNU project by Marco Bodrato.
451c586b8Smrg 
551c586b8Smrg    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
651c586b8Smrg    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
751c586b8Smrg    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
851c586b8Smrg 
9dab47db4Smrg Copyright 2009, 2011, 2012 Free Software Foundation, Inc.
1051c586b8Smrg 
1151c586b8Smrg This file is part of the GNU MP Library.
1251c586b8Smrg 
1351c586b8Smrg The GNU MP Library is free software; you can redistribute it and/or modify
14ce543368Smrg it under the terms of either:
15ce543368Smrg 
16ce543368Smrg   * the GNU Lesser General Public License as published by the Free
17ce543368Smrg     Software Foundation; either version 3 of the License, or (at your
1851c586b8Smrg     option) any later version.
1951c586b8Smrg 
20ce543368Smrg or
21ce543368Smrg 
22ce543368Smrg   * the GNU General Public License as published by the Free Software
23ce543368Smrg     Foundation; either version 2 of the License, or (at your option) any
24ce543368Smrg     later version.
25ce543368Smrg 
26ce543368Smrg or both in parallel, as here.
27ce543368Smrg 
2851c586b8Smrg The GNU MP Library is distributed in the hope that it will be useful, but
2951c586b8Smrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30ce543368Smrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
31ce543368Smrg for more details.
3251c586b8Smrg 
33ce543368Smrg You should have received copies of the GNU General Public License and the
34ce543368Smrg GNU Lesser General Public License along with the GNU MP Library.  If not,
35ce543368Smrg see https://www.gnu.org/licenses/.  */
3651c586b8Smrg 
3751c586b8Smrg #include "gmp-impl.h"
3851c586b8Smrg 
3951c586b8Smrg #define BINVERT_3 MODLIMB_INVERSE_3
4051c586b8Smrg 
4151c586b8Smrg #define BINVERT_15 \
4251c586b8Smrg   ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15)
4351c586b8Smrg 
4451c586b8Smrg #define BINVERT_45 ((BINVERT_15 * BINVERT_3) & GMP_NUMB_MASK)
4551c586b8Smrg 
4651c586b8Smrg #ifndef mpn_divexact_by3
4751c586b8Smrg #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
4851c586b8Smrg #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)
4951c586b8Smrg #else
5051c586b8Smrg #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
5151c586b8Smrg #endif
5251c586b8Smrg #endif
5351c586b8Smrg 
5451c586b8Smrg #ifndef mpn_divexact_by45
5551c586b8Smrg #if GMP_NUMB_BITS % 12 == 0
5651c586b8Smrg #define mpn_divexact_by45(dst,src,size) \
5751c586b8Smrg   (63 & 19 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 45)))
5851c586b8Smrg #else
5951c586b8Smrg #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
6051c586b8Smrg #define mpn_divexact_by45(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,45,BINVERT_45,0)
6151c586b8Smrg #else
6251c586b8Smrg #define mpn_divexact_by45(dst,src,size) mpn_divexact_1(dst,src,size,45)
6351c586b8Smrg #endif
6451c586b8Smrg #endif
6551c586b8Smrg #endif
6651c586b8Smrg 
67dab47db4Smrg #if HAVE_NATIVE_mpn_sublsh2_n_ip1
68dab47db4Smrg #define DO_mpn_sublsh2_n(dst,src,n,ws) mpn_sublsh2_n_ip1(dst,src,n)
69dab47db4Smrg #else
70dab47db4Smrg #define DO_mpn_sublsh2_n(dst,src,n,ws) DO_mpn_sublsh_n(dst,src,n,2,ws)
71dab47db4Smrg #endif
72dab47db4Smrg 
7351c586b8Smrg #if HAVE_NATIVE_mpn_sublsh_n
74dab47db4Smrg #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n (dst,dst,src,n,s)
7551c586b8Smrg #else
7651c586b8Smrg static mp_limb_t
DO_mpn_sublsh_n(mp_ptr dst,mp_srcptr src,mp_size_t n,unsigned int s,mp_ptr ws)7751c586b8Smrg DO_mpn_sublsh_n (mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
7851c586b8Smrg {
79dab47db4Smrg #if USE_MUL_1 && 0
8051c586b8Smrg   return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
8151c586b8Smrg #else
8251c586b8Smrg   mp_limb_t __cy;
8351c586b8Smrg   __cy = mpn_lshift (ws,src,n,s);
8451c586b8Smrg   return __cy + mpn_sub_n (dst,dst,ws,n);
8551c586b8Smrg #endif
8651c586b8Smrg }
8751c586b8Smrg #endif
8851c586b8Smrg 
8951c586b8Smrg 
9051c586b8Smrg #if HAVE_NATIVE_mpn_subrsh
9151c586b8Smrg #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh (dst,nd,src,ns,s)
9251c586b8Smrg #else
9351c586b8Smrg /* This is not a correct definition, it assumes no carry */
9451c586b8Smrg #define DO_mpn_subrsh(dst,nd,src,ns,s,ws)				\
9551c586b8Smrg do {									\
9651c586b8Smrg   mp_limb_t __cy;							\
9751c586b8Smrg   MPN_DECR_U (dst, nd, src[0] >> s);					\
9851c586b8Smrg   __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);	\
9951c586b8Smrg   MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);				\
10051c586b8Smrg } while (0)
10151c586b8Smrg #endif
10251c586b8Smrg 
10351c586b8Smrg /* Interpolation for Toom-4.5 (or Toom-4), using the evaluation
10451c586b8Smrg    points: infinity(4.5 only), 4, -4, 2, -2, 1, -1, 0. More precisely,
10551c586b8Smrg    we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
10651c586b8Smrg    degree 7 (or 6), given the 8 (rsp. 7) values:
10751c586b8Smrg 
10851c586b8Smrg      r1 = limit at infinity of f(x) / x^7,
10951c586b8Smrg      r2 = f(4),
11051c586b8Smrg      r3 = f(-4),
11151c586b8Smrg      r4 = f(2),
11251c586b8Smrg      r5 = f(-2),
11351c586b8Smrg      r6 = f(1),
11451c586b8Smrg      r7 = f(-1),
11551c586b8Smrg      r8 = f(0).
11651c586b8Smrg 
11751c586b8Smrg    All couples of the form f(n),f(-n) must be already mixed with
11851c586b8Smrg    toom_couple_handling(f(n),...,f(-n),...)
11951c586b8Smrg 
12051c586b8Smrg    The result is stored in {pp, spt + 7*n (or 6*n)}.
12151c586b8Smrg    At entry, r8 is stored at {pp, 2n},
12251c586b8Smrg    r5 is stored at {pp + 3n, 3n + 1}.
12351c586b8Smrg 
12451c586b8Smrg    The other values are 2n+... limbs each (with most significant limbs small).
12551c586b8Smrg 
12651c586b8Smrg    All intermediate results are positive.
12751c586b8Smrg    Inputs are destroyed.
12851c586b8Smrg */
12951c586b8Smrg 
13051c586b8Smrg void
mpn_toom_interpolate_8pts(mp_ptr pp,mp_size_t n,mp_ptr r3,mp_ptr r7,mp_size_t spt,mp_ptr ws)13151c586b8Smrg mpn_toom_interpolate_8pts (mp_ptr pp, mp_size_t n,
13251c586b8Smrg 			   mp_ptr r3, mp_ptr r7,
13351c586b8Smrg 			   mp_size_t spt, mp_ptr ws)
13451c586b8Smrg {
13551c586b8Smrg   mp_limb_signed_t cy;
13651c586b8Smrg   mp_ptr r5, r1;
13751c586b8Smrg   r5 = (pp + 3 * n);			/* 3n+1 */
13851c586b8Smrg   r1 = (pp + 7 * n);			/* spt */
13951c586b8Smrg 
14051c586b8Smrg   /******************************* interpolation *****************************/
14151c586b8Smrg 
14251c586b8Smrg   DO_mpn_subrsh(r3+n, 2 * n + 1, pp, 2 * n, 4, ws);
14351c586b8Smrg   cy = DO_mpn_sublsh_n (r3, r1, spt, 12, ws);
14451c586b8Smrg   MPN_DECR_U (r3 + spt, 3 * n + 1 - spt, cy);
14551c586b8Smrg 
14651c586b8Smrg   DO_mpn_subrsh(r5+n, 2 * n + 1, pp, 2 * n, 2, ws);
14751c586b8Smrg   cy = DO_mpn_sublsh_n (r5, r1, spt, 6, ws);
14851c586b8Smrg   MPN_DECR_U (r5 + spt, 3 * n + 1 - spt, cy);
14951c586b8Smrg 
15051c586b8Smrg   r7[3*n] -= mpn_sub_n (r7+n, r7+n, pp, 2 * n);
15151c586b8Smrg   cy = mpn_sub_n (r7, r7, r1, spt);
15251c586b8Smrg   MPN_DECR_U (r7 + spt, 3 * n + 1 - spt, cy);
15351c586b8Smrg 
15451c586b8Smrg   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1));
15551c586b8Smrg   ASSERT_NOCARRY(mpn_rshift(r3, r3, 3 * n + 1, 2));
15651c586b8Smrg 
15751c586b8Smrg   ASSERT_NOCARRY(mpn_sub_n (r5, r5, r7, 3 * n + 1));
15851c586b8Smrg 
15951c586b8Smrg   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1));
16051c586b8Smrg 
16151c586b8Smrg   mpn_divexact_by45 (r3, r3, 3 * n + 1);
16251c586b8Smrg 
16351c586b8Smrg   ASSERT_NOCARRY(mpn_divexact_by3 (r5, r5, 3 * n + 1));
16451c586b8Smrg 
165dab47db4Smrg   ASSERT_NOCARRY(DO_mpn_sublsh2_n (r5, r3, 3 * n + 1, ws));
16651c586b8Smrg 
16751c586b8Smrg   /* last interpolation steps... */
16851c586b8Smrg   /* ... are mixed with recomposition */
16951c586b8Smrg 
17051c586b8Smrg   /***************************** recomposition *******************************/
17151c586b8Smrg   /*
17251c586b8Smrg     pp[] prior to operations:
17351c586b8Smrg      |_H r1|_L r1|____||_H r5|_M_r5|_L r5|_____|_H r8|_L r8|pp
17451c586b8Smrg 
17551c586b8Smrg     summation scheme for remaining operations:
17651c586b8Smrg      |____8|n___7|n___6|n___5|n___4|n___3|n___2|n____|n____|pp
17751c586b8Smrg      |_H r1|_L r1|____||_H*r5|_M r5|_L r5|_____|_H_r8|_L r8|pp
17851c586b8Smrg 	  ||_H r3|_M r3|_L*r3|
17951c586b8Smrg 				  ||_H_r7|_M_r7|_L_r7|
18051c586b8Smrg 		      ||-H r3|-M r3|-L*r3|
18151c586b8Smrg 				  ||-H*r5|-M_r5|-L_r5|
18251c586b8Smrg   */
18351c586b8Smrg 
18451c586b8Smrg   cy = mpn_add_n (pp + n, pp + n, r7, n); /* Hr8+Lr7-Lr5 */
18551c586b8Smrg   cy-= mpn_sub_n (pp + n, pp + n, r5, n);
186*72c7faa4Smrg   if (cy > 0) {
187*72c7faa4Smrg     MPN_INCR_U (r7 + n, 2*n + 1, 1);
188*72c7faa4Smrg     cy = 0;
189*72c7faa4Smrg   }
19051c586b8Smrg 
191*72c7faa4Smrg   cy = mpn_sub_nc (pp + 2*n, r7 + n, r5 + n, n, -cy); /* Mr7-Mr5 */
19251c586b8Smrg   MPN_DECR_U (r7 + 2*n, n + 1, cy);
19351c586b8Smrg 
19451c586b8Smrg   cy = mpn_add_n (pp + 3*n, r5, r7+ 2*n, n+1); /* Hr7+Lr5 */
19551c586b8Smrg   r5[3*n]+= mpn_add_n (r5 + 2*n, r5 + 2*n, r3, n); /* Hr5+Lr3 */
19651c586b8Smrg   cy-= mpn_sub_n (pp + 3*n, pp + 3*n, r5 + 2*n, n+1); /* Hr7-Hr5+Lr5-Lr3 */
19751c586b8Smrg   if (UNLIKELY(0 > cy))
19851c586b8Smrg     MPN_DECR_U (r5 + n + 1, 2*n, 1);
19951c586b8Smrg   else
20051c586b8Smrg     MPN_INCR_U (r5 + n + 1, 2*n, cy);
20151c586b8Smrg 
20251c586b8Smrg   ASSERT_NOCARRY(mpn_sub_n(pp + 4*n, r5 + n, r3 + n, 2*n +1)); /* Mr5-Mr3,Hr5-Hr3 */
20351c586b8Smrg 
20451c586b8Smrg   cy = mpn_add_1 (pp + 6*n, r3 + n, n, pp[6*n]);
20551c586b8Smrg   MPN_INCR_U (r3 + 2*n, n + 1, cy);
206dab47db4Smrg   cy = mpn_add_n (pp + 7*n, pp + 7*n, r3 + 2*n, n);
207dab47db4Smrg   if (LIKELY(spt != n))
208dab47db4Smrg     MPN_INCR_U (pp + 8*n, spt - n, cy + r3[3*n]);
209dab47db4Smrg   else
210ce543368Smrg     ASSERT (r3[3*n] + cy == 0);
21151c586b8Smrg }
212