xref: /dflybsd-src/contrib/gmp/mpn/generic/mul_fft.c (revision 86d7f5d305c6adaa56ff4582ece9859d73106103)
186d7f5d3SJohn Marino /* Schoenhage's fast multiplication modulo 2^N+1.
286d7f5d3SJohn Marino 
386d7f5d3SJohn Marino    Contributed by Paul Zimmermann.
486d7f5d3SJohn Marino 
586d7f5d3SJohn Marino    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
686d7f5d3SJohn Marino    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
786d7f5d3SJohn Marino    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
886d7f5d3SJohn Marino 
986d7f5d3SJohn Marino Copyright 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
1086d7f5d3SJohn Marino 2009, 2010 Free Software Foundation, Inc.
1186d7f5d3SJohn Marino 
1286d7f5d3SJohn Marino This file is part of the GNU MP Library.
1386d7f5d3SJohn Marino 
1486d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1586d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1686d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1786d7f5d3SJohn Marino option) any later version.
1886d7f5d3SJohn Marino 
1986d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
2086d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2186d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
2286d7f5d3SJohn Marino License for more details.
2386d7f5d3SJohn Marino 
2486d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2586d7f5d3SJohn Marino along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
2686d7f5d3SJohn Marino 
2786d7f5d3SJohn Marino 
2886d7f5d3SJohn Marino /* References:
2986d7f5d3SJohn Marino 
3086d7f5d3SJohn Marino    Schnelle Multiplikation grosser Zahlen, by Arnold Schoenhage and Volker
3186d7f5d3SJohn Marino    Strassen, Computing 7, p. 281-292, 1971.
3286d7f5d3SJohn Marino 
3386d7f5d3SJohn Marino    Asymptotically fast algorithms for the numerical multiplication and division
3486d7f5d3SJohn Marino    of polynomials with complex coefficients, by Arnold Schoenhage, Computer
3586d7f5d3SJohn Marino    Algebra, EUROCAM'82, LNCS 144, p. 3-15, 1982.
3686d7f5d3SJohn Marino 
3786d7f5d3SJohn Marino    Tapes versus Pointers, a study in implementing fast algorithms, by Arnold
3886d7f5d3SJohn Marino    Schoenhage, Bulletin of the EATCS, 30, p. 23-32, 1986.
3986d7f5d3SJohn Marino 
4086d7f5d3SJohn Marino    TODO:
4186d7f5d3SJohn Marino 
4286d7f5d3SJohn Marino    Implement some of the tricks published at ISSAC'2007 by Gaudry, Kruppa, and
4386d7f5d3SJohn Marino    Zimmermann.
4486d7f5d3SJohn Marino 
4586d7f5d3SJohn Marino    It might be possible to avoid a small number of MPN_COPYs by using a
4686d7f5d3SJohn Marino    rotating temporary or two.
4786d7f5d3SJohn Marino 
4886d7f5d3SJohn Marino    Cleanup and simplify the code!
4986d7f5d3SJohn Marino */
5086d7f5d3SJohn Marino 
5186d7f5d3SJohn Marino #ifdef TRACE
5286d7f5d3SJohn Marino #undef TRACE
5386d7f5d3SJohn Marino #define TRACE(x) x
5486d7f5d3SJohn Marino #include <stdio.h>
5586d7f5d3SJohn Marino #else
5686d7f5d3SJohn Marino #define TRACE(x)
5786d7f5d3SJohn Marino #endif
5886d7f5d3SJohn Marino 
5986d7f5d3SJohn Marino #include "gmp.h"
6086d7f5d3SJohn Marino #include "gmp-impl.h"
6186d7f5d3SJohn Marino 
6286d7f5d3SJohn Marino #ifdef WANT_ADDSUB
6386d7f5d3SJohn Marino #include "generic/add_n_sub_n.c"
6486d7f5d3SJohn Marino #define HAVE_NATIVE_mpn_add_n_sub_n 1
6586d7f5d3SJohn Marino #endif
6686d7f5d3SJohn Marino 
6786d7f5d3SJohn Marino static mp_limb_t mpn_mul_fft_internal
6886d7f5d3SJohn Marino __GMP_PROTO ((mp_ptr, mp_size_t, int, mp_ptr *, mp_ptr *,
6986d7f5d3SJohn Marino 	      mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_size_t, int **, mp_ptr, int));
7086d7f5d3SJohn Marino static void mpn_mul_fft_decompose
7186d7f5d3SJohn Marino __GMP_PROTO ((mp_ptr, mp_ptr *, int, int, mp_srcptr, mp_size_t, int, int, mp_ptr));
7286d7f5d3SJohn Marino 
7386d7f5d3SJohn Marino 
7486d7f5d3SJohn Marino /* Find the best k to use for a mod 2^(m*GMP_NUMB_BITS)+1 FFT for m >= n.
7586d7f5d3SJohn Marino    We have sqr=0 if for a multiply, sqr=1 for a square.
7686d7f5d3SJohn Marino    There are three generations of this code; we keep the old ones as long as
7786d7f5d3SJohn Marino    some gmp-mparam.h is not updated.  */
7886d7f5d3SJohn Marino 
7986d7f5d3SJohn Marino 
8086d7f5d3SJohn Marino /*****************************************************************************/
8186d7f5d3SJohn Marino 
8286d7f5d3SJohn Marino #if TUNE_PROGRAM_BUILD || (defined (MUL_FFT_TABLE3) && defined (SQR_FFT_TABLE3))
8386d7f5d3SJohn Marino 
8486d7f5d3SJohn Marino #ifndef FFT_TABLE3_SIZE		/* When tuning, this is define in gmp-impl.h */
8586d7f5d3SJohn Marino #if defined (MUL_FFT_TABLE3_SIZE) && defined (SQR_FFT_TABLE3_SIZE)
8686d7f5d3SJohn Marino #if MUL_FFT_TABLE3_SIZE > SQR_FFT_TABLE3_SIZE
8786d7f5d3SJohn Marino #define FFT_TABLE3_SIZE MUL_FFT_TABLE3_SIZE
8886d7f5d3SJohn Marino #else
8986d7f5d3SJohn Marino #define FFT_TABLE3_SIZE SQR_FFT_TABLE3_SIZE
9086d7f5d3SJohn Marino #endif
9186d7f5d3SJohn Marino #endif
9286d7f5d3SJohn Marino #endif
9386d7f5d3SJohn Marino 
9486d7f5d3SJohn Marino #ifndef FFT_TABLE3_SIZE
9586d7f5d3SJohn Marino #define FFT_TABLE3_SIZE 200
9686d7f5d3SJohn Marino #endif
9786d7f5d3SJohn Marino 
9886d7f5d3SJohn Marino FFT_TABLE_ATTRS struct fft_table_nk mpn_fft_table3[2][FFT_TABLE3_SIZE] =
9986d7f5d3SJohn Marino {
10086d7f5d3SJohn Marino   MUL_FFT_TABLE3,
10186d7f5d3SJohn Marino   SQR_FFT_TABLE3
10286d7f5d3SJohn Marino };
10386d7f5d3SJohn Marino 
10486d7f5d3SJohn Marino int
mpn_fft_best_k(mp_size_t n,int sqr)10586d7f5d3SJohn Marino mpn_fft_best_k (mp_size_t n, int sqr)
10686d7f5d3SJohn Marino {
10786d7f5d3SJohn Marino   FFT_TABLE_ATTRS struct fft_table_nk *fft_tab, *tab;
10886d7f5d3SJohn Marino   mp_size_t tab_n, thres;
10986d7f5d3SJohn Marino   int last_k;
11086d7f5d3SJohn Marino 
11186d7f5d3SJohn Marino   fft_tab = mpn_fft_table3[sqr];
11286d7f5d3SJohn Marino   last_k = fft_tab->k;
11386d7f5d3SJohn Marino   for (tab = fft_tab + 1; ; tab++)
11486d7f5d3SJohn Marino     {
11586d7f5d3SJohn Marino       tab_n = tab->n;
11686d7f5d3SJohn Marino       thres = tab_n << last_k;
11786d7f5d3SJohn Marino       if (n <= thres)
11886d7f5d3SJohn Marino 	break;
11986d7f5d3SJohn Marino       last_k = tab->k;
12086d7f5d3SJohn Marino     }
12186d7f5d3SJohn Marino   return last_k;
12286d7f5d3SJohn Marino }
12386d7f5d3SJohn Marino 
12486d7f5d3SJohn Marino #define MPN_FFT_BEST_READY 1
12586d7f5d3SJohn Marino #endif
12686d7f5d3SJohn Marino 
12786d7f5d3SJohn Marino /*****************************************************************************/
12886d7f5d3SJohn Marino 
12986d7f5d3SJohn Marino #if ! defined (MPN_FFT_BEST_READY)
13086d7f5d3SJohn Marino FFT_TABLE_ATTRS mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE] =
13186d7f5d3SJohn Marino {
13286d7f5d3SJohn Marino   MUL_FFT_TABLE,
13386d7f5d3SJohn Marino   SQR_FFT_TABLE
13486d7f5d3SJohn Marino };
13586d7f5d3SJohn Marino 
13686d7f5d3SJohn Marino int
mpn_fft_best_k(mp_size_t n,int sqr)13786d7f5d3SJohn Marino mpn_fft_best_k (mp_size_t n, int sqr)
13886d7f5d3SJohn Marino {
13986d7f5d3SJohn Marino   int i;
14086d7f5d3SJohn Marino 
14186d7f5d3SJohn Marino   for (i = 0; mpn_fft_table[sqr][i] != 0; i++)
14286d7f5d3SJohn Marino     if (n < mpn_fft_table[sqr][i])
14386d7f5d3SJohn Marino       return i + FFT_FIRST_K;
14486d7f5d3SJohn Marino 
14586d7f5d3SJohn Marino   /* treat 4*last as one further entry */
14686d7f5d3SJohn Marino   if (i == 0 || n < 4 * mpn_fft_table[sqr][i - 1])
14786d7f5d3SJohn Marino     return i + FFT_FIRST_K;
14886d7f5d3SJohn Marino   else
14986d7f5d3SJohn Marino     return i + FFT_FIRST_K + 1;
15086d7f5d3SJohn Marino }
15186d7f5d3SJohn Marino #endif
15286d7f5d3SJohn Marino 
15386d7f5d3SJohn Marino /*****************************************************************************/
15486d7f5d3SJohn Marino 
15586d7f5d3SJohn Marino 
15686d7f5d3SJohn Marino /* Returns smallest possible number of limbs >= pl for a fft of size 2^k,
15786d7f5d3SJohn Marino    i.e. smallest multiple of 2^k >= pl.
15886d7f5d3SJohn Marino 
15986d7f5d3SJohn Marino    Don't declare static: needed by tuneup.
16086d7f5d3SJohn Marino */
16186d7f5d3SJohn Marino 
16286d7f5d3SJohn Marino mp_size_t
mpn_fft_next_size(mp_size_t pl,int k)16386d7f5d3SJohn Marino mpn_fft_next_size (mp_size_t pl, int k)
16486d7f5d3SJohn Marino {
16586d7f5d3SJohn Marino   pl = 1 + ((pl - 1) >> k); /* ceil (pl/2^k) */
16686d7f5d3SJohn Marino   return pl << k;
16786d7f5d3SJohn Marino }
16886d7f5d3SJohn Marino 
16986d7f5d3SJohn Marino 
17086d7f5d3SJohn Marino /* Initialize l[i][j] with bitrev(j) */
17186d7f5d3SJohn Marino static void
mpn_fft_initl(int ** l,int k)17286d7f5d3SJohn Marino mpn_fft_initl (int **l, int k)
17386d7f5d3SJohn Marino {
17486d7f5d3SJohn Marino   int i, j, K;
17586d7f5d3SJohn Marino   int *li;
17686d7f5d3SJohn Marino 
17786d7f5d3SJohn Marino   l[0][0] = 0;
17886d7f5d3SJohn Marino   for (i = 1, K = 1; i <= k; i++, K *= 2)
17986d7f5d3SJohn Marino     {
18086d7f5d3SJohn Marino       li = l[i];
18186d7f5d3SJohn Marino       for (j = 0; j < K; j++)
18286d7f5d3SJohn Marino 	{
18386d7f5d3SJohn Marino 	  li[j] = 2 * l[i - 1][j];
18486d7f5d3SJohn Marino 	  li[K + j] = 1 + li[j];
18586d7f5d3SJohn Marino 	}
18686d7f5d3SJohn Marino     }
18786d7f5d3SJohn Marino }
18886d7f5d3SJohn Marino 
18986d7f5d3SJohn Marino 
19086d7f5d3SJohn Marino /* r <- a*2^d mod 2^(n*GMP_NUMB_BITS)+1 with a = {a, n+1}
19186d7f5d3SJohn Marino    Assumes a is semi-normalized, i.e. a[n] <= 1.
19286d7f5d3SJohn Marino    r and a must have n+1 limbs, and not overlap.
19386d7f5d3SJohn Marino */
19486d7f5d3SJohn Marino static void
mpn_fft_mul_2exp_modF(mp_ptr r,mp_srcptr a,unsigned int d,mp_size_t n)19586d7f5d3SJohn Marino mpn_fft_mul_2exp_modF (mp_ptr r, mp_srcptr a, unsigned int d, mp_size_t n)
19686d7f5d3SJohn Marino {
19786d7f5d3SJohn Marino   int sh;
19886d7f5d3SJohn Marino   mp_limb_t cc, rd;
19986d7f5d3SJohn Marino 
20086d7f5d3SJohn Marino   sh = d % GMP_NUMB_BITS;
20186d7f5d3SJohn Marino   d /= GMP_NUMB_BITS;
20286d7f5d3SJohn Marino 
20386d7f5d3SJohn Marino   if (d >= n)			/* negate */
20486d7f5d3SJohn Marino     {
20586d7f5d3SJohn Marino       /* r[0..d-1]  <-- lshift(a[n-d]..a[n-1], sh)
20686d7f5d3SJohn Marino 	 r[d..n-1]  <-- -lshift(a[0]..a[n-d-1],  sh) */
20786d7f5d3SJohn Marino 
20886d7f5d3SJohn Marino       d -= n;
20986d7f5d3SJohn Marino       if (sh != 0)
21086d7f5d3SJohn Marino 	{
21186d7f5d3SJohn Marino 	  /* no out shift below since a[n] <= 1 */
21286d7f5d3SJohn Marino 	  mpn_lshift (r, a + n - d, d + 1, sh);
21386d7f5d3SJohn Marino 	  rd = r[d];
21486d7f5d3SJohn Marino 	  cc = mpn_lshiftc (r + d, a, n - d, sh);
21586d7f5d3SJohn Marino 	}
21686d7f5d3SJohn Marino       else
21786d7f5d3SJohn Marino 	{
21886d7f5d3SJohn Marino 	  MPN_COPY (r, a + n - d, d);
21986d7f5d3SJohn Marino 	  rd = a[n];
22086d7f5d3SJohn Marino 	  mpn_com (r + d, a, n - d);
22186d7f5d3SJohn Marino 	  cc = 0;
22286d7f5d3SJohn Marino 	}
22386d7f5d3SJohn Marino 
22486d7f5d3SJohn Marino       /* add cc to r[0], and add rd to r[d] */
22586d7f5d3SJohn Marino 
22686d7f5d3SJohn Marino       /* now add 1 in r[d], subtract 1 in r[n], i.e. add 1 in r[0] */
22786d7f5d3SJohn Marino 
22886d7f5d3SJohn Marino       r[n] = 0;
22986d7f5d3SJohn Marino       /* cc < 2^sh <= 2^(GMP_NUMB_BITS-1) thus no overflow here */
23086d7f5d3SJohn Marino       cc++;
23186d7f5d3SJohn Marino       mpn_incr_u (r, cc);
23286d7f5d3SJohn Marino 
23386d7f5d3SJohn Marino       rd++;
23486d7f5d3SJohn Marino       /* rd might overflow when sh=GMP_NUMB_BITS-1 */
23586d7f5d3SJohn Marino       cc = (rd == 0) ? 1 : rd;
23686d7f5d3SJohn Marino       r = r + d + (rd == 0);
23786d7f5d3SJohn Marino       mpn_incr_u (r, cc);
23886d7f5d3SJohn Marino     }
23986d7f5d3SJohn Marino   else
24086d7f5d3SJohn Marino     {
24186d7f5d3SJohn Marino       /* r[0..d-1]  <-- -lshift(a[n-d]..a[n-1], sh)
24286d7f5d3SJohn Marino 	 r[d..n-1]  <-- lshift(a[0]..a[n-d-1],  sh)  */
24386d7f5d3SJohn Marino       if (sh != 0)
24486d7f5d3SJohn Marino 	{
24586d7f5d3SJohn Marino 	  /* no out bits below since a[n] <= 1 */
24686d7f5d3SJohn Marino 	  mpn_lshiftc (r, a + n - d, d + 1, sh);
24786d7f5d3SJohn Marino 	  rd = ~r[d];
24886d7f5d3SJohn Marino 	  /* {r, d+1} = {a+n-d, d+1} << sh */
24986d7f5d3SJohn Marino 	  cc = mpn_lshift (r + d, a, n - d, sh); /* {r+d, n-d} = {a, n-d}<<sh */
25086d7f5d3SJohn Marino 	}
25186d7f5d3SJohn Marino       else
25286d7f5d3SJohn Marino 	{
25386d7f5d3SJohn Marino 	  /* r[d] is not used below, but we save a test for d=0 */
25486d7f5d3SJohn Marino 	  mpn_com (r, a + n - d, d + 1);
25586d7f5d3SJohn Marino 	  rd = a[n];
25686d7f5d3SJohn Marino 	  MPN_COPY (r + d, a, n - d);
25786d7f5d3SJohn Marino 	  cc = 0;
25886d7f5d3SJohn Marino 	}
25986d7f5d3SJohn Marino 
26086d7f5d3SJohn Marino       /* now complement {r, d}, subtract cc from r[0], subtract rd from r[d] */
26186d7f5d3SJohn Marino 
26286d7f5d3SJohn Marino       /* if d=0 we just have r[0]=a[n] << sh */
26386d7f5d3SJohn Marino       if (d != 0)
26486d7f5d3SJohn Marino 	{
26586d7f5d3SJohn Marino 	  /* now add 1 in r[0], subtract 1 in r[d] */
26686d7f5d3SJohn Marino 	  if (cc-- == 0) /* then add 1 to r[0] */
26786d7f5d3SJohn Marino 	    cc = mpn_add_1 (r, r, n, CNST_LIMB(1));
26886d7f5d3SJohn Marino 	  cc = mpn_sub_1 (r, r, d, cc) + 1;
26986d7f5d3SJohn Marino 	  /* add 1 to cc instead of rd since rd might overflow */
27086d7f5d3SJohn Marino 	}
27186d7f5d3SJohn Marino 
27286d7f5d3SJohn Marino       /* now subtract cc and rd from r[d..n] */
27386d7f5d3SJohn Marino 
27486d7f5d3SJohn Marino       r[n] = -mpn_sub_1 (r + d, r + d, n - d, cc);
27586d7f5d3SJohn Marino       r[n] -= mpn_sub_1 (r + d, r + d, n - d, rd);
27686d7f5d3SJohn Marino       if (r[n] & GMP_LIMB_HIGHBIT)
27786d7f5d3SJohn Marino 	r[n] = mpn_add_1 (r, r, n, CNST_LIMB(1));
27886d7f5d3SJohn Marino     }
27986d7f5d3SJohn Marino }
28086d7f5d3SJohn Marino 
28186d7f5d3SJohn Marino 
28286d7f5d3SJohn Marino /* r <- a+b mod 2^(n*GMP_NUMB_BITS)+1.
28386d7f5d3SJohn Marino    Assumes a and b are semi-normalized.
28486d7f5d3SJohn Marino */
28586d7f5d3SJohn Marino static inline void
mpn_fft_add_modF(mp_ptr r,mp_srcptr a,mp_srcptr b,int n)28686d7f5d3SJohn Marino mpn_fft_add_modF (mp_ptr r, mp_srcptr a, mp_srcptr b, int n)
28786d7f5d3SJohn Marino {
28886d7f5d3SJohn Marino   mp_limb_t c, x;
28986d7f5d3SJohn Marino 
29086d7f5d3SJohn Marino   c = a[n] + b[n] + mpn_add_n (r, a, b, n);
29186d7f5d3SJohn Marino   /* 0 <= c <= 3 */
29286d7f5d3SJohn Marino 
29386d7f5d3SJohn Marino #if 1
29486d7f5d3SJohn Marino   /* GCC 4.1 outsmarts most expressions here, and generates a 50% branch.  The
29586d7f5d3SJohn Marino      result is slower code, of course.  But the following outsmarts GCC.  */
29686d7f5d3SJohn Marino   x = (c - 1) & -(c != 0);
29786d7f5d3SJohn Marino   r[n] = c - x;
29886d7f5d3SJohn Marino   MPN_DECR_U (r, n + 1, x);
29986d7f5d3SJohn Marino #endif
30086d7f5d3SJohn Marino #if 0
30186d7f5d3SJohn Marino   if (c > 1)
30286d7f5d3SJohn Marino     {
30386d7f5d3SJohn Marino       r[n] = 1;                       /* r[n] - c = 1 */
30486d7f5d3SJohn Marino       MPN_DECR_U (r, n + 1, c - 1);
30586d7f5d3SJohn Marino     }
30686d7f5d3SJohn Marino   else
30786d7f5d3SJohn Marino     {
30886d7f5d3SJohn Marino       r[n] = c;
30986d7f5d3SJohn Marino     }
31086d7f5d3SJohn Marino #endif
31186d7f5d3SJohn Marino }
31286d7f5d3SJohn Marino 
31386d7f5d3SJohn Marino /* r <- a-b mod 2^(n*GMP_NUMB_BITS)+1.
31486d7f5d3SJohn Marino    Assumes a and b are semi-normalized.
31586d7f5d3SJohn Marino */
31686d7f5d3SJohn Marino static inline void
mpn_fft_sub_modF(mp_ptr r,mp_srcptr a,mp_srcptr b,int n)31786d7f5d3SJohn Marino mpn_fft_sub_modF (mp_ptr r, mp_srcptr a, mp_srcptr b, int n)
31886d7f5d3SJohn Marino {
31986d7f5d3SJohn Marino   mp_limb_t c, x;
32086d7f5d3SJohn Marino 
32186d7f5d3SJohn Marino   c = a[n] - b[n] - mpn_sub_n (r, a, b, n);
32286d7f5d3SJohn Marino   /* -2 <= c <= 1 */
32386d7f5d3SJohn Marino 
32486d7f5d3SJohn Marino #if 1
32586d7f5d3SJohn Marino   /* GCC 4.1 outsmarts most expressions here, and generates a 50% branch.  The
32686d7f5d3SJohn Marino      result is slower code, of course.  But the following outsmarts GCC.  */
32786d7f5d3SJohn Marino   x = (-c) & -((c & GMP_LIMB_HIGHBIT) != 0);
32886d7f5d3SJohn Marino   r[n] = x + c;
32986d7f5d3SJohn Marino   MPN_INCR_U (r, n + 1, x);
33086d7f5d3SJohn Marino #endif
33186d7f5d3SJohn Marino #if 0
33286d7f5d3SJohn Marino   if ((c & GMP_LIMB_HIGHBIT) != 0)
33386d7f5d3SJohn Marino     {
33486d7f5d3SJohn Marino       r[n] = 0;
33586d7f5d3SJohn Marino       MPN_INCR_U (r, n + 1, -c);
33686d7f5d3SJohn Marino     }
33786d7f5d3SJohn Marino   else
33886d7f5d3SJohn Marino     {
33986d7f5d3SJohn Marino       r[n] = c;
34086d7f5d3SJohn Marino     }
34186d7f5d3SJohn Marino #endif
34286d7f5d3SJohn Marino }
34386d7f5d3SJohn Marino 
34486d7f5d3SJohn Marino /* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
34586d7f5d3SJohn Marino 	  N=n*GMP_NUMB_BITS, and 2^omega is a primitive root mod 2^N+1
34686d7f5d3SJohn Marino    output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
34786d7f5d3SJohn Marino 
34886d7f5d3SJohn Marino static void
mpn_fft_fft(mp_ptr * Ap,mp_size_t K,int ** ll,mp_size_t omega,mp_size_t n,mp_size_t inc,mp_ptr tp)34986d7f5d3SJohn Marino mpn_fft_fft (mp_ptr *Ap, mp_size_t K, int **ll,
35086d7f5d3SJohn Marino 	     mp_size_t omega, mp_size_t n, mp_size_t inc, mp_ptr tp)
35186d7f5d3SJohn Marino {
35286d7f5d3SJohn Marino   if (K == 2)
35386d7f5d3SJohn Marino     {
35486d7f5d3SJohn Marino       mp_limb_t cy;
35586d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_add_n_sub_n
35686d7f5d3SJohn Marino       cy = mpn_add_n_sub_n (Ap[0], Ap[inc], Ap[0], Ap[inc], n + 1) & 1;
35786d7f5d3SJohn Marino #else
35886d7f5d3SJohn Marino       MPN_COPY (tp, Ap[0], n + 1);
35986d7f5d3SJohn Marino       mpn_add_n (Ap[0], Ap[0], Ap[inc], n + 1);
36086d7f5d3SJohn Marino       cy = mpn_sub_n (Ap[inc], tp, Ap[inc], n + 1);
36186d7f5d3SJohn Marino #endif
36286d7f5d3SJohn Marino       if (Ap[0][n] > 1) /* can be 2 or 3 */
36386d7f5d3SJohn Marino 	Ap[0][n] = 1 - mpn_sub_1 (Ap[0], Ap[0], n, Ap[0][n] - 1);
36486d7f5d3SJohn Marino       if (cy) /* Ap[inc][n] can be -1 or -2 */
36586d7f5d3SJohn Marino 	Ap[inc][n] = mpn_add_1 (Ap[inc], Ap[inc], n, ~Ap[inc][n] + 1);
36686d7f5d3SJohn Marino     }
36786d7f5d3SJohn Marino   else
36886d7f5d3SJohn Marino     {
36986d7f5d3SJohn Marino       int j;
37086d7f5d3SJohn Marino       int *lk = *ll;
37186d7f5d3SJohn Marino 
37286d7f5d3SJohn Marino       mpn_fft_fft (Ap,     K >> 1, ll-1, 2 * omega, n, inc * 2, tp);
37386d7f5d3SJohn Marino       mpn_fft_fft (Ap+inc, K >> 1, ll-1, 2 * omega, n, inc * 2, tp);
37486d7f5d3SJohn Marino       /* A[2*j*inc]   <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
37586d7f5d3SJohn Marino 	 A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
37686d7f5d3SJohn Marino       for (j = 0; j < (K >> 1); j++, lk += 2, Ap += 2 * inc)
37786d7f5d3SJohn Marino 	{
37886d7f5d3SJohn Marino 	  /* Ap[inc] <- Ap[0] + Ap[inc] * 2^(lk[1] * omega)
37986d7f5d3SJohn Marino 	     Ap[0]   <- Ap[0] + Ap[inc] * 2^(lk[0] * omega) */
38086d7f5d3SJohn Marino 	  mpn_fft_mul_2exp_modF (tp, Ap[inc], lk[0] * omega, n);
38186d7f5d3SJohn Marino 	  mpn_fft_sub_modF (Ap[inc], Ap[0], tp, n);
38286d7f5d3SJohn Marino 	  mpn_fft_add_modF (Ap[0],   Ap[0], tp, n);
38386d7f5d3SJohn Marino 	}
38486d7f5d3SJohn Marino     }
38586d7f5d3SJohn Marino }
38686d7f5d3SJohn Marino 
38786d7f5d3SJohn Marino /* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
38886d7f5d3SJohn Marino 	  N=n*GMP_NUMB_BITS, and 2^omega is a primitive root mod 2^N+1
38986d7f5d3SJohn Marino    output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1
39086d7f5d3SJohn Marino    tp must have space for 2*(n+1) limbs.
39186d7f5d3SJohn Marino */
39286d7f5d3SJohn Marino 
39386d7f5d3SJohn Marino 
39486d7f5d3SJohn Marino /* Given ap[0..n] with ap[n]<=1, reduce it modulo 2^(n*GMP_NUMB_BITS)+1,
39586d7f5d3SJohn Marino    by subtracting that modulus if necessary.
39686d7f5d3SJohn Marino 
39786d7f5d3SJohn Marino    If ap[0..n] is exactly 2^(n*GMP_NUMB_BITS) then mpn_sub_1 produces a
39886d7f5d3SJohn Marino    borrow and the limbs must be zeroed out again.  This will occur very
39986d7f5d3SJohn Marino    infrequently.  */
40086d7f5d3SJohn Marino 
40186d7f5d3SJohn Marino static inline void
mpn_fft_normalize(mp_ptr ap,mp_size_t n)40286d7f5d3SJohn Marino mpn_fft_normalize (mp_ptr ap, mp_size_t n)
40386d7f5d3SJohn Marino {
40486d7f5d3SJohn Marino   if (ap[n] != 0)
40586d7f5d3SJohn Marino     {
40686d7f5d3SJohn Marino       MPN_DECR_U (ap, n + 1, CNST_LIMB(1));
40786d7f5d3SJohn Marino       if (ap[n] == 0)
40886d7f5d3SJohn Marino 	{
40986d7f5d3SJohn Marino 	  /* This happens with very low probability; we have yet to trigger it,
41086d7f5d3SJohn Marino 	     and thereby make sure this code is correct.  */
41186d7f5d3SJohn Marino 	  MPN_ZERO (ap, n);
41286d7f5d3SJohn Marino 	  ap[n] = 1;
41386d7f5d3SJohn Marino 	}
41486d7f5d3SJohn Marino       else
41586d7f5d3SJohn Marino 	ap[n] = 0;
41686d7f5d3SJohn Marino     }
41786d7f5d3SJohn Marino }
41886d7f5d3SJohn Marino 
41986d7f5d3SJohn Marino /* a[i] <- a[i]*b[i] mod 2^(n*GMP_NUMB_BITS)+1 for 0 <= i < K */
42086d7f5d3SJohn Marino static void
mpn_fft_mul_modF_K(mp_ptr * ap,mp_ptr * bp,mp_size_t n,int K)42186d7f5d3SJohn Marino mpn_fft_mul_modF_K (mp_ptr *ap, mp_ptr *bp, mp_size_t n, int K)
42286d7f5d3SJohn Marino {
42386d7f5d3SJohn Marino   int i;
42486d7f5d3SJohn Marino   int sqr = (ap == bp);
42586d7f5d3SJohn Marino   TMP_DECL;
42686d7f5d3SJohn Marino 
42786d7f5d3SJohn Marino   TMP_MARK;
42886d7f5d3SJohn Marino 
42986d7f5d3SJohn Marino   if (n >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
43086d7f5d3SJohn Marino     {
43186d7f5d3SJohn Marino       int k, K2, nprime2, Nprime2, M2, maxLK, l, Mp2;
43286d7f5d3SJohn Marino       int **fft_l;
43386d7f5d3SJohn Marino       mp_ptr *Ap, *Bp, A, B, T;
43486d7f5d3SJohn Marino 
43586d7f5d3SJohn Marino       k = mpn_fft_best_k (n, sqr);
43686d7f5d3SJohn Marino       K2 = 1 << k;
43786d7f5d3SJohn Marino       ASSERT_ALWAYS((n & (K2 - 1)) == 0);
43886d7f5d3SJohn Marino       maxLK = (K2 > GMP_NUMB_BITS) ? K2 : GMP_NUMB_BITS;
43986d7f5d3SJohn Marino       M2 = n * GMP_NUMB_BITS >> k;
44086d7f5d3SJohn Marino       l = n >> k;
44186d7f5d3SJohn Marino       Nprime2 = ((2 * M2 + k + 2 + maxLK) / maxLK) * maxLK;
44286d7f5d3SJohn Marino       /* Nprime2 = ceil((2*M2+k+3)/maxLK)*maxLK*/
44386d7f5d3SJohn Marino       nprime2 = Nprime2 / GMP_NUMB_BITS;
44486d7f5d3SJohn Marino 
44586d7f5d3SJohn Marino       /* we should ensure that nprime2 is a multiple of the next K */
44686d7f5d3SJohn Marino       if (nprime2 >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
44786d7f5d3SJohn Marino 	{
44886d7f5d3SJohn Marino 	  unsigned long K3;
44986d7f5d3SJohn Marino 	  for (;;)
45086d7f5d3SJohn Marino 	    {
45186d7f5d3SJohn Marino 	      K3 = 1L << mpn_fft_best_k (nprime2, sqr);
45286d7f5d3SJohn Marino 	      if ((nprime2 & (K3 - 1)) == 0)
45386d7f5d3SJohn Marino 		break;
45486d7f5d3SJohn Marino 	      nprime2 = (nprime2 + K3 - 1) & -K3;
45586d7f5d3SJohn Marino 	      Nprime2 = nprime2 * GMP_LIMB_BITS;
45686d7f5d3SJohn Marino 	      /* warning: since nprime2 changed, K3 may change too! */
45786d7f5d3SJohn Marino 	    }
45886d7f5d3SJohn Marino 	}
45986d7f5d3SJohn Marino       ASSERT_ALWAYS(nprime2 < n); /* otherwise we'll loop */
46086d7f5d3SJohn Marino 
46186d7f5d3SJohn Marino       Mp2 = Nprime2 >> k;
46286d7f5d3SJohn Marino 
46386d7f5d3SJohn Marino       Ap = TMP_ALLOC_MP_PTRS (K2);
46486d7f5d3SJohn Marino       Bp = TMP_ALLOC_MP_PTRS (K2);
46586d7f5d3SJohn Marino       A = TMP_ALLOC_LIMBS (2 * (nprime2 + 1) << k);
46686d7f5d3SJohn Marino       T = TMP_ALLOC_LIMBS (2 * (nprime2 + 1));
46786d7f5d3SJohn Marino       B = A + ((nprime2 + 1) << k);
46886d7f5d3SJohn Marino       fft_l = TMP_ALLOC_TYPE (k + 1, int *);
46986d7f5d3SJohn Marino       for (i = 0; i <= k; i++)
47086d7f5d3SJohn Marino 	fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
47186d7f5d3SJohn Marino       mpn_fft_initl (fft_l, k);
47286d7f5d3SJohn Marino 
47386d7f5d3SJohn Marino       TRACE (printf ("recurse: %ldx%ld limbs -> %d times %dx%d (%1.2f)\n", n,
47486d7f5d3SJohn Marino 		    n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2));
47586d7f5d3SJohn Marino       for (i = 0; i < K; i++, ap++, bp++)
47686d7f5d3SJohn Marino 	{
47786d7f5d3SJohn Marino 	  mp_limb_t cy;
47886d7f5d3SJohn Marino 	  mpn_fft_normalize (*ap, n);
47986d7f5d3SJohn Marino 	  if (!sqr)
48086d7f5d3SJohn Marino 	    mpn_fft_normalize (*bp, n);
48186d7f5d3SJohn Marino 
48286d7f5d3SJohn Marino 	  mpn_mul_fft_decompose (A, Ap, K2, nprime2, *ap, (l << k) + 1, l, Mp2, T);
48386d7f5d3SJohn Marino 	  if (!sqr)
48486d7f5d3SJohn Marino 	    mpn_mul_fft_decompose (B, Bp, K2, nprime2, *bp, (l << k) + 1, l, Mp2, T);
48586d7f5d3SJohn Marino 
48686d7f5d3SJohn Marino 	  cy = mpn_mul_fft_internal (*ap, n, k, Ap, Bp, A, B, nprime2,
48786d7f5d3SJohn Marino 				     l, Mp2, fft_l, T, sqr);
48886d7f5d3SJohn Marino 	  (*ap)[n] = cy;
48986d7f5d3SJohn Marino 	}
49086d7f5d3SJohn Marino     }
49186d7f5d3SJohn Marino   else
49286d7f5d3SJohn Marino     {
49386d7f5d3SJohn Marino       mp_ptr a, b, tp, tpn;
49486d7f5d3SJohn Marino       mp_limb_t cc;
49586d7f5d3SJohn Marino       int n2 = 2 * n;
49686d7f5d3SJohn Marino       tp = TMP_ALLOC_LIMBS (n2);
49786d7f5d3SJohn Marino       tpn = tp + n;
49886d7f5d3SJohn Marino       TRACE (printf ("  mpn_mul_n %d of %ld limbs\n", K, n));
49986d7f5d3SJohn Marino       for (i = 0; i < K; i++)
50086d7f5d3SJohn Marino 	{
50186d7f5d3SJohn Marino 	  a = *ap++;
50286d7f5d3SJohn Marino 	  b = *bp++;
50386d7f5d3SJohn Marino 	  if (sqr)
50486d7f5d3SJohn Marino 	    mpn_sqr (tp, a, n);
50586d7f5d3SJohn Marino 	  else
50686d7f5d3SJohn Marino 	    mpn_mul_n (tp, b, a, n);
50786d7f5d3SJohn Marino 	  if (a[n] != 0)
50886d7f5d3SJohn Marino 	    cc = mpn_add_n (tpn, tpn, b, n);
50986d7f5d3SJohn Marino 	  else
51086d7f5d3SJohn Marino 	    cc = 0;
51186d7f5d3SJohn Marino 	  if (b[n] != 0)
51286d7f5d3SJohn Marino 	    cc += mpn_add_n (tpn, tpn, a, n) + a[n];
51386d7f5d3SJohn Marino 	  if (cc != 0)
51486d7f5d3SJohn Marino 	    {
51586d7f5d3SJohn Marino 	      /* FIXME: use MPN_INCR_U here, since carry is not expected.  */
51686d7f5d3SJohn Marino 	      cc = mpn_add_1 (tp, tp, n2, cc);
51786d7f5d3SJohn Marino 	      ASSERT (cc == 0);
51886d7f5d3SJohn Marino 	    }
51986d7f5d3SJohn Marino 	  a[n] = mpn_sub_n (a, tp, tpn, n) && mpn_add_1 (a, a, n, CNST_LIMB(1));
52086d7f5d3SJohn Marino 	}
52186d7f5d3SJohn Marino     }
52286d7f5d3SJohn Marino   TMP_FREE;
52386d7f5d3SJohn Marino }
52486d7f5d3SJohn Marino 
52586d7f5d3SJohn Marino 
52686d7f5d3SJohn Marino /* input: A^[l[k][0]] A^[l[k][1]] ... A^[l[k][K-1]]
52786d7f5d3SJohn Marino    output: K*A[0] K*A[K-1] ... K*A[1].
52886d7f5d3SJohn Marino    Assumes the Ap[] are pseudo-normalized, i.e. 0 <= Ap[][n] <= 1.
52986d7f5d3SJohn Marino    This condition is also fulfilled at exit.
53086d7f5d3SJohn Marino */
53186d7f5d3SJohn Marino static void
mpn_fft_fftinv(mp_ptr * Ap,int K,mp_size_t omega,mp_size_t n,mp_ptr tp)53286d7f5d3SJohn Marino mpn_fft_fftinv (mp_ptr *Ap, int K, mp_size_t omega, mp_size_t n, mp_ptr tp)
53386d7f5d3SJohn Marino {
53486d7f5d3SJohn Marino   if (K == 2)
53586d7f5d3SJohn Marino     {
53686d7f5d3SJohn Marino       mp_limb_t cy;
53786d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_add_n_sub_n
53886d7f5d3SJohn Marino       cy = mpn_add_n_sub_n (Ap[0], Ap[1], Ap[0], Ap[1], n + 1) & 1;
53986d7f5d3SJohn Marino #else
54086d7f5d3SJohn Marino       MPN_COPY (tp, Ap[0], n + 1);
54186d7f5d3SJohn Marino       mpn_add_n (Ap[0], Ap[0], Ap[1], n + 1);
54286d7f5d3SJohn Marino       cy = mpn_sub_n (Ap[1], tp, Ap[1], n + 1);
54386d7f5d3SJohn Marino #endif
54486d7f5d3SJohn Marino       if (Ap[0][n] > 1) /* can be 2 or 3 */
54586d7f5d3SJohn Marino 	Ap[0][n] = 1 - mpn_sub_1 (Ap[0], Ap[0], n, Ap[0][n] - 1);
54686d7f5d3SJohn Marino       if (cy) /* Ap[1][n] can be -1 or -2 */
54786d7f5d3SJohn Marino 	Ap[1][n] = mpn_add_1 (Ap[1], Ap[1], n, ~Ap[1][n] + 1);
54886d7f5d3SJohn Marino     }
54986d7f5d3SJohn Marino   else
55086d7f5d3SJohn Marino     {
55186d7f5d3SJohn Marino       int j, K2 = K >> 1;
55286d7f5d3SJohn Marino 
55386d7f5d3SJohn Marino       mpn_fft_fftinv (Ap,      K2, 2 * omega, n, tp);
55486d7f5d3SJohn Marino       mpn_fft_fftinv (Ap + K2, K2, 2 * omega, n, tp);
55586d7f5d3SJohn Marino       /* A[j]     <- A[j] + omega^j A[j+K/2]
55686d7f5d3SJohn Marino 	 A[j+K/2] <- A[j] + omega^(j+K/2) A[j+K/2] */
55786d7f5d3SJohn Marino       for (j = 0; j < K2; j++, Ap++)
55886d7f5d3SJohn Marino 	{
55986d7f5d3SJohn Marino 	  /* Ap[K2] <- Ap[0] + Ap[K2] * 2^((j + K2) * omega)
56086d7f5d3SJohn Marino 	     Ap[0]  <- Ap[0] + Ap[K2] * 2^(j * omega) */
56186d7f5d3SJohn Marino 	  mpn_fft_mul_2exp_modF (tp, Ap[K2], j * omega, n);
56286d7f5d3SJohn Marino 	  mpn_fft_sub_modF (Ap[K2], Ap[0], tp, n);
56386d7f5d3SJohn Marino 	  mpn_fft_add_modF (Ap[0],  Ap[0], tp, n);
56486d7f5d3SJohn Marino 	}
56586d7f5d3SJohn Marino     }
56686d7f5d3SJohn Marino }
56786d7f5d3SJohn Marino 
56886d7f5d3SJohn Marino 
56986d7f5d3SJohn Marino /* R <- A/2^k mod 2^(n*GMP_NUMB_BITS)+1 */
57086d7f5d3SJohn Marino static void
mpn_fft_div_2exp_modF(mp_ptr r,mp_srcptr a,int k,mp_size_t n)57186d7f5d3SJohn Marino mpn_fft_div_2exp_modF (mp_ptr r, mp_srcptr a, int k, mp_size_t n)
57286d7f5d3SJohn Marino {
57386d7f5d3SJohn Marino   int i;
57486d7f5d3SJohn Marino 
57586d7f5d3SJohn Marino   ASSERT (r != a);
57686d7f5d3SJohn Marino   i = 2 * n * GMP_NUMB_BITS - k;
57786d7f5d3SJohn Marino   mpn_fft_mul_2exp_modF (r, a, i, n);
57886d7f5d3SJohn Marino   /* 1/2^k = 2^(2nL-k) mod 2^(n*GMP_NUMB_BITS)+1 */
57986d7f5d3SJohn Marino   /* normalize so that R < 2^(n*GMP_NUMB_BITS)+1 */
58086d7f5d3SJohn Marino   mpn_fft_normalize (r, n);
58186d7f5d3SJohn Marino }
58286d7f5d3SJohn Marino 
58386d7f5d3SJohn Marino 
58486d7f5d3SJohn Marino /* {rp,n} <- {ap,an} mod 2^(n*GMP_NUMB_BITS)+1, n <= an <= 3*n.
58586d7f5d3SJohn Marino    Returns carry out, i.e. 1 iff {ap,an} = -1 mod 2^(n*GMP_NUMB_BITS)+1,
58686d7f5d3SJohn Marino    then {rp,n}=0.
58786d7f5d3SJohn Marino */
58886d7f5d3SJohn Marino static int
mpn_fft_norm_modF(mp_ptr rp,mp_size_t n,mp_ptr ap,mp_size_t an)58986d7f5d3SJohn Marino mpn_fft_norm_modF (mp_ptr rp, mp_size_t n, mp_ptr ap, mp_size_t an)
59086d7f5d3SJohn Marino {
59186d7f5d3SJohn Marino   mp_size_t l;
59286d7f5d3SJohn Marino   long int m;
59386d7f5d3SJohn Marino   mp_limb_t cc;
59486d7f5d3SJohn Marino   int rpn;
59586d7f5d3SJohn Marino 
59686d7f5d3SJohn Marino   ASSERT ((n <= an) && (an <= 3 * n));
59786d7f5d3SJohn Marino   m = an - 2 * n;
59886d7f5d3SJohn Marino   if (m > 0)
59986d7f5d3SJohn Marino     {
60086d7f5d3SJohn Marino       l = n;
60186d7f5d3SJohn Marino       /* add {ap, m} and {ap+2n, m} in {rp, m} */
60286d7f5d3SJohn Marino       cc = mpn_add_n (rp, ap, ap + 2 * n, m);
60386d7f5d3SJohn Marino       /* copy {ap+m, n-m} to {rp+m, n-m} */
60486d7f5d3SJohn Marino       rpn = mpn_add_1 (rp + m, ap + m, n - m, cc);
60586d7f5d3SJohn Marino     }
60686d7f5d3SJohn Marino   else
60786d7f5d3SJohn Marino     {
60886d7f5d3SJohn Marino       l = an - n; /* l <= n */
60986d7f5d3SJohn Marino       MPN_COPY (rp, ap, n);
61086d7f5d3SJohn Marino       rpn = 0;
61186d7f5d3SJohn Marino     }
61286d7f5d3SJohn Marino 
61386d7f5d3SJohn Marino   /* remains to subtract {ap+n, l} from {rp, n+1} */
61486d7f5d3SJohn Marino   cc = mpn_sub_n (rp, rp, ap + n, l);
61586d7f5d3SJohn Marino   rpn -= mpn_sub_1 (rp + l, rp + l, n - l, cc);
61686d7f5d3SJohn Marino   if (rpn < 0) /* necessarily rpn = -1 */
61786d7f5d3SJohn Marino     rpn = mpn_add_1 (rp, rp, n, CNST_LIMB(1));
61886d7f5d3SJohn Marino   return rpn;
61986d7f5d3SJohn Marino }
62086d7f5d3SJohn Marino 
62186d7f5d3SJohn Marino /* store in A[0..nprime] the first M bits from {n, nl},
62286d7f5d3SJohn Marino    in A[nprime+1..] the following M bits, ...
62386d7f5d3SJohn Marino    Assumes M is a multiple of GMP_NUMB_BITS (M = l * GMP_NUMB_BITS).
62486d7f5d3SJohn Marino    T must have space for at least (nprime + 1) limbs.
62586d7f5d3SJohn Marino    We must have nl <= 2*K*l.
62686d7f5d3SJohn Marino */
62786d7f5d3SJohn Marino static void
mpn_mul_fft_decompose(mp_ptr A,mp_ptr * Ap,int K,int nprime,mp_srcptr n,mp_size_t nl,int l,int Mp,mp_ptr T)62886d7f5d3SJohn Marino mpn_mul_fft_decompose (mp_ptr A, mp_ptr *Ap, int K, int nprime, mp_srcptr n,
62986d7f5d3SJohn Marino 		       mp_size_t nl, int l, int Mp, mp_ptr T)
63086d7f5d3SJohn Marino {
63186d7f5d3SJohn Marino   int i, j;
63286d7f5d3SJohn Marino   mp_ptr tmp;
63386d7f5d3SJohn Marino   mp_size_t Kl = K * l;
63486d7f5d3SJohn Marino   TMP_DECL;
63586d7f5d3SJohn Marino   TMP_MARK;
63686d7f5d3SJohn Marino 
63786d7f5d3SJohn Marino   if (nl > Kl) /* normalize {n, nl} mod 2^(Kl*GMP_NUMB_BITS)+1 */
63886d7f5d3SJohn Marino     {
63986d7f5d3SJohn Marino       mp_size_t dif = nl - Kl;
64086d7f5d3SJohn Marino       mp_limb_signed_t cy;
64186d7f5d3SJohn Marino 
64286d7f5d3SJohn Marino       tmp = TMP_ALLOC_LIMBS(Kl + 1);
64386d7f5d3SJohn Marino 
64486d7f5d3SJohn Marino       if (dif > Kl)
64586d7f5d3SJohn Marino 	{
64686d7f5d3SJohn Marino 	  int subp = 0;
64786d7f5d3SJohn Marino 
64886d7f5d3SJohn Marino 	  cy = mpn_sub_n (tmp, n, n + Kl, Kl);
64986d7f5d3SJohn Marino 	  n += 2 * Kl;
65086d7f5d3SJohn Marino 	  dif -= Kl;
65186d7f5d3SJohn Marino 
65286d7f5d3SJohn Marino 	  /* now dif > 0 */
65386d7f5d3SJohn Marino 	  while (dif > Kl)
65486d7f5d3SJohn Marino 	    {
65586d7f5d3SJohn Marino 	      if (subp)
65686d7f5d3SJohn Marino 		cy += mpn_sub_n (tmp, tmp, n, Kl);
65786d7f5d3SJohn Marino 	      else
65886d7f5d3SJohn Marino 		cy -= mpn_add_n (tmp, tmp, n, Kl);
65986d7f5d3SJohn Marino 	      subp ^= 1;
66086d7f5d3SJohn Marino 	      n += Kl;
66186d7f5d3SJohn Marino 	      dif -= Kl;
66286d7f5d3SJohn Marino 	    }
66386d7f5d3SJohn Marino 	  /* now dif <= Kl */
66486d7f5d3SJohn Marino 	  if (subp)
66586d7f5d3SJohn Marino 	    cy += mpn_sub (tmp, tmp, Kl, n, dif);
66686d7f5d3SJohn Marino 	  else
66786d7f5d3SJohn Marino 	    cy -= mpn_add (tmp, tmp, Kl, n, dif);
66886d7f5d3SJohn Marino 	  if (cy >= 0)
66986d7f5d3SJohn Marino 	    cy = mpn_add_1 (tmp, tmp, Kl, cy);
67086d7f5d3SJohn Marino 	  else
67186d7f5d3SJohn Marino 	    cy = mpn_sub_1 (tmp, tmp, Kl, -cy);
67286d7f5d3SJohn Marino 	}
67386d7f5d3SJohn Marino       else /* dif <= Kl, i.e. nl <= 2 * Kl */
67486d7f5d3SJohn Marino 	{
67586d7f5d3SJohn Marino 	  cy = mpn_sub (tmp, n, Kl, n + Kl, dif);
67686d7f5d3SJohn Marino 	  cy = mpn_add_1 (tmp, tmp, Kl, cy);
67786d7f5d3SJohn Marino 	}
67886d7f5d3SJohn Marino       tmp[Kl] = cy;
67986d7f5d3SJohn Marino       nl = Kl + 1;
68086d7f5d3SJohn Marino       n = tmp;
68186d7f5d3SJohn Marino     }
68286d7f5d3SJohn Marino   for (i = 0; i < K; i++)
68386d7f5d3SJohn Marino     {
68486d7f5d3SJohn Marino       Ap[i] = A;
68586d7f5d3SJohn Marino       /* store the next M bits of n into A[0..nprime] */
68686d7f5d3SJohn Marino       if (nl > 0) /* nl is the number of remaining limbs */
68786d7f5d3SJohn Marino 	{
68886d7f5d3SJohn Marino 	  j = (l <= nl && i < K - 1) ? l : nl; /* store j next limbs */
68986d7f5d3SJohn Marino 	  nl -= j;
69086d7f5d3SJohn Marino 	  MPN_COPY (T, n, j);
69186d7f5d3SJohn Marino 	  MPN_ZERO (T + j, nprime + 1 - j);
69286d7f5d3SJohn Marino 	  n += l;
69386d7f5d3SJohn Marino 	  mpn_fft_mul_2exp_modF (A, T, i * Mp, nprime);
69486d7f5d3SJohn Marino 	}
69586d7f5d3SJohn Marino       else
69686d7f5d3SJohn Marino 	MPN_ZERO (A, nprime + 1);
69786d7f5d3SJohn Marino       A += nprime + 1;
69886d7f5d3SJohn Marino     }
69986d7f5d3SJohn Marino   ASSERT_ALWAYS (nl == 0);
70086d7f5d3SJohn Marino   TMP_FREE;
70186d7f5d3SJohn Marino }
70286d7f5d3SJohn Marino 
70386d7f5d3SJohn Marino /* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*GMP_NUMB_BITS
70486d7f5d3SJohn Marino    op is pl limbs, its high bit is returned.
70586d7f5d3SJohn Marino    One must have pl = mpn_fft_next_size (pl, k).
70686d7f5d3SJohn Marino    T must have space for 2 * (nprime + 1) limbs.
70786d7f5d3SJohn Marino */
70886d7f5d3SJohn Marino 
70986d7f5d3SJohn Marino static mp_limb_t
mpn_mul_fft_internal(mp_ptr op,mp_size_t pl,int k,mp_ptr * Ap,mp_ptr * Bp,mp_ptr A,mp_ptr B,mp_size_t nprime,mp_size_t l,mp_size_t Mp,int ** fft_l,mp_ptr T,int sqr)71086d7f5d3SJohn Marino mpn_mul_fft_internal (mp_ptr op, mp_size_t pl, int k,
71186d7f5d3SJohn Marino 		      mp_ptr *Ap, mp_ptr *Bp, mp_ptr A, mp_ptr B,
71286d7f5d3SJohn Marino 		      mp_size_t nprime, mp_size_t l, mp_size_t Mp,
71386d7f5d3SJohn Marino 		      int **fft_l, mp_ptr T, int sqr)
71486d7f5d3SJohn Marino {
71586d7f5d3SJohn Marino   int K, i, pla, lo, sh, j;
71686d7f5d3SJohn Marino   mp_ptr p;
71786d7f5d3SJohn Marino   mp_limb_t cc;
71886d7f5d3SJohn Marino 
71986d7f5d3SJohn Marino   K = 1 << k;
72086d7f5d3SJohn Marino 
72186d7f5d3SJohn Marino   /* direct fft's */
72286d7f5d3SJohn Marino   mpn_fft_fft (Ap, K, fft_l + k, 2 * Mp, nprime, 1, T);
72386d7f5d3SJohn Marino   if (!sqr)
72486d7f5d3SJohn Marino     mpn_fft_fft (Bp, K, fft_l + k, 2 * Mp, nprime, 1, T);
72586d7f5d3SJohn Marino 
72686d7f5d3SJohn Marino   /* term to term multiplications */
72786d7f5d3SJohn Marino   mpn_fft_mul_modF_K (Ap, sqr ? Ap : Bp, nprime, K);
72886d7f5d3SJohn Marino 
72986d7f5d3SJohn Marino   /* inverse fft's */
73086d7f5d3SJohn Marino   mpn_fft_fftinv (Ap, K, 2 * Mp, nprime, T);
73186d7f5d3SJohn Marino 
73286d7f5d3SJohn Marino   /* division of terms after inverse fft */
73386d7f5d3SJohn Marino   Bp[0] = T + nprime + 1;
73486d7f5d3SJohn Marino   mpn_fft_div_2exp_modF (Bp[0], Ap[0], k, nprime);
73586d7f5d3SJohn Marino   for (i = 1; i < K; i++)
73686d7f5d3SJohn Marino     {
73786d7f5d3SJohn Marino       Bp[i] = Ap[i - 1];
73886d7f5d3SJohn Marino       mpn_fft_div_2exp_modF (Bp[i], Ap[i], k + (K - i) * Mp, nprime);
73986d7f5d3SJohn Marino     }
74086d7f5d3SJohn Marino 
74186d7f5d3SJohn Marino   /* addition of terms in result p */
74286d7f5d3SJohn Marino   MPN_ZERO (T, nprime + 1);
74386d7f5d3SJohn Marino   pla = l * (K - 1) + nprime + 1; /* number of required limbs for p */
74486d7f5d3SJohn Marino   p = B; /* B has K*(n' + 1) limbs, which is >= pla, i.e. enough */
74586d7f5d3SJohn Marino   MPN_ZERO (p, pla);
74686d7f5d3SJohn Marino   cc = 0; /* will accumulate the (signed) carry at p[pla] */
74786d7f5d3SJohn Marino   for (i = K - 1, lo = l * i + nprime,sh = l * i; i >= 0; i--,lo -= l,sh -= l)
74886d7f5d3SJohn Marino     {
74986d7f5d3SJohn Marino       mp_ptr n = p + sh;
75086d7f5d3SJohn Marino 
75186d7f5d3SJohn Marino       j = (K - i) & (K - 1);
75286d7f5d3SJohn Marino 
75386d7f5d3SJohn Marino       if (mpn_add_n (n, n, Bp[j], nprime + 1))
75486d7f5d3SJohn Marino 	cc += mpn_add_1 (n + nprime + 1, n + nprime + 1,
75586d7f5d3SJohn Marino 			  pla - sh - nprime - 1, CNST_LIMB(1));
75686d7f5d3SJohn Marino       T[2 * l] = i + 1; /* T = (i + 1)*2^(2*M) */
75786d7f5d3SJohn Marino       if (mpn_cmp (Bp[j], T, nprime + 1) > 0)
75886d7f5d3SJohn Marino 	{ /* subtract 2^N'+1 */
75986d7f5d3SJohn Marino 	  cc -= mpn_sub_1 (n, n, pla - sh, CNST_LIMB(1));
76086d7f5d3SJohn Marino 	  cc -= mpn_sub_1 (p + lo, p + lo, pla - lo, CNST_LIMB(1));
76186d7f5d3SJohn Marino 	}
76286d7f5d3SJohn Marino     }
76386d7f5d3SJohn Marino   if (cc == -CNST_LIMB(1))
76486d7f5d3SJohn Marino     {
76586d7f5d3SJohn Marino       if ((cc = mpn_add_1 (p + pla - pl, p + pla - pl, pl, CNST_LIMB(1))))
76686d7f5d3SJohn Marino 	{
76786d7f5d3SJohn Marino 	  /* p[pla-pl]...p[pla-1] are all zero */
76886d7f5d3SJohn Marino 	  mpn_sub_1 (p + pla - pl - 1, p + pla - pl - 1, pl + 1, CNST_LIMB(1));
76986d7f5d3SJohn Marino 	  mpn_sub_1 (p + pla - 1, p + pla - 1, 1, CNST_LIMB(1));
77086d7f5d3SJohn Marino 	}
77186d7f5d3SJohn Marino     }
77286d7f5d3SJohn Marino   else if (cc == 1)
77386d7f5d3SJohn Marino     {
77486d7f5d3SJohn Marino       if (pla >= 2 * pl)
77586d7f5d3SJohn Marino 	{
77686d7f5d3SJohn Marino 	  while ((cc = mpn_add_1 (p + pla - 2 * pl, p + pla - 2 * pl, 2 * pl, cc)))
77786d7f5d3SJohn Marino 	    ;
77886d7f5d3SJohn Marino 	}
77986d7f5d3SJohn Marino       else
78086d7f5d3SJohn Marino 	{
78186d7f5d3SJohn Marino 	  cc = mpn_sub_1 (p + pla - pl, p + pla - pl, pl, cc);
78286d7f5d3SJohn Marino 	  ASSERT (cc == 0);
78386d7f5d3SJohn Marino 	}
78486d7f5d3SJohn Marino     }
78586d7f5d3SJohn Marino   else
78686d7f5d3SJohn Marino     ASSERT (cc == 0);
78786d7f5d3SJohn Marino 
78886d7f5d3SJohn Marino   /* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ]
78986d7f5d3SJohn Marino      < K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ]
79086d7f5d3SJohn Marino      < K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */
79186d7f5d3SJohn Marino   return mpn_fft_norm_modF (op, pl, p, pla);
79286d7f5d3SJohn Marino }
79386d7f5d3SJohn Marino 
79486d7f5d3SJohn Marino /* return the lcm of a and 2^k */
79586d7f5d3SJohn Marino static unsigned long int
mpn_mul_fft_lcm(unsigned long int a,unsigned int k)79686d7f5d3SJohn Marino mpn_mul_fft_lcm (unsigned long int a, unsigned int k)
79786d7f5d3SJohn Marino {
79886d7f5d3SJohn Marino   unsigned long int l = k;
79986d7f5d3SJohn Marino 
80086d7f5d3SJohn Marino   while (a % 2 == 0 && k > 0)
80186d7f5d3SJohn Marino     {
80286d7f5d3SJohn Marino       a >>= 1;
80386d7f5d3SJohn Marino       k --;
80486d7f5d3SJohn Marino     }
80586d7f5d3SJohn Marino   return a << l;
80686d7f5d3SJohn Marino }
80786d7f5d3SJohn Marino 
80886d7f5d3SJohn Marino 
80986d7f5d3SJohn Marino mp_limb_t
mpn_mul_fft(mp_ptr op,mp_size_t pl,mp_srcptr n,mp_size_t nl,mp_srcptr m,mp_size_t ml,int k)81086d7f5d3SJohn Marino mpn_mul_fft (mp_ptr op, mp_size_t pl,
81186d7f5d3SJohn Marino 	     mp_srcptr n, mp_size_t nl,
81286d7f5d3SJohn Marino 	     mp_srcptr m, mp_size_t ml,
81386d7f5d3SJohn Marino 	     int k)
81486d7f5d3SJohn Marino {
81586d7f5d3SJohn Marino   int K, maxLK, i;
81686d7f5d3SJohn Marino   mp_size_t N, Nprime, nprime, M, Mp, l;
81786d7f5d3SJohn Marino   mp_ptr *Ap, *Bp, A, T, B;
81886d7f5d3SJohn Marino   int **fft_l;
81986d7f5d3SJohn Marino   int sqr = (n == m && nl == ml);
82086d7f5d3SJohn Marino   mp_limb_t h;
82186d7f5d3SJohn Marino   TMP_DECL;
82286d7f5d3SJohn Marino 
82386d7f5d3SJohn Marino   TRACE (printf ("\nmpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d\n", pl, nl, ml, k));
82486d7f5d3SJohn Marino   ASSERT_ALWAYS (mpn_fft_next_size (pl, k) == pl);
82586d7f5d3SJohn Marino 
82686d7f5d3SJohn Marino   TMP_MARK;
82786d7f5d3SJohn Marino   N = pl * GMP_NUMB_BITS;
82886d7f5d3SJohn Marino   fft_l = TMP_ALLOC_TYPE (k + 1, int *);
82986d7f5d3SJohn Marino   for (i = 0; i <= k; i++)
83086d7f5d3SJohn Marino     fft_l[i] = TMP_ALLOC_TYPE (1 << i, int);
83186d7f5d3SJohn Marino   mpn_fft_initl (fft_l, k);
83286d7f5d3SJohn Marino   K = 1 << k;
83386d7f5d3SJohn Marino   M = N >> k;	/* N = 2^k M */
83486d7f5d3SJohn Marino   l = 1 + (M - 1) / GMP_NUMB_BITS;
83586d7f5d3SJohn Marino   maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k); /* lcm (GMP_NUMB_BITS, 2^k) */
83686d7f5d3SJohn Marino 
83786d7f5d3SJohn Marino   Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
83886d7f5d3SJohn Marino   /* Nprime = ceil((2*M+k+3)/maxLK)*maxLK; */
83986d7f5d3SJohn Marino   nprime = Nprime / GMP_NUMB_BITS;
84086d7f5d3SJohn Marino   TRACE (printf ("N=%ld K=%d, M=%ld, l=%ld, maxLK=%d, Np=%ld, np=%ld\n",
84186d7f5d3SJohn Marino 		 N, K, M, l, maxLK, Nprime, nprime));
84286d7f5d3SJohn Marino   /* we should ensure that recursively, nprime is a multiple of the next K */
84386d7f5d3SJohn Marino   if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
84486d7f5d3SJohn Marino     {
84586d7f5d3SJohn Marino       unsigned long K2;
84686d7f5d3SJohn Marino       for (;;)
84786d7f5d3SJohn Marino 	{
84886d7f5d3SJohn Marino 	  K2 = 1L << mpn_fft_best_k (nprime, sqr);
84986d7f5d3SJohn Marino 	  if ((nprime & (K2 - 1)) == 0)
85086d7f5d3SJohn Marino 	    break;
85186d7f5d3SJohn Marino 	  nprime = (nprime + K2 - 1) & -K2;
85286d7f5d3SJohn Marino 	  Nprime = nprime * GMP_LIMB_BITS;
85386d7f5d3SJohn Marino 	  /* warning: since nprime changed, K2 may change too! */
85486d7f5d3SJohn Marino 	}
85586d7f5d3SJohn Marino       TRACE (printf ("new maxLK=%d, Np=%ld, np=%ld\n", maxLK, Nprime, nprime));
85686d7f5d3SJohn Marino     }
85786d7f5d3SJohn Marino   ASSERT_ALWAYS (nprime < pl); /* otherwise we'll loop */
85886d7f5d3SJohn Marino 
85986d7f5d3SJohn Marino   T = TMP_ALLOC_LIMBS (2 * (nprime + 1));
86086d7f5d3SJohn Marino   Mp = Nprime >> k;
86186d7f5d3SJohn Marino 
86286d7f5d3SJohn Marino   TRACE (printf ("%ldx%ld limbs -> %d times %ldx%ld limbs (%1.2f)\n",
86386d7f5d3SJohn Marino 		pl, pl, K, nprime, nprime, 2.0 * (double) N / Nprime / K);
86486d7f5d3SJohn Marino 	 printf ("   temp space %ld\n", 2 * K * (nprime + 1)));
86586d7f5d3SJohn Marino 
86686d7f5d3SJohn Marino   A = TMP_ALLOC_LIMBS (K * (nprime + 1));
86786d7f5d3SJohn Marino   Ap = TMP_ALLOC_MP_PTRS (K);
86886d7f5d3SJohn Marino   mpn_mul_fft_decompose (A, Ap, K, nprime, n, nl, l, Mp, T);
86986d7f5d3SJohn Marino   if (sqr)
87086d7f5d3SJohn Marino     {
87186d7f5d3SJohn Marino       mp_size_t pla;
87286d7f5d3SJohn Marino       pla = l * (K - 1) + nprime + 1; /* number of required limbs for p */
87386d7f5d3SJohn Marino       B = TMP_ALLOC_LIMBS (pla);
87486d7f5d3SJohn Marino       Bp = TMP_ALLOC_MP_PTRS (K);
87586d7f5d3SJohn Marino     }
87686d7f5d3SJohn Marino   else
87786d7f5d3SJohn Marino     {
87886d7f5d3SJohn Marino       B = TMP_ALLOC_LIMBS (K * (nprime + 1));
87986d7f5d3SJohn Marino       Bp = TMP_ALLOC_MP_PTRS (K);
88086d7f5d3SJohn Marino       mpn_mul_fft_decompose (B, Bp, K, nprime, m, ml, l, Mp, T);
88186d7f5d3SJohn Marino     }
88286d7f5d3SJohn Marino   h = mpn_mul_fft_internal (op, pl, k, Ap, Bp, A, B, nprime, l, Mp, fft_l, T, sqr);
88386d7f5d3SJohn Marino 
88486d7f5d3SJohn Marino   TMP_FREE;
88586d7f5d3SJohn Marino   return h;
88686d7f5d3SJohn Marino }
88786d7f5d3SJohn Marino 
88886d7f5d3SJohn Marino #if WANT_OLD_FFT_FULL
88986d7f5d3SJohn Marino /* multiply {n, nl} by {m, ml}, and put the result in {op, nl+ml} */
89086d7f5d3SJohn Marino void
mpn_mul_fft_full(mp_ptr op,mp_srcptr n,mp_size_t nl,mp_srcptr m,mp_size_t ml)89186d7f5d3SJohn Marino mpn_mul_fft_full (mp_ptr op,
89286d7f5d3SJohn Marino 		  mp_srcptr n, mp_size_t nl,
89386d7f5d3SJohn Marino 		  mp_srcptr m, mp_size_t ml)
89486d7f5d3SJohn Marino {
89586d7f5d3SJohn Marino   mp_ptr pad_op;
89686d7f5d3SJohn Marino   mp_size_t pl, pl2, pl3, l;
89786d7f5d3SJohn Marino   int k2, k3;
89886d7f5d3SJohn Marino   int sqr = (n == m && nl == ml);
89986d7f5d3SJohn Marino   int cc, c2, oldcc;
90086d7f5d3SJohn Marino 
90186d7f5d3SJohn Marino   pl = nl + ml; /* total number of limbs of the result */
90286d7f5d3SJohn Marino 
90386d7f5d3SJohn Marino   /* perform a fft mod 2^(2N)+1 and one mod 2^(3N)+1.
90486d7f5d3SJohn Marino      We must have pl3 = 3/2 * pl2, with pl2 a multiple of 2^k2, and
90586d7f5d3SJohn Marino      pl3 a multiple of 2^k3. Since k3 >= k2, both are multiples of 2^k2,
90686d7f5d3SJohn Marino      and pl2 must be an even multiple of 2^k2. Thus (pl2,pl3) =
90786d7f5d3SJohn Marino      (2*j*2^k2,3*j*2^k2), which works for 3*j <= pl/2^k2 <= 5*j.
90886d7f5d3SJohn Marino      We need that consecutive intervals overlap, i.e. 5*j >= 3*(j+1),
90986d7f5d3SJohn Marino      which requires j>=2. Thus this scheme requires pl >= 6 * 2^FFT_FIRST_K. */
91086d7f5d3SJohn Marino 
91186d7f5d3SJohn Marino   /*  ASSERT_ALWAYS(pl >= 6 * (1 << FFT_FIRST_K)); */
91286d7f5d3SJohn Marino 
91386d7f5d3SJohn Marino   pl2 = (2 * pl - 1) / 5; /* ceil (2pl/5) - 1 */
91486d7f5d3SJohn Marino   do
91586d7f5d3SJohn Marino     {
91686d7f5d3SJohn Marino       pl2++;
91786d7f5d3SJohn Marino       k2 = mpn_fft_best_k (pl2, sqr); /* best fft size for pl2 limbs */
91886d7f5d3SJohn Marino       pl2 = mpn_fft_next_size (pl2, k2);
91986d7f5d3SJohn Marino       pl3 = 3 * pl2 / 2; /* since k>=FFT_FIRST_K=4, pl2 is a multiple of 2^4,
92086d7f5d3SJohn Marino 			    thus pl2 / 2 is exact */
92186d7f5d3SJohn Marino       k3 = mpn_fft_best_k (pl3, sqr);
92286d7f5d3SJohn Marino     }
92386d7f5d3SJohn Marino   while (mpn_fft_next_size (pl3, k3) != pl3);
92486d7f5d3SJohn Marino 
92586d7f5d3SJohn Marino   TRACE (printf ("mpn_mul_fft_full nl=%ld ml=%ld -> pl2=%ld pl3=%ld k=%d\n",
92686d7f5d3SJohn Marino 		 nl, ml, pl2, pl3, k2));
92786d7f5d3SJohn Marino 
92886d7f5d3SJohn Marino   ASSERT_ALWAYS(pl3 <= pl);
92986d7f5d3SJohn Marino   cc = mpn_mul_fft (op, pl3, n, nl, m, ml, k3);     /* mu */
93086d7f5d3SJohn Marino   ASSERT(cc == 0);
93186d7f5d3SJohn Marino   pad_op = __GMP_ALLOCATE_FUNC_LIMBS (pl2);
93286d7f5d3SJohn Marino   cc = mpn_mul_fft (pad_op, pl2, n, nl, m, ml, k2); /* lambda */
93386d7f5d3SJohn Marino   cc = -cc + mpn_sub_n (pad_op, pad_op, op, pl2);    /* lambda - low(mu) */
93486d7f5d3SJohn Marino   /* 0 <= cc <= 1 */
93586d7f5d3SJohn Marino   ASSERT(0 <= cc && cc <= 1);
93686d7f5d3SJohn Marino   l = pl3 - pl2; /* l = pl2 / 2 since pl3 = 3/2 * pl2 */
93786d7f5d3SJohn Marino   c2 = mpn_add_n (pad_op, pad_op, op + pl2, l);
93886d7f5d3SJohn Marino   cc = mpn_add_1 (pad_op + l, pad_op + l, l, (mp_limb_t) c2) - cc;
93986d7f5d3SJohn Marino   ASSERT(-1 <= cc && cc <= 1);
94086d7f5d3SJohn Marino   if (cc < 0)
94186d7f5d3SJohn Marino     cc = mpn_add_1 (pad_op, pad_op, pl2, (mp_limb_t) -cc);
94286d7f5d3SJohn Marino   ASSERT(0 <= cc && cc <= 1);
94386d7f5d3SJohn Marino   /* now lambda-mu = {pad_op, pl2} - cc mod 2^(pl2*GMP_NUMB_BITS)+1 */
94486d7f5d3SJohn Marino   oldcc = cc;
94586d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_add_n_sub_n
94686d7f5d3SJohn Marino   c2 = mpn_add_n_sub_n (pad_op + l, pad_op, pad_op, pad_op + l, l);
94786d7f5d3SJohn Marino   /* c2 & 1 is the borrow, c2 & 2 is the carry */
94886d7f5d3SJohn Marino   cc += c2 >> 1; /* carry out from high <- low + high */
94986d7f5d3SJohn Marino   c2 = c2 & 1; /* borrow out from low <- low - high */
95086d7f5d3SJohn Marino #else
95186d7f5d3SJohn Marino   {
95286d7f5d3SJohn Marino     mp_ptr tmp;
95386d7f5d3SJohn Marino     TMP_DECL;
95486d7f5d3SJohn Marino 
95586d7f5d3SJohn Marino     TMP_MARK;
95686d7f5d3SJohn Marino     tmp = TMP_ALLOC_LIMBS (l);
95786d7f5d3SJohn Marino     MPN_COPY (tmp, pad_op, l);
95886d7f5d3SJohn Marino     c2 = mpn_sub_n (pad_op,      pad_op, pad_op + l, l);
95986d7f5d3SJohn Marino     cc += mpn_add_n (pad_op + l, tmp,    pad_op + l, l);
96086d7f5d3SJohn Marino     TMP_FREE;
96186d7f5d3SJohn Marino   }
96286d7f5d3SJohn Marino #endif
96386d7f5d3SJohn Marino   c2 += oldcc;
96486d7f5d3SJohn Marino   /* first normalize {pad_op, pl2} before dividing by 2: c2 is the borrow
96586d7f5d3SJohn Marino      at pad_op + l, cc is the carry at pad_op + pl2 */
96686d7f5d3SJohn Marino   /* 0 <= cc <= 2 */
96786d7f5d3SJohn Marino   cc -= mpn_sub_1 (pad_op + l, pad_op + l, l, (mp_limb_t) c2);
96886d7f5d3SJohn Marino   /* -1 <= cc <= 2 */
96986d7f5d3SJohn Marino   if (cc > 0)
97086d7f5d3SJohn Marino     cc = -mpn_sub_1 (pad_op, pad_op, pl2, (mp_limb_t) cc);
97186d7f5d3SJohn Marino   /* now -1 <= cc <= 0 */
97286d7f5d3SJohn Marino   if (cc < 0)
97386d7f5d3SJohn Marino     cc = mpn_add_1 (pad_op, pad_op, pl2, (mp_limb_t) -cc);
97486d7f5d3SJohn Marino   /* now {pad_op, pl2} is normalized, with 0 <= cc <= 1 */
97586d7f5d3SJohn Marino   if (pad_op[0] & 1) /* if odd, add 2^(pl2*GMP_NUMB_BITS)+1 */
97686d7f5d3SJohn Marino     cc += 1 + mpn_add_1 (pad_op, pad_op, pl2, CNST_LIMB(1));
97786d7f5d3SJohn Marino   /* now 0 <= cc <= 2, but cc=2 cannot occur since it would give a carry
97886d7f5d3SJohn Marino      out below */
97986d7f5d3SJohn Marino   mpn_rshift (pad_op, pad_op, pl2, 1); /* divide by two */
98086d7f5d3SJohn Marino   if (cc) /* then cc=1 */
98186d7f5d3SJohn Marino     pad_op [pl2 - 1] |= (mp_limb_t) 1 << (GMP_NUMB_BITS - 1);
98286d7f5d3SJohn Marino   /* now {pad_op,pl2}-cc = (lambda-mu)/(1-2^(l*GMP_NUMB_BITS))
98386d7f5d3SJohn Marino      mod 2^(pl2*GMP_NUMB_BITS) + 1 */
98486d7f5d3SJohn Marino   c2 = mpn_add_n (op, op, pad_op, pl2); /* no need to add cc (is 0) */
98586d7f5d3SJohn Marino   /* since pl2+pl3 >= pl, necessary the extra limbs (including cc) are zero */
98686d7f5d3SJohn Marino   MPN_COPY (op + pl3, pad_op, pl - pl3);
98786d7f5d3SJohn Marino   ASSERT_MPN_ZERO_P (pad_op + pl - pl3, pl2 + pl3 - pl);
98886d7f5d3SJohn Marino   __GMP_FREE_FUNC_LIMBS (pad_op, pl2);
98986d7f5d3SJohn Marino   /* since the final result has at most pl limbs, no carry out below */
99086d7f5d3SJohn Marino   mpn_add_1 (op + pl2, op + pl2, pl - pl2, (mp_limb_t) c2);
99186d7f5d3SJohn Marino }
99286d7f5d3SJohn Marino #endif
993