xref: /dflybsd-src/contrib/gmp/mpn/generic/toom3_sqr.c (revision 86d7f5d305c6adaa56ff4582ece9859d73106103)
186d7f5d3SJohn Marino /* mpn_toom3_sqr -- Square {ap,an}.
286d7f5d3SJohn Marino 
386d7f5d3SJohn Marino    Contributed to the GNU project by Torbjorn Granlund.
486d7f5d3SJohn Marino    Additional improvements by Marco Bodrato.
586d7f5d3SJohn Marino 
686d7f5d3SJohn Marino    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
786d7f5d3SJohn Marino    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
886d7f5d3SJohn Marino    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
986d7f5d3SJohn Marino 
1086d7f5d3SJohn Marino Copyright 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
1186d7f5d3SJohn Marino 
1286d7f5d3SJohn Marino This file is part of the GNU MP Library.
1386d7f5d3SJohn Marino 
1486d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1586d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1686d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1786d7f5d3SJohn Marino option) any later version.
1886d7f5d3SJohn Marino 
1986d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
2086d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2186d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
2286d7f5d3SJohn Marino License for more details.
2386d7f5d3SJohn Marino 
2486d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2586d7f5d3SJohn Marino along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
2686d7f5d3SJohn Marino 
2786d7f5d3SJohn Marino 
2886d7f5d3SJohn Marino #include "gmp.h"
2986d7f5d3SJohn Marino #include "gmp-impl.h"
3086d7f5d3SJohn Marino 
3186d7f5d3SJohn Marino /* Evaluate in: -1, 0, +1, +2, +inf
3286d7f5d3SJohn Marino 
3386d7f5d3SJohn Marino   <-s--><--n--><--n-->
3486d7f5d3SJohn Marino    ____ ______ ______
3586d7f5d3SJohn Marino   |_a2_|___a1_|___a0_|
3686d7f5d3SJohn Marino 
3786d7f5d3SJohn Marino   v0  =  a0         ^2 #   A(0)^2
3886d7f5d3SJohn Marino   v1  = (a0+ a1+ a2)^2 #   A(1)^2    ah  <= 2
3986d7f5d3SJohn Marino   vm1 = (a0- a1+ a2)^2 #  A(-1)^2   |ah| <= 1
4086d7f5d3SJohn Marino   v2  = (a0+2a1+4a2)^2 #   A(2)^2    ah  <= 6
4186d7f5d3SJohn Marino   vinf=          a2 ^2 # A(inf)^2
4286d7f5d3SJohn Marino */
4386d7f5d3SJohn Marino 
4486d7f5d3SJohn Marino #if TUNE_PROGRAM_BUILD
4586d7f5d3SJohn Marino #define MAYBE_sqr_basecase 1
4686d7f5d3SJohn Marino #define MAYBE_sqr_toom3   1
4786d7f5d3SJohn Marino #else
4886d7f5d3SJohn Marino #define MAYBE_sqr_basecase						\
4986d7f5d3SJohn Marino   (SQR_TOOM3_THRESHOLD < 3 * SQR_TOOM2_THRESHOLD)
5086d7f5d3SJohn Marino #define MAYBE_sqr_toom3							\
5186d7f5d3SJohn Marino   (SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD)
5286d7f5d3SJohn Marino #endif
5386d7f5d3SJohn Marino 
5486d7f5d3SJohn Marino #define TOOM3_SQR_REC(p, a, n, ws)					\
5586d7f5d3SJohn Marino   do {									\
5686d7f5d3SJohn Marino     if (MAYBE_sqr_basecase						\
5786d7f5d3SJohn Marino 	&& BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))			\
5886d7f5d3SJohn Marino       mpn_sqr_basecase (p, a, n);					\
5986d7f5d3SJohn Marino     else if (! MAYBE_sqr_toom3						\
6086d7f5d3SJohn Marino 	     || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))		\
6186d7f5d3SJohn Marino       mpn_toom2_sqr (p, a, n, ws);					\
6286d7f5d3SJohn Marino     else								\
6386d7f5d3SJohn Marino       mpn_toom3_sqr (p, a, n, ws);					\
6486d7f5d3SJohn Marino   } while (0)
6586d7f5d3SJohn Marino 
6686d7f5d3SJohn Marino void
mpn_toom3_sqr(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_ptr scratch)6786d7f5d3SJohn Marino mpn_toom3_sqr (mp_ptr pp,
6886d7f5d3SJohn Marino 	       mp_srcptr ap, mp_size_t an,
6986d7f5d3SJohn Marino 	       mp_ptr scratch)
7086d7f5d3SJohn Marino {
7186d7f5d3SJohn Marino   mp_size_t n, s;
7286d7f5d3SJohn Marino   mp_limb_t cy, vinf0;
7386d7f5d3SJohn Marino   mp_ptr gp;
7486d7f5d3SJohn Marino   mp_ptr as1, asm1, as2;
7586d7f5d3SJohn Marino 
7686d7f5d3SJohn Marino #define a0  ap
7786d7f5d3SJohn Marino #define a1  (ap + n)
7886d7f5d3SJohn Marino #define a2  (ap + 2*n)
7986d7f5d3SJohn Marino 
8086d7f5d3SJohn Marino   n = (an + 2) / (size_t) 3;
8186d7f5d3SJohn Marino 
8286d7f5d3SJohn Marino   s = an - 2 * n;
8386d7f5d3SJohn Marino 
8486d7f5d3SJohn Marino   ASSERT (0 < s && s <= n);
8586d7f5d3SJohn Marino 
8686d7f5d3SJohn Marino   as1 = scratch + 4 * n + 4;
8786d7f5d3SJohn Marino   asm1 = scratch + 2 * n + 2;
8886d7f5d3SJohn Marino   as2 = pp + n + 1;
8986d7f5d3SJohn Marino 
9086d7f5d3SJohn Marino   gp = scratch;
9186d7f5d3SJohn Marino 
9286d7f5d3SJohn Marino   /* Compute as1 and asm1.  */
9386d7f5d3SJohn Marino   cy = mpn_add (gp, a0, n, a2, s);
9486d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_add_n_sub_n
9586d7f5d3SJohn Marino   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
9686d7f5d3SJohn Marino     {
9786d7f5d3SJohn Marino       cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
9886d7f5d3SJohn Marino       as1[n] = cy >> 1;
9986d7f5d3SJohn Marino       asm1[n] = 0;
10086d7f5d3SJohn Marino     }
10186d7f5d3SJohn Marino   else
10286d7f5d3SJohn Marino     {
10386d7f5d3SJohn Marino       mp_limb_t cy2;
10486d7f5d3SJohn Marino       cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
10586d7f5d3SJohn Marino       as1[n] = cy + (cy2 >> 1);
10686d7f5d3SJohn Marino       asm1[n] = cy - (cy2 & 1);
10786d7f5d3SJohn Marino     }
10886d7f5d3SJohn Marino #else
10986d7f5d3SJohn Marino   as1[n] = cy + mpn_add_n (as1, gp, a1, n);
11086d7f5d3SJohn Marino   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
11186d7f5d3SJohn Marino     {
11286d7f5d3SJohn Marino       mpn_sub_n (asm1, a1, gp, n);
11386d7f5d3SJohn Marino       asm1[n] = 0;
11486d7f5d3SJohn Marino     }
11586d7f5d3SJohn Marino   else
11686d7f5d3SJohn Marino     {
11786d7f5d3SJohn Marino       cy -= mpn_sub_n (asm1, gp, a1, n);
11886d7f5d3SJohn Marino       asm1[n] = cy;
11986d7f5d3SJohn Marino     }
12086d7f5d3SJohn Marino #endif
12186d7f5d3SJohn Marino 
12286d7f5d3SJohn Marino   /* Compute as2.  */
12386d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_rsblsh1_n
12486d7f5d3SJohn Marino   cy = mpn_add_n (as2, a2, as1, s);
12586d7f5d3SJohn Marino   if (s != n)
12686d7f5d3SJohn Marino     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
12786d7f5d3SJohn Marino   cy += as1[n];
12886d7f5d3SJohn Marino   cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
12986d7f5d3SJohn Marino #else
13086d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
13186d7f5d3SJohn Marino   cy  = mpn_addlsh1_n (as2, a1, a2, s);
13286d7f5d3SJohn Marino   if (s != n)
13386d7f5d3SJohn Marino     cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
13486d7f5d3SJohn Marino   cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
13586d7f5d3SJohn Marino #else
13686d7f5d3SJohn Marino   cy = mpn_add_n (as2, a2, as1, s);
13786d7f5d3SJohn Marino   if (s != n)
13886d7f5d3SJohn Marino     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
13986d7f5d3SJohn Marino   cy += as1[n];
14086d7f5d3SJohn Marino   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
14186d7f5d3SJohn Marino   cy -= mpn_sub_n (as2, as2, a0, n);
14286d7f5d3SJohn Marino #endif
14386d7f5d3SJohn Marino #endif
14486d7f5d3SJohn Marino   as2[n] = cy;
14586d7f5d3SJohn Marino 
14686d7f5d3SJohn Marino   ASSERT (as1[n] <= 2);
14786d7f5d3SJohn Marino   ASSERT (asm1[n] <= 1);
14886d7f5d3SJohn Marino 
14986d7f5d3SJohn Marino #define v0    pp				/* 2n */
15086d7f5d3SJohn Marino #define v1    (pp + 2 * n)			/* 2n+1 */
15186d7f5d3SJohn Marino #define vinf  (pp + 4 * n)			/* s+s */
15286d7f5d3SJohn Marino #define vm1   scratch				/* 2n+1 */
15386d7f5d3SJohn Marino #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
15486d7f5d3SJohn Marino #define scratch_out  (scratch + 5 * n + 5)
15586d7f5d3SJohn Marino 
15686d7f5d3SJohn Marino   /* vm1, 2n+1 limbs */
15786d7f5d3SJohn Marino #ifdef SMALLER_RECURSION
15886d7f5d3SJohn Marino   TOOM3_SQR_REC (vm1, asm1, n, scratch_out);
15986d7f5d3SJohn Marino   cy = 0;
16086d7f5d3SJohn Marino   if (asm1[n] != 0)
16186d7f5d3SJohn Marino     cy = asm1[n] + mpn_add_n (vm1 + n, vm1 + n, asm1, n);
16286d7f5d3SJohn Marino   if (asm1[n] != 0)
16386d7f5d3SJohn Marino     cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
16486d7f5d3SJohn Marino   vm1[2 * n] = cy;
16586d7f5d3SJohn Marino #else
16686d7f5d3SJohn Marino   TOOM3_SQR_REC (vm1, asm1, n + 1, scratch_out);
16786d7f5d3SJohn Marino #endif
16886d7f5d3SJohn Marino 
16986d7f5d3SJohn Marino   TOOM3_SQR_REC (v2, as2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
17086d7f5d3SJohn Marino 
17186d7f5d3SJohn Marino   TOOM3_SQR_REC (vinf, a2, s, scratch_out);	/* vinf, s+s limbs */
17286d7f5d3SJohn Marino 
17386d7f5d3SJohn Marino   vinf0 = vinf[0];				/* v1 overlaps with this */
17486d7f5d3SJohn Marino 
17586d7f5d3SJohn Marino #ifdef SMALLER_RECURSION
17686d7f5d3SJohn Marino   /* v1, 2n+1 limbs */
17786d7f5d3SJohn Marino   TOOM3_SQR_REC (v1, as1, n, scratch_out);
17886d7f5d3SJohn Marino   if (as1[n] == 1)
17986d7f5d3SJohn Marino     {
18086d7f5d3SJohn Marino       cy = as1[n] + mpn_add_n (v1 + n, v1 + n, as1, n);
18186d7f5d3SJohn Marino     }
18286d7f5d3SJohn Marino   else if (as1[n] != 0)
18386d7f5d3SJohn Marino     {
18486d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
18586d7f5d3SJohn Marino       cy = 2 * as1[n] + mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
18686d7f5d3SJohn Marino #else
18786d7f5d3SJohn Marino       cy = 2 * as1[n] + mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
18886d7f5d3SJohn Marino #endif
18986d7f5d3SJohn Marino     }
19086d7f5d3SJohn Marino   else
19186d7f5d3SJohn Marino     cy = 0;
19286d7f5d3SJohn Marino   if (as1[n] == 1)
19386d7f5d3SJohn Marino     {
19486d7f5d3SJohn Marino       cy += mpn_add_n (v1 + n, v1 + n, as1, n);
19586d7f5d3SJohn Marino     }
19686d7f5d3SJohn Marino   else if (as1[n] != 0)
19786d7f5d3SJohn Marino     {
19886d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
19986d7f5d3SJohn Marino       cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
20086d7f5d3SJohn Marino #else
20186d7f5d3SJohn Marino       cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
20286d7f5d3SJohn Marino #endif
20386d7f5d3SJohn Marino     }
20486d7f5d3SJohn Marino   v1[2 * n] = cy;
20586d7f5d3SJohn Marino #else
20686d7f5d3SJohn Marino   cy = vinf[1];
20786d7f5d3SJohn Marino   TOOM3_SQR_REC (v1, as1, n + 1, scratch_out);
20886d7f5d3SJohn Marino   vinf[1] = cy;
20986d7f5d3SJohn Marino #endif
21086d7f5d3SJohn Marino 
21186d7f5d3SJohn Marino   TOOM3_SQR_REC (v0, ap, n, scratch_out);	/* v0, 2n limbs */
21286d7f5d3SJohn Marino 
21386d7f5d3SJohn Marino   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + s, 0, vinf0);
21486d7f5d3SJohn Marino }
215