186d7f5d3SJohn Marino /* mpn_toom4_sqr -- Square {ap,an}.
286d7f5d3SJohn Marino
386d7f5d3SJohn Marino Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
486d7f5d3SJohn Marino
586d7f5d3SJohn Marino THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
686d7f5d3SJohn Marino SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
786d7f5d3SJohn Marino GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
886d7f5d3SJohn Marino
986d7f5d3SJohn Marino Copyright 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
1086d7f5d3SJohn Marino
1186d7f5d3SJohn Marino This file is part of the GNU MP Library.
1286d7f5d3SJohn Marino
1386d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1486d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1586d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1686d7f5d3SJohn Marino option) any later version.
1786d7f5d3SJohn Marino
1886d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
1986d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2086d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
2186d7f5d3SJohn Marino License for more details.
2286d7f5d3SJohn Marino
2386d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2486d7f5d3SJohn Marino along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
2586d7f5d3SJohn Marino
2686d7f5d3SJohn Marino
2786d7f5d3SJohn Marino #include "gmp.h"
2886d7f5d3SJohn Marino #include "gmp-impl.h"
2986d7f5d3SJohn Marino
3086d7f5d3SJohn Marino /* Evaluate in: -1, -1/2, 0, +1/2, +1, +2, +inf
3186d7f5d3SJohn Marino
3286d7f5d3SJohn Marino <-s--><--n--><--n--><--n-->
3386d7f5d3SJohn Marino ____ ______ ______ ______
3486d7f5d3SJohn Marino |_a3_|___a2_|___a1_|___a0_|
3586d7f5d3SJohn Marino
3686d7f5d3SJohn Marino v0 = a0 ^2 # A(0)^2
3786d7f5d3SJohn Marino v1 = ( a0+ a1+ a2+ a3)^2 # A(1)^2 ah <= 3
3886d7f5d3SJohn Marino vm1 = ( a0- a1+ a2- a3)^2 # A(-1)^2 |ah| <= 1
3986d7f5d3SJohn Marino v2 = ( a0+2a1+4a2+8a3)^2 # A(2)^2 ah <= 14
4086d7f5d3SJohn Marino vh = (8a0+4a1+2a2+ a3)^2 # A(1/2)^2 ah <= 14
4186d7f5d3SJohn Marino vmh = (8a0-4a1+2a2- a3)^2 # A(-1/2)^2 -4<=ah<=9
4286d7f5d3SJohn Marino vinf= a3 ^2 # A(inf)^2
4386d7f5d3SJohn Marino */
4486d7f5d3SJohn Marino
4586d7f5d3SJohn Marino #if TUNE_PROGRAM_BUILD
4686d7f5d3SJohn Marino #define MAYBE_sqr_basecase 1
4786d7f5d3SJohn Marino #define MAYBE_sqr_toom2 1
4886d7f5d3SJohn Marino #define MAYBE_sqr_toom4 1
4986d7f5d3SJohn Marino #else
5086d7f5d3SJohn Marino #define MAYBE_sqr_basecase \
5186d7f5d3SJohn Marino (SQR_TOOM4_THRESHOLD < 4 * SQR_TOOM2_THRESHOLD)
5286d7f5d3SJohn Marino #define MAYBE_sqr_toom2 \
5386d7f5d3SJohn Marino (SQR_TOOM4_THRESHOLD < 4 * SQR_TOOM3_THRESHOLD)
5486d7f5d3SJohn Marino #define MAYBE_sqr_toom4 \
5586d7f5d3SJohn Marino (SQR_FFT_THRESHOLD >= 4 * SQR_TOOM4_THRESHOLD)
5686d7f5d3SJohn Marino #endif
5786d7f5d3SJohn Marino
5886d7f5d3SJohn Marino #define TOOM4_SQR_REC(p, a, n, ws) \
5986d7f5d3SJohn Marino do { \
6086d7f5d3SJohn Marino if (MAYBE_sqr_basecase \
6186d7f5d3SJohn Marino && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) \
6286d7f5d3SJohn Marino mpn_sqr_basecase (p, a, n); \
6386d7f5d3SJohn Marino else if (MAYBE_sqr_toom2 \
6486d7f5d3SJohn Marino && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \
6586d7f5d3SJohn Marino mpn_toom2_sqr (p, a, n, ws); \
6686d7f5d3SJohn Marino else if (! MAYBE_sqr_toom4 \
6786d7f5d3SJohn Marino || BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD)) \
6886d7f5d3SJohn Marino mpn_toom3_sqr (p, a, n, ws); \
6986d7f5d3SJohn Marino else \
7086d7f5d3SJohn Marino mpn_toom4_sqr (p, a, n, ws); \
7186d7f5d3SJohn Marino } while (0)
7286d7f5d3SJohn Marino
7386d7f5d3SJohn Marino void
mpn_toom4_sqr(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_ptr scratch)7486d7f5d3SJohn Marino mpn_toom4_sqr (mp_ptr pp,
7586d7f5d3SJohn Marino mp_srcptr ap, mp_size_t an,
7686d7f5d3SJohn Marino mp_ptr scratch)
7786d7f5d3SJohn Marino {
7886d7f5d3SJohn Marino mp_size_t n, s;
7986d7f5d3SJohn Marino mp_limb_t cy;
8086d7f5d3SJohn Marino
8186d7f5d3SJohn Marino #define a0 ap
8286d7f5d3SJohn Marino #define a1 (ap + n)
8386d7f5d3SJohn Marino #define a2 (ap + 2*n)
8486d7f5d3SJohn Marino #define a3 (ap + 3*n)
8586d7f5d3SJohn Marino
8686d7f5d3SJohn Marino n = (an + 3) >> 2;
8786d7f5d3SJohn Marino
8886d7f5d3SJohn Marino s = an - 3 * n;
8986d7f5d3SJohn Marino
9086d7f5d3SJohn Marino ASSERT (0 < s && s <= n);
9186d7f5d3SJohn Marino
9286d7f5d3SJohn Marino /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the
9386d7f5d3SJohn Marino * following limb, so these must be computed in order, and we need a
9486d7f5d3SJohn Marino * one limb gap to tp. */
9586d7f5d3SJohn Marino #define v0 pp /* 2n */
9686d7f5d3SJohn Marino #define v1 (pp + 2 * n) /* 2n+1 */
9786d7f5d3SJohn Marino #define vinf (pp + 6 * n) /* s+t */
9886d7f5d3SJohn Marino #define v2 scratch /* 2n+1 */
9986d7f5d3SJohn Marino #define vm2 (scratch + 2 * n + 1) /* 2n+1 */
10086d7f5d3SJohn Marino #define vh (scratch + 4 * n + 2) /* 2n+1 */
10186d7f5d3SJohn Marino #define vm1 (scratch + 6 * n + 3) /* 2n+1 */
10286d7f5d3SJohn Marino #define tp (scratch + 8*n + 5)
10386d7f5d3SJohn Marino
10486d7f5d3SJohn Marino /* No overlap with v1 */
10586d7f5d3SJohn Marino #define apx pp /* n+1 */
10686d7f5d3SJohn Marino #define amx (pp + 4*n + 2) /* n+1 */
10786d7f5d3SJohn Marino
10886d7f5d3SJohn Marino /* Total scratch need: 8*n + 5 + scratch for recursive calls. This
10986d7f5d3SJohn Marino gives roughly 32 n/3 + log term. */
11086d7f5d3SJohn Marino
11186d7f5d3SJohn Marino /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3. */
11286d7f5d3SJohn Marino mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp);
11386d7f5d3SJohn Marino
11486d7f5d3SJohn Marino TOOM4_SQR_REC (v2, apx, n + 1, tp); /* v2, 2n+1 limbs */
11586d7f5d3SJohn Marino TOOM4_SQR_REC (vm2, amx, n + 1, tp); /* vm2, 2n+1 limbs */
11686d7f5d3SJohn Marino
11786d7f5d3SJohn Marino /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */
11886d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
11986d7f5d3SJohn Marino cy = mpn_addlsh1_n (apx, a1, a0, n);
12086d7f5d3SJohn Marino cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n);
12186d7f5d3SJohn Marino if (s < n)
12286d7f5d3SJohn Marino {
12386d7f5d3SJohn Marino mp_limb_t cy2;
12486d7f5d3SJohn Marino cy2 = mpn_addlsh1_n (apx, a3, apx, s);
12586d7f5d3SJohn Marino apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1);
12686d7f5d3SJohn Marino MPN_INCR_U (apx + s, n+1-s, cy2);
12786d7f5d3SJohn Marino }
12886d7f5d3SJohn Marino else
12986d7f5d3SJohn Marino apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n);
13086d7f5d3SJohn Marino #else
13186d7f5d3SJohn Marino cy = mpn_lshift (apx, a0, n, 1);
13286d7f5d3SJohn Marino cy += mpn_add_n (apx, apx, a1, n);
13386d7f5d3SJohn Marino cy = 2*cy + mpn_lshift (apx, apx, n, 1);
13486d7f5d3SJohn Marino cy += mpn_add_n (apx, apx, a2, n);
13586d7f5d3SJohn Marino cy = 2*cy + mpn_lshift (apx, apx, n, 1);
13686d7f5d3SJohn Marino apx[n] = cy + mpn_add (apx, apx, n, a3, s);
13786d7f5d3SJohn Marino #endif
13886d7f5d3SJohn Marino
13986d7f5d3SJohn Marino ASSERT (apx[n] < 15);
14086d7f5d3SJohn Marino
14186d7f5d3SJohn Marino TOOM4_SQR_REC (vh, apx, n + 1, tp); /* vh, 2n+1 limbs */
14286d7f5d3SJohn Marino
14386d7f5d3SJohn Marino /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3. */
14486d7f5d3SJohn Marino mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp);
14586d7f5d3SJohn Marino
14686d7f5d3SJohn Marino TOOM4_SQR_REC (v1, apx, n + 1, tp); /* v1, 2n+1 limbs */
14786d7f5d3SJohn Marino TOOM4_SQR_REC (vm1, amx, n + 1, tp); /* vm1, 2n+1 limbs */
14886d7f5d3SJohn Marino
14986d7f5d3SJohn Marino TOOM4_SQR_REC (v0, a0, n, tp);
15086d7f5d3SJohn Marino TOOM4_SQR_REC (vinf, a3, s, tp); /* vinf, 2s limbs */
15186d7f5d3SJohn Marino
15286d7f5d3SJohn Marino mpn_toom_interpolate_7pts (pp, n, 0, vm2, vm1, v2, vh, 2*s, tp);
15386d7f5d3SJohn Marino }
154