186d7f5d3SJohn Marino /* mpn_toom44_mul -- Multiply {ap,an} and {bp,bn} where an and bn are close in
286d7f5d3SJohn Marino size. Or more accurately, bn <= an < (4/3)bn.
386d7f5d3SJohn Marino
486d7f5d3SJohn Marino Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
586d7f5d3SJohn Marino
686d7f5d3SJohn Marino THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
786d7f5d3SJohn Marino SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
886d7f5d3SJohn Marino GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
986d7f5d3SJohn Marino
1086d7f5d3SJohn Marino Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
1186d7f5d3SJohn Marino
1286d7f5d3SJohn Marino This file is part of the GNU MP Library.
1386d7f5d3SJohn Marino
1486d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1586d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1686d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1786d7f5d3SJohn Marino option) any later version.
1886d7f5d3SJohn Marino
1986d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
2086d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2186d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
2286d7f5d3SJohn Marino License for more details.
2386d7f5d3SJohn Marino
2486d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2586d7f5d3SJohn Marino along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
2686d7f5d3SJohn Marino
2786d7f5d3SJohn Marino
2886d7f5d3SJohn Marino #include "gmp.h"
2986d7f5d3SJohn Marino #include "gmp-impl.h"
3086d7f5d3SJohn Marino
3186d7f5d3SJohn Marino /* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf
3286d7f5d3SJohn Marino
3386d7f5d3SJohn Marino <-s--><--n--><--n--><--n-->
3486d7f5d3SJohn Marino ____ ______ ______ ______
3586d7f5d3SJohn Marino |_a3_|___a2_|___a1_|___a0_|
3686d7f5d3SJohn Marino |b3_|___b2_|___b1_|___b0_|
3786d7f5d3SJohn Marino <-t-><--n--><--n--><--n-->
3886d7f5d3SJohn Marino
3986d7f5d3SJohn Marino v0 = a0 * b0 # A(0)*B(0)
4086d7f5d3SJohn Marino v1 = ( a0+ a1+ a2+ a3)*( b0+ b1+ b2+ b3) # A(1)*B(1) ah <= 3 bh <= 3
4186d7f5d3SJohn Marino vm1 = ( a0- a1+ a2- a3)*( b0- b1+ b2- b3) # A(-1)*B(-1) |ah| <= 1 |bh| <= 1
4286d7f5d3SJohn Marino v2 = ( a0+2a1+4a2+8a3)*( b0+2b1+4b2+8b3) # A(2)*B(2) ah <= 14 bh <= 14
4386d7f5d3SJohn Marino vm2 = ( a0-2a1+4a2-8a3)*( b0-2b1+4b2-8b3) # A(2)*B(2) ah <= 9 |bh| <= 9
4486d7f5d3SJohn Marino vh = (8a0+4a1+2a2+ a3)*(8b0+4b1+2b2+ b3) # A(1/2)*B(1/2) ah <= 14 bh <= 14
4586d7f5d3SJohn Marino vinf= a3 * b2 # A(inf)*B(inf)
4686d7f5d3SJohn Marino */
4786d7f5d3SJohn Marino
4886d7f5d3SJohn Marino #if TUNE_PROGRAM_BUILD
4986d7f5d3SJohn Marino #define MAYBE_mul_basecase 1
5086d7f5d3SJohn Marino #define MAYBE_mul_toom22 1
5186d7f5d3SJohn Marino #define MAYBE_mul_toom44 1
5286d7f5d3SJohn Marino #else
5386d7f5d3SJohn Marino #define MAYBE_mul_basecase \
5486d7f5d3SJohn Marino (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM22_THRESHOLD)
5586d7f5d3SJohn Marino #define MAYBE_mul_toom22 \
5686d7f5d3SJohn Marino (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM33_THRESHOLD)
5786d7f5d3SJohn Marino #define MAYBE_mul_toom44 \
5886d7f5d3SJohn Marino (MUL_FFT_THRESHOLD >= 4 * MUL_TOOM44_THRESHOLD)
5986d7f5d3SJohn Marino #endif
6086d7f5d3SJohn Marino
6186d7f5d3SJohn Marino #define TOOM44_MUL_N_REC(p, a, b, n, ws) \
6286d7f5d3SJohn Marino do { \
6386d7f5d3SJohn Marino if (MAYBE_mul_basecase \
6486d7f5d3SJohn Marino && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \
6586d7f5d3SJohn Marino mpn_mul_basecase (p, a, n, b, n); \
6686d7f5d3SJohn Marino else if (MAYBE_mul_toom22 \
6786d7f5d3SJohn Marino && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \
6886d7f5d3SJohn Marino mpn_toom22_mul (p, a, n, b, n, ws); \
6986d7f5d3SJohn Marino else if (! MAYBE_mul_toom44 \
7086d7f5d3SJohn Marino || BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD)) \
7186d7f5d3SJohn Marino mpn_toom33_mul (p, a, n, b, n, ws); \
7286d7f5d3SJohn Marino else \
7386d7f5d3SJohn Marino mpn_toom44_mul (p, a, n, b, n, ws); \
7486d7f5d3SJohn Marino } while (0)
7586d7f5d3SJohn Marino
7686d7f5d3SJohn Marino /* Use of scratch space. In the product area, we store
7786d7f5d3SJohn Marino
7886d7f5d3SJohn Marino ___________________
7986d7f5d3SJohn Marino |vinf|____|_v1_|_v0_|
8086d7f5d3SJohn Marino s+t 2n-1 2n+1 2n
8186d7f5d3SJohn Marino
8286d7f5d3SJohn Marino The other recursive products, vm1, v2, vm2, vh are stored in the
8386d7f5d3SJohn Marino scratch area. When computing them, we use the product area for
8486d7f5d3SJohn Marino intermediate values.
8586d7f5d3SJohn Marino
8686d7f5d3SJohn Marino Next, we compute v1. We can store the intermediate factors at v0
8786d7f5d3SJohn Marino and at vh + 2n + 2.
8886d7f5d3SJohn Marino
8986d7f5d3SJohn Marino Finally, for v0 and vinf, factors are parts of the input operands,
9086d7f5d3SJohn Marino and we need scratch space only for the recursive multiplication.
9186d7f5d3SJohn Marino
9286d7f5d3SJohn Marino In all, if S(an) is the scratch need, the needed space is bounded by
9386d7f5d3SJohn Marino
9486d7f5d3SJohn Marino S(an) <= 4 (2*ceil(an/4) + 1) + 1 + S(ceil(an/4) + 1)
9586d7f5d3SJohn Marino
9686d7f5d3SJohn Marino which should give S(n) = 8 n/3 + c log(n) for some constant c.
9786d7f5d3SJohn Marino */
9886d7f5d3SJohn Marino
9986d7f5d3SJohn Marino void
mpn_toom44_mul(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn,mp_ptr scratch)10086d7f5d3SJohn Marino mpn_toom44_mul (mp_ptr pp,
10186d7f5d3SJohn Marino mp_srcptr ap, mp_size_t an,
10286d7f5d3SJohn Marino mp_srcptr bp, mp_size_t bn,
10386d7f5d3SJohn Marino mp_ptr scratch)
10486d7f5d3SJohn Marino {
10586d7f5d3SJohn Marino mp_size_t n, s, t;
10686d7f5d3SJohn Marino mp_limb_t cy;
10786d7f5d3SJohn Marino enum toom7_flags flags;
10886d7f5d3SJohn Marino
10986d7f5d3SJohn Marino #define a0 ap
11086d7f5d3SJohn Marino #define a1 (ap + n)
11186d7f5d3SJohn Marino #define a2 (ap + 2*n)
11286d7f5d3SJohn Marino #define a3 (ap + 3*n)
11386d7f5d3SJohn Marino #define b0 bp
11486d7f5d3SJohn Marino #define b1 (bp + n)
11586d7f5d3SJohn Marino #define b2 (bp + 2*n)
11686d7f5d3SJohn Marino #define b3 (bp + 3*n)
11786d7f5d3SJohn Marino
11886d7f5d3SJohn Marino ASSERT (an >= bn);
11986d7f5d3SJohn Marino
12086d7f5d3SJohn Marino n = (an + 3) >> 2;
12186d7f5d3SJohn Marino
12286d7f5d3SJohn Marino s = an - 3 * n;
12386d7f5d3SJohn Marino t = bn - 3 * n;
12486d7f5d3SJohn Marino
12586d7f5d3SJohn Marino ASSERT (0 < s && s <= n);
12686d7f5d3SJohn Marino ASSERT (0 < t && t <= n);
12786d7f5d3SJohn Marino ASSERT (s >= t);
12886d7f5d3SJohn Marino
12986d7f5d3SJohn Marino /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the
13086d7f5d3SJohn Marino * following limb, so these must be computed in order, and we need a
13186d7f5d3SJohn Marino * one limb gap to tp. */
13286d7f5d3SJohn Marino #define v0 pp /* 2n */
13386d7f5d3SJohn Marino #define v1 (pp + 2 * n) /* 2n+1 */
13486d7f5d3SJohn Marino #define vinf (pp + 6 * n) /* s+t */
13586d7f5d3SJohn Marino #define v2 scratch /* 2n+1 */
13686d7f5d3SJohn Marino #define vm2 (scratch + 2 * n + 1) /* 2n+1 */
13786d7f5d3SJohn Marino #define vh (scratch + 4 * n + 2) /* 2n+1 */
13886d7f5d3SJohn Marino #define vm1 (scratch + 6 * n + 3) /* 2n+1 */
13986d7f5d3SJohn Marino #define tp (scratch + 8*n + 5)
14086d7f5d3SJohn Marino
14186d7f5d3SJohn Marino /* apx and bpx must not overlap with v1 */
14286d7f5d3SJohn Marino #define apx pp /* n+1 */
14386d7f5d3SJohn Marino #define amx (pp + n + 1) /* n+1 */
14486d7f5d3SJohn Marino #define bmx (pp + 2*n + 2) /* n+1 */
14586d7f5d3SJohn Marino #define bpx (pp + 4*n + 2) /* n+1 */
14686d7f5d3SJohn Marino
14786d7f5d3SJohn Marino /* Total scratch need: 8*n + 5 + scratch for recursive calls. This
14886d7f5d3SJohn Marino gives roughly 32 n/3 + log term. */
14986d7f5d3SJohn Marino
15086d7f5d3SJohn Marino /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3. */
15186d7f5d3SJohn Marino flags = toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp);
15286d7f5d3SJohn Marino
15386d7f5d3SJohn Marino /* Compute bpx = b0 + 2 b1 + 4 b2 + 8 b3 and bmx = b0 - 2 b1 + 4 b2 - 8 b3. */
15486d7f5d3SJohn Marino flags ^= toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (bpx, bmx, bp, n, t, tp);
15586d7f5d3SJohn Marino
15686d7f5d3SJohn Marino TOOM44_MUL_N_REC (v2, apx, bpx, n + 1, tp); /* v2, 2n+1 limbs */
15786d7f5d3SJohn Marino TOOM44_MUL_N_REC (vm2, amx, bmx, n + 1, tp); /* vm2, 2n+1 limbs */
15886d7f5d3SJohn Marino
15986d7f5d3SJohn Marino /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */
16086d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
16186d7f5d3SJohn Marino cy = mpn_addlsh1_n (apx, a1, a0, n);
16286d7f5d3SJohn Marino cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n);
16386d7f5d3SJohn Marino if (s < n)
16486d7f5d3SJohn Marino {
16586d7f5d3SJohn Marino mp_limb_t cy2;
16686d7f5d3SJohn Marino cy2 = mpn_addlsh1_n (apx, a3, apx, s);
16786d7f5d3SJohn Marino apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1);
16886d7f5d3SJohn Marino MPN_INCR_U (apx + s, n+1-s, cy2);
16986d7f5d3SJohn Marino }
17086d7f5d3SJohn Marino else
17186d7f5d3SJohn Marino apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n);
17286d7f5d3SJohn Marino #else
17386d7f5d3SJohn Marino cy = mpn_lshift (apx, a0, n, 1);
17486d7f5d3SJohn Marino cy += mpn_add_n (apx, apx, a1, n);
17586d7f5d3SJohn Marino cy = 2*cy + mpn_lshift (apx, apx, n, 1);
17686d7f5d3SJohn Marino cy += mpn_add_n (apx, apx, a2, n);
17786d7f5d3SJohn Marino cy = 2*cy + mpn_lshift (apx, apx, n, 1);
17886d7f5d3SJohn Marino apx[n] = cy + mpn_add (apx, apx, n, a3, s);
17986d7f5d3SJohn Marino #endif
18086d7f5d3SJohn Marino
18186d7f5d3SJohn Marino /* Compute bpx = 8 b0 + 4 b1 + 2 b2 + b3 = (((2*b0 + b1) * 2 + b2) * 2 + b3 */
18286d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
18386d7f5d3SJohn Marino cy = mpn_addlsh1_n (bpx, b1, b0, n);
18486d7f5d3SJohn Marino cy = 2*cy + mpn_addlsh1_n (bpx, b2, bpx, n);
18586d7f5d3SJohn Marino if (t < n)
18686d7f5d3SJohn Marino {
18786d7f5d3SJohn Marino mp_limb_t cy2;
18886d7f5d3SJohn Marino cy2 = mpn_addlsh1_n (bpx, b3, bpx, t);
18986d7f5d3SJohn Marino bpx[n] = 2*cy + mpn_lshift (bpx + t, bpx + t, n - t, 1);
19086d7f5d3SJohn Marino MPN_INCR_U (bpx + t, n+1-t, cy2);
19186d7f5d3SJohn Marino }
19286d7f5d3SJohn Marino else
19386d7f5d3SJohn Marino bpx[n] = 2*cy + mpn_addlsh1_n (bpx, b3, bpx, n);
19486d7f5d3SJohn Marino #else
19586d7f5d3SJohn Marino cy = mpn_lshift (bpx, b0, n, 1);
19686d7f5d3SJohn Marino cy += mpn_add_n (bpx, bpx, b1, n);
19786d7f5d3SJohn Marino cy = 2*cy + mpn_lshift (bpx, bpx, n, 1);
19886d7f5d3SJohn Marino cy += mpn_add_n (bpx, bpx, b2, n);
19986d7f5d3SJohn Marino cy = 2*cy + mpn_lshift (bpx, bpx, n, 1);
20086d7f5d3SJohn Marino bpx[n] = cy + mpn_add (bpx, bpx, n, b3, t);
20186d7f5d3SJohn Marino #endif
20286d7f5d3SJohn Marino
20386d7f5d3SJohn Marino ASSERT (apx[n] < 15);
20486d7f5d3SJohn Marino ASSERT (bpx[n] < 15);
20586d7f5d3SJohn Marino
20686d7f5d3SJohn Marino TOOM44_MUL_N_REC (vh, apx, bpx, n + 1, tp); /* vh, 2n+1 limbs */
20786d7f5d3SJohn Marino
20886d7f5d3SJohn Marino /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3. */
20986d7f5d3SJohn Marino flags |= toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp);
21086d7f5d3SJohn Marino
21186d7f5d3SJohn Marino /* Compute bpx = b0 + b1 + b2 + b3 bnd bmx = b0 - b1 + b2 - b3. */
21286d7f5d3SJohn Marino flags ^= toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (bpx, bmx, bp, n, t, tp);
21386d7f5d3SJohn Marino
21486d7f5d3SJohn Marino TOOM44_MUL_N_REC (vm1, amx, bmx, n + 1, tp); /* vm1, 2n+1 limbs */
21586d7f5d3SJohn Marino /* Clobbers amx, bmx. */
21686d7f5d3SJohn Marino TOOM44_MUL_N_REC (v1, apx, bpx, n + 1, tp); /* v1, 2n+1 limbs */
21786d7f5d3SJohn Marino
21886d7f5d3SJohn Marino TOOM44_MUL_N_REC (v0, a0, b0, n, tp);
21986d7f5d3SJohn Marino if (s > t)
22086d7f5d3SJohn Marino mpn_mul (vinf, a3, s, b3, t);
22186d7f5d3SJohn Marino else
22286d7f5d3SJohn Marino TOOM44_MUL_N_REC (vinf, a3, b3, s, tp); /* vinf, s+t limbs */
22386d7f5d3SJohn Marino
22486d7f5d3SJohn Marino mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t, tp);
22586d7f5d3SJohn Marino }
226