186d7f5d3SJohn Marino /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
286d7f5d3SJohn Marino size. Or more accurately, bn <= an < (3/2)bn.
386d7f5d3SJohn Marino
486d7f5d3SJohn Marino Contributed to the GNU project by Torbjorn Granlund.
586d7f5d3SJohn Marino Additional improvements by Marco Bodrato.
686d7f5d3SJohn Marino
786d7f5d3SJohn Marino THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
886d7f5d3SJohn Marino SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
986d7f5d3SJohn Marino GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
1086d7f5d3SJohn Marino
1186d7f5d3SJohn Marino Copyright 2006, 2007, 2008, 2010 Free Software Foundation, Inc.
1286d7f5d3SJohn Marino
1386d7f5d3SJohn Marino This file is part of the GNU MP Library.
1486d7f5d3SJohn Marino
1586d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1686d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1786d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1886d7f5d3SJohn Marino option) any later version.
1986d7f5d3SJohn Marino
2086d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
2186d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2286d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
2386d7f5d3SJohn Marino License for more details.
2486d7f5d3SJohn Marino
2586d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2686d7f5d3SJohn Marino along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
2786d7f5d3SJohn Marino
2886d7f5d3SJohn Marino
2986d7f5d3SJohn Marino #include "gmp.h"
3086d7f5d3SJohn Marino #include "gmp-impl.h"
3186d7f5d3SJohn Marino
3286d7f5d3SJohn Marino /* Evaluate in: -1, 0, +1, +2, +inf
3386d7f5d3SJohn Marino
3486d7f5d3SJohn Marino <-s--><--n--><--n--><--n-->
3586d7f5d3SJohn Marino ____ ______ ______ ______
3686d7f5d3SJohn Marino |_a3_|___a2_|___a1_|___a0_|
3786d7f5d3SJohn Marino |b3_|___b2_|___b1_|___b0_|
3886d7f5d3SJohn Marino <-t-><--n--><--n--><--n-->
3986d7f5d3SJohn Marino
4086d7f5d3SJohn Marino v0 = a0 * b0 # A(0)*B(0)
4186d7f5d3SJohn Marino v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2
4286d7f5d3SJohn Marino vm1 = (a0- a1+ a2)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 bh <= 1
4386d7f5d3SJohn Marino v2 = (a0+2a1+4a2)*(b0+2b1+4b2) # A(2)*B(2) ah <= 6 bh <= 6
4486d7f5d3SJohn Marino vinf= a2 * b2 # A(inf)*B(inf)
4586d7f5d3SJohn Marino */
4686d7f5d3SJohn Marino
4786d7f5d3SJohn Marino #if TUNE_PROGRAM_BUILD
4886d7f5d3SJohn Marino #define MAYBE_mul_basecase 1
4986d7f5d3SJohn Marino #define MAYBE_mul_toom33 1
5086d7f5d3SJohn Marino #else
5186d7f5d3SJohn Marino #define MAYBE_mul_basecase \
5286d7f5d3SJohn Marino (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
5386d7f5d3SJohn Marino #define MAYBE_mul_toom33 \
5486d7f5d3SJohn Marino (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
5586d7f5d3SJohn Marino #endif
5686d7f5d3SJohn Marino
5786d7f5d3SJohn Marino /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
5886d7f5d3SJohn Marino multiplication at the infinity point. We may have
5986d7f5d3SJohn Marino MAYBE_mul_basecase == 0, and still get s just below
6086d7f5d3SJohn Marino MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
6186d7f5d3SJohn Marino s == 1 and mpn_toom22_mul will crash.
6286d7f5d3SJohn Marino */
6386d7f5d3SJohn Marino
6486d7f5d3SJohn Marino #define TOOM33_MUL_N_REC(p, a, b, n, ws) \
6586d7f5d3SJohn Marino do { \
6686d7f5d3SJohn Marino if (MAYBE_mul_basecase \
6786d7f5d3SJohn Marino && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \
6886d7f5d3SJohn Marino mpn_mul_basecase (p, a, n, b, n); \
6986d7f5d3SJohn Marino else if (! MAYBE_mul_toom33 \
7086d7f5d3SJohn Marino || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \
7186d7f5d3SJohn Marino mpn_toom22_mul (p, a, n, b, n, ws); \
7286d7f5d3SJohn Marino else \
7386d7f5d3SJohn Marino mpn_toom33_mul (p, a, n, b, n, ws); \
7486d7f5d3SJohn Marino } while (0)
7586d7f5d3SJohn Marino
7686d7f5d3SJohn Marino void
mpn_toom33_mul(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn,mp_ptr scratch)7786d7f5d3SJohn Marino mpn_toom33_mul (mp_ptr pp,
7886d7f5d3SJohn Marino mp_srcptr ap, mp_size_t an,
7986d7f5d3SJohn Marino mp_srcptr bp, mp_size_t bn,
8086d7f5d3SJohn Marino mp_ptr scratch)
8186d7f5d3SJohn Marino {
8286d7f5d3SJohn Marino mp_size_t n, s, t;
8386d7f5d3SJohn Marino int vm1_neg;
8486d7f5d3SJohn Marino mp_limb_t cy, vinf0;
8586d7f5d3SJohn Marino mp_ptr gp;
8686d7f5d3SJohn Marino mp_ptr as1, asm1, as2;
8786d7f5d3SJohn Marino mp_ptr bs1, bsm1, bs2;
8886d7f5d3SJohn Marino
8986d7f5d3SJohn Marino #define a0 ap
9086d7f5d3SJohn Marino #define a1 (ap + n)
9186d7f5d3SJohn Marino #define a2 (ap + 2*n)
9286d7f5d3SJohn Marino #define b0 bp
9386d7f5d3SJohn Marino #define b1 (bp + n)
9486d7f5d3SJohn Marino #define b2 (bp + 2*n)
9586d7f5d3SJohn Marino
9686d7f5d3SJohn Marino n = (an + 2) / (size_t) 3;
9786d7f5d3SJohn Marino
9886d7f5d3SJohn Marino s = an - 2 * n;
9986d7f5d3SJohn Marino t = bn - 2 * n;
10086d7f5d3SJohn Marino
10186d7f5d3SJohn Marino ASSERT (an >= bn);
10286d7f5d3SJohn Marino
10386d7f5d3SJohn Marino ASSERT (0 < s && s <= n);
10486d7f5d3SJohn Marino ASSERT (0 < t && t <= n);
10586d7f5d3SJohn Marino
10686d7f5d3SJohn Marino as1 = scratch + 4 * n + 4;
10786d7f5d3SJohn Marino asm1 = scratch + 2 * n + 2;
10886d7f5d3SJohn Marino as2 = pp + n + 1;
10986d7f5d3SJohn Marino
11086d7f5d3SJohn Marino bs1 = pp;
11186d7f5d3SJohn Marino bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
11286d7f5d3SJohn Marino bs2 = pp + 2 * n + 2;
11386d7f5d3SJohn Marino
11486d7f5d3SJohn Marino gp = scratch;
11586d7f5d3SJohn Marino
11686d7f5d3SJohn Marino vm1_neg = 0;
11786d7f5d3SJohn Marino
11886d7f5d3SJohn Marino /* Compute as1 and asm1. */
11986d7f5d3SJohn Marino cy = mpn_add (gp, a0, n, a2, s);
12086d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_add_n_sub_n
12186d7f5d3SJohn Marino if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
12286d7f5d3SJohn Marino {
12386d7f5d3SJohn Marino cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
12486d7f5d3SJohn Marino as1[n] = cy >> 1;
12586d7f5d3SJohn Marino asm1[n] = 0;
12686d7f5d3SJohn Marino vm1_neg = 1;
12786d7f5d3SJohn Marino }
12886d7f5d3SJohn Marino else
12986d7f5d3SJohn Marino {
13086d7f5d3SJohn Marino mp_limb_t cy2;
13186d7f5d3SJohn Marino cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
13286d7f5d3SJohn Marino as1[n] = cy + (cy2 >> 1);
13386d7f5d3SJohn Marino asm1[n] = cy - (cy2 & 1);
13486d7f5d3SJohn Marino }
13586d7f5d3SJohn Marino #else
13686d7f5d3SJohn Marino as1[n] = cy + mpn_add_n (as1, gp, a1, n);
13786d7f5d3SJohn Marino if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
13886d7f5d3SJohn Marino {
13986d7f5d3SJohn Marino mpn_sub_n (asm1, a1, gp, n);
14086d7f5d3SJohn Marino asm1[n] = 0;
14186d7f5d3SJohn Marino vm1_neg = 1;
14286d7f5d3SJohn Marino }
14386d7f5d3SJohn Marino else
14486d7f5d3SJohn Marino {
14586d7f5d3SJohn Marino cy -= mpn_sub_n (asm1, gp, a1, n);
14686d7f5d3SJohn Marino asm1[n] = cy;
14786d7f5d3SJohn Marino }
14886d7f5d3SJohn Marino #endif
14986d7f5d3SJohn Marino
15086d7f5d3SJohn Marino /* Compute as2. */
15186d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_rsblsh1_n
15286d7f5d3SJohn Marino cy = mpn_add_n (as2, a2, as1, s);
15386d7f5d3SJohn Marino if (s != n)
15486d7f5d3SJohn Marino cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
15586d7f5d3SJohn Marino cy += as1[n];
15686d7f5d3SJohn Marino cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
15786d7f5d3SJohn Marino #else
15886d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
15986d7f5d3SJohn Marino cy = mpn_addlsh1_n (as2, a1, a2, s);
16086d7f5d3SJohn Marino if (s != n)
16186d7f5d3SJohn Marino cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
16286d7f5d3SJohn Marino cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
16386d7f5d3SJohn Marino #else
16486d7f5d3SJohn Marino cy = mpn_add_n (as2, a2, as1, s);
16586d7f5d3SJohn Marino if (s != n)
16686d7f5d3SJohn Marino cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
16786d7f5d3SJohn Marino cy += as1[n];
16886d7f5d3SJohn Marino cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
16986d7f5d3SJohn Marino cy -= mpn_sub_n (as2, as2, a0, n);
17086d7f5d3SJohn Marino #endif
17186d7f5d3SJohn Marino #endif
17286d7f5d3SJohn Marino as2[n] = cy;
17386d7f5d3SJohn Marino
17486d7f5d3SJohn Marino /* Compute bs1 and bsm1. */
17586d7f5d3SJohn Marino cy = mpn_add (gp, b0, n, b2, t);
17686d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_add_n_sub_n
17786d7f5d3SJohn Marino if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
17886d7f5d3SJohn Marino {
17986d7f5d3SJohn Marino cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
18086d7f5d3SJohn Marino bs1[n] = cy >> 1;
18186d7f5d3SJohn Marino bsm1[n] = 0;
18286d7f5d3SJohn Marino vm1_neg ^= 1;
18386d7f5d3SJohn Marino }
18486d7f5d3SJohn Marino else
18586d7f5d3SJohn Marino {
18686d7f5d3SJohn Marino mp_limb_t cy2;
18786d7f5d3SJohn Marino cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
18886d7f5d3SJohn Marino bs1[n] = cy + (cy2 >> 1);
18986d7f5d3SJohn Marino bsm1[n] = cy - (cy2 & 1);
19086d7f5d3SJohn Marino }
19186d7f5d3SJohn Marino #else
19286d7f5d3SJohn Marino bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
19386d7f5d3SJohn Marino if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
19486d7f5d3SJohn Marino {
19586d7f5d3SJohn Marino mpn_sub_n (bsm1, b1, gp, n);
19686d7f5d3SJohn Marino bsm1[n] = 0;
19786d7f5d3SJohn Marino vm1_neg ^= 1;
19886d7f5d3SJohn Marino }
19986d7f5d3SJohn Marino else
20086d7f5d3SJohn Marino {
20186d7f5d3SJohn Marino cy -= mpn_sub_n (bsm1, gp, b1, n);
20286d7f5d3SJohn Marino bsm1[n] = cy;
20386d7f5d3SJohn Marino }
20486d7f5d3SJohn Marino #endif
20586d7f5d3SJohn Marino
20686d7f5d3SJohn Marino /* Compute bs2. */
20786d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_rsblsh1_n
20886d7f5d3SJohn Marino cy = mpn_add_n (bs2, b2, bs1, t);
20986d7f5d3SJohn Marino if (t != n)
21086d7f5d3SJohn Marino cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
21186d7f5d3SJohn Marino cy += bs1[n];
21286d7f5d3SJohn Marino cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
21386d7f5d3SJohn Marino #else
21486d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
21586d7f5d3SJohn Marino cy = mpn_addlsh1_n (bs2, b1, b2, t);
21686d7f5d3SJohn Marino if (t != n)
21786d7f5d3SJohn Marino cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
21886d7f5d3SJohn Marino cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
21986d7f5d3SJohn Marino #else
22086d7f5d3SJohn Marino cy = mpn_add_n (bs2, bs1, b2, t);
22186d7f5d3SJohn Marino if (t != n)
22286d7f5d3SJohn Marino cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
22386d7f5d3SJohn Marino cy += bs1[n];
22486d7f5d3SJohn Marino cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
22586d7f5d3SJohn Marino cy -= mpn_sub_n (bs2, bs2, b0, n);
22686d7f5d3SJohn Marino #endif
22786d7f5d3SJohn Marino #endif
22886d7f5d3SJohn Marino bs2[n] = cy;
22986d7f5d3SJohn Marino
23086d7f5d3SJohn Marino ASSERT (as1[n] <= 2);
23186d7f5d3SJohn Marino ASSERT (bs1[n] <= 2);
23286d7f5d3SJohn Marino ASSERT (asm1[n] <= 1);
23386d7f5d3SJohn Marino ASSERT (bsm1[n] <= 1);
23486d7f5d3SJohn Marino ASSERT (as2[n] <= 6);
23586d7f5d3SJohn Marino ASSERT (bs2[n] <= 6);
23686d7f5d3SJohn Marino
23786d7f5d3SJohn Marino #define v0 pp /* 2n */
23886d7f5d3SJohn Marino #define v1 (pp + 2 * n) /* 2n+1 */
23986d7f5d3SJohn Marino #define vinf (pp + 4 * n) /* s+t */
24086d7f5d3SJohn Marino #define vm1 scratch /* 2n+1 */
24186d7f5d3SJohn Marino #define v2 (scratch + 2 * n + 1) /* 2n+2 */
24286d7f5d3SJohn Marino #define scratch_out (scratch + 5 * n + 5)
24386d7f5d3SJohn Marino
24486d7f5d3SJohn Marino /* vm1, 2n+1 limbs */
24586d7f5d3SJohn Marino #ifdef SMALLER_RECURSION
24686d7f5d3SJohn Marino TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
24786d7f5d3SJohn Marino cy = 0;
24886d7f5d3SJohn Marino if (asm1[n] != 0)
24986d7f5d3SJohn Marino cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
25086d7f5d3SJohn Marino if (bsm1[n] != 0)
25186d7f5d3SJohn Marino cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
25286d7f5d3SJohn Marino vm1[2 * n] = cy;
25386d7f5d3SJohn Marino #else
25486d7f5d3SJohn Marino TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
25586d7f5d3SJohn Marino #endif
25686d7f5d3SJohn Marino
25786d7f5d3SJohn Marino TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */
25886d7f5d3SJohn Marino
25986d7f5d3SJohn Marino /* vinf, s+t limbs */
26086d7f5d3SJohn Marino if (s > t) mpn_mul (vinf, a2, s, b2, t);
26186d7f5d3SJohn Marino else TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
26286d7f5d3SJohn Marino
26386d7f5d3SJohn Marino vinf0 = vinf[0]; /* v1 overlaps with this */
26486d7f5d3SJohn Marino
26586d7f5d3SJohn Marino #ifdef SMALLER_RECURSION
26686d7f5d3SJohn Marino /* v1, 2n+1 limbs */
26786d7f5d3SJohn Marino TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
26886d7f5d3SJohn Marino if (as1[n] == 1)
26986d7f5d3SJohn Marino {
27086d7f5d3SJohn Marino cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
27186d7f5d3SJohn Marino }
27286d7f5d3SJohn Marino else if (as1[n] != 0)
27386d7f5d3SJohn Marino {
27486d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
27586d7f5d3SJohn Marino cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
27686d7f5d3SJohn Marino #else
27786d7f5d3SJohn Marino cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
27886d7f5d3SJohn Marino #endif
27986d7f5d3SJohn Marino }
28086d7f5d3SJohn Marino else
28186d7f5d3SJohn Marino cy = 0;
28286d7f5d3SJohn Marino if (bs1[n] == 1)
28386d7f5d3SJohn Marino {
28486d7f5d3SJohn Marino cy += mpn_add_n (v1 + n, v1 + n, as1, n);
28586d7f5d3SJohn Marino }
28686d7f5d3SJohn Marino else if (bs1[n] != 0)
28786d7f5d3SJohn Marino {
28886d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
28986d7f5d3SJohn Marino cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
29086d7f5d3SJohn Marino #else
29186d7f5d3SJohn Marino cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
29286d7f5d3SJohn Marino #endif
29386d7f5d3SJohn Marino }
29486d7f5d3SJohn Marino v1[2 * n] = cy;
29586d7f5d3SJohn Marino #else
29686d7f5d3SJohn Marino cy = vinf[1];
29786d7f5d3SJohn Marino TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
29886d7f5d3SJohn Marino vinf[1] = cy;
29986d7f5d3SJohn Marino #endif
30086d7f5d3SJohn Marino
30186d7f5d3SJohn Marino TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */
30286d7f5d3SJohn Marino
30386d7f5d3SJohn Marino mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
30486d7f5d3SJohn Marino }
305