186d7f5d3SJohn Marino /* mpn_toom32_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 1.5
286d7f5d3SJohn Marino times as large as bn. Or more accurately, bn < an < 3bn.
386d7f5d3SJohn Marino
486d7f5d3SJohn Marino Contributed to the GNU project by Torbjorn Granlund.
586d7f5d3SJohn Marino Improvements by Marco Bodrato and Niels M�ller.
686d7f5d3SJohn Marino
786d7f5d3SJohn Marino The idea of applying toom to unbalanced multiplication is due to Marco
886d7f5d3SJohn Marino Bodrato and Alberto Zanoni.
986d7f5d3SJohn Marino
1086d7f5d3SJohn Marino THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
1186d7f5d3SJohn Marino SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
1286d7f5d3SJohn Marino GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
1386d7f5d3SJohn Marino
1486d7f5d3SJohn Marino Copyright 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
1586d7f5d3SJohn Marino
1686d7f5d3SJohn Marino This file is part of the GNU MP Library.
1786d7f5d3SJohn Marino
1886d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1986d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
2086d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
2186d7f5d3SJohn Marino option) any later version.
2286d7f5d3SJohn Marino
2386d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
2486d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2586d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
2686d7f5d3SJohn Marino License for more details.
2786d7f5d3SJohn Marino
2886d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2986d7f5d3SJohn Marino along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
3086d7f5d3SJohn Marino
3186d7f5d3SJohn Marino
3286d7f5d3SJohn Marino #include "gmp.h"
3386d7f5d3SJohn Marino #include "gmp-impl.h"
3486d7f5d3SJohn Marino
3586d7f5d3SJohn Marino /* Evaluate in: -1, 0, +1, +inf
3686d7f5d3SJohn Marino
3786d7f5d3SJohn Marino <-s-><--n--><--n-->
3886d7f5d3SJohn Marino ___ ______ ______
3986d7f5d3SJohn Marino |a2_|___a1_|___a0_|
4086d7f5d3SJohn Marino |_b1_|___b0_|
4186d7f5d3SJohn Marino <-t--><--n-->
4286d7f5d3SJohn Marino
4386d7f5d3SJohn Marino v0 = a0 * b0 # A(0)*B(0)
4486d7f5d3SJohn Marino v1 = (a0+ a1+ a2)*(b0+ b1) # A(1)*B(1) ah <= 2 bh <= 1
4586d7f5d3SJohn Marino vm1 = (a0- a1+ a2)*(b0- b1) # A(-1)*B(-1) |ah| <= 1 bh = 0
4686d7f5d3SJohn Marino vinf= a2 * b1 # A(inf)*B(inf)
4786d7f5d3SJohn Marino */
4886d7f5d3SJohn Marino
4986d7f5d3SJohn Marino #define TOOM32_MUL_N_REC(p, a, b, n, ws) \
5086d7f5d3SJohn Marino do { \
5186d7f5d3SJohn Marino mpn_mul_n (p, a, b, n); \
5286d7f5d3SJohn Marino } while (0)
5386d7f5d3SJohn Marino
5486d7f5d3SJohn Marino void
mpn_toom32_mul(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn,mp_ptr scratch)5586d7f5d3SJohn Marino mpn_toom32_mul (mp_ptr pp,
5686d7f5d3SJohn Marino mp_srcptr ap, mp_size_t an,
5786d7f5d3SJohn Marino mp_srcptr bp, mp_size_t bn,
5886d7f5d3SJohn Marino mp_ptr scratch)
5986d7f5d3SJohn Marino {
6086d7f5d3SJohn Marino mp_size_t n, s, t;
6186d7f5d3SJohn Marino int vm1_neg;
6286d7f5d3SJohn Marino mp_limb_t cy;
6386d7f5d3SJohn Marino int hi;
6486d7f5d3SJohn Marino mp_limb_t ap1_hi, bp1_hi;
6586d7f5d3SJohn Marino
6686d7f5d3SJohn Marino #define a0 ap
6786d7f5d3SJohn Marino #define a1 (ap + n)
6886d7f5d3SJohn Marino #define a2 (ap + 2 * n)
6986d7f5d3SJohn Marino #define b0 bp
7086d7f5d3SJohn Marino #define b1 (bp + n)
7186d7f5d3SJohn Marino
7286d7f5d3SJohn Marino /* Required, to ensure that s + t >= n. */
7386d7f5d3SJohn Marino ASSERT (bn + 2 <= an && an + 6 <= 3*bn);
7486d7f5d3SJohn Marino
7586d7f5d3SJohn Marino n = 1 + (2 * an >= 3 * bn ? (an - 1) / (size_t) 3 : (bn - 1) >> 1);
7686d7f5d3SJohn Marino
7786d7f5d3SJohn Marino s = an - 2 * n;
7886d7f5d3SJohn Marino t = bn - n;
7986d7f5d3SJohn Marino
8086d7f5d3SJohn Marino ASSERT (0 < s && s <= n);
8186d7f5d3SJohn Marino ASSERT (0 < t && t <= n);
8286d7f5d3SJohn Marino ASSERT (s + t >= n);
8386d7f5d3SJohn Marino
8486d7f5d3SJohn Marino /* Product area of size an + bn = 3*n + s + t >= 4*n + 2. */
8586d7f5d3SJohn Marino #define ap1 (pp) /* n, most significant limb in ap1_hi */
8686d7f5d3SJohn Marino #define bp1 (pp + n) /* n, most significant bit in bp1_hi */
8786d7f5d3SJohn Marino #define am1 (pp + 2*n) /* n, most significant bit in hi */
8886d7f5d3SJohn Marino #define bm1 (pp + 3*n) /* n */
8986d7f5d3SJohn Marino #define v1 (scratch) /* 2n + 1 */
9086d7f5d3SJohn Marino #define vm1 (pp) /* 2n + 1 */
9186d7f5d3SJohn Marino #define scratch_out (scratch + 2*n + 1) /* Currently unused. */
9286d7f5d3SJohn Marino
9386d7f5d3SJohn Marino /* Scratch need: 2*n + 1 + scratch for the recursive multiplications. */
9486d7f5d3SJohn Marino
9586d7f5d3SJohn Marino /* FIXME: Keep v1[2*n] and vm1[2*n] in scalar variables? */
9686d7f5d3SJohn Marino
9786d7f5d3SJohn Marino /* Compute ap1 = a0 + a1 + a3, am1 = a0 - a1 + a3 */
9886d7f5d3SJohn Marino ap1_hi = mpn_add (ap1, a0, n, a2, s);
9986d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_add_n_sub_n
10086d7f5d3SJohn Marino if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
10186d7f5d3SJohn Marino {
10286d7f5d3SJohn Marino ap1_hi = mpn_add_n_sub_n (ap1, am1, a1, ap1, n) >> 1;
10386d7f5d3SJohn Marino hi = 0;
10486d7f5d3SJohn Marino vm1_neg = 1;
10586d7f5d3SJohn Marino }
10686d7f5d3SJohn Marino else
10786d7f5d3SJohn Marino {
10886d7f5d3SJohn Marino cy = mpn_add_n_sub_n (ap1, am1, ap1, a1, n);
10986d7f5d3SJohn Marino hi = ap1_hi - (cy & 1);
11086d7f5d3SJohn Marino ap1_hi += (cy >> 1);
11186d7f5d3SJohn Marino vm1_neg = 0;
11286d7f5d3SJohn Marino }
11386d7f5d3SJohn Marino #else
11486d7f5d3SJohn Marino if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
11586d7f5d3SJohn Marino {
11686d7f5d3SJohn Marino ASSERT_NOCARRY (mpn_sub_n (am1, a1, ap1, n));
11786d7f5d3SJohn Marino hi = 0;
11886d7f5d3SJohn Marino vm1_neg = 1;
11986d7f5d3SJohn Marino }
12086d7f5d3SJohn Marino else
12186d7f5d3SJohn Marino {
12286d7f5d3SJohn Marino hi = ap1_hi - mpn_sub_n (am1, ap1, a1, n);
12386d7f5d3SJohn Marino vm1_neg = 0;
12486d7f5d3SJohn Marino }
12586d7f5d3SJohn Marino ap1_hi += mpn_add_n (ap1, ap1, a1, n);
12686d7f5d3SJohn Marino #endif
12786d7f5d3SJohn Marino
12886d7f5d3SJohn Marino /* Compute bp1 = b0 + b1 and bm1 = b0 - b1. */
12986d7f5d3SJohn Marino if (t == n)
13086d7f5d3SJohn Marino {
13186d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_add_n_sub_n
13286d7f5d3SJohn Marino if (mpn_cmp (b0, b1, n) < 0)
13386d7f5d3SJohn Marino {
13486d7f5d3SJohn Marino cy = mpn_add_n_sub_n (bp1, bm1, b1, b0, n);
13586d7f5d3SJohn Marino vm1_neg ^= 1;
13686d7f5d3SJohn Marino }
13786d7f5d3SJohn Marino else
13886d7f5d3SJohn Marino {
13986d7f5d3SJohn Marino cy = mpn_add_n_sub_n (bp1, bm1, b0, b1, n);
14086d7f5d3SJohn Marino }
14186d7f5d3SJohn Marino bp1_hi = cy >> 1;
14286d7f5d3SJohn Marino #else
14386d7f5d3SJohn Marino bp1_hi = mpn_add_n (bp1, b0, b1, n);
14486d7f5d3SJohn Marino
14586d7f5d3SJohn Marino if (mpn_cmp (b0, b1, n) < 0)
14686d7f5d3SJohn Marino {
14786d7f5d3SJohn Marino ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, n));
14886d7f5d3SJohn Marino vm1_neg ^= 1;
14986d7f5d3SJohn Marino }
15086d7f5d3SJohn Marino else
15186d7f5d3SJohn Marino {
15286d7f5d3SJohn Marino ASSERT_NOCARRY (mpn_sub_n (bm1, b0, b1, n));
15386d7f5d3SJohn Marino }
15486d7f5d3SJohn Marino #endif
15586d7f5d3SJohn Marino }
15686d7f5d3SJohn Marino else
15786d7f5d3SJohn Marino {
15886d7f5d3SJohn Marino /* FIXME: Should still use mpn_add_n_sub_n for the main part. */
15986d7f5d3SJohn Marino bp1_hi = mpn_add (bp1, b0, n, b1, t);
16086d7f5d3SJohn Marino
16186d7f5d3SJohn Marino if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
16286d7f5d3SJohn Marino {
16386d7f5d3SJohn Marino ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, t));
16486d7f5d3SJohn Marino MPN_ZERO (bm1 + t, n - t);
16586d7f5d3SJohn Marino vm1_neg ^= 1;
16686d7f5d3SJohn Marino }
16786d7f5d3SJohn Marino else
16886d7f5d3SJohn Marino {
16986d7f5d3SJohn Marino ASSERT_NOCARRY (mpn_sub (bm1, b0, n, b1, t));
17086d7f5d3SJohn Marino }
17186d7f5d3SJohn Marino }
17286d7f5d3SJohn Marino
17386d7f5d3SJohn Marino TOOM32_MUL_N_REC (v1, ap1, bp1, n, scratch_out);
17486d7f5d3SJohn Marino if (ap1_hi == 1)
17586d7f5d3SJohn Marino {
17686d7f5d3SJohn Marino cy = bp1_hi + mpn_add_n (v1 + n, v1 + n, bp1, n);
17786d7f5d3SJohn Marino }
17886d7f5d3SJohn Marino else if (ap1_hi == 2)
17986d7f5d3SJohn Marino {
18086d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
18186d7f5d3SJohn Marino cy = 2 * bp1_hi + mpn_addlsh1_n (v1 + n, v1 + n, bp1, n);
18286d7f5d3SJohn Marino #else
18386d7f5d3SJohn Marino cy = 2 * bp1_hi + mpn_addmul_1 (v1 + n, bp1, n, CNST_LIMB(2));
18486d7f5d3SJohn Marino #endif
18586d7f5d3SJohn Marino }
18686d7f5d3SJohn Marino else
18786d7f5d3SJohn Marino cy = 0;
18886d7f5d3SJohn Marino if (bp1_hi != 0)
18986d7f5d3SJohn Marino cy += mpn_add_n (v1 + n, v1 + n, ap1, n);
19086d7f5d3SJohn Marino v1[2 * n] = cy;
19186d7f5d3SJohn Marino
19286d7f5d3SJohn Marino TOOM32_MUL_N_REC (vm1, am1, bm1, n, scratch_out);
19386d7f5d3SJohn Marino if (hi)
19486d7f5d3SJohn Marino hi = mpn_add_n (vm1+n, vm1+n, bm1, n);
19586d7f5d3SJohn Marino
19686d7f5d3SJohn Marino vm1[2*n] = hi;
19786d7f5d3SJohn Marino
19886d7f5d3SJohn Marino /* v1 <-- (v1 + vm1) / 2 = x0 + x2 */
19986d7f5d3SJohn Marino if (vm1_neg)
20086d7f5d3SJohn Marino {
20186d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_rsh1sub_n
20286d7f5d3SJohn Marino mpn_rsh1sub_n (v1, v1, vm1, 2*n+1);
20386d7f5d3SJohn Marino #else
20486d7f5d3SJohn Marino mpn_sub_n (v1, v1, vm1, 2*n+1);
20586d7f5d3SJohn Marino ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
20686d7f5d3SJohn Marino #endif
20786d7f5d3SJohn Marino }
20886d7f5d3SJohn Marino else
20986d7f5d3SJohn Marino {
21086d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_rsh1add_n
21186d7f5d3SJohn Marino mpn_rsh1add_n (v1, v1, vm1, 2*n+1);
21286d7f5d3SJohn Marino #else
21386d7f5d3SJohn Marino mpn_add_n (v1, v1, vm1, 2*n+1);
21486d7f5d3SJohn Marino ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
21586d7f5d3SJohn Marino #endif
21686d7f5d3SJohn Marino }
21786d7f5d3SJohn Marino
21886d7f5d3SJohn Marino /* We get x1 + x3 = (x0 + x2) - (x0 - x1 + x2 - x3), and hence
21986d7f5d3SJohn Marino
22086d7f5d3SJohn Marino y = x1 + x3 + (x0 + x2) * B
22186d7f5d3SJohn Marino = (x0 + x2) * B + (x0 + x2) - vm1.
22286d7f5d3SJohn Marino
22386d7f5d3SJohn Marino y is 3*n + 1 limbs, y = y0 + y1 B + y2 B^2. We store them as
22486d7f5d3SJohn Marino follows: y0 at scratch, y1 at pp + 2*n, and y2 at scratch + n
22586d7f5d3SJohn Marino (already in place, except for carry propagation).
22686d7f5d3SJohn Marino
22786d7f5d3SJohn Marino We thus add
22886d7f5d3SJohn Marino
22986d7f5d3SJohn Marino B^3 B^2 B 1
23086d7f5d3SJohn Marino | | | |
23186d7f5d3SJohn Marino +-----+----+
23286d7f5d3SJohn Marino + | x0 + x2 |
23386d7f5d3SJohn Marino +----+-----+----+
23486d7f5d3SJohn Marino + | x0 + x2 |
23586d7f5d3SJohn Marino +----------+
23686d7f5d3SJohn Marino - | vm1 |
23786d7f5d3SJohn Marino --+----++----+----+-
23886d7f5d3SJohn Marino | y2 | y1 | y0 |
23986d7f5d3SJohn Marino +-----+----+----+
24086d7f5d3SJohn Marino
24186d7f5d3SJohn Marino Since we store y0 at the same location as the low half of x0 + x2, we
24286d7f5d3SJohn Marino need to do the middle sum first. */
24386d7f5d3SJohn Marino
24486d7f5d3SJohn Marino hi = vm1[2*n];
24586d7f5d3SJohn Marino cy = mpn_add_n (pp + 2*n, v1, v1 + n, n);
24686d7f5d3SJohn Marino MPN_INCR_U (v1 + n, n + 1, cy + v1[2*n]);
24786d7f5d3SJohn Marino
24886d7f5d3SJohn Marino /* FIXME: Can we get rid of this second vm1_neg conditional by
24986d7f5d3SJohn Marino swapping the location of +1 and -1 values? */
25086d7f5d3SJohn Marino if (vm1_neg)
25186d7f5d3SJohn Marino {
25286d7f5d3SJohn Marino cy = mpn_add_n (v1, v1, vm1, n);
25386d7f5d3SJohn Marino hi += mpn_add_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
25486d7f5d3SJohn Marino MPN_INCR_U (v1 + n, n+1, hi);
25586d7f5d3SJohn Marino }
25686d7f5d3SJohn Marino else
25786d7f5d3SJohn Marino {
25886d7f5d3SJohn Marino cy = mpn_sub_n (v1, v1, vm1, n);
25986d7f5d3SJohn Marino hi += mpn_sub_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
26086d7f5d3SJohn Marino MPN_DECR_U (v1 + n, n+1, hi);
26186d7f5d3SJohn Marino }
26286d7f5d3SJohn Marino
26386d7f5d3SJohn Marino TOOM32_MUL_N_REC (pp, a0, b0, n, scratch_out);
26486d7f5d3SJohn Marino /* vinf, s+t limbs. Use mpn_mul for now, to handle unbalanced operands */
26586d7f5d3SJohn Marino if (s > t) mpn_mul (pp+3*n, a2, s, b1, t);
26686d7f5d3SJohn Marino else mpn_mul (pp+3*n, b1, t, a2, s);
26786d7f5d3SJohn Marino
26886d7f5d3SJohn Marino /* Remaining interpolation.
26986d7f5d3SJohn Marino
27086d7f5d3SJohn Marino y * B + x0 + x3 B^3 - x0 B^2 - x3 B
27186d7f5d3SJohn Marino = (x1 + x3) B + (x0 + x2) B^2 + x0 + x3 B^3 - x0 B^2 - x3 B
27286d7f5d3SJohn Marino = y0 B + y1 B^2 + y3 B^3 + Lx0 + H x0 B
27386d7f5d3SJohn Marino + L x3 B^3 + H x3 B^4 - Lx0 B^2 - H x0 B^3 - L x3 B - H x3 B^2
27486d7f5d3SJohn Marino = L x0 + (y0 + H x0 - L x3) B + (y1 - L x0 - H x3) B^2
27586d7f5d3SJohn Marino + (y2 - (H x0 - L x3)) B^3 + H x3 B^4
27686d7f5d3SJohn Marino
27786d7f5d3SJohn Marino B^4 B^3 B^2 B 1
27886d7f5d3SJohn Marino | | | | | |
27986d7f5d3SJohn Marino +-------+ +---------+---------+
28086d7f5d3SJohn Marino | Hx3 | | Hx0-Lx3 | Lx0 |
28186d7f5d3SJohn Marino +------+----------+---------+---------+---------+
28286d7f5d3SJohn Marino | y2 | y1 | y0 |
28386d7f5d3SJohn Marino ++---------+---------+---------+
28486d7f5d3SJohn Marino -| Hx0-Lx3 | - Lx0 |
28586d7f5d3SJohn Marino +---------+---------+
28686d7f5d3SJohn Marino | - Hx3 |
28786d7f5d3SJohn Marino +--------+
28886d7f5d3SJohn Marino
28986d7f5d3SJohn Marino We must take into account the carry from Hx0 - Lx3.
29086d7f5d3SJohn Marino */
29186d7f5d3SJohn Marino
29286d7f5d3SJohn Marino cy = mpn_sub_n (pp + n, pp + n, pp+3*n, n);
29386d7f5d3SJohn Marino hi = scratch[2*n] + cy;
29486d7f5d3SJohn Marino
29586d7f5d3SJohn Marino cy = mpn_sub_nc (pp + 2*n, pp + 2*n, pp, n, cy);
29686d7f5d3SJohn Marino hi -= mpn_sub_nc (pp + 3*n, scratch + n, pp + n, n, cy);
29786d7f5d3SJohn Marino
29886d7f5d3SJohn Marino hi += mpn_add (pp + n, pp + n, 3*n, scratch, n);
29986d7f5d3SJohn Marino
30086d7f5d3SJohn Marino /* FIXME: Is support for s + t == n needed? */
30186d7f5d3SJohn Marino if (LIKELY (s + t > n))
30286d7f5d3SJohn Marino {
30386d7f5d3SJohn Marino hi -= mpn_sub (pp + 2*n, pp + 2*n, 2*n, pp + 4*n, s+t-n);
30486d7f5d3SJohn Marino
30586d7f5d3SJohn Marino if (hi < 0)
30686d7f5d3SJohn Marino MPN_DECR_U (pp + 4*n, s+t-n, -hi);
30786d7f5d3SJohn Marino else
30886d7f5d3SJohn Marino MPN_INCR_U (pp + 4*n, s+t-n, hi);
30986d7f5d3SJohn Marino }
31086d7f5d3SJohn Marino else
31186d7f5d3SJohn Marino ASSERT (hi == 0);
31286d7f5d3SJohn Marino }
313