186d7f5d3SJohn Marino /* mpn_toom52_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3
286d7f5d3SJohn Marino times as large as bn. Or more accurately, bn < an < 2 bn.
386d7f5d3SJohn Marino
486d7f5d3SJohn Marino Contributed to the GNU project by Marco Bodrato.
586d7f5d3SJohn Marino
686d7f5d3SJohn Marino The idea of applying toom to unbalanced multiplication is due to Marco
786d7f5d3SJohn Marino Bodrato and Alberto Zanoni.
886d7f5d3SJohn Marino
986d7f5d3SJohn Marino THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
1086d7f5d3SJohn Marino SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
1186d7f5d3SJohn Marino GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
1286d7f5d3SJohn Marino
1386d7f5d3SJohn Marino Copyright 2009 Free Software Foundation, Inc.
1486d7f5d3SJohn Marino
1586d7f5d3SJohn Marino This file is part of the GNU MP Library.
1686d7f5d3SJohn Marino
1786d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1886d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1986d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
2086d7f5d3SJohn Marino option) any later version.
2186d7f5d3SJohn Marino
2286d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
2386d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2486d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
2586d7f5d3SJohn Marino License for more details.
2686d7f5d3SJohn Marino
2786d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2886d7f5d3SJohn Marino along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
2986d7f5d3SJohn Marino
3086d7f5d3SJohn Marino
3186d7f5d3SJohn Marino #include "gmp.h"
3286d7f5d3SJohn Marino #include "gmp-impl.h"
3386d7f5d3SJohn Marino
3486d7f5d3SJohn Marino /* Evaluate in: -2, -1, 0, +1, +2, +inf
3586d7f5d3SJohn Marino
3686d7f5d3SJohn Marino <-s-><--n--><--n--><--n--><--n-->
3786d7f5d3SJohn Marino ___ ______ ______ ______ ______
3886d7f5d3SJohn Marino |a4_|___a3_|___a2_|___a1_|___a0_|
3986d7f5d3SJohn Marino |b1|___b0_|
4086d7f5d3SJohn Marino <t-><--n-->
4186d7f5d3SJohn Marino
4286d7f5d3SJohn Marino v0 = a0 * b0 # A(0)*B(0)
4386d7f5d3SJohn Marino v1 = (a0+ a1+ a2+ a3+ a4)*(b0+ b1) # A(1)*B(1) ah <= 4 bh <= 1
4486d7f5d3SJohn Marino vm1 = (a0- a1+ a2- a3+ a4)*(b0- b1) # A(-1)*B(-1) |ah| <= 2 bh = 0
4586d7f5d3SJohn Marino v2 = (a0+2a1+4a2+8a3+16a4)*(b0+2b1) # A(2)*B(2) ah <= 30 bh <= 2
4686d7f5d3SJohn Marino vm2 = (a0-2a1+4a2-8a3+16a4)*(b0-2b1) # A(-2)*B(-2) |ah| <= 20 |bh|<= 1
4786d7f5d3SJohn Marino vinf= a4 * b1 # A(inf)*B(inf)
4886d7f5d3SJohn Marino
4986d7f5d3SJohn Marino Some slight optimization in evaluation are taken from the paper:
5086d7f5d3SJohn Marino "Towards Optimal Toom-Cook Multiplication for Univariate and
5186d7f5d3SJohn Marino Multivariate Polynomials in Characteristic 2 and 0."
5286d7f5d3SJohn Marino */
5386d7f5d3SJohn Marino
5486d7f5d3SJohn Marino void
mpn_toom52_mul(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn,mp_ptr scratch)5586d7f5d3SJohn Marino mpn_toom52_mul (mp_ptr pp,
5686d7f5d3SJohn Marino mp_srcptr ap, mp_size_t an,
5786d7f5d3SJohn Marino mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
5886d7f5d3SJohn Marino {
5986d7f5d3SJohn Marino mp_size_t n, s, t;
6086d7f5d3SJohn Marino enum toom6_flags flags;
6186d7f5d3SJohn Marino
6286d7f5d3SJohn Marino #define a0 ap
6386d7f5d3SJohn Marino #define a1 (ap + n)
6486d7f5d3SJohn Marino #define a2 (ap + 2 * n)
6586d7f5d3SJohn Marino #define a3 (ap + 3 * n)
6686d7f5d3SJohn Marino #define a4 (ap + 4 * n)
6786d7f5d3SJohn Marino #define b0 bp
6886d7f5d3SJohn Marino #define b1 (bp + n)
6986d7f5d3SJohn Marino
7086d7f5d3SJohn Marino n = 1 + (2 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) >> 1);
7186d7f5d3SJohn Marino
7286d7f5d3SJohn Marino s = an - 4 * n;
7386d7f5d3SJohn Marino t = bn - n;
7486d7f5d3SJohn Marino
7586d7f5d3SJohn Marino ASSERT (0 < s && s <= n);
7686d7f5d3SJohn Marino ASSERT (0 < t && t <= n);
7786d7f5d3SJohn Marino
7886d7f5d3SJohn Marino /* Ensures that 5 values of n+1 limbs each fits in the product area.
7986d7f5d3SJohn Marino Borderline cases are an = 32, bn = 8, n = 7, and an = 36, bn = 9,
8086d7f5d3SJohn Marino n = 8. */
8186d7f5d3SJohn Marino ASSERT (s+t >= 5);
8286d7f5d3SJohn Marino
8386d7f5d3SJohn Marino #define v0 pp /* 2n */
8486d7f5d3SJohn Marino #define vm1 (scratch) /* 2n+1 */
8586d7f5d3SJohn Marino #define v1 (pp + 2 * n) /* 2n+1 */
8686d7f5d3SJohn Marino #define vm2 (scratch + 2 * n + 1) /* 2n+1 */
8786d7f5d3SJohn Marino #define v2 (scratch + 4 * n + 2) /* 2n+1 */
8886d7f5d3SJohn Marino #define vinf (pp + 5 * n) /* s+t */
8986d7f5d3SJohn Marino #define bs1 pp /* n+1 */
9086d7f5d3SJohn Marino #define bsm1 (scratch + 2 * n + 2) /* n */
9186d7f5d3SJohn Marino #define asm1 (scratch + 3 * n + 3) /* n+1 */
9286d7f5d3SJohn Marino #define asm2 (scratch + 4 * n + 4) /* n+1 */
9386d7f5d3SJohn Marino #define bsm2 (pp + n + 1) /* n+1 */
9486d7f5d3SJohn Marino #define bs2 (pp + 2 * n + 2) /* n+1 */
9586d7f5d3SJohn Marino #define as2 (pp + 3 * n + 3) /* n+1 */
9686d7f5d3SJohn Marino #define as1 (pp + 4 * n + 4) /* n+1 */
9786d7f5d3SJohn Marino
9886d7f5d3SJohn Marino /* Scratch need is 6 * n + 3 + 1. We need one extra limb, because
9986d7f5d3SJohn Marino products will overwrite 2n+2 limbs. */
10086d7f5d3SJohn Marino
10186d7f5d3SJohn Marino #define a0a2 scratch
10286d7f5d3SJohn Marino #define a1a3 asm1
10386d7f5d3SJohn Marino
10486d7f5d3SJohn Marino /* Compute as2 and asm2. */
10586d7f5d3SJohn Marino flags = toom6_vm2_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, a1a3);
10686d7f5d3SJohn Marino
10786d7f5d3SJohn Marino /* Compute bs1 and bsm1. */
10886d7f5d3SJohn Marino if (t == n)
10986d7f5d3SJohn Marino {
11086d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_add_n_sub_n
11186d7f5d3SJohn Marino mp_limb_t cy;
11286d7f5d3SJohn Marino
11386d7f5d3SJohn Marino if (mpn_cmp (b0, b1, n) < 0)
11486d7f5d3SJohn Marino {
11586d7f5d3SJohn Marino cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
11686d7f5d3SJohn Marino flags ^= toom6_vm1_neg;
11786d7f5d3SJohn Marino }
11886d7f5d3SJohn Marino else
11986d7f5d3SJohn Marino {
12086d7f5d3SJohn Marino cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
12186d7f5d3SJohn Marino }
12286d7f5d3SJohn Marino bs1[n] = cy >> 1;
12386d7f5d3SJohn Marino #else
12486d7f5d3SJohn Marino bs1[n] = mpn_add_n (bs1, b0, b1, n);
12586d7f5d3SJohn Marino if (mpn_cmp (b0, b1, n) < 0)
12686d7f5d3SJohn Marino {
12786d7f5d3SJohn Marino mpn_sub_n (bsm1, b1, b0, n);
12886d7f5d3SJohn Marino flags ^= toom6_vm1_neg;
12986d7f5d3SJohn Marino }
13086d7f5d3SJohn Marino else
13186d7f5d3SJohn Marino {
13286d7f5d3SJohn Marino mpn_sub_n (bsm1, b0, b1, n);
13386d7f5d3SJohn Marino }
13486d7f5d3SJohn Marino #endif
13586d7f5d3SJohn Marino }
13686d7f5d3SJohn Marino else
13786d7f5d3SJohn Marino {
13886d7f5d3SJohn Marino bs1[n] = mpn_add (bs1, b0, n, b1, t);
13986d7f5d3SJohn Marino if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
14086d7f5d3SJohn Marino {
14186d7f5d3SJohn Marino mpn_sub_n (bsm1, b1, b0, t);
14286d7f5d3SJohn Marino MPN_ZERO (bsm1 + t, n - t);
14386d7f5d3SJohn Marino flags ^= toom6_vm1_neg;
14486d7f5d3SJohn Marino }
14586d7f5d3SJohn Marino else
14686d7f5d3SJohn Marino {
14786d7f5d3SJohn Marino mpn_sub (bsm1, b0, n, b1, t);
14886d7f5d3SJohn Marino }
14986d7f5d3SJohn Marino }
15086d7f5d3SJohn Marino
15186d7f5d3SJohn Marino /* Compute bs2 and bsm2, recycling bs1 and bsm1. bs2=bs1+b1; bsm2=bsm1-b1 */
15286d7f5d3SJohn Marino mpn_add (bs2, bs1, n+1, b1, t);
15386d7f5d3SJohn Marino if (flags & toom6_vm1_neg )
15486d7f5d3SJohn Marino {
15586d7f5d3SJohn Marino bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t);
15686d7f5d3SJohn Marino flags ^= toom6_vm2_neg;
15786d7f5d3SJohn Marino }
15886d7f5d3SJohn Marino else
15986d7f5d3SJohn Marino {
16086d7f5d3SJohn Marino bsm2[n] = 0;
16186d7f5d3SJohn Marino if (t == n)
16286d7f5d3SJohn Marino {
16386d7f5d3SJohn Marino if (mpn_cmp (bsm1, b1, n) < 0)
16486d7f5d3SJohn Marino {
16586d7f5d3SJohn Marino mpn_sub_n (bsm2, b1, bsm1, n);
16686d7f5d3SJohn Marino flags ^= toom6_vm2_neg;
16786d7f5d3SJohn Marino }
16886d7f5d3SJohn Marino else
16986d7f5d3SJohn Marino {
17086d7f5d3SJohn Marino mpn_sub_n (bsm2, bsm1, b1, n);
17186d7f5d3SJohn Marino }
17286d7f5d3SJohn Marino }
17386d7f5d3SJohn Marino else
17486d7f5d3SJohn Marino {
17586d7f5d3SJohn Marino if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0)
17686d7f5d3SJohn Marino {
17786d7f5d3SJohn Marino mpn_sub_n (bsm2, b1, bsm1, t);
17886d7f5d3SJohn Marino MPN_ZERO (bsm2 + t, n - t);
17986d7f5d3SJohn Marino flags ^= toom6_vm2_neg;
18086d7f5d3SJohn Marino }
18186d7f5d3SJohn Marino else
18286d7f5d3SJohn Marino {
18386d7f5d3SJohn Marino mpn_sub (bsm2, bsm1, n, b1, t);
18486d7f5d3SJohn Marino }
18586d7f5d3SJohn Marino }
18686d7f5d3SJohn Marino }
18786d7f5d3SJohn Marino
18886d7f5d3SJohn Marino /* Compute as1 and asm1. */
18986d7f5d3SJohn Marino flags ^= toom6_vm1_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, a0a2);
19086d7f5d3SJohn Marino
19186d7f5d3SJohn Marino ASSERT (as1[n] <= 4);
19286d7f5d3SJohn Marino ASSERT (bs1[n] <= 1);
19386d7f5d3SJohn Marino ASSERT (asm1[n] <= 2);
19486d7f5d3SJohn Marino /* ASSERT (bsm1[n] <= 1); */
19586d7f5d3SJohn Marino ASSERT (as2[n] <=30);
19686d7f5d3SJohn Marino ASSERT (bs2[n] <= 2);
19786d7f5d3SJohn Marino ASSERT (asm2[n] <= 20);
19886d7f5d3SJohn Marino ASSERT (bsm2[n] <= 1);
19986d7f5d3SJohn Marino
20086d7f5d3SJohn Marino /* vm1, 2n+1 limbs */
20186d7f5d3SJohn Marino mpn_mul (vm1, asm1, n+1, bsm1, n); /* W4 */
20286d7f5d3SJohn Marino
20386d7f5d3SJohn Marino /* vm2, 2n+1 limbs */
20486d7f5d3SJohn Marino mpn_mul_n (vm2, asm2, bsm2, n+1); /* W2 */
20586d7f5d3SJohn Marino
20686d7f5d3SJohn Marino /* v2, 2n+1 limbs */
20786d7f5d3SJohn Marino mpn_mul_n (v2, as2, bs2, n+1); /* W1 */
20886d7f5d3SJohn Marino
20986d7f5d3SJohn Marino /* v1, 2n+1 limbs */
21086d7f5d3SJohn Marino mpn_mul_n (v1, as1, bs1, n+1); /* W3 */
21186d7f5d3SJohn Marino
21286d7f5d3SJohn Marino /* vinf, s+t limbs */ /* W0 */
21386d7f5d3SJohn Marino if (s > t) mpn_mul (vinf, a4, s, b1, t);
21486d7f5d3SJohn Marino else mpn_mul (vinf, b1, t, a4, s);
21586d7f5d3SJohn Marino
21686d7f5d3SJohn Marino /* v0, 2n limbs */
21786d7f5d3SJohn Marino mpn_mul_n (v0, ap, bp, n); /* W5 */
21886d7f5d3SJohn Marino
21986d7f5d3SJohn Marino mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s);
22086d7f5d3SJohn Marino
22186d7f5d3SJohn Marino #undef v0
22286d7f5d3SJohn Marino #undef vm1
22386d7f5d3SJohn Marino #undef v1
22486d7f5d3SJohn Marino #undef vm2
22586d7f5d3SJohn Marino #undef v2
22686d7f5d3SJohn Marino #undef vinf
22786d7f5d3SJohn Marino #undef bs1
22886d7f5d3SJohn Marino #undef bs2
22986d7f5d3SJohn Marino #undef bsm1
23086d7f5d3SJohn Marino #undef bsm2
23186d7f5d3SJohn Marino #undef asm1
23286d7f5d3SJohn Marino #undef asm2
23386d7f5d3SJohn Marino #undef as1
23486d7f5d3SJohn Marino #undef as2
23586d7f5d3SJohn Marino #undef a0a2
23686d7f5d3SJohn Marino #undef b0b2
23786d7f5d3SJohn Marino #undef a1a3
23886d7f5d3SJohn Marino #undef a0
23986d7f5d3SJohn Marino #undef a1
24086d7f5d3SJohn Marino #undef a2
24186d7f5d3SJohn Marino #undef a3
24286d7f5d3SJohn Marino #undef b0
24386d7f5d3SJohn Marino #undef b1
24486d7f5d3SJohn Marino #undef b2
24586d7f5d3SJohn Marino
24686d7f5d3SJohn Marino }
247