xref: /dflybsd-src/contrib/gmp/mpn/generic/toom42_mul.c (revision 86d7f5d305c6adaa56ff4582ece9859d73106103)
186d7f5d3SJohn Marino /* mpn_toom42_mul -- Multiply {ap,an} and {bp,bn} where an is nominally twice
286d7f5d3SJohn Marino    as large as bn.  Or more accurately, (3/2)bn < an < 4bn.
386d7f5d3SJohn Marino 
486d7f5d3SJohn Marino    Contributed to the GNU project by Torbjorn Granlund.
586d7f5d3SJohn Marino    Additional improvements by Marco Bodrato.
686d7f5d3SJohn Marino 
786d7f5d3SJohn Marino    The idea of applying toom to unbalanced multiplication is due to Marco
886d7f5d3SJohn Marino    Bodrato and Alberto Zanoni.
986d7f5d3SJohn Marino 
1086d7f5d3SJohn Marino    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
1186d7f5d3SJohn Marino    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
1286d7f5d3SJohn Marino    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
1386d7f5d3SJohn Marino 
1486d7f5d3SJohn Marino Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
1586d7f5d3SJohn Marino 
1686d7f5d3SJohn Marino This file is part of the GNU MP Library.
1786d7f5d3SJohn Marino 
1886d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1986d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
2086d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
2186d7f5d3SJohn Marino option) any later version.
2286d7f5d3SJohn Marino 
2386d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
2486d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2586d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
2686d7f5d3SJohn Marino License for more details.
2786d7f5d3SJohn Marino 
2886d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2986d7f5d3SJohn Marino along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
3086d7f5d3SJohn Marino 
3186d7f5d3SJohn Marino 
3286d7f5d3SJohn Marino #include "gmp.h"
3386d7f5d3SJohn Marino #include "gmp-impl.h"
3486d7f5d3SJohn Marino 
3586d7f5d3SJohn Marino /* Evaluate in: -1, 0, +1, +2, +inf
3686d7f5d3SJohn Marino 
3786d7f5d3SJohn Marino   <-s-><--n--><--n--><--n-->
3886d7f5d3SJohn Marino    ___ ______ ______ ______
3986d7f5d3SJohn Marino   |a3_|___a2_|___a1_|___a0_|
4086d7f5d3SJohn Marino 	       |_b1_|___b0_|
4186d7f5d3SJohn Marino 	       <-t--><--n-->
4286d7f5d3SJohn Marino 
4386d7f5d3SJohn Marino   v0  =  a0             * b0      #   A(0)*B(0)
4486d7f5d3SJohn Marino   v1  = (a0+ a1+ a2+ a3)*(b0+ b1) #   A(1)*B(1)      ah  <= 3  bh <= 1
4586d7f5d3SJohn Marino   vm1 = (a0- a1+ a2- a3)*(b0- b1) #  A(-1)*B(-1)    |ah| <= 1  bh  = 0
4686d7f5d3SJohn Marino   v2  = (a0+2a1+4a2+8a3)*(b0+2b1) #   A(2)*B(2)      ah  <= 14 bh <= 2
4786d7f5d3SJohn Marino   vinf=              a3 *     b1  # A(inf)*B(inf)
4886d7f5d3SJohn Marino */
4986d7f5d3SJohn Marino 
5086d7f5d3SJohn Marino #define TOOM42_MUL_N_REC(p, a, b, n, ws)				\
5186d7f5d3SJohn Marino   do {									\
5286d7f5d3SJohn Marino     mpn_mul_n (p, a, b, n);						\
5386d7f5d3SJohn Marino   } while (0)
5486d7f5d3SJohn Marino 
5586d7f5d3SJohn Marino void
mpn_toom42_mul(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn,mp_ptr scratch)5686d7f5d3SJohn Marino mpn_toom42_mul (mp_ptr pp,
5786d7f5d3SJohn Marino 		mp_srcptr ap, mp_size_t an,
5886d7f5d3SJohn Marino 		mp_srcptr bp, mp_size_t bn,
5986d7f5d3SJohn Marino 		mp_ptr scratch)
6086d7f5d3SJohn Marino {
6186d7f5d3SJohn Marino   mp_size_t n, s, t;
6286d7f5d3SJohn Marino   int vm1_neg;
6386d7f5d3SJohn Marino   mp_limb_t cy, vinf0;
6486d7f5d3SJohn Marino   mp_ptr a0_a2, a1_a3;
6586d7f5d3SJohn Marino   mp_ptr as1, asm1, as2;
6686d7f5d3SJohn Marino   mp_ptr bs1, bsm1, bs2;
6786d7f5d3SJohn Marino   TMP_DECL;
6886d7f5d3SJohn Marino 
6986d7f5d3SJohn Marino #define a0  ap
7086d7f5d3SJohn Marino #define a1  (ap + n)
7186d7f5d3SJohn Marino #define a2  (ap + 2*n)
7286d7f5d3SJohn Marino #define a3  (ap + 3*n)
7386d7f5d3SJohn Marino #define b0  bp
7486d7f5d3SJohn Marino #define b1  (bp + n)
7586d7f5d3SJohn Marino 
7686d7f5d3SJohn Marino   n = an >= 2 * bn ? (an + 3) >> 2 : (bn + 1) >> 1;
7786d7f5d3SJohn Marino 
7886d7f5d3SJohn Marino   s = an - 3 * n;
7986d7f5d3SJohn Marino   t = bn - n;
8086d7f5d3SJohn Marino 
8186d7f5d3SJohn Marino   ASSERT (0 < s && s <= n);
8286d7f5d3SJohn Marino   ASSERT (0 < t && t <= n);
8386d7f5d3SJohn Marino 
8486d7f5d3SJohn Marino   TMP_MARK;
8586d7f5d3SJohn Marino 
8686d7f5d3SJohn Marino   as1 = TMP_SALLOC_LIMBS (n + 1);
8786d7f5d3SJohn Marino   asm1 = TMP_SALLOC_LIMBS (n + 1);
8886d7f5d3SJohn Marino   as2 = TMP_SALLOC_LIMBS (n + 1);
8986d7f5d3SJohn Marino 
9086d7f5d3SJohn Marino   bs1 = TMP_SALLOC_LIMBS (n + 1);
9186d7f5d3SJohn Marino   bsm1 = TMP_SALLOC_LIMBS (n);
9286d7f5d3SJohn Marino   bs2 = TMP_SALLOC_LIMBS (n + 1);
9386d7f5d3SJohn Marino 
9486d7f5d3SJohn Marino   a0_a2 = pp;
9586d7f5d3SJohn Marino   a1_a3 = pp + n + 1;
9686d7f5d3SJohn Marino 
9786d7f5d3SJohn Marino   /* Compute as1 and asm1.  */
9886d7f5d3SJohn Marino   vm1_neg = mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0_a2) & 1;
9986d7f5d3SJohn Marino 
10086d7f5d3SJohn Marino   /* Compute as2.  */
10186d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
10286d7f5d3SJohn Marino   cy  = mpn_addlsh1_n (as2, a2, a3, s);
10386d7f5d3SJohn Marino   if (s != n)
10486d7f5d3SJohn Marino     cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy);
10586d7f5d3SJohn Marino   cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n);
10686d7f5d3SJohn Marino   cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
10786d7f5d3SJohn Marino #else
10886d7f5d3SJohn Marino   cy  = mpn_lshift (as2, a3, s, 1);
10986d7f5d3SJohn Marino   cy += mpn_add_n (as2, a2, as2, s);
11086d7f5d3SJohn Marino   if (s != n)
11186d7f5d3SJohn Marino     cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy);
11286d7f5d3SJohn Marino   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
11386d7f5d3SJohn Marino   cy += mpn_add_n (as2, a1, as2, n);
11486d7f5d3SJohn Marino   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
11586d7f5d3SJohn Marino   cy += mpn_add_n (as2, a0, as2, n);
11686d7f5d3SJohn Marino #endif
11786d7f5d3SJohn Marino   as2[n] = cy;
11886d7f5d3SJohn Marino 
11986d7f5d3SJohn Marino   /* Compute bs1 and bsm1.  */
12086d7f5d3SJohn Marino   if (t == n)
12186d7f5d3SJohn Marino     {
12286d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_add_n_sub_n
12386d7f5d3SJohn Marino       if (mpn_cmp (b0, b1, n) < 0)
12486d7f5d3SJohn Marino 	{
12586d7f5d3SJohn Marino 	  cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
12686d7f5d3SJohn Marino 	  vm1_neg ^= 1;
12786d7f5d3SJohn Marino 	}
12886d7f5d3SJohn Marino       else
12986d7f5d3SJohn Marino 	{
13086d7f5d3SJohn Marino 	  cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
13186d7f5d3SJohn Marino 	}
13286d7f5d3SJohn Marino       bs1[n] = cy >> 1;
13386d7f5d3SJohn Marino #else
13486d7f5d3SJohn Marino       bs1[n] = mpn_add_n (bs1, b0, b1, n);
13586d7f5d3SJohn Marino 
13686d7f5d3SJohn Marino       if (mpn_cmp (b0, b1, n) < 0)
13786d7f5d3SJohn Marino 	{
13886d7f5d3SJohn Marino 	  mpn_sub_n (bsm1, b1, b0, n);
13986d7f5d3SJohn Marino 	  vm1_neg ^= 1;
14086d7f5d3SJohn Marino 	}
14186d7f5d3SJohn Marino       else
14286d7f5d3SJohn Marino 	{
14386d7f5d3SJohn Marino 	  mpn_sub_n (bsm1, b0, b1, n);
14486d7f5d3SJohn Marino 	}
14586d7f5d3SJohn Marino #endif
14686d7f5d3SJohn Marino     }
14786d7f5d3SJohn Marino   else
14886d7f5d3SJohn Marino     {
14986d7f5d3SJohn Marino       bs1[n] = mpn_add (bs1, b0, n, b1, t);
15086d7f5d3SJohn Marino 
15186d7f5d3SJohn Marino       if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
15286d7f5d3SJohn Marino 	{
15386d7f5d3SJohn Marino 	  mpn_sub_n (bsm1, b1, b0, t);
15486d7f5d3SJohn Marino 	  MPN_ZERO (bsm1 + t, n - t);
15586d7f5d3SJohn Marino 	  vm1_neg ^= 1;
15686d7f5d3SJohn Marino 	}
15786d7f5d3SJohn Marino       else
15886d7f5d3SJohn Marino 	{
15986d7f5d3SJohn Marino 	  mpn_sub (bsm1, b0, n, b1, t);
16086d7f5d3SJohn Marino 	}
16186d7f5d3SJohn Marino     }
16286d7f5d3SJohn Marino 
16386d7f5d3SJohn Marino   /* Compute bs2, recycling bs1. bs2=bs1+b1  */
16486d7f5d3SJohn Marino   mpn_add (bs2, bs1, n + 1, b1, t);
16586d7f5d3SJohn Marino 
16686d7f5d3SJohn Marino   ASSERT (as1[n] <= 3);
16786d7f5d3SJohn Marino   ASSERT (bs1[n] <= 1);
16886d7f5d3SJohn Marino   ASSERT (asm1[n] <= 1);
16986d7f5d3SJohn Marino /*ASSERT (bsm1[n] == 0);*/
17086d7f5d3SJohn Marino   ASSERT (as2[n] <= 14);
17186d7f5d3SJohn Marino   ASSERT (bs2[n] <= 2);
17286d7f5d3SJohn Marino 
17386d7f5d3SJohn Marino #define v0    pp				/* 2n */
17486d7f5d3SJohn Marino #define v1    (pp + 2 * n)			/* 2n+1 */
17586d7f5d3SJohn Marino #define vinf  (pp + 4 * n)			/* s+t */
17686d7f5d3SJohn Marino #define vm1   scratch				/* 2n+1 */
17786d7f5d3SJohn Marino #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
17886d7f5d3SJohn Marino #define scratch_out	scratch + 4 * n + 4	/* Currently unused. */
17986d7f5d3SJohn Marino 
18086d7f5d3SJohn Marino   /* vm1, 2n+1 limbs */
18186d7f5d3SJohn Marino   TOOM42_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
18286d7f5d3SJohn Marino   cy = 0;
18386d7f5d3SJohn Marino   if (asm1[n] != 0)
18486d7f5d3SJohn Marino     cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
18586d7f5d3SJohn Marino   vm1[2 * n] = cy;
18686d7f5d3SJohn Marino 
18786d7f5d3SJohn Marino   TOOM42_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
18886d7f5d3SJohn Marino 
18986d7f5d3SJohn Marino   /* vinf, s+t limbs */
19086d7f5d3SJohn Marino   if (s > t)  mpn_mul (vinf, a3, s, b1, t);
19186d7f5d3SJohn Marino   else        mpn_mul (vinf, b1, t, a3, s);
19286d7f5d3SJohn Marino 
19386d7f5d3SJohn Marino   vinf0 = vinf[0];				/* v1 overlaps with this */
19486d7f5d3SJohn Marino 
19586d7f5d3SJohn Marino   /* v1, 2n+1 limbs */
19686d7f5d3SJohn Marino   TOOM42_MUL_N_REC (v1, as1, bs1, n, scratch_out);
19786d7f5d3SJohn Marino   if (as1[n] == 1)
19886d7f5d3SJohn Marino     {
19986d7f5d3SJohn Marino       cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
20086d7f5d3SJohn Marino     }
20186d7f5d3SJohn Marino   else if (as1[n] == 2)
20286d7f5d3SJohn Marino     {
20386d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
20486d7f5d3SJohn Marino       cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
20586d7f5d3SJohn Marino #else
20686d7f5d3SJohn Marino       cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
20786d7f5d3SJohn Marino #endif
20886d7f5d3SJohn Marino     }
20986d7f5d3SJohn Marino   else if (as1[n] == 3)
21086d7f5d3SJohn Marino     {
21186d7f5d3SJohn Marino       cy = 3 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(3));
21286d7f5d3SJohn Marino     }
21386d7f5d3SJohn Marino   else
21486d7f5d3SJohn Marino     cy = 0;
21586d7f5d3SJohn Marino   if (bs1[n] != 0)
21686d7f5d3SJohn Marino     cy += mpn_add_n (v1 + n, v1 + n, as1, n);
21786d7f5d3SJohn Marino   v1[2 * n] = cy;
21886d7f5d3SJohn Marino 
21986d7f5d3SJohn Marino   TOOM42_MUL_N_REC (v0, ap, bp, n, scratch_out);	/* v0, 2n limbs */
22086d7f5d3SJohn Marino 
22186d7f5d3SJohn Marino   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
22286d7f5d3SJohn Marino 
22386d7f5d3SJohn Marino   TMP_FREE;
22486d7f5d3SJohn Marino }
225