xref: /dflybsd-src/contrib/gmp/mpn/generic/toom62_mul.c (revision 86d7f5d305c6adaa56ff4582ece9859d73106103)
186d7f5d3SJohn Marino /* mpn_toom62_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 3 times
286d7f5d3SJohn Marino    as large as bn.  Or more accurately, (5/2)bn < an < 6bn.
386d7f5d3SJohn Marino 
486d7f5d3SJohn Marino    Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
586d7f5d3SJohn Marino 
686d7f5d3SJohn Marino    The idea of applying toom to unbalanced multiplication is due to Marco
786d7f5d3SJohn Marino    Bodrato and Alberto Zanoni.
886d7f5d3SJohn Marino 
986d7f5d3SJohn Marino    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
1086d7f5d3SJohn Marino    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
1186d7f5d3SJohn Marino    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
1286d7f5d3SJohn Marino 
1386d7f5d3SJohn Marino Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
1486d7f5d3SJohn Marino 
1586d7f5d3SJohn Marino This file is part of the GNU MP Library.
1686d7f5d3SJohn Marino 
1786d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1886d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1986d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
2086d7f5d3SJohn Marino option) any later version.
2186d7f5d3SJohn Marino 
2286d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
2386d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2486d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
2586d7f5d3SJohn Marino License for more details.
2686d7f5d3SJohn Marino 
2786d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2886d7f5d3SJohn Marino along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
2986d7f5d3SJohn Marino 
3086d7f5d3SJohn Marino 
3186d7f5d3SJohn Marino #include "gmp.h"
3286d7f5d3SJohn Marino #include "gmp-impl.h"
3386d7f5d3SJohn Marino 
3486d7f5d3SJohn Marino /* Evaluate in:
3586d7f5d3SJohn Marino    0, +1, -1, +2, -2, 1/2, +inf
3686d7f5d3SJohn Marino 
3786d7f5d3SJohn Marino   <-s-><--n--><--n--><--n--><--n--><--n-->
3886d7f5d3SJohn Marino    ___ ______ ______ ______ ______ ______
3986d7f5d3SJohn Marino   |a5_|___a4_|___a3_|___a2_|___a1_|___a0_|
4086d7f5d3SJohn Marino 			     |_b1_|___b0_|
4186d7f5d3SJohn Marino 			     <-t--><--n-->
4286d7f5d3SJohn Marino 
4386d7f5d3SJohn Marino   v0  =    a0                       *   b0      #    A(0)*B(0)
4486d7f5d3SJohn Marino   v1  = (  a0+  a1+ a2+ a3+  a4+  a5)*( b0+ b1) #    A(1)*B(1)      ah  <= 5   bh <= 1
4586d7f5d3SJohn Marino   vm1 = (  a0-  a1+ a2- a3+  a4-  a5)*( b0- b1) #   A(-1)*B(-1)    |ah| <= 2   bh  = 0
4686d7f5d3SJohn Marino   v2  = (  a0+ 2a1+4a2+8a3+16a4+32a5)*( b0+2b1) #    A(2)*B(2)      ah  <= 62  bh <= 2
4786d7f5d3SJohn Marino   vm2 = (  a0- 2a1+4a2-8a3+16a4-32a5)*( b0-2b1) #   A(-2)*B(-2)    -41<=ah<=20 -1<=bh<=0
4886d7f5d3SJohn Marino   vh  = (32a0+16a1+8a2+4a3+ 2a4+  a5)*(2b0+ b1) #  A(1/2)*B(1/2)    ah  <= 62  bh <= 2
4986d7f5d3SJohn Marino   vinf=                           a5 *      b1  #  A(inf)*B(inf)
5086d7f5d3SJohn Marino */
5186d7f5d3SJohn Marino 
5286d7f5d3SJohn Marino void
mpn_toom62_mul(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn,mp_ptr scratch)5386d7f5d3SJohn Marino mpn_toom62_mul (mp_ptr pp,
5486d7f5d3SJohn Marino 		mp_srcptr ap, mp_size_t an,
5586d7f5d3SJohn Marino 		mp_srcptr bp, mp_size_t bn,
5686d7f5d3SJohn Marino 		mp_ptr scratch)
5786d7f5d3SJohn Marino {
5886d7f5d3SJohn Marino   mp_size_t n, s, t;
5986d7f5d3SJohn Marino   mp_limb_t cy;
6086d7f5d3SJohn Marino   mp_ptr as1, asm1, as2, asm2, ash;
6186d7f5d3SJohn Marino   mp_ptr bs1, bsm1, bs2, bsm2, bsh;
6286d7f5d3SJohn Marino   mp_ptr gp;
6386d7f5d3SJohn Marino   enum toom7_flags aflags, bflags;
6486d7f5d3SJohn Marino   TMP_DECL;
6586d7f5d3SJohn Marino 
6686d7f5d3SJohn Marino #define a0  ap
6786d7f5d3SJohn Marino #define a1  (ap + n)
6886d7f5d3SJohn Marino #define a2  (ap + 2*n)
6986d7f5d3SJohn Marino #define a3  (ap + 3*n)
7086d7f5d3SJohn Marino #define a4  (ap + 4*n)
7186d7f5d3SJohn Marino #define a5  (ap + 5*n)
7286d7f5d3SJohn Marino #define b0  bp
7386d7f5d3SJohn Marino #define b1  (bp + n)
7486d7f5d3SJohn Marino 
7586d7f5d3SJohn Marino   n = 1 + (an >= 3 * bn ? (an - 1) / (size_t) 6 : (bn - 1) >> 1);
7686d7f5d3SJohn Marino 
7786d7f5d3SJohn Marino   s = an - 5 * n;
7886d7f5d3SJohn Marino   t = bn - n;
7986d7f5d3SJohn Marino 
8086d7f5d3SJohn Marino   ASSERT (0 < s && s <= n);
8186d7f5d3SJohn Marino   ASSERT (0 < t && t <= n);
8286d7f5d3SJohn Marino 
8386d7f5d3SJohn Marino   TMP_MARK;
8486d7f5d3SJohn Marino 
8586d7f5d3SJohn Marino   as1 = TMP_SALLOC_LIMBS (n + 1);
8686d7f5d3SJohn Marino   asm1 = TMP_SALLOC_LIMBS (n + 1);
8786d7f5d3SJohn Marino   as2 = TMP_SALLOC_LIMBS (n + 1);
8886d7f5d3SJohn Marino   asm2 = TMP_SALLOC_LIMBS (n + 1);
8986d7f5d3SJohn Marino   ash = TMP_SALLOC_LIMBS (n + 1);
9086d7f5d3SJohn Marino 
9186d7f5d3SJohn Marino   bs1 = TMP_SALLOC_LIMBS (n + 1);
9286d7f5d3SJohn Marino   bsm1 = TMP_SALLOC_LIMBS (n);
9386d7f5d3SJohn Marino   bs2 = TMP_SALLOC_LIMBS (n + 1);
9486d7f5d3SJohn Marino   bsm2 = TMP_SALLOC_LIMBS (n + 1);
9586d7f5d3SJohn Marino   bsh = TMP_SALLOC_LIMBS (n + 1);
9686d7f5d3SJohn Marino 
9786d7f5d3SJohn Marino   gp = pp;
9886d7f5d3SJohn Marino 
9986d7f5d3SJohn Marino   /* Compute as1 and asm1.  */
10086d7f5d3SJohn Marino   aflags = toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 5, ap, n, s, gp);
10186d7f5d3SJohn Marino 
10286d7f5d3SJohn Marino   /* Compute as2 and asm2. */
10386d7f5d3SJohn Marino   aflags |= toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 5, ap, n, s, gp);
10486d7f5d3SJohn Marino 
10586d7f5d3SJohn Marino   /* Compute ash = 32 a0 + 16 a1 + 8 a2 + 4 a3 + 2 a4 + a5
10686d7f5d3SJohn Marino      = 2*(2*(2*(2*(2*a0 + a1) + a2) + a3) + a4) + a5  */
10786d7f5d3SJohn Marino 
10886d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
10986d7f5d3SJohn Marino   cy = mpn_addlsh1_n (ash, a1, a0, n);
11086d7f5d3SJohn Marino   cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n);
11186d7f5d3SJohn Marino   cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n);
11286d7f5d3SJohn Marino   cy = 2*cy + mpn_addlsh1_n (ash, a4, ash, n);
11386d7f5d3SJohn Marino   if (s < n)
11486d7f5d3SJohn Marino     {
11586d7f5d3SJohn Marino       mp_limb_t cy2;
11686d7f5d3SJohn Marino       cy2 = mpn_addlsh1_n (ash, a5, ash, s);
11786d7f5d3SJohn Marino       ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1);
11886d7f5d3SJohn Marino       MPN_INCR_U (ash + s, n+1-s, cy2);
11986d7f5d3SJohn Marino     }
12086d7f5d3SJohn Marino   else
12186d7f5d3SJohn Marino     ash[n] = 2*cy + mpn_addlsh1_n (ash, a5, ash, n);
12286d7f5d3SJohn Marino #else
12386d7f5d3SJohn Marino   cy = mpn_lshift (ash, a0, n, 1);
12486d7f5d3SJohn Marino   cy += mpn_add_n (ash, ash, a1, n);
12586d7f5d3SJohn Marino   cy = 2*cy + mpn_lshift (ash, ash, n, 1);
12686d7f5d3SJohn Marino   cy += mpn_add_n (ash, ash, a2, n);
12786d7f5d3SJohn Marino   cy = 2*cy + mpn_lshift (ash, ash, n, 1);
12886d7f5d3SJohn Marino   cy += mpn_add_n (ash, ash, a3, n);
12986d7f5d3SJohn Marino   cy = 2*cy + mpn_lshift (ash, ash, n, 1);
13086d7f5d3SJohn Marino   cy += mpn_add_n (ash, ash, a4, n);
13186d7f5d3SJohn Marino   cy = 2*cy + mpn_lshift (ash, ash, n, 1);
13286d7f5d3SJohn Marino   ash[n] = cy + mpn_add (ash, ash, n, a5, s);
13386d7f5d3SJohn Marino #endif
13486d7f5d3SJohn Marino 
13586d7f5d3SJohn Marino   /* Compute bs1 and bsm1.  */
13686d7f5d3SJohn Marino   if (t == n)
13786d7f5d3SJohn Marino     {
13886d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_add_n_sub_n
13986d7f5d3SJohn Marino       if (mpn_cmp (b0, b1, n) < 0)
14086d7f5d3SJohn Marino 	{
14186d7f5d3SJohn Marino 	  cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
14286d7f5d3SJohn Marino 	  bflags = toom7_w3_neg;
14386d7f5d3SJohn Marino 	}
14486d7f5d3SJohn Marino       else
14586d7f5d3SJohn Marino 	{
14686d7f5d3SJohn Marino 	  cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
14786d7f5d3SJohn Marino 	  bflags = 0;
14886d7f5d3SJohn Marino 	}
14986d7f5d3SJohn Marino       bs1[n] = cy >> 1;
15086d7f5d3SJohn Marino #else
15186d7f5d3SJohn Marino       bs1[n] = mpn_add_n (bs1, b0, b1, n);
15286d7f5d3SJohn Marino       if (mpn_cmp (b0, b1, n) < 0)
15386d7f5d3SJohn Marino 	{
15486d7f5d3SJohn Marino 	  mpn_sub_n (bsm1, b1, b0, n);
15586d7f5d3SJohn Marino 	  bflags = toom7_w3_neg;
15686d7f5d3SJohn Marino 	}
15786d7f5d3SJohn Marino       else
15886d7f5d3SJohn Marino 	{
15986d7f5d3SJohn Marino 	  mpn_sub_n (bsm1, b0, b1, n);
16086d7f5d3SJohn Marino 	  bflags = 0;
16186d7f5d3SJohn Marino 	}
16286d7f5d3SJohn Marino #endif
16386d7f5d3SJohn Marino     }
16486d7f5d3SJohn Marino   else
16586d7f5d3SJohn Marino     {
16686d7f5d3SJohn Marino       bs1[n] = mpn_add (bs1, b0, n, b1, t);
16786d7f5d3SJohn Marino       if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
16886d7f5d3SJohn Marino 	{
16986d7f5d3SJohn Marino 	  mpn_sub_n (bsm1, b1, b0, t);
17086d7f5d3SJohn Marino 	  MPN_ZERO (bsm1 + t, n - t);
17186d7f5d3SJohn Marino 	  bflags = toom7_w3_neg;
17286d7f5d3SJohn Marino 	}
17386d7f5d3SJohn Marino       else
17486d7f5d3SJohn Marino 	{
17586d7f5d3SJohn Marino 	  mpn_sub (bsm1, b0, n, b1, t);
17686d7f5d3SJohn Marino 	  bflags = 0;
17786d7f5d3SJohn Marino 	}
17886d7f5d3SJohn Marino     }
17986d7f5d3SJohn Marino 
18086d7f5d3SJohn Marino   /* Compute bs2 and bsm2. Recycling bs1 and bsm1; bs2=bs1+b1, bsm2 =
18186d7f5d3SJohn Marino      bsm1 - b1 */
18286d7f5d3SJohn Marino   mpn_add (bs2, bs1, n + 1, b1, t);
18386d7f5d3SJohn Marino   if (bflags & toom7_w3_neg)
18486d7f5d3SJohn Marino     {
18586d7f5d3SJohn Marino       bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t);
18686d7f5d3SJohn Marino       bflags |= toom7_w1_neg;
18786d7f5d3SJohn Marino     }
18886d7f5d3SJohn Marino   else
18986d7f5d3SJohn Marino     {
19086d7f5d3SJohn Marino       /* FIXME: Simplify this logic? */
19186d7f5d3SJohn Marino       if (t < n)
19286d7f5d3SJohn Marino 	{
19386d7f5d3SJohn Marino 	  if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0)
19486d7f5d3SJohn Marino 	    {
19586d7f5d3SJohn Marino 	      ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, t));
19686d7f5d3SJohn Marino 	      MPN_ZERO (bsm2 + t, n + 1 - t);
19786d7f5d3SJohn Marino 	      bflags |= toom7_w1_neg;
19886d7f5d3SJohn Marino 	    }
19986d7f5d3SJohn Marino 	  else
20086d7f5d3SJohn Marino 	    {
20186d7f5d3SJohn Marino 	      ASSERT_NOCARRY (mpn_sub (bsm2, bsm1, n, b1, t));
20286d7f5d3SJohn Marino 	      bsm2[n] = 0;
20386d7f5d3SJohn Marino 	    }
20486d7f5d3SJohn Marino 	}
20586d7f5d3SJohn Marino       else
20686d7f5d3SJohn Marino 	{
20786d7f5d3SJohn Marino 	  if (mpn_cmp (bsm1, b1, n) < 0)
20886d7f5d3SJohn Marino 	    {
20986d7f5d3SJohn Marino 	      ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, n));
21086d7f5d3SJohn Marino 	      bflags |= toom7_w1_neg;
21186d7f5d3SJohn Marino 	    }
21286d7f5d3SJohn Marino 	  else
21386d7f5d3SJohn Marino 	    {
21486d7f5d3SJohn Marino 	      ASSERT_NOCARRY (mpn_sub (bsm2, bsm1, n, b1, n));
21586d7f5d3SJohn Marino 	    }
21686d7f5d3SJohn Marino 	  bsm2[n] = 0;
21786d7f5d3SJohn Marino 	}
21886d7f5d3SJohn Marino     }
21986d7f5d3SJohn Marino 
22086d7f5d3SJohn Marino   /* Compute bsh, recycling bs1 and bsm1. bsh=bs1+b0;  */
22186d7f5d3SJohn Marino   mpn_add (bsh, bs1, n + 1, b0, n);
22286d7f5d3SJohn Marino 
22386d7f5d3SJohn Marino   ASSERT (as1[n] <= 5);
22486d7f5d3SJohn Marino   ASSERT (bs1[n] <= 1);
22586d7f5d3SJohn Marino   ASSERT (asm1[n] <= 2);
22686d7f5d3SJohn Marino   ASSERT (as2[n] <= 62);
22786d7f5d3SJohn Marino   ASSERT (bs2[n] <= 2);
22886d7f5d3SJohn Marino   ASSERT (asm2[n] <= 41);
22986d7f5d3SJohn Marino   ASSERT (bsm2[n] <= 1);
23086d7f5d3SJohn Marino   ASSERT (ash[n] <= 62);
23186d7f5d3SJohn Marino   ASSERT (bsh[n] <= 2);
23286d7f5d3SJohn Marino 
23386d7f5d3SJohn Marino #define v0    pp				/* 2n */
23486d7f5d3SJohn Marino #define v1    (pp + 2 * n)			/* 2n+1 */
23586d7f5d3SJohn Marino #define vinf  (pp + 6 * n)			/* s+t */
23686d7f5d3SJohn Marino #define v2    scratch				/* 2n+1 */
23786d7f5d3SJohn Marino #define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
23886d7f5d3SJohn Marino #define vh    (scratch + 4 * n + 2)		/* 2n+1 */
23986d7f5d3SJohn Marino #define vm1   (scratch + 6 * n + 3)		/* 2n+1 */
24086d7f5d3SJohn Marino #define scratch_out (scratch + 8 * n + 4)		/* 2n+1 */
24186d7f5d3SJohn Marino   /* Total scratch need: 10*n+5 */
24286d7f5d3SJohn Marino 
24386d7f5d3SJohn Marino   /* Must be in allocation order, as they overwrite one limb beyond
24486d7f5d3SJohn Marino    * 2n+1. */
24586d7f5d3SJohn Marino   mpn_mul_n (v2, as2, bs2, n + 1);		/* v2, 2n+1 limbs */
24686d7f5d3SJohn Marino   mpn_mul_n (vm2, asm2, bsm2, n + 1);		/* vm2, 2n+1 limbs */
24786d7f5d3SJohn Marino   mpn_mul_n (vh, ash, bsh, n + 1);		/* vh, 2n+1 limbs */
24886d7f5d3SJohn Marino 
24986d7f5d3SJohn Marino   /* vm1, 2n+1 limbs */
25086d7f5d3SJohn Marino   mpn_mul_n (vm1, asm1, bsm1, n);
25186d7f5d3SJohn Marino   cy = 0;
25286d7f5d3SJohn Marino   if (asm1[n] == 1)
25386d7f5d3SJohn Marino     {
25486d7f5d3SJohn Marino       cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
25586d7f5d3SJohn Marino     }
25686d7f5d3SJohn Marino   else if (asm1[n] == 2)
25786d7f5d3SJohn Marino     {
25886d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
25986d7f5d3SJohn Marino       cy = mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n);
26086d7f5d3SJohn Marino #else
26186d7f5d3SJohn Marino       cy = mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2));
26286d7f5d3SJohn Marino #endif
26386d7f5d3SJohn Marino     }
26486d7f5d3SJohn Marino   vm1[2 * n] = cy;
26586d7f5d3SJohn Marino 
26686d7f5d3SJohn Marino   /* v1, 2n+1 limbs */
26786d7f5d3SJohn Marino   mpn_mul_n (v1, as1, bs1, n);
26886d7f5d3SJohn Marino   if (as1[n] == 1)
26986d7f5d3SJohn Marino     {
27086d7f5d3SJohn Marino       cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
27186d7f5d3SJohn Marino     }
27286d7f5d3SJohn Marino   else if (as1[n] == 2)
27386d7f5d3SJohn Marino     {
27486d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
27586d7f5d3SJohn Marino       cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
27686d7f5d3SJohn Marino #else
27786d7f5d3SJohn Marino       cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
27886d7f5d3SJohn Marino #endif
27986d7f5d3SJohn Marino     }
28086d7f5d3SJohn Marino   else if (as1[n] != 0)
28186d7f5d3SJohn Marino     {
28286d7f5d3SJohn Marino       cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]);
28386d7f5d3SJohn Marino     }
28486d7f5d3SJohn Marino   else
28586d7f5d3SJohn Marino     cy = 0;
28686d7f5d3SJohn Marino   if (bs1[n] != 0)
28786d7f5d3SJohn Marino     cy += mpn_add_n (v1 + n, v1 + n, as1, n);
28886d7f5d3SJohn Marino   v1[2 * n] = cy;
28986d7f5d3SJohn Marino 
29086d7f5d3SJohn Marino   mpn_mul_n (v0, a0, b0, n);			/* v0, 2n limbs */
29186d7f5d3SJohn Marino 
29286d7f5d3SJohn Marino   /* vinf, s+t limbs */
29386d7f5d3SJohn Marino   if (s > t)  mpn_mul (vinf, a5, s, b1, t);
29486d7f5d3SJohn Marino   else        mpn_mul (vinf, b1, t, a5, s);
29586d7f5d3SJohn Marino 
29686d7f5d3SJohn Marino   mpn_toom_interpolate_7pts (pp, n, aflags ^ bflags,
29786d7f5d3SJohn Marino 			     vm2, vm1, v2, vh, s + t, scratch_out);
29886d7f5d3SJohn Marino 
29986d7f5d3SJohn Marino   TMP_FREE;
30086d7f5d3SJohn Marino }
301