xref: /dflybsd-src/contrib/gmp/mpn/generic/toom44_mul.c (revision 86d7f5d305c6adaa56ff4582ece9859d73106103)
186d7f5d3SJohn Marino /* mpn_toom44_mul -- Multiply {ap,an} and {bp,bn} where an and bn are close in
286d7f5d3SJohn Marino    size.  Or more accurately, bn <= an < (4/3)bn.
386d7f5d3SJohn Marino 
486d7f5d3SJohn Marino    Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
586d7f5d3SJohn Marino 
686d7f5d3SJohn Marino    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
786d7f5d3SJohn Marino    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
886d7f5d3SJohn Marino    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
986d7f5d3SJohn Marino 
1086d7f5d3SJohn Marino Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
1186d7f5d3SJohn Marino 
1286d7f5d3SJohn Marino This file is part of the GNU MP Library.
1386d7f5d3SJohn Marino 
1486d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1586d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1686d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1786d7f5d3SJohn Marino option) any later version.
1886d7f5d3SJohn Marino 
1986d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
2086d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2186d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
2286d7f5d3SJohn Marino License for more details.
2386d7f5d3SJohn Marino 
2486d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2586d7f5d3SJohn Marino along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
2686d7f5d3SJohn Marino 
2786d7f5d3SJohn Marino 
2886d7f5d3SJohn Marino #include "gmp.h"
2986d7f5d3SJohn Marino #include "gmp-impl.h"
3086d7f5d3SJohn Marino 
3186d7f5d3SJohn Marino /* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf
3286d7f5d3SJohn Marino 
3386d7f5d3SJohn Marino   <-s--><--n--><--n--><--n-->
3486d7f5d3SJohn Marino    ____ ______ ______ ______
3586d7f5d3SJohn Marino   |_a3_|___a2_|___a1_|___a0_|
3686d7f5d3SJohn Marino    |b3_|___b2_|___b1_|___b0_|
3786d7f5d3SJohn Marino    <-t-><--n--><--n--><--n-->
3886d7f5d3SJohn Marino 
3986d7f5d3SJohn Marino   v0  =   a0             *  b0              #    A(0)*B(0)
4086d7f5d3SJohn Marino   v1  = ( a0+ a1+ a2+ a3)*( b0+ b1+ b2+ b3) #    A(1)*B(1)      ah  <= 3   bh  <= 3
4186d7f5d3SJohn Marino   vm1 = ( a0- a1+ a2- a3)*( b0- b1+ b2- b3) #   A(-1)*B(-1)    |ah| <= 1  |bh| <= 1
4286d7f5d3SJohn Marino   v2  = ( a0+2a1+4a2+8a3)*( b0+2b1+4b2+8b3) #    A(2)*B(2)      ah  <= 14  bh  <= 14
4386d7f5d3SJohn Marino   vm2 = ( a0-2a1+4a2-8a3)*( b0-2b1+4b2-8b3) #    A(2)*B(2)      ah  <= 9  |bh| <= 9
4486d7f5d3SJohn Marino   vh  = (8a0+4a1+2a2+ a3)*(8b0+4b1+2b2+ b3) #  A(1/2)*B(1/2)    ah  <= 14  bh  <= 14
4586d7f5d3SJohn Marino   vinf=               a3 *          b2      #  A(inf)*B(inf)
4686d7f5d3SJohn Marino */
4786d7f5d3SJohn Marino 
4886d7f5d3SJohn Marino #if TUNE_PROGRAM_BUILD
4986d7f5d3SJohn Marino #define MAYBE_mul_basecase 1
5086d7f5d3SJohn Marino #define MAYBE_mul_toom22   1
5186d7f5d3SJohn Marino #define MAYBE_mul_toom44   1
5286d7f5d3SJohn Marino #else
5386d7f5d3SJohn Marino #define MAYBE_mul_basecase						\
5486d7f5d3SJohn Marino   (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM22_THRESHOLD)
5586d7f5d3SJohn Marino #define MAYBE_mul_toom22						\
5686d7f5d3SJohn Marino   (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM33_THRESHOLD)
5786d7f5d3SJohn Marino #define MAYBE_mul_toom44						\
5886d7f5d3SJohn Marino   (MUL_FFT_THRESHOLD >= 4 * MUL_TOOM44_THRESHOLD)
5986d7f5d3SJohn Marino #endif
6086d7f5d3SJohn Marino 
6186d7f5d3SJohn Marino #define TOOM44_MUL_N_REC(p, a, b, n, ws)				\
6286d7f5d3SJohn Marino   do {									\
6386d7f5d3SJohn Marino     if (MAYBE_mul_basecase						\
6486d7f5d3SJohn Marino 	&& BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))			\
6586d7f5d3SJohn Marino       mpn_mul_basecase (p, a, n, b, n);					\
6686d7f5d3SJohn Marino     else if (MAYBE_mul_toom22						\
6786d7f5d3SJohn Marino 	     && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))		\
6886d7f5d3SJohn Marino       mpn_toom22_mul (p, a, n, b, n, ws);				\
6986d7f5d3SJohn Marino     else if (! MAYBE_mul_toom44						\
7086d7f5d3SJohn Marino 	     || BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD))		\
7186d7f5d3SJohn Marino       mpn_toom33_mul (p, a, n, b, n, ws);				\
7286d7f5d3SJohn Marino     else								\
7386d7f5d3SJohn Marino       mpn_toom44_mul (p, a, n, b, n, ws);				\
7486d7f5d3SJohn Marino   } while (0)
7586d7f5d3SJohn Marino 
7686d7f5d3SJohn Marino /* Use of scratch space. In the product area, we store
7786d7f5d3SJohn Marino 
7886d7f5d3SJohn Marino       ___________________
7986d7f5d3SJohn Marino      |vinf|____|_v1_|_v0_|
8086d7f5d3SJohn Marino       s+t  2n-1 2n+1  2n
8186d7f5d3SJohn Marino 
8286d7f5d3SJohn Marino    The other recursive products, vm1, v2, vm2, vh are stored in the
8386d7f5d3SJohn Marino    scratch area. When computing them, we use the product area for
8486d7f5d3SJohn Marino    intermediate values.
8586d7f5d3SJohn Marino 
8686d7f5d3SJohn Marino    Next, we compute v1. We can store the intermediate factors at v0
8786d7f5d3SJohn Marino    and at vh + 2n + 2.
8886d7f5d3SJohn Marino 
8986d7f5d3SJohn Marino    Finally, for v0 and vinf, factors are parts of the input operands,
9086d7f5d3SJohn Marino    and we need scratch space only for the recursive multiplication.
9186d7f5d3SJohn Marino 
9286d7f5d3SJohn Marino    In all, if S(an) is the scratch need, the needed space is bounded by
9386d7f5d3SJohn Marino 
9486d7f5d3SJohn Marino      S(an) <= 4 (2*ceil(an/4) + 1) + 1 + S(ceil(an/4) + 1)
9586d7f5d3SJohn Marino 
9686d7f5d3SJohn Marino    which should give S(n) = 8 n/3 + c log(n) for some constant c.
9786d7f5d3SJohn Marino */
9886d7f5d3SJohn Marino 
9986d7f5d3SJohn Marino void
mpn_toom44_mul(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn,mp_ptr scratch)10086d7f5d3SJohn Marino mpn_toom44_mul (mp_ptr pp,
10186d7f5d3SJohn Marino 		mp_srcptr ap, mp_size_t an,
10286d7f5d3SJohn Marino 		mp_srcptr bp, mp_size_t bn,
10386d7f5d3SJohn Marino 		mp_ptr scratch)
10486d7f5d3SJohn Marino {
10586d7f5d3SJohn Marino   mp_size_t n, s, t;
10686d7f5d3SJohn Marino   mp_limb_t cy;
10786d7f5d3SJohn Marino   enum toom7_flags flags;
10886d7f5d3SJohn Marino 
10986d7f5d3SJohn Marino #define a0  ap
11086d7f5d3SJohn Marino #define a1  (ap + n)
11186d7f5d3SJohn Marino #define a2  (ap + 2*n)
11286d7f5d3SJohn Marino #define a3  (ap + 3*n)
11386d7f5d3SJohn Marino #define b0  bp
11486d7f5d3SJohn Marino #define b1  (bp + n)
11586d7f5d3SJohn Marino #define b2  (bp + 2*n)
11686d7f5d3SJohn Marino #define b3  (bp + 3*n)
11786d7f5d3SJohn Marino 
11886d7f5d3SJohn Marino   ASSERT (an >= bn);
11986d7f5d3SJohn Marino 
12086d7f5d3SJohn Marino   n = (an + 3) >> 2;
12186d7f5d3SJohn Marino 
12286d7f5d3SJohn Marino   s = an - 3 * n;
12386d7f5d3SJohn Marino   t = bn - 3 * n;
12486d7f5d3SJohn Marino 
12586d7f5d3SJohn Marino   ASSERT (0 < s && s <= n);
12686d7f5d3SJohn Marino   ASSERT (0 < t && t <= n);
12786d7f5d3SJohn Marino   ASSERT (s >= t);
12886d7f5d3SJohn Marino 
12986d7f5d3SJohn Marino   /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the
13086d7f5d3SJohn Marino    * following limb, so these must be computed in order, and we need a
13186d7f5d3SJohn Marino    * one limb gap to tp. */
13286d7f5d3SJohn Marino #define v0    pp				/* 2n */
13386d7f5d3SJohn Marino #define v1    (pp + 2 * n)			/* 2n+1 */
13486d7f5d3SJohn Marino #define vinf  (pp + 6 * n)			/* s+t */
13586d7f5d3SJohn Marino #define v2    scratch				/* 2n+1 */
13686d7f5d3SJohn Marino #define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
13786d7f5d3SJohn Marino #define vh    (scratch + 4 * n + 2)		/* 2n+1 */
13886d7f5d3SJohn Marino #define vm1   (scratch + 6 * n + 3)		/* 2n+1 */
13986d7f5d3SJohn Marino #define tp (scratch + 8*n + 5)
14086d7f5d3SJohn Marino 
14186d7f5d3SJohn Marino   /* apx and bpx must not overlap with v1 */
14286d7f5d3SJohn Marino #define apx   pp				/* n+1 */
14386d7f5d3SJohn Marino #define amx   (pp + n + 1)			/* n+1 */
14486d7f5d3SJohn Marino #define bmx   (pp + 2*n + 2)			/* n+1 */
14586d7f5d3SJohn Marino #define bpx   (pp + 4*n + 2)			/* n+1 */
14686d7f5d3SJohn Marino 
14786d7f5d3SJohn Marino   /* Total scratch need: 8*n + 5 + scratch for recursive calls. This
14886d7f5d3SJohn Marino      gives roughly 32 n/3 + log term. */
14986d7f5d3SJohn Marino 
15086d7f5d3SJohn Marino   /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3.  */
15186d7f5d3SJohn Marino   flags = toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp);
15286d7f5d3SJohn Marino 
15386d7f5d3SJohn Marino   /* Compute bpx = b0 + 2 b1 + 4 b2 + 8 b3 and bmx = b0 - 2 b1 + 4 b2 - 8 b3.  */
15486d7f5d3SJohn Marino   flags ^= toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (bpx, bmx, bp, n, t, tp);
15586d7f5d3SJohn Marino 
15686d7f5d3SJohn Marino   TOOM44_MUL_N_REC (v2, apx, bpx, n + 1, tp);	/* v2,  2n+1 limbs */
15786d7f5d3SJohn Marino   TOOM44_MUL_N_REC (vm2, amx, bmx, n + 1, tp);	/* vm2,  2n+1 limbs */
15886d7f5d3SJohn Marino 
15986d7f5d3SJohn Marino   /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */
16086d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
16186d7f5d3SJohn Marino   cy = mpn_addlsh1_n (apx, a1, a0, n);
16286d7f5d3SJohn Marino   cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n);
16386d7f5d3SJohn Marino   if (s < n)
16486d7f5d3SJohn Marino     {
16586d7f5d3SJohn Marino       mp_limb_t cy2;
16686d7f5d3SJohn Marino       cy2 = mpn_addlsh1_n (apx, a3, apx, s);
16786d7f5d3SJohn Marino       apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1);
16886d7f5d3SJohn Marino       MPN_INCR_U (apx + s, n+1-s, cy2);
16986d7f5d3SJohn Marino     }
17086d7f5d3SJohn Marino   else
17186d7f5d3SJohn Marino     apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n);
17286d7f5d3SJohn Marino #else
17386d7f5d3SJohn Marino   cy = mpn_lshift (apx, a0, n, 1);
17486d7f5d3SJohn Marino   cy += mpn_add_n (apx, apx, a1, n);
17586d7f5d3SJohn Marino   cy = 2*cy + mpn_lshift (apx, apx, n, 1);
17686d7f5d3SJohn Marino   cy += mpn_add_n (apx, apx, a2, n);
17786d7f5d3SJohn Marino   cy = 2*cy + mpn_lshift (apx, apx, n, 1);
17886d7f5d3SJohn Marino   apx[n] = cy + mpn_add (apx, apx, n, a3, s);
17986d7f5d3SJohn Marino #endif
18086d7f5d3SJohn Marino 
18186d7f5d3SJohn Marino   /* Compute bpx = 8 b0 + 4 b1 + 2 b2 + b3 = (((2*b0 + b1) * 2 + b2) * 2 + b3 */
18286d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
18386d7f5d3SJohn Marino   cy = mpn_addlsh1_n (bpx, b1, b0, n);
18486d7f5d3SJohn Marino   cy = 2*cy + mpn_addlsh1_n (bpx, b2, bpx, n);
18586d7f5d3SJohn Marino   if (t < n)
18686d7f5d3SJohn Marino     {
18786d7f5d3SJohn Marino       mp_limb_t cy2;
18886d7f5d3SJohn Marino       cy2 = mpn_addlsh1_n (bpx, b3, bpx, t);
18986d7f5d3SJohn Marino       bpx[n] = 2*cy + mpn_lshift (bpx + t, bpx + t, n - t, 1);
19086d7f5d3SJohn Marino       MPN_INCR_U (bpx + t, n+1-t, cy2);
19186d7f5d3SJohn Marino     }
19286d7f5d3SJohn Marino   else
19386d7f5d3SJohn Marino     bpx[n] = 2*cy + mpn_addlsh1_n (bpx, b3, bpx, n);
19486d7f5d3SJohn Marino #else
19586d7f5d3SJohn Marino   cy = mpn_lshift (bpx, b0, n, 1);
19686d7f5d3SJohn Marino   cy += mpn_add_n (bpx, bpx, b1, n);
19786d7f5d3SJohn Marino   cy = 2*cy + mpn_lshift (bpx, bpx, n, 1);
19886d7f5d3SJohn Marino   cy += mpn_add_n (bpx, bpx, b2, n);
19986d7f5d3SJohn Marino   cy = 2*cy + mpn_lshift (bpx, bpx, n, 1);
20086d7f5d3SJohn Marino   bpx[n] = cy + mpn_add (bpx, bpx, n, b3, t);
20186d7f5d3SJohn Marino #endif
20286d7f5d3SJohn Marino 
20386d7f5d3SJohn Marino   ASSERT (apx[n] < 15);
20486d7f5d3SJohn Marino   ASSERT (bpx[n] < 15);
20586d7f5d3SJohn Marino 
20686d7f5d3SJohn Marino   TOOM44_MUL_N_REC (vh, apx, bpx, n + 1, tp);	/* vh,  2n+1 limbs */
20786d7f5d3SJohn Marino 
20886d7f5d3SJohn Marino   /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3.  */
20986d7f5d3SJohn Marino   flags |= toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp);
21086d7f5d3SJohn Marino 
21186d7f5d3SJohn Marino   /* Compute bpx = b0 + b1 + b2 + b3 bnd bmx = b0 - b1 + b2 - b3.  */
21286d7f5d3SJohn Marino   flags ^= toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (bpx, bmx, bp, n, t, tp);
21386d7f5d3SJohn Marino 
21486d7f5d3SJohn Marino   TOOM44_MUL_N_REC (vm1, amx, bmx, n + 1, tp);	/* vm1,  2n+1 limbs */
21586d7f5d3SJohn Marino   /* Clobbers amx, bmx. */
21686d7f5d3SJohn Marino   TOOM44_MUL_N_REC (v1, apx, bpx, n + 1, tp);	/* v1,  2n+1 limbs */
21786d7f5d3SJohn Marino 
21886d7f5d3SJohn Marino   TOOM44_MUL_N_REC (v0, a0, b0, n, tp);
21986d7f5d3SJohn Marino   if (s > t)
22086d7f5d3SJohn Marino     mpn_mul (vinf, a3, s, b3, t);
22186d7f5d3SJohn Marino   else
22286d7f5d3SJohn Marino     TOOM44_MUL_N_REC (vinf, a3, b3, s, tp);	/* vinf, s+t limbs */
22386d7f5d3SJohn Marino 
22486d7f5d3SJohn Marino   mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t, tp);
22586d7f5d3SJohn Marino }
226