xref: /dflybsd-src/contrib/gmp/mpn/generic/toom44_mul.c (revision 86d7f5d305c6adaa56ff4582ece9859d73106103)
1*86d7f5d3SJohn Marino /* mpn_toom44_mul -- Multiply {ap,an} and {bp,bn} where an and bn are close in
2*86d7f5d3SJohn Marino    size.  Or more accurately, bn <= an < (4/3)bn.
3*86d7f5d3SJohn Marino 
4*86d7f5d3SJohn Marino    Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
5*86d7f5d3SJohn Marino 
6*86d7f5d3SJohn Marino    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
7*86d7f5d3SJohn Marino    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
8*86d7f5d3SJohn Marino    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9*86d7f5d3SJohn Marino 
10*86d7f5d3SJohn Marino Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
11*86d7f5d3SJohn Marino 
12*86d7f5d3SJohn Marino This file is part of the GNU MP Library.
13*86d7f5d3SJohn Marino 
14*86d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
15*86d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
16*86d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
17*86d7f5d3SJohn Marino option) any later version.
18*86d7f5d3SJohn Marino 
19*86d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
20*86d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
21*86d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
22*86d7f5d3SJohn Marino License for more details.
23*86d7f5d3SJohn Marino 
24*86d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
25*86d7f5d3SJohn Marino along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
26*86d7f5d3SJohn Marino 
27*86d7f5d3SJohn Marino 
28*86d7f5d3SJohn Marino #include "gmp.h"
29*86d7f5d3SJohn Marino #include "gmp-impl.h"
30*86d7f5d3SJohn Marino 
31*86d7f5d3SJohn Marino /* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf
32*86d7f5d3SJohn Marino 
33*86d7f5d3SJohn Marino   <-s--><--n--><--n--><--n-->
34*86d7f5d3SJohn Marino    ____ ______ ______ ______
35*86d7f5d3SJohn Marino   |_a3_|___a2_|___a1_|___a0_|
36*86d7f5d3SJohn Marino    |b3_|___b2_|___b1_|___b0_|
37*86d7f5d3SJohn Marino    <-t-><--n--><--n--><--n-->
38*86d7f5d3SJohn Marino 
39*86d7f5d3SJohn Marino   v0  =   a0             *  b0              #    A(0)*B(0)
40*86d7f5d3SJohn Marino   v1  = ( a0+ a1+ a2+ a3)*( b0+ b1+ b2+ b3) #    A(1)*B(1)      ah  <= 3   bh  <= 3
41*86d7f5d3SJohn Marino   vm1 = ( a0- a1+ a2- a3)*( b0- b1+ b2- b3) #   A(-1)*B(-1)    |ah| <= 1  |bh| <= 1
42*86d7f5d3SJohn Marino   v2  = ( a0+2a1+4a2+8a3)*( b0+2b1+4b2+8b3) #    A(2)*B(2)      ah  <= 14  bh  <= 14
43*86d7f5d3SJohn Marino   vm2 = ( a0-2a1+4a2-8a3)*( b0-2b1+4b2-8b3) #    A(2)*B(2)      ah  <= 9  |bh| <= 9
44*86d7f5d3SJohn Marino   vh  = (8a0+4a1+2a2+ a3)*(8b0+4b1+2b2+ b3) #  A(1/2)*B(1/2)    ah  <= 14  bh  <= 14
45*86d7f5d3SJohn Marino   vinf=               a3 *          b2      #  A(inf)*B(inf)
46*86d7f5d3SJohn Marino */
47*86d7f5d3SJohn Marino 
48*86d7f5d3SJohn Marino #if TUNE_PROGRAM_BUILD
49*86d7f5d3SJohn Marino #define MAYBE_mul_basecase 1
50*86d7f5d3SJohn Marino #define MAYBE_mul_toom22   1
51*86d7f5d3SJohn Marino #define MAYBE_mul_toom44   1
52*86d7f5d3SJohn Marino #else
53*86d7f5d3SJohn Marino #define MAYBE_mul_basecase						\
54*86d7f5d3SJohn Marino   (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM22_THRESHOLD)
55*86d7f5d3SJohn Marino #define MAYBE_mul_toom22						\
56*86d7f5d3SJohn Marino   (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM33_THRESHOLD)
57*86d7f5d3SJohn Marino #define MAYBE_mul_toom44						\
58*86d7f5d3SJohn Marino   (MUL_FFT_THRESHOLD >= 4 * MUL_TOOM44_THRESHOLD)
59*86d7f5d3SJohn Marino #endif
60*86d7f5d3SJohn Marino 
61*86d7f5d3SJohn Marino #define TOOM44_MUL_N_REC(p, a, b, n, ws)				\
62*86d7f5d3SJohn Marino   do {									\
63*86d7f5d3SJohn Marino     if (MAYBE_mul_basecase						\
64*86d7f5d3SJohn Marino 	&& BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))			\
65*86d7f5d3SJohn Marino       mpn_mul_basecase (p, a, n, b, n);					\
66*86d7f5d3SJohn Marino     else if (MAYBE_mul_toom22						\
67*86d7f5d3SJohn Marino 	     && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))		\
68*86d7f5d3SJohn Marino       mpn_toom22_mul (p, a, n, b, n, ws);				\
69*86d7f5d3SJohn Marino     else if (! MAYBE_mul_toom44						\
70*86d7f5d3SJohn Marino 	     || BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD))		\
71*86d7f5d3SJohn Marino       mpn_toom33_mul (p, a, n, b, n, ws);				\
72*86d7f5d3SJohn Marino     else								\
73*86d7f5d3SJohn Marino       mpn_toom44_mul (p, a, n, b, n, ws);				\
74*86d7f5d3SJohn Marino   } while (0)
75*86d7f5d3SJohn Marino 
76*86d7f5d3SJohn Marino /* Use of scratch space. In the product area, we store
77*86d7f5d3SJohn Marino 
78*86d7f5d3SJohn Marino       ___________________
79*86d7f5d3SJohn Marino      |vinf|____|_v1_|_v0_|
80*86d7f5d3SJohn Marino       s+t  2n-1 2n+1  2n
81*86d7f5d3SJohn Marino 
82*86d7f5d3SJohn Marino    The other recursive products, vm1, v2, vm2, vh are stored in the
83*86d7f5d3SJohn Marino    scratch area. When computing them, we use the product area for
84*86d7f5d3SJohn Marino    intermediate values.
85*86d7f5d3SJohn Marino 
86*86d7f5d3SJohn Marino    Next, we compute v1. We can store the intermediate factors at v0
87*86d7f5d3SJohn Marino    and at vh + 2n + 2.
88*86d7f5d3SJohn Marino 
89*86d7f5d3SJohn Marino    Finally, for v0 and vinf, factors are parts of the input operands,
90*86d7f5d3SJohn Marino    and we need scratch space only for the recursive multiplication.
91*86d7f5d3SJohn Marino 
92*86d7f5d3SJohn Marino    In all, if S(an) is the scratch need, the needed space is bounded by
93*86d7f5d3SJohn Marino 
94*86d7f5d3SJohn Marino      S(an) <= 4 (2*ceil(an/4) + 1) + 1 + S(ceil(an/4) + 1)
95*86d7f5d3SJohn Marino 
96*86d7f5d3SJohn Marino    which should give S(n) = 8 n/3 + c log(n) for some constant c.
97*86d7f5d3SJohn Marino */
98*86d7f5d3SJohn Marino 
99*86d7f5d3SJohn Marino void
mpn_toom44_mul(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn,mp_ptr scratch)100*86d7f5d3SJohn Marino mpn_toom44_mul (mp_ptr pp,
101*86d7f5d3SJohn Marino 		mp_srcptr ap, mp_size_t an,
102*86d7f5d3SJohn Marino 		mp_srcptr bp, mp_size_t bn,
103*86d7f5d3SJohn Marino 		mp_ptr scratch)
104*86d7f5d3SJohn Marino {
105*86d7f5d3SJohn Marino   mp_size_t n, s, t;
106*86d7f5d3SJohn Marino   mp_limb_t cy;
107*86d7f5d3SJohn Marino   enum toom7_flags flags;
108*86d7f5d3SJohn Marino 
109*86d7f5d3SJohn Marino #define a0  ap
110*86d7f5d3SJohn Marino #define a1  (ap + n)
111*86d7f5d3SJohn Marino #define a2  (ap + 2*n)
112*86d7f5d3SJohn Marino #define a3  (ap + 3*n)
113*86d7f5d3SJohn Marino #define b0  bp
114*86d7f5d3SJohn Marino #define b1  (bp + n)
115*86d7f5d3SJohn Marino #define b2  (bp + 2*n)
116*86d7f5d3SJohn Marino #define b3  (bp + 3*n)
117*86d7f5d3SJohn Marino 
118*86d7f5d3SJohn Marino   ASSERT (an >= bn);
119*86d7f5d3SJohn Marino 
120*86d7f5d3SJohn Marino   n = (an + 3) >> 2;
121*86d7f5d3SJohn Marino 
122*86d7f5d3SJohn Marino   s = an - 3 * n;
123*86d7f5d3SJohn Marino   t = bn - 3 * n;
124*86d7f5d3SJohn Marino 
125*86d7f5d3SJohn Marino   ASSERT (0 < s && s <= n);
126*86d7f5d3SJohn Marino   ASSERT (0 < t && t <= n);
127*86d7f5d3SJohn Marino   ASSERT (s >= t);
128*86d7f5d3SJohn Marino 
129*86d7f5d3SJohn Marino   /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the
130*86d7f5d3SJohn Marino    * following limb, so these must be computed in order, and we need a
131*86d7f5d3SJohn Marino    * one limb gap to tp. */
132*86d7f5d3SJohn Marino #define v0    pp				/* 2n */
133*86d7f5d3SJohn Marino #define v1    (pp + 2 * n)			/* 2n+1 */
134*86d7f5d3SJohn Marino #define vinf  (pp + 6 * n)			/* s+t */
135*86d7f5d3SJohn Marino #define v2    scratch				/* 2n+1 */
136*86d7f5d3SJohn Marino #define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
137*86d7f5d3SJohn Marino #define vh    (scratch + 4 * n + 2)		/* 2n+1 */
138*86d7f5d3SJohn Marino #define vm1   (scratch + 6 * n + 3)		/* 2n+1 */
139*86d7f5d3SJohn Marino #define tp (scratch + 8*n + 5)
140*86d7f5d3SJohn Marino 
141*86d7f5d3SJohn Marino   /* apx and bpx must not overlap with v1 */
142*86d7f5d3SJohn Marino #define apx   pp				/* n+1 */
143*86d7f5d3SJohn Marino #define amx   (pp + n + 1)			/* n+1 */
144*86d7f5d3SJohn Marino #define bmx   (pp + 2*n + 2)			/* n+1 */
145*86d7f5d3SJohn Marino #define bpx   (pp + 4*n + 2)			/* n+1 */
146*86d7f5d3SJohn Marino 
147*86d7f5d3SJohn Marino   /* Total scratch need: 8*n + 5 + scratch for recursive calls. This
148*86d7f5d3SJohn Marino      gives roughly 32 n/3 + log term. */
149*86d7f5d3SJohn Marino 
150*86d7f5d3SJohn Marino   /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3.  */
151*86d7f5d3SJohn Marino   flags = toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp);
152*86d7f5d3SJohn Marino 
153*86d7f5d3SJohn Marino   /* Compute bpx = b0 + 2 b1 + 4 b2 + 8 b3 and bmx = b0 - 2 b1 + 4 b2 - 8 b3.  */
154*86d7f5d3SJohn Marino   flags ^= toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (bpx, bmx, bp, n, t, tp);
155*86d7f5d3SJohn Marino 
156*86d7f5d3SJohn Marino   TOOM44_MUL_N_REC (v2, apx, bpx, n + 1, tp);	/* v2,  2n+1 limbs */
157*86d7f5d3SJohn Marino   TOOM44_MUL_N_REC (vm2, amx, bmx, n + 1, tp);	/* vm2,  2n+1 limbs */
158*86d7f5d3SJohn Marino 
159*86d7f5d3SJohn Marino   /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */
160*86d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
161*86d7f5d3SJohn Marino   cy = mpn_addlsh1_n (apx, a1, a0, n);
162*86d7f5d3SJohn Marino   cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n);
163*86d7f5d3SJohn Marino   if (s < n)
164*86d7f5d3SJohn Marino     {
165*86d7f5d3SJohn Marino       mp_limb_t cy2;
166*86d7f5d3SJohn Marino       cy2 = mpn_addlsh1_n (apx, a3, apx, s);
167*86d7f5d3SJohn Marino       apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1);
168*86d7f5d3SJohn Marino       MPN_INCR_U (apx + s, n+1-s, cy2);
169*86d7f5d3SJohn Marino     }
170*86d7f5d3SJohn Marino   else
171*86d7f5d3SJohn Marino     apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n);
172*86d7f5d3SJohn Marino #else
173*86d7f5d3SJohn Marino   cy = mpn_lshift (apx, a0, n, 1);
174*86d7f5d3SJohn Marino   cy += mpn_add_n (apx, apx, a1, n);
175*86d7f5d3SJohn Marino   cy = 2*cy + mpn_lshift (apx, apx, n, 1);
176*86d7f5d3SJohn Marino   cy += mpn_add_n (apx, apx, a2, n);
177*86d7f5d3SJohn Marino   cy = 2*cy + mpn_lshift (apx, apx, n, 1);
178*86d7f5d3SJohn Marino   apx[n] = cy + mpn_add (apx, apx, n, a3, s);
179*86d7f5d3SJohn Marino #endif
180*86d7f5d3SJohn Marino 
181*86d7f5d3SJohn Marino   /* Compute bpx = 8 b0 + 4 b1 + 2 b2 + b3 = (((2*b0 + b1) * 2 + b2) * 2 + b3 */
182*86d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
183*86d7f5d3SJohn Marino   cy = mpn_addlsh1_n (bpx, b1, b0, n);
184*86d7f5d3SJohn Marino   cy = 2*cy + mpn_addlsh1_n (bpx, b2, bpx, n);
185*86d7f5d3SJohn Marino   if (t < n)
186*86d7f5d3SJohn Marino     {
187*86d7f5d3SJohn Marino       mp_limb_t cy2;
188*86d7f5d3SJohn Marino       cy2 = mpn_addlsh1_n (bpx, b3, bpx, t);
189*86d7f5d3SJohn Marino       bpx[n] = 2*cy + mpn_lshift (bpx + t, bpx + t, n - t, 1);
190*86d7f5d3SJohn Marino       MPN_INCR_U (bpx + t, n+1-t, cy2);
191*86d7f5d3SJohn Marino     }
192*86d7f5d3SJohn Marino   else
193*86d7f5d3SJohn Marino     bpx[n] = 2*cy + mpn_addlsh1_n (bpx, b3, bpx, n);
194*86d7f5d3SJohn Marino #else
195*86d7f5d3SJohn Marino   cy = mpn_lshift (bpx, b0, n, 1);
196*86d7f5d3SJohn Marino   cy += mpn_add_n (bpx, bpx, b1, n);
197*86d7f5d3SJohn Marino   cy = 2*cy + mpn_lshift (bpx, bpx, n, 1);
198*86d7f5d3SJohn Marino   cy += mpn_add_n (bpx, bpx, b2, n);
199*86d7f5d3SJohn Marino   cy = 2*cy + mpn_lshift (bpx, bpx, n, 1);
200*86d7f5d3SJohn Marino   bpx[n] = cy + mpn_add (bpx, bpx, n, b3, t);
201*86d7f5d3SJohn Marino #endif
202*86d7f5d3SJohn Marino 
203*86d7f5d3SJohn Marino   ASSERT (apx[n] < 15);
204*86d7f5d3SJohn Marino   ASSERT (bpx[n] < 15);
205*86d7f5d3SJohn Marino 
206*86d7f5d3SJohn Marino   TOOM44_MUL_N_REC (vh, apx, bpx, n + 1, tp);	/* vh,  2n+1 limbs */
207*86d7f5d3SJohn Marino 
208*86d7f5d3SJohn Marino   /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3.  */
209*86d7f5d3SJohn Marino   flags |= toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp);
210*86d7f5d3SJohn Marino 
211*86d7f5d3SJohn Marino   /* Compute bpx = b0 + b1 + b2 + b3 bnd bmx = b0 - b1 + b2 - b3.  */
212*86d7f5d3SJohn Marino   flags ^= toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (bpx, bmx, bp, n, t, tp);
213*86d7f5d3SJohn Marino 
214*86d7f5d3SJohn Marino   TOOM44_MUL_N_REC (vm1, amx, bmx, n + 1, tp);	/* vm1,  2n+1 limbs */
215*86d7f5d3SJohn Marino   /* Clobbers amx, bmx. */
216*86d7f5d3SJohn Marino   TOOM44_MUL_N_REC (v1, apx, bpx, n + 1, tp);	/* v1,  2n+1 limbs */
217*86d7f5d3SJohn Marino 
218*86d7f5d3SJohn Marino   TOOM44_MUL_N_REC (v0, a0, b0, n, tp);
219*86d7f5d3SJohn Marino   if (s > t)
220*86d7f5d3SJohn Marino     mpn_mul (vinf, a3, s, b3, t);
221*86d7f5d3SJohn Marino   else
222*86d7f5d3SJohn Marino     TOOM44_MUL_N_REC (vinf, a3, b3, s, tp);	/* vinf, s+t limbs */
223*86d7f5d3SJohn Marino 
224*86d7f5d3SJohn Marino   mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t, tp);
225*86d7f5d3SJohn Marino }
226