xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/generic/toom33_mul.c (revision f89f6560d453f5e37386cc7938c072d2f528b9fa)
1 /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
2    size.  Or more accurately, bn <= an < (3/2)bn.
3 
4    Contributed to the GNU project by Torbjorn Granlund.
5    Additional improvements by Marco Bodrato.
6 
7    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
8    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
9    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10 
11 Copyright 2006, 2007, 2008, 2010, 2012 Free Software Foundation, Inc.
12 
13 This file is part of the GNU MP Library.
14 
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of the GNU Lesser General Public License as published by
17 the Free Software Foundation; either version 3 of the License, or (at your
18 option) any later version.
19 
20 The GNU MP Library is distributed in the hope that it will be useful, but
21 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
22 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
23 License for more details.
24 
25 You should have received a copy of the GNU Lesser General Public License
26 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
27 
28 
29 #include "gmp.h"
30 #include "gmp-impl.h"
31 
32 /* Evaluate in: -1, 0, +1, +2, +inf
33 
34   <-s--><--n--><--n--><--n-->
35    ____ ______ ______ ______
36   |_a3_|___a2_|___a1_|___a0_|
37    |b3_|___b2_|___b1_|___b0_|
38    <-t-><--n--><--n--><--n-->
39 
40   v0  =  a0         * b0          #   A(0)*B(0)
41   v1  = (a0+ a1+ a2)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 2  bh <= 2
42   vm1 = (a0- a1+ a2)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1  bh <= 1
43   v2  = (a0+2a1+4a2)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 6  bh <= 6
44   vinf=          a2 *         b2  # A(inf)*B(inf)
45 */
46 
47 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
48 #define MAYBE_mul_basecase 1
49 #define MAYBE_mul_toom33   1
50 #else
51 #define MAYBE_mul_basecase						\
52   (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
53 #define MAYBE_mul_toom33						\
54   (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
55 #endif
56 
57 /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
58    multiplication at the infinity point. We may have
59    MAYBE_mul_basecase == 0, and still get s just below
60    MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
61    s == 1 and mpn_toom22_mul will crash.
62 */
63 
64 #define TOOM33_MUL_N_REC(p, a, b, n, ws)				\
65   do {									\
66     if (MAYBE_mul_basecase						\
67 	&& BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))			\
68       mpn_mul_basecase (p, a, n, b, n);					\
69     else if (! MAYBE_mul_toom33						\
70 	     || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))		\
71       mpn_toom22_mul (p, a, n, b, n, ws);				\
72     else								\
73       mpn_toom33_mul (p, a, n, b, n, ws);				\
74   } while (0)
75 
76 void
77 mpn_toom33_mul (mp_ptr pp,
78 		mp_srcptr ap, mp_size_t an,
79 		mp_srcptr bp, mp_size_t bn,
80 		mp_ptr scratch)
81 {
82   const int __gmpn_cpuvec_initialized = 1;
83   mp_size_t n, s, t;
84   int vm1_neg;
85   mp_limb_t cy, vinf0;
86   mp_ptr gp;
87   mp_ptr as1, asm1, as2;
88   mp_ptr bs1, bsm1, bs2;
89 
90 #define a0  ap
91 #define a1  (ap + n)
92 #define a2  (ap + 2*n)
93 #define b0  bp
94 #define b1  (bp + n)
95 #define b2  (bp + 2*n)
96 
97   n = (an + 2) / (size_t) 3;
98 
99   s = an - 2 * n;
100   t = bn - 2 * n;
101 
102   ASSERT (an >= bn);
103 
104   ASSERT (0 < s && s <= n);
105   ASSERT (0 < t && t <= n);
106 
107   as1  = scratch + 4 * n + 4;
108   asm1 = scratch + 2 * n + 2;
109   as2 = pp + n + 1;
110 
111   bs1 = pp;
112   bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
113   bs2 = pp + 2 * n + 2;
114 
115   gp = scratch;
116 
117   vm1_neg = 0;
118 
119   /* Compute as1 and asm1.  */
120   cy = mpn_add (gp, a0, n, a2, s);
121 #if HAVE_NATIVE_mpn_add_n_sub_n
122   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
123     {
124       cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
125       as1[n] = cy >> 1;
126       asm1[n] = 0;
127       vm1_neg = 1;
128     }
129   else
130     {
131       mp_limb_t cy2;
132       cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
133       as1[n] = cy + (cy2 >> 1);
134       asm1[n] = cy - (cy2 & 1);
135     }
136 #else
137   as1[n] = cy + mpn_add_n (as1, gp, a1, n);
138   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
139     {
140       mpn_sub_n (asm1, a1, gp, n);
141       asm1[n] = 0;
142       vm1_neg = 1;
143     }
144   else
145     {
146       cy -= mpn_sub_n (asm1, gp, a1, n);
147       asm1[n] = cy;
148     }
149 #endif
150 
151   /* Compute as2.  */
152 #if HAVE_NATIVE_mpn_rsblsh1_n
153   cy = mpn_add_n (as2, a2, as1, s);
154   if (s != n)
155     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
156   cy += as1[n];
157   cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
158 #else
159 #if HAVE_NATIVE_mpn_addlsh1_n
160   cy  = mpn_addlsh1_n (as2, a1, a2, s);
161   if (s != n)
162     cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
163   cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
164 #else
165   cy = mpn_add_n (as2, a2, as1, s);
166   if (s != n)
167     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
168   cy += as1[n];
169   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
170   cy -= mpn_sub_n (as2, as2, a0, n);
171 #endif
172 #endif
173   as2[n] = cy;
174 
175   /* Compute bs1 and bsm1.  */
176   cy = mpn_add (gp, b0, n, b2, t);
177 #if HAVE_NATIVE_mpn_add_n_sub_n
178   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
179     {
180       cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
181       bs1[n] = cy >> 1;
182       bsm1[n] = 0;
183       vm1_neg ^= 1;
184     }
185   else
186     {
187       mp_limb_t cy2;
188       cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
189       bs1[n] = cy + (cy2 >> 1);
190       bsm1[n] = cy - (cy2 & 1);
191     }
192 #else
193   bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
194   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
195     {
196       mpn_sub_n (bsm1, b1, gp, n);
197       bsm1[n] = 0;
198       vm1_neg ^= 1;
199     }
200   else
201     {
202       cy -= mpn_sub_n (bsm1, gp, b1, n);
203       bsm1[n] = cy;
204     }
205 #endif
206 
207   /* Compute bs2.  */
208 #if HAVE_NATIVE_mpn_rsblsh1_n
209   cy = mpn_add_n (bs2, b2, bs1, t);
210   if (t != n)
211     cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
212   cy += bs1[n];
213   cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
214 #else
215 #if HAVE_NATIVE_mpn_addlsh1_n
216   cy  = mpn_addlsh1_n (bs2, b1, b2, t);
217   if (t != n)
218     cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
219   cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
220 #else
221   cy  = mpn_add_n (bs2, bs1, b2, t);
222   if (t != n)
223     cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
224   cy += bs1[n];
225   cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
226   cy -= mpn_sub_n (bs2, bs2, b0, n);
227 #endif
228 #endif
229   bs2[n] = cy;
230 
231   ASSERT (as1[n] <= 2);
232   ASSERT (bs1[n] <= 2);
233   ASSERT (asm1[n] <= 1);
234   ASSERT (bsm1[n] <= 1);
235   ASSERT (as2[n] <= 6);
236   ASSERT (bs2[n] <= 6);
237 
238 #define v0    pp				/* 2n */
239 #define v1    (pp + 2 * n)			/* 2n+1 */
240 #define vinf  (pp + 4 * n)			/* s+t */
241 #define vm1   scratch				/* 2n+1 */
242 #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
243 #define scratch_out  (scratch + 5 * n + 5)
244 
245   /* vm1, 2n+1 limbs */
246 #ifdef SMALLER_RECURSION
247   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
248   cy = 0;
249   if (asm1[n] != 0)
250     cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
251   if (bsm1[n] != 0)
252     cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
253   vm1[2 * n] = cy;
254 #else
255   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
256 #endif
257 
258   TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
259 
260   /* vinf, s+t limbs */
261   if (s > t)  mpn_mul (vinf, a2, s, b2, t);
262   else        TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
263 
264   vinf0 = vinf[0];				/* v1 overlaps with this */
265 
266 #ifdef SMALLER_RECURSION
267   /* v1, 2n+1 limbs */
268   TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
269   if (as1[n] == 1)
270     {
271       cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
272     }
273   else if (as1[n] != 0)
274     {
275 #if HAVE_NATIVE_mpn_addlsh1_n
276       cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
277 #else
278       cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
279 #endif
280     }
281   else
282     cy = 0;
283   if (bs1[n] == 1)
284     {
285       cy += mpn_add_n (v1 + n, v1 + n, as1, n);
286     }
287   else if (bs1[n] != 0)
288     {
289 #if HAVE_NATIVE_mpn_addlsh1_n
290       cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
291 #else
292       cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
293 #endif
294     }
295   v1[2 * n] = cy;
296 #else
297   cy = vinf[1];
298   TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
299   vinf[1] = cy;
300 #endif
301 
302   TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out);	/* v0, 2n limbs */
303 
304   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
305 }
306