1 /* mpn_toom43_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3
2 times as large as bn. Or more accurately, bn < an < 2 bn.
3
4 Contributed to the GNU project by Marco Bodrato.
5
6 The idea of applying toom to unbalanced multiplication is due to Marco
7 Bodrato and Alberto Zanoni.
8
9 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
10 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
11 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
12
13 Copyright 2009 Free Software Foundation, Inc.
14
15 This file is part of the GNU MP Library.
16
17 The GNU MP Library is free software; you can redistribute it and/or modify
18 it under the terms of the GNU Lesser General Public License as published by
19 the Free Software Foundation; either version 3 of the License, or (at your
20 option) any later version.
21
22 The GNU MP Library is distributed in the hope that it will be useful, but
23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
25 License for more details.
26
27 You should have received a copy of the GNU Lesser General Public License
28 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
29
30
31 #include "gmp.h"
32 #include "gmp-impl.h"
33
34 /* Evaluate in: -2, -1, 0, +1, +2, +inf
35
36 <-s-><--n--><--n--><--n-->
37 ___ ______ ______ ______
38 |a3_|___a2_|___a1_|___a0_|
39 |_b2_|___b1_|___b0_|
40 <-t--><--n--><--n-->
41
42 v0 = a0 * b0 # A(0)*B(0)
43 v1 = (a0+ a1+ a2+ a3)*(b0+ b1+ b2) # A(1)*B(1) ah <= 3 bh <= 2
44 vm1 = (a0- a1+ a2- a3)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 |bh|<= 1
45 v2 = (a0+2a1+4a2+8a3)*(b0+2b1+4b2) # A(2)*B(2) ah <= 14 bh <= 6
46 vm2 = (a0-2a1+4a2-8a3)*(b0-2b1+4b2) # A(-2)*B(-2) |ah| <= 9 |bh|<= 4
47 vinf= a3 * b2 # A(inf)*B(inf)
48 */
49
50 void
mpn_toom43_mul(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn,mp_ptr scratch)51 mpn_toom43_mul (mp_ptr pp,
52 mp_srcptr ap, mp_size_t an,
53 mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
54 {
55 mp_size_t n, s, t;
56 enum toom6_flags flags;
57 mp_limb_t cy;
58
59 #define a0 ap
60 #define a1 (ap + n)
61 #define a2 (ap + 2 * n)
62 #define a3 (ap + 3 * n)
63 #define b0 bp
64 #define b1 (bp + n)
65 #define b2 (bp + 2 * n)
66
67 n = 1 + (3 * an >= 4 * bn ? (an - 1) >> 2 : (bn - 1) / (size_t) 3);
68
69 s = an - 3 * n;
70 t = bn - 2 * n;
71
72 ASSERT (0 < s && s <= n);
73 ASSERT (0 < t && t <= n);
74
75 /* This is true whenever an >= 25 or bn >= 19, I think. It
76 guarantees that we can fit 5 values of size n+1 in the product
77 area. */
78 ASSERT (s+t >= 5);
79
80 #define v0 pp /* 2n */
81 #define vm1 (scratch) /* 2n+1 */
82 #define v1 (pp + 2*n) /* 2n+1 */
83 #define vm2 (scratch + 2 * n + 1) /* 2n+1 */
84 #define v2 (scratch + 4 * n + 2) /* 2n+1 */
85 #define vinf (pp + 5 * n) /* s+t */
86 #define bs1 pp /* n+1 */
87 #define bsm1 (scratch + 2 * n + 2) /* n+1 */
88 #define asm1 (scratch + 3 * n + 3) /* n+1 */
89 #define asm2 (scratch + 4 * n + 4) /* n+1 */
90 #define bsm2 (pp + n + 1) /* n+1 */
91 #define bs2 (pp + 2 * n + 2) /* n+1 */
92 #define as2 (pp + 3 * n + 3) /* n+1 */
93 #define as1 (pp + 4 * n + 4) /* n+1 */
94
95 /* Total sccratch need is 6 * n + 3 + 1; we allocate one extra
96 limb, because products will overwrite 2n+2 limbs. */
97
98 #define a0a2 scratch
99 #define b0b2 scratch
100 #define a1a3 asm1
101 #define b1d bsm1
102
103 /* Compute as2 and asm2. */
104 flags = toom6_vm2_neg & mpn_toom_eval_dgr3_pm2 (as2, asm2, ap, n, s, a1a3);
105
106 /* Compute bs2 and bsm2. */
107 b1d[n] = mpn_lshift (b1d, b1, n, 1); /* 2b1 */
108 cy = mpn_lshift (b0b2, b2, t, 2); /* 4b2 */
109 cy += mpn_add_n (b0b2, b0b2, b0, t); /* 4b2 + b0 */
110 if (t != n)
111 cy = mpn_add_1 (b0b2 + t, b0 + t, n - t, cy);
112 b0b2[n] = cy;
113
114 #if HAVE_NATIVE_mpn_add_n_sub_n
115 if (mpn_cmp (b0b2, b1d, n+1) < 0)
116 {
117 mpn_add_n_sub_n (bs2, bsm2, b1d, b0b2, n+1);
118 flags ^= toom6_vm2_neg;
119 }
120 else
121 {
122 mpn_add_n_sub_n (bs2, bsm2, b0b2, b1d, n+1);
123 }
124 #else
125 mpn_add_n (bs2, b0b2, b1d, n+1);
126 if (mpn_cmp (b0b2, b1d, n+1) < 0)
127 {
128 mpn_sub_n (bsm2, b1d, b0b2, n+1);
129 flags ^= toom6_vm2_neg;
130 }
131 else
132 {
133 mpn_sub_n (bsm2, b0b2, b1d, n+1);
134 }
135 #endif
136
137 /* Compute as1 and asm1. */
138 flags ^= toom6_vm1_neg & mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0a2);
139
140 /* Compute bs1 and bsm1. */
141 bsm1[n] = mpn_add (bsm1, b0, n, b2, t);
142 #if HAVE_NATIVE_mpn_add_n_sub_n
143 if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)
144 {
145 cy = mpn_add_n_sub_n (bs1, bsm1, b1, bsm1, n);
146 bs1[n] = cy >> 1;
147 flags ^= toom6_vm1_neg;
148 }
149 else
150 {
151 cy = mpn_add_n_sub_n (bs1, bsm1, bsm1, b1, n);
152 bs1[n] = bsm1[n] + (cy >> 1);
153 bsm1[n]-= cy & 1;
154 }
155 #else
156 bs1[n] = bsm1[n] + mpn_add_n (bs1, bsm1, b1, n);
157 if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)
158 {
159 mpn_sub_n (bsm1, b1, bsm1, n);
160 flags ^= toom6_vm1_neg;
161 }
162 else
163 {
164 bsm1[n] -= mpn_sub_n (bsm1, bsm1, b1, n);
165 }
166 #endif
167
168 ASSERT (as1[n] <= 3);
169 ASSERT (bs1[n] <= 2);
170 ASSERT (asm1[n] <= 1);
171 ASSERT (bsm1[n] <= 1);
172 ASSERT (as2[n] <=14);
173 ASSERT (bs2[n] <= 6);
174 ASSERT (asm2[n] <= 9);
175 ASSERT (bsm2[n] <= 4);
176
177 /* vm1, 2n+1 limbs */
178 mpn_mul_n (vm1, asm1, bsm1, n+1); /* W4 */
179
180 /* vm2, 2n+1 limbs */
181 mpn_mul_n (vm2, asm2, bsm2, n+1); /* W2 */
182
183 /* v2, 2n+1 limbs */
184 mpn_mul_n (v2, as2, bs2, n+1); /* W1 */
185
186 /* v1, 2n+1 limbs */
187 mpn_mul_n (v1, as1, bs1, n+1); /* W3 */
188
189 /* vinf, s+t limbs */ /* W0 */
190 if (s > t) mpn_mul (vinf, a3, s, b2, t);
191 else mpn_mul (vinf, b2, t, a3, s);
192
193 /* v0, 2n limbs */
194 mpn_mul_n (v0, ap, bp, n); /* W5 */
195
196 mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s);
197
198 #undef v0
199 #undef vm1
200 #undef v1
201 #undef vm2
202 #undef v2
203 #undef vinf
204 #undef bs1
205 #undef bs2
206 #undef bsm1
207 #undef bsm2
208 #undef asm1
209 #undef asm2
210 /* #undef as1 */
211 /* #undef as2 */
212 #undef a0a2
213 #undef b0b2
214 #undef a1a3
215 #undef b1d
216 #undef a0
217 #undef a1
218 #undef a2
219 #undef a3
220 #undef b0
221 #undef b1
222 #undef b2
223 }
224