1 /* mpn_toom43_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3
2 times as large as bn. Or more accurately, bn < an < 2 bn.
3
4 Contributed to the GNU project by Marco Bodrato.
5
6 The idea of applying toom to unbalanced multiplication is due to Marco
7 Bodrato and Alberto Zanoni.
8
9 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
10 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
11 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
12
13 Copyright 2009 Free Software Foundation, Inc.
14
15 This file is part of the GNU MP Library.
16
17 The GNU MP Library is free software; you can redistribute it and/or modify
18 it under the terms of either:
19
20 * the GNU Lesser General Public License as published by the Free
21 Software Foundation; either version 3 of the License, or (at your
22 option) any later version.
23
24 or
25
26 * the GNU General Public License as published by the Free Software
27 Foundation; either version 2 of the License, or (at your option) any
28 later version.
29
30 or both in parallel, as here.
31
32 The GNU MP Library is distributed in the hope that it will be useful, but
33 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
34 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
35 for more details.
36
37 You should have received copies of the GNU General Public License and the
38 GNU Lesser General Public License along with the GNU MP Library. If not,
39 see https://www.gnu.org/licenses/. */
40
41
42 #include "gmp-impl.h"
43
44 /* Evaluate in: -2, -1, 0, +1, +2, +inf
45
46 <-s-><--n--><--n--><--n-->
47 ___ ______ ______ ______
48 |a3_|___a2_|___a1_|___a0_|
49 |_b2_|___b1_|___b0_|
50 <-t--><--n--><--n-->
51
52 v0 = a0 * b0 # A(0)*B(0)
53 v1 = (a0+ a1+ a2+ a3)*(b0+ b1+ b2) # A(1)*B(1) ah <= 3 bh <= 2
54 vm1 = (a0- a1+ a2- a3)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 |bh|<= 1
55 v2 = (a0+2a1+4a2+8a3)*(b0+2b1+4b2) # A(2)*B(2) ah <= 14 bh <= 6
56 vm2 = (a0-2a1+4a2-8a3)*(b0-2b1+4b2) # A(-2)*B(-2) |ah| <= 9 |bh|<= 4
57 vinf= a3 * b2 # A(inf)*B(inf)
58 */
59
60 void
mpn_toom43_mul(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn,mp_ptr scratch)61 mpn_toom43_mul (mp_ptr pp,
62 mp_srcptr ap, mp_size_t an,
63 mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
64 {
65 mp_size_t n, s, t;
66 enum toom6_flags flags;
67 mp_limb_t cy;
68
69 #define a0 ap
70 #define a1 (ap + n)
71 #define a2 (ap + 2 * n)
72 #define a3 (ap + 3 * n)
73 #define b0 bp
74 #define b1 (bp + n)
75 #define b2 (bp + 2 * n)
76
77 n = 1 + (3 * an >= 4 * bn ? (an - 1) >> 2 : (bn - 1) / (size_t) 3);
78
79 s = an - 3 * n;
80 t = bn - 2 * n;
81
82 ASSERT (0 < s && s <= n);
83 ASSERT (0 < t && t <= n);
84
85 /* This is true whenever an >= 25 or bn >= 19, I think. It
86 guarantees that we can fit 5 values of size n+1 in the product
87 area. */
88 ASSERT (s+t >= 5);
89
90 #define v0 pp /* 2n */
91 #define vm1 (scratch) /* 2n+1 */
92 #define v1 (pp + 2*n) /* 2n+1 */
93 #define vm2 (scratch + 2 * n + 1) /* 2n+1 */
94 #define v2 (scratch + 4 * n + 2) /* 2n+1 */
95 #define vinf (pp + 5 * n) /* s+t */
96 #define bs1 pp /* n+1 */
97 #define bsm1 (scratch + 2 * n + 2) /* n+1 */
98 #define asm1 (scratch + 3 * n + 3) /* n+1 */
99 #define asm2 (scratch + 4 * n + 4) /* n+1 */
100 #define bsm2 (pp + n + 1) /* n+1 */
101 #define bs2 (pp + 2 * n + 2) /* n+1 */
102 #define as2 (pp + 3 * n + 3) /* n+1 */
103 #define as1 (pp + 4 * n + 4) /* n+1 */
104
105 /* Total sccratch need is 6 * n + 3 + 1; we allocate one extra
106 limb, because products will overwrite 2n+2 limbs. */
107
108 #define a0a2 scratch
109 #define b0b2 scratch
110 #define a1a3 asm1
111 #define b1d bsm1
112
113 /* Compute as2 and asm2. */
114 flags = (enum toom6_flags) (toom6_vm2_neg & mpn_toom_eval_dgr3_pm2 (as2, asm2, ap, n, s, a1a3));
115
116 /* Compute bs2 and bsm2. */
117 b1d[n] = mpn_lshift (b1d, b1, n, 1); /* 2b1 */
118 cy = mpn_lshift (b0b2, b2, t, 2); /* 4b2 */
119 cy += mpn_add_n (b0b2, b0b2, b0, t); /* 4b2 + b0 */
120 if (t != n)
121 cy = mpn_add_1 (b0b2 + t, b0 + t, n - t, cy);
122 b0b2[n] = cy;
123
124 #if HAVE_NATIVE_mpn_add_n_sub_n
125 if (mpn_cmp (b0b2, b1d, n+1) < 0)
126 {
127 mpn_add_n_sub_n (bs2, bsm2, b1d, b0b2, n+1);
128 flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
129 }
130 else
131 {
132 mpn_add_n_sub_n (bs2, bsm2, b0b2, b1d, n+1);
133 }
134 #else
135 mpn_add_n (bs2, b0b2, b1d, n+1);
136 if (mpn_cmp (b0b2, b1d, n+1) < 0)
137 {
138 mpn_sub_n (bsm2, b1d, b0b2, n+1);
139 flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
140 }
141 else
142 {
143 mpn_sub_n (bsm2, b0b2, b1d, n+1);
144 }
145 #endif
146
147 /* Compute as1 and asm1. */
148 flags = (enum toom6_flags) (flags ^ (toom6_vm1_neg & mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0a2)));
149
150 /* Compute bs1 and bsm1. */
151 bsm1[n] = mpn_add (bsm1, b0, n, b2, t);
152 #if HAVE_NATIVE_mpn_add_n_sub_n
153 if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)
154 {
155 cy = mpn_add_n_sub_n (bs1, bsm1, b1, bsm1, n);
156 bs1[n] = cy >> 1;
157 flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
158 }
159 else
160 {
161 cy = mpn_add_n_sub_n (bs1, bsm1, bsm1, b1, n);
162 bs1[n] = bsm1[n] + (cy >> 1);
163 bsm1[n]-= cy & 1;
164 }
165 #else
166 bs1[n] = bsm1[n] + mpn_add_n (bs1, bsm1, b1, n);
167 if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)
168 {
169 mpn_sub_n (bsm1, b1, bsm1, n);
170 flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
171 }
172 else
173 {
174 bsm1[n] -= mpn_sub_n (bsm1, bsm1, b1, n);
175 }
176 #endif
177
178 ASSERT (as1[n] <= 3);
179 ASSERT (bs1[n] <= 2);
180 ASSERT (asm1[n] <= 1);
181 ASSERT (bsm1[n] <= 1);
182 ASSERT (as2[n] <=14);
183 ASSERT (bs2[n] <= 6);
184 ASSERT (asm2[n] <= 9);
185 ASSERT (bsm2[n] <= 4);
186
187 /* vm1, 2n+1 limbs */
188 mpn_mul_n (vm1, asm1, bsm1, n+1); /* W4 */
189
190 /* vm2, 2n+1 limbs */
191 mpn_mul_n (vm2, asm2, bsm2, n+1); /* W2 */
192
193 /* v2, 2n+1 limbs */
194 mpn_mul_n (v2, as2, bs2, n+1); /* W1 */
195
196 /* v1, 2n+1 limbs */
197 mpn_mul_n (v1, as1, bs1, n+1); /* W3 */
198
199 /* vinf, s+t limbs */ /* W0 */
200 if (s > t) mpn_mul (vinf, a3, s, b2, t);
201 else mpn_mul (vinf, b2, t, a3, s);
202
203 /* v0, 2n limbs */
204 mpn_mul_n (v0, ap, bp, n); /* W5 */
205
206 mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s);
207
208 #undef v0
209 #undef vm1
210 #undef v1
211 #undef vm2
212 #undef v2
213 #undef vinf
214 #undef bs1
215 #undef bs2
216 #undef bsm1
217 #undef bsm2
218 #undef asm1
219 #undef asm2
220 /* #undef as1 */
221 /* #undef as2 */
222 #undef a0a2
223 #undef b0b2
224 #undef a1a3
225 #undef b1d
226 #undef a0
227 #undef a1
228 #undef a2
229 #undef a3
230 #undef b0
231 #undef b1
232 #undef b2
233 }
234