1 /* matrix22_mul.c.
2
3 Contributed by Niels Möller and Marco Bodrato.
4
5 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
6 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9 Copyright 2003-2005, 2008, 2009 Free Software Foundation, Inc.
10
11 This file is part of the GNU MP Library.
12
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of either:
15
16 * the GNU Lesser General Public License as published by the Free
17 Software Foundation; either version 3 of the License, or (at your
18 option) any later version.
19
20 or
21
22 * the GNU General Public License as published by the Free Software
23 Foundation; either version 2 of the License, or (at your option) any
24 later version.
25
26 or both in parallel, as here.
27
28 The GNU MP Library is distributed in the hope that it will be useful, but
29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
31 for more details.
32
33 You should have received copies of the GNU General Public License and the
34 GNU Lesser General Public License along with the GNU MP Library. If not,
35 see https://www.gnu.org/licenses/. */
36
37 #include "gmp-impl.h"
38 #include "longlong.h"
39
40 #define MUL(rp, ap, an, bp, bn) do { \
41 if (an >= bn) \
42 mpn_mul (rp, ap, an, bp, bn); \
43 else \
44 mpn_mul (rp, bp, bn, ap, an); \
45 } while (0)
46
47 /* Inputs are unsigned. */
48 static int
abs_sub_n(mp_ptr rp,mp_srcptr ap,mp_srcptr bp,mp_size_t n)49 abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)
50 {
51 int c;
52 MPN_CMP (c, ap, bp, n);
53 if (c >= 0)
54 {
55 mpn_sub_n (rp, ap, bp, n);
56 return 0;
57 }
58 else
59 {
60 mpn_sub_n (rp, bp, ap, n);
61 return 1;
62 }
63 }
64
65 static int
add_signed_n(mp_ptr rp,mp_srcptr ap,int as,mp_srcptr bp,int bs,mp_size_t n)66 add_signed_n (mp_ptr rp,
67 mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n)
68 {
69 if (as != bs)
70 return as ^ abs_sub_n (rp, ap, bp, n);
71 else
72 {
73 ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n));
74 return as;
75 }
76 }
77
78 mp_size_t
mpn_matrix22_mul_itch(mp_size_t rn,mp_size_t mn)79 mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn)
80 {
81 if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
82 || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
83 return 3*rn + 2*mn;
84 else
85 return 3*(rn + mn) + 5;
86 }
87
88 /* Algorithm:
89
90 / s0 \ / 1 0 0 0 \ / r0 \
91 | s1 | | 0 1 0 1 | | r1 |
92 | s2 | | 0 0 -1 1 | | r2 |
93 | s3 | = | 0 1 -1 1 | \ r3 /
94 | s4 | | -1 1 -1 1 |
95 | s5 | | 0 1 0 0 |
96 \ s6 / \ 0 0 1 0 /
97
98 / t0 \ / 1 0 0 0 \ / m0 \
99 | t1 | | 0 1 0 1 | | m1 |
100 | t2 | | 0 0 -1 1 | | m2 |
101 | t3 | = | 0 1 -1 1 | \ m3 /
102 | t4 | | -1 1 -1 1 |
103 | t5 | | 0 1 0 0 |
104 \ t6 / \ 0 0 1 0 /
105
106 Note: the two matrices above are the same, but s_i and t_i are used
107 in the same product, only for i<4, see "A Strassen-like Matrix
108 Multiplication suited for squaring and higher power computation" by
109 M. Bodrato, in Proceedings of ISSAC 2010.
110
111 / r0 \ / 1 0 0 0 0 1 0 \ / s0*t0 \
112 | r1 | = | 0 0 -1 1 -1 1 0 | | s1*t1 |
113 | r2 | | 0 1 0 -1 0 -1 -1 | | s2*t2 |
114 \ r3 / \ 0 1 1 -1 0 -1 0 / | s3*t3 |
115 | s4*t5 |
116 | s5*t6 |
117 \ s6*t4 /
118
119 The scheduling uses two temporaries U0 and U1 to store products, and
120 two, S0 and T0, to store combinations of entries of the two
121 operands.
122 */
123
124 /* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3).
125 *
126 * Resulting elements are of size up to rn + mn + 1.
127 *
128 * Temporary storage: 3 rn + 3 mn + 5. */
129 static void
mpn_matrix22_mul_strassen(mp_ptr r0,mp_ptr r1,mp_ptr r2,mp_ptr r3,mp_size_t rn,mp_srcptr m0,mp_srcptr m1,mp_srcptr m2,mp_srcptr m3,mp_size_t mn,mp_ptr tp)130 mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
131 mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
132 mp_ptr tp)
133 {
134 mp_ptr s0, t0, u0, u1;
135 int r1s, r3s, s0s, t0s, u1s;
136 s0 = tp; tp += rn + 1;
137 t0 = tp; tp += mn + 1;
138 u0 = tp; tp += rn + mn + 1;
139 u1 = tp; /* rn + mn + 2 */
140
141 MUL (u0, r1, rn, m2, mn); /* u5 = s5 * t6 */
142 r3s = abs_sub_n (r3, r3, r2, rn); /* r3 - r2 */
143 if (r3s)
144 {
145 r1s = abs_sub_n (r1, r1, r3, rn);
146 r1[rn] = 0;
147 }
148 else
149 {
150 r1[rn] = mpn_add_n (r1, r1, r3, rn);
151 r1s = 0; /* r1 - r2 + r3 */
152 }
153 if (r1s)
154 {
155 s0[rn] = mpn_add_n (s0, r1, r0, rn);
156 s0s = 0;
157 }
158 else if (r1[rn] != 0)
159 {
160 s0[rn] = r1[rn] - mpn_sub_n (s0, r1, r0, rn);
161 s0s = 1; /* s4 = -r0 + r1 - r2 + r3 */
162 /* Reverse sign! */
163 }
164 else
165 {
166 s0s = abs_sub_n (s0, r0, r1, rn);
167 s0[rn] = 0;
168 }
169 MUL (u1, r0, rn, m0, mn); /* u0 = s0 * t0 */
170 r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn);
171 ASSERT (r0[rn+mn] < 2); /* u0 + u5 */
172
173 t0s = abs_sub_n (t0, m3, m2, mn);
174 u1s = r3s^t0s^1; /* Reverse sign! */
175 MUL (u1, r3, rn, t0, mn); /* u2 = s2 * t2 */
176 u1[rn+mn] = 0;
177 if (t0s)
178 {
179 t0s = abs_sub_n (t0, m1, t0, mn);
180 t0[mn] = 0;
181 }
182 else
183 {
184 t0[mn] = mpn_add_n (t0, t0, m1, mn);
185 }
186
187 /* FIXME: Could be simplified if we had space for rn + mn + 2 limbs
188 at r3. I'd expect that for matrices of random size, the high
189 words t0[mn] and r1[rn] are non-zero with a pretty small
190 probability. If that can be confirmed this should be done as an
191 unconditional rn x (mn+1) followed by an if (UNLIKELY (r1[rn]))
192 add_n. */
193 if (t0[mn] != 0)
194 {
195 MUL (r3, r1, rn, t0, mn + 1); /* u3 = s3 * t3 */
196 ASSERT (r1[rn] < 2);
197 if (r1[rn] != 0)
198 mpn_add_n (r3 + rn, r3 + rn, t0, mn + 1);
199 }
200 else
201 {
202 MUL (r3, r1, rn + 1, t0, mn);
203 }
204
205 ASSERT (r3[rn+mn] < 4);
206
207 u0[rn+mn] = 0;
208 if (r1s^t0s)
209 {
210 r3s = abs_sub_n (r3, u0, r3, rn + mn + 1);
211 }
212 else
213 {
214 ASSERT_NOCARRY (mpn_add_n (r3, r3, u0, rn + mn + 1));
215 r3s = 0; /* u3 + u5 */
216 }
217
218 if (t0s)
219 {
220 t0[mn] = mpn_add_n (t0, t0, m0, mn);
221 }
222 else if (t0[mn] != 0)
223 {
224 t0[mn] -= mpn_sub_n (t0, t0, m0, mn);
225 }
226 else
227 {
228 t0s = abs_sub_n (t0, t0, m0, mn);
229 }
230 MUL (u0, r2, rn, t0, mn + 1); /* u6 = s6 * t4 */
231 ASSERT (u0[rn+mn] < 2);
232 if (r1s)
233 {
234 ASSERT_NOCARRY (mpn_sub_n (r1, r2, r1, rn));
235 }
236 else
237 {
238 r1[rn] += mpn_add_n (r1, r1, r2, rn);
239 }
240 rn++;
241 t0s = add_signed_n (r2, r3, r3s, u0, t0s, rn + mn);
242 /* u3 + u5 + u6 */
243 ASSERT (r2[rn+mn-1] < 4);
244 r3s = add_signed_n (r3, r3, r3s, u1, u1s, rn + mn);
245 /* -u2 + u3 + u5 */
246 ASSERT (r3[rn+mn-1] < 3);
247 MUL (u0, s0, rn, m1, mn); /* u4 = s4 * t5 */
248 ASSERT (u0[rn+mn-1] < 2);
249 t0[mn] = mpn_add_n (t0, m3, m1, mn);
250 MUL (u1, r1, rn, t0, mn + 1); /* u1 = s1 * t1 */
251 mn += rn;
252 ASSERT (u1[mn-1] < 4);
253 ASSERT (u1[mn] == 0);
254 ASSERT_NOCARRY (add_signed_n (r1, r3, r3s, u0, s0s, mn));
255 /* -u2 + u3 - u4 + u5 */
256 ASSERT (r1[mn-1] < 2);
257 if (r3s)
258 {
259 ASSERT_NOCARRY (mpn_add_n (r3, u1, r3, mn));
260 }
261 else
262 {
263 ASSERT_NOCARRY (mpn_sub_n (r3, u1, r3, mn));
264 /* u1 + u2 - u3 - u5 */
265 }
266 ASSERT (r3[mn-1] < 2);
267 if (t0s)
268 {
269 ASSERT_NOCARRY (mpn_add_n (r2, u1, r2, mn));
270 }
271 else
272 {
273 ASSERT_NOCARRY (mpn_sub_n (r2, u1, r2, mn));
274 /* u1 - u3 - u5 - u6 */
275 }
276 ASSERT (r2[mn-1] < 2);
277 }
278
279 void
mpn_matrix22_mul(mp_ptr r0,mp_ptr r1,mp_ptr r2,mp_ptr r3,mp_size_t rn,mp_srcptr m0,mp_srcptr m1,mp_srcptr m2,mp_srcptr m3,mp_size_t mn,mp_ptr tp)280 mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
281 mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
282 mp_ptr tp)
283 {
284 if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
285 || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
286 {
287 mp_ptr p0, p1;
288 unsigned i;
289
290 /* Temporary storage: 3 rn + 2 mn */
291 p0 = tp + rn;
292 p1 = p0 + rn + mn;
293
294 for (i = 0; i < 2; i++)
295 {
296 MPN_COPY (tp, r0, rn);
297
298 if (rn >= mn)
299 {
300 mpn_mul (p0, r0, rn, m0, mn);
301 mpn_mul (p1, r1, rn, m3, mn);
302 mpn_mul (r0, r1, rn, m2, mn);
303 mpn_mul (r1, tp, rn, m1, mn);
304 }
305 else
306 {
307 mpn_mul (p0, m0, mn, r0, rn);
308 mpn_mul (p1, m3, mn, r1, rn);
309 mpn_mul (r0, m2, mn, r1, rn);
310 mpn_mul (r1, m1, mn, tp, rn);
311 }
312 r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn);
313 r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn);
314
315 r0 = r2; r1 = r3;
316 }
317 }
318 else
319 mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn,
320 m0, m1, m2, m3, mn, tp);
321 }
322