xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/generic/matrix22_mul.c (revision 72c7faa4dbb41dbb0238d6b4a109da0d4b236dd4)
1 /* matrix22_mul.c.
2 
3    Contributed by Niels Möller and Marco Bodrato.
4 
5    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
6    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7    GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8 
9 Copyright 2003-2005, 2008, 2009 Free Software Foundation, Inc.
10 
11 This file is part of the GNU MP Library.
12 
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of either:
15 
16   * the GNU Lesser General Public License as published by the Free
17     Software Foundation; either version 3 of the License, or (at your
18     option) any later version.
19 
20 or
21 
22   * the GNU General Public License as published by the Free Software
23     Foundation; either version 2 of the License, or (at your option) any
24     later version.
25 
26 or both in parallel, as here.
27 
28 The GNU MP Library is distributed in the hope that it will be useful, but
29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
31 for more details.
32 
33 You should have received copies of the GNU General Public License and the
34 GNU Lesser General Public License along with the GNU MP Library.  If not,
35 see https://www.gnu.org/licenses/.  */
36 
37 #include "gmp-impl.h"
38 #include "longlong.h"
39 
40 #define MUL(rp, ap, an, bp, bn) do {		\
41   if (an >= bn)					\
42     mpn_mul (rp, ap, an, bp, bn);		\
43   else						\
44     mpn_mul (rp, bp, bn, ap, an);		\
45 } while (0)
46 
47 /* Inputs are unsigned. */
48 static int
abs_sub_n(mp_ptr rp,mp_srcptr ap,mp_srcptr bp,mp_size_t n)49 abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)
50 {
51   int c;
52   MPN_CMP (c, ap, bp, n);
53   if (c >= 0)
54     {
55       mpn_sub_n (rp, ap, bp, n);
56       return 0;
57     }
58   else
59     {
60       mpn_sub_n (rp, bp, ap, n);
61       return 1;
62     }
63 }
64 
65 static int
add_signed_n(mp_ptr rp,mp_srcptr ap,int as,mp_srcptr bp,int bs,mp_size_t n)66 add_signed_n (mp_ptr rp,
67 	      mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n)
68 {
69   if (as != bs)
70     return as ^ abs_sub_n (rp, ap, bp, n);
71   else
72     {
73       ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n));
74       return as;
75     }
76 }
77 
78 mp_size_t
mpn_matrix22_mul_itch(mp_size_t rn,mp_size_t mn)79 mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn)
80 {
81   if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
82       || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
83     return 3*rn + 2*mn;
84   else
85     return 3*(rn + mn) + 5;
86 }
87 
88 /* Algorithm:
89 
90     / s0 \   /  1  0  0  0 \ / r0 \
91     | s1 |   |  0  1  0  1 | | r1 |
92     | s2 |   |  0  0 -1  1 | | r2 |
93     | s3 | = |  0  1 -1  1 | \ r3 /
94     | s4 |   | -1  1 -1  1 |
95     | s5 |   |  0  1  0  0 |
96     \ s6 /   \  0  0  1  0 /
97 
98     / t0 \   /  1  0  0  0 \ / m0 \
99     | t1 |   |  0  1  0  1 | | m1 |
100     | t2 |   |  0  0 -1  1 | | m2 |
101     | t3 | = |  0  1 -1  1 | \ m3 /
102     | t4 |   | -1  1 -1  1 |
103     | t5 |   |  0  1  0  0 |
104     \ t6 /   \  0  0  1  0 /
105 
106   Note: the two matrices above are the same, but s_i and t_i are used
107   in the same product, only for i<4, see "A Strassen-like Matrix
108   Multiplication suited for squaring and higher power computation" by
109   M. Bodrato, in Proceedings of ISSAC 2010.
110 
111     / r0 \   / 1 0  0  0  0  1  0 \ / s0*t0 \
112     | r1 | = | 0 0 -1  1 -1  1  0 | | s1*t1 |
113     | r2 |   | 0 1  0 -1  0 -1 -1 | | s2*t2 |
114     \ r3 /   \ 0 1  1 -1  0 -1  0 / | s3*t3 |
115 				    | s4*t5 |
116 				    | s5*t6 |
117 				    \ s6*t4 /
118 
119   The scheduling uses two temporaries U0 and U1 to store products, and
120   two, S0 and T0, to store combinations of entries of the two
121   operands.
122 */
123 
124 /* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3).
125  *
126  * Resulting elements are of size up to rn + mn + 1.
127  *
128  * Temporary storage: 3 rn + 3 mn + 5. */
129 static void
mpn_matrix22_mul_strassen(mp_ptr r0,mp_ptr r1,mp_ptr r2,mp_ptr r3,mp_size_t rn,mp_srcptr m0,mp_srcptr m1,mp_srcptr m2,mp_srcptr m3,mp_size_t mn,mp_ptr tp)130 mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
131 			   mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
132 			   mp_ptr tp)
133 {
134   mp_ptr s0, t0, u0, u1;
135   int r1s, r3s, s0s, t0s, u1s;
136   s0 = tp; tp += rn + 1;
137   t0 = tp; tp += mn + 1;
138   u0 = tp; tp += rn + mn + 1;
139   u1 = tp; /* rn + mn + 2 */
140 
141   MUL (u0, r1, rn, m2, mn);		/* u5 = s5 * t6 */
142   r3s = abs_sub_n (r3, r3, r2, rn);	/* r3 - r2 */
143   if (r3s)
144     {
145       r1s = abs_sub_n (r1, r1, r3, rn);
146       r1[rn] = 0;
147     }
148   else
149     {
150       r1[rn] = mpn_add_n (r1, r1, r3, rn);
151       r1s = 0;				/* r1 - r2 + r3  */
152     }
153   if (r1s)
154     {
155       s0[rn] = mpn_add_n (s0, r1, r0, rn);
156       s0s = 0;
157     }
158   else if (r1[rn] != 0)
159     {
160       s0[rn] = r1[rn] - mpn_sub_n (s0, r1, r0, rn);
161       s0s = 1;				/* s4 = -r0 + r1 - r2 + r3 */
162 					/* Reverse sign! */
163     }
164   else
165     {
166       s0s = abs_sub_n (s0, r0, r1, rn);
167       s0[rn] = 0;
168     }
169   MUL (u1, r0, rn, m0, mn);		/* u0 = s0 * t0 */
170   r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn);
171   ASSERT (r0[rn+mn] < 2);		/* u0 + u5 */
172 
173   t0s = abs_sub_n (t0, m3, m2, mn);
174   u1s = r3s^t0s^1;			/* Reverse sign! */
175   MUL (u1, r3, rn, t0, mn);		/* u2 = s2 * t2 */
176   u1[rn+mn] = 0;
177   if (t0s)
178     {
179       t0s = abs_sub_n (t0, m1, t0, mn);
180       t0[mn] = 0;
181     }
182   else
183     {
184       t0[mn] = mpn_add_n (t0, t0, m1, mn);
185     }
186 
187   /* FIXME: Could be simplified if we had space for rn + mn + 2 limbs
188      at r3. I'd expect that for matrices of random size, the high
189      words t0[mn] and r1[rn] are non-zero with a pretty small
190      probability. If that can be confirmed this should be done as an
191      unconditional rn x (mn+1) followed by an if (UNLIKELY (r1[rn]))
192      add_n. */
193   if (t0[mn] != 0)
194     {
195       MUL (r3, r1, rn, t0, mn + 1);	/* u3 = s3 * t3 */
196       ASSERT (r1[rn] < 2);
197       if (r1[rn] != 0)
198 	mpn_add_n (r3 + rn, r3 + rn, t0, mn + 1);
199     }
200   else
201     {
202       MUL (r3, r1, rn + 1, t0, mn);
203     }
204 
205   ASSERT (r3[rn+mn] < 4);
206 
207   u0[rn+mn] = 0;
208   if (r1s^t0s)
209     {
210       r3s = abs_sub_n (r3, u0, r3, rn + mn + 1);
211     }
212   else
213     {
214       ASSERT_NOCARRY (mpn_add_n (r3, r3, u0, rn + mn + 1));
215       r3s = 0;				/* u3 + u5 */
216     }
217 
218   if (t0s)
219     {
220       t0[mn] = mpn_add_n (t0, t0, m0, mn);
221     }
222   else if (t0[mn] != 0)
223     {
224       t0[mn] -= mpn_sub_n (t0, t0, m0, mn);
225     }
226   else
227     {
228       t0s = abs_sub_n (t0, t0, m0, mn);
229     }
230   MUL (u0, r2, rn, t0, mn + 1);		/* u6 = s6 * t4 */
231   ASSERT (u0[rn+mn] < 2);
232   if (r1s)
233     {
234       ASSERT_NOCARRY (mpn_sub_n (r1, r2, r1, rn));
235     }
236   else
237     {
238       r1[rn] += mpn_add_n (r1, r1, r2, rn);
239     }
240   rn++;
241   t0s = add_signed_n (r2, r3, r3s, u0, t0s, rn + mn);
242 					/* u3 + u5 + u6 */
243   ASSERT (r2[rn+mn-1] < 4);
244   r3s = add_signed_n (r3, r3, r3s, u1, u1s, rn + mn);
245 					/* -u2 + u3 + u5  */
246   ASSERT (r3[rn+mn-1] < 3);
247   MUL (u0, s0, rn, m1, mn);		/* u4 = s4 * t5 */
248   ASSERT (u0[rn+mn-1] < 2);
249   t0[mn] = mpn_add_n (t0, m3, m1, mn);
250   MUL (u1, r1, rn, t0, mn + 1);		/* u1 = s1 * t1 */
251   mn += rn;
252   ASSERT (u1[mn-1] < 4);
253   ASSERT (u1[mn] == 0);
254   ASSERT_NOCARRY (add_signed_n (r1, r3, r3s, u0, s0s, mn));
255 					/* -u2 + u3 - u4 + u5  */
256   ASSERT (r1[mn-1] < 2);
257   if (r3s)
258     {
259       ASSERT_NOCARRY (mpn_add_n (r3, u1, r3, mn));
260     }
261   else
262     {
263       ASSERT_NOCARRY (mpn_sub_n (r3, u1, r3, mn));
264 					/* u1 + u2 - u3 - u5  */
265     }
266   ASSERT (r3[mn-1] < 2);
267   if (t0s)
268     {
269       ASSERT_NOCARRY (mpn_add_n (r2, u1, r2, mn));
270     }
271   else
272     {
273       ASSERT_NOCARRY (mpn_sub_n (r2, u1, r2, mn));
274 					/* u1 - u3 - u5 - u6  */
275     }
276   ASSERT (r2[mn-1] < 2);
277 }
278 
279 void
mpn_matrix22_mul(mp_ptr r0,mp_ptr r1,mp_ptr r2,mp_ptr r3,mp_size_t rn,mp_srcptr m0,mp_srcptr m1,mp_srcptr m2,mp_srcptr m3,mp_size_t mn,mp_ptr tp)280 mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
281 		  mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
282 		  mp_ptr tp)
283 {
284   if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
285       || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
286     {
287       mp_ptr p0, p1;
288       unsigned i;
289 
290       /* Temporary storage: 3 rn + 2 mn */
291       p0 = tp + rn;
292       p1 = p0 + rn + mn;
293 
294       for (i = 0; i < 2; i++)
295 	{
296 	  MPN_COPY (tp, r0, rn);
297 
298 	  if (rn >= mn)
299 	    {
300 	      mpn_mul (p0, r0, rn, m0, mn);
301 	      mpn_mul (p1, r1, rn, m3, mn);
302 	      mpn_mul (r0, r1, rn, m2, mn);
303 	      mpn_mul (r1, tp, rn, m1, mn);
304 	    }
305 	  else
306 	    {
307 	      mpn_mul (p0, m0, mn, r0, rn);
308 	      mpn_mul (p1, m3, mn, r1, rn);
309 	      mpn_mul (r0, m2, mn, r1, rn);
310 	      mpn_mul (r1, m1, mn, tp, rn);
311 	    }
312 	  r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn);
313 	  r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn);
314 
315 	  r0 = r2; r1 = r3;
316 	}
317     }
318   else
319     mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn,
320 			       m0, m1, m2, m3, mn, tp);
321 }
322