1 /* matrix22_mul.c. 2 3 Contributed by Niels Möller and Marco Bodrato. 4 5 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY 6 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 7 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 8 9 Copyright 2003-2005, 2008, 2009 Free Software Foundation, Inc. 10 11 This file is part of the GNU MP Library. 12 13 The GNU MP Library is free software; you can redistribute it and/or modify 14 it under the terms of either: 15 16 * the GNU Lesser General Public License as published by the Free 17 Software Foundation; either version 3 of the License, or (at your 18 option) any later version. 19 20 or 21 22 * the GNU General Public License as published by the Free Software 23 Foundation; either version 2 of the License, or (at your option) any 24 later version. 25 26 or both in parallel, as here. 27 28 The GNU MP Library is distributed in the hope that it will be useful, but 29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 30 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 31 for more details. 32 33 You should have received copies of the GNU General Public License and the 34 GNU Lesser General Public License along with the GNU MP Library. If not, 35 see https://www.gnu.org/licenses/. */ 36 37 #include "gmp-impl.h" 38 #include "longlong.h" 39 40 #define MUL(rp, ap, an, bp, bn) do { \ 41 if (an >= bn) \ 42 mpn_mul (rp, ap, an, bp, bn); \ 43 else \ 44 mpn_mul (rp, bp, bn, ap, an); \ 45 } while (0) 46 47 /* Inputs are unsigned. */ 48 static int 49 abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n) 50 { 51 int c; 52 MPN_CMP (c, ap, bp, n); 53 if (c >= 0) 54 { 55 mpn_sub_n (rp, ap, bp, n); 56 return 0; 57 } 58 else 59 { 60 mpn_sub_n (rp, bp, ap, n); 61 return 1; 62 } 63 } 64 65 static int 66 add_signed_n (mp_ptr rp, 67 mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n) 68 { 69 if (as != bs) 70 return as ^ abs_sub_n (rp, ap, bp, n); 71 else 72 { 73 ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n)); 74 return as; 75 } 76 } 77 78 mp_size_t 79 mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn) 80 { 81 if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD) 82 || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD)) 83 return 3*rn + 2*mn; 84 else 85 return 3*(rn + mn) + 5; 86 } 87 88 /* Algorithm: 89 90 / s0 \ / 1 0 0 0 \ / r0 \ 91 | s1 | | 0 1 0 1 | | r1 | 92 | s2 | | 0 0 -1 1 | | r2 | 93 | s3 | = | 0 1 -1 1 | \ r3 / 94 | s4 | | -1 1 -1 1 | 95 | s5 | | 0 1 0 0 | 96 \ s6 / \ 0 0 1 0 / 97 98 / t0 \ / 1 0 0 0 \ / m0 \ 99 | t1 | | 0 1 0 1 | | m1 | 100 | t2 | | 0 0 -1 1 | | m2 | 101 | t3 | = | 0 1 -1 1 | \ m3 / 102 | t4 | | -1 1 -1 1 | 103 | t5 | | 0 1 0 0 | 104 \ t6 / \ 0 0 1 0 / 105 106 Note: the two matrices above are the same, but s_i and t_i are used 107 in the same product, only for i<4, see "A Strassen-like Matrix 108 Multiplication suited for squaring and higher power computation" by 109 M. Bodrato, in Proceedings of ISSAC 2010. 110 111 / r0 \ / 1 0 0 0 0 1 0 \ / s0*t0 \ 112 | r1 | = | 0 0 -1 1 -1 1 0 | | s1*t1 | 113 | r2 | | 0 1 0 -1 0 -1 -1 | | s2*t2 | 114 \ r3 / \ 0 1 1 -1 0 -1 0 / | s3*t3 | 115 | s4*t5 | 116 | s5*t6 | 117 \ s6*t4 / 118 119 The scheduling uses two temporaries U0 and U1 to store products, and 120 two, S0 and T0, to store combinations of entries of the two 121 operands. 122 */ 123 124 /* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3). 125 * 126 * Resulting elements are of size up to rn + mn + 1. 127 * 128 * Temporary storage: 3 rn + 3 mn + 5. */ 129 static void 130 mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn, 131 mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn, 132 mp_ptr tp) 133 { 134 mp_ptr s0, t0, u0, u1; 135 int r1s, r3s, s0s, t0s, u1s; 136 s0 = tp; tp += rn + 1; 137 t0 = tp; tp += mn + 1; 138 u0 = tp; tp += rn + mn + 1; 139 u1 = tp; /* rn + mn + 2 */ 140 141 MUL (u0, r1, rn, m2, mn); /* u5 = s5 * t6 */ 142 r3s = abs_sub_n (r3, r3, r2, rn); /* r3 - r2 */ 143 if (r3s) 144 { 145 r1s = abs_sub_n (r1, r1, r3, rn); 146 r1[rn] = 0; 147 } 148 else 149 { 150 r1[rn] = mpn_add_n (r1, r1, r3, rn); 151 r1s = 0; /* r1 - r2 + r3 */ 152 } 153 if (r1s) 154 { 155 s0[rn] = mpn_add_n (s0, r1, r0, rn); 156 s0s = 0; 157 } 158 else if (r1[rn] != 0) 159 { 160 s0[rn] = r1[rn] - mpn_sub_n (s0, r1, r0, rn); 161 s0s = 1; /* s4 = -r0 + r1 - r2 + r3 */ 162 /* Reverse sign! */ 163 } 164 else 165 { 166 s0s = abs_sub_n (s0, r0, r1, rn); 167 s0[rn] = 0; 168 } 169 MUL (u1, r0, rn, m0, mn); /* u0 = s0 * t0 */ 170 r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn); 171 ASSERT (r0[rn+mn] < 2); /* u0 + u5 */ 172 173 t0s = abs_sub_n (t0, m3, m2, mn); 174 u1s = r3s^t0s^1; /* Reverse sign! */ 175 MUL (u1, r3, rn, t0, mn); /* u2 = s2 * t2 */ 176 u1[rn+mn] = 0; 177 if (t0s) 178 { 179 t0s = abs_sub_n (t0, m1, t0, mn); 180 t0[mn] = 0; 181 } 182 else 183 { 184 t0[mn] = mpn_add_n (t0, t0, m1, mn); 185 } 186 187 /* FIXME: Could be simplified if we had space for rn + mn + 2 limbs 188 at r3. I'd expect that for matrices of random size, the high 189 words t0[mn] and r1[rn] are non-zero with a pretty small 190 probability. If that can be confirmed this should be done as an 191 unconditional rn x (mn+1) followed by an if (UNLIKELY (r1[rn])) 192 add_n. */ 193 if (t0[mn] != 0) 194 { 195 MUL (r3, r1, rn, t0, mn + 1); /* u3 = s3 * t3 */ 196 ASSERT (r1[rn] < 2); 197 if (r1[rn] != 0) 198 mpn_add_n (r3 + rn, r3 + rn, t0, mn + 1); 199 } 200 else 201 { 202 MUL (r3, r1, rn + 1, t0, mn); 203 } 204 205 ASSERT (r3[rn+mn] < 4); 206 207 u0[rn+mn] = 0; 208 if (r1s^t0s) 209 { 210 r3s = abs_sub_n (r3, u0, r3, rn + mn + 1); 211 } 212 else 213 { 214 ASSERT_NOCARRY (mpn_add_n (r3, r3, u0, rn + mn + 1)); 215 r3s = 0; /* u3 + u5 */ 216 } 217 218 if (t0s) 219 { 220 t0[mn] = mpn_add_n (t0, t0, m0, mn); 221 } 222 else if (t0[mn] != 0) 223 { 224 t0[mn] -= mpn_sub_n (t0, t0, m0, mn); 225 } 226 else 227 { 228 t0s = abs_sub_n (t0, t0, m0, mn); 229 } 230 MUL (u0, r2, rn, t0, mn + 1); /* u6 = s6 * t4 */ 231 ASSERT (u0[rn+mn] < 2); 232 if (r1s) 233 { 234 ASSERT_NOCARRY (mpn_sub_n (r1, r2, r1, rn)); 235 } 236 else 237 { 238 r1[rn] += mpn_add_n (r1, r1, r2, rn); 239 } 240 rn++; 241 t0s = add_signed_n (r2, r3, r3s, u0, t0s, rn + mn); 242 /* u3 + u5 + u6 */ 243 ASSERT (r2[rn+mn-1] < 4); 244 r3s = add_signed_n (r3, r3, r3s, u1, u1s, rn + mn); 245 /* -u2 + u3 + u5 */ 246 ASSERT (r3[rn+mn-1] < 3); 247 MUL (u0, s0, rn, m1, mn); /* u4 = s4 * t5 */ 248 ASSERT (u0[rn+mn-1] < 2); 249 t0[mn] = mpn_add_n (t0, m3, m1, mn); 250 MUL (u1, r1, rn, t0, mn + 1); /* u1 = s1 * t1 */ 251 mn += rn; 252 ASSERT (u1[mn-1] < 4); 253 ASSERT (u1[mn] == 0); 254 ASSERT_NOCARRY (add_signed_n (r1, r3, r3s, u0, s0s, mn)); 255 /* -u2 + u3 - u4 + u5 */ 256 ASSERT (r1[mn-1] < 2); 257 if (r3s) 258 { 259 ASSERT_NOCARRY (mpn_add_n (r3, u1, r3, mn)); 260 } 261 else 262 { 263 ASSERT_NOCARRY (mpn_sub_n (r3, u1, r3, mn)); 264 /* u1 + u2 - u3 - u5 */ 265 } 266 ASSERT (r3[mn-1] < 2); 267 if (t0s) 268 { 269 ASSERT_NOCARRY (mpn_add_n (r2, u1, r2, mn)); 270 } 271 else 272 { 273 ASSERT_NOCARRY (mpn_sub_n (r2, u1, r2, mn)); 274 /* u1 - u3 - u5 - u6 */ 275 } 276 ASSERT (r2[mn-1] < 2); 277 } 278 279 void 280 mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn, 281 mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn, 282 mp_ptr tp) 283 { 284 if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD) 285 || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD)) 286 { 287 mp_ptr p0, p1; 288 unsigned i; 289 290 /* Temporary storage: 3 rn + 2 mn */ 291 p0 = tp + rn; 292 p1 = p0 + rn + mn; 293 294 for (i = 0; i < 2; i++) 295 { 296 MPN_COPY (tp, r0, rn); 297 298 if (rn >= mn) 299 { 300 mpn_mul (p0, r0, rn, m0, mn); 301 mpn_mul (p1, r1, rn, m3, mn); 302 mpn_mul (r0, r1, rn, m2, mn); 303 mpn_mul (r1, tp, rn, m1, mn); 304 } 305 else 306 { 307 mpn_mul (p0, m0, mn, r0, rn); 308 mpn_mul (p1, m3, mn, r1, rn); 309 mpn_mul (r0, m2, mn, r1, rn); 310 mpn_mul (r1, m1, mn, tp, rn); 311 } 312 r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn); 313 r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn); 314 315 r0 = r2; r1 = r3; 316 } 317 } 318 else 319 mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn, 320 m0, m1, m2, m3, mn, tp); 321 } 322