1 /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in 2 size. Or more accurately, bn <= an < (3/2)bn. 3 4 Contributed to the GNU project by Torbjorn Granlund. 5 Additional improvements by Marco Bodrato. 6 7 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 8 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 9 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 10 11 Copyright 2006, 2007, 2008, 2010, 2012 Free Software Foundation, Inc. 12 13 This file is part of the GNU MP Library. 14 15 The GNU MP Library is free software; you can redistribute it and/or modify 16 it under the terms of the GNU Lesser General Public License as published by 17 the Free Software Foundation; either version 3 of the License, or (at your 18 option) any later version. 19 20 The GNU MP Library is distributed in the hope that it will be useful, but 21 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 22 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 23 License for more details. 24 25 You should have received a copy of the GNU Lesser General Public License 26 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 27 28 29 #include "gmp.h" 30 #include "gmp-impl.h" 31 32 /* Evaluate in: -1, 0, +1, +2, +inf 33 34 <-s--><--n--><--n--><--n--> 35 ____ ______ ______ ______ 36 |_a3_|___a2_|___a1_|___a0_| 37 |b3_|___b2_|___b1_|___b0_| 38 <-t-><--n--><--n--><--n--> 39 40 v0 = a0 * b0 # A(0)*B(0) 41 v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2 42 vm1 = (a0- a1+ a2)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 bh <= 1 43 v2 = (a0+2a1+4a2)*(b0+2b1+4b2) # A(2)*B(2) ah <= 6 bh <= 6 44 vinf= a2 * b2 # A(inf)*B(inf) 45 */ 46 47 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY 48 #define MAYBE_mul_basecase 1 49 #define MAYBE_mul_toom33 1 50 #else 51 #define MAYBE_mul_basecase \ 52 (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD) 53 #define MAYBE_mul_toom33 \ 54 (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD) 55 #endif 56 57 /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced 58 multiplication at the infinity point. We may have 59 MAYBE_mul_basecase == 0, and still get s just below 60 MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get 61 s == 1 and mpn_toom22_mul will crash. 62 */ 63 64 #define TOOM33_MUL_N_REC(p, a, b, n, ws) \ 65 do { \ 66 if (MAYBE_mul_basecase \ 67 && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \ 68 mpn_mul_basecase (p, a, n, b, n); \ 69 else if (! MAYBE_mul_toom33 \ 70 || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \ 71 mpn_toom22_mul (p, a, n, b, n, ws); \ 72 else \ 73 mpn_toom33_mul (p, a, n, b, n, ws); \ 74 } while (0) 75 76 void 77 mpn_toom33_mul (mp_ptr pp, 78 mp_srcptr ap, mp_size_t an, 79 mp_srcptr bp, mp_size_t bn, 80 mp_ptr scratch) 81 { 82 const int __gmpn_cpuvec_initialized = 1; 83 mp_size_t n, s, t; 84 int vm1_neg; 85 mp_limb_t cy, vinf0; 86 mp_ptr gp; 87 mp_ptr as1, asm1, as2; 88 mp_ptr bs1, bsm1, bs2; 89 90 #define a0 ap 91 #define a1 (ap + n) 92 #define a2 (ap + 2*n) 93 #define b0 bp 94 #define b1 (bp + n) 95 #define b2 (bp + 2*n) 96 97 n = (an + 2) / (size_t) 3; 98 99 s = an - 2 * n; 100 t = bn - 2 * n; 101 102 ASSERT (an >= bn); 103 104 ASSERT (0 < s && s <= n); 105 ASSERT (0 < t && t <= n); 106 107 as1 = scratch + 4 * n + 4; 108 asm1 = scratch + 2 * n + 2; 109 as2 = pp + n + 1; 110 111 bs1 = pp; 112 bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */ 113 bs2 = pp + 2 * n + 2; 114 115 gp = scratch; 116 117 vm1_neg = 0; 118 119 /* Compute as1 and asm1. */ 120 cy = mpn_add (gp, a0, n, a2, s); 121 #if HAVE_NATIVE_mpn_add_n_sub_n 122 if (cy == 0 && mpn_cmp (gp, a1, n) < 0) 123 { 124 cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n); 125 as1[n] = cy >> 1; 126 asm1[n] = 0; 127 vm1_neg = 1; 128 } 129 else 130 { 131 mp_limb_t cy2; 132 cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n); 133 as1[n] = cy + (cy2 >> 1); 134 asm1[n] = cy - (cy2 & 1); 135 } 136 #else 137 as1[n] = cy + mpn_add_n (as1, gp, a1, n); 138 if (cy == 0 && mpn_cmp (gp, a1, n) < 0) 139 { 140 mpn_sub_n (asm1, a1, gp, n); 141 asm1[n] = 0; 142 vm1_neg = 1; 143 } 144 else 145 { 146 cy -= mpn_sub_n (asm1, gp, a1, n); 147 asm1[n] = cy; 148 } 149 #endif 150 151 /* Compute as2. */ 152 #if HAVE_NATIVE_mpn_rsblsh1_n 153 cy = mpn_add_n (as2, a2, as1, s); 154 if (s != n) 155 cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); 156 cy += as1[n]; 157 cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n); 158 #else 159 #if HAVE_NATIVE_mpn_addlsh1_n 160 cy = mpn_addlsh1_n (as2, a1, a2, s); 161 if (s != n) 162 cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy); 163 cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); 164 #else 165 cy = mpn_add_n (as2, a2, as1, s); 166 if (s != n) 167 cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); 168 cy += as1[n]; 169 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); 170 cy -= mpn_sub_n (as2, as2, a0, n); 171 #endif 172 #endif 173 as2[n] = cy; 174 175 /* Compute bs1 and bsm1. */ 176 cy = mpn_add (gp, b0, n, b2, t); 177 #if HAVE_NATIVE_mpn_add_n_sub_n 178 if (cy == 0 && mpn_cmp (gp, b1, n) < 0) 179 { 180 cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n); 181 bs1[n] = cy >> 1; 182 bsm1[n] = 0; 183 vm1_neg ^= 1; 184 } 185 else 186 { 187 mp_limb_t cy2; 188 cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n); 189 bs1[n] = cy + (cy2 >> 1); 190 bsm1[n] = cy - (cy2 & 1); 191 } 192 #else 193 bs1[n] = cy + mpn_add_n (bs1, gp, b1, n); 194 if (cy == 0 && mpn_cmp (gp, b1, n) < 0) 195 { 196 mpn_sub_n (bsm1, b1, gp, n); 197 bsm1[n] = 0; 198 vm1_neg ^= 1; 199 } 200 else 201 { 202 cy -= mpn_sub_n (bsm1, gp, b1, n); 203 bsm1[n] = cy; 204 } 205 #endif 206 207 /* Compute bs2. */ 208 #if HAVE_NATIVE_mpn_rsblsh1_n 209 cy = mpn_add_n (bs2, b2, bs1, t); 210 if (t != n) 211 cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy); 212 cy += bs1[n]; 213 cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n); 214 #else 215 #if HAVE_NATIVE_mpn_addlsh1_n 216 cy = mpn_addlsh1_n (bs2, b1, b2, t); 217 if (t != n) 218 cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy); 219 cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n); 220 #else 221 cy = mpn_add_n (bs2, bs1, b2, t); 222 if (t != n) 223 cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy); 224 cy += bs1[n]; 225 cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1); 226 cy -= mpn_sub_n (bs2, bs2, b0, n); 227 #endif 228 #endif 229 bs2[n] = cy; 230 231 ASSERT (as1[n] <= 2); 232 ASSERT (bs1[n] <= 2); 233 ASSERT (asm1[n] <= 1); 234 ASSERT (bsm1[n] <= 1); 235 ASSERT (as2[n] <= 6); 236 ASSERT (bs2[n] <= 6); 237 238 #define v0 pp /* 2n */ 239 #define v1 (pp + 2 * n) /* 2n+1 */ 240 #define vinf (pp + 4 * n) /* s+t */ 241 #define vm1 scratch /* 2n+1 */ 242 #define v2 (scratch + 2 * n + 1) /* 2n+2 */ 243 #define scratch_out (scratch + 5 * n + 5) 244 245 /* vm1, 2n+1 limbs */ 246 #ifdef SMALLER_RECURSION 247 TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out); 248 cy = 0; 249 if (asm1[n] != 0) 250 cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n); 251 if (bsm1[n] != 0) 252 cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n); 253 vm1[2 * n] = cy; 254 #else 255 TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out); 256 #endif 257 258 TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */ 259 260 /* vinf, s+t limbs */ 261 if (s > t) mpn_mul (vinf, a2, s, b2, t); 262 else TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out); 263 264 vinf0 = vinf[0]; /* v1 overlaps with this */ 265 266 #ifdef SMALLER_RECURSION 267 /* v1, 2n+1 limbs */ 268 TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out); 269 if (as1[n] == 1) 270 { 271 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n); 272 } 273 else if (as1[n] != 0) 274 { 275 #if HAVE_NATIVE_mpn_addlsh1_n 276 cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n); 277 #else 278 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); 279 #endif 280 } 281 else 282 cy = 0; 283 if (bs1[n] == 1) 284 { 285 cy += mpn_add_n (v1 + n, v1 + n, as1, n); 286 } 287 else if (bs1[n] != 0) 288 { 289 #if HAVE_NATIVE_mpn_addlsh1_n 290 cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n); 291 #else 292 cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2)); 293 #endif 294 } 295 v1[2 * n] = cy; 296 #else 297 cy = vinf[1]; 298 TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out); 299 vinf[1] = cy; 300 #endif 301 302 TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */ 303 304 mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0); 305 } 306