1 /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in 2 size. Or more accurately, bn <= an < (3/2)bn. 3 4 Contributed to the GNU project by Torbjorn Granlund. 5 Additional improvements by Marco Bodrato. 6 7 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 8 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 9 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 10 11 Copyright 2006-2008, 2010, 2012, 2015 Free Software Foundation, Inc. 12 13 This file is part of the GNU MP Library. 14 15 The GNU MP Library is free software; you can redistribute it and/or modify 16 it under the terms of either: 17 18 * the GNU Lesser General Public License as published by the Free 19 Software Foundation; either version 3 of the License, or (at your 20 option) any later version. 21 22 or 23 24 * the GNU General Public License as published by the Free Software 25 Foundation; either version 2 of the License, or (at your option) any 26 later version. 27 28 or both in parallel, as here. 29 30 The GNU MP Library is distributed in the hope that it will be useful, but 31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 32 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 33 for more details. 34 35 You should have received copies of the GNU General Public License and the 36 GNU Lesser General Public License along with the GNU MP Library. If not, 37 see https://www.gnu.org/licenses/. */ 38 39 40 #include "gmp-impl.h" 41 42 /* Evaluate in: -1, 0, +1, +2, +inf 43 44 <-s--><--n--><--n--> 45 ____ ______ ______ 46 |_a2_|___a1_|___a0_| 47 |b2_|___b1_|___b0_| 48 <-t-><--n--><--n--> 49 50 v0 = a0 * b0 # A(0)*B(0) 51 v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2 52 vm1 = (a0- a1+ a2)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 bh <= 1 53 v2 = (a0+2a1+4a2)*(b0+2b1+4b2) # A(2)*B(2) ah <= 6 bh <= 6 54 vinf= a2 * b2 # A(inf)*B(inf) 55 */ 56 57 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY 58 #define MAYBE_mul_basecase 1 59 #define MAYBE_mul_toom33 1 60 #else 61 #define MAYBE_mul_basecase \ 62 (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD) 63 #define MAYBE_mul_toom33 \ 64 (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD) 65 #endif 66 67 /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced 68 multiplication at the infinity point. We may have 69 MAYBE_mul_basecase == 0, and still get s just below 70 MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get 71 s == 1 and mpn_toom22_mul will crash. 72 */ 73 74 #define TOOM33_MUL_N_REC(p, a, b, n, ws) \ 75 do { \ 76 if (MAYBE_mul_basecase \ 77 && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \ 78 mpn_mul_basecase (p, a, n, b, n); \ 79 else if (! MAYBE_mul_toom33 \ 80 || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \ 81 mpn_toom22_mul (p, a, n, b, n, ws); \ 82 else \ 83 mpn_toom33_mul (p, a, n, b, n, ws); \ 84 } while (0) 85 86 void 87 mpn_toom33_mul (mp_ptr pp, 88 mp_srcptr ap, mp_size_t an, 89 mp_srcptr bp, mp_size_t bn, 90 mp_ptr scratch) 91 { 92 const int __gmpn_cpuvec_initialized = 1; 93 mp_size_t n, s, t; 94 int vm1_neg; 95 mp_limb_t cy, vinf0; 96 mp_ptr gp; 97 mp_ptr as1, asm1, as2; 98 mp_ptr bs1, bsm1, bs2; 99 100 #define a0 ap 101 #define a1 (ap + n) 102 #define a2 (ap + 2*n) 103 #define b0 bp 104 #define b1 (bp + n) 105 #define b2 (bp + 2*n) 106 107 n = (an + 2) / (size_t) 3; 108 109 s = an - 2 * n; 110 t = bn - 2 * n; 111 112 ASSERT (an >= bn); 113 114 ASSERT (0 < s && s <= n); 115 ASSERT (0 < t && t <= n); 116 117 as1 = scratch + 4 * n + 4; 118 asm1 = scratch + 2 * n + 2; 119 as2 = pp + n + 1; 120 121 bs1 = pp; 122 bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */ 123 bs2 = pp + 2 * n + 2; 124 125 gp = scratch; 126 127 vm1_neg = 0; 128 129 /* Compute as1 and asm1. */ 130 cy = mpn_add (gp, a0, n, a2, s); 131 #if HAVE_NATIVE_mpn_add_n_sub_n 132 if (cy == 0 && mpn_cmp (gp, a1, n) < 0) 133 { 134 cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n); 135 as1[n] = cy >> 1; 136 asm1[n] = 0; 137 vm1_neg = 1; 138 } 139 else 140 { 141 mp_limb_t cy2; 142 cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n); 143 as1[n] = cy + (cy2 >> 1); 144 asm1[n] = cy - (cy2 & 1); 145 } 146 #else 147 as1[n] = cy + mpn_add_n (as1, gp, a1, n); 148 if (cy == 0 && mpn_cmp (gp, a1, n) < 0) 149 { 150 mpn_sub_n (asm1, a1, gp, n); 151 asm1[n] = 0; 152 vm1_neg = 1; 153 } 154 else 155 { 156 cy -= mpn_sub_n (asm1, gp, a1, n); 157 asm1[n] = cy; 158 } 159 #endif 160 161 /* Compute as2. */ 162 #if HAVE_NATIVE_mpn_rsblsh1_n 163 cy = mpn_add_n (as2, a2, as1, s); 164 if (s != n) 165 cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); 166 cy += as1[n]; 167 cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n); 168 #else 169 #if HAVE_NATIVE_mpn_addlsh1_n 170 cy = mpn_addlsh1_n (as2, a1, a2, s); 171 if (s != n) 172 cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy); 173 cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); 174 #else 175 cy = mpn_add_n (as2, a2, as1, s); 176 if (s != n) 177 cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); 178 cy += as1[n]; 179 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); 180 cy -= mpn_sub_n (as2, as2, a0, n); 181 #endif 182 #endif 183 as2[n] = cy; 184 185 /* Compute bs1 and bsm1. */ 186 cy = mpn_add (gp, b0, n, b2, t); 187 #if HAVE_NATIVE_mpn_add_n_sub_n 188 if (cy == 0 && mpn_cmp (gp, b1, n) < 0) 189 { 190 cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n); 191 bs1[n] = cy >> 1; 192 bsm1[n] = 0; 193 vm1_neg ^= 1; 194 } 195 else 196 { 197 mp_limb_t cy2; 198 cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n); 199 bs1[n] = cy + (cy2 >> 1); 200 bsm1[n] = cy - (cy2 & 1); 201 } 202 #else 203 bs1[n] = cy + mpn_add_n (bs1, gp, b1, n); 204 if (cy == 0 && mpn_cmp (gp, b1, n) < 0) 205 { 206 mpn_sub_n (bsm1, b1, gp, n); 207 bsm1[n] = 0; 208 vm1_neg ^= 1; 209 } 210 else 211 { 212 cy -= mpn_sub_n (bsm1, gp, b1, n); 213 bsm1[n] = cy; 214 } 215 #endif 216 217 /* Compute bs2. */ 218 #if HAVE_NATIVE_mpn_rsblsh1_n 219 cy = mpn_add_n (bs2, b2, bs1, t); 220 if (t != n) 221 cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy); 222 cy += bs1[n]; 223 cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n); 224 #else 225 #if HAVE_NATIVE_mpn_addlsh1_n 226 cy = mpn_addlsh1_n (bs2, b1, b2, t); 227 if (t != n) 228 cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy); 229 cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n); 230 #else 231 cy = mpn_add_n (bs2, bs1, b2, t); 232 if (t != n) 233 cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy); 234 cy += bs1[n]; 235 cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1); 236 cy -= mpn_sub_n (bs2, bs2, b0, n); 237 #endif 238 #endif 239 bs2[n] = cy; 240 241 ASSERT (as1[n] <= 2); 242 ASSERT (bs1[n] <= 2); 243 ASSERT (asm1[n] <= 1); 244 ASSERT (bsm1[n] <= 1); 245 ASSERT (as2[n] <= 6); 246 ASSERT (bs2[n] <= 6); 247 248 #define v0 pp /* 2n */ 249 #define v1 (pp + 2 * n) /* 2n+1 */ 250 #define vinf (pp + 4 * n) /* s+t */ 251 #define vm1 scratch /* 2n+1 */ 252 #define v2 (scratch + 2 * n + 1) /* 2n+2 */ 253 #define scratch_out (scratch + 5 * n + 5) 254 255 /* vm1, 2n+1 limbs */ 256 #ifdef SMALLER_RECURSION 257 TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out); 258 cy = 0; 259 if (asm1[n] != 0) 260 cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n); 261 if (bsm1[n] != 0) 262 cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n); 263 vm1[2 * n] = cy; 264 #else 265 TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out); 266 #endif 267 268 TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */ 269 270 /* vinf, s+t limbs */ 271 if (s > t) mpn_mul (vinf, a2, s, b2, t); 272 else TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out); 273 274 vinf0 = vinf[0]; /* v1 overlaps with this */ 275 276 #ifdef SMALLER_RECURSION 277 /* v1, 2n+1 limbs */ 278 TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out); 279 if (as1[n] == 1) 280 { 281 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n); 282 } 283 else if (as1[n] != 0) 284 { 285 #if HAVE_NATIVE_mpn_addlsh1_n_ip1 286 cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n); 287 #else 288 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); 289 #endif 290 } 291 else 292 cy = 0; 293 if (bs1[n] == 1) 294 { 295 cy += mpn_add_n (v1 + n, v1 + n, as1, n); 296 } 297 else if (bs1[n] != 0) 298 { 299 #if HAVE_NATIVE_mpn_addlsh1_n_ip1 300 cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n); 301 #else 302 cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2)); 303 #endif 304 } 305 v1[2 * n] = cy; 306 #else 307 cy = vinf[1]; 308 TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out); 309 vinf[1] = cy; 310 #endif 311 312 TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */ 313 314 mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0); 315 } 316