1 /* mpn_toom53_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 5/3 2 times as large as bn. Or more accurately, (4/3)bn < an < (5/2)bn. 3 4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 5 6 The idea of applying toom to unbalanced multiplication is due to Marco 7 Bodrato and Alberto Zanoni. 8 9 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 10 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 11 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 12 13 Copyright 2006-2008, 2012, 2014, 2015 Free Software Foundation, Inc. 14 15 This file is part of the GNU MP Library. 16 17 The GNU MP Library is free software; you can redistribute it and/or modify 18 it under the terms of either: 19 20 * the GNU Lesser General Public License as published by the Free 21 Software Foundation; either version 3 of the License, or (at your 22 option) any later version. 23 24 or 25 26 * the GNU General Public License as published by the Free Software 27 Foundation; either version 2 of the License, or (at your option) any 28 later version. 29 30 or both in parallel, as here. 31 32 The GNU MP Library is distributed in the hope that it will be useful, but 33 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 34 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 35 for more details. 36 37 You should have received copies of the GNU General Public License and the 38 GNU Lesser General Public License along with the GNU MP Library. If not, 39 see https://www.gnu.org/licenses/. */ 40 41 42 #include "gmp-impl.h" 43 44 /* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf 45 46 <-s-><--n--><--n--><--n--><--n--> 47 ___ ______ ______ ______ ______ 48 |a4_|___a3_|___a2_|___a1_|___a0_| 49 |__b2|___b1_|___b0_| 50 <-t--><--n--><--n--> 51 52 v0 = a0 * b0 # A(0)*B(0) 53 v1 = ( a0+ a1+ a2+ a3+ a4)*( b0+ b1+ b2) # A(1)*B(1) ah <= 4 bh <= 2 54 vm1 = ( a0- a1+ a2- a3+ a4)*( b0- b1+ b2) # A(-1)*B(-1) |ah| <= 2 bh <= 1 55 v2 = ( a0+2a1+4a2+8a3+16a4)*( b0+2b1+4b2) # A(2)*B(2) ah <= 30 bh <= 6 56 vm2 = ( a0-2a1+4a2-8a3+16a4)*( b0-2b1+4b2) # A(2)*B(2) -9<=ah<=20 -1<=bh<=4 57 vh = (16a0+8a1+4a2+2a3+ a4)*(4b0+2b1+ b2) # A(1/2)*B(1/2) ah <= 30 bh <= 6 58 vinf= a4 * b2 # A(inf)*B(inf) 59 */ 60 61 void 62 mpn_toom53_mul (mp_ptr pp, 63 mp_srcptr ap, mp_size_t an, 64 mp_srcptr bp, mp_size_t bn, 65 mp_ptr scratch) 66 { 67 mp_size_t n, s, t; 68 mp_limb_t cy; 69 mp_ptr gp; 70 mp_ptr as1, asm1, as2, asm2, ash; 71 mp_ptr bs1, bsm1, bs2, bsm2, bsh; 72 mp_ptr tmp; 73 enum toom7_flags flags; 74 TMP_DECL; 75 76 #define a0 ap 77 #define a1 (ap + n) 78 #define a2 (ap + 2*n) 79 #define a3 (ap + 3*n) 80 #define a4 (ap + 4*n) 81 #define b0 bp 82 #define b1 (bp + n) 83 #define b2 (bp + 2*n) 84 85 n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3); 86 87 s = an - 4 * n; 88 t = bn - 2 * n; 89 90 ASSERT (0 < s && s <= n); 91 ASSERT (0 < t && t <= n); 92 93 TMP_MARK; 94 95 tmp = TMP_ALLOC_LIMBS (10 * (n + 1)); 96 as1 = tmp; tmp += n + 1; 97 asm1 = tmp; tmp += n + 1; 98 as2 = tmp; tmp += n + 1; 99 asm2 = tmp; tmp += n + 1; 100 ash = tmp; tmp += n + 1; 101 bs1 = tmp; tmp += n + 1; 102 bsm1 = tmp; tmp += n + 1; 103 bs2 = tmp; tmp += n + 1; 104 bsm2 = tmp; tmp += n + 1; 105 bsh = tmp; tmp += n + 1; 106 107 gp = pp; 108 109 /* Compute as1 and asm1. */ 110 flags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, gp)); 111 112 /* Compute as2 and asm2. */ 113 flags = (enum toom7_flags) (flags | (toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, gp))); 114 115 /* Compute ash = 16 a0 + 8 a1 + 4 a2 + 2 a3 + a4 116 = 2*(2*(2*(2*a0 + a1) + a2) + a3) + a4 */ 117 #if HAVE_NATIVE_mpn_addlsh1_n 118 cy = mpn_addlsh1_n (ash, a1, a0, n); 119 cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n); 120 cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n); 121 if (s < n) 122 { 123 mp_limb_t cy2; 124 cy2 = mpn_addlsh1_n (ash, a4, ash, s); 125 ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1); 126 MPN_INCR_U (ash + s, n+1-s, cy2); 127 } 128 else 129 ash[n] = 2*cy + mpn_addlsh1_n (ash, a4, ash, n); 130 #else 131 cy = mpn_lshift (ash, a0, n, 1); 132 cy += mpn_add_n (ash, ash, a1, n); 133 cy = 2*cy + mpn_lshift (ash, ash, n, 1); 134 cy += mpn_add_n (ash, ash, a2, n); 135 cy = 2*cy + mpn_lshift (ash, ash, n, 1); 136 cy += mpn_add_n (ash, ash, a3, n); 137 cy = 2*cy + mpn_lshift (ash, ash, n, 1); 138 ash[n] = cy + mpn_add (ash, ash, n, a4, s); 139 #endif 140 141 /* Compute bs1 and bsm1. */ 142 bs1[n] = mpn_add (bs1, b0, n, b2, t); /* b0 + b2 */ 143 #if HAVE_NATIVE_mpn_add_n_sub_n 144 if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0) 145 { 146 bs1[n] = mpn_add_n_sub_n (bs1, bsm1, b1, bs1, n) >> 1; 147 bsm1[n] = 0; 148 flags = (enum toom7_flags) (flags ^ toom7_w3_neg); 149 } 150 else 151 { 152 cy = mpn_add_n_sub_n (bs1, bsm1, bs1, b1, n); 153 bsm1[n] = bs1[n] - (cy & 1); 154 bs1[n] += (cy >> 1); 155 } 156 #else 157 if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0) 158 { 159 mpn_sub_n (bsm1, b1, bs1, n); 160 bsm1[n] = 0; 161 flags = (enum toom7_flags) (flags ^ toom7_w3_neg); 162 } 163 else 164 { 165 bsm1[n] = bs1[n] - mpn_sub_n (bsm1, bs1, b1, n); 166 } 167 bs1[n] += mpn_add_n (bs1, bs1, b1, n); /* b0+b1+b2 */ 168 #endif 169 170 /* Compute bs2 and bsm2. */ 171 #if HAVE_NATIVE_mpn_addlsh_n || HAVE_NATIVE_mpn_addlsh2_n 172 #if HAVE_NATIVE_mpn_addlsh2_n 173 cy = mpn_addlsh2_n (bs2, b0, b2, t); 174 #else /* HAVE_NATIVE_mpn_addlsh_n */ 175 cy = mpn_addlsh_n (bs2, b0, b2, t, 2); 176 #endif 177 if (t < n) 178 cy = mpn_add_1 (bs2 + t, b0 + t, n - t, cy); 179 bs2[n] = cy; 180 #else 181 cy = mpn_lshift (gp, b2, t, 2); 182 bs2[n] = mpn_add (bs2, b0, n, gp, t); 183 MPN_INCR_U (bs2 + t, n+1-t, cy); 184 #endif 185 186 gp[n] = mpn_lshift (gp, b1, n, 1); 187 188 #if HAVE_NATIVE_mpn_add_n_sub_n 189 if (mpn_cmp (bs2, gp, n+1) < 0) 190 { 191 ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, gp, bs2, n+1)); 192 flags = (enum toom7_flags) (flags ^ toom7_w1_neg); 193 } 194 else 195 { 196 ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, bs2, gp, n+1)); 197 } 198 #else 199 if (mpn_cmp (bs2, gp, n+1) < 0) 200 { 201 ASSERT_NOCARRY (mpn_sub_n (bsm2, gp, bs2, n+1)); 202 flags = (enum toom7_flags) (flags ^ toom7_w1_neg); 203 } 204 else 205 { 206 ASSERT_NOCARRY (mpn_sub_n (bsm2, bs2, gp, n+1)); 207 } 208 mpn_add_n (bs2, bs2, gp, n+1); 209 #endif 210 211 /* Compute bsh = 4 b0 + 2 b1 + b2 = 2*(2*b0 + b1)+b2. */ 212 #if HAVE_NATIVE_mpn_addlsh1_n 213 cy = mpn_addlsh1_n (bsh, b1, b0, n); 214 if (t < n) 215 { 216 mp_limb_t cy2; 217 cy2 = mpn_addlsh1_n (bsh, b2, bsh, t); 218 bsh[n] = 2*cy + mpn_lshift (bsh + t, bsh + t, n - t, 1); 219 MPN_INCR_U (bsh + t, n+1-t, cy2); 220 } 221 else 222 bsh[n] = 2*cy + mpn_addlsh1_n (bsh, b2, bsh, n); 223 #else 224 cy = mpn_lshift (bsh, b0, n, 1); 225 cy += mpn_add_n (bsh, bsh, b1, n); 226 cy = 2*cy + mpn_lshift (bsh, bsh, n, 1); 227 bsh[n] = cy + mpn_add (bsh, bsh, n, b2, t); 228 #endif 229 230 ASSERT (as1[n] <= 4); 231 ASSERT (bs1[n] <= 2); 232 ASSERT (asm1[n] <= 2); 233 ASSERT (bsm1[n] <= 1); 234 ASSERT (as2[n] <= 30); 235 ASSERT (bs2[n] <= 6); 236 ASSERT (asm2[n] <= 20); 237 ASSERT (bsm2[n] <= 4); 238 ASSERT (ash[n] <= 30); 239 ASSERT (bsh[n] <= 6); 240 241 #define v0 pp /* 2n */ 242 #define v1 (pp + 2 * n) /* 2n+1 */ 243 #define vinf (pp + 6 * n) /* s+t */ 244 #define v2 scratch /* 2n+1 */ 245 #define vm2 (scratch + 2 * n + 1) /* 2n+1 */ 246 #define vh (scratch + 4 * n + 2) /* 2n+1 */ 247 #define vm1 (scratch + 6 * n + 3) /* 2n+1 */ 248 #define scratch_out (scratch + 8 * n + 4) /* 2n+1 */ 249 /* Total scratch need: 10*n+5 */ 250 251 /* Must be in allocation order, as they overwrite one limb beyond 252 * 2n+1. */ 253 mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */ 254 mpn_mul_n (vm2, asm2, bsm2, n + 1); /* vm2, 2n+1 limbs */ 255 mpn_mul_n (vh, ash, bsh, n + 1); /* vh, 2n+1 limbs */ 256 257 /* vm1, 2n+1 limbs */ 258 #ifdef SMALLER_RECURSION 259 mpn_mul_n (vm1, asm1, bsm1, n); 260 if (asm1[n] == 1) 261 { 262 cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n); 263 } 264 else if (asm1[n] == 2) 265 { 266 #if HAVE_NATIVE_mpn_addlsh1_n_ip1 267 cy = 2 * bsm1[n] + mpn_addlsh1_n_ip1 (vm1 + n, bsm1, n); 268 #else 269 cy = 2 * bsm1[n] + mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2)); 270 #endif 271 } 272 else 273 cy = 0; 274 if (bsm1[n] != 0) 275 cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n); 276 vm1[2 * n] = cy; 277 #else /* SMALLER_RECURSION */ 278 vm1[2 * n] = 0; 279 mpn_mul_n (vm1, asm1, bsm1, n + ((asm1[n] | bsm1[n]) != 0)); 280 #endif /* SMALLER_RECURSION */ 281 282 /* v1, 2n+1 limbs */ 283 #ifdef SMALLER_RECURSION 284 mpn_mul_n (v1, as1, bs1, n); 285 if (as1[n] == 1) 286 { 287 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n); 288 } 289 else if (as1[n] == 2) 290 { 291 #if HAVE_NATIVE_mpn_addlsh1_n_ip1 292 cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n); 293 #else 294 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); 295 #endif 296 } 297 else if (as1[n] != 0) 298 { 299 cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]); 300 } 301 else 302 cy = 0; 303 if (bs1[n] == 1) 304 { 305 cy += mpn_add_n (v1 + n, v1 + n, as1, n); 306 } 307 else if (bs1[n] == 2) 308 { 309 #if HAVE_NATIVE_mpn_addlsh1_n_ip1 310 cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n); 311 #else 312 cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2)); 313 #endif 314 } 315 v1[2 * n] = cy; 316 #else /* SMALLER_RECURSION */ 317 v1[2 * n] = 0; 318 mpn_mul_n (v1, as1, bs1, n + ((as1[n] | bs1[n]) != 0)); 319 #endif /* SMALLER_RECURSION */ 320 321 mpn_mul_n (v0, a0, b0, n); /* v0, 2n limbs */ 322 323 /* vinf, s+t limbs */ 324 if (s > t) mpn_mul (vinf, a4, s, b2, t); 325 else mpn_mul (vinf, b2, t, a4, s); 326 327 mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t, 328 scratch_out); 329 330 TMP_FREE; 331 } 332