1 /* mpn_toom62_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 3 times 2 as large as bn. Or more accurately, (5/2)bn < an < 6bn. 3 4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 5 6 The idea of applying toom to unbalanced multiplication is due to Marco 7 Bodrato and Alberto Zanoni. 8 9 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 10 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 11 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 12 13 Copyright 2006-2008, 2012 Free Software Foundation, Inc. 14 15 This file is part of the GNU MP Library. 16 17 The GNU MP Library is free software; you can redistribute it and/or modify 18 it under the terms of either: 19 20 * the GNU Lesser General Public License as published by the Free 21 Software Foundation; either version 3 of the License, or (at your 22 option) any later version. 23 24 or 25 26 * the GNU General Public License as published by the Free Software 27 Foundation; either version 2 of the License, or (at your option) any 28 later version. 29 30 or both in parallel, as here. 31 32 The GNU MP Library is distributed in the hope that it will be useful, but 33 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 34 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 35 for more details. 36 37 You should have received copies of the GNU General Public License and the 38 GNU Lesser General Public License along with the GNU MP Library. If not, 39 see https://www.gnu.org/licenses/. */ 40 41 42 #include "gmp-impl.h" 43 44 /* Evaluate in: 45 0, +1, -1, +2, -2, 1/2, +inf 46 47 <-s-><--n--><--n--><--n--><--n--><--n--> 48 ___ ______ ______ ______ ______ ______ 49 |a5_|___a4_|___a3_|___a2_|___a1_|___a0_| 50 |_b1_|___b0_| 51 <-t--><--n--> 52 53 v0 = a0 * b0 # A(0)*B(0) 54 v1 = ( a0+ a1+ a2+ a3+ a4+ a5)*( b0+ b1) # A(1)*B(1) ah <= 5 bh <= 1 55 vm1 = ( a0- a1+ a2- a3+ a4- a5)*( b0- b1) # A(-1)*B(-1) |ah| <= 2 bh = 0 56 v2 = ( a0+ 2a1+4a2+8a3+16a4+32a5)*( b0+2b1) # A(2)*B(2) ah <= 62 bh <= 2 57 vm2 = ( a0- 2a1+4a2-8a3+16a4-32a5)*( b0-2b1) # A(-2)*B(-2) -41<=ah<=20 -1<=bh<=0 58 vh = (32a0+16a1+8a2+4a3+ 2a4+ a5)*(2b0+ b1) # A(1/2)*B(1/2) ah <= 62 bh <= 2 59 vinf= a5 * b1 # A(inf)*B(inf) 60 */ 61 62 void 63 mpn_toom62_mul (mp_ptr pp, 64 mp_srcptr ap, mp_size_t an, 65 mp_srcptr bp, mp_size_t bn, 66 mp_ptr scratch) 67 { 68 mp_size_t n, s, t; 69 mp_limb_t cy; 70 mp_ptr as1, asm1, as2, asm2, ash; 71 mp_ptr bs1, bsm1, bs2, bsm2, bsh; 72 mp_ptr gp; 73 enum toom7_flags aflags, bflags; 74 TMP_DECL; 75 76 #define a0 ap 77 #define a1 (ap + n) 78 #define a2 (ap + 2*n) 79 #define a3 (ap + 3*n) 80 #define a4 (ap + 4*n) 81 #define a5 (ap + 5*n) 82 #define b0 bp 83 #define b1 (bp + n) 84 85 n = 1 + (an >= 3 * bn ? (an - 1) / (size_t) 6 : (bn - 1) >> 1); 86 87 s = an - 5 * n; 88 t = bn - n; 89 90 ASSERT (0 < s && s <= n); 91 ASSERT (0 < t && t <= n); 92 93 TMP_MARK; 94 95 as1 = TMP_SALLOC_LIMBS (n + 1); 96 asm1 = TMP_SALLOC_LIMBS (n + 1); 97 as2 = TMP_SALLOC_LIMBS (n + 1); 98 asm2 = TMP_SALLOC_LIMBS (n + 1); 99 ash = TMP_SALLOC_LIMBS (n + 1); 100 101 bs1 = TMP_SALLOC_LIMBS (n + 1); 102 bsm1 = TMP_SALLOC_LIMBS (n); 103 bs2 = TMP_SALLOC_LIMBS (n + 1); 104 bsm2 = TMP_SALLOC_LIMBS (n + 1); 105 bsh = TMP_SALLOC_LIMBS (n + 1); 106 107 gp = pp; 108 109 /* Compute as1 and asm1. */ 110 aflags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 5, ap, n, s, gp)); 111 112 /* Compute as2 and asm2. */ 113 aflags = (enum toom7_flags) (aflags | (toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 5, ap, n, s, gp))); 114 115 /* Compute ash = 32 a0 + 16 a1 + 8 a2 + 4 a3 + 2 a4 + a5 116 = 2*(2*(2*(2*(2*a0 + a1) + a2) + a3) + a4) + a5 */ 117 118 #if HAVE_NATIVE_mpn_addlsh1_n 119 cy = mpn_addlsh1_n (ash, a1, a0, n); 120 cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n); 121 cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n); 122 cy = 2*cy + mpn_addlsh1_n (ash, a4, ash, n); 123 if (s < n) 124 { 125 mp_limb_t cy2; 126 cy2 = mpn_addlsh1_n (ash, a5, ash, s); 127 ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1); 128 MPN_INCR_U (ash + s, n+1-s, cy2); 129 } 130 else 131 ash[n] = 2*cy + mpn_addlsh1_n (ash, a5, ash, n); 132 #else 133 cy = mpn_lshift (ash, a0, n, 1); 134 cy += mpn_add_n (ash, ash, a1, n); 135 cy = 2*cy + mpn_lshift (ash, ash, n, 1); 136 cy += mpn_add_n (ash, ash, a2, n); 137 cy = 2*cy + mpn_lshift (ash, ash, n, 1); 138 cy += mpn_add_n (ash, ash, a3, n); 139 cy = 2*cy + mpn_lshift (ash, ash, n, 1); 140 cy += mpn_add_n (ash, ash, a4, n); 141 cy = 2*cy + mpn_lshift (ash, ash, n, 1); 142 ash[n] = cy + mpn_add (ash, ash, n, a5, s); 143 #endif 144 145 /* Compute bs1 and bsm1. */ 146 if (t == n) 147 { 148 #if HAVE_NATIVE_mpn_add_n_sub_n 149 if (mpn_cmp (b0, b1, n) < 0) 150 { 151 cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n); 152 bflags = toom7_w3_neg; 153 } 154 else 155 { 156 cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n); 157 bflags = (enum toom7_flags) 0; 158 } 159 bs1[n] = cy >> 1; 160 #else 161 bs1[n] = mpn_add_n (bs1, b0, b1, n); 162 if (mpn_cmp (b0, b1, n) < 0) 163 { 164 mpn_sub_n (bsm1, b1, b0, n); 165 bflags = toom7_w3_neg; 166 } 167 else 168 { 169 mpn_sub_n (bsm1, b0, b1, n); 170 bflags = (enum toom7_flags) 0; 171 } 172 #endif 173 } 174 else 175 { 176 bs1[n] = mpn_add (bs1, b0, n, b1, t); 177 if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) 178 { 179 mpn_sub_n (bsm1, b1, b0, t); 180 MPN_ZERO (bsm1 + t, n - t); 181 bflags = toom7_w3_neg; 182 } 183 else 184 { 185 mpn_sub (bsm1, b0, n, b1, t); 186 bflags = (enum toom7_flags) 0; 187 } 188 } 189 190 /* Compute bs2 and bsm2. Recycling bs1 and bsm1; bs2=bs1+b1, bsm2 = 191 bsm1 - b1 */ 192 mpn_add (bs2, bs1, n + 1, b1, t); 193 if (bflags & toom7_w3_neg) 194 { 195 bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t); 196 bflags = (enum toom7_flags) (bflags | toom7_w1_neg); 197 } 198 else 199 { 200 /* FIXME: Simplify this logic? */ 201 if (t < n) 202 { 203 if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0) 204 { 205 ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, t)); 206 MPN_ZERO (bsm2 + t, n + 1 - t); 207 bflags = (enum toom7_flags) (bflags | toom7_w1_neg); 208 } 209 else 210 { 211 ASSERT_NOCARRY (mpn_sub (bsm2, bsm1, n, b1, t)); 212 bsm2[n] = 0; 213 } 214 } 215 else 216 { 217 if (mpn_cmp (bsm1, b1, n) < 0) 218 { 219 ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, n)); 220 bflags = (enum toom7_flags) (bflags | toom7_w1_neg); 221 } 222 else 223 { 224 ASSERT_NOCARRY (mpn_sub_n (bsm2, bsm1, b1, n)); 225 } 226 bsm2[n] = 0; 227 } 228 } 229 230 /* Compute bsh, recycling bs1. bsh=bs1+b0; */ 231 bsh[n] = bs1[n] + mpn_add_n (bsh, bs1, b0, n); 232 233 ASSERT (as1[n] <= 5); 234 ASSERT (bs1[n] <= 1); 235 ASSERT (asm1[n] <= 2); 236 ASSERT (as2[n] <= 62); 237 ASSERT (bs2[n] <= 2); 238 ASSERT (asm2[n] <= 41); 239 ASSERT (bsm2[n] <= 1); 240 ASSERT (ash[n] <= 62); 241 ASSERT (bsh[n] <= 2); 242 243 #define v0 pp /* 2n */ 244 #define v1 (pp + 2 * n) /* 2n+1 */ 245 #define vinf (pp + 6 * n) /* s+t */ 246 #define v2 scratch /* 2n+1 */ 247 #define vm2 (scratch + 2 * n + 1) /* 2n+1 */ 248 #define vh (scratch + 4 * n + 2) /* 2n+1 */ 249 #define vm1 (scratch + 6 * n + 3) /* 2n+1 */ 250 #define scratch_out (scratch + 8 * n + 4) /* 2n+1 */ 251 /* Total scratch need: 10*n+5 */ 252 253 /* Must be in allocation order, as they overwrite one limb beyond 254 * 2n+1. */ 255 mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */ 256 mpn_mul_n (vm2, asm2, bsm2, n + 1); /* vm2, 2n+1 limbs */ 257 mpn_mul_n (vh, ash, bsh, n + 1); /* vh, 2n+1 limbs */ 258 259 /* vm1, 2n+1 limbs */ 260 mpn_mul_n (vm1, asm1, bsm1, n); 261 cy = 0; 262 if (asm1[n] == 1) 263 { 264 cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n); 265 } 266 else if (asm1[n] == 2) 267 { 268 #if HAVE_NATIVE_mpn_addlsh1_n 269 cy = mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n); 270 #else 271 cy = mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2)); 272 #endif 273 } 274 vm1[2 * n] = cy; 275 276 /* v1, 2n+1 limbs */ 277 mpn_mul_n (v1, as1, bs1, n); 278 if (as1[n] == 1) 279 { 280 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n); 281 } 282 else if (as1[n] == 2) 283 { 284 #if HAVE_NATIVE_mpn_addlsh1_n 285 cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n); 286 #else 287 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); 288 #endif 289 } 290 else if (as1[n] != 0) 291 { 292 cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]); 293 } 294 else 295 cy = 0; 296 if (bs1[n] != 0) 297 cy += mpn_add_n (v1 + n, v1 + n, as1, n); 298 v1[2 * n] = cy; 299 300 mpn_mul_n (v0, a0, b0, n); /* v0, 2n limbs */ 301 302 /* vinf, s+t limbs */ 303 if (s > t) mpn_mul (vinf, a5, s, b1, t); 304 else mpn_mul (vinf, b1, t, a5, s); 305 306 mpn_toom_interpolate_7pts (pp, n, (enum toom7_flags) (aflags ^ bflags), 307 vm2, vm1, v2, vh, s + t, scratch_out); 308 309 TMP_FREE; 310 } 311