1 /* mpn_toom3_sqr -- Square {ap,an}. 2 3 Contributed to the GNU project by Torbjorn Granlund. 4 Additional improvements by Marco Bodrato. 5 6 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 7 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 8 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 9 10 Copyright 2006, 2007, 2008, 2009, 2010, 2012 Free Software Foundation, Inc. 11 12 This file is part of the GNU MP Library. 13 14 The GNU MP Library is free software; you can redistribute it and/or modify 15 it under the terms of the GNU Lesser General Public License as published by 16 the Free Software Foundation; either version 3 of the License, or (at your 17 option) any later version. 18 19 The GNU MP Library is distributed in the hope that it will be useful, but 20 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 21 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 22 License for more details. 23 24 You should have received a copy of the GNU Lesser General Public License 25 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 26 27 28 #include "gmp.h" 29 #include "gmp-impl.h" 30 31 /* Evaluate in: -1, 0, +1, +2, +inf 32 33 <-s--><--n--><--n--> 34 ____ ______ ______ 35 |_a2_|___a1_|___a0_| 36 37 v0 = a0 ^2 # A(0)^2 38 v1 = (a0+ a1+ a2)^2 # A(1)^2 ah <= 2 39 vm1 = (a0- a1+ a2)^2 # A(-1)^2 |ah| <= 1 40 v2 = (a0+2a1+4a2)^2 # A(2)^2 ah <= 6 41 vinf= a2 ^2 # A(inf)^2 42 */ 43 44 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY 45 #define MAYBE_sqr_basecase 1 46 #define MAYBE_sqr_toom3 1 47 #else 48 #define MAYBE_sqr_basecase \ 49 (SQR_TOOM3_THRESHOLD < 3 * SQR_TOOM2_THRESHOLD) 50 #define MAYBE_sqr_toom3 \ 51 (SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD) 52 #endif 53 54 #define TOOM3_SQR_REC(p, a, n, ws) \ 55 do { \ 56 if (MAYBE_sqr_basecase \ 57 && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) \ 58 mpn_sqr_basecase (p, a, n); \ 59 else if (! MAYBE_sqr_toom3 \ 60 || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \ 61 mpn_toom2_sqr (p, a, n, ws); \ 62 else \ 63 mpn_toom3_sqr (p, a, n, ws); \ 64 } while (0) 65 66 void 67 mpn_toom3_sqr (mp_ptr pp, 68 mp_srcptr ap, mp_size_t an, 69 mp_ptr scratch) 70 { 71 const int __gmpn_cpuvec_initialized = 1; 72 mp_size_t n, s; 73 mp_limb_t cy, vinf0; 74 mp_ptr gp; 75 mp_ptr as1, asm1, as2; 76 77 #define a0 ap 78 #define a1 (ap + n) 79 #define a2 (ap + 2*n) 80 81 n = (an + 2) / (size_t) 3; 82 83 s = an - 2 * n; 84 85 ASSERT (0 < s && s <= n); 86 87 as1 = scratch + 4 * n + 4; 88 asm1 = scratch + 2 * n + 2; 89 as2 = pp + n + 1; 90 91 gp = scratch; 92 93 /* Compute as1 and asm1. */ 94 cy = mpn_add (gp, a0, n, a2, s); 95 #if HAVE_NATIVE_mpn_add_n_sub_n 96 if (cy == 0 && mpn_cmp (gp, a1, n) < 0) 97 { 98 cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n); 99 as1[n] = cy >> 1; 100 asm1[n] = 0; 101 } 102 else 103 { 104 mp_limb_t cy2; 105 cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n); 106 as1[n] = cy + (cy2 >> 1); 107 asm1[n] = cy - (cy2 & 1); 108 } 109 #else 110 as1[n] = cy + mpn_add_n (as1, gp, a1, n); 111 if (cy == 0 && mpn_cmp (gp, a1, n) < 0) 112 { 113 mpn_sub_n (asm1, a1, gp, n); 114 asm1[n] = 0; 115 } 116 else 117 { 118 cy -= mpn_sub_n (asm1, gp, a1, n); 119 asm1[n] = cy; 120 } 121 #endif 122 123 /* Compute as2. */ 124 #if HAVE_NATIVE_mpn_rsblsh1_n 125 cy = mpn_add_n (as2, a2, as1, s); 126 if (s != n) 127 cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); 128 cy += as1[n]; 129 cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n); 130 #else 131 #if HAVE_NATIVE_mpn_addlsh1_n 132 cy = mpn_addlsh1_n (as2, a1, a2, s); 133 if (s != n) 134 cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy); 135 cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); 136 #else 137 cy = mpn_add_n (as2, a2, as1, s); 138 if (s != n) 139 cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); 140 cy += as1[n]; 141 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); 142 cy -= mpn_sub_n (as2, as2, a0, n); 143 #endif 144 #endif 145 as2[n] = cy; 146 147 ASSERT (as1[n] <= 2); 148 ASSERT (asm1[n] <= 1); 149 150 #define v0 pp /* 2n */ 151 #define v1 (pp + 2 * n) /* 2n+1 */ 152 #define vinf (pp + 4 * n) /* s+s */ 153 #define vm1 scratch /* 2n+1 */ 154 #define v2 (scratch + 2 * n + 1) /* 2n+2 */ 155 #define scratch_out (scratch + 5 * n + 5) 156 157 /* vm1, 2n+1 limbs */ 158 #ifdef SMALLER_RECURSION 159 TOOM3_SQR_REC (vm1, asm1, n, scratch_out); 160 cy = 0; 161 if (asm1[n] != 0) 162 cy = asm1[n] + mpn_add_n (vm1 + n, vm1 + n, asm1, n); 163 if (asm1[n] != 0) 164 cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n); 165 vm1[2 * n] = cy; 166 #else 167 TOOM3_SQR_REC (vm1, asm1, n + 1, scratch_out); 168 #endif 169 170 TOOM3_SQR_REC (v2, as2, n + 1, scratch_out); /* v2, 2n+1 limbs */ 171 172 TOOM3_SQR_REC (vinf, a2, s, scratch_out); /* vinf, s+s limbs */ 173 174 vinf0 = vinf[0]; /* v1 overlaps with this */ 175 176 #ifdef SMALLER_RECURSION 177 /* v1, 2n+1 limbs */ 178 TOOM3_SQR_REC (v1, as1, n, scratch_out); 179 if (as1[n] == 1) 180 { 181 cy = as1[n] + mpn_add_n (v1 + n, v1 + n, as1, n); 182 } 183 else if (as1[n] != 0) 184 { 185 #if HAVE_NATIVE_mpn_addlsh1_n 186 cy = 2 * as1[n] + mpn_addlsh1_n (v1 + n, v1 + n, as1, n); 187 #else 188 cy = 2 * as1[n] + mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2)); 189 #endif 190 } 191 else 192 cy = 0; 193 if (as1[n] == 1) 194 { 195 cy += mpn_add_n (v1 + n, v1 + n, as1, n); 196 } 197 else if (as1[n] != 0) 198 { 199 #if HAVE_NATIVE_mpn_addlsh1_n 200 cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n); 201 #else 202 cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2)); 203 #endif 204 } 205 v1[2 * n] = cy; 206 #else 207 cy = vinf[1]; 208 TOOM3_SQR_REC (v1, as1, n + 1, scratch_out); 209 vinf[1] = cy; 210 #endif 211 212 TOOM3_SQR_REC (v0, ap, n, scratch_out); /* v0, 2n limbs */ 213 214 mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + s, 0, vinf0); 215 } 216