1 /* hgcd2.c 2 3 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY 4 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 5 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 6 7 Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019 Free Software Foundation, 8 Inc. 9 10 This file is part of the GNU MP Library. 11 12 The GNU MP Library is free software; you can redistribute it and/or modify 13 it under the terms of either: 14 15 * the GNU Lesser General Public License as published by the Free 16 Software Foundation; either version 3 of the License, or (at your 17 option) any later version. 18 19 or 20 21 * the GNU General Public License as published by the Free Software 22 Foundation; either version 2 of the License, or (at your option) any 23 later version. 24 25 or both in parallel, as here. 26 27 The GNU MP Library is distributed in the hope that it will be useful, but 28 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 29 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 30 for more details. 31 32 You should have received copies of the GNU General Public License and the 33 GNU Lesser General Public License along with the GNU MP Library. If not, 34 see https://www.gnu.org/licenses/. */ 35 36 #include "gmp-impl.h" 37 #include "longlong.h" 38 39 #ifndef HGCD2_DIV1_METHOD 40 #define HGCD2_DIV1_METHOD 3 41 #endif 42 43 #ifndef HGCD2_DIV2_METHOD 44 #define HGCD2_DIV2_METHOD 2 45 #endif 46 47 #if GMP_NAIL_BITS != 0 48 #error Nails not implemented 49 #endif 50 51 #if HAVE_NATIVE_mpn_div_11 52 53 #define div1 mpn_div_11 54 /* Single-limb division optimized for small quotients. 55 Returned value holds d0 = r, d1 = q. */ 56 mp_double_limb_t div1 (mp_limb_t, mp_limb_t); 57 58 #elif HGCD2_DIV1_METHOD == 1 59 60 static inline mp_double_limb_t 61 div1 (mp_limb_t n0, mp_limb_t d0) 62 { 63 mp_double_limb_t res; 64 res.d1 = n0 / d0; 65 res.d0 = n0 - res.d1 * d0; 66 67 return res; 68 } 69 70 #elif HGCD2_DIV1_METHOD == 2 71 72 static mp_double_limb_t 73 div1 (mp_limb_t n0, mp_limb_t d0) 74 { 75 mp_double_limb_t res; 76 int ncnt, dcnt, cnt; 77 mp_limb_t q; 78 mp_limb_t mask; 79 80 ASSERT (n0 >= d0); 81 82 count_leading_zeros (ncnt, n0); 83 count_leading_zeros (dcnt, d0); 84 cnt = dcnt - ncnt; 85 86 d0 <<= cnt; 87 88 q = -(mp_limb_t) (n0 >= d0); 89 n0 -= d0 & q; 90 d0 >>= 1; 91 q = -q; 92 93 while (--cnt >= 0) 94 { 95 mask = -(mp_limb_t) (n0 >= d0); 96 n0 -= d0 & mask; 97 d0 >>= 1; 98 q = (q << 1) - mask; 99 } 100 101 res.d0 = n0; 102 res.d1 = q; 103 return res; 104 } 105 106 #elif HGCD2_DIV1_METHOD == 3 107 108 static inline mp_double_limb_t 109 div1 (mp_limb_t n0, mp_limb_t d0) 110 { 111 mp_double_limb_t res; 112 if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0) 113 || UNLIKELY (n0 >= (d0 << 3))) 114 { 115 res.d1 = n0 / d0; 116 res.d0 = n0 - res.d1 * d0; 117 } 118 else 119 { 120 mp_limb_t q, mask; 121 122 d0 <<= 2; 123 124 mask = -(mp_limb_t) (n0 >= d0); 125 n0 -= d0 & mask; 126 q = 4 & mask; 127 128 d0 >>= 1; 129 mask = -(mp_limb_t) (n0 >= d0); 130 n0 -= d0 & mask; 131 q += 2 & mask; 132 133 d0 >>= 1; 134 mask = -(mp_limb_t) (n0 >= d0); 135 n0 -= d0 & mask; 136 q -= mask; 137 138 res.d0 = n0; 139 res.d1 = q; 140 } 141 return res; 142 } 143 144 #elif HGCD2_DIV1_METHOD == 4 145 146 /* Table quotients. We extract the NBITS most significant bits of the 147 numerator limb, and the corresponding bits from the divisor limb, and use 148 these to form an index into the table. This method is probably only useful 149 for short pipelines with slow multiplication. 150 151 Possible improvements: 152 153 * Perhaps extract the highest NBITS of the divisor instead of the same bits 154 as from the numerator. That would require another count_leading_zeros, 155 and a post-multiply shift of the quotient. 156 157 * Compress tables? Their values are tiny, and there are lots of zero 158 entries (which are never used). 159 160 * Round the table entries more cleverly? 161 */ 162 163 #ifndef NBITS 164 #define NBITS 5 165 #endif 166 167 #if NBITS == 5 168 /* This needs full division about 13.2% of the time. */ 169 static const unsigned char tab[512] = { 170 17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 171 18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 172 19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0, 173 20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0, 174 21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0, 175 22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0, 176 23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0, 177 24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0, 178 25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0, 179 26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, 180 27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, 181 28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0, 182 29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0, 183 30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0, 184 31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0, 185 32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 186 }; 187 #elif NBITS == 6 188 /* This needs full division about 9.8% of the time. */ 189 static const unsigned char tab[2048] = { 190 33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 191 1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 192 34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 193 1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 194 35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 195 1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 196 36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 197 1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 198 37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 199 1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 200 38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 201 1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 202 39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1, 203 1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 204 40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1, 205 1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 206 41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1, 207 1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 208 42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1, 209 1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 210 43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1, 211 1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 212 44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1, 213 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 214 45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1, 215 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 216 46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1, 217 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 218 47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1, 219 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 220 48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1, 221 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 222 49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1, 223 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 224 50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1, 225 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 226 51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1, 227 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0, 228 52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1, 229 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0, 230 53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1, 231 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0, 232 54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1, 233 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0, 234 55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1, 235 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0, 236 56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1, 237 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0, 238 57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1, 239 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0, 240 58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1, 241 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, 242 59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1, 243 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, 244 60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1, 245 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0, 246 61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1, 247 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0, 248 62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1, 249 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0, 250 63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1, 251 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0, 252 64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1, 253 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 254 }; 255 #else 256 #error No table for provided NBITS 257 #endif 258 259 /* Doing tabp with a #define makes compiler warnings about pointing outside an 260 object go away. We used to define this as a variable. It is not clear if 261 e.g. (vector[100] - 10) + 10 is well- defined as per the C standard; 262 (vector[100] + 10) - 10 surely is and there is no sequence point so the 263 expressions should be equivalent. To make this safe, we might want to 264 define tabp as a macro with the index as an argument. Depending on the 265 platform, relocs might allow for assembly-time or linker-time resolution to 266 take place. */ 267 #define tabp (tab - (1 << (NBITS - 1) << NBITS)) 268 269 static inline mp_double_limb_t 270 div1 (mp_limb_t n0, mp_limb_t d0) 271 { 272 int ncnt; 273 size_t nbi, dbi; 274 mp_limb_t q0; 275 mp_limb_t r0; 276 mp_limb_t mask; 277 mp_double_limb_t res; 278 279 ASSERT (n0 >= d0); /* Actually only msb position is critical. */ 280 281 count_leading_zeros (ncnt, n0); 282 nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS); 283 dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS); 284 285 q0 = tabp[(nbi << NBITS) + dbi]; 286 r0 = n0 - q0 * d0; 287 mask = -(mp_limb_t) (r0 >= d0); 288 q0 -= mask; 289 r0 -= d0 & mask; 290 291 if (UNLIKELY (r0 >= d0)) 292 { 293 q0 = n0 / d0; 294 r0 = n0 - q0 * d0; 295 } 296 297 res.d1 = q0; 298 res.d0 = r0; 299 return res; 300 } 301 302 #elif HGCD2_DIV1_METHOD == 5 303 304 /* Table inverses of divisors. We don't bother with suppressing the msb from 305 the tables. We index with the NBITS most significant divisor bits, 306 including the always-set highest bit, but use addressing trickery via tabp 307 to suppress it. 308 309 Possible improvements: 310 311 * Do first multiply using 32-bit operations on 64-bit computers. At least 312 on most Arm64 cores, that uses 3 times less resources. It also saves on 313 many x86-64 processors. 314 */ 315 316 #ifndef NBITS 317 #define NBITS 7 318 #endif 319 320 #if NBITS == 5 321 /* This needs full division about 1.63% of the time. */ 322 static const unsigned char tab[16] = { 323 63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32 324 }; 325 #elif NBITS == 6 326 /* This needs full division about 0.93% of the time. */ 327 static const unsigned char tab[32] = { 328 127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86, 329 84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64 330 }; 331 #elif NBITS == 7 332 /* This needs full division about 0.49% of the time. */ 333 static const unsigned char tab[64] = { 334 255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206, 335 203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171, 336 169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146, 337 145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128 338 }; 339 #elif NBITS == 8 340 /* This needs full division about 0.26% of the time. */ 341 static const unsigned short tab[128] = { 342 511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457, 343 454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411, 344 408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373, 345 371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342, 346 340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315, 347 314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292, 348 291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273, 349 272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256 350 }; 351 #else 352 #error No table for provided NBITS 353 #endif 354 355 /* Doing tabp with a #define makes compiler warnings about pointing outside an 356 object go away. We used to define this as a variable. It is not clear if 357 e.g. (vector[100] - 10) + 10 is well- defined as per the C standard; 358 (vector[100] + 10) - 10 surely is and there is no sequence point so the 359 expressions should be equivalent. To make this safe, we might want to 360 define tabp as a macro with the index as an argument. Depending on the 361 platform, relocs might allow for assembly-time or linker-time resolution to 362 take place. */ 363 #define tabp (tab - (1 << (NBITS - 1))) 364 365 static inline mp_double_limb_t 366 div1 (mp_limb_t n0, mp_limb_t d0) 367 { 368 int ncnt, dcnt; 369 size_t dbi; 370 mp_limb_t inv; 371 mp_limb_t q0; 372 mp_limb_t r0; 373 mp_limb_t mask; 374 mp_double_limb_t res; 375 376 count_leading_zeros (ncnt, n0); 377 count_leading_zeros (dcnt, d0); 378 379 dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS); 380 inv = tabp[dbi]; 381 q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt); 382 r0 = n0 - q0 * d0; 383 mask = -(mp_limb_t) (r0 >= d0); 384 q0 -= mask; 385 r0 -= d0 & mask; 386 387 if (UNLIKELY (r0 >= d0)) 388 { 389 q0 = n0 / d0; 390 r0 = n0 - q0 * d0; 391 } 392 393 res.d1 = q0; 394 res.d0 = r0; 395 return res; 396 } 397 398 #else 399 #error Unknown HGCD2_DIV1_METHOD 400 #endif 401 402 #if HAVE_NATIVE_mpn_div_22 403 404 #define div2 mpn_div_22 405 /* Two-limb division optimized for small quotients. */ 406 mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t); 407 408 #elif HGCD2_DIV2_METHOD == 1 409 410 static mp_limb_t 411 div2 (mp_ptr rp, 412 mp_limb_t n1, mp_limb_t n0, 413 mp_limb_t d1, mp_limb_t d0) 414 { 415 mp_double_limb_t rq = div1 (n1, d1); 416 if (UNLIKELY (rq.d1 > d1)) 417 { 418 mp_limb_t n2, q, t1, t0; 419 int c; 420 421 /* Normalize */ 422 count_leading_zeros (c, d1); 423 ASSERT (c > 0); 424 425 n2 = n1 >> (GMP_LIMB_BITS - c); 426 n1 = (n1 << c) | (n0 >> (GMP_LIMB_BITS - c)); 427 n0 <<= c; 428 d1 = (d1 << c) | (d0 >> (GMP_LIMB_BITS - c)); 429 d0 <<= c; 430 431 udiv_qrnnd (q, n1, n2, n1, d1); 432 umul_ppmm (t1, t0, q, d0); 433 if (t1 > n1 || (t1 == n1 && t0 > n0)) 434 { 435 ASSERT (q > 0); 436 q--; 437 sub_ddmmss (t1, t0, t1, t0, d1, d0); 438 } 439 sub_ddmmss (n1, n0, n1, n0, t1, t0); 440 441 /* Undo normalization */ 442 rp[0] = (n0 >> c) | (n1 << (GMP_LIMB_BITS - c)); 443 rp[1] = n1 >> c; 444 445 return q; 446 } 447 else 448 { 449 mp_limb_t q, t1, t0; 450 n1 = rq.d0; 451 q = rq.d1; 452 umul_ppmm (t1, t0, q, d0); 453 if (UNLIKELY (t1 >= n1) && (t1 > n1 || t0 > n0)) 454 { 455 ASSERT (q > 0); 456 q--; 457 sub_ddmmss (t1, t0, t1, t0, d1, d0); 458 } 459 sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0); 460 return q; 461 } 462 } 463 464 #elif HGCD2_DIV2_METHOD == 2 465 466 /* Bit-wise div2. Relies on fast count_leading_zeros. */ 467 static mp_limb_t 468 div2 (mp_ptr rp, 469 mp_limb_t n1, mp_limb_t n0, 470 mp_limb_t d1, mp_limb_t d0) 471 { 472 mp_limb_t q = 0; 473 int ncnt; 474 int dcnt; 475 476 count_leading_zeros (ncnt, n1); 477 count_leading_zeros (dcnt, d1); 478 dcnt -= ncnt; 479 480 d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt)); 481 d0 <<= dcnt; 482 483 do 484 { 485 mp_limb_t mask; 486 q <<= 1; 487 if (UNLIKELY (n1 == d1)) 488 mask = -(n0 >= d0); 489 else 490 mask = -(n1 > d1); 491 492 q -= mask; 493 494 sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0); 495 496 d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1); 497 d1 = d1 >> 1; 498 } 499 while (dcnt--); 500 501 rp[0] = n0; 502 rp[1] = n1; 503 504 return q; 505 } 506 #else 507 #error Unknown HGCD2_DIV2_METHOD 508 #endif 509 510 /* Reduces a,b until |a-b| (almost) fits in one limb + 1 bit. Constructs 511 matrix M. Returns 1 if we make progress, i.e. can perform at least 512 one subtraction. Otherwise returns zero. */ 513 514 /* FIXME: Possible optimizations: 515 516 The div2 function starts with checking the most significant bit of 517 the numerator. We can maintained normalized operands here, call 518 hgcd with normalized operands only, which should make the code 519 simpler and possibly faster. 520 521 Experiment with table lookups on the most significant bits. 522 523 This function is also a candidate for assembler implementation. 524 */ 525 int 526 mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl, 527 struct hgcd_matrix1 *M) 528 { 529 mp_limb_t u00, u01, u10, u11; 530 531 if (ah < 2 || bh < 2) 532 return 0; 533 534 if (ah > bh || (ah == bh && al > bl)) 535 { 536 sub_ddmmss (ah, al, ah, al, bh, bl); 537 if (ah < 2) 538 return 0; 539 540 u00 = u01 = u11 = 1; 541 u10 = 0; 542 } 543 else 544 { 545 sub_ddmmss (bh, bl, bh, bl, ah, al); 546 if (bh < 2) 547 return 0; 548 549 u00 = u10 = u11 = 1; 550 u01 = 0; 551 } 552 553 if (ah < bh) 554 goto subtract_a; 555 556 for (;;) 557 { 558 ASSERT (ah >= bh); 559 if (ah == bh) 560 goto done; 561 562 if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2))) 563 { 564 ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2)); 565 bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2)); 566 567 break; 568 } 569 570 /* Subtract a -= q b, and multiply M from the right by (1 q ; 0 571 1), affecting the second column of M. */ 572 ASSERT (ah > bh); 573 sub_ddmmss (ah, al, ah, al, bh, bl); 574 575 if (ah < 2) 576 goto done; 577 578 if (ah <= bh) 579 { 580 /* Use q = 1 */ 581 u01 += u00; 582 u11 += u10; 583 } 584 else 585 { 586 mp_limb_t r[2]; 587 mp_limb_t q = div2 (r, ah, al, bh, bl); 588 al = r[0]; ah = r[1]; 589 if (ah < 2) 590 { 591 /* A is too small, but q is correct. */ 592 u01 += q * u00; 593 u11 += q * u10; 594 goto done; 595 } 596 q++; 597 u01 += q * u00; 598 u11 += q * u10; 599 } 600 subtract_a: 601 ASSERT (bh >= ah); 602 if (ah == bh) 603 goto done; 604 605 if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2))) 606 { 607 ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2)); 608 bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2)); 609 610 goto subtract_a1; 611 } 612 613 /* Subtract b -= q a, and multiply M from the right by (1 0 ; q 614 1), affecting the first column of M. */ 615 sub_ddmmss (bh, bl, bh, bl, ah, al); 616 617 if (bh < 2) 618 goto done; 619 620 if (bh <= ah) 621 { 622 /* Use q = 1 */ 623 u00 += u01; 624 u10 += u11; 625 } 626 else 627 { 628 mp_limb_t r[2]; 629 mp_limb_t q = div2 (r, bh, bl, ah, al); 630 bl = r[0]; bh = r[1]; 631 if (bh < 2) 632 { 633 /* B is too small, but q is correct. */ 634 u00 += q * u01; 635 u10 += q * u11; 636 goto done; 637 } 638 q++; 639 u00 += q * u01; 640 u10 += q * u11; 641 } 642 } 643 644 /* NOTE: Since we discard the least significant half limb, we don't get a 645 truly maximal M (corresponding to |a - b| < 2^{GMP_LIMB_BITS +1}). */ 646 /* Single precision loop */ 647 for (;;) 648 { 649 ASSERT (ah >= bh); 650 651 ah -= bh; 652 if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1))) 653 break; 654 655 if (ah <= bh) 656 { 657 /* Use q = 1 */ 658 u01 += u00; 659 u11 += u10; 660 } 661 else 662 { 663 mp_double_limb_t rq = div1 (ah, bh); 664 mp_limb_t q = rq.d1; 665 ah = rq.d0; 666 667 if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1))) 668 { 669 /* A is too small, but q is correct. */ 670 u01 += q * u00; 671 u11 += q * u10; 672 break; 673 } 674 q++; 675 u01 += q * u00; 676 u11 += q * u10; 677 } 678 subtract_a1: 679 ASSERT (bh >= ah); 680 681 bh -= ah; 682 if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1))) 683 break; 684 685 if (bh <= ah) 686 { 687 /* Use q = 1 */ 688 u00 += u01; 689 u10 += u11; 690 } 691 else 692 { 693 mp_double_limb_t rq = div1 (bh, ah); 694 mp_limb_t q = rq.d1; 695 bh = rq.d0; 696 697 if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1))) 698 { 699 /* B is too small, but q is correct. */ 700 u00 += q * u01; 701 u10 += q * u11; 702 break; 703 } 704 q++; 705 u00 += q * u01; 706 u10 += q * u11; 707 } 708 } 709 710 done: 711 M->u[0][0] = u00; M->u[0][1] = u01; 712 M->u[1][0] = u10; M->u[1][1] = u11; 713 714 return 1; 715 } 716 717 /* Sets (r;b) = (a;b) M, with M = (u00, u01; u10, u11). Vector must 718 * have space for n + 1 limbs. Uses three buffers to avoid a copy*/ 719 mp_size_t 720 mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M, 721 mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n) 722 { 723 mp_limb_t ah, bh; 724 725 /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as 726 727 r = u00 * a 728 r += u10 * b 729 b *= u11 730 b += u01 * a 731 */ 732 733 #if HAVE_NATIVE_mpn_addaddmul_1msb0 734 ah = mpn_addaddmul_1msb0 (rp, ap, bp, n, M->u[0][0], M->u[1][0]); 735 bh = mpn_addaddmul_1msb0 (bp, bp, ap, n, M->u[1][1], M->u[0][1]); 736 #else 737 ah = mpn_mul_1 (rp, ap, n, M->u[0][0]); 738 ah += mpn_addmul_1 (rp, bp, n, M->u[1][0]); 739 740 bh = mpn_mul_1 (bp, bp, n, M->u[1][1]); 741 bh += mpn_addmul_1 (bp, ap, n, M->u[0][1]); 742 #endif 743 rp[n] = ah; 744 bp[n] = bh; 745 746 n += (ah | bh) > 0; 747 return n; 748 } 749