1 /* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and 2 quotient. The divisor is two limbs. 3 4 Contributed to the GNU project by Torbjorn Granlund and Niels M�ller 5 6 THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS 7 ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS 8 ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP 9 RELEASE. 10 11 12 Copyright 1993, 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2011 Free Software 13 Foundation, Inc. 14 15 This file is part of the GNU MP Library. 16 17 The GNU MP Library is free software; you can redistribute it and/or modify 18 it under the terms of the GNU Lesser General Public License as published by 19 the Free Software Foundation; either version 3 of the License, or (at your 20 option) any later version. 21 22 The GNU MP Library is distributed in the hope that it will be useful, but 23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 25 License for more details. 26 27 You should have received a copy of the GNU Lesser General Public License 28 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 29 30 #include "gmp.h" 31 #include "gmp-impl.h" 32 #include "longlong.h" 33 34 #ifndef DIV_QR_2_PI2_THRESHOLD 35 /* Disabled unless explicitly tuned. */ 36 #define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX 37 #endif 38 39 #ifndef SANITY_CHECK 40 #define SANITY_CHECK 0 41 #endif 42 43 /* Define some longlong.h-style macros, but for wider operations. 44 * add_sssaaaa is like longlong.h's add_ssaaaa but the propagating 45 carry-out into an additional sum opeand. 46 * add_csaac accepts two addends and a carry in, and generates a sum 47 and a carry out. A little like a "full adder". 48 */ 49 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER) 50 51 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32 52 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ 53 __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \ 54 : "=r" (s2), "=&r" (s1), "=&r" (s0) \ 55 : "0" ((USItype)(s2)), \ 56 "1" ((USItype)(a1)), "g" ((USItype)(b1)), \ 57 "%2" ((USItype)(a0)), "g" ((USItype)(b0))) 58 #define add_csaac(co, s, a, b, ci) \ 59 __asm__ ("bt\t$0, %2\n\tadc\t%5, %k1\n\tadc\t%k0, %k0" \ 60 : "=r" (co), "=r" (s) \ 61 : "rm" ((USItype)(ci)), "0" (CNST_LIMB(0)), \ 62 "%1" ((USItype)(a)), "g" ((USItype)(b))) 63 #endif 64 65 #if defined (__amd64__) && W_TYPE_SIZE == 64 66 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ 67 __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \ 68 : "=r" (s2), "=&r" (s1), "=&r" (s0) \ 69 : "0" ((UDItype)(s2)), \ 70 "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \ 71 "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0))) 72 #define add_csaac(co, s, a, b, ci) \ 73 __asm__ ("bt\t$0, %2\n\tadc\t%5, %q1\n\tadc\t%q0, %q0" \ 74 : "=r" (co), "=r" (s) \ 75 : "rm" ((UDItype)(ci)), "0" (CNST_LIMB(0)), \ 76 "%1" ((UDItype)(a)), "g" ((UDItype)(b))) 77 #endif 78 79 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB) 80 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a 81 processor running in 32-bit mode, since the carry flag then gets the 32-bit 82 carry. */ 83 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ 84 __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0" \ 85 : "=r" (s2), "=&r" (s1), "=&r" (s0) \ 86 : "r" (s2), "r" (a1), "r" (b1), "%r" (a0), "rI" (b0)) 87 #endif 88 89 #endif /* __GNUC__ */ 90 91 #ifndef add_sssaaaa 92 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ 93 do { \ 94 UWtype __s0, __s1, __c0, __c1; \ 95 __s0 = (a0) + (b0); \ 96 __s1 = (a1) + (b1); \ 97 __c0 = __s0 < (a0); \ 98 __c1 = __s1 < (a1); \ 99 (s0) = __s0; \ 100 __s1 = __s1 + __c0; \ 101 (s1) = __s1; \ 102 (s2) += __c1 + (__s1 < __c0); \ 103 } while (0) 104 #endif 105 106 #ifndef add_csaac 107 #define add_csaac(co, s, a, b, ci) \ 108 do { \ 109 UWtype __s, __c; \ 110 __s = (a) + (b); \ 111 __c = __s < (a); \ 112 __s = __s + (ci); \ 113 (s) = __s; \ 114 (co) = __c + (__s < (ci)); \ 115 } while (0) 116 #endif 117 118 /* Typically used with r1, r0 same as n3, n2. Other types of overlap 119 between inputs and outputs not supported. */ 120 #define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0) \ 121 do { \ 122 mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0; \ 123 mp_limb_t _t1, _t0; \ 124 mp_limb_t _c, _mask; \ 125 \ 126 umul_ppmm (_q3,_q2a, n3, di1); \ 127 umul_ppmm (_q2,_q1, n2, di1); \ 128 umul_ppmm (_q2c,_q1c, n3, di0); \ 129 add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1c); \ 130 umul_ppmm (_q1d,_q0, n2, di0); \ 131 add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2a,_q1d); \ 132 \ 133 add_ssaaaa (r1, r0, n3, n2, CNST_LIMB(0), CNST_LIMB(1)); \ 134 /* FIXME: combine as in x86_64 asm */ \ 135 \ 136 /* [q3,q2,q1,q0] += [n3,n3,n1,n0] */ \ 137 add_csaac (_c, _q0, _q0, n0, CNST_LIMB(0)); \ 138 add_csaac (_c, _q1, _q1, n1, _c); \ 139 add_csaac (_c, _q2, _q2, r0, _c); \ 140 _q3 = _q3 + r1 + _c; \ 141 \ 142 umul_ppmm (_t1,_t0, _q2, d0); \ 143 _t1 += _q2 * d1 + _q3 * d0; \ 144 \ 145 sub_ddmmss (r1, r0, n1, n0, _t1, _t0); \ 146 \ 147 _mask = -(mp_limb_t) (r1 >= _q1 & (r1 > _q1 | r0 >= _q0)); /* (r1,r0) >= (q1,q0) */ \ 148 add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask); \ 149 sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask); \ 150 \ 151 if (UNLIKELY (r1 >= d1)) \ 152 { \ 153 if (r1 > d1 || r0 >= d0) \ 154 { \ 155 sub_ddmmss (r1, r0, r1, r0, d1, d0); \ 156 add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\ 157 } \ 158 } \ 159 (q1) = _q3; \ 160 (q0) = _q2; \ 161 } while (0) 162 163 static void 164 invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0) 165 { 166 mp_limb_t v1, v0, p1, t1, t0, p0, mask; 167 invert_limb (v1, d1); 168 p1 = d1 * v1; 169 /* <1, v1> * d1 = <B-1, p1> */ 170 p1 += d0; 171 if (p1 < d0) 172 { 173 v1--; 174 mask = -(mp_limb_t) (p1 >= d1); 175 p1 -= d1; 176 v1 += mask; 177 p1 -= mask & d1; 178 } 179 /* <1, v1> * d1 + d0 = <B-1, p1> */ 180 umul_ppmm (t1, p0, d0, v1); 181 p1 += t1; 182 if (p1 < t1) 183 { 184 if (UNLIKELY (p1 >= d1)) 185 { 186 if (p1 > d1 || p0 >= d0) 187 { 188 sub_ddmmss (p1, p0, p1, p0, d1, d0); 189 v1--; 190 } 191 } 192 sub_ddmmss (p1, p0, p1, p0, d1, d0); 193 v1--; 194 } 195 /* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>, 196 * with <p1, p0> + <d1, d0> >= B^2. 197 * 198 * The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The 199 * partial remainder after <1, v1> is 200 * 201 * B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0> 202 * = <~p1, ~p0, B-1> 203 */ 204 udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1); 205 di[0] = v0; 206 di[1] = v1; 207 208 #if SANITY_CHECK 209 { 210 mp_limb_t tp[4]; 211 mp_limb_t dp[2]; 212 dp[0] = d0; 213 dp[1] = d1; 214 mpn_mul_n (tp, dp, di, 2); 215 ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0); 216 ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX); 217 ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX); 218 ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1); 219 } 220 #endif 221 } 222 223 static mp_limb_t 224 mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn, 225 mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0) 226 { 227 mp_limb_t qh; 228 mp_size_t i; 229 mp_limb_t r1, r0; 230 231 ASSERT (nn >= 2); 232 ASSERT (d1 & GMP_NUMB_HIGHBIT); 233 234 r1 = np[nn-1]; 235 r0 = np[nn-2]; 236 237 qh = 0; 238 if (r1 >= d1 && (r1 > d1 || r0 >= d0)) 239 { 240 #if GMP_NAIL_BITS == 0 241 sub_ddmmss (r1, r0, r1, r0, d1, d0); 242 #else 243 r0 = r0 - d0; 244 r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1); 245 r0 &= GMP_NUMB_MASK; 246 #endif 247 qh = 1; 248 } 249 250 for (i = nn - 2; i >= 2; i -= 2) 251 { 252 mp_limb_t n1, n0, q1, q0; 253 n1 = np[i-1]; 254 n0 = np[i-2]; 255 udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0); 256 qp[i-1] = q1; 257 qp[i-2] = q0; 258 } 259 260 if (i > 0) 261 { 262 mp_limb_t q; 263 udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1); 264 qp[0] = q; 265 } 266 rp[1] = r1; 267 rp[0] = r0; 268 269 return qh; 270 } 271 272 273 /* Divide num {np,nn} by den {dp,2} and write the nn-2 least 274 significant quotient limbs at qp and the 2 long remainder at np. 275 Return the most significant limb of the quotient. 276 277 Preconditions: 278 1. qp must either not overlap with the input operands at all, or 279 qp >= np + 2 must hold true. (This means that it's possible to put 280 the quotient in the high part of {np,nn}, right above the remainder. 281 2. nn >= 2. */ 282 283 mp_limb_t 284 mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn, 285 mp_srcptr dp) 286 { 287 mp_limb_t d1; 288 mp_limb_t d0; 289 gmp_pi1_t dinv; 290 291 ASSERT (nn >= 2); 292 ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2); 293 ASSERT_MPN (np, nn); 294 ASSERT_MPN (dp, 2); 295 296 d1 = dp[1]; d0 = dp[0]; 297 298 ASSERT (d1 > 0); 299 300 if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT)) 301 { 302 if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD)) 303 { 304 gmp_pi1_t dinv; 305 invert_pi1 (dinv, d1, d0); 306 return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32); 307 } 308 else 309 { 310 mp_limb_t di[2]; 311 invert_4by2 (di, d1, d0); 312 return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]); 313 } 314 } 315 else 316 { 317 int shift; 318 count_leading_zeros (shift, d1); 319 d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift)); 320 d0 <<= shift; 321 invert_pi1 (dinv, d1, d0); 322 return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32); 323 } 324 } 325