1 /* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and
2 quotient. The divisor is two limbs.
3
4 Contributed to the GNU project by Torbjorn Granlund and Niels Möller
5
6 THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY
7 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
8 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9
10
11 Copyright 1993-1996, 1999-2002, 2011, 2017 Free Software Foundation, Inc.
12
13 This file is part of the GNU MP Library.
14
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
17
18 * the GNU Lesser General Public License as published by the Free
19 Software Foundation; either version 3 of the License, or (at your
20 option) any later version.
21
22 or
23
24 * the GNU General Public License as published by the Free Software
25 Foundation; either version 2 of the License, or (at your option) any
26 later version.
27
28 or both in parallel, as here.
29
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
33 for more details.
34
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library. If not,
37 see https://www.gnu.org/licenses/. */
38
39 #include "gmp-impl.h"
40 #include "longlong.h"
41
42 #ifndef DIV_QR_2_PI2_THRESHOLD
43 /* Disabled unless explicitly tuned. */
44 #define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX
45 #endif
46
47 #ifndef SANITY_CHECK
48 #define SANITY_CHECK 0
49 #endif
50
51 /* Define some longlong.h-style macros, but for wider operations.
52 * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into
53 an additional sum operand.
54 * add_csaac accepts two addends and a carry in, and generates a sum and a
55 carry out. A little like a "full adder".
56 */
57 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
58
59 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
60 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
61 __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \
62 : "=r" (s2), "=&r" (s1), "=&r" (s0) \
63 : "0" ((USItype)(s2)), \
64 "1" ((USItype)(a1)), "g" ((USItype)(b1)), \
65 "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
66 #endif
67
68 #if defined (__amd64__) && W_TYPE_SIZE == 64
69 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
70 __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \
71 : "=r" (s2), "=&r" (s1), "=&r" (s0) \
72 : "0" ((UDItype)(s2)), \
73 "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \
74 "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
75 #endif
76
77 #if defined (__aarch64__) && W_TYPE_SIZE == 64
78 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
79 __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %3, xzr"\
80 : "=r" (s2), "=&r" (s1), "=&r" (s0) \
81 : "rZ" (s2), "%rZ" (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0) \
82 __CLOBBER_CC)
83 #endif
84
85 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
86 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
87 processor running in 32-bit mode, since the carry flag then gets the 32-bit
88 carry. */
89 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
90 __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3" \
91 : "=r" (s2), "=&r" (s1), "=&r" (s0) \
92 : "r" (s2), "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \
93 __CLOBBER_CC)
94 #endif
95
96 #endif /* __GNUC__ */
97
98 #ifndef add_sssaaaa
99 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
100 do { \
101 UWtype __s0, __s1, __c0, __c1; \
102 __s0 = (a0) + (b0); \
103 __s1 = (a1) + (b1); \
104 __c0 = __s0 < (a0); \
105 __c1 = __s1 < (a1); \
106 (s0) = __s0; \
107 __s1 = __s1 + __c0; \
108 (s1) = __s1; \
109 (s2) += __c1 + (__s1 < __c0); \
110 } while (0)
111 #endif
112
113 /* Typically used with r1, r0 same as n3, n2. Other types of overlap
114 between inputs and outputs are not supported. */
115 #define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0) \
116 do { \
117 mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0; \
118 mp_limb_t _t1, _t0; \
119 mp_limb_t _mask; \
120 \
121 /* [q3,q2,q1,q0] = [n3,n2]*[di1,di0] + [n3,n2,n1,n0] + [0,1,0,0] */ \
122 umul_ppmm (_q2,_q1, n2, di1); \
123 umul_ppmm (_q3,_q2a, n3, di1); \
124 ++_q2; /* _q2 cannot overflow */ \
125 add_ssaaaa (_q3,_q2, _q3,_q2, n3,_q2a); \
126 umul_ppmm (_q2c,_q1c, n3, di0); \
127 add_sssaaaa (_q3,_q2,_q1, _q2,_q1, n2,_q1c); \
128 umul_ppmm (_q1d,_q0, n2, di0); \
129 add_sssaaaa (_q2c,_q1,_q0, _q1,_q0, n1,n0); /* _q2c cannot overflow */ \
130 add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1d); \
131 \
132 umul_ppmm (_t1,_t0, _q2, d0); \
133 _t1 += _q2 * d1 + _q3 * d0; \
134 \
135 sub_ddmmss (r1, r0, n1, n0, _t1, _t0); \
136 \
137 _mask = -(mp_limb_t) ((r1 >= _q1) & ((r1 > _q1) | (r0 >= _q0))); /* (r1,r0) >= (q1,q0) */ \
138 add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask); \
139 sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask); \
140 \
141 if (UNLIKELY (r1 >= d1)) \
142 { \
143 if (r1 > d1 || r0 >= d0) \
144 { \
145 sub_ddmmss (r1, r0, r1, r0, d1, d0); \
146 add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\
147 } \
148 } \
149 (q1) = _q3; \
150 (q0) = _q2; \
151 } while (0)
152
153 static void
invert_4by2(mp_ptr di,mp_limb_t d1,mp_limb_t d0)154 invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0)
155 {
156 mp_limb_t v1, v0, p1, t1, t0, p0, mask;
157 invert_limb (v1, d1);
158 p1 = d1 * v1;
159 /* <1, v1> * d1 = <B-1, p1> */
160 p1 += d0;
161 if (p1 < d0)
162 {
163 v1--;
164 mask = -(mp_limb_t) (p1 >= d1);
165 p1 -= d1;
166 v1 += mask;
167 p1 -= mask & d1;
168 }
169 /* <1, v1> * d1 + d0 = <B-1, p1> */
170 umul_ppmm (t1, p0, d0, v1);
171 p1 += t1;
172 if (p1 < t1)
173 {
174 if (UNLIKELY (p1 >= d1))
175 {
176 if (p1 > d1 || p0 >= d0)
177 {
178 sub_ddmmss (p1, p0, p1, p0, d1, d0);
179 v1--;
180 }
181 }
182 sub_ddmmss (p1, p0, p1, p0, d1, d0);
183 v1--;
184 }
185 /* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>,
186 * with <p1, p0> + <d1, d0> >= B^2.
187 *
188 * The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The
189 * partial remainder after <1, v1> is
190 *
191 * B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0>
192 * = <~p1, ~p0, B-1>
193 */
194 udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1);
195 di[0] = v0;
196 di[1] = v1;
197
198 #if SANITY_CHECK
199 {
200 mp_limb_t tp[4];
201 mp_limb_t dp[2];
202 dp[0] = d0;
203 dp[1] = d1;
204 mpn_mul_n (tp, dp, di, 2);
205 ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0);
206 ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX);
207 ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX);
208 ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1);
209 }
210 #endif
211 }
212
213 static mp_limb_t
mpn_div_qr_2n_pi2(mp_ptr qp,mp_ptr rp,mp_srcptr np,mp_size_t nn,mp_limb_t d1,mp_limb_t d0,mp_limb_t di1,mp_limb_t di0)214 mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
215 mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0)
216 {
217 mp_limb_t qh;
218 mp_size_t i;
219 mp_limb_t r1, r0;
220
221 ASSERT (nn >= 2);
222 ASSERT (d1 & GMP_NUMB_HIGHBIT);
223
224 r1 = np[nn-1];
225 r0 = np[nn-2];
226
227 qh = 0;
228 if (r1 >= d1 && (r1 > d1 || r0 >= d0))
229 {
230 #if GMP_NAIL_BITS == 0
231 sub_ddmmss (r1, r0, r1, r0, d1, d0);
232 #else
233 r0 = r0 - d0;
234 r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
235 r0 &= GMP_NUMB_MASK;
236 #endif
237 qh = 1;
238 }
239
240 for (i = nn - 2; i >= 2; i -= 2)
241 {
242 mp_limb_t n1, n0, q1, q0;
243 n1 = np[i-1];
244 n0 = np[i-2];
245 udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0);
246 qp[i-1] = q1;
247 qp[i-2] = q0;
248 }
249
250 if (i > 0)
251 {
252 mp_limb_t q;
253 udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1);
254 qp[0] = q;
255 }
256 rp[1] = r1;
257 rp[0] = r0;
258
259 return qh;
260 }
261
262
263 /* Divide num {np,nn} by den {dp,2} and write the nn-2 least
264 significant quotient limbs at qp and the 2 long remainder at np.
265 Return the most significant limb of the quotient.
266
267 Preconditions:
268 1. qp must either not overlap with the other operands at all, or
269 qp >= np + 2 must hold true. (This means that it's possible to put
270 the quotient in the high part of {np,nn}, right above the remainder.)
271 2. nn >= 2. */
272
273 mp_limb_t
mpn_div_qr_2(mp_ptr qp,mp_ptr rp,mp_srcptr np,mp_size_t nn,mp_srcptr dp)274 mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
275 mp_srcptr dp)
276 {
277 mp_limb_t d1;
278 mp_limb_t d0;
279 gmp_pi1_t dinv;
280
281 ASSERT (nn >= 2);
282 ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2);
283 ASSERT_MPN (np, nn);
284 ASSERT_MPN (dp, 2);
285
286 d1 = dp[1]; d0 = dp[0];
287
288 ASSERT (d1 > 0);
289
290 if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT))
291 {
292 if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD))
293 {
294 gmp_pi1_t dinv;
295 invert_pi1 (dinv, d1, d0);
296 return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32);
297 }
298 else
299 {
300 mp_limb_t di[2];
301 invert_4by2 (di, d1, d0);
302 return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]);
303 }
304 }
305 else
306 {
307 int shift;
308 count_leading_zeros (shift, d1);
309 d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift));
310 d0 <<= shift;
311 invert_pi1 (dinv, d1, d0);
312 return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32);
313 }
314 }
315