xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/generic/div_qr_2.c (revision 479d8f7d843cc1b22d497efdf1f27a50ee8418d4)
1 /* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and
2    quotient.  The divisor is two limbs.
3 
4    Contributed to the GNU project by Torbjorn Granlund and Niels M�ller
5 
6    THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS
7    ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS
8    ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP
9    RELEASE.
10 
11 
12 Copyright 1993, 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2011 Free Software
13 Foundation, Inc.
14 
15 This file is part of the GNU MP Library.
16 
17 The GNU MP Library is free software; you can redistribute it and/or modify
18 it under the terms of the GNU Lesser General Public License as published by
19 the Free Software Foundation; either version 3 of the License, or (at your
20 option) any later version.
21 
22 The GNU MP Library is distributed in the hope that it will be useful, but
23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
25 License for more details.
26 
27 You should have received a copy of the GNU Lesser General Public License
28 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
29 
30 #include "gmp.h"
31 #include "gmp-impl.h"
32 #include "longlong.h"
33 
34 #ifndef DIV_QR_2_PI2_THRESHOLD
35 /* Disabled unless explicitly tuned. */
36 #define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX
37 #endif
38 
39 #ifndef SANITY_CHECK
40 #define SANITY_CHECK 0
41 #endif
42 
43 /* Define some longlong.h-style macros, but for wider operations.
44    * add_sssaaaa is like longlong.h's add_ssaaaa but the propagating
45      carry-out into an additional sum opeand.
46    * add_csaac accepts two addends and a carry in, and generates a sum
47      and a carry out.  A little like a "full adder".
48 */
49 #if defined (__GNUC__)  && ! defined (__INTEL_COMPILER)
50 
51 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
52 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
53   __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"		\
54 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
55 	   : "0"  ((USItype)(s2)),					\
56 	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
57 	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
58 #define add_csaac(co, s, a, b, ci)					\
59   __asm__ ("bt\t$0, %2\n\tadc\t%5, %k1\n\tadc\t%k0, %k0"		\
60 	   : "=r" (co), "=r" (s)					\
61 	   : "rm"  ((USItype)(ci)), "0" (CNST_LIMB(0)),			\
62 	     "%1" ((USItype)(a)), "g" ((USItype)(b)))
63 #endif
64 
65 #if defined (__amd64__) && W_TYPE_SIZE == 64
66 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
67   __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"		\
68 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
69 	   : "0"  ((UDItype)(s2)),					\
70 	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
71 	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
72 #define add_csaac(co, s, a, b, ci)					\
73   __asm__ ("bt\t$0, %2\n\tadc\t%5, %q1\n\tadc\t%q0, %q0"		\
74 	   : "=r" (co), "=r" (s)					\
75 	   : "rm"  ((UDItype)(ci)), "0" (CNST_LIMB(0)),			\
76 	     "%1" ((UDItype)(a)), "g" ((UDItype)(b)))
77 #endif
78 
79 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
80 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
81    processor running in 32-bit mode, since the carry flag then gets the 32-bit
82    carry.  */
83 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
84   __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0"	\
85 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
86 	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0))
87 #endif
88 
89 #endif /* __GNUC__ */
90 
91 #ifndef add_sssaaaa
92 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
93   do {									\
94     UWtype __s0, __s1, __c0, __c1;					\
95     __s0 = (a0) + (b0);							\
96     __s1 = (a1) + (b1);							\
97     __c0 = __s0 < (a0);							\
98     __c1 = __s1 < (a1);							\
99     (s0) = __s0;							\
100     __s1 = __s1 + __c0;							\
101     (s1) = __s1;							\
102     (s2) += __c1 + (__s1 < __c0);					\
103   } while (0)
104 #endif
105 
106 #ifndef add_csaac
107 #define add_csaac(co, s, a, b, ci)					\
108   do {									\
109     UWtype __s, __c;							\
110     __s = (a) + (b);							\
111     __c = __s < (a);							\
112     __s = __s + (ci);							\
113     (s) = __s;								\
114     (co) = __c + (__s < (ci));						\
115   } while (0)
116 #endif
117 
118 /* Typically used with r1, r0 same as n3, n2. Other types of overlap
119    between inputs and outputs not supported. */
120 #define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0)		\
121   do {									\
122     mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0;		\
123     mp_limb_t _t1, _t0;							\
124     mp_limb_t _c, _mask;						\
125 									\
126     umul_ppmm (_q3,_q2a, n3, di1);					\
127     umul_ppmm (_q2,_q1, n2, di1);					\
128     umul_ppmm (_q2c,_q1c, n3, di0);					\
129     add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1c);			\
130     umul_ppmm (_q1d,_q0, n2, di0);					\
131     add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2a,_q1d);			\
132 									\
133     add_ssaaaa (r1, r0, n3, n2, CNST_LIMB(0), CNST_LIMB(1));		\
134     				  /* FIXME: combine as in x86_64 asm */	\
135 									\
136     /* [q3,q2,q1,q0] += [n3,n3,n1,n0] */				\
137     add_csaac (_c, _q0, _q0, n0, CNST_LIMB(0));				\
138     add_csaac (_c, _q1, _q1, n1, _c);					\
139     add_csaac (_c, _q2, _q2, r0, _c);					\
140     _q3 = _q3 + r1 + _c;						\
141 									\
142     umul_ppmm (_t1,_t0, _q2, d0);					\
143     _t1 += _q2 * d1 + _q3 * d0;						\
144 									\
145     sub_ddmmss (r1, r0, n1, n0, _t1, _t0);				\
146 									\
147     _mask = -(mp_limb_t) (r1 >= _q1 & (r1 > _q1 | r0 >= _q0));  /* (r1,r0) >= (q1,q0) */  \
148     add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask);		\
149     sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask);		\
150 									\
151     if (UNLIKELY (r1 >= d1))						\
152       {									\
153 	if (r1 > d1 || r0 >= d0)					\
154 	  {								\
155 	    sub_ddmmss (r1, r0, r1, r0, d1, d0);			\
156 	    add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\
157 	  }								\
158       }									\
159     (q1) = _q3;								\
160     (q0) = _q2;								\
161   } while (0)
162 
163 static void
164 invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0)
165 {
166   mp_limb_t v1, v0, p1, t1, t0, p0, mask;
167   invert_limb (v1, d1);
168   p1 = d1 * v1;
169   /* <1, v1> * d1 = <B-1, p1> */
170   p1 += d0;
171   if (p1 < d0)
172     {
173       v1--;
174       mask = -(mp_limb_t) (p1 >= d1);
175       p1 -= d1;
176       v1 += mask;
177       p1 -= mask & d1;
178     }
179   /* <1, v1> * d1 + d0 = <B-1, p1> */
180   umul_ppmm (t1, p0, d0, v1);
181   p1 += t1;
182   if (p1 < t1)
183     {
184       if (UNLIKELY (p1 >= d1))
185 	{
186 	  if (p1 > d1 || p0 >= d0)
187 	    {
188 	      sub_ddmmss (p1, p0, p1, p0, d1, d0);
189 	      v1--;
190 	    }
191 	}
192       sub_ddmmss (p1, p0, p1, p0, d1, d0);
193       v1--;
194     }
195   /* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>,
196    * with <p1, p0> + <d1, d0> >= B^2.
197    *
198    * The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The
199    * partial remainder after <1, v1> is
200    *
201    * B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0>
202    *                              = <~p1, ~p0, B-1>
203    */
204   udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1);
205   di[0] = v0;
206   di[1] = v1;
207 
208 #if SANITY_CHECK
209   {
210     mp_limb_t tp[4];
211     mp_limb_t dp[2];
212     dp[0] = d0;
213     dp[1] = d1;
214     mpn_mul_n (tp, dp, di, 2);
215     ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0);
216     ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX);
217     ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX);
218     ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1);
219   }
220 #endif
221 }
222 
223 static mp_limb_t
224 mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
225 		   mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0)
226 {
227   mp_limb_t qh;
228   mp_size_t i;
229   mp_limb_t r1, r0;
230 
231   ASSERT (nn >= 2);
232   ASSERT (d1 & GMP_NUMB_HIGHBIT);
233 
234   r1 = np[nn-1];
235   r0 = np[nn-2];
236 
237   qh = 0;
238   if (r1 >= d1 && (r1 > d1 || r0 >= d0))
239     {
240 #if GMP_NAIL_BITS == 0
241       sub_ddmmss (r1, r0, r1, r0, d1, d0);
242 #else
243       r0 = r0 - d0;
244       r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
245       r0 &= GMP_NUMB_MASK;
246 #endif
247       qh = 1;
248     }
249 
250   for (i = nn - 2; i >= 2; i -= 2)
251     {
252       mp_limb_t n1, n0, q1, q0;
253       n1 = np[i-1];
254       n0 = np[i-2];
255       udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0);
256       qp[i-1] = q1;
257       qp[i-2] = q0;
258     }
259 
260   if (i > 0)
261     {
262       mp_limb_t q;
263       udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1);
264       qp[0] = q;
265     }
266   rp[1] = r1;
267   rp[0] = r0;
268 
269   return qh;
270 }
271 
272 
273 /* Divide num {np,nn} by den {dp,2} and write the nn-2 least
274    significant quotient limbs at qp and the 2 long remainder at np.
275    Return the most significant limb of the quotient.
276 
277    Preconditions:
278    1. qp must either not overlap with the input operands at all, or
279       qp >= np + 2 must hold true.  (This means that it's possible to put
280       the quotient in the high part of {np,nn}, right above the remainder.
281    2. nn >= 2.  */
282 
283 mp_limb_t
284 mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
285 	      mp_srcptr dp)
286 {
287   mp_limb_t d1;
288   mp_limb_t d0;
289   gmp_pi1_t dinv;
290 
291   ASSERT (nn >= 2);
292   ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2);
293   ASSERT_MPN (np, nn);
294   ASSERT_MPN (dp, 2);
295 
296   d1 = dp[1]; d0 = dp[0];
297 
298   ASSERT (d1 > 0);
299 
300   if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT))
301     {
302       if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD))
303 	{
304 	  gmp_pi1_t dinv;
305 	  invert_pi1 (dinv, d1, d0);
306 	  return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32);
307 	}
308       else
309 	{
310 	  mp_limb_t di[2];
311 	  invert_4by2 (di, d1, d0);
312 	  return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]);
313 	}
314     }
315   else
316     {
317       int shift;
318       count_leading_zeros (shift, d1);
319       d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift));
320       d0 <<= shift;
321       invert_pi1 (dinv, d1, d0);
322       return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32);
323     }
324 }
325