xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/generic/div_qr_2.c (revision 41f3ac3e09f0c1c4d8b911b4c8a1d6450bd14f46)
1 /* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and
2    quotient.  The divisor is two limbs.
3 
4    Contributed to the GNU project by Torbjorn Granlund and Niels Möller
5 
6    THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY
7    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
8    GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9 
10 
11 Copyright 1993-1996, 1999-2002, 2011, 2017 Free Software Foundation, Inc.
12 
13 This file is part of the GNU MP Library.
14 
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
17 
18   * the GNU Lesser General Public License as published by the Free
19     Software Foundation; either version 3 of the License, or (at your
20     option) any later version.
21 
22 or
23 
24   * the GNU General Public License as published by the Free Software
25     Foundation; either version 2 of the License, or (at your option) any
26     later version.
27 
28 or both in parallel, as here.
29 
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
33 for more details.
34 
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library.  If not,
37 see https://www.gnu.org/licenses/.  */
38 
39 #include "gmp-impl.h"
40 #include "longlong.h"
41 
42 #ifndef DIV_QR_2_PI2_THRESHOLD
43 /* Disabled unless explicitly tuned. */
44 #define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX
45 #endif
46 
47 #ifndef SANITY_CHECK
48 #define SANITY_CHECK 0
49 #endif
50 
51 /* Define some longlong.h-style macros, but for wider operations.
52    * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into
53      an additional sum operand.
54    * add_csaac accepts two addends and a carry in, and generates a sum and a
55      carry out.  A little like a "full adder".
56 */
57 #if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
58 
59 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
60 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
61   __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"		\
62 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
63 	   : "0"  ((USItype)(s2)),					\
64 	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
65 	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
66 #endif
67 
68 #if defined (__amd64__) && W_TYPE_SIZE == 64
69 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
70   __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"		\
71 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
72 	   : "0"  ((UDItype)(s2)),					\
73 	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
74 	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
75 #endif
76 
77 #if defined (__aarch64__) && W_TYPE_SIZE == 64
78 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
79   __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %3, xzr"\
80 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
81 	   : "rZ" (s2), "%rZ"  (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0)	\
82 	     __CLOBBER_CC)
83 #endif
84 
85 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
86 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
87    processor running in 32-bit mode, since the carry flag then gets the 32-bit
88    carry.  */
89 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
90   __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3"	\
91 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
92 	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0)	\
93 	     __CLOBBER_CC)
94 #endif
95 
96 #endif /* __GNUC__ */
97 
98 #ifndef add_sssaaaa
99 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
100   do {									\
101     UWtype __s0, __s1, __c0, __c1;					\
102     __s0 = (a0) + (b0);							\
103     __s1 = (a1) + (b1);							\
104     __c0 = __s0 < (a0);							\
105     __c1 = __s1 < (a1);							\
106     (s0) = __s0;							\
107     __s1 = __s1 + __c0;							\
108     (s1) = __s1;							\
109     (s2) += __c1 + (__s1 < __c0);					\
110   } while (0)
111 #endif
112 
113 /* Typically used with r1, r0 same as n3, n2. Other types of overlap
114    between inputs and outputs are not supported. */
115 #define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0)		\
116   do {									\
117     mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0;		\
118     mp_limb_t _t1, _t0;							\
119     mp_limb_t _mask;							\
120 									\
121     /* [q3,q2,q1,q0] = [n3,n2]*[di1,di0] + [n3,n2,n1,n0] + [0,1,0,0] */	\
122     umul_ppmm (_q2,_q1, n2, di1);					\
123     umul_ppmm (_q3,_q2a, n3, di1);					\
124     ++_q2;	/* _q2 cannot overflow */				\
125     add_ssaaaa (_q3,_q2, _q3,_q2, n3,_q2a);				\
126     umul_ppmm (_q2c,_q1c, n3, di0);					\
127     add_sssaaaa (_q3,_q2,_q1, _q2,_q1, n2,_q1c);			\
128     umul_ppmm (_q1d,_q0, n2, di0);					\
129     add_sssaaaa (_q2c,_q1,_q0, _q1,_q0, n1,n0); /* _q2c cannot overflow */ \
130     add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1d);			\
131 									\
132     umul_ppmm (_t1,_t0, _q2, d0);					\
133     _t1 += _q2 * d1 + _q3 * d0;						\
134 									\
135     sub_ddmmss (r1, r0, n1, n0, _t1, _t0);				\
136 									\
137     _mask = -(mp_limb_t) ((r1 >= _q1) & ((r1 > _q1) | (r0 >= _q0)));  /* (r1,r0) >= (q1,q0) */  \
138     add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask);		\
139     sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask);		\
140 									\
141     if (UNLIKELY (r1 >= d1))						\
142       {									\
143 	if (r1 > d1 || r0 >= d0)					\
144 	  {								\
145 	    sub_ddmmss (r1, r0, r1, r0, d1, d0);			\
146 	    add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\
147 	  }								\
148       }									\
149     (q1) = _q3;								\
150     (q0) = _q2;								\
151   } while (0)
152 
153 static void
invert_4by2(mp_ptr di,mp_limb_t d1,mp_limb_t d0)154 invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0)
155 {
156   mp_limb_t v1, v0, p1, t1, t0, p0, mask;
157   invert_limb (v1, d1);
158   p1 = d1 * v1;
159   /* <1, v1> * d1 = <B-1, p1> */
160   p1 += d0;
161   if (p1 < d0)
162     {
163       v1--;
164       mask = -(mp_limb_t) (p1 >= d1);
165       p1 -= d1;
166       v1 += mask;
167       p1 -= mask & d1;
168     }
169   /* <1, v1> * d1 + d0 = <B-1, p1> */
170   umul_ppmm (t1, p0, d0, v1);
171   p1 += t1;
172   if (p1 < t1)
173     {
174       if (UNLIKELY (p1 >= d1))
175 	{
176 	  if (p1 > d1 || p0 >= d0)
177 	    {
178 	      sub_ddmmss (p1, p0, p1, p0, d1, d0);
179 	      v1--;
180 	    }
181 	}
182       sub_ddmmss (p1, p0, p1, p0, d1, d0);
183       v1--;
184     }
185   /* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>,
186    * with <p1, p0> + <d1, d0> >= B^2.
187    *
188    * The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The
189    * partial remainder after <1, v1> is
190    *
191    * B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0>
192    *                              = <~p1, ~p0, B-1>
193    */
194   udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1);
195   di[0] = v0;
196   di[1] = v1;
197 
198 #if SANITY_CHECK
199   {
200     mp_limb_t tp[4];
201     mp_limb_t dp[2];
202     dp[0] = d0;
203     dp[1] = d1;
204     mpn_mul_n (tp, dp, di, 2);
205     ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0);
206     ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX);
207     ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX);
208     ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1);
209   }
210 #endif
211 }
212 
213 static mp_limb_t
mpn_div_qr_2n_pi2(mp_ptr qp,mp_ptr rp,mp_srcptr np,mp_size_t nn,mp_limb_t d1,mp_limb_t d0,mp_limb_t di1,mp_limb_t di0)214 mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
215 		   mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0)
216 {
217   mp_limb_t qh;
218   mp_size_t i;
219   mp_limb_t r1, r0;
220 
221   ASSERT (nn >= 2);
222   ASSERT (d1 & GMP_NUMB_HIGHBIT);
223 
224   r1 = np[nn-1];
225   r0 = np[nn-2];
226 
227   qh = 0;
228   if (r1 >= d1 && (r1 > d1 || r0 >= d0))
229     {
230 #if GMP_NAIL_BITS == 0
231       sub_ddmmss (r1, r0, r1, r0, d1, d0);
232 #else
233       r0 = r0 - d0;
234       r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
235       r0 &= GMP_NUMB_MASK;
236 #endif
237       qh = 1;
238     }
239 
240   for (i = nn - 2; i >= 2; i -= 2)
241     {
242       mp_limb_t n1, n0, q1, q0;
243       n1 = np[i-1];
244       n0 = np[i-2];
245       udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0);
246       qp[i-1] = q1;
247       qp[i-2] = q0;
248     }
249 
250   if (i > 0)
251     {
252       mp_limb_t q;
253       udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1);
254       qp[0] = q;
255     }
256   rp[1] = r1;
257   rp[0] = r0;
258 
259   return qh;
260 }
261 
262 
263 /* Divide num {np,nn} by den {dp,2} and write the nn-2 least
264    significant quotient limbs at qp and the 2 long remainder at np.
265    Return the most significant limb of the quotient.
266 
267    Preconditions:
268    1. qp must either not overlap with the other operands at all, or
269       qp >= np + 2 must hold true.  (This means that it's possible to put
270       the quotient in the high part of {np,nn}, right above the remainder.)
271    2. nn >= 2.  */
272 
273 mp_limb_t
mpn_div_qr_2(mp_ptr qp,mp_ptr rp,mp_srcptr np,mp_size_t nn,mp_srcptr dp)274 mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
275 	      mp_srcptr dp)
276 {
277   mp_limb_t d1;
278   mp_limb_t d0;
279   gmp_pi1_t dinv;
280 
281   ASSERT (nn >= 2);
282   ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2);
283   ASSERT_MPN (np, nn);
284   ASSERT_MPN (dp, 2);
285 
286   d1 = dp[1]; d0 = dp[0];
287 
288   ASSERT (d1 > 0);
289 
290   if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT))
291     {
292       if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD))
293 	{
294 	  gmp_pi1_t dinv;
295 	  invert_pi1 (dinv, d1, d0);
296 	  return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32);
297 	}
298       else
299 	{
300 	  mp_limb_t di[2];
301 	  invert_4by2 (di, d1, d0);
302 	  return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]);
303 	}
304     }
305   else
306     {
307       int shift;
308       count_leading_zeros (shift, d1);
309       d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift));
310       d0 <<= shift;
311       invert_pi1 (dinv, d1, d0);
312       return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32);
313     }
314 }
315