xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/generic/div_q.c (revision 72c7faa4dbb41dbb0238d6b4a109da0d4b236dd4)
1 /* mpn_div_q -- division for arbitrary size operands.
2 
3    Contributed to the GNU project by Torbjorn Granlund.
4 
5    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
6    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
8 
9 Copyright 2009, 2010, 2015, 2018 Free Software Foundation, Inc.
10 
11 This file is part of the GNU MP Library.
12 
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of either:
15 
16   * the GNU Lesser General Public License as published by the Free
17     Software Foundation; either version 3 of the License, or (at your
18     option) any later version.
19 
20 or
21 
22   * the GNU General Public License as published by the Free Software
23     Foundation; either version 2 of the License, or (at your option) any
24     later version.
25 
26 or both in parallel, as here.
27 
28 The GNU MP Library is distributed in the hope that it will be useful, but
29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
31 for more details.
32 
33 You should have received copies of the GNU General Public License and the
34 GNU Lesser General Public License along with the GNU MP Library.  If not,
35 see https://www.gnu.org/licenses/.  */
36 
37 #include "gmp-impl.h"
38 #include "longlong.h"
39 
40 
41 /* Compute Q = N/D with truncation.
42      N = {np,nn}
43      D = {dp,dn}
44      Q = {qp,nn-dn+1}
45      T = {scratch,nn+1} is scratch space
46    N and D are both untouched by the computation.
47    N and T may overlap; pass the same space if N is irrelevant after the call,
48    but note that tp needs an extra limb.
49 
50    Operand requirements:
51      N >= D > 0
52      dp[dn-1] != 0
53      No overlap between the N, D, and Q areas.
54 
55    This division function does not clobber its input operands, since it is
56    intended to support average-O(qn) division, and for that to be effective, it
57    cannot put requirements on callers to copy a O(nn) operand.
58 
59    If a caller does not care about the value of {np,nn+1} after calling this
60    function, it should pass np also for the scratch argument.  This function
61    will then save some time and space by avoiding allocation and copying.
62    (FIXME: Is this a good design?  We only really save any copying for
63    already-normalised divisors, which should be rare.  It also prevents us from
64    reasonably asking for all scratch space we need.)
65 
66    We write nn-dn+1 limbs for the quotient, but return void.  Why not return
67    the most significant quotient limb?  Look at the 4 main code blocks below
68    (consisting of an outer if-else where each arm contains an if-else). It is
69    tricky for the first code block, since the mpn_*_div_q calls will typically
70    generate all nn-dn+1 and return 0 or 1.  I don't see how to fix that unless
71    we generate the most significant quotient limb here, before calling
72    mpn_*_div_q, or put the quotient in a temporary area.  Since this is a
73    critical division case (the SB sub-case in particular) copying is not a good
74    idea.
75 
76    It might make sense to split the if-else parts of the (qn + FUDGE
77    >= dn) blocks into separate functions, since we could promise quite
78    different things to callers in these two cases.  The 'then' case
79    benefits from np=scratch, and it could perhaps even tolerate qp=np,
80    saving some headache for many callers.
81 
82    FIXME: Scratch allocation leaves a lot to be desired.  E.g., for the MU size
83    operands, we do not reuse the huge scratch for adjustments.  This can be a
84    serious waste of memory for the largest operands.
85 */
86 
87 /* FUDGE determines when to try getting an approximate quotient from the upper
88    parts of the dividend and divisor, then adjust.  N.B. FUDGE must be >= 2
89    for the code to be correct.  */
90 #define FUDGE 5			/* FIXME: tune this */
91 
92 #define DC_DIV_Q_THRESHOLD      DC_DIVAPPR_Q_THRESHOLD
93 #define MU_DIV_Q_THRESHOLD      MU_DIVAPPR_Q_THRESHOLD
94 #define MUPI_DIV_Q_THRESHOLD  MUPI_DIVAPPR_Q_THRESHOLD
95 #ifndef MUPI_DIVAPPR_Q_THRESHOLD
96 #define MUPI_DIVAPPR_Q_THRESHOLD  MUPI_DIV_QR_THRESHOLD
97 #endif
98 
99 void
mpn_div_q(mp_ptr qp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)100 mpn_div_q (mp_ptr qp,
101 	   mp_srcptr np, mp_size_t nn,
102 	   mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
103 {
104   mp_ptr new_dp, new_np, tp, rp;
105   mp_limb_t cy, dh, qh;
106   mp_size_t new_nn, qn;
107   gmp_pi1_t dinv;
108   int cnt;
109   TMP_DECL;
110   TMP_MARK;
111 
112   ASSERT (nn >= dn);
113   ASSERT (dn > 0);
114   ASSERT (dp[dn - 1] != 0);
115   ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, np, nn));
116   ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, dp, dn));
117   ASSERT (MPN_SAME_OR_SEPARATE_P (np, scratch, nn));
118 
119   ASSERT_ALWAYS (FUDGE >= 2);
120 
121   dh = dp[dn - 1];
122   if (dn == 1)
123     {
124       mpn_divrem_1 (qp, 0L, np, nn, dh);
125       return;
126     }
127 
128   qn = nn - dn + 1;		/* Quotient size, high limb might be zero */
129 
130   if (qn + FUDGE >= dn)
131     {
132       /* |________________________|
133                           |_______|  */
134       new_np = scratch;
135 
136       if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0))
137 	{
138 	  count_leading_zeros (cnt, dh);
139 
140 	  cy = mpn_lshift (new_np, np, nn, cnt);
141 	  new_np[nn] = cy;
142 	  new_nn = nn + (cy != 0);
143 
144 	  new_dp = TMP_ALLOC_LIMBS (dn);
145 	  mpn_lshift (new_dp, dp, dn, cnt);
146 
147 	  if (dn == 2)
148 	    {
149 	      qh = mpn_divrem_2 (qp, 0L, new_np, new_nn, new_dp);
150 	    }
151 	  else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) ||
152 		   BELOW_THRESHOLD (new_nn - dn, DC_DIV_Q_THRESHOLD))
153 	    {
154 	      invert_pi1 (dinv, new_dp[dn - 1], new_dp[dn - 2]);
155 	      qh = mpn_sbpi1_div_q (qp, new_np, new_nn, new_dp, dn, dinv.inv32);
156 	    }
157 	  else if (BELOW_THRESHOLD (dn, MUPI_DIV_Q_THRESHOLD) ||   /* fast condition */
158 		   BELOW_THRESHOLD (nn, 2 * MU_DIV_Q_THRESHOLD) || /* fast condition */
159 		   (double) (2 * (MU_DIV_Q_THRESHOLD - MUPI_DIV_Q_THRESHOLD)) * dn /* slow... */
160 		   + (double) MUPI_DIV_Q_THRESHOLD * nn > (double) dn * nn)   /* ...condition */
161 	    {
162 	      invert_pi1 (dinv, new_dp[dn - 1], new_dp[dn - 2]);
163 	      qh = mpn_dcpi1_div_q (qp, new_np, new_nn, new_dp, dn, &dinv);
164 	    }
165 	  else
166 	    {
167 	      mp_size_t itch = mpn_mu_div_q_itch (new_nn, dn, 0);
168 	      mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
169 	      qh = mpn_mu_div_q (qp, new_np, new_nn, new_dp, dn, scratch);
170 	    }
171 	  if (cy == 0)
172 	    qp[qn - 1] = qh;
173 	  else
174 	    ASSERT (qh == 0);
175 	}
176       else  /* divisor is already normalised */
177 	{
178 	  if (new_np != np)
179 	    MPN_COPY (new_np, np, nn);
180 
181 	  if (dn == 2)
182 	    {
183 	      qh = mpn_divrem_2 (qp, 0L, new_np, nn, dp);
184 	    }
185 	  else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) ||
186 		   BELOW_THRESHOLD (nn - dn, DC_DIV_Q_THRESHOLD))
187 	    {
188 	      invert_pi1 (dinv, dh, dp[dn - 2]);
189 	      qh = mpn_sbpi1_div_q (qp, new_np, nn, dp, dn, dinv.inv32);
190 	    }
191 	  else if (BELOW_THRESHOLD (dn, MUPI_DIV_Q_THRESHOLD) ||   /* fast condition */
192 		   BELOW_THRESHOLD (nn, 2 * MU_DIV_Q_THRESHOLD) || /* fast condition */
193 		   (double) (2 * (MU_DIV_Q_THRESHOLD - MUPI_DIV_Q_THRESHOLD)) * dn /* slow... */
194 		   + (double) MUPI_DIV_Q_THRESHOLD * nn > (double) dn * nn)   /* ...condition */
195 	    {
196 	      invert_pi1 (dinv, dh, dp[dn - 2]);
197 	      qh = mpn_dcpi1_div_q (qp, new_np, nn, dp, dn, &dinv);
198 	    }
199 	  else
200 	    {
201 	      mp_size_t itch = mpn_mu_div_q_itch (nn, dn, 0);
202 	      mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
203 	      qh = mpn_mu_div_q (qp, np, nn, dp, dn, scratch);
204 	    }
205 	  qp[nn - dn] = qh;
206 	}
207     }
208   else
209     {
210       /* |________________________|
211                 |_________________|  */
212       tp = TMP_ALLOC_LIMBS (qn + 1);
213 
214       new_np = scratch;
215       new_nn = 2 * qn + 1;
216       if (new_np == np)
217 	/* We need {np,nn} to remain untouched until the final adjustment, so
218 	   we need to allocate separate space for new_np.  */
219 	new_np = TMP_ALLOC_LIMBS (new_nn + 1);
220 
221 
222       if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0))
223 	{
224 	  count_leading_zeros (cnt, dh);
225 
226 	  cy = mpn_lshift (new_np, np + nn - new_nn, new_nn, cnt);
227 	  new_np[new_nn] = cy;
228 
229 	  new_nn += (cy != 0);
230 
231 	  new_dp = TMP_ALLOC_LIMBS (qn + 1);
232 	  mpn_lshift (new_dp, dp + dn - (qn + 1), qn + 1, cnt);
233 	  new_dp[0] |= dp[dn - (qn + 1) - 1] >> (GMP_NUMB_BITS - cnt);
234 
235 	  if (qn + 1 == 2)
236 	    {
237 	      qh = mpn_divrem_2 (tp, 0L, new_np, new_nn, new_dp);
238 	    }
239 	  else if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD - 1))
240 	    {
241 	      invert_pi1 (dinv, new_dp[qn], new_dp[qn - 1]);
242 	      qh = mpn_sbpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv.inv32);
243 	    }
244 	  else if (BELOW_THRESHOLD (qn, MU_DIVAPPR_Q_THRESHOLD - 1))
245 	    {
246 	      invert_pi1 (dinv, new_dp[qn], new_dp[qn - 1]);
247 	      qh = mpn_dcpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, &dinv);
248 	    }
249 	  else
250 	    {
251 	      mp_size_t itch = mpn_mu_divappr_q_itch (new_nn, qn + 1, 0);
252 	      mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
253 	      qh = mpn_mu_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, scratch);
254 	    }
255 	  if (cy == 0)
256 	    tp[qn] = qh;
257 	  else if (UNLIKELY (qh != 0))
258 	    {
259 	      /* This happens only when the quotient is close to B^n and
260 		 mpn_*_divappr_q returned B^n.  */
261 	      mp_size_t i, n;
262 	      n = new_nn - (qn + 1);
263 	      for (i = 0; i < n; i++)
264 		tp[i] = GMP_NUMB_MAX;
265 	      qh = 0;		/* currently ignored */
266 	    }
267 	}
268       else  /* divisor is already normalised */
269 	{
270 	  MPN_COPY (new_np, np + nn - new_nn, new_nn); /* pointless if MU will be used */
271 
272 	  new_dp = (mp_ptr) dp + dn - (qn + 1);
273 
274 	  if (qn == 2 - 1)
275 	    {
276 	      qh = mpn_divrem_2 (tp, 0L, new_np, new_nn, new_dp);
277 	    }
278 	  else if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD - 1))
279 	    {
280 	      invert_pi1 (dinv, dh, new_dp[qn - 1]);
281 	      qh = mpn_sbpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv.inv32);
282 	    }
283 	  else if (BELOW_THRESHOLD (qn, MU_DIVAPPR_Q_THRESHOLD - 1))
284 	    {
285 	      invert_pi1 (dinv, dh, new_dp[qn - 1]);
286 	      qh = mpn_dcpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, &dinv);
287 	    }
288 	  else
289 	    {
290 	      mp_size_t itch = mpn_mu_divappr_q_itch (new_nn, qn + 1, 0);
291 	      mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
292 	      qh = mpn_mu_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, scratch);
293 	    }
294 	  tp[qn] = qh;
295 	}
296 
297       MPN_COPY (qp, tp + 1, qn);
298       if (tp[0] <= 4)
299         {
300 	  mp_size_t rn;
301 
302           rp = TMP_ALLOC_LIMBS (dn + qn);
303           mpn_mul (rp, dp, dn, tp + 1, qn);
304 	  rn = dn + qn;
305 	  rn -= rp[rn - 1] == 0;
306 
307           if (rn > nn || mpn_cmp (np, rp, nn) < 0)
308             MPN_DECR_U (qp, qn, 1);
309         }
310     }
311 
312   TMP_FREE;
313 }
314