xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/generic/mu_bdiv_qr.c (revision 72c7faa4dbb41dbb0238d6b4a109da0d4b236dd4)
1 /* mpn_mu_bdiv_qr(qp,rp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^qn,
2    where qn = nn-dn, storing the result in {qp,qn}.  Overlap allowed between Q
3    and N; all other overlap disallowed.
4 
5    Contributed to the GNU project by Torbjorn Granlund.
6 
7    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
8    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
9    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
10 
11 Copyright 2005-2007, 2009, 2010, 2012, 2017 Free Software Foundation, Inc.
12 
13 This file is part of the GNU MP Library.
14 
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
17 
18   * the GNU Lesser General Public License as published by the Free
19     Software Foundation; either version 3 of the License, or (at your
20     option) any later version.
21 
22 or
23 
24   * the GNU General Public License as published by the Free Software
25     Foundation; either version 2 of the License, or (at your option) any
26     later version.
27 
28 or both in parallel, as here.
29 
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
33 for more details.
34 
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library.  If not,
37 see https://www.gnu.org/licenses/.  */
38 
39 
40 /*
41    The idea of the algorithm used herein is to compute a smaller inverted value
42    than used in the standard Barrett algorithm, and thus save time in the
43    Newton iterations, and pay just a small price when using the inverted value
44    for developing quotient bits.  This algorithm was presented at ICMS 2006.
45 */
46 
47 #include "gmp-impl.h"
48 
49 
50 /* N = {np,nn}
51    D = {dp,dn}
52 
53    Requirements: N >= D
54 		 D >= 1
55 		 D odd
56 		 dn >= 2
57 		 nn >= 2
58 		 scratch space as determined by mpn_mu_bdiv_qr_itch(nn,dn).
59 
60    Write quotient to Q = {qp,nn-dn}.
61 
62    FIXME: When iterating, perhaps do the small step before loop, not after.
63    FIXME: Try to avoid the scalar divisions when computing inverse size.
64    FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible.  In
65 	  particular, when dn==in, tp and rp could use the same space.
66 */
67 static mp_limb_t
mpn_mu_bdiv_qr_old(mp_ptr qp,mp_ptr rp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)68 mpn_mu_bdiv_qr_old (mp_ptr qp,
69 		    mp_ptr rp,
70 		    mp_srcptr np, mp_size_t nn,
71 		    mp_srcptr dp, mp_size_t dn,
72 		    mp_ptr scratch)
73 {
74   mp_size_t qn;
75   mp_size_t in;
76   mp_limb_t cy, c0;
77   mp_size_t tn, wn;
78 
79   qn = nn - dn;
80 
81   ASSERT (dn >= 2);
82   ASSERT (qn >= 2);
83 
84   if (qn > dn)
85     {
86       mp_size_t b;
87 
88       /* |_______________________|   dividend
89 			|________|   divisor  */
90 
91 #define ip           scratch		/* in */
92 #define tp           (scratch + in)	/* dn+in or next_size(dn) or rest >= binvert_itch(in) */
93 #define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
94 
95       /* Compute an inverse size that is a nice partition of the quotient.  */
96       b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
97       in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
98 
99       /* Some notes on allocation:
100 
101 	 When in = dn, R dies when mpn_mullo returns, if in < dn the low in
102 	 limbs of R dies at that point.  We could save memory by letting T live
103 	 just under R, and let the upper part of T expand into R. These changes
104 	 should reduce itch to perhaps 3dn.
105        */
106 
107       mpn_binvert (ip, dp, in, tp);
108 
109       MPN_COPY (rp, np, dn);
110       np += dn;
111       cy = 0;
112 
113       while (qn > in)
114 	{
115 	  mpn_mullo_n (qp, rp, ip, in);
116 
117 	  if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
118 	    mpn_mul (tp, dp, dn, qp, in);	/* mulhi, need tp[dn+in-1...in] */
119 	  else
120 	    {
121 	      tn = mpn_mulmod_bnm1_next_size (dn);
122 	      mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
123 	      wn = dn + in - tn;		/* number of wrapped limbs */
124 	      if (wn > 0)
125 		{
126 		  c0 = mpn_sub_n (tp + tn, tp, rp, wn);
127 		  mpn_decr_u (tp + wn, c0);
128 		}
129 	    }
130 
131 	  qp += in;
132 	  qn -= in;
133 
134 	  if (dn != in)
135 	    {
136 	      /* Subtract tp[dn-1...in] from partial remainder.  */
137 	      cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
138 	      if (cy == 2)
139 		{
140 		  mpn_incr_u (tp + dn, 1);
141 		  cy = 1;
142 		}
143 	    }
144 	  /* Subtract tp[dn+in-1...dn] from dividend.  */
145 	  cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
146 	  np += in;
147 	}
148 
149       /* Generate last qn limbs.  */
150       mpn_mullo_n (qp, rp, ip, qn);
151 
152       if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
153 	mpn_mul (tp, dp, dn, qp, qn);		/* mulhi, need tp[qn+in-1...in] */
154       else
155 	{
156 	  tn = mpn_mulmod_bnm1_next_size (dn);
157 	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
158 	  wn = dn + qn - tn;			/* number of wrapped limbs */
159 	  if (wn > 0)
160 	    {
161 	      c0 = mpn_sub_n (tp + tn, tp, rp, wn);
162 	      mpn_decr_u (tp + wn, c0);
163 	    }
164 	}
165 
166       if (dn != qn)
167 	{
168 	  cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
169 	  if (cy == 2)
170 	    {
171 	      mpn_incr_u (tp + dn, 1);
172 	      cy = 1;
173 	    }
174 	}
175       return mpn_sub_nc (rp + dn - qn, np, tp + dn, qn, cy);
176 
177 #undef ip
178 #undef tp
179 #undef scratch_out
180     }
181   else
182     {
183       /* |_______________________|   dividend
184 		|________________|   divisor  */
185 
186 #define ip           scratch		/* in */
187 #define tp           (scratch + in)	/* dn+in or next_size(dn) or rest >= binvert_itch(in) */
188 #define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
189 
190       /* Compute half-sized inverse.  */
191       in = qn - (qn >> 1);
192 
193       mpn_binvert (ip, dp, in, tp);
194 
195       mpn_mullo_n (qp, np, ip, in);		/* low `in' quotient limbs */
196 
197       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
198 	mpn_mul (tp, dp, dn, qp, in);		/* mulhigh */
199       else
200 	{
201 	  tn = mpn_mulmod_bnm1_next_size (dn);
202 	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
203 	  wn = dn + in - tn;			/* number of wrapped limbs */
204 	  if (wn > 0)
205 	    {
206 	      c0 = mpn_sub_n (tp + tn, tp, np, wn);
207 	      mpn_decr_u (tp + wn, c0);
208 	    }
209 	}
210 
211       qp += in;
212       qn -= in;
213 
214       cy = mpn_sub_n (rp, np + in, tp + in, dn);
215       mpn_mullo_n (qp, rp, ip, qn);		/* high qn quotient limbs */
216 
217       if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
218 	mpn_mul (tp, dp, dn, qp, qn);		/* mulhigh */
219       else
220 	{
221 	  tn = mpn_mulmod_bnm1_next_size (dn);
222 	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
223 	  wn = dn + qn - tn;			/* number of wrapped limbs */
224 	  if (wn > 0)
225 	    {
226 	      c0 = mpn_sub_n (tp + tn, tp, rp, wn);
227 	      mpn_decr_u (tp + wn, c0);
228 	    }
229 	}
230 
231       cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
232       if (cy == 2)
233 	{
234 	  mpn_incr_u (tp + dn, 1);
235 	  cy = 1;
236 	}
237       return mpn_sub_nc (rp + dn - qn, np + dn + in, tp + dn, qn, cy);
238 
239 #undef ip
240 #undef tp
241 #undef scratch_out
242     }
243 }
244 
245 mp_limb_t
mpn_mu_bdiv_qr(mp_ptr qp,mp_ptr rp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)246 mpn_mu_bdiv_qr (mp_ptr qp,
247 		mp_ptr rp,
248 		mp_srcptr np, mp_size_t nn,
249 		mp_srcptr dp, mp_size_t dn,
250 		mp_ptr scratch)
251 {
252   mp_limb_t cy = mpn_mu_bdiv_qr_old (qp, rp, np, nn, dp, dn, scratch);
253 
254   /* R' B^{qn} = U - Q' D
255    *
256    * Q = B^{qn} - Q' (assuming Q' != 0)
257    *
258    * R B^{qn} = U + Q D = U + B^{qn} D - Q' D
259    *          = B^{qn} D + R'
260    */
261 
262   if (UNLIKELY (!mpn_neg (qp, qp, nn - dn)))
263     {
264       /* Zero quotient. */
265       ASSERT (cy == 0);
266       return 0;
267     }
268   else
269     {
270       mp_limb_t cy2 = mpn_add_n (rp, rp, dp, dn);
271       ASSERT (cy2 >= cy);
272 
273       return cy2 - cy;
274     }
275 }
276 
277 
278 mp_size_t
mpn_mu_bdiv_qr_itch(mp_size_t nn,mp_size_t dn)279 mpn_mu_bdiv_qr_itch (mp_size_t nn, mp_size_t dn)
280 {
281   mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
282   mp_size_t b;
283 
284   ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD);
285 
286   qn = nn - dn;
287 
288   if (qn > dn)
289     {
290       b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
291       in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
292     }
293   else
294     {
295       in = qn - (qn >> 1);
296     }
297 
298   if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
299     {
300       tn = dn + in;
301       itch_out = 0;
302     }
303   else
304     {
305       tn = mpn_mulmod_bnm1_next_size (dn);
306       itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
307     }
308 
309   itch_binvert = mpn_binvert_itch (in);
310   itches = tn + itch_out;
311   return in + MAX (itches, itch_binvert);
312 }
313