1 /* mpn_mu_bdiv_qr(qp,rp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^qn,
2 where qn = nn-dn, storing the result in {qp,qn}. Overlap allowed between Q
3 and N; all other overlap disallowed.
4
5 Contributed to the GNU project by Torbjorn Granlund.
6
7 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
8 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
9 GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
10
11 Copyright 2005-2007, 2009, 2010, 2012, 2017 Free Software Foundation, Inc.
12
13 This file is part of the GNU MP Library.
14
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
17
18 * the GNU Lesser General Public License as published by the Free
19 Software Foundation; either version 3 of the License, or (at your
20 option) any later version.
21
22 or
23
24 * the GNU General Public License as published by the Free Software
25 Foundation; either version 2 of the License, or (at your option) any
26 later version.
27
28 or both in parallel, as here.
29
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
33 for more details.
34
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library. If not,
37 see https://www.gnu.org/licenses/. */
38
39
40 /*
41 The idea of the algorithm used herein is to compute a smaller inverted value
42 than used in the standard Barrett algorithm, and thus save time in the
43 Newton iterations, and pay just a small price when using the inverted value
44 for developing quotient bits. This algorithm was presented at ICMS 2006.
45 */
46
47 #include "gmp-impl.h"
48
49
50 /* N = {np,nn}
51 D = {dp,dn}
52
53 Requirements: N >= D
54 D >= 1
55 D odd
56 dn >= 2
57 nn >= 2
58 scratch space as determined by mpn_mu_bdiv_qr_itch(nn,dn).
59
60 Write quotient to Q = {qp,nn-dn}.
61
62 FIXME: When iterating, perhaps do the small step before loop, not after.
63 FIXME: Try to avoid the scalar divisions when computing inverse size.
64 FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible. In
65 particular, when dn==in, tp and rp could use the same space.
66 */
67 static mp_limb_t
mpn_mu_bdiv_qr_old(mp_ptr qp,mp_ptr rp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)68 mpn_mu_bdiv_qr_old (mp_ptr qp,
69 mp_ptr rp,
70 mp_srcptr np, mp_size_t nn,
71 mp_srcptr dp, mp_size_t dn,
72 mp_ptr scratch)
73 {
74 mp_size_t qn;
75 mp_size_t in;
76 mp_limb_t cy, c0;
77 mp_size_t tn, wn;
78
79 qn = nn - dn;
80
81 ASSERT (dn >= 2);
82 ASSERT (qn >= 2);
83
84 if (qn > dn)
85 {
86 mp_size_t b;
87
88 /* |_______________________| dividend
89 |________| divisor */
90
91 #define ip scratch /* in */
92 #define tp (scratch + in) /* dn+in or next_size(dn) or rest >= binvert_itch(in) */
93 #define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
94
95 /* Compute an inverse size that is a nice partition of the quotient. */
96 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
97 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
98
99 /* Some notes on allocation:
100
101 When in = dn, R dies when mpn_mullo returns, if in < dn the low in
102 limbs of R dies at that point. We could save memory by letting T live
103 just under R, and let the upper part of T expand into R. These changes
104 should reduce itch to perhaps 3dn.
105 */
106
107 mpn_binvert (ip, dp, in, tp);
108
109 MPN_COPY (rp, np, dn);
110 np += dn;
111 cy = 0;
112
113 while (qn > in)
114 {
115 mpn_mullo_n (qp, rp, ip, in);
116
117 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
118 mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[dn+in-1...in] */
119 else
120 {
121 tn = mpn_mulmod_bnm1_next_size (dn);
122 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
123 wn = dn + in - tn; /* number of wrapped limbs */
124 if (wn > 0)
125 {
126 c0 = mpn_sub_n (tp + tn, tp, rp, wn);
127 mpn_decr_u (tp + wn, c0);
128 }
129 }
130
131 qp += in;
132 qn -= in;
133
134 if (dn != in)
135 {
136 /* Subtract tp[dn-1...in] from partial remainder. */
137 cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
138 if (cy == 2)
139 {
140 mpn_incr_u (tp + dn, 1);
141 cy = 1;
142 }
143 }
144 /* Subtract tp[dn+in-1...dn] from dividend. */
145 cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
146 np += in;
147 }
148
149 /* Generate last qn limbs. */
150 mpn_mullo_n (qp, rp, ip, qn);
151
152 if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
153 mpn_mul (tp, dp, dn, qp, qn); /* mulhi, need tp[qn+in-1...in] */
154 else
155 {
156 tn = mpn_mulmod_bnm1_next_size (dn);
157 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
158 wn = dn + qn - tn; /* number of wrapped limbs */
159 if (wn > 0)
160 {
161 c0 = mpn_sub_n (tp + tn, tp, rp, wn);
162 mpn_decr_u (tp + wn, c0);
163 }
164 }
165
166 if (dn != qn)
167 {
168 cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
169 if (cy == 2)
170 {
171 mpn_incr_u (tp + dn, 1);
172 cy = 1;
173 }
174 }
175 return mpn_sub_nc (rp + dn - qn, np, tp + dn, qn, cy);
176
177 #undef ip
178 #undef tp
179 #undef scratch_out
180 }
181 else
182 {
183 /* |_______________________| dividend
184 |________________| divisor */
185
186 #define ip scratch /* in */
187 #define tp (scratch + in) /* dn+in or next_size(dn) or rest >= binvert_itch(in) */
188 #define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
189
190 /* Compute half-sized inverse. */
191 in = qn - (qn >> 1);
192
193 mpn_binvert (ip, dp, in, tp);
194
195 mpn_mullo_n (qp, np, ip, in); /* low `in' quotient limbs */
196
197 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
198 mpn_mul (tp, dp, dn, qp, in); /* mulhigh */
199 else
200 {
201 tn = mpn_mulmod_bnm1_next_size (dn);
202 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
203 wn = dn + in - tn; /* number of wrapped limbs */
204 if (wn > 0)
205 {
206 c0 = mpn_sub_n (tp + tn, tp, np, wn);
207 mpn_decr_u (tp + wn, c0);
208 }
209 }
210
211 qp += in;
212 qn -= in;
213
214 cy = mpn_sub_n (rp, np + in, tp + in, dn);
215 mpn_mullo_n (qp, rp, ip, qn); /* high qn quotient limbs */
216
217 if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
218 mpn_mul (tp, dp, dn, qp, qn); /* mulhigh */
219 else
220 {
221 tn = mpn_mulmod_bnm1_next_size (dn);
222 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
223 wn = dn + qn - tn; /* number of wrapped limbs */
224 if (wn > 0)
225 {
226 c0 = mpn_sub_n (tp + tn, tp, rp, wn);
227 mpn_decr_u (tp + wn, c0);
228 }
229 }
230
231 cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
232 if (cy == 2)
233 {
234 mpn_incr_u (tp + dn, 1);
235 cy = 1;
236 }
237 return mpn_sub_nc (rp + dn - qn, np + dn + in, tp + dn, qn, cy);
238
239 #undef ip
240 #undef tp
241 #undef scratch_out
242 }
243 }
244
245 mp_limb_t
mpn_mu_bdiv_qr(mp_ptr qp,mp_ptr rp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)246 mpn_mu_bdiv_qr (mp_ptr qp,
247 mp_ptr rp,
248 mp_srcptr np, mp_size_t nn,
249 mp_srcptr dp, mp_size_t dn,
250 mp_ptr scratch)
251 {
252 mp_limb_t cy = mpn_mu_bdiv_qr_old (qp, rp, np, nn, dp, dn, scratch);
253
254 /* R' B^{qn} = U - Q' D
255 *
256 * Q = B^{qn} - Q' (assuming Q' != 0)
257 *
258 * R B^{qn} = U + Q D = U + B^{qn} D - Q' D
259 * = B^{qn} D + R'
260 */
261
262 if (UNLIKELY (!mpn_neg (qp, qp, nn - dn)))
263 {
264 /* Zero quotient. */
265 ASSERT (cy == 0);
266 return 0;
267 }
268 else
269 {
270 mp_limb_t cy2 = mpn_add_n (rp, rp, dp, dn);
271 ASSERT (cy2 >= cy);
272
273 return cy2 - cy;
274 }
275 }
276
277
278 mp_size_t
mpn_mu_bdiv_qr_itch(mp_size_t nn,mp_size_t dn)279 mpn_mu_bdiv_qr_itch (mp_size_t nn, mp_size_t dn)
280 {
281 mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
282 mp_size_t b;
283
284 ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD);
285
286 qn = nn - dn;
287
288 if (qn > dn)
289 {
290 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
291 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
292 }
293 else
294 {
295 in = qn - (qn >> 1);
296 }
297
298 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
299 {
300 tn = dn + in;
301 itch_out = 0;
302 }
303 else
304 {
305 tn = mpn_mulmod_bnm1_next_size (dn);
306 itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
307 }
308
309 itch_binvert = mpn_binvert_itch (in);
310 itches = tn + itch_out;
311 return in + MAX (itches, itch_binvert);
312 }
313