1 /* Mulders' short product, square and division.
2
3 Copyright 2005-2023 Free Software Foundation, Inc.
4 Contributed by the AriC and Caramba projects, INRIA.
5
6 This file is part of the GNU MPFR Library.
7
8 The GNU MPFR Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12
13 The GNU MPFR Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MPFR Library; see the file COPYING.LESSER. If not, see
20 https://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
21 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
22
23 /* References:
24 [1] Short Division of Long Integers, David Harvey and Paul Zimmermann,
25 Proceedings of the 20th Symposium on Computer Arithmetic (ARITH-20),
26 July 25-27, 2011, pages 7-14.
27 */
28
29 #define MPFR_NEED_LONGLONG_H
30 #include "mpfr-impl.h"
31
32 /* Don't use MPFR_MULHIGH_SIZE since it is handled by tuneup */
33 #ifdef MPFR_MULHIGH_TAB_SIZE
34 static short mulhigh_ktab[MPFR_MULHIGH_TAB_SIZE];
35 #else
36 static short mulhigh_ktab[] = {MPFR_MULHIGH_TAB};
37 #define MPFR_MULHIGH_TAB_SIZE (numberof_const (mulhigh_ktab))
38 #endif
39
40 /* Put in rp[n..2n-1] an approximation of the n high limbs
41 of {up, n} * {vp, n}. The error is less than n ulps of rp[n] (and the
42 approximation is always less or equal to the truncated full product).
43 Assume 2n limbs are allocated at rp.
44
45 Implements Algorithm ShortMulNaive from [1].
46 */
47 static void
mpfr_mulhigh_n_basecase(mpfr_limb_ptr rp,mpfr_limb_srcptr up,mpfr_limb_srcptr vp,mp_size_t n)48 mpfr_mulhigh_n_basecase (mpfr_limb_ptr rp, mpfr_limb_srcptr up,
49 mpfr_limb_srcptr vp, mp_size_t n)
50 {
51 mp_size_t i;
52
53 rp += n - 1;
54 umul_ppmm (rp[1], rp[0], up[n-1], vp[0]); /* we neglect up[0..n-2]*vp[0],
55 which is less than B^n */
56 for (i = 1 ; i < n ; i++)
57 /* here, we neglect up[0..n-i-2] * vp[i], which is less than B^n too */
58 rp[i + 1] = mpn_addmul_1 (rp, up + (n - i - 1), i + 1, vp[i]);
59 /* in total, we neglect less than n*B^n, i.e., n ulps of rp[n]. */
60 }
61
62 /* Put in rp[n..2n-1] an approximation of the n high limbs
63 of {np, n} * {mp, n}. The error is less than n ulps of rp[n] (and the
64 approximation is always less or equal to the truncated full product).
65
66 Implements Algorithm ShortMul from [1].
67 */
68 void
mpfr_mulhigh_n(mpfr_limb_ptr rp,mpfr_limb_srcptr np,mpfr_limb_srcptr mp,mp_size_t n)69 mpfr_mulhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp,
70 mp_size_t n)
71 {
72 mp_size_t k;
73
74 MPFR_STAT_STATIC_ASSERT (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */
75 k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4);
76 /* Algorithm ShortMul from [1] requires k >= (n+3)/2, which translates
77 into k >= (n+4)/2 in the C language. */
78 MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
79 if (k < 0)
80 mpn_mul_basecase (rp, np, n, mp, n); /* result is exact, no error */
81 else if (k == 0)
82 mpfr_mulhigh_n_basecase (rp, np, mp, n); /* basecase error < n ulps */
83 else if (n > MUL_FFT_THRESHOLD)
84 mpn_mul_n (rp, np, mp, n); /* result is exact, no error */
85 else
86 {
87 mp_size_t l = n - k;
88 mp_limb_t cy;
89
90 mpn_mul_n (rp + 2 * l, np + l, mp + l, k); /* fills rp[2l..2n-1] */
91 mpfr_mulhigh_n (rp, np + k, mp, l); /* fills rp[l-1..2l-1] */
92 cy = mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
93 mpfr_mulhigh_n (rp, np, mp + k, l); /* fills rp[l-1..2l-1] */
94 cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
95 mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
96 }
97 }
98
99 #ifdef MPFR_SQRHIGH_TAB_SIZE
100 static short sqrhigh_ktab[MPFR_SQRHIGH_TAB_SIZE];
101 #else
102 static short sqrhigh_ktab[] = {MPFR_SQRHIGH_TAB};
103 #define MPFR_SQRHIGH_TAB_SIZE (numberof_const (sqrhigh_ktab))
104 #endif
105
106 /* Put in rp[n..2n-1] an approximation of the n high limbs
107 of {np, n}^2. The error is less than n ulps of rp[n]. */
108 void
mpfr_sqrhigh_n(mpfr_limb_ptr rp,mpfr_limb_srcptr np,mp_size_t n)109 mpfr_sqrhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mp_size_t n)
110 {
111 mp_size_t k;
112
113 MPFR_STAT_STATIC_ASSERT (MPFR_SQRHIGH_TAB_SIZE > 2); /* ensures k < n */
114 k = MPFR_LIKELY (n < MPFR_SQRHIGH_TAB_SIZE) ? sqrhigh_ktab[n]
115 : (n+4)/2; /* ensures that k >= (n+3)/2 */
116 MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
117 if (k < 0)
118 /* we can't use mpn_sqr_basecase here, since it requires
119 n <= SQR_KARATSUBA_THRESHOLD, where SQR_KARATSUBA_THRESHOLD
120 is not exported by GMP */
121 mpn_sqr (rp, np, n);
122 else if (k == 0)
123 mpfr_mulhigh_n_basecase (rp, np, np, n);
124 else
125 {
126 mp_size_t l = n - k;
127 mp_limb_t cy;
128
129 mpn_sqr (rp + 2 * l, np + l, k); /* fills rp[2l..2n-1] */
130 mpfr_mulhigh_n (rp, np, np + k, l); /* fills rp[l-1..2l-1] */
131 /* {rp+n-1,l+1} += 2 * {rp+l-1,l+1} */
132 cy = mpn_lshift (rp + l - 1, rp + l - 1, l + 1, 1);
133 cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
134 mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
135 }
136 }
137
138 #ifdef MPFR_DIVHIGH_TAB_SIZE
139 static short divhigh_ktab[MPFR_DIVHIGH_TAB_SIZE];
140 #else
141 static short divhigh_ktab[] = {MPFR_DIVHIGH_TAB};
142 #define MPFR_DIVHIGH_TAB_SIZE (numberof_const (divhigh_ktab))
143 #endif
144
145 /* Put in Q={qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
146 with the most significant limb of the quotient as return value (0 or 1).
147 Assumes the most significant bit of D is set. Clobbers N.
148
149 The approximate quotient Q satisfies - 2(n-1) < N/D - Q <= 4.
150
151 Assumes n >= 2.
152 */
153 static mp_limb_t
mpfr_divhigh_n_basecase(mpfr_limb_ptr qp,mpfr_limb_ptr np,mpfr_limb_srcptr dp,mp_size_t n)154 mpfr_divhigh_n_basecase (mpfr_limb_ptr qp, mpfr_limb_ptr np,
155 mpfr_limb_srcptr dp, mp_size_t n)
156 {
157 mp_limb_t qh, d1, d0, q2, q1, q0;
158 mpfr_pi1_t dinv2;
159
160 MPFR_ASSERTD(n >= 2);
161
162 np += n;
163
164 if ((qh = (mpn_cmp (np, dp, n) >= 0)))
165 mpn_sub_n (np, np, dp, n);
166
167 /* now {np, n} is less than D={dp, n}, which implies np[n-1] <= dp[n-1] */
168
169 d1 = dp[n - 1];
170
171 /* we assumed n >= 2 */
172 d0 = dp[n - 2];
173 invert_pi1 (dinv2, d1, d0);
174 /* dinv2.inv32 = floor ((B^3 - 1) / (d0 + d1 B)) - B */
175 while (n > 1)
176 {
177 /* Invariant: it remains to reduce n limbs from N (in addition to the
178 initial low n limbs).
179 Since n >= 2 here, necessarily we had n >= 2 initially, which means
180 that in addition to the limb np[n-1] to reduce, we have at least 2
181 extra limbs, thus accessing np[n-3] is valid. */
182
183 /* Warning: we can have np[n-1]>d1 or (np[n-1]=d1 and np[n-2]>=d0) here,
184 since we truncate the divisor at each step, but since {np,n} < D
185 originally, the largest possible partial quotient is B-1. */
186 if (MPFR_UNLIKELY(np[n-1] > d1 || (np[n-1] == d1 && np[n-2] >= d0)))
187 q2 = MPFR_LIMB_MAX;
188 else
189 udiv_qr_3by2 (q2, q1, q0, np[n - 1], np[n - 2], np[n - 3],
190 d1, d0, dinv2.inv32);
191 /* since q2 = floor((np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0)),
192 we have q2 <= (np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0),
193 thus np[n-1]*B^2+np[n-2]*B+np[n-3] >= q2*(d1*B+d0)
194 and {np-1, n} >= q2*D - q2*B^(n-2) >= q2*D - B^(n-1)
195 thus {np-1, n} - (q2-1)*D >= D - B^(n-1) >= 0
196 which proves that at most one correction is needed */
197 q0 = mpn_submul_1 (np - 1, dp, n, q2);
198 if (MPFR_UNLIKELY(q0 > np[n - 1]))
199 {
200 mpn_add_n (np - 1, np - 1, dp, n);
201 q2 --;
202 }
203 qp[--n] = q2;
204 dp ++;
205 }
206
207 /* we have B+dinv2 = floor((B^3-1)/(d1*B+d0)) < B^2/d1
208 q1 = floor(np[0]*(B+dinv2)/B) <= floor(np[0]*B/d1)
209 <= floor((np[0]*B+np[1])/d1)
210 thus q1 is not larger than the true quotient.
211 q1 > np[0]*(B+dinv2)/B - 1 > np[0]*(B^3-1)/(d1*B+d0)/B - 2
212 For d1*B+d0 <> B^2/2, we have B+dinv2 = floor(B^3/(d1*B+d0))
213 thus q1 > np[0]*B^2/(d1*B+d0) - 2, i.e.,
214 (d1*B+d0)*q1 > np[0]*B^2 - 2*(d1*B+d0)
215 d1*B*q1 > np[0]*B^2 - 2*d1*B - 2*d0 - d0*q1 >= np[0]*B^2 - 2*d1*B - B^2
216 thus q1 > np[0]*B/d1 - 2 - B/d1 > np[0]*B/d1 - 4.
217
218 For d1*B+d0 = B^2/2, dinv2 = B-1 thus q1 > np[0]*(2B-1)/B - 1 >
219 np[0]*B/d1 - 2.
220
221 In all cases, if q = floor((np[0]*B+np[1])/d1), we have:
222 q - 4 <= q1 <= q
223 */
224 umul_ppmm (q1, q0, np[0], dinv2.inv32);
225 qp[0] = np[0] + q1;
226
227 return qh;
228 }
229
230 /* Put in {qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
231 with the most significant limb of the quotient as return value (0 or 1).
232 Assumes the most significant bit of D is set. Clobbers N.
233
234 This implements the ShortDiv algorithm from reference [1].
235
236 Assumes n >= 2 (which should be fulfilled also in the recursive calls).
237 */
238 mp_limb_t
mpfr_divhigh_n(mpfr_limb_ptr qp,mpfr_limb_ptr np,mpfr_limb_ptr dp,mp_size_t n)239 mpfr_divhigh_n (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_ptr dp,
240 mp_size_t n)
241 {
242 mp_size_t k, l;
243 mp_limb_t qh, cy;
244 mpfr_limb_ptr tp;
245 MPFR_TMP_DECL(marker);
246
247 MPFR_STAT_STATIC_ASSERT (MPFR_DIVHIGH_TAB_SIZE >= 15); /* so that 2*(n/3) >= (n+4)/2 */
248 MPFR_ASSERTD(n >= 2);
249 k = MPFR_LIKELY (n < MPFR_DIVHIGH_TAB_SIZE) ? divhigh_ktab[n] : 2*(n/3);
250
251 if (k == 0)
252 {
253 #if defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q)
254 mpfr_pi1_t dinv2;
255 invert_pi1 (dinv2, dp[n - 1], dp[n - 2]);
256 if (n > 2) /* sbpi1_divappr_q wants n > 2 */
257 return __gmpn_sbpi1_divappr_q (qp, np, n + n, dp, n, dinv2.inv32);
258 else
259 return mpfr_divhigh_n_basecase (qp, np, dp, n);
260 #else /* use our own code for base-case short division */
261 return mpfr_divhigh_n_basecase (qp, np, dp, n);
262 #endif
263 }
264
265 /* Check the bounds from [1]. In addition, we forbid k=n-1, which would
266 give l=1 in the recursive call. It follows n >= 5. */
267 MPFR_ASSERTD ((n+4)/2 <= k && k < n-1);
268
269 MPFR_TMP_MARK (marker);
270 l = n - k;
271 /* first divide the most significant 2k limbs from N by the most significant
272 k limbs of D */
273 qh = mpn_divrem (qp + l, 0, np + 2 * l, 2 * k, dp + l, k); /* exact */
274
275 /* it remains {np,2l+k} = {np,n+l} as remainder */
276
277 /* now we have to subtract high(Q1)*D0 where Q1=qh*B^k+{qp+l,k} and
278 D0={dp,l} */
279 tp = MPFR_TMP_LIMBS_ALLOC (2 * l);
280 mpfr_mulhigh_n (tp, qp + k, dp, l);
281 /* we are only interested in the upper l limbs from {tp,2l} */
282 cy = mpn_sub_n (np + n, np + n, tp + l, l);
283 if (qh)
284 cy += mpn_sub_n (np + n, np + n, dp, l);
285 while (cy > 0) /* Q1 was too large: subtract 1 to Q1 and add D to np+l */
286 {
287 qh -= mpn_sub_1 (qp + l, qp + l, k, MPFR_LIMB_ONE);
288 cy -= mpn_add_n (np + l, np + l, dp, n);
289 }
290
291 /* now it remains {np,n+l} to divide by D */
292 cy = mpfr_divhigh_n (qp, np + k, dp + k, l);
293 qh += mpn_add_1 (qp + l, qp + l, k, cy);
294 MPFR_TMP_FREE(marker);
295
296 return qh;
297 }
298