xref: /netbsd-src/external/lgpl3/mpfr/dist/src/mulders.c (revision ba125506a622fe649968631a56eba5d42ff57863)
1 /* Mulders' short product, square and division.
2 
3 Copyright 2005-2023 Free Software Foundation, Inc.
4 Contributed by the AriC and Caramba projects, INRIA.
5 
6 This file is part of the GNU MPFR Library.
7 
8 The GNU MPFR Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 
13 The GNU MPFR Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16 License for more details.
17 
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
20 https://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
21 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
22 
23 /* References:
24    [1] Short Division of Long Integers, David Harvey and Paul Zimmermann,
25        Proceedings of the 20th Symposium on Computer Arithmetic (ARITH-20),
26        July 25-27, 2011, pages 7-14.
27 */
28 
29 #define MPFR_NEED_LONGLONG_H
30 #include "mpfr-impl.h"
31 
32 /* Don't use MPFR_MULHIGH_SIZE since it is handled by tuneup */
33 #ifdef MPFR_MULHIGH_TAB_SIZE
34 static short mulhigh_ktab[MPFR_MULHIGH_TAB_SIZE];
35 #else
36 static short mulhigh_ktab[] = {MPFR_MULHIGH_TAB};
37 #define MPFR_MULHIGH_TAB_SIZE (numberof_const (mulhigh_ktab))
38 #endif
39 
40 /* Put in  rp[n..2n-1] an approximation of the n high limbs
41    of {up, n} * {vp, n}. The error is less than n ulps of rp[n] (and the
42    approximation is always less or equal to the truncated full product).
43    Assume 2n limbs are allocated at rp.
44 
45    Implements Algorithm ShortMulNaive from [1].
46 */
47 static void
mpfr_mulhigh_n_basecase(mpfr_limb_ptr rp,mpfr_limb_srcptr up,mpfr_limb_srcptr vp,mp_size_t n)48 mpfr_mulhigh_n_basecase (mpfr_limb_ptr rp, mpfr_limb_srcptr up,
49                          mpfr_limb_srcptr vp, mp_size_t n)
50 {
51   mp_size_t i;
52 
53   rp += n - 1;
54   umul_ppmm (rp[1], rp[0], up[n-1], vp[0]); /* we neglect up[0..n-2]*vp[0],
55                                                which is less than B^n */
56   for (i = 1 ; i < n ; i++)
57     /* here, we neglect up[0..n-i-2] * vp[i], which is less than B^n too */
58     rp[i + 1] = mpn_addmul_1 (rp, up + (n - i - 1), i + 1, vp[i]);
59   /* in total, we neglect less than n*B^n, i.e., n ulps of rp[n]. */
60 }
61 
62 /* Put in  rp[n..2n-1] an approximation of the n high limbs
63    of {np, n} * {mp, n}. The error is less than n ulps of rp[n] (and the
64    approximation is always less or equal to the truncated full product).
65 
66    Implements Algorithm ShortMul from [1].
67 */
68 void
mpfr_mulhigh_n(mpfr_limb_ptr rp,mpfr_limb_srcptr np,mpfr_limb_srcptr mp,mp_size_t n)69 mpfr_mulhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp,
70                 mp_size_t n)
71 {
72   mp_size_t k;
73 
74   MPFR_STAT_STATIC_ASSERT (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */
75   k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4);
76   /* Algorithm ShortMul from [1] requires k >= (n+3)/2, which translates
77      into k >= (n+4)/2 in the C language. */
78   MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
79   if (k < 0)
80     mpn_mul_basecase (rp, np, n, mp, n); /* result is exact, no error */
81   else if (k == 0)
82     mpfr_mulhigh_n_basecase (rp, np, mp, n); /* basecase error < n ulps */
83   else if (n > MUL_FFT_THRESHOLD)
84     mpn_mul_n (rp, np, mp, n); /* result is exact, no error */
85   else
86     {
87       mp_size_t l = n - k;
88       mp_limb_t cy;
89 
90       mpn_mul_n (rp + 2 * l, np + l, mp + l, k); /* fills rp[2l..2n-1] */
91       mpfr_mulhigh_n (rp, np + k, mp, l);        /* fills rp[l-1..2l-1] */
92       cy = mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
93       mpfr_mulhigh_n (rp, np, mp + k, l);        /* fills rp[l-1..2l-1] */
94       cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
95       mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
96     }
97 }
98 
99 #ifdef MPFR_SQRHIGH_TAB_SIZE
100 static short sqrhigh_ktab[MPFR_SQRHIGH_TAB_SIZE];
101 #else
102 static short sqrhigh_ktab[] = {MPFR_SQRHIGH_TAB};
103 #define MPFR_SQRHIGH_TAB_SIZE (numberof_const (sqrhigh_ktab))
104 #endif
105 
106 /* Put in  rp[n..2n-1] an approximation of the n high limbs
107    of {np, n}^2. The error is less than n ulps of rp[n]. */
108 void
mpfr_sqrhigh_n(mpfr_limb_ptr rp,mpfr_limb_srcptr np,mp_size_t n)109 mpfr_sqrhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mp_size_t n)
110 {
111   mp_size_t k;
112 
113   MPFR_STAT_STATIC_ASSERT (MPFR_SQRHIGH_TAB_SIZE > 2); /* ensures k < n */
114   k = MPFR_LIKELY (n < MPFR_SQRHIGH_TAB_SIZE) ? sqrhigh_ktab[n]
115     : (n+4)/2; /* ensures that k >= (n+3)/2 */
116   MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
117   if (k < 0)
118     /* we can't use mpn_sqr_basecase here, since it requires
119        n <= SQR_KARATSUBA_THRESHOLD, where SQR_KARATSUBA_THRESHOLD
120        is not exported by GMP */
121     mpn_sqr (rp, np, n);
122   else if (k == 0)
123     mpfr_mulhigh_n_basecase (rp, np, np, n);
124   else
125     {
126       mp_size_t l = n - k;
127       mp_limb_t cy;
128 
129       mpn_sqr (rp + 2 * l, np + l, k);            /* fills rp[2l..2n-1] */
130       mpfr_mulhigh_n (rp, np, np + k, l);         /* fills rp[l-1..2l-1] */
131       /* {rp+n-1,l+1} += 2 * {rp+l-1,l+1} */
132       cy = mpn_lshift (rp + l - 1, rp + l - 1, l + 1, 1);
133       cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
134       mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
135     }
136 }
137 
138 #ifdef MPFR_DIVHIGH_TAB_SIZE
139 static short divhigh_ktab[MPFR_DIVHIGH_TAB_SIZE];
140 #else
141 static short divhigh_ktab[] = {MPFR_DIVHIGH_TAB};
142 #define MPFR_DIVHIGH_TAB_SIZE (numberof_const (divhigh_ktab))
143 #endif
144 
145 /* Put in Q={qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
146    with the most significant limb of the quotient as return value (0 or 1).
147    Assumes the most significant bit of D is set. Clobbers N.
148 
149    The approximate quotient Q satisfies - 2(n-1) < N/D - Q <= 4.
150 
151    Assumes n >= 2.
152 */
153 static mp_limb_t
mpfr_divhigh_n_basecase(mpfr_limb_ptr qp,mpfr_limb_ptr np,mpfr_limb_srcptr dp,mp_size_t n)154 mpfr_divhigh_n_basecase (mpfr_limb_ptr qp, mpfr_limb_ptr np,
155                          mpfr_limb_srcptr dp, mp_size_t n)
156 {
157   mp_limb_t qh, d1, d0, q2, q1, q0;
158   mpfr_pi1_t dinv2;
159 
160   MPFR_ASSERTD(n >= 2);
161 
162   np += n;
163 
164   if ((qh = (mpn_cmp (np, dp, n) >= 0)))
165     mpn_sub_n (np, np, dp, n);
166 
167   /* now {np, n} is less than D={dp, n}, which implies np[n-1] <= dp[n-1] */
168 
169   d1 = dp[n - 1];
170 
171   /* we assumed n >= 2 */
172   d0 = dp[n - 2];
173   invert_pi1 (dinv2, d1, d0);
174   /* dinv2.inv32 = floor ((B^3 - 1) / (d0 + d1 B)) - B */
175   while (n > 1)
176     {
177       /* Invariant: it remains to reduce n limbs from N (in addition to the
178          initial low n limbs).
179          Since n >= 2 here, necessarily we had n >= 2 initially, which means
180          that in addition to the limb np[n-1] to reduce, we have at least 2
181          extra limbs, thus accessing np[n-3] is valid. */
182 
183       /* Warning: we can have np[n-1]>d1 or (np[n-1]=d1 and np[n-2]>=d0) here,
184          since we truncate the divisor at each step, but since {np,n} < D
185          originally, the largest possible partial quotient is B-1. */
186       if (MPFR_UNLIKELY(np[n-1] > d1 || (np[n-1] == d1 && np[n-2] >= d0)))
187         q2 = MPFR_LIMB_MAX;
188       else
189         udiv_qr_3by2 (q2, q1, q0, np[n - 1], np[n - 2], np[n - 3],
190                       d1, d0, dinv2.inv32);
191       /* since q2 = floor((np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0)),
192          we have q2 <= (np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0),
193          thus np[n-1]*B^2+np[n-2]*B+np[n-3] >= q2*(d1*B+d0)
194          and {np-1, n} >= q2*D - q2*B^(n-2) >= q2*D - B^(n-1)
195          thus {np-1, n} - (q2-1)*D >= D - B^(n-1) >= 0
196          which proves that at most one correction is needed */
197       q0 = mpn_submul_1 (np - 1, dp, n, q2);
198       if (MPFR_UNLIKELY(q0 > np[n - 1]))
199         {
200           mpn_add_n (np - 1, np - 1, dp, n);
201           q2 --;
202         }
203       qp[--n] = q2;
204       dp ++;
205     }
206 
207   /* we have B+dinv2 = floor((B^3-1)/(d1*B+d0)) < B^2/d1
208      q1 = floor(np[0]*(B+dinv2)/B) <= floor(np[0]*B/d1)
209         <= floor((np[0]*B+np[1])/d1)
210      thus q1 is not larger than the true quotient.
211      q1 > np[0]*(B+dinv2)/B - 1 > np[0]*(B^3-1)/(d1*B+d0)/B - 2
212      For d1*B+d0 <> B^2/2, we have B+dinv2 = floor(B^3/(d1*B+d0))
213      thus q1 > np[0]*B^2/(d1*B+d0) - 2, i.e.,
214      (d1*B+d0)*q1 > np[0]*B^2 - 2*(d1*B+d0)
215      d1*B*q1 > np[0]*B^2 - 2*d1*B - 2*d0 - d0*q1 >= np[0]*B^2 - 2*d1*B - B^2
216      thus q1 > np[0]*B/d1 - 2 - B/d1 > np[0]*B/d1 - 4.
217 
218      For d1*B+d0 = B^2/2, dinv2 = B-1 thus q1 > np[0]*(2B-1)/B - 1 >
219      np[0]*B/d1 - 2.
220 
221      In all cases, if q = floor((np[0]*B+np[1])/d1), we have:
222      q - 4 <= q1 <= q
223   */
224   umul_ppmm (q1, q0, np[0], dinv2.inv32);
225   qp[0] = np[0] + q1;
226 
227   return qh;
228 }
229 
230 /* Put in {qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
231    with the most significant limb of the quotient as return value (0 or 1).
232    Assumes the most significant bit of D is set. Clobbers N.
233 
234    This implements the ShortDiv algorithm from reference [1].
235 
236    Assumes n >= 2 (which should be fulfilled also in the recursive calls).
237 */
238 mp_limb_t
mpfr_divhigh_n(mpfr_limb_ptr qp,mpfr_limb_ptr np,mpfr_limb_ptr dp,mp_size_t n)239 mpfr_divhigh_n (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_ptr dp,
240                 mp_size_t n)
241 {
242   mp_size_t k, l;
243   mp_limb_t qh, cy;
244   mpfr_limb_ptr tp;
245   MPFR_TMP_DECL(marker);
246 
247   MPFR_STAT_STATIC_ASSERT (MPFR_DIVHIGH_TAB_SIZE >= 15); /* so that 2*(n/3) >= (n+4)/2 */
248   MPFR_ASSERTD(n >= 2);
249   k = MPFR_LIKELY (n < MPFR_DIVHIGH_TAB_SIZE) ? divhigh_ktab[n] : 2*(n/3);
250 
251   if (k == 0)
252     {
253 #if defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q)
254       mpfr_pi1_t dinv2;
255       invert_pi1 (dinv2, dp[n - 1], dp[n - 2]);
256       if (n > 2) /* sbpi1_divappr_q wants n > 2 */
257         return __gmpn_sbpi1_divappr_q (qp, np, n + n, dp, n, dinv2.inv32);
258       else
259         return mpfr_divhigh_n_basecase (qp, np, dp, n);
260 #else /* use our own code for base-case short division */
261       return mpfr_divhigh_n_basecase (qp, np, dp, n);
262 #endif
263     }
264 
265   /* Check the bounds from [1]. In addition, we forbid k=n-1, which would
266      give l=1 in the recursive call. It follows n >= 5. */
267   MPFR_ASSERTD ((n+4)/2 <= k && k < n-1);
268 
269   MPFR_TMP_MARK (marker);
270   l = n - k;
271   /* first divide the most significant 2k limbs from N by the most significant
272      k limbs of D */
273   qh = mpn_divrem (qp + l, 0, np + 2 * l, 2 * k, dp + l, k); /* exact */
274 
275   /* it remains {np,2l+k} = {np,n+l} as remainder */
276 
277   /* now we have to subtract high(Q1)*D0 where Q1=qh*B^k+{qp+l,k} and
278      D0={dp,l} */
279   tp = MPFR_TMP_LIMBS_ALLOC (2 * l);
280   mpfr_mulhigh_n (tp, qp + k, dp, l);
281   /* we are only interested in the upper l limbs from {tp,2l} */
282   cy = mpn_sub_n (np + n, np + n, tp + l, l);
283   if (qh)
284     cy += mpn_sub_n (np + n, np + n, dp, l);
285   while (cy > 0) /* Q1 was too large: subtract 1 to Q1 and add D to np+l */
286     {
287       qh -= mpn_sub_1 (qp + l, qp + l, k, MPFR_LIMB_ONE);
288       cy -= mpn_add_n (np + l, np + l, dp, n);
289     }
290 
291   /* now it remains {np,n+l} to divide by D */
292   cy = mpfr_divhigh_n (qp, np + k, dp + k, l);
293   qh += mpn_add_1 (qp + l, qp + l, k, cy);
294   MPFR_TMP_FREE(marker);
295 
296   return qh;
297 }
298