xref: /netbsd-src/external/lgpl3/mpfr/dist/src/sqr.c (revision ec6772edaf0cdcb5f52a48f4aca5e33a8fb8ecfd)
1 /* mpfr_sqr -- Floating-point square
2 
3 Copyright 2004-2023 Free Software Foundation, Inc.
4 Contributed by the AriC and Caramba projects, INRIA.
5 
6 This file is part of the GNU MPFR Library.
7 
8 The GNU MPFR Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 
13 The GNU MPFR Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16 License for more details.
17 
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
20 https://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
21 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
22 
23 #define MPFR_NEED_LONGLONG_H
24 #include "mpfr-impl.h"
25 
26 #if !defined(MPFR_GENERIC_ABI) && (GMP_NUMB_BITS == 32 || GMP_NUMB_BITS == 64)
27 
28 /* Special code for prec(a) < GMP_NUMB_BITS and prec(b) <= GMP_NUMB_BITS.
29    Note: this function was copied from mpfr_mul_1 in file mul.c, thus any
30    change here should be done also in mpfr_mul_1.
31    Although this function works as soon as prec(a) < GMP_NUMB_BITS and
32    prec(b) <= GMP_NUMB_BITS, we use it for prec(a)=prec(b) < GMP_NUMB_BITS. */
33 static int
mpfr_sqr_1(mpfr_ptr a,mpfr_srcptr b,mpfr_rnd_t rnd_mode,mpfr_prec_t p)34 mpfr_sqr_1 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p)
35 {
36   mp_limb_t a0;
37   mpfr_limb_ptr ap = MPFR_MANT(a);
38   mp_limb_t b0 = MPFR_MANT(b)[0];
39   mpfr_exp_t ax;
40   mpfr_prec_t sh = GMP_NUMB_BITS - p;
41   mp_limb_t rb, sb, mask = MPFR_LIMB_MASK(sh);
42 
43   /* When prec(b) <= GMP_NUMB_BITS / 2, we could replace umul_ppmm
44      by a limb multiplication as follows, but we assume umul_ppmm is as fast
45      as a limb multiplication on modern processors:
46       a0 = (b0 >> (GMP_NUMB_BITS / 2)) * (b0 >> (GMP_NUMB_BITS / 2));
47       sb = 0;
48   */
49   ax = MPFR_GET_EXP(b) * 2;
50   umul_ppmm (a0, sb, b0, b0);
51   if (a0 < MPFR_LIMB_HIGHBIT)
52     {
53       ax --;
54       a0 = (a0 << 1) | (sb >> (GMP_NUMB_BITS - 1));
55       sb <<= 1;
56     }
57   rb = a0 & (MPFR_LIMB_ONE << (sh - 1));
58   sb |= (a0 & mask) ^ rb;
59   ap[0] = a0 & ~mask;
60 
61   MPFR_SIGN(a) = MPFR_SIGN_POS;
62 
63   /* rounding */
64   if (MPFR_UNLIKELY(ax > __gmpfr_emax))
65     return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
66 
67   /* Warning: underflow should be checked *after* rounding, thus when rounding
68      away and when a > 0.111...111*2^(emin-1), or when rounding to nearest and
69      a >= 0.111...111[1]*2^(emin-1), there is no underflow. */
70   if (MPFR_UNLIKELY(ax < __gmpfr_emin))
71     {
72       /* Note: for emin=2*k+1, a >= 0.111...111*2^(emin-1) is not possible,
73          i.e., a >= (1 - 2^(-p))*2^(2k), since we need a = b^2 with EXP(b)=k,
74          and the largest such b is (1 - 2^(-p))*2^k satisfies
75          b^2 < (1 - 2^(-p))*2^(2k).
76          For emin=2*k, it is only possible for some values of p: it is not
77          possible for p=53, because the largest significand is 6369051672525772
78          but its square has only 52 leading ones. For p=24 it is possible,
79          with b = 11863283, whose square has 24 leading ones. */
80       if (ax == __gmpfr_emin - 1 && ap[0] == ~mask &&
81           ((rnd_mode == MPFR_RNDN && rb) ||
82            (MPFR_IS_LIKE_RNDA (rnd_mode, 0) && (rb | sb))))
83         goto rounding; /* no underflow */
84       /* For RNDN, mpfr_underflow always rounds away, thus for |a| <= 2^(emin-2)
85          we have to change to RNDZ. This corresponds to:
86          (a) either ax < emin - 1
87          (b) or ax = emin - 1 and ap[0] = 1000....000 and rb = sb = 0 */
88       if (rnd_mode == MPFR_RNDN &&
89           (ax < __gmpfr_emin - 1 ||
90            (ap[0] == MPFR_LIMB_HIGHBIT && (rb | sb) == 0)))
91         rnd_mode = MPFR_RNDZ;
92       return mpfr_underflow (a, rnd_mode, MPFR_SIGN_POS);
93     }
94 
95  rounding:
96   MPFR_EXP (a) = ax; /* Don't use MPFR_SET_EXP since ax might be < __gmpfr_emin
97                         in the cases "goto rounding" above. */
98   if ((rb == 0 && sb == 0) || rnd_mode == MPFR_RNDF)
99     {
100       MPFR_ASSERTD(ax >= __gmpfr_emin);
101       MPFR_RET (0);
102     }
103   else if (rnd_mode == MPFR_RNDN)
104     {
105       if (rb == 0 || (sb == 0 && (ap[0] & (MPFR_LIMB_ONE << sh)) == 0))
106         goto truncate;
107       else
108         goto add_one_ulp;
109     }
110   else if (MPFR_IS_LIKE_RNDZ (rnd_mode, 0))
111     {
112     truncate:
113       MPFR_ASSERTD(ax >= __gmpfr_emin);
114       MPFR_RET(-MPFR_SIGN_POS);
115     }
116   else /* round away from zero */
117     {
118     add_one_ulp:
119       ap[0] += MPFR_LIMB_ONE << sh;
120       if (ap[0] == 0)
121         {
122           ap[0] = MPFR_LIMB_HIGHBIT;
123           if (MPFR_UNLIKELY(ax + 1 > __gmpfr_emax))
124             return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
125           MPFR_ASSERTD(ax + 1 <= __gmpfr_emax);
126           MPFR_ASSERTD(ax + 1 >= __gmpfr_emin);
127           MPFR_SET_EXP (a, ax + 1);
128         }
129       MPFR_RET(MPFR_SIGN_POS);
130     }
131 }
132 
133 /* special code for PREC(a) = GMP_NUMB_BITS */
134 static int
mpfr_sqr_1n(mpfr_ptr a,mpfr_srcptr b,mpfr_rnd_t rnd_mode)135 mpfr_sqr_1n (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode)
136 {
137   mp_limb_t a0;
138   mpfr_limb_ptr ap = MPFR_MANT(a);
139   mp_limb_t b0 = MPFR_MANT(b)[0];
140   mpfr_exp_t ax;
141   mp_limb_t rb, sb;
142 
143   ax = MPFR_GET_EXP(b) * 2;
144   umul_ppmm (a0, sb, b0, b0);
145   if (a0 < MPFR_LIMB_HIGHBIT)
146     {
147       ax --;
148       a0 = (a0 << 1) | (sb >> (GMP_NUMB_BITS - 1));
149       sb <<= 1;
150     }
151   rb = sb & MPFR_LIMB_HIGHBIT;
152   sb = sb & ~MPFR_LIMB_HIGHBIT;
153   ap[0] = a0;
154 
155   MPFR_SIGN(a) = MPFR_SIGN_POS;
156 
157   /* rounding */
158   if (MPFR_UNLIKELY(ax > __gmpfr_emax))
159     return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
160 
161   /* Warning: underflow should be checked *after* rounding, thus when rounding
162      away and when a > 0.111...111*2^(emin-1), or when rounding to nearest and
163      a >= 0.111...111[1]*2^(emin-1), there is no underflow. */
164   if (MPFR_UNLIKELY(ax < __gmpfr_emin))
165     {
166       /* As seen in mpfr_mul_1, we cannot have a0 = 111...111 here if there
167          was not an exponent decrease (ax--) above.
168          In the case of an exponent decrease:
169          - For GMP_NUMB_BITS=32, a0 = 111...111 is not possible since the
170            largest b0 such that b0^2 < 2^(2*32-1) is b0=3037000499, but
171            its square has only 30 leading ones.
172          - For GMP_NUMB_BITS=64, a0 = 111...111 is possible: the largest b0
173            is 13043817825332782212, and its square has 64 leading ones; but
174            since the next bit is rb=0, for RNDN, we always have an underflow.
175          For the test below, note that a is positive.
176       */
177       if (ax == __gmpfr_emin - 1 && ap[0] == MPFR_LIMB_MAX &&
178           MPFR_IS_LIKE_RNDA (rnd_mode, 0))
179         goto rounding; /* no underflow */
180       /* For RNDN, mpfr_underflow always rounds away, thus for |a| <= 2^(emin-2)
181          we have to change to RNDZ. This corresponds to:
182          (a) either ax < emin - 1
183          (b) or ax = emin - 1 and ap[0] = 1000....000 and rb = sb = 0 */
184       if (rnd_mode == MPFR_RNDN &&
185           (ax < __gmpfr_emin - 1 ||
186            (ap[0] == MPFR_LIMB_HIGHBIT && (rb | sb) == 0)))
187         rnd_mode = MPFR_RNDZ;
188       return mpfr_underflow (a, rnd_mode, MPFR_SIGN_POS);
189     }
190 
191  rounding:
192   MPFR_EXP (a) = ax; /* Don't use MPFR_SET_EXP since ax might be < __gmpfr_emin
193                         in the cases "goto rounding" above. */
194   if ((rb == 0 && sb == 0) || rnd_mode == MPFR_RNDF)
195     {
196       MPFR_ASSERTD(ax >= __gmpfr_emin);
197       MPFR_RET (0);
198     }
199   else if (rnd_mode == MPFR_RNDN)
200     {
201       if (rb == 0 || (sb == 0 && (ap[0] & MPFR_LIMB_ONE) == 0))
202         goto truncate;
203       else
204         goto add_one_ulp;
205     }
206   else if (MPFR_IS_LIKE_RNDZ (rnd_mode, 0))
207     {
208     truncate:
209       MPFR_ASSERTD(ax >= __gmpfr_emin);
210       MPFR_RET(-MPFR_SIGN_POS);
211     }
212   else /* round away from zero */
213     {
214     add_one_ulp:
215       ap[0] += MPFR_LIMB_ONE;
216       if (ap[0] == 0)
217         {
218           ap[0] = MPFR_LIMB_HIGHBIT;
219           if (MPFR_UNLIKELY(ax + 1 > __gmpfr_emax))
220             return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
221           MPFR_ASSERTD(ax + 1 <= __gmpfr_emax);
222           MPFR_ASSERTD(ax + 1 >= __gmpfr_emin);
223           MPFR_SET_EXP (a, ax + 1);
224         }
225       MPFR_RET(MPFR_SIGN_POS);
226     }
227 }
228 
229 /* Special code for GMP_NUMB_BITS < prec(a) < 2*GMP_NUMB_BITS and
230    GMP_NUMB_BITS < prec(b) <= 2*GMP_NUMB_BITS.
231    Note: this function was copied and optimized from mpfr_mul_2 in file mul.c,
232    thus any change here should be done also in mpfr_mul_2, if applicable. */
233 static int
mpfr_sqr_2(mpfr_ptr a,mpfr_srcptr b,mpfr_rnd_t rnd_mode,mpfr_prec_t p)234 mpfr_sqr_2 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p)
235 {
236   mp_limb_t h, l, u, v;
237   mpfr_limb_ptr ap = MPFR_MANT(a);
238   mpfr_exp_t ax = 2 * MPFR_GET_EXP(b);
239   mpfr_prec_t sh = 2 * GMP_NUMB_BITS - p;
240   mp_limb_t rb, sb, sb2, mask = MPFR_LIMB_MASK(sh);
241   mp_limb_t *bp = MPFR_MANT(b);
242 
243   /* we store the 4-limb product in h=ap[1], l=ap[0], sb=ap[-1], sb2=ap[-2] */
244   umul_ppmm (h, l, bp[1], bp[1]);
245   umul_ppmm (u, v, bp[1], bp[0]);
246   l += u << 1;
247   h += (l < (u << 1)) + (u >> (GMP_NUMB_BITS - 1));
248 
249   /* now the full square is {h, l, 2*v + high(b0*c0), low(b0*c0)},
250      where the lower part contributes to less than 3 ulps to {h, l} */
251 
252   /* If h has its most significant bit set and the low sh-1 bits of l are not
253      000...000 nor 111...111 nor 111...110, then we can round correctly;
254      if h has zero as most significant bit, we have to shift left h and l,
255      thus if the low sh-2 bits are not 000...000 nor 111...111 nor 111...110,
256      then we can round correctly. To avoid an extra test we consider the latter
257      case (if we can round, we can also round in the former case).
258      For sh <= 3, we have mask <= 7, thus (mask>>2) <= 1, and the approximation
259      cannot be enough. */
260   if (MPFR_LIKELY(((l + 2) & (mask >> 2)) > 2))
261     sb = sb2 = 1; /* result cannot be exact in that case */
262   else
263     {
264       mp_limb_t carry1, carry2;
265 
266       umul_ppmm (sb, sb2, bp[0], bp[0]);
267       /* the full product is {h, l, sb + v + w, sb2} */
268       ADD_LIMB (sb, v, carry1);
269       ADD_LIMB (l, carry1, carry2);
270       h += carry2;
271       ADD_LIMB (sb, v, carry1);
272       ADD_LIMB (l, carry1, carry2);
273       h += carry2;
274     }
275   if (h < MPFR_LIMB_HIGHBIT)
276     {
277       ax --;
278       h = (h << 1) | (l >> (GMP_NUMB_BITS - 1));
279       l = (l << 1) | (sb >> (GMP_NUMB_BITS - 1));
280       sb <<= 1;
281       /* no need to shift sb2 since we only want to know if it is zero or not */
282     }
283   ap[1] = h;
284   rb = l & (MPFR_LIMB_ONE << (sh - 1));
285   sb |= ((l & mask) ^ rb) | sb2;
286   ap[0] = l & ~mask;
287 
288   MPFR_SIGN(a) = MPFR_SIGN_POS;
289 
290   /* rounding */
291   if (MPFR_UNLIKELY(ax > __gmpfr_emax))
292     return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
293 
294   /* Warning: underflow should be checked *after* rounding, thus when rounding
295      away and when a > 0.111...111*2^(emin-1), or when rounding to nearest and
296      a >= 0.111...111[1]*2^(emin-1), there is no underflow. */
297   if (MPFR_UNLIKELY(ax < __gmpfr_emin))
298     {
299       /* Note: like for mpfr_sqr_1, the case
300          0.111...111*2^(emin-1) < a < 2^(emin-1) is not possible when emin is
301          odd, since (modulo a shift) this would imply 1-2^(-p) < a = b^2 < 1,
302          and this is not possible with 1-2^(-p) <= b < 1.
303          For emin even, it is possible for some values of p, for example for
304          p=69 with b=417402170410649030795*2^k. */
305       if (ax == __gmpfr_emin - 1 &&
306           ap[1] == MPFR_LIMB_MAX &&
307           ap[0] == ~mask &&
308           ((rnd_mode == MPFR_RNDN && rb) ||
309            (MPFR_IS_LIKE_RNDA (rnd_mode, 0) && (rb | sb))))
310         goto rounding; /* no underflow */
311       /* for RNDN, mpfr_underflow always rounds away, thus for
312          |a| <= 2^(emin-2) we have to change to RNDZ */
313       if (rnd_mode == MPFR_RNDN &&
314           (ax < __gmpfr_emin - 1 ||
315            (ap[1] == MPFR_LIMB_HIGHBIT && ap[0] == 0 && (rb | sb) == 0)))
316         rnd_mode = MPFR_RNDZ;
317       return mpfr_underflow (a, rnd_mode, MPFR_SIGN_POS);
318     }
319 
320  rounding:
321   MPFR_EXP (a) = ax; /* Don't use MPFR_SET_EXP since ax might be < __gmpfr_emin
322                         in the cases "goto rounding" above. */
323   if ((rb == 0 && sb == 0) || rnd_mode == MPFR_RNDF)
324     {
325       MPFR_ASSERTD(ax >= __gmpfr_emin);
326       MPFR_RET (0);
327     }
328   else if (rnd_mode == MPFR_RNDN)
329     {
330       if (rb == 0 || (sb == 0 && (ap[0] & (MPFR_LIMB_ONE << sh)) == 0))
331         goto truncate;
332       else
333         goto add_one_ulp;
334     }
335   else if (MPFR_IS_LIKE_RNDZ (rnd_mode, 0))
336     {
337     truncate:
338       MPFR_ASSERTD(ax >= __gmpfr_emin);
339       MPFR_RET(-MPFR_SIGN_POS);
340     }
341   else /* round away from zero */
342     {
343     add_one_ulp:
344       ap[0] += MPFR_LIMB_ONE << sh;
345       ap[1] += (ap[0] == 0);
346       if (ap[1] == 0)
347         {
348           ap[1] = MPFR_LIMB_HIGHBIT;
349           if (MPFR_UNLIKELY(ax + 1 > __gmpfr_emax))
350             return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
351           MPFR_ASSERTD(ax + 1 <= __gmpfr_emax);
352           MPFR_ASSERTD(ax + 1 >= __gmpfr_emin);
353           MPFR_SET_EXP (a, ax + 1);
354         }
355       MPFR_RET(MPFR_SIGN_POS);
356     }
357 }
358 
359 /* Special code for 2*GMP_NUMB_BITS < prec(a) < 3*GMP_NUMB_BITS and
360    2*GMP_NUMB_BITS < prec(b) <= 3*GMP_NUMB_BITS. */
361 static int
mpfr_sqr_3(mpfr_ptr a,mpfr_srcptr b,mpfr_rnd_t rnd_mode,mpfr_prec_t p)362 mpfr_sqr_3 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p)
363 {
364   mp_limb_t a0, a1, a2, h, l;
365   mpfr_limb_ptr ap = MPFR_MANT(a);
366   mpfr_exp_t ax = 2 * MPFR_GET_EXP(b);
367   mpfr_prec_t sh = 3 * GMP_NUMB_BITS - p;
368   mp_limb_t rb, sb, sb2, mask = MPFR_LIMB_MASK(sh);
369   mp_limb_t *bp = MPFR_MANT(b);
370 
371   /* we store the upper 3-limb product in a2, a1, a0:
372      b2^2, 2*b2*b1, 2*b2*b0+b1^2 */
373 
374   /* first compute b2*b1 and b2*b0, which will be shifted by 1 */
375   umul_ppmm (a1, a0, bp[2], bp[1]);
376   umul_ppmm (h, l, bp[2], bp[0]);
377   a0 += h;
378   a1 += (a0 < h);
379   /* now a1, a0 contains b2*b1 + floor(b2*b0/B): there can be no overflow
380      since b2*b1*B + b2*b0 <= b2*(b1*B+b0) <= b2*(B^2-1) < B^3 */
381 
382   /* multiply a2, a1, a0 by 2 */
383   a2 = a1 >> (GMP_NUMB_BITS - 1);
384   a1 = (a1 << 1) | (a0 >> (GMP_NUMB_BITS - 1));
385   a0 = (a0 << 1);
386 
387   /* add b2^2 */
388   umul_ppmm (h, l, bp[2], bp[2]);
389   a1 += l;
390   a2 += h + (a1 < l);
391 
392   /* add b1^2 */
393   umul_ppmm (h, l, bp[1], bp[1]);
394   a0 += h;
395   a1 += (a0 < h);
396   a2 += (a1 == 0 && a0 < h);
397 
398   /* Now the approximate product {a2, a1, a0} has an error of less than
399      5 ulps (3 ulps for the ignored low limbs of 2*b2*b0+b1^2,
400      plus 2 ulps for the ignored 2*b1*b0 (plus b0^2).
401      Since we might shift by 1 bit, we make sure the low sh-2 bits of a0
402      are not 0, -1, -2, -3 or -4. */
403 
404   if (MPFR_LIKELY(((a0 + 4) & (mask >> 2)) > 4))
405     sb = sb2 = 1; /* result cannot be exact in that case */
406   else
407     {
408       mp_limb_t t[6];
409       mpn_sqr (t, bp, 3);
410       a2 = t[5];
411       a1 = t[4];
412       a0 = t[3];
413       sb = t[2];
414       sb2 = t[1] | t[0];
415     }
416   if (a2 < MPFR_LIMB_HIGHBIT)
417     {
418       ax --;
419       a2 = (a2 << 1) | (a1 >> (GMP_NUMB_BITS - 1));
420       a1 = (a1 << 1) | (a0 >> (GMP_NUMB_BITS - 1));
421       a0 = (a0 << 1) | (sb >> (GMP_NUMB_BITS - 1));
422       sb <<= 1;
423       /* no need to shift sb2: we only need to know if it is zero or not */
424     }
425   ap[2] = a2;
426   ap[1] = a1;
427   rb = a0 & (MPFR_LIMB_ONE << (sh - 1));
428   sb |= ((a0 & mask) ^ rb) | sb2;
429   ap[0] = a0 & ~mask;
430 
431   MPFR_SIGN(a) = MPFR_SIGN_POS;
432 
433   /* rounding */
434   if (MPFR_UNLIKELY(ax > __gmpfr_emax))
435     return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
436 
437   /* Warning: underflow should be checked *after* rounding, thus when rounding
438      away and when a > 0.111...111*2^(emin-1), or when rounding to nearest and
439      a >= 0.111...111[1]*2^(emin-1), there is no underflow. */
440   if (MPFR_UNLIKELY(ax < __gmpfr_emin))
441     {
442       if (ax == __gmpfr_emin - 1 &&
443           ap[2] == MPFR_LIMB_MAX &&
444           ap[1] == MPFR_LIMB_MAX &&
445           ap[0] == ~mask &&
446           ((rnd_mode == MPFR_RNDN && rb) ||
447            (MPFR_IS_LIKE_RNDA (rnd_mode, 0) && (rb | sb))))
448         goto rounding; /* no underflow */
449       /* for RNDN, mpfr_underflow always rounds away, thus for |a| <= 2^(emin-2)
450          we have to change to RNDZ */
451       if (rnd_mode == MPFR_RNDN &&
452           (ax < __gmpfr_emin - 1 ||
453            (ap[2] == MPFR_LIMB_HIGHBIT && ap[1] == 0 && ap[0] == 0
454             && (rb | sb) == 0)))
455         rnd_mode = MPFR_RNDZ;
456       return mpfr_underflow (a, rnd_mode, MPFR_SIGN_POS);
457     }
458 
459  rounding:
460   MPFR_EXP (a) = ax; /* Don't use MPFR_SET_EXP since ax might be < __gmpfr_emin
461                         in the cases "goto rounding" above. */
462   if ((rb == 0 && sb == 0) || rnd_mode == MPFR_RNDF)
463     {
464       MPFR_ASSERTD(ax >= __gmpfr_emin);
465       MPFR_RET (0);
466     }
467   else if (rnd_mode == MPFR_RNDN)
468     {
469       if (rb == 0 || (sb == 0 && (ap[0] & (MPFR_LIMB_ONE << sh)) == 0))
470         goto truncate;
471       else
472         goto add_one_ulp;
473     }
474   else if (MPFR_IS_LIKE_RNDZ (rnd_mode, 0))
475     {
476     truncate:
477       MPFR_ASSERTD(ax >= __gmpfr_emin);
478       MPFR_RET(-MPFR_SIGN_POS);
479     }
480   else /* round away from zero */
481     {
482     add_one_ulp:
483       ap[0] += MPFR_LIMB_ONE << sh;
484       ap[1] += (ap[0] == 0);
485       ap[2] += (ap[1] == 0) && (ap[0] == 0);
486       if (ap[2] == 0)
487         {
488           ap[2] = MPFR_LIMB_HIGHBIT;
489           if (MPFR_UNLIKELY(ax + 1 > __gmpfr_emax))
490             return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
491           MPFR_ASSERTD(ax + 1 <= __gmpfr_emax);
492           MPFR_ASSERTD(ax + 1 >= __gmpfr_emin);
493           MPFR_SET_EXP (a, ax + 1);
494         }
495       MPFR_RET(MPFR_SIGN_POS);
496     }
497 }
498 
499 #endif /* !defined(MPFR_GENERIC_ABI) && ... */
500 
501 /* Note: mpfr_sqr will call mpfr_mul if bn > MPFR_SQR_THRESHOLD,
502    in order to use Mulders' mulhigh, which is handled only here
503    to avoid partial code duplication. There is some overhead due
504    to the additional tests, but slowdown should not be noticeable
505    as this code is not executed in very small precisions. */
506 
507 int
mpfr_sqr(mpfr_ptr a,mpfr_srcptr b,mpfr_rnd_t rnd_mode)508 mpfr_sqr (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode)
509 {
510   int cc, inexact;
511   mpfr_exp_t ax;
512   mp_limb_t *tmp;
513   mp_limb_t b1;
514   mpfr_prec_t aq, bq;
515   mp_size_t bn, tn;
516   MPFR_TMP_DECL(marker);
517 
518   MPFR_LOG_FUNC
519     (("x[%Pd]=%.*Rg rnd=%d", mpfr_get_prec (b), mpfr_log_prec, b, rnd_mode),
520      ("y[%Pd]=%.*Rg inexact=%d",
521       mpfr_get_prec (a), mpfr_log_prec, a, inexact));
522 
523   /* deal with special cases */
524   if (MPFR_UNLIKELY(MPFR_IS_SINGULAR(b)))
525     {
526       if (MPFR_IS_NAN(b))
527         {
528           MPFR_SET_NAN(a);
529           MPFR_RET_NAN;
530         }
531       MPFR_SET_POS (a);
532       if (MPFR_IS_INF(b))
533         MPFR_SET_INF(a);
534       else
535         ( MPFR_ASSERTD(MPFR_IS_ZERO(b)), MPFR_SET_ZERO(a) );
536       MPFR_RET(0);
537     }
538   aq = MPFR_GET_PREC(a);
539   bq = MPFR_GET_PREC(b);
540 
541 #if !defined(MPFR_GENERIC_ABI) && (GMP_NUMB_BITS == 32 || GMP_NUMB_BITS == 64)
542   if (aq == bq)
543     {
544       if (aq < GMP_NUMB_BITS)
545         return mpfr_sqr_1 (a, b, rnd_mode, aq);
546 
547       if (GMP_NUMB_BITS < aq && aq < 2 * GMP_NUMB_BITS)
548         return mpfr_sqr_2 (a, b, rnd_mode, aq);
549 
550       if (aq == GMP_NUMB_BITS)
551         return mpfr_sqr_1n (a, b, rnd_mode);
552 
553       if (2 * GMP_NUMB_BITS < aq && aq < 3 * GMP_NUMB_BITS)
554         return mpfr_sqr_3 (a, b, rnd_mode, aq);
555     }
556 #endif
557 
558   ax = 2 * MPFR_GET_EXP (b);
559   MPFR_ASSERTN (2 * (mpfr_uprec_t) bq <= MPFR_PREC_MAX);
560 
561   bn = MPFR_LIMB_SIZE (b); /* number of limbs of b */
562   tn = MPFR_PREC2LIMBS (2 * bq); /* number of limbs of square,
563                                     2*bn or 2*bn-1 */
564 
565   if (MPFR_UNLIKELY(bn > MPFR_SQR_THRESHOLD))
566     /* the following line should not be replaced by mpfr_sqr,
567        otherwise we'll get an infinite loop! */
568     return mpfr_mul (a, b, b, rnd_mode);
569 
570   MPFR_TMP_MARK(marker);
571   tmp = MPFR_TMP_LIMBS_ALLOC (2 * bn);
572 
573   /* Multiplies the mantissa in temporary allocated space */
574   mpn_sqr (tmp, MPFR_MANT(b), bn);
575   b1 = tmp[2 * bn - 1];
576 
577   /* now tmp[0]..tmp[2*bn-1] contains the product of both mantissa,
578      with tmp[2*bn-1]>=2^(GMP_NUMB_BITS-2) */
579   b1 >>= GMP_NUMB_BITS - 1; /* msb from the product */
580 
581   /* if the mantissas of b and c are uniformly distributed in ]1/2, 1],
582      then their product is in ]1/4, 1/2] with probability 2*ln(2)-1 ~ 0.386
583      and in [1/2, 1] with probability 2-2*ln(2) ~ 0.614 */
584   tmp += 2 * bn - tn; /* +0 or +1 */
585   if (MPFR_UNLIKELY(b1 == 0))
586     mpn_lshift (tmp, tmp, tn, 1); /* tn <= k, so no stack corruption */
587 
588   cc = mpfr_round_raw (MPFR_MANT (a), tmp, 2 * bq, 0, aq, rnd_mode, &inexact);
589   /* cc = 1 ==> result is a power of two */
590   if (MPFR_UNLIKELY(cc))
591     MPFR_MANT(a)[MPFR_LIMB_SIZE(a)-1] = MPFR_LIMB_HIGHBIT;
592 
593   MPFR_TMP_FREE(marker);
594   {
595     mpfr_exp_t ax2 = ax + ((int) b1 - 1 + cc);
596     if (MPFR_UNLIKELY( ax2 > __gmpfr_emax))
597       return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
598     if (MPFR_UNLIKELY( ax2 < __gmpfr_emin))
599       {
600         /* In the rounding to the nearest mode, if the exponent of the exact
601            result (i.e. before rounding, i.e. without taking cc into account)
602            is < __gmpfr_emin - 1 or the exact result is a power of 2 (i.e. if
603            both arguments are powers of 2), then round to zero. */
604         if (rnd_mode == MPFR_RNDN &&
605             (ax + (mpfr_exp_t) b1 < __gmpfr_emin || mpfr_powerof2_raw (b)))
606           rnd_mode = MPFR_RNDZ;
607         return mpfr_underflow (a, rnd_mode, MPFR_SIGN_POS);
608       }
609     MPFR_SET_EXP (a, ax2);
610     MPFR_SET_POS (a);
611   }
612   MPFR_RET (inexact);
613 }
614