mpfr/src/rec_sqrt.c

4a238c70SJohn Marino/* mpfr_rec_sqrt -- inverse square root
4a238c70SJohn Marino
*ab6d115fSJohn MarinoCopyright 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
*ab6d115fSJohn MarinoContributed by the AriC and Caramel projects, INRIA.
4a238c70SJohn Marino
4a238c70SJohn MarinoThis file is part of the GNU MPFR Library.
4a238c70SJohn Marino
4a238c70SJohn MarinoThe GNU MPFR Library is free software; you can redistribute it and/or modify
4a238c70SJohn Marinoit under the terms of the GNU Lesser General Public License as published by
4a238c70SJohn Marinothe Free Software Foundation; either version 3 of the License, or (at your
4a238c70SJohn Marinooption) any later version.
4a238c70SJohn Marino
4a238c70SJohn MarinoThe GNU MPFR Library is distributed in the hope that it will be useful, but
4a238c70SJohn MarinoWITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
4a238c70SJohn Marinoor FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
4a238c70SJohn MarinoLicense for more details.
4a238c70SJohn Marino
4a238c70SJohn MarinoYou should have received a copy of the GNU Lesser General Public License
4a238c70SJohn Marinoalong with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
4a238c70SJohn Marinohttp://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
4a238c70SJohn Marino51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
4a238c70SJohn Marino
4a238c70SJohn Marino#include <stdio.h>
4a238c70SJohn Marino#include <stdlib.h>
4a238c70SJohn Marino
4a238c70SJohn Marino#define MPFR_NEED_LONGLONG_H /* for umul_ppmm */
4a238c70SJohn Marino#include "mpfr-impl.h"
4a238c70SJohn Marino
4a238c70SJohn Marino#define LIMB_SIZE(x) ((((x)-1)>>MPFR_LOG2_GMP_NUMB_BITS) + 1)
4a238c70SJohn Marino
4a238c70SJohn Marino#define MPFR_COM_N(x,y,n)                               \
4a238c70SJohn Marino  {                                                     \
4a238c70SJohn Marino    mp_size_t i;                                        \
4a238c70SJohn Marino    for (i = 0; i < n; i++)                             \
4a238c70SJohn Marino      *((x)+i) = ~*((y)+i);                             \
4a238c70SJohn Marino  }
4a238c70SJohn Marino
4a238c70SJohn Marino/* Put in X a p-bit approximation of 1/sqrt(A),
4a238c70SJohn Marino   where X = {x, n}/B^n, n = ceil(p/GMP_NUMB_BITS),
4a238c70SJohn Marino   A = 2^(1+as)*{a, an}/B^an, as is 0 or 1, an = ceil(ap/GMP_NUMB_BITS),
4a238c70SJohn Marino   where B = 2^GMP_NUMB_BITS.
4a238c70SJohn Marino
4a238c70SJohn Marino   We have 1 <= A < 4 and 1/2 <= X < 1.
4a238c70SJohn Marino
4a238c70SJohn Marino   The error in the approximate result with respect to the true
4a238c70SJohn Marino   value 1/sqrt(A) is bounded by 1 ulp(X), i.e., 2^{-p} since 1/2 <= X < 1.
4a238c70SJohn Marino
4a238c70SJohn Marino   Note: x and a are left-aligned, i.e., the most significant bit of
4a238c70SJohn Marino   a[an-1] is set, and so is the most significant bit of the output x[n-1].
4a238c70SJohn Marino
4a238c70SJohn Marino   If p is not a multiple of GMP_NUMB_BITS, the extra low bits of the input
4a238c70SJohn Marino   A are taken into account to compute the approximation of 1/sqrt(A), but
4a238c70SJohn Marino   whether or not they are zero, the error between X and 1/sqrt(A) is bounded
4a238c70SJohn Marino   by 1 ulp(X) [in precision p].
4a238c70SJohn Marino   The extra low bits of the output X (if p is not a multiple of GMP_NUMB_BITS)
4a238c70SJohn Marino   are set to 0.
4a238c70SJohn Marino
4a238c70SJohn Marino   Assumptions:
4a238c70SJohn Marino   (1) A should be normalized, i.e., the most significant bit of a[an-1]
4a238c70SJohn Marino       should be 1. If as=0, we have 1 <= A < 2; if as=1, we have 2 <= A < 4.
4a238c70SJohn Marino   (2) p >= 12
4a238c70SJohn Marino   (3) {a, an} and {x, n} should not overlap
4a238c70SJohn Marino   (4) GMP_NUMB_BITS >= 12 and is even
4a238c70SJohn Marino
4a238c70SJohn Marino   Note: this routine is much more efficient when ap is small compared to p,
4a238c70SJohn Marino   including the case where ap <= GMP_NUMB_BITS, thus it can be used to
4a238c70SJohn Marino   implement an efficient mpfr_rec_sqrt_ui function.
4a238c70SJohn Marino
4a238c70SJohn Marino   References:
4a238c70SJohn Marino   [1] Modern Computer Algebra, Richard Brent and Paul Zimmermann,
4a238c70SJohn Marino   http://www.loria.fr/~zimmerma/mca/pub226.html
4a238c70SJohn Marino*/
4a238c70SJohn Marinostatic void
4a238c70SJohn Marinompfr_mpn_rec_sqrt (mpfr_limb_ptr x, mpfr_prec_t p,
4a238c70SJohn Marino                   mpfr_limb_srcptr a, mpfr_prec_t ap, int as)
4a238c70SJohn Marino
4a238c70SJohn Marino{
4a238c70SJohn Marino  /* the following T1 and T2 are bipartite tables giving initial
4a238c70SJohn Marino     approximation for the inverse square root, with 13-bit input split in
4a238c70SJohn Marino     5+4+4, and 11-bit output. More precisely, if 2048 <= i < 8192,
4a238c70SJohn Marino     with i = a*2^8 + b*2^4 + c, we use for approximation of
4a238c70SJohn Marino     2048/sqrt(i/2048) the value x = T1[16*(a-8)+b] + T2[16*(a-8)+c].
4a238c70SJohn Marino     The largest error is obtained for i = 2054, where x = 2044,
4a238c70SJohn Marino     and 2048/sqrt(i/2048) = 2045.006576...
4a238c70SJohn Marino  */
4a238c70SJohn Marino  static short int T1[384] = {
4a238c70SJohn Marino2040, 2033, 2025, 2017, 2009, 2002, 1994, 1987, 1980, 1972, 1965, 1958, 1951,
4a238c70SJohn Marino1944, 1938, 1931, /* a=8 */
4a238c70SJohn Marino1925, 1918, 1912, 1905, 1899, 1892, 1886, 1880, 1874, 1867, 1861, 1855, 1849,
4a238c70SJohn Marino1844, 1838, 1832, /* a=9 */
4a238c70SJohn Marino1827, 1821, 1815, 1810, 1804, 1799, 1793, 1788, 1783, 1777, 1772, 1767, 1762,
4a238c70SJohn Marino1757, 1752, 1747, /* a=10 */
4a238c70SJohn Marino1742, 1737, 1733, 1728, 1723, 1718, 1713, 1709, 1704, 1699, 1695, 1690, 1686,
4a238c70SJohn Marino1681, 1677, 1673, /* a=11 */
4a238c70SJohn Marino1669, 1664, 1660, 1656, 1652, 1647, 1643, 1639, 1635, 1631, 1627, 1623, 1619,
4a238c70SJohn Marino1615, 1611, 1607, /* a=12 */
4a238c70SJohn Marino1603, 1600, 1596, 1592, 1588, 1585, 1581, 1577, 1574, 1570, 1566, 1563, 1559,
4a238c70SJohn Marino1556, 1552, 1549, /* a=13 */
4a238c70SJohn Marino1545, 1542, 1538, 1535, 1532, 1528, 1525, 1522, 1518, 1515, 1512, 1509, 1505,
4a238c70SJohn Marino1502, 1499, 1496, /* a=14 */
4a238c70SJohn Marino1493, 1490, 1487, 1484, 1481, 1478, 1475, 1472, 1469, 1466, 1463, 1460, 1457,
4a238c70SJohn Marino1454, 1451, 1449, /* a=15 */
4a238c70SJohn Marino1446, 1443, 1440, 1438, 1435, 1432, 1429, 1427, 1424, 1421, 1419, 1416, 1413,
4a238c70SJohn Marino1411, 1408, 1405, /* a=16 */
4a238c70SJohn Marino1403, 1400, 1398, 1395, 1393, 1390, 1388, 1385, 1383, 1380, 1378, 1375, 1373,
4a238c70SJohn Marino1371, 1368, 1366, /* a=17 */
4a238c70SJohn Marino1363, 1360, 1358, 1356, 1353, 1351, 1349, 1346, 1344, 1342, 1340, 1337, 1335,
4a238c70SJohn Marino1333, 1331, 1329, /* a=18 */
4a238c70SJohn Marino1327, 1325, 1323, 1321, 1319, 1316, 1314, 1312, 1310, 1308, 1306, 1304, 1302,
4a238c70SJohn Marino1300, 1298, 1296, /* a=19 */
4a238c70SJohn Marino1294, 1292, 1290, 1288, 1286, 1284, 1282, 1280, 1278, 1276, 1274, 1272, 1270,
4a238c70SJohn Marino1268, 1266, 1265, /* a=20 */
4a238c70SJohn Marino1263, 1261, 1259, 1257, 1255, 1253, 1251, 1250, 1248, 1246, 1244, 1242, 1241,
4a238c70SJohn Marino1239, 1237, 1235, /* a=21 */
4a238c70SJohn Marino1234, 1232, 1230, 1229, 1227, 1225, 1223, 1222, 1220, 1218, 1217, 1215, 1213,
4a238c70SJohn Marino1212, 1210, 1208, /* a=22 */
4a238c70SJohn Marino1206, 1204, 1203, 1201, 1199, 1198, 1196, 1195, 1193, 1191, 1190, 1188, 1187,
4a238c70SJohn Marino1185, 1184, 1182, /* a=23 */
4a238c70SJohn Marino1181, 1180, 1178, 1177, 1175, 1174, 1172, 1171, 1169, 1168, 1166, 1165, 1163,
4a238c70SJohn Marino1162, 1160, 1159, /* a=24 */
4a238c70SJohn Marino1157, 1156, 1154, 1153, 1151, 1150, 1149, 1147, 1146, 1144, 1143, 1142, 1140,
4a238c70SJohn Marino1139, 1137, 1136, /* a=25 */
4a238c70SJohn Marino1135, 1133, 1132, 1131, 1129, 1128, 1127, 1125, 1124, 1123, 1121, 1120, 1119,
4a238c70SJohn Marino1117, 1116, 1115, /* a=26 */
4a238c70SJohn Marino1114, 1113, 1111, 1110, 1109, 1108, 1106, 1105, 1104, 1103, 1101, 1100, 1099,
4a238c70SJohn Marino1098, 1096, 1095, /* a=27 */
4a238c70SJohn Marino1093, 1092, 1091, 1090, 1089, 1087, 1086, 1085, 1084, 1083, 1081, 1080, 1079,
4a238c70SJohn Marino1078, 1077, 1076, /* a=28 */
4a238c70SJohn Marino1075, 1073, 1072, 1071, 1070, 1069, 1068, 1067, 1065, 1064, 1063, 1062, 1061,
4a238c70SJohn Marino1060, 1059, 1058, /* a=29 */
4a238c70SJohn Marino1057, 1056, 1055, 1054, 1052, 1051, 1050, 1049, 1048, 1047, 1046, 1045, 1044,
4a238c70SJohn Marino1043, 1042, 1041, /* a=30 */
4a238c70SJohn Marino1040, 1039, 1038, 1037, 1036, 1035, 1034, 1033, 1032, 1031, 1030, 1029, 1028,
4a238c70SJohn Marino1027, 1026, 1025 /* a=31 */
4a238c70SJohn Marino};
4a238c70SJohn Marino  static unsigned char T2[384] = {
4a238c70SJohn Marino    7, 7, 6, 6, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1, 0, /* a=8 */
4a238c70SJohn Marino    6, 5, 5, 5, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 0, 0, /* a=9 */
4a238c70SJohn Marino    5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, /* a=10 */
4a238c70SJohn Marino    4, 4, 3, 3, 3, 3, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, /* a=11 */
4a238c70SJohn Marino    3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, /* a=12 */
4a238c70SJohn Marino    3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=13 */
4a238c70SJohn Marino    3, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, /* a=14 */
4a238c70SJohn Marino    2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=15 */
4a238c70SJohn Marino    2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=16 */
4a238c70SJohn Marino    2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=17 */
4a238c70SJohn Marino    3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, /* a=18 */
4a238c70SJohn Marino    2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=19 */
4a238c70SJohn Marino    1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, /* a=20 */
4a238c70SJohn Marino    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=21 */
4a238c70SJohn Marino    1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=22 */
4a238c70SJohn Marino    2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* a=23 */
4a238c70SJohn Marino    1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=24 */
4a238c70SJohn Marino    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=25 */
4a238c70SJohn Marino    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=26 */
4a238c70SJohn Marino    1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=27 */
4a238c70SJohn Marino    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=28 */
4a238c70SJohn Marino    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=29 */
4a238c70SJohn Marino    1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=30 */
4a238c70SJohn Marino    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* a=31 */
4a238c70SJohn Marino};
4a238c70SJohn Marino  mp_size_t n = LIMB_SIZE(p);   /* number of limbs of X */
4a238c70SJohn Marino  mp_size_t an = LIMB_SIZE(ap); /* number of limbs of A */
4a238c70SJohn Marino
4a238c70SJohn Marino  /* A should be normalized */
4a238c70SJohn Marino  MPFR_ASSERTD((a[an - 1] & MPFR_LIMB_HIGHBIT) != 0);
4a238c70SJohn Marino  /* We should have enough bits in one limb and GMP_NUMB_BITS should be even.
4a238c70SJohn Marino     Since that does not depend on MPFR, we always check this. */
4a238c70SJohn Marino  MPFR_ASSERTN((GMP_NUMB_BITS >= 12) && ((GMP_NUMB_BITS & 1) == 0));
4a238c70SJohn Marino  /* {a, an} and {x, n} should not overlap */
4a238c70SJohn Marino  MPFR_ASSERTD((a + an <= x) || (x + n <= a));
4a238c70SJohn Marino  MPFR_ASSERTD(p >= 11);
4a238c70SJohn Marino
4a238c70SJohn Marino  if (MPFR_UNLIKELY(an > n)) /* we can cut the input to n limbs */
4a238c70SJohn Marino    {
4a238c70SJohn Marino      a += an - n;
4a238c70SJohn Marino      an = n;
4a238c70SJohn Marino    }
4a238c70SJohn Marino
4a238c70SJohn Marino  if (p == 11) /* should happen only from recursive calls */
4a238c70SJohn Marino    {
4a238c70SJohn Marino      unsigned long i, ab, ac;
4a238c70SJohn Marino      mp_limb_t t;
4a238c70SJohn Marino
4a238c70SJohn Marino      /* take the 12+as most significant bits of A */
4a238c70SJohn Marino      i = a[an - 1] >> (GMP_NUMB_BITS - (12 + as));
4a238c70SJohn Marino      /* if one wants faithful rounding for p=11, replace #if 0 by #if 1 */
4a238c70SJohn Marino      ab = i >> 4;
4a238c70SJohn Marino      ac = (ab & 0x3F0) | (i & 0x0F);
4a238c70SJohn Marino      t = (mp_limb_t) T1[ab - 0x80] + (mp_limb_t) T2[ac - 0x80];
4a238c70SJohn Marino      x[0] = t << (GMP_NUMB_BITS - p);
4a238c70SJohn Marino    }
4a238c70SJohn Marino  else /* p >= 12 */
4a238c70SJohn Marino    {
4a238c70SJohn Marino      mpfr_prec_t h, pl;
4a238c70SJohn Marino      mpfr_limb_ptr r, s, t, u;
4a238c70SJohn Marino      mp_size_t xn, rn, th, ln, tn, sn, ahn, un;
4a238c70SJohn Marino      mp_limb_t neg, cy, cu;
4a238c70SJohn Marino      MPFR_TMP_DECL(marker);
4a238c70SJohn Marino
4a238c70SJohn Marino      /* compared to Algorithm 3.9 of [1], we have {a, an} = A/2 if as=0,
4a238c70SJohn Marino         and A/4 if as=1. */
4a238c70SJohn Marino
4a238c70SJohn Marino      /* h = max(11, ceil((p+3)/2)) is the bitsize of the recursive call */
4a238c70SJohn Marino      h = (p < 18) ? 11 : (p >> 1) + 2;
4a238c70SJohn Marino
4a238c70SJohn Marino      xn = LIMB_SIZE(h);       /* limb size of the recursive Xh */
4a238c70SJohn Marino      rn = LIMB_SIZE(2 * h);   /* a priori limb size of Xh^2 */
4a238c70SJohn Marino      ln = n - xn;             /* remaining limbs to be computed */
4a238c70SJohn Marino
4a238c70SJohn Marino      /* Since |Xh - A^{-1/2}| <= 2^{-h}, then by multiplying by Xh + A^{-1/2}
4a238c70SJohn Marino         we get |Xh^2 - 1/A| <= 2^{-h+1}, thus |A*Xh^2 - 1| <= 2^{-h+3},
4a238c70SJohn Marino         thus the h-3 most significant bits of t should be zero,
4a238c70SJohn Marino         which is in fact h+1+as-3 because of the normalization of A.
4a238c70SJohn Marino         This corresponds to th=floor((h+1+as-3)/GMP_NUMB_BITS) limbs.
4a238c70SJohn Marino
4a238c70SJohn Marino         More precisely we have |Xh^2 - 1/A| <= 2^{-h} * (Xh + A^{-1/2})
4a238c70SJohn Marino         <= 2^{-h} * (2 A^{-1/2} + 2^{-h}) <= 2.001 * 2^{-h} * A^{-1/2}
4a238c70SJohn Marino         since A < 4 and h >= 11, thus
4a238c70SJohn Marino         |A*Xh^2 - 1| <= 2.001 * 2^{-h} * A^{1/2} <= 1.001 * 2^{2-h}.
4a238c70SJohn Marino         This is sufficient to prove that the upper limb of {t,tn} below is
4a238c70SJohn Marino         less that 0.501 * 2^GMP_NUMB_BITS, thus cu = 0 below.
4a238c70SJohn Marino      */
4a238c70SJohn Marino      th = (h + 1 + as - 3) >> MPFR_LOG2_GMP_NUMB_BITS;
4a238c70SJohn Marino      tn = LIMB_SIZE(2 * h + 1 + as);
4a238c70SJohn Marino
4a238c70SJohn Marino      /* we need h+1+as bits of a */
4a238c70SJohn Marino      ahn = LIMB_SIZE(h + 1 + as); /* number of high limbs of A
4a238c70SJohn Marino                                      needed for the recursive call*/
4a238c70SJohn Marino      if (MPFR_UNLIKELY(ahn > an))
4a238c70SJohn Marino        ahn = an;
4a238c70SJohn Marino      mpfr_mpn_rec_sqrt (x + ln, h, a + an - ahn, ahn * GMP_NUMB_BITS, as);
4a238c70SJohn Marino      /* the most h significant bits of X are set, X has ceil(h/GMP_NUMB_BITS)
4a238c70SJohn Marino         limbs, the low (-h) % GMP_NUMB_BITS bits are zero */
4a238c70SJohn Marino
4a238c70SJohn Marino      /* compared to Algorithm 3.9 of [1], we have {x+ln,xn} = X_h */
4a238c70SJohn Marino
4a238c70SJohn Marino      MPFR_TMP_MARK (marker);
4a238c70SJohn Marino      /* first step: square X in r, result is exact */
4a238c70SJohn Marino      un = xn + (tn - th);
4a238c70SJohn Marino      /* We use the same temporary buffer to store r and u: r needs 2*xn
4a238c70SJohn Marino         limbs where u needs xn+(tn-th) limbs. Since tn can store at least
4a238c70SJohn Marino         2h bits, and th at most h bits, then tn-th can store at least h bits,
4a238c70SJohn Marino         thus tn - th >= xn, and reserving the space for u is enough. */
4a238c70SJohn Marino      MPFR_ASSERTD(2 * xn <= un);
4a238c70SJohn Marino      u = r = MPFR_TMP_LIMBS_ALLOC (un);
4a238c70SJohn Marino      if (2 * h <= GMP_NUMB_BITS) /* xn=rn=1, and since p <= 2h-3, n=1,
4a238c70SJohn Marino                                     thus ln = 0 */
4a238c70SJohn Marino        {
4a238c70SJohn Marino          MPFR_ASSERTD(ln == 0);
4a238c70SJohn Marino          cy = x[0] >> (GMP_NUMB_BITS >> 1);
4a238c70SJohn Marino          r ++;
4a238c70SJohn Marino          r[0] = cy * cy;
4a238c70SJohn Marino        }
4a238c70SJohn Marino      else if (xn == 1) /* xn=1, rn=2 */
4a238c70SJohn Marino        umul_ppmm(r[1], r[0], x[ln], x[ln]);
4a238c70SJohn Marino      else
4a238c70SJohn Marino        {
4a238c70SJohn Marino          mpn_mul_n (r, x + ln, x + ln, xn);
4a238c70SJohn Marino          /* we have {r, 2*xn} = X_h^2 */
4a238c70SJohn Marino          if (rn < 2 * xn)
4a238c70SJohn Marino            r ++;
4a238c70SJohn Marino        }
4a238c70SJohn Marino      /* now the 2h most significant bits of {r, rn} contains X^2, r has rn
4a238c70SJohn Marino         limbs, and the low (-2h) % GMP_NUMB_BITS bits are zero */
4a238c70SJohn Marino
4a238c70SJohn Marino      /* Second step: s <- A * (r^2), and truncate the low ap bits,
4a238c70SJohn Marino         i.e., at weight 2^{-2h} (s is aligned to the low significant bits)
4a238c70SJohn Marino       */
4a238c70SJohn Marino      sn = an + rn;
4a238c70SJohn Marino      s = MPFR_TMP_LIMBS_ALLOC (sn);
4a238c70SJohn Marino      if (rn == 1) /* rn=1 implies n=1, since rn*GMP_NUMB_BITS >= 2h,
4a238c70SJohn Marino                           and 2h >= p+3 */
4a238c70SJohn Marino        {
4a238c70SJohn Marino          /* necessarily p <= GMP_NUMB_BITS-3: we can ignore the two low
4a238c70SJohn Marino             bits from A */
4a238c70SJohn Marino          /* since n=1, and we ensured an <= n, we also have an=1 */
4a238c70SJohn Marino          MPFR_ASSERTD(an == 1);
4a238c70SJohn Marino          umul_ppmm (s[1], s[0], r[0], a[0]);
4a238c70SJohn Marino        }
4a238c70SJohn Marino      else
4a238c70SJohn Marino        {
4a238c70SJohn Marino          /* we have p <= n * GMP_NUMB_BITS
4a238c70SJohn Marino             2h <= rn * GMP_NUMB_BITS with p+3 <= 2h <= p+4
4a238c70SJohn Marino             thus n <= rn <= n + 1 */
4a238c70SJohn Marino          MPFR_ASSERTD(rn <= n + 1);
4a238c70SJohn Marino          /* since we ensured an <= n, we have an <= rn */
4a238c70SJohn Marino          MPFR_ASSERTD(an <= rn);
4a238c70SJohn Marino          mpn_mul (s, r, rn, a, an);
4a238c70SJohn Marino          /* s should be near B^sn/2^(1+as), thus s[sn-1] is either
4a238c70SJohn Marino             100000... or 011111... if as=0, or
4a238c70SJohn Marino             010000... or 001111... if as=1.
4a238c70SJohn Marino             We ignore the bits of s after the first 2h+1+as ones.
4a238c70SJohn Marino             We have {s, rn+an} = A*X_h^2/2 if as=0, A*X_h^2/4 if as=1. */
4a238c70SJohn Marino        }
4a238c70SJohn Marino
4a238c70SJohn Marino      /* We ignore the bits of s after the first 2h+1+as ones: s has rn + an
4a238c70SJohn Marino         limbs, where rn = LIMBS(2h), an=LIMBS(a), and tn = LIMBS(2h+1+as). */
4a238c70SJohn Marino      t = s + sn - tn; /* pointer to low limb of the high part of t */
4a238c70SJohn Marino      /* the upper h-3 bits of 1-t should be zero,
4a238c70SJohn Marino         where 1 corresponds to the most significant bit of t[tn-1] if as=0,
4a238c70SJohn Marino         and to the 2nd most significant bit of t[tn-1] if as=1 */
4a238c70SJohn Marino
4a238c70SJohn Marino      /* compute t <- 1 - t, which is B^tn - {t, tn+1},
4a238c70SJohn Marino         with rounding toward -Inf, i.e., rounding the input t toward +Inf.
4a238c70SJohn Marino         We could only modify the low tn - th limbs from t, but it gives only
4a238c70SJohn Marino         a small speedup, and would make the code more complex.
4a238c70SJohn Marino      */
4a238c70SJohn Marino      neg = t[tn - 1] & (MPFR_LIMB_HIGHBIT >> as);
4a238c70SJohn Marino      if (neg == 0) /* Ax^2 < 1: we have t = th + eps, where 0 <= eps < ulp(th)
4a238c70SJohn Marino                       is the part truncated above, thus 1 - t rounded to -Inf
4a238c70SJohn Marino                       is 1 - th - ulp(th) */
4a238c70SJohn Marino        {
4a238c70SJohn Marino          /* since the 1+as most significant bits of t are zero, set them
4a238c70SJohn Marino             to 1 before the one-complement */
4a238c70SJohn Marino          t[tn - 1] |= MPFR_LIMB_HIGHBIT | (MPFR_LIMB_HIGHBIT >> as);
4a238c70SJohn Marino          MPFR_COM_N (t, t, tn);
4a238c70SJohn Marino          /* we should add 1 here to get 1-th complement, and subtract 1 for
4a238c70SJohn Marino             -ulp(th), thus we do nothing */
4a238c70SJohn Marino        }
4a238c70SJohn Marino      else /* negative case: we want 1 - t rounded toward -Inf, i.e.,
4a238c70SJohn Marino              th + eps rounded toward +Inf, which is th + ulp(th):
4a238c70SJohn Marino              we discard the bit corresponding to 1,
4a238c70SJohn Marino              and we add 1 to the least significant bit of t */
4a238c70SJohn Marino        {
4a238c70SJohn Marino          t[tn - 1] ^= neg;
4a238c70SJohn Marino          mpn_add_1 (t, t, tn, 1);
4a238c70SJohn Marino        }
4a238c70SJohn Marino      tn -= th; /* we know at least th = floor((h+1+as-3)/GMP_NUMB_LIMBS) of
4a238c70SJohn Marino                   the high limbs of {t, tn} are zero */
4a238c70SJohn Marino
4a238c70SJohn Marino      /* tn = rn - th, where rn * GMP_NUMB_BITS >= 2*h and
4a238c70SJohn Marino         th * GMP_NUMB_BITS <= h+1+as-3, thus tn > 0 */
4a238c70SJohn Marino      MPFR_ASSERTD(tn > 0);
4a238c70SJohn Marino
4a238c70SJohn Marino      /* u <- x * t, where {t, tn} contains at least h+3 bits,
4a238c70SJohn Marino         and {x, xn} contains h bits, thus tn >= xn */
4a238c70SJohn Marino      MPFR_ASSERTD(tn >= xn);
4a238c70SJohn Marino      if (tn == 1) /* necessarily xn=1 */
4a238c70SJohn Marino        umul_ppmm (u[1], u[0], t[0], x[ln]);
4a238c70SJohn Marino      else
4a238c70SJohn Marino        mpn_mul (u, t, tn, x + ln, xn);
4a238c70SJohn Marino
4a238c70SJohn Marino      /* we have {u, tn+xn} = T_l X_h/2 if as=0, T_l X_h/4 if as=1 */
4a238c70SJohn Marino
4a238c70SJohn Marino      /* we have already discarded the upper th high limbs of t, thus we only
4a238c70SJohn Marino         have to consider the upper n - th limbs of u */
4a238c70SJohn Marino      un = n - th; /* un cannot be zero, since p <= n*GMP_NUMB_BITS,
4a238c70SJohn Marino                      h = ceil((p+3)/2) <= (p+4)/2,
4a238c70SJohn Marino                      th*GMP_NUMB_BITS <= h-1 <= p/2+1,
4a238c70SJohn Marino                      thus (n-th)*GMP_NUMB_BITS >= p/2-1.
4a238c70SJohn Marino                   */
4a238c70SJohn Marino      MPFR_ASSERTD(un > 0);
4a238c70SJohn Marino      u += (tn + xn) - un; /* xn + tn - un = xn + (original_tn - th) - (n - th)
4a238c70SJohn Marino                                           = xn + original_tn - n
4a238c70SJohn Marino                              = LIMBS(h) + LIMBS(2h+1+as) - LIMBS(p) > 0
4a238c70SJohn Marino                              since 2h >= p+3 */
4a238c70SJohn Marino      MPFR_ASSERTD(tn + xn > un); /* will allow to access u[-1] below */
4a238c70SJohn Marino
4a238c70SJohn Marino      /* In case as=0, u contains |x*(1-Ax^2)/2|, which is exactly what we
4a238c70SJohn Marino         need to add or subtract.
4a238c70SJohn Marino         In case as=1, u contains |x*(1-Ax^2)/4|, thus we need to multiply
4a238c70SJohn Marino         u by 2. */
4a238c70SJohn Marino
4a238c70SJohn Marino      if (as == 1)
4a238c70SJohn Marino        /* shift on un+1 limbs to get most significant bit of u[-1] into
4a238c70SJohn Marino           least significant bit of u[0] */
4a238c70SJohn Marino        mpn_lshift (u - 1, u - 1, un + 1, 1);
4a238c70SJohn Marino
4a238c70SJohn Marino      /* now {u,un} represents U / 2 from Algorithm 3.9 */
4a238c70SJohn Marino
4a238c70SJohn Marino      pl = n * GMP_NUMB_BITS - p;       /* low bits from x */
4a238c70SJohn Marino      /* We want that the low pl bits are zero after rounding to nearest,
4a238c70SJohn Marino         thus we round u to nearest at bit pl-1 of u[0] */
4a238c70SJohn Marino      if (pl > 0)
4a238c70SJohn Marino        {
4a238c70SJohn Marino          cu = mpn_add_1 (u, u, un, u[0] & (MPFR_LIMB_ONE << (pl - 1)));
4a238c70SJohn Marino          /* mask bits 0..pl-1 of u[0] */
4a238c70SJohn Marino          u[0] &= ~MPFR_LIMB_MASK(pl);
4a238c70SJohn Marino        }
4a238c70SJohn Marino      else /* round bit is in u[-1] */
4a238c70SJohn Marino        cu = mpn_add_1 (u, u, un, u[-1] >> (GMP_NUMB_BITS - 1));
4a238c70SJohn Marino      MPFR_ASSERTN(cu == 0);
4a238c70SJohn Marino
4a238c70SJohn Marino      /* We already have filled {x + ln, xn = n - ln}, and we want to add or
4a238c70SJohn Marino         subtract {u, un} at position x.
4a238c70SJohn Marino         un = n - th, where th contains <= h+1+as-3<=h-1 bits
4a238c70SJohn Marino         ln = n - xn, where xn contains >= h bits
4a238c70SJohn Marino         thus un > ln.
4a238c70SJohn Marino         Warning: ln might be zero.
4a238c70SJohn Marino      */
4a238c70SJohn Marino      MPFR_ASSERTD(un > ln);
4a238c70SJohn Marino      /* we can have un = ln + 2, for example with GMP_NUMB_BITS=32 and
4a238c70SJohn Marino         p=62, as=0, then h=33, n=2, th=0, xn=2, thus un=2 and ln=0. */
4a238c70SJohn Marino      MPFR_ASSERTD(un == ln + 1 || un == ln + 2);
4a238c70SJohn Marino      /* the high un-ln limbs of u will overlap the low part of {x+ln,xn},
4a238c70SJohn Marino         we need to add or subtract the overlapping part {u + ln, un - ln} */
4a238c70SJohn Marino      /* Warning! th may be 0, in which case the mpn_add_1 and mpn_sub_1
4a238c70SJohn Marino         below (with size = th) mustn't be used. */
4a238c70SJohn Marino      if (neg == 0)
4a238c70SJohn Marino        {
4a238c70SJohn Marino          if (ln > 0)
4a238c70SJohn Marino            MPN_COPY (x, u, ln);
4a238c70SJohn Marino          cy = mpn_add (x + ln, x + ln, xn, u + ln, un - ln);
4a238c70SJohn Marino          /* cy is the carry at x + (ln + xn) = x + n */
4a238c70SJohn Marino        }
4a238c70SJohn Marino      else /* negative case */
4a238c70SJohn Marino        {
4a238c70SJohn Marino          /* subtract {u+ln, un-ln} from {x+ln,un} */
4a238c70SJohn Marino          cy = mpn_sub (x + ln, x + ln, xn, u + ln, un - ln);
4a238c70SJohn Marino          /* cy is the borrow at x + (ln + xn) = x + n */
4a238c70SJohn Marino
4a238c70SJohn Marino          /* cy cannot be non-zero, since the most significant bit of Xh is 1,
4a238c70SJohn Marino             and the correction is bounded by 2^{-h+3} */
4a238c70SJohn Marino          MPFR_ASSERTD(cy == 0);
4a238c70SJohn Marino          if (ln > 0)
4a238c70SJohn Marino            {
4a238c70SJohn Marino              MPFR_COM_N (x, u, ln);
4a238c70SJohn Marino              /* we must add one for the 2-complement ... */
4a238c70SJohn Marino              cy = mpn_add_1 (x, x, n, MPFR_LIMB_ONE);
4a238c70SJohn Marino              /* ... and subtract 1 at x[ln], where n = ln + xn */
4a238c70SJohn Marino              cy -= mpn_sub_1 (x + ln, x + ln, xn, MPFR_LIMB_ONE);
4a238c70SJohn Marino            }
4a238c70SJohn Marino        }
4a238c70SJohn Marino
4a238c70SJohn Marino      /* cy can be 1 when A=1, i.e., {a, n} = B^n. In that case we should
4a238c70SJohn Marino         have X = B^n, and setting X to 1-2^{-p} satisties the error bound
4a238c70SJohn Marino         of 1 ulp. */
4a238c70SJohn Marino      if (MPFR_UNLIKELY(cy != 0))
4a238c70SJohn Marino        {
4a238c70SJohn Marino          cy -= mpn_sub_1 (x, x, n, MPFR_LIMB_ONE << pl);
4a238c70SJohn Marino          MPFR_ASSERTD(cy == 0);
4a238c70SJohn Marino        }
4a238c70SJohn Marino
4a238c70SJohn Marino      MPFR_TMP_FREE (marker);
4a238c70SJohn Marino    }
4a238c70SJohn Marino}
4a238c70SJohn Marino
4a238c70SJohn Marinoint
4a238c70SJohn Marinompfr_rec_sqrt (mpfr_ptr r, mpfr_srcptr u, mpfr_rnd_t rnd_mode)
4a238c70SJohn Marino{
4a238c70SJohn Marino  mpfr_prec_t rp, up, wp;
4a238c70SJohn Marino  mp_size_t rn, wn;
4a238c70SJohn Marino  int s, cy, inex;
4a238c70SJohn Marino  mpfr_limb_ptr x;
4a238c70SJohn Marino  MPFR_TMP_DECL(marker);
4a238c70SJohn Marino
4a238c70SJohn Marino  MPFR_LOG_FUNC
4a238c70SJohn Marino    (("x[%Pu]=%.*Rg rnd=%d", mpfr_get_prec (u), mpfr_log_prec, u, rnd_mode),
4a238c70SJohn Marino     ("y[%Pu]=%.*Rg inexact=%d", mpfr_get_prec (r), mpfr_log_prec, r, inex));
4a238c70SJohn Marino
4a238c70SJohn Marino  /* special values */
4a238c70SJohn Marino  if (MPFR_UNLIKELY(MPFR_IS_SINGULAR(u)))
4a238c70SJohn Marino    {
4a238c70SJohn Marino      if (MPFR_IS_NAN(u))
4a238c70SJohn Marino        {
4a238c70SJohn Marino          MPFR_SET_NAN(r);
4a238c70SJohn Marino          MPFR_RET_NAN;
4a238c70SJohn Marino        }
4a238c70SJohn Marino      else if (MPFR_IS_ZERO(u)) /* 1/sqrt(+0) = 1/sqrt(-0) = +Inf */
4a238c70SJohn Marino        {
4a238c70SJohn Marino          /* 0+ or 0- */
4a238c70SJohn Marino          MPFR_SET_INF(r);
4a238c70SJohn Marino          MPFR_SET_POS(r);
4a238c70SJohn Marino          mpfr_set_divby0 ();
4a238c70SJohn Marino          MPFR_RET(0); /* Inf is exact */
4a238c70SJohn Marino        }
4a238c70SJohn Marino      else
4a238c70SJohn Marino        {
4a238c70SJohn Marino          MPFR_ASSERTD(MPFR_IS_INF(u));
4a238c70SJohn Marino          /* 1/sqrt(-Inf) = NAN */
4a238c70SJohn Marino          if (MPFR_IS_NEG(u))
4a238c70SJohn Marino            {
4a238c70SJohn Marino              MPFR_SET_NAN(r);
4a238c70SJohn Marino              MPFR_RET_NAN;
4a238c70SJohn Marino            }
4a238c70SJohn Marino          /* 1/sqrt(+Inf) = +0 */
4a238c70SJohn Marino          MPFR_SET_POS(r);
4a238c70SJohn Marino          MPFR_SET_ZERO(r);
4a238c70SJohn Marino          MPFR_RET(0);
4a238c70SJohn Marino        }
4a238c70SJohn Marino    }
4a238c70SJohn Marino
4a238c70SJohn Marino  /* if u < 0, 1/sqrt(u) is NaN */
4a238c70SJohn Marino  if (MPFR_UNLIKELY(MPFR_IS_NEG(u)))
4a238c70SJohn Marino    {
4a238c70SJohn Marino      MPFR_SET_NAN(r);
4a238c70SJohn Marino      MPFR_RET_NAN;
4a238c70SJohn Marino    }
4a238c70SJohn Marino
4a238c70SJohn Marino  MPFR_SET_POS(r);
4a238c70SJohn Marino
4a238c70SJohn Marino  rp = MPFR_PREC(r); /* output precision */
4a238c70SJohn Marino  up = MPFR_PREC(u); /* input precision */
4a238c70SJohn Marino  wp = rp + 11;      /* initial working precision */
4a238c70SJohn Marino
4a238c70SJohn Marino  /* Let u = U*2^e, where e = EXP(u), and 1/2 <= U < 1.
4a238c70SJohn Marino     If e is even, we compute an approximation of X of (4U)^{-1/2},
4a238c70SJohn Marino     and the result is X*2^(-(e-2)/2) [case s=1].
4a238c70SJohn Marino     If e is odd, we compute an approximation of X of (2U)^{-1/2},
4a238c70SJohn Marino     and the result is X*2^(-(e-1)/2) [case s=0]. */
4a238c70SJohn Marino
4a238c70SJohn Marino  /* parity of the exponent of u */
4a238c70SJohn Marino  s = 1 - ((mpfr_uexp_t) MPFR_GET_EXP (u) & 1);
4a238c70SJohn Marino
4a238c70SJohn Marino  rn = LIMB_SIZE(rp);
4a238c70SJohn Marino
4a238c70SJohn Marino  /* for the first iteration, if rp + 11 fits into rn limbs, we round up
4a238c70SJohn Marino     up to a full limb to maximize the chance of rounding, while avoiding
4a238c70SJohn Marino     to allocate extra space */
4a238c70SJohn Marino  wp = rp + 11;
4a238c70SJohn Marino  if (wp < rn * GMP_NUMB_BITS)
4a238c70SJohn Marino    wp = rn * GMP_NUMB_BITS;
4a238c70SJohn Marino  for (;;)
4a238c70SJohn Marino    {
4a238c70SJohn Marino      MPFR_TMP_MARK (marker);
4a238c70SJohn Marino      wn = LIMB_SIZE(wp);
4a238c70SJohn Marino      if (r == u || wn > rn) /* out of place, i.e., we cannot write to r */
4a238c70SJohn Marino        x = MPFR_TMP_LIMBS_ALLOC (wn);
4a238c70SJohn Marino      else
4a238c70SJohn Marino        x = MPFR_MANT(r);
4a238c70SJohn Marino      mpfr_mpn_rec_sqrt (x, wp, MPFR_MANT(u), up, s);
4a238c70SJohn Marino      /* If the input was not truncated, the error is at most one ulp;
4a238c70SJohn Marino         if the input was truncated, the error is at most two ulps
4a238c70SJohn Marino         (see algorithms.tex). */
4a238c70SJohn Marino      if (MPFR_LIKELY (mpfr_round_p (x, wn, wp - (wp < up),
4a238c70SJohn Marino                                     rp + (rnd_mode == MPFR_RNDN))))
4a238c70SJohn Marino        break;
4a238c70SJohn Marino
4a238c70SJohn Marino      /* We detect only now the exact case where u=2^(2e), to avoid
4a238c70SJohn Marino         slowing down the average case. This can happen only when the
4a238c70SJohn Marino         mantissa is exactly 1/2 and the exponent is odd. */
4a238c70SJohn Marino      if (s == 0 && mpfr_cmp_ui_2exp (u, 1, MPFR_EXP(u) - 1) == 0)
4a238c70SJohn Marino        {
4a238c70SJohn Marino          mpfr_prec_t pl = wn * GMP_NUMB_BITS - wp;
4a238c70SJohn Marino
4a238c70SJohn Marino          /* we should have x=111...111 */
4a238c70SJohn Marino          mpn_add_1 (x, x, wn, MPFR_LIMB_ONE << pl);
4a238c70SJohn Marino          x[wn - 1] = MPFR_LIMB_HIGHBIT;
4a238c70SJohn Marino          s += 2;
4a238c70SJohn Marino          break; /* go through */
4a238c70SJohn Marino        }
4a238c70SJohn Marino      MPFR_TMP_FREE(marker);
4a238c70SJohn Marino
4a238c70SJohn Marino      wp += GMP_NUMB_BITS;
4a238c70SJohn Marino    }
4a238c70SJohn Marino  cy = mpfr_round_raw (MPFR_MANT(r), x, wp, 0, rp, rnd_mode, &inex);
4a238c70SJohn Marino  MPFR_EXP(r) = - (MPFR_EXP(u) - 1 - s) / 2;
4a238c70SJohn Marino  if (MPFR_UNLIKELY(cy != 0))
4a238c70SJohn Marino    {
4a238c70SJohn Marino      MPFR_EXP(r) ++;
4a238c70SJohn Marino      MPFR_MANT(r)[rn - 1] = MPFR_LIMB_HIGHBIT;
4a238c70SJohn Marino    }
4a238c70SJohn Marino  MPFR_TMP_FREE(marker);
4a238c70SJohn Marino  return mpfr_check_range (r, inex, rnd_mode);
4a238c70SJohn Marino}