xref: /dflybsd-src/contrib/mpfr/src/rec_sqrt.c (revision 2786097444a0124b5d33763854de247e230c6629)
14a238c70SJohn Marino /* mpfr_rec_sqrt -- inverse square root
24a238c70SJohn Marino 
3*ab6d115fSJohn Marino Copyright 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
4*ab6d115fSJohn Marino Contributed by the AriC and Caramel projects, INRIA.
54a238c70SJohn Marino 
64a238c70SJohn Marino This file is part of the GNU MPFR Library.
74a238c70SJohn Marino 
84a238c70SJohn Marino The GNU MPFR Library is free software; you can redistribute it and/or modify
94a238c70SJohn Marino it under the terms of the GNU Lesser General Public License as published by
104a238c70SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
114a238c70SJohn Marino option) any later version.
124a238c70SJohn Marino 
134a238c70SJohn Marino The GNU MPFR Library is distributed in the hope that it will be useful, but
144a238c70SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
154a238c70SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
164a238c70SJohn Marino License for more details.
174a238c70SJohn Marino 
184a238c70SJohn Marino You should have received a copy of the GNU Lesser General Public License
194a238c70SJohn Marino along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
204a238c70SJohn Marino http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
214a238c70SJohn Marino 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
224a238c70SJohn Marino 
234a238c70SJohn Marino #include <stdio.h>
244a238c70SJohn Marino #include <stdlib.h>
254a238c70SJohn Marino 
264a238c70SJohn Marino #define MPFR_NEED_LONGLONG_H /* for umul_ppmm */
274a238c70SJohn Marino #include "mpfr-impl.h"
284a238c70SJohn Marino 
294a238c70SJohn Marino #define LIMB_SIZE(x) ((((x)-1)>>MPFR_LOG2_GMP_NUMB_BITS) + 1)
304a238c70SJohn Marino 
314a238c70SJohn Marino #define MPFR_COM_N(x,y,n)                               \
324a238c70SJohn Marino   {                                                     \
334a238c70SJohn Marino     mp_size_t i;                                        \
344a238c70SJohn Marino     for (i = 0; i < n; i++)                             \
354a238c70SJohn Marino       *((x)+i) = ~*((y)+i);                             \
364a238c70SJohn Marino   }
374a238c70SJohn Marino 
384a238c70SJohn Marino /* Put in X a p-bit approximation of 1/sqrt(A),
394a238c70SJohn Marino    where X = {x, n}/B^n, n = ceil(p/GMP_NUMB_BITS),
404a238c70SJohn Marino    A = 2^(1+as)*{a, an}/B^an, as is 0 or 1, an = ceil(ap/GMP_NUMB_BITS),
414a238c70SJohn Marino    where B = 2^GMP_NUMB_BITS.
424a238c70SJohn Marino 
434a238c70SJohn Marino    We have 1 <= A < 4 and 1/2 <= X < 1.
444a238c70SJohn Marino 
454a238c70SJohn Marino    The error in the approximate result with respect to the true
464a238c70SJohn Marino    value 1/sqrt(A) is bounded by 1 ulp(X), i.e., 2^{-p} since 1/2 <= X < 1.
474a238c70SJohn Marino 
484a238c70SJohn Marino    Note: x and a are left-aligned, i.e., the most significant bit of
494a238c70SJohn Marino    a[an-1] is set, and so is the most significant bit of the output x[n-1].
504a238c70SJohn Marino 
514a238c70SJohn Marino    If p is not a multiple of GMP_NUMB_BITS, the extra low bits of the input
524a238c70SJohn Marino    A are taken into account to compute the approximation of 1/sqrt(A), but
534a238c70SJohn Marino    whether or not they are zero, the error between X and 1/sqrt(A) is bounded
544a238c70SJohn Marino    by 1 ulp(X) [in precision p].
554a238c70SJohn Marino    The extra low bits of the output X (if p is not a multiple of GMP_NUMB_BITS)
564a238c70SJohn Marino    are set to 0.
574a238c70SJohn Marino 
584a238c70SJohn Marino    Assumptions:
594a238c70SJohn Marino    (1) A should be normalized, i.e., the most significant bit of a[an-1]
604a238c70SJohn Marino        should be 1. If as=0, we have 1 <= A < 2; if as=1, we have 2 <= A < 4.
614a238c70SJohn Marino    (2) p >= 12
624a238c70SJohn Marino    (3) {a, an} and {x, n} should not overlap
634a238c70SJohn Marino    (4) GMP_NUMB_BITS >= 12 and is even
644a238c70SJohn Marino 
654a238c70SJohn Marino    Note: this routine is much more efficient when ap is small compared to p,
664a238c70SJohn Marino    including the case where ap <= GMP_NUMB_BITS, thus it can be used to
674a238c70SJohn Marino    implement an efficient mpfr_rec_sqrt_ui function.
684a238c70SJohn Marino 
694a238c70SJohn Marino    References:
704a238c70SJohn Marino    [1] Modern Computer Algebra, Richard Brent and Paul Zimmermann,
714a238c70SJohn Marino    http://www.loria.fr/~zimmerma/mca/pub226.html
724a238c70SJohn Marino */
734a238c70SJohn Marino static void
mpfr_mpn_rec_sqrt(mpfr_limb_ptr x,mpfr_prec_t p,mpfr_limb_srcptr a,mpfr_prec_t ap,int as)744a238c70SJohn Marino mpfr_mpn_rec_sqrt (mpfr_limb_ptr x, mpfr_prec_t p,
754a238c70SJohn Marino                    mpfr_limb_srcptr a, mpfr_prec_t ap, int as)
764a238c70SJohn Marino 
774a238c70SJohn Marino {
784a238c70SJohn Marino   /* the following T1 and T2 are bipartite tables giving initial
794a238c70SJohn Marino      approximation for the inverse square root, with 13-bit input split in
804a238c70SJohn Marino      5+4+4, and 11-bit output. More precisely, if 2048 <= i < 8192,
814a238c70SJohn Marino      with i = a*2^8 + b*2^4 + c, we use for approximation of
824a238c70SJohn Marino      2048/sqrt(i/2048) the value x = T1[16*(a-8)+b] + T2[16*(a-8)+c].
834a238c70SJohn Marino      The largest error is obtained for i = 2054, where x = 2044,
844a238c70SJohn Marino      and 2048/sqrt(i/2048) = 2045.006576...
854a238c70SJohn Marino   */
864a238c70SJohn Marino   static short int T1[384] = {
874a238c70SJohn Marino 2040, 2033, 2025, 2017, 2009, 2002, 1994, 1987, 1980, 1972, 1965, 1958, 1951,
884a238c70SJohn Marino 1944, 1938, 1931, /* a=8 */
894a238c70SJohn Marino 1925, 1918, 1912, 1905, 1899, 1892, 1886, 1880, 1874, 1867, 1861, 1855, 1849,
904a238c70SJohn Marino 1844, 1838, 1832, /* a=9 */
914a238c70SJohn Marino 1827, 1821, 1815, 1810, 1804, 1799, 1793, 1788, 1783, 1777, 1772, 1767, 1762,
924a238c70SJohn Marino 1757, 1752, 1747, /* a=10 */
934a238c70SJohn Marino 1742, 1737, 1733, 1728, 1723, 1718, 1713, 1709, 1704, 1699, 1695, 1690, 1686,
944a238c70SJohn Marino 1681, 1677, 1673, /* a=11 */
954a238c70SJohn Marino 1669, 1664, 1660, 1656, 1652, 1647, 1643, 1639, 1635, 1631, 1627, 1623, 1619,
964a238c70SJohn Marino 1615, 1611, 1607, /* a=12 */
974a238c70SJohn Marino 1603, 1600, 1596, 1592, 1588, 1585, 1581, 1577, 1574, 1570, 1566, 1563, 1559,
984a238c70SJohn Marino 1556, 1552, 1549, /* a=13 */
994a238c70SJohn Marino 1545, 1542, 1538, 1535, 1532, 1528, 1525, 1522, 1518, 1515, 1512, 1509, 1505,
1004a238c70SJohn Marino 1502, 1499, 1496, /* a=14 */
1014a238c70SJohn Marino 1493, 1490, 1487, 1484, 1481, 1478, 1475, 1472, 1469, 1466, 1463, 1460, 1457,
1024a238c70SJohn Marino 1454, 1451, 1449, /* a=15 */
1034a238c70SJohn Marino 1446, 1443, 1440, 1438, 1435, 1432, 1429, 1427, 1424, 1421, 1419, 1416, 1413,
1044a238c70SJohn Marino 1411, 1408, 1405, /* a=16 */
1054a238c70SJohn Marino 1403, 1400, 1398, 1395, 1393, 1390, 1388, 1385, 1383, 1380, 1378, 1375, 1373,
1064a238c70SJohn Marino 1371, 1368, 1366, /* a=17 */
1074a238c70SJohn Marino 1363, 1360, 1358, 1356, 1353, 1351, 1349, 1346, 1344, 1342, 1340, 1337, 1335,
1084a238c70SJohn Marino 1333, 1331, 1329, /* a=18 */
1094a238c70SJohn Marino 1327, 1325, 1323, 1321, 1319, 1316, 1314, 1312, 1310, 1308, 1306, 1304, 1302,
1104a238c70SJohn Marino 1300, 1298, 1296, /* a=19 */
1114a238c70SJohn Marino 1294, 1292, 1290, 1288, 1286, 1284, 1282, 1280, 1278, 1276, 1274, 1272, 1270,
1124a238c70SJohn Marino 1268, 1266, 1265, /* a=20 */
1134a238c70SJohn Marino 1263, 1261, 1259, 1257, 1255, 1253, 1251, 1250, 1248, 1246, 1244, 1242, 1241,
1144a238c70SJohn Marino 1239, 1237, 1235, /* a=21 */
1154a238c70SJohn Marino 1234, 1232, 1230, 1229, 1227, 1225, 1223, 1222, 1220, 1218, 1217, 1215, 1213,
1164a238c70SJohn Marino 1212, 1210, 1208, /* a=22 */
1174a238c70SJohn Marino 1206, 1204, 1203, 1201, 1199, 1198, 1196, 1195, 1193, 1191, 1190, 1188, 1187,
1184a238c70SJohn Marino 1185, 1184, 1182, /* a=23 */
1194a238c70SJohn Marino 1181, 1180, 1178, 1177, 1175, 1174, 1172, 1171, 1169, 1168, 1166, 1165, 1163,
1204a238c70SJohn Marino 1162, 1160, 1159, /* a=24 */
1214a238c70SJohn Marino 1157, 1156, 1154, 1153, 1151, 1150, 1149, 1147, 1146, 1144, 1143, 1142, 1140,
1224a238c70SJohn Marino 1139, 1137, 1136, /* a=25 */
1234a238c70SJohn Marino 1135, 1133, 1132, 1131, 1129, 1128, 1127, 1125, 1124, 1123, 1121, 1120, 1119,
1244a238c70SJohn Marino 1117, 1116, 1115, /* a=26 */
1254a238c70SJohn Marino 1114, 1113, 1111, 1110, 1109, 1108, 1106, 1105, 1104, 1103, 1101, 1100, 1099,
1264a238c70SJohn Marino 1098, 1096, 1095, /* a=27 */
1274a238c70SJohn Marino 1093, 1092, 1091, 1090, 1089, 1087, 1086, 1085, 1084, 1083, 1081, 1080, 1079,
1284a238c70SJohn Marino 1078, 1077, 1076, /* a=28 */
1294a238c70SJohn Marino 1075, 1073, 1072, 1071, 1070, 1069, 1068, 1067, 1065, 1064, 1063, 1062, 1061,
1304a238c70SJohn Marino 1060, 1059, 1058, /* a=29 */
1314a238c70SJohn Marino 1057, 1056, 1055, 1054, 1052, 1051, 1050, 1049, 1048, 1047, 1046, 1045, 1044,
1324a238c70SJohn Marino 1043, 1042, 1041, /* a=30 */
1334a238c70SJohn Marino 1040, 1039, 1038, 1037, 1036, 1035, 1034, 1033, 1032, 1031, 1030, 1029, 1028,
1344a238c70SJohn Marino 1027, 1026, 1025 /* a=31 */
1354a238c70SJohn Marino };
1364a238c70SJohn Marino   static unsigned char T2[384] = {
1374a238c70SJohn Marino     7, 7, 6, 6, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1, 0, /* a=8 */
1384a238c70SJohn Marino     6, 5, 5, 5, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 0, 0, /* a=9 */
1394a238c70SJohn Marino     5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, /* a=10 */
1404a238c70SJohn Marino     4, 4, 3, 3, 3, 3, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, /* a=11 */
1414a238c70SJohn Marino     3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, /* a=12 */
1424a238c70SJohn Marino     3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=13 */
1434a238c70SJohn Marino     3, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, /* a=14 */
1444a238c70SJohn Marino     2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=15 */
1454a238c70SJohn Marino     2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=16 */
1464a238c70SJohn Marino     2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=17 */
1474a238c70SJohn Marino     3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, /* a=18 */
1484a238c70SJohn Marino     2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=19 */
1494a238c70SJohn Marino     1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, /* a=20 */
1504a238c70SJohn Marino     1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=21 */
1514a238c70SJohn Marino     1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=22 */
1524a238c70SJohn Marino     2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* a=23 */
1534a238c70SJohn Marino     1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=24 */
1544a238c70SJohn Marino     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=25 */
1554a238c70SJohn Marino     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=26 */
1564a238c70SJohn Marino     1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=27 */
1574a238c70SJohn Marino     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=28 */
1584a238c70SJohn Marino     1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=29 */
1594a238c70SJohn Marino     1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=30 */
1604a238c70SJohn Marino     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* a=31 */
1614a238c70SJohn Marino };
1624a238c70SJohn Marino   mp_size_t n = LIMB_SIZE(p);   /* number of limbs of X */
1634a238c70SJohn Marino   mp_size_t an = LIMB_SIZE(ap); /* number of limbs of A */
1644a238c70SJohn Marino 
1654a238c70SJohn Marino   /* A should be normalized */
1664a238c70SJohn Marino   MPFR_ASSERTD((a[an - 1] & MPFR_LIMB_HIGHBIT) != 0);
1674a238c70SJohn Marino   /* We should have enough bits in one limb and GMP_NUMB_BITS should be even.
1684a238c70SJohn Marino      Since that does not depend on MPFR, we always check this. */
1694a238c70SJohn Marino   MPFR_ASSERTN((GMP_NUMB_BITS >= 12) && ((GMP_NUMB_BITS & 1) == 0));
1704a238c70SJohn Marino   /* {a, an} and {x, n} should not overlap */
1714a238c70SJohn Marino   MPFR_ASSERTD((a + an <= x) || (x + n <= a));
1724a238c70SJohn Marino   MPFR_ASSERTD(p >= 11);
1734a238c70SJohn Marino 
1744a238c70SJohn Marino   if (MPFR_UNLIKELY(an > n)) /* we can cut the input to n limbs */
1754a238c70SJohn Marino     {
1764a238c70SJohn Marino       a += an - n;
1774a238c70SJohn Marino       an = n;
1784a238c70SJohn Marino     }
1794a238c70SJohn Marino 
1804a238c70SJohn Marino   if (p == 11) /* should happen only from recursive calls */
1814a238c70SJohn Marino     {
1824a238c70SJohn Marino       unsigned long i, ab, ac;
1834a238c70SJohn Marino       mp_limb_t t;
1844a238c70SJohn Marino 
1854a238c70SJohn Marino       /* take the 12+as most significant bits of A */
1864a238c70SJohn Marino       i = a[an - 1] >> (GMP_NUMB_BITS - (12 + as));
1874a238c70SJohn Marino       /* if one wants faithful rounding for p=11, replace #if 0 by #if 1 */
1884a238c70SJohn Marino       ab = i >> 4;
1894a238c70SJohn Marino       ac = (ab & 0x3F0) | (i & 0x0F);
1904a238c70SJohn Marino       t = (mp_limb_t) T1[ab - 0x80] + (mp_limb_t) T2[ac - 0x80];
1914a238c70SJohn Marino       x[0] = t << (GMP_NUMB_BITS - p);
1924a238c70SJohn Marino     }
1934a238c70SJohn Marino   else /* p >= 12 */
1944a238c70SJohn Marino     {
1954a238c70SJohn Marino       mpfr_prec_t h, pl;
1964a238c70SJohn Marino       mpfr_limb_ptr r, s, t, u;
1974a238c70SJohn Marino       mp_size_t xn, rn, th, ln, tn, sn, ahn, un;
1984a238c70SJohn Marino       mp_limb_t neg, cy, cu;
1994a238c70SJohn Marino       MPFR_TMP_DECL(marker);
2004a238c70SJohn Marino 
2014a238c70SJohn Marino       /* compared to Algorithm 3.9 of [1], we have {a, an} = A/2 if as=0,
2024a238c70SJohn Marino          and A/4 if as=1. */
2034a238c70SJohn Marino 
2044a238c70SJohn Marino       /* h = max(11, ceil((p+3)/2)) is the bitsize of the recursive call */
2054a238c70SJohn Marino       h = (p < 18) ? 11 : (p >> 1) + 2;
2064a238c70SJohn Marino 
2074a238c70SJohn Marino       xn = LIMB_SIZE(h);       /* limb size of the recursive Xh */
2084a238c70SJohn Marino       rn = LIMB_SIZE(2 * h);   /* a priori limb size of Xh^2 */
2094a238c70SJohn Marino       ln = n - xn;             /* remaining limbs to be computed */
2104a238c70SJohn Marino 
2114a238c70SJohn Marino       /* Since |Xh - A^{-1/2}| <= 2^{-h}, then by multiplying by Xh + A^{-1/2}
2124a238c70SJohn Marino          we get |Xh^2 - 1/A| <= 2^{-h+1}, thus |A*Xh^2 - 1| <= 2^{-h+3},
2134a238c70SJohn Marino          thus the h-3 most significant bits of t should be zero,
2144a238c70SJohn Marino          which is in fact h+1+as-3 because of the normalization of A.
2154a238c70SJohn Marino          This corresponds to th=floor((h+1+as-3)/GMP_NUMB_BITS) limbs.
2164a238c70SJohn Marino 
2174a238c70SJohn Marino          More precisely we have |Xh^2 - 1/A| <= 2^{-h} * (Xh + A^{-1/2})
2184a238c70SJohn Marino          <= 2^{-h} * (2 A^{-1/2} + 2^{-h}) <= 2.001 * 2^{-h} * A^{-1/2}
2194a238c70SJohn Marino          since A < 4 and h >= 11, thus
2204a238c70SJohn Marino          |A*Xh^2 - 1| <= 2.001 * 2^{-h} * A^{1/2} <= 1.001 * 2^{2-h}.
2214a238c70SJohn Marino          This is sufficient to prove that the upper limb of {t,tn} below is
2224a238c70SJohn Marino          less that 0.501 * 2^GMP_NUMB_BITS, thus cu = 0 below.
2234a238c70SJohn Marino       */
2244a238c70SJohn Marino       th = (h + 1 + as - 3) >> MPFR_LOG2_GMP_NUMB_BITS;
2254a238c70SJohn Marino       tn = LIMB_SIZE(2 * h + 1 + as);
2264a238c70SJohn Marino 
2274a238c70SJohn Marino       /* we need h+1+as bits of a */
2284a238c70SJohn Marino       ahn = LIMB_SIZE(h + 1 + as); /* number of high limbs of A
2294a238c70SJohn Marino                                       needed for the recursive call*/
2304a238c70SJohn Marino       if (MPFR_UNLIKELY(ahn > an))
2314a238c70SJohn Marino         ahn = an;
2324a238c70SJohn Marino       mpfr_mpn_rec_sqrt (x + ln, h, a + an - ahn, ahn * GMP_NUMB_BITS, as);
2334a238c70SJohn Marino       /* the most h significant bits of X are set, X has ceil(h/GMP_NUMB_BITS)
2344a238c70SJohn Marino          limbs, the low (-h) % GMP_NUMB_BITS bits are zero */
2354a238c70SJohn Marino 
2364a238c70SJohn Marino       /* compared to Algorithm 3.9 of [1], we have {x+ln,xn} = X_h */
2374a238c70SJohn Marino 
2384a238c70SJohn Marino       MPFR_TMP_MARK (marker);
2394a238c70SJohn Marino       /* first step: square X in r, result is exact */
2404a238c70SJohn Marino       un = xn + (tn - th);
2414a238c70SJohn Marino       /* We use the same temporary buffer to store r and u: r needs 2*xn
2424a238c70SJohn Marino          limbs where u needs xn+(tn-th) limbs. Since tn can store at least
2434a238c70SJohn Marino          2h bits, and th at most h bits, then tn-th can store at least h bits,
2444a238c70SJohn Marino          thus tn - th >= xn, and reserving the space for u is enough. */
2454a238c70SJohn Marino       MPFR_ASSERTD(2 * xn <= un);
2464a238c70SJohn Marino       u = r = MPFR_TMP_LIMBS_ALLOC (un);
2474a238c70SJohn Marino       if (2 * h <= GMP_NUMB_BITS) /* xn=rn=1, and since p <= 2h-3, n=1,
2484a238c70SJohn Marino                                      thus ln = 0 */
2494a238c70SJohn Marino         {
2504a238c70SJohn Marino           MPFR_ASSERTD(ln == 0);
2514a238c70SJohn Marino           cy = x[0] >> (GMP_NUMB_BITS >> 1);
2524a238c70SJohn Marino           r ++;
2534a238c70SJohn Marino           r[0] = cy * cy;
2544a238c70SJohn Marino         }
2554a238c70SJohn Marino       else if (xn == 1) /* xn=1, rn=2 */
2564a238c70SJohn Marino         umul_ppmm(r[1], r[0], x[ln], x[ln]);
2574a238c70SJohn Marino       else
2584a238c70SJohn Marino         {
2594a238c70SJohn Marino           mpn_mul_n (r, x + ln, x + ln, xn);
2604a238c70SJohn Marino           /* we have {r, 2*xn} = X_h^2 */
2614a238c70SJohn Marino           if (rn < 2 * xn)
2624a238c70SJohn Marino             r ++;
2634a238c70SJohn Marino         }
2644a238c70SJohn Marino       /* now the 2h most significant bits of {r, rn} contains X^2, r has rn
2654a238c70SJohn Marino          limbs, and the low (-2h) % GMP_NUMB_BITS bits are zero */
2664a238c70SJohn Marino 
2674a238c70SJohn Marino       /* Second step: s <- A * (r^2), and truncate the low ap bits,
2684a238c70SJohn Marino          i.e., at weight 2^{-2h} (s is aligned to the low significant bits)
2694a238c70SJohn Marino        */
2704a238c70SJohn Marino       sn = an + rn;
2714a238c70SJohn Marino       s = MPFR_TMP_LIMBS_ALLOC (sn);
2724a238c70SJohn Marino       if (rn == 1) /* rn=1 implies n=1, since rn*GMP_NUMB_BITS >= 2h,
2734a238c70SJohn Marino                            and 2h >= p+3 */
2744a238c70SJohn Marino         {
2754a238c70SJohn Marino           /* necessarily p <= GMP_NUMB_BITS-3: we can ignore the two low
2764a238c70SJohn Marino              bits from A */
2774a238c70SJohn Marino           /* since n=1, and we ensured an <= n, we also have an=1 */
2784a238c70SJohn Marino           MPFR_ASSERTD(an == 1);
2794a238c70SJohn Marino           umul_ppmm (s[1], s[0], r[0], a[0]);
2804a238c70SJohn Marino         }
2814a238c70SJohn Marino       else
2824a238c70SJohn Marino         {
2834a238c70SJohn Marino           /* we have p <= n * GMP_NUMB_BITS
2844a238c70SJohn Marino              2h <= rn * GMP_NUMB_BITS with p+3 <= 2h <= p+4
2854a238c70SJohn Marino              thus n <= rn <= n + 1 */
2864a238c70SJohn Marino           MPFR_ASSERTD(rn <= n + 1);
2874a238c70SJohn Marino           /* since we ensured an <= n, we have an <= rn */
2884a238c70SJohn Marino           MPFR_ASSERTD(an <= rn);
2894a238c70SJohn Marino           mpn_mul (s, r, rn, a, an);
2904a238c70SJohn Marino           /* s should be near B^sn/2^(1+as), thus s[sn-1] is either
2914a238c70SJohn Marino              100000... or 011111... if as=0, or
2924a238c70SJohn Marino              010000... or 001111... if as=1.
2934a238c70SJohn Marino              We ignore the bits of s after the first 2h+1+as ones.
2944a238c70SJohn Marino              We have {s, rn+an} = A*X_h^2/2 if as=0, A*X_h^2/4 if as=1. */
2954a238c70SJohn Marino         }
2964a238c70SJohn Marino 
2974a238c70SJohn Marino       /* We ignore the bits of s after the first 2h+1+as ones: s has rn + an
2984a238c70SJohn Marino          limbs, where rn = LIMBS(2h), an=LIMBS(a), and tn = LIMBS(2h+1+as). */
2994a238c70SJohn Marino       t = s + sn - tn; /* pointer to low limb of the high part of t */
3004a238c70SJohn Marino       /* the upper h-3 bits of 1-t should be zero,
3014a238c70SJohn Marino          where 1 corresponds to the most significant bit of t[tn-1] if as=0,
3024a238c70SJohn Marino          and to the 2nd most significant bit of t[tn-1] if as=1 */
3034a238c70SJohn Marino 
3044a238c70SJohn Marino       /* compute t <- 1 - t, which is B^tn - {t, tn+1},
3054a238c70SJohn Marino          with rounding toward -Inf, i.e., rounding the input t toward +Inf.
3064a238c70SJohn Marino          We could only modify the low tn - th limbs from t, but it gives only
3074a238c70SJohn Marino          a small speedup, and would make the code more complex.
3084a238c70SJohn Marino       */
3094a238c70SJohn Marino       neg = t[tn - 1] & (MPFR_LIMB_HIGHBIT >> as);
3104a238c70SJohn Marino       if (neg == 0) /* Ax^2 < 1: we have t = th + eps, where 0 <= eps < ulp(th)
3114a238c70SJohn Marino                        is the part truncated above, thus 1 - t rounded to -Inf
3124a238c70SJohn Marino                        is 1 - th - ulp(th) */
3134a238c70SJohn Marino         {
3144a238c70SJohn Marino           /* since the 1+as most significant bits of t are zero, set them
3154a238c70SJohn Marino              to 1 before the one-complement */
3164a238c70SJohn Marino           t[tn - 1] |= MPFR_LIMB_HIGHBIT | (MPFR_LIMB_HIGHBIT >> as);
3174a238c70SJohn Marino           MPFR_COM_N (t, t, tn);
3184a238c70SJohn Marino           /* we should add 1 here to get 1-th complement, and subtract 1 for
3194a238c70SJohn Marino              -ulp(th), thus we do nothing */
3204a238c70SJohn Marino         }
3214a238c70SJohn Marino       else /* negative case: we want 1 - t rounded toward -Inf, i.e.,
3224a238c70SJohn Marino               th + eps rounded toward +Inf, which is th + ulp(th):
3234a238c70SJohn Marino               we discard the bit corresponding to 1,
3244a238c70SJohn Marino               and we add 1 to the least significant bit of t */
3254a238c70SJohn Marino         {
3264a238c70SJohn Marino           t[tn - 1] ^= neg;
3274a238c70SJohn Marino           mpn_add_1 (t, t, tn, 1);
3284a238c70SJohn Marino         }
3294a238c70SJohn Marino       tn -= th; /* we know at least th = floor((h+1+as-3)/GMP_NUMB_LIMBS) of
3304a238c70SJohn Marino                    the high limbs of {t, tn} are zero */
3314a238c70SJohn Marino 
3324a238c70SJohn Marino       /* tn = rn - th, where rn * GMP_NUMB_BITS >= 2*h and
3334a238c70SJohn Marino          th * GMP_NUMB_BITS <= h+1+as-3, thus tn > 0 */
3344a238c70SJohn Marino       MPFR_ASSERTD(tn > 0);
3354a238c70SJohn Marino 
3364a238c70SJohn Marino       /* u <- x * t, where {t, tn} contains at least h+3 bits,
3374a238c70SJohn Marino          and {x, xn} contains h bits, thus tn >= xn */
3384a238c70SJohn Marino       MPFR_ASSERTD(tn >= xn);
3394a238c70SJohn Marino       if (tn == 1) /* necessarily xn=1 */
3404a238c70SJohn Marino         umul_ppmm (u[1], u[0], t[0], x[ln]);
3414a238c70SJohn Marino       else
3424a238c70SJohn Marino         mpn_mul (u, t, tn, x + ln, xn);
3434a238c70SJohn Marino 
3444a238c70SJohn Marino       /* we have {u, tn+xn} = T_l X_h/2 if as=0, T_l X_h/4 if as=1 */
3454a238c70SJohn Marino 
3464a238c70SJohn Marino       /* we have already discarded the upper th high limbs of t, thus we only
3474a238c70SJohn Marino          have to consider the upper n - th limbs of u */
3484a238c70SJohn Marino       un = n - th; /* un cannot be zero, since p <= n*GMP_NUMB_BITS,
3494a238c70SJohn Marino                       h = ceil((p+3)/2) <= (p+4)/2,
3504a238c70SJohn Marino                       th*GMP_NUMB_BITS <= h-1 <= p/2+1,
3514a238c70SJohn Marino                       thus (n-th)*GMP_NUMB_BITS >= p/2-1.
3524a238c70SJohn Marino                    */
3534a238c70SJohn Marino       MPFR_ASSERTD(un > 0);
3544a238c70SJohn Marino       u += (tn + xn) - un; /* xn + tn - un = xn + (original_tn - th) - (n - th)
3554a238c70SJohn Marino                                            = xn + original_tn - n
3564a238c70SJohn Marino                               = LIMBS(h) + LIMBS(2h+1+as) - LIMBS(p) > 0
3574a238c70SJohn Marino                               since 2h >= p+3 */
3584a238c70SJohn Marino       MPFR_ASSERTD(tn + xn > un); /* will allow to access u[-1] below */
3594a238c70SJohn Marino 
3604a238c70SJohn Marino       /* In case as=0, u contains |x*(1-Ax^2)/2|, which is exactly what we
3614a238c70SJohn Marino          need to add or subtract.
3624a238c70SJohn Marino          In case as=1, u contains |x*(1-Ax^2)/4|, thus we need to multiply
3634a238c70SJohn Marino          u by 2. */
3644a238c70SJohn Marino 
3654a238c70SJohn Marino       if (as == 1)
3664a238c70SJohn Marino         /* shift on un+1 limbs to get most significant bit of u[-1] into
3674a238c70SJohn Marino            least significant bit of u[0] */
3684a238c70SJohn Marino         mpn_lshift (u - 1, u - 1, un + 1, 1);
3694a238c70SJohn Marino 
3704a238c70SJohn Marino       /* now {u,un} represents U / 2 from Algorithm 3.9 */
3714a238c70SJohn Marino 
3724a238c70SJohn Marino       pl = n * GMP_NUMB_BITS - p;       /* low bits from x */
3734a238c70SJohn Marino       /* We want that the low pl bits are zero after rounding to nearest,
3744a238c70SJohn Marino          thus we round u to nearest at bit pl-1 of u[0] */
3754a238c70SJohn Marino       if (pl > 0)
3764a238c70SJohn Marino         {
3774a238c70SJohn Marino           cu = mpn_add_1 (u, u, un, u[0] & (MPFR_LIMB_ONE << (pl - 1)));
3784a238c70SJohn Marino           /* mask bits 0..pl-1 of u[0] */
3794a238c70SJohn Marino           u[0] &= ~MPFR_LIMB_MASK(pl);
3804a238c70SJohn Marino         }
3814a238c70SJohn Marino       else /* round bit is in u[-1] */
3824a238c70SJohn Marino         cu = mpn_add_1 (u, u, un, u[-1] >> (GMP_NUMB_BITS - 1));
3834a238c70SJohn Marino       MPFR_ASSERTN(cu == 0);
3844a238c70SJohn Marino 
3854a238c70SJohn Marino       /* We already have filled {x + ln, xn = n - ln}, and we want to add or
3864a238c70SJohn Marino          subtract {u, un} at position x.
3874a238c70SJohn Marino          un = n - th, where th contains <= h+1+as-3<=h-1 bits
3884a238c70SJohn Marino          ln = n - xn, where xn contains >= h bits
3894a238c70SJohn Marino          thus un > ln.
3904a238c70SJohn Marino          Warning: ln might be zero.
3914a238c70SJohn Marino       */
3924a238c70SJohn Marino       MPFR_ASSERTD(un > ln);
3934a238c70SJohn Marino       /* we can have un = ln + 2, for example with GMP_NUMB_BITS=32 and
3944a238c70SJohn Marino          p=62, as=0, then h=33, n=2, th=0, xn=2, thus un=2 and ln=0. */
3954a238c70SJohn Marino       MPFR_ASSERTD(un == ln + 1 || un == ln + 2);
3964a238c70SJohn Marino       /* the high un-ln limbs of u will overlap the low part of {x+ln,xn},
3974a238c70SJohn Marino          we need to add or subtract the overlapping part {u + ln, un - ln} */
3984a238c70SJohn Marino       /* Warning! th may be 0, in which case the mpn_add_1 and mpn_sub_1
3994a238c70SJohn Marino          below (with size = th) mustn't be used. */
4004a238c70SJohn Marino       if (neg == 0)
4014a238c70SJohn Marino         {
4024a238c70SJohn Marino           if (ln > 0)
4034a238c70SJohn Marino             MPN_COPY (x, u, ln);
4044a238c70SJohn Marino           cy = mpn_add (x + ln, x + ln, xn, u + ln, un - ln);
4054a238c70SJohn Marino           /* cy is the carry at x + (ln + xn) = x + n */
4064a238c70SJohn Marino         }
4074a238c70SJohn Marino       else /* negative case */
4084a238c70SJohn Marino         {
4094a238c70SJohn Marino           /* subtract {u+ln, un-ln} from {x+ln,un} */
4104a238c70SJohn Marino           cy = mpn_sub (x + ln, x + ln, xn, u + ln, un - ln);
4114a238c70SJohn Marino           /* cy is the borrow at x + (ln + xn) = x + n */
4124a238c70SJohn Marino 
4134a238c70SJohn Marino           /* cy cannot be non-zero, since the most significant bit of Xh is 1,
4144a238c70SJohn Marino              and the correction is bounded by 2^{-h+3} */
4154a238c70SJohn Marino           MPFR_ASSERTD(cy == 0);
4164a238c70SJohn Marino           if (ln > 0)
4174a238c70SJohn Marino             {
4184a238c70SJohn Marino               MPFR_COM_N (x, u, ln);
4194a238c70SJohn Marino               /* we must add one for the 2-complement ... */
4204a238c70SJohn Marino               cy = mpn_add_1 (x, x, n, MPFR_LIMB_ONE);
4214a238c70SJohn Marino               /* ... and subtract 1 at x[ln], where n = ln + xn */
4224a238c70SJohn Marino               cy -= mpn_sub_1 (x + ln, x + ln, xn, MPFR_LIMB_ONE);
4234a238c70SJohn Marino             }
4244a238c70SJohn Marino         }
4254a238c70SJohn Marino 
4264a238c70SJohn Marino       /* cy can be 1 when A=1, i.e., {a, n} = B^n. In that case we should
4274a238c70SJohn Marino          have X = B^n, and setting X to 1-2^{-p} satisties the error bound
4284a238c70SJohn Marino          of 1 ulp. */
4294a238c70SJohn Marino       if (MPFR_UNLIKELY(cy != 0))
4304a238c70SJohn Marino         {
4314a238c70SJohn Marino           cy -= mpn_sub_1 (x, x, n, MPFR_LIMB_ONE << pl);
4324a238c70SJohn Marino           MPFR_ASSERTD(cy == 0);
4334a238c70SJohn Marino         }
4344a238c70SJohn Marino 
4354a238c70SJohn Marino       MPFR_TMP_FREE (marker);
4364a238c70SJohn Marino     }
4374a238c70SJohn Marino }
4384a238c70SJohn Marino 
4394a238c70SJohn Marino int
mpfr_rec_sqrt(mpfr_ptr r,mpfr_srcptr u,mpfr_rnd_t rnd_mode)4404a238c70SJohn Marino mpfr_rec_sqrt (mpfr_ptr r, mpfr_srcptr u, mpfr_rnd_t rnd_mode)
4414a238c70SJohn Marino {
4424a238c70SJohn Marino   mpfr_prec_t rp, up, wp;
4434a238c70SJohn Marino   mp_size_t rn, wn;
4444a238c70SJohn Marino   int s, cy, inex;
4454a238c70SJohn Marino   mpfr_limb_ptr x;
4464a238c70SJohn Marino   MPFR_TMP_DECL(marker);
4474a238c70SJohn Marino 
4484a238c70SJohn Marino   MPFR_LOG_FUNC
4494a238c70SJohn Marino     (("x[%Pu]=%.*Rg rnd=%d", mpfr_get_prec (u), mpfr_log_prec, u, rnd_mode),
4504a238c70SJohn Marino      ("y[%Pu]=%.*Rg inexact=%d", mpfr_get_prec (r), mpfr_log_prec, r, inex));
4514a238c70SJohn Marino 
4524a238c70SJohn Marino   /* special values */
4534a238c70SJohn Marino   if (MPFR_UNLIKELY(MPFR_IS_SINGULAR(u)))
4544a238c70SJohn Marino     {
4554a238c70SJohn Marino       if (MPFR_IS_NAN(u))
4564a238c70SJohn Marino         {
4574a238c70SJohn Marino           MPFR_SET_NAN(r);
4584a238c70SJohn Marino           MPFR_RET_NAN;
4594a238c70SJohn Marino         }
4604a238c70SJohn Marino       else if (MPFR_IS_ZERO(u)) /* 1/sqrt(+0) = 1/sqrt(-0) = +Inf */
4614a238c70SJohn Marino         {
4624a238c70SJohn Marino           /* 0+ or 0- */
4634a238c70SJohn Marino           MPFR_SET_INF(r);
4644a238c70SJohn Marino           MPFR_SET_POS(r);
4654a238c70SJohn Marino           mpfr_set_divby0 ();
4664a238c70SJohn Marino           MPFR_RET(0); /* Inf is exact */
4674a238c70SJohn Marino         }
4684a238c70SJohn Marino       else
4694a238c70SJohn Marino         {
4704a238c70SJohn Marino           MPFR_ASSERTD(MPFR_IS_INF(u));
4714a238c70SJohn Marino           /* 1/sqrt(-Inf) = NAN */
4724a238c70SJohn Marino           if (MPFR_IS_NEG(u))
4734a238c70SJohn Marino             {
4744a238c70SJohn Marino               MPFR_SET_NAN(r);
4754a238c70SJohn Marino               MPFR_RET_NAN;
4764a238c70SJohn Marino             }
4774a238c70SJohn Marino           /* 1/sqrt(+Inf) = +0 */
4784a238c70SJohn Marino           MPFR_SET_POS(r);
4794a238c70SJohn Marino           MPFR_SET_ZERO(r);
4804a238c70SJohn Marino           MPFR_RET(0);
4814a238c70SJohn Marino         }
4824a238c70SJohn Marino     }
4834a238c70SJohn Marino 
4844a238c70SJohn Marino   /* if u < 0, 1/sqrt(u) is NaN */
4854a238c70SJohn Marino   if (MPFR_UNLIKELY(MPFR_IS_NEG(u)))
4864a238c70SJohn Marino     {
4874a238c70SJohn Marino       MPFR_SET_NAN(r);
4884a238c70SJohn Marino       MPFR_RET_NAN;
4894a238c70SJohn Marino     }
4904a238c70SJohn Marino 
4914a238c70SJohn Marino   MPFR_SET_POS(r);
4924a238c70SJohn Marino 
4934a238c70SJohn Marino   rp = MPFR_PREC(r); /* output precision */
4944a238c70SJohn Marino   up = MPFR_PREC(u); /* input precision */
4954a238c70SJohn Marino   wp = rp + 11;      /* initial working precision */
4964a238c70SJohn Marino 
4974a238c70SJohn Marino   /* Let u = U*2^e, where e = EXP(u), and 1/2 <= U < 1.
4984a238c70SJohn Marino      If e is even, we compute an approximation of X of (4U)^{-1/2},
4994a238c70SJohn Marino      and the result is X*2^(-(e-2)/2) [case s=1].
5004a238c70SJohn Marino      If e is odd, we compute an approximation of X of (2U)^{-1/2},
5014a238c70SJohn Marino      and the result is X*2^(-(e-1)/2) [case s=0]. */
5024a238c70SJohn Marino 
5034a238c70SJohn Marino   /* parity of the exponent of u */
5044a238c70SJohn Marino   s = 1 - ((mpfr_uexp_t) MPFR_GET_EXP (u) & 1);
5054a238c70SJohn Marino 
5064a238c70SJohn Marino   rn = LIMB_SIZE(rp);
5074a238c70SJohn Marino 
5084a238c70SJohn Marino   /* for the first iteration, if rp + 11 fits into rn limbs, we round up
5094a238c70SJohn Marino      up to a full limb to maximize the chance of rounding, while avoiding
5104a238c70SJohn Marino      to allocate extra space */
5114a238c70SJohn Marino   wp = rp + 11;
5124a238c70SJohn Marino   if (wp < rn * GMP_NUMB_BITS)
5134a238c70SJohn Marino     wp = rn * GMP_NUMB_BITS;
5144a238c70SJohn Marino   for (;;)
5154a238c70SJohn Marino     {
5164a238c70SJohn Marino       MPFR_TMP_MARK (marker);
5174a238c70SJohn Marino       wn = LIMB_SIZE(wp);
5184a238c70SJohn Marino       if (r == u || wn > rn) /* out of place, i.e., we cannot write to r */
5194a238c70SJohn Marino         x = MPFR_TMP_LIMBS_ALLOC (wn);
5204a238c70SJohn Marino       else
5214a238c70SJohn Marino         x = MPFR_MANT(r);
5224a238c70SJohn Marino       mpfr_mpn_rec_sqrt (x, wp, MPFR_MANT(u), up, s);
5234a238c70SJohn Marino       /* If the input was not truncated, the error is at most one ulp;
5244a238c70SJohn Marino          if the input was truncated, the error is at most two ulps
5254a238c70SJohn Marino          (see algorithms.tex). */
5264a238c70SJohn Marino       if (MPFR_LIKELY (mpfr_round_p (x, wn, wp - (wp < up),
5274a238c70SJohn Marino                                      rp + (rnd_mode == MPFR_RNDN))))
5284a238c70SJohn Marino         break;
5294a238c70SJohn Marino 
5304a238c70SJohn Marino       /* We detect only now the exact case where u=2^(2e), to avoid
5314a238c70SJohn Marino          slowing down the average case. This can happen only when the
5324a238c70SJohn Marino          mantissa is exactly 1/2 and the exponent is odd. */
5334a238c70SJohn Marino       if (s == 0 && mpfr_cmp_ui_2exp (u, 1, MPFR_EXP(u) - 1) == 0)
5344a238c70SJohn Marino         {
5354a238c70SJohn Marino           mpfr_prec_t pl = wn * GMP_NUMB_BITS - wp;
5364a238c70SJohn Marino 
5374a238c70SJohn Marino           /* we should have x=111...111 */
5384a238c70SJohn Marino           mpn_add_1 (x, x, wn, MPFR_LIMB_ONE << pl);
5394a238c70SJohn Marino           x[wn - 1] = MPFR_LIMB_HIGHBIT;
5404a238c70SJohn Marino           s += 2;
5414a238c70SJohn Marino           break; /* go through */
5424a238c70SJohn Marino         }
5434a238c70SJohn Marino       MPFR_TMP_FREE(marker);
5444a238c70SJohn Marino 
5454a238c70SJohn Marino       wp += GMP_NUMB_BITS;
5464a238c70SJohn Marino     }
5474a238c70SJohn Marino   cy = mpfr_round_raw (MPFR_MANT(r), x, wp, 0, rp, rnd_mode, &inex);
5484a238c70SJohn Marino   MPFR_EXP(r) = - (MPFR_EXP(u) - 1 - s) / 2;
5494a238c70SJohn Marino   if (MPFR_UNLIKELY(cy != 0))
5504a238c70SJohn Marino     {
5514a238c70SJohn Marino       MPFR_EXP(r) ++;
5524a238c70SJohn Marino       MPFR_MANT(r)[rn - 1] = MPFR_LIMB_HIGHBIT;
5534a238c70SJohn Marino     }
5544a238c70SJohn Marino   MPFR_TMP_FREE(marker);
5554a238c70SJohn Marino   return mpfr_check_range (r, inex, rnd_mode);
5564a238c70SJohn Marino }
557