xref: /netbsd-src/external/lgpl3/mpfr/dist/src/invert_limb.h (revision ba125506a622fe649968631a56eba5d42ff57863)
1 /* __gmpfr_invert_limb -- implement GMP's invert_limb (which is not in GMP API)
2 
3 Copyright 2016-2023 Free Software Foundation, Inc.
4 Contributed by the AriC and Caramba projects, INRIA.
5 
6 This file is part of the GNU MPFR Library.
7 
8 The GNU MPFR Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 
13 The GNU MPFR Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16 License for more details.
17 
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
20 https://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
21 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
22 
23 #define MPFR_NEED_LONGLONG_H
24 #include "mpfr-impl.h"
25 
26 /* for now, we only provide __gmpfr_invert_limb for 64-bit limb */
27 #if GMP_NUMB_BITS == 64
28 
29 /* for 256 <= d9 < 512, invert_limb_table[d9-256] = floor((2^19-3*2^8)/d9) */
30 static const unsigned short invert_limb_table[256] =
31   { 2045, 2037, 2029, 2021, 2013, 2005, 1998, 1990,
32     1983, 1975, 1968, 1960, 1953, 1946, 1938, 1931,
33     1924, 1917, 1910, 1903, 1896, 1889, 1883, 1876,
34     1869, 1863, 1856, 1849, 1843, 1836, 1830, 1824,
35     1817, 1811, 1805, 1799, 1792, 1786, 1780, 1774,
36     1768, 1762, 1756, 1750, 1745, 1739, 1733, 1727,
37     1722, 1716, 1710, 1705, 1699, 1694, 1688, 1683,
38     1677, 1672, 1667, 1661, 1656, 1651, 1646, 1641,
39     1636, 1630, 1625, 1620, 1615, 1610, 1605, 1600,
40     1596, 1591, 1586, 1581, 1576, 1572, 1567, 1562,
41     1558, 1553, 1548, 1544, 1539, 1535, 1530, 1526,
42     1521, 1517, 1513, 1508, 1504, 1500, 1495, 1491,
43     1487, 1483, 1478, 1474, 1470, 1466, 1462, 1458,
44     1454, 1450, 1446, 1442, 1438, 1434, 1430, 1426,
45     1422, 1418, 1414, 1411, 1407, 1403, 1399, 1396,
46     1392, 1388, 1384, 1381, 1377, 1374, 1370, 1366,
47     1363, 1359, 1356, 1352, 1349, 1345, 1342, 1338,
48     1335, 1332, 1328, 1325, 1322, 1318, 1315, 1312,
49     1308, 1305, 1302, 1299, 1295, 1292, 1289, 1286,
50     1283, 1280, 1276, 1273, 1270, 1267, 1264, 1261,
51     1258, 1255, 1252, 1249, 1246, 1243, 1240, 1237,
52     1234, 1231, 1228, 1226, 1223, 1220, 1217, 1214,
53     1211, 1209, 1206, 1203, 1200, 1197, 1195, 1192,
54     1189, 1187, 1184, 1181, 1179, 1176, 1173, 1171,
55     1168, 1165, 1163, 1160, 1158, 1155, 1153, 1150,
56     1148, 1145, 1143, 1140, 1138, 1135, 1133, 1130,
57     1128, 1125, 1123, 1121, 1118, 1116, 1113, 1111,
58     1109, 1106, 1104, 1102, 1099, 1097, 1095, 1092,
59     1090, 1088, 1086, 1083, 1081, 1079, 1077, 1074,
60     1072, 1070, 1068, 1066, 1064, 1061, 1059, 1057,
61     1055, 1053, 1051, 1049, 1047, 1044, 1042, 1040,
62     1038, 1036, 1034, 1032, 1030, 1028, 1026, 1024 };
63 
64 /* for 256 <= d9 < 512, invert_limb_table2[d9-256] = floor((2^19-3*2^8)/d9)^2
65  *
66  * Note: This table requires 4182025 to be representable in unsigned int,
67  * thus disallows 16-bit int, for instance. However, this code is under
68  * "#if GMP_NUMB_BITS == 64", and a system with int smaller than 32 bits
69  * should better be used with GMP_NUMB_BITS == 32 (or 16 if supported).
70  * This constraint is checked with MPFR_STAT_STATIC_ASSERT below.
71  */
72 static const unsigned int invert_limb_table2[256] =
73   { 4182025, 4149369, 4116841, 4084441, 4052169, 4020025, 3992004, 3960100, 3932289,
74     3900625, 3873024, 3841600, 3814209, 3786916, 3755844, 3728761, 3701776, 3674889,
75     3648100, 3621409, 3594816, 3568321, 3545689, 3519376, 3493161, 3470769, 3444736,
76     3418801, 3396649, 3370896, 3348900, 3326976, 3301489, 3279721, 3258025, 3236401,
77     3211264, 3189796, 3168400, 3147076, 3125824, 3104644, 3083536, 3062500, 3045025,
78     3024121, 3003289, 2982529, 2965284, 2944656, 2924100, 2907025, 2886601, 2869636,
79     2849344, 2832489, 2812329, 2795584, 2778889, 2758921, 2742336, 2725801, 2709316,
80     2692881, 2676496, 2656900, 2640625, 2624400, 2608225, 2592100, 2576025, 2560000,
81     2547216, 2531281, 2515396, 2499561, 2483776, 2471184, 2455489, 2439844, 2427364,
82     2411809, 2396304, 2383936, 2368521, 2356225, 2340900, 2328676, 2313441, 2301289,
83     2289169, 2274064, 2262016, 2250000, 2235025, 2223081, 2211169, 2199289, 2184484,
84     2172676, 2160900, 2149156, 2137444, 2125764, 2114116, 2102500, 2090916, 2079364,
85     2067844, 2056356, 2044900, 2033476, 2022084, 2010724, 1999396, 1990921, 1979649,
86     1968409, 1957201, 1948816, 1937664, 1926544, 1915456, 1907161, 1896129, 1887876,
87     1876900, 1865956, 1857769, 1846881, 1838736, 1827904, 1819801, 1809025, 1800964,
88     1790244, 1782225, 1774224, 1763584, 1755625, 1747684, 1737124, 1729225, 1721344,
89     1710864, 1703025, 1695204, 1687401, 1677025, 1669264, 1661521, 1653796, 1646089,
90     1638400, 1628176, 1620529, 1612900, 1605289, 1597696, 1590121, 1582564, 1575025,
91     1567504, 1560001, 1552516, 1545049, 1537600, 1530169, 1522756, 1515361, 1507984,
92     1503076, 1495729, 1488400, 1481089, 1473796, 1466521, 1461681, 1454436, 1447209,
93     1440000, 1432809, 1428025, 1420864, 1413721, 1408969, 1401856, 1394761, 1390041,
94     1382976, 1375929, 1371241, 1364224, 1357225, 1352569, 1345600, 1340964, 1334025,
95     1329409, 1322500, 1317904, 1311025, 1306449, 1299600, 1295044, 1288225, 1283689,
96     1276900, 1272384, 1265625, 1261129, 1256641, 1249924, 1245456, 1238769, 1234321,
97     1229881, 1223236, 1218816, 1214404, 1207801, 1203409, 1199025, 1192464, 1188100,
98     1183744, 1179396, 1172889, 1168561, 1164241, 1159929, 1153476, 1149184, 1144900,
99     1140624, 1136356, 1132096, 1125721, 1121481, 1117249, 1113025, 1108809, 1104601,
100     1100401, 1096209, 1089936, 1085764, 1081600, 1077444, 1073296, 1069156, 1065024,
101     1060900, 1056784, 1052676, 1048576 };
102 
103 /* Implements Algorithm 2 from "Improved Division by Invariant Integers",
104    Niels Möller and Torbjörn Granlund, IEEE Transactions on Computers,
105    volume 60, number 2, pages 165-175, 2011. */
106 #define __gmpfr_invert_limb(r, d)                                       \
107     do {                                                                \
108       mp_limb_t _d, _d0, _i, _d40, _d63, _v0, _v1, _v2, _e, _v3, _h, _l; \
109       MPFR_STAT_STATIC_ASSERT (4182025 <= UINT_MAX);                    \
110       _d = (d);                                                         \
111       _i = (_d >> 55) - 256; /* i = d9 - 256 */                         \
112       /* the shift by 11 is for free since it is hidden in the */       \
113       /* invert_limb_table2[_i] * _d40 multiplication latency */        \
114       _v0 = (mp_limb_t) invert_limb_table[_i] << 11;                    \
115       _d40 = (_d >> 24) + 1;                                            \
116       _v1 = _v0 - ((invert_limb_table2[_i] * _d40) >> 40) - 1;          \
117       _v2 = (_v1 << 13) +                                               \
118         ((_v1 * ((MPFR_LIMB_ONE << 60) - _v1 * _d40)) >> 47);           \
119       _d0 = _d & 1;                                                     \
120       _d63 = ((_d - 1) >> 1) + 1;                                       \
121       _e = - _v2 * _d63 + ((_v2 & -_d0) >> 1);                          \
122       umul_hi (_h, _v2, _e);                                            \
123       _v3 = (_v2 << 31) + (_h >> 1);                                    \
124       umul_ppmm (_h, _l, _v3, _d);                                      \
125       /* v3 is too small iff (h+d)*2^64+l+d < 2^128 */                  \
126       add_ssaaaa(_h, _l, _h, _l, _d, _d);                               \
127       MPFR_ASSERTD(_h == MPFR_LIMB_ZERO || -_h == MPFR_LIMB_ONE);       \
128       (r) = _v3 - _h;                                                   \
129     } while (0)
130 
131 /* same algorithm, but return the value v3, which is such that
132    v3 <= invert_limb (d) <= v3 + 1 */
133 #define __gmpfr_invert_limb_approx(r, d)                                \
134     do {                                                                \
135       mp_limb_t _d, _d0, _i, _d40, _d63, _v0, _v1, _v2, _e, _h;         \
136       MPFR_STAT_STATIC_ASSERT (4182025 <= UINT_MAX);                    \
137       _d = (d);                                                         \
138       _i = (_d >> 55) - 256; /* i = d9 - 256 */                         \
139       _v0 = (mp_limb_t) invert_limb_table[_i] << 11;                    \
140       _d40 = (_d >> 24) + 1;                                            \
141       _v1 = _v0 - ((invert_limb_table2[_i] * _d40) >> 40) - 1;          \
142       _v2 = (_v1 << 13) +                                               \
143         ((_v1 * ((MPFR_LIMB_ONE << 60) - _v1 * _d40)) >> 47);           \
144       _d0 = _d & 1;                                                     \
145       _d63 = ((_d - 1) >> 1) + 1;                                       \
146       _e = - _v2 * _d63 + ((_v2 & -_d0) >> 1);                          \
147       umul_hi (_h, _v2, _e);                                            \
148       (r) = (_v2 << 31) + (_h >> 1);                                    \
149     } while (0)
150 
151 #elif GMP_NUMB_BITS == 32
152 
153 /* for 512 <= d10 < 1024, l[d10-512] = floor((2^24-2^14+2^9)/d10) */
154 static const unsigned short invert_limb_table[512] =
155   { 32737, 32673, 32609, 32546, 32483, 32420, 32357, 32295, 32233, 32171,
156     32109, 32048, 31987, 31926, 31865, 31805, 31744, 31684, 31625, 31565,
157     31506, 31447, 31388, 31329, 31271, 31212, 31154, 31097, 31039, 30982,
158     30924, 30868, 30811, 30754, 30698, 30642, 30586, 30530, 30475, 30419,
159     30364, 30309, 30255, 30200, 30146, 30092, 30038, 29984, 29930, 29877,
160     29824, 29771, 29718, 29666, 29613, 29561, 29509, 29457, 29405, 29354,
161     29303, 29251, 29200, 29150, 29099, 29049, 28998, 28948, 28898, 28849,
162     28799, 28750, 28700, 28651, 28602, 28554, 28505, 28457, 28409, 28360,
163     28313, 28265, 28217, 28170, 28123, 28075, 28029, 27982, 27935, 27889,
164     27842, 27796, 27750, 27704, 27658, 27613, 27568, 27522, 27477, 27432,
165     27387, 27343, 27298, 27254, 27209, 27165, 27121, 27078, 27034, 26990,
166     26947, 26904, 26861, 26818, 26775, 26732, 26690, 26647, 26605, 26563,
167     26521, 26479, 26437, 26395, 26354, 26312, 26271, 26230, 26189, 26148,
168     26108, 26067, 26026, 25986, 25946, 25906, 25866, 25826, 25786, 25747,
169     25707, 25668, 25628, 25589, 25550, 25511, 25473, 25434, 25395, 25357,
170     25319, 25281, 25242, 25205, 25167, 25129, 25091, 25054, 25016, 24979,
171     24942, 24905, 24868, 24831, 24794, 24758, 24721, 24685, 24649, 24612,
172     24576, 24540, 24504, 24469, 24433, 24397, 24362, 24327, 24291, 24256,
173     24221, 24186, 24151, 24117, 24082, 24047, 24013, 23979, 23944, 23910,
174     23876, 23842, 23808, 23774, 23741, 23707, 23674, 23640, 23607, 23574,
175     23541, 23508, 23475, 23442, 23409, 23377, 23344, 23312, 23279, 23247,
176     23215, 23183, 23151, 23119, 23087, 23055, 23023, 22992, 22960, 22929,
177     22898, 22866, 22835, 22804, 22773, 22742, 22711, 22681, 22650, 22619,
178     22589, 22559, 22528, 22498, 22468, 22438, 22408, 22378, 22348, 22318,
179     22289, 22259, 22229, 22200, 22171, 22141, 22112, 22083, 22054, 22025,
180     21996, 21967, 21938, 21910, 21881, 21853, 21824, 21796, 21767, 21739,
181     21711, 21683, 21655, 21627, 21599, 21571, 21544, 21516, 21488, 21461,
182     21433, 21406, 21379, 21352, 21324, 21297, 21270, 21243, 21216, 21190,
183     21163, 21136, 21110, 21083, 21056, 21030, 21004, 20977, 20951, 20925,
184     20899, 20873, 20847, 20821, 20795, 20769, 20744, 20718, 20693, 20667,
185     20642, 20616, 20591, 20566, 20540, 20515, 20490, 20465, 20440, 20415,
186     20390, 20366, 20341, 20316, 20292, 20267, 20243, 20218, 20194, 20170,
187     20145, 20121, 20097, 20073, 20049, 20025, 20001, 19977, 19953, 19930,
188     19906, 19882, 19859, 19835, 19812, 19789, 19765, 19742, 19719, 19696,
189     19672, 19649, 19626, 19603, 19581, 19558, 19535, 19512, 19489, 19467,
190     19444, 19422, 19399, 19377, 19354, 19332, 19310, 19288, 19265, 19243,
191     19221, 19199, 19177, 19155, 19133, 19112, 19090, 19068, 19046, 19025,
192     19003, 18982, 18960, 18939, 18917, 18896, 18875, 18854, 18832, 18811,
193     18790, 18769, 18748, 18727, 18706, 18686, 18665, 18644, 18623, 18603,
194     18582, 18561, 18541, 18520, 18500, 18479, 18459, 18439, 18419, 18398,
195     18378, 18358, 18338, 18318, 18298, 18278, 18258, 18238, 18218, 18199,
196     18179, 18159, 18139, 18120, 18100, 18081, 18061, 18042, 18022, 18003,
197     17984, 17964, 17945, 17926, 17907, 17888, 17869, 17850, 17831, 17812,
198     17793, 17774, 17755, 17736, 17718, 17699, 17680, 17662, 17643, 17624,
199     17606, 17587, 17569, 17551, 17532, 17514, 17496, 17477, 17459, 17441,
200     17423, 17405, 17387, 17369, 17351, 17333, 17315, 17297, 17279, 17261,
201     17244, 17226, 17208, 17191, 17173, 17155, 17138, 17120, 17103, 17085,
202     17068, 17051, 17033, 17016, 16999, 16982, 16964, 16947, 16930, 16913,
203     16896, 16879, 16862, 16845, 16828, 16811, 16794, 16778, 16761, 16744,
204     16727, 16711, 16694, 16677, 16661, 16644, 16628, 16611, 16595, 16578,
205     16562, 16546, 16529, 16513, 16497, 16481, 16464, 16448, 16432, 16416,
206     16400, 16384 };
207 
208 /* Implements Algorithm 3 from "Improved Division by Invariant Integers",
209    Niels Möller and Torbjörn Granlund, IEEE Transactions on Computers,
210    volume 60, number 2, pages 165-175, 2011. */
211 #define __gmpfr_invert_limb(r, d)                                       \
212     do {                                                                \
213       mp_limb_t _d, _d0, _d10, _d21, _d31, _v0, _v1, _v2, _e, _h, _l;   \
214       _d = (d);                                                         \
215       _d0 = _d & 1;                                                     \
216       _d10 = _d >> 22;                                                  \
217       _d21 = (_d >> 11) + 1;                                            \
218       _d31 = ((_d - 1) >> 1) + 1;                                       \
219       _v0 = invert_limb_table[_d10 - 512];                              \
220       umul_ppmm (_h, _l, _v0 * _v0, _d21);                              \
221       _v1 = (_v0 << 4) - _h - 1;                                        \
222       _e = - _v1 * _d31 + ((_v1 & - _d0) >> 1);                         \
223       umul_ppmm (_h, _l, _v1, _e);                                      \
224       _v2 = (_v1 << 15) + (_h >> 1);                                    \
225       umul_ppmm (_h, _l, _v2, _d);                                      \
226       /* v2 is too small iff (h+d)*2^32+l+d < 2^64 */                   \
227       add_ssaaaa(_h, _l, _h, _l, _d, _d);                               \
228       MPFR_ASSERTD(_h == MPFR_LIMB_ZERO || -_h == MPFR_LIMB_ONE);       \
229       (r) = _v2 - _h;                                                   \
230     } while (0)
231 
232 #endif /* GMP_NUMB_BITS == 64 or 32 */
233