1/* __gmpfr_invert_limb -- implement GMP's invert_limb (which is not in GMP API)
2
3Copyright 2016-2023 Free Software Foundation, Inc.
4Contributed by the AriC and Caramba projects, INRIA.
5
6This file is part of the GNU MPFR Library.
7
8The GNU MPFR Library is free software; you can redistribute it and/or modify
9it under the terms of the GNU Lesser General Public License as published by
10the Free Software Foundation; either version 3 of the License, or (at your
11option) any later version.
12
13The GNU MPFR Library is distributed in the hope that it will be useful, but
14WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16License for more details.
17
18You should have received a copy of the GNU Lesser General Public License
19along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
20https://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
2151 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
22
23#define MPFR_NEED_LONGLONG_H
24#include "mpfr-impl.h"
25
26/* for now, we only provide __gmpfr_invert_limb for 64-bit limb */
27#if GMP_NUMB_BITS == 64
28
29/* for 256 <= d9 < 512, invert_limb_table[d9-256] = floor((2^19-3*2^8)/d9) */
30static const unsigned short invert_limb_table[256] =
31  { 2045, 2037, 2029, 2021, 2013, 2005, 1998, 1990,
32    1983, 1975, 1968, 1960, 1953, 1946, 1938, 1931,
33    1924, 1917, 1910, 1903, 1896, 1889, 1883, 1876,
34    1869, 1863, 1856, 1849, 1843, 1836, 1830, 1824,
35    1817, 1811, 1805, 1799, 1792, 1786, 1780, 1774,
36    1768, 1762, 1756, 1750, 1745, 1739, 1733, 1727,
37    1722, 1716, 1710, 1705, 1699, 1694, 1688, 1683,
38    1677, 1672, 1667, 1661, 1656, 1651, 1646, 1641,
39    1636, 1630, 1625, 1620, 1615, 1610, 1605, 1600,
40    1596, 1591, 1586, 1581, 1576, 1572, 1567, 1562,
41    1558, 1553, 1548, 1544, 1539, 1535, 1530, 1526,
42    1521, 1517, 1513, 1508, 1504, 1500, 1495, 1491,
43    1487, 1483, 1478, 1474, 1470, 1466, 1462, 1458,
44    1454, 1450, 1446, 1442, 1438, 1434, 1430, 1426,
45    1422, 1418, 1414, 1411, 1407, 1403, 1399, 1396,
46    1392, 1388, 1384, 1381, 1377, 1374, 1370, 1366,
47    1363, 1359, 1356, 1352, 1349, 1345, 1342, 1338,
48    1335, 1332, 1328, 1325, 1322, 1318, 1315, 1312,
49    1308, 1305, 1302, 1299, 1295, 1292, 1289, 1286,
50    1283, 1280, 1276, 1273, 1270, 1267, 1264, 1261,
51    1258, 1255, 1252, 1249, 1246, 1243, 1240, 1237,
52    1234, 1231, 1228, 1226, 1223, 1220, 1217, 1214,
53    1211, 1209, 1206, 1203, 1200, 1197, 1195, 1192,
54    1189, 1187, 1184, 1181, 1179, 1176, 1173, 1171,
55    1168, 1165, 1163, 1160, 1158, 1155, 1153, 1150,
56    1148, 1145, 1143, 1140, 1138, 1135, 1133, 1130,
57    1128, 1125, 1123, 1121, 1118, 1116, 1113, 1111,
58    1109, 1106, 1104, 1102, 1099, 1097, 1095, 1092,
59    1090, 1088, 1086, 1083, 1081, 1079, 1077, 1074,
60    1072, 1070, 1068, 1066, 1064, 1061, 1059, 1057,
61    1055, 1053, 1051, 1049, 1047, 1044, 1042, 1040,
62    1038, 1036, 1034, 1032, 1030, 1028, 1026, 1024 };
63
64/* for 256 <= d9 < 512, invert_limb_table2[d9-256] = floor((2^19-3*2^8)/d9)^2
65 *
66 * Note: This table requires 4182025 to be representable in unsigned int,
67 * thus disallows 16-bit int, for instance. However, this code is under
68 * "#if GMP_NUMB_BITS == 64", and a system with int smaller than 32 bits
69 * should better be used with GMP_NUMB_BITS == 32 (or 16 if supported).
70 * This constraint is checked with MPFR_STAT_STATIC_ASSERT below.
71 */
72static const unsigned int invert_limb_table2[256] =
73  { 4182025, 4149369, 4116841, 4084441, 4052169, 4020025, 3992004, 3960100, 3932289,
74    3900625, 3873024, 3841600, 3814209, 3786916, 3755844, 3728761, 3701776, 3674889,
75    3648100, 3621409, 3594816, 3568321, 3545689, 3519376, 3493161, 3470769, 3444736,
76    3418801, 3396649, 3370896, 3348900, 3326976, 3301489, 3279721, 3258025, 3236401,
77    3211264, 3189796, 3168400, 3147076, 3125824, 3104644, 3083536, 3062500, 3045025,
78    3024121, 3003289, 2982529, 2965284, 2944656, 2924100, 2907025, 2886601, 2869636,
79    2849344, 2832489, 2812329, 2795584, 2778889, 2758921, 2742336, 2725801, 2709316,
80    2692881, 2676496, 2656900, 2640625, 2624400, 2608225, 2592100, 2576025, 2560000,
81    2547216, 2531281, 2515396, 2499561, 2483776, 2471184, 2455489, 2439844, 2427364,
82    2411809, 2396304, 2383936, 2368521, 2356225, 2340900, 2328676, 2313441, 2301289,
83    2289169, 2274064, 2262016, 2250000, 2235025, 2223081, 2211169, 2199289, 2184484,
84    2172676, 2160900, 2149156, 2137444, 2125764, 2114116, 2102500, 2090916, 2079364,
85    2067844, 2056356, 2044900, 2033476, 2022084, 2010724, 1999396, 1990921, 1979649,
86    1968409, 1957201, 1948816, 1937664, 1926544, 1915456, 1907161, 1896129, 1887876,
87    1876900, 1865956, 1857769, 1846881, 1838736, 1827904, 1819801, 1809025, 1800964,
88    1790244, 1782225, 1774224, 1763584, 1755625, 1747684, 1737124, 1729225, 1721344,
89    1710864, 1703025, 1695204, 1687401, 1677025, 1669264, 1661521, 1653796, 1646089,
90    1638400, 1628176, 1620529, 1612900, 1605289, 1597696, 1590121, 1582564, 1575025,
91    1567504, 1560001, 1552516, 1545049, 1537600, 1530169, 1522756, 1515361, 1507984,
92    1503076, 1495729, 1488400, 1481089, 1473796, 1466521, 1461681, 1454436, 1447209,
93    1440000, 1432809, 1428025, 1420864, 1413721, 1408969, 1401856, 1394761, 1390041,
94    1382976, 1375929, 1371241, 1364224, 1357225, 1352569, 1345600, 1340964, 1334025,
95    1329409, 1322500, 1317904, 1311025, 1306449, 1299600, 1295044, 1288225, 1283689,
96    1276900, 1272384, 1265625, 1261129, 1256641, 1249924, 1245456, 1238769, 1234321,
97    1229881, 1223236, 1218816, 1214404, 1207801, 1203409, 1199025, 1192464, 1188100,
98    1183744, 1179396, 1172889, 1168561, 1164241, 1159929, 1153476, 1149184, 1144900,
99    1140624, 1136356, 1132096, 1125721, 1121481, 1117249, 1113025, 1108809, 1104601,
100    1100401, 1096209, 1089936, 1085764, 1081600, 1077444, 1073296, 1069156, 1065024,
101    1060900, 1056784, 1052676, 1048576 };
102
103/* Implements Algorithm 2 from "Improved Division by Invariant Integers",
104   Niels M��ller and Torbj��rn Granlund, IEEE Transactions on Computers,
105   volume 60, number 2, pages 165-175, 2011. */
106#define __gmpfr_invert_limb(r, d)                                       \
107    do {                                                                \
108      mp_limb_t _d, _d0, _i, _d40, _d63, _v0, _v1, _v2, _e, _v3, _h, _l; \
109      MPFR_STAT_STATIC_ASSERT (4182025 <= UINT_MAX);                    \
110      _d = (d);                                                         \
111      _i = (_d >> 55) - 256; /* i = d9 - 256 */                         \
112      /* the shift by 11 is for free since it is hidden in the */       \
113      /* invert_limb_table2[_i] * _d40 multiplication latency */        \
114      _v0 = (mp_limb_t) invert_limb_table[_i] << 11;                    \
115      _d40 = (_d >> 24) + 1;                                            \
116      _v1 = _v0 - ((invert_limb_table2[_i] * _d40) >> 40) - 1;          \
117      _v2 = (_v1 << 13) +                                               \
118        ((_v1 * ((MPFR_LIMB_ONE << 60) - _v1 * _d40)) >> 47);           \
119      _d0 = _d & 1;                                                     \
120      _d63 = ((_d - 1) >> 1) + 1;                                       \
121      _e = - _v2 * _d63 + ((_v2 & -_d0) >> 1);                          \
122      umul_hi (_h, _v2, _e);                                            \
123      _v3 = (_v2 << 31) + (_h >> 1);                                    \
124      umul_ppmm (_h, _l, _v3, _d);                                      \
125      /* v3 is too small iff (h+d)*2^64+l+d < 2^128 */                  \
126      add_ssaaaa(_h, _l, _h, _l, _d, _d);                               \
127      MPFR_ASSERTD(_h == MPFR_LIMB_ZERO || -_h == MPFR_LIMB_ONE);       \
128      (r) = _v3 - _h;                                                   \
129    } while (0)
130
131/* same algorithm, but return the value v3, which is such that
132   v3 <= invert_limb (d) <= v3 + 1 */
133#define __gmpfr_invert_limb_approx(r, d)                                \
134    do {                                                                \
135      mp_limb_t _d, _d0, _i, _d40, _d63, _v0, _v1, _v2, _e, _h;         \
136      MPFR_STAT_STATIC_ASSERT (4182025 <= UINT_MAX);                    \
137      _d = (d);                                                         \
138      _i = (_d >> 55) - 256; /* i = d9 - 256 */                         \
139      _v0 = (mp_limb_t) invert_limb_table[_i] << 11;                    \
140      _d40 = (_d >> 24) + 1;                                            \
141      _v1 = _v0 - ((invert_limb_table2[_i] * _d40) >> 40) - 1;          \
142      _v2 = (_v1 << 13) +                                               \
143        ((_v1 * ((MPFR_LIMB_ONE << 60) - _v1 * _d40)) >> 47);           \
144      _d0 = _d & 1;                                                     \
145      _d63 = ((_d - 1) >> 1) + 1;                                       \
146      _e = - _v2 * _d63 + ((_v2 & -_d0) >> 1);                          \
147      umul_hi (_h, _v2, _e);                                            \
148      (r) = (_v2 << 31) + (_h >> 1);                                    \
149    } while (0)
150
151#elif GMP_NUMB_BITS == 32
152
153/* for 512 <= d10 < 1024, l[d10-512] = floor((2^24-2^14+2^9)/d10) */
154static const unsigned short invert_limb_table[512] =
155  { 32737, 32673, 32609, 32546, 32483, 32420, 32357, 32295, 32233, 32171,
156    32109, 32048, 31987, 31926, 31865, 31805, 31744, 31684, 31625, 31565,
157    31506, 31447, 31388, 31329, 31271, 31212, 31154, 31097, 31039, 30982,
158    30924, 30868, 30811, 30754, 30698, 30642, 30586, 30530, 30475, 30419,
159    30364, 30309, 30255, 30200, 30146, 30092, 30038, 29984, 29930, 29877,
160    29824, 29771, 29718, 29666, 29613, 29561, 29509, 29457, 29405, 29354,
161    29303, 29251, 29200, 29150, 29099, 29049, 28998, 28948, 28898, 28849,
162    28799, 28750, 28700, 28651, 28602, 28554, 28505, 28457, 28409, 28360,
163    28313, 28265, 28217, 28170, 28123, 28075, 28029, 27982, 27935, 27889,
164    27842, 27796, 27750, 27704, 27658, 27613, 27568, 27522, 27477, 27432,
165    27387, 27343, 27298, 27254, 27209, 27165, 27121, 27078, 27034, 26990,
166    26947, 26904, 26861, 26818, 26775, 26732, 26690, 26647, 26605, 26563,
167    26521, 26479, 26437, 26395, 26354, 26312, 26271, 26230, 26189, 26148,
168    26108, 26067, 26026, 25986, 25946, 25906, 25866, 25826, 25786, 25747,
169    25707, 25668, 25628, 25589, 25550, 25511, 25473, 25434, 25395, 25357,
170    25319, 25281, 25242, 25205, 25167, 25129, 25091, 25054, 25016, 24979,
171    24942, 24905, 24868, 24831, 24794, 24758, 24721, 24685, 24649, 24612,
172    24576, 24540, 24504, 24469, 24433, 24397, 24362, 24327, 24291, 24256,
173    24221, 24186, 24151, 24117, 24082, 24047, 24013, 23979, 23944, 23910,
174    23876, 23842, 23808, 23774, 23741, 23707, 23674, 23640, 23607, 23574,
175    23541, 23508, 23475, 23442, 23409, 23377, 23344, 23312, 23279, 23247,
176    23215, 23183, 23151, 23119, 23087, 23055, 23023, 22992, 22960, 22929,
177    22898, 22866, 22835, 22804, 22773, 22742, 22711, 22681, 22650, 22619,
178    22589, 22559, 22528, 22498, 22468, 22438, 22408, 22378, 22348, 22318,
179    22289, 22259, 22229, 22200, 22171, 22141, 22112, 22083, 22054, 22025,
180    21996, 21967, 21938, 21910, 21881, 21853, 21824, 21796, 21767, 21739,
181    21711, 21683, 21655, 21627, 21599, 21571, 21544, 21516, 21488, 21461,
182    21433, 21406, 21379, 21352, 21324, 21297, 21270, 21243, 21216, 21190,
183    21163, 21136, 21110, 21083, 21056, 21030, 21004, 20977, 20951, 20925,
184    20899, 20873, 20847, 20821, 20795, 20769, 20744, 20718, 20693, 20667,
185    20642, 20616, 20591, 20566, 20540, 20515, 20490, 20465, 20440, 20415,
186    20390, 20366, 20341, 20316, 20292, 20267, 20243, 20218, 20194, 20170,
187    20145, 20121, 20097, 20073, 20049, 20025, 20001, 19977, 19953, 19930,
188    19906, 19882, 19859, 19835, 19812, 19789, 19765, 19742, 19719, 19696,
189    19672, 19649, 19626, 19603, 19581, 19558, 19535, 19512, 19489, 19467,
190    19444, 19422, 19399, 19377, 19354, 19332, 19310, 19288, 19265, 19243,
191    19221, 19199, 19177, 19155, 19133, 19112, 19090, 19068, 19046, 19025,
192    19003, 18982, 18960, 18939, 18917, 18896, 18875, 18854, 18832, 18811,
193    18790, 18769, 18748, 18727, 18706, 18686, 18665, 18644, 18623, 18603,
194    18582, 18561, 18541, 18520, 18500, 18479, 18459, 18439, 18419, 18398,
195    18378, 18358, 18338, 18318, 18298, 18278, 18258, 18238, 18218, 18199,
196    18179, 18159, 18139, 18120, 18100, 18081, 18061, 18042, 18022, 18003,
197    17984, 17964, 17945, 17926, 17907, 17888, 17869, 17850, 17831, 17812,
198    17793, 17774, 17755, 17736, 17718, 17699, 17680, 17662, 17643, 17624,
199    17606, 17587, 17569, 17551, 17532, 17514, 17496, 17477, 17459, 17441,
200    17423, 17405, 17387, 17369, 17351, 17333, 17315, 17297, 17279, 17261,
201    17244, 17226, 17208, 17191, 17173, 17155, 17138, 17120, 17103, 17085,
202    17068, 17051, 17033, 17016, 16999, 16982, 16964, 16947, 16930, 16913,
203    16896, 16879, 16862, 16845, 16828, 16811, 16794, 16778, 16761, 16744,
204    16727, 16711, 16694, 16677, 16661, 16644, 16628, 16611, 16595, 16578,
205    16562, 16546, 16529, 16513, 16497, 16481, 16464, 16448, 16432, 16416,
206    16400, 16384 };
207
208/* Implements Algorithm 3 from "Improved Division by Invariant Integers",
209   Niels M��ller and Torbj��rn Granlund, IEEE Transactions on Computers,
210   volume 60, number 2, pages 165-175, 2011. */
211#define __gmpfr_invert_limb(r, d)                                       \
212    do {                                                                \
213      mp_limb_t _d, _d0, _d10, _d21, _d31, _v0, _v1, _v2, _e, _h, _l;   \
214      _d = (d);                                                         \
215      _d0 = _d & 1;                                                     \
216      _d10 = _d >> 22;                                                  \
217      _d21 = (_d >> 11) + 1;                                            \
218      _d31 = ((_d - 1) >> 1) + 1;                                       \
219      _v0 = invert_limb_table[_d10 - 512];                              \
220      umul_ppmm (_h, _l, _v0 * _v0, _d21);                              \
221      _v1 = (_v0 << 4) - _h - 1;                                        \
222      _e = - _v1 * _d31 + ((_v1 & - _d0) >> 1);                         \
223      umul_ppmm (_h, _l, _v1, _e);                                      \
224      _v2 = (_v1 << 15) + (_h >> 1);                                    \
225      umul_ppmm (_h, _l, _v2, _d);                                      \
226      /* v2 is too small iff (h+d)*2^32+l+d < 2^64 */                   \
227      add_ssaaaa(_h, _l, _h, _l, _d, _d);                               \
228      MPFR_ASSERTD(_h == MPFR_LIMB_ZERO || -_h == MPFR_LIMB_ONE);       \
229      (r) = _v2 - _h;                                                   \
230    } while (0)
231
232#endif /* GMP_NUMB_BITS == 64 or 32 */
233