1/* __gmpfr_invert_limb -- implement GMP's invert_limb (which is not in GMP API) 2 3Copyright 2016-2023 Free Software Foundation, Inc. 4Contributed by the AriC and Caramba projects, INRIA. 5 6This file is part of the GNU MPFR Library. 7 8The GNU MPFR Library is free software; you can redistribute it and/or modify 9it under the terms of the GNU Lesser General Public License as published by 10the Free Software Foundation; either version 3 of the License, or (at your 11option) any later version. 12 13The GNU MPFR Library is distributed in the hope that it will be useful, but 14WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16License for more details. 17 18You should have received a copy of the GNU Lesser General Public License 19along with the GNU MPFR Library; see the file COPYING.LESSER. If not, see 20https://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 2151 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ 22 23#define MPFR_NEED_LONGLONG_H 24#include "mpfr-impl.h" 25 26/* for now, we only provide __gmpfr_invert_limb for 64-bit limb */ 27#if GMP_NUMB_BITS == 64 28 29/* for 256 <= d9 < 512, invert_limb_table[d9-256] = floor((2^19-3*2^8)/d9) */ 30static const unsigned short invert_limb_table[256] = 31 { 2045, 2037, 2029, 2021, 2013, 2005, 1998, 1990, 32 1983, 1975, 1968, 1960, 1953, 1946, 1938, 1931, 33 1924, 1917, 1910, 1903, 1896, 1889, 1883, 1876, 34 1869, 1863, 1856, 1849, 1843, 1836, 1830, 1824, 35 1817, 1811, 1805, 1799, 1792, 1786, 1780, 1774, 36 1768, 1762, 1756, 1750, 1745, 1739, 1733, 1727, 37 1722, 1716, 1710, 1705, 1699, 1694, 1688, 1683, 38 1677, 1672, 1667, 1661, 1656, 1651, 1646, 1641, 39 1636, 1630, 1625, 1620, 1615, 1610, 1605, 1600, 40 1596, 1591, 1586, 1581, 1576, 1572, 1567, 1562, 41 1558, 1553, 1548, 1544, 1539, 1535, 1530, 1526, 42 1521, 1517, 1513, 1508, 1504, 1500, 1495, 1491, 43 1487, 1483, 1478, 1474, 1470, 1466, 1462, 1458, 44 1454, 1450, 1446, 1442, 1438, 1434, 1430, 1426, 45 1422, 1418, 1414, 1411, 1407, 1403, 1399, 1396, 46 1392, 1388, 1384, 1381, 1377, 1374, 1370, 1366, 47 1363, 1359, 1356, 1352, 1349, 1345, 1342, 1338, 48 1335, 1332, 1328, 1325, 1322, 1318, 1315, 1312, 49 1308, 1305, 1302, 1299, 1295, 1292, 1289, 1286, 50 1283, 1280, 1276, 1273, 1270, 1267, 1264, 1261, 51 1258, 1255, 1252, 1249, 1246, 1243, 1240, 1237, 52 1234, 1231, 1228, 1226, 1223, 1220, 1217, 1214, 53 1211, 1209, 1206, 1203, 1200, 1197, 1195, 1192, 54 1189, 1187, 1184, 1181, 1179, 1176, 1173, 1171, 55 1168, 1165, 1163, 1160, 1158, 1155, 1153, 1150, 56 1148, 1145, 1143, 1140, 1138, 1135, 1133, 1130, 57 1128, 1125, 1123, 1121, 1118, 1116, 1113, 1111, 58 1109, 1106, 1104, 1102, 1099, 1097, 1095, 1092, 59 1090, 1088, 1086, 1083, 1081, 1079, 1077, 1074, 60 1072, 1070, 1068, 1066, 1064, 1061, 1059, 1057, 61 1055, 1053, 1051, 1049, 1047, 1044, 1042, 1040, 62 1038, 1036, 1034, 1032, 1030, 1028, 1026, 1024 }; 63 64/* for 256 <= d9 < 512, invert_limb_table2[d9-256] = floor((2^19-3*2^8)/d9)^2 65 * 66 * Note: This table requires 4182025 to be representable in unsigned int, 67 * thus disallows 16-bit int, for instance. However, this code is under 68 * "#if GMP_NUMB_BITS == 64", and a system with int smaller than 32 bits 69 * should better be used with GMP_NUMB_BITS == 32 (or 16 if supported). 70 * This constraint is checked with MPFR_STAT_STATIC_ASSERT below. 71 */ 72static const unsigned int invert_limb_table2[256] = 73 { 4182025, 4149369, 4116841, 4084441, 4052169, 4020025, 3992004, 3960100, 3932289, 74 3900625, 3873024, 3841600, 3814209, 3786916, 3755844, 3728761, 3701776, 3674889, 75 3648100, 3621409, 3594816, 3568321, 3545689, 3519376, 3493161, 3470769, 3444736, 76 3418801, 3396649, 3370896, 3348900, 3326976, 3301489, 3279721, 3258025, 3236401, 77 3211264, 3189796, 3168400, 3147076, 3125824, 3104644, 3083536, 3062500, 3045025, 78 3024121, 3003289, 2982529, 2965284, 2944656, 2924100, 2907025, 2886601, 2869636, 79 2849344, 2832489, 2812329, 2795584, 2778889, 2758921, 2742336, 2725801, 2709316, 80 2692881, 2676496, 2656900, 2640625, 2624400, 2608225, 2592100, 2576025, 2560000, 81 2547216, 2531281, 2515396, 2499561, 2483776, 2471184, 2455489, 2439844, 2427364, 82 2411809, 2396304, 2383936, 2368521, 2356225, 2340900, 2328676, 2313441, 2301289, 83 2289169, 2274064, 2262016, 2250000, 2235025, 2223081, 2211169, 2199289, 2184484, 84 2172676, 2160900, 2149156, 2137444, 2125764, 2114116, 2102500, 2090916, 2079364, 85 2067844, 2056356, 2044900, 2033476, 2022084, 2010724, 1999396, 1990921, 1979649, 86 1968409, 1957201, 1948816, 1937664, 1926544, 1915456, 1907161, 1896129, 1887876, 87 1876900, 1865956, 1857769, 1846881, 1838736, 1827904, 1819801, 1809025, 1800964, 88 1790244, 1782225, 1774224, 1763584, 1755625, 1747684, 1737124, 1729225, 1721344, 89 1710864, 1703025, 1695204, 1687401, 1677025, 1669264, 1661521, 1653796, 1646089, 90 1638400, 1628176, 1620529, 1612900, 1605289, 1597696, 1590121, 1582564, 1575025, 91 1567504, 1560001, 1552516, 1545049, 1537600, 1530169, 1522756, 1515361, 1507984, 92 1503076, 1495729, 1488400, 1481089, 1473796, 1466521, 1461681, 1454436, 1447209, 93 1440000, 1432809, 1428025, 1420864, 1413721, 1408969, 1401856, 1394761, 1390041, 94 1382976, 1375929, 1371241, 1364224, 1357225, 1352569, 1345600, 1340964, 1334025, 95 1329409, 1322500, 1317904, 1311025, 1306449, 1299600, 1295044, 1288225, 1283689, 96 1276900, 1272384, 1265625, 1261129, 1256641, 1249924, 1245456, 1238769, 1234321, 97 1229881, 1223236, 1218816, 1214404, 1207801, 1203409, 1199025, 1192464, 1188100, 98 1183744, 1179396, 1172889, 1168561, 1164241, 1159929, 1153476, 1149184, 1144900, 99 1140624, 1136356, 1132096, 1125721, 1121481, 1117249, 1113025, 1108809, 1104601, 100 1100401, 1096209, 1089936, 1085764, 1081600, 1077444, 1073296, 1069156, 1065024, 101 1060900, 1056784, 1052676, 1048576 }; 102 103/* Implements Algorithm 2 from "Improved Division by Invariant Integers", 104 Niels M��ller and Torbj��rn Granlund, IEEE Transactions on Computers, 105 volume 60, number 2, pages 165-175, 2011. */ 106#define __gmpfr_invert_limb(r, d) \ 107 do { \ 108 mp_limb_t _d, _d0, _i, _d40, _d63, _v0, _v1, _v2, _e, _v3, _h, _l; \ 109 MPFR_STAT_STATIC_ASSERT (4182025 <= UINT_MAX); \ 110 _d = (d); \ 111 _i = (_d >> 55) - 256; /* i = d9 - 256 */ \ 112 /* the shift by 11 is for free since it is hidden in the */ \ 113 /* invert_limb_table2[_i] * _d40 multiplication latency */ \ 114 _v0 = (mp_limb_t) invert_limb_table[_i] << 11; \ 115 _d40 = (_d >> 24) + 1; \ 116 _v1 = _v0 - ((invert_limb_table2[_i] * _d40) >> 40) - 1; \ 117 _v2 = (_v1 << 13) + \ 118 ((_v1 * ((MPFR_LIMB_ONE << 60) - _v1 * _d40)) >> 47); \ 119 _d0 = _d & 1; \ 120 _d63 = ((_d - 1) >> 1) + 1; \ 121 _e = - _v2 * _d63 + ((_v2 & -_d0) >> 1); \ 122 umul_hi (_h, _v2, _e); \ 123 _v3 = (_v2 << 31) + (_h >> 1); \ 124 umul_ppmm (_h, _l, _v3, _d); \ 125 /* v3 is too small iff (h+d)*2^64+l+d < 2^128 */ \ 126 add_ssaaaa(_h, _l, _h, _l, _d, _d); \ 127 MPFR_ASSERTD(_h == MPFR_LIMB_ZERO || -_h == MPFR_LIMB_ONE); \ 128 (r) = _v3 - _h; \ 129 } while (0) 130 131/* same algorithm, but return the value v3, which is such that 132 v3 <= invert_limb (d) <= v3 + 1 */ 133#define __gmpfr_invert_limb_approx(r, d) \ 134 do { \ 135 mp_limb_t _d, _d0, _i, _d40, _d63, _v0, _v1, _v2, _e, _h; \ 136 MPFR_STAT_STATIC_ASSERT (4182025 <= UINT_MAX); \ 137 _d = (d); \ 138 _i = (_d >> 55) - 256; /* i = d9 - 256 */ \ 139 _v0 = (mp_limb_t) invert_limb_table[_i] << 11; \ 140 _d40 = (_d >> 24) + 1; \ 141 _v1 = _v0 - ((invert_limb_table2[_i] * _d40) >> 40) - 1; \ 142 _v2 = (_v1 << 13) + \ 143 ((_v1 * ((MPFR_LIMB_ONE << 60) - _v1 * _d40)) >> 47); \ 144 _d0 = _d & 1; \ 145 _d63 = ((_d - 1) >> 1) + 1; \ 146 _e = - _v2 * _d63 + ((_v2 & -_d0) >> 1); \ 147 umul_hi (_h, _v2, _e); \ 148 (r) = (_v2 << 31) + (_h >> 1); \ 149 } while (0) 150 151#elif GMP_NUMB_BITS == 32 152 153/* for 512 <= d10 < 1024, l[d10-512] = floor((2^24-2^14+2^9)/d10) */ 154static const unsigned short invert_limb_table[512] = 155 { 32737, 32673, 32609, 32546, 32483, 32420, 32357, 32295, 32233, 32171, 156 32109, 32048, 31987, 31926, 31865, 31805, 31744, 31684, 31625, 31565, 157 31506, 31447, 31388, 31329, 31271, 31212, 31154, 31097, 31039, 30982, 158 30924, 30868, 30811, 30754, 30698, 30642, 30586, 30530, 30475, 30419, 159 30364, 30309, 30255, 30200, 30146, 30092, 30038, 29984, 29930, 29877, 160 29824, 29771, 29718, 29666, 29613, 29561, 29509, 29457, 29405, 29354, 161 29303, 29251, 29200, 29150, 29099, 29049, 28998, 28948, 28898, 28849, 162 28799, 28750, 28700, 28651, 28602, 28554, 28505, 28457, 28409, 28360, 163 28313, 28265, 28217, 28170, 28123, 28075, 28029, 27982, 27935, 27889, 164 27842, 27796, 27750, 27704, 27658, 27613, 27568, 27522, 27477, 27432, 165 27387, 27343, 27298, 27254, 27209, 27165, 27121, 27078, 27034, 26990, 166 26947, 26904, 26861, 26818, 26775, 26732, 26690, 26647, 26605, 26563, 167 26521, 26479, 26437, 26395, 26354, 26312, 26271, 26230, 26189, 26148, 168 26108, 26067, 26026, 25986, 25946, 25906, 25866, 25826, 25786, 25747, 169 25707, 25668, 25628, 25589, 25550, 25511, 25473, 25434, 25395, 25357, 170 25319, 25281, 25242, 25205, 25167, 25129, 25091, 25054, 25016, 24979, 171 24942, 24905, 24868, 24831, 24794, 24758, 24721, 24685, 24649, 24612, 172 24576, 24540, 24504, 24469, 24433, 24397, 24362, 24327, 24291, 24256, 173 24221, 24186, 24151, 24117, 24082, 24047, 24013, 23979, 23944, 23910, 174 23876, 23842, 23808, 23774, 23741, 23707, 23674, 23640, 23607, 23574, 175 23541, 23508, 23475, 23442, 23409, 23377, 23344, 23312, 23279, 23247, 176 23215, 23183, 23151, 23119, 23087, 23055, 23023, 22992, 22960, 22929, 177 22898, 22866, 22835, 22804, 22773, 22742, 22711, 22681, 22650, 22619, 178 22589, 22559, 22528, 22498, 22468, 22438, 22408, 22378, 22348, 22318, 179 22289, 22259, 22229, 22200, 22171, 22141, 22112, 22083, 22054, 22025, 180 21996, 21967, 21938, 21910, 21881, 21853, 21824, 21796, 21767, 21739, 181 21711, 21683, 21655, 21627, 21599, 21571, 21544, 21516, 21488, 21461, 182 21433, 21406, 21379, 21352, 21324, 21297, 21270, 21243, 21216, 21190, 183 21163, 21136, 21110, 21083, 21056, 21030, 21004, 20977, 20951, 20925, 184 20899, 20873, 20847, 20821, 20795, 20769, 20744, 20718, 20693, 20667, 185 20642, 20616, 20591, 20566, 20540, 20515, 20490, 20465, 20440, 20415, 186 20390, 20366, 20341, 20316, 20292, 20267, 20243, 20218, 20194, 20170, 187 20145, 20121, 20097, 20073, 20049, 20025, 20001, 19977, 19953, 19930, 188 19906, 19882, 19859, 19835, 19812, 19789, 19765, 19742, 19719, 19696, 189 19672, 19649, 19626, 19603, 19581, 19558, 19535, 19512, 19489, 19467, 190 19444, 19422, 19399, 19377, 19354, 19332, 19310, 19288, 19265, 19243, 191 19221, 19199, 19177, 19155, 19133, 19112, 19090, 19068, 19046, 19025, 192 19003, 18982, 18960, 18939, 18917, 18896, 18875, 18854, 18832, 18811, 193 18790, 18769, 18748, 18727, 18706, 18686, 18665, 18644, 18623, 18603, 194 18582, 18561, 18541, 18520, 18500, 18479, 18459, 18439, 18419, 18398, 195 18378, 18358, 18338, 18318, 18298, 18278, 18258, 18238, 18218, 18199, 196 18179, 18159, 18139, 18120, 18100, 18081, 18061, 18042, 18022, 18003, 197 17984, 17964, 17945, 17926, 17907, 17888, 17869, 17850, 17831, 17812, 198 17793, 17774, 17755, 17736, 17718, 17699, 17680, 17662, 17643, 17624, 199 17606, 17587, 17569, 17551, 17532, 17514, 17496, 17477, 17459, 17441, 200 17423, 17405, 17387, 17369, 17351, 17333, 17315, 17297, 17279, 17261, 201 17244, 17226, 17208, 17191, 17173, 17155, 17138, 17120, 17103, 17085, 202 17068, 17051, 17033, 17016, 16999, 16982, 16964, 16947, 16930, 16913, 203 16896, 16879, 16862, 16845, 16828, 16811, 16794, 16778, 16761, 16744, 204 16727, 16711, 16694, 16677, 16661, 16644, 16628, 16611, 16595, 16578, 205 16562, 16546, 16529, 16513, 16497, 16481, 16464, 16448, 16432, 16416, 206 16400, 16384 }; 207 208/* Implements Algorithm 3 from "Improved Division by Invariant Integers", 209 Niels M��ller and Torbj��rn Granlund, IEEE Transactions on Computers, 210 volume 60, number 2, pages 165-175, 2011. */ 211#define __gmpfr_invert_limb(r, d) \ 212 do { \ 213 mp_limb_t _d, _d0, _d10, _d21, _d31, _v0, _v1, _v2, _e, _h, _l; \ 214 _d = (d); \ 215 _d0 = _d & 1; \ 216 _d10 = _d >> 22; \ 217 _d21 = (_d >> 11) + 1; \ 218 _d31 = ((_d - 1) >> 1) + 1; \ 219 _v0 = invert_limb_table[_d10 - 512]; \ 220 umul_ppmm (_h, _l, _v0 * _v0, _d21); \ 221 _v1 = (_v0 << 4) - _h - 1; \ 222 _e = - _v1 * _d31 + ((_v1 & - _d0) >> 1); \ 223 umul_ppmm (_h, _l, _v1, _e); \ 224 _v2 = (_v1 << 15) + (_h >> 1); \ 225 umul_ppmm (_h, _l, _v2, _d); \ 226 /* v2 is too small iff (h+d)*2^32+l+d < 2^64 */ \ 227 add_ssaaaa(_h, _l, _h, _l, _d, _d); \ 228 MPFR_ASSERTD(_h == MPFR_LIMB_ZERO || -_h == MPFR_LIMB_ONE); \ 229 (r) = _v2 - _h; \ 230 } while (0) 231 232#endif /* GMP_NUMB_BITS == 64 or 32 */ 233