1/* Copyright (C) 2007-2020 Free Software Foundation, Inc.
2
3This file is part of GCC.
4
5GCC is free software; you can redistribute it and/or modify it under
6the terms of the GNU General Public License as published by the Free
7Software Foundation; either version 3, or (at your option) any later
8version.
9
10GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11WARRANTY; without even the implied warranty of MERCHANTABILITY or
12FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13for more details.
14
15Under Section 7 of GPL version 3, you are granted additional
16permissions described in the GCC Runtime Library Exception, version
173.1, as published by the Free Software Foundation.
18
19You should have received a copy of the GNU General Public License and
20a copy of the GCC Runtime Library Exception along with this program;
21see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22<http://www.gnu.org/licenses/>.  */
23
24#ifndef _SQRT_MACROS_H_
25#define _SQRT_MACROS_H_
26
27#define FENCE __fence
28
29#if DOUBLE_EXTENDED_ON
30
31extern BINARY80 SQRT80 (BINARY80);
32
33
34__BID_INLINE__ UINT64
35short_sqrt128 (UINT128 A10) {
36  BINARY80 lx, ly, l64;
37  int_float f64;
38
39  // 2^64
40  f64.i = 0x5f800000;
41  l64 = (BINARY80) f64.d;
42  lx = (BINARY80) A10.w[1] * l64 + (BINARY80) A10.w[0];
43  ly = SQRT80 (lx);
44  return (UINT64) ly;
45}
46
47
48__BID_INLINE__ void
49long_sqrt128 (UINT128 * pCS, UINT256 C256) {
50  UINT256 C4;
51  UINT128 CS;
52  UINT64 X;
53  SINT64 SE;
54  BINARY80 l64, lm64, l128, lxL, lx, ly, lS, lSH, lSL, lE, l3, l2,
55    l1, l0, lp, lCl;
56  int_float fx, f64, fm64;
57  int *ple = (int *) &lx;
58
59  // 2^64
60  f64.i = 0x5f800000;
61  l64 = (BINARY80) f64.d;
62
63  l128 = l64 * l64;
64  lx = l3 = (BINARY80) C256.w[3] * l64 * l128;
65  l2 = (BINARY80) C256.w[2] * l128;
66  lx = FENCE (lx + l2);
67  l1 = (BINARY80) C256.w[1] * l64;
68  lx = FENCE (lx + l1);
69  l0 = (BINARY80) C256.w[0];
70  lx = FENCE (lx + l0);
71  // sqrt(C256)
72  lS = SQRT80 (lx);
73
74  // get coefficient
75  // 2^(-64)
76  fm64.i = 0x1f800000;
77  lm64 = (BINARY80) fm64.d;
78  CS.w[1] = (UINT64) (lS * lm64);
79  CS.w[0] = (UINT64) (lS - (BINARY80) CS.w[1] * l64);
80
81  ///////////////////////////////////////
82  //  CAUTION!
83  //  little endian code only
84  //  add solution for big endian
85  //////////////////////////////////////
86  lSH = lS;
87  *((UINT64 *) & lSH) &= 0xffffffff00000000ull;
88
89  // correction for C256 rounding
90  lCl = FENCE (l3 - lx);
91  lCl = FENCE (lCl + l2);
92  lCl = FENCE (lCl + l1);
93  lCl = FENCE (lCl + l0);
94
95  lSL = lS - lSH;
96
97  //////////////////////////////////////////
98  //   Watch for compiler re-ordering
99  //
100  /////////////////////////////////////////
101  // C256-S^2
102  lxL = FENCE (lx - lSH * lSH);
103  lp = lSH * lSL;
104  lp += lp;
105  lxL = FENCE (lxL - lp);
106  lSL *= lSL;
107  lxL = FENCE (lxL - lSL);
108  lCl += lxL;
109
110  // correction term
111  lE = lCl / (lS + lS);
112
113  // get low part of coefficient
114  X = CS.w[0];
115  if (lCl >= 0) {
116    SE = (SINT64) (lE);
117    CS.w[0] += SE;
118    if (CS.w[0] < X)
119      CS.w[1]++;
120  } else {
121    SE = (SINT64) (-lE);
122    CS.w[0] -= SE;
123    if (CS.w[0] > X)
124      CS.w[1]--;
125  }
126
127  pCS->w[0] = CS.w[0];
128  pCS->w[1] = CS.w[1];
129}
130
131#else
132
133extern double sqrt (double);
134
135__BID_INLINE__ UINT64
136short_sqrt128 (UINT128 A10) {
137  UINT256 ARS, ARS0, AE0, AE, S;
138
139  UINT64 MY, ES, CY;
140  double lx, l64;
141  int_double f64, ly;
142  int ey, k;
143
144  // 2^64
145  f64.i = 0x43f0000000000000ull;
146  l64 = f64.d;
147  lx = (double) A10.w[1] * l64 + (double) A10.w[0];
148  ly.d = 1.0 / sqrt (lx);
149
150  MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull;
151  ey = 0x3ff - (ly.i >> 52);
152
153  // A10*RS^2
154  __mul_64x128_to_192 (ARS0, MY, A10);
155  __mul_64x192_to_256 (ARS, MY, ARS0);
156
157  // shr by 2*ey+40, to get a 64-bit value
158  k = (ey << 1) + 104 - 64;
159  if (k >= 128) {
160    if (k > 128)
161      ES = (ARS.w[2] >> (k - 128)) | (ARS.w[3] << (192 - k));
162    else
163      ES = ARS.w[2];
164  } else {
165    if (k >= 64) {
166      ARS.w[0] = ARS.w[1];
167      ARS.w[1] = ARS.w[2];
168      k -= 64;
169    }
170    if (k) {
171      __shr_128 (ARS, ARS, k);
172    }
173    ES = ARS.w[0];
174  }
175
176  ES = ((SINT64) ES) >> 1;
177
178  if (((SINT64) ES) < 0) {
179    ES = -ES;
180
181    // A*RS*eps (scaled by 2^64)
182    __mul_64x192_to_256 (AE0, ES, ARS0);
183
184    AE.w[0] = AE0.w[1];
185    AE.w[1] = AE0.w[2];
186    AE.w[2] = AE0.w[3];
187
188    __add_carry_out (S.w[0], CY, ARS0.w[0], AE.w[0]);
189    __add_carry_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY);
190    S.w[2] = ARS0.w[2] + AE.w[2] + CY;
191  } else {
192    // A*RS*eps (scaled by 2^64)
193    __mul_64x192_to_256 (AE0, ES, ARS0);
194
195    AE.w[0] = AE0.w[1];
196    AE.w[1] = AE0.w[2];
197    AE.w[2] = AE0.w[3];
198
199    __sub_borrow_out (S.w[0], CY, ARS0.w[0], AE.w[0]);
200    __sub_borrow_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY);
201    S.w[2] = ARS0.w[2] - AE.w[2] - CY;
202  }
203
204  k = ey + 51;
205
206  if (k >= 64) {
207    if (k >= 128) {
208      S.w[0] = S.w[2];
209      S.w[1] = 0;
210      k -= 128;
211    } else {
212      S.w[0] = S.w[1];
213      S.w[1] = S.w[2];
214    }
215    k -= 64;
216  }
217  if (k) {
218    __shr_128 (S, S, k);
219  }
220
221
222  return (UINT64) ((S.w[0] + 1) >> 1);
223
224}
225
226
227
228__BID_INLINE__ void
229long_sqrt128 (UINT128 * pCS, UINT256 C256) {
230  UINT512 ARS0, ARS;
231  UINT256 ARS00, AE, AE2, S;
232  UINT128 ES, ES2, ARS1;
233  UINT64 ES32, CY, MY;
234  double l64, l128, lx, l2, l1, l0;
235  int_double f64, ly;
236  int ey, k, k2;
237
238  // 2^64
239  f64.i = 0x43f0000000000000ull;
240  l64 = f64.d;
241
242  l128 = l64 * l64;
243  lx = (double) C256.w[3] * l64 * l128;
244  l2 = (double) C256.w[2] * l128;
245  lx = FENCE (lx + l2);
246  l1 = (double) C256.w[1] * l64;
247  lx = FENCE (lx + l1);
248  l0 = (double) C256.w[0];
249  lx = FENCE (lx + l0);
250  // sqrt(C256)
251  ly.d = 1.0 / sqrt (lx);
252
253  MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull;
254  ey = 0x3ff - (ly.i >> 52);
255
256  // A10*RS^2, scaled by 2^(2*ey+104)
257  __mul_64x256_to_320 (ARS0, MY, C256);
258  __mul_64x320_to_384 (ARS, MY, ARS0);
259
260  // shr by k=(2*ey+104)-128
261  // expect k is in the range (192, 256) if result in [10^33, 10^34)
262  // apply an additional signed shift by 1 at the same time (to get eps=eps0/2)
263  k = (ey << 1) + 104 - 128 - 192;
264  k2 = 64 - k;
265  ES.w[0] = (ARS.w[3] >> (k + 1)) | (ARS.w[4] << (k2 - 1));
266  ES.w[1] = (ARS.w[4] >> k) | (ARS.w[5] << k2);
267  ES.w[1] = ((SINT64) ES.w[1]) >> 1;
268
269  // A*RS >> 192 (for error term computation)
270  ARS1.w[0] = ARS0.w[3];
271  ARS1.w[1] = ARS0.w[4];
272
273  // A*RS>>64
274  ARS00.w[0] = ARS0.w[1];
275  ARS00.w[1] = ARS0.w[2];
276  ARS00.w[2] = ARS0.w[3];
277  ARS00.w[3] = ARS0.w[4];
278
279  if (((SINT64) ES.w[1]) < 0) {
280    ES.w[0] = -ES.w[0];
281    ES.w[1] = -ES.w[1];
282    if (ES.w[0])
283      ES.w[1]--;
284
285    // A*RS*eps
286    __mul_128x128_to_256 (AE, ES, ARS1);
287
288    __add_carry_out (S.w[0], CY, ARS00.w[0], AE.w[0]);
289    __add_carry_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY);
290    __add_carry_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY);
291    S.w[3] = ARS00.w[3] + AE.w[3] + CY;
292  } else {
293    // A*RS*eps
294    __mul_128x128_to_256 (AE, ES, ARS1);
295
296    __sub_borrow_out (S.w[0], CY, ARS00.w[0], AE.w[0]);
297    __sub_borrow_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY);
298    __sub_borrow_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY);
299    S.w[3] = ARS00.w[3] - AE.w[3] - CY;
300  }
301
302  // 3/2*eps^2, scaled by 2^128
303  ES32 = ES.w[1] + (ES.w[1] >> 1);
304  __mul_64x64_to_128 (ES2, ES32, ES.w[1]);
305  // A*RS*3/2*eps^2
306  __mul_128x128_to_256 (AE2, ES2, ARS1);
307
308  // result, scaled by 2^(ey+52-64)
309  __add_carry_out (S.w[0], CY, S.w[0], AE2.w[0]);
310  __add_carry_in_out (S.w[1], CY, S.w[1], AE2.w[1], CY);
311  __add_carry_in_out (S.w[2], CY, S.w[2], AE2.w[2], CY);
312  S.w[3] = S.w[3] + AE2.w[3] + CY;
313
314  // k in (0, 64)
315  k = ey + 51 - 128;
316  k2 = 64 - k;
317  S.w[0] = (S.w[1] >> k) | (S.w[2] << k2);
318  S.w[1] = (S.w[2] >> k) | (S.w[3] << k2);
319
320  // round to nearest
321  S.w[0]++;
322  if (!S.w[0])
323    S.w[1]++;
324
325  pCS->w[0] = (S.w[1] << 63) | (S.w[0] >> 1);
326  pCS->w[1] = S.w[1] >> 1;
327
328}
329
330#endif
331#endif
332