1169689Skan/* Software floating-point emulation.
2169689Skan   Basic two-word fraction declaration and manipulation.
3171825Skan   Copyright (C) 1997,1998,1999,2006,2007 Free Software Foundation, Inc.
4169689Skan   This file is part of the GNU C Library.
5169689Skan   Contributed by Richard Henderson (rth@cygnus.com),
6169689Skan		  Jakub Jelinek (jj@ultra.linux.cz),
7169689Skan		  David S. Miller (davem@redhat.com) and
8169689Skan		  Peter Maydell (pmaydell@chiark.greenend.org.uk).
9169689Skan
10169689Skan   The GNU C Library is free software; you can redistribute it and/or
11169689Skan   modify it under the terms of the GNU Lesser General Public
12169689Skan   License as published by the Free Software Foundation; either
13169689Skan   version 2.1 of the License, or (at your option) any later version.
14169689Skan
15169689Skan   In addition to the permissions in the GNU Lesser General Public
16169689Skan   License, the Free Software Foundation gives you unlimited
17169689Skan   permission to link the compiled version of this file into
18169689Skan   combinations with other programs, and to distribute those
19169689Skan   combinations without any restriction coming from the use of this
20169689Skan   file.  (The Lesser General Public License restrictions do apply in
21169689Skan   other respects; for example, they cover modification of the file,
22169689Skan   and distribution when not linked into a combine executable.)
23169689Skan
24169689Skan   The GNU C Library is distributed in the hope that it will be useful,
25169689Skan   but WITHOUT ANY WARRANTY; without even the implied warranty of
26169689Skan   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
27169689Skan   Lesser General Public License for more details.
28169689Skan
29169689Skan   You should have received a copy of the GNU Lesser General Public
30169689Skan   License along with the GNU C Library; if not, write to the Free
31169689Skan   Software Foundation, 51 Franklin Street, Fifth Floor, Boston,
32169689Skan   MA 02110-1301, USA.  */
33169689Skan
34169689Skan#define _FP_FRAC_DECL_2(X)	_FP_W_TYPE X##_f0, X##_f1
35169689Skan#define _FP_FRAC_COPY_2(D,S)	(D##_f0 = S##_f0, D##_f1 = S##_f1)
36169689Skan#define _FP_FRAC_SET_2(X,I)	__FP_FRAC_SET_2(X, I)
37169689Skan#define _FP_FRAC_HIGH_2(X)	(X##_f1)
38169689Skan#define _FP_FRAC_LOW_2(X)	(X##_f0)
39169689Skan#define _FP_FRAC_WORD_2(X,w)	(X##_f##w)
40169689Skan
41169689Skan#define _FP_FRAC_SLL_2(X,N)						    \
42169689Skan(void)(((N) < _FP_W_TYPE_SIZE)						    \
43169689Skan       ? ({								    \
44169689Skan	    if (__builtin_constant_p(N) && (N) == 1)			    \
45169689Skan	      {								    \
46169689Skan		X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE)(X##_f0)) < 0);   \
47169689Skan		X##_f0 += X##_f0;					    \
48169689Skan	      }								    \
49169689Skan	    else							    \
50169689Skan	      {								    \
51169689Skan		X##_f1 = X##_f1 << (N) | X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \
52169689Skan		X##_f0 <<= (N);						    \
53169689Skan	      }								    \
54169689Skan	    0;								    \
55169689Skan	  })								    \
56169689Skan       : ({								    \
57169689Skan	    X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE);			    \
58169689Skan	    X##_f0 = 0;							    \
59169689Skan	  }))
60169689Skan
61169689Skan
62169689Skan#define _FP_FRAC_SRL_2(X,N)						\
63169689Skan(void)(((N) < _FP_W_TYPE_SIZE)						\
64169689Skan       ? ({								\
65169689Skan	    X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N));	\
66169689Skan	    X##_f1 >>= (N);						\
67169689Skan	  })								\
68169689Skan       : ({								\
69169689Skan	    X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE);			\
70169689Skan	    X##_f1 = 0;							\
71169689Skan	  }))
72169689Skan
73169689Skan/* Right shift with sticky-lsb.  */
74169689Skan#define _FP_FRAC_SRST_2(X,S, N,sz)					  \
75169689Skan(void)(((N) < _FP_W_TYPE_SIZE)						  \
76169689Skan       ? ({								  \
77169689Skan	    S = (__builtin_constant_p(N) && (N) == 1			  \
78169689Skan		 ? X##_f0 & 1						  \
79169689Skan		 : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0);		  \
80169689Skan	    X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N)); \
81169689Skan	    X##_f1 >>= (N);						  \
82169689Skan	  })								  \
83169689Skan       : ({								  \
84169689Skan	    S = ((((N) == _FP_W_TYPE_SIZE				  \
85169689Skan		   ? 0							  \
86169689Skan		   : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))		  \
87169689Skan		  | X##_f0) != 0);					  \
88169689Skan	    X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE));		  \
89169689Skan	    X##_f1 = 0;							  \
90169689Skan	  }))
91169689Skan
92169689Skan#define _FP_FRAC_SRS_2(X,N,sz)						  \
93169689Skan(void)(((N) < _FP_W_TYPE_SIZE)						  \
94169689Skan       ? ({								  \
95169689Skan	    X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) | \
96169689Skan		      (__builtin_constant_p(N) && (N) == 1		  \
97169689Skan		       ? X##_f0 & 1					  \
98169689Skan		       : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0));	  \
99169689Skan	    X##_f1 >>= (N);						  \
100169689Skan	  })								  \
101169689Skan       : ({								  \
102169689Skan	    X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE) |		  \
103169689Skan		      ((((N) == _FP_W_TYPE_SIZE				  \
104169689Skan			 ? 0						  \
105169689Skan			 : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))	  \
106169689Skan			| X##_f0) != 0));				  \
107169689Skan	    X##_f1 = 0;							  \
108169689Skan	  }))
109169689Skan
110169689Skan#define _FP_FRAC_ADDI_2(X,I)	\
111169689Skan  __FP_FRAC_ADDI_2(X##_f1, X##_f0, I)
112169689Skan
113169689Skan#define _FP_FRAC_ADD_2(R,X,Y)	\
114169689Skan  __FP_FRAC_ADD_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
115169689Skan
116169689Skan#define _FP_FRAC_SUB_2(R,X,Y)	\
117169689Skan  __FP_FRAC_SUB_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
118169689Skan
119169689Skan#define _FP_FRAC_DEC_2(X,Y)	\
120169689Skan  __FP_FRAC_DEC_2(X##_f1, X##_f0, Y##_f1, Y##_f0)
121169689Skan
122169689Skan#define _FP_FRAC_CLZ_2(R,X)	\
123169689Skan  do {				\
124169689Skan    if (X##_f1)			\
125169689Skan      __FP_CLZ(R,X##_f1);	\
126169689Skan    else 			\
127169689Skan    {				\
128169689Skan      __FP_CLZ(R,X##_f0);	\
129169689Skan      R += _FP_W_TYPE_SIZE;	\
130169689Skan    }				\
131169689Skan  } while(0)
132169689Skan
133169689Skan/* Predicates */
134169689Skan#define _FP_FRAC_NEGP_2(X)	((_FP_WS_TYPE)X##_f1 < 0)
135169689Skan#define _FP_FRAC_ZEROP_2(X)	((X##_f1 | X##_f0) == 0)
136169689Skan#define _FP_FRAC_OVERP_2(fs,X)	(_FP_FRAC_HIGH_##fs(X) & _FP_OVERFLOW_##fs)
137169689Skan#define _FP_FRAC_CLEAR_OVERP_2(fs,X)	(_FP_FRAC_HIGH_##fs(X) &= ~_FP_OVERFLOW_##fs)
138169689Skan#define _FP_FRAC_EQ_2(X, Y)	(X##_f1 == Y##_f1 && X##_f0 == Y##_f0)
139169689Skan#define _FP_FRAC_GT_2(X, Y)	\
140169689Skan  (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 > Y##_f0))
141169689Skan#define _FP_FRAC_GE_2(X, Y)	\
142169689Skan  (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 >= Y##_f0))
143169689Skan
144169689Skan#define _FP_ZEROFRAC_2		0, 0
145169689Skan#define _FP_MINFRAC_2		0, 1
146169689Skan#define _FP_MAXFRAC_2		(~(_FP_WS_TYPE)0), (~(_FP_WS_TYPE)0)
147169689Skan
148169689Skan/*
149169689Skan * Internals
150169689Skan */
151169689Skan
152169689Skan#define __FP_FRAC_SET_2(X,I1,I0)	(X##_f0 = I0, X##_f1 = I1)
153169689Skan
154169689Skan#define __FP_CLZ_2(R, xh, xl)	\
155169689Skan  do {				\
156169689Skan    if (xh)			\
157169689Skan      __FP_CLZ(R,xh);		\
158169689Skan    else 			\
159169689Skan    {				\
160169689Skan      __FP_CLZ(R,xl);		\
161169689Skan      R += _FP_W_TYPE_SIZE;	\
162169689Skan    }				\
163169689Skan  } while(0)
164169689Skan
165169689Skan#if 0
166169689Skan
167169689Skan#ifndef __FP_FRAC_ADDI_2
168169689Skan#define __FP_FRAC_ADDI_2(xh, xl, i)	\
169169689Skan  (xh += ((xl += i) < i))
170169689Skan#endif
171169689Skan#ifndef __FP_FRAC_ADD_2
172169689Skan#define __FP_FRAC_ADD_2(rh, rl, xh, xl, yh, yl)	\
173169689Skan  (rh = xh + yh + ((rl = xl + yl) < xl))
174169689Skan#endif
175169689Skan#ifndef __FP_FRAC_SUB_2
176169689Skan#define __FP_FRAC_SUB_2(rh, rl, xh, xl, yh, yl)	\
177169689Skan  (rh = xh - yh - ((rl = xl - yl) > xl))
178169689Skan#endif
179169689Skan#ifndef __FP_FRAC_DEC_2
180169689Skan#define __FP_FRAC_DEC_2(xh, xl, yh, yl)	\
181169689Skan  do {					\
182169689Skan    UWtype _t = xl;			\
183169689Skan    xh -= yh + ((xl -= yl) > _t);	\
184169689Skan  } while (0)
185169689Skan#endif
186169689Skan
187169689Skan#else
188169689Skan
189169689Skan#undef __FP_FRAC_ADDI_2
190169689Skan#define __FP_FRAC_ADDI_2(xh, xl, i)	add_ssaaaa(xh, xl, xh, xl, 0, i)
191169689Skan#undef __FP_FRAC_ADD_2
192169689Skan#define __FP_FRAC_ADD_2			add_ssaaaa
193169689Skan#undef __FP_FRAC_SUB_2
194169689Skan#define __FP_FRAC_SUB_2			sub_ddmmss
195169689Skan#undef __FP_FRAC_DEC_2
196169689Skan#define __FP_FRAC_DEC_2(xh, xl, yh, yl)	sub_ddmmss(xh, xl, xh, xl, yh, yl)
197169689Skan
198169689Skan#endif
199169689Skan
200169689Skan/*
201169689Skan * Unpack the raw bits of a native fp value.  Do not classify or
202169689Skan * normalize the data.
203169689Skan */
204169689Skan
205169689Skan#define _FP_UNPACK_RAW_2(fs, X, val)			\
206169689Skan  do {							\
207169689Skan    union _FP_UNION_##fs _flo; _flo.flt = (val);	\
208169689Skan							\
209169689Skan    X##_f0 = _flo.bits.frac0;				\
210169689Skan    X##_f1 = _flo.bits.frac1;				\
211169689Skan    X##_e  = _flo.bits.exp;				\
212169689Skan    X##_s  = _flo.bits.sign;				\
213169689Skan  } while (0)
214169689Skan
215169689Skan#define _FP_UNPACK_RAW_2_P(fs, X, val)			\
216169689Skan  do {							\
217169689Skan    union _FP_UNION_##fs *_flo =			\
218169689Skan      (union _FP_UNION_##fs *)(val);			\
219169689Skan							\
220169689Skan    X##_f0 = _flo->bits.frac0;				\
221169689Skan    X##_f1 = _flo->bits.frac1;				\
222169689Skan    X##_e  = _flo->bits.exp;				\
223169689Skan    X##_s  = _flo->bits.sign;				\
224169689Skan  } while (0)
225169689Skan
226169689Skan
227169689Skan/*
228169689Skan * Repack the raw bits of a native fp value.
229169689Skan */
230169689Skan
231169689Skan#define _FP_PACK_RAW_2(fs, val, X)			\
232169689Skan  do {							\
233169689Skan    union _FP_UNION_##fs _flo;				\
234169689Skan							\
235169689Skan    _flo.bits.frac0 = X##_f0;				\
236169689Skan    _flo.bits.frac1 = X##_f1;				\
237169689Skan    _flo.bits.exp   = X##_e;				\
238169689Skan    _flo.bits.sign  = X##_s;				\
239169689Skan							\
240169689Skan    (val) = _flo.flt;					\
241169689Skan  } while (0)
242169689Skan
243169689Skan#define _FP_PACK_RAW_2_P(fs, val, X)			\
244169689Skan  do {							\
245169689Skan    union _FP_UNION_##fs *_flo =			\
246169689Skan      (union _FP_UNION_##fs *)(val);			\
247169689Skan							\
248169689Skan    _flo->bits.frac0 = X##_f0;				\
249169689Skan    _flo->bits.frac1 = X##_f1;				\
250169689Skan    _flo->bits.exp   = X##_e;				\
251169689Skan    _flo->bits.sign  = X##_s;				\
252169689Skan  } while (0)
253169689Skan
254169689Skan
255169689Skan/*
256169689Skan * Multiplication algorithms:
257169689Skan */
258169689Skan
259169689Skan/* Given a 1W * 1W => 2W primitive, do the extended multiplication.  */
260169689Skan
261169689Skan#define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit)			\
262169689Skan  do {									\
263169689Skan    _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);	\
264169689Skan									\
265169689Skan    doit(_FP_FRAC_WORD_4(_z,1), _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0);	\
266169689Skan    doit(_b_f1, _b_f0, X##_f0, Y##_f1);					\
267169689Skan    doit(_c_f1, _c_f0, X##_f1, Y##_f0);					\
268169689Skan    doit(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2), X##_f1, Y##_f1);	\
269169689Skan									\
270169689Skan    __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
271169689Skan		    _FP_FRAC_WORD_4(_z,1), 0, _b_f1, _b_f0,		\
272169689Skan		    _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
273169689Skan		    _FP_FRAC_WORD_4(_z,1));				\
274169689Skan    __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
275169689Skan		    _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0,		\
276169689Skan		    _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
277169689Skan		    _FP_FRAC_WORD_4(_z,1));				\
278169689Skan									\
279169689Skan    /* Normalize since we know where the msb of the multiplicands	\
280169689Skan       were (bit B), we know that the msb of the of the product is	\
281169689Skan       at either 2B or 2B-1.  */					\
282169689Skan    _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);			\
283169689Skan    R##_f0 = _FP_FRAC_WORD_4(_z,0);					\
284169689Skan    R##_f1 = _FP_FRAC_WORD_4(_z,1);					\
285169689Skan  } while (0)
286169689Skan
287169689Skan/* Given a 1W * 1W => 2W primitive, do the extended multiplication.
288169689Skan   Do only 3 multiplications instead of four. This one is for machines
289169689Skan   where multiplication is much more expensive than subtraction.  */
290169689Skan
291169689Skan#define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit)		\
292169689Skan  do {									\
293169689Skan    _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);	\
294169689Skan    _FP_W_TYPE _d;							\
295169689Skan    int _c1, _c2;							\
296169689Skan									\
297169689Skan    _b_f0 = X##_f0 + X##_f1;						\
298169689Skan    _c1 = _b_f0 < X##_f0;						\
299169689Skan    _b_f1 = Y##_f0 + Y##_f1;						\
300169689Skan    _c2 = _b_f1 < Y##_f0;						\
301169689Skan    doit(_d, _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0);			\
302169689Skan    doit(_FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1), _b_f0, _b_f1);	\
303169689Skan    doit(_c_f1, _c_f0, X##_f1, Y##_f1);					\
304169689Skan									\
305169689Skan    _b_f0 &= -_c2;							\
306169689Skan    _b_f1 &= -_c1;							\
307169689Skan    __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
308169689Skan		    _FP_FRAC_WORD_4(_z,1), (_c1 & _c2), 0, _d,		\
309169689Skan		    0, _FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1));	\
310169689Skan    __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
311169689Skan		     _b_f0);						\
312169689Skan    __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
313169689Skan		     _b_f1);						\
314169689Skan    __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
315169689Skan		    _FP_FRAC_WORD_4(_z,1),				\
316169689Skan		    0, _d, _FP_FRAC_WORD_4(_z,0));			\
317169689Skan    __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
318169689Skan		    _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0);		\
319169689Skan    __FP_FRAC_ADD_2(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2),	\
320169689Skan		    _c_f1, _c_f0,					\
321169689Skan		    _FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2));	\
322169689Skan									\
323169689Skan    /* Normalize since we know where the msb of the multiplicands	\
324169689Skan       were (bit B), we know that the msb of the of the product is	\
325169689Skan       at either 2B or 2B-1.  */					\
326169689Skan    _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);			\
327169689Skan    R##_f0 = _FP_FRAC_WORD_4(_z,0);					\
328169689Skan    R##_f1 = _FP_FRAC_WORD_4(_z,1);					\
329169689Skan  } while (0)
330169689Skan
331169689Skan#define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y)				\
332169689Skan  do {									\
333169689Skan    _FP_FRAC_DECL_4(_z);						\
334169689Skan    _FP_W_TYPE _x[2], _y[2];						\
335169689Skan    _x[0] = X##_f0; _x[1] = X##_f1;					\
336169689Skan    _y[0] = Y##_f0; _y[1] = Y##_f1;					\
337169689Skan									\
338169689Skan    mpn_mul_n(_z_f, _x, _y, 2);						\
339169689Skan									\
340169689Skan    /* Normalize since we know where the msb of the multiplicands	\
341169689Skan       were (bit B), we know that the msb of the of the product is	\
342169689Skan       at either 2B or 2B-1.  */					\
343169689Skan    _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);			\
344169689Skan    R##_f0 = _z_f[0];							\
345169689Skan    R##_f1 = _z_f[1];							\
346169689Skan  } while (0)
347169689Skan
348169689Skan/* Do at most 120x120=240 bits multiplication using double floating
349169689Skan   point multiplication.  This is useful if floating point
350169689Skan   multiplication has much bigger throughput than integer multiply.
351169689Skan   It is supposed to work for _FP_W_TYPE_SIZE 64 and wfracbits
352169689Skan   between 106 and 120 only.
353169689Skan   Caller guarantees that X and Y has (1LLL << (wfracbits - 1)) set.
354169689Skan   SETFETZ is a macro which will disable all FPU exceptions and set rounding
355169689Skan   towards zero,  RESETFE should optionally reset it back.  */
356169689Skan
357169689Skan#define _FP_MUL_MEAT_2_120_240_double(wfracbits, R, X, Y, setfetz, resetfe)	\
358169689Skan  do {										\
359169689Skan    static const double _const[] = {						\
360169689Skan      /* 2^-24 */ 5.9604644775390625e-08,					\
361169689Skan      /* 2^-48 */ 3.5527136788005009e-15,					\
362169689Skan      /* 2^-72 */ 2.1175823681357508e-22,					\
363169689Skan      /* 2^-96 */ 1.2621774483536189e-29,					\
364169689Skan      /* 2^28 */ 2.68435456e+08,						\
365169689Skan      /* 2^4 */ 1.600000e+01,							\
366169689Skan      /* 2^-20 */ 9.5367431640625e-07,						\
367169689Skan      /* 2^-44 */ 5.6843418860808015e-14,					\
368169689Skan      /* 2^-68 */ 3.3881317890172014e-21,					\
369169689Skan      /* 2^-92 */ 2.0194839173657902e-28,					\
370169689Skan      /* 2^-116 */ 1.2037062152420224e-35};					\
371169689Skan    double _a240, _b240, _c240, _d240, _e240, _f240, 				\
372169689Skan	   _g240, _h240, _i240, _j240, _k240;					\
373169689Skan    union { double d; UDItype i; } _l240, _m240, _n240, _o240,			\
374169689Skan				   _p240, _q240, _r240, _s240;			\
375169689Skan    UDItype _t240, _u240, _v240, _w240, _x240, _y240 = 0;			\
376169689Skan										\
377169689Skan    if (wfracbits < 106 || wfracbits > 120)					\
378169689Skan      abort();									\
379169689Skan										\
380169689Skan    setfetz;									\
381169689Skan										\
382169689Skan    _e240 = (double)(long)(X##_f0 & 0xffffff);					\
383169689Skan    _j240 = (double)(long)(Y##_f0 & 0xffffff);					\
384169689Skan    _d240 = (double)(long)((X##_f0 >> 24) & 0xffffff);				\
385169689Skan    _i240 = (double)(long)((Y##_f0 >> 24) & 0xffffff);				\
386169689Skan    _c240 = (double)(long)(((X##_f1 << 16) & 0xffffff) | (X##_f0 >> 48));	\
387169689Skan    _h240 = (double)(long)(((Y##_f1 << 16) & 0xffffff) | (Y##_f0 >> 48));	\
388169689Skan    _b240 = (double)(long)((X##_f1 >> 8) & 0xffffff);				\
389169689Skan    _g240 = (double)(long)((Y##_f1 >> 8) & 0xffffff);				\
390169689Skan    _a240 = (double)(long)(X##_f1 >> 32);					\
391169689Skan    _f240 = (double)(long)(Y##_f1 >> 32);					\
392169689Skan    _e240 *= _const[3];								\
393169689Skan    _j240 *= _const[3];								\
394169689Skan    _d240 *= _const[2];								\
395169689Skan    _i240 *= _const[2];								\
396169689Skan    _c240 *= _const[1];								\
397169689Skan    _h240 *= _const[1];								\
398169689Skan    _b240 *= _const[0];								\
399169689Skan    _g240 *= _const[0];								\
400169689Skan    _s240.d =							      _e240*_j240;\
401169689Skan    _r240.d =						_d240*_j240 + _e240*_i240;\
402169689Skan    _q240.d =				  _c240*_j240 + _d240*_i240 + _e240*_h240;\
403169689Skan    _p240.d =		    _b240*_j240 + _c240*_i240 + _d240*_h240 + _e240*_g240;\
404169689Skan    _o240.d = _a240*_j240 + _b240*_i240 + _c240*_h240 + _d240*_g240 + _e240*_f240;\
405169689Skan    _n240.d = _a240*_i240 + _b240*_h240 + _c240*_g240 + _d240*_f240;		\
406169689Skan    _m240.d = _a240*_h240 + _b240*_g240 + _c240*_f240;				\
407169689Skan    _l240.d = _a240*_g240 + _b240*_f240;					\
408169689Skan    _k240 =   _a240*_f240;							\
409169689Skan    _r240.d += _s240.d;								\
410169689Skan    _q240.d += _r240.d;								\
411169689Skan    _p240.d += _q240.d;								\
412169689Skan    _o240.d += _p240.d;								\
413169689Skan    _n240.d += _o240.d;								\
414169689Skan    _m240.d += _n240.d;								\
415169689Skan    _l240.d += _m240.d;								\
416169689Skan    _k240 += _l240.d;								\
417169689Skan    _s240.d -= ((_const[10]+_s240.d)-_const[10]);				\
418169689Skan    _r240.d -= ((_const[9]+_r240.d)-_const[9]);					\
419169689Skan    _q240.d -= ((_const[8]+_q240.d)-_const[8]);					\
420169689Skan    _p240.d -= ((_const[7]+_p240.d)-_const[7]);					\
421169689Skan    _o240.d += _const[7];							\
422169689Skan    _n240.d += _const[6];							\
423169689Skan    _m240.d += _const[5];							\
424169689Skan    _l240.d += _const[4];							\
425169689Skan    if (_s240.d != 0.0) _y240 = 1;						\
426169689Skan    if (_r240.d != 0.0) _y240 = 1;						\
427169689Skan    if (_q240.d != 0.0) _y240 = 1;						\
428169689Skan    if (_p240.d != 0.0) _y240 = 1;						\
429169689Skan    _t240 = (DItype)_k240;							\
430169689Skan    _u240 = _l240.i;								\
431169689Skan    _v240 = _m240.i;								\
432169689Skan    _w240 = _n240.i;								\
433169689Skan    _x240 = _o240.i;								\
434169689Skan    R##_f1 = (_t240 << (128 - (wfracbits - 1)))					\
435169689Skan	     | ((_u240 & 0xffffff) >> ((wfracbits - 1) - 104));			\
436169689Skan    R##_f0 = ((_u240 & 0xffffff) << (168 - (wfracbits - 1)))			\
437169689Skan    	     | ((_v240 & 0xffffff) << (144 - (wfracbits - 1)))			\
438169689Skan    	     | ((_w240 & 0xffffff) << (120 - (wfracbits - 1)))			\
439169689Skan    	     | ((_x240 & 0xffffff) >> ((wfracbits - 1) - 96))			\
440169689Skan    	     | _y240;								\
441169689Skan    resetfe;									\
442169689Skan  } while (0)
443169689Skan
444169689Skan/*
445169689Skan * Division algorithms:
446169689Skan */
447169689Skan
448169689Skan#define _FP_DIV_MEAT_2_udiv(fs, R, X, Y)				\
449169689Skan  do {									\
450169689Skan    _FP_W_TYPE _n_f2, _n_f1, _n_f0, _r_f1, _r_f0, _m_f1, _m_f0;		\
451169689Skan    if (_FP_FRAC_GT_2(X, Y))						\
452169689Skan      {									\
453169689Skan	_n_f2 = X##_f1 >> 1;						\
454169689Skan	_n_f1 = X##_f1 << (_FP_W_TYPE_SIZE - 1) | X##_f0 >> 1;		\
455169689Skan	_n_f0 = X##_f0 << (_FP_W_TYPE_SIZE - 1);			\
456169689Skan      }									\
457169689Skan    else								\
458169689Skan      {									\
459169689Skan	R##_e--;							\
460169689Skan	_n_f2 = X##_f1;							\
461169689Skan	_n_f1 = X##_f0;							\
462169689Skan	_n_f0 = 0;							\
463169689Skan      }									\
464169689Skan									\
465169689Skan    /* Normalize, i.e. make the most significant bit of the 		\
466169689Skan       denominator set. */						\
467169689Skan    _FP_FRAC_SLL_2(Y, _FP_WFRACXBITS_##fs);				\
468169689Skan									\
469169689Skan    udiv_qrnnd(R##_f1, _r_f1, _n_f2, _n_f1, Y##_f1);			\
470169689Skan    umul_ppmm(_m_f1, _m_f0, R##_f1, Y##_f0);				\
471169689Skan    _r_f0 = _n_f0;							\
472169689Skan    if (_FP_FRAC_GT_2(_m, _r))						\
473169689Skan      {									\
474169689Skan	R##_f1--;							\
475169689Skan	_FP_FRAC_ADD_2(_r, Y, _r);					\
476169689Skan	if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r))		\
477169689Skan	  {								\
478169689Skan	    R##_f1--;							\
479169689Skan	    _FP_FRAC_ADD_2(_r, Y, _r);					\
480169689Skan	  }								\
481169689Skan      }									\
482169689Skan    _FP_FRAC_DEC_2(_r, _m);						\
483169689Skan									\
484169689Skan    if (_r_f1 == Y##_f1)						\
485169689Skan      {									\
486169689Skan	/* This is a special case, not an optimization			\
487169689Skan	   (_r/Y##_f1 would not fit into UWtype).			\
488169689Skan	   As _r is guaranteed to be < Y,  R##_f0 can be either		\
489169689Skan	   (UWtype)-1 or (UWtype)-2.  But as we know what kind		\
490169689Skan	   of bits it is (sticky, guard, round),  we don't care.	\
491169689Skan	   We also don't care what the reminder is,  because the	\
492169689Skan	   guard bit will be set anyway.  -jj */			\
493169689Skan	R##_f0 = -1;							\
494169689Skan      }									\
495169689Skan    else								\
496169689Skan      {									\
497169689Skan	udiv_qrnnd(R##_f0, _r_f1, _r_f1, _r_f0, Y##_f1);		\
498169689Skan	umul_ppmm(_m_f1, _m_f0, R##_f0, Y##_f0);			\
499169689Skan	_r_f0 = 0;							\
500169689Skan	if (_FP_FRAC_GT_2(_m, _r))					\
501169689Skan	  {								\
502169689Skan	    R##_f0--;							\
503169689Skan	    _FP_FRAC_ADD_2(_r, Y, _r);					\
504169689Skan	    if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r))		\
505169689Skan	      {								\
506169689Skan		R##_f0--;						\
507169689Skan		_FP_FRAC_ADD_2(_r, Y, _r);				\
508169689Skan	      }								\
509169689Skan	  }								\
510169689Skan	if (!_FP_FRAC_EQ_2(_r, _m))					\
511169689Skan	  R##_f0 |= _FP_WORK_STICKY;					\
512169689Skan      }									\
513169689Skan  } while (0)
514169689Skan
515169689Skan
516169689Skan#define _FP_DIV_MEAT_2_gmp(fs, R, X, Y)					\
517169689Skan  do {									\
518169689Skan    _FP_W_TYPE _x[4], _y[2], _z[4];					\
519169689Skan    _y[0] = Y##_f0; _y[1] = Y##_f1;					\
520169689Skan    _x[0] = _x[3] = 0;							\
521169689Skan    if (_FP_FRAC_GT_2(X, Y))						\
522169689Skan      {									\
523169689Skan	R##_e++;							\
524169689Skan	_x[1] = (X##_f0 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE) |	\
525169689Skan		 X##_f1 >> (_FP_W_TYPE_SIZE -				\
526169689Skan			    (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE)));	\
527169689Skan	_x[2] = X##_f1 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE);	\
528169689Skan      }									\
529169689Skan    else								\
530169689Skan      {									\
531169689Skan	_x[1] = (X##_f0 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE) |	\
532169689Skan		 X##_f1 >> (_FP_W_TYPE_SIZE -				\
533169689Skan			    (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE)));	\
534169689Skan	_x[2] = X##_f1 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE);	\
535169689Skan      }									\
536169689Skan									\
537169689Skan    (void) mpn_divrem (_z, 0, _x, 4, _y, 2);				\
538169689Skan    R##_f1 = _z[1];							\
539169689Skan    R##_f0 = _z[0] | ((_x[0] | _x[1]) != 0);				\
540169689Skan  } while (0)
541169689Skan
542169689Skan
543169689Skan/*
544169689Skan * Square root algorithms:
545169689Skan * We have just one right now, maybe Newton approximation
546169689Skan * should be added for those machines where division is fast.
547169689Skan */
548169689Skan
549169689Skan#define _FP_SQRT_MEAT_2(R, S, T, X, q)			\
550169689Skan  do {							\
551169689Skan    while (q)						\
552169689Skan      {							\
553169689Skan	T##_f1 = S##_f1 + q;				\
554169689Skan	if (T##_f1 <= X##_f1)				\
555169689Skan	  {						\
556169689Skan	    S##_f1 = T##_f1 + q;			\
557169689Skan	    X##_f1 -= T##_f1;				\
558169689Skan	    R##_f1 += q;				\
559169689Skan	  }						\
560169689Skan	_FP_FRAC_SLL_2(X, 1);				\
561169689Skan	q >>= 1;					\
562169689Skan      }							\
563169689Skan    q = (_FP_W_TYPE)1 << (_FP_W_TYPE_SIZE - 1);		\
564169689Skan    while (q != _FP_WORK_ROUND)				\
565169689Skan      {							\
566169689Skan	T##_f0 = S##_f0 + q;				\
567169689Skan	T##_f1 = S##_f1;				\
568169689Skan	if (T##_f1 < X##_f1 || 				\
569169689Skan	    (T##_f1 == X##_f1 && T##_f0 <= X##_f0))	\
570169689Skan	  {						\
571169689Skan	    S##_f0 = T##_f0 + q;			\
572169689Skan	    S##_f1 += (T##_f0 > S##_f0);		\
573169689Skan	    _FP_FRAC_DEC_2(X, T);			\
574169689Skan	    R##_f0 += q;				\
575169689Skan	  }						\
576169689Skan	_FP_FRAC_SLL_2(X, 1);				\
577169689Skan	q >>= 1;					\
578169689Skan      }							\
579169689Skan    if (X##_f0 | X##_f1)				\
580169689Skan      {							\
581169689Skan	if (S##_f1 < X##_f1 || 				\
582169689Skan	    (S##_f1 == X##_f1 && S##_f0 < X##_f0))	\
583169689Skan	  R##_f0 |= _FP_WORK_ROUND;			\
584169689Skan	R##_f0 |= _FP_WORK_STICKY;			\
585169689Skan      }							\
586169689Skan  } while (0)
587169689Skan
588169689Skan
589169689Skan/*
590169689Skan * Assembly/disassembly for converting to/from integral types.
591169689Skan * No shifting or overflow handled here.
592169689Skan */
593169689Skan
594169689Skan#define _FP_FRAC_ASSEMBLE_2(r, X, rsize)	\
595169689Skan(void)((rsize <= _FP_W_TYPE_SIZE)		\
596169689Skan       ? ({ r = X##_f0; })			\
597169689Skan       : ({					\
598169689Skan	    r = X##_f1;				\
599169689Skan	    r <<= _FP_W_TYPE_SIZE;		\
600169689Skan	    r += X##_f0;			\
601169689Skan	  }))
602169689Skan
603169689Skan#define _FP_FRAC_DISASSEMBLE_2(X, r, rsize)				\
604169689Skan  do {									\
605169689Skan    X##_f0 = r;								\
606169689Skan    X##_f1 = (rsize <= _FP_W_TYPE_SIZE ? 0 : r >> _FP_W_TYPE_SIZE);	\
607169689Skan  } while (0)
608169689Skan
609169689Skan/*
610169689Skan * Convert FP values between word sizes
611169689Skan */
612169689Skan
613169689Skan#define _FP_FRAC_COPY_1_2(D, S)		(D##_f = S##_f0)
614169689Skan
615169689Skan#define _FP_FRAC_COPY_2_1(D, S)		((D##_f0 = S##_f), (D##_f1 = 0))
616171825Skan
617171825Skan#define _FP_FRAC_COPY_2_2(D,S)		_FP_FRAC_COPY_2(D,S)
618