1/* Software floating-point emulation.
2   Basic two-word fraction declaration and manipulation.
3   Copyright (C) 1997,1998,1999,2006,2007 Free Software Foundation, Inc.
4   This file is part of the GNU C Library.
5   Contributed by Richard Henderson (rth@cygnus.com),
6		  Jakub Jelinek (jj@ultra.linux.cz),
7		  David S. Miller (davem@redhat.com) and
8		  Peter Maydell (pmaydell@chiark.greenend.org.uk).
9
10   The GNU C Library is free software; you can redistribute it and/or
11   modify it under the terms of the GNU Lesser General Public
12   License as published by the Free Software Foundation; either
13   version 2.1 of the License, or (at your option) any later version.
14
15   In addition to the permissions in the GNU Lesser General Public
16   License, the Free Software Foundation gives you unlimited
17   permission to link the compiled version of this file into
18   combinations with other programs, and to distribute those
19   combinations without any restriction coming from the use of this
20   file.  (The Lesser General Public License restrictions do apply in
21   other respects; for example, they cover modification of the file,
22   and distribution when not linked into a combine executable.)
23
24   The GNU C Library is distributed in the hope that it will be useful,
25   but WITHOUT ANY WARRANTY; without even the implied warranty of
26   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
27   Lesser General Public License for more details.
28
29   You should have received a copy of the GNU Lesser General Public
30   License along with the GNU C Library; if not, write to the Free
31   Software Foundation, 51 Franklin Street, Fifth Floor, Boston,
32   MA 02110-1301, USA.  */
33
34#define _FP_FRAC_DECL_2(X)	_FP_W_TYPE X##_f0, X##_f1
35#define _FP_FRAC_COPY_2(D,S)	(D##_f0 = S##_f0, D##_f1 = S##_f1)
36#define _FP_FRAC_SET_2(X,I)	__FP_FRAC_SET_2(X, I)
37#define _FP_FRAC_HIGH_2(X)	(X##_f1)
38#define _FP_FRAC_LOW_2(X)	(X##_f0)
39#define _FP_FRAC_WORD_2(X,w)	(X##_f##w)
40
41#define _FP_FRAC_SLL_2(X,N)						    \
42(void)(((N) < _FP_W_TYPE_SIZE)						    \
43       ? ({								    \
44	    if (__builtin_constant_p(N) && (N) == 1)			    \
45	      {								    \
46		X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE)(X##_f0)) < 0);   \
47		X##_f0 += X##_f0;					    \
48	      }								    \
49	    else							    \
50	      {								    \
51		X##_f1 = X##_f1 << (N) | X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \
52		X##_f0 <<= (N);						    \
53	      }								    \
54	    0;								    \
55	  })								    \
56       : ({								    \
57	    X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE);			    \
58	    X##_f0 = 0;							    \
59	  }))
60
61
62#define _FP_FRAC_SRL_2(X,N)						\
63(void)(((N) < _FP_W_TYPE_SIZE)						\
64       ? ({								\
65	    X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N));	\
66	    X##_f1 >>= (N);						\
67	  })								\
68       : ({								\
69	    X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE);			\
70	    X##_f1 = 0;							\
71	  }))
72
73/* Right shift with sticky-lsb.  */
74#define _FP_FRAC_SRST_2(X,S, N,sz)					  \
75(void)(((N) < _FP_W_TYPE_SIZE)						  \
76       ? ({								  \
77	    S = (__builtin_constant_p(N) && (N) == 1			  \
78		 ? X##_f0 & 1						  \
79		 : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0);		  \
80	    X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N)); \
81	    X##_f1 >>= (N);						  \
82	  })								  \
83       : ({								  \
84	    S = ((((N) == _FP_W_TYPE_SIZE				  \
85		   ? 0							  \
86		   : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))		  \
87		  | X##_f0) != 0);					  \
88	    X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE));		  \
89	    X##_f1 = 0;							  \
90	  }))
91
92#define _FP_FRAC_SRS_2(X,N,sz)						  \
93(void)(((N) < _FP_W_TYPE_SIZE)						  \
94       ? ({								  \
95	    X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) | \
96		      (__builtin_constant_p(N) && (N) == 1		  \
97		       ? X##_f0 & 1					  \
98		       : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0));	  \
99	    X##_f1 >>= (N);						  \
100	  })								  \
101       : ({								  \
102	    X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE) |		  \
103		      ((((N) == _FP_W_TYPE_SIZE				  \
104			 ? 0						  \
105			 : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))	  \
106			| X##_f0) != 0));				  \
107	    X##_f1 = 0;							  \
108	  }))
109
110#define _FP_FRAC_ADDI_2(X,I)	\
111  __FP_FRAC_ADDI_2(X##_f1, X##_f0, I)
112
113#define _FP_FRAC_ADD_2(R,X,Y)	\
114  __FP_FRAC_ADD_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
115
116#define _FP_FRAC_SUB_2(R,X,Y)	\
117  __FP_FRAC_SUB_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
118
119#define _FP_FRAC_DEC_2(X,Y)	\
120  __FP_FRAC_DEC_2(X##_f1, X##_f0, Y##_f1, Y##_f0)
121
122#define _FP_FRAC_CLZ_2(R,X)	\
123  do {				\
124    if (X##_f1)			\
125      __FP_CLZ(R,X##_f1);	\
126    else 			\
127    {				\
128      __FP_CLZ(R,X##_f0);	\
129      R += _FP_W_TYPE_SIZE;	\
130    }				\
131  } while(0)
132
133/* Predicates */
134#define _FP_FRAC_NEGP_2(X)	((_FP_WS_TYPE)X##_f1 < 0)
135#define _FP_FRAC_ZEROP_2(X)	((X##_f1 | X##_f0) == 0)
136#define _FP_FRAC_OVERP_2(fs,X)	(_FP_FRAC_HIGH_##fs(X) & _FP_OVERFLOW_##fs)
137#define _FP_FRAC_CLEAR_OVERP_2(fs,X)	(_FP_FRAC_HIGH_##fs(X) &= ~_FP_OVERFLOW_##fs)
138#define _FP_FRAC_EQ_2(X, Y)	(X##_f1 == Y##_f1 && X##_f0 == Y##_f0)
139#define _FP_FRAC_GT_2(X, Y)	\
140  (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 > Y##_f0))
141#define _FP_FRAC_GE_2(X, Y)	\
142  (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 >= Y##_f0))
143
144#define _FP_ZEROFRAC_2		0, 0
145#define _FP_MINFRAC_2		0, 1
146#define _FP_MAXFRAC_2		(~(_FP_WS_TYPE)0), (~(_FP_WS_TYPE)0)
147
148/*
149 * Internals
150 */
151
152#define __FP_FRAC_SET_2(X,I1,I0)	(X##_f0 = I0, X##_f1 = I1)
153
154#define __FP_CLZ_2(R, xh, xl)	\
155  do {				\
156    if (xh)			\
157      __FP_CLZ(R,xh);		\
158    else 			\
159    {				\
160      __FP_CLZ(R,xl);		\
161      R += _FP_W_TYPE_SIZE;	\
162    }				\
163  } while(0)
164
165#if 0
166
167#ifndef __FP_FRAC_ADDI_2
168#define __FP_FRAC_ADDI_2(xh, xl, i)	\
169  (xh += ((xl += i) < i))
170#endif
171#ifndef __FP_FRAC_ADD_2
172#define __FP_FRAC_ADD_2(rh, rl, xh, xl, yh, yl)	\
173  (rh = xh + yh + ((rl = xl + yl) < xl))
174#endif
175#ifndef __FP_FRAC_SUB_2
176#define __FP_FRAC_SUB_2(rh, rl, xh, xl, yh, yl)	\
177  (rh = xh - yh - ((rl = xl - yl) > xl))
178#endif
179#ifndef __FP_FRAC_DEC_2
180#define __FP_FRAC_DEC_2(xh, xl, yh, yl)	\
181  do {					\
182    UWtype _t = xl;			\
183    xh -= yh + ((xl -= yl) > _t);	\
184  } while (0)
185#endif
186
187#else
188
189#undef __FP_FRAC_ADDI_2
190#define __FP_FRAC_ADDI_2(xh, xl, i)	add_ssaaaa(xh, xl, xh, xl, 0, i)
191#undef __FP_FRAC_ADD_2
192#define __FP_FRAC_ADD_2			add_ssaaaa
193#undef __FP_FRAC_SUB_2
194#define __FP_FRAC_SUB_2			sub_ddmmss
195#undef __FP_FRAC_DEC_2
196#define __FP_FRAC_DEC_2(xh, xl, yh, yl)	sub_ddmmss(xh, xl, xh, xl, yh, yl)
197
198#endif
199
200/*
201 * Unpack the raw bits of a native fp value.  Do not classify or
202 * normalize the data.
203 */
204
205#define _FP_UNPACK_RAW_2(fs, X, val)			\
206  do {							\
207    union _FP_UNION_##fs _flo; _flo.flt = (val);	\
208							\
209    X##_f0 = _flo.bits.frac0;				\
210    X##_f1 = _flo.bits.frac1;				\
211    X##_e  = _flo.bits.exp;				\
212    X##_s  = _flo.bits.sign;				\
213  } while (0)
214
215#define _FP_UNPACK_RAW_2_P(fs, X, val)			\
216  do {							\
217    union _FP_UNION_##fs *_flo =			\
218      (union _FP_UNION_##fs *)(val);			\
219							\
220    X##_f0 = _flo->bits.frac0;				\
221    X##_f1 = _flo->bits.frac1;				\
222    X##_e  = _flo->bits.exp;				\
223    X##_s  = _flo->bits.sign;				\
224  } while (0)
225
226
227/*
228 * Repack the raw bits of a native fp value.
229 */
230
231#define _FP_PACK_RAW_2(fs, val, X)			\
232  do {							\
233    union _FP_UNION_##fs _flo;				\
234							\
235    _flo.bits.frac0 = X##_f0;				\
236    _flo.bits.frac1 = X##_f1;				\
237    _flo.bits.exp   = X##_e;				\
238    _flo.bits.sign  = X##_s;				\
239							\
240    (val) = _flo.flt;					\
241  } while (0)
242
243#define _FP_PACK_RAW_2_P(fs, val, X)			\
244  do {							\
245    union _FP_UNION_##fs *_flo =			\
246      (union _FP_UNION_##fs *)(val);			\
247							\
248    _flo->bits.frac0 = X##_f0;				\
249    _flo->bits.frac1 = X##_f1;				\
250    _flo->bits.exp   = X##_e;				\
251    _flo->bits.sign  = X##_s;				\
252  } while (0)
253
254
255/*
256 * Multiplication algorithms:
257 */
258
259/* Given a 1W * 1W => 2W primitive, do the extended multiplication.  */
260
261#define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit)			\
262  do {									\
263    _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);	\
264									\
265    doit(_FP_FRAC_WORD_4(_z,1), _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0);	\
266    doit(_b_f1, _b_f0, X##_f0, Y##_f1);					\
267    doit(_c_f1, _c_f0, X##_f1, Y##_f0);					\
268    doit(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2), X##_f1, Y##_f1);	\
269									\
270    __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
271		    _FP_FRAC_WORD_4(_z,1), 0, _b_f1, _b_f0,		\
272		    _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
273		    _FP_FRAC_WORD_4(_z,1));				\
274    __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
275		    _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0,		\
276		    _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
277		    _FP_FRAC_WORD_4(_z,1));				\
278									\
279    /* Normalize since we know where the msb of the multiplicands	\
280       were (bit B), we know that the msb of the of the product is	\
281       at either 2B or 2B-1.  */					\
282    _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);			\
283    R##_f0 = _FP_FRAC_WORD_4(_z,0);					\
284    R##_f1 = _FP_FRAC_WORD_4(_z,1);					\
285  } while (0)
286
287/* Given a 1W * 1W => 2W primitive, do the extended multiplication.
288   Do only 3 multiplications instead of four. This one is for machines
289   where multiplication is much more expensive than subtraction.  */
290
291#define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit)		\
292  do {									\
293    _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);	\
294    _FP_W_TYPE _d;							\
295    int _c1, _c2;							\
296									\
297    _b_f0 = X##_f0 + X##_f1;						\
298    _c1 = _b_f0 < X##_f0;						\
299    _b_f1 = Y##_f0 + Y##_f1;						\
300    _c2 = _b_f1 < Y##_f0;						\
301    doit(_d, _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0);			\
302    doit(_FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1), _b_f0, _b_f1);	\
303    doit(_c_f1, _c_f0, X##_f1, Y##_f1);					\
304									\
305    _b_f0 &= -_c2;							\
306    _b_f1 &= -_c1;							\
307    __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
308		    _FP_FRAC_WORD_4(_z,1), (_c1 & _c2), 0, _d,		\
309		    0, _FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1));	\
310    __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
311		     _b_f0);						\
312    __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
313		     _b_f1);						\
314    __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
315		    _FP_FRAC_WORD_4(_z,1),				\
316		    0, _d, _FP_FRAC_WORD_4(_z,0));			\
317    __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
318		    _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0);		\
319    __FP_FRAC_ADD_2(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2),	\
320		    _c_f1, _c_f0,					\
321		    _FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2));	\
322									\
323    /* Normalize since we know where the msb of the multiplicands	\
324       were (bit B), we know that the msb of the of the product is	\
325       at either 2B or 2B-1.  */					\
326    _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);			\
327    R##_f0 = _FP_FRAC_WORD_4(_z,0);					\
328    R##_f1 = _FP_FRAC_WORD_4(_z,1);					\
329  } while (0)
330
331#define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y)				\
332  do {									\
333    _FP_FRAC_DECL_4(_z);						\
334    _FP_W_TYPE _x[2], _y[2];						\
335    _x[0] = X##_f0; _x[1] = X##_f1;					\
336    _y[0] = Y##_f0; _y[1] = Y##_f1;					\
337									\
338    mpn_mul_n(_z_f, _x, _y, 2);						\
339									\
340    /* Normalize since we know where the msb of the multiplicands	\
341       were (bit B), we know that the msb of the of the product is	\
342       at either 2B or 2B-1.  */					\
343    _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);			\
344    R##_f0 = _z_f[0];							\
345    R##_f1 = _z_f[1];							\
346  } while (0)
347
348/* Do at most 120x120=240 bits multiplication using double floating
349   point multiplication.  This is useful if floating point
350   multiplication has much bigger throughput than integer multiply.
351   It is supposed to work for _FP_W_TYPE_SIZE 64 and wfracbits
352   between 106 and 120 only.
353   Caller guarantees that X and Y has (1LLL << (wfracbits - 1)) set.
354   SETFETZ is a macro which will disable all FPU exceptions and set rounding
355   towards zero,  RESETFE should optionally reset it back.  */
356
357#define _FP_MUL_MEAT_2_120_240_double(wfracbits, R, X, Y, setfetz, resetfe)	\
358  do {										\
359    static const double _const[] = {						\
360      /* 2^-24 */ 5.9604644775390625e-08,					\
361      /* 2^-48 */ 3.5527136788005009e-15,					\
362      /* 2^-72 */ 2.1175823681357508e-22,					\
363      /* 2^-96 */ 1.2621774483536189e-29,					\
364      /* 2^28 */ 2.68435456e+08,						\
365      /* 2^4 */ 1.600000e+01,							\
366      /* 2^-20 */ 9.5367431640625e-07,						\
367      /* 2^-44 */ 5.6843418860808015e-14,					\
368      /* 2^-68 */ 3.3881317890172014e-21,					\
369      /* 2^-92 */ 2.0194839173657902e-28,					\
370      /* 2^-116 */ 1.2037062152420224e-35};					\
371    double _a240, _b240, _c240, _d240, _e240, _f240, 				\
372	   _g240, _h240, _i240, _j240, _k240;					\
373    union { double d; UDItype i; } _l240, _m240, _n240, _o240,			\
374				   _p240, _q240, _r240, _s240;			\
375    UDItype _t240, _u240, _v240, _w240, _x240, _y240 = 0;			\
376										\
377    if (wfracbits < 106 || wfracbits > 120)					\
378      abort();									\
379										\
380    setfetz;									\
381										\
382    _e240 = (double)(long)(X##_f0 & 0xffffff);					\
383    _j240 = (double)(long)(Y##_f0 & 0xffffff);					\
384    _d240 = (double)(long)((X##_f0 >> 24) & 0xffffff);				\
385    _i240 = (double)(long)((Y##_f0 >> 24) & 0xffffff);				\
386    _c240 = (double)(long)(((X##_f1 << 16) & 0xffffff) | (X##_f0 >> 48));	\
387    _h240 = (double)(long)(((Y##_f1 << 16) & 0xffffff) | (Y##_f0 >> 48));	\
388    _b240 = (double)(long)((X##_f1 >> 8) & 0xffffff);				\
389    _g240 = (double)(long)((Y##_f1 >> 8) & 0xffffff);				\
390    _a240 = (double)(long)(X##_f1 >> 32);					\
391    _f240 = (double)(long)(Y##_f1 >> 32);					\
392    _e240 *= _const[3];								\
393    _j240 *= _const[3];								\
394    _d240 *= _const[2];								\
395    _i240 *= _const[2];								\
396    _c240 *= _const[1];								\
397    _h240 *= _const[1];								\
398    _b240 *= _const[0];								\
399    _g240 *= _const[0];								\
400    _s240.d =							      _e240*_j240;\
401    _r240.d =						_d240*_j240 + _e240*_i240;\
402    _q240.d =				  _c240*_j240 + _d240*_i240 + _e240*_h240;\
403    _p240.d =		    _b240*_j240 + _c240*_i240 + _d240*_h240 + _e240*_g240;\
404    _o240.d = _a240*_j240 + _b240*_i240 + _c240*_h240 + _d240*_g240 + _e240*_f240;\
405    _n240.d = _a240*_i240 + _b240*_h240 + _c240*_g240 + _d240*_f240;		\
406    _m240.d = _a240*_h240 + _b240*_g240 + _c240*_f240;				\
407    _l240.d = _a240*_g240 + _b240*_f240;					\
408    _k240 =   _a240*_f240;							\
409    _r240.d += _s240.d;								\
410    _q240.d += _r240.d;								\
411    _p240.d += _q240.d;								\
412    _o240.d += _p240.d;								\
413    _n240.d += _o240.d;								\
414    _m240.d += _n240.d;								\
415    _l240.d += _m240.d;								\
416    _k240 += _l240.d;								\
417    _s240.d -= ((_const[10]+_s240.d)-_const[10]);				\
418    _r240.d -= ((_const[9]+_r240.d)-_const[9]);					\
419    _q240.d -= ((_const[8]+_q240.d)-_const[8]);					\
420    _p240.d -= ((_const[7]+_p240.d)-_const[7]);					\
421    _o240.d += _const[7];							\
422    _n240.d += _const[6];							\
423    _m240.d += _const[5];							\
424    _l240.d += _const[4];							\
425    if (_s240.d != 0.0) _y240 = 1;						\
426    if (_r240.d != 0.0) _y240 = 1;						\
427    if (_q240.d != 0.0) _y240 = 1;						\
428    if (_p240.d != 0.0) _y240 = 1;						\
429    _t240 = (DItype)_k240;							\
430    _u240 = _l240.i;								\
431    _v240 = _m240.i;								\
432    _w240 = _n240.i;								\
433    _x240 = _o240.i;								\
434    R##_f1 = (_t240 << (128 - (wfracbits - 1)))					\
435	     | ((_u240 & 0xffffff) >> ((wfracbits - 1) - 104));			\
436    R##_f0 = ((_u240 & 0xffffff) << (168 - (wfracbits - 1)))			\
437    	     | ((_v240 & 0xffffff) << (144 - (wfracbits - 1)))			\
438    	     | ((_w240 & 0xffffff) << (120 - (wfracbits - 1)))			\
439    	     | ((_x240 & 0xffffff) >> ((wfracbits - 1) - 96))			\
440    	     | _y240;								\
441    resetfe;									\
442  } while (0)
443
444/*
445 * Division algorithms:
446 */
447
448#define _FP_DIV_MEAT_2_udiv(fs, R, X, Y)				\
449  do {									\
450    _FP_W_TYPE _n_f2, _n_f1, _n_f0, _r_f1, _r_f0, _m_f1, _m_f0;		\
451    if (_FP_FRAC_GT_2(X, Y))						\
452      {									\
453	_n_f2 = X##_f1 >> 1;						\
454	_n_f1 = X##_f1 << (_FP_W_TYPE_SIZE - 1) | X##_f0 >> 1;		\
455	_n_f0 = X##_f0 << (_FP_W_TYPE_SIZE - 1);			\
456      }									\
457    else								\
458      {									\
459	R##_e--;							\
460	_n_f2 = X##_f1;							\
461	_n_f1 = X##_f0;							\
462	_n_f0 = 0;							\
463      }									\
464									\
465    /* Normalize, i.e. make the most significant bit of the 		\
466       denominator set. */						\
467    _FP_FRAC_SLL_2(Y, _FP_WFRACXBITS_##fs);				\
468									\
469    udiv_qrnnd(R##_f1, _r_f1, _n_f2, _n_f1, Y##_f1);			\
470    umul_ppmm(_m_f1, _m_f0, R##_f1, Y##_f0);				\
471    _r_f0 = _n_f0;							\
472    if (_FP_FRAC_GT_2(_m, _r))						\
473      {									\
474	R##_f1--;							\
475	_FP_FRAC_ADD_2(_r, Y, _r);					\
476	if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r))		\
477	  {								\
478	    R##_f1--;							\
479	    _FP_FRAC_ADD_2(_r, Y, _r);					\
480	  }								\
481      }									\
482    _FP_FRAC_DEC_2(_r, _m);						\
483									\
484    if (_r_f1 == Y##_f1)						\
485      {									\
486	/* This is a special case, not an optimization			\
487	   (_r/Y##_f1 would not fit into UWtype).			\
488	   As _r is guaranteed to be < Y,  R##_f0 can be either		\
489	   (UWtype)-1 or (UWtype)-2.  But as we know what kind		\
490	   of bits it is (sticky, guard, round),  we don't care.	\
491	   We also don't care what the reminder is,  because the	\
492	   guard bit will be set anyway.  -jj */			\
493	R##_f0 = -1;							\
494      }									\
495    else								\
496      {									\
497	udiv_qrnnd(R##_f0, _r_f1, _r_f1, _r_f0, Y##_f1);		\
498	umul_ppmm(_m_f1, _m_f0, R##_f0, Y##_f0);			\
499	_r_f0 = 0;							\
500	if (_FP_FRAC_GT_2(_m, _r))					\
501	  {								\
502	    R##_f0--;							\
503	    _FP_FRAC_ADD_2(_r, Y, _r);					\
504	    if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r))		\
505	      {								\
506		R##_f0--;						\
507		_FP_FRAC_ADD_2(_r, Y, _r);				\
508	      }								\
509	  }								\
510	if (!_FP_FRAC_EQ_2(_r, _m))					\
511	  R##_f0 |= _FP_WORK_STICKY;					\
512      }									\
513  } while (0)
514
515
516#define _FP_DIV_MEAT_2_gmp(fs, R, X, Y)					\
517  do {									\
518    _FP_W_TYPE _x[4], _y[2], _z[4];					\
519    _y[0] = Y##_f0; _y[1] = Y##_f1;					\
520    _x[0] = _x[3] = 0;							\
521    if (_FP_FRAC_GT_2(X, Y))						\
522      {									\
523	R##_e++;							\
524	_x[1] = (X##_f0 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE) |	\
525		 X##_f1 >> (_FP_W_TYPE_SIZE -				\
526			    (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE)));	\
527	_x[2] = X##_f1 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE);	\
528      }									\
529    else								\
530      {									\
531	_x[1] = (X##_f0 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE) |	\
532		 X##_f1 >> (_FP_W_TYPE_SIZE -				\
533			    (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE)));	\
534	_x[2] = X##_f1 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE);	\
535      }									\
536									\
537    (void) mpn_divrem (_z, 0, _x, 4, _y, 2);				\
538    R##_f1 = _z[1];							\
539    R##_f0 = _z[0] | ((_x[0] | _x[1]) != 0);				\
540  } while (0)
541
542
543/*
544 * Square root algorithms:
545 * We have just one right now, maybe Newton approximation
546 * should be added for those machines where division is fast.
547 */
548
549#define _FP_SQRT_MEAT_2(R, S, T, X, q)			\
550  do {							\
551    while (q)						\
552      {							\
553	T##_f1 = S##_f1 + q;				\
554	if (T##_f1 <= X##_f1)				\
555	  {						\
556	    S##_f1 = T##_f1 + q;			\
557	    X##_f1 -= T##_f1;				\
558	    R##_f1 += q;				\
559	  }						\
560	_FP_FRAC_SLL_2(X, 1);				\
561	q >>= 1;					\
562      }							\
563    q = (_FP_W_TYPE)1 << (_FP_W_TYPE_SIZE - 1);		\
564    while (q != _FP_WORK_ROUND)				\
565      {							\
566	T##_f0 = S##_f0 + q;				\
567	T##_f1 = S##_f1;				\
568	if (T##_f1 < X##_f1 || 				\
569	    (T##_f1 == X##_f1 && T##_f0 <= X##_f0))	\
570	  {						\
571	    S##_f0 = T##_f0 + q;			\
572	    S##_f1 += (T##_f0 > S##_f0);		\
573	    _FP_FRAC_DEC_2(X, T);			\
574	    R##_f0 += q;				\
575	  }						\
576	_FP_FRAC_SLL_2(X, 1);				\
577	q >>= 1;					\
578      }							\
579    if (X##_f0 | X##_f1)				\
580      {							\
581	if (S##_f1 < X##_f1 || 				\
582	    (S##_f1 == X##_f1 && S##_f0 < X##_f0))	\
583	  R##_f0 |= _FP_WORK_ROUND;			\
584	R##_f0 |= _FP_WORK_STICKY;			\
585      }							\
586  } while (0)
587
588
589/*
590 * Assembly/disassembly for converting to/from integral types.
591 * No shifting or overflow handled here.
592 */
593
594#define _FP_FRAC_ASSEMBLE_2(r, X, rsize)	\
595(void)((rsize <= _FP_W_TYPE_SIZE)		\
596       ? ({ r = X##_f0; })			\
597       : ({					\
598	    r = X##_f1;				\
599	    r <<= _FP_W_TYPE_SIZE;		\
600	    r += X##_f0;			\
601	  }))
602
603#define _FP_FRAC_DISASSEMBLE_2(X, r, rsize)				\
604  do {									\
605    X##_f0 = r;								\
606    X##_f1 = (rsize <= _FP_W_TYPE_SIZE ? 0 : r >> _FP_W_TYPE_SIZE);	\
607  } while (0)
608
609/*
610 * Convert FP values between word sizes
611 */
612
613#define _FP_FRAC_COPY_1_2(D, S)		(D##_f = S##_f0)
614
615#define _FP_FRAC_COPY_2_1(D, S)		((D##_f0 = S##_f), (D##_f1 = 0))
616
617#define _FP_FRAC_COPY_2_2(D,S)		_FP_FRAC_COPY_2(D,S)
618