1/* Copyright (C) 2007-2020 Free Software Foundation, Inc.
2
3This file is part of GCC.
4
5GCC is free software; you can redistribute it and/or modify it under
6the terms of the GNU General Public License as published by the Free
7Software Foundation; either version 3, or (at your option) any later
8version.
9
10GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11WARRANTY; without even the implied warranty of MERCHANTABILITY or
12FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13for more details.
14
15Under Section 7 of GPL version 3, you are granted additional
16permissions described in the GCC Runtime Library Exception, version
173.1, as published by the Free Software Foundation.
18
19You should have received a copy of the GNU General Public License and
20a copy of the GCC Runtime Library Exception along with this program;
21see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22<http://www.gnu.org/licenses/>.  */
23
24#include "bid_internal.h"
25
26
27#if DECIMAL_CALL_BY_REFERENCE
28void
29bid64dq_add (UINT64 * pres, UINT64 * px, UINT128 * py
30	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
31	     _EXC_INFO_PARAM) {
32  UINT64 x = *px;
33#if !DECIMAL_GLOBAL_ROUNDING
34  unsigned int rnd_mode = *prnd_mode;
35#endif
36#else
37UINT64
38bid64dq_add (UINT64 x, UINT128 y
39	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
40	     _EXC_INFO_PARAM) {
41#endif
42  UINT64 res = 0xbaddbaddbaddbaddull;
43  UINT128 x1;
44
45#if DECIMAL_CALL_BY_REFERENCE
46  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
47  bid64qq_add (&res, &x1, py
48	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
49	       _EXC_INFO_ARG);
50#else
51  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
52  res = bid64qq_add (x1, y
53		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
54		     _EXC_INFO_ARG);
55#endif
56  BID_RETURN (res);
57}
58
59
60#if DECIMAL_CALL_BY_REFERENCE
61void
62bid64qd_add (UINT64 * pres, UINT128 * px, UINT64 * py
63	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
64	     _EXC_INFO_PARAM) {
65  UINT64 y = *py;
66#if !DECIMAL_GLOBAL_ROUNDING
67  unsigned int rnd_mode = *prnd_mode;
68#endif
69#else
70UINT64
71bid64qd_add (UINT128 x, UINT64 y
72	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
73	     _EXC_INFO_PARAM) {
74#endif
75  UINT64 res = 0xbaddbaddbaddbaddull;
76  UINT128 y1;
77
78#if DECIMAL_CALL_BY_REFERENCE
79  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
80  bid64qq_add (&res, px, &y1
81	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
82	       _EXC_INFO_ARG);
83#else
84  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
85  res = bid64qq_add (x, y1
86		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
87		     _EXC_INFO_ARG);
88#endif
89  BID_RETURN (res);
90}
91
92
93#if DECIMAL_CALL_BY_REFERENCE
94void
95bid64qq_add (UINT64 * pres, UINT128 * px, UINT128 * py
96	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
97	     _EXC_INFO_PARAM) {
98  UINT128 x = *px, y = *py;
99#if !DECIMAL_GLOBAL_ROUNDING
100  unsigned int rnd_mode = *prnd_mode;
101#endif
102#else
103UINT64
104bid64qq_add (UINT128 x, UINT128 y
105	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
106	     _EXC_INFO_PARAM) {
107#endif
108
109  UINT128 one = { {0x0000000000000001ull, 0x3040000000000000ull}
110  };
111  UINT64 res = 0xbaddbaddbaddbaddull;
112
113  BID_SWAP128 (one);
114#if DECIMAL_CALL_BY_REFERENCE
115  bid64qqq_fma (&res, &one, &x, &y
116		_RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
117		_EXC_INFO_ARG);
118#else
119  res = bid64qqq_fma (one, x, y
120		      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
121		      _EXC_INFO_ARG);
122#endif
123  BID_RETURN (res);
124}
125
126
127#if DECIMAL_CALL_BY_REFERENCE
128void
129bid128dd_add (UINT128 * pres, UINT64 * px, UINT64 * py
130	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
131	      _EXC_INFO_PARAM) {
132  UINT64 x = *px, y = *py;
133#if !DECIMAL_GLOBAL_ROUNDING
134  unsigned int rnd_mode = *prnd_mode;
135#endif
136#else
137UINT128
138bid128dd_add (UINT64 x, UINT64 y
139	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
140	      _EXC_INFO_PARAM) {
141#endif
142  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
143  };
144  UINT128 x1, y1;
145
146#if DECIMAL_CALL_BY_REFERENCE
147  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
148  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
149  bid128_add (&res, &x1, &y1
150	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
151	      _EXC_INFO_ARG);
152#else
153  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
154  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
155  res = bid128_add (x1, y1
156		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
157		    _EXC_INFO_ARG);
158#endif
159  BID_RETURN (res);
160}
161
162
163#if DECIMAL_CALL_BY_REFERENCE
164void
165bid128dq_add (UINT128 * pres, UINT64 * px, UINT128 * py
166	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
167	      _EXC_INFO_PARAM) {
168  UINT64 x = *px;
169#if !DECIMAL_GLOBAL_ROUNDING
170  unsigned int rnd_mode = *prnd_mode;
171#endif
172#else
173UINT128
174bid128dq_add (UINT64 x, UINT128 y
175	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
176	      _EXC_INFO_PARAM) {
177#endif
178  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
179  };
180  UINT128 x1;
181
182#if DECIMAL_CALL_BY_REFERENCE
183  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
184  bid128_add (&res, &x1, py
185	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
186	      _EXC_INFO_ARG);
187#else
188  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
189  res = bid128_add (x1, y
190		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
191		    _EXC_INFO_ARG);
192#endif
193  BID_RETURN (res);
194}
195
196
197#if DECIMAL_CALL_BY_REFERENCE
198void
199bid128qd_add (UINT128 * pres, UINT128 * px, UINT64 * py
200	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
201	      _EXC_INFO_PARAM) {
202  UINT64 y = *py;
203#if !DECIMAL_GLOBAL_ROUNDING
204  unsigned int rnd_mode = *prnd_mode;
205#endif
206#else
207UINT128
208bid128qd_add (UINT128 x, UINT64 y
209	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
210	      _EXC_INFO_PARAM) {
211#endif
212  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
213  };
214  UINT128 y1;
215
216#if DECIMAL_CALL_BY_REFERENCE
217  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
218  bid128_add (&res, px, &y1
219	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
220	      _EXC_INFO_ARG);
221#else
222  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
223  res = bid128_add (x, y1
224		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
225		    _EXC_INFO_ARG);
226#endif
227  BID_RETURN (res);
228}
229
230
231// bid128_add stands for bid128qq_add
232
233
234/*****************************************************************************
235 *  BID64/BID128 sub
236 ****************************************************************************/
237
238#if DECIMAL_CALL_BY_REFERENCE
239void
240bid64dq_sub (UINT64 * pres, UINT64 * px, UINT128 * py
241	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
242	     _EXC_INFO_PARAM) {
243  UINT64 x = *px;
244#if !DECIMAL_GLOBAL_ROUNDING
245  unsigned int rnd_mode = *prnd_mode;
246#endif
247#else
248UINT64
249bid64dq_sub (UINT64 x, UINT128 y
250	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
251	     _EXC_INFO_PARAM) {
252#endif
253  UINT64 res = 0xbaddbaddbaddbaddull;
254  UINT128 x1;
255
256#if DECIMAL_CALL_BY_REFERENCE
257  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
258  bid64qq_sub (&res, &x1, py
259	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
260	       _EXC_INFO_ARG);
261#else
262  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
263  res = bid64qq_sub (x1, y
264		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
265		     _EXC_INFO_ARG);
266#endif
267  BID_RETURN (res);
268}
269
270
271#if DECIMAL_CALL_BY_REFERENCE
272void
273bid64qd_sub (UINT64 * pres, UINT128 * px, UINT64 * py
274	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
275	     _EXC_INFO_PARAM) {
276  UINT64 y = *py;
277#if !DECIMAL_GLOBAL_ROUNDING
278  unsigned int rnd_mode = *prnd_mode;
279#endif
280#else
281UINT64
282bid64qd_sub (UINT128 x, UINT64 y
283	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
284	     _EXC_INFO_PARAM) {
285#endif
286  UINT64 res = 0xbaddbaddbaddbaddull;
287  UINT128 y1;
288
289#if DECIMAL_CALL_BY_REFERENCE
290  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
291  bid64qq_sub (&res, px, &y1
292	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
293	       _EXC_INFO_ARG);
294#else
295  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
296  res = bid64qq_sub (x, y1
297		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
298		     _EXC_INFO_ARG);
299#endif
300  BID_RETURN (res);
301}
302
303
304#if DECIMAL_CALL_BY_REFERENCE
305void
306bid64qq_sub (UINT64 * pres, UINT128 * px, UINT128 * py
307	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
308	     _EXC_INFO_PARAM) {
309  UINT128 x = *px, y = *py;
310#if !DECIMAL_GLOBAL_ROUNDING
311  unsigned int rnd_mode = *prnd_mode;
312#endif
313#else
314UINT64
315bid64qq_sub (UINT128 x, UINT128 y
316	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
317	     _EXC_INFO_PARAM) {
318#endif
319
320  UINT128 one = { {0x0000000000000001ull, 0x3040000000000000ull}
321  };
322  UINT64 res = 0xbaddbaddbaddbaddull;
323  UINT64 y_sign;
324
325  BID_SWAP128 (one);
326  if ((y.w[HIGH_128W] & MASK_NAN) != MASK_NAN) {	// y is not NAN
327    // change its sign
328    y_sign = y.w[HIGH_128W] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative
329    if (y_sign)
330      y.w[HIGH_128W] = y.w[HIGH_128W] & 0x7fffffffffffffffull;
331    else
332      y.w[HIGH_128W] = y.w[HIGH_128W] | 0x8000000000000000ull;
333  }
334#if DECIMAL_CALL_BY_REFERENCE
335  bid64qqq_fma (&res, &one, &x, &y
336		_RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
337		_EXC_INFO_ARG);
338#else
339  res = bid64qqq_fma (one, x, y
340		      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
341		      _EXC_INFO_ARG);
342#endif
343  BID_RETURN (res);
344}
345
346
347#if DECIMAL_CALL_BY_REFERENCE
348void
349bid128dd_sub (UINT128 * pres, UINT64 * px, UINT64 * py
350	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
351	      _EXC_INFO_PARAM) {
352  UINT64 x = *px, y = *py;
353#if !DECIMAL_GLOBAL_ROUNDING
354  unsigned int rnd_mode = *prnd_mode;
355#endif
356#else
357UINT128
358bid128dd_sub (UINT64 x, UINT64 y
359	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
360	      _EXC_INFO_PARAM) {
361#endif
362  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
363  };
364  UINT128 x1, y1;
365
366#if DECIMAL_CALL_BY_REFERENCE
367  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
368  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
369  bid128_sub (&res, &x1, &y1
370	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
371	      _EXC_INFO_ARG);
372#else
373  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
374  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
375  res = bid128_sub (x1, y1
376		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
377		    _EXC_INFO_ARG);
378#endif
379  BID_RETURN (res);
380}
381
382
383#if DECIMAL_CALL_BY_REFERENCE
384void
385bid128dq_sub (UINT128 * pres, UINT64 * px, UINT128 * py
386	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
387	      _EXC_INFO_PARAM) {
388  UINT64 x = *px;
389#if !DECIMAL_GLOBAL_ROUNDING
390  unsigned int rnd_mode = *prnd_mode;
391#endif
392#else
393UINT128
394bid128dq_sub (UINT64 x, UINT128 y
395	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
396	      _EXC_INFO_PARAM) {
397#endif
398  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
399  };
400  UINT128 x1;
401
402#if DECIMAL_CALL_BY_REFERENCE
403  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
404  bid128_sub (&res, &x1, py
405	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
406	      _EXC_INFO_ARG);
407#else
408  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
409  res = bid128_sub (x1, y
410		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
411		    _EXC_INFO_ARG);
412#endif
413  BID_RETURN (res);
414}
415
416
417#if DECIMAL_CALL_BY_REFERENCE
418void
419bid128qd_sub (UINT128 * pres, UINT128 * px, UINT64 * py
420	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
421	      _EXC_INFO_PARAM) {
422  UINT64 y = *py;
423#if !DECIMAL_GLOBAL_ROUNDING
424  unsigned int rnd_mode = *prnd_mode;
425#endif
426#else
427UINT128
428bid128qd_sub (UINT128 x, UINT64 y
429	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
430	      _EXC_INFO_PARAM) {
431#endif
432  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
433  };
434  UINT128 y1;
435
436#if DECIMAL_CALL_BY_REFERENCE
437  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
438  bid128_sub (&res, px, &y1
439	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
440	      _EXC_INFO_ARG);
441#else
442  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
443  res = bid128_sub (x, y1
444		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
445		    _EXC_INFO_ARG);
446#endif
447  BID_RETURN (res);
448}
449
450#if DECIMAL_CALL_BY_REFERENCE
451void
452bid128_add (UINT128 * pres, UINT128 * px, UINT128 * py
453	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
454	    _EXC_INFO_PARAM) {
455  UINT128 x = *px, y = *py;
456#if !DECIMAL_GLOBAL_ROUNDING
457  unsigned int rnd_mode = *prnd_mode;
458#endif
459#else
460UINT128
461bid128_add (UINT128 x, UINT128 y
462	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
463	    _EXC_INFO_PARAM) {
464#endif
465  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
466  };
467  UINT64 x_sign, y_sign, tmp_sign;
468  UINT64 x_exp, y_exp, tmp_exp;	// e1 = x_exp, e2 = y_exp
469  UINT64 C1_hi, C2_hi, tmp_signif_hi;
470  UINT64 C1_lo, C2_lo, tmp_signif_lo;
471  // Note: C1.w[1], C1.w[0] represent C1_hi, C1_lo (all UINT64)
472  // Note: C2.w[1], C2.w[0] represent C2_hi, C2_lo (all UINT64)
473  UINT64 tmp64, tmp64A, tmp64B;
474  BID_UI64DOUBLE tmp1, tmp2;
475  int x_nr_bits, y_nr_bits;
476  int q1, q2, delta, scale, x1, ind, shift, tmp_inexact = 0;
477  UINT64 halfulp64;
478  UINT128 halfulp128;
479  UINT128 C1, C2;
480  UINT128 ten2m1;
481  UINT128 highf2star;		// top 128 bits in f2*; low 128 bits in R256[1], R256[0]
482  UINT256 P256, Q256, R256;
483  int is_inexact = 0, is_midpoint_lt_even = 0, is_midpoint_gt_even = 0;
484  int is_inexact_lt_midpoint = 0, is_inexact_gt_midpoint = 0;
485  int second_pass = 0;
486
487  BID_SWAP128 (x);
488  BID_SWAP128 (y);
489  x_sign = x.w[1] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative
490  y_sign = y.w[1] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative
491
492  // check for NaN or Infinity
493  if (((x.w[1] & MASK_SPECIAL) == MASK_SPECIAL)
494      || ((y.w[1] & MASK_SPECIAL) == MASK_SPECIAL)) {
495    // x is special or y is special
496    if ((x.w[1] & MASK_NAN) == MASK_NAN) {	// x is NAN
497      // check first for non-canonical NaN payload
498      if (((x.w[1] & 0x00003fffffffffffull) > 0x0000314dc6448d93ull) ||
499	  (((x.w[1] & 0x00003fffffffffffull) == 0x0000314dc6448d93ull)
500	   && (x.w[0] > 0x38c15b09ffffffffull))) {
501	x.w[1] = x.w[1] & 0xffffc00000000000ull;
502	x.w[0] = 0x0ull;
503      }
504      if ((x.w[1] & MASK_SNAN) == MASK_SNAN) {	// x is SNAN
505	// set invalid flag
506	*pfpsf |= INVALID_EXCEPTION;
507	// return quiet (x)
508	res.w[1] = x.w[1] & 0xfc003fffffffffffull;
509	// clear out also G[6]-G[16]
510	res.w[0] = x.w[0];
511      } else {	// x is QNaN
512	// return x
513	res.w[1] = x.w[1] & 0xfc003fffffffffffull;
514	// clear out G[6]-G[16]
515	res.w[0] = x.w[0];
516	// if y = SNaN signal invalid exception
517	if ((y.w[1] & MASK_SNAN) == MASK_SNAN) {
518	  // set invalid flag
519	  *pfpsf |= INVALID_EXCEPTION;
520	}
521      }
522      BID_SWAP128 (res);
523      BID_RETURN (res);
524    } else if ((y.w[1] & MASK_NAN) == MASK_NAN) {	// y is NAN
525      // check first for non-canonical NaN payload
526      if (((y.w[1] & 0x00003fffffffffffull) > 0x0000314dc6448d93ull) ||
527	  (((y.w[1] & 0x00003fffffffffffull) == 0x0000314dc6448d93ull)
528	   && (y.w[0] > 0x38c15b09ffffffffull))) {
529	y.w[1] = y.w[1] & 0xffffc00000000000ull;
530	y.w[0] = 0x0ull;
531      }
532      if ((y.w[1] & MASK_SNAN) == MASK_SNAN) {	// y is SNAN
533	// set invalid flag
534	*pfpsf |= INVALID_EXCEPTION;
535	// return quiet (y)
536	res.w[1] = y.w[1] & 0xfc003fffffffffffull;
537	// clear out also G[6]-G[16]
538	res.w[0] = y.w[0];
539      } else {	// y is QNaN
540	// return y
541	res.w[1] = y.w[1] & 0xfc003fffffffffffull;
542	// clear out G[6]-G[16]
543	res.w[0] = y.w[0];
544      }
545      BID_SWAP128 (res);
546      BID_RETURN (res);
547    } else {	// neither x not y is NaN; at least one is infinity
548      if ((x.w[1] & MASK_ANY_INF) == MASK_INF) {	// x is infinity
549	if ((y.w[1] & MASK_ANY_INF) == MASK_INF) {	// y is infinity
550	  // if same sign, return either of them
551	  if ((x.w[1] & MASK_SIGN) == (y.w[1] & MASK_SIGN)) {
552	    res.w[1] = x_sign | MASK_INF;
553	    res.w[0] = 0x0ull;
554	  } else {	// x and y are infinities of opposite signs
555	    // set invalid flag
556	    *pfpsf |= INVALID_EXCEPTION;
557	    // return QNaN Indefinite
558	    res.w[1] = 0x7c00000000000000ull;
559	    res.w[0] = 0x0000000000000000ull;
560	  }
561	} else {	// y is 0 or finite
562	  // return x
563	  res.w[1] = x_sign | MASK_INF;
564	  res.w[0] = 0x0ull;
565	}
566      } else {	// x is not NaN or infinity, so y must be infinity
567	res.w[1] = y_sign | MASK_INF;
568	res.w[0] = 0x0ull;
569      }
570      BID_SWAP128 (res);
571      BID_RETURN (res);
572    }
573  }
574  // unpack the arguments
575
576  // unpack x
577  C1_hi = x.w[1] & MASK_COEFF;
578  C1_lo = x.w[0];
579  // test for non-canonical values:
580  // - values whose encoding begins with x00, x01, or x10 and whose
581  //   coefficient is larger than 10^34 -1, or
582  // - values whose encoding begins with x1100, x1101, x1110 (if NaNs
583  //   and infinitis were eliminated already this test is reduced to
584  //   checking for x10x)
585
586  // x is not infinity; check for non-canonical values - treated as zero
587  if ((x.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
588    // G0_G1=11; non-canonical
589    x_exp = (x.w[1] << 2) & MASK_EXP;	// biased and shifted left 49 bits
590    C1_hi = 0;	// significand high
591    C1_lo = 0;	// significand low
592  } else {	// G0_G1 != 11
593    x_exp = x.w[1] & MASK_EXP;	// biased and shifted left 49 bits
594    if (C1_hi > 0x0001ed09bead87c0ull ||
595	(C1_hi == 0x0001ed09bead87c0ull
596	 && C1_lo > 0x378d8e63ffffffffull)) {
597      // x is non-canonical if coefficient is larger than 10^34 -1
598      C1_hi = 0;
599      C1_lo = 0;
600    } else {	// canonical
601      ;
602    }
603  }
604
605  // unpack y
606  C2_hi = y.w[1] & MASK_COEFF;
607  C2_lo = y.w[0];
608  // y is not infinity; check for non-canonical values - treated as zero
609  if ((y.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
610    // G0_G1=11; non-canonical
611    y_exp = (y.w[1] << 2) & MASK_EXP;	// biased and shifted left 49 bits
612    C2_hi = 0;	// significand high
613    C2_lo = 0;	// significand low
614  } else {	// G0_G1 != 11
615    y_exp = y.w[1] & MASK_EXP;	// biased and shifted left 49 bits
616    if (C2_hi > 0x0001ed09bead87c0ull ||
617	(C2_hi == 0x0001ed09bead87c0ull
618	 && C2_lo > 0x378d8e63ffffffffull)) {
619      // y is non-canonical if coefficient is larger than 10^34 -1
620      C2_hi = 0;
621      C2_lo = 0;
622    } else {	// canonical
623      ;
624    }
625  }
626
627  if ((C1_hi == 0x0ull) && (C1_lo == 0x0ull)) {
628    // x is 0 and y is not special
629    // if y is 0 return 0 with the smaller exponent
630    if ((C2_hi == 0x0ull) && (C2_lo == 0x0ull)) {
631      if (x_exp < y_exp)
632	res.w[1] = x_exp;
633      else
634	res.w[1] = y_exp;
635      if (x_sign && y_sign)
636	res.w[1] = res.w[1] | x_sign;	// both negative
637      else if (rnd_mode == ROUNDING_DOWN && x_sign != y_sign)
638	res.w[1] = res.w[1] | 0x8000000000000000ull;	// -0
639      // else; // res = +0
640      res.w[0] = 0;
641    } else {
642      // for 0 + y return y, with the preferred exponent
643      if (y_exp <= x_exp) {
644	res.w[1] = y.w[1];
645	res.w[0] = y.w[0];
646      } else {	// if y_exp > x_exp
647	// return (C2 * 10^scale) * 10^(y_exp - scale)
648	// where scale = min (P34-q2, y_exp-x_exp)
649	// determine q2 = nr. of decimal digits in y
650	//  determine first the nr. of bits in y (y_nr_bits)
651
652	if (C2_hi == 0) {	// y_bits is the nr. of bits in C2_lo
653	  if (C2_lo >= 0x0020000000000000ull) {	// y >= 2^53
654	    // split the 64-bit value in two 32-bit halves to avoid
655	    // rounding errors
656	    if (C2_lo >= 0x0000000100000000ull) {	// y >= 2^32
657	      tmp2.d = (double) (C2_lo >> 32);	// exact conversion
658	      y_nr_bits =
659		32 +
660		((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
661	    } else {	// y < 2^32
662	      tmp2.d = (double) (C2_lo);	// exact conversion
663	      y_nr_bits =
664		((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
665	    }
666	  } else {	// if y < 2^53
667	    tmp2.d = (double) C2_lo;	// exact conversion
668	    y_nr_bits =
669	      ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
670	  }
671	} else {	// C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
672	  tmp2.d = (double) C2_hi;	// exact conversion
673	  y_nr_bits =
674	    64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
675	}
676	q2 = nr_digits[y_nr_bits].digits;
677	if (q2 == 0) {
678	  q2 = nr_digits[y_nr_bits].digits1;
679	  if (C2_hi > nr_digits[y_nr_bits].threshold_hi ||
680	      (C2_hi == nr_digits[y_nr_bits].threshold_hi &&
681	       C2_lo >= nr_digits[y_nr_bits].threshold_lo))
682	    q2++;
683	}
684	// return (C2 * 10^scale) * 10^(y_exp - scale)
685	// where scale = min (P34-q2, y_exp-x_exp)
686	scale = P34 - q2;
687	ind = (y_exp - x_exp) >> 49;
688	if (ind < scale)
689	  scale = ind;
690	if (scale == 0) {
691	  res.w[1] = y.w[1];
692	  res.w[0] = y.w[0];
693	} else if (q2 <= 19) {	// y fits in 64 bits
694	  if (scale <= 19) {	// 10^scale fits in 64 bits
695	    // 64 x 64 C2_lo * ten2k64[scale]
696	    __mul_64x64_to_128MACH (res, C2_lo, ten2k64[scale]);
697	  } else {	// 10^scale fits in 128 bits
698	    // 64 x 128 C2_lo * ten2k128[scale - 20]
699	    __mul_128x64_to_128 (res, C2_lo, ten2k128[scale - 20]);
700	  }
701	} else {	// y fits in 128 bits, but 10^scale must fit in 64 bits
702	  // 64 x 128 ten2k64[scale] * C2
703	  C2.w[1] = C2_hi;
704	  C2.w[0] = C2_lo;
705	  __mul_128x64_to_128 (res, ten2k64[scale], C2);
706	}
707	// subtract scale from the exponent
708	y_exp = y_exp - ((UINT64) scale << 49);
709	res.w[1] = res.w[1] | y_sign | y_exp;
710      }
711    }
712    BID_SWAP128 (res);
713    BID_RETURN (res);
714  } else if ((C2_hi == 0x0ull) && (C2_lo == 0x0ull)) {
715    // y is 0 and x is not special, and not zero
716    // for x + 0 return x, with the preferred exponent
717    if (x_exp <= y_exp) {
718      res.w[1] = x.w[1];
719      res.w[0] = x.w[0];
720    } else {	// if x_exp > y_exp
721      // return (C1 * 10^scale) * 10^(x_exp - scale)
722      // where scale = min (P34-q1, x_exp-y_exp)
723      // determine q1 = nr. of decimal digits in x
724      //  determine first the nr. of bits in x
725      if (C1_hi == 0) {	// x_bits is the nr. of bits in C1_lo
726	if (C1_lo >= 0x0020000000000000ull) {	// x >= 2^53
727	  // split the 64-bit value in two 32-bit halves to avoid
728	  // rounding errors
729	  if (C1_lo >= 0x0000000100000000ull) {	// x >= 2^32
730	    tmp1.d = (double) (C1_lo >> 32);	// exact conversion
731	    x_nr_bits =
732	      32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) -
733		    0x3ff);
734	  } else {	// x < 2^32
735	    tmp1.d = (double) (C1_lo);	// exact conversion
736	    x_nr_bits =
737	      ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
738	  }
739	} else {	// if x < 2^53
740	  tmp1.d = (double) C1_lo;	// exact conversion
741	  x_nr_bits =
742	    ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
743	}
744      } else {	// C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
745	tmp1.d = (double) C1_hi;	// exact conversion
746	x_nr_bits =
747	  64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
748      }
749      q1 = nr_digits[x_nr_bits].digits;
750      if (q1 == 0) {
751	q1 = nr_digits[x_nr_bits].digits1;
752	if (C1_hi > nr_digits[x_nr_bits].threshold_hi ||
753	    (C1_hi == nr_digits[x_nr_bits].threshold_hi &&
754	     C1_lo >= nr_digits[x_nr_bits].threshold_lo))
755	  q1++;
756      }
757      // return (C1 * 10^scale) * 10^(x_exp - scale)
758      // where scale = min (P34-q1, x_exp-y_exp)
759      scale = P34 - q1;
760      ind = (x_exp - y_exp) >> 49;
761      if (ind < scale)
762	scale = ind;
763      if (scale == 0) {
764	res.w[1] = x.w[1];
765	res.w[0] = x.w[0];
766      } else if (q1 <= 19) {	// x fits in 64 bits
767	if (scale <= 19) {	// 10^scale fits in 64 bits
768	  // 64 x 64 C1_lo * ten2k64[scale]
769	  __mul_64x64_to_128MACH (res, C1_lo, ten2k64[scale]);
770	} else {	// 10^scale fits in 128 bits
771	  // 64 x 128 C1_lo * ten2k128[scale - 20]
772	  __mul_128x64_to_128 (res, C1_lo, ten2k128[scale - 20]);
773	}
774      } else {	// x fits in 128 bits, but 10^scale must fit in 64 bits
775	// 64 x 128 ten2k64[scale] * C1
776	C1.w[1] = C1_hi;
777	C1.w[0] = C1_lo;
778	__mul_128x64_to_128 (res, ten2k64[scale], C1);
779      }
780      // subtract scale from the exponent
781      x_exp = x_exp - ((UINT64) scale << 49);
782      res.w[1] = res.w[1] | x_sign | x_exp;
783    }
784    BID_SWAP128 (res);
785    BID_RETURN (res);
786  } else {	// x and y are not canonical, not special, and are not zero
787    // note that the result may still be zero, and then it has to have the
788    // preferred exponent
789    if (x_exp < y_exp) {	// if exp_x < exp_y then swap x and y
790      tmp_sign = x_sign;
791      tmp_exp = x_exp;
792      tmp_signif_hi = C1_hi;
793      tmp_signif_lo = C1_lo;
794      x_sign = y_sign;
795      x_exp = y_exp;
796      C1_hi = C2_hi;
797      C1_lo = C2_lo;
798      y_sign = tmp_sign;
799      y_exp = tmp_exp;
800      C2_hi = tmp_signif_hi;
801      C2_lo = tmp_signif_lo;
802    }
803    // q1 = nr. of decimal digits in x
804    //  determine first the nr. of bits in x
805    if (C1_hi == 0) {	// x_bits is the nr. of bits in C1_lo
806      if (C1_lo >= 0x0020000000000000ull) {	// x >= 2^53
807	//split the 64-bit value in two 32-bit halves to avoid rounding errors
808	if (C1_lo >= 0x0000000100000000ull) {	// x >= 2^32
809	  tmp1.d = (double) (C1_lo >> 32);	// exact conversion
810	  x_nr_bits =
811	    32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
812	} else {	// x < 2^32
813	  tmp1.d = (double) (C1_lo);	// exact conversion
814	  x_nr_bits =
815	    ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
816	}
817      } else {	// if x < 2^53
818	tmp1.d = (double) C1_lo;	// exact conversion
819	x_nr_bits =
820	  ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
821      }
822    } else {	// C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
823      tmp1.d = (double) C1_hi;	// exact conversion
824      x_nr_bits =
825	64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
826    }
827
828    q1 = nr_digits[x_nr_bits].digits;
829    if (q1 == 0) {
830      q1 = nr_digits[x_nr_bits].digits1;
831      if (C1_hi > nr_digits[x_nr_bits].threshold_hi ||
832	  (C1_hi == nr_digits[x_nr_bits].threshold_hi &&
833	   C1_lo >= nr_digits[x_nr_bits].threshold_lo))
834	q1++;
835    }
836    // q2 = nr. of decimal digits in y
837    //  determine first the nr. of bits in y (y_nr_bits)
838    if (C2_hi == 0) {	// y_bits is the nr. of bits in C2_lo
839      if (C2_lo >= 0x0020000000000000ull) {	// y >= 2^53
840	//split the 64-bit value in two 32-bit halves to avoid rounding errors
841	if (C2_lo >= 0x0000000100000000ull) {	// y >= 2^32
842	  tmp2.d = (double) (C2_lo >> 32);	// exact conversion
843	  y_nr_bits =
844	    32 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
845	} else {	// y < 2^32
846	  tmp2.d = (double) (C2_lo);	// exact conversion
847	  y_nr_bits =
848	    ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
849	}
850      } else {	// if y < 2^53
851	tmp2.d = (double) C2_lo;	// exact conversion
852	y_nr_bits =
853	  ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
854      }
855    } else {	// C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
856      tmp2.d = (double) C2_hi;	// exact conversion
857      y_nr_bits =
858	64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
859    }
860
861    q2 = nr_digits[y_nr_bits].digits;
862    if (q2 == 0) {
863      q2 = nr_digits[y_nr_bits].digits1;
864      if (C2_hi > nr_digits[y_nr_bits].threshold_hi ||
865	  (C2_hi == nr_digits[y_nr_bits].threshold_hi &&
866	   C2_lo >= nr_digits[y_nr_bits].threshold_lo))
867	q2++;
868    }
869
870    delta = q1 + (int) (x_exp >> 49) - q2 - (int) (y_exp >> 49);
871
872    if (delta >= P34) {
873      // round the result directly because 0 < C2 < ulp (C1 * 10^(x_exp-e2))
874      // n = C1 * 10^e1 or n = C1 +/- 10^(q1-P34)) * 10^e1
875      // the result is inexact; the preferred exponent is the least possible
876
877      if (delta >= P34 + 1) {
878	// for RN the result is the operand with the larger magnitude,
879	// possibly scaled up by 10^(P34-q1)
880	// an overflow cannot occur in this case (rounding to nearest)
881	if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
882	  // Note: because delta >= P34+1 it is certain that
883	  //     x_exp - ((UINT64)scale << 49) will stay above e_min
884	  scale = P34 - q1;
885	  if (q1 <= 19) {	// C1 fits in 64 bits
886	    // 1 <= q1 <= 19 => 15 <= scale <= 33
887	    if (scale <= 19) {	// 10^scale fits in 64 bits
888	      __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
889	    } else {	// if 20 <= scale <= 33
890	      // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
891	      // (C1 * 10^(scale-19)) fits in 64 bits
892	      C1_lo = C1_lo * ten2k64[scale - 19];
893	      __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
894	    }
895	  } else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
896	    // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
897	    C1.w[1] = C1_hi;
898	    C1.w[0] = C1_lo;
899	    // C1 = ten2k64[P34 - q1] * C1
900	    __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
901	  }
902	  x_exp = x_exp - ((UINT64) scale << 49);
903	  C1_hi = C1.w[1];
904	  C1_lo = C1.w[0];
905	}
906	// some special cases arise: if delta = P34 + 1 and C1 = 10^(P34-1)
907	// (after scaling) and x_sign != y_sign and C2 > 5*10^(q2-1) =>
908	// subtract 1 ulp
909	// Note: do this only for rounding to nearest; for other rounding
910	// modes the correction will be applied next
911	if ((rnd_mode == ROUNDING_TO_NEAREST
912	     || rnd_mode == ROUNDING_TIES_AWAY) && delta == (P34 + 1)
913	    && C1_hi == 0x0000314dc6448d93ull
914	    && C1_lo == 0x38c15b0a00000000ull && x_sign != y_sign
915	    && ((q2 <= 19 && C2_lo > midpoint64[q2 - 1]) || (q2 >= 20
916							     && (C2_hi >
917								 midpoint128
918								 [q2 -
919								  20].
920								 w[1]
921								 ||
922								 (C2_hi
923								  ==
924								  midpoint128
925								  [q2 -
926								   20].
927								  w[1]
928								  &&
929								  C2_lo
930								  >
931								  midpoint128
932								  [q2 -
933								   20].
934								  w
935								  [0])))))
936	{
937	  // C1 = 10^34 - 1 and decrement x_exp by 1 (no underflow possible)
938	  C1_hi = 0x0001ed09bead87c0ull;
939	  C1_lo = 0x378d8e63ffffffffull;
940	  x_exp = x_exp - EXP_P1;
941	}
942	if (rnd_mode != ROUNDING_TO_NEAREST) {
943	  if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
944	      (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
945	    // add 1 ulp and then check for overflow
946	    C1_lo = C1_lo + 1;
947	    if (C1_lo == 0) {	// rounding overflow in the low 64 bits
948	      C1_hi = C1_hi + 1;
949	    }
950	    if (C1_hi == 0x0001ed09bead87c0ull
951		&& C1_lo == 0x378d8e6400000000ull) {
952	      // C1 = 10^34 => rounding overflow
953	      C1_hi = 0x0000314dc6448d93ull;
954	      C1_lo = 0x38c15b0a00000000ull;	// 10^33
955	      x_exp = x_exp + EXP_P1;
956	      if (x_exp == EXP_MAX_P1) {	// overflow
957		C1_hi = 0x7800000000000000ull;	// +inf
958		C1_lo = 0x0ull;
959		x_exp = 0;	// x_sign is preserved
960		// set overflow flag (the inexact flag was set too)
961		*pfpsf |= OVERFLOW_EXCEPTION;
962	      }
963	    }
964	  } else if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign) ||
965		     (rnd_mode == ROUNDING_UP && x_sign && !y_sign) ||
966		     (rnd_mode == ROUNDING_TO_ZERO
967		      && x_sign != y_sign)) {
968	    // subtract 1 ulp from C1
969	    // Note: because delta >= P34 + 1 the result cannot be zero
970	    C1_lo = C1_lo - 1;
971	    if (C1_lo == 0xffffffffffffffffull)
972	      C1_hi = C1_hi - 1;
973	    // if the coefficient is 10^33 - 1 then make it 10^34 - 1 and
974	    // decrease the exponent by 1 (because delta >= P34 + 1 the
975	    // exponent will not become less than e_min)
976	    // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
977	    // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
978	    if (C1_hi == 0x0000314dc6448d93ull
979		&& C1_lo == 0x38c15b09ffffffffull) {
980	      // make C1 = 10^34  - 1
981	      C1_hi = 0x0001ed09bead87c0ull;
982	      C1_lo = 0x378d8e63ffffffffull;
983	      x_exp = x_exp - EXP_P1;
984	    }
985	  } else {
986	    ;	// the result is already correct
987	  }
988	}
989	// set the inexact flag
990	*pfpsf |= INEXACT_EXCEPTION;
991	// assemble the result
992	res.w[1] = x_sign | x_exp | C1_hi;
993	res.w[0] = C1_lo;
994      } else {	// delta = P34
995	// in most cases, the smaller operand may be < or = or > 1/2 ulp of the
996	// larger operand
997	// however, the case C1 = 10^(q1-1) and x_sign != y_sign is special due
998	// to accuracy loss after subtraction, and will be treated separately
999	if (x_sign == y_sign || (q1 <= 20
1000				 && (C1_hi != 0
1001				     || C1_lo != ten2k64[q1 - 1]))
1002	    || (q1 >= 21 && (C1_hi != ten2k128[q1 - 21].w[1]
1003			     || C1_lo != ten2k128[q1 - 21].w[0]))) {
1004	  // if x_sign == y_sign or C1 != 10^(q1-1)
1005	  // compare C2 with 1/2 ulp = 5 * 10^(q2-1), the latter read from table
1006	  // Note: cases q1<=19 and q1>=20 can be coalesced at some latency cost
1007	  if (q2 <= 19) {	// C2 and 5*10^(q2-1) both fit in 64 bits
1008	    halfulp64 = midpoint64[q2 - 1];	// 5 * 10^(q2-1)
1009	    if (C2_lo < halfulp64) {	// n2 < 1/2 ulp (n1)
1010	      // for RN the result is the operand with the larger magnitude,
1011	      // possibly scaled up by 10^(P34-q1)
1012	      // an overflow cannot occur in this case (rounding to nearest)
1013	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
1014		// Note: because delta = P34 it is certain that
1015		//     x_exp - ((UINT64)scale << 49) will stay above e_min
1016		scale = P34 - q1;
1017		if (q1 <= 19) {	// C1 fits in 64 bits
1018		  // 1 <= q1 <= 19 => 15 <= scale <= 33
1019		  if (scale <= 19) {	// 10^scale fits in 64 bits
1020		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1021		  } else {	// if 20 <= scale <= 33
1022		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1023		    // (C1 * 10^(scale-19)) fits in 64 bits
1024		    C1_lo = C1_lo * ten2k64[scale - 19];
1025		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1026		  }
1027		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1028		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1029		  C1.w[1] = C1_hi;
1030		  C1.w[0] = C1_lo;
1031		  // C1 = ten2k64[P34 - q1] * C1
1032		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1033		}
1034		x_exp = x_exp - ((UINT64) scale << 49);
1035		C1_hi = C1.w[1];
1036		C1_lo = C1.w[0];
1037	      }
1038	      if (rnd_mode != ROUNDING_TO_NEAREST) {
1039		if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
1040		    (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
1041		  // add 1 ulp and then check for overflow
1042		  C1_lo = C1_lo + 1;
1043		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1044		    C1_hi = C1_hi + 1;
1045		  }
1046		  if (C1_hi == 0x0001ed09bead87c0ull
1047		      && C1_lo == 0x378d8e6400000000ull) {
1048		    // C1 = 10^34 => rounding overflow
1049		    C1_hi = 0x0000314dc6448d93ull;
1050		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
1051		    x_exp = x_exp + EXP_P1;
1052		    if (x_exp == EXP_MAX_P1) {	// overflow
1053		      C1_hi = 0x7800000000000000ull;	// +inf
1054		      C1_lo = 0x0ull;
1055		      x_exp = 0;	// x_sign is preserved
1056		      // set overflow flag (the inexact flag was set too)
1057		      *pfpsf |= OVERFLOW_EXCEPTION;
1058		    }
1059		  }
1060		} else
1061		  if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
1062		      || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1063		      || (rnd_mode == ROUNDING_TO_ZERO
1064			  && x_sign != y_sign)) {
1065		  // subtract 1 ulp from C1
1066		  // Note: because delta >= P34 + 1 the result cannot be zero
1067		  C1_lo = C1_lo - 1;
1068		  if (C1_lo == 0xffffffffffffffffull)
1069		    C1_hi = C1_hi - 1;
1070		  // if the coefficient is 10^33-1 then make it 10^34-1 and
1071		  // decrease the exponent by 1 (because delta >= P34 + 1 the
1072		  // exponent will not become less than e_min)
1073		  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
1074		  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
1075		  if (C1_hi == 0x0000314dc6448d93ull
1076		      && C1_lo == 0x38c15b09ffffffffull) {
1077		    // make C1 = 10^34  - 1
1078		    C1_hi = 0x0001ed09bead87c0ull;
1079		    C1_lo = 0x378d8e63ffffffffull;
1080		    x_exp = x_exp - EXP_P1;
1081		  }
1082		} else {
1083		  ;	// the result is already correct
1084		}
1085	      }
1086	      // set the inexact flag
1087	      *pfpsf |= INEXACT_EXCEPTION;
1088	      // assemble the result
1089	      res.w[1] = x_sign | x_exp | C1_hi;
1090	      res.w[0] = C1_lo;
1091	    } else if ((C2_lo == halfulp64)
1092		       && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
1093	      // n2 = 1/2 ulp (n1) and C1 is even
1094	      // the result is the operand with the larger magnitude,
1095	      // possibly scaled up by 10^(P34-q1)
1096	      // an overflow cannot occur in this case (rounding to nearest)
1097	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
1098		// Note: because delta = P34 it is certain that
1099		//     x_exp - ((UINT64)scale << 49) will stay above e_min
1100		scale = P34 - q1;
1101		if (q1 <= 19) {	// C1 fits in 64 bits
1102		  // 1 <= q1 <= 19 => 15 <= scale <= 33
1103		  if (scale <= 19) {	// 10^scale fits in 64 bits
1104		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1105		  } else {	// if 20 <= scale <= 33
1106		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1107		    // (C1 * 10^(scale-19)) fits in 64 bits
1108		    C1_lo = C1_lo * ten2k64[scale - 19];
1109		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1110		  }
1111		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1112		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1113		  C1.w[1] = C1_hi;
1114		  C1.w[0] = C1_lo;
1115		  // C1 = ten2k64[P34 - q1] * C1
1116		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1117		}
1118		x_exp = x_exp - ((UINT64) scale << 49);
1119		C1_hi = C1.w[1];
1120		C1_lo = C1.w[0];
1121	      }
1122	      if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign == y_sign
1123		   && (C1_lo & 0x01)) || (rnd_mode == ROUNDING_TIES_AWAY
1124					  && x_sign == y_sign)
1125		  || (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)
1126		  || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)) {
1127		// add 1 ulp and then check for overflow
1128		C1_lo = C1_lo + 1;
1129		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1130		  C1_hi = C1_hi + 1;
1131		}
1132		if (C1_hi == 0x0001ed09bead87c0ull
1133		    && C1_lo == 0x378d8e6400000000ull) {
1134		  // C1 = 10^34 => rounding overflow
1135		  C1_hi = 0x0000314dc6448d93ull;
1136		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
1137		  x_exp = x_exp + EXP_P1;
1138		  if (x_exp == EXP_MAX_P1) {	// overflow
1139		    C1_hi = 0x7800000000000000ull;	// +inf
1140		    C1_lo = 0x0ull;
1141		    x_exp = 0;	// x_sign is preserved
1142		    // set overflow flag (the inexact flag was set too)
1143		    *pfpsf |= OVERFLOW_EXCEPTION;
1144		  }
1145		}
1146	      } else
1147		if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign
1148		     && (C1_lo & 0x01)) || (rnd_mode == ROUNDING_DOWN
1149					    && !x_sign && y_sign)
1150		    || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1151		    || (rnd_mode == ROUNDING_TO_ZERO
1152			&& x_sign != y_sign)) {
1153		// subtract 1 ulp from C1
1154		// Note: because delta >= P34 + 1 the result cannot be zero
1155		C1_lo = C1_lo - 1;
1156		if (C1_lo == 0xffffffffffffffffull)
1157		  C1_hi = C1_hi - 1;
1158		// if the coefficient is 10^33 - 1 then make it 10^34 - 1
1159		// and decrease the exponent by 1 (because delta >= P34 + 1
1160		// the exponent will not become less than e_min)
1161		// 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
1162		// 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
1163		if (C1_hi == 0x0000314dc6448d93ull
1164		    && C1_lo == 0x38c15b09ffffffffull) {
1165		  // make C1 = 10^34  - 1
1166		  C1_hi = 0x0001ed09bead87c0ull;
1167		  C1_lo = 0x378d8e63ffffffffull;
1168		  x_exp = x_exp - EXP_P1;
1169		}
1170	      } else {
1171		;	// the result is already correct
1172	      }
1173	      // set the inexact flag
1174	      *pfpsf |= INEXACT_EXCEPTION;
1175	      // assemble the result
1176	      res.w[1] = x_sign | x_exp | C1_hi;
1177	      res.w[0] = C1_lo;
1178	    } else {	// if C2_lo > halfulp64 ||
1179	      // (C2_lo == halfulp64 && q1 == P34 && ((C1_lo & 0x1) == 1)), i.e.
1180	      // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
1181	      // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
1182	      if (q1 < P34) {	// then 1 ulp = 10^(e1+q1-P34) < 10^e1
1183		// Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
1184		// because q1 < P34 we must first replace C1 by
1185		// C1 * 10^(P34-q1), and must decrease the exponent by
1186		// (P34-q1) (it will still be at least e_min)
1187		scale = P34 - q1;
1188		if (q1 <= 19) {	// C1 fits in 64 bits
1189		  // 1 <= q1 <= 19 => 15 <= scale <= 33
1190		  if (scale <= 19) {	// 10^scale fits in 64 bits
1191		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1192		  } else {	// if 20 <= scale <= 33
1193		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1194		    // (C1 * 10^(scale-19)) fits in 64 bits
1195		    C1_lo = C1_lo * ten2k64[scale - 19];
1196		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1197		  }
1198		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1199		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1200		  C1.w[1] = C1_hi;
1201		  C1.w[0] = C1_lo;
1202		  // C1 = ten2k64[P34 - q1] * C1
1203		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1204		}
1205		x_exp = x_exp - ((UINT64) scale << 49);
1206		C1_hi = C1.w[1];
1207		C1_lo = C1.w[0];
1208		// check for rounding overflow
1209		if (C1_hi == 0x0001ed09bead87c0ull
1210		    && C1_lo == 0x378d8e6400000000ull) {
1211		  // C1 = 10^34 => rounding overflow
1212		  C1_hi = 0x0000314dc6448d93ull;
1213		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
1214		  x_exp = x_exp + EXP_P1;
1215		}
1216	      }
1217	      if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign)
1218		  || (rnd_mode == ROUNDING_TIES_AWAY && x_sign != y_sign
1219		      && C2_lo != halfulp64)
1220		  || (rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
1221		  || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1222		  || (rnd_mode == ROUNDING_TO_ZERO
1223		      && x_sign != y_sign)) {
1224		// the result is x - 1
1225		// for RN n1 * n2 < 0; underflow not possible
1226		C1_lo = C1_lo - 1;
1227		if (C1_lo == 0xffffffffffffffffull)
1228		  C1_hi--;
1229		// check if we crossed into the lower decade
1230		if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
1231		  C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
1232		  C1_lo = 0x378d8e63ffffffffull;
1233		  x_exp = x_exp - EXP_P1;	// no underflow, because n1 >> n2
1234		}
1235	      } else
1236		if ((rnd_mode == ROUNDING_TO_NEAREST
1237		     && x_sign == y_sign)
1238		    || (rnd_mode == ROUNDING_TIES_AWAY
1239			&& x_sign == y_sign)
1240		    || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)
1241		    || (rnd_mode == ROUNDING_UP && !x_sign
1242			&& !y_sign)) {
1243		// the result is x + 1
1244		// for RN x_sign = y_sign, i.e. n1*n2 > 0
1245		C1_lo = C1_lo + 1;
1246		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1247		  C1_hi = C1_hi + 1;
1248		}
1249		if (C1_hi == 0x0001ed09bead87c0ull
1250		    && C1_lo == 0x378d8e6400000000ull) {
1251		  // C1 = 10^34 => rounding overflow
1252		  C1_hi = 0x0000314dc6448d93ull;
1253		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
1254		  x_exp = x_exp + EXP_P1;
1255		  if (x_exp == EXP_MAX_P1) {	// overflow
1256		    C1_hi = 0x7800000000000000ull;	// +inf
1257		    C1_lo = 0x0ull;
1258		    x_exp = 0;	// x_sign is preserved
1259		    // set the overflow flag
1260		    *pfpsf |= OVERFLOW_EXCEPTION;
1261		  }
1262		}
1263	      } else {
1264		;	// the result is x
1265	      }
1266	      // set the inexact flag
1267	      *pfpsf |= INEXACT_EXCEPTION;
1268	      // assemble the result
1269	      res.w[1] = x_sign | x_exp | C1_hi;
1270	      res.w[0] = C1_lo;
1271	    }
1272	  } else {	// if q2 >= 20 then 5*10^(q2-1) and C2 (the latter in
1273	    // most cases) fit only in more than 64 bits
1274	    halfulp128 = midpoint128[q2 - 20];	// 5 * 10^(q2-1)
1275	    if ((C2_hi < halfulp128.w[1])
1276		|| (C2_hi == halfulp128.w[1]
1277		    && C2_lo < halfulp128.w[0])) {
1278	      // n2 < 1/2 ulp (n1)
1279	      // the result is the operand with the larger magnitude,
1280	      // possibly scaled up by 10^(P34-q1)
1281	      // an overflow cannot occur in this case (rounding to nearest)
1282	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
1283		// Note: because delta = P34 it is certain that
1284		//     x_exp - ((UINT64)scale << 49) will stay above e_min
1285		scale = P34 - q1;
1286		if (q1 <= 19) {	// C1 fits in 64 bits
1287		  // 1 <= q1 <= 19 => 15 <= scale <= 33
1288		  if (scale <= 19) {	// 10^scale fits in 64 bits
1289		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1290		  } else {	// if 20 <= scale <= 33
1291		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1292		    // (C1 * 10^(scale-19)) fits in 64 bits
1293		    C1_lo = C1_lo * ten2k64[scale - 19];
1294		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1295		  }
1296		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1297		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1298		  C1.w[1] = C1_hi;
1299		  C1.w[0] = C1_lo;
1300		  // C1 = ten2k64[P34 - q1] * C1
1301		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1302		}
1303		C1_hi = C1.w[1];
1304		C1_lo = C1.w[0];
1305		x_exp = x_exp - ((UINT64) scale << 49);
1306	      }
1307	      if (rnd_mode != ROUNDING_TO_NEAREST) {
1308		if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
1309		    (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
1310		  // add 1 ulp and then check for overflow
1311		  C1_lo = C1_lo + 1;
1312		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1313		    C1_hi = C1_hi + 1;
1314		  }
1315		  if (C1_hi == 0x0001ed09bead87c0ull
1316		      && C1_lo == 0x378d8e6400000000ull) {
1317		    // C1 = 10^34 => rounding overflow
1318		    C1_hi = 0x0000314dc6448d93ull;
1319		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
1320		    x_exp = x_exp + EXP_P1;
1321		    if (x_exp == EXP_MAX_P1) {	// overflow
1322		      C1_hi = 0x7800000000000000ull;	// +inf
1323		      C1_lo = 0x0ull;
1324		      x_exp = 0;	// x_sign is preserved
1325		      // set overflow flag (the inexact flag was set too)
1326		      *pfpsf |= OVERFLOW_EXCEPTION;
1327		    }
1328		  }
1329		} else
1330		  if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
1331		      || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1332		      || (rnd_mode == ROUNDING_TO_ZERO
1333			  && x_sign != y_sign)) {
1334		  // subtract 1 ulp from C1
1335		  // Note: because delta >= P34 + 1 the result cannot be zero
1336		  C1_lo = C1_lo - 1;
1337		  if (C1_lo == 0xffffffffffffffffull)
1338		    C1_hi = C1_hi - 1;
1339		  // if the coefficient is 10^33-1 then make it 10^34-1 and
1340		  // decrease the exponent by 1 (because delta >= P34 + 1 the
1341		  // exponent will not become less than e_min)
1342		  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
1343		  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
1344		  if (C1_hi == 0x0000314dc6448d93ull
1345		      && C1_lo == 0x38c15b09ffffffffull) {
1346		    // make C1 = 10^34  - 1
1347		    C1_hi = 0x0001ed09bead87c0ull;
1348		    C1_lo = 0x378d8e63ffffffffull;
1349		    x_exp = x_exp - EXP_P1;
1350		  }
1351		} else {
1352		  ;	// the result is already correct
1353		}
1354	      }
1355	      // set the inexact flag
1356	      *pfpsf |= INEXACT_EXCEPTION;
1357	      // assemble the result
1358	      res.w[1] = x_sign | x_exp | C1_hi;
1359	      res.w[0] = C1_lo;
1360	    } else if ((C2_hi == halfulp128.w[1]
1361			&& C2_lo == halfulp128.w[0])
1362		       && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
1363	      // midpoint & lsb in C1 is 0
1364	      // n2 = 1/2 ulp (n1) and C1 is even
1365	      // the result is the operand with the larger magnitude,
1366	      // possibly scaled up by 10^(P34-q1)
1367	      // an overflow cannot occur in this case (rounding to nearest)
1368	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
1369		// Note: because delta = P34 it is certain that
1370		//     x_exp - ((UINT64)scale << 49) will stay above e_min
1371		scale = P34 - q1;
1372		if (q1 <= 19) {	// C1 fits in 64 bits
1373		  // 1 <= q1 <= 19 => 15 <= scale <= 33
1374		  if (scale <= 19) {	// 10^scale fits in 64 bits
1375		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1376		  } else {	// if 20 <= scale <= 33
1377		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1378		    // (C1 * 10^(scale-19)) fits in 64 bits
1379		    C1_lo = C1_lo * ten2k64[scale - 19];
1380		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1381		  }
1382		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1383		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1384		  C1.w[1] = C1_hi;
1385		  C1.w[0] = C1_lo;
1386		  // C1 = ten2k64[P34 - q1] * C1
1387		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1388		}
1389		x_exp = x_exp - ((UINT64) scale << 49);
1390		C1_hi = C1.w[1];
1391		C1_lo = C1.w[0];
1392	      }
1393	      if (rnd_mode != ROUNDING_TO_NEAREST) {
1394		if ((rnd_mode == ROUNDING_TIES_AWAY && x_sign == y_sign)
1395		    || (rnd_mode == ROUNDING_UP && !y_sign)) {
1396		  // add 1 ulp and then check for overflow
1397		  C1_lo = C1_lo + 1;
1398		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1399		    C1_hi = C1_hi + 1;
1400		  }
1401		  if (C1_hi == 0x0001ed09bead87c0ull
1402		      && C1_lo == 0x378d8e6400000000ull) {
1403		    // C1 = 10^34 => rounding overflow
1404		    C1_hi = 0x0000314dc6448d93ull;
1405		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
1406		    x_exp = x_exp + EXP_P1;
1407		    if (x_exp == EXP_MAX_P1) {	// overflow
1408		      C1_hi = 0x7800000000000000ull;	// +inf
1409		      C1_lo = 0x0ull;
1410		      x_exp = 0;	// x_sign is preserved
1411		      // set overflow flag (the inexact flag was set too)
1412		      *pfpsf |= OVERFLOW_EXCEPTION;
1413		    }
1414		  }
1415		} else if ((rnd_mode == ROUNDING_DOWN && y_sign)
1416			   || (rnd_mode == ROUNDING_TO_ZERO
1417			       && x_sign != y_sign)) {
1418		  // subtract 1 ulp from C1
1419		  // Note: because delta >= P34 + 1 the result cannot be zero
1420		  C1_lo = C1_lo - 1;
1421		  if (C1_lo == 0xffffffffffffffffull)
1422		    C1_hi = C1_hi - 1;
1423		  // if the coefficient is 10^33 - 1 then make it 10^34 - 1
1424		  // and decrease the exponent by 1 (because delta >= P34 + 1
1425		  // the exponent will not become less than e_min)
1426		  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
1427		  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
1428		  if (C1_hi == 0x0000314dc6448d93ull
1429		      && C1_lo == 0x38c15b09ffffffffull) {
1430		    // make C1 = 10^34  - 1
1431		    C1_hi = 0x0001ed09bead87c0ull;
1432		    C1_lo = 0x378d8e63ffffffffull;
1433		    x_exp = x_exp - EXP_P1;
1434		  }
1435		} else {
1436		  ;	// the result is already correct
1437		}
1438	      }
1439	      // set the inexact flag
1440	      *pfpsf |= INEXACT_EXCEPTION;
1441	      // assemble the result
1442	      res.w[1] = x_sign | x_exp | C1_hi;
1443	      res.w[0] = C1_lo;
1444	    } else {	// if C2 > halfulp128 ||
1445	      // (C2 == halfulp128 && q1 == P34 && ((C1 & 0x1) == 1)), i.e.
1446	      // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
1447	      // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
1448	      if (q1 < P34) {	// then 1 ulp = 10^(e1+q1-P34) < 10^e1
1449		// Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
1450		// because q1 < P34 we must first replace C1 by C1*10^(P34-q1),
1451		// and must decrease the exponent by (P34-q1) (it will still be
1452		// at least e_min)
1453		scale = P34 - q1;
1454		if (q1 <= 19) {	// C1 fits in 64 bits
1455		  // 1 <= q1 <= 19 => 15 <= scale <= 33
1456		  if (scale <= 19) {	// 10^scale fits in 64 bits
1457		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1458		  } else {	// if 20 <= scale <= 33
1459		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1460		    // (C1 * 10^(scale-19)) fits in 64 bits
1461		    C1_lo = C1_lo * ten2k64[scale - 19];
1462		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1463		  }
1464		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1465		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1466		  C1.w[1] = C1_hi;
1467		  C1.w[0] = C1_lo;
1468		  // C1 = ten2k64[P34 - q1] * C1
1469		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1470		}
1471		C1_hi = C1.w[1];
1472		C1_lo = C1.w[0];
1473		x_exp = x_exp - ((UINT64) scale << 49);
1474	      }
1475	      if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign)
1476		  || (rnd_mode == ROUNDING_TIES_AWAY && x_sign != y_sign
1477		      && (C2_hi != halfulp128.w[1]
1478			  || C2_lo != halfulp128.w[0]))
1479		  || (rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
1480		  || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1481		  || (rnd_mode == ROUNDING_TO_ZERO
1482		      && x_sign != y_sign)) {
1483		// the result is x - 1
1484		// for RN n1 * n2 < 0; underflow not possible
1485		C1_lo = C1_lo - 1;
1486		if (C1_lo == 0xffffffffffffffffull)
1487		  C1_hi--;
1488		// check if we crossed into the lower decade
1489		if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
1490		  C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
1491		  C1_lo = 0x378d8e63ffffffffull;
1492		  x_exp = x_exp - EXP_P1;	// no underflow, because n1 >> n2
1493		}
1494	      } else
1495		if ((rnd_mode == ROUNDING_TO_NEAREST
1496		     && x_sign == y_sign)
1497		    || (rnd_mode == ROUNDING_TIES_AWAY
1498			&& x_sign == y_sign)
1499		    || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)
1500		    || (rnd_mode == ROUNDING_UP && !x_sign
1501			&& !y_sign)) {
1502		// the result is x + 1
1503		// for RN x_sign = y_sign, i.e. n1*n2 > 0
1504		C1_lo = C1_lo + 1;
1505		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1506		  C1_hi = C1_hi + 1;
1507		}
1508		if (C1_hi == 0x0001ed09bead87c0ull
1509		    && C1_lo == 0x378d8e6400000000ull) {
1510		  // C1 = 10^34 => rounding overflow
1511		  C1_hi = 0x0000314dc6448d93ull;
1512		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
1513		  x_exp = x_exp + EXP_P1;
1514		  if (x_exp == EXP_MAX_P1) {	// overflow
1515		    C1_hi = 0x7800000000000000ull;	// +inf
1516		    C1_lo = 0x0ull;
1517		    x_exp = 0;	// x_sign is preserved
1518		    // set the overflow flag
1519		    *pfpsf |= OVERFLOW_EXCEPTION;
1520		  }
1521		}
1522	      } else {
1523		;	// the result is x
1524	      }
1525	      // set the inexact flag
1526	      *pfpsf |= INEXACT_EXCEPTION;
1527	      // assemble the result
1528	      res.w[1] = x_sign | x_exp | C1_hi;
1529	      res.w[0] = C1_lo;
1530	    }
1531	  }	// end q1 >= 20
1532	  // end case where C1 != 10^(q1-1)
1533	} else {	// C1 = 10^(q1-1) and x_sign != y_sign
1534	  // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
1535	  // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34
1536	  // where x1 = q2 - 1, 0 <= x1 <= P34 - 1
1537	  // Because C1 = 10^(q1-1) and x_sign != y_sign, C' will have P34
1538	  // digits and n = C' * 10^(e2+x1)
1539	  // If the result has P34+1 digits, redo the steps above with x1+1
1540	  // If the result has P34-1 digits or less, redo the steps above with
1541	  // x1-1 but only if initially x1 >= 1
1542	  // NOTE: these two steps can be improved, e.g we could guess if
1543	  // P34+1 or P34-1 digits will be obtained by adding/subtracting
1544	  // just the top 64 bits of the two operands
1545	  // The result cannot be zero, and it cannot overflow
1546	  x1 = q2 - 1;	// 0 <= x1 <= P34-1
1547	  // Calculate C1 * 10^(e1-e2-x1) where 1 <= e1-e2-x1 <= P34
1548	  // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
1549	  scale = P34 - q1 + 1;	// scale=e1-e2-x1 = P34+1-q1; 1<=scale<=P34
1550	  // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
1551	  // but their product fits with certainty in 128 bits
1552	  if (scale >= 20) {	//10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
1553	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
1554	  } else {	// if (scale >= 1
1555	    // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
1556	    if (q1 <= 19) {	// C1 fits in 64 bits
1557	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
1558	    } else {	// q1 >= 20
1559	      C1.w[1] = C1_hi;
1560	      C1.w[0] = C1_lo;
1561	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
1562	    }
1563	  }
1564	  tmp64 = C1.w[0];	// C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)
1565
1566	  // now round C2 to q2-x1 = 1 decimal digit
1567	  // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
1568	  ind = x1 - 1;	// -1 <= ind <= P34 - 2
1569	  if (ind >= 0) {	// if (x1 >= 1)
1570	    C2.w[0] = C2_lo;
1571	    C2.w[1] = C2_hi;
1572	    if (ind <= 18) {
1573	      C2.w[0] = C2.w[0] + midpoint64[ind];
1574	      if (C2.w[0] < C2_lo)
1575		C2.w[1]++;
1576	    } else {	// 19 <= ind <= 32
1577	      C2.w[0] = C2.w[0] + midpoint128[ind - 19].w[0];
1578	      C2.w[1] = C2.w[1] + midpoint128[ind - 19].w[1];
1579	      if (C2.w[0] < C2_lo)
1580		C2.w[1]++;
1581	    }
1582	    // the approximation of 10^(-x1) was rounded up to 118 bits
1583	    __mul_128x128_to_256 (R256, C2, ten2mk128[ind]);	// R256 = C2*, f2*
1584	    // calculate C2* and f2*
1585	    // C2* is actually floor(C2*) in this case
1586	    // C2* and f2* need shifting and masking, as shown by
1587	    // shiftright128[] and maskhigh128[]
1588	    // the top Ex bits of 10^(-x1) are T* = ten2mk128trunc[ind], e.g.
1589	    // if x1=1, T*=ten2mk128trunc[0]=0x19999999999999999999999999999999
1590	    // if (0 < f2* < 10^(-x1)) then
1591	    //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
1592	    //       shift; C2* has p decimal digits, correct by Prop. 1)
1593	    //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
1594	    //       shift; C2* has p decimal digits, correct by Pr. 1)
1595	    // else
1596	    //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
1597	    //       correct by Property 1)
1598	    // n = C2* * 10^(e2+x1)
1599
1600	    if (ind <= 2) {
1601	      highf2star.w[1] = 0x0;
1602	      highf2star.w[0] = 0x0;	// low f2* ok
1603	    } else if (ind <= 21) {
1604	      highf2star.w[1] = 0x0;
1605	      highf2star.w[0] = R256.w[2] & maskhigh128[ind];	// low f2* ok
1606	    } else {
1607	      highf2star.w[1] = R256.w[3] & maskhigh128[ind];
1608	      highf2star.w[0] = R256.w[2];	// low f2* is ok
1609	    }
1610	    // shift right C2* by Ex-128 = shiftright128[ind]
1611	    if (ind >= 3) {
1612	      shift = shiftright128[ind];
1613	      if (shift < 64) {	// 3 <= shift <= 63
1614		R256.w[2] =
1615		  (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
1616		R256.w[3] = (R256.w[3] >> shift);
1617	      } else {	// 66 <= shift <= 102
1618		R256.w[2] = (R256.w[3] >> (shift - 64));
1619		R256.w[3] = 0x0ULL;
1620	      }
1621	    }
1622	    // redundant
1623	    is_inexact_lt_midpoint = 0;
1624	    is_inexact_gt_midpoint = 0;
1625	    is_midpoint_lt_even = 0;
1626	    is_midpoint_gt_even = 0;
1627	    // determine inexactness of the rounding of C2*
1628	    // (cannot be followed by a second rounding)
1629	    // if (0 < f2* - 1/2 < 10^(-x1)) then
1630	    //   the result is exact
1631	    // else (if f2* - 1/2 > T* then)
1632	    //   the result of is inexact
1633	    if (ind <= 2) {
1634	      if (R256.w[1] > 0x8000000000000000ull ||
1635		  (R256.w[1] == 0x8000000000000000ull
1636		   && R256.w[0] > 0x0ull)) {
1637		// f2* > 1/2 and the result may be exact
1638		tmp64A = R256.w[1] - 0x8000000000000000ull;	// f* - 1/2
1639		if ((tmp64A > ten2mk128trunc[ind].w[1]
1640		     || (tmp64A == ten2mk128trunc[ind].w[1]
1641			 && R256.w[0] >= ten2mk128trunc[ind].w[0]))) {
1642		  // set the inexact flag
1643		  *pfpsf |= INEXACT_EXCEPTION;
1644		  // this rounding is applied to C2 only!
1645		  // x_sign != y_sign
1646		  is_inexact_gt_midpoint = 1;
1647		}	// else the result is exact
1648		// rounding down, unless a midpoint in [ODD, EVEN]
1649	      } else {	// the result is inexact; f2* <= 1/2
1650		// set the inexact flag
1651		*pfpsf |= INEXACT_EXCEPTION;
1652		// this rounding is applied to C2 only!
1653		// x_sign != y_sign
1654		is_inexact_lt_midpoint = 1;
1655	      }
1656	    } else if (ind <= 21) {	// if 3 <= ind <= 21
1657	      if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
1658					    && highf2star.w[0] >
1659					    onehalf128[ind])
1660		  || (highf2star.w[1] == 0x0
1661		      && highf2star.w[0] == onehalf128[ind]
1662		      && (R256.w[1] || R256.w[0]))) {
1663		// f2* > 1/2 and the result may be exact
1664		// Calculate f2* - 1/2
1665		tmp64A = highf2star.w[0] - onehalf128[ind];
1666		tmp64B = highf2star.w[1];
1667		if (tmp64A > highf2star.w[0])
1668		  tmp64B--;
1669		if (tmp64B || tmp64A
1670		    || R256.w[1] > ten2mk128trunc[ind].w[1]
1671		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
1672			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
1673		  // set the inexact flag
1674		  *pfpsf |= INEXACT_EXCEPTION;
1675		  // this rounding is applied to C2 only!
1676		  // x_sign != y_sign
1677		  is_inexact_gt_midpoint = 1;
1678		}	// else the result is exact
1679	      } else {	// the result is inexact; f2* <= 1/2
1680		// set the inexact flag
1681		*pfpsf |= INEXACT_EXCEPTION;
1682		// this rounding is applied to C2 only!
1683		// x_sign != y_sign
1684		is_inexact_lt_midpoint = 1;
1685	      }
1686	    } else {	// if 22 <= ind <= 33
1687	      if (highf2star.w[1] > onehalf128[ind]
1688		  || (highf2star.w[1] == onehalf128[ind]
1689		      && (highf2star.w[0] || R256.w[1]
1690			  || R256.w[0]))) {
1691		// f2* > 1/2 and the result may be exact
1692		// Calculate f2* - 1/2
1693		// tmp64A = highf2star.w[0];
1694		tmp64B = highf2star.w[1] - onehalf128[ind];
1695		if (tmp64B || highf2star.w[0]
1696		    || R256.w[1] > ten2mk128trunc[ind].w[1]
1697		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
1698			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
1699		  // set the inexact flag
1700		  *pfpsf |= INEXACT_EXCEPTION;
1701		  // this rounding is applied to C2 only!
1702		  // x_sign != y_sign
1703		  is_inexact_gt_midpoint = 1;
1704		}	// else the result is exact
1705	      } else {	// the result is inexact; f2* <= 1/2
1706		// set the inexact flag
1707		*pfpsf |= INEXACT_EXCEPTION;
1708		// this rounding is applied to C2 only!
1709		// x_sign != y_sign
1710		is_inexact_lt_midpoint = 1;
1711	      }
1712	    }
1713	    // check for midpoints after determining inexactness
1714	    if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
1715		&& (highf2star.w[0] == 0)
1716		&& (R256.w[1] < ten2mk128trunc[ind].w[1]
1717		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
1718			&& R256.w[0] <= ten2mk128trunc[ind].w[0]))) {
1719	      // the result is a midpoint
1720	      if ((tmp64 + R256.w[2]) & 0x01) {	// MP in [EVEN, ODD]
1721		// if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
1722		R256.w[2]--;
1723		if (R256.w[2] == 0xffffffffffffffffull)
1724		  R256.w[3]--;
1725		// this rounding is applied to C2 only!
1726		// x_sign != y_sign
1727		is_midpoint_lt_even = 1;
1728		is_inexact_lt_midpoint = 0;
1729		is_inexact_gt_midpoint = 0;
1730	      } else {
1731		// else MP in [ODD, EVEN]
1732		// this rounding is applied to C2 only!
1733		// x_sign != y_sign
1734		is_midpoint_gt_even = 1;
1735		is_inexact_lt_midpoint = 0;
1736		is_inexact_gt_midpoint = 0;
1737	      }
1738	    }
1739	  } else {	// if (ind == -1) only when x1 = 0
1740	    R256.w[2] = C2_lo;
1741	    R256.w[3] = C2_hi;
1742	    is_midpoint_lt_even = 0;
1743	    is_midpoint_gt_even = 0;
1744	    is_inexact_lt_midpoint = 0;
1745	    is_inexact_gt_midpoint = 0;
1746	  }
1747	  // and now subtract C1 * 10^(e1-e2-x1) - (C2 * 10^(-x1))rnd,P34
1748	  // because x_sign != y_sign this last operation is exact
1749	  C1.w[0] = C1.w[0] - R256.w[2];
1750	  C1.w[1] = C1.w[1] - R256.w[3];
1751	  if (C1.w[0] > tmp64)
1752	    C1.w[1]--;	// borrow
1753	  if (C1.w[1] >= 0x8000000000000000ull) {	// negative coefficient!
1754	    C1.w[0] = ~C1.w[0];
1755	    C1.w[0]++;
1756	    C1.w[1] = ~C1.w[1];
1757	    if (C1.w[0] == 0x0)
1758	      C1.w[1]++;
1759	    tmp_sign = y_sign;	// the result will have the sign of y
1760	  } else {
1761	    tmp_sign = x_sign;
1762	  }
1763	  // the difference has exactly P34 digits
1764	  x_sign = tmp_sign;
1765	  if (x1 >= 1)
1766	    y_exp = y_exp + ((UINT64) x1 << 49);
1767	  C1_hi = C1.w[1];
1768	  C1_lo = C1.w[0];
1769	  // general correction from RN to RA, RM, RP, RZ; result uses y_exp
1770	  if (rnd_mode != ROUNDING_TO_NEAREST) {
1771	    if ((!x_sign
1772		 && ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint)
1773		     ||
1774		     ((rnd_mode == ROUNDING_TIES_AWAY
1775		       || rnd_mode == ROUNDING_UP)
1776		      && is_midpoint_gt_even))) || (x_sign
1777						    &&
1778						    ((rnd_mode ==
1779						      ROUNDING_DOWN
1780						      &&
1781						      is_inexact_lt_midpoint)
1782						     ||
1783						     ((rnd_mode ==
1784						       ROUNDING_TIES_AWAY
1785						       || rnd_mode ==
1786						       ROUNDING_DOWN)
1787						      &&
1788						      is_midpoint_gt_even))))
1789	    {
1790	      // C1 = C1 + 1
1791	      C1_lo = C1_lo + 1;
1792	      if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1793		C1_hi = C1_hi + 1;
1794	      }
1795	      if (C1_hi == 0x0001ed09bead87c0ull
1796		  && C1_lo == 0x378d8e6400000000ull) {
1797		// C1 = 10^34 => rounding overflow
1798		C1_hi = 0x0000314dc6448d93ull;
1799		C1_lo = 0x38c15b0a00000000ull;	// 10^33
1800		y_exp = y_exp + EXP_P1;
1801	      }
1802	    } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
1803		       &&
1804		       ((x_sign
1805			 && (rnd_mode == ROUNDING_UP
1806			     || rnd_mode == ROUNDING_TO_ZERO))
1807			|| (!x_sign
1808			    && (rnd_mode == ROUNDING_DOWN
1809				|| rnd_mode == ROUNDING_TO_ZERO)))) {
1810	      // C1 = C1 - 1
1811	      C1_lo = C1_lo - 1;
1812	      if (C1_lo == 0xffffffffffffffffull)
1813		C1_hi--;
1814	      // check if we crossed into the lower decade
1815	      if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
1816		C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
1817		C1_lo = 0x378d8e63ffffffffull;
1818		y_exp = y_exp - EXP_P1;
1819		// no underflow, because delta + q2 >= P34 + 1
1820	      }
1821	    } else {
1822	      ;	// exact, the result is already correct
1823	    }
1824	  }
1825	  // assemble the result
1826	  res.w[1] = x_sign | y_exp | C1_hi;
1827	  res.w[0] = C1_lo;
1828	}
1829      }	// end delta = P34
1830    } else {	// if (|delta| <= P34 - 1)
1831      if (delta >= 0) {	// if (0 <= delta <= P34 - 1)
1832	if (delta <= P34 - 1 - q2) {
1833	  // calculate C' directly; the result is exact
1834	  // in this case 1<=q1<=P34-1, 1<=q2<=P34-1 and 0 <= e1-e2 <= P34-2
1835	  // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
1836	  // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
1837	  // but their product fits with certainty in 128 bits (actually in 113)
1838	  scale = delta - q1 + q2;	// scale = (int)(e1 >> 49) - (int)(e2 >> 49)
1839
1840	  if (scale >= 20) {	// 10^(e1-e2) does not fit in 64 bits, but C1 does
1841	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
1842	    C1_hi = C1.w[1];
1843	    C1_lo = C1.w[0];
1844	  } else if (scale >= 1) {
1845	    // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
1846	    if (q1 <= 19) {	// C1 fits in 64 bits
1847	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
1848	    } else {	// q1 >= 20
1849	      C1.w[1] = C1_hi;
1850	      C1.w[0] = C1_lo;
1851	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
1852	    }
1853	    C1_hi = C1.w[1];
1854	    C1_lo = C1.w[0];
1855	  } else {	// if (scale == 0) C1 is unchanged
1856	    C1.w[0] = C1_lo;	// C1.w[1] = C1_hi;
1857	  }
1858	  // now add C2
1859	  if (x_sign == y_sign) {
1860	    // the result cannot overflow
1861	    C1_lo = C1_lo + C2_lo;
1862	    C1_hi = C1_hi + C2_hi;
1863	    if (C1_lo < C1.w[0])
1864	      C1_hi++;
1865	  } else {	// if x_sign != y_sign
1866	    C1_lo = C1_lo - C2_lo;
1867	    C1_hi = C1_hi - C2_hi;
1868	    if (C1_lo > C1.w[0])
1869	      C1_hi--;
1870	    // the result can be zero, but it cannot overflow
1871	    if (C1_lo == 0 && C1_hi == 0) {
1872	      // assemble the result
1873	      if (x_exp < y_exp)
1874		res.w[1] = x_exp;
1875	      else
1876		res.w[1] = y_exp;
1877	      res.w[0] = 0;
1878	      if (rnd_mode == ROUNDING_DOWN) {
1879		res.w[1] |= 0x8000000000000000ull;
1880	      }
1881	      BID_SWAP128 (res);
1882	      BID_RETURN (res);
1883	    }
1884	    if (C1_hi >= 0x8000000000000000ull) {	// negative coefficient!
1885	      C1_lo = ~C1_lo;
1886	      C1_lo++;
1887	      C1_hi = ~C1_hi;
1888	      if (C1_lo == 0x0)
1889		C1_hi++;
1890	      x_sign = y_sign;	// the result will have the sign of y
1891	    }
1892	  }
1893	  // assemble the result
1894	  res.w[1] = x_sign | y_exp | C1_hi;
1895	  res.w[0] = C1_lo;
1896	} else if (delta == P34 - q2) {
1897	  // calculate C' directly; the result may be inexact if it requires
1898	  // P34+1 decimal digits; in this case the 'cutoff' point for addition
1899	  // is at the position of the lsb of C2, so 0 <= e1-e2 <= P34-1
1900	  // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
1901	  // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
1902	  // but their product fits with certainty in 128 bits (actually in 113)
1903	  scale = delta - q1 + q2;	// scale = (int)(e1 >> 49) - (int)(e2 >> 49)
1904	  if (scale >= 20) {	// 10^(e1-e2) does not fit in 64 bits, but C1 does
1905	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
1906	  } else if (scale >= 1) {
1907	    // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
1908	    if (q1 <= 19) {	// C1 fits in 64 bits
1909	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
1910	    } else {	// q1 >= 20
1911	      C1.w[1] = C1_hi;
1912	      C1.w[0] = C1_lo;
1913	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
1914	    }
1915	  } else {	// if (scale == 0) C1 is unchanged
1916	    C1.w[1] = C1_hi;
1917	    C1.w[0] = C1_lo;	// only the low part is necessary
1918	  }
1919	  C1_hi = C1.w[1];
1920	  C1_lo = C1.w[0];
1921	  // now add C2
1922	  if (x_sign == y_sign) {
1923	    // the result can overflow!
1924	    C1_lo = C1_lo + C2_lo;
1925	    C1_hi = C1_hi + C2_hi;
1926	    if (C1_lo < C1.w[0])
1927	      C1_hi++;
1928	    // test for overflow, possible only when C1 >= 10^34
1929	    if (C1_hi > 0x0001ed09bead87c0ull || (C1_hi == 0x0001ed09bead87c0ull && C1_lo >= 0x378d8e6400000000ull)) {	// C1 >= 10^34
1930	      // in this case q = P34 + 1 and x = q - P34 = 1, so multiply
1931	      // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1
1932	      // decimal digits
1933	      // Calculate C'' = C' + 1/2 * 10^x
1934	      if (C1_lo >= 0xfffffffffffffffbull) {	// low half add has carry
1935		C1_lo = C1_lo + 5;
1936		C1_hi = C1_hi + 1;
1937	      } else {
1938		C1_lo = C1_lo + 5;
1939	      }
1940	      // the approximation of 10^(-1) was rounded up to 118 bits
1941	      // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
1942	      // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
1943	      C1.w[1] = C1_hi;
1944	      C1.w[0] = C1_lo;	// C''
1945	      ten2m1.w[1] = 0x1999999999999999ull;
1946	      ten2m1.w[0] = 0x9999999999999a00ull;
1947	      __mul_128x128_to_256 (P256, C1, ten2m1);	// P256 = C*, f*
1948	      // C* is actually floor(C*) in this case
1949	      // the top Ex = 128 bits of 10^(-1) are
1950	      // T* = 0x00199999999999999999999999999999
1951	      // if (0 < f* < 10^(-x)) then
1952	      //   if floor(C*) is even then C = floor(C*) - logical right
1953	      //       shift; C has p decimal digits, correct by Prop. 1)
1954	      //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
1955	      //       shift; C has p decimal digits, correct by Pr. 1)
1956	      // else
1957	      //   C = floor(C*) (logical right shift; C has p decimal digits,
1958	      //       correct by Property 1)
1959	      // n = C * 10^(e2+x)
1960	      if ((P256.w[1] || P256.w[0])
1961		  && (P256.w[1] < 0x1999999999999999ull
1962		      || (P256.w[1] == 0x1999999999999999ull
1963			  && P256.w[0] <= 0x9999999999999999ull))) {
1964		// the result is a midpoint
1965		if (P256.w[2] & 0x01) {
1966		  is_midpoint_gt_even = 1;
1967		  // if floor(C*) is odd C = floor(C*) - 1; the result is not 0
1968		  P256.w[2]--;
1969		  if (P256.w[2] == 0xffffffffffffffffull)
1970		    P256.w[3]--;
1971		} else {
1972		  is_midpoint_lt_even = 1;
1973		}
1974	      }
1975	      // n = Cstar * 10^(e2+1)
1976	      y_exp = y_exp + EXP_P1;
1977	      // C* != 10^P because C* has P34 digits
1978	      // check for overflow
1979	      if (y_exp == EXP_MAX_P1
1980		  && (rnd_mode == ROUNDING_TO_NEAREST
1981		      || rnd_mode == ROUNDING_TIES_AWAY)) {
1982		// overflow for RN
1983		res.w[1] = x_sign | 0x7800000000000000ull;	// +/-inf
1984		res.w[0] = 0x0ull;
1985		// set the inexact flag
1986		*pfpsf |= INEXACT_EXCEPTION;
1987		// set the overflow flag
1988		*pfpsf |= OVERFLOW_EXCEPTION;
1989		BID_SWAP128 (res);
1990		BID_RETURN (res);
1991	      }
1992	      // if (0 < f* - 1/2 < 10^(-x)) then
1993	      //   the result of the addition is exact
1994	      // else
1995	      //   the result of the addition is inexact
1996	      if (P256.w[1] > 0x8000000000000000ull || (P256.w[1] == 0x8000000000000000ull && P256.w[0] > 0x0ull)) {	// the result may be exact
1997		tmp64 = P256.w[1] - 0x8000000000000000ull;	// f* - 1/2
1998		if ((tmp64 > 0x1999999999999999ull
1999		     || (tmp64 == 0x1999999999999999ull
2000			 && P256.w[0] >= 0x9999999999999999ull))) {
2001		  // set the inexact flag
2002		  *pfpsf |= INEXACT_EXCEPTION;
2003		  is_inexact = 1;
2004		}	// else the result is exact
2005	      } else {	// the result is inexact
2006		// set the inexact flag
2007		*pfpsf |= INEXACT_EXCEPTION;
2008		is_inexact = 1;
2009	      }
2010	      C1_hi = P256.w[3];
2011	      C1_lo = P256.w[2];
2012	      if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
2013		is_inexact_lt_midpoint = is_inexact
2014		  && (P256.w[1] & 0x8000000000000000ull);
2015		is_inexact_gt_midpoint = is_inexact
2016		  && !(P256.w[1] & 0x8000000000000000ull);
2017	      }
2018	      // general correction from RN to RA, RM, RP, RZ;
2019	      // result uses y_exp
2020	      if (rnd_mode != ROUNDING_TO_NEAREST) {
2021		if ((!x_sign
2022		     &&
2023		     ((rnd_mode == ROUNDING_UP
2024		       && is_inexact_lt_midpoint)
2025		      ||
2026		      ((rnd_mode == ROUNDING_TIES_AWAY
2027			|| rnd_mode == ROUNDING_UP)
2028		       && is_midpoint_gt_even))) || (x_sign
2029						     &&
2030						     ((rnd_mode ==
2031						       ROUNDING_DOWN
2032						       &&
2033						       is_inexact_lt_midpoint)
2034						      ||
2035						      ((rnd_mode ==
2036							ROUNDING_TIES_AWAY
2037							|| rnd_mode ==
2038							ROUNDING_DOWN)
2039						       &&
2040						       is_midpoint_gt_even))))
2041		{
2042		  // C1 = C1 + 1
2043		  C1_lo = C1_lo + 1;
2044		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
2045		    C1_hi = C1_hi + 1;
2046		  }
2047		  if (C1_hi == 0x0001ed09bead87c0ull
2048		      && C1_lo == 0x378d8e6400000000ull) {
2049		    // C1 = 10^34 => rounding overflow
2050		    C1_hi = 0x0000314dc6448d93ull;
2051		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
2052		    y_exp = y_exp + EXP_P1;
2053		  }
2054		} else
2055		  if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
2056		      &&
2057		      ((x_sign
2058			&& (rnd_mode == ROUNDING_UP
2059			    || rnd_mode == ROUNDING_TO_ZERO))
2060		       || (!x_sign
2061			   && (rnd_mode == ROUNDING_DOWN
2062			       || rnd_mode == ROUNDING_TO_ZERO)))) {
2063		  // C1 = C1 - 1
2064		  C1_lo = C1_lo - 1;
2065		  if (C1_lo == 0xffffffffffffffffull)
2066		    C1_hi--;
2067		  // check if we crossed into the lower decade
2068		  if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
2069		    C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
2070		    C1_lo = 0x378d8e63ffffffffull;
2071		    y_exp = y_exp - EXP_P1;
2072		    // no underflow, because delta + q2 >= P34 + 1
2073		  }
2074		} else {
2075		  ;	// exact, the result is already correct
2076		}
2077		// in all cases check for overflow (RN and RA solved already)
2078		if (y_exp == EXP_MAX_P1) {	// overflow
2079		  if ((rnd_mode == ROUNDING_DOWN && x_sign) ||	// RM and res < 0
2080		      (rnd_mode == ROUNDING_UP && !x_sign)) {	// RP and res > 0
2081		    C1_hi = 0x7800000000000000ull;	// +inf
2082		    C1_lo = 0x0ull;
2083		  } else {	// RM and res > 0, RP and res < 0, or RZ
2084		    C1_hi = 0x5fffed09bead87c0ull;
2085		    C1_lo = 0x378d8e63ffffffffull;
2086		  }
2087		  y_exp = 0;	// x_sign is preserved
2088		  // set the inexact flag (in case the exact addition was exact)
2089		  *pfpsf |= INEXACT_EXCEPTION;
2090		  // set the overflow flag
2091		  *pfpsf |= OVERFLOW_EXCEPTION;
2092		}
2093	      }
2094	    }	// else if (C1 < 10^34) then C1 is the coeff.; the result is exact
2095	  } else {	// if x_sign != y_sign the result is exact
2096	    C1_lo = C1_lo - C2_lo;
2097	    C1_hi = C1_hi - C2_hi;
2098	    if (C1_lo > C1.w[0])
2099	      C1_hi--;
2100	    // the result can be zero, but it cannot overflow
2101	    if (C1_lo == 0 && C1_hi == 0) {
2102	      // assemble the result
2103	      if (x_exp < y_exp)
2104		res.w[1] = x_exp;
2105	      else
2106		res.w[1] = y_exp;
2107	      res.w[0] = 0;
2108	      if (rnd_mode == ROUNDING_DOWN) {
2109		res.w[1] |= 0x8000000000000000ull;
2110	      }
2111	      BID_SWAP128 (res);
2112	      BID_RETURN (res);
2113	    }
2114	    if (C1_hi >= 0x8000000000000000ull) {	// negative coefficient!
2115	      C1_lo = ~C1_lo;
2116	      C1_lo++;
2117	      C1_hi = ~C1_hi;
2118	      if (C1_lo == 0x0)
2119		C1_hi++;
2120	      x_sign = y_sign;	// the result will have the sign of y
2121	    }
2122	  }
2123	  // assemble the result
2124	  res.w[1] = x_sign | y_exp | C1_hi;
2125	  res.w[0] = C1_lo;
2126	} else {	// if (delta >= P34 + 1 - q2)
2127	  // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
2128	  // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34
2129	  // where x1 = q1 + e1 - e2 - P34, 1 <= x1 <= P34 - 1
2130	  // In most cases C' will have P34 digits, and n = C' * 10^(e2+x1)
2131	  // If the result has P34+1 digits, redo the steps above with x1+1
2132	  // If the result has P34-1 digits or less, redo the steps above with
2133	  // x1-1 but only if initially x1 >= 1
2134	  // NOTE: these two steps can be improved, e.g we could guess if
2135	  // P34+1 or P34-1 digits will be obtained by adding/subtracting just
2136	  // the top 64 bits of the two operands
2137	  // The result cannot be zero, but it can overflow
2138	  x1 = delta + q2 - P34;	// 1 <= x1 <= P34-1
2139	roundC2:
2140	  // Calculate C1 * 10^(e1-e2-x1) where 0 <= e1-e2-x1 <= P34 - 1
2141	  // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
2142	  scale = delta - q1 + q2 - x1;	// scale = e1 - e2 - x1 = P34 - q1
2143	  // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
2144	  // but their product fits with certainty in 128 bits (actually in 113)
2145	  if (scale >= 20) {	//10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
2146	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
2147	  } else if (scale >= 1) {
2148	    // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
2149	    if (q1 <= 19) {	// C1 fits in 64 bits
2150	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
2151	    } else {	// q1 >= 20
2152	      C1.w[1] = C1_hi;
2153	      C1.w[0] = C1_lo;
2154	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
2155	    }
2156	  } else {	// if (scale == 0) C1 is unchanged
2157	    C1.w[1] = C1_hi;
2158	    C1.w[0] = C1_lo;
2159	  }
2160	  tmp64 = C1.w[0];	// C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)
2161
2162	  // now round C2 to q2-x1 decimal digits, where 1<=x1<=q2-1<=P34-1
2163	  // (but if we got here a second time after x1 = x1 - 1, then
2164	  // x1 >= 0; note that for x1 = 0 C2 is unchanged)
2165	  // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
2166	  ind = x1 - 1;	// 0 <= ind <= q2-2<=P34-2=32; but note that if x1 = 0
2167	  // during a second pass, then ind = -1
2168	  if (ind >= 0) {	// if (x1 >= 1)
2169	    C2.w[0] = C2_lo;
2170	    C2.w[1] = C2_hi;
2171	    if (ind <= 18) {
2172	      C2.w[0] = C2.w[0] + midpoint64[ind];
2173	      if (C2.w[0] < C2_lo)
2174		C2.w[1]++;
2175	    } else {	// 19 <= ind <= 32
2176	      C2.w[0] = C2.w[0] + midpoint128[ind - 19].w[0];
2177	      C2.w[1] = C2.w[1] + midpoint128[ind - 19].w[1];
2178	      if (C2.w[0] < C2_lo)
2179		C2.w[1]++;
2180	    }
2181	    // the approximation of 10^(-x1) was rounded up to 118 bits
2182	    __mul_128x128_to_256 (R256, C2, ten2mk128[ind]);	// R256 = C2*, f2*
2183	    // calculate C2* and f2*
2184	    // C2* is actually floor(C2*) in this case
2185	    // C2* and f2* need shifting and masking, as shown by
2186	    // shiftright128[] and maskhigh128[]
2187	    // the top Ex bits of 10^(-x1) are T* = ten2mk128trunc[ind], e.g.
2188	    // if x1=1, T*=ten2mk128trunc[0]=0x19999999999999999999999999999999
2189	    // if (0 < f2* < 10^(-x1)) then
2190	    //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
2191	    //       shift; C2* has p decimal digits, correct by Prop. 1)
2192	    //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
2193	    //       shift; C2* has p decimal digits, correct by Pr. 1)
2194	    // else
2195	    //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
2196	    //       correct by Property 1)
2197	    // n = C2* * 10^(e2+x1)
2198
2199	    if (ind <= 2) {
2200	      highf2star.w[1] = 0x0;
2201	      highf2star.w[0] = 0x0;	// low f2* ok
2202	    } else if (ind <= 21) {
2203	      highf2star.w[1] = 0x0;
2204	      highf2star.w[0] = R256.w[2] & maskhigh128[ind];	// low f2* ok
2205	    } else {
2206	      highf2star.w[1] = R256.w[3] & maskhigh128[ind];
2207	      highf2star.w[0] = R256.w[2];	// low f2* is ok
2208	    }
2209	    // shift right C2* by Ex-128 = shiftright128[ind]
2210	    if (ind >= 3) {
2211	      shift = shiftright128[ind];
2212	      if (shift < 64) {	// 3 <= shift <= 63
2213		R256.w[2] =
2214		  (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
2215		R256.w[3] = (R256.w[3] >> shift);
2216	      } else {	// 66 <= shift <= 102
2217		R256.w[2] = (R256.w[3] >> (shift - 64));
2218		R256.w[3] = 0x0ULL;
2219	      }
2220	    }
2221	    if (second_pass) {
2222	      is_inexact_lt_midpoint = 0;
2223	      is_inexact_gt_midpoint = 0;
2224	      is_midpoint_lt_even = 0;
2225	      is_midpoint_gt_even = 0;
2226	    }
2227	    // determine inexactness of the rounding of C2* (this may be
2228	    // followed by a second rounding only if we get P34+1
2229	    // decimal digits)
2230	    // if (0 < f2* - 1/2 < 10^(-x1)) then
2231	    //   the result is exact
2232	    // else (if f2* - 1/2 > T* then)
2233	    //   the result of is inexact
2234	    if (ind <= 2) {
2235	      if (R256.w[1] > 0x8000000000000000ull ||
2236		  (R256.w[1] == 0x8000000000000000ull
2237		   && R256.w[0] > 0x0ull)) {
2238		// f2* > 1/2 and the result may be exact
2239		tmp64A = R256.w[1] - 0x8000000000000000ull;	// f* - 1/2
2240		if ((tmp64A > ten2mk128trunc[ind].w[1]
2241		     || (tmp64A == ten2mk128trunc[ind].w[1]
2242			 && R256.w[0] >= ten2mk128trunc[ind].w[0]))) {
2243		  // set the inexact flag
2244		  // *pfpsf |= INEXACT_EXCEPTION;
2245		  tmp_inexact = 1;	// may be set again during a second pass
2246		  // this rounding is applied to C2 only!
2247		  if (x_sign == y_sign)
2248		    is_inexact_lt_midpoint = 1;
2249		  else	// if (x_sign != y_sign)
2250		    is_inexact_gt_midpoint = 1;
2251		}	// else the result is exact
2252		// rounding down, unless a midpoint in [ODD, EVEN]
2253	      } else {	// the result is inexact; f2* <= 1/2
2254		// set the inexact flag
2255		// *pfpsf |= INEXACT_EXCEPTION;
2256		tmp_inexact = 1;	// just in case we will round a second time
2257		// rounding up, unless a midpoint in [EVEN, ODD]
2258		// this rounding is applied to C2 only!
2259		if (x_sign == y_sign)
2260		  is_inexact_gt_midpoint = 1;
2261		else	// if (x_sign != y_sign)
2262		  is_inexact_lt_midpoint = 1;
2263	      }
2264	    } else if (ind <= 21) {	// if 3 <= ind <= 21
2265	      if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
2266					    && highf2star.w[0] >
2267					    onehalf128[ind])
2268		  || (highf2star.w[1] == 0x0
2269		      && highf2star.w[0] == onehalf128[ind]
2270		      && (R256.w[1] || R256.w[0]))) {
2271		// f2* > 1/2 and the result may be exact
2272		// Calculate f2* - 1/2
2273		tmp64A = highf2star.w[0] - onehalf128[ind];
2274		tmp64B = highf2star.w[1];
2275		if (tmp64A > highf2star.w[0])
2276		  tmp64B--;
2277		if (tmp64B || tmp64A
2278		    || R256.w[1] > ten2mk128trunc[ind].w[1]
2279		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
2280			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
2281		  // set the inexact flag
2282		  // *pfpsf |= INEXACT_EXCEPTION;
2283		  tmp_inexact = 1;	// may be set again during a second pass
2284		  // this rounding is applied to C2 only!
2285		  if (x_sign == y_sign)
2286		    is_inexact_lt_midpoint = 1;
2287		  else	// if (x_sign != y_sign)
2288		    is_inexact_gt_midpoint = 1;
2289		}	// else the result is exact
2290	      } else {	// the result is inexact; f2* <= 1/2
2291		// set the inexact flag
2292		// *pfpsf |= INEXACT_EXCEPTION;
2293		tmp_inexact = 1;	// may be set again during a second pass
2294		// rounding up, unless a midpoint in [EVEN, ODD]
2295		// this rounding is applied to C2 only!
2296		if (x_sign == y_sign)
2297		  is_inexact_gt_midpoint = 1;
2298		else	// if (x_sign != y_sign)
2299		  is_inexact_lt_midpoint = 1;
2300	      }
2301	    } else {	// if 22 <= ind <= 33
2302	      if (highf2star.w[1] > onehalf128[ind]
2303		  || (highf2star.w[1] == onehalf128[ind]
2304		      && (highf2star.w[0] || R256.w[1]
2305			  || R256.w[0]))) {
2306		// f2* > 1/2 and the result may be exact
2307		// Calculate f2* - 1/2
2308		// tmp64A = highf2star.w[0];
2309		tmp64B = highf2star.w[1] - onehalf128[ind];
2310		if (tmp64B || highf2star.w[0]
2311		    || R256.w[1] > ten2mk128trunc[ind].w[1]
2312		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
2313			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
2314		  // set the inexact flag
2315		  // *pfpsf |= INEXACT_EXCEPTION;
2316		  tmp_inexact = 1;	// may be set again during a second pass
2317		  // this rounding is applied to C2 only!
2318		  if (x_sign == y_sign)
2319		    is_inexact_lt_midpoint = 1;
2320		  else	// if (x_sign != y_sign)
2321		    is_inexact_gt_midpoint = 1;
2322		}	// else the result is exact
2323	      } else {	// the result is inexact; f2* <= 1/2
2324		// set the inexact flag
2325		// *pfpsf |= INEXACT_EXCEPTION;
2326		tmp_inexact = 1;	// may be set again during a second pass
2327		// rounding up, unless a midpoint in [EVEN, ODD]
2328		// this rounding is applied to C2 only!
2329		if (x_sign == y_sign)
2330		  is_inexact_gt_midpoint = 1;
2331		else	// if (x_sign != y_sign)
2332		  is_inexact_lt_midpoint = 1;
2333	      }
2334	    }
2335	    // check for midpoints
2336	    if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
2337		&& (highf2star.w[0] == 0)
2338		&& (R256.w[1] < ten2mk128trunc[ind].w[1]
2339		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
2340			&& R256.w[0] <= ten2mk128trunc[ind].w[0]))) {
2341	      // the result is a midpoint
2342	      if ((tmp64 + R256.w[2]) & 0x01) {	// MP in [EVEN, ODD]
2343		// if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
2344		R256.w[2]--;
2345		if (R256.w[2] == 0xffffffffffffffffull)
2346		  R256.w[3]--;
2347		// this rounding is applied to C2 only!
2348		if (x_sign == y_sign)
2349		  is_midpoint_gt_even = 1;
2350		else	// if (x_sign != y_sign)
2351		  is_midpoint_lt_even = 1;
2352		is_inexact_lt_midpoint = 0;
2353		is_inexact_gt_midpoint = 0;
2354	      } else {
2355		// else MP in [ODD, EVEN]
2356		// this rounding is applied to C2 only!
2357		if (x_sign == y_sign)
2358		  is_midpoint_lt_even = 1;
2359		else	// if (x_sign != y_sign)
2360		  is_midpoint_gt_even = 1;
2361		is_inexact_lt_midpoint = 0;
2362		is_inexact_gt_midpoint = 0;
2363	      }
2364	    }
2365	    // end if (ind >= 0)
2366	  } else {	// if (ind == -1); only during a 2nd pass, and when x1 = 0
2367	    R256.w[2] = C2_lo;
2368	    R256.w[3] = C2_hi;
2369	    tmp_inexact = 0;
2370	    // to correct a possible setting to 1 from 1st pass
2371	    if (second_pass) {
2372	      is_midpoint_lt_even = 0;
2373	      is_midpoint_gt_even = 0;
2374	      is_inexact_lt_midpoint = 0;
2375	      is_inexact_gt_midpoint = 0;
2376	    }
2377	  }
2378	  // and now add/subtract C1 * 10^(e1-e2-x1) +/- (C2 * 10^(-x1))rnd,P34
2379	  if (x_sign == y_sign) {	// addition; could overflow
2380	    // no second pass is possible this way (only for x_sign != y_sign)
2381	    C1.w[0] = C1.w[0] + R256.w[2];
2382	    C1.w[1] = C1.w[1] + R256.w[3];
2383	    if (C1.w[0] < tmp64)
2384	      C1.w[1]++;	// carry
2385	    // if the sum has P34+1 digits, i.e. C1>=10^34 redo the calculation
2386	    // with x1=x1+1
2387	    if (C1.w[1] > 0x0001ed09bead87c0ull || (C1.w[1] == 0x0001ed09bead87c0ull && C1.w[0] >= 0x378d8e6400000000ull)) {	// C1 >= 10^34
2388	      // chop off one more digit from the sum, but make sure there is
2389	      // no double-rounding error (see table - double rounding logic)
2390	      // now round C1 from P34+1 to P34 decimal digits
2391	      // C1' = C1 + 1/2 * 10 = C1 + 5
2392	      if (C1.w[0] >= 0xfffffffffffffffbull) {	// low half add has carry
2393		C1.w[0] = C1.w[0] + 5;
2394		C1.w[1] = C1.w[1] + 1;
2395	      } else {
2396		C1.w[0] = C1.w[0] + 5;
2397	      }
2398	      // the approximation of 10^(-1) was rounded up to 118 bits
2399	      __mul_128x128_to_256 (Q256, C1, ten2mk128[0]);	// Q256 = C1*, f1*
2400	      // C1* is actually floor(C1*) in this case
2401	      // the top 128 bits of 10^(-1) are
2402	      // T* = ten2mk128trunc[0]=0x19999999999999999999999999999999
2403	      // if (0 < f1* < 10^(-1)) then
2404	      //   if floor(C1*) is even then C1* = floor(C1*) - logical right
2405	      //       shift; C1* has p decimal digits, correct by Prop. 1)
2406	      //   else if floor(C1*) is odd C1* = floor(C1*) - 1 (logical right
2407	      //       shift; C1* has p decimal digits, correct by Pr. 1)
2408	      // else
2409	      //   C1* = floor(C1*) (logical right shift; C has p decimal digits
2410	      //       correct by Property 1)
2411	      // n = C1* * 10^(e2+x1+1)
2412	      if ((Q256.w[1] || Q256.w[0])
2413		  && (Q256.w[1] < ten2mk128trunc[0].w[1]
2414		      || (Q256.w[1] == ten2mk128trunc[0].w[1]
2415			  && Q256.w[0] <= ten2mk128trunc[0].w[0]))) {
2416		// the result is a midpoint
2417		if (is_inexact_lt_midpoint) {	// for the 1st rounding
2418		  is_inexact_gt_midpoint = 1;
2419		  is_inexact_lt_midpoint = 0;
2420		  is_midpoint_gt_even = 0;
2421		  is_midpoint_lt_even = 0;
2422		} else if (is_inexact_gt_midpoint) {	// for the 1st rounding
2423		  Q256.w[2]--;
2424		  if (Q256.w[2] == 0xffffffffffffffffull)
2425		    Q256.w[3]--;
2426		  is_inexact_gt_midpoint = 0;
2427		  is_inexact_lt_midpoint = 1;
2428		  is_midpoint_gt_even = 0;
2429		  is_midpoint_lt_even = 0;
2430		} else if (is_midpoint_gt_even) {	// for the 1st rounding
2431		  // Note: cannot have is_midpoint_lt_even
2432		  is_inexact_gt_midpoint = 0;
2433		  is_inexact_lt_midpoint = 1;
2434		  is_midpoint_gt_even = 0;
2435		  is_midpoint_lt_even = 0;
2436		} else {	// the first rounding must have been exact
2437		  if (Q256.w[2] & 0x01) {	// MP in [EVEN, ODD]
2438		    // the truncated result is correct
2439		    Q256.w[2]--;
2440		    if (Q256.w[2] == 0xffffffffffffffffull)
2441		      Q256.w[3]--;
2442		    is_inexact_gt_midpoint = 0;
2443		    is_inexact_lt_midpoint = 0;
2444		    is_midpoint_gt_even = 1;
2445		    is_midpoint_lt_even = 0;
2446		  } else {	// MP in [ODD, EVEN]
2447		    is_inexact_gt_midpoint = 0;
2448		    is_inexact_lt_midpoint = 0;
2449		    is_midpoint_gt_even = 0;
2450		    is_midpoint_lt_even = 1;
2451		  }
2452		}
2453		tmp_inexact = 1;	// in all cases
2454	      } else {	// the result is not a midpoint
2455		// determine inexactness of the rounding of C1 (the sum C1+C2*)
2456		// if (0 < f1* - 1/2 < 10^(-1)) then
2457		//   the result is exact
2458		// else (if f1* - 1/2 > T* then)
2459		//   the result of is inexact
2460		// ind = 0
2461		if (Q256.w[1] > 0x8000000000000000ull
2462		    || (Q256.w[1] == 0x8000000000000000ull
2463			&& Q256.w[0] > 0x0ull)) {
2464		  // f1* > 1/2 and the result may be exact
2465		  Q256.w[1] = Q256.w[1] - 0x8000000000000000ull;	// f1* - 1/2
2466		  if ((Q256.w[1] > ten2mk128trunc[0].w[1]
2467		       || (Q256.w[1] == ten2mk128trunc[0].w[1]
2468			   && Q256.w[0] > ten2mk128trunc[0].w[0]))) {
2469		    is_inexact_gt_midpoint = 0;
2470		    is_inexact_lt_midpoint = 1;
2471		    is_midpoint_gt_even = 0;
2472		    is_midpoint_lt_even = 0;
2473		    // set the inexact flag
2474		    tmp_inexact = 1;
2475		    // *pfpsf |= INEXACT_EXCEPTION;
2476		  } else {	// else the result is exact for the 2nd rounding
2477		    if (tmp_inexact) {	// if the previous rounding was inexact
2478		      if (is_midpoint_lt_even) {
2479			is_inexact_gt_midpoint = 1;
2480			is_midpoint_lt_even = 0;
2481		      } else if (is_midpoint_gt_even) {
2482			is_inexact_lt_midpoint = 1;
2483			is_midpoint_gt_even = 0;
2484		      } else {
2485			;	// no change
2486		      }
2487		    }
2488		  }
2489		  // rounding down, unless a midpoint in [ODD, EVEN]
2490		} else {	// the result is inexact; f1* <= 1/2
2491		  is_inexact_gt_midpoint = 1;
2492		  is_inexact_lt_midpoint = 0;
2493		  is_midpoint_gt_even = 0;
2494		  is_midpoint_lt_even = 0;
2495		  // set the inexact flag
2496		  tmp_inexact = 1;
2497		  // *pfpsf |= INEXACT_EXCEPTION;
2498		}
2499	      }	// end 'the result is not a midpoint'
2500	      // n = C1 * 10^(e2+x1)
2501	      C1.w[1] = Q256.w[3];
2502	      C1.w[0] = Q256.w[2];
2503	      y_exp = y_exp + ((UINT64) (x1 + 1) << 49);
2504	    } else {	// C1 < 10^34
2505	      // C1.w[1] and C1.w[0] already set
2506	      // n = C1 * 10^(e2+x1)
2507	      y_exp = y_exp + ((UINT64) x1 << 49);
2508	    }
2509	    // check for overflow
2510	    if (y_exp == EXP_MAX_P1
2511		&& (rnd_mode == ROUNDING_TO_NEAREST
2512		    || rnd_mode == ROUNDING_TIES_AWAY)) {
2513	      res.w[1] = 0x7800000000000000ull | x_sign;	// +/-inf
2514	      res.w[0] = 0x0ull;
2515	      // set the inexact flag
2516	      *pfpsf |= INEXACT_EXCEPTION;
2517	      // set the overflow flag
2518	      *pfpsf |= OVERFLOW_EXCEPTION;
2519	      BID_SWAP128 (res);
2520	      BID_RETURN (res);
2521	    }	// else no overflow
2522	  } else {	// if x_sign != y_sign the result of this subtract. is exact
2523	    C1.w[0] = C1.w[0] - R256.w[2];
2524	    C1.w[1] = C1.w[1] - R256.w[3];
2525	    if (C1.w[0] > tmp64)
2526	      C1.w[1]--;	// borrow
2527	    if (C1.w[1] >= 0x8000000000000000ull) {	// negative coefficient!
2528	      C1.w[0] = ~C1.w[0];
2529	      C1.w[0]++;
2530	      C1.w[1] = ~C1.w[1];
2531	      if (C1.w[0] == 0x0)
2532		C1.w[1]++;
2533	      tmp_sign = y_sign;
2534	      // the result will have the sign of y if last rnd
2535	    } else {
2536	      tmp_sign = x_sign;
2537	    }
2538	    // if the difference has P34-1 digits or less, i.e. C1 < 10^33 then
2539	    //   redo the calculation with x1=x1-1;
2540	    // redo the calculation also if C1 = 10^33 and
2541	    //   (is_inexact_gt_midpoint or is_midpoint_lt_even);
2542	    //   (the last part should have really been
2543	    //   (is_inexact_lt_midpoint or is_midpoint_gt_even) from
2544	    //    the rounding of C2, but the position flags have been reversed)
2545	    // 10^33 = 0x0000314dc6448d93 0x38c15b0a00000000
2546	    if ((C1.w[1] < 0x0000314dc6448d93ull || (C1.w[1] == 0x0000314dc6448d93ull && C1.w[0] < 0x38c15b0a00000000ull)) || (C1.w[1] == 0x0000314dc6448d93ull && C1.w[0] == 0x38c15b0a00000000ull && (is_inexact_gt_midpoint || is_midpoint_lt_even))) {	// C1=10^33
2547	      x1 = x1 - 1;	// x1 >= 0
2548	      if (x1 >= 0) {
2549		// clear position flags and tmp_inexact
2550		is_midpoint_lt_even = 0;
2551		is_midpoint_gt_even = 0;
2552		is_inexact_lt_midpoint = 0;
2553		is_inexact_gt_midpoint = 0;
2554		tmp_inexact = 0;
2555		second_pass = 1;
2556		goto roundC2;	// else result has less than P34 digits
2557	      }
2558	    }
2559	    // if the coefficient of the result is 10^34 it means that this
2560	    // must be the second pass, and we are done
2561	    if (C1.w[1] == 0x0001ed09bead87c0ull && C1.w[0] == 0x378d8e6400000000ull) {	// if  C1 = 10^34
2562	      C1.w[1] = 0x0000314dc6448d93ull;	// C1 = 10^33
2563	      C1.w[0] = 0x38c15b0a00000000ull;
2564	      y_exp = y_exp + ((UINT64) 1 << 49);
2565	    }
2566	    x_sign = tmp_sign;
2567	    if (x1 >= 1)
2568	      y_exp = y_exp + ((UINT64) x1 << 49);
2569	    // x1 = -1 is possible at the end of a second pass when the
2570	    // first pass started with x1 = 1
2571	  }
2572	  C1_hi = C1.w[1];
2573	  C1_lo = C1.w[0];
2574	  // general correction from RN to RA, RM, RP, RZ; result uses y_exp
2575	  if (rnd_mode != ROUNDING_TO_NEAREST) {
2576	    if ((!x_sign
2577		 && ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint)
2578		     ||
2579		     ((rnd_mode == ROUNDING_TIES_AWAY
2580		       || rnd_mode == ROUNDING_UP)
2581		      && is_midpoint_gt_even))) || (x_sign
2582						    &&
2583						    ((rnd_mode ==
2584						      ROUNDING_DOWN
2585						      &&
2586						      is_inexact_lt_midpoint)
2587						     ||
2588						     ((rnd_mode ==
2589						       ROUNDING_TIES_AWAY
2590						       || rnd_mode ==
2591						       ROUNDING_DOWN)
2592						      &&
2593						      is_midpoint_gt_even))))
2594	    {
2595	      // C1 = C1 + 1
2596	      C1_lo = C1_lo + 1;
2597	      if (C1_lo == 0) {	// rounding overflow in the low 64 bits
2598		C1_hi = C1_hi + 1;
2599	      }
2600	      if (C1_hi == 0x0001ed09bead87c0ull
2601		  && C1_lo == 0x378d8e6400000000ull) {
2602		// C1 = 10^34 => rounding overflow
2603		C1_hi = 0x0000314dc6448d93ull;
2604		C1_lo = 0x38c15b0a00000000ull;	// 10^33
2605		y_exp = y_exp + EXP_P1;
2606	      }
2607	    } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
2608		       &&
2609		       ((x_sign
2610			 && (rnd_mode == ROUNDING_UP
2611			     || rnd_mode == ROUNDING_TO_ZERO))
2612			|| (!x_sign
2613			    && (rnd_mode == ROUNDING_DOWN
2614				|| rnd_mode == ROUNDING_TO_ZERO)))) {
2615	      // C1 = C1 - 1
2616	      C1_lo = C1_lo - 1;
2617	      if (C1_lo == 0xffffffffffffffffull)
2618		C1_hi--;
2619	      // check if we crossed into the lower decade
2620	      if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
2621		C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
2622		C1_lo = 0x378d8e63ffffffffull;
2623		y_exp = y_exp - EXP_P1;
2624		// no underflow, because delta + q2 >= P34 + 1
2625	      }
2626	    } else {
2627	      ;	// exact, the result is already correct
2628	    }
2629	    // in all cases check for overflow (RN and RA solved already)
2630	    if (y_exp == EXP_MAX_P1) {	// overflow
2631	      if ((rnd_mode == ROUNDING_DOWN && x_sign) ||	// RM and res < 0
2632		  (rnd_mode == ROUNDING_UP && !x_sign)) {	// RP and res > 0
2633		C1_hi = 0x7800000000000000ull;	// +inf
2634		C1_lo = 0x0ull;
2635	      } else {	// RM and res > 0, RP and res < 0, or RZ
2636		C1_hi = 0x5fffed09bead87c0ull;
2637		C1_lo = 0x378d8e63ffffffffull;
2638	      }
2639	      y_exp = 0;	// x_sign is preserved
2640	      // set the inexact flag (in case the exact addition was exact)
2641	      *pfpsf |= INEXACT_EXCEPTION;
2642	      // set the overflow flag
2643	      *pfpsf |= OVERFLOW_EXCEPTION;
2644	    }
2645	  }
2646	  // assemble the result
2647	  res.w[1] = x_sign | y_exp | C1_hi;
2648	  res.w[0] = C1_lo;
2649	  if (tmp_inexact)
2650	    *pfpsf |= INEXACT_EXCEPTION;
2651	}
2652      } else {	// if (-P34 + 1 <= delta <= -1) <=> 1 <= -delta <= P34 - 1
2653	// NOTE: the following, up to "} else { // if x_sign != y_sign
2654	// the result is exact" is identical to "else if (delta == P34 - q2) {"
2655	// from above; also, the code is not symmetric: a+b and b+a may take
2656	// different paths (need to unify eventually!)
2657	// calculate C' = C2 + C1 * 10^(e1-e2) directly; the result may be
2658	// inexact if it requires P34 + 1 decimal digits; in either case the
2659	// 'cutoff' point for addition is at the position of the lsb of C2
2660	// The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
2661	// exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
2662	// but their product fits with certainty in 128 bits (actually in 113)
2663	// Note that 0 <= e1 - e2 <= P34 - 2
2664	//   -P34 + 1 <= delta <= -1 <=> -P34 + 1 <= delta <= -1 <=>
2665	//   -P34 + 1 <= q1 + e1 - q2 - e2 <= -1 <=>
2666	//   q2 - q1 - P34 + 1 <= e1 - e2 <= q2 - q1 - 1 <=>
2667	//   1 - P34 - P34 + 1 <= e1-e2 <= P34 - 1 - 1 => 0 <= e1-e2 <= P34 - 2
2668	scale = delta - q1 + q2;	// scale = (int)(e1 >> 49) - (int)(e2 >> 49)
2669	if (scale >= 20) {	// 10^(e1-e2) does not fit in 64 bits, but C1 does
2670	  __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
2671	} else if (scale >= 1) {
2672	  // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
2673	  if (q1 <= 19) {	// C1 fits in 64 bits
2674	    __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
2675	  } else {	// q1 >= 20
2676	    C1.w[1] = C1_hi;
2677	    C1.w[0] = C1_lo;
2678	    __mul_128x64_to_128 (C1, ten2k64[scale], C1);
2679	  }
2680	} else {	// if (scale == 0) C1 is unchanged
2681	  C1.w[1] = C1_hi;
2682	  C1.w[0] = C1_lo;	// only the low part is necessary
2683	}
2684	C1_hi = C1.w[1];
2685	C1_lo = C1.w[0];
2686	// now add C2
2687	if (x_sign == y_sign) {
2688	  // the result can overflow!
2689	  C1_lo = C1_lo + C2_lo;
2690	  C1_hi = C1_hi + C2_hi;
2691	  if (C1_lo < C1.w[0])
2692	    C1_hi++;
2693	  // test for overflow, possible only when C1 >= 10^34
2694	  if (C1_hi > 0x0001ed09bead87c0ull || (C1_hi == 0x0001ed09bead87c0ull && C1_lo >= 0x378d8e6400000000ull)) {	// C1 >= 10^34
2695	    // in this case q = P34 + 1 and x = q - P34 = 1, so multiply
2696	    // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1
2697	    // decimal digits
2698	    // Calculate C'' = C' + 1/2 * 10^x
2699	    if (C1_lo >= 0xfffffffffffffffbull) {	// low half add has carry
2700	      C1_lo = C1_lo + 5;
2701	      C1_hi = C1_hi + 1;
2702	    } else {
2703	      C1_lo = C1_lo + 5;
2704	    }
2705	    // the approximation of 10^(-1) was rounded up to 118 bits
2706	    // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
2707	    // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
2708	    C1.w[1] = C1_hi;
2709	    C1.w[0] = C1_lo;	// C''
2710	    ten2m1.w[1] = 0x1999999999999999ull;
2711	    ten2m1.w[0] = 0x9999999999999a00ull;
2712	    __mul_128x128_to_256 (P256, C1, ten2m1);	// P256 = C*, f*
2713	    // C* is actually floor(C*) in this case
2714	    // the top Ex = 128 bits of 10^(-1) are
2715	    // T* = 0x00199999999999999999999999999999
2716	    // if (0 < f* < 10^(-x)) then
2717	    //   if floor(C*) is even then C = floor(C*) - logical right
2718	    //       shift; C has p decimal digits, correct by Prop. 1)
2719	    //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
2720	    //       shift; C has p decimal digits, correct by Pr. 1)
2721	    // else
2722	    //   C = floor(C*) (logical right shift; C has p decimal digits,
2723	    //       correct by Property 1)
2724	    // n = C * 10^(e2+x)
2725	    if ((P256.w[1] || P256.w[0])
2726		&& (P256.w[1] < 0x1999999999999999ull
2727		    || (P256.w[1] == 0x1999999999999999ull
2728			&& P256.w[0] <= 0x9999999999999999ull))) {
2729	      // the result is a midpoint
2730	      if (P256.w[2] & 0x01) {
2731		is_midpoint_gt_even = 1;
2732		// if floor(C*) is odd C = floor(C*) - 1; the result is not 0
2733		P256.w[2]--;
2734		if (P256.w[2] == 0xffffffffffffffffull)
2735		  P256.w[3]--;
2736	      } else {
2737		is_midpoint_lt_even = 1;
2738	      }
2739	    }
2740	    // n = Cstar * 10^(e2+1)
2741	    y_exp = y_exp + EXP_P1;
2742	    // C* != 10^P34 because C* has P34 digits
2743	    // check for overflow
2744	    if (y_exp == EXP_MAX_P1
2745		&& (rnd_mode == ROUNDING_TO_NEAREST
2746		    || rnd_mode == ROUNDING_TIES_AWAY)) {
2747	      // overflow for RN
2748	      res.w[1] = x_sign | 0x7800000000000000ull;	// +/-inf
2749	      res.w[0] = 0x0ull;
2750	      // set the inexact flag
2751	      *pfpsf |= INEXACT_EXCEPTION;
2752	      // set the overflow flag
2753	      *pfpsf |= OVERFLOW_EXCEPTION;
2754	      BID_SWAP128 (res);
2755	      BID_RETURN (res);
2756	    }
2757	    // if (0 < f* - 1/2 < 10^(-x)) then
2758	    //   the result of the addition is exact
2759	    // else
2760	    //   the result of the addition is inexact
2761	    if (P256.w[1] > 0x8000000000000000ull || (P256.w[1] == 0x8000000000000000ull && P256.w[0] > 0x0ull)) {	// the result may be exact
2762	      tmp64 = P256.w[1] - 0x8000000000000000ull;	// f* - 1/2
2763	      if ((tmp64 > 0x1999999999999999ull
2764		   || (tmp64 == 0x1999999999999999ull
2765		       && P256.w[0] >= 0x9999999999999999ull))) {
2766		// set the inexact flag
2767		*pfpsf |= INEXACT_EXCEPTION;
2768		is_inexact = 1;
2769	      }	// else the result is exact
2770	    } else {	// the result is inexact
2771	      // set the inexact flag
2772	      *pfpsf |= INEXACT_EXCEPTION;
2773	      is_inexact = 1;
2774	    }
2775	    C1_hi = P256.w[3];
2776	    C1_lo = P256.w[2];
2777	    if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
2778	      is_inexact_lt_midpoint = is_inexact
2779		&& (P256.w[1] & 0x8000000000000000ull);
2780	      is_inexact_gt_midpoint = is_inexact
2781		&& !(P256.w[1] & 0x8000000000000000ull);
2782	    }
2783	    // general correction from RN to RA, RM, RP, RZ; result uses y_exp
2784	    if (rnd_mode != ROUNDING_TO_NEAREST) {
2785	      if ((!x_sign
2786		   && ((rnd_mode == ROUNDING_UP
2787			&& is_inexact_lt_midpoint)
2788		       || ((rnd_mode == ROUNDING_TIES_AWAY
2789			    || rnd_mode == ROUNDING_UP)
2790			   && is_midpoint_gt_even))) || (x_sign
2791							 &&
2792							 ((rnd_mode ==
2793							   ROUNDING_DOWN
2794							   &&
2795							   is_inexact_lt_midpoint)
2796							  ||
2797							  ((rnd_mode ==
2798							    ROUNDING_TIES_AWAY
2799							    || rnd_mode
2800							    ==
2801							    ROUNDING_DOWN)
2802							   &&
2803							   is_midpoint_gt_even))))
2804	      {
2805		// C1 = C1 + 1
2806		C1_lo = C1_lo + 1;
2807		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
2808		  C1_hi = C1_hi + 1;
2809		}
2810		if (C1_hi == 0x0001ed09bead87c0ull
2811		    && C1_lo == 0x378d8e6400000000ull) {
2812		  // C1 = 10^34 => rounding overflow
2813		  C1_hi = 0x0000314dc6448d93ull;
2814		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
2815		  y_exp = y_exp + EXP_P1;
2816		}
2817	      } else
2818		if ((is_midpoint_lt_even || is_inexact_gt_midpoint) &&
2819		    ((x_sign && (rnd_mode == ROUNDING_UP ||
2820				 rnd_mode == ROUNDING_TO_ZERO)) ||
2821		     (!x_sign && (rnd_mode == ROUNDING_DOWN ||
2822				  rnd_mode == ROUNDING_TO_ZERO)))) {
2823		// C1 = C1 - 1
2824		C1_lo = C1_lo - 1;
2825		if (C1_lo == 0xffffffffffffffffull)
2826		  C1_hi--;
2827		// check if we crossed into the lower decade
2828		if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
2829		  C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
2830		  C1_lo = 0x378d8e63ffffffffull;
2831		  y_exp = y_exp - EXP_P1;
2832		  // no underflow, because delta + q2 >= P34 + 1
2833		}
2834	      } else {
2835		;	// exact, the result is already correct
2836	      }
2837	      // in all cases check for overflow (RN and RA solved already)
2838	      if (y_exp == EXP_MAX_P1) {	// overflow
2839		if ((rnd_mode == ROUNDING_DOWN && x_sign) ||	// RM and res < 0
2840		    (rnd_mode == ROUNDING_UP && !x_sign)) {	// RP and res > 0
2841		  C1_hi = 0x7800000000000000ull;	// +inf
2842		  C1_lo = 0x0ull;
2843		} else {	// RM and res > 0, RP and res < 0, or RZ
2844		  C1_hi = 0x5fffed09bead87c0ull;
2845		  C1_lo = 0x378d8e63ffffffffull;
2846		}
2847		y_exp = 0;	// x_sign is preserved
2848		// set the inexact flag (in case the exact addition was exact)
2849		*pfpsf |= INEXACT_EXCEPTION;
2850		// set the overflow flag
2851		*pfpsf |= OVERFLOW_EXCEPTION;
2852	      }
2853	    }
2854	  }	// else if (C1 < 10^34) then C1 is the coeff.; the result is exact
2855	  // assemble the result
2856	  res.w[1] = x_sign | y_exp | C1_hi;
2857	  res.w[0] = C1_lo;
2858	} else {	// if x_sign != y_sign the result is exact
2859	  C1_lo = C2_lo - C1_lo;
2860	  C1_hi = C2_hi - C1_hi;
2861	  if (C1_lo > C2_lo)
2862	    C1_hi--;
2863	  if (C1_hi >= 0x8000000000000000ull) {	// negative coefficient!
2864	    C1_lo = ~C1_lo;
2865	    C1_lo++;
2866	    C1_hi = ~C1_hi;
2867	    if (C1_lo == 0x0)
2868	      C1_hi++;
2869	    x_sign = y_sign;	// the result will have the sign of y
2870	  }
2871	  // the result can be zero, but it cannot overflow
2872	  if (C1_lo == 0 && C1_hi == 0) {
2873	    // assemble the result
2874	    if (x_exp < y_exp)
2875	      res.w[1] = x_exp;
2876	    else
2877	      res.w[1] = y_exp;
2878	    res.w[0] = 0;
2879	    if (rnd_mode == ROUNDING_DOWN) {
2880	      res.w[1] |= 0x8000000000000000ull;
2881	    }
2882	    BID_SWAP128 (res);
2883	    BID_RETURN (res);
2884	  }
2885	  // assemble the result
2886	  res.w[1] = y_sign | y_exp | C1_hi;
2887	  res.w[0] = C1_lo;
2888	}
2889      }
2890    }
2891    BID_SWAP128 (res);
2892    BID_RETURN (res)
2893  }
2894}
2895
2896
2897
2898// bid128_sub stands for bid128qq_sub
2899
2900/*****************************************************************************
2901 *  BID128 sub
2902 ****************************************************************************/
2903
2904#if DECIMAL_CALL_BY_REFERENCE
2905void
2906bid128_sub (UINT128 * pres, UINT128 * px, UINT128 * py
2907	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
2908	    _EXC_INFO_PARAM) {
2909  UINT128 x = *px, y = *py;
2910#if !DECIMAL_GLOBAL_ROUNDING
2911  unsigned int rnd_mode = *prnd_mode;
2912#endif
2913#else
2914UINT128
2915bid128_sub (UINT128 x, UINT128 y
2916	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
2917	    _EXC_INFO_PARAM) {
2918#endif
2919
2920  UINT128 res;
2921  UINT64 y_sign;
2922
2923  if ((y.w[HIGH_128W] & MASK_NAN) != MASK_NAN) {	// y is not NAN
2924    // change its sign
2925    y_sign = y.w[HIGH_128W] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative
2926    if (y_sign)
2927      y.w[HIGH_128W] = y.w[HIGH_128W] & 0x7fffffffffffffffull;
2928    else
2929      y.w[HIGH_128W] = y.w[HIGH_128W] | 0x8000000000000000ull;
2930  }
2931#if DECIMAL_CALL_BY_REFERENCE
2932  bid128_add (&res, &x, &y
2933	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
2934	      _EXC_INFO_ARG);
2935#else
2936  res = bid128_add (x, y
2937		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
2938		    _EXC_INFO_ARG);
2939#endif
2940  BID_RETURN (res);
2941}
2942