1/* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
2
3Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003,
42004, 2005, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
5
6This file is free software; you can redistribute it and/or modify it under the
7terms of the GNU Lesser General Public License as published by the Free
8Software Foundation; either version 3 of the License, or (at your option) any
9later version.
10
11This file is distributed in the hope that it will be useful, but WITHOUT ANY
12WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
13PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
14details.
15
16You should have received a copy of the GNU Lesser General Public License
17along with this file.  If not, see http://www.gnu.org/licenses/.  */
18
19/* You have to define the following before including this file:
20
21   UWtype -- An unsigned type, default type for operations (typically a "word")
22   UHWtype -- An unsigned type, at least half the size of UWtype.
23   UDWtype -- An unsigned type, at least twice as large a UWtype
24   W_TYPE_SIZE -- size in bits of UWtype
25
26   SItype, USItype -- Signed and unsigned 32 bit types.
27   DItype, UDItype -- Signed and unsigned 64 bit types.
28
29   On a 32 bit machine UWtype should typically be USItype;
30   on a 64 bit machine, UWtype should typically be UDItype.
31
32   CAUTION!  Using this file outside of GMP is not safe.  You need to include
33   gmp.h and gmp-impl.h, or certain things might not work as expected.
34*/
35
36#define __BITS4 (W_TYPE_SIZE / 4)
37#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
38#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
39#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
40
41/* This is used to make sure no undesirable sharing between different libraries
42   that use this file takes place.  */
43#ifndef __MPN
44#define __MPN(x) __##x
45#endif
46
47#ifndef _PROTO
48#if (__STDC__-0) || defined (__cplusplus)
49#define _PROTO(x) x
50#else
51#define _PROTO(x) ()
52#endif
53#endif
54
55/* Define auxiliary asm macros.
56
57   1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
58   UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
59   word product in HIGH_PROD and LOW_PROD.
60
61   2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
62   UDWtype product.  This is just a variant of umul_ppmm.
63
64   3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
65   denominator) divides a UDWtype, composed by the UWtype integers
66   HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
67   in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
68   than DENOMINATOR for correct operation.  If, in addition, the most
69   significant bit of DENOMINATOR must be 1, then the pre-processor symbol
70   UDIV_NEEDS_NORMALIZATION is defined to 1.
71
72   4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
73   denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
74   is rounded toward 0.
75
76   5) count_leading_zeros(count, x) counts the number of zero-bits from the
77   msb to the first non-zero bit in the UWtype X.  This is the number of
78   steps X needs to be shifted left to set the msb.  Undefined for X == 0,
79   unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
80
81   6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
82   from the least significant end.
83
84   7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
85   high_addend_2, low_addend_2) adds two UWtype integers, composed by
86   HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
87   respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
88   (i.e. carry out) is not stored anywhere, and is lost.
89
90   8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
91   high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
92   composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
93   LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
94   and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
95   and is lost.
96
97   If any of these macros are left undefined for a particular CPU,
98   C macros are used.
99
100
101   Notes:
102
103   For add_ssaaaa the two high and two low addends can both commute, but
104   unfortunately gcc only supports one "%" commutative in each asm block.
105   This has always been so but is only documented in recent versions
106   (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
107   compiler error in certain rare circumstances.
108
109   Apparently it was only the last "%" that was ever actually respected, so
110   the code has been updated to leave just that.  Clearly there's a free
111   choice whether high or low should get it, if there's a reason to favour
112   one over the other.  Also obviously when the constraints on the two
113   operands are identical there's no benefit to the reloader in any "%" at
114   all.
115
116   */
117
118/* The CPUs come in alphabetical order below.
119
120   Please add support for more CPUs here, or improve the current support
121   for the CPUs below!  */
122
123
124/* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
125   3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
126   Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
127   __builtin_ctzll.
128
129   These builtins are only used when we check what code comes out, on some
130   chips they're merely libgcc calls, where we will instead want an inline
131   in that case (either asm or generic C).
132
133   These builtins are better than an asm block of the same insn, since an
134   asm block doesn't give gcc any information about scheduling or resource
135   usage.  We keep an asm block for use on prior versions of gcc though.
136
137   For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
138   it's not used (for count_leading_zeros) because it generally gives extra
139   code to ensure the result is 0 when the input is 0, which we don't need
140   or want.  */
141
142#ifdef _LONG_LONG_LIMB
143#define count_leading_zeros_gcc_clz(count,x)    \
144  do {                                          \
145    ASSERT ((x) != 0);                          \
146    (count) = __builtin_clzll (x);              \
147  } while (0)
148#else
149#define count_leading_zeros_gcc_clz(count,x)    \
150  do {                                          \
151    ASSERT ((x) != 0);                          \
152    (count) = __builtin_clzl (x);               \
153  } while (0)
154#endif
155
156#ifdef _LONG_LONG_LIMB
157#define count_trailing_zeros_gcc_ctz(count,x)   \
158  do {                                          \
159    ASSERT ((x) != 0);                          \
160    (count) = __builtin_ctzll (x);              \
161  } while (0)
162#else
163#define count_trailing_zeros_gcc_ctz(count,x)   \
164  do {                                          \
165    ASSERT ((x) != 0);                          \
166    (count) = __builtin_ctzl (x);               \
167  } while (0)
168#endif
169
170/* Note: the following FIXME comes from GMP, thus it does make sense to try
171   to resolve it in MPFR. */
172/* FIXME: The macros using external routines like __MPN(count_leading_zeros)
173   don't need to be under !NO_ASM */
174#if ! defined (NO_ASM)
175
176#if defined (__alpha) && W_TYPE_SIZE == 64
177/* Most alpha-based machines, except Cray systems. */
178#if defined (__GNUC__)
179#if __GMP_GNUC_PREREQ (3,3)
180#define umul_ppmm(ph, pl, m0, m1) \
181  do {									\
182    UDItype __m0 = (m0), __m1 = (m1);					\
183    (ph) = __builtin_alpha_umulh (__m0, __m1);				\
184    (pl) = __m0 * __m1;							\
185  } while (0)
186#else
187#define umul_ppmm(ph, pl, m0, m1) \
188  do {									\
189    UDItype __m0 = (m0), __m1 = (m1);					\
190    __asm__ ("umulh %r1,%2,%0"						\
191	     : "=r" (ph)						\
192	     : "%rJ" (m0), "rI" (m1));					\
193    (pl) = __m0 * __m1;							\
194  } while (0)
195#endif
196#define UMUL_TIME 18
197#else /* ! __GNUC__ */
198#include <machine/builtins.h>
199#define umul_ppmm(ph, pl, m0, m1) \
200  do {									\
201    UDItype __m0 = (m0), __m1 = (m1);					\
202    (ph) = __UMULH (m0, m1);						\
203    (pl) = __m0 * __m1;							\
204  } while (0)
205#endif
206#ifndef LONGLONG_STANDALONE
207#define udiv_qrnnd(q, r, n1, n0, d) \
208  do { UWtype __di;							\
209    __di = __MPN(invert_limb) (d);					\
210    udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
211  } while (0)
212#define UDIV_PREINV_ALWAYS  1
213#define UDIV_NEEDS_NORMALIZATION 1
214#define UDIV_TIME 220
215#endif /* LONGLONG_STANDALONE */
216
217/* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
218   always goes into libgmp.so, even when not actually used.  */
219#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
220
221#if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
222#define count_leading_zeros(COUNT,X) \
223  __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
224#define count_trailing_zeros(COUNT,X) \
225  __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
226#endif /* clz/ctz using cix */
227
228#if ! defined (count_leading_zeros)                             \
229  && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
230/* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
231   "$31" is written explicitly in the asm, since an "r" constraint won't
232   select reg 31.  There seems no need to worry about "r31" syntax for cray,
233   since gcc itself (pre-release 3.4) emits just $31 in various places.  */
234#define ALPHA_CMPBGE_0(dst, src)                                        \
235  do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
236/* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
237   them, locating the highest non-zero byte.  A second __clz_tab lookup
238   counts the leading zero bits in that byte, giving the result.  */
239#define count_leading_zeros(count, x)                                   \
240  do {                                                                  \
241    UWtype  __clz__b, __clz__c, __clz__x = (x);                         \
242    ALPHA_CMPBGE_0 (__clz__b,  __clz__x);           /* zero bytes */    \
243    __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */   \
244    __clz__b = __clz__b * 8 - 7;                    /* 57 to 1 shift */ \
245    __clz__x >>= __clz__b;                                              \
246    __clz__c = __clz_tab [__clz__x];                /* 8 to 1 bit */    \
247    __clz__b = 65 - __clz__b;                                           \
248    (count) = __clz__b - __clz__c;                                      \
249  } while (0)
250#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
251#endif /* clz using cmpbge */
252
253#if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
254#if HAVE_ATTRIBUTE_CONST
255long __MPN(count_leading_zeros) _PROTO ((UDItype)) __attribute__ ((const));
256#else
257long __MPN(count_leading_zeros) _PROTO ((UDItype));
258#endif
259#define count_leading_zeros(count, x) \
260  ((count) = __MPN(count_leading_zeros) (x))
261#endif /* clz using mpn */
262#endif /* __alpha */
263
264#if defined (_CRAY) && W_TYPE_SIZE == 64
265#include <intrinsics.h>
266#define UDIV_PREINV_ALWAYS  1
267#define UDIV_NEEDS_NORMALIZATION 1
268#define UDIV_TIME 220
269long __MPN(count_leading_zeros) _PROTO ((UDItype));
270#define count_leading_zeros(count, x) \
271  ((count) = _leadz ((UWtype) (x)))
272#if defined (_CRAYIEEE)		/* I.e., Cray T90/ieee, T3D, and T3E */
273#define umul_ppmm(ph, pl, m0, m1) \
274  do {									\
275    UDItype __m0 = (m0), __m1 = (m1);					\
276    (ph) = _int_mult_upper (m0, m1);					\
277    (pl) = __m0 * __m1;							\
278  } while (0)
279#ifndef LONGLONG_STANDALONE
280#define udiv_qrnnd(q, r, n1, n0, d) \
281  do { UWtype __di;							\
282    __di = __MPN(invert_limb) (d);					\
283    udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
284  } while (0)
285#endif /* LONGLONG_STANDALONE */
286#endif /* _CRAYIEEE */
287#endif /* _CRAY */
288
289#if defined (__ia64) && W_TYPE_SIZE == 64
290/* This form encourages gcc (pre-release 3.4 at least) to emit predicated
291   "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
292   code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
293   register, which takes an extra cycle.  */
294#define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
295  do {                                          \
296    UWtype __x;                                 \
297    __x = (al) - (bl);                          \
298    if ((al) < (bl))                            \
299      (sh) = (ah) - (bh) - 1;                   \
300    else                                        \
301      (sh) = (ah) - (bh);                       \
302    (sl) = __x;                                 \
303  } while (0)
304#if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
305/* Do both product parts in assembly, since that gives better code with
306   all gcc versions.  Some callers will just use the upper part, and in
307   that situation we waste an instruction, but not any cycles.  */
308#define umul_ppmm(ph, pl, m0, m1) \
309    __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"		\
310	     : "=&f" (ph), "=f" (pl)					\
311	     : "f" (m0), "f" (m1))
312#define UMUL_TIME 14
313#define count_leading_zeros(count, x) \
314  do {									\
315    UWtype _x = (x), _y, _a, _c;					\
316    __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));		\
317    __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));		\
318    _c = (_a - 1) << 3;							\
319    _x >>= _c;								\
320    if (_x >= 1 << 4)							\
321      _x >>= 4, _c += 4;						\
322    if (_x >= 1 << 2)							\
323      _x >>= 2, _c += 2;						\
324    _c += _x >> 1;							\
325    (count) =  W_TYPE_SIZE - 1 - _c;					\
326  } while (0)
327/* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
328   based, and we don't need a special case for x==0 here */
329#define count_trailing_zeros(count, x)					\
330  do {									\
331    UWtype __ctz_x = (x);						\
332    __asm__ ("popcnt %0 = %1"						\
333	     : "=r" (count)						\
334	     : "r" ((__ctz_x-1) & ~__ctz_x));				\
335  } while (0)
336#endif
337#if defined (__INTEL_COMPILER)
338#include <ia64intrin.h>
339#define umul_ppmm(ph, pl, m0, m1)					\
340  do {									\
341    UWtype _m0 = (m0), _m1 = (m1);					\
342    ph = _m64_xmahu (_m0, _m1, 0);					\
343    pl = _m0 * _m1;							\
344  } while (0)
345#endif
346#ifndef LONGLONG_STANDALONE
347#define udiv_qrnnd(q, r, n1, n0, d) \
348  do { UWtype __di;							\
349    __di = __MPN(invert_limb) (d);					\
350    udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
351  } while (0)
352#define UDIV_PREINV_ALWAYS  1
353#define UDIV_NEEDS_NORMALIZATION 1
354#endif
355#define UDIV_TIME 220
356#endif
357
358
359#if defined (__GNUC__)
360
361/* We sometimes need to clobber "cc" with gcc2, but that would not be
362   understood by gcc1.  Use cpp to avoid major code duplication.  */
363#if __GNUC__ < 2
364#define __CLOBBER_CC
365#define __AND_CLOBBER_CC
366#else /* __GNUC__ >= 2 */
367#define __CLOBBER_CC : "cc"
368#define __AND_CLOBBER_CC , "cc"
369#endif /* __GNUC__ < 2 */
370
371#if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
372#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
373  __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"				\
374	   : "=r" (sh), "=&r" (sl)					\
375	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
376#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
377  __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"				\
378	   : "=r" (sh), "=&r" (sl)					\
379	   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
380#define umul_ppmm(xh, xl, m0, m1) \
381  do {									\
382    USItype __m0 = (m0), __m1 = (m1);					\
383    __asm__ ("multiplu %0,%1,%2"					\
384	     : "=r" (xl)						\
385	     : "r" (__m0), "r" (__m1));					\
386    __asm__ ("multmu %0,%1,%2"						\
387	     : "=r" (xh)						\
388	     : "r" (__m0), "r" (__m1));					\
389  } while (0)
390#define udiv_qrnnd(q, r, n1, n0, d) \
391  __asm__ ("dividu %0,%3,%4"						\
392	   : "=r" (q), "=q" (r)						\
393	   : "1" (n1), "r" (n0), "r" (d))
394#define count_leading_zeros(count, x) \
395    __asm__ ("clz %0,%1"						\
396	     : "=r" (count)						\
397	     : "r" (x))
398#define COUNT_LEADING_ZEROS_0 32
399#endif /* __a29k__ */
400
401#if defined (__arc__)
402#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
403  __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
404	   : "=r" (sh),							\
405	     "=&r" (sl)							\
406	   : "r"  ((USItype) (ah)),					\
407	     "rIJ" ((USItype) (bh)),					\
408	     "%r" ((USItype) (al)),					\
409	     "rIJ" ((USItype) (bl)))
410#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
411  __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
412	   : "=r" (sh),							\
413	     "=&r" (sl)							\
414	   : "r" ((USItype) (ah)),					\
415	     "rIJ" ((USItype) (bh)),					\
416	     "r" ((USItype) (al)),					\
417	     "rIJ" ((USItype) (bl)))
418#endif
419
420#if defined (__arm__) && W_TYPE_SIZE == 32
421#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
422  __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
423	   : "=r" (sh), "=&r" (sl)					\
424	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
425#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
426  do {									\
427    if (__builtin_constant_p (al))					\
428      {									\
429	if (__builtin_constant_p (ah))					\
430	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
431		   : "=r" (sh), "=&r" (sl)				\
432		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
433	else								\
434	  __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"		\
435		   : "=r" (sh), "=&r" (sl)				\
436		   : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
437      }									\
438    else if (__builtin_constant_p (ah))					\
439      {									\
440	if (__builtin_constant_p (bl))					\
441	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
442		   : "=r" (sh), "=&r" (sl)				\
443		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
444	else								\
445	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
446		   : "=r" (sh), "=&r" (sl)				\
447		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
448      }									\
449    else if (__builtin_constant_p (bl))					\
450      {									\
451	if (__builtin_constant_p (bh))					\
452	  __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"		\
453		   : "=r" (sh), "=&r" (sl)				\
454		   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
455	else								\
456	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
457		   : "=r" (sh), "=&r" (sl)				\
458		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
459      }									\
460    else /* only bh might be a constant */				\
461      __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
462	       : "=r" (sh), "=&r" (sl)					\
463	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
464    } while (0)
465#if 1 || defined (__arm_m__)	/* `M' series has widening multiply support */
466#define umul_ppmm(xh, xl, a, b) \
467  __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
468#define UMUL_TIME 5
469#define smul_ppmm(xh, xl, a, b) \
470  __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
471#ifndef LONGLONG_STANDALONE
472#define udiv_qrnnd(q, r, n1, n0, d) \
473  do { UWtype __di;							\
474    __di = __MPN(invert_limb) (d);					\
475    udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
476  } while (0)
477#define UDIV_PREINV_ALWAYS  1
478#define UDIV_NEEDS_NORMALIZATION 1
479#define UDIV_TIME 70
480#endif /* LONGLONG_STANDALONE */
481#else
482#define umul_ppmm(xh, xl, a, b) \
483  __asm__ ("%@ Inlined umul_ppmm\n"					\
484"	mov	%|r0, %2, lsr #16\n"					\
485"	mov	%|r2, %3, lsr #16\n"					\
486"	bic	%|r1, %2, %|r0, lsl #16\n"				\
487"	bic	%|r2, %3, %|r2, lsl #16\n"				\
488"	mul	%1, %|r1, %|r2\n"					\
489"	mul	%|r2, %|r0, %|r2\n"					\
490"	mul	%|r1, %0, %|r1\n"					\
491"	mul	%0, %|r0, %0\n"						\
492"	adds	%|r1, %|r2, %|r1\n"					\
493"	addcs	%0, %0, #65536\n"					\
494"	adds	%1, %1, %|r1, lsl #16\n"				\
495"	adc	%0, %0, %|r1, lsr #16"					\
496	   : "=&r" (xh), "=r" (xl)					\
497	   : "r" (a), "r" (b)						\
498	   : "r0", "r1", "r2")
499#define UMUL_TIME 20
500#ifndef LONGLONG_STANDALONE
501#define udiv_qrnnd(q, r, n1, n0, d) \
502  do { UWtype __r;							\
503    (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
504    (r) = __r;								\
505  } while (0)
506extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
507#define UDIV_TIME 200
508#endif /* LONGLONG_STANDALONE */
509#endif
510#endif /* __arm__ */
511
512#if defined (__clipper__) && W_TYPE_SIZE == 32
513#define umul_ppmm(w1, w0, u, v) \
514  ({union {UDItype __ll;						\
515	   struct {USItype __l, __h;} __i;				\
516	  } __x;							\
517  __asm__ ("mulwux %2,%0"						\
518	   : "=r" (__x.__ll)						\
519	   : "%0" ((USItype)(u)), "r" ((USItype)(v)));			\
520  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
521#define smul_ppmm(w1, w0, u, v) \
522  ({union {DItype __ll;							\
523	   struct {SItype __l, __h;} __i;				\
524	  } __x;							\
525  __asm__ ("mulwx %2,%0"						\
526	   : "=r" (__x.__ll)						\
527	   : "%0" ((SItype)(u)), "r" ((SItype)(v)));			\
528  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
529#define __umulsidi3(u, v) \
530  ({UDItype __w;							\
531    __asm__ ("mulwux %2,%0"						\
532	     : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));	\
533    __w; })
534#endif /* __clipper__ */
535
536/* Fujitsu vector computers.  */
537#if defined (__uxp__) && W_TYPE_SIZE == 32
538#define umul_ppmm(ph, pl, u, v) \
539  do {									\
540    union {UDItype __ll;						\
541	   struct {USItype __h, __l;} __i;				\
542	  } __x;							\
543    __asm__ ("mult.lu %1,%2,%0"	: "=r" (__x.__ll) : "%r" (u), "rK" (v));\
544    (ph) = __x.__i.__h;							\
545    (pl) = __x.__i.__l;							\
546  } while (0)
547#define smul_ppmm(ph, pl, u, v) \
548  do {									\
549    union {UDItype __ll;						\
550	   struct {USItype __h, __l;} __i;				\
551	  } __x;							\
552    __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));	\
553    (ph) = __x.__i.__h;							\
554    (pl) = __x.__i.__l;							\
555  } while (0)
556#endif
557
558#if defined (__gmicro__) && W_TYPE_SIZE == 32
559#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
560  __asm__ ("add.w %5,%1\n\taddx %3,%0"					\
561	   : "=g" (sh), "=&g" (sl)					\
562	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
563	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
564#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
565  __asm__ ("sub.w %5,%1\n\tsubx %3,%0"					\
566	   : "=g" (sh), "=&g" (sl)					\
567	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
568	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
569#define umul_ppmm(ph, pl, m0, m1) \
570  __asm__ ("mulx %3,%0,%1"						\
571	   : "=g" (ph), "=r" (pl)					\
572	   : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
573#define udiv_qrnnd(q, r, nh, nl, d) \
574  __asm__ ("divx %4,%0,%1"						\
575	   : "=g" (q), "=r" (r)						\
576	   : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
577#define count_leading_zeros(count, x) \
578  __asm__ ("bsch/1 %1,%0"						\
579	   : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
580#endif
581
582#if defined (__hppa) && W_TYPE_SIZE == 32
583#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
584  __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"			\
585	   : "=r" (sh), "=&r" (sl)					\
586	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
587#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
588  __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"			\
589	   : "=r" (sh), "=&r" (sl)					\
590	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
591#if defined (_PA_RISC1_1)
592#define umul_ppmm(wh, wl, u, v) \
593  do {									\
594    union {UDItype __ll;						\
595	   struct {USItype __h, __l;} __i;				\
596	  } __x;							\
597    __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v));	\
598    (wh) = __x.__i.__h;							\
599    (wl) = __x.__i.__l;							\
600  } while (0)
601#define UMUL_TIME 8
602#define UDIV_TIME 60
603#else
604#define UMUL_TIME 40
605#define UDIV_TIME 80
606#endif
607#define count_leading_zeros(count, x) \
608  do {									\
609    USItype __tmp;							\
610    __asm__ (								\
611       "ldi		1,%0\n"						\
612"	extru,=		%1,15,16,%%r0	; Bits 31..16 zero?\n"		\
613"	extru,tr	%1,15,16,%1	; No.  Shift down, skip add.\n"	\
614"	ldo		16(%0),%0	; Yes.  Perform add.\n"		\
615"	extru,=		%1,23,8,%%r0	; Bits 15..8 zero?\n"		\
616"	extru,tr	%1,23,8,%1	; No.  Shift down, skip add.\n"	\
617"	ldo		8(%0),%0	; Yes.  Perform add.\n"		\
618"	extru,=		%1,27,4,%%r0	; Bits 7..4 zero?\n"		\
619"	extru,tr	%1,27,4,%1	; No.  Shift down, skip add.\n"	\
620"	ldo		4(%0),%0	; Yes.  Perform add.\n"		\
621"	extru,=		%1,29,2,%%r0	; Bits 3..2 zero?\n"		\
622"	extru,tr	%1,29,2,%1	; No.  Shift down, skip add.\n"	\
623"	ldo		2(%0),%0	; Yes.  Perform add.\n"		\
624"	extru		%1,30,1,%1	; Extract bit 1.\n"		\
625"	sub		%0,%1,%0	; Subtract it.\n"		\
626	: "=r" (count), "=r" (__tmp) : "1" (x));			\
627  } while (0)
628#endif /* hppa */
629
630/* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
631   (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
632   is just a case of no direct support for 2.0n but treating it like 1.0. */
633#if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
634#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
635  __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"			\
636	   : "=r" (sh), "=&r" (sl)					\
637	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
638#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
639  __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"			\
640	   : "=r" (sh), "=&r" (sl)					\
641	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
642#endif /* hppa */
643
644#if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
645#define smul_ppmm(xh, xl, m0, m1) \
646  do {									\
647    union {DItype __ll;							\
648	   struct {USItype __h, __l;} __i;				\
649	  } __x;							\
650    __asm__ ("lr %N0,%1\n\tmr %0,%2"					\
651	     : "=&r" (__x.__ll)						\
652	     : "r" (m0), "r" (m1));					\
653    (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
654  } while (0)
655#define sdiv_qrnnd(q, r, n1, n0, d) \
656  do {									\
657    union {DItype __ll;							\
658	   struct {USItype __h, __l;} __i;				\
659	  } __x;							\
660    __x.__i.__h = n1; __x.__i.__l = n0;					\
661    __asm__ ("dr %0,%2"							\
662	     : "=r" (__x.__ll)						\
663	     : "0" (__x.__ll), "r" (d));				\
664    (q) = __x.__i.__l; (r) = __x.__i.__h;				\
665  } while (0)
666#endif
667
668#if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
669#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
670  __asm__ ("addl %5,%k1\n\tadcl %3,%k0"					\
671	   : "=r" (sh), "=&r" (sl)					\
672	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
673	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
674#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
675  __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"					\
676	   : "=r" (sh), "=&r" (sl)					\
677	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
678	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
679#define umul_ppmm(w1, w0, u, v) \
680  __asm__ ("mull %3"							\
681	   : "=a" (w0), "=d" (w1)					\
682	   : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
683#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
684  __asm__ ("divl %4"		     /* stringification in K&R C */	\
685	   : "=a" (q), "=d" (r)						\
686	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
687
688#if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
689/* Pentium bsrl takes between 10 and 72 cycles depending where the most
690   significant 1 bit is, hence the use of the following alternatives.  bsfl
691   is slow too, between 18 and 42 depending where the least significant 1
692   bit is, so let the generic count_trailing_zeros below make use of the
693   count_leading_zeros here too.  */
694
695#if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
696/* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
697   cache miss reading from __clz_tab.  For P55 it's favoured over the float
698   below so as to avoid mixing MMX and x87, since the penalty for switching
699   between the two is about 100 cycles.
700
701   The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
702   16, -1 for 8, or 0 otherwise.  This could be written equivalently as
703   follows, but as of gcc 2.95.2 it results in conditional jumps.
704
705       __shift = -(__n < 0x1000000);
706       __shift -= (__n < 0x10000);
707       __shift -= (__n < 0x100);
708
709   The middle two sbbl and cmpl's pair, and with luck something gcc
710   generates might pair with the first cmpl and the last sbbl.  The "32+1"
711   constant could be folded into __clz_tab[], but it doesn't seem worth
712   making a different table just for that.  */
713
714#define count_leading_zeros(c,n)					\
715  do {									\
716    USItype  __n = (n);							\
717    USItype  __shift;							\
718    __asm__ ("cmpl  $0x1000000, %1\n"					\
719	     "sbbl  %0, %0\n"						\
720	     "cmpl  $0x10000, %1\n"					\
721	     "sbbl  $0, %0\n"						\
722	     "cmpl  $0x100, %1\n"					\
723	     "sbbl  $0, %0\n"						\
724	     : "=&r" (__shift) : "r"  (__n));				\
725    __shift = __shift*8 + 24 + 1;					\
726    (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];			\
727  } while (0)
728#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
729#define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
730
731#else /* ! pentiummmx || LONGLONG_STANDALONE */
732/* The following should be a fixed 14 cycles or so.  Some scheduling
733   opportunities should be available between the float load/store too.  This
734   sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
735   apparently suggested by the Intel optimizing manual (don't know exactly
736   where).  gcc 2.95 or up will be best for this, so the "double" is
737   correctly aligned on the stack.  */
738#define count_leading_zeros(c,n)					\
739  do {									\
740    union {								\
741      double    d;							\
742      unsigned  a[2];							\
743    } __u;								\
744    ASSERT ((n) != 0);							\
745    __u.d = (UWtype) (n);						\
746    (c) = 0x3FF + 31 - (__u.a[1] >> 20);				\
747  } while (0)
748#define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
749#endif /* pentiummx */
750
751#else /* ! pentium */
752
753#if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
754#define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
755#endif /* gcc clz */
756
757/* On P6, gcc prior to 3.0 generates a partial register stall for
758   __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
759   being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
760   cost of one extra instruction.  Do this for "i386" too, since that means
761   generic x86.  */
762#if ! defined (count_leading_zeros) && __GNUC__ < 3                     \
763  && (HAVE_HOST_CPU_i386						\
764      || HAVE_HOST_CPU_i686						\
765      || HAVE_HOST_CPU_pentiumpro					\
766      || HAVE_HOST_CPU_pentium2						\
767      || HAVE_HOST_CPU_pentium3)
768#define count_leading_zeros(count, x)					\
769  do {									\
770    USItype __cbtmp;							\
771    ASSERT ((x) != 0);							\
772    __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
773    (count) = 31 - __cbtmp;						\
774  } while (0)
775#endif /* gcc<3 asm bsrl */
776
777#ifndef count_leading_zeros
778#define count_leading_zeros(count, x)					\
779  do {									\
780    USItype __cbtmp;							\
781    ASSERT ((x) != 0);							\
782    __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
783    (count) = __cbtmp ^ 31;						\
784  } while (0)
785#endif /* asm bsrl */
786
787#if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
788#define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
789#endif /* gcc ctz */
790
791#ifndef count_trailing_zeros
792#define count_trailing_zeros(count, x)					\
793  do {									\
794    ASSERT ((x) != 0);							\
795    __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));	\
796  } while (0)
797#endif /* asm bsfl */
798
799#endif /* ! pentium */
800
801#ifndef UMUL_TIME
802#define UMUL_TIME 10
803#endif
804#ifndef UDIV_TIME
805#define UDIV_TIME 40
806#endif
807#endif /* 80x86 */
808
809#if defined (__amd64__) && W_TYPE_SIZE == 64
810#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
811  __asm__ ("addq %5,%q1\n\tadcq %3,%q0"					\
812	   : "=r" (sh), "=&r" (sl)					\
813	   : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
814	     "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
815#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
816  __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"					\
817	   : "=r" (sh), "=&r" (sl)					\
818	   : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
819	     "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
820#define umul_ppmm(w1, w0, u, v) \
821  __asm__ ("mulq %3"							\
822	   : "=a" (w0), "=d" (w1)					\
823	   : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
824#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
825  __asm__ ("divq %4"		     /* stringification in K&R C */	\
826	   : "=a" (q), "=d" (r)						\
827	   : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
828/* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
829#define count_leading_zeros(count, x)					\
830  do {									\
831    UDItype __cbtmp;							\
832    ASSERT ((x) != 0);							\
833    __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));	\
834    (count) = __cbtmp ^ 63;						\
835  } while (0)
836/* bsfq destination must be a 64-bit register, "%q0" forces this in case
837   count is only an int. */
838#define count_trailing_zeros(count, x)					\
839  do {									\
840    ASSERT ((x) != 0);							\
841    __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
842  } while (0)
843#endif /* x86_64 */
844
845#if defined (__i860__) && W_TYPE_SIZE == 32
846#define rshift_rhlc(r,h,l,c) \
847  __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"				\
848	   "=r" (r) : "r" (h), "r" (l), "rn" (c))
849#endif /* i860 */
850
851#if defined (__i960__) && W_TYPE_SIZE == 32
852#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
853  __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"			\
854	   : "=r" (sh), "=&r" (sl)					\
855	   : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
856#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
857  __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"			\
858	   : "=r" (sh), "=&r" (sl)					\
859	   : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
860#define umul_ppmm(w1, w0, u, v) \
861  ({union {UDItype __ll;						\
862	   struct {USItype __l, __h;} __i;				\
863	  } __x;							\
864  __asm__ ("emul %2,%1,%0"						\
865	   : "=d" (__x.__ll) : "%dI" (u), "dI" (v));			\
866  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
867#define __umulsidi3(u, v) \
868  ({UDItype __w;							\
869    __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));	\
870    __w; })
871#define udiv_qrnnd(q, r, nh, nl, d) \
872  do {									\
873    union {UDItype __ll;						\
874	   struct {USItype __l, __h;} __i;				\
875	  } __nn;							\
876    __nn.__i.__h = (nh); __nn.__i.__l = (nl);				\
877    __asm__ ("ediv %d,%n,%0"						\
878	   : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));		\
879    (r) = __rq.__i.__l; (q) = __rq.__i.__h;				\
880  } while (0)
881#define count_leading_zeros(count, x) \
882  do {									\
883    USItype __cbtmp;							\
884    __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));		\
885    (count) = __cbtmp ^ 31;						\
886  } while (0)
887#define COUNT_LEADING_ZEROS_0 (-32) /* sic */
888#if defined (__i960mx)		/* what is the proper symbol to test??? */
889#define rshift_rhlc(r,h,l,c) \
890  do {									\
891    union {UDItype __ll;						\
892	   struct {USItype __l, __h;} __i;				\
893	  } __nn;							\
894    __nn.__i.__h = (h); __nn.__i.__l = (l);				\
895    __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));	\
896  }
897#endif /* i960mx */
898#endif /* i960 */
899
900#if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
901     || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
902     || defined (__mc5307__)) && W_TYPE_SIZE == 32
903#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
904  __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
905	   : "=d" (sh), "=&d" (sl)					\
906	   : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),			\
907	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
908#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
909  __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
910	   : "=d" (sh), "=&d" (sl)					\
911	   : "0" ((USItype)(ah)), "d" ((USItype)(bh)),			\
912	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
913/* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
914#if defined (__mc68020__) || defined(mc68020) \
915     || defined (__mc68030__) || defined (mc68030) \
916     || defined (__mc68040__) || defined (mc68040) \
917     || defined (__mcpu32__) || defined (mcpu32) \
918     || defined (__NeXT__)
919#define umul_ppmm(w1, w0, u, v) \
920  __asm__ ("mulu%.l %3,%1:%0"						\
921	   : "=d" (w0), "=d" (w1)					\
922	   : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
923#define UMUL_TIME 45
924#define udiv_qrnnd(q, r, n1, n0, d) \
925  __asm__ ("divu%.l %4,%1:%0"						\
926	   : "=d" (q), "=d" (r)						\
927	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
928#define UDIV_TIME 90
929#define sdiv_qrnnd(q, r, n1, n0, d) \
930  __asm__ ("divs%.l %4,%1:%0"						\
931	   : "=d" (q), "=d" (r)						\
932	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
933#else /* for other 68k family members use 16x16->32 multiplication */
934#define umul_ppmm(xh, xl, a, b) \
935  do { USItype __umul_tmp1, __umul_tmp2;				\
936	__asm__ ("| Inlined umul_ppmm\n"				\
937"	move%.l	%5,%3\n"						\
938"	move%.l	%2,%0\n"						\
939"	move%.w	%3,%1\n"						\
940"	swap	%3\n"							\
941"	swap	%0\n"							\
942"	mulu%.w	%2,%1\n"						\
943"	mulu%.w	%3,%0\n"						\
944"	mulu%.w	%2,%3\n"						\
945"	swap	%2\n"							\
946"	mulu%.w	%5,%2\n"						\
947"	add%.l	%3,%2\n"						\
948"	jcc	1f\n"							\
949"	add%.l	%#0x10000,%0\n"						\
950"1:	move%.l	%2,%3\n"						\
951"	clr%.w	%2\n"							\
952"	swap	%2\n"							\
953"	swap	%3\n"							\
954"	clr%.w	%3\n"							\
955"	add%.l	%3,%1\n"						\
956"	addx%.l	%2,%0\n"						\
957"	| End inlined umul_ppmm"					\
958	      : "=&d" (xh), "=&d" (xl),					\
959		"=d" (__umul_tmp1), "=&d" (__umul_tmp2)			\
960	      : "%2" ((USItype)(a)), "d" ((USItype)(b)));		\
961  } while (0)
962#define UMUL_TIME 100
963#define UDIV_TIME 400
964#endif /* not mc68020 */
965/* The '020, '030, '040 and '060 have bitfield insns.
966   GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
967   exclude bfffo on that chip (bitfield insns not available).  */
968#if (defined (__mc68020__) || defined (mc68020)    \
969     || defined (__mc68030__) || defined (mc68030) \
970     || defined (__mc68040__) || defined (mc68040) \
971     || defined (__mc68060__) || defined (mc68060) \
972     || defined (__NeXT__))                        \
973  && ! defined (__mcpu32__)
974#define count_leading_zeros(count, x) \
975  __asm__ ("bfffo %1{%b2:%b2},%0"					\
976	   : "=d" (count)						\
977	   : "od" ((USItype) (x)), "n" (0))
978#define COUNT_LEADING_ZEROS_0 32
979#endif
980#endif /* mc68000 */
981
982#if defined (__m88000__) && W_TYPE_SIZE == 32
983#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
984  __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
985	   : "=r" (sh), "=&r" (sl)					\
986	   : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
987#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
988  __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
989	   : "=r" (sh), "=&r" (sl)					\
990	   : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
991#define count_leading_zeros(count, x) \
992  do {									\
993    USItype __cbtmp;							\
994    __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));			\
995    (count) = __cbtmp ^ 31;						\
996  } while (0)
997#define COUNT_LEADING_ZEROS_0 63 /* sic */
998#if defined (__m88110__)
999#define umul_ppmm(wh, wl, u, v) \
1000  do {									\
1001    union {UDItype __ll;						\
1002	   struct {USItype __h, __l;} __i;				\
1003	  } __x;							\
1004    __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));	\
1005    (wh) = __x.__i.__h;							\
1006    (wl) = __x.__i.__l;							\
1007  } while (0)
1008#define udiv_qrnnd(q, r, n1, n0, d) \
1009  ({union {UDItype __ll;						\
1010	   struct {USItype __h, __l;} __i;				\
1011	  } __x, __q;							\
1012  __x.__i.__h = (n1); __x.__i.__l = (n0);				\
1013  __asm__ ("divu.d %0,%1,%2"						\
1014	   : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));		\
1015  (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1016#define UMUL_TIME 5
1017#define UDIV_TIME 25
1018#else
1019#define UMUL_TIME 17
1020#define UDIV_TIME 150
1021#endif /* __m88110__ */
1022#endif /* __m88000__ */
1023
1024#if defined (__mips) && W_TYPE_SIZE == 32
1025#if __GMP_GNUC_PREREQ (4,4)
1026#define umul_ppmm(w1, w0, u, v) \
1027  do {									\
1028    UDItype __ll = (UDItype)(u) * (v);					\
1029    w1 = __ll >> 32;							\
1030    w0 = __ll;								\
1031  } while (0)
1032#endif
1033#if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1034#define umul_ppmm(w1, w0, u, v) \
1035  __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1036#endif
1037#if !defined (umul_ppmm)
1038#define umul_ppmm(w1, w0, u, v) \
1039  __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"				\
1040	   : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1041#endif
1042#define UMUL_TIME 10
1043#define UDIV_TIME 100
1044#endif /* __mips */
1045
1046#if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1047#if __GMP_GNUC_PREREQ (4,4)
1048#define umul_ppmm(w1, w0, u, v) \
1049  do {									\
1050    typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
1051    __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
1052    w1 = __ll >> 64;							\
1053    w0 = __ll;								\
1054  } while (0)
1055#endif
1056#if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1057#define umul_ppmm(w1, w0, u, v) \
1058  __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1059#endif
1060#if !defined (umul_ppmm)
1061#define umul_ppmm(w1, w0, u, v) \
1062  __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"				\
1063	   : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1064#endif
1065#define UMUL_TIME 20
1066#define UDIV_TIME 140
1067#endif /* __mips */
1068
1069#if defined (__mmix__) && W_TYPE_SIZE == 64
1070#define umul_ppmm(w1, w0, u, v) \
1071  __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1072#endif
1073
1074#if defined (__ns32000__) && W_TYPE_SIZE == 32
1075#define umul_ppmm(w1, w0, u, v) \
1076  ({union {UDItype __ll;						\
1077	   struct {USItype __l, __h;} __i;				\
1078	  } __x;							\
1079  __asm__ ("meid %2,%0"							\
1080	   : "=g" (__x.__ll)						\
1081	   : "%0" ((USItype)(u)), "g" ((USItype)(v)));			\
1082  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1083#define __umulsidi3(u, v) \
1084  ({UDItype __w;							\
1085    __asm__ ("meid %2,%0"						\
1086	     : "=g" (__w)						\
1087	     : "%0" ((USItype)(u)), "g" ((USItype)(v)));		\
1088    __w; })
1089#define udiv_qrnnd(q, r, n1, n0, d) \
1090  ({union {UDItype __ll;						\
1091	   struct {USItype __l, __h;} __i;				\
1092	  } __x;							\
1093  __x.__i.__h = (n1); __x.__i.__l = (n0);				\
1094  __asm__ ("deid %2,%0"							\
1095	   : "=g" (__x.__ll)						\
1096	   : "0" (__x.__ll), "g" ((USItype)(d)));			\
1097  (r) = __x.__i.__l; (q) = __x.__i.__h; })
1098#define count_trailing_zeros(count,x) \
1099  do {									\
1100    __asm__ ("ffsd	%2,%0"						\
1101	     : "=r" (count)						\
1102	     : "0" ((USItype) 0), "r" ((USItype) (x)));			\
1103  } while (0)
1104#endif /* __ns32000__ */
1105
1106/* In the past we had a block of various #defines tested
1107       _ARCH_PPC    - AIX
1108       _ARCH_PWR    - AIX
1109       __powerpc__  - gcc
1110       __POWERPC__  - BEOS
1111       __ppc__      - Darwin
1112       PPC          - old gcc, GNU/Linux, SysV
1113   The plain PPC test was not good for vxWorks, since PPC is defined on all
1114   CPUs there (eg. m68k too), as a constant one is expected to compare
1115   CPU_FAMILY against.
1116
1117   At any rate, this was pretty unattractive and a bit fragile.  The use of
1118   HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1119   getting the desired effect.
1120
1121   ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1122   the system vendor compilers.  (Is that vendor compilers with inline asm,
1123   or what?)  */
1124
1125#if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)        \
1126  && W_TYPE_SIZE == 32
1127#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1128  do {									\
1129    if (__builtin_constant_p (bh) && (bh) == 0)				\
1130      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"		\
1131	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1132    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
1133      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"		\
1134	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1135    else								\
1136      __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"		\
1137	     : "=r" (sh), "=&r" (sl)					\
1138	     : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
1139  } while (0)
1140#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1141  do {									\
1142    if (__builtin_constant_p (ah) && (ah) == 0)				\
1143      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"	\
1144	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1145    else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
1146      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"	\
1147	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1148    else if (__builtin_constant_p (bh) && (bh) == 0)			\
1149      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"		\
1150	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1151    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
1152      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"		\
1153	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1154    else								\
1155      __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"	\
1156	       : "=r" (sh), "=&r" (sl)					\
1157	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
1158  } while (0)
1159#define count_leading_zeros(count, x) \
1160  __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))
1161#define COUNT_LEADING_ZEROS_0 32
1162#if HAVE_HOST_CPU_FAMILY_powerpc
1163#if __GMP_GNUC_PREREQ (4,4)
1164#define umul_ppmm(w1, w0, u, v) \
1165  do {									\
1166    UDItype __ll = (UDItype)(u) * (v);					\
1167    w1 = __ll >> 32;							\
1168    w0 = __ll;								\
1169  } while (0)
1170#endif
1171#if !defined (umul_ppmm)
1172#define umul_ppmm(ph, pl, m0, m1) \
1173  do {									\
1174    USItype __m0 = (m0), __m1 = (m1);					\
1175    __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1176    (pl) = __m0 * __m1;							\
1177  } while (0)
1178#endif
1179#define UMUL_TIME 15
1180#define smul_ppmm(ph, pl, m0, m1) \
1181  do {									\
1182    SItype __m0 = (m0), __m1 = (m1);					\
1183    __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1184    (pl) = __m0 * __m1;							\
1185  } while (0)
1186#define SMUL_TIME 14
1187#define UDIV_TIME 120
1188#else
1189#define UMUL_TIME 8
1190#define smul_ppmm(xh, xl, m0, m1) \
1191  __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1192#define SMUL_TIME 4
1193#define sdiv_qrnnd(q, r, nh, nl, d) \
1194  __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1195#define UDIV_TIME 100
1196#endif
1197#endif /* 32-bit POWER architecture variants.  */
1198
1199/* We should test _IBMR2 here when we add assembly support for the system
1200   vendor compilers.  */
1201#if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1202#if !defined (_LONG_LONG_LIMB)
1203/* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
1204   use adde etc only when not _LONG_LONG_LIMB.  */
1205#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1206  do {									\
1207    if (__builtin_constant_p (bh) && (bh) == 0)				\
1208      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"		\
1209	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1210    else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
1211      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"		\
1212	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
1213    else								\
1214      __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"		\
1215	     : "=r" (sh), "=&r" (sl)					\
1216	     : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
1217  } while (0)
1218/* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1219   This might seem strange, but gcc folds away the dead code late.  */
1220#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1221  do {									      \
1222    if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) {	      \
1223	if (__builtin_constant_p (ah) && (ah) == 0)			      \
1224	  __asm__ ("{ai|addic} %1,%3,%4\n\t{sfze|subfze} %0,%2"		      \
1225		   : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1226	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	      \
1227	  __asm__ ("{ai|addic} %1,%3,%4\n\t{sfme|subfme} %0,%2"		      \
1228		   : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1229	else if (__builtin_constant_p (bh) && (bh) == 0)		      \
1230	  __asm__ ("{ai|addic} %1,%3,%4\n\t{ame|addme} %0,%2"		      \
1231		   : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1232	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	      \
1233	  __asm__ ("{ai|addic} %1,%3,%4\n\t{aze|addze} %0,%2"		      \
1234		   : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1235	else								      \
1236	  __asm__ ("{ai|addic} %1,%4,%5\n\t{sfe|subfe} %0,%3,%2"	      \
1237		   : "=r" (sh), "=&r" (sl)				      \
1238		   : "r" (ah), "r" (bh), "rI" (al), "*rI" (-bl));	      \
1239      } else {								      \
1240	if (__builtin_constant_p (ah) && (ah) == 0)			      \
1241	  __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"	      \
1242		   : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));  \
1243	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	      \
1244	  __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"	      \
1245		   : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));  \
1246	else if (__builtin_constant_p (bh) && (bh) == 0)		      \
1247	  __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"	      \
1248		   : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));  \
1249	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	      \
1250	  __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"	      \
1251		   : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));  \
1252	else								      \
1253	  __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"	      \
1254		   : "=r" (sh), "=&r" (sl)				      \
1255		   : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		      \
1256      }									      \
1257  } while (0)
1258#endif /* ! _LONG_LONG_LIMB */
1259#define count_leading_zeros(count, x) \
1260  __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1261#define COUNT_LEADING_ZEROS_0 64
1262#if __GMP_GNUC_PREREQ (4,4)
1263#define umul_ppmm(w1, w0, u, v) \
1264  do {									\
1265    typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
1266    __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
1267    w1 = __ll >> 64;							\
1268    w0 = __ll;								\
1269  } while (0)
1270#endif
1271#if !defined (umul_ppmm)
1272#define umul_ppmm(ph, pl, m0, m1) \
1273  do {									\
1274    UDItype __m0 = (m0), __m1 = (m1);					\
1275    __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1276    (pl) = __m0 * __m1;							\
1277  } while (0)
1278#endif
1279#define UMUL_TIME 15
1280#define smul_ppmm(ph, pl, m0, m1) \
1281  do {									\
1282    DItype __m0 = (m0), __m1 = (m1);					\
1283    __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1284    (pl) = __m0 * __m1;							\
1285  } while (0)
1286#define SMUL_TIME 14  /* ??? */
1287#define UDIV_TIME 120 /* ??? */
1288#endif /* 64-bit PowerPC.  */
1289
1290#if defined (__pyr__) && W_TYPE_SIZE == 32
1291#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1292  __asm__ ("addw %5,%1\n\taddwc %3,%0"					\
1293	   : "=r" (sh), "=&r" (sl)					\
1294	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
1295	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1296#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1297  __asm__ ("subw %5,%1\n\tsubwb %3,%0"					\
1298	   : "=r" (sh), "=&r" (sl)					\
1299	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
1300	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1301/* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
1302#define umul_ppmm(w1, w0, u, v) \
1303  ({union {UDItype __ll;						\
1304	   struct {USItype __h, __l;} __i;				\
1305	  } __x;							\
1306  __asm__ ("movw %1,%R0\n\tuemul %2,%0"					\
1307	   : "=&r" (__x.__ll)						\
1308	   : "g" ((USItype) (u)), "g" ((USItype)(v)));			\
1309  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1310#endif /* __pyr__ */
1311
1312#if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
1313#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1314  __asm__ ("a %1,%5\n\tae %0,%3"					\
1315	   : "=r" (sh), "=&r" (sl)					\
1316	   : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),			\
1317	     "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1318#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1319  __asm__ ("s %1,%5\n\tse %0,%3"					\
1320	   : "=r" (sh), "=&r" (sl)					\
1321	   : "0" ((USItype)(ah)), "r" ((USItype)(bh)),			\
1322	     "1" ((USItype)(al)), "r" ((USItype)(bl)))
1323#define smul_ppmm(ph, pl, m0, m1) \
1324  __asm__ (								\
1325       "s	r2,r2\n"						\
1326"	mts r10,%2\n"							\
1327"	m	r2,%3\n"						\
1328"	m	r2,%3\n"						\
1329"	m	r2,%3\n"						\
1330"	m	r2,%3\n"						\
1331"	m	r2,%3\n"						\
1332"	m	r2,%3\n"						\
1333"	m	r2,%3\n"						\
1334"	m	r2,%3\n"						\
1335"	m	r2,%3\n"						\
1336"	m	r2,%3\n"						\
1337"	m	r2,%3\n"						\
1338"	m	r2,%3\n"						\
1339"	m	r2,%3\n"						\
1340"	m	r2,%3\n"						\
1341"	m	r2,%3\n"						\
1342"	m	r2,%3\n"						\
1343"	cas	%0,r2,r0\n"						\
1344"	mfs	r10,%1"							\
1345	   : "=r" (ph), "=r" (pl)					\
1346	   : "%r" ((USItype)(m0)), "r" ((USItype)(m1))			\
1347	   : "r2")
1348#define UMUL_TIME 20
1349#define UDIV_TIME 200
1350#define count_leading_zeros(count, x) \
1351  do {									\
1352    if ((x) >= 0x10000)							\
1353      __asm__ ("clz	%0,%1"						\
1354	       : "=r" (count) : "r" ((USItype)(x) >> 16));		\
1355    else								\
1356      {									\
1357	__asm__ ("clz	%0,%1"						\
1358		 : "=r" (count) : "r" ((USItype)(x)));			\
1359	(count) += 16;							\
1360      }									\
1361  } while (0)
1362#endif /* RT/ROMP */
1363
1364#if defined (__sh2__) && W_TYPE_SIZE == 32
1365#define umul_ppmm(w1, w0, u, v) \
1366  __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"		\
1367	   : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1368#define UMUL_TIME 5
1369#endif
1370
1371#if defined (__sparc__) && W_TYPE_SIZE == 32
1372#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1373  __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
1374	   : "=r" (sh), "=&r" (sl)					\
1375	   : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)			\
1376	   __CLOBBER_CC)
1377#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1378  __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
1379	   : "=r" (sh), "=&r" (sl)					\
1380	   : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl)	\
1381	   __CLOBBER_CC)
1382/* Note: the following FIXME comes from GMP, thus it does make sense to try
1383   to resolve it in MPFR. */
1384/* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1385   doesn't define anything to indicate that to us, it only sets __sparcv8. */
1386#if defined (__sparc_v9__) || defined (__sparcv9)
1387/* Perhaps we should use floating-point operations here?  */
1388#if 0
1389/* Triggers a bug making mpz/tests/t-gcd.c fail.
1390   Perhaps we simply need explicitly zero-extend the inputs?  */
1391#define umul_ppmm(w1, w0, u, v) \
1392  __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :		\
1393	   "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1394#else
1395/* Use v8 umul until above bug is fixed.  */
1396#define umul_ppmm(w1, w0, u, v) \
1397  __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1398#endif
1399/* Use a plain v8 divide for v9.  */
1400#define udiv_qrnnd(q, r, n1, n0, d) \
1401  do {									\
1402    USItype __q;							\
1403    __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
1404	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
1405    (r) = (n0) - __q * (d);						\
1406    (q) = __q;								\
1407  } while (0)
1408#else
1409#if defined (__sparc_v8__)   /* gcc normal */				\
1410  || defined (__sparcv8)     /* gcc solaris */				\
1411  || HAVE_HOST_CPU_supersparc
1412/* Don't match immediate range because, 1) it is not often useful,
1413   2) the 'I' flag thinks of the range as a 13 bit signed interval,
1414   while we want to match a 13 bit interval, sign extended to 32 bits,
1415   but INTERPRETED AS UNSIGNED.  */
1416#define umul_ppmm(w1, w0, u, v) \
1417  __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1418#define UMUL_TIME 5
1419
1420#if HAVE_HOST_CPU_supersparc
1421#define UDIV_TIME 60		/* SuperSPARC timing */
1422#else
1423/* Don't use this on SuperSPARC because its udiv only handles 53 bit
1424   dividends and will trap to the kernel for the rest. */
1425#define udiv_qrnnd(q, r, n1, n0, d) \
1426  do {									\
1427    USItype __q;							\
1428    __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
1429	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
1430    (r) = (n0) - __q * (d);						\
1431    (q) = __q;								\
1432  } while (0)
1433#define UDIV_TIME 25
1434#endif /* HAVE_HOST_CPU_supersparc */
1435
1436#else /* ! __sparc_v8__ */
1437#if defined (__sparclite__)
1438/* This has hardware multiply but not divide.  It also has two additional
1439   instructions scan (ffs from high bit) and divscc.  */
1440#define umul_ppmm(w1, w0, u, v) \
1441  __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1442#define UMUL_TIME 5
1443#define udiv_qrnnd(q, r, n1, n0, d) \
1444  __asm__ ("! Inlined udiv_qrnnd\n"					\
1445"	wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
1446"	tst	%%g0\n"							\
1447"	divscc	%3,%4,%%g1\n"						\
1448"	divscc	%%g1,%4,%%g1\n"						\
1449"	divscc	%%g1,%4,%%g1\n"						\
1450"	divscc	%%g1,%4,%%g1\n"						\
1451"	divscc	%%g1,%4,%%g1\n"						\
1452"	divscc	%%g1,%4,%%g1\n"						\
1453"	divscc	%%g1,%4,%%g1\n"						\
1454"	divscc	%%g1,%4,%%g1\n"						\
1455"	divscc	%%g1,%4,%%g1\n"						\
1456"	divscc	%%g1,%4,%%g1\n"						\
1457"	divscc	%%g1,%4,%%g1\n"						\
1458"	divscc	%%g1,%4,%%g1\n"						\
1459"	divscc	%%g1,%4,%%g1\n"						\
1460"	divscc	%%g1,%4,%%g1\n"						\
1461"	divscc	%%g1,%4,%%g1\n"						\
1462"	divscc	%%g1,%4,%%g1\n"						\
1463"	divscc	%%g1,%4,%%g1\n"						\
1464"	divscc	%%g1,%4,%%g1\n"						\
1465"	divscc	%%g1,%4,%%g1\n"						\
1466"	divscc	%%g1,%4,%%g1\n"						\
1467"	divscc	%%g1,%4,%%g1\n"						\
1468"	divscc	%%g1,%4,%%g1\n"						\
1469"	divscc	%%g1,%4,%%g1\n"						\
1470"	divscc	%%g1,%4,%%g1\n"						\
1471"	divscc	%%g1,%4,%%g1\n"						\
1472"	divscc	%%g1,%4,%%g1\n"						\
1473"	divscc	%%g1,%4,%%g1\n"						\
1474"	divscc	%%g1,%4,%%g1\n"						\
1475"	divscc	%%g1,%4,%%g1\n"						\
1476"	divscc	%%g1,%4,%%g1\n"						\
1477"	divscc	%%g1,%4,%%g1\n"						\
1478"	divscc	%%g1,%4,%0\n"						\
1479"	rd	%%y,%1\n"						\
1480"	bl,a 1f\n"							\
1481"	add	%1,%4,%1\n"						\
1482"1:	! End of inline udiv_qrnnd"					\
1483	   : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)		\
1484	   : "%g1" __AND_CLOBBER_CC)
1485#define UDIV_TIME 37
1486#define count_leading_zeros(count, x) \
1487  __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1488/* Early sparclites return 63 for an argument of 0, but they warn that future
1489   implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1490   undefined.  */
1491#endif /* __sparclite__ */
1492#endif /* __sparc_v8__ */
1493#endif /* __sparc_v9__ */
1494/* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
1495#ifndef umul_ppmm
1496#define umul_ppmm(w1, w0, u, v) \
1497  __asm__ ("! Inlined umul_ppmm\n"					\
1498"	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n" \
1499"	sra	%3,31,%%g2	! Don't move this insn\n"		\
1500"	and	%2,%%g2,%%g2	! Don't move this insn\n"		\
1501"	andcc	%%g0,0,%%g1	! Don't move this insn\n"		\
1502"	mulscc	%%g1,%3,%%g1\n"						\
1503"	mulscc	%%g1,%3,%%g1\n"						\
1504"	mulscc	%%g1,%3,%%g1\n"						\
1505"	mulscc	%%g1,%3,%%g1\n"						\
1506"	mulscc	%%g1,%3,%%g1\n"						\
1507"	mulscc	%%g1,%3,%%g1\n"						\
1508"	mulscc	%%g1,%3,%%g1\n"						\
1509"	mulscc	%%g1,%3,%%g1\n"						\
1510"	mulscc	%%g1,%3,%%g1\n"						\
1511"	mulscc	%%g1,%3,%%g1\n"						\
1512"	mulscc	%%g1,%3,%%g1\n"						\
1513"	mulscc	%%g1,%3,%%g1\n"						\
1514"	mulscc	%%g1,%3,%%g1\n"						\
1515"	mulscc	%%g1,%3,%%g1\n"						\
1516"	mulscc	%%g1,%3,%%g1\n"						\
1517"	mulscc	%%g1,%3,%%g1\n"						\
1518"	mulscc	%%g1,%3,%%g1\n"						\
1519"	mulscc	%%g1,%3,%%g1\n"						\
1520"	mulscc	%%g1,%3,%%g1\n"						\
1521"	mulscc	%%g1,%3,%%g1\n"						\
1522"	mulscc	%%g1,%3,%%g1\n"						\
1523"	mulscc	%%g1,%3,%%g1\n"						\
1524"	mulscc	%%g1,%3,%%g1\n"						\
1525"	mulscc	%%g1,%3,%%g1\n"						\
1526"	mulscc	%%g1,%3,%%g1\n"						\
1527"	mulscc	%%g1,%3,%%g1\n"						\
1528"	mulscc	%%g1,%3,%%g1\n"						\
1529"	mulscc	%%g1,%3,%%g1\n"						\
1530"	mulscc	%%g1,%3,%%g1\n"						\
1531"	mulscc	%%g1,%3,%%g1\n"						\
1532"	mulscc	%%g1,%3,%%g1\n"						\
1533"	mulscc	%%g1,%3,%%g1\n"						\
1534"	mulscc	%%g1,0,%%g1\n"						\
1535"	add	%%g1,%%g2,%0\n"						\
1536"	rd	%%y,%1"							\
1537	   : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)			\
1538	   : "%g1", "%g2" __AND_CLOBBER_CC)
1539#define UMUL_TIME 39		/* 39 instructions */
1540#endif
1541#ifndef udiv_qrnnd
1542#ifndef LONGLONG_STANDALONE
1543#define udiv_qrnnd(q, r, n1, n0, d) \
1544  do { UWtype __r;							\
1545    (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
1546    (r) = __r;								\
1547  } while (0)
1548extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1549#ifndef UDIV_TIME
1550#define UDIV_TIME 140
1551#endif
1552#endif /* LONGLONG_STANDALONE */
1553#endif /* udiv_qrnnd */
1554#endif /* __sparc__ */
1555
1556#if defined (__sparc__) && W_TYPE_SIZE == 64
1557#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1558  __asm__ (								\
1559       "addcc	%r4,%5,%1\n"						\
1560      "	addccc	%r6,%7,%%g0\n"						\
1561      "	addc	%r2,%3,%0"						\
1562	  : "=r" (sh), "=&r" (sl)					\
1563	  : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),		\
1564	    "%rJ" ((al) >> 32), "rI" ((bl) >> 32)			\
1565	   __CLOBBER_CC)
1566#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1567  __asm__ (								\
1568       "subcc	%r4,%5,%1\n"						\
1569      "	subccc	%r6,%7,%%g0\n"						\
1570      "	subc	%r2,%3,%0"						\
1571	  : "=r" (sh), "=&r" (sl)					\
1572	  : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl),		\
1573	    "rJ" ((al) >> 32), "rI" ((bl) >> 32)			\
1574	   __CLOBBER_CC)
1575#endif
1576
1577#if defined (__vax__) && W_TYPE_SIZE == 32
1578#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1579  __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
1580	   : "=g" (sh), "=&g" (sl)					\
1581	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
1582	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1583#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1584  __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
1585	   : "=g" (sh), "=&g" (sl)					\
1586	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
1587	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1588#define smul_ppmm(xh, xl, m0, m1) \
1589  do {									\
1590    union {UDItype __ll;						\
1591	   struct {USItype __l, __h;} __i;				\
1592	  } __x;							\
1593    USItype __m0 = (m0), __m1 = (m1);					\
1594    __asm__ ("emul %1,%2,$0,%0"						\
1595	     : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));		\
1596    (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
1597  } while (0)
1598#define sdiv_qrnnd(q, r, n1, n0, d) \
1599  do {									\
1600    union {DItype __ll;							\
1601	   struct {SItype __l, __h;} __i;				\
1602	  } __x;							\
1603    __x.__i.__h = n1; __x.__i.__l = n0;					\
1604    __asm__ ("ediv %3,%2,%0,%1"						\
1605	     : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));		\
1606  } while (0)
1607#if 0
1608/* Note: the following FIXME comes from GMP, thus it does make sense to try
1609   to resolve it in MPFR. */
1610/* FIXME: This instruction appears to be unimplemented on some systems (vax
1611   8800 maybe). */
1612#define count_trailing_zeros(count,x)					\
1613  do {									\
1614    __asm__ ("ffs 0, 31, %1, %0"					\
1615	     : "=g" (count)						\
1616	     : "g" ((USItype) (x)));					\
1617  } while (0)
1618#endif
1619#endif /* __vax__ */
1620
1621#if defined (__z8000__) && W_TYPE_SIZE == 16
1622#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1623  __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
1624	   : "=r" (sh), "=&r" (sl)					\
1625	   : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
1626	     "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1627#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1628  __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
1629	   : "=r" (sh), "=&r" (sl)					\
1630	   : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
1631	     "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1632#define umul_ppmm(xh, xl, m0, m1) \
1633  do {									\
1634    union {long int __ll;						\
1635	   struct {unsigned int __h, __l;} __i;				\
1636	  } __x;							\
1637    unsigned int __m0 = (m0), __m1 = (m1);				\
1638    __asm__ ("mult	%S0,%H3"					\
1639	     : "=r" (__x.__i.__h), "=r" (__x.__i.__l)			\
1640	     : "%1" (m0), "rQR" (m1));					\
1641    (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
1642    (xh) += ((((signed int) __m0 >> 15) & __m1)				\
1643	     + (((signed int) __m1 >> 15) & __m0));			\
1644  } while (0)
1645#endif /* __z8000__ */
1646
1647#endif /* __GNUC__ */
1648
1649#endif /* NO_ASM */
1650
1651
1652#if !defined (umul_ppmm) && defined (__umulsidi3)
1653#define umul_ppmm(ph, pl, m0, m1) \
1654  {									\
1655    UDWtype __ll = __umulsidi3 (m0, m1);				\
1656    ph = (UWtype) (__ll >> W_TYPE_SIZE);				\
1657    pl = (UWtype) __ll;							\
1658  }
1659#endif
1660
1661#if !defined (__umulsidi3)
1662#define __umulsidi3(u, v) \
1663  ({UWtype __hi, __lo;							\
1664    umul_ppmm (__hi, __lo, u, v);					\
1665    ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1666#endif
1667
1668
1669/* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
1670   forms have "reversed" arguments, meaning the pointer is last, which
1671   sometimes allows better parameter passing, in particular on 64-bit
1672   hppa. */
1673
1674#define mpn_umul_ppmm  __MPN(umul_ppmm)
1675extern UWtype mpn_umul_ppmm _PROTO ((UWtype *, UWtype, UWtype));
1676
1677#if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
1678  && ! defined (LONGLONG_STANDALONE)
1679#define umul_ppmm(wh, wl, u, v)						      \
1680  do {									      \
1681    UWtype __umul_ppmm__p0;						      \
1682    (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));      \
1683    (wl) = __umul_ppmm__p0;						      \
1684  } while (0)
1685#endif
1686
1687#define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
1688extern UWtype mpn_umul_ppmm_r _PROTO ((UWtype, UWtype, UWtype *));
1689
1690#if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r	\
1691  && ! defined (LONGLONG_STANDALONE)
1692#define umul_ppmm(wh, wl, u, v)						      \
1693  do {									      \
1694    UWtype __umul_ppmm__p0;						      \
1695    (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_ppmm__p0);    \
1696    (wl) = __umul_ppmm__p0;						      \
1697  } while (0)
1698#endif
1699
1700#define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
1701extern UWtype mpn_udiv_qrnnd _PROTO ((UWtype *, UWtype, UWtype, UWtype));
1702
1703#if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd	\
1704  && ! defined (LONGLONG_STANDALONE)
1705#define udiv_qrnnd(q, r, n1, n0, d)					\
1706  do {									\
1707    UWtype __udiv_qrnnd__r;						\
1708    (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r,				\
1709			  (UWtype) (n1), (UWtype) (n0), (UWtype) d);	\
1710    (r) = __udiv_qrnnd__r;						\
1711  } while (0)
1712#endif
1713
1714#define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
1715extern UWtype mpn_udiv_qrnnd_r _PROTO ((UWtype, UWtype, UWtype, UWtype *));
1716
1717#if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r	\
1718  && ! defined (LONGLONG_STANDALONE)
1719#define udiv_qrnnd(q, r, n1, n0, d)					\
1720  do {									\
1721    UWtype __udiv_qrnnd__r;						\
1722    (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,	\
1723			    &__udiv_qrnnd__r);				\
1724    (r) = __udiv_qrnnd__r;						\
1725  } while (0)
1726#endif
1727
1728
1729/* If this machine has no inline assembler, use C macros.  */
1730
1731#if !defined (add_ssaaaa)
1732#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1733  do {									\
1734    UWtype __x;								\
1735    __x = (al) + (bl);							\
1736    (sh) = (ah) + (bh) + (__x < (al));					\
1737    (sl) = __x;								\
1738  } while (0)
1739#endif
1740
1741#if !defined (sub_ddmmss)
1742#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1743  do {									\
1744    UWtype __x;								\
1745    __x = (al) - (bl);							\
1746    (sh) = (ah) - (bh) - ((al) < (bl));                                 \
1747    (sl) = __x;								\
1748  } while (0)
1749#endif
1750
1751/* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
1752   smul_ppmm.  */
1753#if !defined (umul_ppmm) && defined (smul_ppmm)
1754#define umul_ppmm(w1, w0, u, v)						\
1755  do {									\
1756    UWtype __w1;							\
1757    UWtype __xm0 = (u), __xm1 = (v);					\
1758    smul_ppmm (__w1, w0, __xm0, __xm1);					\
1759    (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
1760		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
1761  } while (0)
1762#endif
1763
1764/* If we still don't have umul_ppmm, define it using plain C.
1765
1766   For reference, when this code is used for squaring (ie. u and v identical
1767   expressions), gcc recognises __x1 and __x2 are the same and generates 3
1768   multiplies, not 4.  The subsequent additions could be optimized a bit,
1769   but the only place GMP currently uses such a square is mpn_sqr_basecase,
1770   and chips obliged to use this generic C umul will have plenty of worse
1771   performance problems than a couple of extra instructions on the diagonal
1772   of sqr_basecase.  */
1773
1774#if !defined (umul_ppmm)
1775#define umul_ppmm(w1, w0, u, v)						\
1776  do {									\
1777    UWtype __x0, __x1, __x2, __x3;					\
1778    UHWtype __ul, __vl, __uh, __vh;					\
1779    UWtype __u = (u), __v = (v);					\
1780									\
1781    __ul = __ll_lowpart (__u);						\
1782    __uh = __ll_highpart (__u);						\
1783    __vl = __ll_lowpart (__v);						\
1784    __vh = __ll_highpart (__v);						\
1785									\
1786    __x0 = (UWtype) __ul * __vl;					\
1787    __x1 = (UWtype) __ul * __vh;					\
1788    __x2 = (UWtype) __uh * __vl;					\
1789    __x3 = (UWtype) __uh * __vh;					\
1790									\
1791    __x1 += __ll_highpart (__x0);/* this can't give carry */		\
1792    __x1 += __x2;		/* but this indeed can */		\
1793    if (__x1 < __x2)		/* did we get it? */			\
1794      __x3 += __ll_B;		/* yes, add it in the proper pos. */	\
1795									\
1796    (w1) = __x3 + __ll_highpart (__x1);					\
1797    (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);		\
1798  } while (0)
1799#endif
1800
1801/* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
1802   exist in one form or another.  */
1803#if !defined (smul_ppmm)
1804#define smul_ppmm(w1, w0, u, v)						\
1805  do {									\
1806    UWtype __w1;							\
1807    UWtype __xm0 = (u), __xm1 = (v);					\
1808    umul_ppmm (__w1, w0, __xm0, __xm1);					\
1809    (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
1810		- (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
1811  } while (0)
1812#endif
1813
1814/* Define this unconditionally, so it can be used for debugging.  */
1815#define __udiv_qrnnd_c(q, r, n1, n0, d) \
1816  do {									\
1817    UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;			\
1818									\
1819    ASSERT ((d) != 0);							\
1820    ASSERT ((n1) < (d));						\
1821									\
1822    __d1 = __ll_highpart (d);						\
1823    __d0 = __ll_lowpart (d);						\
1824									\
1825    __q1 = (n1) / __d1;							\
1826    __r1 = (n1) - __q1 * __d1;						\
1827    __m = __q1 * __d0;							\
1828    __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
1829    if (__r1 < __m)							\
1830      {									\
1831	__q1--, __r1 += (d);						\
1832	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
1833	  if (__r1 < __m)						\
1834	    __q1--, __r1 += (d);					\
1835      }									\
1836    __r1 -= __m;							\
1837									\
1838    __q0 = __r1 / __d1;							\
1839    __r0 = __r1  - __q0 * __d1;						\
1840    __m = __q0 * __d0;							\
1841    __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
1842    if (__r0 < __m)							\
1843      {									\
1844	__q0--, __r0 += (d);						\
1845	if (__r0 >= (d))						\
1846	  if (__r0 < __m)						\
1847	    __q0--, __r0 += (d);					\
1848      }									\
1849    __r0 -= __m;							\
1850									\
1851    (q) = __q1 * __ll_B | __q0;						\
1852    (r) = __r0;								\
1853  } while (0)
1854
1855/* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
1856   __udiv_w_sdiv (defined in libgcc or elsewhere).  */
1857#if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
1858#define udiv_qrnnd(q, r, nh, nl, d) \
1859  do {									\
1860    UWtype __r;								\
1861    (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);				\
1862    (r) = __r;								\
1863  } while (0)
1864#endif
1865
1866/* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
1867#if !defined (udiv_qrnnd)
1868#define UDIV_NEEDS_NORMALIZATION 1
1869#define udiv_qrnnd __udiv_qrnnd_c
1870#endif
1871
1872#if !defined (count_leading_zeros)
1873#define count_leading_zeros(count, x) \
1874  do {									\
1875    UWtype __xr = (x);							\
1876    UWtype __a;								\
1877									\
1878    if (W_TYPE_SIZE == 32)						\
1879      {									\
1880	__a = __xr < ((UWtype) 1 << 2*__BITS4)				\
1881	  ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)		\
1882	  : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1		\
1883	  : 3*__BITS4 + 1);						\
1884      }									\
1885    else								\
1886      {									\
1887	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
1888	  if (((__xr >> __a) & 0xff) != 0)				\
1889	    break;							\
1890	++__a;								\
1891      }									\
1892									\
1893    (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];		\
1894  } while (0)
1895/* This version gives a well-defined value for zero. */
1896#define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
1897#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1898#endif
1899
1900/* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
1901#if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
1902#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1903#endif
1904
1905#ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1906# ifdef MPFR_HAVE_GMP_IMPL
1907    extern const unsigned char __GMP_DECLSPEC __clz_tab[128];
1908# else
1909    extern const unsigned char __clz_tab[128];
1910# endif
1911#endif
1912
1913#if !defined (count_trailing_zeros)
1914/* Define count_trailing_zeros using count_leading_zeros.  The latter might be
1915   defined in asm, but if it is not, the C version above is good enough.  */
1916#define count_trailing_zeros(count, x) \
1917  do {									\
1918    UWtype __ctz_x = (x);						\
1919    UWtype __ctz_c;							\
1920    ASSERT (__ctz_x != 0);						\
1921    count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
1922    (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
1923  } while (0)
1924#endif
1925
1926#ifndef UDIV_NEEDS_NORMALIZATION
1927#define UDIV_NEEDS_NORMALIZATION 0
1928#endif
1929
1930/* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
1931   that hence the latter should always be used.  */
1932#ifndef UDIV_PREINV_ALWAYS
1933#define UDIV_PREINV_ALWAYS 0
1934#endif
1935
1936/* Give defaults for UMUL_TIME and UDIV_TIME.  */
1937#ifndef UMUL_TIME
1938#define UMUL_TIME 1
1939#endif
1940
1941#ifndef UDIV_TIME
1942#define UDIV_TIME UMUL_TIME
1943#endif
1944