xref: /netbsd-src/external/lgpl3/gmp/dist/longlong.h (revision e6c7e151de239c49d2e38720a061ed9d1fa99309)
1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
2 
3 Copyright 1991-1994, 1996, 1997, 1999-2005, 2007-2009, 2011-2016 Free Software
4 Foundation, Inc.
5 
6 This file is part of the GNU MP Library.
7 
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of either:
10 
11   * the GNU Lesser General Public License as published by the Free
12     Software Foundation; either version 3 of the License, or (at your
13     option) any later version.
14 
15 or
16 
17   * the GNU General Public License as published by the Free Software
18     Foundation; either version 2 of the License, or (at your option) any
19     later version.
20 
21 or both in parallel, as here.
22 
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26 for more details.
27 
28 You should have received copies of the GNU General Public License and the
29 GNU Lesser General Public License along with the GNU MP Library.  If not,
30 see https://www.gnu.org/licenses/.  */
31 
32 /* You have to define the following before including this file:
33 
34    UWtype -- An unsigned type, default type for operations (typically a "word")
35    UHWtype -- An unsigned type, at least half the size of UWtype
36    UDWtype -- An unsigned type, at least twice as large a UWtype
37    W_TYPE_SIZE -- size in bits of UWtype
38 
39    SItype, USItype -- Signed and unsigned 32 bit types
40    DItype, UDItype -- Signed and unsigned 64 bit types
41 
42    On a 32 bit machine UWtype should typically be USItype;
43    on a 64 bit machine, UWtype should typically be UDItype.
44 
45    Optionally, define:
46 
47    LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
48    NO_ASM -- Disable inline asm
49 
50 
51    CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
52    need to include gmp.h and gmp-impl.h, or certain things might not work as
53    expected.
54 */
55 
56 #define __BITS4 (W_TYPE_SIZE / 4)
57 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
58 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
59 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
60 
61 /* This is used to make sure no undesirable sharing between different libraries
62    that use this file takes place.  */
63 #ifndef __MPN
64 #define __MPN(x) __##x
65 #endif
66 
67 /* Define auxiliary asm macros.
68 
69    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
70    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
71    word product in HIGH_PROD and LOW_PROD.
72 
73    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
74    UDWtype product.  This is just a variant of umul_ppmm.
75 
76    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
77    denominator) divides a UDWtype, composed by the UWtype integers
78    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
79    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
80    than DENOMINATOR for correct operation.  If, in addition, the most
81    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
82    UDIV_NEEDS_NORMALIZATION is defined to 1.
83 
84    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
85    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
86    is rounded towards 0.
87 
88    5) count_leading_zeros(count, x) counts the number of zero-bits from the
89    msb to the first non-zero bit in the UWtype X.  This is the number of
90    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
91    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
92 
93    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
94    from the least significant end.
95 
96    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
97    high_addend_2, low_addend_2) adds two UWtype integers, composed by
98    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
99    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
100    (i.e. carry out) is not stored anywhere, and is lost.
101 
102    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
103    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
104    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
105    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
106    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
107    and is lost.
108 
109    If any of these macros are left undefined for a particular CPU,
110    C macros are used.
111 
112 
113    Notes:
114 
115    For add_ssaaaa the two high and two low addends can both commute, but
116    unfortunately gcc only supports one "%" commutative in each asm block.
117    This has always been so but is only documented in recent versions
118    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
119    compiler error in certain rare circumstances.
120 
121    Apparently it was only the last "%" that was ever actually respected, so
122    the code has been updated to leave just that.  Clearly there's a free
123    choice whether high or low should get it, if there's a reason to favour
124    one over the other.  Also obviously when the constraints on the two
125    operands are identical there's no benefit to the reloader in any "%" at
126    all.
127 
128    */
129 
130 /* The CPUs come in alphabetical order below.
131 
132    Please add support for more CPUs here, or improve the current support
133    for the CPUs below!  */
134 
135 
136 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
137    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
138    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
139    __builtin_ctzll.
140 
141    These builtins are only used when we check what code comes out, on some
142    chips they're merely libgcc calls, where we will instead want an inline
143    in that case (either asm or generic C).
144 
145    These builtins are better than an asm block of the same insn, since an
146    asm block doesn't give gcc any information about scheduling or resource
147    usage.  We keep an asm block for use on prior versions of gcc though.
148 
149    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
150    it's not used (for count_leading_zeros) because it generally gives extra
151    code to ensure the result is 0 when the input is 0, which we don't need
152    or want.  */
153 
154 #ifdef _LONG_LONG_LIMB
155 #define count_leading_zeros_gcc_clz(count,x)	\
156   do {						\
157     ASSERT ((x) != 0);				\
158     (count) = __builtin_clzll (x);		\
159   } while (0)
160 #else
161 #define count_leading_zeros_gcc_clz(count,x)	\
162   do {						\
163     ASSERT ((x) != 0);				\
164     (count) = __builtin_clzl (x);		\
165   } while (0)
166 #endif
167 
168 #ifdef _LONG_LONG_LIMB
169 #define count_trailing_zeros_gcc_ctz(count,x)	\
170   do {						\
171     ASSERT ((x) != 0);				\
172     (count) = __builtin_ctzll (x);		\
173   } while (0)
174 #else
175 #define count_trailing_zeros_gcc_ctz(count,x)	\
176   do {						\
177     ASSERT ((x) != 0);				\
178     (count) = __builtin_ctzl (x);		\
179   } while (0)
180 #endif
181 
182 
183 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
184    don't need to be under !NO_ASM */
185 #if ! defined (NO_ASM)
186 
187 #if defined (__alpha) && W_TYPE_SIZE == 64
188 /* Most alpha-based machines, except Cray systems. */
189 #if defined (__GNUC__)
190 #if __GMP_GNUC_PREREQ (3,3)
191 #define umul_ppmm(ph, pl, m0, m1) \
192   do {									\
193     UDItype __m0 = (m0), __m1 = (m1);					\
194     (ph) = __builtin_alpha_umulh (__m0, __m1);				\
195     (pl) = __m0 * __m1;							\
196   } while (0)
197 #else
198 #define umul_ppmm(ph, pl, m0, m1) \
199   do {									\
200     UDItype __m0 = (m0), __m1 = (m1);					\
201     __asm__ ("umulh %r1,%2,%0"						\
202 	     : "=r" (ph)						\
203 	     : "%rJ" (__m0), "rI" (__m1));				\
204     (pl) = __m0 * __m1;							\
205   } while (0)
206 #endif
207 #define UMUL_TIME 18
208 #else /* ! __GNUC__ */
209 #include <machine/builtins.h>
210 #define umul_ppmm(ph, pl, m0, m1) \
211   do {									\
212     UDItype __m0 = (m0), __m1 = (m1);					\
213     (ph) = __UMULH (__m0, __m1);					\
214     (pl) = __m0 * __m1;							\
215   } while (0)
216 #endif
217 #ifndef LONGLONG_STANDALONE
218 #define udiv_qrnnd(q, r, n1, n0, d) \
219   do { UWtype __di;							\
220     __di = __MPN(invert_limb) (d);					\
221     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
222   } while (0)
223 #define UDIV_PREINV_ALWAYS  1
224 #define UDIV_NEEDS_NORMALIZATION 1
225 #define UDIV_TIME 220
226 #endif /* LONGLONG_STANDALONE */
227 
228 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
229    always goes into libgmp.so, even when not actually used.  */
230 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
231 
232 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
233 #define count_leading_zeros(COUNT,X) \
234   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
235 #define count_trailing_zeros(COUNT,X) \
236   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
237 #endif /* clz/ctz using cix */
238 
239 #if ! defined (count_leading_zeros)				\
240   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
241 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
242    "$31" is written explicitly in the asm, since an "r" constraint won't
243    select reg 31.  There seems no need to worry about "r31" syntax for cray,
244    since gcc itself (pre-release 3.4) emits just $31 in various places.	 */
245 #define ALPHA_CMPBGE_0(dst, src)					\
246   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
247 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
248    them, locating the highest non-zero byte.  A second __clz_tab lookup
249    counts the leading zero bits in that byte, giving the result.  */
250 #define count_leading_zeros(count, x)					\
251   do {									\
252     UWtype  __clz__b, __clz__c, __clz__x = (x);				\
253     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);	    /* zero bytes */	\
254     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */	\
255     __clz__b = __clz__b * 8 - 7;		    /* 57 to 1 shift */ \
256     __clz__x >>= __clz__b;						\
257     __clz__c = __clz_tab [__clz__x];		    /* 8 to 1 bit */	\
258     __clz__b = 65 - __clz__b;						\
259     (count) = __clz__b - __clz__c;					\
260   } while (0)
261 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
262 #endif /* clz using cmpbge */
263 
264 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
265 #if HAVE_ATTRIBUTE_CONST
266 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
267 #else
268 long __MPN(count_leading_zeros) (UDItype);
269 #endif
270 #define count_leading_zeros(count, x) \
271   ((count) = __MPN(count_leading_zeros) (x))
272 #endif /* clz using mpn */
273 #endif /* __alpha */
274 
275 #if defined (__AVR) && W_TYPE_SIZE == 8
276 #define umul_ppmm(ph, pl, m0, m1) \
277   do {									\
278     unsigned short __p = (unsigned short) (m0) * (m1);			\
279     (ph) = __p >> 8;							\
280     (pl) = __p;								\
281   } while (0)
282 #endif /* AVR */
283 
284 #if defined (_CRAY) && W_TYPE_SIZE == 64
285 #include <intrinsics.h>
286 #define UDIV_PREINV_ALWAYS  1
287 #define UDIV_NEEDS_NORMALIZATION 1
288 #define UDIV_TIME 220
289 long __MPN(count_leading_zeros) (UDItype);
290 #define count_leading_zeros(count, x) \
291   ((count) = _leadz ((UWtype) (x)))
292 #if defined (_CRAYIEEE)		/* I.e., Cray T90/ieee, T3D, and T3E */
293 #define umul_ppmm(ph, pl, m0, m1) \
294   do {									\
295     UDItype __m0 = (m0), __m1 = (m1);					\
296     (ph) = _int_mult_upper (__m0, __m1);				\
297     (pl) = __m0 * __m1;							\
298   } while (0)
299 #ifndef LONGLONG_STANDALONE
300 #define udiv_qrnnd(q, r, n1, n0, d) \
301   do { UWtype __di;							\
302     __di = __MPN(invert_limb) (d);					\
303     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
304   } while (0)
305 #endif /* LONGLONG_STANDALONE */
306 #endif /* _CRAYIEEE */
307 #endif /* _CRAY */
308 
309 #if defined (__ia64) && W_TYPE_SIZE == 64
310 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
311    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
312    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
313    register, which takes an extra cycle.  */
314 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
315   do {						\
316     UWtype __x;					\
317     __x = (al) - (bl);				\
318     if ((al) < (bl))				\
319       (sh) = (ah) - (bh) - 1;			\
320     else					\
321       (sh) = (ah) - (bh);			\
322     (sl) = __x;					\
323   } while (0)
324 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
325 /* Do both product parts in assembly, since that gives better code with
326    all gcc versions.  Some callers will just use the upper part, and in
327    that situation we waste an instruction, but not any cycles.  */
328 #define umul_ppmm(ph, pl, m0, m1) \
329     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"		\
330 	     : "=&f" (ph), "=f" (pl)					\
331 	     : "f" (m0), "f" (m1))
332 #define UMUL_TIME 14
333 #define count_leading_zeros(count, x) \
334   do {									\
335     UWtype _x = (x), _y, _a, _c;					\
336     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));		\
337     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));		\
338     _c = (_a - 1) << 3;							\
339     _x >>= _c;								\
340     if (_x >= 1 << 4)							\
341       _x >>= 4, _c += 4;						\
342     if (_x >= 1 << 2)							\
343       _x >>= 2, _c += 2;						\
344     _c += _x >> 1;							\
345     (count) =  W_TYPE_SIZE - 1 - _c;					\
346   } while (0)
347 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
348    based, and we don't need a special case for x==0 here */
349 #define count_trailing_zeros(count, x)					\
350   do {									\
351     UWtype __ctz_x = (x);						\
352     __asm__ ("popcnt %0 = %1"						\
353 	     : "=r" (count)						\
354 	     : "r" ((__ctz_x-1) & ~__ctz_x));				\
355   } while (0)
356 #endif
357 #if defined (__INTEL_COMPILER)
358 #include <ia64intrin.h>
359 #define umul_ppmm(ph, pl, m0, m1)					\
360   do {									\
361     UWtype __m0 = (m0), __m1 = (m1);					\
362     ph = _m64_xmahu (__m0, __m1, 0);					\
363     pl = __m0 * __m1;							\
364   } while (0)
365 #endif
366 #ifndef LONGLONG_STANDALONE
367 #define udiv_qrnnd(q, r, n1, n0, d) \
368   do { UWtype __di;							\
369     __di = __MPN(invert_limb) (d);					\
370     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
371   } while (0)
372 #define UDIV_PREINV_ALWAYS  1
373 #define UDIV_NEEDS_NORMALIZATION 1
374 #endif
375 #define UDIV_TIME 220
376 #endif
377 
378 
379 #if defined (__GNUC__)
380 
381 /* We sometimes need to clobber "cc" with gcc2, but that would not be
382    understood by gcc1.  Use cpp to avoid major code duplication.  */
383 #if __GNUC__ < 2
384 #define __CLOBBER_CC
385 #define __AND_CLOBBER_CC
386 #else /* __GNUC__ >= 2 */
387 #define __CLOBBER_CC : "cc"
388 #define __AND_CLOBBER_CC , "cc"
389 #endif /* __GNUC__ < 2 */
390 
391 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
392 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
393   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"				\
394 	   : "=r" (sh), "=&r" (sl)					\
395 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
396 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
397   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"				\
398 	   : "=r" (sh), "=&r" (sl)					\
399 	   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
400 #define umul_ppmm(xh, xl, m0, m1) \
401   do {									\
402     USItype __m0 = (m0), __m1 = (m1);					\
403     __asm__ ("multiplu %0,%1,%2"					\
404 	     : "=r" (xl)						\
405 	     : "r" (__m0), "r" (__m1));					\
406     __asm__ ("multmu %0,%1,%2"						\
407 	     : "=r" (xh)						\
408 	     : "r" (__m0), "r" (__m1));					\
409   } while (0)
410 #define udiv_qrnnd(q, r, n1, n0, d) \
411   __asm__ ("dividu %0,%3,%4"						\
412 	   : "=r" (q), "=q" (r)						\
413 	   : "1" (n1), "r" (n0), "r" (d))
414 #define count_leading_zeros(count, x) \
415     __asm__ ("clz %0,%1"						\
416 	     : "=r" (count)						\
417 	     : "r" (x))
418 #define COUNT_LEADING_ZEROS_0 32
419 #endif /* __a29k__ */
420 
421 #if defined (__arc__)
422 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
423   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
424 	   : "=r" (sh),							\
425 	     "=&r" (sl)							\
426 	   : "r"  ((USItype) (ah)),					\
427 	     "rICal" ((USItype) (bh)),					\
428 	     "%r" ((USItype) (al)),					\
429 	     "rICal" ((USItype) (bl)))
430 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
431   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
432 	   : "=r" (sh),							\
433 	     "=&r" (sl)							\
434 	   : "r" ((USItype) (ah)),					\
435 	     "rICal" ((USItype) (bh)),					\
436 	     "r" ((USItype) (al)),					\
437 	     "rICal" ((USItype) (bl)))
438 #endif
439 
440 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
441     && W_TYPE_SIZE == 32
442 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
443   __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
444 	   : "=r" (sh), "=&r" (sl)					\
445 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
446 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
447   do {									\
448     if (__builtin_constant_p (al))					\
449       {									\
450 	if (__builtin_constant_p (ah))					\
451 	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
452 		   : "=r" (sh), "=&r" (sl)				\
453 		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
454 	else								\
455 	  __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"		\
456 		   : "=r" (sh), "=&r" (sl)				\
457 		   : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
458       }									\
459     else if (__builtin_constant_p (ah))					\
460       {									\
461 	if (__builtin_constant_p (bl))					\
462 	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
463 		   : "=r" (sh), "=&r" (sl)				\
464 		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
465 	else								\
466 	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
467 		   : "=r" (sh), "=&r" (sl)				\
468 		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
469       }									\
470     else if (__builtin_constant_p (bl))					\
471       {									\
472 	if (__builtin_constant_p (bh))					\
473 	  __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"		\
474 		   : "=r" (sh), "=&r" (sl)				\
475 		   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
476 	else								\
477 	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
478 		   : "=r" (sh), "=&r" (sl)				\
479 		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
480       }									\
481     else /* only bh might be a constant */				\
482       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
483 	       : "=r" (sh), "=&r" (sl)					\
484 	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
485     } while (0)
486 #if defined (__ARM_ARCH_2__) || defined (__ARM_ARCH_2A__) \
487     || defined (__ARM_ARCH_3__)
488 #define umul_ppmm(xh, xl, a, b)						\
489   do {									\
490     register USItype __t0, __t1, __t2;					\
491     __asm__ ("%@ Inlined umul_ppmm\n"					\
492 	   "	mov	%2, %5, lsr #16\n"				\
493 	   "	mov	%0, %6, lsr #16\n"				\
494 	   "	bic	%3, %5, %2, lsl #16\n"				\
495 	   "	bic	%4, %6, %0, lsl #16\n"				\
496 	   "	mul	%1, %3, %4\n"					\
497 	   "	mul	%4, %2, %4\n"					\
498 	   "	mul	%3, %0, %3\n"					\
499 	   "	mul	%0, %2, %0\n"					\
500 	   "	adds	%3, %4, %3\n"					\
501 	   "	addcs	%0, %0, #65536\n"				\
502 	   "	adds	%1, %1, %3, lsl #16\n"				\
503 	   "	adc	%0, %0, %3, lsr #16"				\
504 	   : "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)),		\
505 	     "=&r" (__t0), "=&r" (__t1), "=r" (__t2)			\
506 	   : "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC);	\
507   } while (0)
508 #define UMUL_TIME 20
509 #ifndef LONGLONG_STANDALONE
510 #define udiv_qrnnd(q, r, n1, n0, d) \
511   do { UWtype __r;							\
512     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
513     (r) = __r;								\
514   } while (0)
515 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
516 #define UDIV_TIME 200
517 #endif /* LONGLONG_STANDALONE */
518 #else /* ARMv4 or newer */
519 #define umul_ppmm(xh, xl, a, b) \
520   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
521 #define UMUL_TIME 5
522 #define smul_ppmm(xh, xl, a, b) \
523   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
524 #ifndef LONGLONG_STANDALONE
525 #define udiv_qrnnd(q, r, n1, n0, d) \
526   do { UWtype __di;							\
527     __di = __MPN(invert_limb) (d);					\
528     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
529   } while (0)
530 #define UDIV_PREINV_ALWAYS  1
531 #define UDIV_NEEDS_NORMALIZATION 1
532 #define UDIV_TIME 70
533 #endif /* LONGLONG_STANDALONE */
534 #endif /* defined(__ARM_ARCH_2__) ... */
535 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
536 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
537 #define COUNT_LEADING_ZEROS_0 32
538 #endif /* __arm__ */
539 
540 #if defined (__aarch64__) && W_TYPE_SIZE == 64
541 /* FIXME: Extend the immediate range for the low word by using both
542    ADDS and SUBS, since they set carry in the same way.  */
543 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
544   __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"			\
545 	   : "=r" (sh), "=&r" (sl)					\
546 	   : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)),		\
547 	     "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
548 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
549   __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"			\
550 	   : "=r,r" (sh), "=&r,&r" (sl)					\
551 	   : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)),		\
552 	     "r,Z"   ((UDItype)(al)), "rI,r"  ((UDItype)(bl)) __CLOBBER_CC)
553 #define umul_ppmm(ph, pl, m0, m1) \
554   do {									\
555     UDItype __m0 = (m0), __m1 = (m1);					\
556     __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1));	\
557     (pl) = __m0 * __m1;							\
558   } while (0)
559 #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
560 #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
561 #define COUNT_LEADING_ZEROS_0 64
562 #endif /* __aarch64__ */
563 
564 #if defined (__clipper__) && W_TYPE_SIZE == 32
565 #define umul_ppmm(w1, w0, u, v) \
566   ({union {UDItype __ll;						\
567 	   struct {USItype __l, __h;} __i;				\
568 	  } __x;							\
569   __asm__ ("mulwux %2,%0"						\
570 	   : "=r" (__x.__ll)						\
571 	   : "%0" ((USItype)(u)), "r" ((USItype)(v)));			\
572   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
573 #define smul_ppmm(w1, w0, u, v) \
574   ({union {DItype __ll;							\
575 	   struct {SItype __l, __h;} __i;				\
576 	  } __x;							\
577   __asm__ ("mulwx %2,%0"						\
578 	   : "=r" (__x.__ll)						\
579 	   : "%0" ((SItype)(u)), "r" ((SItype)(v)));			\
580   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
581 #define __umulsidi3(u, v) \
582   ({UDItype __w;							\
583     __asm__ ("mulwux %2,%0"						\
584 	     : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));	\
585     __w; })
586 #endif /* __clipper__ */
587 
588 /* Fujitsu vector computers.  */
589 #if defined (__uxp__) && W_TYPE_SIZE == 32
590 #define umul_ppmm(ph, pl, u, v) \
591   do {									\
592     union {UDItype __ll;						\
593 	   struct {USItype __h, __l;} __i;				\
594 	  } __x;							\
595     __asm__ ("mult.lu %1,%2,%0"	: "=r" (__x.__ll) : "%r" (u), "rK" (v));\
596     (ph) = __x.__i.__h;							\
597     (pl) = __x.__i.__l;							\
598   } while (0)
599 #define smul_ppmm(ph, pl, u, v) \
600   do {									\
601     union {UDItype __ll;						\
602 	   struct {USItype __h, __l;} __i;				\
603 	  } __x;							\
604     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));	\
605     (ph) = __x.__i.__h;							\
606     (pl) = __x.__i.__l;							\
607   } while (0)
608 #endif
609 
610 #if defined (__gmicro__) && W_TYPE_SIZE == 32
611 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
612   __asm__ ("add.w %5,%1\n\taddx %3,%0"					\
613 	   : "=g" (sh), "=&g" (sl)					\
614 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
615 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
616 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
617   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"					\
618 	   : "=g" (sh), "=&g" (sl)					\
619 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
620 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
621 #define umul_ppmm(ph, pl, m0, m1) \
622   __asm__ ("mulx %3,%0,%1"						\
623 	   : "=g" (ph), "=r" (pl)					\
624 	   : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
625 #define udiv_qrnnd(q, r, nh, nl, d) \
626   __asm__ ("divx %4,%0,%1"						\
627 	   : "=g" (q), "=r" (r)						\
628 	   : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
629 #define count_leading_zeros(count, x) \
630   __asm__ ("bsch/1 %1,%0"						\
631 	   : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
632 #endif
633 
634 #if defined (__hppa) && W_TYPE_SIZE == 32
635 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
636   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"			\
637 	   : "=r" (sh), "=&r" (sl)					\
638 	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
639 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
640   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"			\
641 	   : "=r" (sh), "=&r" (sl)					\
642 	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
643 #if defined (_PA_RISC1_1)
644 #define umul_ppmm(wh, wl, u, v) \
645   do {									\
646     union {UDItype __ll;						\
647 	   struct {USItype __h, __l;} __i;				\
648 	  } __x;							\
649     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v));	\
650     (wh) = __x.__i.__h;							\
651     (wl) = __x.__i.__l;							\
652   } while (0)
653 #define UMUL_TIME 8
654 #define UDIV_TIME 60
655 #else
656 #define UMUL_TIME 40
657 #define UDIV_TIME 80
658 #endif
659 #define count_leading_zeros(count, x) \
660   do {									\
661     USItype __tmp;							\
662     __asm__ (								\
663        "ldi		1,%0\n"						\
664 "	extru,=		%1,15,16,%%r0	; Bits 31..16 zero?\n"		\
665 "	extru,tr	%1,15,16,%1	; No.  Shift down, skip add.\n"	\
666 "	ldo		16(%0),%0	; Yes.  Perform add.\n"		\
667 "	extru,=		%1,23,8,%%r0	; Bits 15..8 zero?\n"		\
668 "	extru,tr	%1,23,8,%1	; No.  Shift down, skip add.\n"	\
669 "	ldo		8(%0),%0	; Yes.  Perform add.\n"		\
670 "	extru,=		%1,27,4,%%r0	; Bits 7..4 zero?\n"		\
671 "	extru,tr	%1,27,4,%1	; No.  Shift down, skip add.\n"	\
672 "	ldo		4(%0),%0	; Yes.  Perform add.\n"		\
673 "	extru,=		%1,29,2,%%r0	; Bits 3..2 zero?\n"		\
674 "	extru,tr	%1,29,2,%1	; No.  Shift down, skip add.\n"	\
675 "	ldo		2(%0),%0	; Yes.  Perform add.\n"		\
676 "	extru		%1,30,1,%1	; Extract bit 1.\n"		\
677 "	sub		%0,%1,%0	; Subtract it.\n"		\
678 	: "=r" (count), "=r" (__tmp) : "1" (x));			\
679   } while (0)
680 #endif /* hppa */
681 
682 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
683    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
684    is just a case of no direct support for 2.0n but treating it like 1.0. */
685 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
686 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
687   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"			\
688 	   : "=r" (sh), "=&r" (sl)					\
689 	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
690 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
691   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"			\
692 	   : "=r" (sh), "=&r" (sl)					\
693 	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
694 #endif /* hppa */
695 
696 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
697 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
698 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
699   do {									\
700 /*  if (__builtin_constant_p (bl))					\
701       __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3"				\
702 	       : "=r" (sh), "=&r" (sl)					\
703 	       : "0"  (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
704     else								\
705 */    __asm__ ("alr\t%1,%5\n\talcr\t%0,%3"				\
706 	       : "=r" (sh), "=&r" (sl)					\
707 	       : "0"  (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC);	\
708   } while (0)
709 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
710   do {									\
711 /*  if (__builtin_constant_p (bl))					\
712       __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3"				\
713 	       : "=r" (sh), "=&r" (sl)					\
714 	       : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC);	\
715     else								\
716 */    __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3"				\
717 	       : "=r" (sh), "=&r" (sl)					\
718 	       : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC);	\
719   } while (0)
720 #if __GMP_GNUC_PREREQ (4,5)
721 #define umul_ppmm(xh, xl, m0, m1)					\
722   do {									\
723     union {UDItype __ll;						\
724 	   struct {USItype __h, __l;} __i;				\
725 	  } __x;							\
726     __x.__ll = (UDItype) (m0) * (UDItype) (m1);				\
727     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
728   } while (0)
729 #else
730 #if 0
731 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
732    with a new enough processor pretending we have 32-bit registers.  */
733 #define umul_ppmm(xh, xl, m0, m1)					\
734   do {									\
735     union {UDItype __ll;						\
736 	   struct {USItype __h, __l;} __i;				\
737 	  } __x;							\
738     __asm__ ("mlr\t%0,%2"						\
739 	     : "=r" (__x.__ll)						\
740 	     : "%0" (m0), "r" (m1));					\
741     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
742   } while (0)
743 #else
744 #define umul_ppmm(xh, xl, m0, m1)					\
745   do {									\
746   /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
747      DImode for the product, since that would be allocated to a single 64-bit
748      register, whereas mlr uses the low 32-bits of an even-odd register pair.
749   */									\
750     register USItype __r0 __asm__ ("0");				\
751     register USItype __r1 __asm__ ("1") = (m0);				\
752     __asm__ ("mlr\t%0,%3"						\
753 	     : "=r" (__r0), "=r" (__r1)					\
754 	     : "r" (__r1), "r" (m1));					\
755     (xh) = __r0; (xl) = __r1;						\
756   } while (0)
757 #endif /* if 0 */
758 #endif
759 #if 0
760 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
761    with a new enough processor pretending we have 32-bit registers.  */
762 #define udiv_qrnnd(q, r, n1, n0, d)					\
763   do {									\
764     union {UDItype __ll;						\
765 	   struct {USItype __h, __l;} __i;				\
766 	  } __x;							\
767     __x.__i.__h = n1; __x.__i.__l = n0;					\
768     __asm__ ("dlr\t%0,%2"						\
769 	     : "=r" (__x.__ll)						\
770 	     : "0" (__x.__ll), "r" (d));				\
771     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
772   } while (0)
773 #else
774 #define udiv_qrnnd(q, r, n1, n0, d)					\
775   do {									\
776     register USItype __r0 __asm__ ("0") = (n1);				\
777     register USItype __r1 __asm__ ("1") = (n0);				\
778     __asm__ ("dlr\t%0,%4"						\
779 	     : "=r" (__r0), "=r" (__r1)					\
780 	     : "r" (__r0), "r" (__r1), "r" (d));			\
781     (q) = __r1; (r) = __r0;						\
782   } while (0)
783 #endif /* if 0 */
784 #else /* if __zarch__ */
785 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
786 #define smul_ppmm(xh, xl, m0, m1)					\
787   do {									\
788     union {DItype __ll;							\
789 	   struct {USItype __h, __l;} __i;				\
790 	  } __x;							\
791     __asm__ ("mr\t%0,%2"						\
792 	     : "=r" (__x.__ll)						\
793 	     : "%0" (m0), "r" (m1));					\
794     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
795   } while (0)
796 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
797 #define sdiv_qrnnd(q, r, n1, n0, d)					\
798   do {									\
799     union {DItype __ll;							\
800 	   struct {USItype __h, __l;} __i;				\
801 	  } __x;							\
802     __x.__i.__h = n1; __x.__i.__l = n0;					\
803     __asm__ ("dr\t%0,%2"						\
804 	     : "=r" (__x.__ll)						\
805 	     : "0" (__x.__ll), "r" (d));				\
806     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
807   } while (0)
808 #endif /* if __zarch__ */
809 #endif
810 
811 #if defined (__s390x__) && W_TYPE_SIZE == 64
812 /* We need to cast operands with register constraints, otherwise their types
813    will be assumed to be SImode by gcc.  For these machines, such operations
814    will insert a value into the low 32 bits, and leave the high 32 bits with
815    garbage.  */
816 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
817   do {									\
818     __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3"				\
819 	       : "=r" (sh), "=&r" (sl)					\
820 	       : "0"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
821 		 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
822   } while (0)
823 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
824   do {									\
825     __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3"				\
826 	     : "=r" (sh), "=&r" (sl)					\
827 	     : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
828 	       "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC);	\
829   } while (0)
830 #define umul_ppmm(xh, xl, m0, m1)					\
831   do {									\
832     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
833 	   struct {UDItype __h, __l;} __i;				\
834 	  } __x;							\
835     __asm__ ("mlgr\t%0,%2"						\
836 	     : "=r" (__x.__ll)						\
837 	     : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1)));		\
838     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
839   } while (0)
840 #define udiv_qrnnd(q, r, n1, n0, d)					\
841   do {									\
842     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
843 	   struct {UDItype __h, __l;} __i;				\
844 	  } __x;							\
845     __x.__i.__h = n1; __x.__i.__l = n0;					\
846     __asm__ ("dlgr\t%0,%2"						\
847 	     : "=r" (__x.__ll)						\
848 	     : "0" (__x.__ll), "r" ((UDItype)(d)));			\
849     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
850   } while (0)
851 #if 0 /* FIXME: Enable for z10 (?) */
852 #define count_leading_zeros(cnt, x)					\
853   do {									\
854     union {unsigned int __attribute__ ((mode(TI))) __ll;		\
855 	   struct {UDItype __h, __l;} __i;				\
856 	  } __clr_cnt;							\
857     __asm__ ("flogr\t%0,%1"						\
858 	     : "=r" (__clr_cnt.__ll)					\
859 	     : "r" (x) __CLOBBER_CC);					\
860     (cnt) = __clr_cnt.__i.__h;						\
861   } while (0)
862 #endif
863 #endif
864 
865 /* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr",
866    so we don't need __CLOBBER_CC.  */
867 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
868 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
869   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"					\
870 	   : "=r" (sh), "=&r" (sl)					\
871 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
872 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
873 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
874   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"					\
875 	   : "=r" (sh), "=&r" (sl)					\
876 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
877 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
878 #define umul_ppmm(w1, w0, u, v) \
879   __asm__ ("mull %3"							\
880 	   : "=a" (w0), "=d" (w1)					\
881 	   : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
882 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
883   __asm__ ("divl %4"		     /* stringification in K&R C */	\
884 	   : "=a" (q), "=d" (r)						\
885 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
886 
887 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
888 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
889    significant 1 bit is, hence the use of the following alternatives.  bsfl
890    is slow too, between 18 and 42 depending where the least significant 1
891    bit is, so let the generic count_trailing_zeros below make use of the
892    count_leading_zeros here too.  */
893 
894 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
895 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
896    cache miss reading from __clz_tab.  For P55 it's favoured over the float
897    below so as to avoid mixing MMX and x87, since the penalty for switching
898    between the two is about 100 cycles.
899 
900    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
901    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
902    follows, but as of gcc 2.95.2 it results in conditional jumps.
903 
904        __shift = -(__n < 0x1000000);
905        __shift -= (__n < 0x10000);
906        __shift -= (__n < 0x100);
907 
908    The middle two sbbl and cmpl's pair, and with luck something gcc
909    generates might pair with the first cmpl and the last sbbl.  The "32+1"
910    constant could be folded into __clz_tab[], but it doesn't seem worth
911    making a different table just for that.  */
912 
913 #define count_leading_zeros(c,n)					\
914   do {									\
915     USItype  __n = (n);							\
916     USItype  __shift;							\
917     __asm__ ("cmpl  $0x1000000, %1\n"					\
918 	     "sbbl  %0, %0\n"						\
919 	     "cmpl  $0x10000, %1\n"					\
920 	     "sbbl  $0, %0\n"						\
921 	     "cmpl  $0x100, %1\n"					\
922 	     "sbbl  $0, %0\n"						\
923 	     : "=&r" (__shift) : "r"  (__n));				\
924     __shift = __shift*8 + 24 + 1;					\
925     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];			\
926   } while (0)
927 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
928 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
929 
930 #else /* ! pentiummmx || LONGLONG_STANDALONE */
931 /* The following should be a fixed 14 cycles or so.  Some scheduling
932    opportunities should be available between the float load/store too.  This
933    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
934    apparently suggested by the Intel optimizing manual (don't know exactly
935    where).  gcc 2.95 or up will be best for this, so the "double" is
936    correctly aligned on the stack.  */
937 #define count_leading_zeros(c,n)					\
938   do {									\
939     union {								\
940       double    d;							\
941       unsigned  a[2];							\
942     } __u;								\
943     ASSERT ((n) != 0);							\
944     __u.d = (UWtype) (n);						\
945     (c) = 0x3FF + 31 - (__u.a[1] >> 20);				\
946   } while (0)
947 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
948 #endif /* pentiummx */
949 
950 #else /* ! pentium */
951 
952 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
953 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
954 #endif /* gcc clz */
955 
956 /* On P6, gcc prior to 3.0 generates a partial register stall for
957    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
958    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
959    cost of one extra instruction.  Do this for "i386" too, since that means
960    generic x86.  */
961 #if ! defined (count_leading_zeros) && __GNUC__ < 3			\
962   && (HAVE_HOST_CPU_i386						\
963       || HAVE_HOST_CPU_i686						\
964       || HAVE_HOST_CPU_pentiumpro					\
965       || HAVE_HOST_CPU_pentium2						\
966       || HAVE_HOST_CPU_pentium3)
967 #define count_leading_zeros(count, x)					\
968   do {									\
969     USItype __cbtmp;							\
970     ASSERT ((x) != 0);							\
971     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
972     (count) = 31 - __cbtmp;						\
973   } while (0)
974 #endif /* gcc<3 asm bsrl */
975 
976 #ifndef count_leading_zeros
977 #define count_leading_zeros(count, x)					\
978   do {									\
979     USItype __cbtmp;							\
980     ASSERT ((x) != 0);							\
981     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
982     (count) = __cbtmp ^ 31;						\
983   } while (0)
984 #endif /* asm bsrl */
985 
986 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
987 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
988 #endif /* gcc ctz */
989 
990 #ifndef count_trailing_zeros
991 #define count_trailing_zeros(count, x)					\
992   do {									\
993     ASSERT ((x) != 0);							\
994     __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));	\
995   } while (0)
996 #endif /* asm bsfl */
997 
998 #endif /* ! pentium */
999 
1000 #ifndef UMUL_TIME
1001 #define UMUL_TIME 10
1002 #endif
1003 #ifndef UDIV_TIME
1004 #define UDIV_TIME 40
1005 #endif
1006 #endif /* 80x86 */
1007 
1008 #if defined (__amd64__) && W_TYPE_SIZE == 64
1009 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1010   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"					\
1011 	   : "=r" (sh), "=&r" (sl)					\
1012 	   : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
1013 	     "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1014 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1015   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"					\
1016 	   : "=r" (sh), "=&r" (sl)					\
1017 	   : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
1018 	     "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1019 #define umul_ppmm(w1, w0, u, v) \
1020   __asm__ ("mulq %3"							\
1021 	   : "=a" (w0), "=d" (w1)					\
1022 	   : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
1023 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
1024   __asm__ ("divq %4"		     /* stringification in K&R C */	\
1025 	   : "=a" (q), "=d" (r)						\
1026 	   : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
1027 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
1028 #define count_leading_zeros(count, x)					\
1029   do {									\
1030     UDItype __cbtmp;							\
1031     ASSERT ((x) != 0);							\
1032     __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));	\
1033     (count) = __cbtmp ^ 63;						\
1034   } while (0)
1035 /* bsfq destination must be a 64-bit register, "%q0" forces this in case
1036    count is only an int. */
1037 #define count_trailing_zeros(count, x)					\
1038   do {									\
1039     ASSERT ((x) != 0);							\
1040     __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
1041   } while (0)
1042 #endif /* __amd64__ */
1043 
1044 #if defined (__i860__) && W_TYPE_SIZE == 32
1045 #define rshift_rhlc(r,h,l,c) \
1046   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"				\
1047 	   "=r" (r) : "r" (h), "r" (l), "rn" (c))
1048 #endif /* i860 */
1049 
1050 #if defined (__i960__) && W_TYPE_SIZE == 32
1051 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1052   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"			\
1053 	   : "=r" (sh), "=&r" (sl)					\
1054 	   : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
1055 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1056   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"			\
1057 	   : "=r" (sh), "=&r" (sl)					\
1058 	   : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
1059 #define umul_ppmm(w1, w0, u, v) \
1060   ({union {UDItype __ll;						\
1061 	   struct {USItype __l, __h;} __i;				\
1062 	  } __x;							\
1063   __asm__ ("emul %2,%1,%0"						\
1064 	   : "=d" (__x.__ll) : "%dI" (u), "dI" (v));			\
1065   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1066 #define __umulsidi3(u, v) \
1067   ({UDItype __w;							\
1068     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));	\
1069     __w; })
1070 #define udiv_qrnnd(q, r, nh, nl, d) \
1071   do {									\
1072     union {UDItype __ll;						\
1073 	   struct {USItype __l, __h;} __i;				\
1074 	  } __nn;							\
1075     __nn.__i.__h = (nh); __nn.__i.__l = (nl);				\
1076     __asm__ ("ediv %d,%n,%0"						\
1077 	   : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));		\
1078     (r) = __rq.__i.__l; (q) = __rq.__i.__h;				\
1079   } while (0)
1080 #define count_leading_zeros(count, x) \
1081   do {									\
1082     USItype __cbtmp;							\
1083     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));		\
1084     (count) = __cbtmp ^ 31;						\
1085   } while (0)
1086 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
1087 #if defined (__i960mx)		/* what is the proper symbol to test??? */
1088 #define rshift_rhlc(r,h,l,c) \
1089   do {									\
1090     union {UDItype __ll;						\
1091 	   struct {USItype __l, __h;} __i;				\
1092 	  } __nn;							\
1093     __nn.__i.__h = (h); __nn.__i.__l = (l);				\
1094     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));	\
1095   }
1096 #endif /* i960mx */
1097 #endif /* i960 */
1098 
1099 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
1100      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
1101      || defined (__mc5307__)) && W_TYPE_SIZE == 32
1102 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1103   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
1104 	   : "=d" (sh), "=&d" (sl)					\
1105 	   : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),			\
1106 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1107 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1108   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
1109 	   : "=d" (sh), "=&d" (sl)					\
1110 	   : "0" ((USItype)(ah)), "d" ((USItype)(bh)),			\
1111 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1112 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
1113 #if defined (__mc68020__) || defined(mc68020) \
1114      || defined (__mc68030__) || defined (mc68030) \
1115      || defined (__mc68040__) || defined (mc68040) \
1116      || defined (__mcpu32__) || defined (mcpu32) \
1117      || defined (__NeXT__)
1118 #define umul_ppmm(w1, w0, u, v) \
1119   __asm__ ("mulu%.l %3,%1:%0"						\
1120 	   : "=d" (w0), "=d" (w1)					\
1121 	   : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
1122 #define UMUL_TIME 45
1123 #define udiv_qrnnd(q, r, n1, n0, d) \
1124   __asm__ ("divu%.l %4,%1:%0"						\
1125 	   : "=d" (q), "=d" (r)						\
1126 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1127 #define UDIV_TIME 90
1128 #define sdiv_qrnnd(q, r, n1, n0, d) \
1129   __asm__ ("divs%.l %4,%1:%0"						\
1130 	   : "=d" (q), "=d" (r)						\
1131 	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1132 #else /* for other 68k family members use 16x16->32 multiplication */
1133 #define umul_ppmm(xh, xl, a, b) \
1134   do { USItype __umul_tmp1, __umul_tmp2;				\
1135 	__asm__ ("| Inlined umul_ppmm\n"				\
1136 "	move%.l	%5,%3\n"						\
1137 "	move%.l	%2,%0\n"						\
1138 "	move%.w	%3,%1\n"						\
1139 "	swap	%3\n"							\
1140 "	swap	%0\n"							\
1141 "	mulu%.w	%2,%1\n"						\
1142 "	mulu%.w	%3,%0\n"						\
1143 "	mulu%.w	%2,%3\n"						\
1144 "	swap	%2\n"							\
1145 "	mulu%.w	%5,%2\n"						\
1146 "	add%.l	%3,%2\n"						\
1147 "	jcc	1f\n"							\
1148 "	add%.l	%#0x10000,%0\n"						\
1149 "1:	move%.l	%2,%3\n"						\
1150 "	clr%.w	%2\n"							\
1151 "	swap	%2\n"							\
1152 "	swap	%3\n"							\
1153 "	clr%.w	%3\n"							\
1154 "	add%.l	%3,%1\n"						\
1155 "	addx%.l	%2,%0\n"						\
1156 "	| End inlined umul_ppmm"					\
1157 	      : "=&d" (xh), "=&d" (xl),					\
1158 		"=d" (__umul_tmp1), "=&d" (__umul_tmp2)			\
1159 	      : "%2" ((USItype)(a)), "d" ((USItype)(b)));		\
1160   } while (0)
1161 #define UMUL_TIME 100
1162 #define UDIV_TIME 400
1163 #endif /* not mc68020 */
1164 /* The '020, '030, '040 and '060 have bitfield insns.
1165    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
1166    exclude bfffo on that chip (bitfield insns not available).  */
1167 #if (defined (__mc68020__) || defined (mc68020)    \
1168      || defined (__mc68030__) || defined (mc68030) \
1169      || defined (__mc68040__) || defined (mc68040) \
1170      || defined (__mc68060__) || defined (mc68060) \
1171      || defined (__NeXT__))			   \
1172   && ! defined (__mcpu32__)
1173 #define count_leading_zeros(count, x) \
1174   __asm__ ("bfffo %1{%b2:%b2},%0"					\
1175 	   : "=d" (count)						\
1176 	   : "od" ((USItype) (x)), "n" (0))
1177 #define COUNT_LEADING_ZEROS_0 32
1178 #endif
1179 #endif /* mc68000 */
1180 
1181 #if defined (__m88000__) && W_TYPE_SIZE == 32
1182 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1183   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
1184 	   : "=r" (sh), "=&r" (sl)					\
1185 	   : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
1186 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1187   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
1188 	   : "=r" (sh), "=&r" (sl)					\
1189 	   : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1190 #define count_leading_zeros(count, x) \
1191   do {									\
1192     USItype __cbtmp;							\
1193     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));			\
1194     (count) = __cbtmp ^ 31;						\
1195   } while (0)
1196 #define COUNT_LEADING_ZEROS_0 63 /* sic */
1197 #if defined (__m88110__)
1198 #define umul_ppmm(wh, wl, u, v) \
1199   do {									\
1200     union {UDItype __ll;						\
1201 	   struct {USItype __h, __l;} __i;				\
1202 	  } __x;							\
1203     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));	\
1204     (wh) = __x.__i.__h;							\
1205     (wl) = __x.__i.__l;							\
1206   } while (0)
1207 #define udiv_qrnnd(q, r, n1, n0, d) \
1208   ({union {UDItype __ll;						\
1209 	   struct {USItype __h, __l;} __i;				\
1210 	  } __x, __q;							\
1211   __x.__i.__h = (n1); __x.__i.__l = (n0);				\
1212   __asm__ ("divu.d %0,%1,%2"						\
1213 	   : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));		\
1214   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1215 #define UMUL_TIME 5
1216 #define UDIV_TIME 25
1217 #else
1218 #define UMUL_TIME 17
1219 #define UDIV_TIME 150
1220 #endif /* __m88110__ */
1221 #endif /* __m88000__ */
1222 
1223 #if defined (__mips) && W_TYPE_SIZE == 32
1224 #if __GMP_GNUC_PREREQ (4,4) || defined(__clang__)
1225 #define umul_ppmm(w1, w0, u, v) \
1226   do {									\
1227     UDItype __ll = (UDItype)(u) * (v);					\
1228     w1 = __ll >> 32;							\
1229     w0 = __ll;								\
1230   } while (0)
1231 #endif
1232 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1233 #define umul_ppmm(w1, w0, u, v) \
1234   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1235 #endif
1236 #if !defined (umul_ppmm)
1237 #define umul_ppmm(w1, w0, u, v) \
1238   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"				\
1239 	   : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1240 #endif
1241 #define UMUL_TIME 10
1242 #define UDIV_TIME 100
1243 #endif /* __mips */
1244 
1245 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1246 #if __GMP_GNUC_PREREQ (4,4) || defined(__clang__)
1247 #define umul_ppmm(w1, w0, u, v) \
1248   do {									\
1249     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
1250     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
1251     w1 = __ll >> 64;							\
1252     w0 = __ll;								\
1253   } while (0)
1254 #endif
1255 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1256 #define umul_ppmm(w1, w0, u, v) \
1257   __asm__ ("dmultu %2,%3"						\
1258 	   : "=l" (w0), "=h" (w1)					\
1259 	   : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1260 #endif
1261 #if !defined (umul_ppmm)
1262 #define umul_ppmm(w1, w0, u, v) \
1263   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"				\
1264 	   : "=d" (w0), "=d" (w1)					\
1265 	   : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1266 #endif
1267 #define UMUL_TIME 20
1268 #define UDIV_TIME 140
1269 #endif /* __mips */
1270 
1271 #if defined (__mmix__) && W_TYPE_SIZE == 64
1272 #define umul_ppmm(w1, w0, u, v) \
1273   __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1274 #endif
1275 
1276 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1277 #define umul_ppmm(w1, w0, u, v) \
1278   ({union {UDItype __ll;						\
1279 	   struct {USItype __l, __h;} __i;				\
1280 	  } __x;							\
1281   __asm__ ("meid %2,%0"							\
1282 	   : "=g" (__x.__ll)						\
1283 	   : "%0" ((USItype)(u)), "g" ((USItype)(v)));			\
1284   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1285 #define __umulsidi3(u, v) \
1286   ({UDItype __w;							\
1287     __asm__ ("meid %2,%0"						\
1288 	     : "=g" (__w)						\
1289 	     : "%0" ((USItype)(u)), "g" ((USItype)(v)));		\
1290     __w; })
1291 #define udiv_qrnnd(q, r, n1, n0, d) \
1292   ({union {UDItype __ll;						\
1293 	   struct {USItype __l, __h;} __i;				\
1294 	  } __x;							\
1295   __x.__i.__h = (n1); __x.__i.__l = (n0);				\
1296   __asm__ ("deid %2,%0"							\
1297 	   : "=g" (__x.__ll)						\
1298 	   : "0" (__x.__ll), "g" ((USItype)(d)));			\
1299   (r) = __x.__i.__l; (q) = __x.__i.__h; })
1300 #define count_trailing_zeros(count,x) \
1301   do {									\
1302     __asm__ ("ffsd	%2,%0"						\
1303 	     : "=r" (count)						\
1304 	     : "0" ((USItype) 0), "r" ((USItype) (x)));			\
1305   } while (0)
1306 #endif /* __ns32000__ */
1307 
1308 /* In the past we had a block of various #defines tested
1309        _ARCH_PPC    - AIX
1310        _ARCH_PWR    - AIX
1311        __powerpc__  - gcc
1312        __POWERPC__  - BEOS
1313        __ppc__      - Darwin
1314        PPC          - old gcc, GNU/Linux, SysV
1315    The plain PPC test was not good for vxWorks, since PPC is defined on all
1316    CPUs there (eg. m68k too), as a constant one is expected to compare
1317    CPU_FAMILY against.
1318 
1319    At any rate, this was pretty unattractive and a bit fragile.  The use of
1320    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1321    getting the desired effect.
1322 
1323    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1324    the system vendor compilers.  (Is that vendor compilers with inline asm,
1325    or what?)  */
1326 
1327 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)	\
1328   && W_TYPE_SIZE == 32
1329 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1330   do {									\
1331     if (__builtin_constant_p (bh) && (bh) == 0)				\
1332       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
1333 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));	\
1334     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
1335       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
1336 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));	\
1337     else								\
1338       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
1339 	     : "=r" (sh), "=&r" (sl)					\
1340 	     : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
1341   } while (0)
1342 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1343   do {									\
1344     if (__builtin_constant_p (ah) && (ah) == 0)				\
1345       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
1346 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1347     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
1348       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
1349 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1350     else if (__builtin_constant_p (bh) && (bh) == 0)			\
1351       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
1352 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1353     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
1354       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
1355 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1356     else								\
1357       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"			\
1358 	       : "=r" (sh), "=&r" (sl)					\
1359 	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
1360   } while (0)
1361 #define count_leading_zeros(count, x) \
1362   __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
1363 #define COUNT_LEADING_ZEROS_0 32
1364 #if HAVE_HOST_CPU_FAMILY_powerpc
1365 #if __GMP_GNUC_PREREQ (4,4) || defined(__clang__)
1366 #define umul_ppmm(w1, w0, u, v) \
1367   do {									\
1368     UDItype __ll = (UDItype)(u) * (v);					\
1369     w1 = __ll >> 32;							\
1370     w0 = __ll;								\
1371   } while (0)
1372 #endif
1373 #if !defined (umul_ppmm)
1374 #define umul_ppmm(ph, pl, m0, m1) \
1375   do {									\
1376     USItype __m0 = (m0), __m1 = (m1);					\
1377     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1378     (pl) = __m0 * __m1;							\
1379   } while (0)
1380 #endif
1381 #define UMUL_TIME 15
1382 #define smul_ppmm(ph, pl, m0, m1) \
1383   do {									\
1384     SItype __m0 = (m0), __m1 = (m1);					\
1385     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1386     (pl) = __m0 * __m1;							\
1387   } while (0)
1388 #define SMUL_TIME 14
1389 #define UDIV_TIME 120
1390 #else
1391 #define UMUL_TIME 8
1392 #define smul_ppmm(xh, xl, m0, m1) \
1393   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1394 #define SMUL_TIME 4
1395 #define sdiv_qrnnd(q, r, nh, nl, d) \
1396   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1397 #define UDIV_TIME 100
1398 #endif
1399 #endif /* 32-bit POWER architecture variants.  */
1400 
1401 /* We should test _IBMR2 here when we add assembly support for the system
1402    vendor compilers.  */
1403 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1404 #if !defined (_LONG_LONG_LIMB)
1405 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
1406    use adde etc only when not _LONG_LONG_LIMB.  */
1407 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1408   do {									\
1409     if (__builtin_constant_p (bh) && (bh) == 0)				\
1410       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
1411 	       : "=r" (sh), "=&r" (sl)					\
1412 	       : "r"  ((UDItype)(ah)),					\
1413 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)));		\
1414     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
1415       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
1416 	       : "=r" (sh), "=&r" (sl)					\
1417 	       : "r"  ((UDItype)(ah)),					\
1418 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)));		\
1419     else								\
1420       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
1421 	       : "=r" (sh), "=&r" (sl)					\
1422 	       : "r"  ((UDItype)(ah)), "r"  ((UDItype)(bh)),		\
1423 		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)));		\
1424   } while (0)
1425 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1426    This might seem strange, but gcc folds away the dead code late.  */
1427 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1428   do {									\
1429     if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) {	\
1430 	if (__builtin_constant_p (ah) && (ah) == 0)			\
1431 	  __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2"			\
1432 		   : "=r" (sh), "=&r" (sl)				\
1433 		   :                       "r" ((UDItype)(bh)),		\
1434 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));	\
1435 	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	\
1436 	  __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2"			\
1437 		   : "=r" (sh), "=&r" (sl)				\
1438 		   :                       "r" ((UDItype)(bh)),		\
1439 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));	\
1440 	else if (__builtin_constant_p (bh) && (bh) == 0)		\
1441 	  __asm__ ("addic %1,%3,%4\n\taddme %0,%2"			\
1442 		   : "=r" (sh), "=&r" (sl)				\
1443 		   : "r"  ((UDItype)(ah)),				\
1444 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));	\
1445 	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	\
1446 	  __asm__ ("addic %1,%3,%4\n\taddze %0,%2"			\
1447 		   : "=r" (sh), "=&r" (sl)				\
1448 		   : "r"  ((UDItype)(ah)),				\
1449 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));	\
1450 	else								\
1451 	  __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2"			\
1452 		   : "=r" (sh), "=&r" (sl)				\
1453 		   : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
1454 		     "rI" ((UDItype)(al)), "*rI" (-((UDItype)(bl))));	\
1455     } else {								\
1456 	if (__builtin_constant_p (ah) && (ah) == 0)			\
1457 	  __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
1458 		   : "=r" (sh), "=&r" (sl)				\
1459 		   :                       "r" ((UDItype)(bh)),		\
1460 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));	\
1461 	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	\
1462 	  __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
1463 		   : "=r" (sh), "=&r" (sl)				\
1464 		   :                       "r" ((UDItype)(bh)),		\
1465 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));	\
1466 	else if (__builtin_constant_p (bh) && (bh) == 0)		\
1467 	  __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
1468 		   : "=r" (sh), "=&r" (sl)				\
1469 		   : "r"  ((UDItype)(ah)),				\
1470 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));	\
1471 	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	\
1472 	  __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
1473 		   : "=r" (sh), "=&r" (sl)				\
1474 		   : "r"  ((UDItype)(ah)),				\
1475 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));	\
1476 	else								\
1477 	  __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"		\
1478 		   : "=r" (sh), "=&r" (sl)				\
1479 		   : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
1480 		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl)));	\
1481     }									\
1482   } while (0)
1483 #endif /* ! _LONG_LONG_LIMB */
1484 #define count_leading_zeros(count, x) \
1485   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1486 #define COUNT_LEADING_ZEROS_0 64
1487 #if 0 && __GMP_GNUC_PREREQ (4,4) /* Disable, this results in libcalls! */
1488 #define umul_ppmm(w1, w0, u, v) \
1489   do {									\
1490     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
1491     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
1492     w1 = __ll >> 64;							\
1493     w0 = __ll;								\
1494   } while (0)
1495 #endif
1496 #if !defined (umul_ppmm)
1497 #define umul_ppmm(ph, pl, m0, m1) \
1498   do {									\
1499     UDItype __m0 = (m0), __m1 = (m1);					\
1500     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));	\
1501     (pl) = __m0 * __m1;							\
1502   } while (0)
1503 #endif
1504 #define UMUL_TIME 15
1505 #define smul_ppmm(ph, pl, m0, m1) \
1506   do {									\
1507     DItype __m0 = (m0), __m1 = (m1);					\
1508     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));	\
1509     (pl) = __m0 * __m1;							\
1510   } while (0)
1511 #define SMUL_TIME 14  /* ??? */
1512 #define UDIV_TIME 120 /* ??? */
1513 #endif /* 64-bit PowerPC.  */
1514 
1515 #if defined (__pyr__) && W_TYPE_SIZE == 32
1516 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1517   __asm__ ("addw %5,%1\n\taddwc %3,%0"					\
1518 	   : "=r" (sh), "=&r" (sl)					\
1519 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
1520 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1521 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1522   __asm__ ("subw %5,%1\n\tsubwb %3,%0"					\
1523 	   : "=r" (sh), "=&r" (sl)					\
1524 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
1525 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1526 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
1527 #define umul_ppmm(w1, w0, u, v) \
1528   ({union {UDItype __ll;						\
1529 	   struct {USItype __h, __l;} __i;				\
1530 	  } __x;							\
1531   __asm__ ("movw %1,%R0\n\tuemul %2,%0"					\
1532 	   : "=&r" (__x.__ll)						\
1533 	   : "g" ((USItype) (u)), "g" ((USItype)(v)));			\
1534   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1535 #endif /* __pyr__ */
1536 
1537 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
1538 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1539   __asm__ ("a %1,%5\n\tae %0,%3"					\
1540 	   : "=r" (sh), "=&r" (sl)					\
1541 	   : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),			\
1542 	     "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1543 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1544   __asm__ ("s %1,%5\n\tse %0,%3"					\
1545 	   : "=r" (sh), "=&r" (sl)					\
1546 	   : "0" ((USItype)(ah)), "r" ((USItype)(bh)),			\
1547 	     "1" ((USItype)(al)), "r" ((USItype)(bl)))
1548 #define smul_ppmm(ph, pl, m0, m1) \
1549   __asm__ (								\
1550        "s	r2,r2\n"						\
1551 "	mts r10,%2\n"							\
1552 "	m	r2,%3\n"						\
1553 "	m	r2,%3\n"						\
1554 "	m	r2,%3\n"						\
1555 "	m	r2,%3\n"						\
1556 "	m	r2,%3\n"						\
1557 "	m	r2,%3\n"						\
1558 "	m	r2,%3\n"						\
1559 "	m	r2,%3\n"						\
1560 "	m	r2,%3\n"						\
1561 "	m	r2,%3\n"						\
1562 "	m	r2,%3\n"						\
1563 "	m	r2,%3\n"						\
1564 "	m	r2,%3\n"						\
1565 "	m	r2,%3\n"						\
1566 "	m	r2,%3\n"						\
1567 "	m	r2,%3\n"						\
1568 "	cas	%0,r2,r0\n"						\
1569 "	mfs	r10,%1"							\
1570 	   : "=r" (ph), "=r" (pl)					\
1571 	   : "%r" ((USItype)(m0)), "r" ((USItype)(m1))			\
1572 	   : "r2")
1573 #define UMUL_TIME 20
1574 #define UDIV_TIME 200
1575 #define count_leading_zeros(count, x) \
1576   do {									\
1577     if ((x) >= 0x10000)							\
1578       __asm__ ("clz	%0,%1"						\
1579 	       : "=r" (count) : "r" ((USItype)(x) >> 16));		\
1580     else								\
1581       {									\
1582 	__asm__ ("clz	%0,%1"						\
1583 		 : "=r" (count) : "r" ((USItype)(x)));			\
1584 	(count) += 16;							\
1585       }									\
1586   } while (0)
1587 #endif /* RT/ROMP */
1588 
1589 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32
1590 #define umul_ppmm(w1, w0, u, v) \
1591   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"		\
1592 	   : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1593 #define UMUL_TIME 5
1594 #endif
1595 
1596 #if defined (__sparc__) && W_TYPE_SIZE == 32
1597 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1598   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
1599 	   : "=r" (sh), "=&r" (sl)					\
1600 	   : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)			\
1601 	   __CLOBBER_CC)
1602 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1603   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
1604 	   : "=r" (sh), "=&r" (sl)					\
1605 	   : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl)	\
1606 	   __CLOBBER_CC)
1607 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1608    doesn't define anything to indicate that to us, it only sets __sparcv8. */
1609 #if defined (__sparc_v9__) || defined (__sparcv9)
1610 /* Perhaps we should use floating-point operations here?  */
1611 #if 0
1612 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1613    Perhaps we simply need explicitly zero-extend the inputs?  */
1614 #define umul_ppmm(w1, w0, u, v) \
1615   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :		\
1616 	   "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1617 #else
1618 /* Use v8 umul until above bug is fixed.  */
1619 #define umul_ppmm(w1, w0, u, v) \
1620   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1621 #endif
1622 /* Use a plain v8 divide for v9.  */
1623 #define udiv_qrnnd(q, r, n1, n0, d) \
1624   do {									\
1625     USItype __q;							\
1626     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
1627 	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
1628     (r) = (n0) - __q * (d);						\
1629     (q) = __q;								\
1630   } while (0)
1631 #else
1632 #if defined (__sparc_v8__)   /* gcc normal */				\
1633   || defined (__sparcv8)     /* gcc solaris */				\
1634   || HAVE_HOST_CPU_supersparc
1635 /* Don't match immediate range because, 1) it is not often useful,
1636    2) the 'I' flag thinks of the range as a 13 bit signed interval,
1637    while we want to match a 13 bit interval, sign extended to 32 bits,
1638    but INTERPRETED AS UNSIGNED.  */
1639 #define umul_ppmm(w1, w0, u, v) \
1640   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1641 #define UMUL_TIME 5
1642 
1643 #if HAVE_HOST_CPU_supersparc
1644 #define UDIV_TIME 60		/* SuperSPARC timing */
1645 #else
1646 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1647    dividends and will trap to the kernel for the rest. */
1648 #define udiv_qrnnd(q, r, n1, n0, d) \
1649   do {									\
1650     USItype __q;							\
1651     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
1652 	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
1653     (r) = (n0) - __q * (d);						\
1654     (q) = __q;								\
1655   } while (0)
1656 #define UDIV_TIME 25
1657 #endif /* HAVE_HOST_CPU_supersparc */
1658 
1659 #else /* ! __sparc_v8__ */
1660 #if defined (__sparclite__)
1661 /* This has hardware multiply but not divide.  It also has two additional
1662    instructions scan (ffs from high bit) and divscc.  */
1663 #define umul_ppmm(w1, w0, u, v) \
1664   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1665 #define UMUL_TIME 5
1666 #define udiv_qrnnd(q, r, n1, n0, d) \
1667   __asm__ ("! Inlined udiv_qrnnd\n"					\
1668 "	wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
1669 "	tst	%%g0\n"							\
1670 "	divscc	%3,%4,%%g1\n"						\
1671 "	divscc	%%g1,%4,%%g1\n"						\
1672 "	divscc	%%g1,%4,%%g1\n"						\
1673 "	divscc	%%g1,%4,%%g1\n"						\
1674 "	divscc	%%g1,%4,%%g1\n"						\
1675 "	divscc	%%g1,%4,%%g1\n"						\
1676 "	divscc	%%g1,%4,%%g1\n"						\
1677 "	divscc	%%g1,%4,%%g1\n"						\
1678 "	divscc	%%g1,%4,%%g1\n"						\
1679 "	divscc	%%g1,%4,%%g1\n"						\
1680 "	divscc	%%g1,%4,%%g1\n"						\
1681 "	divscc	%%g1,%4,%%g1\n"						\
1682 "	divscc	%%g1,%4,%%g1\n"						\
1683 "	divscc	%%g1,%4,%%g1\n"						\
1684 "	divscc	%%g1,%4,%%g1\n"						\
1685 "	divscc	%%g1,%4,%%g1\n"						\
1686 "	divscc	%%g1,%4,%%g1\n"						\
1687 "	divscc	%%g1,%4,%%g1\n"						\
1688 "	divscc	%%g1,%4,%%g1\n"						\
1689 "	divscc	%%g1,%4,%%g1\n"						\
1690 "	divscc	%%g1,%4,%%g1\n"						\
1691 "	divscc	%%g1,%4,%%g1\n"						\
1692 "	divscc	%%g1,%4,%%g1\n"						\
1693 "	divscc	%%g1,%4,%%g1\n"						\
1694 "	divscc	%%g1,%4,%%g1\n"						\
1695 "	divscc	%%g1,%4,%%g1\n"						\
1696 "	divscc	%%g1,%4,%%g1\n"						\
1697 "	divscc	%%g1,%4,%%g1\n"						\
1698 "	divscc	%%g1,%4,%%g1\n"						\
1699 "	divscc	%%g1,%4,%%g1\n"						\
1700 "	divscc	%%g1,%4,%%g1\n"						\
1701 "	divscc	%%g1,%4,%0\n"						\
1702 "	rd	%%y,%1\n"						\
1703 "	bl,a 1f\n"							\
1704 "	add	%1,%4,%1\n"						\
1705 "1:	! End of inline udiv_qrnnd"					\
1706 	   : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)		\
1707 	   : "%g1" __AND_CLOBBER_CC)
1708 #define UDIV_TIME 37
1709 #define count_leading_zeros(count, x) \
1710   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1711 /* Early sparclites return 63 for an argument of 0, but they warn that future
1712    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1713    undefined.  */
1714 #endif /* __sparclite__ */
1715 #endif /* __sparc_v8__ */
1716 #endif /* __sparc_v9__ */
1717 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
1718 #ifndef umul_ppmm
1719 #define umul_ppmm(w1, w0, u, v) \
1720   __asm__ ("! Inlined umul_ppmm\n"					\
1721 "	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n" \
1722 "	sra	%3,31,%%g2	! Don't move this insn\n"		\
1723 "	and	%2,%%g2,%%g2	! Don't move this insn\n"		\
1724 "	andcc	%%g0,0,%%g1	! Don't move this insn\n"		\
1725 "	mulscc	%%g1,%3,%%g1\n"						\
1726 "	mulscc	%%g1,%3,%%g1\n"						\
1727 "	mulscc	%%g1,%3,%%g1\n"						\
1728 "	mulscc	%%g1,%3,%%g1\n"						\
1729 "	mulscc	%%g1,%3,%%g1\n"						\
1730 "	mulscc	%%g1,%3,%%g1\n"						\
1731 "	mulscc	%%g1,%3,%%g1\n"						\
1732 "	mulscc	%%g1,%3,%%g1\n"						\
1733 "	mulscc	%%g1,%3,%%g1\n"						\
1734 "	mulscc	%%g1,%3,%%g1\n"						\
1735 "	mulscc	%%g1,%3,%%g1\n"						\
1736 "	mulscc	%%g1,%3,%%g1\n"						\
1737 "	mulscc	%%g1,%3,%%g1\n"						\
1738 "	mulscc	%%g1,%3,%%g1\n"						\
1739 "	mulscc	%%g1,%3,%%g1\n"						\
1740 "	mulscc	%%g1,%3,%%g1\n"						\
1741 "	mulscc	%%g1,%3,%%g1\n"						\
1742 "	mulscc	%%g1,%3,%%g1\n"						\
1743 "	mulscc	%%g1,%3,%%g1\n"						\
1744 "	mulscc	%%g1,%3,%%g1\n"						\
1745 "	mulscc	%%g1,%3,%%g1\n"						\
1746 "	mulscc	%%g1,%3,%%g1\n"						\
1747 "	mulscc	%%g1,%3,%%g1\n"						\
1748 "	mulscc	%%g1,%3,%%g1\n"						\
1749 "	mulscc	%%g1,%3,%%g1\n"						\
1750 "	mulscc	%%g1,%3,%%g1\n"						\
1751 "	mulscc	%%g1,%3,%%g1\n"						\
1752 "	mulscc	%%g1,%3,%%g1\n"						\
1753 "	mulscc	%%g1,%3,%%g1\n"						\
1754 "	mulscc	%%g1,%3,%%g1\n"						\
1755 "	mulscc	%%g1,%3,%%g1\n"						\
1756 "	mulscc	%%g1,%3,%%g1\n"						\
1757 "	mulscc	%%g1,0,%%g1\n"						\
1758 "	add	%%g1,%%g2,%0\n"						\
1759 "	rd	%%y,%1"							\
1760 	   : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)			\
1761 	   : "%g1", "%g2" __AND_CLOBBER_CC)
1762 #define UMUL_TIME 39		/* 39 instructions */
1763 #endif
1764 #ifndef udiv_qrnnd
1765 #ifndef LONGLONG_STANDALONE
1766 #define udiv_qrnnd(q, r, n1, n0, d) \
1767   do { UWtype __r;							\
1768     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
1769     (r) = __r;								\
1770   } while (0)
1771 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
1772 #ifndef UDIV_TIME
1773 #define UDIV_TIME 140
1774 #endif
1775 #endif /* LONGLONG_STANDALONE */
1776 #endif /* udiv_qrnnd */
1777 #endif /* __sparc__ */
1778 
1779 #if defined (__sparc__) && W_TYPE_SIZE == 64
1780 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1781   __asm__ (								\
1782        "addcc	%r4,%5,%1\n"						\
1783       "	addccc	%r6,%7,%%g0\n"						\
1784       "	addc	%r2,%3,%0"						\
1785        : "=r" (sh), "=&r" (sl)						\
1786        : "rJ"  ((UDItype)(ah)), "rI" ((UDItype)(bh)),			\
1787 	 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),			\
1788 	 "%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)	\
1789 	   __CLOBBER_CC)
1790 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1791   __asm__ (								\
1792        "subcc	%r4,%5,%1\n"						\
1793       "	subccc	%r6,%7,%%g0\n"						\
1794       "	subc	%r2,%3,%0"						\
1795        : "=r" (sh), "=&r" (sl)						\
1796        : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)),			\
1797 	 "rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),			\
1798 	 "rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)		\
1799 	   __CLOBBER_CC)
1800 #if __VIS__ >= 0x300
1801 #undef add_ssaaaa
1802 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1803   __asm__ (								\
1804        "addcc	%r4, %5, %1\n"						\
1805       "	addxc	%r2, %r3, %0"						\
1806 	  : "=r" (sh), "=&r" (sl)					\
1807        : "rJ"  ((UDItype)(ah)), "rJ" ((UDItype)(bh)),			\
1808 	 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
1809 #define umul_ppmm(ph, pl, m0, m1) \
1810   do {									\
1811     UDItype __m0 = (m0), __m1 = (m1);					\
1812     (pl) = __m0 * __m1;							\
1813     __asm__ ("umulxhi\t%2, %1, %0"					\
1814 	     : "=r" (ph)						\
1815 	     : "%r" (__m0), "r" (__m1));				\
1816   } while (0)
1817 #define count_leading_zeros(count, x) \
1818   __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x))
1819 /* Needed by count_leading_zeros_32 in sparc64.h.  */
1820 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1821 #endif
1822 #endif
1823 
1824 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32
1825 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1826   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
1827 	   : "=g" (sh), "=&g" (sl)					\
1828 	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
1829 	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1830 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1831   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
1832 	   : "=g" (sh), "=&g" (sl)					\
1833 	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
1834 	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
1835 #define smul_ppmm(xh, xl, m0, m1) \
1836   do {									\
1837     union {UDItype __ll;						\
1838 	   struct {USItype __l, __h;} __i;				\
1839 	  } __x;							\
1840     USItype __m0 = (m0), __m1 = (m1);					\
1841     __asm__ ("emul %1,%2,$0,%0"						\
1842 	     : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));		\
1843     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
1844   } while (0)
1845 #define sdiv_qrnnd(q, r, n1, n0, d) \
1846   do {									\
1847     union {DItype __ll;							\
1848 	   struct {SItype __l, __h;} __i;				\
1849 	  } __x;							\
1850     __x.__i.__h = n1; __x.__i.__l = n0;					\
1851     __asm__ ("ediv %3,%2,%0,%1"						\
1852 	     : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));		\
1853   } while (0)
1854 #if 0
1855 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1856    8800 maybe). */
1857 #define count_trailing_zeros(count,x)					\
1858   do {									\
1859     __asm__ ("ffs 0, 31, %1, %0"					\
1860 	     : "=g" (count)						\
1861 	     : "g" ((USItype) (x)));					\
1862   } while (0)
1863 #endif
1864 #endif /* vax */
1865 
1866 #if defined (__z8000__) && W_TYPE_SIZE == 16
1867 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1868   __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
1869 	   : "=r" (sh), "=&r" (sl)					\
1870 	   : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
1871 	     "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1872 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1873   __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
1874 	   : "=r" (sh), "=&r" (sl)					\
1875 	   : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
1876 	     "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1877 #define umul_ppmm(xh, xl, m0, m1) \
1878   do {									\
1879     union {long int __ll;						\
1880 	   struct {unsigned int __h, __l;} __i;				\
1881 	  } __x;							\
1882     unsigned int __m0 = (m0), __m1 = (m1);				\
1883     __asm__ ("mult	%S0,%H3"					\
1884 	     : "=r" (__x.__i.__h), "=r" (__x.__i.__l)			\
1885 	     : "%1" (m0), "rQR" (m1));					\
1886     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
1887     (xh) += ((((signed int) __m0 >> 15) & __m1)				\
1888 	     + (((signed int) __m1 >> 15) & __m0));			\
1889   } while (0)
1890 #endif /* __z8000__ */
1891 
1892 #endif /* __GNUC__ */
1893 
1894 #endif /* NO_ASM */
1895 
1896 
1897 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti".  */
1898 #if !defined (umul_ppmm) && defined (__umulsidi3)
1899 #define umul_ppmm(ph, pl, m0, m1) \
1900   do {									\
1901     UDWtype __ll = __umulsidi3 (m0, m1);				\
1902     ph = (UWtype) (__ll >> W_TYPE_SIZE);				\
1903     pl = (UWtype) __ll;							\
1904   } while (0)
1905 #endif
1906 
1907 #if !defined (__umulsidi3)
1908 #define __umulsidi3(u, v) \
1909   ({UWtype __hi, __lo;							\
1910     umul_ppmm (__hi, __lo, u, v);					\
1911     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1912 #endif
1913 
1914 
1915 #if defined (__cplusplus)
1916 #define __longlong_h_C "C"
1917 #else
1918 #define __longlong_h_C
1919 #endif
1920 
1921 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
1922    forms have "reversed" arguments, meaning the pointer is last, which
1923    sometimes allows better parameter passing, in particular on 64-bit
1924    hppa. */
1925 
1926 #define mpn_umul_ppmm  __MPN(umul_ppmm)
1927 extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
1928 
1929 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
1930   && ! defined (LONGLONG_STANDALONE)
1931 #define umul_ppmm(wh, wl, u, v)						\
1932   do {									\
1933     UWtype __umul_ppmm__p0;						\
1934     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\
1935     (wl) = __umul_ppmm__p0;						\
1936   } while (0)
1937 #endif
1938 
1939 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
1940 extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
1941 
1942 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r	\
1943   && ! defined (LONGLONG_STANDALONE)
1944 #define umul_ppmm(wh, wl, u, v)						\
1945   do {									\
1946     UWtype __umul_p0;							\
1947     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0);	\
1948     (wl) = __umul_p0;							\
1949   } while (0)
1950 #endif
1951 
1952 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
1953 extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
1954 
1955 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd	\
1956   && ! defined (LONGLONG_STANDALONE)
1957 #define udiv_qrnnd(q, r, n1, n0, d)					\
1958   do {									\
1959     UWtype __udiv_qrnnd_r;						\
1960     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r,				\
1961 			  (UWtype) (n1), (UWtype) (n0), (UWtype) d);	\
1962     (r) = __udiv_qrnnd_r;						\
1963   } while (0)
1964 #endif
1965 
1966 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
1967 extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
1968 
1969 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r	\
1970   && ! defined (LONGLONG_STANDALONE)
1971 #define udiv_qrnnd(q, r, n1, n0, d)					\
1972   do {									\
1973     UWtype __udiv_qrnnd_r;						\
1974     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,	\
1975 			    &__udiv_qrnnd_r);				\
1976     (r) = __udiv_qrnnd_r;						\
1977   } while (0)
1978 #endif
1979 
1980 
1981 /* If this machine has no inline assembler, use C macros.  */
1982 
1983 #if !defined (add_ssaaaa)
1984 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1985   do {									\
1986     UWtype __x;								\
1987     __x = (al) + (bl);							\
1988     (sh) = (ah) + (bh) + (__x < (al));					\
1989     (sl) = __x;								\
1990   } while (0)
1991 #endif
1992 
1993 #if !defined (sub_ddmmss)
1994 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1995   do {									\
1996     UWtype __x;								\
1997     __x = (al) - (bl);							\
1998     (sh) = (ah) - (bh) - ((al) < (bl));					\
1999     (sl) = __x;								\
2000   } while (0)
2001 #endif
2002 
2003 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
2004    smul_ppmm.  */
2005 #if !defined (umul_ppmm) && defined (smul_ppmm)
2006 #define umul_ppmm(w1, w0, u, v)						\
2007   do {									\
2008     UWtype __w1;							\
2009     UWtype __xm0 = (u), __xm1 = (v);					\
2010     smul_ppmm (__w1, w0, __xm0, __xm1);					\
2011     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
2012 		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
2013   } while (0)
2014 #endif
2015 
2016 /* If we still don't have umul_ppmm, define it using plain C.
2017 
2018    For reference, when this code is used for squaring (ie. u and v identical
2019    expressions), gcc recognises __x1 and __x2 are the same and generates 3
2020    multiplies, not 4.  The subsequent additions could be optimized a bit,
2021    but the only place GMP currently uses such a square is mpn_sqr_basecase,
2022    and chips obliged to use this generic C umul will have plenty of worse
2023    performance problems than a couple of extra instructions on the diagonal
2024    of sqr_basecase.  */
2025 
2026 #if !defined (umul_ppmm)
2027 #define umul_ppmm(w1, w0, u, v)						\
2028   do {									\
2029     UWtype __x0, __x1, __x2, __x3;					\
2030     UHWtype __ul, __vl, __uh, __vh;					\
2031     UWtype __u = (u), __v = (v);					\
2032 									\
2033     __ul = __ll_lowpart (__u);						\
2034     __uh = __ll_highpart (__u);						\
2035     __vl = __ll_lowpart (__v);						\
2036     __vh = __ll_highpart (__v);						\
2037 									\
2038     __x0 = (UWtype) __ul * __vl;					\
2039     __x1 = (UWtype) __ul * __vh;					\
2040     __x2 = (UWtype) __uh * __vl;					\
2041     __x3 = (UWtype) __uh * __vh;					\
2042 									\
2043     __x1 += __ll_highpart (__x0);/* this can't give carry */		\
2044     __x1 += __x2;		/* but this indeed can */		\
2045     if (__x1 < __x2)		/* did we get it? */			\
2046       __x3 += __ll_B;		/* yes, add it in the proper pos. */	\
2047 									\
2048     (w1) = __x3 + __ll_highpart (__x1);					\
2049     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);		\
2050   } while (0)
2051 #endif
2052 
2053 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
2054    exist in one form or another.  */
2055 #if !defined (smul_ppmm)
2056 #define smul_ppmm(w1, w0, u, v)						\
2057   do {									\
2058     UWtype __w1;							\
2059     UWtype __xm0 = (u), __xm1 = (v);					\
2060     umul_ppmm (__w1, w0, __xm0, __xm1);					\
2061     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
2062 		- (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
2063   } while (0)
2064 #endif
2065 
2066 /* Define this unconditionally, so it can be used for debugging.  */
2067 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
2068   do {									\
2069     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;			\
2070 									\
2071     ASSERT ((d) != 0);							\
2072     ASSERT ((n1) < (d));						\
2073 									\
2074     __d1 = __ll_highpart (d);						\
2075     __d0 = __ll_lowpart (d);						\
2076 									\
2077     __q1 = (n1) / __d1;							\
2078     __r1 = (n1) - __q1 * __d1;						\
2079     __m = __q1 * __d0;							\
2080     __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
2081     if (__r1 < __m)							\
2082       {									\
2083 	__q1--, __r1 += (d);						\
2084 	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
2085 	  if (__r1 < __m)						\
2086 	    __q1--, __r1 += (d);					\
2087       }									\
2088     __r1 -= __m;							\
2089 									\
2090     __q0 = __r1 / __d1;							\
2091     __r0 = __r1  - __q0 * __d1;						\
2092     __m = __q0 * __d0;							\
2093     __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
2094     if (__r0 < __m)							\
2095       {									\
2096 	__q0--, __r0 += (d);						\
2097 	if (__r0 >= (d))						\
2098 	  if (__r0 < __m)						\
2099 	    __q0--, __r0 += (d);					\
2100       }									\
2101     __r0 -= __m;							\
2102 									\
2103     (q) = __q1 * __ll_B | __q0;						\
2104     (r) = __r0;								\
2105   } while (0)
2106 
2107 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
2108    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
2109 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) \
2110   && ! defined (LONGLONG_STANDALONE)
2111 #define udiv_qrnnd(q, r, nh, nl, d) \
2112   do {									\
2113     UWtype __r;								\
2114     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);				\
2115     (r) = __r;								\
2116   } while (0)
2117 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
2118 #endif
2119 
2120 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
2121 #if !defined (udiv_qrnnd)
2122 #define UDIV_NEEDS_NORMALIZATION 1
2123 #define udiv_qrnnd __udiv_qrnnd_c
2124 #endif
2125 
2126 #if !defined (count_leading_zeros)
2127 #define count_leading_zeros(count, x) \
2128   do {									\
2129     UWtype __xr = (x);							\
2130     UWtype __a;								\
2131 									\
2132     if (W_TYPE_SIZE == 32)						\
2133       {									\
2134 	__a = __xr < ((UWtype) 1 << 2*__BITS4)				\
2135 	  ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)		\
2136 	  : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1		\
2137 	  : 3*__BITS4 + 1);						\
2138       }									\
2139     else								\
2140       {									\
2141 	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
2142 	  if (((__xr >> __a) & 0xff) != 0)				\
2143 	    break;							\
2144 	++__a;								\
2145       }									\
2146 									\
2147     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];		\
2148   } while (0)
2149 /* This version gives a well-defined value for zero. */
2150 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
2151 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2152 #define COUNT_LEADING_ZEROS_SLOW
2153 #endif
2154 
2155 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
2156 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
2157 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2158 #endif
2159 
2160 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2161 extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
2162 #endif
2163 
2164 #if !defined (count_trailing_zeros)
2165 #if !defined (COUNT_LEADING_ZEROS_SLOW)
2166 /* Define count_trailing_zeros using an asm count_leading_zeros.  */
2167 #define count_trailing_zeros(count, x)					\
2168   do {									\
2169     UWtype __ctz_x = (x);						\
2170     UWtype __ctz_c;							\
2171     ASSERT (__ctz_x != 0);						\
2172     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
2173     (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
2174   } while (0)
2175 #else
2176 /* Define count_trailing_zeros in plain C, assuming small counts are common.
2177    We use clz_tab without ado, since the C count_leading_zeros above will have
2178    pulled it in.  */
2179 #define count_trailing_zeros(count, x)					\
2180   do {									\
2181     UWtype __ctz_x = (x);						\
2182     int __ctz_c;							\
2183 									\
2184     if (LIKELY ((__ctz_x & 0xff) != 0))					\
2185       (count) = __clz_tab[__ctz_x & -__ctz_x] - 2;			\
2186     else								\
2187       {									\
2188 	for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8)	\
2189 	  {								\
2190 	    __ctz_x >>= 8;						\
2191 	    if (LIKELY ((__ctz_x & 0xff) != 0))				\
2192 	      break;							\
2193 	  }								\
2194 									\
2195 	(count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x];		\
2196       }									\
2197   } while (0)
2198 #endif
2199 #endif
2200 
2201 #ifndef UDIV_NEEDS_NORMALIZATION
2202 #define UDIV_NEEDS_NORMALIZATION 0
2203 #endif
2204 
2205 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
2206    that hence the latter should always be used.  */
2207 #ifndef UDIV_PREINV_ALWAYS
2208 #define UDIV_PREINV_ALWAYS 0
2209 #endif
2210 
2211 /* Give defaults for UMUL_TIME and UDIV_TIME.  */
2212 #ifndef UMUL_TIME
2213 #define UMUL_TIME 1
2214 #endif
2215 
2216 #ifndef UDIV_TIME
2217 #define UDIV_TIME UMUL_TIME
2218 #endif
2219