1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic. 2 3 Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003, 4 2004, 2005, 2007, 2008, 2009, 2011, 2012 Free Software Foundation, Inc. 5 6 This file is free software; you can redistribute it and/or modify it under the 7 terms of the GNU Lesser General Public License as published by the Free 8 Software Foundation; either version 3 of the License, or (at your option) any 9 later version. 10 11 This file is distributed in the hope that it will be useful, but WITHOUT ANY 12 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 13 PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 14 details. 15 16 You should have received a copy of the GNU Lesser General Public License 17 along with this file. If not, see http://www.gnu.org/licenses/. */ 18 19 /* You have to define the following before including this file: 20 21 UWtype -- An unsigned type, default type for operations (typically a "word") 22 UHWtype -- An unsigned type, at least half the size of UWtype 23 UDWtype -- An unsigned type, at least twice as large a UWtype 24 W_TYPE_SIZE -- size in bits of UWtype 25 26 SItype, USItype -- Signed and unsigned 32 bit types 27 DItype, UDItype -- Signed and unsigned 64 bit types 28 29 On a 32 bit machine UWtype should typically be USItype; 30 on a 64 bit machine, UWtype should typically be UDItype. 31 32 Optionally, define: 33 34 LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files 35 NO_ASM -- Disable inline asm 36 37 38 CAUTION! Using this version of longlong.h outside of GMP is not safe. You 39 need to include gmp.h and gmp-impl.h, or certain things might not work as 40 expected. 41 */ 42 43 #define __BITS4 (W_TYPE_SIZE / 4) 44 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2)) 45 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1)) 46 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2)) 47 48 /* This is used to make sure no undesirable sharing between different libraries 49 that use this file takes place. */ 50 #ifndef __MPN 51 #define __MPN(x) __##x 52 #endif 53 54 /* Define auxiliary asm macros. 55 56 1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two 57 UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype 58 word product in HIGH_PROD and LOW_PROD. 59 60 2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a 61 UDWtype product. This is just a variant of umul_ppmm. 62 63 3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator, 64 denominator) divides a UDWtype, composed by the UWtype integers 65 HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient 66 in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less 67 than DENOMINATOR for correct operation. If, in addition, the most 68 significant bit of DENOMINATOR must be 1, then the pre-processor symbol 69 UDIV_NEEDS_NORMALIZATION is defined to 1. 70 71 4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator, 72 denominator). Like udiv_qrnnd but the numbers are signed. The quotient 73 is rounded towards 0. 74 75 5) count_leading_zeros(count, x) counts the number of zero-bits from the 76 msb to the first non-zero bit in the UWtype X. This is the number of 77 steps X needs to be shifted left to set the msb. Undefined for X == 0, 78 unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value. 79 80 6) count_trailing_zeros(count, x) like count_leading_zeros, but counts 81 from the least significant end. 82 83 7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1, 84 high_addend_2, low_addend_2) adds two UWtype integers, composed by 85 HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2 86 respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow 87 (i.e. carry out) is not stored anywhere, and is lost. 88 89 8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend, 90 high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers, 91 composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and 92 LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE 93 and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere, 94 and is lost. 95 96 If any of these macros are left undefined for a particular CPU, 97 C macros are used. 98 99 100 Notes: 101 102 For add_ssaaaa the two high and two low addends can both commute, but 103 unfortunately gcc only supports one "%" commutative in each asm block. 104 This has always been so but is only documented in recent versions 105 (eg. pre-release 3.3). Having two or more "%"s can cause an internal 106 compiler error in certain rare circumstances. 107 108 Apparently it was only the last "%" that was ever actually respected, so 109 the code has been updated to leave just that. Clearly there's a free 110 choice whether high or low should get it, if there's a reason to favour 111 one over the other. Also obviously when the constraints on the two 112 operands are identical there's no benefit to the reloader in any "%" at 113 all. 114 115 */ 116 117 /* The CPUs come in alphabetical order below. 118 119 Please add support for more CPUs here, or improve the current support 120 for the CPUs below! */ 121 122 123 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc 124 3.4 __builtin_clzl or __builtin_clzll, according to our limb size. 125 Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or 126 __builtin_ctzll. 127 128 These builtins are only used when we check what code comes out, on some 129 chips they're merely libgcc calls, where we will instead want an inline 130 in that case (either asm or generic C). 131 132 These builtins are better than an asm block of the same insn, since an 133 asm block doesn't give gcc any information about scheduling or resource 134 usage. We keep an asm block for use on prior versions of gcc though. 135 136 For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but 137 it's not used (for count_leading_zeros) because it generally gives extra 138 code to ensure the result is 0 when the input is 0, which we don't need 139 or want. */ 140 141 #ifdef _LONG_LONG_LIMB 142 #define count_leading_zeros_gcc_clz(count,x) \ 143 do { \ 144 ASSERT ((x) != 0); \ 145 (count) = __builtin_clzll (x); \ 146 } while (0) 147 #else 148 #define count_leading_zeros_gcc_clz(count,x) \ 149 do { \ 150 ASSERT ((x) != 0); \ 151 (count) = __builtin_clzl (x); \ 152 } while (0) 153 #endif 154 155 #ifdef _LONG_LONG_LIMB 156 #define count_trailing_zeros_gcc_ctz(count,x) \ 157 do { \ 158 ASSERT ((x) != 0); \ 159 (count) = __builtin_ctzll (x); \ 160 } while (0) 161 #else 162 #define count_trailing_zeros_gcc_ctz(count,x) \ 163 do { \ 164 ASSERT ((x) != 0); \ 165 (count) = __builtin_ctzl (x); \ 166 } while (0) 167 #endif 168 169 170 /* FIXME: The macros using external routines like __MPN(count_leading_zeros) 171 don't need to be under !NO_ASM */ 172 #if ! defined (NO_ASM) 173 174 #if defined (__alpha) && W_TYPE_SIZE == 64 175 /* Most alpha-based machines, except Cray systems. */ 176 #if defined (__GNUC__) 177 #if __GMP_GNUC_PREREQ (3,3) 178 #define umul_ppmm(ph, pl, m0, m1) \ 179 do { \ 180 UDItype __m0 = (m0), __m1 = (m1); \ 181 (ph) = __builtin_alpha_umulh (__m0, __m1); \ 182 (pl) = __m0 * __m1; \ 183 } while (0) 184 #else 185 #define umul_ppmm(ph, pl, m0, m1) \ 186 do { \ 187 UDItype __m0 = (m0), __m1 = (m1); \ 188 __asm__ ("umulh %r1,%2,%0" \ 189 : "=r" (ph) \ 190 : "%rJ" (m0), "rI" (m1)); \ 191 (pl) = __m0 * __m1; \ 192 } while (0) 193 #endif 194 #define UMUL_TIME 18 195 #else /* ! __GNUC__ */ 196 #include <machine/builtins.h> 197 #define umul_ppmm(ph, pl, m0, m1) \ 198 do { \ 199 UDItype __m0 = (m0), __m1 = (m1); \ 200 (ph) = __UMULH (m0, m1); \ 201 (pl) = __m0 * __m1; \ 202 } while (0) 203 #endif 204 #ifndef LONGLONG_STANDALONE 205 #define udiv_qrnnd(q, r, n1, n0, d) \ 206 do { UWtype __di; \ 207 __di = __MPN(invert_limb) (d); \ 208 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 209 } while (0) 210 #define UDIV_PREINV_ALWAYS 1 211 #define UDIV_NEEDS_NORMALIZATION 1 212 #define UDIV_TIME 220 213 #endif /* LONGLONG_STANDALONE */ 214 215 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm 216 always goes into libgmp.so, even when not actually used. */ 217 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 218 219 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX 220 #define count_leading_zeros(COUNT,X) \ 221 __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X)) 222 #define count_trailing_zeros(COUNT,X) \ 223 __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X)) 224 #endif /* clz/ctz using cix */ 225 226 #if ! defined (count_leading_zeros) \ 227 && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE) 228 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0. 229 "$31" is written explicitly in the asm, since an "r" constraint won't 230 select reg 31. There seems no need to worry about "r31" syntax for cray, 231 since gcc itself (pre-release 3.4) emits just $31 in various places. */ 232 #define ALPHA_CMPBGE_0(dst, src) \ 233 do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0) 234 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts 235 them, locating the highest non-zero byte. A second __clz_tab lookup 236 counts the leading zero bits in that byte, giving the result. */ 237 #define count_leading_zeros(count, x) \ 238 do { \ 239 UWtype __clz__b, __clz__c, __clz__x = (x); \ 240 ALPHA_CMPBGE_0 (__clz__b, __clz__x); /* zero bytes */ \ 241 __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F]; /* 8 to 1 byte */ \ 242 __clz__b = __clz__b * 8 - 7; /* 57 to 1 shift */ \ 243 __clz__x >>= __clz__b; \ 244 __clz__c = __clz_tab [__clz__x]; /* 8 to 1 bit */ \ 245 __clz__b = 65 - __clz__b; \ 246 (count) = __clz__b - __clz__c; \ 247 } while (0) 248 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 249 #endif /* clz using cmpbge */ 250 251 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE) 252 #if HAVE_ATTRIBUTE_CONST 253 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const)); 254 #else 255 long __MPN(count_leading_zeros) (UDItype); 256 #endif 257 #define count_leading_zeros(count, x) \ 258 ((count) = __MPN(count_leading_zeros) (x)) 259 #endif /* clz using mpn */ 260 #endif /* __alpha */ 261 262 #if defined (__AVR) && W_TYPE_SIZE == 8 263 #define umul_ppmm(ph, pl, m0, m1) \ 264 do { \ 265 unsigned short __p = (unsigned short) (m0) * (m1); \ 266 (ph) = __p >> 8; \ 267 (pl) = __p; \ 268 } while (0) 269 #endif /* AVR */ 270 271 #if defined (_CRAY) && W_TYPE_SIZE == 64 272 #include <intrinsics.h> 273 #define UDIV_PREINV_ALWAYS 1 274 #define UDIV_NEEDS_NORMALIZATION 1 275 #define UDIV_TIME 220 276 long __MPN(count_leading_zeros) (UDItype); 277 #define count_leading_zeros(count, x) \ 278 ((count) = _leadz ((UWtype) (x))) 279 #if defined (_CRAYIEEE) /* I.e., Cray T90/ieee, T3D, and T3E */ 280 #define umul_ppmm(ph, pl, m0, m1) \ 281 do { \ 282 UDItype __m0 = (m0), __m1 = (m1); \ 283 (ph) = _int_mult_upper (m0, m1); \ 284 (pl) = __m0 * __m1; \ 285 } while (0) 286 #ifndef LONGLONG_STANDALONE 287 #define udiv_qrnnd(q, r, n1, n0, d) \ 288 do { UWtype __di; \ 289 __di = __MPN(invert_limb) (d); \ 290 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 291 } while (0) 292 #endif /* LONGLONG_STANDALONE */ 293 #endif /* _CRAYIEEE */ 294 #endif /* _CRAY */ 295 296 #if defined (__ia64) && W_TYPE_SIZE == 64 297 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated 298 "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency. The generic 299 code using "al<bl" arithmetically comes out making an actual 0 or 1 in a 300 register, which takes an extra cycle. */ 301 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 302 do { \ 303 UWtype __x; \ 304 __x = (al) - (bl); \ 305 if ((al) < (bl)) \ 306 (sh) = (ah) - (bh) - 1; \ 307 else \ 308 (sh) = (ah) - (bh); \ 309 (sl) = __x; \ 310 } while (0) 311 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER) 312 /* Do both product parts in assembly, since that gives better code with 313 all gcc versions. Some callers will just use the upper part, and in 314 that situation we waste an instruction, but not any cycles. */ 315 #define umul_ppmm(ph, pl, m0, m1) \ 316 __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0" \ 317 : "=&f" (ph), "=f" (pl) \ 318 : "f" (m0), "f" (m1)) 319 #define UMUL_TIME 14 320 #define count_leading_zeros(count, x) \ 321 do { \ 322 UWtype _x = (x), _y, _a, _c; \ 323 __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x)); \ 324 __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y)); \ 325 _c = (_a - 1) << 3; \ 326 _x >>= _c; \ 327 if (_x >= 1 << 4) \ 328 _x >>= 4, _c += 4; \ 329 if (_x >= 1 << 2) \ 330 _x >>= 2, _c += 2; \ 331 _c += _x >> 1; \ 332 (count) = W_TYPE_SIZE - 1 - _c; \ 333 } while (0) 334 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1 335 based, and we don't need a special case for x==0 here */ 336 #define count_trailing_zeros(count, x) \ 337 do { \ 338 UWtype __ctz_x = (x); \ 339 __asm__ ("popcnt %0 = %1" \ 340 : "=r" (count) \ 341 : "r" ((__ctz_x-1) & ~__ctz_x)); \ 342 } while (0) 343 #endif 344 #if defined (__INTEL_COMPILER) 345 #include <ia64intrin.h> 346 #define umul_ppmm(ph, pl, m0, m1) \ 347 do { \ 348 UWtype _m0 = (m0), _m1 = (m1); \ 349 ph = _m64_xmahu (_m0, _m1, 0); \ 350 pl = _m0 * _m1; \ 351 } while (0) 352 #endif 353 #ifndef LONGLONG_STANDALONE 354 #define udiv_qrnnd(q, r, n1, n0, d) \ 355 do { UWtype __di; \ 356 __di = __MPN(invert_limb) (d); \ 357 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 358 } while (0) 359 #define UDIV_PREINV_ALWAYS 1 360 #define UDIV_NEEDS_NORMALIZATION 1 361 #endif 362 #define UDIV_TIME 220 363 #endif 364 365 366 #if defined (__GNUC__) 367 368 /* We sometimes need to clobber "cc" with gcc2, but that would not be 369 understood by gcc1. Use cpp to avoid major code duplication. */ 370 #if __GNUC__ < 2 371 #define __CLOBBER_CC 372 #define __AND_CLOBBER_CC 373 #else /* __GNUC__ >= 2 */ 374 #define __CLOBBER_CC : "cc" 375 #define __AND_CLOBBER_CC , "cc" 376 #endif /* __GNUC__ < 2 */ 377 378 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32 379 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 380 __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3" \ 381 : "=r" (sh), "=&r" (sl) \ 382 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl)) 383 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 384 __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3" \ 385 : "=r" (sh), "=&r" (sl) \ 386 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl)) 387 #define umul_ppmm(xh, xl, m0, m1) \ 388 do { \ 389 USItype __m0 = (m0), __m1 = (m1); \ 390 __asm__ ("multiplu %0,%1,%2" \ 391 : "=r" (xl) \ 392 : "r" (__m0), "r" (__m1)); \ 393 __asm__ ("multmu %0,%1,%2" \ 394 : "=r" (xh) \ 395 : "r" (__m0), "r" (__m1)); \ 396 } while (0) 397 #define udiv_qrnnd(q, r, n1, n0, d) \ 398 __asm__ ("dividu %0,%3,%4" \ 399 : "=r" (q), "=q" (r) \ 400 : "1" (n1), "r" (n0), "r" (d)) 401 #define count_leading_zeros(count, x) \ 402 __asm__ ("clz %0,%1" \ 403 : "=r" (count) \ 404 : "r" (x)) 405 #define COUNT_LEADING_ZEROS_0 32 406 #endif /* __a29k__ */ 407 408 #if defined (__arc__) 409 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 410 __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3" \ 411 : "=r" (sh), \ 412 "=&r" (sl) \ 413 : "r" ((USItype) (ah)), \ 414 "rIJ" ((USItype) (bh)), \ 415 "%r" ((USItype) (al)), \ 416 "rIJ" ((USItype) (bl))) 417 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 418 __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ 419 : "=r" (sh), \ 420 "=&r" (sl) \ 421 : "r" ((USItype) (ah)), \ 422 "rIJ" ((USItype) (bh)), \ 423 "r" ((USItype) (al)), \ 424 "rIJ" ((USItype) (bl))) 425 #endif 426 427 #if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32 428 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 429 __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3" \ 430 : "=r" (sh), "=&r" (sl) \ 431 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC) 432 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 433 do { \ 434 if (__builtin_constant_p (al)) \ 435 { \ 436 if (__builtin_constant_p (ah)) \ 437 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \ 438 : "=r" (sh), "=&r" (sl) \ 439 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ 440 else \ 441 __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \ 442 : "=r" (sh), "=&r" (sl) \ 443 : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ 444 } \ 445 else if (__builtin_constant_p (ah)) \ 446 { \ 447 if (__builtin_constant_p (bl)) \ 448 __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \ 449 : "=r" (sh), "=&r" (sl) \ 450 : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ 451 else \ 452 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \ 453 : "=r" (sh), "=&r" (sl) \ 454 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ 455 } \ 456 else if (__builtin_constant_p (bl)) \ 457 { \ 458 if (__builtin_constant_p (bh)) \ 459 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ 460 : "=r" (sh), "=&r" (sl) \ 461 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ 462 else \ 463 __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \ 464 : "=r" (sh), "=&r" (sl) \ 465 : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ 466 } \ 467 else /* only bh might be a constant */ \ 468 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ 469 : "=r" (sh), "=&r" (sl) \ 470 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\ 471 } while (0) 472 #if 1 || defined (__arm_m__) /* `M' series has widening multiply support */ 473 #define umul_ppmm(xh, xl, a, b) \ 474 __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b)) 475 #define UMUL_TIME 5 476 #define smul_ppmm(xh, xl, a, b) \ 477 __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b)) 478 #ifndef LONGLONG_STANDALONE 479 #define udiv_qrnnd(q, r, n1, n0, d) \ 480 do { UWtype __di; \ 481 __di = __MPN(invert_limb) (d); \ 482 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ 483 } while (0) 484 #define UDIV_PREINV_ALWAYS 1 485 #define UDIV_NEEDS_NORMALIZATION 1 486 #define UDIV_TIME 70 487 #endif /* LONGLONG_STANDALONE */ 488 #else 489 #define umul_ppmm(xh, xl, a, b) \ 490 __asm__ ("%@ Inlined umul_ppmm\n" \ 491 " mov %|r0, %2, lsr #16\n" \ 492 " mov %|r2, %3, lsr #16\n" \ 493 " bic %|r1, %2, %|r0, lsl #16\n" \ 494 " bic %|r2, %3, %|r2, lsl #16\n" \ 495 " mul %1, %|r1, %|r2\n" \ 496 " mul %|r2, %|r0, %|r2\n" \ 497 " mul %|r1, %0, %|r1\n" \ 498 " mul %0, %|r0, %0\n" \ 499 " adds %|r1, %|r2, %|r1\n" \ 500 " addcs %0, %0, #65536\n" \ 501 " adds %1, %1, %|r1, lsl #16\n" \ 502 " adc %0, %0, %|r1, lsr #16" \ 503 : "=&r" (xh), "=r" (xl) \ 504 : "r" (a), "r" (b) \ 505 : "r0", "r1", "r2") 506 #define UMUL_TIME 20 507 #ifndef LONGLONG_STANDALONE 508 #define udiv_qrnnd(q, r, n1, n0, d) \ 509 do { UWtype __r; \ 510 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \ 511 (r) = __r; \ 512 } while (0) 513 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype); 514 #define UDIV_TIME 200 515 #endif /* LONGLONG_STANDALONE */ 516 #endif 517 /* This is a bizarre test, but GCC doesn't define useful common symbol. */ 518 #if defined (__ARM_ARCH_5__) || defined (__ARM_ARCH_5T__) || \ 519 defined (__ARM_ARCH_5E__) || defined (__ARM_ARCH_5TE__)|| \ 520 defined (__ARM_ARCH_6__) || defined (__ARM_ARCH_6J__) || \ 521 defined (__ARM_ARCH_6K__) || defined (__ARM_ARCH_6Z__) || \ 522 defined (__ARM_ARCH_6ZK__)|| defined (__ARM_ARCH_6T2__)|| \ 523 defined (__ARM_ARCH_6M__) || defined (__ARM_ARCH_7__) || \ 524 defined (__ARM_ARCH_7A__) || defined (__ARM_ARCH_7R__) || \ 525 defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__) 526 #define count_leading_zeros(count, x) \ 527 __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x)) 528 #define COUNT_LEADING_ZEROS_0 32 529 #endif 530 #endif /* __arm__ */ 531 532 #if defined (__aarch64__) && W_TYPE_SIZE == 64 533 /* FIXME: Extend the immediate range for the low word by using both 534 ADDS and SUBS, since they set carry in the same way. */ 535 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 536 __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3" \ 537 : "=r" (sh), "=&r" (sl) \ 538 : "rZ" (ah), "rZ" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC) 539 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 540 __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3" \ 541 : "=r,r" (sh), "=&r,&r" (sl) \ 542 : "rZ,rZ" (ah), "rZ,rZ" (bh), "r,Z" (al), "rI,r" (bl) __CLOBBER_CC) 543 #define umul_ppmm(ph, pl, m0, m1) \ 544 do { \ 545 UDItype __m0 = (m0), __m1 = (m1); \ 546 __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (m0), "r" (m1)); \ 547 (pl) = __m0 * __m1; \ 548 } while (0) 549 #define count_leading_zeros(count, x) \ 550 __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x)) 551 #define COUNT_LEADING_ZEROS_0 64 552 #endif /* __aarch64__ */ 553 554 #if defined (__clipper__) && W_TYPE_SIZE == 32 555 #define umul_ppmm(w1, w0, u, v) \ 556 ({union {UDItype __ll; \ 557 struct {USItype __l, __h;} __i; \ 558 } __x; \ 559 __asm__ ("mulwux %2,%0" \ 560 : "=r" (__x.__ll) \ 561 : "%0" ((USItype)(u)), "r" ((USItype)(v))); \ 562 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 563 #define smul_ppmm(w1, w0, u, v) \ 564 ({union {DItype __ll; \ 565 struct {SItype __l, __h;} __i; \ 566 } __x; \ 567 __asm__ ("mulwx %2,%0" \ 568 : "=r" (__x.__ll) \ 569 : "%0" ((SItype)(u)), "r" ((SItype)(v))); \ 570 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 571 #define __umulsidi3(u, v) \ 572 ({UDItype __w; \ 573 __asm__ ("mulwux %2,%0" \ 574 : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v))); \ 575 __w; }) 576 #endif /* __clipper__ */ 577 578 /* Fujitsu vector computers. */ 579 #if defined (__uxp__) && W_TYPE_SIZE == 32 580 #define umul_ppmm(ph, pl, u, v) \ 581 do { \ 582 union {UDItype __ll; \ 583 struct {USItype __h, __l;} __i; \ 584 } __x; \ 585 __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\ 586 (ph) = __x.__i.__h; \ 587 (pl) = __x.__i.__l; \ 588 } while (0) 589 #define smul_ppmm(ph, pl, u, v) \ 590 do { \ 591 union {UDItype __ll; \ 592 struct {USItype __h, __l;} __i; \ 593 } __x; \ 594 __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \ 595 (ph) = __x.__i.__h; \ 596 (pl) = __x.__i.__l; \ 597 } while (0) 598 #endif 599 600 #if defined (__gmicro__) && W_TYPE_SIZE == 32 601 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 602 __asm__ ("add.w %5,%1\n\taddx %3,%0" \ 603 : "=g" (sh), "=&g" (sl) \ 604 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 605 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 606 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 607 __asm__ ("sub.w %5,%1\n\tsubx %3,%0" \ 608 : "=g" (sh), "=&g" (sl) \ 609 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 610 "1" ((USItype)(al)), "g" ((USItype)(bl))) 611 #define umul_ppmm(ph, pl, m0, m1) \ 612 __asm__ ("mulx %3,%0,%1" \ 613 : "=g" (ph), "=r" (pl) \ 614 : "%0" ((USItype)(m0)), "g" ((USItype)(m1))) 615 #define udiv_qrnnd(q, r, nh, nl, d) \ 616 __asm__ ("divx %4,%0,%1" \ 617 : "=g" (q), "=r" (r) \ 618 : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d))) 619 #define count_leading_zeros(count, x) \ 620 __asm__ ("bsch/1 %1,%0" \ 621 : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0)) 622 #endif 623 624 #if defined (__hppa) && W_TYPE_SIZE == 32 625 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 626 __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0" \ 627 : "=r" (sh), "=&r" (sl) \ 628 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl)) 629 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 630 __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0" \ 631 : "=r" (sh), "=&r" (sl) \ 632 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl)) 633 #if defined (_PA_RISC1_1) 634 #define umul_ppmm(wh, wl, u, v) \ 635 do { \ 636 union {UDItype __ll; \ 637 struct {USItype __h, __l;} __i; \ 638 } __x; \ 639 __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \ 640 (wh) = __x.__i.__h; \ 641 (wl) = __x.__i.__l; \ 642 } while (0) 643 #define UMUL_TIME 8 644 #define UDIV_TIME 60 645 #else 646 #define UMUL_TIME 40 647 #define UDIV_TIME 80 648 #endif 649 #define count_leading_zeros(count, x) \ 650 do { \ 651 USItype __tmp; \ 652 __asm__ ( \ 653 "ldi 1,%0\n" \ 654 " extru,= %1,15,16,%%r0 ; Bits 31..16 zero?\n" \ 655 " extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \ 656 " ldo 16(%0),%0 ; Yes. Perform add.\n" \ 657 " extru,= %1,23,8,%%r0 ; Bits 15..8 zero?\n" \ 658 " extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \ 659 " ldo 8(%0),%0 ; Yes. Perform add.\n" \ 660 " extru,= %1,27,4,%%r0 ; Bits 7..4 zero?\n" \ 661 " extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \ 662 " ldo 4(%0),%0 ; Yes. Perform add.\n" \ 663 " extru,= %1,29,2,%%r0 ; Bits 3..2 zero?\n" \ 664 " extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \ 665 " ldo 2(%0),%0 ; Yes. Perform add.\n" \ 666 " extru %1,30,1,%1 ; Extract bit 1.\n" \ 667 " sub %0,%1,%0 ; Subtract it.\n" \ 668 : "=r" (count), "=r" (__tmp) : "1" (x)); \ 669 } while (0) 670 #endif /* hppa */ 671 672 /* These macros are for ABI=2.0w. In ABI=2.0n they can't be used, since GCC 673 (3.2) puts longlong into two adjacent 32-bit registers. Presumably this 674 is just a case of no direct support for 2.0n but treating it like 1.0. */ 675 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB) 676 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 677 __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0" \ 678 : "=r" (sh), "=&r" (sl) \ 679 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl)) 680 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 681 __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0" \ 682 : "=r" (sh), "=&r" (sl) \ 683 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl)) 684 #endif /* hppa */ 685 686 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32 687 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch) 688 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 689 do { \ 690 /* if (__builtin_constant_p (bl)) \ 691 __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3" \ 692 : "=r" (sh), "=&r" (sl) \ 693 : "0" (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\ 694 else \ 695 */ __asm__ ("alr\t%1,%5\n\talcr\t%0,%3" \ 696 : "=r" (sh), "=&r" (sl) \ 697 : "0" (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \ 698 } while (0) 699 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 700 do { \ 701 /* if (__builtin_constant_p (bl)) \ 702 __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3" \ 703 : "=r" (sh), "=&r" (sl) \ 704 : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC); \ 705 else \ 706 */ __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3" \ 707 : "=r" (sh), "=&r" (sl) \ 708 : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC); \ 709 } while (0) 710 #if __GMP_GNUC_PREREQ (4,5) 711 #define umul_ppmm(xh, xl, m0, m1) \ 712 do { \ 713 union {UDItype __ll; \ 714 struct {USItype __h, __l;} __i; \ 715 } __x; \ 716 __x.__ll = (UDItype) (m0) * (UDItype) (m1); \ 717 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 718 } while (0) 719 #else 720 #if 0 721 /* FIXME: this fails if gcc knows about the 64-bit registers. Use only 722 with a new enough processor pretending we have 32-bit registers. */ 723 #define umul_ppmm(xh, xl, m0, m1) \ 724 do { \ 725 union {UDItype __ll; \ 726 struct {USItype __h, __l;} __i; \ 727 } __x; \ 728 __asm__ ("mlr\t%0,%2" \ 729 : "=r" (__x.__ll) \ 730 : "%0" (m0), "r" (m1)); \ 731 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 732 } while (0) 733 #else 734 #define umul_ppmm(xh, xl, m0, m1) \ 735 do { \ 736 /* When we have 64-bit regs and gcc is aware of that, we cannot simply use 737 DImode for the product, since that would be allocated to a single 64-bit 738 register, whereas mlr uses the low 32-bits of an even-odd register pair. 739 */ \ 740 register USItype __r0 __asm__ ("0"); \ 741 register USItype __r1 __asm__ ("1") = (m0); \ 742 __asm__ ("mlr\t%0,%3" \ 743 : "=r" (__r0), "=r" (__r1) \ 744 : "r" (__r1), "r" (m1)); \ 745 (xh) = __r0; (xl) = __r1; \ 746 } while (0) 747 #endif /* if 0 */ 748 #endif 749 #if 0 750 /* FIXME: this fails if gcc knows about the 64-bit registers. Use only 751 with a new enough processor pretending we have 32-bit registers. */ 752 #define udiv_qrnnd(q, r, n1, n0, d) \ 753 do { \ 754 union {UDItype __ll; \ 755 struct {USItype __h, __l;} __i; \ 756 } __x; \ 757 __x.__i.__h = n1; __x.__i.__l = n0; \ 758 __asm__ ("dlr\t%0,%2" \ 759 : "=r" (__x.__ll) \ 760 : "0" (__x.__ll), "r" (d)); \ 761 (q) = __x.__i.__l; (r) = __x.__i.__h; \ 762 } while (0) 763 #else 764 #define udiv_qrnnd(q, r, n1, n0, d) \ 765 do { \ 766 register USItype __r0 __asm__ ("0") = (n1); \ 767 register USItype __r1 __asm__ ("1") = (n0); \ 768 __asm__ ("dlr\t%0,%4" \ 769 : "=r" (__r0), "=r" (__r1) \ 770 : "r" (__r0), "r" (__r1), "r" (d)); \ 771 (q) = __r1; (r) = __r0; \ 772 } while (0) 773 #endif /* if 0 */ 774 #else /* if __zarch__ */ 775 /* FIXME: this fails if gcc knows about the 64-bit registers. */ 776 #define smul_ppmm(xh, xl, m0, m1) \ 777 do { \ 778 union {DItype __ll; \ 779 struct {USItype __h, __l;} __i; \ 780 } __x; \ 781 __asm__ ("mr\t%0,%2" \ 782 : "=r" (__x.__ll) \ 783 : "%0" (m0), "r" (m1)); \ 784 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 785 } while (0) 786 /* FIXME: this fails if gcc knows about the 64-bit registers. */ 787 #define sdiv_qrnnd(q, r, n1, n0, d) \ 788 do { \ 789 union {DItype __ll; \ 790 struct {USItype __h, __l;} __i; \ 791 } __x; \ 792 __x.__i.__h = n1; __x.__i.__l = n0; \ 793 __asm__ ("dr\t%0,%2" \ 794 : "=r" (__x.__ll) \ 795 : "0" (__x.__ll), "r" (d)); \ 796 (q) = __x.__i.__l; (r) = __x.__i.__h; \ 797 } while (0) 798 #endif /* if __zarch__ */ 799 #endif 800 801 #if defined (__s390x__) && W_TYPE_SIZE == 64 802 /* We need to cast operands with register constraints, otherwise their types 803 will be assumed to be SImode by gcc. For these machines, such operations 804 will insert a value into the low 32 bits, and leave the high 32 bits with 805 garbage. */ 806 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 807 do { \ 808 __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3" \ 809 : "=r" (sh), "=&r" (sl) \ 810 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 811 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \ 812 } while (0) 813 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 814 do { \ 815 __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3" \ 816 : "=r" (sh), "=&r" (sl) \ 817 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ 818 "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \ 819 } while (0) 820 #define umul_ppmm(xh, xl, m0, m1) \ 821 do { \ 822 union {unsigned int __attribute__ ((mode(TI))) __ll; \ 823 struct {UDItype __h, __l;} __i; \ 824 } __x; \ 825 __asm__ ("mlgr\t%0,%2" \ 826 : "=r" (__x.__ll) \ 827 : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1))); \ 828 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 829 } while (0) 830 #define udiv_qrnnd(q, r, n1, n0, d) \ 831 do { \ 832 union {unsigned int __attribute__ ((mode(TI))) __ll; \ 833 struct {UDItype __h, __l;} __i; \ 834 } __x; \ 835 __x.__i.__h = n1; __x.__i.__l = n0; \ 836 __asm__ ("dlgr\t%0,%2" \ 837 : "=r" (__x.__ll) \ 838 : "0" (__x.__ll), "r" ((UDItype)(d))); \ 839 (q) = __x.__i.__l; (r) = __x.__i.__h; \ 840 } while (0) 841 #if 0 /* FIXME: Enable for z10 (?) */ 842 #define count_leading_zeros(cnt, x) \ 843 do { \ 844 union {unsigned int __attribute__ ((mode(TI))) __ll; \ 845 struct {UDItype __h, __l;} __i; \ 846 } __clr_cnt; \ 847 __asm__ ("flogr\t%0,%1" \ 848 : "=r" (__clr_cnt.__ll) \ 849 : "r" (x) __CLOBBER_CC); \ 850 (cnt) = __clr_cnt.__i.__h; \ 851 } while (0) 852 #endif 853 #endif 854 855 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32 856 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 857 __asm__ ("addl %5,%k1\n\tadcl %3,%k0" \ 858 : "=r" (sh), "=&r" (sl) \ 859 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 860 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 861 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 862 __asm__ ("subl %5,%k1\n\tsbbl %3,%k0" \ 863 : "=r" (sh), "=&r" (sl) \ 864 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 865 "1" ((USItype)(al)), "g" ((USItype)(bl))) 866 #define umul_ppmm(w1, w0, u, v) \ 867 __asm__ ("mull %3" \ 868 : "=a" (w0), "=d" (w1) \ 869 : "%0" ((USItype)(u)), "rm" ((USItype)(v))) 870 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\ 871 __asm__ ("divl %4" /* stringification in K&R C */ \ 872 : "=a" (q), "=d" (r) \ 873 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx))) 874 875 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx 876 /* Pentium bsrl takes between 10 and 72 cycles depending where the most 877 significant 1 bit is, hence the use of the following alternatives. bsfl 878 is slow too, between 18 and 42 depending where the least significant 1 879 bit is, so let the generic count_trailing_zeros below make use of the 880 count_leading_zeros here too. */ 881 882 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE) 883 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1 884 cache miss reading from __clz_tab. For P55 it's favoured over the float 885 below so as to avoid mixing MMX and x87, since the penalty for switching 886 between the two is about 100 cycles. 887 888 The asm block sets __shift to -3 if the high 24 bits are clear, -2 for 889 16, -1 for 8, or 0 otherwise. This could be written equivalently as 890 follows, but as of gcc 2.95.2 it results in conditional jumps. 891 892 __shift = -(__n < 0x1000000); 893 __shift -= (__n < 0x10000); 894 __shift -= (__n < 0x100); 895 896 The middle two sbbl and cmpl's pair, and with luck something gcc 897 generates might pair with the first cmpl and the last sbbl. The "32+1" 898 constant could be folded into __clz_tab[], but it doesn't seem worth 899 making a different table just for that. */ 900 901 #define count_leading_zeros(c,n) \ 902 do { \ 903 USItype __n = (n); \ 904 USItype __shift; \ 905 __asm__ ("cmpl $0x1000000, %1\n" \ 906 "sbbl %0, %0\n" \ 907 "cmpl $0x10000, %1\n" \ 908 "sbbl $0, %0\n" \ 909 "cmpl $0x100, %1\n" \ 910 "sbbl $0, %0\n" \ 911 : "=&r" (__shift) : "r" (__n)); \ 912 __shift = __shift*8 + 24 + 1; \ 913 (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift]; \ 914 } while (0) 915 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 916 #define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */ 917 918 #else /* ! pentiummmx || LONGLONG_STANDALONE */ 919 /* The following should be a fixed 14 cycles or so. Some scheduling 920 opportunities should be available between the float load/store too. This 921 sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is 922 apparently suggested by the Intel optimizing manual (don't know exactly 923 where). gcc 2.95 or up will be best for this, so the "double" is 924 correctly aligned on the stack. */ 925 #define count_leading_zeros(c,n) \ 926 do { \ 927 union { \ 928 double d; \ 929 unsigned a[2]; \ 930 } __u; \ 931 ASSERT ((n) != 0); \ 932 __u.d = (UWtype) (n); \ 933 (c) = 0x3FF + 31 - (__u.a[1] >> 20); \ 934 } while (0) 935 #define COUNT_LEADING_ZEROS_0 (0x3FF + 31) 936 #endif /* pentiummx */ 937 938 #else /* ! pentium */ 939 940 #if __GMP_GNUC_PREREQ (3,4) /* using bsrl */ 941 #define count_leading_zeros(count,x) count_leading_zeros_gcc_clz(count,x) 942 #endif /* gcc clz */ 943 944 /* On P6, gcc prior to 3.0 generates a partial register stall for 945 __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former 946 being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the 947 cost of one extra instruction. Do this for "i386" too, since that means 948 generic x86. */ 949 #if ! defined (count_leading_zeros) && __GNUC__ < 3 \ 950 && (HAVE_HOST_CPU_i386 \ 951 || HAVE_HOST_CPU_i686 \ 952 || HAVE_HOST_CPU_pentiumpro \ 953 || HAVE_HOST_CPU_pentium2 \ 954 || HAVE_HOST_CPU_pentium3) 955 #define count_leading_zeros(count, x) \ 956 do { \ 957 USItype __cbtmp; \ 958 ASSERT ((x) != 0); \ 959 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \ 960 (count) = 31 - __cbtmp; \ 961 } while (0) 962 #endif /* gcc<3 asm bsrl */ 963 964 #ifndef count_leading_zeros 965 #define count_leading_zeros(count, x) \ 966 do { \ 967 USItype __cbtmp; \ 968 ASSERT ((x) != 0); \ 969 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \ 970 (count) = __cbtmp ^ 31; \ 971 } while (0) 972 #endif /* asm bsrl */ 973 974 #if __GMP_GNUC_PREREQ (3,4) /* using bsfl */ 975 #define count_trailing_zeros(count,x) count_trailing_zeros_gcc_ctz(count,x) 976 #endif /* gcc ctz */ 977 978 #ifndef count_trailing_zeros 979 #define count_trailing_zeros(count, x) \ 980 do { \ 981 ASSERT ((x) != 0); \ 982 __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x))); \ 983 } while (0) 984 #endif /* asm bsfl */ 985 986 #endif /* ! pentium */ 987 988 #ifndef UMUL_TIME 989 #define UMUL_TIME 10 990 #endif 991 #ifndef UDIV_TIME 992 #define UDIV_TIME 40 993 #endif 994 #endif /* 80x86 */ 995 996 #if defined (__amd64__) && W_TYPE_SIZE == 64 997 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 998 __asm__ ("addq %5,%q1\n\tadcq %3,%q0" \ 999 : "=r" (sh), "=&r" (sl) \ 1000 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \ 1001 "%1" ((UDItype)(al)), "rme" ((UDItype)(bl))) 1002 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1003 __asm__ ("subq %5,%q1\n\tsbbq %3,%q0" \ 1004 : "=r" (sh), "=&r" (sl) \ 1005 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \ 1006 "1" ((UDItype)(al)), "rme" ((UDItype)(bl))) 1007 #define umul_ppmm(w1, w0, u, v) \ 1008 __asm__ ("mulq %3" \ 1009 : "=a" (w0), "=d" (w1) \ 1010 : "%0" ((UDItype)(u)), "rm" ((UDItype)(v))) 1011 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\ 1012 __asm__ ("divq %4" /* stringification in K&R C */ \ 1013 : "=a" (q), "=d" (r) \ 1014 : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx))) 1015 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */ 1016 #define count_leading_zeros(count, x) \ 1017 do { \ 1018 UDItype __cbtmp; \ 1019 ASSERT ((x) != 0); \ 1020 __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \ 1021 (count) = __cbtmp ^ 63; \ 1022 } while (0) 1023 /* bsfq destination must be a 64-bit register, "%q0" forces this in case 1024 count is only an int. */ 1025 #define count_trailing_zeros(count, x) \ 1026 do { \ 1027 ASSERT ((x) != 0); \ 1028 __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x))); \ 1029 } while (0) 1030 #endif /* x86_64 */ 1031 1032 #if defined (__i860__) && W_TYPE_SIZE == 32 1033 #define rshift_rhlc(r,h,l,c) \ 1034 __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0" \ 1035 "=r" (r) : "r" (h), "r" (l), "rn" (c)) 1036 #endif /* i860 */ 1037 1038 #if defined (__i960__) && W_TYPE_SIZE == 32 1039 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1040 __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0" \ 1041 : "=r" (sh), "=&r" (sl) \ 1042 : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl)) 1043 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1044 __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0" \ 1045 : "=r" (sh), "=&r" (sl) \ 1046 : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl)) 1047 #define umul_ppmm(w1, w0, u, v) \ 1048 ({union {UDItype __ll; \ 1049 struct {USItype __l, __h;} __i; \ 1050 } __x; \ 1051 __asm__ ("emul %2,%1,%0" \ 1052 : "=d" (__x.__ll) : "%dI" (u), "dI" (v)); \ 1053 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 1054 #define __umulsidi3(u, v) \ 1055 ({UDItype __w; \ 1056 __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v)); \ 1057 __w; }) 1058 #define udiv_qrnnd(q, r, nh, nl, d) \ 1059 do { \ 1060 union {UDItype __ll; \ 1061 struct {USItype __l, __h;} __i; \ 1062 } __nn; \ 1063 __nn.__i.__h = (nh); __nn.__i.__l = (nl); \ 1064 __asm__ ("ediv %d,%n,%0" \ 1065 : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d)); \ 1066 (r) = __rq.__i.__l; (q) = __rq.__i.__h; \ 1067 } while (0) 1068 #define count_leading_zeros(count, x) \ 1069 do { \ 1070 USItype __cbtmp; \ 1071 __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x)); \ 1072 (count) = __cbtmp ^ 31; \ 1073 } while (0) 1074 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */ 1075 #if defined (__i960mx) /* what is the proper symbol to test??? */ 1076 #define rshift_rhlc(r,h,l,c) \ 1077 do { \ 1078 union {UDItype __ll; \ 1079 struct {USItype __l, __h;} __i; \ 1080 } __nn; \ 1081 __nn.__i.__h = (h); __nn.__i.__l = (l); \ 1082 __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \ 1083 } 1084 #endif /* i960mx */ 1085 #endif /* i960 */ 1086 1087 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \ 1088 || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \ 1089 || defined (__mc5307__)) && W_TYPE_SIZE == 32 1090 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1091 __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0" \ 1092 : "=d" (sh), "=&d" (sl) \ 1093 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \ 1094 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 1095 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1096 __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0" \ 1097 : "=d" (sh), "=&d" (sl) \ 1098 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \ 1099 "1" ((USItype)(al)), "g" ((USItype)(bl))) 1100 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r. */ 1101 #if defined (__mc68020__) || defined(mc68020) \ 1102 || defined (__mc68030__) || defined (mc68030) \ 1103 || defined (__mc68040__) || defined (mc68040) \ 1104 || defined (__mcpu32__) || defined (mcpu32) \ 1105 || defined (__NeXT__) 1106 #define umul_ppmm(w1, w0, u, v) \ 1107 __asm__ ("mulu%.l %3,%1:%0" \ 1108 : "=d" (w0), "=d" (w1) \ 1109 : "%0" ((USItype)(u)), "dmi" ((USItype)(v))) 1110 #define UMUL_TIME 45 1111 #define udiv_qrnnd(q, r, n1, n0, d) \ 1112 __asm__ ("divu%.l %4,%1:%0" \ 1113 : "=d" (q), "=d" (r) \ 1114 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d))) 1115 #define UDIV_TIME 90 1116 #define sdiv_qrnnd(q, r, n1, n0, d) \ 1117 __asm__ ("divs%.l %4,%1:%0" \ 1118 : "=d" (q), "=d" (r) \ 1119 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d))) 1120 #else /* for other 68k family members use 16x16->32 multiplication */ 1121 #define umul_ppmm(xh, xl, a, b) \ 1122 do { USItype __umul_tmp1, __umul_tmp2; \ 1123 __asm__ ("| Inlined umul_ppmm\n" \ 1124 " move%.l %5,%3\n" \ 1125 " move%.l %2,%0\n" \ 1126 " move%.w %3,%1\n" \ 1127 " swap %3\n" \ 1128 " swap %0\n" \ 1129 " mulu%.w %2,%1\n" \ 1130 " mulu%.w %3,%0\n" \ 1131 " mulu%.w %2,%3\n" \ 1132 " swap %2\n" \ 1133 " mulu%.w %5,%2\n" \ 1134 " add%.l %3,%2\n" \ 1135 " jcc 1f\n" \ 1136 " add%.l %#0x10000,%0\n" \ 1137 "1: move%.l %2,%3\n" \ 1138 " clr%.w %2\n" \ 1139 " swap %2\n" \ 1140 " swap %3\n" \ 1141 " clr%.w %3\n" \ 1142 " add%.l %3,%1\n" \ 1143 " addx%.l %2,%0\n" \ 1144 " | End inlined umul_ppmm" \ 1145 : "=&d" (xh), "=&d" (xl), \ 1146 "=d" (__umul_tmp1), "=&d" (__umul_tmp2) \ 1147 : "%2" ((USItype)(a)), "d" ((USItype)(b))); \ 1148 } while (0) 1149 #define UMUL_TIME 100 1150 #define UDIV_TIME 400 1151 #endif /* not mc68020 */ 1152 /* The '020, '030, '040 and '060 have bitfield insns. 1153 GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to 1154 exclude bfffo on that chip (bitfield insns not available). */ 1155 #if (defined (__mc68020__) || defined (mc68020) \ 1156 || defined (__mc68030__) || defined (mc68030) \ 1157 || defined (__mc68040__) || defined (mc68040) \ 1158 || defined (__mc68060__) || defined (mc68060) \ 1159 || defined (__NeXT__)) \ 1160 && ! defined (__mcpu32__) 1161 #define count_leading_zeros(count, x) \ 1162 __asm__ ("bfffo %1{%b2:%b2},%0" \ 1163 : "=d" (count) \ 1164 : "od" ((USItype) (x)), "n" (0)) 1165 #define COUNT_LEADING_ZEROS_0 32 1166 #endif 1167 #endif /* mc68000 */ 1168 1169 #if defined (__m88000__) && W_TYPE_SIZE == 32 1170 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1171 __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3" \ 1172 : "=r" (sh), "=&r" (sl) \ 1173 : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl)) 1174 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1175 __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3" \ 1176 : "=r" (sh), "=&r" (sl) \ 1177 : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl)) 1178 #define count_leading_zeros(count, x) \ 1179 do { \ 1180 USItype __cbtmp; \ 1181 __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x)); \ 1182 (count) = __cbtmp ^ 31; \ 1183 } while (0) 1184 #define COUNT_LEADING_ZEROS_0 63 /* sic */ 1185 #if defined (__m88110__) 1186 #define umul_ppmm(wh, wl, u, v) \ 1187 do { \ 1188 union {UDItype __ll; \ 1189 struct {USItype __h, __l;} __i; \ 1190 } __x; \ 1191 __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \ 1192 (wh) = __x.__i.__h; \ 1193 (wl) = __x.__i.__l; \ 1194 } while (0) 1195 #define udiv_qrnnd(q, r, n1, n0, d) \ 1196 ({union {UDItype __ll; \ 1197 struct {USItype __h, __l;} __i; \ 1198 } __x, __q; \ 1199 __x.__i.__h = (n1); __x.__i.__l = (n0); \ 1200 __asm__ ("divu.d %0,%1,%2" \ 1201 : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \ 1202 (r) = (n0) - __q.__l * (d); (q) = __q.__l; }) 1203 #define UMUL_TIME 5 1204 #define UDIV_TIME 25 1205 #else 1206 #define UMUL_TIME 17 1207 #define UDIV_TIME 150 1208 #endif /* __m88110__ */ 1209 #endif /* __m88000__ */ 1210 1211 #if defined (__mips) && W_TYPE_SIZE == 32 1212 #if __GMP_GNUC_PREREQ (4,4) || defined(__clang__) 1213 #define umul_ppmm(w1, w0, u, v) \ 1214 do { \ 1215 UDItype __ll = (UDItype)(u) * (v); \ 1216 w1 = __ll >> 32; \ 1217 w0 = __ll; \ 1218 } while (0) 1219 #endif 1220 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) 1221 #define umul_ppmm(w1, w0, u, v) \ 1222 __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v)) 1223 #endif 1224 #if !defined (umul_ppmm) 1225 #define umul_ppmm(w1, w0, u, v) \ 1226 __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1" \ 1227 : "=d" (w0), "=d" (w1) : "d" (u), "d" (v)) 1228 #endif 1229 #define UMUL_TIME 10 1230 #define UDIV_TIME 100 1231 #endif /* __mips */ 1232 1233 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64 1234 #if __GMP_GNUC_PREREQ (4,4) || defined(__clang__) 1235 #define umul_ppmm(w1, w0, u, v) \ 1236 do { \ 1237 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \ 1238 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \ 1239 w1 = __ll >> 64; \ 1240 w0 = __ll; \ 1241 } while (0) 1242 #endif 1243 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) 1244 #define umul_ppmm(w1, w0, u, v) \ 1245 __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v)) 1246 #endif 1247 #if !defined (umul_ppmm) 1248 #define umul_ppmm(w1, w0, u, v) \ 1249 __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1" \ 1250 : "=d" (w0), "=d" (w1) : "d" (u), "d" (v)) 1251 #endif 1252 #define UMUL_TIME 20 1253 #define UDIV_TIME 140 1254 #endif /* __mips */ 1255 1256 #if defined (__mmix__) && W_TYPE_SIZE == 64 1257 #define umul_ppmm(w1, w0, u, v) \ 1258 __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v)) 1259 #endif 1260 1261 #if defined (__ns32000__) && W_TYPE_SIZE == 32 1262 #define umul_ppmm(w1, w0, u, v) \ 1263 ({union {UDItype __ll; \ 1264 struct {USItype __l, __h;} __i; \ 1265 } __x; \ 1266 __asm__ ("meid %2,%0" \ 1267 : "=g" (__x.__ll) \ 1268 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \ 1269 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 1270 #define __umulsidi3(u, v) \ 1271 ({UDItype __w; \ 1272 __asm__ ("meid %2,%0" \ 1273 : "=g" (__w) \ 1274 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \ 1275 __w; }) 1276 #define udiv_qrnnd(q, r, n1, n0, d) \ 1277 ({union {UDItype __ll; \ 1278 struct {USItype __l, __h;} __i; \ 1279 } __x; \ 1280 __x.__i.__h = (n1); __x.__i.__l = (n0); \ 1281 __asm__ ("deid %2,%0" \ 1282 : "=g" (__x.__ll) \ 1283 : "0" (__x.__ll), "g" ((USItype)(d))); \ 1284 (r) = __x.__i.__l; (q) = __x.__i.__h; }) 1285 #define count_trailing_zeros(count,x) \ 1286 do { \ 1287 __asm__ ("ffsd %2,%0" \ 1288 : "=r" (count) \ 1289 : "0" ((USItype) 0), "r" ((USItype) (x))); \ 1290 } while (0) 1291 #endif /* __ns32000__ */ 1292 1293 /* In the past we had a block of various #defines tested 1294 _ARCH_PPC - AIX 1295 _ARCH_PWR - AIX 1296 __powerpc__ - gcc 1297 __POWERPC__ - BEOS 1298 __ppc__ - Darwin 1299 PPC - old gcc, GNU/Linux, SysV 1300 The plain PPC test was not good for vxWorks, since PPC is defined on all 1301 CPUs there (eg. m68k too), as a constant one is expected to compare 1302 CPU_FAMILY against. 1303 1304 At any rate, this was pretty unattractive and a bit fragile. The use of 1305 HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of 1306 getting the desired effect. 1307 1308 ENHANCE-ME: We should test _IBMR2 here when we add assembly support for 1309 the system vendor compilers. (Is that vendor compilers with inline asm, 1310 or what?) */ 1311 1312 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc) \ 1313 && W_TYPE_SIZE == 32 1314 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1315 do { \ 1316 if (__builtin_constant_p (bh) && (bh) == 0) \ 1317 __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \ 1318 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\ 1319 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \ 1320 __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \ 1321 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\ 1322 else \ 1323 __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \ 1324 : "=r" (sh), "=&r" (sl) \ 1325 : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \ 1326 } while (0) 1327 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1328 do { \ 1329 if (__builtin_constant_p (ah) && (ah) == 0) \ 1330 __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \ 1331 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\ 1332 else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0) \ 1333 __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \ 1334 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\ 1335 else if (__builtin_constant_p (bh) && (bh) == 0) \ 1336 __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \ 1337 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\ 1338 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \ 1339 __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \ 1340 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\ 1341 else \ 1342 __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \ 1343 : "=r" (sh), "=&r" (sl) \ 1344 : "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \ 1345 } while (0) 1346 #define count_leading_zeros(count, x) \ 1347 __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x)) 1348 #define COUNT_LEADING_ZEROS_0 32 1349 #if HAVE_HOST_CPU_FAMILY_powerpc 1350 #if __GMP_GNUC_PREREQ (4,4) || defined(__clang__) 1351 #define umul_ppmm(w1, w0, u, v) \ 1352 do { \ 1353 UDItype __ll = (UDItype)(u) * (v); \ 1354 w1 = __ll >> 32; \ 1355 w0 = __ll; \ 1356 } while (0) 1357 #endif 1358 #if !defined (umul_ppmm) 1359 #define umul_ppmm(ph, pl, m0, m1) \ 1360 do { \ 1361 USItype __m0 = (m0), __m1 = (m1); \ 1362 __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ 1363 (pl) = __m0 * __m1; \ 1364 } while (0) 1365 #endif 1366 #define UMUL_TIME 15 1367 #define smul_ppmm(ph, pl, m0, m1) \ 1368 do { \ 1369 SItype __m0 = (m0), __m1 = (m1); \ 1370 __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ 1371 (pl) = __m0 * __m1; \ 1372 } while (0) 1373 #define SMUL_TIME 14 1374 #define UDIV_TIME 120 1375 #else 1376 #define UMUL_TIME 8 1377 #define smul_ppmm(xh, xl, m0, m1) \ 1378 __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1)) 1379 #define SMUL_TIME 4 1380 #define sdiv_qrnnd(q, r, nh, nl, d) \ 1381 __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d)) 1382 #define UDIV_TIME 100 1383 #endif 1384 #endif /* 32-bit POWER architecture variants. */ 1385 1386 /* We should test _IBMR2 here when we add assembly support for the system 1387 vendor compilers. */ 1388 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64 1389 #if !defined (_LONG_LONG_LIMB) 1390 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values. So 1391 use adde etc only when not _LONG_LONG_LIMB. */ 1392 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1393 do { \ 1394 if (__builtin_constant_p (bh) && (bh) == 0) \ 1395 __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \ 1396 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\ 1397 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ 1398 __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \ 1399 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\ 1400 else \ 1401 __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \ 1402 : "=r" (sh), "=&r" (sl) \ 1403 : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \ 1404 } while (0) 1405 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs. 1406 This might seem strange, but gcc folds away the dead code late. */ 1407 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1408 do { \ 1409 if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) { \ 1410 if (__builtin_constant_p (ah) && (ah) == 0) \ 1411 __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2" \ 1412 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \ 1413 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \ 1414 __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2" \ 1415 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \ 1416 else if (__builtin_constant_p (bh) && (bh) == 0) \ 1417 __asm__ ("addic %1,%3,%4\n\taddme %0,%2" \ 1418 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \ 1419 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ 1420 __asm__ ("addic %1,%3,%4\n\taddze %0,%2" \ 1421 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \ 1422 else \ 1423 __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2" \ 1424 : "=r" (sh), "=&r" (sl) \ 1425 : "r" (ah), "r" (bh), "rI" (al), "*rI" (-bl)); \ 1426 } else { \ 1427 if (__builtin_constant_p (ah) && (ah) == 0) \ 1428 __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \ 1429 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)); \ 1430 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \ 1431 __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \ 1432 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)); \ 1433 else if (__builtin_constant_p (bh) && (bh) == 0) \ 1434 __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \ 1435 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)); \ 1436 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ 1437 __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \ 1438 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)); \ 1439 else \ 1440 __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \ 1441 : "=r" (sh), "=&r" (sl) \ 1442 : "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \ 1443 } \ 1444 } while (0) 1445 #endif /* ! _LONG_LONG_LIMB */ 1446 #define count_leading_zeros(count, x) \ 1447 __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x)) 1448 #define COUNT_LEADING_ZEROS_0 64 1449 #if 0 && __GMP_GNUC_PREREQ (4,4) /* Disable, this results in libcalls! */ 1450 #define umul_ppmm(w1, w0, u, v) \ 1451 do { \ 1452 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \ 1453 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \ 1454 w1 = __ll >> 64; \ 1455 w0 = __ll; \ 1456 } while (0) 1457 #endif 1458 #if !defined (umul_ppmm) 1459 #define umul_ppmm(ph, pl, m0, m1) \ 1460 do { \ 1461 UDItype __m0 = (m0), __m1 = (m1); \ 1462 __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ 1463 (pl) = __m0 * __m1; \ 1464 } while (0) 1465 #endif 1466 #define UMUL_TIME 15 1467 #define smul_ppmm(ph, pl, m0, m1) \ 1468 do { \ 1469 DItype __m0 = (m0), __m1 = (m1); \ 1470 __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ 1471 (pl) = __m0 * __m1; \ 1472 } while (0) 1473 #define SMUL_TIME 14 /* ??? */ 1474 #define UDIV_TIME 120 /* ??? */ 1475 #endif /* 64-bit PowerPC. */ 1476 1477 #if defined (__pyr__) && W_TYPE_SIZE == 32 1478 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1479 __asm__ ("addw %5,%1\n\taddwc %3,%0" \ 1480 : "=r" (sh), "=&r" (sl) \ 1481 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1482 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 1483 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1484 __asm__ ("subw %5,%1\n\tsubwb %3,%0" \ 1485 : "=r" (sh), "=&r" (sl) \ 1486 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1487 "1" ((USItype)(al)), "g" ((USItype)(bl))) 1488 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */ 1489 #define umul_ppmm(w1, w0, u, v) \ 1490 ({union {UDItype __ll; \ 1491 struct {USItype __h, __l;} __i; \ 1492 } __x; \ 1493 __asm__ ("movw %1,%R0\n\tuemul %2,%0" \ 1494 : "=&r" (__x.__ll) \ 1495 : "g" ((USItype) (u)), "g" ((USItype)(v))); \ 1496 (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) 1497 #endif /* __pyr__ */ 1498 1499 #if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32 1500 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1501 __asm__ ("a %1,%5\n\tae %0,%3" \ 1502 : "=r" (sh), "=&r" (sl) \ 1503 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \ 1504 "%1" ((USItype)(al)), "r" ((USItype)(bl))) 1505 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1506 __asm__ ("s %1,%5\n\tse %0,%3" \ 1507 : "=r" (sh), "=&r" (sl) \ 1508 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \ 1509 "1" ((USItype)(al)), "r" ((USItype)(bl))) 1510 #define smul_ppmm(ph, pl, m0, m1) \ 1511 __asm__ ( \ 1512 "s r2,r2\n" \ 1513 " mts r10,%2\n" \ 1514 " m r2,%3\n" \ 1515 " m r2,%3\n" \ 1516 " m r2,%3\n" \ 1517 " m r2,%3\n" \ 1518 " m r2,%3\n" \ 1519 " m r2,%3\n" \ 1520 " m r2,%3\n" \ 1521 " m r2,%3\n" \ 1522 " m r2,%3\n" \ 1523 " m r2,%3\n" \ 1524 " m r2,%3\n" \ 1525 " m r2,%3\n" \ 1526 " m r2,%3\n" \ 1527 " m r2,%3\n" \ 1528 " m r2,%3\n" \ 1529 " m r2,%3\n" \ 1530 " cas %0,r2,r0\n" \ 1531 " mfs r10,%1" \ 1532 : "=r" (ph), "=r" (pl) \ 1533 : "%r" ((USItype)(m0)), "r" ((USItype)(m1)) \ 1534 : "r2") 1535 #define UMUL_TIME 20 1536 #define UDIV_TIME 200 1537 #define count_leading_zeros(count, x) \ 1538 do { \ 1539 if ((x) >= 0x10000) \ 1540 __asm__ ("clz %0,%1" \ 1541 : "=r" (count) : "r" ((USItype)(x) >> 16)); \ 1542 else \ 1543 { \ 1544 __asm__ ("clz %0,%1" \ 1545 : "=r" (count) : "r" ((USItype)(x))); \ 1546 (count) += 16; \ 1547 } \ 1548 } while (0) 1549 #endif /* RT/ROMP */ 1550 1551 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32 1552 #define umul_ppmm(w1, w0, u, v) \ 1553 __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0" \ 1554 : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach") 1555 #define UMUL_TIME 5 1556 #endif 1557 1558 #if defined (__sparc__) && W_TYPE_SIZE == 32 1559 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1560 __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0" \ 1561 : "=r" (sh), "=&r" (sl) \ 1562 : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl) \ 1563 __CLOBBER_CC) 1564 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1565 __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0" \ 1566 : "=r" (sh), "=&r" (sl) \ 1567 : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \ 1568 __CLOBBER_CC) 1569 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h 1570 doesn't define anything to indicate that to us, it only sets __sparcv8. */ 1571 #if defined (__sparc_v9__) || defined (__sparcv9) 1572 /* Perhaps we should use floating-point operations here? */ 1573 #if 0 1574 /* Triggers a bug making mpz/tests/t-gcd.c fail. 1575 Perhaps we simply need explicitly zero-extend the inputs? */ 1576 #define umul_ppmm(w1, w0, u, v) \ 1577 __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" : \ 1578 "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1") 1579 #else 1580 /* Use v8 umul until above bug is fixed. */ 1581 #define umul_ppmm(w1, w0, u, v) \ 1582 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) 1583 #endif 1584 /* Use a plain v8 divide for v9. */ 1585 #define udiv_qrnnd(q, r, n1, n0, d) \ 1586 do { \ 1587 USItype __q; \ 1588 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \ 1589 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \ 1590 (r) = (n0) - __q * (d); \ 1591 (q) = __q; \ 1592 } while (0) 1593 #else 1594 #if defined (__sparc_v8__) /* gcc normal */ \ 1595 || defined (__sparcv8) /* gcc solaris */ \ 1596 || HAVE_HOST_CPU_supersparc 1597 /* Don't match immediate range because, 1) it is not often useful, 1598 2) the 'I' flag thinks of the range as a 13 bit signed interval, 1599 while we want to match a 13 bit interval, sign extended to 32 bits, 1600 but INTERPRETED AS UNSIGNED. */ 1601 #define umul_ppmm(w1, w0, u, v) \ 1602 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) 1603 #define UMUL_TIME 5 1604 1605 #if HAVE_HOST_CPU_supersparc 1606 #define UDIV_TIME 60 /* SuperSPARC timing */ 1607 #else 1608 /* Don't use this on SuperSPARC because its udiv only handles 53 bit 1609 dividends and will trap to the kernel for the rest. */ 1610 #define udiv_qrnnd(q, r, n1, n0, d) \ 1611 do { \ 1612 USItype __q; \ 1613 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \ 1614 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \ 1615 (r) = (n0) - __q * (d); \ 1616 (q) = __q; \ 1617 } while (0) 1618 #define UDIV_TIME 25 1619 #endif /* HAVE_HOST_CPU_supersparc */ 1620 1621 #else /* ! __sparc_v8__ */ 1622 #if defined (__sparclite__) 1623 /* This has hardware multiply but not divide. It also has two additional 1624 instructions scan (ffs from high bit) and divscc. */ 1625 #define umul_ppmm(w1, w0, u, v) \ 1626 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) 1627 #define UMUL_TIME 5 1628 #define udiv_qrnnd(q, r, n1, n0, d) \ 1629 __asm__ ("! Inlined udiv_qrnnd\n" \ 1630 " wr %%g0,%2,%%y ! Not a delayed write for sparclite\n" \ 1631 " tst %%g0\n" \ 1632 " divscc %3,%4,%%g1\n" \ 1633 " divscc %%g1,%4,%%g1\n" \ 1634 " divscc %%g1,%4,%%g1\n" \ 1635 " divscc %%g1,%4,%%g1\n" \ 1636 " divscc %%g1,%4,%%g1\n" \ 1637 " divscc %%g1,%4,%%g1\n" \ 1638 " divscc %%g1,%4,%%g1\n" \ 1639 " divscc %%g1,%4,%%g1\n" \ 1640 " divscc %%g1,%4,%%g1\n" \ 1641 " divscc %%g1,%4,%%g1\n" \ 1642 " divscc %%g1,%4,%%g1\n" \ 1643 " divscc %%g1,%4,%%g1\n" \ 1644 " divscc %%g1,%4,%%g1\n" \ 1645 " divscc %%g1,%4,%%g1\n" \ 1646 " divscc %%g1,%4,%%g1\n" \ 1647 " divscc %%g1,%4,%%g1\n" \ 1648 " divscc %%g1,%4,%%g1\n" \ 1649 " divscc %%g1,%4,%%g1\n" \ 1650 " divscc %%g1,%4,%%g1\n" \ 1651 " divscc %%g1,%4,%%g1\n" \ 1652 " divscc %%g1,%4,%%g1\n" \ 1653 " divscc %%g1,%4,%%g1\n" \ 1654 " divscc %%g1,%4,%%g1\n" \ 1655 " divscc %%g1,%4,%%g1\n" \ 1656 " divscc %%g1,%4,%%g1\n" \ 1657 " divscc %%g1,%4,%%g1\n" \ 1658 " divscc %%g1,%4,%%g1\n" \ 1659 " divscc %%g1,%4,%%g1\n" \ 1660 " divscc %%g1,%4,%%g1\n" \ 1661 " divscc %%g1,%4,%%g1\n" \ 1662 " divscc %%g1,%4,%%g1\n" \ 1663 " divscc %%g1,%4,%0\n" \ 1664 " rd %%y,%1\n" \ 1665 " bl,a 1f\n" \ 1666 " add %1,%4,%1\n" \ 1667 "1: ! End of inline udiv_qrnnd" \ 1668 : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d) \ 1669 : "%g1" __AND_CLOBBER_CC) 1670 #define UDIV_TIME 37 1671 #define count_leading_zeros(count, x) \ 1672 __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x)) 1673 /* Early sparclites return 63 for an argument of 0, but they warn that future 1674 implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0 1675 undefined. */ 1676 #endif /* __sparclite__ */ 1677 #endif /* __sparc_v8__ */ 1678 #endif /* __sparc_v9__ */ 1679 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */ 1680 #ifndef umul_ppmm 1681 #define umul_ppmm(w1, w0, u, v) \ 1682 __asm__ ("! Inlined umul_ppmm\n" \ 1683 " wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr\n" \ 1684 " sra %3,31,%%g2 ! Don't move this insn\n" \ 1685 " and %2,%%g2,%%g2 ! Don't move this insn\n" \ 1686 " andcc %%g0,0,%%g1 ! Don't move this insn\n" \ 1687 " mulscc %%g1,%3,%%g1\n" \ 1688 " mulscc %%g1,%3,%%g1\n" \ 1689 " mulscc %%g1,%3,%%g1\n" \ 1690 " mulscc %%g1,%3,%%g1\n" \ 1691 " mulscc %%g1,%3,%%g1\n" \ 1692 " mulscc %%g1,%3,%%g1\n" \ 1693 " mulscc %%g1,%3,%%g1\n" \ 1694 " mulscc %%g1,%3,%%g1\n" \ 1695 " mulscc %%g1,%3,%%g1\n" \ 1696 " mulscc %%g1,%3,%%g1\n" \ 1697 " mulscc %%g1,%3,%%g1\n" \ 1698 " mulscc %%g1,%3,%%g1\n" \ 1699 " mulscc %%g1,%3,%%g1\n" \ 1700 " mulscc %%g1,%3,%%g1\n" \ 1701 " mulscc %%g1,%3,%%g1\n" \ 1702 " mulscc %%g1,%3,%%g1\n" \ 1703 " mulscc %%g1,%3,%%g1\n" \ 1704 " mulscc %%g1,%3,%%g1\n" \ 1705 " mulscc %%g1,%3,%%g1\n" \ 1706 " mulscc %%g1,%3,%%g1\n" \ 1707 " mulscc %%g1,%3,%%g1\n" \ 1708 " mulscc %%g1,%3,%%g1\n" \ 1709 " mulscc %%g1,%3,%%g1\n" \ 1710 " mulscc %%g1,%3,%%g1\n" \ 1711 " mulscc %%g1,%3,%%g1\n" \ 1712 " mulscc %%g1,%3,%%g1\n" \ 1713 " mulscc %%g1,%3,%%g1\n" \ 1714 " mulscc %%g1,%3,%%g1\n" \ 1715 " mulscc %%g1,%3,%%g1\n" \ 1716 " mulscc %%g1,%3,%%g1\n" \ 1717 " mulscc %%g1,%3,%%g1\n" \ 1718 " mulscc %%g1,%3,%%g1\n" \ 1719 " mulscc %%g1,0,%%g1\n" \ 1720 " add %%g1,%%g2,%0\n" \ 1721 " rd %%y,%1" \ 1722 : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v) \ 1723 : "%g1", "%g2" __AND_CLOBBER_CC) 1724 #define UMUL_TIME 39 /* 39 instructions */ 1725 #endif 1726 #ifndef udiv_qrnnd 1727 #ifndef LONGLONG_STANDALONE 1728 #define udiv_qrnnd(q, r, n1, n0, d) \ 1729 do { UWtype __r; \ 1730 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \ 1731 (r) = __r; \ 1732 } while (0) 1733 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype); 1734 #ifndef UDIV_TIME 1735 #define UDIV_TIME 140 1736 #endif 1737 #endif /* LONGLONG_STANDALONE */ 1738 #endif /* udiv_qrnnd */ 1739 #endif /* __sparc__ */ 1740 1741 #if defined (__sparc__) && W_TYPE_SIZE == 64 1742 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1743 __asm__ ( \ 1744 "addcc %r4,%5,%1\n" \ 1745 " addccc %r6,%7,%%g0\n" \ 1746 " addc %r2,%3,%0" \ 1747 : "=r" (sh), "=&r" (sl) \ 1748 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \ 1749 "%rJ" ((al) >> 32), "rI" ((bl) >> 32) \ 1750 __CLOBBER_CC) 1751 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1752 __asm__ ( \ 1753 "subcc %r4,%5,%1\n" \ 1754 " subccc %r6,%7,%%g0\n" \ 1755 " subc %r2,%3,%0" \ 1756 : "=r" (sh), "=&r" (sl) \ 1757 : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl), \ 1758 "rJ" ((al) >> 32), "rI" ((bl) >> 32) \ 1759 __CLOBBER_CC) 1760 #endif 1761 1762 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32 1763 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1764 __asm__ ("addl2 %5,%1\n\tadwc %3,%0" \ 1765 : "=g" (sh), "=&g" (sl) \ 1766 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1767 "%1" ((USItype)(al)), "g" ((USItype)(bl))) 1768 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1769 __asm__ ("subl2 %5,%1\n\tsbwc %3,%0" \ 1770 : "=g" (sh), "=&g" (sl) \ 1771 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ 1772 "1" ((USItype)(al)), "g" ((USItype)(bl))) 1773 #define smul_ppmm(xh, xl, m0, m1) \ 1774 do { \ 1775 union {UDItype __ll; \ 1776 struct {USItype __l, __h;} __i; \ 1777 } __x; \ 1778 USItype __m0 = (m0), __m1 = (m1); \ 1779 __asm__ ("emul %1,%2,$0,%0" \ 1780 : "=g" (__x.__ll) : "g" (__m0), "g" (__m1)); \ 1781 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 1782 } while (0) 1783 #define sdiv_qrnnd(q, r, n1, n0, d) \ 1784 do { \ 1785 union {DItype __ll; \ 1786 struct {SItype __l, __h;} __i; \ 1787 } __x; \ 1788 __x.__i.__h = n1; __x.__i.__l = n0; \ 1789 __asm__ ("ediv %3,%2,%0,%1" \ 1790 : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d)); \ 1791 } while (0) 1792 #if 0 1793 /* FIXME: This instruction appears to be unimplemented on some systems (vax 1794 8800 maybe). */ 1795 #define count_trailing_zeros(count,x) \ 1796 do { \ 1797 __asm__ ("ffs 0, 31, %1, %0" \ 1798 : "=g" (count) \ 1799 : "g" ((USItype) (x))); \ 1800 } while (0) 1801 #endif 1802 #endif /* vax */ 1803 1804 #if defined (__z8000__) && W_TYPE_SIZE == 16 1805 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1806 __asm__ ("add %H1,%H5\n\tadc %H0,%H3" \ 1807 : "=r" (sh), "=&r" (sl) \ 1808 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \ 1809 "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl))) 1810 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1811 __asm__ ("sub %H1,%H5\n\tsbc %H0,%H3" \ 1812 : "=r" (sh), "=&r" (sl) \ 1813 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \ 1814 "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl))) 1815 #define umul_ppmm(xh, xl, m0, m1) \ 1816 do { \ 1817 union {long int __ll; \ 1818 struct {unsigned int __h, __l;} __i; \ 1819 } __x; \ 1820 unsigned int __m0 = (m0), __m1 = (m1); \ 1821 __asm__ ("mult %S0,%H3" \ 1822 : "=r" (__x.__i.__h), "=r" (__x.__i.__l) \ 1823 : "%1" (m0), "rQR" (m1)); \ 1824 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ 1825 (xh) += ((((signed int) __m0 >> 15) & __m1) \ 1826 + (((signed int) __m1 >> 15) & __m0)); \ 1827 } while (0) 1828 #endif /* __z8000__ */ 1829 1830 #endif /* __GNUC__ */ 1831 1832 #endif /* NO_ASM */ 1833 1834 1835 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti". */ 1836 #if !defined (umul_ppmm) && defined (__umulsidi3) 1837 #define umul_ppmm(ph, pl, m0, m1) \ 1838 { \ 1839 UDWtype __ll = __umulsidi3 (m0, m1); \ 1840 ph = (UWtype) (__ll >> W_TYPE_SIZE); \ 1841 pl = (UWtype) __ll; \ 1842 } 1843 #endif 1844 1845 #if !defined (__umulsidi3) 1846 #define __umulsidi3(u, v) \ 1847 ({UWtype __hi, __lo; \ 1848 umul_ppmm (__hi, __lo, u, v); \ 1849 ((UDWtype) __hi << W_TYPE_SIZE) | __lo; }) 1850 #endif 1851 1852 1853 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist. The "_r" 1854 forms have "reversed" arguments, meaning the pointer is last, which 1855 sometimes allows better parameter passing, in particular on 64-bit 1856 hppa. */ 1857 1858 #define mpn_umul_ppmm __MPN(umul_ppmm) 1859 extern UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype); 1860 1861 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm \ 1862 && ! defined (LONGLONG_STANDALONE) 1863 #define umul_ppmm(wh, wl, u, v) \ 1864 do { \ 1865 UWtype __umul_ppmm__p0; \ 1866 (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v)); \ 1867 (wl) = __umul_ppmm__p0; \ 1868 } while (0) 1869 #endif 1870 1871 #define mpn_umul_ppmm_r __MPN(umul_ppmm_r) 1872 extern UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *); 1873 1874 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r \ 1875 && ! defined (LONGLONG_STANDALONE) 1876 #define umul_ppmm(wh, wl, u, v) \ 1877 do { \ 1878 UWtype __umul_ppmm__p0; \ 1879 (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_ppmm__p0); \ 1880 (wl) = __umul_ppmm__p0; \ 1881 } while (0) 1882 #endif 1883 1884 #define mpn_udiv_qrnnd __MPN(udiv_qrnnd) 1885 extern UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype); 1886 1887 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd \ 1888 && ! defined (LONGLONG_STANDALONE) 1889 #define udiv_qrnnd(q, r, n1, n0, d) \ 1890 do { \ 1891 UWtype __udiv_qrnnd__r; \ 1892 (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r, \ 1893 (UWtype) (n1), (UWtype) (n0), (UWtype) d); \ 1894 (r) = __udiv_qrnnd__r; \ 1895 } while (0) 1896 #endif 1897 1898 #define mpn_udiv_qrnnd_r __MPN(udiv_qrnnd_r) 1899 extern UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *); 1900 1901 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r \ 1902 && ! defined (LONGLONG_STANDALONE) 1903 #define udiv_qrnnd(q, r, n1, n0, d) \ 1904 do { \ 1905 UWtype __udiv_qrnnd__r; \ 1906 (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d, \ 1907 &__udiv_qrnnd__r); \ 1908 (r) = __udiv_qrnnd__r; \ 1909 } while (0) 1910 #endif 1911 1912 1913 /* If this machine has no inline assembler, use C macros. */ 1914 1915 #if !defined (add_ssaaaa) 1916 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ 1917 do { \ 1918 UWtype __x; \ 1919 __x = (al) + (bl); \ 1920 (sh) = (ah) + (bh) + (__x < (al)); \ 1921 (sl) = __x; \ 1922 } while (0) 1923 #endif 1924 1925 #if !defined (sub_ddmmss) 1926 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ 1927 do { \ 1928 UWtype __x; \ 1929 __x = (al) - (bl); \ 1930 (sh) = (ah) - (bh) - ((al) < (bl)); \ 1931 (sl) = __x; \ 1932 } while (0) 1933 #endif 1934 1935 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of 1936 smul_ppmm. */ 1937 #if !defined (umul_ppmm) && defined (smul_ppmm) 1938 #define umul_ppmm(w1, w0, u, v) \ 1939 do { \ 1940 UWtype __w1; \ 1941 UWtype __xm0 = (u), __xm1 = (v); \ 1942 smul_ppmm (__w1, w0, __xm0, __xm1); \ 1943 (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \ 1944 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \ 1945 } while (0) 1946 #endif 1947 1948 /* If we still don't have umul_ppmm, define it using plain C. 1949 1950 For reference, when this code is used for squaring (ie. u and v identical 1951 expressions), gcc recognises __x1 and __x2 are the same and generates 3 1952 multiplies, not 4. The subsequent additions could be optimized a bit, 1953 but the only place GMP currently uses such a square is mpn_sqr_basecase, 1954 and chips obliged to use this generic C umul will have plenty of worse 1955 performance problems than a couple of extra instructions on the diagonal 1956 of sqr_basecase. */ 1957 1958 #if !defined (umul_ppmm) 1959 #define umul_ppmm(w1, w0, u, v) \ 1960 do { \ 1961 UWtype __x0, __x1, __x2, __x3; \ 1962 UHWtype __ul, __vl, __uh, __vh; \ 1963 UWtype __u = (u), __v = (v); \ 1964 \ 1965 __ul = __ll_lowpart (__u); \ 1966 __uh = __ll_highpart (__u); \ 1967 __vl = __ll_lowpart (__v); \ 1968 __vh = __ll_highpart (__v); \ 1969 \ 1970 __x0 = (UWtype) __ul * __vl; \ 1971 __x1 = (UWtype) __ul * __vh; \ 1972 __x2 = (UWtype) __uh * __vl; \ 1973 __x3 = (UWtype) __uh * __vh; \ 1974 \ 1975 __x1 += __ll_highpart (__x0);/* this can't give carry */ \ 1976 __x1 += __x2; /* but this indeed can */ \ 1977 if (__x1 < __x2) /* did we get it? */ \ 1978 __x3 += __ll_B; /* yes, add it in the proper pos. */ \ 1979 \ 1980 (w1) = __x3 + __ll_highpart (__x1); \ 1981 (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0); \ 1982 } while (0) 1983 #endif 1984 1985 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will 1986 exist in one form or another. */ 1987 #if !defined (smul_ppmm) 1988 #define smul_ppmm(w1, w0, u, v) \ 1989 do { \ 1990 UWtype __w1; \ 1991 UWtype __xm0 = (u), __xm1 = (v); \ 1992 umul_ppmm (__w1, w0, __xm0, __xm1); \ 1993 (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \ 1994 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \ 1995 } while (0) 1996 #endif 1997 1998 /* Define this unconditionally, so it can be used for debugging. */ 1999 #define __udiv_qrnnd_c(q, r, n1, n0, d) \ 2000 do { \ 2001 UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \ 2002 \ 2003 ASSERT ((d) != 0); \ 2004 ASSERT ((n1) < (d)); \ 2005 \ 2006 __d1 = __ll_highpart (d); \ 2007 __d0 = __ll_lowpart (d); \ 2008 \ 2009 __q1 = (n1) / __d1; \ 2010 __r1 = (n1) - __q1 * __d1; \ 2011 __m = __q1 * __d0; \ 2012 __r1 = __r1 * __ll_B | __ll_highpart (n0); \ 2013 if (__r1 < __m) \ 2014 { \ 2015 __q1--, __r1 += (d); \ 2016 if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\ 2017 if (__r1 < __m) \ 2018 __q1--, __r1 += (d); \ 2019 } \ 2020 __r1 -= __m; \ 2021 \ 2022 __q0 = __r1 / __d1; \ 2023 __r0 = __r1 - __q0 * __d1; \ 2024 __m = __q0 * __d0; \ 2025 __r0 = __r0 * __ll_B | __ll_lowpart (n0); \ 2026 if (__r0 < __m) \ 2027 { \ 2028 __q0--, __r0 += (d); \ 2029 if (__r0 >= (d)) \ 2030 if (__r0 < __m) \ 2031 __q0--, __r0 += (d); \ 2032 } \ 2033 __r0 -= __m; \ 2034 \ 2035 (q) = __q1 * __ll_B | __q0; \ 2036 (r) = __r0; \ 2037 } while (0) 2038 2039 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through 2040 __udiv_w_sdiv (defined in libgcc or elsewhere). */ 2041 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) 2042 #define udiv_qrnnd(q, r, nh, nl, d) \ 2043 do { \ 2044 UWtype __r; \ 2045 (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \ 2046 (r) = __r; \ 2047 } while (0) 2048 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype); 2049 #endif 2050 2051 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c. */ 2052 #if !defined (udiv_qrnnd) 2053 #define UDIV_NEEDS_NORMALIZATION 1 2054 #define udiv_qrnnd __udiv_qrnnd_c 2055 #endif 2056 2057 #if !defined (count_leading_zeros) 2058 #define count_leading_zeros(count, x) \ 2059 do { \ 2060 UWtype __xr = (x); \ 2061 UWtype __a; \ 2062 \ 2063 if (W_TYPE_SIZE == 32) \ 2064 { \ 2065 __a = __xr < ((UWtype) 1 << 2*__BITS4) \ 2066 ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1) \ 2067 : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1 \ 2068 : 3*__BITS4 + 1); \ 2069 } \ 2070 else \ 2071 { \ 2072 for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \ 2073 if (((__xr >> __a) & 0xff) != 0) \ 2074 break; \ 2075 ++__a; \ 2076 } \ 2077 \ 2078 (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \ 2079 } while (0) 2080 /* This version gives a well-defined value for zero. */ 2081 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1) 2082 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 2083 #define COUNT_LEADING_ZEROS_SLOW 2084 #endif 2085 2086 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */ 2087 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY 2088 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 2089 #endif 2090 2091 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB 2092 extern const unsigned char __GMP_DECLSPEC __clz_tab[129]; 2093 #endif 2094 2095 #if !defined (count_trailing_zeros) 2096 #if !defined (COUNT_LEADING_ZEROS_SLOW) 2097 /* Define count_trailing_zeros using an asm count_leading_zeros. */ 2098 #define count_trailing_zeros(count, x) \ 2099 do { \ 2100 UWtype __ctz_x = (x); \ 2101 UWtype __ctz_c; \ 2102 ASSERT (__ctz_x != 0); \ 2103 count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x); \ 2104 (count) = W_TYPE_SIZE - 1 - __ctz_c; \ 2105 } while (0) 2106 #else 2107 /* Define count_trailing_zeros in plain C, assuming small counts are common. 2108 We use clz_tab without ado, since the C count_leading_zeros above will have 2109 pulled it in. */ 2110 #define count_trailing_zeros(count, x) \ 2111 do { \ 2112 UWtype __ctz_x = (x); \ 2113 int __ctz_c; \ 2114 \ 2115 if (LIKELY ((__ctz_x & 0xff) != 0)) \ 2116 (count) = __clz_tab[__ctz_x & -__ctz_x] - 2; \ 2117 else \ 2118 { \ 2119 for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8) \ 2120 { \ 2121 __ctz_x >>= 8; \ 2122 if (LIKELY ((__ctz_x & 0xff) != 0)) \ 2123 break; \ 2124 } \ 2125 \ 2126 (count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x]; \ 2127 } \ 2128 } while (0) 2129 #endif 2130 #endif 2131 2132 #ifndef UDIV_NEEDS_NORMALIZATION 2133 #define UDIV_NEEDS_NORMALIZATION 0 2134 #endif 2135 2136 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and 2137 that hence the latter should always be used. */ 2138 #ifndef UDIV_PREINV_ALWAYS 2139 #define UDIV_PREINV_ALWAYS 0 2140 #endif 2141 2142 /* Give defaults for UMUL_TIME and UDIV_TIME. */ 2143 #ifndef UMUL_TIME 2144 #define UMUL_TIME 1 2145 #endif 2146 2147 #ifndef UDIV_TIME 2148 #define UDIV_TIME UMUL_TIME 2149 #endif 2150