1 /*===---- __clang_hip_math.h - Device-side HIP math support ----------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 #ifndef __CLANG_HIP_MATH_H__ 10 #define __CLANG_HIP_MATH_H__ 11 12 #if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__) 13 #error "This file is for HIP and OpenMP AMDGCN device compilation only." 14 #endif 15 16 // The __CLANG_GPU_DISABLE_MATH_WRAPPERS macro provides a way to let standard 17 // libcalls reach the link step instead of being eagerly replaced. 18 #ifndef __CLANG_GPU_DISABLE_MATH_WRAPPERS 19 20 #if !defined(__HIPCC_RTC__) 21 #include <limits.h> 22 #include <stdint.h> 23 #ifdef __OPENMP_AMDGCN__ 24 #include <omp.h> 25 #endif 26 #endif // !defined(__HIPCC_RTC__) 27 28 #pragma push_macro("__DEVICE__") 29 30 #ifdef __OPENMP_AMDGCN__ 31 #define __DEVICE__ static inline __attribute__((always_inline, nothrow)) 32 #else 33 #define __DEVICE__ static __device__ inline __attribute__((always_inline)) 34 #endif 35 36 // Device library provides fast low precision and slow full-recision 37 // implementations for some functions. Which one gets selected depends on 38 // __CLANG_GPU_APPROX_TRANSCENDENTALS__ which gets defined by clang if 39 // -ffast-math or -fgpu-approx-transcendentals are in effect. 40 #pragma push_macro("__FAST_OR_SLOW") 41 #if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__) 42 #define __FAST_OR_SLOW(fast, slow) fast 43 #else 44 #define __FAST_OR_SLOW(fast, slow) slow 45 #endif 46 47 // A few functions return bool type starting only in C++11. 48 #pragma push_macro("__RETURN_TYPE") 49 #ifdef __OPENMP_AMDGCN__ 50 #define __RETURN_TYPE int 51 #else 52 #if defined(__cplusplus) 53 #define __RETURN_TYPE bool 54 #else 55 #define __RETURN_TYPE int 56 #endif 57 #endif // __OPENMP_AMDGCN__ 58 59 #if defined (__cplusplus) && __cplusplus < 201103L 60 // emulate static_assert on type sizes 61 template<bool> 62 struct __compare_result{}; 63 template<> 64 struct __compare_result<true> { 65 static const __device__ bool valid; 66 }; 67 68 __DEVICE__ 69 void __suppress_unused_warning(bool b){}; 70 template <unsigned int S, unsigned int T> 71 __DEVICE__ void __static_assert_equal_size() { 72 __suppress_unused_warning(__compare_result<S == T>::valid); 73 } 74 75 #define __static_assert_type_size_equal(A, B) \ 76 __static_assert_equal_size<A,B>() 77 78 #else 79 #define __static_assert_type_size_equal(A,B) \ 80 static_assert((A) == (B), "") 81 82 #endif 83 84 __DEVICE__ 85 uint64_t __make_mantissa_base8(const char *__tagp __attribute__((nonnull))) { 86 uint64_t __r = 0; 87 while (*__tagp != '\0') { 88 char __tmp = *__tagp; 89 90 if (__tmp >= '0' && __tmp <= '7') 91 __r = (__r * 8u) + __tmp - '0'; 92 else 93 return 0; 94 95 ++__tagp; 96 } 97 98 return __r; 99 } 100 101 __DEVICE__ 102 uint64_t __make_mantissa_base10(const char *__tagp __attribute__((nonnull))) { 103 uint64_t __r = 0; 104 while (*__tagp != '\0') { 105 char __tmp = *__tagp; 106 107 if (__tmp >= '0' && __tmp <= '9') 108 __r = (__r * 10u) + __tmp - '0'; 109 else 110 return 0; 111 112 ++__tagp; 113 } 114 115 return __r; 116 } 117 118 __DEVICE__ 119 uint64_t __make_mantissa_base16(const char *__tagp __attribute__((nonnull))) { 120 uint64_t __r = 0; 121 while (*__tagp != '\0') { 122 char __tmp = *__tagp; 123 124 if (__tmp >= '0' && __tmp <= '9') 125 __r = (__r * 16u) + __tmp - '0'; 126 else if (__tmp >= 'a' && __tmp <= 'f') 127 __r = (__r * 16u) + __tmp - 'a' + 10; 128 else if (__tmp >= 'A' && __tmp <= 'F') 129 __r = (__r * 16u) + __tmp - 'A' + 10; 130 else 131 return 0; 132 133 ++__tagp; 134 } 135 136 return __r; 137 } 138 139 __DEVICE__ 140 uint64_t __make_mantissa(const char *__tagp __attribute__((nonnull))) { 141 if (*__tagp == '0') { 142 ++__tagp; 143 144 if (*__tagp == 'x' || *__tagp == 'X') 145 return __make_mantissa_base16(__tagp); 146 else 147 return __make_mantissa_base8(__tagp); 148 } 149 150 return __make_mantissa_base10(__tagp); 151 } 152 153 // BEGIN FLOAT 154 155 // BEGIN INTRINSICS 156 157 __DEVICE__ 158 float __cosf(float __x) { return __ocml_native_cos_f32(__x); } 159 160 __DEVICE__ 161 float __exp10f(float __x) { 162 const float __log2_10 = 0x1.a934f0p+1f; 163 return __builtin_amdgcn_exp2f(__log2_10 * __x); 164 } 165 166 __DEVICE__ 167 float __expf(float __x) { 168 const float __log2_e = 0x1.715476p+0; 169 return __builtin_amdgcn_exp2f(__log2_e * __x); 170 } 171 172 #if defined OCML_BASIC_ROUNDED_OPERATIONS 173 __DEVICE__ 174 float __fadd_rd(float __x, float __y) { return __ocml_add_rtn_f32(__x, __y); } 175 __DEVICE__ 176 float __fadd_rn(float __x, float __y) { return __ocml_add_rte_f32(__x, __y); } 177 __DEVICE__ 178 float __fadd_ru(float __x, float __y) { return __ocml_add_rtp_f32(__x, __y); } 179 __DEVICE__ 180 float __fadd_rz(float __x, float __y) { return __ocml_add_rtz_f32(__x, __y); } 181 #else 182 __DEVICE__ 183 float __fadd_rn(float __x, float __y) { return __x + __y; } 184 #endif 185 186 #if defined OCML_BASIC_ROUNDED_OPERATIONS 187 __DEVICE__ 188 float __fdiv_rd(float __x, float __y) { return __ocml_div_rtn_f32(__x, __y); } 189 __DEVICE__ 190 float __fdiv_rn(float __x, float __y) { return __ocml_div_rte_f32(__x, __y); } 191 __DEVICE__ 192 float __fdiv_ru(float __x, float __y) { return __ocml_div_rtp_f32(__x, __y); } 193 __DEVICE__ 194 float __fdiv_rz(float __x, float __y) { return __ocml_div_rtz_f32(__x, __y); } 195 #else 196 __DEVICE__ 197 float __fdiv_rn(float __x, float __y) { return __x / __y; } 198 #endif 199 200 __DEVICE__ 201 float __fdividef(float __x, float __y) { return __x / __y; } 202 203 #if defined OCML_BASIC_ROUNDED_OPERATIONS 204 __DEVICE__ 205 float __fmaf_rd(float __x, float __y, float __z) { 206 return __ocml_fma_rtn_f32(__x, __y, __z); 207 } 208 __DEVICE__ 209 float __fmaf_rn(float __x, float __y, float __z) { 210 return __ocml_fma_rte_f32(__x, __y, __z); 211 } 212 __DEVICE__ 213 float __fmaf_ru(float __x, float __y, float __z) { 214 return __ocml_fma_rtp_f32(__x, __y, __z); 215 } 216 __DEVICE__ 217 float __fmaf_rz(float __x, float __y, float __z) { 218 return __ocml_fma_rtz_f32(__x, __y, __z); 219 } 220 #else 221 __DEVICE__ 222 float __fmaf_rn(float __x, float __y, float __z) { 223 return __builtin_fmaf(__x, __y, __z); 224 } 225 #endif 226 227 #if defined OCML_BASIC_ROUNDED_OPERATIONS 228 __DEVICE__ 229 float __fmul_rd(float __x, float __y) { return __ocml_mul_rtn_f32(__x, __y); } 230 __DEVICE__ 231 float __fmul_rn(float __x, float __y) { return __ocml_mul_rte_f32(__x, __y); } 232 __DEVICE__ 233 float __fmul_ru(float __x, float __y) { return __ocml_mul_rtp_f32(__x, __y); } 234 __DEVICE__ 235 float __fmul_rz(float __x, float __y) { return __ocml_mul_rtz_f32(__x, __y); } 236 #else 237 __DEVICE__ 238 float __fmul_rn(float __x, float __y) { return __x * __y; } 239 #endif 240 241 #if defined OCML_BASIC_ROUNDED_OPERATIONS 242 __DEVICE__ 243 float __frcp_rd(float __x) { return __ocml_div_rtn_f32(1.0f, __x); } 244 __DEVICE__ 245 float __frcp_rn(float __x) { return __ocml_div_rte_f32(1.0f, __x); } 246 __DEVICE__ 247 float __frcp_ru(float __x) { return __ocml_div_rtp_f32(1.0f, __x); } 248 __DEVICE__ 249 float __frcp_rz(float __x) { return __ocml_div_rtz_f32(1.0f, __x); } 250 #else 251 __DEVICE__ 252 float __frcp_rn(float __x) { return 1.0f / __x; } 253 #endif 254 255 __DEVICE__ 256 float __frsqrt_rn(float __x) { return __builtin_amdgcn_rsqf(__x); } 257 258 #if defined OCML_BASIC_ROUNDED_OPERATIONS 259 __DEVICE__ 260 float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); } 261 __DEVICE__ 262 float __fsqrt_rn(float __x) { return __ocml_sqrt_rte_f32(__x); } 263 __DEVICE__ 264 float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); } 265 __DEVICE__ 266 float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); } 267 #else 268 __DEVICE__ 269 float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); } 270 #endif 271 272 #if defined OCML_BASIC_ROUNDED_OPERATIONS 273 __DEVICE__ 274 float __fsub_rd(float __x, float __y) { return __ocml_sub_rtn_f32(__x, __y); } 275 __DEVICE__ 276 float __fsub_rn(float __x, float __y) { return __ocml_sub_rte_f32(__x, __y); } 277 __DEVICE__ 278 float __fsub_ru(float __x, float __y) { return __ocml_sub_rtp_f32(__x, __y); } 279 __DEVICE__ 280 float __fsub_rz(float __x, float __y) { return __ocml_sub_rtz_f32(__x, __y); } 281 #else 282 __DEVICE__ 283 float __fsub_rn(float __x, float __y) { return __x - __y; } 284 #endif 285 286 __DEVICE__ 287 float __log10f(float __x) { return __builtin_log10f(__x); } 288 289 __DEVICE__ 290 float __log2f(float __x) { return __builtin_amdgcn_logf(__x); } 291 292 __DEVICE__ 293 float __logf(float __x) { return __builtin_logf(__x); } 294 295 __DEVICE__ 296 float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); } 297 298 __DEVICE__ 299 float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); } 300 301 __DEVICE__ 302 void __sincosf(float __x, float *__sinptr, float *__cosptr) { 303 *__sinptr = __ocml_native_sin_f32(__x); 304 *__cosptr = __ocml_native_cos_f32(__x); 305 } 306 307 __DEVICE__ 308 float __sinf(float __x) { return __ocml_native_sin_f32(__x); } 309 310 __DEVICE__ 311 float __tanf(float __x) { 312 return __sinf(__x) * __builtin_amdgcn_rcpf(__cosf(__x)); 313 } 314 // END INTRINSICS 315 316 #if defined(__cplusplus) 317 __DEVICE__ 318 int abs(int __x) { 319 return __builtin_abs(__x); 320 } 321 __DEVICE__ 322 long labs(long __x) { 323 return __builtin_labs(__x); 324 } 325 __DEVICE__ 326 long long llabs(long long __x) { 327 return __builtin_llabs(__x); 328 } 329 #endif 330 331 __DEVICE__ 332 float acosf(float __x) { return __ocml_acos_f32(__x); } 333 334 __DEVICE__ 335 float acoshf(float __x) { return __ocml_acosh_f32(__x); } 336 337 __DEVICE__ 338 float asinf(float __x) { return __ocml_asin_f32(__x); } 339 340 __DEVICE__ 341 float asinhf(float __x) { return __ocml_asinh_f32(__x); } 342 343 __DEVICE__ 344 float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); } 345 346 __DEVICE__ 347 float atanf(float __x) { return __ocml_atan_f32(__x); } 348 349 __DEVICE__ 350 float atanhf(float __x) { return __ocml_atanh_f32(__x); } 351 352 __DEVICE__ 353 float cbrtf(float __x) { return __ocml_cbrt_f32(__x); } 354 355 __DEVICE__ 356 float ceilf(float __x) { return __builtin_ceilf(__x); } 357 358 __DEVICE__ 359 float copysignf(float __x, float __y) { return __builtin_copysignf(__x, __y); } 360 361 __DEVICE__ 362 float cosf(float __x) { return __FAST_OR_SLOW(__cosf, __ocml_cos_f32)(__x); } 363 364 __DEVICE__ 365 float coshf(float __x) { return __ocml_cosh_f32(__x); } 366 367 __DEVICE__ 368 float cospif(float __x) { return __ocml_cospi_f32(__x); } 369 370 __DEVICE__ 371 float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); } 372 373 __DEVICE__ 374 float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); } 375 376 __DEVICE__ 377 float erfcf(float __x) { return __ocml_erfc_f32(__x); } 378 379 __DEVICE__ 380 float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); } 381 382 __DEVICE__ 383 float erfcxf(float __x) { return __ocml_erfcx_f32(__x); } 384 385 __DEVICE__ 386 float erff(float __x) { return __ocml_erf_f32(__x); } 387 388 __DEVICE__ 389 float erfinvf(float __x) { return __ocml_erfinv_f32(__x); } 390 391 __DEVICE__ 392 float exp10f(float __x) { return __ocml_exp10_f32(__x); } 393 394 __DEVICE__ 395 float exp2f(float __x) { return __builtin_exp2f(__x); } 396 397 __DEVICE__ 398 float expf(float __x) { return __builtin_expf(__x); } 399 400 __DEVICE__ 401 float expm1f(float __x) { return __ocml_expm1_f32(__x); } 402 403 __DEVICE__ 404 float fabsf(float __x) { return __builtin_fabsf(__x); } 405 406 __DEVICE__ 407 float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); } 408 409 __DEVICE__ 410 float fdividef(float __x, float __y) { return __x / __y; } 411 412 __DEVICE__ 413 float floorf(float __x) { return __builtin_floorf(__x); } 414 415 __DEVICE__ 416 float fmaf(float __x, float __y, float __z) { 417 return __builtin_fmaf(__x, __y, __z); 418 } 419 420 __DEVICE__ 421 float fmaxf(float __x, float __y) { return __builtin_fmaxf(__x, __y); } 422 423 __DEVICE__ 424 float fminf(float __x, float __y) { return __builtin_fminf(__x, __y); } 425 426 __DEVICE__ 427 float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); } 428 429 __DEVICE__ 430 float frexpf(float __x, int *__nptr) { 431 return __builtin_frexpf(__x, __nptr); 432 } 433 434 __DEVICE__ 435 float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); } 436 437 __DEVICE__ 438 int ilogbf(float __x) { return __ocml_ilogb_f32(__x); } 439 440 __DEVICE__ 441 __RETURN_TYPE __finitef(float __x) { return __builtin_isfinite(__x); } 442 443 __DEVICE__ 444 __RETURN_TYPE __isinff(float __x) { return __builtin_isinf(__x); } 445 446 __DEVICE__ 447 __RETURN_TYPE __isnanf(float __x) { return __builtin_isnan(__x); } 448 449 __DEVICE__ 450 float j0f(float __x) { return __ocml_j0_f32(__x); } 451 452 __DEVICE__ 453 float j1f(float __x) { return __ocml_j1_f32(__x); } 454 455 __DEVICE__ 456 float jnf(int __n, float __x) { // TODO: we could use Ahmes multiplication 457 // and the Miller & Brown algorithm 458 // for linear recurrences to get O(log n) steps, but it's unclear if 459 // it'd be beneficial in this case. 460 if (__n == 0) 461 return j0f(__x); 462 if (__n == 1) 463 return j1f(__x); 464 465 float __x0 = j0f(__x); 466 float __x1 = j1f(__x); 467 for (int __i = 1; __i < __n; ++__i) { 468 float __x2 = (2 * __i) / __x * __x1 - __x0; 469 __x0 = __x1; 470 __x1 = __x2; 471 } 472 473 return __x1; 474 } 475 476 __DEVICE__ 477 float ldexpf(float __x, int __e) { return __builtin_amdgcn_ldexpf(__x, __e); } 478 479 __DEVICE__ 480 float lgammaf(float __x) { return __ocml_lgamma_f32(__x); } 481 482 __DEVICE__ 483 long long int llrintf(float __x) { return __builtin_rintf(__x); } 484 485 __DEVICE__ 486 long long int llroundf(float __x) { return __builtin_roundf(__x); } 487 488 __DEVICE__ 489 float log10f(float __x) { return __builtin_log10f(__x); } 490 491 __DEVICE__ 492 float log1pf(float __x) { return __ocml_log1p_f32(__x); } 493 494 __DEVICE__ 495 float log2f(float __x) { return __FAST_OR_SLOW(__log2f, __ocml_log2_f32)(__x); } 496 497 __DEVICE__ 498 float logbf(float __x) { return __ocml_logb_f32(__x); } 499 500 __DEVICE__ 501 float logf(float __x) { return __FAST_OR_SLOW(__logf, __ocml_log_f32)(__x); } 502 503 __DEVICE__ 504 long int lrintf(float __x) { return __builtin_rintf(__x); } 505 506 __DEVICE__ 507 long int lroundf(float __x) { return __builtin_roundf(__x); } 508 509 __DEVICE__ 510 float modff(float __x, float *__iptr) { 511 float __tmp; 512 #ifdef __OPENMP_AMDGCN__ 513 #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 514 #endif 515 float __r = 516 __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); 517 *__iptr = __tmp; 518 return __r; 519 } 520 521 __DEVICE__ 522 float nanf(const char *__tagp __attribute__((nonnull))) { 523 union { 524 float val; 525 struct ieee_float { 526 unsigned int mantissa : 22; 527 unsigned int quiet : 1; 528 unsigned int exponent : 8; 529 unsigned int sign : 1; 530 } bits; 531 } __tmp; 532 __static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits)); 533 534 __tmp.bits.sign = 0u; 535 __tmp.bits.exponent = ~0u; 536 __tmp.bits.quiet = 1u; 537 __tmp.bits.mantissa = __make_mantissa(__tagp); 538 539 return __tmp.val; 540 } 541 542 __DEVICE__ 543 float nearbyintf(float __x) { return __builtin_nearbyintf(__x); } 544 545 __DEVICE__ 546 float nextafterf(float __x, float __y) { 547 return __ocml_nextafter_f32(__x, __y); 548 } 549 550 __DEVICE__ 551 float norm3df(float __x, float __y, float __z) { 552 return __ocml_len3_f32(__x, __y, __z); 553 } 554 555 __DEVICE__ 556 float norm4df(float __x, float __y, float __z, float __w) { 557 return __ocml_len4_f32(__x, __y, __z, __w); 558 } 559 560 __DEVICE__ 561 float normcdff(float __x) { return __ocml_ncdf_f32(__x); } 562 563 __DEVICE__ 564 float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); } 565 566 __DEVICE__ 567 float normf(int __dim, 568 const float *__a) { // TODO: placeholder until OCML adds support. 569 float __r = 0; 570 while (__dim--) { 571 __r += __a[0] * __a[0]; 572 ++__a; 573 } 574 575 return __builtin_sqrtf(__r); 576 } 577 578 __DEVICE__ 579 float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); } 580 581 __DEVICE__ 582 float powif(float __x, int __y) { return __ocml_pown_f32(__x, __y); } 583 584 __DEVICE__ 585 float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); } 586 587 __DEVICE__ 588 float remainderf(float __x, float __y) { 589 return __ocml_remainder_f32(__x, __y); 590 } 591 592 __DEVICE__ 593 float remquof(float __x, float __y, int *__quo) { 594 int __tmp; 595 #ifdef __OPENMP_AMDGCN__ 596 #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 597 #endif 598 float __r = __ocml_remquo_f32( 599 __x, __y, (__attribute__((address_space(5))) int *)&__tmp); 600 *__quo = __tmp; 601 602 return __r; 603 } 604 605 __DEVICE__ 606 float rhypotf(float __x, float __y) { return __ocml_rhypot_f32(__x, __y); } 607 608 __DEVICE__ 609 float rintf(float __x) { return __builtin_rintf(__x); } 610 611 __DEVICE__ 612 float rnorm3df(float __x, float __y, float __z) { 613 return __ocml_rlen3_f32(__x, __y, __z); 614 } 615 616 __DEVICE__ 617 float rnorm4df(float __x, float __y, float __z, float __w) { 618 return __ocml_rlen4_f32(__x, __y, __z, __w); 619 } 620 621 __DEVICE__ 622 float rnormf(int __dim, 623 const float *__a) { // TODO: placeholder until OCML adds support. 624 float __r = 0; 625 while (__dim--) { 626 __r += __a[0] * __a[0]; 627 ++__a; 628 } 629 630 return __ocml_rsqrt_f32(__r); 631 } 632 633 __DEVICE__ 634 float roundf(float __x) { return __builtin_roundf(__x); } 635 636 __DEVICE__ 637 float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); } 638 639 __DEVICE__ 640 float scalblnf(float __x, long int __n) { 641 return (__n < INT_MAX) ? __builtin_amdgcn_ldexpf(__x, __n) 642 : __ocml_scalb_f32(__x, __n); 643 } 644 645 __DEVICE__ 646 float scalbnf(float __x, int __n) { return __builtin_amdgcn_ldexpf(__x, __n); } 647 648 __DEVICE__ 649 __RETURN_TYPE __signbitf(float __x) { return __builtin_signbitf(__x); } 650 651 __DEVICE__ 652 void sincosf(float __x, float *__sinptr, float *__cosptr) { 653 float __tmp; 654 #ifdef __OPENMP_AMDGCN__ 655 #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 656 #endif 657 #ifdef __CLANG_CUDA_APPROX_TRANSCENDENTALS__ 658 __sincosf(__x, __sinptr, __cosptr); 659 #else 660 *__sinptr = 661 __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); 662 *__cosptr = __tmp; 663 #endif 664 } 665 666 __DEVICE__ 667 void sincospif(float __x, float *__sinptr, float *__cosptr) { 668 float __tmp; 669 #ifdef __OPENMP_AMDGCN__ 670 #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 671 #endif 672 *__sinptr = __ocml_sincospi_f32( 673 __x, (__attribute__((address_space(5))) float *)&__tmp); 674 *__cosptr = __tmp; 675 } 676 677 __DEVICE__ 678 float sinf(float __x) { return __FAST_OR_SLOW(__sinf, __ocml_sin_f32)(__x); } 679 680 __DEVICE__ 681 float sinhf(float __x) { return __ocml_sinh_f32(__x); } 682 683 __DEVICE__ 684 float sinpif(float __x) { return __ocml_sinpi_f32(__x); } 685 686 __DEVICE__ 687 float sqrtf(float __x) { return __builtin_sqrtf(__x); } 688 689 __DEVICE__ 690 float tanf(float __x) { return __ocml_tan_f32(__x); } 691 692 __DEVICE__ 693 float tanhf(float __x) { return __ocml_tanh_f32(__x); } 694 695 __DEVICE__ 696 float tgammaf(float __x) { return __ocml_tgamma_f32(__x); } 697 698 __DEVICE__ 699 float truncf(float __x) { return __builtin_truncf(__x); } 700 701 __DEVICE__ 702 float y0f(float __x) { return __ocml_y0_f32(__x); } 703 704 __DEVICE__ 705 float y1f(float __x) { return __ocml_y1_f32(__x); } 706 707 __DEVICE__ 708 float ynf(int __n, float __x) { // TODO: we could use Ahmes multiplication 709 // and the Miller & Brown algorithm 710 // for linear recurrences to get O(log n) steps, but it's unclear if 711 // it'd be beneficial in this case. Placeholder until OCML adds 712 // support. 713 if (__n == 0) 714 return y0f(__x); 715 if (__n == 1) 716 return y1f(__x); 717 718 float __x0 = y0f(__x); 719 float __x1 = y1f(__x); 720 for (int __i = 1; __i < __n; ++__i) { 721 float __x2 = (2 * __i) / __x * __x1 - __x0; 722 __x0 = __x1; 723 __x1 = __x2; 724 } 725 726 return __x1; 727 } 728 729 730 // END FLOAT 731 732 // BEGIN DOUBLE 733 __DEVICE__ 734 double acos(double __x) { return __ocml_acos_f64(__x); } 735 736 __DEVICE__ 737 double acosh(double __x) { return __ocml_acosh_f64(__x); } 738 739 __DEVICE__ 740 double asin(double __x) { return __ocml_asin_f64(__x); } 741 742 __DEVICE__ 743 double asinh(double __x) { return __ocml_asinh_f64(__x); } 744 745 __DEVICE__ 746 double atan(double __x) { return __ocml_atan_f64(__x); } 747 748 __DEVICE__ 749 double atan2(double __x, double __y) { return __ocml_atan2_f64(__x, __y); } 750 751 __DEVICE__ 752 double atanh(double __x) { return __ocml_atanh_f64(__x); } 753 754 __DEVICE__ 755 double cbrt(double __x) { return __ocml_cbrt_f64(__x); } 756 757 __DEVICE__ 758 double ceil(double __x) { return __builtin_ceil(__x); } 759 760 __DEVICE__ 761 double copysign(double __x, double __y) { 762 return __builtin_copysign(__x, __y); 763 } 764 765 __DEVICE__ 766 double cos(double __x) { return __ocml_cos_f64(__x); } 767 768 __DEVICE__ 769 double cosh(double __x) { return __ocml_cosh_f64(__x); } 770 771 __DEVICE__ 772 double cospi(double __x) { return __ocml_cospi_f64(__x); } 773 774 __DEVICE__ 775 double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); } 776 777 __DEVICE__ 778 double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); } 779 780 __DEVICE__ 781 double erf(double __x) { return __ocml_erf_f64(__x); } 782 783 __DEVICE__ 784 double erfc(double __x) { return __ocml_erfc_f64(__x); } 785 786 __DEVICE__ 787 double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); } 788 789 __DEVICE__ 790 double erfcx(double __x) { return __ocml_erfcx_f64(__x); } 791 792 __DEVICE__ 793 double erfinv(double __x) { return __ocml_erfinv_f64(__x); } 794 795 __DEVICE__ 796 double exp(double __x) { return __ocml_exp_f64(__x); } 797 798 __DEVICE__ 799 double exp10(double __x) { return __ocml_exp10_f64(__x); } 800 801 __DEVICE__ 802 double exp2(double __x) { return __ocml_exp2_f64(__x); } 803 804 __DEVICE__ 805 double expm1(double __x) { return __ocml_expm1_f64(__x); } 806 807 __DEVICE__ 808 double fabs(double __x) { return __builtin_fabs(__x); } 809 810 __DEVICE__ 811 double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); } 812 813 __DEVICE__ 814 double floor(double __x) { return __builtin_floor(__x); } 815 816 __DEVICE__ 817 double fma(double __x, double __y, double __z) { 818 return __builtin_fma(__x, __y, __z); 819 } 820 821 __DEVICE__ 822 double fmax(double __x, double __y) { return __builtin_fmax(__x, __y); } 823 824 __DEVICE__ 825 double fmin(double __x, double __y) { return __builtin_fmin(__x, __y); } 826 827 __DEVICE__ 828 double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); } 829 830 __DEVICE__ 831 double frexp(double __x, int *__nptr) { 832 return __builtin_frexp(__x, __nptr); 833 } 834 835 __DEVICE__ 836 double hypot(double __x, double __y) { return __ocml_hypot_f64(__x, __y); } 837 838 __DEVICE__ 839 int ilogb(double __x) { return __ocml_ilogb_f64(__x); } 840 841 __DEVICE__ 842 __RETURN_TYPE __finite(double __x) { return __builtin_isfinite(__x); } 843 844 __DEVICE__ 845 __RETURN_TYPE __isinf(double __x) { return __builtin_isinf(__x); } 846 847 __DEVICE__ 848 __RETURN_TYPE __isnan(double __x) { return __builtin_isnan(__x); } 849 850 __DEVICE__ 851 double j0(double __x) { return __ocml_j0_f64(__x); } 852 853 __DEVICE__ 854 double j1(double __x) { return __ocml_j1_f64(__x); } 855 856 __DEVICE__ 857 double jn(int __n, double __x) { // TODO: we could use Ahmes multiplication 858 // and the Miller & Brown algorithm 859 // for linear recurrences to get O(log n) steps, but it's unclear if 860 // it'd be beneficial in this case. Placeholder until OCML adds 861 // support. 862 if (__n == 0) 863 return j0(__x); 864 if (__n == 1) 865 return j1(__x); 866 867 double __x0 = j0(__x); 868 double __x1 = j1(__x); 869 for (int __i = 1; __i < __n; ++__i) { 870 double __x2 = (2 * __i) / __x * __x1 - __x0; 871 __x0 = __x1; 872 __x1 = __x2; 873 } 874 return __x1; 875 } 876 877 __DEVICE__ 878 double ldexp(double __x, int __e) { return __builtin_amdgcn_ldexp(__x, __e); } 879 880 __DEVICE__ 881 double lgamma(double __x) { return __ocml_lgamma_f64(__x); } 882 883 __DEVICE__ 884 long long int llrint(double __x) { return __builtin_rint(__x); } 885 886 __DEVICE__ 887 long long int llround(double __x) { return __builtin_round(__x); } 888 889 __DEVICE__ 890 double log(double __x) { return __ocml_log_f64(__x); } 891 892 __DEVICE__ 893 double log10(double __x) { return __ocml_log10_f64(__x); } 894 895 __DEVICE__ 896 double log1p(double __x) { return __ocml_log1p_f64(__x); } 897 898 __DEVICE__ 899 double log2(double __x) { return __ocml_log2_f64(__x); } 900 901 __DEVICE__ 902 double logb(double __x) { return __ocml_logb_f64(__x); } 903 904 __DEVICE__ 905 long int lrint(double __x) { return __builtin_rint(__x); } 906 907 __DEVICE__ 908 long int lround(double __x) { return __builtin_round(__x); } 909 910 __DEVICE__ 911 double modf(double __x, double *__iptr) { 912 double __tmp; 913 #ifdef __OPENMP_AMDGCN__ 914 #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 915 #endif 916 double __r = 917 __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp); 918 *__iptr = __tmp; 919 920 return __r; 921 } 922 923 __DEVICE__ 924 double nan(const char *__tagp) { 925 #if !_WIN32 926 union { 927 double val; 928 struct ieee_double { 929 uint64_t mantissa : 51; 930 uint32_t quiet : 1; 931 uint32_t exponent : 11; 932 uint32_t sign : 1; 933 } bits; 934 } __tmp; 935 __static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits)); 936 937 __tmp.bits.sign = 0u; 938 __tmp.bits.exponent = ~0u; 939 __tmp.bits.quiet = 1u; 940 __tmp.bits.mantissa = __make_mantissa(__tagp); 941 942 return __tmp.val; 943 #else 944 __static_assert_type_size_equal(sizeof(uint64_t), sizeof(double)); 945 uint64_t __val = __make_mantissa(__tagp); 946 __val |= 0xFFF << 51; 947 return *reinterpret_cast<double *>(&__val); 948 #endif 949 } 950 951 __DEVICE__ 952 double nearbyint(double __x) { return __builtin_nearbyint(__x); } 953 954 __DEVICE__ 955 double nextafter(double __x, double __y) { 956 return __ocml_nextafter_f64(__x, __y); 957 } 958 959 __DEVICE__ 960 double norm(int __dim, 961 const double *__a) { // TODO: placeholder until OCML adds support. 962 double __r = 0; 963 while (__dim--) { 964 __r += __a[0] * __a[0]; 965 ++__a; 966 } 967 968 return __builtin_sqrt(__r); 969 } 970 971 __DEVICE__ 972 double norm3d(double __x, double __y, double __z) { 973 return __ocml_len3_f64(__x, __y, __z); 974 } 975 976 __DEVICE__ 977 double norm4d(double __x, double __y, double __z, double __w) { 978 return __ocml_len4_f64(__x, __y, __z, __w); 979 } 980 981 __DEVICE__ 982 double normcdf(double __x) { return __ocml_ncdf_f64(__x); } 983 984 __DEVICE__ 985 double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); } 986 987 __DEVICE__ 988 double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); } 989 990 __DEVICE__ 991 double powi(double __x, int __y) { return __ocml_pown_f64(__x, __y); } 992 993 __DEVICE__ 994 double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); } 995 996 __DEVICE__ 997 double remainder(double __x, double __y) { 998 return __ocml_remainder_f64(__x, __y); 999 } 1000 1001 __DEVICE__ 1002 double remquo(double __x, double __y, int *__quo) { 1003 int __tmp; 1004 #ifdef __OPENMP_AMDGCN__ 1005 #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 1006 #endif 1007 double __r = __ocml_remquo_f64( 1008 __x, __y, (__attribute__((address_space(5))) int *)&__tmp); 1009 *__quo = __tmp; 1010 1011 return __r; 1012 } 1013 1014 __DEVICE__ 1015 double rhypot(double __x, double __y) { return __ocml_rhypot_f64(__x, __y); } 1016 1017 __DEVICE__ 1018 double rint(double __x) { return __builtin_rint(__x); } 1019 1020 __DEVICE__ 1021 double rnorm(int __dim, 1022 const double *__a) { // TODO: placeholder until OCML adds support. 1023 double __r = 0; 1024 while (__dim--) { 1025 __r += __a[0] * __a[0]; 1026 ++__a; 1027 } 1028 1029 return __ocml_rsqrt_f64(__r); 1030 } 1031 1032 __DEVICE__ 1033 double rnorm3d(double __x, double __y, double __z) { 1034 return __ocml_rlen3_f64(__x, __y, __z); 1035 } 1036 1037 __DEVICE__ 1038 double rnorm4d(double __x, double __y, double __z, double __w) { 1039 return __ocml_rlen4_f64(__x, __y, __z, __w); 1040 } 1041 1042 __DEVICE__ 1043 double round(double __x) { return __builtin_round(__x); } 1044 1045 __DEVICE__ 1046 double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); } 1047 1048 __DEVICE__ 1049 double scalbln(double __x, long int __n) { 1050 return (__n < INT_MAX) ? __builtin_amdgcn_ldexp(__x, __n) 1051 : __ocml_scalb_f64(__x, __n); 1052 } 1053 __DEVICE__ 1054 double scalbn(double __x, int __n) { return __builtin_amdgcn_ldexp(__x, __n); } 1055 1056 __DEVICE__ 1057 __RETURN_TYPE __signbit(double __x) { return __builtin_signbit(__x); } 1058 1059 __DEVICE__ 1060 double sin(double __x) { return __ocml_sin_f64(__x); } 1061 1062 __DEVICE__ 1063 void sincos(double __x, double *__sinptr, double *__cosptr) { 1064 double __tmp; 1065 #ifdef __OPENMP_AMDGCN__ 1066 #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 1067 #endif 1068 *__sinptr = __ocml_sincos_f64( 1069 __x, (__attribute__((address_space(5))) double *)&__tmp); 1070 *__cosptr = __tmp; 1071 } 1072 1073 __DEVICE__ 1074 void sincospi(double __x, double *__sinptr, double *__cosptr) { 1075 double __tmp; 1076 #ifdef __OPENMP_AMDGCN__ 1077 #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 1078 #endif 1079 *__sinptr = __ocml_sincospi_f64( 1080 __x, (__attribute__((address_space(5))) double *)&__tmp); 1081 *__cosptr = __tmp; 1082 } 1083 1084 __DEVICE__ 1085 double sinh(double __x) { return __ocml_sinh_f64(__x); } 1086 1087 __DEVICE__ 1088 double sinpi(double __x) { return __ocml_sinpi_f64(__x); } 1089 1090 __DEVICE__ 1091 double sqrt(double __x) { return __builtin_sqrt(__x); } 1092 1093 __DEVICE__ 1094 double tan(double __x) { return __ocml_tan_f64(__x); } 1095 1096 __DEVICE__ 1097 double tanh(double __x) { return __ocml_tanh_f64(__x); } 1098 1099 __DEVICE__ 1100 double tgamma(double __x) { return __ocml_tgamma_f64(__x); } 1101 1102 __DEVICE__ 1103 double trunc(double __x) { return __builtin_trunc(__x); } 1104 1105 __DEVICE__ 1106 double y0(double __x) { return __ocml_y0_f64(__x); } 1107 1108 __DEVICE__ 1109 double y1(double __x) { return __ocml_y1_f64(__x); } 1110 1111 __DEVICE__ 1112 double yn(int __n, double __x) { // TODO: we could use Ahmes multiplication 1113 // and the Miller & Brown algorithm 1114 // for linear recurrences to get O(log n) steps, but it's unclear if 1115 // it'd be beneficial in this case. Placeholder until OCML adds 1116 // support. 1117 if (__n == 0) 1118 return y0(__x); 1119 if (__n == 1) 1120 return y1(__x); 1121 1122 double __x0 = y0(__x); 1123 double __x1 = y1(__x); 1124 for (int __i = 1; __i < __n; ++__i) { 1125 double __x2 = (2 * __i) / __x * __x1 - __x0; 1126 __x0 = __x1; 1127 __x1 = __x2; 1128 } 1129 1130 return __x1; 1131 } 1132 1133 // BEGIN INTRINSICS 1134 #if defined OCML_BASIC_ROUNDED_OPERATIONS 1135 __DEVICE__ 1136 double __dadd_rd(double __x, double __y) { 1137 return __ocml_add_rtn_f64(__x, __y); 1138 } 1139 __DEVICE__ 1140 double __dadd_rn(double __x, double __y) { 1141 return __ocml_add_rte_f64(__x, __y); 1142 } 1143 __DEVICE__ 1144 double __dadd_ru(double __x, double __y) { 1145 return __ocml_add_rtp_f64(__x, __y); 1146 } 1147 __DEVICE__ 1148 double __dadd_rz(double __x, double __y) { 1149 return __ocml_add_rtz_f64(__x, __y); 1150 } 1151 #else 1152 __DEVICE__ 1153 double __dadd_rn(double __x, double __y) { return __x + __y; } 1154 #endif 1155 1156 #if defined OCML_BASIC_ROUNDED_OPERATIONS 1157 __DEVICE__ 1158 double __ddiv_rd(double __x, double __y) { 1159 return __ocml_div_rtn_f64(__x, __y); 1160 } 1161 __DEVICE__ 1162 double __ddiv_rn(double __x, double __y) { 1163 return __ocml_div_rte_f64(__x, __y); 1164 } 1165 __DEVICE__ 1166 double __ddiv_ru(double __x, double __y) { 1167 return __ocml_div_rtp_f64(__x, __y); 1168 } 1169 __DEVICE__ 1170 double __ddiv_rz(double __x, double __y) { 1171 return __ocml_div_rtz_f64(__x, __y); 1172 } 1173 #else 1174 __DEVICE__ 1175 double __ddiv_rn(double __x, double __y) { return __x / __y; } 1176 #endif 1177 1178 #if defined OCML_BASIC_ROUNDED_OPERATIONS 1179 __DEVICE__ 1180 double __dmul_rd(double __x, double __y) { 1181 return __ocml_mul_rtn_f64(__x, __y); 1182 } 1183 __DEVICE__ 1184 double __dmul_rn(double __x, double __y) { 1185 return __ocml_mul_rte_f64(__x, __y); 1186 } 1187 __DEVICE__ 1188 double __dmul_ru(double __x, double __y) { 1189 return __ocml_mul_rtp_f64(__x, __y); 1190 } 1191 __DEVICE__ 1192 double __dmul_rz(double __x, double __y) { 1193 return __ocml_mul_rtz_f64(__x, __y); 1194 } 1195 #else 1196 __DEVICE__ 1197 double __dmul_rn(double __x, double __y) { return __x * __y; } 1198 #endif 1199 1200 #if defined OCML_BASIC_ROUNDED_OPERATIONS 1201 __DEVICE__ 1202 double __drcp_rd(double __x) { return __ocml_div_rtn_f64(1.0, __x); } 1203 __DEVICE__ 1204 double __drcp_rn(double __x) { return __ocml_div_rte_f64(1.0, __x); } 1205 __DEVICE__ 1206 double __drcp_ru(double __x) { return __ocml_div_rtp_f64(1.0, __x); } 1207 __DEVICE__ 1208 double __drcp_rz(double __x) { return __ocml_div_rtz_f64(1.0, __x); } 1209 #else 1210 __DEVICE__ 1211 double __drcp_rn(double __x) { return 1.0 / __x; } 1212 #endif 1213 1214 #if defined OCML_BASIC_ROUNDED_OPERATIONS 1215 __DEVICE__ 1216 double __dsqrt_rd(double __x) { return __ocml_sqrt_rtn_f64(__x); } 1217 __DEVICE__ 1218 double __dsqrt_rn(double __x) { return __ocml_sqrt_rte_f64(__x); } 1219 __DEVICE__ 1220 double __dsqrt_ru(double __x) { return __ocml_sqrt_rtp_f64(__x); } 1221 __DEVICE__ 1222 double __dsqrt_rz(double __x) { return __ocml_sqrt_rtz_f64(__x); } 1223 #else 1224 __DEVICE__ 1225 double __dsqrt_rn(double __x) { return __builtin_sqrt(__x); } 1226 #endif 1227 1228 #if defined OCML_BASIC_ROUNDED_OPERATIONS 1229 __DEVICE__ 1230 double __dsub_rd(double __x, double __y) { 1231 return __ocml_sub_rtn_f64(__x, __y); 1232 } 1233 __DEVICE__ 1234 double __dsub_rn(double __x, double __y) { 1235 return __ocml_sub_rte_f64(__x, __y); 1236 } 1237 __DEVICE__ 1238 double __dsub_ru(double __x, double __y) { 1239 return __ocml_sub_rtp_f64(__x, __y); 1240 } 1241 __DEVICE__ 1242 double __dsub_rz(double __x, double __y) { 1243 return __ocml_sub_rtz_f64(__x, __y); 1244 } 1245 #else 1246 __DEVICE__ 1247 double __dsub_rn(double __x, double __y) { return __x - __y; } 1248 #endif 1249 1250 #if defined OCML_BASIC_ROUNDED_OPERATIONS 1251 __DEVICE__ 1252 double __fma_rd(double __x, double __y, double __z) { 1253 return __ocml_fma_rtn_f64(__x, __y, __z); 1254 } 1255 __DEVICE__ 1256 double __fma_rn(double __x, double __y, double __z) { 1257 return __ocml_fma_rte_f64(__x, __y, __z); 1258 } 1259 __DEVICE__ 1260 double __fma_ru(double __x, double __y, double __z) { 1261 return __ocml_fma_rtp_f64(__x, __y, __z); 1262 } 1263 __DEVICE__ 1264 double __fma_rz(double __x, double __y, double __z) { 1265 return __ocml_fma_rtz_f64(__x, __y, __z); 1266 } 1267 #else 1268 __DEVICE__ 1269 double __fma_rn(double __x, double __y, double __z) { 1270 return __builtin_fma(__x, __y, __z); 1271 } 1272 #endif 1273 // END INTRINSICS 1274 // END DOUBLE 1275 1276 // C only macros 1277 #if !defined(__cplusplus) && __STDC_VERSION__ >= 201112L 1278 #define isfinite(__x) _Generic((__x), float : __finitef, double : __finite)(__x) 1279 #define isinf(__x) _Generic((__x), float : __isinff, double : __isinf)(__x) 1280 #define isnan(__x) _Generic((__x), float : __isnanf, double : __isnan)(__x) 1281 #define signbit(__x) \ 1282 _Generic((__x), float : __signbitf, double : __signbit)(__x) 1283 #endif // !defined(__cplusplus) && __STDC_VERSION__ >= 201112L 1284 1285 #if defined(__cplusplus) 1286 template <class T> __DEVICE__ T min(T __arg1, T __arg2) { 1287 return (__arg1 < __arg2) ? __arg1 : __arg2; 1288 } 1289 1290 template <class T> __DEVICE__ T max(T __arg1, T __arg2) { 1291 return (__arg1 > __arg2) ? __arg1 : __arg2; 1292 } 1293 1294 __DEVICE__ int min(int __arg1, int __arg2) { 1295 return (__arg1 < __arg2) ? __arg1 : __arg2; 1296 } 1297 __DEVICE__ int max(int __arg1, int __arg2) { 1298 return (__arg1 > __arg2) ? __arg1 : __arg2; 1299 } 1300 1301 __DEVICE__ 1302 float max(float __x, float __y) { return __builtin_fmaxf(__x, __y); } 1303 1304 __DEVICE__ 1305 double max(double __x, double __y) { return __builtin_fmax(__x, __y); } 1306 1307 __DEVICE__ 1308 float min(float __x, float __y) { return __builtin_fminf(__x, __y); } 1309 1310 __DEVICE__ 1311 double min(double __x, double __y) { return __builtin_fmin(__x, __y); } 1312 1313 #if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) 1314 __host__ inline static int min(int __arg1, int __arg2) { 1315 return __arg1 < __arg2 ? __arg1 : __arg2; 1316 } 1317 1318 __host__ inline static int max(int __arg1, int __arg2) { 1319 return __arg1 > __arg2 ? __arg1 : __arg2; 1320 } 1321 #endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) 1322 #endif 1323 1324 #pragma pop_macro("__DEVICE__") 1325 #pragma pop_macro("__RETURN_TYPE") 1326 #pragma pop_macro("__FAST_OR_SLOW") 1327 1328 #endif // __CLANG_GPU_DISABLE_MATH_WRAPPERS 1329 #endif // __CLANG_HIP_MATH_H__ 1330