xref: /freebsd-src/contrib/llvm-project/clang/lib/Headers/__clang_cuda_intrinsics.h (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
10b57cec5SDimitry Andric /*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---===
20b57cec5SDimitry Andric  *
30b57cec5SDimitry Andric  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric  * See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric  *
70b57cec5SDimitry Andric  *===-----------------------------------------------------------------------===
80b57cec5SDimitry Andric  */
90b57cec5SDimitry Andric #ifndef __CLANG_CUDA_INTRINSICS_H__
100b57cec5SDimitry Andric #define __CLANG_CUDA_INTRINSICS_H__
110b57cec5SDimitry Andric #ifndef __CUDA__
120b57cec5SDimitry Andric #error "This file is for CUDA compilation only."
130b57cec5SDimitry Andric #endif
140b57cec5SDimitry Andric 
150b57cec5SDimitry Andric // sm_30 intrinsics: __shfl_{up,down,xor}.
160b57cec5SDimitry Andric 
170b57cec5SDimitry Andric #define __SM_30_INTRINSICS_H__
180b57cec5SDimitry Andric #define __SM_30_INTRINSICS_HPP__
190b57cec5SDimitry Andric 
200b57cec5SDimitry Andric #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
210b57cec5SDimitry Andric 
220b57cec5SDimitry Andric #pragma push_macro("__MAKE_SHUFFLES")
230b57cec5SDimitry Andric #define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask,    \
240b57cec5SDimitry Andric                         __Type)                                                \
250b57cec5SDimitry Andric   inline __device__ int __FnName(int __val, __Type __offset,                   \
260b57cec5SDimitry Andric                                  int __width = warpSize) {                     \
270b57cec5SDimitry Andric     return __IntIntrinsic(__val, __offset,                                     \
280b57cec5SDimitry Andric                           ((warpSize - __width) << 8) | (__Mask));             \
290b57cec5SDimitry Andric   }                                                                            \
300b57cec5SDimitry Andric   inline __device__ float __FnName(float __val, __Type __offset,               \
310b57cec5SDimitry Andric                                    int __width = warpSize) {                   \
320b57cec5SDimitry Andric     return __FloatIntrinsic(__val, __offset,                                   \
330b57cec5SDimitry Andric                             ((warpSize - __width) << 8) | (__Mask));           \
340b57cec5SDimitry Andric   }                                                                            \
350b57cec5SDimitry Andric   inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \
360b57cec5SDimitry Andric                                           int __width = warpSize) {            \
370b57cec5SDimitry Andric     return static_cast<unsigned int>(                                          \
380b57cec5SDimitry Andric         ::__FnName(static_cast<int>(__val), __offset, __width));               \
390b57cec5SDimitry Andric   }                                                                            \
400b57cec5SDimitry Andric   inline __device__ long long __FnName(long long __val, __Type __offset,       \
410b57cec5SDimitry Andric                                        int __width = warpSize) {               \
420b57cec5SDimitry Andric     struct __Bits {                                                            \
430b57cec5SDimitry Andric       int __a, __b;                                                            \
440b57cec5SDimitry Andric     };                                                                         \
450b57cec5SDimitry Andric     _Static_assert(sizeof(__val) == sizeof(__Bits));                           \
460b57cec5SDimitry Andric     _Static_assert(sizeof(__Bits) == 2 * sizeof(int));                         \
470b57cec5SDimitry Andric     __Bits __tmp;                                                              \
4813138422SDimitry Andric     memcpy(&__tmp, &__val, sizeof(__val));                                \
490b57cec5SDimitry Andric     __tmp.__a = ::__FnName(__tmp.__a, __offset, __width);                      \
500b57cec5SDimitry Andric     __tmp.__b = ::__FnName(__tmp.__b, __offset, __width);                      \
510b57cec5SDimitry Andric     long long __ret;                                                           \
520b57cec5SDimitry Andric     memcpy(&__ret, &__tmp, sizeof(__tmp));                                     \
530b57cec5SDimitry Andric     return __ret;                                                              \
540b57cec5SDimitry Andric   }                                                                            \
550b57cec5SDimitry Andric   inline __device__ long __FnName(long __val, __Type __offset,                 \
560b57cec5SDimitry Andric                                   int __width = warpSize) {                    \
570b57cec5SDimitry Andric     _Static_assert(sizeof(long) == sizeof(long long) ||                        \
580b57cec5SDimitry Andric                    sizeof(long) == sizeof(int));                               \
590b57cec5SDimitry Andric     if (sizeof(long) == sizeof(long long)) {                                   \
600b57cec5SDimitry Andric       return static_cast<long>(                                                \
610b57cec5SDimitry Andric           ::__FnName(static_cast<long long>(__val), __offset, __width));       \
620b57cec5SDimitry Andric     } else if (sizeof(long) == sizeof(int)) {                                  \
630b57cec5SDimitry Andric       return static_cast<long>(                                                \
640b57cec5SDimitry Andric           ::__FnName(static_cast<int>(__val), __offset, __width));             \
650b57cec5SDimitry Andric     }                                                                          \
660b57cec5SDimitry Andric   }                                                                            \
670b57cec5SDimitry Andric   inline __device__ unsigned long __FnName(                                    \
680b57cec5SDimitry Andric       unsigned long __val, __Type __offset, int __width = warpSize) {          \
690b57cec5SDimitry Andric     return static_cast<unsigned long>(                                         \
700b57cec5SDimitry Andric         ::__FnName(static_cast<long>(__val), __offset, __width));              \
710b57cec5SDimitry Andric   }                                                                            \
720b57cec5SDimitry Andric   inline __device__ unsigned long long __FnName(                               \
730b57cec5SDimitry Andric       unsigned long long __val, __Type __offset, int __width = warpSize) {     \
74fcaf7f86SDimitry Andric     return static_cast<unsigned long long>(                                    \
75fcaf7f86SDimitry Andric         ::__FnName(static_cast<long long>(__val), __offset, __width));         \
760b57cec5SDimitry Andric   }                                                                            \
770b57cec5SDimitry Andric   inline __device__ double __FnName(double __val, __Type __offset,             \
780b57cec5SDimitry Andric                                     int __width = warpSize) {                  \
790b57cec5SDimitry Andric     long long __tmp;                                                           \
800b57cec5SDimitry Andric     _Static_assert(sizeof(__tmp) == sizeof(__val));                            \
810b57cec5SDimitry Andric     memcpy(&__tmp, &__val, sizeof(__val));                                     \
820b57cec5SDimitry Andric     __tmp = ::__FnName(__tmp, __offset, __width);                              \
830b57cec5SDimitry Andric     double __ret;                                                              \
840b57cec5SDimitry Andric     memcpy(&__ret, &__tmp, sizeof(__ret));                                     \
850b57cec5SDimitry Andric     return __ret;                                                              \
860b57cec5SDimitry Andric   }
870b57cec5SDimitry Andric 
880b57cec5SDimitry Andric __MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int);
890b57cec5SDimitry Andric // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
900b57cec5SDimitry Andric // maxLane.
910b57cec5SDimitry Andric __MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0,
920b57cec5SDimitry Andric                 unsigned int);
930b57cec5SDimitry Andric __MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f,
940b57cec5SDimitry Andric                 unsigned int);
950b57cec5SDimitry Andric __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f,
960b57cec5SDimitry Andric                 int);
970b57cec5SDimitry Andric #pragma pop_macro("__MAKE_SHUFFLES")
980b57cec5SDimitry Andric 
990b57cec5SDimitry Andric #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
1000b57cec5SDimitry Andric 
1010b57cec5SDimitry Andric #if CUDA_VERSION >= 9000
1020b57cec5SDimitry Andric #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300)
1030b57cec5SDimitry Andric // __shfl_sync_* variants available in CUDA-9
1040b57cec5SDimitry Andric #pragma push_macro("__MAKE_SYNC_SHUFFLES")
1050b57cec5SDimitry Andric #define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic,       \
1060b57cec5SDimitry Andric                              __Mask, __Type)                                   \
1070b57cec5SDimitry Andric   inline __device__ int __FnName(unsigned int __mask, int __val,               \
1080b57cec5SDimitry Andric                                  __Type __offset, int __width = warpSize) {    \
1090b57cec5SDimitry Andric     return __IntIntrinsic(__mask, __val, __offset,                             \
1100b57cec5SDimitry Andric                           ((warpSize - __width) << 8) | (__Mask));             \
1110b57cec5SDimitry Andric   }                                                                            \
1120b57cec5SDimitry Andric   inline __device__ float __FnName(unsigned int __mask, float __val,           \
1130b57cec5SDimitry Andric                                    __Type __offset, int __width = warpSize) {  \
1140b57cec5SDimitry Andric     return __FloatIntrinsic(__mask, __val, __offset,                           \
1150b57cec5SDimitry Andric                             ((warpSize - __width) << 8) | (__Mask));           \
1160b57cec5SDimitry Andric   }                                                                            \
1170b57cec5SDimitry Andric   inline __device__ unsigned int __FnName(unsigned int __mask,                 \
1180b57cec5SDimitry Andric                                           unsigned int __val, __Type __offset, \
1190b57cec5SDimitry Andric                                           int __width = warpSize) {            \
1200b57cec5SDimitry Andric     return static_cast<unsigned int>(                                          \
1210b57cec5SDimitry Andric         ::__FnName(__mask, static_cast<int>(__val), __offset, __width));       \
1220b57cec5SDimitry Andric   }                                                                            \
1230b57cec5SDimitry Andric   inline __device__ long long __FnName(unsigned int __mask, long long __val,   \
1240b57cec5SDimitry Andric                                        __Type __offset,                        \
1250b57cec5SDimitry Andric                                        int __width = warpSize) {               \
1260b57cec5SDimitry Andric     struct __Bits {                                                            \
1270b57cec5SDimitry Andric       int __a, __b;                                                            \
1280b57cec5SDimitry Andric     };                                                                         \
1290b57cec5SDimitry Andric     _Static_assert(sizeof(__val) == sizeof(__Bits));                           \
1300b57cec5SDimitry Andric     _Static_assert(sizeof(__Bits) == 2 * sizeof(int));                         \
1310b57cec5SDimitry Andric     __Bits __tmp;                                                              \
13213138422SDimitry Andric     memcpy(&__tmp, &__val, sizeof(__val));                                     \
1330b57cec5SDimitry Andric     __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width);              \
1340b57cec5SDimitry Andric     __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width);              \
1350b57cec5SDimitry Andric     long long __ret;                                                           \
1360b57cec5SDimitry Andric     memcpy(&__ret, &__tmp, sizeof(__tmp));                                     \
1370b57cec5SDimitry Andric     return __ret;                                                              \
1380b57cec5SDimitry Andric   }                                                                            \
1390b57cec5SDimitry Andric   inline __device__ unsigned long long __FnName(                               \
1400b57cec5SDimitry Andric       unsigned int __mask, unsigned long long __val, __Type __offset,          \
1410b57cec5SDimitry Andric       int __width = warpSize) {                                                \
142fcaf7f86SDimitry Andric     return static_cast<unsigned long long>(                                    \
143fcaf7f86SDimitry Andric         ::__FnName(__mask, static_cast<long long>(__val), __offset, __width)); \
1440b57cec5SDimitry Andric   }                                                                            \
1450b57cec5SDimitry Andric   inline __device__ long __FnName(unsigned int __mask, long __val,             \
1460b57cec5SDimitry Andric                                   __Type __offset, int __width = warpSize) {   \
1470b57cec5SDimitry Andric     _Static_assert(sizeof(long) == sizeof(long long) ||                        \
1480b57cec5SDimitry Andric                    sizeof(long) == sizeof(int));                               \
1490b57cec5SDimitry Andric     if (sizeof(long) == sizeof(long long)) {                                   \
1500b57cec5SDimitry Andric       return static_cast<long>(::__FnName(                                     \
1510b57cec5SDimitry Andric           __mask, static_cast<long long>(__val), __offset, __width));          \
1520b57cec5SDimitry Andric     } else if (sizeof(long) == sizeof(int)) {                                  \
1530b57cec5SDimitry Andric       return static_cast<long>(                                                \
1540b57cec5SDimitry Andric           ::__FnName(__mask, static_cast<int>(__val), __offset, __width));     \
1550b57cec5SDimitry Andric     }                                                                          \
1560b57cec5SDimitry Andric   }                                                                            \
1570b57cec5SDimitry Andric   inline __device__ unsigned long __FnName(                                    \
1580b57cec5SDimitry Andric       unsigned int __mask, unsigned long __val, __Type __offset,               \
1590b57cec5SDimitry Andric       int __width = warpSize) {                                                \
1600b57cec5SDimitry Andric     return static_cast<unsigned long>(                                         \
1610b57cec5SDimitry Andric         ::__FnName(__mask, static_cast<long>(__val), __offset, __width));      \
1620b57cec5SDimitry Andric   }                                                                            \
1630b57cec5SDimitry Andric   inline __device__ double __FnName(unsigned int __mask, double __val,         \
1640b57cec5SDimitry Andric                                     __Type __offset, int __width = warpSize) { \
1650b57cec5SDimitry Andric     long long __tmp;                                                           \
1660b57cec5SDimitry Andric     _Static_assert(sizeof(__tmp) == sizeof(__val));                            \
1670b57cec5SDimitry Andric     memcpy(&__tmp, &__val, sizeof(__val));                                     \
1680b57cec5SDimitry Andric     __tmp = ::__FnName(__mask, __tmp, __offset, __width);                      \
1690b57cec5SDimitry Andric     double __ret;                                                              \
1700b57cec5SDimitry Andric     memcpy(&__ret, &__tmp, sizeof(__ret));                                     \
1710b57cec5SDimitry Andric     return __ret;                                                              \
1720b57cec5SDimitry Andric   }
1730b57cec5SDimitry Andric __MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
1740b57cec5SDimitry Andric                      __nvvm_shfl_sync_idx_f32, 0x1f, int);
1750b57cec5SDimitry Andric // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
1760b57cec5SDimitry Andric // maxLane.
1770b57cec5SDimitry Andric __MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32,
1780b57cec5SDimitry Andric                      __nvvm_shfl_sync_up_f32, 0, unsigned int);
1790b57cec5SDimitry Andric __MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32,
1800b57cec5SDimitry Andric                      __nvvm_shfl_sync_down_f32, 0x1f, unsigned int);
1810b57cec5SDimitry Andric __MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32,
1820b57cec5SDimitry Andric                      __nvvm_shfl_sync_bfly_f32, 0x1f, int);
1830b57cec5SDimitry Andric #pragma pop_macro("__MAKE_SYNC_SHUFFLES")
1840b57cec5SDimitry Andric 
1850b57cec5SDimitry Andric inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) {
1860b57cec5SDimitry Andric   return __nvvm_bar_warp_sync(mask);
1870b57cec5SDimitry Andric }
1880b57cec5SDimitry Andric 
1890b57cec5SDimitry Andric inline __device__ void __barrier_sync(unsigned int id) {
1900b57cec5SDimitry Andric   __nvvm_barrier_sync(id);
1910b57cec5SDimitry Andric }
1920b57cec5SDimitry Andric 
1930b57cec5SDimitry Andric inline __device__ void __barrier_sync_count(unsigned int id,
1940b57cec5SDimitry Andric                                             unsigned int count) {
1950b57cec5SDimitry Andric   __nvvm_barrier_sync_cnt(id, count);
1960b57cec5SDimitry Andric }
1970b57cec5SDimitry Andric 
1980b57cec5SDimitry Andric inline __device__ int __all_sync(unsigned int mask, int pred) {
1990b57cec5SDimitry Andric   return __nvvm_vote_all_sync(mask, pred);
2000b57cec5SDimitry Andric }
2010b57cec5SDimitry Andric 
2020b57cec5SDimitry Andric inline __device__ int __any_sync(unsigned int mask, int pred) {
2030b57cec5SDimitry Andric   return __nvvm_vote_any_sync(mask, pred);
2040b57cec5SDimitry Andric }
2050b57cec5SDimitry Andric 
2060b57cec5SDimitry Andric inline __device__ int __uni_sync(unsigned int mask, int pred) {
2070b57cec5SDimitry Andric   return __nvvm_vote_uni_sync(mask, pred);
2080b57cec5SDimitry Andric }
2090b57cec5SDimitry Andric 
2100b57cec5SDimitry Andric inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) {
2110b57cec5SDimitry Andric   return __nvvm_vote_ballot_sync(mask, pred);
2120b57cec5SDimitry Andric }
2130b57cec5SDimitry Andric 
214a7dea167SDimitry Andric inline __device__ unsigned int __activemask() {
215a7dea167SDimitry Andric #if CUDA_VERSION < 9020
216a7dea167SDimitry Andric   return __nvvm_vote_ballot(1);
217a7dea167SDimitry Andric #else
218*0fca6ea1SDimitry Andric   return __nvvm_activemask();
219a7dea167SDimitry Andric #endif
220a7dea167SDimitry Andric }
2210b57cec5SDimitry Andric 
2220b57cec5SDimitry Andric inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) {
2230b57cec5SDimitry Andric   return __nvvm_fns(mask, base, offset);
2240b57cec5SDimitry Andric }
2250b57cec5SDimitry Andric 
2260b57cec5SDimitry Andric #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
2270b57cec5SDimitry Andric 
2280b57cec5SDimitry Andric // Define __match* builtins CUDA-9 headers expect to see.
2290b57cec5SDimitry Andric #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
2300b57cec5SDimitry Andric inline __device__ unsigned int __match32_any_sync(unsigned int mask,
2310b57cec5SDimitry Andric                                                   unsigned int value) {
2320b57cec5SDimitry Andric   return __nvvm_match_any_sync_i32(mask, value);
2330b57cec5SDimitry Andric }
2340b57cec5SDimitry Andric 
23581ad6265SDimitry Andric inline __device__ unsigned int
2360b57cec5SDimitry Andric __match64_any_sync(unsigned int mask, unsigned long long value) {
2370b57cec5SDimitry Andric   return __nvvm_match_any_sync_i64(mask, value);
2380b57cec5SDimitry Andric }
2390b57cec5SDimitry Andric 
2400b57cec5SDimitry Andric inline __device__ unsigned int
2410b57cec5SDimitry Andric __match32_all_sync(unsigned int mask, unsigned int value, int *pred) {
2420b57cec5SDimitry Andric   return __nvvm_match_all_sync_i32p(mask, value, pred);
2430b57cec5SDimitry Andric }
2440b57cec5SDimitry Andric 
24581ad6265SDimitry Andric inline __device__ unsigned int
2460b57cec5SDimitry Andric __match64_all_sync(unsigned int mask, unsigned long long value, int *pred) {
2470b57cec5SDimitry Andric   return __nvvm_match_all_sync_i64p(mask, value, pred);
2480b57cec5SDimitry Andric }
2490b57cec5SDimitry Andric #include "crt/sm_70_rt.hpp"
2500b57cec5SDimitry Andric 
2510b57cec5SDimitry Andric #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
2520b57cec5SDimitry Andric #endif // __CUDA_VERSION >= 9000
2530b57cec5SDimitry Andric 
2540b57cec5SDimitry Andric // sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
2550b57cec5SDimitry Andric 
2560b57cec5SDimitry Andric // Prevent the vanilla sm_32 intrinsics header from being included.
2570b57cec5SDimitry Andric #define __SM_32_INTRINSICS_H__
2580b57cec5SDimitry Andric #define __SM_32_INTRINSICS_HPP__
2590b57cec5SDimitry Andric 
2600b57cec5SDimitry Andric #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
2610b57cec5SDimitry Andric 
2620b57cec5SDimitry Andric inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); }
2630b57cec5SDimitry Andric inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); }
2640b57cec5SDimitry Andric inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); }
2650b57cec5SDimitry Andric inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); }
2660b57cec5SDimitry Andric inline __device__ long long __ldg(const long long *ptr) {
2670b57cec5SDimitry Andric   return __nvvm_ldg_ll(ptr);
2680b57cec5SDimitry Andric }
2690b57cec5SDimitry Andric inline __device__ unsigned char __ldg(const unsigned char *ptr) {
2700b57cec5SDimitry Andric   return __nvvm_ldg_uc(ptr);
2710b57cec5SDimitry Andric }
2720b57cec5SDimitry Andric inline __device__ signed char __ldg(const signed char *ptr) {
2730b57cec5SDimitry Andric   return __nvvm_ldg_uc((const unsigned char *)ptr);
2740b57cec5SDimitry Andric }
2750b57cec5SDimitry Andric inline __device__ unsigned short __ldg(const unsigned short *ptr) {
2760b57cec5SDimitry Andric   return __nvvm_ldg_us(ptr);
2770b57cec5SDimitry Andric }
2780b57cec5SDimitry Andric inline __device__ unsigned int __ldg(const unsigned int *ptr) {
2790b57cec5SDimitry Andric   return __nvvm_ldg_ui(ptr);
2800b57cec5SDimitry Andric }
2810b57cec5SDimitry Andric inline __device__ unsigned long __ldg(const unsigned long *ptr) {
2820b57cec5SDimitry Andric   return __nvvm_ldg_ul(ptr);
2830b57cec5SDimitry Andric }
2840b57cec5SDimitry Andric inline __device__ unsigned long long __ldg(const unsigned long long *ptr) {
2850b57cec5SDimitry Andric   return __nvvm_ldg_ull(ptr);
2860b57cec5SDimitry Andric }
2870b57cec5SDimitry Andric inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); }
2880b57cec5SDimitry Andric inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); }
2890b57cec5SDimitry Andric 
2900b57cec5SDimitry Andric inline __device__ char2 __ldg(const char2 *ptr) {
2910b57cec5SDimitry Andric   typedef char c2 __attribute__((ext_vector_type(2)));
2920b57cec5SDimitry Andric   // We can assume that ptr is aligned at least to char2's alignment, but the
2930b57cec5SDimitry Andric   // load will assume that ptr is aligned to char2's alignment.  This is only
2940b57cec5SDimitry Andric   // safe if alignof(c2) <= alignof(char2).
2950b57cec5SDimitry Andric   c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr));
2960b57cec5SDimitry Andric   char2 ret;
2970b57cec5SDimitry Andric   ret.x = rv[0];
2980b57cec5SDimitry Andric   ret.y = rv[1];
2990b57cec5SDimitry Andric   return ret;
3000b57cec5SDimitry Andric }
3010b57cec5SDimitry Andric inline __device__ char4 __ldg(const char4 *ptr) {
3020b57cec5SDimitry Andric   typedef char c4 __attribute__((ext_vector_type(4)));
3030b57cec5SDimitry Andric   c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr));
3040b57cec5SDimitry Andric   char4 ret;
3050b57cec5SDimitry Andric   ret.x = rv[0];
3060b57cec5SDimitry Andric   ret.y = rv[1];
3070b57cec5SDimitry Andric   ret.z = rv[2];
3080b57cec5SDimitry Andric   ret.w = rv[3];
3090b57cec5SDimitry Andric   return ret;
3100b57cec5SDimitry Andric }
3110b57cec5SDimitry Andric inline __device__ short2 __ldg(const short2 *ptr) {
3120b57cec5SDimitry Andric   typedef short s2 __attribute__((ext_vector_type(2)));
3130b57cec5SDimitry Andric   s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr));
3140b57cec5SDimitry Andric   short2 ret;
3150b57cec5SDimitry Andric   ret.x = rv[0];
3160b57cec5SDimitry Andric   ret.y = rv[1];
3170b57cec5SDimitry Andric   return ret;
3180b57cec5SDimitry Andric }
3190b57cec5SDimitry Andric inline __device__ short4 __ldg(const short4 *ptr) {
3200b57cec5SDimitry Andric   typedef short s4 __attribute__((ext_vector_type(4)));
3210b57cec5SDimitry Andric   s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr));
3220b57cec5SDimitry Andric   short4 ret;
3230b57cec5SDimitry Andric   ret.x = rv[0];
3240b57cec5SDimitry Andric   ret.y = rv[1];
3250b57cec5SDimitry Andric   ret.z = rv[2];
3260b57cec5SDimitry Andric   ret.w = rv[3];
3270b57cec5SDimitry Andric   return ret;
3280b57cec5SDimitry Andric }
3290b57cec5SDimitry Andric inline __device__ int2 __ldg(const int2 *ptr) {
3300b57cec5SDimitry Andric   typedef int i2 __attribute__((ext_vector_type(2)));
3310b57cec5SDimitry Andric   i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr));
3320b57cec5SDimitry Andric   int2 ret;
3330b57cec5SDimitry Andric   ret.x = rv[0];
3340b57cec5SDimitry Andric   ret.y = rv[1];
3350b57cec5SDimitry Andric   return ret;
3360b57cec5SDimitry Andric }
3370b57cec5SDimitry Andric inline __device__ int4 __ldg(const int4 *ptr) {
3380b57cec5SDimitry Andric   typedef int i4 __attribute__((ext_vector_type(4)));
3390b57cec5SDimitry Andric   i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr));
3400b57cec5SDimitry Andric   int4 ret;
3410b57cec5SDimitry Andric   ret.x = rv[0];
3420b57cec5SDimitry Andric   ret.y = rv[1];
3430b57cec5SDimitry Andric   ret.z = rv[2];
3440b57cec5SDimitry Andric   ret.w = rv[3];
3450b57cec5SDimitry Andric   return ret;
3460b57cec5SDimitry Andric }
3470b57cec5SDimitry Andric inline __device__ longlong2 __ldg(const longlong2 *ptr) {
3480b57cec5SDimitry Andric   typedef long long ll2 __attribute__((ext_vector_type(2)));
3490b57cec5SDimitry Andric   ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr));
3500b57cec5SDimitry Andric   longlong2 ret;
3510b57cec5SDimitry Andric   ret.x = rv[0];
3520b57cec5SDimitry Andric   ret.y = rv[1];
3530b57cec5SDimitry Andric   return ret;
3540b57cec5SDimitry Andric }
3550b57cec5SDimitry Andric 
3560b57cec5SDimitry Andric inline __device__ uchar2 __ldg(const uchar2 *ptr) {
3570b57cec5SDimitry Andric   typedef unsigned char uc2 __attribute__((ext_vector_type(2)));
3580b57cec5SDimitry Andric   uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr));
3590b57cec5SDimitry Andric   uchar2 ret;
3600b57cec5SDimitry Andric   ret.x = rv[0];
3610b57cec5SDimitry Andric   ret.y = rv[1];
3620b57cec5SDimitry Andric   return ret;
3630b57cec5SDimitry Andric }
3640b57cec5SDimitry Andric inline __device__ uchar4 __ldg(const uchar4 *ptr) {
3650b57cec5SDimitry Andric   typedef unsigned char uc4 __attribute__((ext_vector_type(4)));
3660b57cec5SDimitry Andric   uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr));
3670b57cec5SDimitry Andric   uchar4 ret;
3680b57cec5SDimitry Andric   ret.x = rv[0];
3690b57cec5SDimitry Andric   ret.y = rv[1];
3700b57cec5SDimitry Andric   ret.z = rv[2];
3710b57cec5SDimitry Andric   ret.w = rv[3];
3720b57cec5SDimitry Andric   return ret;
3730b57cec5SDimitry Andric }
3740b57cec5SDimitry Andric inline __device__ ushort2 __ldg(const ushort2 *ptr) {
3750b57cec5SDimitry Andric   typedef unsigned short us2 __attribute__((ext_vector_type(2)));
3760b57cec5SDimitry Andric   us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr));
3770b57cec5SDimitry Andric   ushort2 ret;
3780b57cec5SDimitry Andric   ret.x = rv[0];
3790b57cec5SDimitry Andric   ret.y = rv[1];
3800b57cec5SDimitry Andric   return ret;
3810b57cec5SDimitry Andric }
3820b57cec5SDimitry Andric inline __device__ ushort4 __ldg(const ushort4 *ptr) {
3830b57cec5SDimitry Andric   typedef unsigned short us4 __attribute__((ext_vector_type(4)));
3840b57cec5SDimitry Andric   us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr));
3850b57cec5SDimitry Andric   ushort4 ret;
3860b57cec5SDimitry Andric   ret.x = rv[0];
3870b57cec5SDimitry Andric   ret.y = rv[1];
3880b57cec5SDimitry Andric   ret.z = rv[2];
3890b57cec5SDimitry Andric   ret.w = rv[3];
3900b57cec5SDimitry Andric   return ret;
3910b57cec5SDimitry Andric }
3920b57cec5SDimitry Andric inline __device__ uint2 __ldg(const uint2 *ptr) {
3930b57cec5SDimitry Andric   typedef unsigned int ui2 __attribute__((ext_vector_type(2)));
3940b57cec5SDimitry Andric   ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr));
3950b57cec5SDimitry Andric   uint2 ret;
3960b57cec5SDimitry Andric   ret.x = rv[0];
3970b57cec5SDimitry Andric   ret.y = rv[1];
3980b57cec5SDimitry Andric   return ret;
3990b57cec5SDimitry Andric }
4000b57cec5SDimitry Andric inline __device__ uint4 __ldg(const uint4 *ptr) {
4010b57cec5SDimitry Andric   typedef unsigned int ui4 __attribute__((ext_vector_type(4)));
4020b57cec5SDimitry Andric   ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr));
4030b57cec5SDimitry Andric   uint4 ret;
4040b57cec5SDimitry Andric   ret.x = rv[0];
4050b57cec5SDimitry Andric   ret.y = rv[1];
4060b57cec5SDimitry Andric   ret.z = rv[2];
4070b57cec5SDimitry Andric   ret.w = rv[3];
4080b57cec5SDimitry Andric   return ret;
4090b57cec5SDimitry Andric }
4100b57cec5SDimitry Andric inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) {
4110b57cec5SDimitry Andric   typedef unsigned long long ull2 __attribute__((ext_vector_type(2)));
4120b57cec5SDimitry Andric   ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr));
4130b57cec5SDimitry Andric   ulonglong2 ret;
4140b57cec5SDimitry Andric   ret.x = rv[0];
4150b57cec5SDimitry Andric   ret.y = rv[1];
4160b57cec5SDimitry Andric   return ret;
4170b57cec5SDimitry Andric }
4180b57cec5SDimitry Andric 
4190b57cec5SDimitry Andric inline __device__ float2 __ldg(const float2 *ptr) {
4200b57cec5SDimitry Andric   typedef float f2 __attribute__((ext_vector_type(2)));
4210b57cec5SDimitry Andric   f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr));
4220b57cec5SDimitry Andric   float2 ret;
4230b57cec5SDimitry Andric   ret.x = rv[0];
4240b57cec5SDimitry Andric   ret.y = rv[1];
4250b57cec5SDimitry Andric   return ret;
4260b57cec5SDimitry Andric }
4270b57cec5SDimitry Andric inline __device__ float4 __ldg(const float4 *ptr) {
4280b57cec5SDimitry Andric   typedef float f4 __attribute__((ext_vector_type(4)));
4290b57cec5SDimitry Andric   f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr));
4300b57cec5SDimitry Andric   float4 ret;
4310b57cec5SDimitry Andric   ret.x = rv[0];
4320b57cec5SDimitry Andric   ret.y = rv[1];
4330b57cec5SDimitry Andric   ret.z = rv[2];
4340b57cec5SDimitry Andric   ret.w = rv[3];
4350b57cec5SDimitry Andric   return ret;
4360b57cec5SDimitry Andric }
4370b57cec5SDimitry Andric inline __device__ double2 __ldg(const double2 *ptr) {
4380b57cec5SDimitry Andric   typedef double d2 __attribute__((ext_vector_type(2)));
4390b57cec5SDimitry Andric   d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr));
4400b57cec5SDimitry Andric   double2 ret;
4410b57cec5SDimitry Andric   ret.x = rv[0];
4420b57cec5SDimitry Andric   ret.y = rv[1];
4430b57cec5SDimitry Andric   return ret;
4440b57cec5SDimitry Andric }
4450b57cec5SDimitry Andric 
4460b57cec5SDimitry Andric // TODO: Implement these as intrinsics, so the backend can work its magic on
4470b57cec5SDimitry Andric // these.  Alternatively, we could implement these as plain C and try to get
4480b57cec5SDimitry Andric // llvm to recognize the relevant patterns.
4490b57cec5SDimitry Andric inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32,
4500b57cec5SDimitry Andric                                            unsigned shiftWidth) {
4510b57cec5SDimitry Andric   unsigned result;
4520b57cec5SDimitry Andric   asm("shf.l.wrap.b32 %0, %1, %2, %3;"
4530b57cec5SDimitry Andric       : "=r"(result)
4540b57cec5SDimitry Andric       : "r"(low32), "r"(high32), "r"(shiftWidth));
4550b57cec5SDimitry Andric   return result;
4560b57cec5SDimitry Andric }
4570b57cec5SDimitry Andric inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32,
4580b57cec5SDimitry Andric                                             unsigned shiftWidth) {
4590b57cec5SDimitry Andric   unsigned result;
4600b57cec5SDimitry Andric   asm("shf.l.clamp.b32 %0, %1, %2, %3;"
4610b57cec5SDimitry Andric       : "=r"(result)
4620b57cec5SDimitry Andric       : "r"(low32), "r"(high32), "r"(shiftWidth));
4630b57cec5SDimitry Andric   return result;
4640b57cec5SDimitry Andric }
4650b57cec5SDimitry Andric inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32,
4660b57cec5SDimitry Andric                                            unsigned shiftWidth) {
4670b57cec5SDimitry Andric   unsigned result;
4680b57cec5SDimitry Andric   asm("shf.r.wrap.b32 %0, %1, %2, %3;"
4690b57cec5SDimitry Andric       : "=r"(result)
4700b57cec5SDimitry Andric       : "r"(low32), "r"(high32), "r"(shiftWidth));
4710b57cec5SDimitry Andric   return result;
4720b57cec5SDimitry Andric }
4730b57cec5SDimitry Andric inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
4740b57cec5SDimitry Andric                                             unsigned shiftWidth) {
4750b57cec5SDimitry Andric   unsigned ret;
4760b57cec5SDimitry Andric   asm("shf.r.clamp.b32 %0, %1, %2, %3;"
4770b57cec5SDimitry Andric       : "=r"(ret)
4780b57cec5SDimitry Andric       : "r"(low32), "r"(high32), "r"(shiftWidth));
4790b57cec5SDimitry Andric   return ret;
4800b57cec5SDimitry Andric }
4810b57cec5SDimitry Andric 
4820b57cec5SDimitry Andric #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
4830b57cec5SDimitry Andric 
484349cc55cSDimitry Andric #if CUDA_VERSION >= 11000
485349cc55cSDimitry Andric extern "C" {
486349cc55cSDimitry Andric __device__ inline size_t __nv_cvta_generic_to_global_impl(const void *__ptr) {
487349cc55cSDimitry Andric   return (size_t)(void __attribute__((address_space(1))) *)__ptr;
488349cc55cSDimitry Andric }
489349cc55cSDimitry Andric __device__ inline size_t __nv_cvta_generic_to_shared_impl(const void *__ptr) {
490349cc55cSDimitry Andric   return (size_t)(void __attribute__((address_space(3))) *)__ptr;
491349cc55cSDimitry Andric }
492349cc55cSDimitry Andric __device__ inline size_t __nv_cvta_generic_to_constant_impl(const void *__ptr) {
493349cc55cSDimitry Andric   return (size_t)(void __attribute__((address_space(4))) *)__ptr;
494349cc55cSDimitry Andric }
495349cc55cSDimitry Andric __device__ inline size_t __nv_cvta_generic_to_local_impl(const void *__ptr) {
496349cc55cSDimitry Andric   return (size_t)(void __attribute__((address_space(5))) *)__ptr;
497349cc55cSDimitry Andric }
498349cc55cSDimitry Andric __device__ inline void *__nv_cvta_global_to_generic_impl(size_t __ptr) {
499349cc55cSDimitry Andric   return (void *)(void __attribute__((address_space(1))) *)__ptr;
500349cc55cSDimitry Andric }
501349cc55cSDimitry Andric __device__ inline void *__nv_cvta_shared_to_generic_impl(size_t __ptr) {
502349cc55cSDimitry Andric   return (void *)(void __attribute__((address_space(3))) *)__ptr;
503349cc55cSDimitry Andric }
504349cc55cSDimitry Andric __device__ inline void *__nv_cvta_constant_to_generic_impl(size_t __ptr) {
505349cc55cSDimitry Andric   return (void *)(void __attribute__((address_space(4))) *)__ptr;
506349cc55cSDimitry Andric }
507349cc55cSDimitry Andric __device__ inline void *__nv_cvta_local_to_generic_impl(size_t __ptr) {
508349cc55cSDimitry Andric   return (void *)(void __attribute__((address_space(5))) *)__ptr;
509349cc55cSDimitry Andric }
51056f451bbSDimitry Andric __device__ inline cuuint32_t __nvvm_get_smem_pointer(void *__ptr) {
511349cc55cSDimitry Andric   return __nv_cvta_generic_to_shared_impl(__ptr);
512349cc55cSDimitry Andric }
513349cc55cSDimitry Andric } // extern "C"
51406c3fb27SDimitry Andric 
51506c3fb27SDimitry Andric #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
51606c3fb27SDimitry Andric __device__ inline unsigned __reduce_add_sync(unsigned __mask,
51706c3fb27SDimitry Andric                                              unsigned __value) {
51806c3fb27SDimitry Andric   return __nvvm_redux_sync_add(__mask, __value);
51906c3fb27SDimitry Andric }
52006c3fb27SDimitry Andric __device__ inline unsigned __reduce_min_sync(unsigned __mask,
52106c3fb27SDimitry Andric                                              unsigned __value) {
52206c3fb27SDimitry Andric   return __nvvm_redux_sync_umin(__mask, __value);
52306c3fb27SDimitry Andric }
52406c3fb27SDimitry Andric __device__ inline unsigned __reduce_max_sync(unsigned __mask,
52506c3fb27SDimitry Andric                                              unsigned __value) {
52606c3fb27SDimitry Andric   return __nvvm_redux_sync_umax(__mask, __value);
52706c3fb27SDimitry Andric }
52806c3fb27SDimitry Andric __device__ inline int __reduce_min_sync(unsigned __mask, int __value) {
52906c3fb27SDimitry Andric   return __nvvm_redux_sync_min(__mask, __value);
53006c3fb27SDimitry Andric }
53106c3fb27SDimitry Andric __device__ inline int __reduce_max_sync(unsigned __mask, int __value) {
53206c3fb27SDimitry Andric   return __nvvm_redux_sync_max(__mask, __value);
53306c3fb27SDimitry Andric }
53406c3fb27SDimitry Andric __device__ inline unsigned __reduce_or_sync(unsigned __mask, unsigned __value) {
53506c3fb27SDimitry Andric   return __nvvm_redux_sync_or(__mask, __value);
53606c3fb27SDimitry Andric }
53706c3fb27SDimitry Andric __device__ inline unsigned __reduce_and_sync(unsigned __mask,
53806c3fb27SDimitry Andric                                              unsigned __value) {
53906c3fb27SDimitry Andric   return __nvvm_redux_sync_and(__mask, __value);
54006c3fb27SDimitry Andric }
54106c3fb27SDimitry Andric __device__ inline unsigned __reduce_xor_sync(unsigned __mask,
54206c3fb27SDimitry Andric                                              unsigned __value) {
54306c3fb27SDimitry Andric   return __nvvm_redux_sync_xor(__mask, __value);
54406c3fb27SDimitry Andric }
54506c3fb27SDimitry Andric 
54606c3fb27SDimitry Andric __device__ inline void __nv_memcpy_async_shared_global_4(void *__dst,
54706c3fb27SDimitry Andric                                                          const void *__src,
54806c3fb27SDimitry Andric                                                          unsigned __src_size) {
54906c3fb27SDimitry Andric   __nvvm_cp_async_ca_shared_global_4(
55006c3fb27SDimitry Andric       (void __attribute__((address_space(3))) *)__dst,
55106c3fb27SDimitry Andric       (const void __attribute__((address_space(1))) *)__src, __src_size);
55206c3fb27SDimitry Andric }
55306c3fb27SDimitry Andric __device__ inline void __nv_memcpy_async_shared_global_8(void *__dst,
55406c3fb27SDimitry Andric                                                          const void *__src,
55506c3fb27SDimitry Andric                                                          unsigned __src_size) {
55606c3fb27SDimitry Andric   __nvvm_cp_async_ca_shared_global_8(
55706c3fb27SDimitry Andric       (void __attribute__((address_space(3))) *)__dst,
55806c3fb27SDimitry Andric       (const void __attribute__((address_space(1))) *)__src, __src_size);
55906c3fb27SDimitry Andric }
56006c3fb27SDimitry Andric __device__ inline void __nv_memcpy_async_shared_global_16(void *__dst,
56106c3fb27SDimitry Andric                                                           const void *__src,
56206c3fb27SDimitry Andric                                                           unsigned __src_size) {
56306c3fb27SDimitry Andric   __nvvm_cp_async_ca_shared_global_16(
56406c3fb27SDimitry Andric       (void __attribute__((address_space(3))) *)__dst,
56506c3fb27SDimitry Andric       (const void __attribute__((address_space(1))) *)__src, __src_size);
56606c3fb27SDimitry Andric }
56706c3fb27SDimitry Andric 
56806c3fb27SDimitry Andric __device__ inline void *
56906c3fb27SDimitry Andric __nv_associate_access_property(const void *__ptr, unsigned long long __prop) {
57006c3fb27SDimitry Andric   // TODO: it appears to provide compiler with some sort of a hint. We do not
57106c3fb27SDimitry Andric   // know what exactly it is supposed to do. However, CUDA headers suggest that
57206c3fb27SDimitry Andric   // just passing through __ptr should not affect correctness. They do so on
57306c3fb27SDimitry Andric   // pre-sm80 GPUs where this builtin is not available.
57406c3fb27SDimitry Andric   return (void*)__ptr;
57506c3fb27SDimitry Andric }
57606c3fb27SDimitry Andric #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
57706c3fb27SDimitry Andric 
57806c3fb27SDimitry Andric #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
57906c3fb27SDimitry Andric __device__ inline unsigned __isCtaShared(const void *ptr) {
58006c3fb27SDimitry Andric   return __isShared(ptr);
58106c3fb27SDimitry Andric }
58206c3fb27SDimitry Andric 
58306c3fb27SDimitry Andric __device__ inline unsigned __isClusterShared(const void *__ptr) {
58406c3fb27SDimitry Andric   return __nvvm_isspacep_shared_cluster(__ptr);
58506c3fb27SDimitry Andric }
58606c3fb27SDimitry Andric 
58706c3fb27SDimitry Andric __device__ inline void *__cluster_map_shared_rank(const void *__ptr,
58806c3fb27SDimitry Andric                                                   unsigned __rank) {
58906c3fb27SDimitry Andric   return __nvvm_mapa((void *)__ptr, __rank);
59006c3fb27SDimitry Andric }
59106c3fb27SDimitry Andric 
59206c3fb27SDimitry Andric __device__ inline unsigned __cluster_query_shared_rank(const void *__ptr) {
59306c3fb27SDimitry Andric   return __nvvm_getctarank((void *)__ptr);
59406c3fb27SDimitry Andric }
59506c3fb27SDimitry Andric 
59606c3fb27SDimitry Andric __device__ inline uint2
59706c3fb27SDimitry Andric __cluster_map_shared_multicast(const void *__ptr,
59806c3fb27SDimitry Andric                                unsigned int __cluster_cta_mask) {
59906c3fb27SDimitry Andric   return make_uint2((unsigned)__cvta_generic_to_shared(__ptr),
60006c3fb27SDimitry Andric                     __cluster_cta_mask);
60106c3fb27SDimitry Andric }
60206c3fb27SDimitry Andric 
60306c3fb27SDimitry Andric __device__ inline unsigned __clusterDimIsSpecified() {
60406c3fb27SDimitry Andric   return __nvvm_is_explicit_cluster();
60506c3fb27SDimitry Andric }
60606c3fb27SDimitry Andric 
60706c3fb27SDimitry Andric __device__ inline dim3 __clusterDim() {
60806c3fb27SDimitry Andric   return dim3(__nvvm_read_ptx_sreg_cluster_nctaid_x(),
60906c3fb27SDimitry Andric               __nvvm_read_ptx_sreg_cluster_nctaid_y(),
61006c3fb27SDimitry Andric               __nvvm_read_ptx_sreg_cluster_nctaid_z());
61106c3fb27SDimitry Andric }
61206c3fb27SDimitry Andric 
61306c3fb27SDimitry Andric __device__ inline dim3 __clusterRelativeBlockIdx() {
61406c3fb27SDimitry Andric   return dim3(__nvvm_read_ptx_sreg_cluster_ctaid_x(),
61506c3fb27SDimitry Andric               __nvvm_read_ptx_sreg_cluster_ctaid_y(),
61606c3fb27SDimitry Andric               __nvvm_read_ptx_sreg_cluster_ctaid_z());
61706c3fb27SDimitry Andric }
61806c3fb27SDimitry Andric 
61906c3fb27SDimitry Andric __device__ inline dim3 __clusterGridDimInClusters() {
62006c3fb27SDimitry Andric   return dim3(__nvvm_read_ptx_sreg_nclusterid_x(),
62106c3fb27SDimitry Andric               __nvvm_read_ptx_sreg_nclusterid_y(),
62206c3fb27SDimitry Andric               __nvvm_read_ptx_sreg_nclusterid_z());
62306c3fb27SDimitry Andric }
62406c3fb27SDimitry Andric 
62506c3fb27SDimitry Andric __device__ inline dim3 __clusterIdx() {
62606c3fb27SDimitry Andric   return dim3(__nvvm_read_ptx_sreg_clusterid_x(),
62706c3fb27SDimitry Andric               __nvvm_read_ptx_sreg_clusterid_y(),
62806c3fb27SDimitry Andric               __nvvm_read_ptx_sreg_clusterid_z());
62906c3fb27SDimitry Andric }
63006c3fb27SDimitry Andric 
63106c3fb27SDimitry Andric __device__ inline unsigned __clusterRelativeBlockRank() {
63206c3fb27SDimitry Andric   return __nvvm_read_ptx_sreg_cluster_ctarank();
63306c3fb27SDimitry Andric }
63406c3fb27SDimitry Andric 
63506c3fb27SDimitry Andric __device__ inline unsigned __clusterSizeInBlocks() {
63606c3fb27SDimitry Andric   return __nvvm_read_ptx_sreg_cluster_nctarank();
63706c3fb27SDimitry Andric }
63806c3fb27SDimitry Andric 
63906c3fb27SDimitry Andric __device__ inline void __cluster_barrier_arrive() {
64006c3fb27SDimitry Andric   __nvvm_barrier_cluster_arrive();
64106c3fb27SDimitry Andric }
64206c3fb27SDimitry Andric 
64306c3fb27SDimitry Andric __device__ inline void __cluster_barrier_arrive_relaxed() {
64406c3fb27SDimitry Andric   __nvvm_barrier_cluster_arrive_relaxed();
64506c3fb27SDimitry Andric }
64606c3fb27SDimitry Andric 
64706c3fb27SDimitry Andric __device__ inline void __cluster_barrier_wait() {
64806c3fb27SDimitry Andric   __nvvm_barrier_cluster_wait();
64906c3fb27SDimitry Andric }
65006c3fb27SDimitry Andric 
65106c3fb27SDimitry Andric __device__ inline void __threadfence_cluster() { __nvvm_fence_sc_cluster(); }
65206c3fb27SDimitry Andric 
65306c3fb27SDimitry Andric __device__ inline float2 atomicAdd(float2 *__ptr, float2 __val) {
65406c3fb27SDimitry Andric   float2 __ret;
65506c3fb27SDimitry Andric   __asm__("atom.add.v2.f32         {%0, %1}, [%2], {%3, %4};"
65606c3fb27SDimitry Andric           : "=f"(__ret.x), "=f"(__ret.y)
65706c3fb27SDimitry Andric           : "l"(__ptr), "f"(__val.x), "f"(__val.y));
65806c3fb27SDimitry Andric   return __ret;
65906c3fb27SDimitry Andric }
66006c3fb27SDimitry Andric 
66106c3fb27SDimitry Andric __device__ inline float2 atomicAdd_block(float2 *__ptr, float2 __val) {
66206c3fb27SDimitry Andric   float2 __ret;
66306c3fb27SDimitry Andric   __asm__("atom.cta.add.v2.f32         {%0, %1}, [%2], {%3, %4};"
66406c3fb27SDimitry Andric           : "=f"(__ret.x), "=f"(__ret.y)
66506c3fb27SDimitry Andric           : "l"(__ptr), "f"(__val.x), "f"(__val.y));
66606c3fb27SDimitry Andric   return __ret;
66706c3fb27SDimitry Andric }
66806c3fb27SDimitry Andric 
66906c3fb27SDimitry Andric __device__ inline float2 atomicAdd_system(float2 *__ptr, float2 __val) {
67006c3fb27SDimitry Andric   float2 __ret;
67106c3fb27SDimitry Andric   __asm__("atom.sys.add.v2.f32         {%0, %1}, [%2], {%3, %4};"
67206c3fb27SDimitry Andric           : "=f"(__ret.x), "=f"(__ret.y)
67306c3fb27SDimitry Andric           : "l"(__ptr), "f"(__val.x), "f"(__val.y));
67406c3fb27SDimitry Andric   return __ret;
67506c3fb27SDimitry Andric }
67606c3fb27SDimitry Andric 
67706c3fb27SDimitry Andric __device__ inline float4 atomicAdd(float4 *__ptr, float4 __val) {
67806c3fb27SDimitry Andric   float4 __ret;
67906c3fb27SDimitry Andric   __asm__("atom.add.v4.f32         {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
68006c3fb27SDimitry Andric           : "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w)
68106c3fb27SDimitry Andric           : "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w));
68206c3fb27SDimitry Andric   return __ret;
68306c3fb27SDimitry Andric }
68406c3fb27SDimitry Andric 
68506c3fb27SDimitry Andric __device__ inline float4 atomicAdd_block(float4 *__ptr, float4 __val) {
68606c3fb27SDimitry Andric   float4 __ret;
68706c3fb27SDimitry Andric   __asm__(
68806c3fb27SDimitry Andric       "atom.cta.add.v4.f32         {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
68906c3fb27SDimitry Andric       : "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w)
69006c3fb27SDimitry Andric       : "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w));
69106c3fb27SDimitry Andric   return __ret;
69206c3fb27SDimitry Andric }
69306c3fb27SDimitry Andric 
69406c3fb27SDimitry Andric __device__ inline float4 atomicAdd_system(float4 *__ptr, float4 __val) {
69506c3fb27SDimitry Andric   float4 __ret;
69606c3fb27SDimitry Andric   __asm__(
69706c3fb27SDimitry Andric       "atom.sys.add.v4.f32         {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
69806c3fb27SDimitry Andric       : "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w)
69906c3fb27SDimitry Andric       : "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w)
70006c3fb27SDimitry Andric       :);
70106c3fb27SDimitry Andric   return __ret;
70206c3fb27SDimitry Andric }
70306c3fb27SDimitry Andric 
70406c3fb27SDimitry Andric #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
705349cc55cSDimitry Andric #endif // CUDA_VERSION >= 11000
706349cc55cSDimitry Andric 
7070b57cec5SDimitry Andric #endif // defined(__CLANG_CUDA_INTRINSICS_H__)
708