10b57cec5SDimitry Andric /*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---=== 20b57cec5SDimitry Andric * 30b57cec5SDimitry Andric * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric * See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric * 70b57cec5SDimitry Andric *===-----------------------------------------------------------------------=== 80b57cec5SDimitry Andric */ 90b57cec5SDimitry Andric #ifndef __CLANG_CUDA_INTRINSICS_H__ 100b57cec5SDimitry Andric #define __CLANG_CUDA_INTRINSICS_H__ 110b57cec5SDimitry Andric #ifndef __CUDA__ 120b57cec5SDimitry Andric #error "This file is for CUDA compilation only." 130b57cec5SDimitry Andric #endif 140b57cec5SDimitry Andric 150b57cec5SDimitry Andric // sm_30 intrinsics: __shfl_{up,down,xor}. 160b57cec5SDimitry Andric 170b57cec5SDimitry Andric #define __SM_30_INTRINSICS_H__ 180b57cec5SDimitry Andric #define __SM_30_INTRINSICS_HPP__ 190b57cec5SDimitry Andric 200b57cec5SDimitry Andric #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 210b57cec5SDimitry Andric 220b57cec5SDimitry Andric #pragma push_macro("__MAKE_SHUFFLES") 230b57cec5SDimitry Andric #define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \ 240b57cec5SDimitry Andric __Type) \ 250b57cec5SDimitry Andric inline __device__ int __FnName(int __val, __Type __offset, \ 260b57cec5SDimitry Andric int __width = warpSize) { \ 270b57cec5SDimitry Andric return __IntIntrinsic(__val, __offset, \ 280b57cec5SDimitry Andric ((warpSize - __width) << 8) | (__Mask)); \ 290b57cec5SDimitry Andric } \ 300b57cec5SDimitry Andric inline __device__ float __FnName(float __val, __Type __offset, \ 310b57cec5SDimitry Andric int __width = warpSize) { \ 320b57cec5SDimitry Andric return __FloatIntrinsic(__val, __offset, \ 330b57cec5SDimitry Andric ((warpSize - __width) << 8) | (__Mask)); \ 340b57cec5SDimitry Andric } \ 350b57cec5SDimitry Andric inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \ 360b57cec5SDimitry Andric int __width = warpSize) { \ 370b57cec5SDimitry Andric return static_cast<unsigned int>( \ 380b57cec5SDimitry Andric ::__FnName(static_cast<int>(__val), __offset, __width)); \ 390b57cec5SDimitry Andric } \ 400b57cec5SDimitry Andric inline __device__ long long __FnName(long long __val, __Type __offset, \ 410b57cec5SDimitry Andric int __width = warpSize) { \ 420b57cec5SDimitry Andric struct __Bits { \ 430b57cec5SDimitry Andric int __a, __b; \ 440b57cec5SDimitry Andric }; \ 450b57cec5SDimitry Andric _Static_assert(sizeof(__val) == sizeof(__Bits)); \ 460b57cec5SDimitry Andric _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ 470b57cec5SDimitry Andric __Bits __tmp; \ 4813138422SDimitry Andric memcpy(&__tmp, &__val, sizeof(__val)); \ 490b57cec5SDimitry Andric __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \ 500b57cec5SDimitry Andric __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \ 510b57cec5SDimitry Andric long long __ret; \ 520b57cec5SDimitry Andric memcpy(&__ret, &__tmp, sizeof(__tmp)); \ 530b57cec5SDimitry Andric return __ret; \ 540b57cec5SDimitry Andric } \ 550b57cec5SDimitry Andric inline __device__ long __FnName(long __val, __Type __offset, \ 560b57cec5SDimitry Andric int __width = warpSize) { \ 570b57cec5SDimitry Andric _Static_assert(sizeof(long) == sizeof(long long) || \ 580b57cec5SDimitry Andric sizeof(long) == sizeof(int)); \ 590b57cec5SDimitry Andric if (sizeof(long) == sizeof(long long)) { \ 600b57cec5SDimitry Andric return static_cast<long>( \ 610b57cec5SDimitry Andric ::__FnName(static_cast<long long>(__val), __offset, __width)); \ 620b57cec5SDimitry Andric } else if (sizeof(long) == sizeof(int)) { \ 630b57cec5SDimitry Andric return static_cast<long>( \ 640b57cec5SDimitry Andric ::__FnName(static_cast<int>(__val), __offset, __width)); \ 650b57cec5SDimitry Andric } \ 660b57cec5SDimitry Andric } \ 670b57cec5SDimitry Andric inline __device__ unsigned long __FnName( \ 680b57cec5SDimitry Andric unsigned long __val, __Type __offset, int __width = warpSize) { \ 690b57cec5SDimitry Andric return static_cast<unsigned long>( \ 700b57cec5SDimitry Andric ::__FnName(static_cast<long>(__val), __offset, __width)); \ 710b57cec5SDimitry Andric } \ 720b57cec5SDimitry Andric inline __device__ unsigned long long __FnName( \ 730b57cec5SDimitry Andric unsigned long long __val, __Type __offset, int __width = warpSize) { \ 74fcaf7f86SDimitry Andric return static_cast<unsigned long long>( \ 75fcaf7f86SDimitry Andric ::__FnName(static_cast<long long>(__val), __offset, __width)); \ 760b57cec5SDimitry Andric } \ 770b57cec5SDimitry Andric inline __device__ double __FnName(double __val, __Type __offset, \ 780b57cec5SDimitry Andric int __width = warpSize) { \ 790b57cec5SDimitry Andric long long __tmp; \ 800b57cec5SDimitry Andric _Static_assert(sizeof(__tmp) == sizeof(__val)); \ 810b57cec5SDimitry Andric memcpy(&__tmp, &__val, sizeof(__val)); \ 820b57cec5SDimitry Andric __tmp = ::__FnName(__tmp, __offset, __width); \ 830b57cec5SDimitry Andric double __ret; \ 840b57cec5SDimitry Andric memcpy(&__ret, &__tmp, sizeof(__ret)); \ 850b57cec5SDimitry Andric return __ret; \ 860b57cec5SDimitry Andric } 870b57cec5SDimitry Andric 880b57cec5SDimitry Andric __MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int); 890b57cec5SDimitry Andric // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= 900b57cec5SDimitry Andric // maxLane. 910b57cec5SDimitry Andric __MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0, 920b57cec5SDimitry Andric unsigned int); 930b57cec5SDimitry Andric __MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f, 940b57cec5SDimitry Andric unsigned int); 950b57cec5SDimitry Andric __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f, 960b57cec5SDimitry Andric int); 970b57cec5SDimitry Andric #pragma pop_macro("__MAKE_SHUFFLES") 980b57cec5SDimitry Andric 990b57cec5SDimitry Andric #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 1000b57cec5SDimitry Andric 1010b57cec5SDimitry Andric #if CUDA_VERSION >= 9000 1020b57cec5SDimitry Andric #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) 1030b57cec5SDimitry Andric // __shfl_sync_* variants available in CUDA-9 1040b57cec5SDimitry Andric #pragma push_macro("__MAKE_SYNC_SHUFFLES") 1050b57cec5SDimitry Andric #define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ 1060b57cec5SDimitry Andric __Mask, __Type) \ 1070b57cec5SDimitry Andric inline __device__ int __FnName(unsigned int __mask, int __val, \ 1080b57cec5SDimitry Andric __Type __offset, int __width = warpSize) { \ 1090b57cec5SDimitry Andric return __IntIntrinsic(__mask, __val, __offset, \ 1100b57cec5SDimitry Andric ((warpSize - __width) << 8) | (__Mask)); \ 1110b57cec5SDimitry Andric } \ 1120b57cec5SDimitry Andric inline __device__ float __FnName(unsigned int __mask, float __val, \ 1130b57cec5SDimitry Andric __Type __offset, int __width = warpSize) { \ 1140b57cec5SDimitry Andric return __FloatIntrinsic(__mask, __val, __offset, \ 1150b57cec5SDimitry Andric ((warpSize - __width) << 8) | (__Mask)); \ 1160b57cec5SDimitry Andric } \ 1170b57cec5SDimitry Andric inline __device__ unsigned int __FnName(unsigned int __mask, \ 1180b57cec5SDimitry Andric unsigned int __val, __Type __offset, \ 1190b57cec5SDimitry Andric int __width = warpSize) { \ 1200b57cec5SDimitry Andric return static_cast<unsigned int>( \ 1210b57cec5SDimitry Andric ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ 1220b57cec5SDimitry Andric } \ 1230b57cec5SDimitry Andric inline __device__ long long __FnName(unsigned int __mask, long long __val, \ 1240b57cec5SDimitry Andric __Type __offset, \ 1250b57cec5SDimitry Andric int __width = warpSize) { \ 1260b57cec5SDimitry Andric struct __Bits { \ 1270b57cec5SDimitry Andric int __a, __b; \ 1280b57cec5SDimitry Andric }; \ 1290b57cec5SDimitry Andric _Static_assert(sizeof(__val) == sizeof(__Bits)); \ 1300b57cec5SDimitry Andric _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ 1310b57cec5SDimitry Andric __Bits __tmp; \ 13213138422SDimitry Andric memcpy(&__tmp, &__val, sizeof(__val)); \ 1330b57cec5SDimitry Andric __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \ 1340b57cec5SDimitry Andric __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \ 1350b57cec5SDimitry Andric long long __ret; \ 1360b57cec5SDimitry Andric memcpy(&__ret, &__tmp, sizeof(__tmp)); \ 1370b57cec5SDimitry Andric return __ret; \ 1380b57cec5SDimitry Andric } \ 1390b57cec5SDimitry Andric inline __device__ unsigned long long __FnName( \ 1400b57cec5SDimitry Andric unsigned int __mask, unsigned long long __val, __Type __offset, \ 1410b57cec5SDimitry Andric int __width = warpSize) { \ 142fcaf7f86SDimitry Andric return static_cast<unsigned long long>( \ 143fcaf7f86SDimitry Andric ::__FnName(__mask, static_cast<long long>(__val), __offset, __width)); \ 1440b57cec5SDimitry Andric } \ 1450b57cec5SDimitry Andric inline __device__ long __FnName(unsigned int __mask, long __val, \ 1460b57cec5SDimitry Andric __Type __offset, int __width = warpSize) { \ 1470b57cec5SDimitry Andric _Static_assert(sizeof(long) == sizeof(long long) || \ 1480b57cec5SDimitry Andric sizeof(long) == sizeof(int)); \ 1490b57cec5SDimitry Andric if (sizeof(long) == sizeof(long long)) { \ 1500b57cec5SDimitry Andric return static_cast<long>(::__FnName( \ 1510b57cec5SDimitry Andric __mask, static_cast<long long>(__val), __offset, __width)); \ 1520b57cec5SDimitry Andric } else if (sizeof(long) == sizeof(int)) { \ 1530b57cec5SDimitry Andric return static_cast<long>( \ 1540b57cec5SDimitry Andric ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ 1550b57cec5SDimitry Andric } \ 1560b57cec5SDimitry Andric } \ 1570b57cec5SDimitry Andric inline __device__ unsigned long __FnName( \ 1580b57cec5SDimitry Andric unsigned int __mask, unsigned long __val, __Type __offset, \ 1590b57cec5SDimitry Andric int __width = warpSize) { \ 1600b57cec5SDimitry Andric return static_cast<unsigned long>( \ 1610b57cec5SDimitry Andric ::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \ 1620b57cec5SDimitry Andric } \ 1630b57cec5SDimitry Andric inline __device__ double __FnName(unsigned int __mask, double __val, \ 1640b57cec5SDimitry Andric __Type __offset, int __width = warpSize) { \ 1650b57cec5SDimitry Andric long long __tmp; \ 1660b57cec5SDimitry Andric _Static_assert(sizeof(__tmp) == sizeof(__val)); \ 1670b57cec5SDimitry Andric memcpy(&__tmp, &__val, sizeof(__val)); \ 1680b57cec5SDimitry Andric __tmp = ::__FnName(__mask, __tmp, __offset, __width); \ 1690b57cec5SDimitry Andric double __ret; \ 1700b57cec5SDimitry Andric memcpy(&__ret, &__tmp, sizeof(__ret)); \ 1710b57cec5SDimitry Andric return __ret; \ 1720b57cec5SDimitry Andric } 1730b57cec5SDimitry Andric __MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32, 1740b57cec5SDimitry Andric __nvvm_shfl_sync_idx_f32, 0x1f, int); 1750b57cec5SDimitry Andric // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= 1760b57cec5SDimitry Andric // maxLane. 1770b57cec5SDimitry Andric __MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32, 1780b57cec5SDimitry Andric __nvvm_shfl_sync_up_f32, 0, unsigned int); 1790b57cec5SDimitry Andric __MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32, 1800b57cec5SDimitry Andric __nvvm_shfl_sync_down_f32, 0x1f, unsigned int); 1810b57cec5SDimitry Andric __MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32, 1820b57cec5SDimitry Andric __nvvm_shfl_sync_bfly_f32, 0x1f, int); 1830b57cec5SDimitry Andric #pragma pop_macro("__MAKE_SYNC_SHUFFLES") 1840b57cec5SDimitry Andric 1850b57cec5SDimitry Andric inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) { 1860b57cec5SDimitry Andric return __nvvm_bar_warp_sync(mask); 1870b57cec5SDimitry Andric } 1880b57cec5SDimitry Andric 1890b57cec5SDimitry Andric inline __device__ void __barrier_sync(unsigned int id) { 1900b57cec5SDimitry Andric __nvvm_barrier_sync(id); 1910b57cec5SDimitry Andric } 1920b57cec5SDimitry Andric 1930b57cec5SDimitry Andric inline __device__ void __barrier_sync_count(unsigned int id, 1940b57cec5SDimitry Andric unsigned int count) { 1950b57cec5SDimitry Andric __nvvm_barrier_sync_cnt(id, count); 1960b57cec5SDimitry Andric } 1970b57cec5SDimitry Andric 1980b57cec5SDimitry Andric inline __device__ int __all_sync(unsigned int mask, int pred) { 1990b57cec5SDimitry Andric return __nvvm_vote_all_sync(mask, pred); 2000b57cec5SDimitry Andric } 2010b57cec5SDimitry Andric 2020b57cec5SDimitry Andric inline __device__ int __any_sync(unsigned int mask, int pred) { 2030b57cec5SDimitry Andric return __nvvm_vote_any_sync(mask, pred); 2040b57cec5SDimitry Andric } 2050b57cec5SDimitry Andric 2060b57cec5SDimitry Andric inline __device__ int __uni_sync(unsigned int mask, int pred) { 2070b57cec5SDimitry Andric return __nvvm_vote_uni_sync(mask, pred); 2080b57cec5SDimitry Andric } 2090b57cec5SDimitry Andric 2100b57cec5SDimitry Andric inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) { 2110b57cec5SDimitry Andric return __nvvm_vote_ballot_sync(mask, pred); 2120b57cec5SDimitry Andric } 2130b57cec5SDimitry Andric 214a7dea167SDimitry Andric inline __device__ unsigned int __activemask() { 215a7dea167SDimitry Andric #if CUDA_VERSION < 9020 216a7dea167SDimitry Andric return __nvvm_vote_ballot(1); 217a7dea167SDimitry Andric #else 218*0fca6ea1SDimitry Andric return __nvvm_activemask(); 219a7dea167SDimitry Andric #endif 220a7dea167SDimitry Andric } 2210b57cec5SDimitry Andric 2220b57cec5SDimitry Andric inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) { 2230b57cec5SDimitry Andric return __nvvm_fns(mask, base, offset); 2240b57cec5SDimitry Andric } 2250b57cec5SDimitry Andric 2260b57cec5SDimitry Andric #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 2270b57cec5SDimitry Andric 2280b57cec5SDimitry Andric // Define __match* builtins CUDA-9 headers expect to see. 2290b57cec5SDimitry Andric #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 2300b57cec5SDimitry Andric inline __device__ unsigned int __match32_any_sync(unsigned int mask, 2310b57cec5SDimitry Andric unsigned int value) { 2320b57cec5SDimitry Andric return __nvvm_match_any_sync_i32(mask, value); 2330b57cec5SDimitry Andric } 2340b57cec5SDimitry Andric 23581ad6265SDimitry Andric inline __device__ unsigned int 2360b57cec5SDimitry Andric __match64_any_sync(unsigned int mask, unsigned long long value) { 2370b57cec5SDimitry Andric return __nvvm_match_any_sync_i64(mask, value); 2380b57cec5SDimitry Andric } 2390b57cec5SDimitry Andric 2400b57cec5SDimitry Andric inline __device__ unsigned int 2410b57cec5SDimitry Andric __match32_all_sync(unsigned int mask, unsigned int value, int *pred) { 2420b57cec5SDimitry Andric return __nvvm_match_all_sync_i32p(mask, value, pred); 2430b57cec5SDimitry Andric } 2440b57cec5SDimitry Andric 24581ad6265SDimitry Andric inline __device__ unsigned int 2460b57cec5SDimitry Andric __match64_all_sync(unsigned int mask, unsigned long long value, int *pred) { 2470b57cec5SDimitry Andric return __nvvm_match_all_sync_i64p(mask, value, pred); 2480b57cec5SDimitry Andric } 2490b57cec5SDimitry Andric #include "crt/sm_70_rt.hpp" 2500b57cec5SDimitry Andric 2510b57cec5SDimitry Andric #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 2520b57cec5SDimitry Andric #endif // __CUDA_VERSION >= 9000 2530b57cec5SDimitry Andric 2540b57cec5SDimitry Andric // sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}. 2550b57cec5SDimitry Andric 2560b57cec5SDimitry Andric // Prevent the vanilla sm_32 intrinsics header from being included. 2570b57cec5SDimitry Andric #define __SM_32_INTRINSICS_H__ 2580b57cec5SDimitry Andric #define __SM_32_INTRINSICS_HPP__ 2590b57cec5SDimitry Andric 2600b57cec5SDimitry Andric #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 2610b57cec5SDimitry Andric 2620b57cec5SDimitry Andric inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); } 2630b57cec5SDimitry Andric inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); } 2640b57cec5SDimitry Andric inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); } 2650b57cec5SDimitry Andric inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); } 2660b57cec5SDimitry Andric inline __device__ long long __ldg(const long long *ptr) { 2670b57cec5SDimitry Andric return __nvvm_ldg_ll(ptr); 2680b57cec5SDimitry Andric } 2690b57cec5SDimitry Andric inline __device__ unsigned char __ldg(const unsigned char *ptr) { 2700b57cec5SDimitry Andric return __nvvm_ldg_uc(ptr); 2710b57cec5SDimitry Andric } 2720b57cec5SDimitry Andric inline __device__ signed char __ldg(const signed char *ptr) { 2730b57cec5SDimitry Andric return __nvvm_ldg_uc((const unsigned char *)ptr); 2740b57cec5SDimitry Andric } 2750b57cec5SDimitry Andric inline __device__ unsigned short __ldg(const unsigned short *ptr) { 2760b57cec5SDimitry Andric return __nvvm_ldg_us(ptr); 2770b57cec5SDimitry Andric } 2780b57cec5SDimitry Andric inline __device__ unsigned int __ldg(const unsigned int *ptr) { 2790b57cec5SDimitry Andric return __nvvm_ldg_ui(ptr); 2800b57cec5SDimitry Andric } 2810b57cec5SDimitry Andric inline __device__ unsigned long __ldg(const unsigned long *ptr) { 2820b57cec5SDimitry Andric return __nvvm_ldg_ul(ptr); 2830b57cec5SDimitry Andric } 2840b57cec5SDimitry Andric inline __device__ unsigned long long __ldg(const unsigned long long *ptr) { 2850b57cec5SDimitry Andric return __nvvm_ldg_ull(ptr); 2860b57cec5SDimitry Andric } 2870b57cec5SDimitry Andric inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); } 2880b57cec5SDimitry Andric inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); } 2890b57cec5SDimitry Andric 2900b57cec5SDimitry Andric inline __device__ char2 __ldg(const char2 *ptr) { 2910b57cec5SDimitry Andric typedef char c2 __attribute__((ext_vector_type(2))); 2920b57cec5SDimitry Andric // We can assume that ptr is aligned at least to char2's alignment, but the 2930b57cec5SDimitry Andric // load will assume that ptr is aligned to char2's alignment. This is only 2940b57cec5SDimitry Andric // safe if alignof(c2) <= alignof(char2). 2950b57cec5SDimitry Andric c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr)); 2960b57cec5SDimitry Andric char2 ret; 2970b57cec5SDimitry Andric ret.x = rv[0]; 2980b57cec5SDimitry Andric ret.y = rv[1]; 2990b57cec5SDimitry Andric return ret; 3000b57cec5SDimitry Andric } 3010b57cec5SDimitry Andric inline __device__ char4 __ldg(const char4 *ptr) { 3020b57cec5SDimitry Andric typedef char c4 __attribute__((ext_vector_type(4))); 3030b57cec5SDimitry Andric c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr)); 3040b57cec5SDimitry Andric char4 ret; 3050b57cec5SDimitry Andric ret.x = rv[0]; 3060b57cec5SDimitry Andric ret.y = rv[1]; 3070b57cec5SDimitry Andric ret.z = rv[2]; 3080b57cec5SDimitry Andric ret.w = rv[3]; 3090b57cec5SDimitry Andric return ret; 3100b57cec5SDimitry Andric } 3110b57cec5SDimitry Andric inline __device__ short2 __ldg(const short2 *ptr) { 3120b57cec5SDimitry Andric typedef short s2 __attribute__((ext_vector_type(2))); 3130b57cec5SDimitry Andric s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr)); 3140b57cec5SDimitry Andric short2 ret; 3150b57cec5SDimitry Andric ret.x = rv[0]; 3160b57cec5SDimitry Andric ret.y = rv[1]; 3170b57cec5SDimitry Andric return ret; 3180b57cec5SDimitry Andric } 3190b57cec5SDimitry Andric inline __device__ short4 __ldg(const short4 *ptr) { 3200b57cec5SDimitry Andric typedef short s4 __attribute__((ext_vector_type(4))); 3210b57cec5SDimitry Andric s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr)); 3220b57cec5SDimitry Andric short4 ret; 3230b57cec5SDimitry Andric ret.x = rv[0]; 3240b57cec5SDimitry Andric ret.y = rv[1]; 3250b57cec5SDimitry Andric ret.z = rv[2]; 3260b57cec5SDimitry Andric ret.w = rv[3]; 3270b57cec5SDimitry Andric return ret; 3280b57cec5SDimitry Andric } 3290b57cec5SDimitry Andric inline __device__ int2 __ldg(const int2 *ptr) { 3300b57cec5SDimitry Andric typedef int i2 __attribute__((ext_vector_type(2))); 3310b57cec5SDimitry Andric i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr)); 3320b57cec5SDimitry Andric int2 ret; 3330b57cec5SDimitry Andric ret.x = rv[0]; 3340b57cec5SDimitry Andric ret.y = rv[1]; 3350b57cec5SDimitry Andric return ret; 3360b57cec5SDimitry Andric } 3370b57cec5SDimitry Andric inline __device__ int4 __ldg(const int4 *ptr) { 3380b57cec5SDimitry Andric typedef int i4 __attribute__((ext_vector_type(4))); 3390b57cec5SDimitry Andric i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr)); 3400b57cec5SDimitry Andric int4 ret; 3410b57cec5SDimitry Andric ret.x = rv[0]; 3420b57cec5SDimitry Andric ret.y = rv[1]; 3430b57cec5SDimitry Andric ret.z = rv[2]; 3440b57cec5SDimitry Andric ret.w = rv[3]; 3450b57cec5SDimitry Andric return ret; 3460b57cec5SDimitry Andric } 3470b57cec5SDimitry Andric inline __device__ longlong2 __ldg(const longlong2 *ptr) { 3480b57cec5SDimitry Andric typedef long long ll2 __attribute__((ext_vector_type(2))); 3490b57cec5SDimitry Andric ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr)); 3500b57cec5SDimitry Andric longlong2 ret; 3510b57cec5SDimitry Andric ret.x = rv[0]; 3520b57cec5SDimitry Andric ret.y = rv[1]; 3530b57cec5SDimitry Andric return ret; 3540b57cec5SDimitry Andric } 3550b57cec5SDimitry Andric 3560b57cec5SDimitry Andric inline __device__ uchar2 __ldg(const uchar2 *ptr) { 3570b57cec5SDimitry Andric typedef unsigned char uc2 __attribute__((ext_vector_type(2))); 3580b57cec5SDimitry Andric uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr)); 3590b57cec5SDimitry Andric uchar2 ret; 3600b57cec5SDimitry Andric ret.x = rv[0]; 3610b57cec5SDimitry Andric ret.y = rv[1]; 3620b57cec5SDimitry Andric return ret; 3630b57cec5SDimitry Andric } 3640b57cec5SDimitry Andric inline __device__ uchar4 __ldg(const uchar4 *ptr) { 3650b57cec5SDimitry Andric typedef unsigned char uc4 __attribute__((ext_vector_type(4))); 3660b57cec5SDimitry Andric uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr)); 3670b57cec5SDimitry Andric uchar4 ret; 3680b57cec5SDimitry Andric ret.x = rv[0]; 3690b57cec5SDimitry Andric ret.y = rv[1]; 3700b57cec5SDimitry Andric ret.z = rv[2]; 3710b57cec5SDimitry Andric ret.w = rv[3]; 3720b57cec5SDimitry Andric return ret; 3730b57cec5SDimitry Andric } 3740b57cec5SDimitry Andric inline __device__ ushort2 __ldg(const ushort2 *ptr) { 3750b57cec5SDimitry Andric typedef unsigned short us2 __attribute__((ext_vector_type(2))); 3760b57cec5SDimitry Andric us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr)); 3770b57cec5SDimitry Andric ushort2 ret; 3780b57cec5SDimitry Andric ret.x = rv[0]; 3790b57cec5SDimitry Andric ret.y = rv[1]; 3800b57cec5SDimitry Andric return ret; 3810b57cec5SDimitry Andric } 3820b57cec5SDimitry Andric inline __device__ ushort4 __ldg(const ushort4 *ptr) { 3830b57cec5SDimitry Andric typedef unsigned short us4 __attribute__((ext_vector_type(4))); 3840b57cec5SDimitry Andric us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr)); 3850b57cec5SDimitry Andric ushort4 ret; 3860b57cec5SDimitry Andric ret.x = rv[0]; 3870b57cec5SDimitry Andric ret.y = rv[1]; 3880b57cec5SDimitry Andric ret.z = rv[2]; 3890b57cec5SDimitry Andric ret.w = rv[3]; 3900b57cec5SDimitry Andric return ret; 3910b57cec5SDimitry Andric } 3920b57cec5SDimitry Andric inline __device__ uint2 __ldg(const uint2 *ptr) { 3930b57cec5SDimitry Andric typedef unsigned int ui2 __attribute__((ext_vector_type(2))); 3940b57cec5SDimitry Andric ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr)); 3950b57cec5SDimitry Andric uint2 ret; 3960b57cec5SDimitry Andric ret.x = rv[0]; 3970b57cec5SDimitry Andric ret.y = rv[1]; 3980b57cec5SDimitry Andric return ret; 3990b57cec5SDimitry Andric } 4000b57cec5SDimitry Andric inline __device__ uint4 __ldg(const uint4 *ptr) { 4010b57cec5SDimitry Andric typedef unsigned int ui4 __attribute__((ext_vector_type(4))); 4020b57cec5SDimitry Andric ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr)); 4030b57cec5SDimitry Andric uint4 ret; 4040b57cec5SDimitry Andric ret.x = rv[0]; 4050b57cec5SDimitry Andric ret.y = rv[1]; 4060b57cec5SDimitry Andric ret.z = rv[2]; 4070b57cec5SDimitry Andric ret.w = rv[3]; 4080b57cec5SDimitry Andric return ret; 4090b57cec5SDimitry Andric } 4100b57cec5SDimitry Andric inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) { 4110b57cec5SDimitry Andric typedef unsigned long long ull2 __attribute__((ext_vector_type(2))); 4120b57cec5SDimitry Andric ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr)); 4130b57cec5SDimitry Andric ulonglong2 ret; 4140b57cec5SDimitry Andric ret.x = rv[0]; 4150b57cec5SDimitry Andric ret.y = rv[1]; 4160b57cec5SDimitry Andric return ret; 4170b57cec5SDimitry Andric } 4180b57cec5SDimitry Andric 4190b57cec5SDimitry Andric inline __device__ float2 __ldg(const float2 *ptr) { 4200b57cec5SDimitry Andric typedef float f2 __attribute__((ext_vector_type(2))); 4210b57cec5SDimitry Andric f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr)); 4220b57cec5SDimitry Andric float2 ret; 4230b57cec5SDimitry Andric ret.x = rv[0]; 4240b57cec5SDimitry Andric ret.y = rv[1]; 4250b57cec5SDimitry Andric return ret; 4260b57cec5SDimitry Andric } 4270b57cec5SDimitry Andric inline __device__ float4 __ldg(const float4 *ptr) { 4280b57cec5SDimitry Andric typedef float f4 __attribute__((ext_vector_type(4))); 4290b57cec5SDimitry Andric f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr)); 4300b57cec5SDimitry Andric float4 ret; 4310b57cec5SDimitry Andric ret.x = rv[0]; 4320b57cec5SDimitry Andric ret.y = rv[1]; 4330b57cec5SDimitry Andric ret.z = rv[2]; 4340b57cec5SDimitry Andric ret.w = rv[3]; 4350b57cec5SDimitry Andric return ret; 4360b57cec5SDimitry Andric } 4370b57cec5SDimitry Andric inline __device__ double2 __ldg(const double2 *ptr) { 4380b57cec5SDimitry Andric typedef double d2 __attribute__((ext_vector_type(2))); 4390b57cec5SDimitry Andric d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr)); 4400b57cec5SDimitry Andric double2 ret; 4410b57cec5SDimitry Andric ret.x = rv[0]; 4420b57cec5SDimitry Andric ret.y = rv[1]; 4430b57cec5SDimitry Andric return ret; 4440b57cec5SDimitry Andric } 4450b57cec5SDimitry Andric 4460b57cec5SDimitry Andric // TODO: Implement these as intrinsics, so the backend can work its magic on 4470b57cec5SDimitry Andric // these. Alternatively, we could implement these as plain C and try to get 4480b57cec5SDimitry Andric // llvm to recognize the relevant patterns. 4490b57cec5SDimitry Andric inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32, 4500b57cec5SDimitry Andric unsigned shiftWidth) { 4510b57cec5SDimitry Andric unsigned result; 4520b57cec5SDimitry Andric asm("shf.l.wrap.b32 %0, %1, %2, %3;" 4530b57cec5SDimitry Andric : "=r"(result) 4540b57cec5SDimitry Andric : "r"(low32), "r"(high32), "r"(shiftWidth)); 4550b57cec5SDimitry Andric return result; 4560b57cec5SDimitry Andric } 4570b57cec5SDimitry Andric inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32, 4580b57cec5SDimitry Andric unsigned shiftWidth) { 4590b57cec5SDimitry Andric unsigned result; 4600b57cec5SDimitry Andric asm("shf.l.clamp.b32 %0, %1, %2, %3;" 4610b57cec5SDimitry Andric : "=r"(result) 4620b57cec5SDimitry Andric : "r"(low32), "r"(high32), "r"(shiftWidth)); 4630b57cec5SDimitry Andric return result; 4640b57cec5SDimitry Andric } 4650b57cec5SDimitry Andric inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32, 4660b57cec5SDimitry Andric unsigned shiftWidth) { 4670b57cec5SDimitry Andric unsigned result; 4680b57cec5SDimitry Andric asm("shf.r.wrap.b32 %0, %1, %2, %3;" 4690b57cec5SDimitry Andric : "=r"(result) 4700b57cec5SDimitry Andric : "r"(low32), "r"(high32), "r"(shiftWidth)); 4710b57cec5SDimitry Andric return result; 4720b57cec5SDimitry Andric } 4730b57cec5SDimitry Andric inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, 4740b57cec5SDimitry Andric unsigned shiftWidth) { 4750b57cec5SDimitry Andric unsigned ret; 4760b57cec5SDimitry Andric asm("shf.r.clamp.b32 %0, %1, %2, %3;" 4770b57cec5SDimitry Andric : "=r"(ret) 4780b57cec5SDimitry Andric : "r"(low32), "r"(high32), "r"(shiftWidth)); 4790b57cec5SDimitry Andric return ret; 4800b57cec5SDimitry Andric } 4810b57cec5SDimitry Andric 4820b57cec5SDimitry Andric #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 4830b57cec5SDimitry Andric 484349cc55cSDimitry Andric #if CUDA_VERSION >= 11000 485349cc55cSDimitry Andric extern "C" { 486349cc55cSDimitry Andric __device__ inline size_t __nv_cvta_generic_to_global_impl(const void *__ptr) { 487349cc55cSDimitry Andric return (size_t)(void __attribute__((address_space(1))) *)__ptr; 488349cc55cSDimitry Andric } 489349cc55cSDimitry Andric __device__ inline size_t __nv_cvta_generic_to_shared_impl(const void *__ptr) { 490349cc55cSDimitry Andric return (size_t)(void __attribute__((address_space(3))) *)__ptr; 491349cc55cSDimitry Andric } 492349cc55cSDimitry Andric __device__ inline size_t __nv_cvta_generic_to_constant_impl(const void *__ptr) { 493349cc55cSDimitry Andric return (size_t)(void __attribute__((address_space(4))) *)__ptr; 494349cc55cSDimitry Andric } 495349cc55cSDimitry Andric __device__ inline size_t __nv_cvta_generic_to_local_impl(const void *__ptr) { 496349cc55cSDimitry Andric return (size_t)(void __attribute__((address_space(5))) *)__ptr; 497349cc55cSDimitry Andric } 498349cc55cSDimitry Andric __device__ inline void *__nv_cvta_global_to_generic_impl(size_t __ptr) { 499349cc55cSDimitry Andric return (void *)(void __attribute__((address_space(1))) *)__ptr; 500349cc55cSDimitry Andric } 501349cc55cSDimitry Andric __device__ inline void *__nv_cvta_shared_to_generic_impl(size_t __ptr) { 502349cc55cSDimitry Andric return (void *)(void __attribute__((address_space(3))) *)__ptr; 503349cc55cSDimitry Andric } 504349cc55cSDimitry Andric __device__ inline void *__nv_cvta_constant_to_generic_impl(size_t __ptr) { 505349cc55cSDimitry Andric return (void *)(void __attribute__((address_space(4))) *)__ptr; 506349cc55cSDimitry Andric } 507349cc55cSDimitry Andric __device__ inline void *__nv_cvta_local_to_generic_impl(size_t __ptr) { 508349cc55cSDimitry Andric return (void *)(void __attribute__((address_space(5))) *)__ptr; 509349cc55cSDimitry Andric } 51056f451bbSDimitry Andric __device__ inline cuuint32_t __nvvm_get_smem_pointer(void *__ptr) { 511349cc55cSDimitry Andric return __nv_cvta_generic_to_shared_impl(__ptr); 512349cc55cSDimitry Andric } 513349cc55cSDimitry Andric } // extern "C" 51406c3fb27SDimitry Andric 51506c3fb27SDimitry Andric #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 51606c3fb27SDimitry Andric __device__ inline unsigned __reduce_add_sync(unsigned __mask, 51706c3fb27SDimitry Andric unsigned __value) { 51806c3fb27SDimitry Andric return __nvvm_redux_sync_add(__mask, __value); 51906c3fb27SDimitry Andric } 52006c3fb27SDimitry Andric __device__ inline unsigned __reduce_min_sync(unsigned __mask, 52106c3fb27SDimitry Andric unsigned __value) { 52206c3fb27SDimitry Andric return __nvvm_redux_sync_umin(__mask, __value); 52306c3fb27SDimitry Andric } 52406c3fb27SDimitry Andric __device__ inline unsigned __reduce_max_sync(unsigned __mask, 52506c3fb27SDimitry Andric unsigned __value) { 52606c3fb27SDimitry Andric return __nvvm_redux_sync_umax(__mask, __value); 52706c3fb27SDimitry Andric } 52806c3fb27SDimitry Andric __device__ inline int __reduce_min_sync(unsigned __mask, int __value) { 52906c3fb27SDimitry Andric return __nvvm_redux_sync_min(__mask, __value); 53006c3fb27SDimitry Andric } 53106c3fb27SDimitry Andric __device__ inline int __reduce_max_sync(unsigned __mask, int __value) { 53206c3fb27SDimitry Andric return __nvvm_redux_sync_max(__mask, __value); 53306c3fb27SDimitry Andric } 53406c3fb27SDimitry Andric __device__ inline unsigned __reduce_or_sync(unsigned __mask, unsigned __value) { 53506c3fb27SDimitry Andric return __nvvm_redux_sync_or(__mask, __value); 53606c3fb27SDimitry Andric } 53706c3fb27SDimitry Andric __device__ inline unsigned __reduce_and_sync(unsigned __mask, 53806c3fb27SDimitry Andric unsigned __value) { 53906c3fb27SDimitry Andric return __nvvm_redux_sync_and(__mask, __value); 54006c3fb27SDimitry Andric } 54106c3fb27SDimitry Andric __device__ inline unsigned __reduce_xor_sync(unsigned __mask, 54206c3fb27SDimitry Andric unsigned __value) { 54306c3fb27SDimitry Andric return __nvvm_redux_sync_xor(__mask, __value); 54406c3fb27SDimitry Andric } 54506c3fb27SDimitry Andric 54606c3fb27SDimitry Andric __device__ inline void __nv_memcpy_async_shared_global_4(void *__dst, 54706c3fb27SDimitry Andric const void *__src, 54806c3fb27SDimitry Andric unsigned __src_size) { 54906c3fb27SDimitry Andric __nvvm_cp_async_ca_shared_global_4( 55006c3fb27SDimitry Andric (void __attribute__((address_space(3))) *)__dst, 55106c3fb27SDimitry Andric (const void __attribute__((address_space(1))) *)__src, __src_size); 55206c3fb27SDimitry Andric } 55306c3fb27SDimitry Andric __device__ inline void __nv_memcpy_async_shared_global_8(void *__dst, 55406c3fb27SDimitry Andric const void *__src, 55506c3fb27SDimitry Andric unsigned __src_size) { 55606c3fb27SDimitry Andric __nvvm_cp_async_ca_shared_global_8( 55706c3fb27SDimitry Andric (void __attribute__((address_space(3))) *)__dst, 55806c3fb27SDimitry Andric (const void __attribute__((address_space(1))) *)__src, __src_size); 55906c3fb27SDimitry Andric } 56006c3fb27SDimitry Andric __device__ inline void __nv_memcpy_async_shared_global_16(void *__dst, 56106c3fb27SDimitry Andric const void *__src, 56206c3fb27SDimitry Andric unsigned __src_size) { 56306c3fb27SDimitry Andric __nvvm_cp_async_ca_shared_global_16( 56406c3fb27SDimitry Andric (void __attribute__((address_space(3))) *)__dst, 56506c3fb27SDimitry Andric (const void __attribute__((address_space(1))) *)__src, __src_size); 56606c3fb27SDimitry Andric } 56706c3fb27SDimitry Andric 56806c3fb27SDimitry Andric __device__ inline void * 56906c3fb27SDimitry Andric __nv_associate_access_property(const void *__ptr, unsigned long long __prop) { 57006c3fb27SDimitry Andric // TODO: it appears to provide compiler with some sort of a hint. We do not 57106c3fb27SDimitry Andric // know what exactly it is supposed to do. However, CUDA headers suggest that 57206c3fb27SDimitry Andric // just passing through __ptr should not affect correctness. They do so on 57306c3fb27SDimitry Andric // pre-sm80 GPUs where this builtin is not available. 57406c3fb27SDimitry Andric return (void*)__ptr; 57506c3fb27SDimitry Andric } 57606c3fb27SDimitry Andric #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 57706c3fb27SDimitry Andric 57806c3fb27SDimitry Andric #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900 57906c3fb27SDimitry Andric __device__ inline unsigned __isCtaShared(const void *ptr) { 58006c3fb27SDimitry Andric return __isShared(ptr); 58106c3fb27SDimitry Andric } 58206c3fb27SDimitry Andric 58306c3fb27SDimitry Andric __device__ inline unsigned __isClusterShared(const void *__ptr) { 58406c3fb27SDimitry Andric return __nvvm_isspacep_shared_cluster(__ptr); 58506c3fb27SDimitry Andric } 58606c3fb27SDimitry Andric 58706c3fb27SDimitry Andric __device__ inline void *__cluster_map_shared_rank(const void *__ptr, 58806c3fb27SDimitry Andric unsigned __rank) { 58906c3fb27SDimitry Andric return __nvvm_mapa((void *)__ptr, __rank); 59006c3fb27SDimitry Andric } 59106c3fb27SDimitry Andric 59206c3fb27SDimitry Andric __device__ inline unsigned __cluster_query_shared_rank(const void *__ptr) { 59306c3fb27SDimitry Andric return __nvvm_getctarank((void *)__ptr); 59406c3fb27SDimitry Andric } 59506c3fb27SDimitry Andric 59606c3fb27SDimitry Andric __device__ inline uint2 59706c3fb27SDimitry Andric __cluster_map_shared_multicast(const void *__ptr, 59806c3fb27SDimitry Andric unsigned int __cluster_cta_mask) { 59906c3fb27SDimitry Andric return make_uint2((unsigned)__cvta_generic_to_shared(__ptr), 60006c3fb27SDimitry Andric __cluster_cta_mask); 60106c3fb27SDimitry Andric } 60206c3fb27SDimitry Andric 60306c3fb27SDimitry Andric __device__ inline unsigned __clusterDimIsSpecified() { 60406c3fb27SDimitry Andric return __nvvm_is_explicit_cluster(); 60506c3fb27SDimitry Andric } 60606c3fb27SDimitry Andric 60706c3fb27SDimitry Andric __device__ inline dim3 __clusterDim() { 60806c3fb27SDimitry Andric return dim3(__nvvm_read_ptx_sreg_cluster_nctaid_x(), 60906c3fb27SDimitry Andric __nvvm_read_ptx_sreg_cluster_nctaid_y(), 61006c3fb27SDimitry Andric __nvvm_read_ptx_sreg_cluster_nctaid_z()); 61106c3fb27SDimitry Andric } 61206c3fb27SDimitry Andric 61306c3fb27SDimitry Andric __device__ inline dim3 __clusterRelativeBlockIdx() { 61406c3fb27SDimitry Andric return dim3(__nvvm_read_ptx_sreg_cluster_ctaid_x(), 61506c3fb27SDimitry Andric __nvvm_read_ptx_sreg_cluster_ctaid_y(), 61606c3fb27SDimitry Andric __nvvm_read_ptx_sreg_cluster_ctaid_z()); 61706c3fb27SDimitry Andric } 61806c3fb27SDimitry Andric 61906c3fb27SDimitry Andric __device__ inline dim3 __clusterGridDimInClusters() { 62006c3fb27SDimitry Andric return dim3(__nvvm_read_ptx_sreg_nclusterid_x(), 62106c3fb27SDimitry Andric __nvvm_read_ptx_sreg_nclusterid_y(), 62206c3fb27SDimitry Andric __nvvm_read_ptx_sreg_nclusterid_z()); 62306c3fb27SDimitry Andric } 62406c3fb27SDimitry Andric 62506c3fb27SDimitry Andric __device__ inline dim3 __clusterIdx() { 62606c3fb27SDimitry Andric return dim3(__nvvm_read_ptx_sreg_clusterid_x(), 62706c3fb27SDimitry Andric __nvvm_read_ptx_sreg_clusterid_y(), 62806c3fb27SDimitry Andric __nvvm_read_ptx_sreg_clusterid_z()); 62906c3fb27SDimitry Andric } 63006c3fb27SDimitry Andric 63106c3fb27SDimitry Andric __device__ inline unsigned __clusterRelativeBlockRank() { 63206c3fb27SDimitry Andric return __nvvm_read_ptx_sreg_cluster_ctarank(); 63306c3fb27SDimitry Andric } 63406c3fb27SDimitry Andric 63506c3fb27SDimitry Andric __device__ inline unsigned __clusterSizeInBlocks() { 63606c3fb27SDimitry Andric return __nvvm_read_ptx_sreg_cluster_nctarank(); 63706c3fb27SDimitry Andric } 63806c3fb27SDimitry Andric 63906c3fb27SDimitry Andric __device__ inline void __cluster_barrier_arrive() { 64006c3fb27SDimitry Andric __nvvm_barrier_cluster_arrive(); 64106c3fb27SDimitry Andric } 64206c3fb27SDimitry Andric 64306c3fb27SDimitry Andric __device__ inline void __cluster_barrier_arrive_relaxed() { 64406c3fb27SDimitry Andric __nvvm_barrier_cluster_arrive_relaxed(); 64506c3fb27SDimitry Andric } 64606c3fb27SDimitry Andric 64706c3fb27SDimitry Andric __device__ inline void __cluster_barrier_wait() { 64806c3fb27SDimitry Andric __nvvm_barrier_cluster_wait(); 64906c3fb27SDimitry Andric } 65006c3fb27SDimitry Andric 65106c3fb27SDimitry Andric __device__ inline void __threadfence_cluster() { __nvvm_fence_sc_cluster(); } 65206c3fb27SDimitry Andric 65306c3fb27SDimitry Andric __device__ inline float2 atomicAdd(float2 *__ptr, float2 __val) { 65406c3fb27SDimitry Andric float2 __ret; 65506c3fb27SDimitry Andric __asm__("atom.add.v2.f32 {%0, %1}, [%2], {%3, %4};" 65606c3fb27SDimitry Andric : "=f"(__ret.x), "=f"(__ret.y) 65706c3fb27SDimitry Andric : "l"(__ptr), "f"(__val.x), "f"(__val.y)); 65806c3fb27SDimitry Andric return __ret; 65906c3fb27SDimitry Andric } 66006c3fb27SDimitry Andric 66106c3fb27SDimitry Andric __device__ inline float2 atomicAdd_block(float2 *__ptr, float2 __val) { 66206c3fb27SDimitry Andric float2 __ret; 66306c3fb27SDimitry Andric __asm__("atom.cta.add.v2.f32 {%0, %1}, [%2], {%3, %4};" 66406c3fb27SDimitry Andric : "=f"(__ret.x), "=f"(__ret.y) 66506c3fb27SDimitry Andric : "l"(__ptr), "f"(__val.x), "f"(__val.y)); 66606c3fb27SDimitry Andric return __ret; 66706c3fb27SDimitry Andric } 66806c3fb27SDimitry Andric 66906c3fb27SDimitry Andric __device__ inline float2 atomicAdd_system(float2 *__ptr, float2 __val) { 67006c3fb27SDimitry Andric float2 __ret; 67106c3fb27SDimitry Andric __asm__("atom.sys.add.v2.f32 {%0, %1}, [%2], {%3, %4};" 67206c3fb27SDimitry Andric : "=f"(__ret.x), "=f"(__ret.y) 67306c3fb27SDimitry Andric : "l"(__ptr), "f"(__val.x), "f"(__val.y)); 67406c3fb27SDimitry Andric return __ret; 67506c3fb27SDimitry Andric } 67606c3fb27SDimitry Andric 67706c3fb27SDimitry Andric __device__ inline float4 atomicAdd(float4 *__ptr, float4 __val) { 67806c3fb27SDimitry Andric float4 __ret; 67906c3fb27SDimitry Andric __asm__("atom.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};" 68006c3fb27SDimitry Andric : "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w) 68106c3fb27SDimitry Andric : "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w)); 68206c3fb27SDimitry Andric return __ret; 68306c3fb27SDimitry Andric } 68406c3fb27SDimitry Andric 68506c3fb27SDimitry Andric __device__ inline float4 atomicAdd_block(float4 *__ptr, float4 __val) { 68606c3fb27SDimitry Andric float4 __ret; 68706c3fb27SDimitry Andric __asm__( 68806c3fb27SDimitry Andric "atom.cta.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};" 68906c3fb27SDimitry Andric : "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w) 69006c3fb27SDimitry Andric : "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w)); 69106c3fb27SDimitry Andric return __ret; 69206c3fb27SDimitry Andric } 69306c3fb27SDimitry Andric 69406c3fb27SDimitry Andric __device__ inline float4 atomicAdd_system(float4 *__ptr, float4 __val) { 69506c3fb27SDimitry Andric float4 __ret; 69606c3fb27SDimitry Andric __asm__( 69706c3fb27SDimitry Andric "atom.sys.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};" 69806c3fb27SDimitry Andric : "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w) 69906c3fb27SDimitry Andric : "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w) 70006c3fb27SDimitry Andric :); 70106c3fb27SDimitry Andric return __ret; 70206c3fb27SDimitry Andric } 70306c3fb27SDimitry Andric 70406c3fb27SDimitry Andric #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900 705349cc55cSDimitry Andric #endif // CUDA_VERSION >= 11000 706349cc55cSDimitry Andric 7070b57cec5SDimitry Andric #endif // defined(__CLANG_CUDA_INTRINSICS_H__) 708