1 //===-- amdgpuintrin.h - AMDPGU intrinsic functions -----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef __AMDGPUINTRIN_H 10 #define __AMDGPUINTRIN_H 11 12 #ifndef __AMDGPU__ 13 #error "This file is intended for AMDGPU targets or offloading to AMDGPU" 14 #endif 15 16 #include <stdint.h> 17 18 #if !defined(__cplusplus) 19 _Pragma("push_macro(\"bool\")"); 20 #define bool _Bool 21 #endif 22 23 _Pragma("omp begin declare target device_type(nohost)"); 24 _Pragma("omp begin declare variant match(device = {arch(amdgcn)})"); 25 26 // Type aliases to the address spaces used by the AMDGPU backend. 27 #define __gpu_private __attribute__((address_space(5))) 28 #define __gpu_constant __attribute__((address_space(4))) 29 #define __gpu_local __attribute__((address_space(3))) 30 #define __gpu_global __attribute__((address_space(1))) 31 #define __gpu_generic __attribute__((address_space(0))) 32 33 // Attribute to declare a function as a kernel. 34 #define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected"))) 35 36 // Returns the number of workgroups in the 'x' dimension of the grid. 37 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) { 38 return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); 39 } 40 41 // Returns the number of workgroups in the 'y' dimension of the grid. 42 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) { 43 return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y(); 44 } 45 46 // Returns the number of workgroups in the 'z' dimension of the grid. 47 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) { 48 return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z(); 49 } 50 51 // Returns the 'x' dimension of the current AMD workgroup's id. 52 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) { 53 return __builtin_amdgcn_workgroup_id_x(); 54 } 55 56 // Returns the 'y' dimension of the current AMD workgroup's id. 57 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) { 58 return __builtin_amdgcn_workgroup_id_y(); 59 } 60 61 // Returns the 'z' dimension of the current AMD workgroup's id. 62 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) { 63 return __builtin_amdgcn_workgroup_id_z(); 64 } 65 66 // Returns the number of workitems in the 'x' dimension. 67 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) { 68 return __builtin_amdgcn_workgroup_size_x(); 69 } 70 71 // Returns the number of workitems in the 'y' dimension. 72 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) { 73 return __builtin_amdgcn_workgroup_size_y(); 74 } 75 76 // Returns the number of workitems in the 'z' dimension. 77 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) { 78 return __builtin_amdgcn_workgroup_size_z(); 79 } 80 81 // Returns the 'x' dimension id of the workitem in the current AMD workgroup. 82 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) { 83 return __builtin_amdgcn_workitem_id_x(); 84 } 85 86 // Returns the 'y' dimension id of the workitem in the current AMD workgroup. 87 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) { 88 return __builtin_amdgcn_workitem_id_y(); 89 } 90 91 // Returns the 'z' dimension id of the workitem in the current AMD workgroup. 92 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) { 93 return __builtin_amdgcn_workitem_id_z(); 94 } 95 96 // Returns the size of an AMD wavefront, either 32 or 64 depending on hardware 97 // and compilation options. 98 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) { 99 return __builtin_amdgcn_wavefrontsize(); 100 } 101 102 // Returns the id of the thread inside of an AMD wavefront executing together. 103 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) { 104 return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); 105 } 106 107 // Returns the bit-mask of active threads in the current wavefront. 108 _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) { 109 return __builtin_amdgcn_read_exec(); 110 } 111 112 // Copies the value from the first active thread in the wavefront to the rest. 113 _DEFAULT_FN_ATTRS static __inline__ uint32_t 114 __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) { 115 return __builtin_amdgcn_readfirstlane(__x); 116 } 117 118 // Copies the value from the first active thread in the wavefront to the rest. 119 _DEFAULT_FN_ATTRS __inline__ uint64_t 120 __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) { 121 uint32_t __hi = (uint32_t)(__x >> 32ull); 122 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF); 123 return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) | 124 ((uint64_t)__builtin_amdgcn_readfirstlane(__lo)); 125 } 126 127 // Returns a bitmask of threads in the current lane for which \p x is true. 128 _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, 129 bool __x) { 130 // The lane_mask & gives the nvptx semantics when lane_mask is a subset of 131 // the active threads 132 return __lane_mask & __builtin_amdgcn_ballot_w64(__x); 133 } 134 135 // Waits for all the threads in the block to converge and issues a fence. 136 _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) { 137 __builtin_amdgcn_s_barrier(); 138 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup"); 139 } 140 141 // Wait for all threads in the wavefront to converge, this is a noop on AMDGPU. 142 _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) { 143 __builtin_amdgcn_wave_barrier(); 144 } 145 146 // Shuffles the the lanes inside the wavefront according to the given index. 147 _DEFAULT_FN_ATTRS static __inline__ uint32_t 148 __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) { 149 return __builtin_amdgcn_ds_bpermute(__idx << 2, __x); 150 } 151 152 // Shuffles the the lanes inside the wavefront according to the given index. 153 _DEFAULT_FN_ATTRS static __inline__ uint64_t 154 __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) { 155 uint32_t __hi = (uint32_t)(__x >> 32ull); 156 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF); 157 return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) | 158 ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo)); 159 } 160 161 // Returns true if the flat pointer points to AMDGPU 'shared' memory. 162 _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) { 163 return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)(( 164 void [[clang::opencl_generic]] *)ptr)); 165 } 166 167 // Returns true if the flat pointer points to AMDGPU 'private' memory. 168 _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) { 169 return __builtin_amdgcn_is_private((void [[clang::address_space(0)]] *)(( 170 void [[clang::opencl_generic]] *)ptr)); 171 } 172 173 // Terminates execution of the associated wavefront. 174 _DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) { 175 __builtin_amdgcn_endpgm(); 176 } 177 178 // Suspend the thread briefly to assist the scheduler during busy loops. 179 _DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) { 180 __builtin_amdgcn_s_sleep(2); 181 } 182 183 _Pragma("omp end declare variant"); 184 _Pragma("omp end declare target"); 185 186 #if !defined(__cplusplus) 187 _Pragma("pop_macro(\"bool\")"); 188 #endif 189 190 #endif // __AMDGPUINTRIN_H 191