1 //===-- Shared memory RPC client / server utilities -------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef LLVM_LIBC_SHARED_RPC_UTIL_H 10 #define LLVM_LIBC_SHARED_RPC_UTIL_H 11 12 #include <stddef.h> 13 #include <stdint.h> 14 15 #if (defined(__NVPTX__) || defined(__AMDGPU__)) && \ 16 !((defined(__CUDA__) && !defined(__CUDA_ARCH__)) || \ 17 (defined(__HIP__) && !defined(__HIP_DEVICE_COMPILE__))) 18 #include <gpuintrin.h> 19 #define RPC_TARGET_IS_GPU 20 #endif 21 22 // Workaround for missing __has_builtin in < GCC 10. 23 #ifndef __has_builtin 24 #define __has_builtin(x) 0 25 #endif 26 27 #ifndef RPC_ATTRS 28 #if defined(__CUDA__) || defined(__HIP__) 29 #define RPC_ATTRS __attribute__((host, device)) inline 30 #else 31 #define RPC_ATTRS inline 32 #endif 33 #endif 34 35 namespace rpc { 36 37 template <typename T> struct type_identity { 38 using type = T; 39 }; 40 41 template <class T, T v> struct type_constant { 42 static inline constexpr T value = v; 43 }; 44 45 template <class T> struct remove_reference : type_identity<T> {}; 46 template <class T> struct remove_reference<T &> : type_identity<T> {}; 47 template <class T> struct remove_reference<T &&> : type_identity<T> {}; 48 49 template <class T> struct is_const : type_constant<bool, false> {}; 50 template <class T> struct is_const<const T> : type_constant<bool, true> {}; 51 52 /// Freestanding implementation of std::move. 53 template <class T> 54 RPC_ATTRS constexpr typename remove_reference<T>::type &&move(T &&t) { 55 return static_cast<typename remove_reference<T>::type &&>(t); 56 } 57 58 /// Freestanding implementation of std::forward. 59 template <typename T> 60 RPC_ATTRS constexpr T &&forward(typename remove_reference<T>::type &value) { 61 return static_cast<T &&>(value); 62 } 63 template <typename T> 64 RPC_ATTRS constexpr T &&forward(typename remove_reference<T>::type &&value) { 65 return static_cast<T &&>(value); 66 } 67 68 struct in_place_t { 69 RPC_ATTRS explicit in_place_t() = default; 70 }; 71 72 struct nullopt_t { 73 RPC_ATTRS constexpr explicit nullopt_t() = default; 74 }; 75 76 constexpr inline in_place_t in_place{}; 77 constexpr inline nullopt_t nullopt{}; 78 79 /// Freestanding and minimal implementation of std::optional. 80 template <typename T> class optional { 81 template <typename U> struct OptionalStorage { 82 union { 83 char empty; 84 U stored_value; 85 }; 86 87 bool in_use = false; 88 89 RPC_ATTRS ~OptionalStorage() { reset(); } 90 91 RPC_ATTRS constexpr OptionalStorage() : empty() {} 92 93 template <typename... Args> 94 RPC_ATTRS constexpr explicit OptionalStorage(in_place_t, Args &&...args) 95 : stored_value(forward<Args>(args)...) {} 96 97 RPC_ATTRS constexpr void reset() { 98 if (in_use) 99 stored_value.~U(); 100 in_use = false; 101 } 102 }; 103 104 OptionalStorage<T> storage; 105 106 public: 107 RPC_ATTRS constexpr optional() = default; 108 RPC_ATTRS constexpr optional(nullopt_t) {} 109 110 RPC_ATTRS constexpr optional(const T &t) : storage(in_place, t) { 111 storage.in_use = true; 112 } 113 RPC_ATTRS constexpr optional(const optional &) = default; 114 115 RPC_ATTRS constexpr optional(T &&t) : storage(in_place, move(t)) { 116 storage.in_use = true; 117 } 118 RPC_ATTRS constexpr optional(optional &&O) = default; 119 120 RPC_ATTRS constexpr optional &operator=(T &&t) { 121 storage = move(t); 122 return *this; 123 } 124 RPC_ATTRS constexpr optional &operator=(optional &&) = default; 125 126 RPC_ATTRS constexpr optional &operator=(const T &t) { 127 storage = t; 128 return *this; 129 } 130 RPC_ATTRS constexpr optional &operator=(const optional &) = default; 131 132 RPC_ATTRS constexpr void reset() { storage.reset(); } 133 134 RPC_ATTRS constexpr const T &value() const & { return storage.stored_value; } 135 136 RPC_ATTRS constexpr T &value() & { return storage.stored_value; } 137 138 RPC_ATTRS constexpr explicit operator bool() const { return storage.in_use; } 139 RPC_ATTRS constexpr bool has_value() const { return storage.in_use; } 140 RPC_ATTRS constexpr const T *operator->() const { 141 return &storage.stored_value; 142 } 143 RPC_ATTRS constexpr T *operator->() { return &storage.stored_value; } 144 RPC_ATTRS constexpr const T &operator*() const & { 145 return storage.stored_value; 146 } 147 RPC_ATTRS constexpr T &operator*() & { return storage.stored_value; } 148 149 RPC_ATTRS constexpr T &&value() && { return move(storage.stored_value); } 150 RPC_ATTRS constexpr T &&operator*() && { return move(storage.stored_value); } 151 }; 152 153 /// Suspend the thread briefly to assist the thread scheduler during busy loops. 154 RPC_ATTRS void sleep_briefly() { 155 #if __has_builtin(__nvvm_reflect) 156 if (__nvvm_reflect("__CUDA_ARCH") >= 700) 157 asm("nanosleep.u32 64;" ::: "memory"); 158 #elif __has_builtin(__builtin_amdgcn_s_sleep) 159 __builtin_amdgcn_s_sleep(2); 160 #elif __has_builtin(__builtin_ia32_pause) 161 __builtin_ia32_pause(); 162 #elif __has_builtin(__builtin_arm_isb) 163 __builtin_arm_isb(0xf); 164 #else 165 // Simply do nothing if sleeping isn't supported on this platform. 166 #endif 167 } 168 169 /// Conditional to indicate if this process is running on the GPU. 170 RPC_ATTRS constexpr bool is_process_gpu() { 171 #ifdef RPC_TARGET_IS_GPU 172 return true; 173 #else 174 return false; 175 #endif 176 } 177 178 /// Wait for all lanes in the group to complete. 179 RPC_ATTRS void sync_lane([[maybe_unused]] uint64_t lane_mask) { 180 #ifdef RPC_TARGET_IS_GPU 181 return __gpu_sync_lane(lane_mask); 182 #endif 183 } 184 185 /// Copies the value from the first active thread to the rest. 186 RPC_ATTRS uint32_t broadcast_value([[maybe_unused]] uint64_t lane_mask, 187 uint32_t x) { 188 #ifdef RPC_TARGET_IS_GPU 189 return __gpu_read_first_lane_u32(lane_mask, x); 190 #else 191 return x; 192 #endif 193 } 194 195 /// Returns the number lanes that participate in the RPC interface. 196 RPC_ATTRS uint32_t get_num_lanes() { 197 #ifdef RPC_TARGET_IS_GPU 198 return __gpu_num_lanes(); 199 #else 200 return 1; 201 #endif 202 } 203 204 /// Returns the id of the thread inside of an AMD wavefront executing together. 205 RPC_ATTRS uint64_t get_lane_mask() { 206 #ifdef RPC_TARGET_IS_GPU 207 return __gpu_lane_mask(); 208 #else 209 return 1; 210 #endif 211 } 212 213 /// Returns the id of the thread inside of an AMD wavefront executing together. 214 RPC_ATTRS uint32_t get_lane_id() { 215 #ifdef RPC_TARGET_IS_GPU 216 return __gpu_lane_id(); 217 #else 218 return 0; 219 #endif 220 } 221 222 /// Conditional that is only true for a single thread in a lane. 223 RPC_ATTRS bool is_first_lane([[maybe_unused]] uint64_t lane_mask) { 224 #ifdef RPC_TARGET_IS_GPU 225 return __gpu_is_first_in_lane(lane_mask); 226 #else 227 return true; 228 #endif 229 } 230 231 /// Returns a bitmask of threads in the current lane for which \p x is true. 232 RPC_ATTRS uint64_t ballot([[maybe_unused]] uint64_t lane_mask, bool x) { 233 #ifdef RPC_TARGET_IS_GPU 234 return __gpu_ballot(lane_mask, x); 235 #else 236 return x; 237 #endif 238 } 239 240 /// Return \p val aligned "upwards" according to \p align. 241 template <typename V, typename A> 242 RPC_ATTRS constexpr V align_up(V val, A align) { 243 return ((val + V(align) - 1) / V(align)) * V(align); 244 } 245 246 /// Utility to provide a unified interface between the CPU and GPU's memory 247 /// model. On the GPU stack variables are always private to a lane so we can 248 /// simply use the variable passed in. On the CPU we need to allocate enough 249 /// space for the whole lane and index into it. 250 template <typename V> RPC_ATTRS V &lane_value(V *val, uint32_t id) { 251 if constexpr (is_process_gpu()) 252 return *val; 253 return val[id]; 254 } 255 256 /// Advance the \p p by \p bytes. 257 template <typename T, typename U> RPC_ATTRS T *advance(T *ptr, U bytes) { 258 if constexpr (is_const<T>::value) 259 return reinterpret_cast<T *>(reinterpret_cast<const uint8_t *>(ptr) + 260 bytes); 261 else 262 return reinterpret_cast<T *>(reinterpret_cast<uint8_t *>(ptr) + bytes); 263 } 264 265 /// Wrapper around the optimal memory copy implementation for the target. 266 RPC_ATTRS void rpc_memcpy(void *dst, const void *src, size_t count) { 267 __builtin_memcpy(dst, src, count); 268 } 269 270 template <class T> RPC_ATTRS constexpr const T &max(const T &a, const T &b) { 271 return (a < b) ? b : a; 272 } 273 274 } // namespace rpc 275 276 #endif // LLVM_LIBC_SHARED_RPC_UTIL_H 277