1 //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 //===----------------------------------------------------------------------===// 10 11 #include "Shared/Environment.h" 12 13 #include "Allocator.h" 14 #include "Configuration.h" 15 #include "Debug.h" 16 #include "DeviceTypes.h" 17 #include "DeviceUtils.h" 18 #include "Interface.h" 19 #include "LibC.h" 20 #include "Mapping.h" 21 #include "State.h" 22 #include "Synchronization.h" 23 24 using namespace ompx; 25 26 #pragma omp begin declare target device_type(nohost) 27 28 /// Memory implementation 29 /// 30 ///{ 31 32 /// External symbol to access dynamic shared memory. 33 [[gnu::aligned( 34 allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[]; 35 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc) 36 37 /// The kernel environment passed to the init method by the compiler. 38 static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr); 39 40 /// The kernel launch environment passed as argument to the kernel by the 41 /// runtime. 42 static KernelLaunchEnvironmentTy *SHARED(KernelLaunchEnvironmentPtr); 43 44 ///} 45 46 namespace { 47 48 /// Fallback implementations are missing to trigger a link time error. 49 /// Implementations for new devices, including the host, should go into a 50 /// dedicated begin/end declare variant. 51 /// 52 ///{ 53 extern "C" { 54 #ifdef __AMDGPU__ 55 56 [[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); } 57 [[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); } 58 59 #else 60 61 [[gnu::weak, gnu::leaf]] void *malloc(size_t Size); 62 [[gnu::weak, gnu::leaf]] void free(void *Ptr); 63 64 #endif 65 } 66 ///} 67 68 /// A "smart" stack in shared memory. 69 /// 70 /// The stack exposes a malloc/free interface but works like a stack internally. 71 /// In fact, it is a separate stack *per warp*. That means, each warp must push 72 /// and pop symmetrically or this breaks, badly. The implementation will (aim 73 /// to) detect non-lock-step warps and fallback to malloc/free. The same will 74 /// happen if a warp runs out of memory. The master warp in generic memory is 75 /// special and is given more memory than the rest. 76 /// 77 struct SharedMemorySmartStackTy { 78 /// Initialize the stack. Must be called by all threads. 79 void init(bool IsSPMD); 80 81 /// Allocate \p Bytes on the stack for the encountering thread. Each thread 82 /// can call this function. 83 void *push(uint64_t Bytes); 84 85 /// Deallocate the last allocation made by the encountering thread and pointed 86 /// to by \p Ptr from the stack. Each thread can call this function. 87 void pop(void *Ptr, uint64_t Bytes); 88 89 private: 90 /// Compute the size of the storage space reserved for a thread. 91 uint32_t computeThreadStorageTotal() { 92 uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock(); 93 return utils::alignDown((state::SharedScratchpadSize / NumLanesInBlock), 94 allocator::ALIGNMENT); 95 } 96 97 /// Return the top address of the warp data stack, that is the first address 98 /// this warp will allocate memory at next. 99 void *getThreadDataTop(uint32_t TId) { 100 return &Data[computeThreadStorageTotal() * TId + Usage[TId]]; 101 } 102 103 /// The actual storage, shared among all warps. 104 [[gnu::aligned( 105 allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize]; 106 [[gnu::aligned( 107 allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam]; 108 }; 109 110 static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256, 111 "Shared scratchpad of this size not supported yet."); 112 113 /// The allocation of a single shared memory scratchpad. 114 static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack); 115 116 void SharedMemorySmartStackTy::init(bool IsSPMD) { 117 Usage[mapping::getThreadIdInBlock()] = 0; 118 } 119 120 void *SharedMemorySmartStackTy::push(uint64_t Bytes) { 121 // First align the number of requested bytes. 122 /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to 123 /// be passed in as an argument and the stack rewritten to support it. 124 uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT); 125 126 uint32_t StorageTotal = computeThreadStorageTotal(); 127 128 // The main thread in generic mode gets the space of its entire warp as the 129 // other threads do not participate in any computation at all. 130 if (mapping::isMainThreadInGenericMode()) 131 StorageTotal *= mapping::getWarpSize(); 132 133 int TId = mapping::getThreadIdInBlock(); 134 if (Usage[TId] + AlignedBytes <= StorageTotal) { 135 void *Ptr = getThreadDataTop(TId); 136 Usage[TId] += AlignedBytes; 137 return Ptr; 138 } 139 140 if (config::isDebugMode(DeviceDebugKind::CommonIssues)) 141 printf("Shared memory stack full, fallback to dynamic allocation of global " 142 "memory will negatively impact performance.\n"); 143 void *GlobalMemory = memory::allocGlobal( 144 AlignedBytes, "Slow path shared memory allocation, insufficient " 145 "shared memory stack memory!"); 146 ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!"); 147 148 return GlobalMemory; 149 } 150 151 void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) { 152 uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT); 153 if (utils::isSharedMemPtr(Ptr)) { 154 int TId = mapping::getThreadIdInBlock(); 155 Usage[TId] -= AlignedBytes; 156 return; 157 } 158 memory::freeGlobal(Ptr, "Slow path shared memory deallocation"); 159 } 160 161 } // namespace 162 163 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; } 164 165 void *memory::allocShared(uint64_t Bytes, const char *Reason) { 166 return SharedMemorySmartStack.push(Bytes); 167 } 168 169 void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) { 170 SharedMemorySmartStack.pop(Ptr, Bytes); 171 } 172 173 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) { 174 void *Ptr = malloc(Bytes); 175 if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr) 176 printf("nullptr returned by malloc!\n"); 177 return Ptr; 178 } 179 180 void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); } 181 182 ///} 183 184 bool state::ICVStateTy::operator==(const ICVStateTy &Other) const { 185 return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) & 186 (ActiveLevelVar == Other.ActiveLevelVar) & 187 (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) & 188 (RunSchedVar == Other.RunSchedVar) & 189 (RunSchedChunkVar == Other.RunSchedChunkVar); 190 } 191 192 void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const { 193 ASSERT(NThreadsVar == Other.NThreadsVar, nullptr); 194 ASSERT(LevelVar == Other.LevelVar, nullptr); 195 ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr); 196 ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr); 197 ASSERT(RunSchedVar == Other.RunSchedVar, nullptr); 198 ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr); 199 } 200 201 void state::TeamStateTy::init(bool IsSPMD) { 202 ICVState.NThreadsVar = 0; 203 ICVState.LevelVar = 0; 204 ICVState.ActiveLevelVar = 0; 205 ICVState.Padding0Val = 0; 206 ICVState.MaxActiveLevelsVar = 1; 207 ICVState.RunSchedVar = omp_sched_static; 208 ICVState.RunSchedChunkVar = 1; 209 ParallelTeamSize = 1; 210 HasThreadState = false; 211 ParallelRegionFnVar = nullptr; 212 } 213 214 bool state::TeamStateTy::operator==(const TeamStateTy &Other) const { 215 return (ICVState == Other.ICVState) & 216 (HasThreadState == Other.HasThreadState) & 217 (ParallelTeamSize == Other.ParallelTeamSize); 218 } 219 220 void state::TeamStateTy::assertEqual(TeamStateTy &Other) const { 221 ICVState.assertEqual(Other.ICVState); 222 ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr); 223 ASSERT(HasThreadState == Other.HasThreadState, nullptr); 224 } 225 226 state::TeamStateTy SHARED(ompx::state::TeamState); 227 state::ThreadStateTy **SHARED(ompx::state::ThreadStates); 228 229 namespace { 230 231 int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, 232 int OutOfBoundsVal = -1) { 233 if (Level == 0) 234 return DefaultVal; 235 int LevelVar = omp_get_level(); 236 if (OMP_UNLIKELY(Level < 0 || Level > LevelVar)) 237 return OutOfBoundsVal; 238 int ActiveLevel = icv::ActiveLevel; 239 if (OMP_UNLIKELY(Level != ActiveLevel)) 240 return DefaultVal; 241 return Val; 242 } 243 244 } // namespace 245 246 void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, 247 KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { 248 SharedMemorySmartStack.init(IsSPMD); 249 if (mapping::isInitialThreadInLevel0(IsSPMD)) { 250 TeamState.init(IsSPMD); 251 ThreadStates = nullptr; 252 KernelEnvironmentPtr = &KernelEnvironment; 253 KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment; 254 } 255 } 256 257 KernelEnvironmentTy &state::getKernelEnvironment() { 258 return *KernelEnvironmentPtr; 259 } 260 261 KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() { 262 return *KernelLaunchEnvironmentPtr; 263 } 264 265 void state::enterDataEnvironment(IdentTy *Ident) { 266 ASSERT(config::mayUseThreadStates(), 267 "Thread state modified while explicitly disabled!"); 268 if (!config::mayUseThreadStates()) 269 return; 270 271 unsigned TId = mapping::getThreadIdInBlock(); 272 ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>( 273 memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc")); 274 uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates); 275 if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) { 276 uint32_t Bytes = 277 sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock(); 278 void *ThreadStatesPtr = 279 memory::allocGlobal(Bytes, "Thread state array allocation"); 280 __builtin_memset(ThreadStatesPtr, 0, Bytes); 281 if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0), 282 reinterpret_cast<uintptr_t>(ThreadStatesPtr), 283 atomic::seq_cst, atomic::seq_cst)) 284 memory::freeGlobal(ThreadStatesPtr, 285 "Thread state array allocated multiple times"); 286 ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst), 287 "Expected valid thread states bit!"); 288 } 289 NewThreadState->init(ThreadStates[TId]); 290 TeamState.HasThreadState = true; 291 ThreadStates[TId] = NewThreadState; 292 } 293 294 void state::exitDataEnvironment() { 295 ASSERT(config::mayUseThreadStates(), 296 "Thread state modified while explicitly disabled!"); 297 298 unsigned TId = mapping::getThreadIdInBlock(); 299 resetStateForThread(TId); 300 } 301 302 void state::resetStateForThread(uint32_t TId) { 303 if (!config::mayUseThreadStates()) 304 return; 305 if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId])) 306 return; 307 308 ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState; 309 memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc"); 310 ThreadStates[TId] = PreviousThreadState; 311 } 312 313 void state::runAndCheckState(void(Func(void))) { 314 TeamStateTy OldTeamState = TeamState; 315 OldTeamState.assertEqual(TeamState); 316 317 Func(); 318 319 OldTeamState.assertEqual(TeamState); 320 } 321 322 void state::assumeInitialState(bool IsSPMD) { 323 TeamStateTy InitialTeamState; 324 InitialTeamState.init(IsSPMD); 325 InitialTeamState.assertEqual(TeamState); 326 ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr); 327 } 328 329 int state::getEffectivePTeamSize() { 330 int PTeamSize = state::ParallelTeamSize; 331 return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads(); 332 } 333 334 extern "C" { 335 void omp_set_dynamic(int V) {} 336 337 int omp_get_dynamic(void) { return 0; } 338 339 void omp_set_num_threads(int V) { icv::NThreads = V; } 340 341 int omp_get_max_threads(void) { 342 int NT = icv::NThreads; 343 return NT > 0 ? NT : mapping::getMaxTeamThreads(); 344 } 345 346 int omp_get_level(void) { 347 int LevelVar = icv::Level; 348 ASSERT(LevelVar >= 0, nullptr); 349 return LevelVar; 350 } 351 352 int omp_get_active_level(void) { return !!icv::ActiveLevel; } 353 354 int omp_in_parallel(void) { return !!icv::ActiveLevel; } 355 356 void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) { 357 *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched); 358 *ChunkSize = state::RunSchedChunk; 359 } 360 361 void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) { 362 icv::RunSched = (int)ScheduleKind; 363 state::RunSchedChunk = ChunkSize; 364 } 365 366 int omp_get_ancestor_thread_num(int Level) { 367 return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0); 368 } 369 370 int omp_get_thread_num(void) { 371 return omp_get_ancestor_thread_num(omp_get_level()); 372 } 373 374 int omp_get_team_size(int Level) { 375 return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1); 376 } 377 378 int omp_get_num_threads(void) { 379 return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize(); 380 } 381 382 int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); } 383 384 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); } 385 386 void omp_set_nested(int) {} 387 388 int omp_get_nested(void) { return false; } 389 390 void omp_set_max_active_levels(int Levels) { 391 icv::MaxActiveLevels = Levels > 0 ? 1 : 0; 392 } 393 394 int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; } 395 396 omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; } 397 398 int omp_get_num_places(void) { return 0; } 399 400 int omp_get_place_num_procs(int) { return omp_get_num_procs(); } 401 402 void omp_get_place_proc_ids(int, int *) { 403 // TODO 404 } 405 406 int omp_get_place_num(void) { return 0; } 407 408 int omp_get_partition_num_places(void) { return 0; } 409 410 void omp_get_partition_place_nums(int *) { 411 // TODO 412 } 413 414 int omp_get_cancellation(void) { return 0; } 415 416 void omp_set_default_device(int) {} 417 418 int omp_get_default_device(void) { return -1; } 419 420 int omp_get_num_devices(void) { return config::getNumDevices(); } 421 422 int omp_get_device_num(void) { return config::getDeviceNum(); } 423 424 int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); } 425 426 int omp_get_team_num() { return mapping::getBlockIdInKernel(); } 427 428 int omp_get_initial_device(void) { return -1; } 429 430 int omp_is_initial_device(void) { return 0; } 431 } 432 433 extern "C" { 434 [[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) { 435 return memory::allocShared(Bytes, "Frontend alloc shared"); 436 } 437 438 [[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) { 439 memory::freeShared(Ptr, Bytes, "Frontend free shared"); 440 } 441 442 void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); } 443 444 void *llvm_omp_target_dynamic_shared_alloc() { 445 return __kmpc_get_dynamic_shared(); 446 } 447 448 void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); } 449 450 /// Allocate storage in shared memory to communicate arguments from the main 451 /// thread to the workers in generic mode. If we exceed 452 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication. 453 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64; 454 455 [[clang::loader_uninitialized]] static void 456 *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM]; 457 #pragma omp allocate(SharedMemVariableSharingSpace) \ 458 allocator(omp_pteam_mem_alloc) 459 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr; 460 #pragma omp allocate(SharedMemVariableSharingSpacePtr) \ 461 allocator(omp_pteam_mem_alloc) 462 463 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) { 464 if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) { 465 SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0]; 466 } else { 467 SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal( 468 nArgs * sizeof(void *), "new extended args"); 469 ASSERT(SharedMemVariableSharingSpacePtr != nullptr, 470 "Nullptr returned by malloc!"); 471 } 472 *GlobalArgs = SharedMemVariableSharingSpacePtr; 473 } 474 475 void __kmpc_end_sharing_variables() { 476 if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0]) 477 memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args"); 478 } 479 480 void __kmpc_get_shared_variables(void ***GlobalArgs) { 481 *GlobalArgs = SharedMemVariableSharingSpacePtr; 482 } 483 } 484 #pragma omp end declare target 485