xref: /llvm-project/offload/DeviceRTL/src/State.cpp (revision 2d9f40694324a72c2b7a3d6a9cfcc7ce8069afc1)
1 //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
10 
11 #include "Shared/Environment.h"
12 
13 #include "Allocator.h"
14 #include "Configuration.h"
15 #include "Debug.h"
16 #include "DeviceTypes.h"
17 #include "DeviceUtils.h"
18 #include "Interface.h"
19 #include "LibC.h"
20 #include "Mapping.h"
21 #include "State.h"
22 #include "Synchronization.h"
23 
24 using namespace ompx;
25 
26 #pragma omp begin declare target device_type(nohost)
27 
28 /// Memory implementation
29 ///
30 ///{
31 
32 /// External symbol to access dynamic shared memory.
33 [[gnu::aligned(
34     allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[];
35 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
36 
37 /// The kernel environment passed to the init method by the compiler.
38 static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
39 
40 /// The kernel launch environment passed as argument to the kernel by the
41 /// runtime.
42 static KernelLaunchEnvironmentTy *SHARED(KernelLaunchEnvironmentPtr);
43 
44 ///}
45 
46 namespace {
47 
48 /// Fallback implementations are missing to trigger a link time error.
49 /// Implementations for new devices, including the host, should go into a
50 /// dedicated begin/end declare variant.
51 ///
52 ///{
53 extern "C" {
54 #ifdef __AMDGPU__
55 
56 [[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
57 [[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
58 
59 #else
60 
61 [[gnu::weak, gnu::leaf]] void *malloc(size_t Size);
62 [[gnu::weak, gnu::leaf]] void free(void *Ptr);
63 
64 #endif
65 }
66 ///}
67 
68 /// A "smart" stack in shared memory.
69 ///
70 /// The stack exposes a malloc/free interface but works like a stack internally.
71 /// In fact, it is a separate stack *per warp*. That means, each warp must push
72 /// and pop symmetrically or this breaks, badly. The implementation will (aim
73 /// to) detect non-lock-step warps and fallback to malloc/free. The same will
74 /// happen if a warp runs out of memory. The master warp in generic memory is
75 /// special and is given more memory than the rest.
76 ///
77 struct SharedMemorySmartStackTy {
78   /// Initialize the stack. Must be called by all threads.
79   void init(bool IsSPMD);
80 
81   /// Allocate \p Bytes on the stack for the encountering thread. Each thread
82   /// can call this function.
83   void *push(uint64_t Bytes);
84 
85   /// Deallocate the last allocation made by the encountering thread and pointed
86   /// to by \p Ptr from the stack. Each thread can call this function.
87   void pop(void *Ptr, uint64_t Bytes);
88 
89 private:
90   /// Compute the size of the storage space reserved for a thread.
91   uint32_t computeThreadStorageTotal() {
92     uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
93     return utils::alignDown((state::SharedScratchpadSize / NumLanesInBlock),
94                             allocator::ALIGNMENT);
95   }
96 
97   /// Return the top address of the warp data stack, that is the first address
98   /// this warp will allocate memory at next.
99   void *getThreadDataTop(uint32_t TId) {
100     return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
101   }
102 
103   /// The actual storage, shared among all warps.
104   [[gnu::aligned(
105       allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize];
106   [[gnu::aligned(
107       allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
108 };
109 
110 static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
111               "Shared scratchpad of this size not supported yet.");
112 
113 /// The allocation of a single shared memory scratchpad.
114 static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
115 
116 void SharedMemorySmartStackTy::init(bool IsSPMD) {
117   Usage[mapping::getThreadIdInBlock()] = 0;
118 }
119 
120 void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
121   // First align the number of requested bytes.
122   /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
123   /// be passed in as an argument and the stack rewritten to support it.
124   uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT);
125 
126   uint32_t StorageTotal = computeThreadStorageTotal();
127 
128   // The main thread in generic mode gets the space of its entire warp as the
129   // other threads do not participate in any computation at all.
130   if (mapping::isMainThreadInGenericMode())
131     StorageTotal *= mapping::getWarpSize();
132 
133   int TId = mapping::getThreadIdInBlock();
134   if (Usage[TId] + AlignedBytes <= StorageTotal) {
135     void *Ptr = getThreadDataTop(TId);
136     Usage[TId] += AlignedBytes;
137     return Ptr;
138   }
139 
140   if (config::isDebugMode(DeviceDebugKind::CommonIssues))
141     printf("Shared memory stack full, fallback to dynamic allocation of global "
142            "memory will negatively impact performance.\n");
143   void *GlobalMemory = memory::allocGlobal(
144       AlignedBytes, "Slow path shared memory allocation, insufficient "
145                     "shared memory stack memory!");
146   ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!");
147 
148   return GlobalMemory;
149 }
150 
151 void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
152   uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT);
153   if (utils::isSharedMemPtr(Ptr)) {
154     int TId = mapping::getThreadIdInBlock();
155     Usage[TId] -= AlignedBytes;
156     return;
157   }
158   memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
159 }
160 
161 } // namespace
162 
163 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
164 
165 void *memory::allocShared(uint64_t Bytes, const char *Reason) {
166   return SharedMemorySmartStack.push(Bytes);
167 }
168 
169 void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
170   SharedMemorySmartStack.pop(Ptr, Bytes);
171 }
172 
173 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
174   void *Ptr = malloc(Bytes);
175   if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr)
176     printf("nullptr returned by malloc!\n");
177   return Ptr;
178 }
179 
180 void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
181 
182 ///}
183 
184 bool state::ICVStateTy::operator==(const ICVStateTy &Other) const {
185   return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
186          (ActiveLevelVar == Other.ActiveLevelVar) &
187          (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
188          (RunSchedVar == Other.RunSchedVar) &
189          (RunSchedChunkVar == Other.RunSchedChunkVar);
190 }
191 
192 void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
193   ASSERT(NThreadsVar == Other.NThreadsVar, nullptr);
194   ASSERT(LevelVar == Other.LevelVar, nullptr);
195   ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr);
196   ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr);
197   ASSERT(RunSchedVar == Other.RunSchedVar, nullptr);
198   ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr);
199 }
200 
201 void state::TeamStateTy::init(bool IsSPMD) {
202   ICVState.NThreadsVar = 0;
203   ICVState.LevelVar = 0;
204   ICVState.ActiveLevelVar = 0;
205   ICVState.Padding0Val = 0;
206   ICVState.MaxActiveLevelsVar = 1;
207   ICVState.RunSchedVar = omp_sched_static;
208   ICVState.RunSchedChunkVar = 1;
209   ParallelTeamSize = 1;
210   HasThreadState = false;
211   ParallelRegionFnVar = nullptr;
212 }
213 
214 bool state::TeamStateTy::operator==(const TeamStateTy &Other) const {
215   return (ICVState == Other.ICVState) &
216          (HasThreadState == Other.HasThreadState) &
217          (ParallelTeamSize == Other.ParallelTeamSize);
218 }
219 
220 void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
221   ICVState.assertEqual(Other.ICVState);
222   ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr);
223   ASSERT(HasThreadState == Other.HasThreadState, nullptr);
224 }
225 
226 state::TeamStateTy SHARED(ompx::state::TeamState);
227 state::ThreadStateTy **SHARED(ompx::state::ThreadStates);
228 
229 namespace {
230 
231 int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
232                              int OutOfBoundsVal = -1) {
233   if (Level == 0)
234     return DefaultVal;
235   int LevelVar = omp_get_level();
236   if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
237     return OutOfBoundsVal;
238   int ActiveLevel = icv::ActiveLevel;
239   if (OMP_UNLIKELY(Level != ActiveLevel))
240     return DefaultVal;
241   return Val;
242 }
243 
244 } // namespace
245 
246 void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
247                  KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
248   SharedMemorySmartStack.init(IsSPMD);
249   if (mapping::isInitialThreadInLevel0(IsSPMD)) {
250     TeamState.init(IsSPMD);
251     ThreadStates = nullptr;
252     KernelEnvironmentPtr = &KernelEnvironment;
253     KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
254   }
255 }
256 
257 KernelEnvironmentTy &state::getKernelEnvironment() {
258   return *KernelEnvironmentPtr;
259 }
260 
261 KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() {
262   return *KernelLaunchEnvironmentPtr;
263 }
264 
265 void state::enterDataEnvironment(IdentTy *Ident) {
266   ASSERT(config::mayUseThreadStates(),
267          "Thread state modified while explicitly disabled!");
268   if (!config::mayUseThreadStates())
269     return;
270 
271   unsigned TId = mapping::getThreadIdInBlock();
272   ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>(
273       memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc"));
274   uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
275   if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
276     uint32_t Bytes =
277         sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock();
278     void *ThreadStatesPtr =
279         memory::allocGlobal(Bytes, "Thread state array allocation");
280     __builtin_memset(ThreadStatesPtr, 0, Bytes);
281     if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0),
282                      reinterpret_cast<uintptr_t>(ThreadStatesPtr),
283                      atomic::seq_cst, atomic::seq_cst))
284       memory::freeGlobal(ThreadStatesPtr,
285                          "Thread state array allocated multiple times");
286     ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst),
287            "Expected valid thread states bit!");
288   }
289   NewThreadState->init(ThreadStates[TId]);
290   TeamState.HasThreadState = true;
291   ThreadStates[TId] = NewThreadState;
292 }
293 
294 void state::exitDataEnvironment() {
295   ASSERT(config::mayUseThreadStates(),
296          "Thread state modified while explicitly disabled!");
297 
298   unsigned TId = mapping::getThreadIdInBlock();
299   resetStateForThread(TId);
300 }
301 
302 void state::resetStateForThread(uint32_t TId) {
303   if (!config::mayUseThreadStates())
304     return;
305   if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId]))
306     return;
307 
308   ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
309   memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc");
310   ThreadStates[TId] = PreviousThreadState;
311 }
312 
313 void state::runAndCheckState(void(Func(void))) {
314   TeamStateTy OldTeamState = TeamState;
315   OldTeamState.assertEqual(TeamState);
316 
317   Func();
318 
319   OldTeamState.assertEqual(TeamState);
320 }
321 
322 void state::assumeInitialState(bool IsSPMD) {
323   TeamStateTy InitialTeamState;
324   InitialTeamState.init(IsSPMD);
325   InitialTeamState.assertEqual(TeamState);
326   ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr);
327 }
328 
329 int state::getEffectivePTeamSize() {
330   int PTeamSize = state::ParallelTeamSize;
331   return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads();
332 }
333 
334 extern "C" {
335 void omp_set_dynamic(int V) {}
336 
337 int omp_get_dynamic(void) { return 0; }
338 
339 void omp_set_num_threads(int V) { icv::NThreads = V; }
340 
341 int omp_get_max_threads(void) {
342   int NT = icv::NThreads;
343   return NT > 0 ? NT : mapping::getMaxTeamThreads();
344 }
345 
346 int omp_get_level(void) {
347   int LevelVar = icv::Level;
348   ASSERT(LevelVar >= 0, nullptr);
349   return LevelVar;
350 }
351 
352 int omp_get_active_level(void) { return !!icv::ActiveLevel; }
353 
354 int omp_in_parallel(void) { return !!icv::ActiveLevel; }
355 
356 void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
357   *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
358   *ChunkSize = state::RunSchedChunk;
359 }
360 
361 void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
362   icv::RunSched = (int)ScheduleKind;
363   state::RunSchedChunk = ChunkSize;
364 }
365 
366 int omp_get_ancestor_thread_num(int Level) {
367   return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
368 }
369 
370 int omp_get_thread_num(void) {
371   return omp_get_ancestor_thread_num(omp_get_level());
372 }
373 
374 int omp_get_team_size(int Level) {
375   return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
376 }
377 
378 int omp_get_num_threads(void) {
379   return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
380 }
381 
382 int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
383 
384 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
385 
386 void omp_set_nested(int) {}
387 
388 int omp_get_nested(void) { return false; }
389 
390 void omp_set_max_active_levels(int Levels) {
391   icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
392 }
393 
394 int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
395 
396 omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
397 
398 int omp_get_num_places(void) { return 0; }
399 
400 int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
401 
402 void omp_get_place_proc_ids(int, int *) {
403   // TODO
404 }
405 
406 int omp_get_place_num(void) { return 0; }
407 
408 int omp_get_partition_num_places(void) { return 0; }
409 
410 void omp_get_partition_place_nums(int *) {
411   // TODO
412 }
413 
414 int omp_get_cancellation(void) { return 0; }
415 
416 void omp_set_default_device(int) {}
417 
418 int omp_get_default_device(void) { return -1; }
419 
420 int omp_get_num_devices(void) { return config::getNumDevices(); }
421 
422 int omp_get_device_num(void) { return config::getDeviceNum(); }
423 
424 int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
425 
426 int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
427 
428 int omp_get_initial_device(void) { return -1; }
429 
430 int omp_is_initial_device(void) { return 0; }
431 }
432 
433 extern "C" {
434 [[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) {
435   return memory::allocShared(Bytes, "Frontend alloc shared");
436 }
437 
438 [[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
439   memory::freeShared(Ptr, Bytes, "Frontend free shared");
440 }
441 
442 void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
443 
444 void *llvm_omp_target_dynamic_shared_alloc() {
445   return __kmpc_get_dynamic_shared();
446 }
447 
448 void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
449 
450 /// Allocate storage in shared memory to communicate arguments from the main
451 /// thread to the workers in generic mode. If we exceed
452 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
453 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
454 
455 [[clang::loader_uninitialized]] static void
456     *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
457 #pragma omp allocate(SharedMemVariableSharingSpace)                            \
458     allocator(omp_pteam_mem_alloc)
459 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
460 #pragma omp allocate(SharedMemVariableSharingSpacePtr)                         \
461     allocator(omp_pteam_mem_alloc)
462 
463 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
464   if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
465     SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
466   } else {
467     SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
468         nArgs * sizeof(void *), "new extended args");
469     ASSERT(SharedMemVariableSharingSpacePtr != nullptr,
470            "Nullptr returned by malloc!");
471   }
472   *GlobalArgs = SharedMemVariableSharingSpacePtr;
473 }
474 
475 void __kmpc_end_sharing_variables() {
476   if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
477     memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
478 }
479 
480 void __kmpc_get_shared_variables(void ***GlobalArgs) {
481   *GlobalArgs = SharedMemVariableSharingSpacePtr;
482 }
483 }
484 #pragma omp end declare target
485