xref: /llvm-project/offload/DeviceRTL/include/State.h (revision f233a54ae80b5fe7604aa20007d050cefdd5f663)
1 //===-------- State.h - OpenMP State & ICV interface ------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //
10 //===----------------------------------------------------------------------===//
11 
12 #ifndef OMPTARGET_STATE_H
13 #define OMPTARGET_STATE_H
14 
15 #include "Shared/Environment.h"
16 
17 #include "Debug.h"
18 #include "DeviceTypes.h"
19 #include "DeviceUtils.h"
20 #include "Mapping.h"
21 
22 // Forward declaration.
23 struct KernelEnvironmentTy;
24 
25 #pragma omp begin declare target device_type(nohost)
26 
27 namespace ompx {
28 
29 namespace memory {
30 
31 /// Alloca \p Size bytes in shared memory, if possible, for \p Reason.
32 ///
33 /// Note: See the restrictions on __kmpc_alloc_shared for proper usage.
34 void *allocShared(uint64_t Size, const char *Reason);
35 
36 /// Free \p Ptr, alloated via allocShared, for \p Reason.
37 ///
38 /// Note: See the restrictions on __kmpc_free_shared for proper usage.
39 void freeShared(void *Ptr, uint64_t Bytes, const char *Reason);
40 
41 /// Alloca \p Size bytes in global memory, if possible, for \p Reason.
42 void *allocGlobal(uint64_t Size, const char *Reason);
43 
44 /// Return a pointer to the dynamic shared memory buffer.
45 void *getDynamicBuffer();
46 
47 /// Free \p Ptr, alloated via allocGlobal, for \p Reason.
48 void freeGlobal(void *Ptr, const char *Reason);
49 
50 } // namespace memory
51 
52 namespace state {
53 
54 inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE;
55 
56 struct ICVStateTy {
57   uint32_t NThreadsVar;
58   uint32_t LevelVar;
59   uint32_t ActiveLevelVar;
60   uint32_t Padding0Val;
61   uint32_t MaxActiveLevelsVar;
62   uint32_t RunSchedVar;
63   uint32_t RunSchedChunkVar;
64 
65   bool operator==(const ICVStateTy &Other) const;
66 
67   void assertEqual(const ICVStateTy &Other) const;
68 };
69 
70 struct TeamStateTy {
71   void init(bool IsSPMD);
72 
73   bool operator==(const TeamStateTy &) const;
74 
75   void assertEqual(TeamStateTy &Other) const;
76 
77   /// ICVs
78   ///
79   /// Preallocated storage for ICV values that are used if the threads have not
80   /// set a custom default. The latter is supported but unlikely and slow(er).
81   ///
82   ///{
83   ICVStateTy ICVState;
84   ///}
85 
86   uint32_t ParallelTeamSize;
87   uint32_t HasThreadState;
88   ParallelRegionFnTy ParallelRegionFnVar;
89 };
90 
91 extern TeamStateTy TeamState;
92 #pragma omp allocate(TeamState) allocator(omp_pteam_mem_alloc)
93 
94 struct ThreadStateTy {
95 
96   /// ICVs have preallocated storage in the TeamStateTy which is used if a
97   /// thread has not set a custom value. The latter is supported but unlikely.
98   /// When it happens we will allocate dynamic memory to hold the values of all
99   /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an
100   /// ICV struct to hold them all. This is slower than alternatives but allows
101   /// users to pay only for what they use.
102   ///
103   state::ICVStateTy ICVState;
104 
105   ThreadStateTy *PreviousThreadState;
106 
107   void init() {
108     ICVState = TeamState.ICVState;
109     PreviousThreadState = nullptr;
110   }
111 
112   void init(ThreadStateTy *PreviousTS) {
113     ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState;
114     PreviousThreadState = PreviousTS;
115   }
116 };
117 
118 extern ThreadStateTy **ThreadStates;
119 #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
120 
121 /// Initialize the state machinery. Must be called by all threads.
122 void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
123           KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
124 
125 /// Return the kernel and kernel launch environment associated with the current
126 /// kernel. The former is static and contains compile time information that
127 /// holds for all instances of the kernel. The latter is dynamic and provides
128 /// per-launch information.
129 KernelEnvironmentTy &getKernelEnvironment();
130 KernelLaunchEnvironmentTy &getKernelLaunchEnvironment();
131 
132 /// TODO
133 enum ValueKind {
134   VK_NThreads,
135   VK_Level,
136   VK_ActiveLevel,
137   VK_MaxActiveLevels,
138   VK_RunSched,
139   // ---
140   VK_RunSchedChunk,
141   VK_ParallelRegionFn,
142   VK_ParallelTeamSize,
143   VK_HasThreadState,
144 };
145 
146 /// TODO
147 void enterDataEnvironment(IdentTy *Ident);
148 
149 /// TODO
150 void exitDataEnvironment();
151 
152 /// TODO
153 struct DateEnvironmentRAII {
154   DateEnvironmentRAII(IdentTy *Ident) { enterDataEnvironment(Ident); }
155   ~DateEnvironmentRAII() { exitDataEnvironment(); }
156 };
157 
158 /// TODO
159 void resetStateForThread(uint32_t TId);
160 
161 // FIXME: https://github.com/llvm/llvm-project/issues/123241.
162 #define lookupForModify32Impl(Member, Ident, ForceTeamState)                   \
163   {                                                                            \
164     if (OMP_LIKELY(ForceTeamState || !config::mayUseThreadStates() ||          \
165                    !TeamState.HasThreadState))                                 \
166       return TeamState.ICVState.Member;                                        \
167     uint32_t TId = mapping::getThreadIdInBlock();                              \
168     if (OMP_UNLIKELY(!ThreadStates[TId])) {                                    \
169       ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(                   \
170           memory::allocGlobal(sizeof(ThreadStateTy),                           \
171                               "ICV modification outside data environment"));   \
172       ASSERT(ThreadStates[TId] != nullptr, "Nullptr returned by malloc!");     \
173       TeamState.HasThreadState = true;                                         \
174       ThreadStates[TId]->init();                                               \
175     }                                                                          \
176     return ThreadStates[TId]->ICVState.Member;                                 \
177   }
178 
179 // FIXME: https://github.com/llvm/llvm-project/issues/123241.
180 #define lookupImpl(Member, ForceTeamState)                                     \
181   {                                                                            \
182     auto TId = mapping::getThreadIdInBlock();                                  \
183     if (OMP_UNLIKELY(!ForceTeamState && config::mayUseThreadStates() &&        \
184                      TeamState.HasThreadState && ThreadStates[TId]))           \
185       return ThreadStates[TId]->ICVState.Member;                               \
186     return TeamState.ICVState.Member;                                          \
187   }
188 
189 [[gnu::always_inline, gnu::flatten]] inline uint32_t &
190 lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
191   switch (Kind) {
192   case state::VK_NThreads:
193     if (IsReadonly)
194       lookupImpl(NThreadsVar, ForceTeamState);
195     lookupForModify32Impl(NThreadsVar, Ident, ForceTeamState);
196   case state::VK_Level:
197     if (IsReadonly)
198       lookupImpl(LevelVar, ForceTeamState);
199     lookupForModify32Impl(LevelVar, Ident, ForceTeamState);
200   case state::VK_ActiveLevel:
201     if (IsReadonly)
202       lookupImpl(ActiveLevelVar, ForceTeamState);
203     lookupForModify32Impl(ActiveLevelVar, Ident, ForceTeamState);
204   case state::VK_MaxActiveLevels:
205     if (IsReadonly)
206       lookupImpl(MaxActiveLevelsVar, ForceTeamState);
207     lookupForModify32Impl(MaxActiveLevelsVar, Ident, ForceTeamState);
208   case state::VK_RunSched:
209     if (IsReadonly)
210       lookupImpl(RunSchedVar, ForceTeamState);
211     lookupForModify32Impl(RunSchedVar, Ident, ForceTeamState);
212   case state::VK_RunSchedChunk:
213     if (IsReadonly)
214       lookupImpl(RunSchedChunkVar, ForceTeamState);
215     lookupForModify32Impl(RunSchedChunkVar, Ident, ForceTeamState);
216   case state::VK_ParallelTeamSize:
217     return TeamState.ParallelTeamSize;
218   case state::VK_HasThreadState:
219     return TeamState.HasThreadState;
220   default:
221     break;
222   }
223   __builtin_unreachable();
224 }
225 
226 [[gnu::always_inline, gnu::flatten]] inline void *&
227 lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) {
228   switch (Kind) {
229   case state::VK_ParallelRegionFn:
230     return TeamState.ParallelRegionFnVar;
231   default:
232     break;
233   }
234   __builtin_unreachable();
235 }
236 
237 /// A class without actual state used to provide a nice interface to lookup and
238 /// update ICV values we can declare in global scope.
239 template <typename Ty, ValueKind Kind> struct Value {
240   [[gnu::flatten, gnu::always_inline]] operator Ty() {
241     return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr,
242                   /*ForceTeamState=*/false);
243   }
244 
245   [[gnu::flatten, gnu::always_inline]] Value &operator=(const Ty &Other) {
246     set(Other, /*IdentTy=*/nullptr);
247     return *this;
248   }
249 
250   [[gnu::flatten, gnu::always_inline]] Value &operator++() {
251     inc(1, /*IdentTy=*/nullptr);
252     return *this;
253   }
254 
255   [[gnu::flatten, gnu::always_inline]] Value &operator--() {
256     inc(-1, /*IdentTy=*/nullptr);
257     return *this;
258   }
259 
260   [[gnu::flatten, gnu::always_inline]] void
261   assert_eq(const Ty &V, IdentTy *Ident = nullptr,
262             bool ForceTeamState = false) {
263     ASSERT(lookup(/*IsReadonly=*/true, Ident, ForceTeamState) == V, nullptr);
264   }
265 
266 private:
267   [[gnu::flatten, gnu::always_inline]] Ty &
268   lookup(bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
269     Ty &t = lookup32(Kind, IsReadonly, Ident, ForceTeamState);
270     return t;
271   }
272 
273   [[gnu::flatten, gnu::always_inline]] Ty &inc(int UpdateVal, IdentTy *Ident) {
274     return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) +=
275             UpdateVal);
276   }
277 
278   [[gnu::flatten, gnu::always_inline]] Ty &set(Ty UpdateVal, IdentTy *Ident) {
279     return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) =
280                 UpdateVal);
281   }
282 
283   template <typename VTy, typename Ty2> friend struct ValueRAII;
284 };
285 
286 /// A mookup class without actual state used to provide
287 /// a nice interface to lookup and update ICV values
288 /// we can declare in global scope.
289 template <typename Ty, ValueKind Kind> struct PtrValue {
290   [[gnu::flatten, gnu::always_inline]] operator Ty() {
291     return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr,
292                   /*ForceTeamState=*/false);
293   }
294 
295   [[gnu::flatten, gnu::always_inline]] PtrValue &operator=(const Ty Other) {
296     set(Other);
297     return *this;
298   }
299 
300 private:
301   Ty &lookup(bool IsReadonly, IdentTy *, bool ForceTeamState) {
302     return lookupPtr(Kind, IsReadonly, ForceTeamState);
303   }
304 
305   Ty &set(Ty UpdateVal) {
306     return (lookup(/*IsReadonly=*/false, /*IdentTy=*/nullptr,
307                    /*ForceTeamState=*/false) = UpdateVal);
308   }
309 
310   template <typename VTy, typename Ty2> friend struct ValueRAII;
311 };
312 
313 template <typename VTy, typename Ty> struct ValueRAII {
314   ValueRAII(VTy &V, Ty NewValue, Ty OldValue, bool Active, IdentTy *Ident,
315             bool ForceTeamState = false)
316       : Ptr(Active ? &V.lookup(/*IsReadonly=*/false, Ident, ForceTeamState)
317                    : (Ty *)utils::UndefPtr),
318         Val(OldValue), Active(Active) {
319     if (!Active)
320       return;
321     ASSERT(*Ptr == OldValue, "ValueRAII initialization with wrong old value!");
322     *Ptr = NewValue;
323   }
324   ~ValueRAII() {
325     if (Active)
326       *Ptr = Val;
327   }
328 
329 private:
330   Ty *Ptr;
331   Ty Val;
332   bool Active;
333 };
334 
335 /// TODO
336 inline state::Value<uint32_t, state::VK_RunSchedChunk> RunSchedChunk;
337 
338 /// TODO
339 inline state::Value<uint32_t, state::VK_ParallelTeamSize> ParallelTeamSize;
340 
341 /// TODO
342 inline state::Value<uint32_t, state::VK_HasThreadState> HasThreadState;
343 
344 /// TODO
345 inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn>
346     ParallelRegionFn;
347 
348 void runAndCheckState(void(Func(void)));
349 
350 void assumeInitialState(bool IsSPMD);
351 
352 /// Return the value of the ParallelTeamSize ICV.
353 int getEffectivePTeamSize();
354 
355 } // namespace state
356 
357 namespace icv {
358 
359 /// TODO
360 inline state::Value<uint32_t, state::VK_NThreads> NThreads;
361 
362 /// TODO
363 inline state::Value<uint32_t, state::VK_Level> Level;
364 
365 /// The `active-level` describes which of the parallel level counted with the
366 /// `level-var` is active. There can only be one.
367 ///
368 /// active-level-var is 1, if ActiveLevelVar is not 0, otherweise it is 0.
369 inline state::Value<uint32_t, state::VK_ActiveLevel> ActiveLevel;
370 
371 /// TODO
372 inline state::Value<uint32_t, state::VK_MaxActiveLevels> MaxActiveLevels;
373 
374 /// TODO
375 inline state::Value<uint32_t, state::VK_RunSched> RunSched;
376 
377 } // namespace icv
378 
379 } // namespace ompx
380 
381 #pragma omp end declare target
382 
383 #endif
384