xref: /llvm-project/offload/DeviceRTL/src/Parallelism.cpp (revision 2d9f40694324a72c2b7a3d6a9cfcc7ce8069afc1)
1 //===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Parallel implementation in the GPU. Here is the pattern:
10 //
11 //    while (not finished) {
12 //
13 //    if (master) {
14 //      sequential code, decide which par loop to do, or if finished
15 //     __kmpc_kernel_prepare_parallel() // exec by master only
16 //    }
17 //    syncthreads // A
18 //    __kmpc_kernel_parallel() // exec by all
19 //    if (this thread is included in the parallel) {
20 //      switch () for all parallel loops
21 //      __kmpc_kernel_end_parallel() // exec only by threads in parallel
22 //    }
23 //
24 //
25 //    The reason we don't exec end_parallel for the threads not included
26 //    in the parallel loop is that for each barrier in the parallel
27 //    region, these non-included threads will cycle through the
28 //    syncthread A. Thus they must preserve their current threadId that
29 //    is larger than thread in team.
30 //
31 //    To make a long story short...
32 //
33 //===----------------------------------------------------------------------===//
34 
35 #include "Debug.h"
36 #include "DeviceTypes.h"
37 #include "DeviceUtils.h"
38 #include "Interface.h"
39 #include "LibC.h"
40 #include "Mapping.h"
41 #include "State.h"
42 #include "Synchronization.h"
43 
44 using namespace ompx;
45 
46 #pragma omp begin declare target device_type(nohost)
47 
48 namespace {
49 
50 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
51   uint32_t NThreadsICV =
52       NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
53   uint32_t NumThreads = mapping::getMaxTeamThreads();
54 
55   if (NThreadsICV != 0 && NThreadsICV < NumThreads)
56     NumThreads = NThreadsICV;
57 
58   // SPMD mode allows any number of threads, for generic mode we round down to a
59   // multiple of WARPSIZE since it is legal to do so in OpenMP.
60   if (mapping::isSPMDMode())
61     return NumThreads;
62 
63   if (NumThreads < mapping::getWarpSize())
64     NumThreads = 1;
65   else
66     NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
67 
68   return NumThreads;
69 }
70 
71 // Invoke an outlined parallel function unwrapping arguments (up to 32).
72 [[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
73                                               int32_t bound_tid, void *fn,
74                                               void **args, int64_t nargs) {
75   switch (nargs) {
76 #include "generated_microtask_cases.gen"
77   default:
78     printf("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
79     __builtin_trap();
80   }
81 }
82 
83 } // namespace
84 
85 extern "C" {
86 
87 [[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
88                                                    int32_t num_threads,
89                                                    void *fn, void **args,
90                                                    const int64_t nargs) {
91   uint32_t TId = mapping::getThreadIdInBlock();
92   uint32_t NumThreads = determineNumberOfThreads(num_threads);
93   uint32_t PTeamSize =
94       NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads;
95   // Avoid the race between the read of the `icv::Level` above and the write
96   // below by synchronizing all threads here.
97   synchronize::threadsAligned(atomic::seq_cst);
98   {
99     // Note that the order here is important. `icv::Level` has to be updated
100     // last or the other updates will cause a thread specific state to be
101     // created.
102     state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
103                                           1u, TId == 0, ident,
104                                           /*ForceTeamState=*/true);
105     state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident,
106                                      /*ForceTeamState=*/true);
107     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
108                                /*ForceTeamState=*/true);
109 
110     // Synchronize all threads after the main thread (TId == 0) set up the
111     // team state properly.
112     synchronize::threadsAligned(atomic::acq_rel);
113 
114     state::ParallelTeamSize.assert_eq(PTeamSize, ident,
115                                       /*ForceTeamState=*/true);
116     icv::ActiveLevel.assert_eq(1u, ident, /*ForceTeamState=*/true);
117     icv::Level.assert_eq(1u, ident, /*ForceTeamState=*/true);
118 
119     // Ensure we synchronize before we run user code to avoid invalidating the
120     // assumptions above.
121     synchronize::threadsAligned(atomic::relaxed);
122 
123     if (!PTeamSize || TId < PTeamSize)
124       invokeMicrotask(TId, 0, fn, args, nargs);
125 
126     // Synchronize all threads at the end of a parallel region.
127     synchronize::threadsAligned(atomic::seq_cst);
128   }
129 
130   // Synchronize all threads to make sure every thread exits the scope above;
131   // otherwise the following assertions and the assumption in
132   // __kmpc_target_deinit may not hold.
133   synchronize::threadsAligned(atomic::acq_rel);
134 
135   state::ParallelTeamSize.assert_eq(1u, ident, /*ForceTeamState=*/true);
136   icv::ActiveLevel.assert_eq(0u, ident, /*ForceTeamState=*/true);
137   icv::Level.assert_eq(0u, ident, /*ForceTeamState=*/true);
138 
139   // Ensure we synchronize to create an aligned region around the assumptions.
140   synchronize::threadsAligned(atomic::relaxed);
141 
142   return;
143 }
144 
145 [[clang::always_inline]] void
146 __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
147                    int32_t num_threads, int proc_bind, void *fn,
148                    void *wrapper_fn, void **args, int64_t nargs) {
149   uint32_t TId = mapping::getThreadIdInBlock();
150 
151   // Assert the parallelism level is zero if disabled by the user.
152   ASSERT((config::mayUseNestedParallelism() || icv::Level == 0),
153          "nested parallelism while disabled");
154 
155   // Handle the serialized case first, same for SPMD/non-SPMD:
156   // 1) if-clause(0)
157   // 2) parallel in task or other thread state inducing construct
158   // 3) nested parallel regions
159   if (OMP_UNLIKELY(!if_expr || state::HasThreadState ||
160                    (config::mayUseNestedParallelism() && icv::Level))) {
161     state::DateEnvironmentRAII DERAII(ident);
162     ++icv::Level;
163     invokeMicrotask(TId, 0, fn, args, nargs);
164     return;
165   }
166 
167   // From this point forward we know that there is no thread state used.
168   ASSERT(state::HasThreadState == false, nullptr);
169 
170   if (mapping::isSPMDMode()) {
171     // This was moved to its own routine so it could be called directly
172     // in certain situations to avoid resource consumption of unused
173     // logic in parallel_51.
174     __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs);
175 
176     return;
177   }
178 
179   uint32_t NumThreads = determineNumberOfThreads(num_threads);
180   uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
181   uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
182 
183   // We do *not* create a new data environment because all threads in the team
184   // that are active are now running this parallel region. They share the
185   // TeamState, which has an increase level-var and potentially active-level
186   // set, but they do not have individual ThreadStates yet. If they ever
187   // modify the ICVs beyond this point a ThreadStates will be allocated.
188 
189   bool IsActiveParallelRegion = NumThreads > 1;
190   if (!IsActiveParallelRegion) {
191     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
192     invokeMicrotask(TId, 0, fn, args, nargs);
193     return;
194   }
195 
196   void **GlobalArgs = nullptr;
197   if (nargs) {
198     __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
199     switch (nargs) {
200     default:
201       for (int I = 0; I < nargs; I++)
202         GlobalArgs[I] = args[I];
203       break;
204     case 16:
205       GlobalArgs[15] = args[15];
206       [[fallthrough]];
207     case 15:
208       GlobalArgs[14] = args[14];
209       [[fallthrough]];
210     case 14:
211       GlobalArgs[13] = args[13];
212       [[fallthrough]];
213     case 13:
214       GlobalArgs[12] = args[12];
215       [[fallthrough]];
216     case 12:
217       GlobalArgs[11] = args[11];
218       [[fallthrough]];
219     case 11:
220       GlobalArgs[10] = args[10];
221       [[fallthrough]];
222     case 10:
223       GlobalArgs[9] = args[9];
224       [[fallthrough]];
225     case 9:
226       GlobalArgs[8] = args[8];
227       [[fallthrough]];
228     case 8:
229       GlobalArgs[7] = args[7];
230       [[fallthrough]];
231     case 7:
232       GlobalArgs[6] = args[6];
233       [[fallthrough]];
234     case 6:
235       GlobalArgs[5] = args[5];
236       [[fallthrough]];
237     case 5:
238       GlobalArgs[4] = args[4];
239       [[fallthrough]];
240     case 4:
241       GlobalArgs[3] = args[3];
242       [[fallthrough]];
243     case 3:
244       GlobalArgs[2] = args[2];
245       [[fallthrough]];
246     case 2:
247       GlobalArgs[1] = args[1];
248       [[fallthrough]];
249     case 1:
250       GlobalArgs[0] = args[0];
251       [[fallthrough]];
252     case 0:
253       break;
254     }
255   }
256 
257   {
258     // Note that the order here is important. `icv::Level` has to be updated
259     // last or the other updates will cause a thread specific state to be
260     // created.
261     state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
262                                           1u, true, ident,
263                                           /*ForceTeamState=*/true);
264     state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
265                                           (void *)nullptr, true, ident,
266                                           /*ForceTeamState=*/true);
267     state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
268                                      /*ForceTeamState=*/true);
269     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident,
270                                /*ForceTeamState=*/true);
271 
272     // Master signals work to activate workers.
273     synchronize::threads(atomic::seq_cst);
274     // Master waits for workers to signal.
275     synchronize::threads(atomic::seq_cst);
276   }
277 
278   if (nargs)
279     __kmpc_end_sharing_variables();
280 }
281 
282 [[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
283   // Work function and arguments for L1 parallel region.
284   *WorkFn = state::ParallelRegionFn;
285 
286   // If this is the termination signal from the master, quit early.
287   if (!*WorkFn)
288     return false;
289 
290   // Set to true for workers participating in the parallel region.
291   uint32_t TId = mapping::getThreadIdInBlock();
292   bool ThreadIsActive = TId < state::getEffectivePTeamSize();
293   return ThreadIsActive;
294 }
295 
296 [[clang::noinline]] void __kmpc_kernel_end_parallel() {
297   // In case we have modified an ICV for this thread before a ThreadState was
298   // created. We drop it now to not contaminate the next parallel region.
299   ASSERT(!mapping::isSPMDMode(), nullptr);
300   uint32_t TId = mapping::getThreadIdInBlock();
301   state::resetStateForThread(TId);
302   ASSERT(!mapping::isSPMDMode(), nullptr);
303 }
304 
305 uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }
306 
307 int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }
308 
309 void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
310                            int32_t thread_limit) {}
311 
312 void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
313 }
314 
315 #pragma omp end declare target
316