1 //===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Parallel implementation in the GPU. Here is the pattern: 10 // 11 // while (not finished) { 12 // 13 // if (master) { 14 // sequential code, decide which par loop to do, or if finished 15 // __kmpc_kernel_prepare_parallel() // exec by master only 16 // } 17 // syncthreads // A 18 // __kmpc_kernel_parallel() // exec by all 19 // if (this thread is included in the parallel) { 20 // switch () for all parallel loops 21 // __kmpc_kernel_end_parallel() // exec only by threads in parallel 22 // } 23 // 24 // 25 // The reason we don't exec end_parallel for the threads not included 26 // in the parallel loop is that for each barrier in the parallel 27 // region, these non-included threads will cycle through the 28 // syncthread A. Thus they must preserve their current threadId that 29 // is larger than thread in team. 30 // 31 // To make a long story short... 32 // 33 //===----------------------------------------------------------------------===// 34 35 #include "Debug.h" 36 #include "DeviceTypes.h" 37 #include "DeviceUtils.h" 38 #include "Interface.h" 39 #include "LibC.h" 40 #include "Mapping.h" 41 #include "State.h" 42 #include "Synchronization.h" 43 44 using namespace ompx; 45 46 #pragma omp begin declare target device_type(nohost) 47 48 namespace { 49 50 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { 51 uint32_t NThreadsICV = 52 NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads; 53 uint32_t NumThreads = mapping::getMaxTeamThreads(); 54 55 if (NThreadsICV != 0 && NThreadsICV < NumThreads) 56 NumThreads = NThreadsICV; 57 58 // SPMD mode allows any number of threads, for generic mode we round down to a 59 // multiple of WARPSIZE since it is legal to do so in OpenMP. 60 if (mapping::isSPMDMode()) 61 return NumThreads; 62 63 if (NumThreads < mapping::getWarpSize()) 64 NumThreads = 1; 65 else 66 NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1)); 67 68 return NumThreads; 69 } 70 71 // Invoke an outlined parallel function unwrapping arguments (up to 32). 72 [[clang::always_inline]] void invokeMicrotask(int32_t global_tid, 73 int32_t bound_tid, void *fn, 74 void **args, int64_t nargs) { 75 switch (nargs) { 76 #include "generated_microtask_cases.gen" 77 default: 78 printf("Too many arguments in kmp_invoke_microtask, aborting execution.\n"); 79 __builtin_trap(); 80 } 81 } 82 83 } // namespace 84 85 extern "C" { 86 87 [[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident, 88 int32_t num_threads, 89 void *fn, void **args, 90 const int64_t nargs) { 91 uint32_t TId = mapping::getThreadIdInBlock(); 92 uint32_t NumThreads = determineNumberOfThreads(num_threads); 93 uint32_t PTeamSize = 94 NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads; 95 // Avoid the race between the read of the `icv::Level` above and the write 96 // below by synchronizing all threads here. 97 synchronize::threadsAligned(atomic::seq_cst); 98 { 99 // Note that the order here is important. `icv::Level` has to be updated 100 // last or the other updates will cause a thread specific state to be 101 // created. 102 state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, 103 1u, TId == 0, ident, 104 /*ForceTeamState=*/true); 105 state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident, 106 /*ForceTeamState=*/true); 107 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident, 108 /*ForceTeamState=*/true); 109 110 // Synchronize all threads after the main thread (TId == 0) set up the 111 // team state properly. 112 synchronize::threadsAligned(atomic::acq_rel); 113 114 state::ParallelTeamSize.assert_eq(PTeamSize, ident, 115 /*ForceTeamState=*/true); 116 icv::ActiveLevel.assert_eq(1u, ident, /*ForceTeamState=*/true); 117 icv::Level.assert_eq(1u, ident, /*ForceTeamState=*/true); 118 119 // Ensure we synchronize before we run user code to avoid invalidating the 120 // assumptions above. 121 synchronize::threadsAligned(atomic::relaxed); 122 123 if (!PTeamSize || TId < PTeamSize) 124 invokeMicrotask(TId, 0, fn, args, nargs); 125 126 // Synchronize all threads at the end of a parallel region. 127 synchronize::threadsAligned(atomic::seq_cst); 128 } 129 130 // Synchronize all threads to make sure every thread exits the scope above; 131 // otherwise the following assertions and the assumption in 132 // __kmpc_target_deinit may not hold. 133 synchronize::threadsAligned(atomic::acq_rel); 134 135 state::ParallelTeamSize.assert_eq(1u, ident, /*ForceTeamState=*/true); 136 icv::ActiveLevel.assert_eq(0u, ident, /*ForceTeamState=*/true); 137 icv::Level.assert_eq(0u, ident, /*ForceTeamState=*/true); 138 139 // Ensure we synchronize to create an aligned region around the assumptions. 140 synchronize::threadsAligned(atomic::relaxed); 141 142 return; 143 } 144 145 [[clang::always_inline]] void 146 __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, 147 int32_t num_threads, int proc_bind, void *fn, 148 void *wrapper_fn, void **args, int64_t nargs) { 149 uint32_t TId = mapping::getThreadIdInBlock(); 150 151 // Assert the parallelism level is zero if disabled by the user. 152 ASSERT((config::mayUseNestedParallelism() || icv::Level == 0), 153 "nested parallelism while disabled"); 154 155 // Handle the serialized case first, same for SPMD/non-SPMD: 156 // 1) if-clause(0) 157 // 2) parallel in task or other thread state inducing construct 158 // 3) nested parallel regions 159 if (OMP_UNLIKELY(!if_expr || state::HasThreadState || 160 (config::mayUseNestedParallelism() && icv::Level))) { 161 state::DateEnvironmentRAII DERAII(ident); 162 ++icv::Level; 163 invokeMicrotask(TId, 0, fn, args, nargs); 164 return; 165 } 166 167 // From this point forward we know that there is no thread state used. 168 ASSERT(state::HasThreadState == false, nullptr); 169 170 if (mapping::isSPMDMode()) { 171 // This was moved to its own routine so it could be called directly 172 // in certain situations to avoid resource consumption of unused 173 // logic in parallel_51. 174 __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs); 175 176 return; 177 } 178 179 uint32_t NumThreads = determineNumberOfThreads(num_threads); 180 uint32_t MaxTeamThreads = mapping::getMaxTeamThreads(); 181 uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads; 182 183 // We do *not* create a new data environment because all threads in the team 184 // that are active are now running this parallel region. They share the 185 // TeamState, which has an increase level-var and potentially active-level 186 // set, but they do not have individual ThreadStates yet. If they ever 187 // modify the ICVs beyond this point a ThreadStates will be allocated. 188 189 bool IsActiveParallelRegion = NumThreads > 1; 190 if (!IsActiveParallelRegion) { 191 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident); 192 invokeMicrotask(TId, 0, fn, args, nargs); 193 return; 194 } 195 196 void **GlobalArgs = nullptr; 197 if (nargs) { 198 __kmpc_begin_sharing_variables(&GlobalArgs, nargs); 199 switch (nargs) { 200 default: 201 for (int I = 0; I < nargs; I++) 202 GlobalArgs[I] = args[I]; 203 break; 204 case 16: 205 GlobalArgs[15] = args[15]; 206 [[fallthrough]]; 207 case 15: 208 GlobalArgs[14] = args[14]; 209 [[fallthrough]]; 210 case 14: 211 GlobalArgs[13] = args[13]; 212 [[fallthrough]]; 213 case 13: 214 GlobalArgs[12] = args[12]; 215 [[fallthrough]]; 216 case 12: 217 GlobalArgs[11] = args[11]; 218 [[fallthrough]]; 219 case 11: 220 GlobalArgs[10] = args[10]; 221 [[fallthrough]]; 222 case 10: 223 GlobalArgs[9] = args[9]; 224 [[fallthrough]]; 225 case 9: 226 GlobalArgs[8] = args[8]; 227 [[fallthrough]]; 228 case 8: 229 GlobalArgs[7] = args[7]; 230 [[fallthrough]]; 231 case 7: 232 GlobalArgs[6] = args[6]; 233 [[fallthrough]]; 234 case 6: 235 GlobalArgs[5] = args[5]; 236 [[fallthrough]]; 237 case 5: 238 GlobalArgs[4] = args[4]; 239 [[fallthrough]]; 240 case 4: 241 GlobalArgs[3] = args[3]; 242 [[fallthrough]]; 243 case 3: 244 GlobalArgs[2] = args[2]; 245 [[fallthrough]]; 246 case 2: 247 GlobalArgs[1] = args[1]; 248 [[fallthrough]]; 249 case 1: 250 GlobalArgs[0] = args[0]; 251 [[fallthrough]]; 252 case 0: 253 break; 254 } 255 } 256 257 { 258 // Note that the order here is important. `icv::Level` has to be updated 259 // last or the other updates will cause a thread specific state to be 260 // created. 261 state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, 262 1u, true, ident, 263 /*ForceTeamState=*/true); 264 state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn, 265 (void *)nullptr, true, ident, 266 /*ForceTeamState=*/true); 267 state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident, 268 /*ForceTeamState=*/true); 269 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident, 270 /*ForceTeamState=*/true); 271 272 // Master signals work to activate workers. 273 synchronize::threads(atomic::seq_cst); 274 // Master waits for workers to signal. 275 synchronize::threads(atomic::seq_cst); 276 } 277 278 if (nargs) 279 __kmpc_end_sharing_variables(); 280 } 281 282 [[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) { 283 // Work function and arguments for L1 parallel region. 284 *WorkFn = state::ParallelRegionFn; 285 286 // If this is the termination signal from the master, quit early. 287 if (!*WorkFn) 288 return false; 289 290 // Set to true for workers participating in the parallel region. 291 uint32_t TId = mapping::getThreadIdInBlock(); 292 bool ThreadIsActive = TId < state::getEffectivePTeamSize(); 293 return ThreadIsActive; 294 } 295 296 [[clang::noinline]] void __kmpc_kernel_end_parallel() { 297 // In case we have modified an ICV for this thread before a ThreadState was 298 // created. We drop it now to not contaminate the next parallel region. 299 ASSERT(!mapping::isSPMDMode(), nullptr); 300 uint32_t TId = mapping::getThreadIdInBlock(); 301 state::resetStateForThread(TId); 302 ASSERT(!mapping::isSPMDMode(), nullptr); 303 } 304 305 uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); } 306 307 int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); } 308 309 void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams, 310 int32_t thread_limit) {} 311 312 void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {} 313 } 314 315 #pragma omp end declare target 316