//===----- Workshare.cpp - OpenMP workshare implementation ------ C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file contains the implementation of the KMPC interface // for the loop construct plus other worksharing constructs that use the same // interface as loops. // //===----------------------------------------------------------------------===// #include "Workshare.h" #include "Debug.h" #include "DeviceTypes.h" #include "DeviceUtils.h" #include "Interface.h" #include "Mapping.h" #include "State.h" #include "Synchronization.h" using namespace ompx; // TODO: struct DynamicScheduleTracker { int64_t Chunk; int64_t LoopUpperBound; int64_t NextLowerBound; int64_t Stride; kmp_sched_t ScheduleType; DynamicScheduleTracker *NextDST; }; #define ASSERT0(...) // used by the library for the interface with the app #define DISPATCH_FINISHED 0 #define DISPATCH_NOTFINISHED 1 // used by dynamic scheduling #define FINISHED 0 #define NOT_FINISHED 1 #define LAST_CHUNK 2 #pragma omp begin declare target device_type(nohost) // TODO: This variable is a hack inherited from the old runtime. static uint64_t SHARED(Cnt); template struct omptarget_nvptx_LoopSupport { //////////////////////////////////////////////////////////////////////////////// // Loop with static scheduling with chunk // Generic implementation of OMP loop scheduling with static policy /*! \brief Calculate initial bounds for static loop and stride * @param[in] loc location in code of the call (not used here) * @param[in] global_tid global thread id * @param[in] schetype type of scheduling (see omptarget-nvptx.h) * @param[in] plastiter pointer to last iteration * @param[in,out] pointer to loop lower bound. it will contain value of * lower bound of first chunk * @param[in,out] pointer to loop upper bound. It will contain value of * upper bound of first chunk * @param[in,out] pointer to loop stride. It will contain value of stride * between two successive chunks executed by the same thread * @param[in] loop increment bump * @param[in] chunk size */ // helper function for static chunk static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk, T entityId, T numberOfEntities) { // each thread executes multiple chunks all of the same size, except // the last one // distance between two successive chunks stride = numberOfEntities * chunk; lb = lb + entityId * chunk; T inputUb = ub; ub = lb + chunk - 1; // Clang uses i <= ub // Say ub' is the begining of the last chunk. Then who ever has a // lower bound plus a multiple of the increment equal to ub' is // the last one. T beginingLastChunk = inputUb - (inputUb % chunk); last = ((beginingLastChunk - lb) % stride) == 0; } //////////////////////////////////////////////////////////////////////////////// // Loop with static scheduling without chunk // helper function for static no chunk static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk, T entityId, T numberOfEntities) { // No chunk size specified. Each thread or warp gets at most one // chunk; chunks are all almost of equal size T loopSize = ub - lb + 1; chunk = loopSize / numberOfEntities; T leftOver = loopSize - chunk * numberOfEntities; if (entityId < leftOver) { chunk++; lb = lb + entityId * chunk; } else { lb = lb + entityId * chunk + leftOver; } T inputUb = ub; ub = lb + chunk - 1; // Clang uses i <= ub last = lb <= inputUb && inputUb <= ub; stride = loopSize; // make sure we only do 1 chunk per warp } //////////////////////////////////////////////////////////////////////////////// // Support for Static Init static void for_static_init(int32_t, int32_t schedtype, int32_t *plastiter, T *plower, T *pupper, ST *pstride, ST chunk, bool IsSPMDExecutionMode) { int32_t gtid = omp_get_thread_num(); int numberOfActiveOMPThreads = omp_get_num_threads(); // All warps that are in excess of the maximum requested, do // not execute the loop ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads, "current thread is not needed here; error"); // copy int lastiter = 0; T lb = *plower; T ub = *pupper; ST stride = *pstride; // init switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) { case kmp_sched_static_chunk: { if (chunk > 0) { ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, numberOfActiveOMPThreads); break; } [[fallthrough]]; } // note: if chunk <=0, use nochunk case kmp_sched_static_balanced_chunk: { if (chunk > 0) { // round up to make sure the chunk is enough to cover all iterations T tripCount = ub - lb + 1; // +1 because ub is inclusive T span = (tripCount + numberOfActiveOMPThreads - 1) / numberOfActiveOMPThreads; // perform chunk adjustment chunk = (span + chunk - 1) & ~(chunk - 1); ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); T oldUb = ub; ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, numberOfActiveOMPThreads); if (ub > oldUb) ub = oldUb; break; } [[fallthrough]]; } // note: if chunk <=0, use nochunk case kmp_sched_static_nochunk: { ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid, numberOfActiveOMPThreads); break; } case kmp_sched_distr_static_chunk: { if (chunk > 0) { ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(), omp_get_num_teams()); break; } [[fallthrough]]; } // note: if chunk <=0, use nochunk case kmp_sched_distr_static_nochunk: { ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(), omp_get_num_teams()); break; } case kmp_sched_distr_static_chunk_sched_static_chunkone: { ForStaticChunk(lastiter, lb, ub, stride, chunk, numberOfActiveOMPThreads * omp_get_team_num() + gtid, omp_get_num_teams() * numberOfActiveOMPThreads); break; } default: { // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype); ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, numberOfActiveOMPThreads); break; } } // copy back *plastiter = lastiter; *plower = lb; *pupper = ub; *pstride = stride; } //////////////////////////////////////////////////////////////////////////////// // Support for dispatch Init static int OrderedSchedule(kmp_sched_t schedule) { return schedule >= kmp_sched_ordered_first && schedule <= kmp_sched_ordered_last; } static void dispatch_init(IdentTy *loc, int32_t threadId, kmp_sched_t schedule, T lb, T ub, ST st, ST chunk, DynamicScheduleTracker *DST) { int tid = mapping::getThreadIdInBlock(); T tnum = omp_get_num_threads(); T tripCount = ub - lb + 1; // +1 because ub is inclusive ASSERT0(LT_FUSSY, threadId < tnum, "current thread is not needed here; error"); /* Currently just ignore the monotonic and non-monotonic modifiers * (the compiler isn't producing them * yet anyway). * When it is we'll want to look at them somewhere here and use that * information to add to our schedule choice. We shouldn't need to pass * them on, they merely affect which schedule we can legally choose for * various dynamic cases. (In particular, whether or not a stealing scheme * is legal). */ schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); // Process schedule. if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) { if (OrderedSchedule(schedule)) __kmpc_barrier(loc, threadId); schedule = kmp_sched_static_chunk; chunk = tripCount; // one thread gets the whole loop } else if (schedule == kmp_sched_runtime) { // process runtime omp_sched_t rtSched; int ChunkInt; omp_get_schedule(&rtSched, &ChunkInt); chunk = ChunkInt; switch (rtSched) { case omp_sched_static: { if (chunk > 0) schedule = kmp_sched_static_chunk; else schedule = kmp_sched_static_nochunk; break; } case omp_sched_auto: { schedule = kmp_sched_static_chunk; chunk = 1; break; } case omp_sched_dynamic: case omp_sched_guided: { schedule = kmp_sched_dynamic; break; } } } else if (schedule == kmp_sched_auto) { schedule = kmp_sched_static_chunk; chunk = 1; } else { // ASSERT(LT_FUSSY, // schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, // "unknown schedule %d & chunk %lld\n", (int)schedule, // (long long)chunk); } // init schedules if (schedule == kmp_sched_static_chunk) { ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); // save sched state DST->ScheduleType = schedule; // save ub DST->LoopUpperBound = ub; // compute static chunk ST stride; int lastiter = 0; ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); // save computed params DST->Chunk = chunk; DST->NextLowerBound = lb; DST->Stride = stride; } else if (schedule == kmp_sched_static_balanced_chunk) { ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); // save sched state DST->ScheduleType = schedule; // save ub DST->LoopUpperBound = ub; // compute static chunk ST stride; int lastiter = 0; // round up to make sure the chunk is enough to cover all iterations T span = (tripCount + tnum - 1) / tnum; // perform chunk adjustment chunk = (span + chunk - 1) & ~(chunk - 1); T oldUb = ub; ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); if (ub > oldUb) ub = oldUb; // save computed params DST->Chunk = chunk; DST->NextLowerBound = lb; DST->Stride = stride; } else if (schedule == kmp_sched_static_nochunk) { ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); // save sched state DST->ScheduleType = schedule; // save ub DST->LoopUpperBound = ub; // compute static chunk ST stride; int lastiter = 0; ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); // save computed params DST->Chunk = chunk; DST->NextLowerBound = lb; DST->Stride = stride; } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { // save data DST->ScheduleType = schedule; if (chunk < 1) chunk = 1; DST->Chunk = chunk; DST->LoopUpperBound = ub; DST->NextLowerBound = lb; __kmpc_barrier(loc, threadId); if (tid == 0) { Cnt = 0; fence::team(atomic::seq_cst); } __kmpc_barrier(loc, threadId); } } //////////////////////////////////////////////////////////////////////////////// // Support for dispatch next static uint64_t NextIter() { __kmpc_impl_lanemask_t active = mapping::activemask(); uint32_t leader = utils::ffs(active) - 1; uint32_t change = utils::popc(active); __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT(); unsigned int rank = utils::popc(active & lane_mask_lt); uint64_t warp_res = 0; if (rank == 0) { warp_res = atomic::add(&Cnt, change, atomic::seq_cst); } warp_res = utils::shuffle(active, warp_res, leader, mapping::getWarpSize()); return warp_res + rank; } static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound, T loopUpperBound) { T N = NextIter(); lb = loopLowerBound + N * chunkSize; ub = lb + chunkSize - 1; // Clang uses i <= ub // 3 result cases: // a. lb and ub < loopUpperBound --> NOT_FINISHED // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> // NOT_FINISHED // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED // a. if (lb <= loopUpperBound && ub < loopUpperBound) { return NOT_FINISHED; } // b. if (lb <= loopUpperBound) { ub = loopUpperBound; return LAST_CHUNK; } // c. if we are here, we are in case 'c' lb = loopUpperBound + 2; ub = loopUpperBound + 1; return FINISHED; } static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast, T *plower, T *pupper, ST *pstride, DynamicScheduleTracker *DST) { // ID of a thread in its own warp // automatically selects thread or warp ID based on selected implementation ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(), "current thread is not needed here; error"); // retrieve schedule kmp_sched_t schedule = DST->ScheduleType; // xxx reduce to one if (schedule == kmp_sched_static_chunk || schedule == kmp_sched_static_nochunk) { T myLb = DST->NextLowerBound; T ub = DST->LoopUpperBound; // finished? if (myLb > ub) { return DISPATCH_FINISHED; } // not finished, save current bounds ST chunk = DST->Chunk; *plower = myLb; T myUb = myLb + chunk - 1; // Clang uses i <= ub if (myUb > ub) myUb = ub; *pupper = myUb; *plast = (int32_t)(myUb == ub); // increment next lower bound by the stride ST stride = DST->Stride; DST->NextLowerBound = myLb + stride; return DISPATCH_NOTFINISHED; } ASSERT0(LT_FUSSY, schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, "bad sched"); T myLb, myUb; int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound, DST->LoopUpperBound); if (finished == FINISHED) return DISPATCH_FINISHED; // not finished (either not finished or last chunk) *plast = (int32_t)(finished == LAST_CHUNK); *plower = myLb; *pupper = myUb; *pstride = 1; return DISPATCH_NOTFINISHED; } static void dispatch_fini() { // nothing } //////////////////////////////////////////////////////////////////////////////// // end of template class that encapsulate all the helper functions //////////////////////////////////////////////////////////////////////////////// }; //////////////////////////////////////////////////////////////////////////////// // KMP interface implementation (dyn loops) //////////////////////////////////////////////////////////////////////////////// // TODO: Expand the dispatch API to take a DST pointer which can then be // allocated properly without malloc. // For now, each team will contain an LDS pointer (ThreadDST) to a global array // of references to the DST structs allocated (in global memory) for each thread // in the team. The global memory array is allocated during the init phase if it // was not allocated already and will be deallocated when the dispatch phase // ends: // // __kmpc_dispatch_init // // ** Dispatch loop ** // // __kmpc_dispatch_deinit // static DynamicScheduleTracker **SHARED(ThreadDST); // Create a new DST, link the current one, and define the new as current. static DynamicScheduleTracker *pushDST() { int32_t ThreadIndex = mapping::getThreadIdInBlock(); // Each block will allocate an array of pointers to DST structs. The array is // equal in length to the number of threads in that block. if (!ThreadDST) { // Allocate global memory array of pointers to DST structs: if (mapping::isMainThreadInGenericMode() || ThreadIndex == 0) ThreadDST = static_cast( memory::allocGlobal(mapping::getNumberOfThreadsInBlock() * sizeof(DynamicScheduleTracker *), "new ThreadDST array")); synchronize::threads(atomic::seq_cst); // Initialize the array pointers: ThreadDST[ThreadIndex] = nullptr; } // Create a DST struct for the current thread: DynamicScheduleTracker *NewDST = static_cast( memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST")); *NewDST = DynamicScheduleTracker({0}); // Add the new DST struct to the array of DST structs: NewDST->NextDST = ThreadDST[ThreadIndex]; ThreadDST[ThreadIndex] = NewDST; return NewDST; } // Return the current DST. static DynamicScheduleTracker *peekDST() { return ThreadDST[mapping::getThreadIdInBlock()]; } // Pop the current DST and restore the last one. static void popDST() { int32_t ThreadIndex = mapping::getThreadIdInBlock(); DynamicScheduleTracker *CurrentDST = ThreadDST[ThreadIndex]; DynamicScheduleTracker *OldDST = CurrentDST->NextDST; memory::freeGlobal(CurrentDST, "remove DST"); ThreadDST[ThreadIndex] = OldDST; // Check if we need to deallocate the global array. Ensure all threads // in the block have finished deallocating the individual DSTs. synchronize::threads(atomic::seq_cst); if (!ThreadDST[ThreadIndex] && !ThreadIndex) { memory::freeGlobal(ThreadDST, "remove ThreadDST array"); ThreadDST = nullptr; } synchronize::threads(atomic::seq_cst); } void workshare::init(bool IsSPMD) { if (mapping::isInitialThreadInLevel0(IsSPMD)) ThreadDST = nullptr; } extern "C" { // init void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule, int32_t lb, int32_t ub, int32_t st, int32_t chunk) { DynamicScheduleTracker *DST = pushDST(); omptarget_nvptx_LoopSupport::dispatch_init( loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); } void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule, uint32_t lb, uint32_t ub, int32_t st, int32_t chunk) { DynamicScheduleTracker *DST = pushDST(); omptarget_nvptx_LoopSupport::dispatch_init( loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); } void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule, int64_t lb, int64_t ub, int64_t st, int64_t chunk) { DynamicScheduleTracker *DST = pushDST(); omptarget_nvptx_LoopSupport::dispatch_init( loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); } void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule, uint64_t lb, uint64_t ub, int64_t st, int64_t chunk) { DynamicScheduleTracker *DST = pushDST(); omptarget_nvptx_LoopSupport::dispatch_init( loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); } // next int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last, int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { DynamicScheduleTracker *DST = peekDST(); return omptarget_nvptx_LoopSupport::dispatch_next( loc, tid, p_last, p_lb, p_ub, p_st, DST); } int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last, uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) { DynamicScheduleTracker *DST = peekDST(); return omptarget_nvptx_LoopSupport::dispatch_next( loc, tid, p_last, p_lb, p_ub, p_st, DST); } int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last, int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { DynamicScheduleTracker *DST = peekDST(); return omptarget_nvptx_LoopSupport::dispatch_next( loc, tid, p_last, p_lb, p_ub, p_st, DST); } int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last, uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) { DynamicScheduleTracker *DST = peekDST(); return omptarget_nvptx_LoopSupport::dispatch_next( loc, tid, p_last, p_lb, p_ub, p_st, DST); } // fini void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) { omptarget_nvptx_LoopSupport::dispatch_fini(); } void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) { omptarget_nvptx_LoopSupport::dispatch_fini(); } void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) { omptarget_nvptx_LoopSupport::dispatch_fini(); } void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) { omptarget_nvptx_LoopSupport::dispatch_fini(); } // deinit void __kmpc_dispatch_deinit(IdentTy *loc, int32_t tid) { popDST(); } //////////////////////////////////////////////////////////////////////////////// // KMP interface implementation (static loops) //////////////////////////////////////////////////////////////////////////////// void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr, int32_t chunk) { omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, mapping::isSPMDMode()); } void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, int32_t chunk) { omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, mapping::isSPMDMode()); } void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr, int64_t chunk) { omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, mapping::isSPMDMode()); } void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, int64_t chunk) { omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, mapping::isSPMDMode()); } void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr, int32_t chunk) { omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, mapping::isSPMDMode()); } void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, int32_t chunk) { omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, mapping::isSPMDMode()); } void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr, int64_t chunk) { omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, mapping::isSPMDMode()); } void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, int64_t chunk) { omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, mapping::isSPMDMode()); } void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {} void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {} } namespace ompx { /// Helper class to hide the generic loop nest and provide the template argument /// throughout. template class StaticLoopChunker { /// Generic loop nest that handles block and/or thread distribution in the /// absence of user specified chunk sizes. This implicitly picks a block chunk /// size equal to the number of threads in the block and a thread chunk size /// equal to one. In contrast to the chunked version we can get away with a /// single loop in this case static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg, Ty NumBlocks, Ty BId, Ty NumThreads, Ty TId, Ty NumIters, bool OneIterationPerThread) { Ty KernelIteration = NumBlocks * NumThreads; // Start index in the normalized space. Ty IV = BId * NumThreads + TId; ASSERT(IV >= 0, "Bad index"); // Cover the entire iteration space, assumptions in the caller might allow // to simplify this loop to a conditional. if (IV < NumIters) { do { // Execute the loop body. LoopBody(IV, Arg); // Every thread executed one block and thread chunk now. IV += KernelIteration; if (OneIterationPerThread) return; } while (IV < NumIters); } } /// Generic loop nest that handles block and/or thread distribution in the /// presence of user specified chunk sizes (for at least one of them). static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg, Ty BlockChunk, Ty NumBlocks, Ty BId, Ty ThreadChunk, Ty NumThreads, Ty TId, Ty NumIters, bool OneIterationPerThread) { Ty KernelIteration = NumBlocks * BlockChunk; // Start index in the chunked space. Ty IV = BId * BlockChunk + TId; ASSERT(IV >= 0, "Bad index"); // Cover the entire iteration space, assumptions in the caller might allow // to simplify this loop to a conditional. do { Ty BlockChunkLeft = BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0; Ty ThreadChunkLeft = ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft; while (ThreadChunkLeft--) { // Given the blocking it's hard to keep track of what to execute. if (IV >= NumIters) return; // Execute the loop body. LoopBody(IV, Arg); if (OneIterationPerThread) return; ++IV; } IV += KernelIteration; } while (IV < NumIters); } public: /// Worksharing `for`-loop. static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, Ty NumIters, Ty NumThreads, Ty ThreadChunk) { ASSERT(NumIters >= 0, "Bad iteration count"); ASSERT(ThreadChunk >= 0, "Bad thread count"); // All threads need to participate but we don't know if we are in a // parallel at all or if the user might have used a `num_threads` clause // on the parallel and reduced the number compared to the block size. // Since nested parallels are possible too we need to get the thread id // from the `omp` getter and not the mapping directly. Ty TId = omp_get_thread_num(); // There are no blocks involved here. Ty BlockChunk = 0; Ty NumBlocks = 1; Ty BId = 0; // If the thread chunk is not specified we pick a default now. if (ThreadChunk == 0) ThreadChunk = 1; // If we know we have more threads than iterations we can indicate that to // avoid an outer loop. bool OneIterationPerThread = false; if (config::getAssumeThreadsOversubscription()) { ASSERT(NumThreads >= NumIters, "Broken assumption"); OneIterationPerThread = true; } if (ThreadChunk != 1) NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, ThreadChunk, NumThreads, TId, NumIters, OneIterationPerThread); else NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, NumIters, OneIterationPerThread); } /// Worksharing `distrbute`-loop. static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, Ty NumIters, Ty BlockChunk) { ASSERT(icv::Level == 0, "Bad distribute"); ASSERT(icv::ActiveLevel == 0, "Bad distribute"); ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); ASSERT(state::ParallelTeamSize == 1, "Bad distribute"); ASSERT(NumIters >= 0, "Bad iteration count"); ASSERT(BlockChunk >= 0, "Bad block count"); // There are no threads involved here. Ty ThreadChunk = 0; Ty NumThreads = 1; Ty TId = 0; ASSERT(TId == mapping::getThreadIdInBlock(), "Bad thread id"); // All teams need to participate. Ty NumBlocks = mapping::getNumberOfBlocksInKernel(); Ty BId = mapping::getBlockIdInKernel(); // If the block chunk is not specified we pick a default now. if (BlockChunk == 0) BlockChunk = NumThreads; // If we know we have more blocks than iterations we can indicate that to // avoid an outer loop. bool OneIterationPerThread = false; if (config::getAssumeTeamsOversubscription()) { ASSERT(NumBlocks >= NumIters, "Broken assumption"); OneIterationPerThread = true; } if (BlockChunk != NumThreads) NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, ThreadChunk, NumThreads, TId, NumIters, OneIterationPerThread); else NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, NumIters, OneIterationPerThread); ASSERT(icv::Level == 0, "Bad distribute"); ASSERT(icv::ActiveLevel == 0, "Bad distribute"); ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); ASSERT(state::ParallelTeamSize == 1, "Bad distribute"); } /// Worksharing `distrbute parallel for`-loop. static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, Ty NumIters, Ty NumThreads, Ty BlockChunk, Ty ThreadChunk) { ASSERT(icv::Level == 1, "Bad distribute"); ASSERT(icv::ActiveLevel == 1, "Bad distribute"); ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); ASSERT(NumIters >= 0, "Bad iteration count"); ASSERT(BlockChunk >= 0, "Bad block count"); ASSERT(ThreadChunk >= 0, "Bad thread count"); // All threads need to participate but the user might have used a // `num_threads` clause on the parallel and reduced the number compared to // the block size. Ty TId = mapping::getThreadIdInBlock(); // All teams need to participate. Ty NumBlocks = mapping::getNumberOfBlocksInKernel(); Ty BId = mapping::getBlockIdInKernel(); // If the block chunk is not specified we pick a default now. if (BlockChunk == 0) BlockChunk = NumThreads; // If the thread chunk is not specified we pick a default now. if (ThreadChunk == 0) ThreadChunk = 1; // If we know we have more threads (across all blocks) than iterations we // can indicate that to avoid an outer loop. bool OneIterationPerThread = false; if (config::getAssumeTeamsOversubscription() & config::getAssumeThreadsOversubscription()) { OneIterationPerThread = true; ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption"); } if (BlockChunk != NumThreads || ThreadChunk != 1) NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, ThreadChunk, NumThreads, TId, NumIters, OneIterationPerThread); else NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, NumIters, OneIterationPerThread); ASSERT(icv::Level == 1, "Bad distribute"); ASSERT(icv::ActiveLevel == 1, "Bad distribute"); ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); } }; } // namespace ompx #define OMP_LOOP_ENTRY(BW, TY) \ [[gnu::flatten, clang::always_inline]] void \ __kmpc_distribute_for_static_loop##BW( \ IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ TY num_threads, TY block_chunk, TY thread_chunk) { \ ompx::StaticLoopChunker::DistributeFor( \ loc, fn, arg, num_iters + 1, num_threads, block_chunk, thread_chunk); \ } \ [[gnu::flatten, clang::always_inline]] void \ __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \ void *arg, TY num_iters, \ TY block_chunk) { \ ompx::StaticLoopChunker::Distribute(loc, fn, arg, num_iters + 1, \ block_chunk); \ } \ [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \ IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ TY num_threads, TY thread_chunk) { \ ompx::StaticLoopChunker::For(loc, fn, arg, num_iters + 1, num_threads, \ thread_chunk); \ } extern "C" { OMP_LOOP_ENTRY(_4, int32_t) OMP_LOOP_ENTRY(_4u, uint32_t) OMP_LOOP_ENTRY(_8, int64_t) OMP_LOOP_ENTRY(_8u, uint64_t) } #pragma omp end declare target