xref: /freebsd-src/contrib/llvm-project/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp (revision 36b606ae6aa4b24061096ba18582e0a08ccd5dba)
1e8d8bef9SDimitry Andric //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2e8d8bef9SDimitry Andric //
3e8d8bef9SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4e8d8bef9SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5e8d8bef9SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6e8d8bef9SDimitry Andric //
7e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===//
8e8d8bef9SDimitry Andric //
9e8d8bef9SDimitry Andric // This provides a generalized class for OpenMP runtime code generation
10e8d8bef9SDimitry Andric // specialized by GPU targets NVPTX and AMDGCN.
11e8d8bef9SDimitry Andric //
12e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===//
13e8d8bef9SDimitry Andric 
14e8d8bef9SDimitry Andric #include "CGOpenMPRuntimeGPU.h"
15e8d8bef9SDimitry Andric #include "CodeGenFunction.h"
16e8d8bef9SDimitry Andric #include "clang/AST/Attr.h"
17e8d8bef9SDimitry Andric #include "clang/AST/DeclOpenMP.h"
18bdd1243dSDimitry Andric #include "clang/AST/OpenMPClause.h"
19e8d8bef9SDimitry Andric #include "clang/AST/StmtOpenMP.h"
20e8d8bef9SDimitry Andric #include "clang/AST/StmtVisitor.h"
21e8d8bef9SDimitry Andric #include "clang/Basic/Cuda.h"
22e8d8bef9SDimitry Andric #include "llvm/ADT/SmallPtrSet.h"
23e8d8bef9SDimitry Andric #include "llvm/Frontend/OpenMP/OMPGridValues.h"
24349cc55cSDimitry Andric #include "llvm/Support/MathExtras.h"
25e8d8bef9SDimitry Andric 
26e8d8bef9SDimitry Andric using namespace clang;
27e8d8bef9SDimitry Andric using namespace CodeGen;
28e8d8bef9SDimitry Andric using namespace llvm::omp;
29e8d8bef9SDimitry Andric 
30e8d8bef9SDimitry Andric namespace {
31e8d8bef9SDimitry Andric /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32e8d8bef9SDimitry Andric class NVPTXActionTy final : public PrePostActionTy {
33e8d8bef9SDimitry Andric   llvm::FunctionCallee EnterCallee = nullptr;
34e8d8bef9SDimitry Andric   ArrayRef<llvm::Value *> EnterArgs;
35e8d8bef9SDimitry Andric   llvm::FunctionCallee ExitCallee = nullptr;
36e8d8bef9SDimitry Andric   ArrayRef<llvm::Value *> ExitArgs;
37e8d8bef9SDimitry Andric   bool Conditional = false;
38e8d8bef9SDimitry Andric   llvm::BasicBlock *ContBlock = nullptr;
39e8d8bef9SDimitry Andric 
40e8d8bef9SDimitry Andric public:
41e8d8bef9SDimitry Andric   NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42e8d8bef9SDimitry Andric                 ArrayRef<llvm::Value *> EnterArgs,
43e8d8bef9SDimitry Andric                 llvm::FunctionCallee ExitCallee,
44e8d8bef9SDimitry Andric                 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45e8d8bef9SDimitry Andric       : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46e8d8bef9SDimitry Andric         ExitArgs(ExitArgs), Conditional(Conditional) {}
47e8d8bef9SDimitry Andric   void Enter(CodeGenFunction &CGF) override {
48e8d8bef9SDimitry Andric     llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
49e8d8bef9SDimitry Andric     if (Conditional) {
50e8d8bef9SDimitry Andric       llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
51e8d8bef9SDimitry Andric       auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
52e8d8bef9SDimitry Andric       ContBlock = CGF.createBasicBlock("omp_if.end");
53e8d8bef9SDimitry Andric       // Generate the branch (If-stmt)
54e8d8bef9SDimitry Andric       CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
55e8d8bef9SDimitry Andric       CGF.EmitBlock(ThenBlock);
56e8d8bef9SDimitry Andric     }
57e8d8bef9SDimitry Andric   }
58e8d8bef9SDimitry Andric   void Done(CodeGenFunction &CGF) {
59e8d8bef9SDimitry Andric     // Emit the rest of blocks/branches
60e8d8bef9SDimitry Andric     CGF.EmitBranch(ContBlock);
61e8d8bef9SDimitry Andric     CGF.EmitBlock(ContBlock, true);
62e8d8bef9SDimitry Andric   }
63e8d8bef9SDimitry Andric   void Exit(CodeGenFunction &CGF) override {
64e8d8bef9SDimitry Andric     CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
65e8d8bef9SDimitry Andric   }
66e8d8bef9SDimitry Andric };
67e8d8bef9SDimitry Andric 
68e8d8bef9SDimitry Andric /// A class to track the execution mode when codegening directives within
69e8d8bef9SDimitry Andric /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70e8d8bef9SDimitry Andric /// to the target region and used by containing directives such as 'parallel'
71e8d8bef9SDimitry Andric /// to emit optimized code.
72e8d8bef9SDimitry Andric class ExecutionRuntimeModesRAII {
73e8d8bef9SDimitry Andric private:
74e8d8bef9SDimitry Andric   CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
75e8d8bef9SDimitry Andric       CGOpenMPRuntimeGPU::EM_Unknown;
76e8d8bef9SDimitry Andric   CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
77e8d8bef9SDimitry Andric 
78e8d8bef9SDimitry Andric public:
79bdd1243dSDimitry Andric   ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
80bdd1243dSDimitry Andric                             CGOpenMPRuntimeGPU::ExecutionMode EntryMode)
81e8d8bef9SDimitry Andric       : ExecMode(ExecMode) {
82e8d8bef9SDimitry Andric     SavedExecMode = ExecMode;
83bdd1243dSDimitry Andric     ExecMode = EntryMode;
84e8d8bef9SDimitry Andric   }
85bdd1243dSDimitry Andric   ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
86e8d8bef9SDimitry Andric };
87e8d8bef9SDimitry Andric 
88e8d8bef9SDimitry Andric static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
89e8d8bef9SDimitry Andric   RefExpr = RefExpr->IgnoreParens();
90e8d8bef9SDimitry Andric   if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
91e8d8bef9SDimitry Andric     const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
92e8d8bef9SDimitry Andric     while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
93e8d8bef9SDimitry Andric       Base = TempASE->getBase()->IgnoreParenImpCasts();
94e8d8bef9SDimitry Andric     RefExpr = Base;
950fca6ea1SDimitry Andric   } else if (auto *OASE = dyn_cast<ArraySectionExpr>(RefExpr)) {
96e8d8bef9SDimitry Andric     const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
970fca6ea1SDimitry Andric     while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Base))
98e8d8bef9SDimitry Andric       Base = TempOASE->getBase()->IgnoreParenImpCasts();
99e8d8bef9SDimitry Andric     while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
100e8d8bef9SDimitry Andric       Base = TempASE->getBase()->IgnoreParenImpCasts();
101e8d8bef9SDimitry Andric     RefExpr = Base;
102e8d8bef9SDimitry Andric   }
103e8d8bef9SDimitry Andric   RefExpr = RefExpr->IgnoreParenImpCasts();
104e8d8bef9SDimitry Andric   if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
105e8d8bef9SDimitry Andric     return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
106e8d8bef9SDimitry Andric   const auto *ME = cast<MemberExpr>(RefExpr);
107e8d8bef9SDimitry Andric   return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
108e8d8bef9SDimitry Andric }
109e8d8bef9SDimitry Andric 
110e8d8bef9SDimitry Andric static RecordDecl *buildRecordForGlobalizedVars(
111e8d8bef9SDimitry Andric     ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
112e8d8bef9SDimitry Andric     ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
113e8d8bef9SDimitry Andric     llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1145f757f3fSDimitry Andric         &MappedDeclsFields,
1155f757f3fSDimitry Andric     int BufSize) {
116e8d8bef9SDimitry Andric   using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
117e8d8bef9SDimitry Andric   if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
118e8d8bef9SDimitry Andric     return nullptr;
119e8d8bef9SDimitry Andric   SmallVector<VarsDataTy, 4> GlobalizedVars;
120e8d8bef9SDimitry Andric   for (const ValueDecl *D : EscapedDecls)
1215f757f3fSDimitry Andric     GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
122e8d8bef9SDimitry Andric   for (const ValueDecl *D : EscapedDeclsForTeams)
123e8d8bef9SDimitry Andric     GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
124e8d8bef9SDimitry Andric 
125e8d8bef9SDimitry Andric   // Build struct _globalized_locals_ty {
1265f757f3fSDimitry Andric   //         /*  globalized vars  */[WarSize] align (decl_align)
127e8d8bef9SDimitry Andric   //         /*  globalized vars  */ for EscapedDeclsForTeams
128e8d8bef9SDimitry Andric   //       };
129e8d8bef9SDimitry Andric   RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
130e8d8bef9SDimitry Andric   GlobalizedRD->startDefinition();
131e8d8bef9SDimitry Andric   llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
132e8d8bef9SDimitry Andric       EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
133e8d8bef9SDimitry Andric   for (const auto &Pair : GlobalizedVars) {
134e8d8bef9SDimitry Andric     const ValueDecl *VD = Pair.second;
135e8d8bef9SDimitry Andric     QualType Type = VD->getType();
136e8d8bef9SDimitry Andric     if (Type->isLValueReferenceType())
137e8d8bef9SDimitry Andric       Type = C.getPointerType(Type.getNonReferenceType());
138e8d8bef9SDimitry Andric     else
139e8d8bef9SDimitry Andric       Type = Type.getNonReferenceType();
140e8d8bef9SDimitry Andric     SourceLocation Loc = VD->getLocation();
141e8d8bef9SDimitry Andric     FieldDecl *Field;
142e8d8bef9SDimitry Andric     if (SingleEscaped.count(VD)) {
143e8d8bef9SDimitry Andric       Field = FieldDecl::Create(
144e8d8bef9SDimitry Andric           C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
145e8d8bef9SDimitry Andric           C.getTrivialTypeSourceInfo(Type, SourceLocation()),
146e8d8bef9SDimitry Andric           /*BW=*/nullptr, /*Mutable=*/false,
147e8d8bef9SDimitry Andric           /*InitStyle=*/ICIS_NoInit);
148e8d8bef9SDimitry Andric       Field->setAccess(AS_public);
149e8d8bef9SDimitry Andric       if (VD->hasAttrs()) {
150e8d8bef9SDimitry Andric         for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
151e8d8bef9SDimitry Andric              E(VD->getAttrs().end());
152e8d8bef9SDimitry Andric              I != E; ++I)
153e8d8bef9SDimitry Andric           Field->addAttr(*I);
154e8d8bef9SDimitry Andric       }
155e8d8bef9SDimitry Andric     } else {
1565f757f3fSDimitry Andric       if (BufSize > 1) {
157e8d8bef9SDimitry Andric         llvm::APInt ArraySize(32, BufSize);
1585f757f3fSDimitry Andric         Type = C.getConstantArrayType(Type, ArraySize, nullptr,
1595f757f3fSDimitry Andric                                       ArraySizeModifier::Normal, 0);
1605f757f3fSDimitry Andric       }
161e8d8bef9SDimitry Andric       Field = FieldDecl::Create(
162e8d8bef9SDimitry Andric           C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
163e8d8bef9SDimitry Andric           C.getTrivialTypeSourceInfo(Type, SourceLocation()),
164e8d8bef9SDimitry Andric           /*BW=*/nullptr, /*Mutable=*/false,
165e8d8bef9SDimitry Andric           /*InitStyle=*/ICIS_NoInit);
166e8d8bef9SDimitry Andric       Field->setAccess(AS_public);
1675f757f3fSDimitry Andric       llvm::APInt Align(32, Pair.first.getQuantity());
168e8d8bef9SDimitry Andric       Field->addAttr(AlignedAttr::CreateImplicit(
169e8d8bef9SDimitry Andric           C, /*IsAlignmentExpr=*/true,
170e8d8bef9SDimitry Andric           IntegerLiteral::Create(C, Align,
171e8d8bef9SDimitry Andric                                  C.getIntTypeForBitwidth(32, /*Signed=*/0),
172e8d8bef9SDimitry Andric                                  SourceLocation()),
17306c3fb27SDimitry Andric           {}, AlignedAttr::GNU_aligned));
174e8d8bef9SDimitry Andric     }
175e8d8bef9SDimitry Andric     GlobalizedRD->addDecl(Field);
176e8d8bef9SDimitry Andric     MappedDeclsFields.try_emplace(VD, Field);
177e8d8bef9SDimitry Andric   }
178e8d8bef9SDimitry Andric   GlobalizedRD->completeDefinition();
179e8d8bef9SDimitry Andric   return GlobalizedRD;
180e8d8bef9SDimitry Andric }
181e8d8bef9SDimitry Andric 
182e8d8bef9SDimitry Andric /// Get the list of variables that can escape their declaration context.
183e8d8bef9SDimitry Andric class CheckVarsEscapingDeclContext final
184e8d8bef9SDimitry Andric     : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
185e8d8bef9SDimitry Andric   CodeGenFunction &CGF;
186e8d8bef9SDimitry Andric   llvm::SetVector<const ValueDecl *> EscapedDecls;
187e8d8bef9SDimitry Andric   llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
18806c3fb27SDimitry Andric   llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;
189e8d8bef9SDimitry Andric   llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
190e8d8bef9SDimitry Andric   RecordDecl *GlobalizedRD = nullptr;
191e8d8bef9SDimitry Andric   llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
192e8d8bef9SDimitry Andric   bool AllEscaped = false;
193e8d8bef9SDimitry Andric   bool IsForCombinedParallelRegion = false;
194e8d8bef9SDimitry Andric 
195e8d8bef9SDimitry Andric   void markAsEscaped(const ValueDecl *VD) {
196e8d8bef9SDimitry Andric     // Do not globalize declare target variables.
197e8d8bef9SDimitry Andric     if (!isa<VarDecl>(VD) ||
198e8d8bef9SDimitry Andric         OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
199e8d8bef9SDimitry Andric       return;
200e8d8bef9SDimitry Andric     VD = cast<ValueDecl>(VD->getCanonicalDecl());
201e8d8bef9SDimitry Andric     // Use user-specified allocation.
202e8d8bef9SDimitry Andric     if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
203e8d8bef9SDimitry Andric       return;
204e8d8bef9SDimitry Andric     // Variables captured by value must be globalized.
20506c3fb27SDimitry Andric     bool IsCaptured = false;
206e8d8bef9SDimitry Andric     if (auto *CSI = CGF.CapturedStmtInfo) {
207e8d8bef9SDimitry Andric       if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
208e8d8bef9SDimitry Andric         // Check if need to capture the variable that was already captured by
209e8d8bef9SDimitry Andric         // value in the outer region.
21006c3fb27SDimitry Andric         IsCaptured = true;
211e8d8bef9SDimitry Andric         if (!IsForCombinedParallelRegion) {
212e8d8bef9SDimitry Andric           if (!FD->hasAttrs())
213e8d8bef9SDimitry Andric             return;
214e8d8bef9SDimitry Andric           const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
215e8d8bef9SDimitry Andric           if (!Attr)
216e8d8bef9SDimitry Andric             return;
217e8d8bef9SDimitry Andric           if (((Attr->getCaptureKind() != OMPC_map) &&
218e8d8bef9SDimitry Andric                !isOpenMPPrivate(Attr->getCaptureKind())) ||
219e8d8bef9SDimitry Andric               ((Attr->getCaptureKind() == OMPC_map) &&
220e8d8bef9SDimitry Andric                !FD->getType()->isAnyPointerType()))
221e8d8bef9SDimitry Andric             return;
222e8d8bef9SDimitry Andric         }
223e8d8bef9SDimitry Andric         if (!FD->getType()->isReferenceType()) {
224e8d8bef9SDimitry Andric           assert(!VD->getType()->isVariablyModifiedType() &&
225e8d8bef9SDimitry Andric                  "Parameter captured by value with variably modified type");
226e8d8bef9SDimitry Andric           EscapedParameters.insert(VD);
227e8d8bef9SDimitry Andric         } else if (!IsForCombinedParallelRegion) {
228e8d8bef9SDimitry Andric           return;
229e8d8bef9SDimitry Andric         }
230e8d8bef9SDimitry Andric       }
231e8d8bef9SDimitry Andric     }
232e8d8bef9SDimitry Andric     if ((!CGF.CapturedStmtInfo ||
233e8d8bef9SDimitry Andric          (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
234e8d8bef9SDimitry Andric         VD->getType()->isReferenceType())
235e8d8bef9SDimitry Andric       // Do not globalize variables with reference type.
236e8d8bef9SDimitry Andric       return;
23706c3fb27SDimitry Andric     if (VD->getType()->isVariablyModifiedType()) {
23806c3fb27SDimitry Andric       // If not captured at the target region level then mark the escaped
23906c3fb27SDimitry Andric       // variable as delayed.
24006c3fb27SDimitry Andric       if (IsCaptured)
241e8d8bef9SDimitry Andric         EscapedVariableLengthDecls.insert(VD);
242e8d8bef9SDimitry Andric       else
24306c3fb27SDimitry Andric         DelayedVariableLengthDecls.insert(VD);
24406c3fb27SDimitry Andric     } else
245e8d8bef9SDimitry Andric       EscapedDecls.insert(VD);
246e8d8bef9SDimitry Andric   }
247e8d8bef9SDimitry Andric 
248e8d8bef9SDimitry Andric   void VisitValueDecl(const ValueDecl *VD) {
249e8d8bef9SDimitry Andric     if (VD->getType()->isLValueReferenceType())
250e8d8bef9SDimitry Andric       markAsEscaped(VD);
251e8d8bef9SDimitry Andric     if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
252e8d8bef9SDimitry Andric       if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
253e8d8bef9SDimitry Andric         const bool SavedAllEscaped = AllEscaped;
254e8d8bef9SDimitry Andric         AllEscaped = VD->getType()->isLValueReferenceType();
255e8d8bef9SDimitry Andric         Visit(VarD->getInit());
256e8d8bef9SDimitry Andric         AllEscaped = SavedAllEscaped;
257e8d8bef9SDimitry Andric       }
258e8d8bef9SDimitry Andric     }
259e8d8bef9SDimitry Andric   }
260e8d8bef9SDimitry Andric   void VisitOpenMPCapturedStmt(const CapturedStmt *S,
261e8d8bef9SDimitry Andric                                ArrayRef<OMPClause *> Clauses,
262e8d8bef9SDimitry Andric                                bool IsCombinedParallelRegion) {
263e8d8bef9SDimitry Andric     if (!S)
264e8d8bef9SDimitry Andric       return;
265e8d8bef9SDimitry Andric     for (const CapturedStmt::Capture &C : S->captures()) {
266e8d8bef9SDimitry Andric       if (C.capturesVariable() && !C.capturesVariableByCopy()) {
267e8d8bef9SDimitry Andric         const ValueDecl *VD = C.getCapturedVar();
268e8d8bef9SDimitry Andric         bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
269e8d8bef9SDimitry Andric         if (IsCombinedParallelRegion) {
270e8d8bef9SDimitry Andric           // Check if the variable is privatized in the combined construct and
271e8d8bef9SDimitry Andric           // those private copies must be shared in the inner parallel
272e8d8bef9SDimitry Andric           // directive.
273e8d8bef9SDimitry Andric           IsForCombinedParallelRegion = false;
274e8d8bef9SDimitry Andric           for (const OMPClause *C : Clauses) {
275e8d8bef9SDimitry Andric             if (!isOpenMPPrivate(C->getClauseKind()) ||
276e8d8bef9SDimitry Andric                 C->getClauseKind() == OMPC_reduction ||
277e8d8bef9SDimitry Andric                 C->getClauseKind() == OMPC_linear ||
278e8d8bef9SDimitry Andric                 C->getClauseKind() == OMPC_private)
279e8d8bef9SDimitry Andric               continue;
280e8d8bef9SDimitry Andric             ArrayRef<const Expr *> Vars;
281e8d8bef9SDimitry Andric             if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
282e8d8bef9SDimitry Andric               Vars = PC->getVarRefs();
283e8d8bef9SDimitry Andric             else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
284e8d8bef9SDimitry Andric               Vars = PC->getVarRefs();
285e8d8bef9SDimitry Andric             else
286e8d8bef9SDimitry Andric               llvm_unreachable("Unexpected clause.");
287e8d8bef9SDimitry Andric             for (const auto *E : Vars) {
288e8d8bef9SDimitry Andric               const Decl *D =
289e8d8bef9SDimitry Andric                   cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
290e8d8bef9SDimitry Andric               if (D == VD->getCanonicalDecl()) {
291e8d8bef9SDimitry Andric                 IsForCombinedParallelRegion = true;
292e8d8bef9SDimitry Andric                 break;
293e8d8bef9SDimitry Andric               }
294e8d8bef9SDimitry Andric             }
295e8d8bef9SDimitry Andric             if (IsForCombinedParallelRegion)
296e8d8bef9SDimitry Andric               break;
297e8d8bef9SDimitry Andric           }
298e8d8bef9SDimitry Andric         }
299e8d8bef9SDimitry Andric         markAsEscaped(VD);
300e8d8bef9SDimitry Andric         if (isa<OMPCapturedExprDecl>(VD))
301e8d8bef9SDimitry Andric           VisitValueDecl(VD);
302e8d8bef9SDimitry Andric         IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
303e8d8bef9SDimitry Andric       }
304e8d8bef9SDimitry Andric     }
305e8d8bef9SDimitry Andric   }
306e8d8bef9SDimitry Andric 
307e8d8bef9SDimitry Andric   void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
308e8d8bef9SDimitry Andric     assert(!GlobalizedRD &&
309e8d8bef9SDimitry Andric            "Record for globalized variables is built already.");
310e8d8bef9SDimitry Andric     ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
311349cc55cSDimitry Andric     unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
312e8d8bef9SDimitry Andric     if (IsInTTDRegion)
313e8d8bef9SDimitry Andric       EscapedDeclsForTeams = EscapedDecls.getArrayRef();
314e8d8bef9SDimitry Andric     else
315e8d8bef9SDimitry Andric       EscapedDeclsForParallel = EscapedDecls.getArrayRef();
316e8d8bef9SDimitry Andric     GlobalizedRD = ::buildRecordForGlobalizedVars(
317e8d8bef9SDimitry Andric         CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
318e8d8bef9SDimitry Andric         MappedDeclsFields, WarpSize);
319e8d8bef9SDimitry Andric   }
320e8d8bef9SDimitry Andric 
321e8d8bef9SDimitry Andric public:
322e8d8bef9SDimitry Andric   CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
323e8d8bef9SDimitry Andric                                ArrayRef<const ValueDecl *> TeamsReductions)
324e8d8bef9SDimitry Andric       : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
325e8d8bef9SDimitry Andric   }
326e8d8bef9SDimitry Andric   virtual ~CheckVarsEscapingDeclContext() = default;
327e8d8bef9SDimitry Andric   void VisitDeclStmt(const DeclStmt *S) {
328e8d8bef9SDimitry Andric     if (!S)
329e8d8bef9SDimitry Andric       return;
330e8d8bef9SDimitry Andric     for (const Decl *D : S->decls())
331e8d8bef9SDimitry Andric       if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
332e8d8bef9SDimitry Andric         VisitValueDecl(VD);
333e8d8bef9SDimitry Andric   }
334e8d8bef9SDimitry Andric   void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
335e8d8bef9SDimitry Andric     if (!D)
336e8d8bef9SDimitry Andric       return;
337e8d8bef9SDimitry Andric     if (!D->hasAssociatedStmt())
338e8d8bef9SDimitry Andric       return;
339e8d8bef9SDimitry Andric     if (const auto *S =
340e8d8bef9SDimitry Andric             dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
341e8d8bef9SDimitry Andric       // Do not analyze directives that do not actually require capturing,
342e8d8bef9SDimitry Andric       // like `omp for` or `omp simd` directives.
343e8d8bef9SDimitry Andric       llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
344e8d8bef9SDimitry Andric       getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
345e8d8bef9SDimitry Andric       if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
346e8d8bef9SDimitry Andric         VisitStmt(S->getCapturedStmt());
347e8d8bef9SDimitry Andric         return;
348e8d8bef9SDimitry Andric       }
349e8d8bef9SDimitry Andric       VisitOpenMPCapturedStmt(
350e8d8bef9SDimitry Andric           S, D->clauses(),
351e8d8bef9SDimitry Andric           CaptureRegions.back() == OMPD_parallel &&
352e8d8bef9SDimitry Andric               isOpenMPDistributeDirective(D->getDirectiveKind()));
353e8d8bef9SDimitry Andric     }
354e8d8bef9SDimitry Andric   }
355e8d8bef9SDimitry Andric   void VisitCapturedStmt(const CapturedStmt *S) {
356e8d8bef9SDimitry Andric     if (!S)
357e8d8bef9SDimitry Andric       return;
358e8d8bef9SDimitry Andric     for (const CapturedStmt::Capture &C : S->captures()) {
359e8d8bef9SDimitry Andric       if (C.capturesVariable() && !C.capturesVariableByCopy()) {
360e8d8bef9SDimitry Andric         const ValueDecl *VD = C.getCapturedVar();
361e8d8bef9SDimitry Andric         markAsEscaped(VD);
362e8d8bef9SDimitry Andric         if (isa<OMPCapturedExprDecl>(VD))
363e8d8bef9SDimitry Andric           VisitValueDecl(VD);
364e8d8bef9SDimitry Andric       }
365e8d8bef9SDimitry Andric     }
366e8d8bef9SDimitry Andric   }
367e8d8bef9SDimitry Andric   void VisitLambdaExpr(const LambdaExpr *E) {
368e8d8bef9SDimitry Andric     if (!E)
369e8d8bef9SDimitry Andric       return;
370e8d8bef9SDimitry Andric     for (const LambdaCapture &C : E->captures()) {
371e8d8bef9SDimitry Andric       if (C.capturesVariable()) {
372e8d8bef9SDimitry Andric         if (C.getCaptureKind() == LCK_ByRef) {
373e8d8bef9SDimitry Andric           const ValueDecl *VD = C.getCapturedVar();
374e8d8bef9SDimitry Andric           markAsEscaped(VD);
375e8d8bef9SDimitry Andric           if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
376e8d8bef9SDimitry Andric             VisitValueDecl(VD);
377e8d8bef9SDimitry Andric         }
378e8d8bef9SDimitry Andric       }
379e8d8bef9SDimitry Andric     }
380e8d8bef9SDimitry Andric   }
381e8d8bef9SDimitry Andric   void VisitBlockExpr(const BlockExpr *E) {
382e8d8bef9SDimitry Andric     if (!E)
383e8d8bef9SDimitry Andric       return;
384e8d8bef9SDimitry Andric     for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
385e8d8bef9SDimitry Andric       if (C.isByRef()) {
386e8d8bef9SDimitry Andric         const VarDecl *VD = C.getVariable();
387e8d8bef9SDimitry Andric         markAsEscaped(VD);
388e8d8bef9SDimitry Andric         if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
389e8d8bef9SDimitry Andric           VisitValueDecl(VD);
390e8d8bef9SDimitry Andric       }
391e8d8bef9SDimitry Andric     }
392e8d8bef9SDimitry Andric   }
393e8d8bef9SDimitry Andric   void VisitCallExpr(const CallExpr *E) {
394e8d8bef9SDimitry Andric     if (!E)
395e8d8bef9SDimitry Andric       return;
396e8d8bef9SDimitry Andric     for (const Expr *Arg : E->arguments()) {
397e8d8bef9SDimitry Andric       if (!Arg)
398e8d8bef9SDimitry Andric         continue;
399e8d8bef9SDimitry Andric       if (Arg->isLValue()) {
400e8d8bef9SDimitry Andric         const bool SavedAllEscaped = AllEscaped;
401e8d8bef9SDimitry Andric         AllEscaped = true;
402e8d8bef9SDimitry Andric         Visit(Arg);
403e8d8bef9SDimitry Andric         AllEscaped = SavedAllEscaped;
404e8d8bef9SDimitry Andric       } else {
405e8d8bef9SDimitry Andric         Visit(Arg);
406e8d8bef9SDimitry Andric       }
407e8d8bef9SDimitry Andric     }
408e8d8bef9SDimitry Andric     Visit(E->getCallee());
409e8d8bef9SDimitry Andric   }
410e8d8bef9SDimitry Andric   void VisitDeclRefExpr(const DeclRefExpr *E) {
411e8d8bef9SDimitry Andric     if (!E)
412e8d8bef9SDimitry Andric       return;
413e8d8bef9SDimitry Andric     const ValueDecl *VD = E->getDecl();
414e8d8bef9SDimitry Andric     if (AllEscaped)
415e8d8bef9SDimitry Andric       markAsEscaped(VD);
416e8d8bef9SDimitry Andric     if (isa<OMPCapturedExprDecl>(VD))
417e8d8bef9SDimitry Andric       VisitValueDecl(VD);
418bdd1243dSDimitry Andric     else if (VD->isInitCapture())
419e8d8bef9SDimitry Andric       VisitValueDecl(VD);
420e8d8bef9SDimitry Andric   }
421e8d8bef9SDimitry Andric   void VisitUnaryOperator(const UnaryOperator *E) {
422e8d8bef9SDimitry Andric     if (!E)
423e8d8bef9SDimitry Andric       return;
424e8d8bef9SDimitry Andric     if (E->getOpcode() == UO_AddrOf) {
425e8d8bef9SDimitry Andric       const bool SavedAllEscaped = AllEscaped;
426e8d8bef9SDimitry Andric       AllEscaped = true;
427e8d8bef9SDimitry Andric       Visit(E->getSubExpr());
428e8d8bef9SDimitry Andric       AllEscaped = SavedAllEscaped;
429e8d8bef9SDimitry Andric     } else {
430e8d8bef9SDimitry Andric       Visit(E->getSubExpr());
431e8d8bef9SDimitry Andric     }
432e8d8bef9SDimitry Andric   }
433e8d8bef9SDimitry Andric   void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
434e8d8bef9SDimitry Andric     if (!E)
435e8d8bef9SDimitry Andric       return;
436e8d8bef9SDimitry Andric     if (E->getCastKind() == CK_ArrayToPointerDecay) {
437e8d8bef9SDimitry Andric       const bool SavedAllEscaped = AllEscaped;
438e8d8bef9SDimitry Andric       AllEscaped = true;
439e8d8bef9SDimitry Andric       Visit(E->getSubExpr());
440e8d8bef9SDimitry Andric       AllEscaped = SavedAllEscaped;
441e8d8bef9SDimitry Andric     } else {
442e8d8bef9SDimitry Andric       Visit(E->getSubExpr());
443e8d8bef9SDimitry Andric     }
444e8d8bef9SDimitry Andric   }
445e8d8bef9SDimitry Andric   void VisitExpr(const Expr *E) {
446e8d8bef9SDimitry Andric     if (!E)
447e8d8bef9SDimitry Andric       return;
448e8d8bef9SDimitry Andric     bool SavedAllEscaped = AllEscaped;
449e8d8bef9SDimitry Andric     if (!E->isLValue())
450e8d8bef9SDimitry Andric       AllEscaped = false;
451e8d8bef9SDimitry Andric     for (const Stmt *Child : E->children())
452e8d8bef9SDimitry Andric       if (Child)
453e8d8bef9SDimitry Andric         Visit(Child);
454e8d8bef9SDimitry Andric     AllEscaped = SavedAllEscaped;
455e8d8bef9SDimitry Andric   }
456e8d8bef9SDimitry Andric   void VisitStmt(const Stmt *S) {
457e8d8bef9SDimitry Andric     if (!S)
458e8d8bef9SDimitry Andric       return;
459e8d8bef9SDimitry Andric     for (const Stmt *Child : S->children())
460e8d8bef9SDimitry Andric       if (Child)
461e8d8bef9SDimitry Andric         Visit(Child);
462e8d8bef9SDimitry Andric   }
463e8d8bef9SDimitry Andric 
464e8d8bef9SDimitry Andric   /// Returns the record that handles all the escaped local variables and used
465e8d8bef9SDimitry Andric   /// instead of their original storage.
466e8d8bef9SDimitry Andric   const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
467e8d8bef9SDimitry Andric     if (!GlobalizedRD)
468e8d8bef9SDimitry Andric       buildRecordForGlobalizedVars(IsInTTDRegion);
469e8d8bef9SDimitry Andric     return GlobalizedRD;
470e8d8bef9SDimitry Andric   }
471e8d8bef9SDimitry Andric 
472e8d8bef9SDimitry Andric   /// Returns the field in the globalized record for the escaped variable.
473e8d8bef9SDimitry Andric   const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
474e8d8bef9SDimitry Andric     assert(GlobalizedRD &&
475e8d8bef9SDimitry Andric            "Record for globalized variables must be generated already.");
47606c3fb27SDimitry Andric     return MappedDeclsFields.lookup(VD);
477e8d8bef9SDimitry Andric   }
478e8d8bef9SDimitry Andric 
479e8d8bef9SDimitry Andric   /// Returns the list of the escaped local variables/parameters.
480e8d8bef9SDimitry Andric   ArrayRef<const ValueDecl *> getEscapedDecls() const {
481e8d8bef9SDimitry Andric     return EscapedDecls.getArrayRef();
482e8d8bef9SDimitry Andric   }
483e8d8bef9SDimitry Andric 
484e8d8bef9SDimitry Andric   /// Checks if the escaped local variable is actually a parameter passed by
485e8d8bef9SDimitry Andric   /// value.
486e8d8bef9SDimitry Andric   const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
487e8d8bef9SDimitry Andric     return EscapedParameters;
488e8d8bef9SDimitry Andric   }
489e8d8bef9SDimitry Andric 
490e8d8bef9SDimitry Andric   /// Returns the list of the escaped variables with the variably modified
491e8d8bef9SDimitry Andric   /// types.
492e8d8bef9SDimitry Andric   ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
493e8d8bef9SDimitry Andric     return EscapedVariableLengthDecls.getArrayRef();
494e8d8bef9SDimitry Andric   }
49506c3fb27SDimitry Andric 
49606c3fb27SDimitry Andric   /// Returns the list of the delayed variables with the variably modified
49706c3fb27SDimitry Andric   /// types.
49806c3fb27SDimitry Andric   ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const {
49906c3fb27SDimitry Andric     return DelayedVariableLengthDecls.getArrayRef();
50006c3fb27SDimitry Andric   }
501e8d8bef9SDimitry Andric };
502e8d8bef9SDimitry Andric } // anonymous namespace
503e8d8bef9SDimitry Andric 
504e8d8bef9SDimitry Andric CGOpenMPRuntimeGPU::ExecutionMode
505e8d8bef9SDimitry Andric CGOpenMPRuntimeGPU::getExecutionMode() const {
506e8d8bef9SDimitry Andric   return CurrentExecutionMode;
507e8d8bef9SDimitry Andric }
508e8d8bef9SDimitry Andric 
5095f757f3fSDimitry Andric CGOpenMPRuntimeGPU::DataSharingMode
5105f757f3fSDimitry Andric CGOpenMPRuntimeGPU::getDataSharingMode() const {
5115f757f3fSDimitry Andric   return CurrentDataSharingMode;
512e8d8bef9SDimitry Andric }
513e8d8bef9SDimitry Andric 
514e8d8bef9SDimitry Andric /// Check for inner (nested) SPMD construct, if any
515e8d8bef9SDimitry Andric static bool hasNestedSPMDDirective(ASTContext &Ctx,
516e8d8bef9SDimitry Andric                                    const OMPExecutableDirective &D) {
517e8d8bef9SDimitry Andric   const auto *CS = D.getInnermostCapturedStmt();
518e8d8bef9SDimitry Andric   const auto *Body =
519e8d8bef9SDimitry Andric       CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
520e8d8bef9SDimitry Andric   const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
521e8d8bef9SDimitry Andric 
522e8d8bef9SDimitry Andric   if (const auto *NestedDir =
523e8d8bef9SDimitry Andric           dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
524e8d8bef9SDimitry Andric     OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
525e8d8bef9SDimitry Andric     switch (D.getDirectiveKind()) {
526e8d8bef9SDimitry Andric     case OMPD_target:
527e8d8bef9SDimitry Andric       if (isOpenMPParallelDirective(DKind))
528e8d8bef9SDimitry Andric         return true;
529e8d8bef9SDimitry Andric       if (DKind == OMPD_teams) {
530e8d8bef9SDimitry Andric         Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
531e8d8bef9SDimitry Andric             /*IgnoreCaptured=*/true);
532e8d8bef9SDimitry Andric         if (!Body)
533e8d8bef9SDimitry Andric           return false;
534e8d8bef9SDimitry Andric         ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
535e8d8bef9SDimitry Andric         if (const auto *NND =
536e8d8bef9SDimitry Andric                 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
537e8d8bef9SDimitry Andric           DKind = NND->getDirectiveKind();
538e8d8bef9SDimitry Andric           if (isOpenMPParallelDirective(DKind))
539e8d8bef9SDimitry Andric             return true;
540e8d8bef9SDimitry Andric         }
541e8d8bef9SDimitry Andric       }
542e8d8bef9SDimitry Andric       return false;
543e8d8bef9SDimitry Andric     case OMPD_target_teams:
544e8d8bef9SDimitry Andric       return isOpenMPParallelDirective(DKind);
545e8d8bef9SDimitry Andric     case OMPD_target_simd:
546e8d8bef9SDimitry Andric     case OMPD_target_parallel:
547e8d8bef9SDimitry Andric     case OMPD_target_parallel_for:
548e8d8bef9SDimitry Andric     case OMPD_target_parallel_for_simd:
549e8d8bef9SDimitry Andric     case OMPD_target_teams_distribute:
550e8d8bef9SDimitry Andric     case OMPD_target_teams_distribute_simd:
551e8d8bef9SDimitry Andric     case OMPD_target_teams_distribute_parallel_for:
552e8d8bef9SDimitry Andric     case OMPD_target_teams_distribute_parallel_for_simd:
553e8d8bef9SDimitry Andric     case OMPD_parallel:
554e8d8bef9SDimitry Andric     case OMPD_for:
555e8d8bef9SDimitry Andric     case OMPD_parallel_for:
556e8d8bef9SDimitry Andric     case OMPD_parallel_master:
557e8d8bef9SDimitry Andric     case OMPD_parallel_sections:
558e8d8bef9SDimitry Andric     case OMPD_for_simd:
559e8d8bef9SDimitry Andric     case OMPD_parallel_for_simd:
560e8d8bef9SDimitry Andric     case OMPD_cancel:
561e8d8bef9SDimitry Andric     case OMPD_cancellation_point:
562e8d8bef9SDimitry Andric     case OMPD_ordered:
563e8d8bef9SDimitry Andric     case OMPD_threadprivate:
564e8d8bef9SDimitry Andric     case OMPD_allocate:
565e8d8bef9SDimitry Andric     case OMPD_task:
566e8d8bef9SDimitry Andric     case OMPD_simd:
567e8d8bef9SDimitry Andric     case OMPD_sections:
568e8d8bef9SDimitry Andric     case OMPD_section:
569e8d8bef9SDimitry Andric     case OMPD_single:
570e8d8bef9SDimitry Andric     case OMPD_master:
571e8d8bef9SDimitry Andric     case OMPD_critical:
572e8d8bef9SDimitry Andric     case OMPD_taskyield:
573e8d8bef9SDimitry Andric     case OMPD_barrier:
574e8d8bef9SDimitry Andric     case OMPD_taskwait:
575e8d8bef9SDimitry Andric     case OMPD_taskgroup:
576e8d8bef9SDimitry Andric     case OMPD_atomic:
577e8d8bef9SDimitry Andric     case OMPD_flush:
578e8d8bef9SDimitry Andric     case OMPD_depobj:
579e8d8bef9SDimitry Andric     case OMPD_scan:
580e8d8bef9SDimitry Andric     case OMPD_teams:
581e8d8bef9SDimitry Andric     case OMPD_target_data:
582e8d8bef9SDimitry Andric     case OMPD_target_exit_data:
583e8d8bef9SDimitry Andric     case OMPD_target_enter_data:
584e8d8bef9SDimitry Andric     case OMPD_distribute:
585e8d8bef9SDimitry Andric     case OMPD_distribute_simd:
586e8d8bef9SDimitry Andric     case OMPD_distribute_parallel_for:
587e8d8bef9SDimitry Andric     case OMPD_distribute_parallel_for_simd:
588e8d8bef9SDimitry Andric     case OMPD_teams_distribute:
589e8d8bef9SDimitry Andric     case OMPD_teams_distribute_simd:
590e8d8bef9SDimitry Andric     case OMPD_teams_distribute_parallel_for:
591e8d8bef9SDimitry Andric     case OMPD_teams_distribute_parallel_for_simd:
592e8d8bef9SDimitry Andric     case OMPD_target_update:
593e8d8bef9SDimitry Andric     case OMPD_declare_simd:
594e8d8bef9SDimitry Andric     case OMPD_declare_variant:
595e8d8bef9SDimitry Andric     case OMPD_begin_declare_variant:
596e8d8bef9SDimitry Andric     case OMPD_end_declare_variant:
597e8d8bef9SDimitry Andric     case OMPD_declare_target:
598e8d8bef9SDimitry Andric     case OMPD_end_declare_target:
599e8d8bef9SDimitry Andric     case OMPD_declare_reduction:
600e8d8bef9SDimitry Andric     case OMPD_declare_mapper:
601e8d8bef9SDimitry Andric     case OMPD_taskloop:
602e8d8bef9SDimitry Andric     case OMPD_taskloop_simd:
603e8d8bef9SDimitry Andric     case OMPD_master_taskloop:
604e8d8bef9SDimitry Andric     case OMPD_master_taskloop_simd:
605e8d8bef9SDimitry Andric     case OMPD_parallel_master_taskloop:
606e8d8bef9SDimitry Andric     case OMPD_parallel_master_taskloop_simd:
607e8d8bef9SDimitry Andric     case OMPD_requires:
608e8d8bef9SDimitry Andric     case OMPD_unknown:
609e8d8bef9SDimitry Andric     default:
610e8d8bef9SDimitry Andric       llvm_unreachable("Unexpected directive.");
611e8d8bef9SDimitry Andric     }
612e8d8bef9SDimitry Andric   }
613e8d8bef9SDimitry Andric 
614e8d8bef9SDimitry Andric   return false;
615e8d8bef9SDimitry Andric }
616e8d8bef9SDimitry Andric 
617e8d8bef9SDimitry Andric static bool supportsSPMDExecutionMode(ASTContext &Ctx,
618e8d8bef9SDimitry Andric                                       const OMPExecutableDirective &D) {
619e8d8bef9SDimitry Andric   OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
620e8d8bef9SDimitry Andric   switch (DirectiveKind) {
621e8d8bef9SDimitry Andric   case OMPD_target:
622e8d8bef9SDimitry Andric   case OMPD_target_teams:
623e8d8bef9SDimitry Andric     return hasNestedSPMDDirective(Ctx, D);
62406c3fb27SDimitry Andric   case OMPD_target_parallel_loop:
625e8d8bef9SDimitry Andric   case OMPD_target_parallel:
626e8d8bef9SDimitry Andric   case OMPD_target_parallel_for:
627e8d8bef9SDimitry Andric   case OMPD_target_parallel_for_simd:
628e8d8bef9SDimitry Andric   case OMPD_target_teams_distribute_parallel_for:
629e8d8bef9SDimitry Andric   case OMPD_target_teams_distribute_parallel_for_simd:
630e8d8bef9SDimitry Andric   case OMPD_target_simd:
631e8d8bef9SDimitry Andric   case OMPD_target_teams_distribute_simd:
632e8d8bef9SDimitry Andric     return true;
633e8d8bef9SDimitry Andric   case OMPD_target_teams_distribute:
634e8d8bef9SDimitry Andric     return false;
6350fca6ea1SDimitry Andric   case OMPD_target_teams_loop:
6360fca6ea1SDimitry Andric     // Whether this is true or not depends on how the directive will
6370fca6ea1SDimitry Andric     // eventually be emitted.
6380fca6ea1SDimitry Andric     if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(&D))
6390fca6ea1SDimitry Andric       return TTLD->canBeParallelFor();
6400fca6ea1SDimitry Andric     return false;
641e8d8bef9SDimitry Andric   case OMPD_parallel:
642e8d8bef9SDimitry Andric   case OMPD_for:
643e8d8bef9SDimitry Andric   case OMPD_parallel_for:
644e8d8bef9SDimitry Andric   case OMPD_parallel_master:
645e8d8bef9SDimitry Andric   case OMPD_parallel_sections:
646e8d8bef9SDimitry Andric   case OMPD_for_simd:
647e8d8bef9SDimitry Andric   case OMPD_parallel_for_simd:
648e8d8bef9SDimitry Andric   case OMPD_cancel:
649e8d8bef9SDimitry Andric   case OMPD_cancellation_point:
650e8d8bef9SDimitry Andric   case OMPD_ordered:
651e8d8bef9SDimitry Andric   case OMPD_threadprivate:
652e8d8bef9SDimitry Andric   case OMPD_allocate:
653e8d8bef9SDimitry Andric   case OMPD_task:
654e8d8bef9SDimitry Andric   case OMPD_simd:
655e8d8bef9SDimitry Andric   case OMPD_sections:
656e8d8bef9SDimitry Andric   case OMPD_section:
657e8d8bef9SDimitry Andric   case OMPD_single:
658e8d8bef9SDimitry Andric   case OMPD_master:
659e8d8bef9SDimitry Andric   case OMPD_critical:
660e8d8bef9SDimitry Andric   case OMPD_taskyield:
661e8d8bef9SDimitry Andric   case OMPD_barrier:
662e8d8bef9SDimitry Andric   case OMPD_taskwait:
663e8d8bef9SDimitry Andric   case OMPD_taskgroup:
664e8d8bef9SDimitry Andric   case OMPD_atomic:
665e8d8bef9SDimitry Andric   case OMPD_flush:
666e8d8bef9SDimitry Andric   case OMPD_depobj:
667e8d8bef9SDimitry Andric   case OMPD_scan:
668e8d8bef9SDimitry Andric   case OMPD_teams:
669e8d8bef9SDimitry Andric   case OMPD_target_data:
670e8d8bef9SDimitry Andric   case OMPD_target_exit_data:
671e8d8bef9SDimitry Andric   case OMPD_target_enter_data:
672e8d8bef9SDimitry Andric   case OMPD_distribute:
673e8d8bef9SDimitry Andric   case OMPD_distribute_simd:
674e8d8bef9SDimitry Andric   case OMPD_distribute_parallel_for:
675e8d8bef9SDimitry Andric   case OMPD_distribute_parallel_for_simd:
676e8d8bef9SDimitry Andric   case OMPD_teams_distribute:
677e8d8bef9SDimitry Andric   case OMPD_teams_distribute_simd:
678e8d8bef9SDimitry Andric   case OMPD_teams_distribute_parallel_for:
679e8d8bef9SDimitry Andric   case OMPD_teams_distribute_parallel_for_simd:
680e8d8bef9SDimitry Andric   case OMPD_target_update:
681e8d8bef9SDimitry Andric   case OMPD_declare_simd:
682e8d8bef9SDimitry Andric   case OMPD_declare_variant:
683e8d8bef9SDimitry Andric   case OMPD_begin_declare_variant:
684e8d8bef9SDimitry Andric   case OMPD_end_declare_variant:
685e8d8bef9SDimitry Andric   case OMPD_declare_target:
686e8d8bef9SDimitry Andric   case OMPD_end_declare_target:
687e8d8bef9SDimitry Andric   case OMPD_declare_reduction:
688e8d8bef9SDimitry Andric   case OMPD_declare_mapper:
689e8d8bef9SDimitry Andric   case OMPD_taskloop:
690e8d8bef9SDimitry Andric   case OMPD_taskloop_simd:
691e8d8bef9SDimitry Andric   case OMPD_master_taskloop:
692e8d8bef9SDimitry Andric   case OMPD_master_taskloop_simd:
693e8d8bef9SDimitry Andric   case OMPD_parallel_master_taskloop:
694e8d8bef9SDimitry Andric   case OMPD_parallel_master_taskloop_simd:
695e8d8bef9SDimitry Andric   case OMPD_requires:
696e8d8bef9SDimitry Andric   case OMPD_unknown:
697e8d8bef9SDimitry Andric   default:
698e8d8bef9SDimitry Andric     break;
699e8d8bef9SDimitry Andric   }
700e8d8bef9SDimitry Andric   llvm_unreachable(
701e8d8bef9SDimitry Andric       "Unknown programming model for OpenMP directive on NVPTX target.");
702e8d8bef9SDimitry Andric }
703e8d8bef9SDimitry Andric 
704e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
705e8d8bef9SDimitry Andric                                              StringRef ParentName,
706e8d8bef9SDimitry Andric                                              llvm::Function *&OutlinedFn,
707e8d8bef9SDimitry Andric                                              llvm::Constant *&OutlinedFnID,
708e8d8bef9SDimitry Andric                                              bool IsOffloadEntry,
709e8d8bef9SDimitry Andric                                              const RegionCodeGenTy &CodeGen) {
710bdd1243dSDimitry Andric   ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
711e8d8bef9SDimitry Andric   EntryFunctionState EST;
712e8d8bef9SDimitry Andric   WrapperFunctionsMap.clear();
713e8d8bef9SDimitry Andric 
7145f757f3fSDimitry Andric   [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
7155f757f3fSDimitry Andric   assert(!IsBareKernel && "bare kernel should not be at generic mode");
7165f757f3fSDimitry Andric 
717e8d8bef9SDimitry Andric   // Emit target region as a standalone region.
718e8d8bef9SDimitry Andric   class NVPTXPrePostActionTy : public PrePostActionTy {
719e8d8bef9SDimitry Andric     CGOpenMPRuntimeGPU::EntryFunctionState &EST;
7205f757f3fSDimitry Andric     const OMPExecutableDirective &D;
721e8d8bef9SDimitry Andric 
722e8d8bef9SDimitry Andric   public:
7235f757f3fSDimitry Andric     NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
7245f757f3fSDimitry Andric                          const OMPExecutableDirective &D)
7255f757f3fSDimitry Andric         : EST(EST), D(D) {}
726e8d8bef9SDimitry Andric     void Enter(CodeGenFunction &CGF) override {
7275f757f3fSDimitry Andric       auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
7285f757f3fSDimitry Andric       RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);
729e8d8bef9SDimitry Andric       // Skip target region initialization.
730e8d8bef9SDimitry Andric       RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
731e8d8bef9SDimitry Andric     }
732e8d8bef9SDimitry Andric     void Exit(CodeGenFunction &CGF) override {
7335f757f3fSDimitry Andric       auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
734e8d8bef9SDimitry Andric       RT.clearLocThreadIdInsertPt(CGF);
735fe6060f1SDimitry Andric       RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
736e8d8bef9SDimitry Andric     }
7375f757f3fSDimitry Andric   } Action(EST, D);
738e8d8bef9SDimitry Andric   CodeGen.setAction(Action);
739e8d8bef9SDimitry Andric   IsInTTDRegion = true;
740e8d8bef9SDimitry Andric   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
741e8d8bef9SDimitry Andric                                    IsOffloadEntry, CodeGen);
742e8d8bef9SDimitry Andric   IsInTTDRegion = false;
743e8d8bef9SDimitry Andric }
744e8d8bef9SDimitry Andric 
7455f757f3fSDimitry Andric void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
7465f757f3fSDimitry Andric                                         CodeGenFunction &CGF,
747fe6060f1SDimitry Andric                                         EntryFunctionState &EST, bool IsSPMD) {
7485f757f3fSDimitry Andric   int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1,
7495f757f3fSDimitry Andric           MaxTeamsVal = -1;
7505f757f3fSDimitry Andric   computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,
7515f757f3fSDimitry Andric                                   MinTeamsVal, MaxTeamsVal);
7525f757f3fSDimitry Andric 
753e8d8bef9SDimitry Andric   CGBuilderTy &Bld = CGF.Builder;
7545f757f3fSDimitry Andric   Bld.restoreIP(OMPBuilder.createTargetInit(
7555f757f3fSDimitry Andric       Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));
756fe6060f1SDimitry Andric   if (!IsSPMD)
757fe6060f1SDimitry Andric     emitGenericVarsProlog(CGF, EST.Loc);
758e8d8bef9SDimitry Andric }
759e8d8bef9SDimitry Andric 
760fe6060f1SDimitry Andric void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
761fe6060f1SDimitry Andric                                           EntryFunctionState &EST,
762fe6060f1SDimitry Andric                                           bool IsSPMD) {
763fe6060f1SDimitry Andric   if (!IsSPMD)
764e8d8bef9SDimitry Andric     emitGenericVarsEpilog(CGF);
765e8d8bef9SDimitry Andric 
7665f757f3fSDimitry Andric   // This is temporary until we remove the fixed sized buffer.
7675f757f3fSDimitry Andric   ASTContext &C = CGM.getContext();
7685f757f3fSDimitry Andric   RecordDecl *StaticRD = C.buildImplicitRecord(
7695f757f3fSDimitry Andric       "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union);
7705f757f3fSDimitry Andric   StaticRD->startDefinition();
7715f757f3fSDimitry Andric   for (const RecordDecl *TeamReductionRec : TeamsReductions) {
7725f757f3fSDimitry Andric     QualType RecTy = C.getRecordType(TeamReductionRec);
7735f757f3fSDimitry Andric     auto *Field = FieldDecl::Create(
7745f757f3fSDimitry Andric         C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
7755f757f3fSDimitry Andric         C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
7765f757f3fSDimitry Andric         /*BW=*/nullptr, /*Mutable=*/false,
7775f757f3fSDimitry Andric         /*InitStyle=*/ICIS_NoInit);
7785f757f3fSDimitry Andric     Field->setAccess(AS_public);
7795f757f3fSDimitry Andric     StaticRD->addDecl(Field);
7805f757f3fSDimitry Andric   }
7815f757f3fSDimitry Andric   StaticRD->completeDefinition();
7825f757f3fSDimitry Andric   QualType StaticTy = C.getRecordType(StaticRD);
7835f757f3fSDimitry Andric   llvm::Type *LLVMReductionsBufferTy =
7845f757f3fSDimitry Andric       CGM.getTypes().ConvertTypeForMem(StaticTy);
7855f757f3fSDimitry Andric   const auto &DL = CGM.getModule().getDataLayout();
7865f757f3fSDimitry Andric   uint64_t ReductionDataSize =
7875f757f3fSDimitry Andric       TeamsReductions.empty()
7885f757f3fSDimitry Andric           ? 0
7895f757f3fSDimitry Andric           : DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
790fe6060f1SDimitry Andric   CGBuilderTy &Bld = CGF.Builder;
7915f757f3fSDimitry Andric   OMPBuilder.createTargetDeinit(Bld, ReductionDataSize,
7925f757f3fSDimitry Andric                                 C.getLangOpts().OpenMPCUDAReductionBufNum);
7935f757f3fSDimitry Andric   TeamsReductions.clear();
794e8d8bef9SDimitry Andric }
795e8d8bef9SDimitry Andric 
796e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
797e8d8bef9SDimitry Andric                                           StringRef ParentName,
798e8d8bef9SDimitry Andric                                           llvm::Function *&OutlinedFn,
799e8d8bef9SDimitry Andric                                           llvm::Constant *&OutlinedFnID,
800e8d8bef9SDimitry Andric                                           bool IsOffloadEntry,
801e8d8bef9SDimitry Andric                                           const RegionCodeGenTy &CodeGen) {
802bdd1243dSDimitry Andric   ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
803e8d8bef9SDimitry Andric   EntryFunctionState EST;
804e8d8bef9SDimitry Andric 
8055f757f3fSDimitry Andric   bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
8065f757f3fSDimitry Andric 
807e8d8bef9SDimitry Andric   // Emit target region as a standalone region.
808e8d8bef9SDimitry Andric   class NVPTXPrePostActionTy : public PrePostActionTy {
809e8d8bef9SDimitry Andric     CGOpenMPRuntimeGPU &RT;
810e8d8bef9SDimitry Andric     CGOpenMPRuntimeGPU::EntryFunctionState &EST;
8115f757f3fSDimitry Andric     bool IsBareKernel;
8125f757f3fSDimitry Andric     DataSharingMode Mode;
8135f757f3fSDimitry Andric     const OMPExecutableDirective &D;
814e8d8bef9SDimitry Andric 
815e8d8bef9SDimitry Andric   public:
816e8d8bef9SDimitry Andric     NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
8175f757f3fSDimitry Andric                          CGOpenMPRuntimeGPU::EntryFunctionState &EST,
8185f757f3fSDimitry Andric                          bool IsBareKernel, const OMPExecutableDirective &D)
8195f757f3fSDimitry Andric         : RT(RT), EST(EST), IsBareKernel(IsBareKernel),
8205f757f3fSDimitry Andric           Mode(RT.CurrentDataSharingMode), D(D) {}
821e8d8bef9SDimitry Andric     void Enter(CodeGenFunction &CGF) override {
8225f757f3fSDimitry Andric       if (IsBareKernel) {
8235f757f3fSDimitry Andric         RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
8245f757f3fSDimitry Andric         return;
8255f757f3fSDimitry Andric       }
8265f757f3fSDimitry Andric       RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);
827e8d8bef9SDimitry Andric       // Skip target region initialization.
828e8d8bef9SDimitry Andric       RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
829e8d8bef9SDimitry Andric     }
830e8d8bef9SDimitry Andric     void Exit(CodeGenFunction &CGF) override {
8315f757f3fSDimitry Andric       if (IsBareKernel) {
8325f757f3fSDimitry Andric         RT.CurrentDataSharingMode = Mode;
8335f757f3fSDimitry Andric         return;
8345f757f3fSDimitry Andric       }
835e8d8bef9SDimitry Andric       RT.clearLocThreadIdInsertPt(CGF);
836fe6060f1SDimitry Andric       RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
837e8d8bef9SDimitry Andric     }
8385f757f3fSDimitry Andric   } Action(*this, EST, IsBareKernel, D);
839e8d8bef9SDimitry Andric   CodeGen.setAction(Action);
840e8d8bef9SDimitry Andric   IsInTTDRegion = true;
841e8d8bef9SDimitry Andric   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
842e8d8bef9SDimitry Andric                                    IsOffloadEntry, CodeGen);
843e8d8bef9SDimitry Andric   IsInTTDRegion = false;
844e8d8bef9SDimitry Andric }
845e8d8bef9SDimitry Andric 
846e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
847e8d8bef9SDimitry Andric     const OMPExecutableDirective &D, StringRef ParentName,
848e8d8bef9SDimitry Andric     llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
849e8d8bef9SDimitry Andric     bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
850e8d8bef9SDimitry Andric   if (!IsOffloadEntry) // Nothing to do.
851e8d8bef9SDimitry Andric     return;
852e8d8bef9SDimitry Andric 
853e8d8bef9SDimitry Andric   assert(!ParentName.empty() && "Invalid target region parent name!");
854e8d8bef9SDimitry Andric 
855e8d8bef9SDimitry Andric   bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
8565f757f3fSDimitry Andric   bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
8575f757f3fSDimitry Andric   if (Mode || IsBareKernel)
858e8d8bef9SDimitry Andric     emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
859e8d8bef9SDimitry Andric                    CodeGen);
860e8d8bef9SDimitry Andric   else
861e8d8bef9SDimitry Andric     emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
862e8d8bef9SDimitry Andric                       CodeGen);
863e8d8bef9SDimitry Andric }
864e8d8bef9SDimitry Andric 
865e8d8bef9SDimitry Andric CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
866bdd1243dSDimitry Andric     : CGOpenMPRuntime(CGM) {
8675f757f3fSDimitry Andric   llvm::OpenMPIRBuilderConfig Config(
8685f757f3fSDimitry Andric       CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
8695f757f3fSDimitry Andric       CGM.getLangOpts().OpenMPOffloadMandatory,
8705f757f3fSDimitry Andric       /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,
8715f757f3fSDimitry Andric       hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
872bdd1243dSDimitry Andric   OMPBuilder.setConfig(Config);
873bdd1243dSDimitry Andric 
87406c3fb27SDimitry Andric   if (!CGM.getLangOpts().OpenMPIsTargetDevice)
875349cc55cSDimitry Andric     llvm_unreachable("OpenMP can only handle device code.");
876349cc55cSDimitry Andric 
8775f757f3fSDimitry Andric   if (CGM.getLangOpts().OpenMPCUDAMode)
8785f757f3fSDimitry Andric     CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
8795f757f3fSDimitry Andric 
880349cc55cSDimitry Andric   llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
88181ad6265SDimitry Andric   if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
88281ad6265SDimitry Andric     return;
88381ad6265SDimitry Andric 
884349cc55cSDimitry Andric   OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
885349cc55cSDimitry Andric                               "__omp_rtl_debug_kind");
886349cc55cSDimitry Andric   OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
887349cc55cSDimitry Andric                               "__omp_rtl_assume_teams_oversubscription");
888349cc55cSDimitry Andric   OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
889349cc55cSDimitry Andric                               "__omp_rtl_assume_threads_oversubscription");
89081ad6265SDimitry Andric   OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
89181ad6265SDimitry Andric                               "__omp_rtl_assume_no_thread_state");
892bdd1243dSDimitry Andric   OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,
893bdd1243dSDimitry Andric                               "__omp_rtl_assume_no_nested_parallelism");
894e8d8bef9SDimitry Andric }
895e8d8bef9SDimitry Andric 
896e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
897e8d8bef9SDimitry Andric                                               ProcBindKind ProcBind,
898e8d8bef9SDimitry Andric                                               SourceLocation Loc) {
8995f757f3fSDimitry Andric   // Nothing to do.
900e8d8bef9SDimitry Andric }
901e8d8bef9SDimitry Andric 
902e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
903e8d8bef9SDimitry Andric                                                 llvm::Value *NumThreads,
904e8d8bef9SDimitry Andric                                                 SourceLocation Loc) {
9050eae32dcSDimitry Andric   // Nothing to do.
906e8d8bef9SDimitry Andric }
907e8d8bef9SDimitry Andric 
908e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
909e8d8bef9SDimitry Andric                                               const Expr *NumTeams,
910e8d8bef9SDimitry Andric                                               const Expr *ThreadLimit,
911e8d8bef9SDimitry Andric                                               SourceLocation Loc) {}
912e8d8bef9SDimitry Andric 
913e8d8bef9SDimitry Andric llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
91406c3fb27SDimitry Andric     CodeGenFunction &CGF, const OMPExecutableDirective &D,
91506c3fb27SDimitry Andric     const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
91606c3fb27SDimitry Andric     const RegionCodeGenTy &CodeGen) {
917e8d8bef9SDimitry Andric   // Emit target region as a standalone region.
918e8d8bef9SDimitry Andric   bool PrevIsInTTDRegion = IsInTTDRegion;
919e8d8bef9SDimitry Andric   IsInTTDRegion = false;
920e8d8bef9SDimitry Andric   auto *OutlinedFun =
921e8d8bef9SDimitry Andric       cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
92206c3fb27SDimitry Andric           CGF, D, ThreadIDVar, InnermostKind, CodeGen));
923e8d8bef9SDimitry Andric   IsInTTDRegion = PrevIsInTTDRegion;
924bdd1243dSDimitry Andric   if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
925e8d8bef9SDimitry Andric     llvm::Function *WrapperFun =
926e8d8bef9SDimitry Andric         createParallelDataSharingWrapper(OutlinedFun, D);
927e8d8bef9SDimitry Andric     WrapperFunctionsMap[OutlinedFun] = WrapperFun;
928e8d8bef9SDimitry Andric   }
929e8d8bef9SDimitry Andric 
930e8d8bef9SDimitry Andric   return OutlinedFun;
931e8d8bef9SDimitry Andric }
932e8d8bef9SDimitry Andric 
933e8d8bef9SDimitry Andric /// Get list of lastprivate variables from the teams distribute ... or
934e8d8bef9SDimitry Andric /// teams {distribute ...} directives.
935e8d8bef9SDimitry Andric static void
936e8d8bef9SDimitry Andric getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
937e8d8bef9SDimitry Andric                              llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
938e8d8bef9SDimitry Andric   assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
939e8d8bef9SDimitry Andric          "expected teams directive.");
940e8d8bef9SDimitry Andric   const OMPExecutableDirective *Dir = &D;
941e8d8bef9SDimitry Andric   if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
942e8d8bef9SDimitry Andric     if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
943e8d8bef9SDimitry Andric             Ctx,
944e8d8bef9SDimitry Andric             D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
945e8d8bef9SDimitry Andric                 /*IgnoreCaptured=*/true))) {
946e8d8bef9SDimitry Andric       Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
947e8d8bef9SDimitry Andric       if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
948e8d8bef9SDimitry Andric         Dir = nullptr;
949e8d8bef9SDimitry Andric     }
950e8d8bef9SDimitry Andric   }
951e8d8bef9SDimitry Andric   if (!Dir)
952e8d8bef9SDimitry Andric     return;
953e8d8bef9SDimitry Andric   for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
954e8d8bef9SDimitry Andric     for (const Expr *E : C->getVarRefs())
955e8d8bef9SDimitry Andric       Vars.push_back(getPrivateItem(E));
956e8d8bef9SDimitry Andric   }
957e8d8bef9SDimitry Andric }
958e8d8bef9SDimitry Andric 
959e8d8bef9SDimitry Andric /// Get list of reduction variables from the teams ... directives.
960e8d8bef9SDimitry Andric static void
961e8d8bef9SDimitry Andric getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
962e8d8bef9SDimitry Andric                       llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
963e8d8bef9SDimitry Andric   assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
964e8d8bef9SDimitry Andric          "expected teams directive.");
965e8d8bef9SDimitry Andric   for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
966e8d8bef9SDimitry Andric     for (const Expr *E : C->privates())
967e8d8bef9SDimitry Andric       Vars.push_back(getPrivateItem(E));
968e8d8bef9SDimitry Andric   }
969e8d8bef9SDimitry Andric }
970e8d8bef9SDimitry Andric 
971e8d8bef9SDimitry Andric llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
97206c3fb27SDimitry Andric     CodeGenFunction &CGF, const OMPExecutableDirective &D,
97306c3fb27SDimitry Andric     const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
97406c3fb27SDimitry Andric     const RegionCodeGenTy &CodeGen) {
975e8d8bef9SDimitry Andric   SourceLocation Loc = D.getBeginLoc();
976e8d8bef9SDimitry Andric 
977e8d8bef9SDimitry Andric   const RecordDecl *GlobalizedRD = nullptr;
978e8d8bef9SDimitry Andric   llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
979e8d8bef9SDimitry Andric   llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
980349cc55cSDimitry Andric   unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
981e8d8bef9SDimitry Andric   // Globalize team reductions variable unconditionally in all modes.
982e8d8bef9SDimitry Andric   if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
983e8d8bef9SDimitry Andric     getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
984e8d8bef9SDimitry Andric   if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
985e8d8bef9SDimitry Andric     getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
986e8d8bef9SDimitry Andric     if (!LastPrivatesReductions.empty()) {
987e8d8bef9SDimitry Andric       GlobalizedRD = ::buildRecordForGlobalizedVars(
988bdd1243dSDimitry Andric           CGM.getContext(), std::nullopt, LastPrivatesReductions,
989e8d8bef9SDimitry Andric           MappedDeclsFields, WarpSize);
990e8d8bef9SDimitry Andric     }
991e8d8bef9SDimitry Andric   } else if (!LastPrivatesReductions.empty()) {
992e8d8bef9SDimitry Andric     assert(!TeamAndReductions.first &&
993e8d8bef9SDimitry Andric            "Previous team declaration is not expected.");
994e8d8bef9SDimitry Andric     TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
995e8d8bef9SDimitry Andric     std::swap(TeamAndReductions.second, LastPrivatesReductions);
996e8d8bef9SDimitry Andric   }
997e8d8bef9SDimitry Andric 
998e8d8bef9SDimitry Andric   // Emit target region as a standalone region.
999e8d8bef9SDimitry Andric   class NVPTXPrePostActionTy : public PrePostActionTy {
1000e8d8bef9SDimitry Andric     SourceLocation &Loc;
1001e8d8bef9SDimitry Andric     const RecordDecl *GlobalizedRD;
1002e8d8bef9SDimitry Andric     llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1003e8d8bef9SDimitry Andric         &MappedDeclsFields;
1004e8d8bef9SDimitry Andric 
1005e8d8bef9SDimitry Andric   public:
1006e8d8bef9SDimitry Andric     NVPTXPrePostActionTy(
1007e8d8bef9SDimitry Andric         SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1008e8d8bef9SDimitry Andric         llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1009e8d8bef9SDimitry Andric             &MappedDeclsFields)
1010e8d8bef9SDimitry Andric         : Loc(Loc), GlobalizedRD(GlobalizedRD),
1011e8d8bef9SDimitry Andric           MappedDeclsFields(MappedDeclsFields) {}
1012e8d8bef9SDimitry Andric     void Enter(CodeGenFunction &CGF) override {
1013e8d8bef9SDimitry Andric       auto &Rt =
1014e8d8bef9SDimitry Andric           static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1015e8d8bef9SDimitry Andric       if (GlobalizedRD) {
1016e8d8bef9SDimitry Andric         auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1017e8d8bef9SDimitry Andric         I->getSecond().MappedParams =
1018e8d8bef9SDimitry Andric             std::make_unique<CodeGenFunction::OMPMapVars>();
1019e8d8bef9SDimitry Andric         DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1020e8d8bef9SDimitry Andric         for (const auto &Pair : MappedDeclsFields) {
1021e8d8bef9SDimitry Andric           assert(Pair.getFirst()->isCanonicalDecl() &&
1022e8d8bef9SDimitry Andric                  "Expected canonical declaration");
1023fe6060f1SDimitry Andric           Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1024e8d8bef9SDimitry Andric         }
1025e8d8bef9SDimitry Andric       }
1026e8d8bef9SDimitry Andric       Rt.emitGenericVarsProlog(CGF, Loc);
1027e8d8bef9SDimitry Andric     }
1028e8d8bef9SDimitry Andric     void Exit(CodeGenFunction &CGF) override {
1029e8d8bef9SDimitry Andric       static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1030e8d8bef9SDimitry Andric           .emitGenericVarsEpilog(CGF);
1031e8d8bef9SDimitry Andric     }
1032e8d8bef9SDimitry Andric   } Action(Loc, GlobalizedRD, MappedDeclsFields);
1033e8d8bef9SDimitry Andric   CodeGen.setAction(Action);
1034e8d8bef9SDimitry Andric   llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
103506c3fb27SDimitry Andric       CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1036e8d8bef9SDimitry Andric 
1037e8d8bef9SDimitry Andric   return OutlinedFun;
1038e8d8bef9SDimitry Andric }
1039e8d8bef9SDimitry Andric 
1040e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
10415f757f3fSDimitry Andric                                                SourceLocation Loc) {
10425f757f3fSDimitry Andric   if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1043e8d8bef9SDimitry Andric     return;
1044e8d8bef9SDimitry Andric 
1045e8d8bef9SDimitry Andric   CGBuilderTy &Bld = CGF.Builder;
1046e8d8bef9SDimitry Andric 
1047e8d8bef9SDimitry Andric   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1048e8d8bef9SDimitry Andric   if (I == FunctionGlobalizedDecls.end())
1049e8d8bef9SDimitry Andric     return;
1050e8d8bef9SDimitry Andric 
1051e8d8bef9SDimitry Andric   for (auto &Rec : I->getSecond().LocalVarData) {
1052fe6060f1SDimitry Andric     const auto *VD = cast<VarDecl>(Rec.first);
1053e8d8bef9SDimitry Andric     bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1054fe6060f1SDimitry Andric     QualType VarTy = VD->getType();
1055fe6060f1SDimitry Andric 
1056fe6060f1SDimitry Andric     // Get the local allocation of a firstprivate variable before sharing
1057e8d8bef9SDimitry Andric     llvm::Value *ParValue;
1058e8d8bef9SDimitry Andric     if (EscapedParam) {
1059e8d8bef9SDimitry Andric       LValue ParLVal =
1060e8d8bef9SDimitry Andric           CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1061e8d8bef9SDimitry Andric       ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1062e8d8bef9SDimitry Andric     }
1063fe6060f1SDimitry Andric 
1064fe6060f1SDimitry Andric     // Allocate space for the variable to be globalized
1065fe6060f1SDimitry Andric     llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
106604eeddc0SDimitry Andric     llvm::CallBase *VoidPtr =
1067fe6060f1SDimitry Andric         CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1068fe6060f1SDimitry Andric                                 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1069fe6060f1SDimitry Andric                             AllocArgs, VD->getName());
107004eeddc0SDimitry Andric     // FIXME: We should use the variables actual alignment as an argument.
107104eeddc0SDimitry Andric     VoidPtr->addRetAttr(llvm::Attribute::get(
107204eeddc0SDimitry Andric         CGM.getLLVMContext(), llvm::Attribute::Alignment,
107304eeddc0SDimitry Andric         CGM.getContext().getTargetInfo().getNewAlign() / 8));
1074fe6060f1SDimitry Andric 
1075fe6060f1SDimitry Andric     // Cast the void pointer and get the address of the globalized variable.
1076fe6060f1SDimitry Andric     llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
1077fe6060f1SDimitry Andric     llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1078fe6060f1SDimitry Andric         VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
10790fca6ea1SDimitry Andric     LValue VarAddr =
10800fca6ea1SDimitry Andric         CGF.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr, VarTy);
10810fca6ea1SDimitry Andric     Rec.second.PrivateAddr = VarAddr.getAddress();
1082fe6060f1SDimitry Andric     Rec.second.GlobalizedVal = VoidPtr;
1083fe6060f1SDimitry Andric 
1084fe6060f1SDimitry Andric     // Assign the local allocation to the newly globalized location.
1085e8d8bef9SDimitry Andric     if (EscapedParam) {
1086e8d8bef9SDimitry Andric       CGF.EmitStoreOfScalar(ParValue, VarAddr);
10870fca6ea1SDimitry Andric       I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
1088e8d8bef9SDimitry Andric     }
1089fe6060f1SDimitry Andric     if (auto *DI = CGF.getDebugInfo())
1090fe6060f1SDimitry Andric       VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1091e8d8bef9SDimitry Andric   }
109206c3fb27SDimitry Andric 
109306c3fb27SDimitry Andric   for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) {
109406c3fb27SDimitry Andric     const auto *VD = cast<VarDecl>(ValueD);
109506c3fb27SDimitry Andric     std::pair<llvm::Value *, llvm::Value *> AddrSizePair =
109606c3fb27SDimitry Andric         getKmpcAllocShared(CGF, VD);
109706c3fb27SDimitry Andric     I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair);
109806c3fb27SDimitry Andric     LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
109906c3fb27SDimitry Andric                                      CGM.getContext().getDeclAlign(VD),
110006c3fb27SDimitry Andric                                      AlignmentSource::Decl);
11010fca6ea1SDimitry Andric     I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress());
110206c3fb27SDimitry Andric   }
110306c3fb27SDimitry Andric   I->getSecond().MappedParams->apply(CGF);
110406c3fb27SDimitry Andric }
110506c3fb27SDimitry Andric 
110606c3fb27SDimitry Andric bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,
110706c3fb27SDimitry Andric                                                      const VarDecl *VD) const {
110806c3fb27SDimitry Andric   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
110906c3fb27SDimitry Andric   if (I == FunctionGlobalizedDecls.end())
111006c3fb27SDimitry Andric     return false;
111106c3fb27SDimitry Andric 
111206c3fb27SDimitry Andric   // Check variable declaration is delayed:
111306c3fb27SDimitry Andric   return llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD);
111406c3fb27SDimitry Andric }
111506c3fb27SDimitry Andric 
111606c3fb27SDimitry Andric std::pair<llvm::Value *, llvm::Value *>
111706c3fb27SDimitry Andric CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,
111806c3fb27SDimitry Andric                                        const VarDecl *VD) {
111906c3fb27SDimitry Andric   CGBuilderTy &Bld = CGF.Builder;
112006c3fb27SDimitry Andric 
112106c3fb27SDimitry Andric   // Compute size and alignment.
1122e8d8bef9SDimitry Andric   llvm::Value *Size = CGF.getTypeSize(VD->getType());
1123e8d8bef9SDimitry Andric   CharUnits Align = CGM.getContext().getDeclAlign(VD);
1124e8d8bef9SDimitry Andric   Size = Bld.CreateNUWAdd(
1125e8d8bef9SDimitry Andric       Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1126e8d8bef9SDimitry Andric   llvm::Value *AlignVal =
1127e8d8bef9SDimitry Andric       llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1128e8d8bef9SDimitry Andric   Size = Bld.CreateUDiv(Size, AlignVal);
1129e8d8bef9SDimitry Andric   Size = Bld.CreateNUWMul(Size, AlignVal);
1130fe6060f1SDimitry Andric 
1131fe6060f1SDimitry Andric   // Allocate space for this VLA object to be globalized.
113206c3fb27SDimitry Andric   llvm::Value *AllocArgs[] = {Size};
113304eeddc0SDimitry Andric   llvm::CallBase *VoidPtr =
1134fe6060f1SDimitry Andric       CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1135fe6060f1SDimitry Andric                               CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1136fe6060f1SDimitry Andric                           AllocArgs, VD->getName());
113706c3fb27SDimitry Andric   VoidPtr->addRetAttr(llvm::Attribute::get(
113806c3fb27SDimitry Andric       CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
1139fe6060f1SDimitry Andric 
114006c3fb27SDimitry Andric   return std::make_pair(VoidPtr, Size);
1141e8d8bef9SDimitry Andric }
114206c3fb27SDimitry Andric 
114306c3fb27SDimitry Andric void CGOpenMPRuntimeGPU::getKmpcFreeShared(
114406c3fb27SDimitry Andric     CodeGenFunction &CGF,
114506c3fb27SDimitry Andric     const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) {
114606c3fb27SDimitry Andric   // Deallocate the memory for each globalized VLA object
114706c3fb27SDimitry Andric   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
114806c3fb27SDimitry Andric                           CGM.getModule(), OMPRTL___kmpc_free_shared),
114906c3fb27SDimitry Andric                       {AddrSizePair.first, AddrSizePair.second});
1150e8d8bef9SDimitry Andric }
1151e8d8bef9SDimitry Andric 
11525f757f3fSDimitry Andric void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
11535f757f3fSDimitry Andric   if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1154e8d8bef9SDimitry Andric     return;
1155e8d8bef9SDimitry Andric 
1156e8d8bef9SDimitry Andric   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1157e8d8bef9SDimitry Andric   if (I != FunctionGlobalizedDecls.end()) {
115806c3fb27SDimitry Andric     // Deallocate the memory for each globalized VLA object that was
115906c3fb27SDimitry Andric     // globalized in the prolog (i.e. emitGenericVarsProlog).
116006c3fb27SDimitry Andric     for (const auto &AddrSizePair :
1161e8d8bef9SDimitry Andric          llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1162fe6060f1SDimitry Andric       CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1163fe6060f1SDimitry Andric                               CGM.getModule(), OMPRTL___kmpc_free_shared),
1164fe6060f1SDimitry Andric                           {AddrSizePair.first, AddrSizePair.second});
1165e8d8bef9SDimitry Andric     }
1166fe6060f1SDimitry Andric     // Deallocate the memory for each globalized value
1167fe6060f1SDimitry Andric     for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1168fe6060f1SDimitry Andric       const auto *VD = cast<VarDecl>(Rec.first);
1169fe6060f1SDimitry Andric       I->getSecond().MappedParams->restore(CGF);
1170fe6060f1SDimitry Andric 
1171fe6060f1SDimitry Andric       llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1172fe6060f1SDimitry Andric                                  CGF.getTypeSize(VD->getType())};
1173fe6060f1SDimitry Andric       CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1174fe6060f1SDimitry Andric                               CGM.getModule(), OMPRTL___kmpc_free_shared),
1175fe6060f1SDimitry Andric                           FreeArgs);
1176e8d8bef9SDimitry Andric     }
1177e8d8bef9SDimitry Andric   }
1178e8d8bef9SDimitry Andric }
1179e8d8bef9SDimitry Andric 
1180e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1181e8d8bef9SDimitry Andric                                          const OMPExecutableDirective &D,
1182e8d8bef9SDimitry Andric                                          SourceLocation Loc,
1183e8d8bef9SDimitry Andric                                          llvm::Function *OutlinedFn,
1184e8d8bef9SDimitry Andric                                          ArrayRef<llvm::Value *> CapturedVars) {
1185e8d8bef9SDimitry Andric   if (!CGF.HaveInsertPoint())
1186e8d8bef9SDimitry Andric     return;
1187e8d8bef9SDimitry Andric 
11885f757f3fSDimitry Andric   bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
11895f757f3fSDimitry Andric 
11900fca6ea1SDimitry Andric   RawAddress ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1191e8d8bef9SDimitry Andric                                                          /*Name=*/".zero.addr");
1192349cc55cSDimitry Andric   CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1193e8d8bef9SDimitry Andric   llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
11945f757f3fSDimitry Andric   // We don't emit any thread id function call in bare kernel, but because the
11955f757f3fSDimitry Andric   // outlined function has a pointer argument, we emit a nullptr here.
11965f757f3fSDimitry Andric   if (IsBareKernel)
11975f757f3fSDimitry Andric     OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy));
11985f757f3fSDimitry Andric   else
11990fca6ea1SDimitry Andric     OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF));
1200e8d8bef9SDimitry Andric   OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1201e8d8bef9SDimitry Andric   OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1202e8d8bef9SDimitry Andric   emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1203e8d8bef9SDimitry Andric }
1204e8d8bef9SDimitry Andric 
1205fe6060f1SDimitry Andric void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1206fe6060f1SDimitry Andric                                           SourceLocation Loc,
1207fe6060f1SDimitry Andric                                           llvm::Function *OutlinedFn,
1208fe6060f1SDimitry Andric                                           ArrayRef<llvm::Value *> CapturedVars,
12090eae32dcSDimitry Andric                                           const Expr *IfCond,
12100eae32dcSDimitry Andric                                           llvm::Value *NumThreads) {
1211e8d8bef9SDimitry Andric   if (!CGF.HaveInsertPoint())
1212e8d8bef9SDimitry Andric     return;
1213e8d8bef9SDimitry Andric 
12140eae32dcSDimitry Andric   auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
12150eae32dcSDimitry Andric                         NumThreads](CodeGenFunction &CGF,
12160eae32dcSDimitry Andric                                     PrePostActionTy &Action) {
1217e8d8bef9SDimitry Andric     CGBuilderTy &Bld = CGF.Builder;
12180eae32dcSDimitry Andric     llvm::Value *NumThreadsVal = NumThreads;
1219fe6060f1SDimitry Andric     llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1220fe6060f1SDimitry Andric     llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1221fe6060f1SDimitry Andric     if (WFn)
1222fe6060f1SDimitry Andric       ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1223fe6060f1SDimitry Andric     llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1224e8d8bef9SDimitry Andric 
1225e8d8bef9SDimitry Andric     // Create a private scope that will globalize the arguments
1226e8d8bef9SDimitry Andric     // passed from the outside of the target region.
1227fe6060f1SDimitry Andric     // TODO: Is that needed?
1228e8d8bef9SDimitry Andric     CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1229e8d8bef9SDimitry Andric 
1230fe6060f1SDimitry Andric     Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1231fe6060f1SDimitry Andric         llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1232fe6060f1SDimitry Andric         "captured_vars_addrs");
1233e8d8bef9SDimitry Andric     // There's something to share.
1234e8d8bef9SDimitry Andric     if (!CapturedVars.empty()) {
1235e8d8bef9SDimitry Andric       // Prepare for parallel region. Indicate the outlined function.
1236e8d8bef9SDimitry Andric       ASTContext &Ctx = CGF.getContext();
1237fe6060f1SDimitry Andric       unsigned Idx = 0;
1238e8d8bef9SDimitry Andric       for (llvm::Value *V : CapturedVars) {
1239fe6060f1SDimitry Andric         Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1240e8d8bef9SDimitry Andric         llvm::Value *PtrV;
1241e8d8bef9SDimitry Andric         if (V->getType()->isIntegerTy())
1242e8d8bef9SDimitry Andric           PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1243e8d8bef9SDimitry Andric         else
1244e8d8bef9SDimitry Andric           PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
1245e8d8bef9SDimitry Andric         CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1246e8d8bef9SDimitry Andric                               Ctx.getPointerType(Ctx.VoidPtrTy));
1247e8d8bef9SDimitry Andric         ++Idx;
1248e8d8bef9SDimitry Andric       }
1249e8d8bef9SDimitry Andric     }
1250e8d8bef9SDimitry Andric 
1251fe6060f1SDimitry Andric     llvm::Value *IfCondVal = nullptr;
1252fe6060f1SDimitry Andric     if (IfCond)
1253fe6060f1SDimitry Andric       IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1254fe6060f1SDimitry Andric                                     /* isSigned */ false);
1255fe6060f1SDimitry Andric     else
1256fe6060f1SDimitry Andric       IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1257e8d8bef9SDimitry Andric 
12580eae32dcSDimitry Andric     if (!NumThreadsVal)
12590eae32dcSDimitry Andric       NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
12600eae32dcSDimitry Andric     else
12610eae32dcSDimitry Andric       NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
12620eae32dcSDimitry Andric 
1263fe6060f1SDimitry Andric       assert(IfCondVal && "Expected a value");
1264fe6060f1SDimitry Andric     llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1265fe6060f1SDimitry Andric     llvm::Value *Args[] = {
1266fe6060f1SDimitry Andric         RTLoc,
1267fe6060f1SDimitry Andric         getThreadID(CGF, Loc),
1268fe6060f1SDimitry Andric         IfCondVal,
12690eae32dcSDimitry Andric         NumThreadsVal,
1270fe6060f1SDimitry Andric         llvm::ConstantInt::get(CGF.Int32Ty, -1),
1271fe6060f1SDimitry Andric         FnPtr,
1272fe6060f1SDimitry Andric         ID,
12730fca6ea1SDimitry Andric         Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF),
1274fe6060f1SDimitry Andric                                    CGF.VoidPtrPtrTy),
1275fe6060f1SDimitry Andric         llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1276e8d8bef9SDimitry Andric     CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1277fe6060f1SDimitry Andric                             CGM.getModule(), OMPRTL___kmpc_parallel_51),
1278e8d8bef9SDimitry Andric                         Args);
1279e8d8bef9SDimitry Andric   };
1280e8d8bef9SDimitry Andric 
1281fe6060f1SDimitry Andric   RegionCodeGenTy RCG(ParallelGen);
1282e8d8bef9SDimitry Andric   RCG(CGF);
1283e8d8bef9SDimitry Andric }
1284e8d8bef9SDimitry Andric 
1285e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1286e8d8bef9SDimitry Andric   // Always emit simple barriers!
1287e8d8bef9SDimitry Andric   if (!CGF.HaveInsertPoint())
1288e8d8bef9SDimitry Andric     return;
1289e8d8bef9SDimitry Andric   // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1290e8d8bef9SDimitry Andric   // This function does not use parameters, so we can emit just default values.
1291e8d8bef9SDimitry Andric   llvm::Value *Args[] = {
1292e8d8bef9SDimitry Andric       llvm::ConstantPointerNull::get(
1293e8d8bef9SDimitry Andric           cast<llvm::PointerType>(getIdentTyPointerTy())),
1294e8d8bef9SDimitry Andric       llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
1295e8d8bef9SDimitry Andric   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1296e8d8bef9SDimitry Andric                           CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1297e8d8bef9SDimitry Andric                       Args);
1298e8d8bef9SDimitry Andric }
1299e8d8bef9SDimitry Andric 
1300e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1301e8d8bef9SDimitry Andric                                            SourceLocation Loc,
1302e8d8bef9SDimitry Andric                                            OpenMPDirectiveKind Kind, bool,
1303e8d8bef9SDimitry Andric                                            bool) {
1304e8d8bef9SDimitry Andric   // Always emit simple barriers!
1305e8d8bef9SDimitry Andric   if (!CGF.HaveInsertPoint())
1306e8d8bef9SDimitry Andric     return;
1307e8d8bef9SDimitry Andric   // Build call __kmpc_cancel_barrier(loc, thread_id);
1308e8d8bef9SDimitry Andric   unsigned Flags = getDefaultFlagsForBarriers(Kind);
1309e8d8bef9SDimitry Andric   llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1310e8d8bef9SDimitry Andric                          getThreadID(CGF, Loc)};
1311e8d8bef9SDimitry Andric 
1312e8d8bef9SDimitry Andric   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1313e8d8bef9SDimitry Andric                           CGM.getModule(), OMPRTL___kmpc_barrier),
1314e8d8bef9SDimitry Andric                       Args);
1315e8d8bef9SDimitry Andric }
1316e8d8bef9SDimitry Andric 
1317e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitCriticalRegion(
1318e8d8bef9SDimitry Andric     CodeGenFunction &CGF, StringRef CriticalName,
1319e8d8bef9SDimitry Andric     const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1320e8d8bef9SDimitry Andric     const Expr *Hint) {
1321e8d8bef9SDimitry Andric   llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1322e8d8bef9SDimitry Andric   llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1323e8d8bef9SDimitry Andric   llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1324e8d8bef9SDimitry Andric   llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1325e8d8bef9SDimitry Andric   llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1326e8d8bef9SDimitry Andric 
1327e8d8bef9SDimitry Andric   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1328e8d8bef9SDimitry Andric 
1329e8d8bef9SDimitry Andric   // Get the mask of active threads in the warp.
1330e8d8bef9SDimitry Andric   llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1331e8d8bef9SDimitry Andric       CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1332e8d8bef9SDimitry Andric   // Fetch team-local id of the thread.
1333e8d8bef9SDimitry Andric   llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1334e8d8bef9SDimitry Andric 
1335e8d8bef9SDimitry Andric   // Get the width of the team.
1336e8d8bef9SDimitry Andric   llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1337e8d8bef9SDimitry Andric 
1338e8d8bef9SDimitry Andric   // Initialize the counter variable for the loop.
1339e8d8bef9SDimitry Andric   QualType Int32Ty =
1340e8d8bef9SDimitry Andric       CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1341e8d8bef9SDimitry Andric   Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1342e8d8bef9SDimitry Andric   LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1343e8d8bef9SDimitry Andric   CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1344e8d8bef9SDimitry Andric                         /*isInit=*/true);
1345e8d8bef9SDimitry Andric 
1346e8d8bef9SDimitry Andric   // Block checks if loop counter exceeds upper bound.
1347e8d8bef9SDimitry Andric   CGF.EmitBlock(LoopBB);
1348e8d8bef9SDimitry Andric   llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1349e8d8bef9SDimitry Andric   llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1350e8d8bef9SDimitry Andric   CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1351e8d8bef9SDimitry Andric 
1352e8d8bef9SDimitry Andric   // Block tests which single thread should execute region, and which threads
1353e8d8bef9SDimitry Andric   // should go straight to synchronisation point.
1354e8d8bef9SDimitry Andric   CGF.EmitBlock(TestBB);
1355e8d8bef9SDimitry Andric   CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1356e8d8bef9SDimitry Andric   llvm::Value *CmpThreadToCounter =
1357e8d8bef9SDimitry Andric       CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1358e8d8bef9SDimitry Andric   CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1359e8d8bef9SDimitry Andric 
1360e8d8bef9SDimitry Andric   // Block emits the body of the critical region.
1361e8d8bef9SDimitry Andric   CGF.EmitBlock(BodyBB);
1362e8d8bef9SDimitry Andric 
1363e8d8bef9SDimitry Andric   // Output the critical statement.
1364e8d8bef9SDimitry Andric   CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1365e8d8bef9SDimitry Andric                                       Hint);
1366e8d8bef9SDimitry Andric 
1367e8d8bef9SDimitry Andric   // After the body surrounded by the critical region, the single executing
1368e8d8bef9SDimitry Andric   // thread will jump to the synchronisation point.
1369e8d8bef9SDimitry Andric   // Block waits for all threads in current team to finish then increments the
1370e8d8bef9SDimitry Andric   // counter variable and returns to the loop.
1371e8d8bef9SDimitry Andric   CGF.EmitBlock(SyncBB);
1372e8d8bef9SDimitry Andric   // Reconverge active threads in the warp.
1373e8d8bef9SDimitry Andric   (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1374e8d8bef9SDimitry Andric                                 CGM.getModule(), OMPRTL___kmpc_syncwarp),
1375e8d8bef9SDimitry Andric                             Mask);
1376e8d8bef9SDimitry Andric 
1377e8d8bef9SDimitry Andric   llvm::Value *IncCounterVal =
1378e8d8bef9SDimitry Andric       CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1379e8d8bef9SDimitry Andric   CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1380e8d8bef9SDimitry Andric   CGF.EmitBranch(LoopBB);
1381e8d8bef9SDimitry Andric 
1382e8d8bef9SDimitry Andric   // Block that is reached when  all threads in the team complete the region.
1383e8d8bef9SDimitry Andric   CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1384e8d8bef9SDimitry Andric }
1385e8d8bef9SDimitry Andric 
1386e8d8bef9SDimitry Andric /// Cast value to the specified type.
1387e8d8bef9SDimitry Andric static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1388e8d8bef9SDimitry Andric                                     QualType ValTy, QualType CastTy,
1389e8d8bef9SDimitry Andric                                     SourceLocation Loc) {
1390e8d8bef9SDimitry Andric   assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1391e8d8bef9SDimitry Andric          "Cast type must sized.");
1392e8d8bef9SDimitry Andric   assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1393e8d8bef9SDimitry Andric          "Val type must sized.");
1394e8d8bef9SDimitry Andric   llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1395e8d8bef9SDimitry Andric   if (ValTy == CastTy)
1396e8d8bef9SDimitry Andric     return Val;
1397e8d8bef9SDimitry Andric   if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1398e8d8bef9SDimitry Andric       CGF.getContext().getTypeSizeInChars(CastTy))
1399e8d8bef9SDimitry Andric     return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1400e8d8bef9SDimitry Andric   if (CastTy->isIntegerType() && ValTy->isIntegerType())
1401e8d8bef9SDimitry Andric     return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1402e8d8bef9SDimitry Andric                                      CastTy->hasSignedIntegerRepresentation());
1403e8d8bef9SDimitry Andric   Address CastItem = CGF.CreateMemTemp(CastTy);
14045f757f3fSDimitry Andric   Address ValCastItem = CastItem.withElementType(Val->getType());
1405e8d8bef9SDimitry Andric   CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
1406e8d8bef9SDimitry Andric                         LValueBaseInfo(AlignmentSource::Type),
1407e8d8bef9SDimitry Andric                         TBAAAccessInfo());
1408e8d8bef9SDimitry Andric   return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
1409e8d8bef9SDimitry Andric                               LValueBaseInfo(AlignmentSource::Type),
1410e8d8bef9SDimitry Andric                               TBAAAccessInfo());
1411e8d8bef9SDimitry Andric }
1412e8d8bef9SDimitry Andric 
1413e8d8bef9SDimitry Andric ///
1414e8d8bef9SDimitry Andric /// Design of OpenMP reductions on the GPU
1415e8d8bef9SDimitry Andric ///
1416e8d8bef9SDimitry Andric /// Consider a typical OpenMP program with one or more reduction
1417e8d8bef9SDimitry Andric /// clauses:
1418e8d8bef9SDimitry Andric ///
1419e8d8bef9SDimitry Andric /// float foo;
1420e8d8bef9SDimitry Andric /// double bar;
1421e8d8bef9SDimitry Andric /// #pragma omp target teams distribute parallel for \
1422e8d8bef9SDimitry Andric ///             reduction(+:foo) reduction(*:bar)
1423e8d8bef9SDimitry Andric /// for (int i = 0; i < N; i++) {
1424e8d8bef9SDimitry Andric ///   foo += A[i]; bar *= B[i];
1425e8d8bef9SDimitry Andric /// }
1426e8d8bef9SDimitry Andric ///
1427e8d8bef9SDimitry Andric /// where 'foo' and 'bar' are reduced across all OpenMP threads in
1428e8d8bef9SDimitry Andric /// all teams.  In our OpenMP implementation on the NVPTX device an
1429e8d8bef9SDimitry Andric /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1430e8d8bef9SDimitry Andric /// within a team are mapped to CUDA threads within a threadblock.
1431e8d8bef9SDimitry Andric /// Our goal is to efficiently aggregate values across all OpenMP
1432e8d8bef9SDimitry Andric /// threads such that:
1433e8d8bef9SDimitry Andric ///
1434e8d8bef9SDimitry Andric ///   - the compiler and runtime are logically concise, and
1435e8d8bef9SDimitry Andric ///   - the reduction is performed efficiently in a hierarchical
1436e8d8bef9SDimitry Andric ///     manner as follows: within OpenMP threads in the same warp,
1437e8d8bef9SDimitry Andric ///     across warps in a threadblock, and finally across teams on
1438e8d8bef9SDimitry Andric ///     the NVPTX device.
1439e8d8bef9SDimitry Andric ///
1440e8d8bef9SDimitry Andric /// Introduction to Decoupling
1441e8d8bef9SDimitry Andric ///
1442e8d8bef9SDimitry Andric /// We would like to decouple the compiler and the runtime so that the
1443e8d8bef9SDimitry Andric /// latter is ignorant of the reduction variables (number, data types)
1444e8d8bef9SDimitry Andric /// and the reduction operators.  This allows a simpler interface
1445e8d8bef9SDimitry Andric /// and implementation while still attaining good performance.
1446e8d8bef9SDimitry Andric ///
1447e8d8bef9SDimitry Andric /// Pseudocode for the aforementioned OpenMP program generated by the
1448e8d8bef9SDimitry Andric /// compiler is as follows:
1449e8d8bef9SDimitry Andric ///
1450e8d8bef9SDimitry Andric /// 1. Create private copies of reduction variables on each OpenMP
1451e8d8bef9SDimitry Andric ///    thread: 'foo_private', 'bar_private'
1452e8d8bef9SDimitry Andric /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1453e8d8bef9SDimitry Andric ///    to it and writes the result in 'foo_private' and 'bar_private'
1454e8d8bef9SDimitry Andric ///    respectively.
1455e8d8bef9SDimitry Andric /// 3. Call the OpenMP runtime on the GPU to reduce within a team
1456e8d8bef9SDimitry Andric ///    and store the result on the team master:
1457e8d8bef9SDimitry Andric ///
1458e8d8bef9SDimitry Andric ///     __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1459e8d8bef9SDimitry Andric ///        reduceData, shuffleReduceFn, interWarpCpyFn)
1460e8d8bef9SDimitry Andric ///
1461e8d8bef9SDimitry Andric ///     where:
1462e8d8bef9SDimitry Andric ///       struct ReduceData {
1463e8d8bef9SDimitry Andric ///         double *foo;
1464e8d8bef9SDimitry Andric ///         double *bar;
1465e8d8bef9SDimitry Andric ///       } reduceData
1466e8d8bef9SDimitry Andric ///       reduceData.foo = &foo_private
1467e8d8bef9SDimitry Andric ///       reduceData.bar = &bar_private
1468e8d8bef9SDimitry Andric ///
1469e8d8bef9SDimitry Andric ///     'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1470e8d8bef9SDimitry Andric ///     auxiliary functions generated by the compiler that operate on
1471e8d8bef9SDimitry Andric ///     variables of type 'ReduceData'.  They aid the runtime perform
1472e8d8bef9SDimitry Andric ///     algorithmic steps in a data agnostic manner.
1473e8d8bef9SDimitry Andric ///
1474e8d8bef9SDimitry Andric ///     'shuffleReduceFn' is a pointer to a function that reduces data
1475e8d8bef9SDimitry Andric ///     of type 'ReduceData' across two OpenMP threads (lanes) in the
1476e8d8bef9SDimitry Andric ///     same warp.  It takes the following arguments as input:
1477e8d8bef9SDimitry Andric ///
1478e8d8bef9SDimitry Andric ///     a. variable of type 'ReduceData' on the calling lane,
1479e8d8bef9SDimitry Andric ///     b. its lane_id,
1480e8d8bef9SDimitry Andric ///     c. an offset relative to the current lane_id to generate a
1481e8d8bef9SDimitry Andric ///        remote_lane_id.  The remote lane contains the second
1482e8d8bef9SDimitry Andric ///        variable of type 'ReduceData' that is to be reduced.
1483e8d8bef9SDimitry Andric ///     d. an algorithm version parameter determining which reduction
1484e8d8bef9SDimitry Andric ///        algorithm to use.
1485e8d8bef9SDimitry Andric ///
1486e8d8bef9SDimitry Andric ///     'shuffleReduceFn' retrieves data from the remote lane using
1487e8d8bef9SDimitry Andric ///     efficient GPU shuffle intrinsics and reduces, using the
1488e8d8bef9SDimitry Andric ///     algorithm specified by the 4th parameter, the two operands
1489e8d8bef9SDimitry Andric ///     element-wise.  The result is written to the first operand.
1490e8d8bef9SDimitry Andric ///
1491e8d8bef9SDimitry Andric ///     Different reduction algorithms are implemented in different
1492e8d8bef9SDimitry Andric ///     runtime functions, all calling 'shuffleReduceFn' to perform
1493e8d8bef9SDimitry Andric ///     the essential reduction step.  Therefore, based on the 4th
1494e8d8bef9SDimitry Andric ///     parameter, this function behaves slightly differently to
1495e8d8bef9SDimitry Andric ///     cooperate with the runtime to ensure correctness under
1496e8d8bef9SDimitry Andric ///     different circumstances.
1497e8d8bef9SDimitry Andric ///
1498e8d8bef9SDimitry Andric ///     'InterWarpCpyFn' is a pointer to a function that transfers
1499e8d8bef9SDimitry Andric ///     reduced variables across warps.  It tunnels, through CUDA
1500e8d8bef9SDimitry Andric ///     shared memory, the thread-private data of type 'ReduceData'
1501e8d8bef9SDimitry Andric ///     from lane 0 of each warp to a lane in the first warp.
1502e8d8bef9SDimitry Andric /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1503e8d8bef9SDimitry Andric ///    The last team writes the global reduced value to memory.
1504e8d8bef9SDimitry Andric ///
1505e8d8bef9SDimitry Andric ///     ret = __kmpc_nvptx_teams_reduce_nowait(...,
1506e8d8bef9SDimitry Andric ///             reduceData, shuffleReduceFn, interWarpCpyFn,
1507e8d8bef9SDimitry Andric ///             scratchpadCopyFn, loadAndReduceFn)
1508e8d8bef9SDimitry Andric ///
1509e8d8bef9SDimitry Andric ///     'scratchpadCopyFn' is a helper that stores reduced
1510e8d8bef9SDimitry Andric ///     data from the team master to a scratchpad array in
1511e8d8bef9SDimitry Andric ///     global memory.
1512e8d8bef9SDimitry Andric ///
1513e8d8bef9SDimitry Andric ///     'loadAndReduceFn' is a helper that loads data from
1514e8d8bef9SDimitry Andric ///     the scratchpad array and reduces it with the input
1515e8d8bef9SDimitry Andric ///     operand.
1516e8d8bef9SDimitry Andric ///
1517e8d8bef9SDimitry Andric ///     These compiler generated functions hide address
1518e8d8bef9SDimitry Andric ///     calculation and alignment information from the runtime.
1519e8d8bef9SDimitry Andric /// 5. if ret == 1:
1520e8d8bef9SDimitry Andric ///     The team master of the last team stores the reduced
1521e8d8bef9SDimitry Andric ///     result to the globals in memory.
1522e8d8bef9SDimitry Andric ///     foo += reduceData.foo; bar *= reduceData.bar
1523e8d8bef9SDimitry Andric ///
1524e8d8bef9SDimitry Andric ///
1525e8d8bef9SDimitry Andric /// Warp Reduction Algorithms
1526e8d8bef9SDimitry Andric ///
1527e8d8bef9SDimitry Andric /// On the warp level, we have three algorithms implemented in the
1528e8d8bef9SDimitry Andric /// OpenMP runtime depending on the number of active lanes:
1529e8d8bef9SDimitry Andric ///
1530e8d8bef9SDimitry Andric /// Full Warp Reduction
1531e8d8bef9SDimitry Andric ///
1532e8d8bef9SDimitry Andric /// The reduce algorithm within a warp where all lanes are active
1533e8d8bef9SDimitry Andric /// is implemented in the runtime as follows:
1534e8d8bef9SDimitry Andric ///
1535e8d8bef9SDimitry Andric /// full_warp_reduce(void *reduce_data,
1536e8d8bef9SDimitry Andric ///                  kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1537e8d8bef9SDimitry Andric ///   for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1538e8d8bef9SDimitry Andric ///     ShuffleReduceFn(reduce_data, 0, offset, 0);
1539e8d8bef9SDimitry Andric /// }
1540e8d8bef9SDimitry Andric ///
1541e8d8bef9SDimitry Andric /// The algorithm completes in log(2, WARPSIZE) steps.
1542e8d8bef9SDimitry Andric ///
1543e8d8bef9SDimitry Andric /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1544e8d8bef9SDimitry Andric /// not used therefore we save instructions by not retrieving lane_id
1545e8d8bef9SDimitry Andric /// from the corresponding special registers.  The 4th parameter, which
1546e8d8bef9SDimitry Andric /// represents the version of the algorithm being used, is set to 0 to
1547e8d8bef9SDimitry Andric /// signify full warp reduction.
1548e8d8bef9SDimitry Andric ///
1549e8d8bef9SDimitry Andric /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1550e8d8bef9SDimitry Andric ///
1551e8d8bef9SDimitry Andric /// #reduce_elem refers to an element in the local lane's data structure
1552e8d8bef9SDimitry Andric /// #remote_elem is retrieved from a remote lane
1553e8d8bef9SDimitry Andric /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1554e8d8bef9SDimitry Andric /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1555e8d8bef9SDimitry Andric ///
1556e8d8bef9SDimitry Andric /// Contiguous Partial Warp Reduction
1557e8d8bef9SDimitry Andric ///
1558e8d8bef9SDimitry Andric /// This reduce algorithm is used within a warp where only the first
1559e8d8bef9SDimitry Andric /// 'n' (n <= WARPSIZE) lanes are active.  It is typically used when the
1560e8d8bef9SDimitry Andric /// number of OpenMP threads in a parallel region is not a multiple of
1561e8d8bef9SDimitry Andric /// WARPSIZE.  The algorithm is implemented in the runtime as follows:
1562e8d8bef9SDimitry Andric ///
1563e8d8bef9SDimitry Andric /// void
1564e8d8bef9SDimitry Andric /// contiguous_partial_reduce(void *reduce_data,
1565e8d8bef9SDimitry Andric ///                           kmp_ShuffleReductFctPtr ShuffleReduceFn,
1566e8d8bef9SDimitry Andric ///                           int size, int lane_id) {
1567e8d8bef9SDimitry Andric ///   int curr_size;
1568e8d8bef9SDimitry Andric ///   int offset;
1569e8d8bef9SDimitry Andric ///   curr_size = size;
1570e8d8bef9SDimitry Andric ///   mask = curr_size/2;
1571e8d8bef9SDimitry Andric ///   while (offset>0) {
1572e8d8bef9SDimitry Andric ///     ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1573e8d8bef9SDimitry Andric ///     curr_size = (curr_size+1)/2;
1574e8d8bef9SDimitry Andric ///     offset = curr_size/2;
1575e8d8bef9SDimitry Andric ///   }
1576e8d8bef9SDimitry Andric /// }
1577e8d8bef9SDimitry Andric ///
1578e8d8bef9SDimitry Andric /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1579e8d8bef9SDimitry Andric ///
1580e8d8bef9SDimitry Andric /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1581e8d8bef9SDimitry Andric /// if (lane_id < offset)
1582e8d8bef9SDimitry Andric ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
1583e8d8bef9SDimitry Andric /// else
1584e8d8bef9SDimitry Andric ///     reduce_elem = remote_elem
1585e8d8bef9SDimitry Andric ///
1586e8d8bef9SDimitry Andric /// This algorithm assumes that the data to be reduced are located in a
1587e8d8bef9SDimitry Andric /// contiguous subset of lanes starting from the first.  When there is
1588e8d8bef9SDimitry Andric /// an odd number of active lanes, the data in the last lane is not
1589e8d8bef9SDimitry Andric /// aggregated with any other lane's dat but is instead copied over.
1590e8d8bef9SDimitry Andric ///
1591e8d8bef9SDimitry Andric /// Dispersed Partial Warp Reduction
1592e8d8bef9SDimitry Andric ///
1593e8d8bef9SDimitry Andric /// This algorithm is used within a warp when any discontiguous subset of
1594e8d8bef9SDimitry Andric /// lanes are active.  It is used to implement the reduction operation
1595e8d8bef9SDimitry Andric /// across lanes in an OpenMP simd region or in a nested parallel region.
1596e8d8bef9SDimitry Andric ///
1597e8d8bef9SDimitry Andric /// void
1598e8d8bef9SDimitry Andric /// dispersed_partial_reduce(void *reduce_data,
1599e8d8bef9SDimitry Andric ///                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1600e8d8bef9SDimitry Andric ///   int size, remote_id;
1601e8d8bef9SDimitry Andric ///   int logical_lane_id = number_of_active_lanes_before_me() * 2;
1602e8d8bef9SDimitry Andric ///   do {
1603e8d8bef9SDimitry Andric ///       remote_id = next_active_lane_id_right_after_me();
1604e8d8bef9SDimitry Andric ///       # the above function returns 0 of no active lane
1605e8d8bef9SDimitry Andric ///       # is present right after the current lane.
1606e8d8bef9SDimitry Andric ///       size = number_of_active_lanes_in_this_warp();
1607e8d8bef9SDimitry Andric ///       logical_lane_id /= 2;
1608e8d8bef9SDimitry Andric ///       ShuffleReduceFn(reduce_data, logical_lane_id,
1609e8d8bef9SDimitry Andric ///                       remote_id-1-threadIdx.x, 2);
1610e8d8bef9SDimitry Andric ///   } while (logical_lane_id % 2 == 0 && size > 1);
1611e8d8bef9SDimitry Andric /// }
1612e8d8bef9SDimitry Andric ///
1613e8d8bef9SDimitry Andric /// There is no assumption made about the initial state of the reduction.
1614e8d8bef9SDimitry Andric /// Any number of lanes (>=1) could be active at any position.  The reduction
1615e8d8bef9SDimitry Andric /// result is returned in the first active lane.
1616e8d8bef9SDimitry Andric ///
1617e8d8bef9SDimitry Andric /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1618e8d8bef9SDimitry Andric ///
1619e8d8bef9SDimitry Andric /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1620e8d8bef9SDimitry Andric /// if (lane_id % 2 == 0 && offset > 0)
1621e8d8bef9SDimitry Andric ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
1622e8d8bef9SDimitry Andric /// else
1623e8d8bef9SDimitry Andric ///     reduce_elem = remote_elem
1624e8d8bef9SDimitry Andric ///
1625e8d8bef9SDimitry Andric ///
1626e8d8bef9SDimitry Andric /// Intra-Team Reduction
1627e8d8bef9SDimitry Andric ///
1628e8d8bef9SDimitry Andric /// This function, as implemented in the runtime call
1629e8d8bef9SDimitry Andric /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1630e8d8bef9SDimitry Andric /// threads in a team.  It first reduces within a warp using the
1631e8d8bef9SDimitry Andric /// aforementioned algorithms.  We then proceed to gather all such
1632e8d8bef9SDimitry Andric /// reduced values at the first warp.
1633e8d8bef9SDimitry Andric ///
1634e8d8bef9SDimitry Andric /// The runtime makes use of the function 'InterWarpCpyFn', which copies
1635e8d8bef9SDimitry Andric /// data from each of the "warp master" (zeroth lane of each warp, where
1636e8d8bef9SDimitry Andric /// warp-reduced data is held) to the zeroth warp.  This step reduces (in
1637e8d8bef9SDimitry Andric /// a mathematical sense) the problem of reduction across warp masters in
1638e8d8bef9SDimitry Andric /// a block to the problem of warp reduction.
1639e8d8bef9SDimitry Andric ///
1640e8d8bef9SDimitry Andric ///
1641e8d8bef9SDimitry Andric /// Inter-Team Reduction
1642e8d8bef9SDimitry Andric ///
1643e8d8bef9SDimitry Andric /// Once a team has reduced its data to a single value, it is stored in
1644e8d8bef9SDimitry Andric /// a global scratchpad array.  Since each team has a distinct slot, this
1645e8d8bef9SDimitry Andric /// can be done without locking.
1646e8d8bef9SDimitry Andric ///
1647e8d8bef9SDimitry Andric /// The last team to write to the scratchpad array proceeds to reduce the
1648e8d8bef9SDimitry Andric /// scratchpad array.  One or more workers in the last team use the helper
1649e8d8bef9SDimitry Andric /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
1650e8d8bef9SDimitry Andric /// the k'th worker reduces every k'th element.
1651e8d8bef9SDimitry Andric ///
1652e8d8bef9SDimitry Andric /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
1653e8d8bef9SDimitry Andric /// reduce across workers and compute a globally reduced value.
1654e8d8bef9SDimitry Andric ///
1655e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitReduction(
1656e8d8bef9SDimitry Andric     CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
1657e8d8bef9SDimitry Andric     ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
1658e8d8bef9SDimitry Andric     ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
1659e8d8bef9SDimitry Andric   if (!CGF.HaveInsertPoint())
1660e8d8bef9SDimitry Andric     return;
1661e8d8bef9SDimitry Andric 
1662e8d8bef9SDimitry Andric   bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
16630fca6ea1SDimitry Andric   bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind);
1664e8d8bef9SDimitry Andric   bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
16650fca6ea1SDimitry Andric 
16660fca6ea1SDimitry Andric   ASTContext &C = CGM.getContext();
1667e8d8bef9SDimitry Andric 
1668e8d8bef9SDimitry Andric   if (Options.SimpleReduction) {
1669e8d8bef9SDimitry Andric     assert(!TeamsReduction && !ParallelReduction &&
1670e8d8bef9SDimitry Andric            "Invalid reduction selection in emitReduction.");
16710fca6ea1SDimitry Andric     (void)ParallelReduction;
1672e8d8bef9SDimitry Andric     CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
1673e8d8bef9SDimitry Andric                                    ReductionOps, Options);
1674e8d8bef9SDimitry Andric     return;
1675e8d8bef9SDimitry Andric   }
1676e8d8bef9SDimitry Andric 
16775f757f3fSDimitry Andric   llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
16785f757f3fSDimitry Andric   llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
16795f757f3fSDimitry Andric   int Cnt = 0;
16805f757f3fSDimitry Andric   for (const Expr *DRE : Privates) {
16815f757f3fSDimitry Andric     PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
16825f757f3fSDimitry Andric     ++Cnt;
16835f757f3fSDimitry Andric   }
16845f757f3fSDimitry Andric   const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
16855f757f3fSDimitry Andric       CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1);
16865f757f3fSDimitry Andric 
16870fca6ea1SDimitry Andric   if (TeamsReduction)
16880fca6ea1SDimitry Andric     TeamsReductions.push_back(ReductionRec);
16890fca6ea1SDimitry Andric 
16900fca6ea1SDimitry Andric   // Source location for the ident struct
1691e8d8bef9SDimitry Andric   llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1692e8d8bef9SDimitry Andric 
16930fca6ea1SDimitry Andric   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
16940fca6ea1SDimitry Andric   InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
16950fca6ea1SDimitry Andric                          CGF.AllocaInsertPt->getIterator());
16960fca6ea1SDimitry Andric   InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
16970fca6ea1SDimitry Andric                           CGF.Builder.GetInsertPoint());
1698*36b606aeSDimitry Andric   llvm::OpenMPIRBuilder::LocationDescription OmpLoc(
1699*36b606aeSDimitry Andric       CodeGenIP, CGF.SourceLocToDebugLoc(Loc));
17000fca6ea1SDimitry Andric   llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos;
17010fca6ea1SDimitry Andric 
17020fca6ea1SDimitry Andric   CodeGenFunction::OMPPrivateScope Scope(CGF);
1703e8d8bef9SDimitry Andric   unsigned Idx = 0;
17040fca6ea1SDimitry Andric   for (const Expr *Private : Privates) {
17050fca6ea1SDimitry Andric     llvm::Type *ElementType;
17060fca6ea1SDimitry Andric     llvm::Value *Variable;
17070fca6ea1SDimitry Andric     llvm::Value *PrivateVariable;
17080fca6ea1SDimitry Andric     llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen = nullptr;
17090fca6ea1SDimitry Andric     ElementType = CGF.ConvertTypeForMem(Private->getType());
17100fca6ea1SDimitry Andric     const auto *RHSVar =
17110fca6ea1SDimitry Andric         cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl());
17120fca6ea1SDimitry Andric     PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).emitRawPointer(CGF);
17130fca6ea1SDimitry Andric     const auto *LHSVar =
17140fca6ea1SDimitry Andric         cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl());
17150fca6ea1SDimitry Andric     Variable = CGF.GetAddrOfLocalVar(LHSVar).emitRawPointer(CGF);
17160fca6ea1SDimitry Andric     llvm::OpenMPIRBuilder::EvalKind EvalKind;
17170fca6ea1SDimitry Andric     switch (CGF.getEvaluationKind(Private->getType())) {
17180fca6ea1SDimitry Andric     case TEK_Scalar:
17190fca6ea1SDimitry Andric       EvalKind = llvm::OpenMPIRBuilder::EvalKind::Scalar;
17200fca6ea1SDimitry Andric       break;
17210fca6ea1SDimitry Andric     case TEK_Complex:
17220fca6ea1SDimitry Andric       EvalKind = llvm::OpenMPIRBuilder::EvalKind::Complex;
17230fca6ea1SDimitry Andric       break;
17240fca6ea1SDimitry Andric     case TEK_Aggregate:
17250fca6ea1SDimitry Andric       EvalKind = llvm::OpenMPIRBuilder::EvalKind::Aggregate;
17260fca6ea1SDimitry Andric       break;
1727e8d8bef9SDimitry Andric     }
17280fca6ea1SDimitry Andric     auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
17290fca6ea1SDimitry Andric                             llvm::Value **LHSPtr, llvm::Value **RHSPtr,
17300fca6ea1SDimitry Andric                             llvm::Function *NewFunc) {
17310fca6ea1SDimitry Andric       CGF.Builder.restoreIP(CodeGenIP);
17320fca6ea1SDimitry Andric       auto *CurFn = CGF.CurFn;
17330fca6ea1SDimitry Andric       CGF.CurFn = NewFunc;
1734e8d8bef9SDimitry Andric 
17350fca6ea1SDimitry Andric       *LHSPtr = CGF.GetAddrOfLocalVar(
17360fca6ea1SDimitry Andric                        cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl()))
17370fca6ea1SDimitry Andric                     .emitRawPointer(CGF);
17380fca6ea1SDimitry Andric       *RHSPtr = CGF.GetAddrOfLocalVar(
17390fca6ea1SDimitry Andric                        cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl()))
17400fca6ea1SDimitry Andric                     .emitRawPointer(CGF);
1741e8d8bef9SDimitry Andric 
17420fca6ea1SDimitry Andric       emitSingleReductionCombiner(CGF, ReductionOps[I], Privates[I],
17430fca6ea1SDimitry Andric                                   cast<DeclRefExpr>(LHSExprs[I]),
17440fca6ea1SDimitry Andric                                   cast<DeclRefExpr>(RHSExprs[I]));
1745e8d8bef9SDimitry Andric 
17460fca6ea1SDimitry Andric       CGF.CurFn = CurFn;
1747e8d8bef9SDimitry Andric 
17480fca6ea1SDimitry Andric       return InsertPointTy(CGF.Builder.GetInsertBlock(),
17490fca6ea1SDimitry Andric                            CGF.Builder.GetInsertPoint());
1750e8d8bef9SDimitry Andric     };
17510fca6ea1SDimitry Andric     ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo(
17520fca6ea1SDimitry Andric         ElementType, Variable, PrivateVariable, EvalKind,
17530fca6ea1SDimitry Andric         /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen));
17540fca6ea1SDimitry Andric     Idx++;
17550fca6ea1SDimitry Andric   }
17560fca6ea1SDimitry Andric 
17570fca6ea1SDimitry Andric   CGF.Builder.restoreIP(OMPBuilder.createReductionsGPU(
17580fca6ea1SDimitry Andric       OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction,
17590fca6ea1SDimitry Andric       DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
17600fca6ea1SDimitry Andric       CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum,
17610fca6ea1SDimitry Andric       RTLoc));
17620fca6ea1SDimitry Andric   return;
1763e8d8bef9SDimitry Andric }
1764e8d8bef9SDimitry Andric 
1765e8d8bef9SDimitry Andric const VarDecl *
1766e8d8bef9SDimitry Andric CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
1767e8d8bef9SDimitry Andric                                        const VarDecl *NativeParam) const {
1768e8d8bef9SDimitry Andric   if (!NativeParam->getType()->isReferenceType())
1769e8d8bef9SDimitry Andric     return NativeParam;
1770e8d8bef9SDimitry Andric   QualType ArgType = NativeParam->getType();
1771e8d8bef9SDimitry Andric   QualifierCollector QC;
1772e8d8bef9SDimitry Andric   const Type *NonQualTy = QC.strip(ArgType);
1773e8d8bef9SDimitry Andric   QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
1774e8d8bef9SDimitry Andric   if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
1775e8d8bef9SDimitry Andric     if (Attr->getCaptureKind() == OMPC_map) {
1776e8d8bef9SDimitry Andric       PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
1777e8d8bef9SDimitry Andric                                                         LangAS::opencl_global);
1778e8d8bef9SDimitry Andric     }
1779e8d8bef9SDimitry Andric   }
1780e8d8bef9SDimitry Andric   ArgType = CGM.getContext().getPointerType(PointeeTy);
1781e8d8bef9SDimitry Andric   QC.addRestrict();
1782e8d8bef9SDimitry Andric   enum { NVPTX_local_addr = 5 };
1783e8d8bef9SDimitry Andric   QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
1784e8d8bef9SDimitry Andric   ArgType = QC.apply(CGM.getContext(), ArgType);
1785e8d8bef9SDimitry Andric   if (isa<ImplicitParamDecl>(NativeParam))
1786e8d8bef9SDimitry Andric     return ImplicitParamDecl::Create(
1787e8d8bef9SDimitry Andric         CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
17885f757f3fSDimitry Andric         NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other);
1789e8d8bef9SDimitry Andric   return ParmVarDecl::Create(
1790e8d8bef9SDimitry Andric       CGM.getContext(),
1791e8d8bef9SDimitry Andric       const_cast<DeclContext *>(NativeParam->getDeclContext()),
1792e8d8bef9SDimitry Andric       NativeParam->getBeginLoc(), NativeParam->getLocation(),
1793e8d8bef9SDimitry Andric       NativeParam->getIdentifier(), ArgType,
1794e8d8bef9SDimitry Andric       /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
1795e8d8bef9SDimitry Andric }
1796e8d8bef9SDimitry Andric 
1797e8d8bef9SDimitry Andric Address
1798e8d8bef9SDimitry Andric CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
1799e8d8bef9SDimitry Andric                                           const VarDecl *NativeParam,
1800e8d8bef9SDimitry Andric                                           const VarDecl *TargetParam) const {
1801e8d8bef9SDimitry Andric   assert(NativeParam != TargetParam &&
1802e8d8bef9SDimitry Andric          NativeParam->getType()->isReferenceType() &&
1803e8d8bef9SDimitry Andric          "Native arg must not be the same as target arg.");
1804e8d8bef9SDimitry Andric   Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
1805e8d8bef9SDimitry Andric   QualType NativeParamType = NativeParam->getType();
1806e8d8bef9SDimitry Andric   QualifierCollector QC;
1807e8d8bef9SDimitry Andric   const Type *NonQualTy = QC.strip(NativeParamType);
1808e8d8bef9SDimitry Andric   QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
1809e8d8bef9SDimitry Andric   unsigned NativePointeeAddrSpace =
1810bdd1243dSDimitry Andric       CGF.getTypes().getTargetAddressSpace(NativePointeeTy);
1811e8d8bef9SDimitry Andric   QualType TargetTy = TargetParam->getType();
181206c3fb27SDimitry Andric   llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(LocalAddr, /*Volatile=*/false,
181306c3fb27SDimitry Andric                                                  TargetTy, SourceLocation());
18145f757f3fSDimitry Andric   // Cast to native address space.
1815e8d8bef9SDimitry Andric   TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
181606c3fb27SDimitry Andric       TargetAddr,
181706c3fb27SDimitry Andric       llvm::PointerType::get(CGF.getLLVMContext(), NativePointeeAddrSpace));
1818e8d8bef9SDimitry Andric   Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
1819e8d8bef9SDimitry Andric   CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
1820e8d8bef9SDimitry Andric                         NativeParamType);
1821e8d8bef9SDimitry Andric   return NativeParamAddr;
1822e8d8bef9SDimitry Andric }
1823e8d8bef9SDimitry Andric 
1824e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
1825e8d8bef9SDimitry Andric     CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
1826e8d8bef9SDimitry Andric     ArrayRef<llvm::Value *> Args) const {
1827e8d8bef9SDimitry Andric   SmallVector<llvm::Value *, 4> TargetArgs;
1828e8d8bef9SDimitry Andric   TargetArgs.reserve(Args.size());
1829e8d8bef9SDimitry Andric   auto *FnType = OutlinedFn.getFunctionType();
1830e8d8bef9SDimitry Andric   for (unsigned I = 0, E = Args.size(); I < E; ++I) {
1831e8d8bef9SDimitry Andric     if (FnType->isVarArg() && FnType->getNumParams() <= I) {
1832e8d8bef9SDimitry Andric       TargetArgs.append(std::next(Args.begin(), I), Args.end());
1833e8d8bef9SDimitry Andric       break;
1834e8d8bef9SDimitry Andric     }
1835e8d8bef9SDimitry Andric     llvm::Type *TargetType = FnType->getParamType(I);
1836e8d8bef9SDimitry Andric     llvm::Value *NativeArg = Args[I];
1837e8d8bef9SDimitry Andric     if (!TargetType->isPointerTy()) {
1838e8d8bef9SDimitry Andric       TargetArgs.emplace_back(NativeArg);
1839e8d8bef9SDimitry Andric       continue;
1840e8d8bef9SDimitry Andric     }
1841e8d8bef9SDimitry Andric     TargetArgs.emplace_back(
18425f757f3fSDimitry Andric         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(NativeArg, TargetType));
1843e8d8bef9SDimitry Andric   }
1844e8d8bef9SDimitry Andric   CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
1845e8d8bef9SDimitry Andric }
1846e8d8bef9SDimitry Andric 
1847e8d8bef9SDimitry Andric /// Emit function which wraps the outline parallel region
1848e8d8bef9SDimitry Andric /// and controls the arguments which are passed to this function.
1849e8d8bef9SDimitry Andric /// The wrapper ensures that the outlined function is called
1850e8d8bef9SDimitry Andric /// with the correct arguments when data is shared.
1851e8d8bef9SDimitry Andric llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
1852e8d8bef9SDimitry Andric     llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
1853e8d8bef9SDimitry Andric   ASTContext &Ctx = CGM.getContext();
1854e8d8bef9SDimitry Andric   const auto &CS = *D.getCapturedStmt(OMPD_parallel);
1855e8d8bef9SDimitry Andric 
1856e8d8bef9SDimitry Andric   // Create a function that takes as argument the source thread.
1857e8d8bef9SDimitry Andric   FunctionArgList WrapperArgs;
1858e8d8bef9SDimitry Andric   QualType Int16QTy =
1859e8d8bef9SDimitry Andric       Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
1860e8d8bef9SDimitry Andric   QualType Int32QTy =
1861e8d8bef9SDimitry Andric       Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
1862e8d8bef9SDimitry Andric   ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
1863e8d8bef9SDimitry Andric                                      /*Id=*/nullptr, Int16QTy,
18645f757f3fSDimitry Andric                                      ImplicitParamKind::Other);
1865e8d8bef9SDimitry Andric   ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
1866e8d8bef9SDimitry Andric                                /*Id=*/nullptr, Int32QTy,
18675f757f3fSDimitry Andric                                ImplicitParamKind::Other);
1868e8d8bef9SDimitry Andric   WrapperArgs.emplace_back(&ParallelLevelArg);
1869e8d8bef9SDimitry Andric   WrapperArgs.emplace_back(&WrapperArg);
1870e8d8bef9SDimitry Andric 
1871e8d8bef9SDimitry Andric   const CGFunctionInfo &CGFI =
1872e8d8bef9SDimitry Andric       CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
1873e8d8bef9SDimitry Andric 
1874e8d8bef9SDimitry Andric   auto *Fn = llvm::Function::Create(
1875e8d8bef9SDimitry Andric       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1876e8d8bef9SDimitry Andric       Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
1877fe6060f1SDimitry Andric 
1878fe6060f1SDimitry Andric   // Ensure we do not inline the function. This is trivially true for the ones
1879fe6060f1SDimitry Andric   // passed to __kmpc_fork_call but the ones calles in serialized regions
1880fe6060f1SDimitry Andric   // could be inlined. This is not a perfect but it is closer to the invariant
1881fe6060f1SDimitry Andric   // we want, namely, every data environment starts with a new function.
1882fe6060f1SDimitry Andric   // TODO: We should pass the if condition to the runtime function and do the
1883fe6060f1SDimitry Andric   //       handling there. Much cleaner code.
1884fe6060f1SDimitry Andric   Fn->addFnAttr(llvm::Attribute::NoInline);
1885fe6060f1SDimitry Andric 
1886e8d8bef9SDimitry Andric   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
1887e8d8bef9SDimitry Andric   Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
1888e8d8bef9SDimitry Andric   Fn->setDoesNotRecurse();
1889e8d8bef9SDimitry Andric 
1890e8d8bef9SDimitry Andric   CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1891e8d8bef9SDimitry Andric   CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
1892e8d8bef9SDimitry Andric                     D.getBeginLoc(), D.getBeginLoc());
1893e8d8bef9SDimitry Andric 
1894e8d8bef9SDimitry Andric   const auto *RD = CS.getCapturedRecordDecl();
1895e8d8bef9SDimitry Andric   auto CurField = RD->field_begin();
1896e8d8bef9SDimitry Andric 
1897e8d8bef9SDimitry Andric   Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1898e8d8bef9SDimitry Andric                                                       /*Name=*/".zero.addr");
1899349cc55cSDimitry Andric   CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1900e8d8bef9SDimitry Andric   // Get the array of arguments.
1901e8d8bef9SDimitry Andric   SmallVector<llvm::Value *, 8> Args;
1902e8d8bef9SDimitry Andric 
19030fca6ea1SDimitry Andric   Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).emitRawPointer(CGF));
19040fca6ea1SDimitry Andric   Args.emplace_back(ZeroAddr.emitRawPointer(CGF));
1905e8d8bef9SDimitry Andric 
1906e8d8bef9SDimitry Andric   CGBuilderTy &Bld = CGF.Builder;
1907e8d8bef9SDimitry Andric   auto CI = CS.capture_begin();
1908e8d8bef9SDimitry Andric 
1909e8d8bef9SDimitry Andric   // Use global memory for data sharing.
1910e8d8bef9SDimitry Andric   // Handle passing of global args to workers.
19110fca6ea1SDimitry Andric   RawAddress GlobalArgs =
1912e8d8bef9SDimitry Andric       CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
1913e8d8bef9SDimitry Andric   llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
1914e8d8bef9SDimitry Andric   llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
1915e8d8bef9SDimitry Andric   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1916e8d8bef9SDimitry Andric                           CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
1917e8d8bef9SDimitry Andric                       DataSharingArgs);
1918e8d8bef9SDimitry Andric 
1919e8d8bef9SDimitry Andric   // Retrieve the shared variables from the list of references returned
1920e8d8bef9SDimitry Andric   // by the runtime. Pass the variables to the outlined function.
1921e8d8bef9SDimitry Andric   Address SharedArgListAddress = Address::invalid();
1922e8d8bef9SDimitry Andric   if (CS.capture_size() > 0 ||
1923e8d8bef9SDimitry Andric       isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
1924e8d8bef9SDimitry Andric     SharedArgListAddress = CGF.EmitLoadOfPointer(
1925e8d8bef9SDimitry Andric         GlobalArgs, CGF.getContext()
192681ad6265SDimitry Andric                         .getPointerType(CGF.getContext().VoidPtrTy)
1927e8d8bef9SDimitry Andric                         .castAs<PointerType>());
1928e8d8bef9SDimitry Andric   }
1929e8d8bef9SDimitry Andric   unsigned Idx = 0;
1930e8d8bef9SDimitry Andric   if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
1931e8d8bef9SDimitry Andric     Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
1932e8d8bef9SDimitry Andric     Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
193381ad6265SDimitry Andric         Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
1934e8d8bef9SDimitry Andric     llvm::Value *LB = CGF.EmitLoadOfScalar(
1935e8d8bef9SDimitry Andric         TypedAddress,
1936e8d8bef9SDimitry Andric         /*Volatile=*/false,
1937e8d8bef9SDimitry Andric         CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
1938e8d8bef9SDimitry Andric         cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
1939e8d8bef9SDimitry Andric     Args.emplace_back(LB);
1940e8d8bef9SDimitry Andric     ++Idx;
1941e8d8bef9SDimitry Andric     Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
1942e8d8bef9SDimitry Andric     TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
194381ad6265SDimitry Andric         Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
1944e8d8bef9SDimitry Andric     llvm::Value *UB = CGF.EmitLoadOfScalar(
1945e8d8bef9SDimitry Andric         TypedAddress,
1946e8d8bef9SDimitry Andric         /*Volatile=*/false,
1947e8d8bef9SDimitry Andric         CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
1948e8d8bef9SDimitry Andric         cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
1949e8d8bef9SDimitry Andric     Args.emplace_back(UB);
1950e8d8bef9SDimitry Andric     ++Idx;
1951e8d8bef9SDimitry Andric   }
1952e8d8bef9SDimitry Andric   if (CS.capture_size() > 0) {
1953e8d8bef9SDimitry Andric     ASTContext &CGFContext = CGF.getContext();
1954e8d8bef9SDimitry Andric     for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
1955e8d8bef9SDimitry Andric       QualType ElemTy = CurField->getType();
1956e8d8bef9SDimitry Andric       Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
1957e8d8bef9SDimitry Andric       Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
195881ad6265SDimitry Andric           Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),
195981ad6265SDimitry Andric           CGF.ConvertTypeForMem(ElemTy));
1960e8d8bef9SDimitry Andric       llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
1961e8d8bef9SDimitry Andric                                               /*Volatile=*/false,
1962e8d8bef9SDimitry Andric                                               CGFContext.getPointerType(ElemTy),
1963e8d8bef9SDimitry Andric                                               CI->getLocation());
1964e8d8bef9SDimitry Andric       if (CI->capturesVariableByCopy() &&
1965e8d8bef9SDimitry Andric           !CI->getCapturedVar()->getType()->isAnyPointerType()) {
1966e8d8bef9SDimitry Andric         Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
1967e8d8bef9SDimitry Andric                               CI->getLocation());
1968e8d8bef9SDimitry Andric       }
1969e8d8bef9SDimitry Andric       Args.emplace_back(Arg);
1970e8d8bef9SDimitry Andric     }
1971e8d8bef9SDimitry Andric   }
1972e8d8bef9SDimitry Andric 
1973e8d8bef9SDimitry Andric   emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
1974e8d8bef9SDimitry Andric   CGF.FinishFunction();
1975e8d8bef9SDimitry Andric   return Fn;
1976e8d8bef9SDimitry Andric }
1977e8d8bef9SDimitry Andric 
1978e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
1979e8d8bef9SDimitry Andric                                               const Decl *D) {
19805f757f3fSDimitry Andric   if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1981e8d8bef9SDimitry Andric     return;
1982e8d8bef9SDimitry Andric 
1983e8d8bef9SDimitry Andric   assert(D && "Expected function or captured|block decl.");
1984e8d8bef9SDimitry Andric   assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
1985e8d8bef9SDimitry Andric          "Function is registered already.");
1986e8d8bef9SDimitry Andric   assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
1987e8d8bef9SDimitry Andric          "Team is set but not processed.");
1988e8d8bef9SDimitry Andric   const Stmt *Body = nullptr;
1989e8d8bef9SDimitry Andric   bool NeedToDelayGlobalization = false;
1990e8d8bef9SDimitry Andric   if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
1991e8d8bef9SDimitry Andric     Body = FD->getBody();
1992e8d8bef9SDimitry Andric   } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
1993e8d8bef9SDimitry Andric     Body = BD->getBody();
1994e8d8bef9SDimitry Andric   } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
1995e8d8bef9SDimitry Andric     Body = CD->getBody();
1996e8d8bef9SDimitry Andric     NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
1997e8d8bef9SDimitry Andric     if (NeedToDelayGlobalization &&
1998e8d8bef9SDimitry Andric         getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1999e8d8bef9SDimitry Andric       return;
2000e8d8bef9SDimitry Andric   }
2001e8d8bef9SDimitry Andric   if (!Body)
2002e8d8bef9SDimitry Andric     return;
2003e8d8bef9SDimitry Andric   CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
2004e8d8bef9SDimitry Andric   VarChecker.Visit(Body);
2005e8d8bef9SDimitry Andric   const RecordDecl *GlobalizedVarsRecord =
2006e8d8bef9SDimitry Andric       VarChecker.getGlobalizedRecord(IsInTTDRegion);
2007e8d8bef9SDimitry Andric   TeamAndReductions.first = nullptr;
2008e8d8bef9SDimitry Andric   TeamAndReductions.second.clear();
2009e8d8bef9SDimitry Andric   ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
2010e8d8bef9SDimitry Andric       VarChecker.getEscapedVariableLengthDecls();
201106c3fb27SDimitry Andric   ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =
201206c3fb27SDimitry Andric       VarChecker.getDelayedVariableLengthDecls();
201306c3fb27SDimitry Andric   if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&
201406c3fb27SDimitry Andric       DelayedVariableLengthDecls.empty())
2015e8d8bef9SDimitry Andric     return;
2016e8d8bef9SDimitry Andric   auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
2017e8d8bef9SDimitry Andric   I->getSecond().MappedParams =
2018e8d8bef9SDimitry Andric       std::make_unique<CodeGenFunction::OMPMapVars>();
2019e8d8bef9SDimitry Andric   I->getSecond().EscapedParameters.insert(
2020e8d8bef9SDimitry Andric       VarChecker.getEscapedParameters().begin(),
2021e8d8bef9SDimitry Andric       VarChecker.getEscapedParameters().end());
2022e8d8bef9SDimitry Andric   I->getSecond().EscapedVariableLengthDecls.append(
2023e8d8bef9SDimitry Andric       EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
202406c3fb27SDimitry Andric   I->getSecond().DelayedVariableLengthDecls.append(
202506c3fb27SDimitry Andric       DelayedVariableLengthDecls.begin(), DelayedVariableLengthDecls.end());
2026e8d8bef9SDimitry Andric   DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2027e8d8bef9SDimitry Andric   for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
2028e8d8bef9SDimitry Andric     assert(VD->isCanonicalDecl() && "Expected canonical declaration");
2029fe6060f1SDimitry Andric     Data.insert(std::make_pair(VD, MappedVarData()));
2030e8d8bef9SDimitry Andric   }
2031e8d8bef9SDimitry Andric   if (!NeedToDelayGlobalization) {
20325f757f3fSDimitry Andric     emitGenericVarsProlog(CGF, D->getBeginLoc());
2033e8d8bef9SDimitry Andric     struct GlobalizationScope final : EHScopeStack::Cleanup {
2034e8d8bef9SDimitry Andric       GlobalizationScope() = default;
2035e8d8bef9SDimitry Andric 
2036e8d8bef9SDimitry Andric       void Emit(CodeGenFunction &CGF, Flags flags) override {
2037e8d8bef9SDimitry Andric         static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
20385f757f3fSDimitry Andric             .emitGenericVarsEpilog(CGF);
2039e8d8bef9SDimitry Andric       }
2040e8d8bef9SDimitry Andric     };
2041e8d8bef9SDimitry Andric     CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
2042e8d8bef9SDimitry Andric   }
2043e8d8bef9SDimitry Andric }
2044e8d8bef9SDimitry Andric 
2045e8d8bef9SDimitry Andric Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
2046e8d8bef9SDimitry Andric                                                         const VarDecl *VD) {
2047e8d8bef9SDimitry Andric   if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
2048e8d8bef9SDimitry Andric     const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2049e8d8bef9SDimitry Andric     auto AS = LangAS::Default;
2050e8d8bef9SDimitry Andric     switch (A->getAllocatorType()) {
2051e8d8bef9SDimitry Andric       // Use the default allocator here as by default local vars are
2052e8d8bef9SDimitry Andric       // threadlocal.
2053e8d8bef9SDimitry Andric     case OMPAllocateDeclAttr::OMPNullMemAlloc:
2054e8d8bef9SDimitry Andric     case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2055e8d8bef9SDimitry Andric     case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2056e8d8bef9SDimitry Andric     case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2057e8d8bef9SDimitry Andric     case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2058e8d8bef9SDimitry Andric       // Follow the user decision - use default allocation.
2059e8d8bef9SDimitry Andric       return Address::invalid();
2060e8d8bef9SDimitry Andric     case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2061e8d8bef9SDimitry Andric       // TODO: implement aupport for user-defined allocators.
2062e8d8bef9SDimitry Andric       return Address::invalid();
2063e8d8bef9SDimitry Andric     case OMPAllocateDeclAttr::OMPConstMemAlloc:
2064e8d8bef9SDimitry Andric       AS = LangAS::cuda_constant;
2065e8d8bef9SDimitry Andric       break;
2066e8d8bef9SDimitry Andric     case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2067e8d8bef9SDimitry Andric       AS = LangAS::cuda_shared;
2068e8d8bef9SDimitry Andric       break;
2069e8d8bef9SDimitry Andric     case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2070e8d8bef9SDimitry Andric     case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2071e8d8bef9SDimitry Andric       break;
2072e8d8bef9SDimitry Andric     }
2073e8d8bef9SDimitry Andric     llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
2074e8d8bef9SDimitry Andric     auto *GV = new llvm::GlobalVariable(
2075e8d8bef9SDimitry Andric         CGM.getModule(), VarTy, /*isConstant=*/false,
207606c3fb27SDimitry Andric         llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy),
2077e8d8bef9SDimitry Andric         VD->getName(),
2078e8d8bef9SDimitry Andric         /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
2079e8d8bef9SDimitry Andric         CGM.getContext().getTargetAddressSpace(AS));
2080e8d8bef9SDimitry Andric     CharUnits Align = CGM.getContext().getDeclAlign(VD);
2081e8d8bef9SDimitry Andric     GV->setAlignment(Align.getAsAlign());
2082e8d8bef9SDimitry Andric     return Address(
2083e8d8bef9SDimitry Andric         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2084e8d8bef9SDimitry Andric             GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
2085e8d8bef9SDimitry Andric                     VD->getType().getAddressSpace()))),
208681ad6265SDimitry Andric         VarTy, Align);
2087e8d8bef9SDimitry Andric   }
2088e8d8bef9SDimitry Andric 
20895f757f3fSDimitry Andric   if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
2090e8d8bef9SDimitry Andric     return Address::invalid();
2091e8d8bef9SDimitry Andric 
2092e8d8bef9SDimitry Andric   VD = VD->getCanonicalDecl();
2093e8d8bef9SDimitry Andric   auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2094e8d8bef9SDimitry Andric   if (I == FunctionGlobalizedDecls.end())
2095e8d8bef9SDimitry Andric     return Address::invalid();
2096e8d8bef9SDimitry Andric   auto VDI = I->getSecond().LocalVarData.find(VD);
2097e8d8bef9SDimitry Andric   if (VDI != I->getSecond().LocalVarData.end())
2098e8d8bef9SDimitry Andric     return VDI->second.PrivateAddr;
2099e8d8bef9SDimitry Andric   if (VD->hasAttrs()) {
2100e8d8bef9SDimitry Andric     for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
2101e8d8bef9SDimitry Andric          E(VD->attr_end());
2102e8d8bef9SDimitry Andric          IT != E; ++IT) {
2103e8d8bef9SDimitry Andric       auto VDI = I->getSecond().LocalVarData.find(
2104e8d8bef9SDimitry Andric           cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
2105e8d8bef9SDimitry Andric               ->getCanonicalDecl());
2106e8d8bef9SDimitry Andric       if (VDI != I->getSecond().LocalVarData.end())
2107e8d8bef9SDimitry Andric         return VDI->second.PrivateAddr;
2108e8d8bef9SDimitry Andric     }
2109e8d8bef9SDimitry Andric   }
2110e8d8bef9SDimitry Andric 
2111e8d8bef9SDimitry Andric   return Address::invalid();
2112e8d8bef9SDimitry Andric }
2113e8d8bef9SDimitry Andric 
2114e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
2115e8d8bef9SDimitry Andric   FunctionGlobalizedDecls.erase(CGF.CurFn);
2116e8d8bef9SDimitry Andric   CGOpenMPRuntime::functionFinished(CGF);
2117e8d8bef9SDimitry Andric }
2118e8d8bef9SDimitry Andric 
2119e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
2120e8d8bef9SDimitry Andric     CodeGenFunction &CGF, const OMPLoopDirective &S,
2121e8d8bef9SDimitry Andric     OpenMPDistScheduleClauseKind &ScheduleKind,
2122e8d8bef9SDimitry Andric     llvm::Value *&Chunk) const {
2123e8d8bef9SDimitry Andric   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2124e8d8bef9SDimitry Andric   if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
2125e8d8bef9SDimitry Andric     ScheduleKind = OMPC_DIST_SCHEDULE_static;
2126e8d8bef9SDimitry Andric     Chunk = CGF.EmitScalarConversion(
2127e8d8bef9SDimitry Andric         RT.getGPUNumThreads(CGF),
2128e8d8bef9SDimitry Andric         CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
2129e8d8bef9SDimitry Andric         S.getIterationVariable()->getType(), S.getBeginLoc());
2130e8d8bef9SDimitry Andric     return;
2131e8d8bef9SDimitry Andric   }
2132e8d8bef9SDimitry Andric   CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
2133e8d8bef9SDimitry Andric       CGF, S, ScheduleKind, Chunk);
2134e8d8bef9SDimitry Andric }
2135e8d8bef9SDimitry Andric 
2136e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
2137e8d8bef9SDimitry Andric     CodeGenFunction &CGF, const OMPLoopDirective &S,
2138e8d8bef9SDimitry Andric     OpenMPScheduleClauseKind &ScheduleKind,
2139e8d8bef9SDimitry Andric     const Expr *&ChunkExpr) const {
2140e8d8bef9SDimitry Andric   ScheduleKind = OMPC_SCHEDULE_static;
2141e8d8bef9SDimitry Andric   // Chunk size is 1 in this case.
2142e8d8bef9SDimitry Andric   llvm::APInt ChunkSize(32, 1);
2143e8d8bef9SDimitry Andric   ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
2144e8d8bef9SDimitry Andric       CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
2145e8d8bef9SDimitry Andric       SourceLocation());
2146e8d8bef9SDimitry Andric }
2147e8d8bef9SDimitry Andric 
2148e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
2149e8d8bef9SDimitry Andric     CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
2150e8d8bef9SDimitry Andric   assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
2151e8d8bef9SDimitry Andric          " Expected target-based directive.");
2152e8d8bef9SDimitry Andric   const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
2153e8d8bef9SDimitry Andric   for (const CapturedStmt::Capture &C : CS->captures()) {
2154e8d8bef9SDimitry Andric     // Capture variables captured by reference in lambdas for target-based
2155e8d8bef9SDimitry Andric     // directives.
2156e8d8bef9SDimitry Andric     if (!C.capturesVariable())
2157e8d8bef9SDimitry Andric       continue;
2158e8d8bef9SDimitry Andric     const VarDecl *VD = C.getCapturedVar();
2159e8d8bef9SDimitry Andric     const auto *RD = VD->getType()
2160e8d8bef9SDimitry Andric                          .getCanonicalType()
2161e8d8bef9SDimitry Andric                          .getNonReferenceType()
2162e8d8bef9SDimitry Andric                          ->getAsCXXRecordDecl();
2163e8d8bef9SDimitry Andric     if (!RD || !RD->isLambda())
2164e8d8bef9SDimitry Andric       continue;
2165e8d8bef9SDimitry Andric     Address VDAddr = CGF.GetAddrOfLocalVar(VD);
2166e8d8bef9SDimitry Andric     LValue VDLVal;
2167e8d8bef9SDimitry Andric     if (VD->getType().getCanonicalType()->isReferenceType())
2168e8d8bef9SDimitry Andric       VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
2169e8d8bef9SDimitry Andric     else
2170e8d8bef9SDimitry Andric       VDLVal = CGF.MakeAddrLValue(
2171e8d8bef9SDimitry Andric           VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
2172bdd1243dSDimitry Andric     llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
2173e8d8bef9SDimitry Andric     FieldDecl *ThisCapture = nullptr;
2174e8d8bef9SDimitry Andric     RD->getCaptureFields(Captures, ThisCapture);
2175e8d8bef9SDimitry Andric     if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
2176e8d8bef9SDimitry Andric       LValue ThisLVal =
2177e8d8bef9SDimitry Andric           CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
2178e8d8bef9SDimitry Andric       llvm::Value *CXXThis = CGF.LoadCXXThis();
2179e8d8bef9SDimitry Andric       CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
2180e8d8bef9SDimitry Andric     }
2181e8d8bef9SDimitry Andric     for (const LambdaCapture &LC : RD->captures()) {
2182e8d8bef9SDimitry Andric       if (LC.getCaptureKind() != LCK_ByRef)
2183e8d8bef9SDimitry Andric         continue;
2184bdd1243dSDimitry Andric       const ValueDecl *VD = LC.getCapturedVar();
2185bdd1243dSDimitry Andric       // FIXME: For now VD is always a VarDecl because OpenMP does not support
2186bdd1243dSDimitry Andric       //  capturing structured bindings in lambdas yet.
2187bdd1243dSDimitry Andric       if (!CS->capturesVariable(cast<VarDecl>(VD)))
2188e8d8bef9SDimitry Andric         continue;
2189e8d8bef9SDimitry Andric       auto It = Captures.find(VD);
2190e8d8bef9SDimitry Andric       assert(It != Captures.end() && "Found lambda capture without field.");
2191e8d8bef9SDimitry Andric       LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
2192bdd1243dSDimitry Andric       Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
2193e8d8bef9SDimitry Andric       if (VD->getType().getCanonicalType()->isReferenceType())
2194e8d8bef9SDimitry Andric         VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
2195e8d8bef9SDimitry Andric                                                VD->getType().getCanonicalType())
21960fca6ea1SDimitry Andric                      .getAddress();
21970fca6ea1SDimitry Andric       CGF.EmitStoreOfScalar(VDAddr.emitRawPointer(CGF), VarLVal);
2198e8d8bef9SDimitry Andric     }
2199e8d8bef9SDimitry Andric   }
2200e8d8bef9SDimitry Andric }
2201e8d8bef9SDimitry Andric 
2202e8d8bef9SDimitry Andric bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
2203e8d8bef9SDimitry Andric                                                             LangAS &AS) {
2204e8d8bef9SDimitry Andric   if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
2205e8d8bef9SDimitry Andric     return false;
2206e8d8bef9SDimitry Andric   const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2207e8d8bef9SDimitry Andric   switch(A->getAllocatorType()) {
2208e8d8bef9SDimitry Andric   case OMPAllocateDeclAttr::OMPNullMemAlloc:
2209e8d8bef9SDimitry Andric   case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2210e8d8bef9SDimitry Andric   // Not supported, fallback to the default mem space.
2211e8d8bef9SDimitry Andric   case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2212e8d8bef9SDimitry Andric   case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2213e8d8bef9SDimitry Andric   case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2214e8d8bef9SDimitry Andric   case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2215e8d8bef9SDimitry Andric   case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2216e8d8bef9SDimitry Andric     AS = LangAS::Default;
2217e8d8bef9SDimitry Andric     return true;
2218e8d8bef9SDimitry Andric   case OMPAllocateDeclAttr::OMPConstMemAlloc:
2219e8d8bef9SDimitry Andric     AS = LangAS::cuda_constant;
2220e8d8bef9SDimitry Andric     return true;
2221e8d8bef9SDimitry Andric   case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2222e8d8bef9SDimitry Andric     AS = LangAS::cuda_shared;
2223e8d8bef9SDimitry Andric     return true;
2224e8d8bef9SDimitry Andric   case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2225e8d8bef9SDimitry Andric     llvm_unreachable("Expected predefined allocator for the variables with the "
2226e8d8bef9SDimitry Andric                      "static storage.");
2227e8d8bef9SDimitry Andric   }
2228e8d8bef9SDimitry Andric   return false;
2229e8d8bef9SDimitry Andric }
2230e8d8bef9SDimitry Andric 
22310fca6ea1SDimitry Andric // Get current OffloadArch and ignore any unknown values
22320fca6ea1SDimitry Andric static OffloadArch getOffloadArch(CodeGenModule &CGM) {
2233e8d8bef9SDimitry Andric   if (!CGM.getTarget().hasFeature("ptx"))
22340fca6ea1SDimitry Andric     return OffloadArch::UNKNOWN;
2235e8d8bef9SDimitry Andric   for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
2236e8d8bef9SDimitry Andric     if (Feature.getValue()) {
22370fca6ea1SDimitry Andric       OffloadArch Arch = StringToOffloadArch(Feature.getKey());
22380fca6ea1SDimitry Andric       if (Arch != OffloadArch::UNKNOWN)
2239e8d8bef9SDimitry Andric         return Arch;
2240e8d8bef9SDimitry Andric     }
2241e8d8bef9SDimitry Andric   }
22420fca6ea1SDimitry Andric   return OffloadArch::UNKNOWN;
2243e8d8bef9SDimitry Andric }
2244e8d8bef9SDimitry Andric 
2245e8d8bef9SDimitry Andric /// Check to see if target architecture supports unified addressing which is
2246e8d8bef9SDimitry Andric /// a restriction for OpenMP requires clause "unified_shared_memory".
22470fca6ea1SDimitry Andric void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
2248e8d8bef9SDimitry Andric   for (const OMPClause *Clause : D->clauselists()) {
2249e8d8bef9SDimitry Andric     if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
22500fca6ea1SDimitry Andric       OffloadArch Arch = getOffloadArch(CGM);
2251e8d8bef9SDimitry Andric       switch (Arch) {
22520fca6ea1SDimitry Andric       case OffloadArch::SM_20:
22530fca6ea1SDimitry Andric       case OffloadArch::SM_21:
22540fca6ea1SDimitry Andric       case OffloadArch::SM_30:
22550fca6ea1SDimitry Andric       case OffloadArch::SM_32_:
22560fca6ea1SDimitry Andric       case OffloadArch::SM_35:
22570fca6ea1SDimitry Andric       case OffloadArch::SM_37:
22580fca6ea1SDimitry Andric       case OffloadArch::SM_50:
22590fca6ea1SDimitry Andric       case OffloadArch::SM_52:
22600fca6ea1SDimitry Andric       case OffloadArch::SM_53: {
2261e8d8bef9SDimitry Andric         SmallString<256> Buffer;
2262e8d8bef9SDimitry Andric         llvm::raw_svector_ostream Out(Buffer);
22630fca6ea1SDimitry Andric         Out << "Target architecture " << OffloadArchToString(Arch)
2264e8d8bef9SDimitry Andric             << " does not support unified addressing";
2265e8d8bef9SDimitry Andric         CGM.Error(Clause->getBeginLoc(), Out.str());
2266e8d8bef9SDimitry Andric         return;
2267e8d8bef9SDimitry Andric       }
22680fca6ea1SDimitry Andric       case OffloadArch::SM_60:
22690fca6ea1SDimitry Andric       case OffloadArch::SM_61:
22700fca6ea1SDimitry Andric       case OffloadArch::SM_62:
22710fca6ea1SDimitry Andric       case OffloadArch::SM_70:
22720fca6ea1SDimitry Andric       case OffloadArch::SM_72:
22730fca6ea1SDimitry Andric       case OffloadArch::SM_75:
22740fca6ea1SDimitry Andric       case OffloadArch::SM_80:
22750fca6ea1SDimitry Andric       case OffloadArch::SM_86:
22760fca6ea1SDimitry Andric       case OffloadArch::SM_87:
22770fca6ea1SDimitry Andric       case OffloadArch::SM_89:
22780fca6ea1SDimitry Andric       case OffloadArch::SM_90:
22790fca6ea1SDimitry Andric       case OffloadArch::SM_90a:
22800fca6ea1SDimitry Andric       case OffloadArch::GFX600:
22810fca6ea1SDimitry Andric       case OffloadArch::GFX601:
22820fca6ea1SDimitry Andric       case OffloadArch::GFX602:
22830fca6ea1SDimitry Andric       case OffloadArch::GFX700:
22840fca6ea1SDimitry Andric       case OffloadArch::GFX701:
22850fca6ea1SDimitry Andric       case OffloadArch::GFX702:
22860fca6ea1SDimitry Andric       case OffloadArch::GFX703:
22870fca6ea1SDimitry Andric       case OffloadArch::GFX704:
22880fca6ea1SDimitry Andric       case OffloadArch::GFX705:
22890fca6ea1SDimitry Andric       case OffloadArch::GFX801:
22900fca6ea1SDimitry Andric       case OffloadArch::GFX802:
22910fca6ea1SDimitry Andric       case OffloadArch::GFX803:
22920fca6ea1SDimitry Andric       case OffloadArch::GFX805:
22930fca6ea1SDimitry Andric       case OffloadArch::GFX810:
22940fca6ea1SDimitry Andric       case OffloadArch::GFX9_GENERIC:
22950fca6ea1SDimitry Andric       case OffloadArch::GFX900:
22960fca6ea1SDimitry Andric       case OffloadArch::GFX902:
22970fca6ea1SDimitry Andric       case OffloadArch::GFX904:
22980fca6ea1SDimitry Andric       case OffloadArch::GFX906:
22990fca6ea1SDimitry Andric       case OffloadArch::GFX908:
23000fca6ea1SDimitry Andric       case OffloadArch::GFX909:
23010fca6ea1SDimitry Andric       case OffloadArch::GFX90a:
23020fca6ea1SDimitry Andric       case OffloadArch::GFX90c:
23030fca6ea1SDimitry Andric       case OffloadArch::GFX940:
23040fca6ea1SDimitry Andric       case OffloadArch::GFX941:
23050fca6ea1SDimitry Andric       case OffloadArch::GFX942:
23060fca6ea1SDimitry Andric       case OffloadArch::GFX10_1_GENERIC:
23070fca6ea1SDimitry Andric       case OffloadArch::GFX1010:
23080fca6ea1SDimitry Andric       case OffloadArch::GFX1011:
23090fca6ea1SDimitry Andric       case OffloadArch::GFX1012:
23100fca6ea1SDimitry Andric       case OffloadArch::GFX1013:
23110fca6ea1SDimitry Andric       case OffloadArch::GFX10_3_GENERIC:
23120fca6ea1SDimitry Andric       case OffloadArch::GFX1030:
23130fca6ea1SDimitry Andric       case OffloadArch::GFX1031:
23140fca6ea1SDimitry Andric       case OffloadArch::GFX1032:
23150fca6ea1SDimitry Andric       case OffloadArch::GFX1033:
23160fca6ea1SDimitry Andric       case OffloadArch::GFX1034:
23170fca6ea1SDimitry Andric       case OffloadArch::GFX1035:
23180fca6ea1SDimitry Andric       case OffloadArch::GFX1036:
23190fca6ea1SDimitry Andric       case OffloadArch::GFX11_GENERIC:
23200fca6ea1SDimitry Andric       case OffloadArch::GFX1100:
23210fca6ea1SDimitry Andric       case OffloadArch::GFX1101:
23220fca6ea1SDimitry Andric       case OffloadArch::GFX1102:
23230fca6ea1SDimitry Andric       case OffloadArch::GFX1103:
23240fca6ea1SDimitry Andric       case OffloadArch::GFX1150:
23250fca6ea1SDimitry Andric       case OffloadArch::GFX1151:
23260fca6ea1SDimitry Andric       case OffloadArch::GFX1152:
23270fca6ea1SDimitry Andric       case OffloadArch::GFX12_GENERIC:
23280fca6ea1SDimitry Andric       case OffloadArch::GFX1200:
23290fca6ea1SDimitry Andric       case OffloadArch::GFX1201:
23300fca6ea1SDimitry Andric       case OffloadArch::AMDGCNSPIRV:
23310fca6ea1SDimitry Andric       case OffloadArch::Generic:
23320fca6ea1SDimitry Andric       case OffloadArch::UNUSED:
23330fca6ea1SDimitry Andric       case OffloadArch::UNKNOWN:
2334e8d8bef9SDimitry Andric         break;
23350fca6ea1SDimitry Andric       case OffloadArch::LAST:
23360fca6ea1SDimitry Andric         llvm_unreachable("Unexpected GPU arch.");
2337e8d8bef9SDimitry Andric       }
2338e8d8bef9SDimitry Andric     }
2339e8d8bef9SDimitry Andric   }
2340e8d8bef9SDimitry Andric   CGOpenMPRuntime::processRequiresDirective(D);
2341e8d8bef9SDimitry Andric }
2342e8d8bef9SDimitry Andric 
2343349cc55cSDimitry Andric llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
2344349cc55cSDimitry Andric   CGBuilderTy &Bld = CGF.Builder;
2345349cc55cSDimitry Andric   llvm::Module *M = &CGF.CGM.getModule();
2346349cc55cSDimitry Andric   const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
2347349cc55cSDimitry Andric   llvm::Function *F = M->getFunction(LocSize);
2348349cc55cSDimitry Andric   if (!F) {
2349349cc55cSDimitry Andric     F = llvm::Function::Create(
2350bdd1243dSDimitry Andric         llvm::FunctionType::get(CGF.Int32Ty, std::nullopt, false),
2351349cc55cSDimitry Andric         llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
2352349cc55cSDimitry Andric   }
2353bdd1243dSDimitry Andric   return Bld.CreateCall(F, std::nullopt, "nvptx_num_threads");
2354349cc55cSDimitry Andric }
2355349cc55cSDimitry Andric 
2356349cc55cSDimitry Andric llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
2357349cc55cSDimitry Andric   ArrayRef<llvm::Value *> Args{};
2358349cc55cSDimitry Andric   return CGF.EmitRuntimeCall(
2359349cc55cSDimitry Andric       OMPBuilder.getOrCreateRuntimeFunction(
2360349cc55cSDimitry Andric           CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
2361349cc55cSDimitry Andric       Args);
2362349cc55cSDimitry Andric }
2363