1e8d8bef9SDimitry Andric //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===// 2e8d8bef9SDimitry Andric // 3e8d8bef9SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4e8d8bef9SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5e8d8bef9SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6e8d8bef9SDimitry Andric // 7e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===// 8e8d8bef9SDimitry Andric // 9e8d8bef9SDimitry Andric // This provides a generalized class for OpenMP runtime code generation 10e8d8bef9SDimitry Andric // specialized by GPU targets NVPTX and AMDGCN. 11e8d8bef9SDimitry Andric // 12e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===// 13e8d8bef9SDimitry Andric 14e8d8bef9SDimitry Andric #include "CGOpenMPRuntimeGPU.h" 15e8d8bef9SDimitry Andric #include "CodeGenFunction.h" 16e8d8bef9SDimitry Andric #include "clang/AST/Attr.h" 17e8d8bef9SDimitry Andric #include "clang/AST/DeclOpenMP.h" 18bdd1243dSDimitry Andric #include "clang/AST/OpenMPClause.h" 19e8d8bef9SDimitry Andric #include "clang/AST/StmtOpenMP.h" 20e8d8bef9SDimitry Andric #include "clang/AST/StmtVisitor.h" 21e8d8bef9SDimitry Andric #include "clang/Basic/Cuda.h" 22e8d8bef9SDimitry Andric #include "llvm/ADT/SmallPtrSet.h" 23e8d8bef9SDimitry Andric #include "llvm/Frontend/OpenMP/OMPGridValues.h" 24349cc55cSDimitry Andric #include "llvm/Support/MathExtras.h" 25e8d8bef9SDimitry Andric 26e8d8bef9SDimitry Andric using namespace clang; 27e8d8bef9SDimitry Andric using namespace CodeGen; 28e8d8bef9SDimitry Andric using namespace llvm::omp; 29e8d8bef9SDimitry Andric 30e8d8bef9SDimitry Andric namespace { 31e8d8bef9SDimitry Andric /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. 32e8d8bef9SDimitry Andric class NVPTXActionTy final : public PrePostActionTy { 33e8d8bef9SDimitry Andric llvm::FunctionCallee EnterCallee = nullptr; 34e8d8bef9SDimitry Andric ArrayRef<llvm::Value *> EnterArgs; 35e8d8bef9SDimitry Andric llvm::FunctionCallee ExitCallee = nullptr; 36e8d8bef9SDimitry Andric ArrayRef<llvm::Value *> ExitArgs; 37e8d8bef9SDimitry Andric bool Conditional = false; 38e8d8bef9SDimitry Andric llvm::BasicBlock *ContBlock = nullptr; 39e8d8bef9SDimitry Andric 40e8d8bef9SDimitry Andric public: 41e8d8bef9SDimitry Andric NVPTXActionTy(llvm::FunctionCallee EnterCallee, 42e8d8bef9SDimitry Andric ArrayRef<llvm::Value *> EnterArgs, 43e8d8bef9SDimitry Andric llvm::FunctionCallee ExitCallee, 44e8d8bef9SDimitry Andric ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false) 45e8d8bef9SDimitry Andric : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee), 46e8d8bef9SDimitry Andric ExitArgs(ExitArgs), Conditional(Conditional) {} 47e8d8bef9SDimitry Andric void Enter(CodeGenFunction &CGF) override { 48e8d8bef9SDimitry Andric llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs); 49e8d8bef9SDimitry Andric if (Conditional) { 50e8d8bef9SDimitry Andric llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes); 51e8d8bef9SDimitry Andric auto *ThenBlock = CGF.createBasicBlock("omp_if.then"); 52e8d8bef9SDimitry Andric ContBlock = CGF.createBasicBlock("omp_if.end"); 53e8d8bef9SDimitry Andric // Generate the branch (If-stmt) 54e8d8bef9SDimitry Andric CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock); 55e8d8bef9SDimitry Andric CGF.EmitBlock(ThenBlock); 56e8d8bef9SDimitry Andric } 57e8d8bef9SDimitry Andric } 58e8d8bef9SDimitry Andric void Done(CodeGenFunction &CGF) { 59e8d8bef9SDimitry Andric // Emit the rest of blocks/branches 60e8d8bef9SDimitry Andric CGF.EmitBranch(ContBlock); 61e8d8bef9SDimitry Andric CGF.EmitBlock(ContBlock, true); 62e8d8bef9SDimitry Andric } 63e8d8bef9SDimitry Andric void Exit(CodeGenFunction &CGF) override { 64e8d8bef9SDimitry Andric CGF.EmitRuntimeCall(ExitCallee, ExitArgs); 65e8d8bef9SDimitry Andric } 66e8d8bef9SDimitry Andric }; 67e8d8bef9SDimitry Andric 68e8d8bef9SDimitry Andric /// A class to track the execution mode when codegening directives within 69e8d8bef9SDimitry Andric /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry 70e8d8bef9SDimitry Andric /// to the target region and used by containing directives such as 'parallel' 71e8d8bef9SDimitry Andric /// to emit optimized code. 72e8d8bef9SDimitry Andric class ExecutionRuntimeModesRAII { 73e8d8bef9SDimitry Andric private: 74e8d8bef9SDimitry Andric CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode = 75e8d8bef9SDimitry Andric CGOpenMPRuntimeGPU::EM_Unknown; 76e8d8bef9SDimitry Andric CGOpenMPRuntimeGPU::ExecutionMode &ExecMode; 77e8d8bef9SDimitry Andric 78e8d8bef9SDimitry Andric public: 79bdd1243dSDimitry Andric ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode, 80bdd1243dSDimitry Andric CGOpenMPRuntimeGPU::ExecutionMode EntryMode) 81e8d8bef9SDimitry Andric : ExecMode(ExecMode) { 82e8d8bef9SDimitry Andric SavedExecMode = ExecMode; 83bdd1243dSDimitry Andric ExecMode = EntryMode; 84e8d8bef9SDimitry Andric } 85bdd1243dSDimitry Andric ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; } 86e8d8bef9SDimitry Andric }; 87e8d8bef9SDimitry Andric 88e8d8bef9SDimitry Andric static const ValueDecl *getPrivateItem(const Expr *RefExpr) { 89e8d8bef9SDimitry Andric RefExpr = RefExpr->IgnoreParens(); 90e8d8bef9SDimitry Andric if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) { 91e8d8bef9SDimitry Andric const Expr *Base = ASE->getBase()->IgnoreParenImpCasts(); 92e8d8bef9SDimitry Andric while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) 93e8d8bef9SDimitry Andric Base = TempASE->getBase()->IgnoreParenImpCasts(); 94e8d8bef9SDimitry Andric RefExpr = Base; 950fca6ea1SDimitry Andric } else if (auto *OASE = dyn_cast<ArraySectionExpr>(RefExpr)) { 96e8d8bef9SDimitry Andric const Expr *Base = OASE->getBase()->IgnoreParenImpCasts(); 970fca6ea1SDimitry Andric while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Base)) 98e8d8bef9SDimitry Andric Base = TempOASE->getBase()->IgnoreParenImpCasts(); 99e8d8bef9SDimitry Andric while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) 100e8d8bef9SDimitry Andric Base = TempASE->getBase()->IgnoreParenImpCasts(); 101e8d8bef9SDimitry Andric RefExpr = Base; 102e8d8bef9SDimitry Andric } 103e8d8bef9SDimitry Andric RefExpr = RefExpr->IgnoreParenImpCasts(); 104e8d8bef9SDimitry Andric if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr)) 105e8d8bef9SDimitry Andric return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl()); 106e8d8bef9SDimitry Andric const auto *ME = cast<MemberExpr>(RefExpr); 107e8d8bef9SDimitry Andric return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl()); 108e8d8bef9SDimitry Andric } 109e8d8bef9SDimitry Andric 110e8d8bef9SDimitry Andric static RecordDecl *buildRecordForGlobalizedVars( 111e8d8bef9SDimitry Andric ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls, 112e8d8bef9SDimitry Andric ArrayRef<const ValueDecl *> EscapedDeclsForTeams, 113e8d8bef9SDimitry Andric llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 1145f757f3fSDimitry Andric &MappedDeclsFields, 1155f757f3fSDimitry Andric int BufSize) { 116e8d8bef9SDimitry Andric using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>; 117e8d8bef9SDimitry Andric if (EscapedDecls.empty() && EscapedDeclsForTeams.empty()) 118e8d8bef9SDimitry Andric return nullptr; 119e8d8bef9SDimitry Andric SmallVector<VarsDataTy, 4> GlobalizedVars; 120e8d8bef9SDimitry Andric for (const ValueDecl *D : EscapedDecls) 1215f757f3fSDimitry Andric GlobalizedVars.emplace_back(C.getDeclAlign(D), D); 122e8d8bef9SDimitry Andric for (const ValueDecl *D : EscapedDeclsForTeams) 123e8d8bef9SDimitry Andric GlobalizedVars.emplace_back(C.getDeclAlign(D), D); 124e8d8bef9SDimitry Andric 125e8d8bef9SDimitry Andric // Build struct _globalized_locals_ty { 1265f757f3fSDimitry Andric // /* globalized vars */[WarSize] align (decl_align) 127e8d8bef9SDimitry Andric // /* globalized vars */ for EscapedDeclsForTeams 128e8d8bef9SDimitry Andric // }; 129e8d8bef9SDimitry Andric RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty"); 130e8d8bef9SDimitry Andric GlobalizedRD->startDefinition(); 131e8d8bef9SDimitry Andric llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped( 132e8d8bef9SDimitry Andric EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end()); 133e8d8bef9SDimitry Andric for (const auto &Pair : GlobalizedVars) { 134e8d8bef9SDimitry Andric const ValueDecl *VD = Pair.second; 135e8d8bef9SDimitry Andric QualType Type = VD->getType(); 136e8d8bef9SDimitry Andric if (Type->isLValueReferenceType()) 137e8d8bef9SDimitry Andric Type = C.getPointerType(Type.getNonReferenceType()); 138e8d8bef9SDimitry Andric else 139e8d8bef9SDimitry Andric Type = Type.getNonReferenceType(); 140e8d8bef9SDimitry Andric SourceLocation Loc = VD->getLocation(); 141e8d8bef9SDimitry Andric FieldDecl *Field; 142e8d8bef9SDimitry Andric if (SingleEscaped.count(VD)) { 143e8d8bef9SDimitry Andric Field = FieldDecl::Create( 144e8d8bef9SDimitry Andric C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, 145e8d8bef9SDimitry Andric C.getTrivialTypeSourceInfo(Type, SourceLocation()), 146e8d8bef9SDimitry Andric /*BW=*/nullptr, /*Mutable=*/false, 147e8d8bef9SDimitry Andric /*InitStyle=*/ICIS_NoInit); 148e8d8bef9SDimitry Andric Field->setAccess(AS_public); 149e8d8bef9SDimitry Andric if (VD->hasAttrs()) { 150e8d8bef9SDimitry Andric for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()), 151e8d8bef9SDimitry Andric E(VD->getAttrs().end()); 152e8d8bef9SDimitry Andric I != E; ++I) 153e8d8bef9SDimitry Andric Field->addAttr(*I); 154e8d8bef9SDimitry Andric } 155e8d8bef9SDimitry Andric } else { 1565f757f3fSDimitry Andric if (BufSize > 1) { 157e8d8bef9SDimitry Andric llvm::APInt ArraySize(32, BufSize); 1585f757f3fSDimitry Andric Type = C.getConstantArrayType(Type, ArraySize, nullptr, 1595f757f3fSDimitry Andric ArraySizeModifier::Normal, 0); 1605f757f3fSDimitry Andric } 161e8d8bef9SDimitry Andric Field = FieldDecl::Create( 162e8d8bef9SDimitry Andric C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, 163e8d8bef9SDimitry Andric C.getTrivialTypeSourceInfo(Type, SourceLocation()), 164e8d8bef9SDimitry Andric /*BW=*/nullptr, /*Mutable=*/false, 165e8d8bef9SDimitry Andric /*InitStyle=*/ICIS_NoInit); 166e8d8bef9SDimitry Andric Field->setAccess(AS_public); 1675f757f3fSDimitry Andric llvm::APInt Align(32, Pair.first.getQuantity()); 168e8d8bef9SDimitry Andric Field->addAttr(AlignedAttr::CreateImplicit( 169e8d8bef9SDimitry Andric C, /*IsAlignmentExpr=*/true, 170e8d8bef9SDimitry Andric IntegerLiteral::Create(C, Align, 171e8d8bef9SDimitry Andric C.getIntTypeForBitwidth(32, /*Signed=*/0), 172e8d8bef9SDimitry Andric SourceLocation()), 17306c3fb27SDimitry Andric {}, AlignedAttr::GNU_aligned)); 174e8d8bef9SDimitry Andric } 175e8d8bef9SDimitry Andric GlobalizedRD->addDecl(Field); 176e8d8bef9SDimitry Andric MappedDeclsFields.try_emplace(VD, Field); 177e8d8bef9SDimitry Andric } 178e8d8bef9SDimitry Andric GlobalizedRD->completeDefinition(); 179e8d8bef9SDimitry Andric return GlobalizedRD; 180e8d8bef9SDimitry Andric } 181e8d8bef9SDimitry Andric 182e8d8bef9SDimitry Andric /// Get the list of variables that can escape their declaration context. 183e8d8bef9SDimitry Andric class CheckVarsEscapingDeclContext final 184e8d8bef9SDimitry Andric : public ConstStmtVisitor<CheckVarsEscapingDeclContext> { 185e8d8bef9SDimitry Andric CodeGenFunction &CGF; 186e8d8bef9SDimitry Andric llvm::SetVector<const ValueDecl *> EscapedDecls; 187e8d8bef9SDimitry Andric llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls; 18806c3fb27SDimitry Andric llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls; 189e8d8bef9SDimitry Andric llvm::SmallPtrSet<const Decl *, 4> EscapedParameters; 190e8d8bef9SDimitry Andric RecordDecl *GlobalizedRD = nullptr; 191e8d8bef9SDimitry Andric llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; 192e8d8bef9SDimitry Andric bool AllEscaped = false; 193e8d8bef9SDimitry Andric bool IsForCombinedParallelRegion = false; 194e8d8bef9SDimitry Andric 195e8d8bef9SDimitry Andric void markAsEscaped(const ValueDecl *VD) { 196e8d8bef9SDimitry Andric // Do not globalize declare target variables. 197e8d8bef9SDimitry Andric if (!isa<VarDecl>(VD) || 198e8d8bef9SDimitry Andric OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) 199e8d8bef9SDimitry Andric return; 200e8d8bef9SDimitry Andric VD = cast<ValueDecl>(VD->getCanonicalDecl()); 201e8d8bef9SDimitry Andric // Use user-specified allocation. 202e8d8bef9SDimitry Andric if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>()) 203e8d8bef9SDimitry Andric return; 204e8d8bef9SDimitry Andric // Variables captured by value must be globalized. 20506c3fb27SDimitry Andric bool IsCaptured = false; 206e8d8bef9SDimitry Andric if (auto *CSI = CGF.CapturedStmtInfo) { 207e8d8bef9SDimitry Andric if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) { 208e8d8bef9SDimitry Andric // Check if need to capture the variable that was already captured by 209e8d8bef9SDimitry Andric // value in the outer region. 21006c3fb27SDimitry Andric IsCaptured = true; 211e8d8bef9SDimitry Andric if (!IsForCombinedParallelRegion) { 212e8d8bef9SDimitry Andric if (!FD->hasAttrs()) 213e8d8bef9SDimitry Andric return; 214e8d8bef9SDimitry Andric const auto *Attr = FD->getAttr<OMPCaptureKindAttr>(); 215e8d8bef9SDimitry Andric if (!Attr) 216e8d8bef9SDimitry Andric return; 217e8d8bef9SDimitry Andric if (((Attr->getCaptureKind() != OMPC_map) && 218e8d8bef9SDimitry Andric !isOpenMPPrivate(Attr->getCaptureKind())) || 219e8d8bef9SDimitry Andric ((Attr->getCaptureKind() == OMPC_map) && 220e8d8bef9SDimitry Andric !FD->getType()->isAnyPointerType())) 221e8d8bef9SDimitry Andric return; 222e8d8bef9SDimitry Andric } 223e8d8bef9SDimitry Andric if (!FD->getType()->isReferenceType()) { 224e8d8bef9SDimitry Andric assert(!VD->getType()->isVariablyModifiedType() && 225e8d8bef9SDimitry Andric "Parameter captured by value with variably modified type"); 226e8d8bef9SDimitry Andric EscapedParameters.insert(VD); 227e8d8bef9SDimitry Andric } else if (!IsForCombinedParallelRegion) { 228e8d8bef9SDimitry Andric return; 229e8d8bef9SDimitry Andric } 230e8d8bef9SDimitry Andric } 231e8d8bef9SDimitry Andric } 232e8d8bef9SDimitry Andric if ((!CGF.CapturedStmtInfo || 233e8d8bef9SDimitry Andric (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) && 234e8d8bef9SDimitry Andric VD->getType()->isReferenceType()) 235e8d8bef9SDimitry Andric // Do not globalize variables with reference type. 236e8d8bef9SDimitry Andric return; 23706c3fb27SDimitry Andric if (VD->getType()->isVariablyModifiedType()) { 23806c3fb27SDimitry Andric // If not captured at the target region level then mark the escaped 23906c3fb27SDimitry Andric // variable as delayed. 24006c3fb27SDimitry Andric if (IsCaptured) 241e8d8bef9SDimitry Andric EscapedVariableLengthDecls.insert(VD); 242e8d8bef9SDimitry Andric else 24306c3fb27SDimitry Andric DelayedVariableLengthDecls.insert(VD); 24406c3fb27SDimitry Andric } else 245e8d8bef9SDimitry Andric EscapedDecls.insert(VD); 246e8d8bef9SDimitry Andric } 247e8d8bef9SDimitry Andric 248e8d8bef9SDimitry Andric void VisitValueDecl(const ValueDecl *VD) { 249e8d8bef9SDimitry Andric if (VD->getType()->isLValueReferenceType()) 250e8d8bef9SDimitry Andric markAsEscaped(VD); 251e8d8bef9SDimitry Andric if (const auto *VarD = dyn_cast<VarDecl>(VD)) { 252e8d8bef9SDimitry Andric if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) { 253e8d8bef9SDimitry Andric const bool SavedAllEscaped = AllEscaped; 254e8d8bef9SDimitry Andric AllEscaped = VD->getType()->isLValueReferenceType(); 255e8d8bef9SDimitry Andric Visit(VarD->getInit()); 256e8d8bef9SDimitry Andric AllEscaped = SavedAllEscaped; 257e8d8bef9SDimitry Andric } 258e8d8bef9SDimitry Andric } 259e8d8bef9SDimitry Andric } 260e8d8bef9SDimitry Andric void VisitOpenMPCapturedStmt(const CapturedStmt *S, 261e8d8bef9SDimitry Andric ArrayRef<OMPClause *> Clauses, 262e8d8bef9SDimitry Andric bool IsCombinedParallelRegion) { 263e8d8bef9SDimitry Andric if (!S) 264e8d8bef9SDimitry Andric return; 265e8d8bef9SDimitry Andric for (const CapturedStmt::Capture &C : S->captures()) { 266e8d8bef9SDimitry Andric if (C.capturesVariable() && !C.capturesVariableByCopy()) { 267e8d8bef9SDimitry Andric const ValueDecl *VD = C.getCapturedVar(); 268e8d8bef9SDimitry Andric bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion; 269e8d8bef9SDimitry Andric if (IsCombinedParallelRegion) { 270e8d8bef9SDimitry Andric // Check if the variable is privatized in the combined construct and 271e8d8bef9SDimitry Andric // those private copies must be shared in the inner parallel 272e8d8bef9SDimitry Andric // directive. 273e8d8bef9SDimitry Andric IsForCombinedParallelRegion = false; 274e8d8bef9SDimitry Andric for (const OMPClause *C : Clauses) { 275e8d8bef9SDimitry Andric if (!isOpenMPPrivate(C->getClauseKind()) || 276e8d8bef9SDimitry Andric C->getClauseKind() == OMPC_reduction || 277e8d8bef9SDimitry Andric C->getClauseKind() == OMPC_linear || 278e8d8bef9SDimitry Andric C->getClauseKind() == OMPC_private) 279e8d8bef9SDimitry Andric continue; 280e8d8bef9SDimitry Andric ArrayRef<const Expr *> Vars; 281e8d8bef9SDimitry Andric if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C)) 282e8d8bef9SDimitry Andric Vars = PC->getVarRefs(); 283e8d8bef9SDimitry Andric else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C)) 284e8d8bef9SDimitry Andric Vars = PC->getVarRefs(); 285e8d8bef9SDimitry Andric else 286e8d8bef9SDimitry Andric llvm_unreachable("Unexpected clause."); 287e8d8bef9SDimitry Andric for (const auto *E : Vars) { 288e8d8bef9SDimitry Andric const Decl *D = 289e8d8bef9SDimitry Andric cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl(); 290e8d8bef9SDimitry Andric if (D == VD->getCanonicalDecl()) { 291e8d8bef9SDimitry Andric IsForCombinedParallelRegion = true; 292e8d8bef9SDimitry Andric break; 293e8d8bef9SDimitry Andric } 294e8d8bef9SDimitry Andric } 295e8d8bef9SDimitry Andric if (IsForCombinedParallelRegion) 296e8d8bef9SDimitry Andric break; 297e8d8bef9SDimitry Andric } 298e8d8bef9SDimitry Andric } 299e8d8bef9SDimitry Andric markAsEscaped(VD); 300e8d8bef9SDimitry Andric if (isa<OMPCapturedExprDecl>(VD)) 301e8d8bef9SDimitry Andric VisitValueDecl(VD); 302e8d8bef9SDimitry Andric IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion; 303e8d8bef9SDimitry Andric } 304e8d8bef9SDimitry Andric } 305e8d8bef9SDimitry Andric } 306e8d8bef9SDimitry Andric 307e8d8bef9SDimitry Andric void buildRecordForGlobalizedVars(bool IsInTTDRegion) { 308e8d8bef9SDimitry Andric assert(!GlobalizedRD && 309e8d8bef9SDimitry Andric "Record for globalized variables is built already."); 310e8d8bef9SDimitry Andric ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams; 311349cc55cSDimitry Andric unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size; 312e8d8bef9SDimitry Andric if (IsInTTDRegion) 313e8d8bef9SDimitry Andric EscapedDeclsForTeams = EscapedDecls.getArrayRef(); 314e8d8bef9SDimitry Andric else 315e8d8bef9SDimitry Andric EscapedDeclsForParallel = EscapedDecls.getArrayRef(); 316e8d8bef9SDimitry Andric GlobalizedRD = ::buildRecordForGlobalizedVars( 317e8d8bef9SDimitry Andric CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams, 318e8d8bef9SDimitry Andric MappedDeclsFields, WarpSize); 319e8d8bef9SDimitry Andric } 320e8d8bef9SDimitry Andric 321e8d8bef9SDimitry Andric public: 322e8d8bef9SDimitry Andric CheckVarsEscapingDeclContext(CodeGenFunction &CGF, 323e8d8bef9SDimitry Andric ArrayRef<const ValueDecl *> TeamsReductions) 324e8d8bef9SDimitry Andric : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) { 325e8d8bef9SDimitry Andric } 326e8d8bef9SDimitry Andric virtual ~CheckVarsEscapingDeclContext() = default; 327e8d8bef9SDimitry Andric void VisitDeclStmt(const DeclStmt *S) { 328e8d8bef9SDimitry Andric if (!S) 329e8d8bef9SDimitry Andric return; 330e8d8bef9SDimitry Andric for (const Decl *D : S->decls()) 331e8d8bef9SDimitry Andric if (const auto *VD = dyn_cast_or_null<ValueDecl>(D)) 332e8d8bef9SDimitry Andric VisitValueDecl(VD); 333e8d8bef9SDimitry Andric } 334e8d8bef9SDimitry Andric void VisitOMPExecutableDirective(const OMPExecutableDirective *D) { 335e8d8bef9SDimitry Andric if (!D) 336e8d8bef9SDimitry Andric return; 337e8d8bef9SDimitry Andric if (!D->hasAssociatedStmt()) 338e8d8bef9SDimitry Andric return; 339e8d8bef9SDimitry Andric if (const auto *S = 340e8d8bef9SDimitry Andric dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) { 341e8d8bef9SDimitry Andric // Do not analyze directives that do not actually require capturing, 342e8d8bef9SDimitry Andric // like `omp for` or `omp simd` directives. 343e8d8bef9SDimitry Andric llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions; 344e8d8bef9SDimitry Andric getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind()); 345e8d8bef9SDimitry Andric if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) { 346e8d8bef9SDimitry Andric VisitStmt(S->getCapturedStmt()); 347e8d8bef9SDimitry Andric return; 348e8d8bef9SDimitry Andric } 349e8d8bef9SDimitry Andric VisitOpenMPCapturedStmt( 350e8d8bef9SDimitry Andric S, D->clauses(), 351e8d8bef9SDimitry Andric CaptureRegions.back() == OMPD_parallel && 352e8d8bef9SDimitry Andric isOpenMPDistributeDirective(D->getDirectiveKind())); 353e8d8bef9SDimitry Andric } 354e8d8bef9SDimitry Andric } 355e8d8bef9SDimitry Andric void VisitCapturedStmt(const CapturedStmt *S) { 356e8d8bef9SDimitry Andric if (!S) 357e8d8bef9SDimitry Andric return; 358e8d8bef9SDimitry Andric for (const CapturedStmt::Capture &C : S->captures()) { 359e8d8bef9SDimitry Andric if (C.capturesVariable() && !C.capturesVariableByCopy()) { 360e8d8bef9SDimitry Andric const ValueDecl *VD = C.getCapturedVar(); 361e8d8bef9SDimitry Andric markAsEscaped(VD); 362e8d8bef9SDimitry Andric if (isa<OMPCapturedExprDecl>(VD)) 363e8d8bef9SDimitry Andric VisitValueDecl(VD); 364e8d8bef9SDimitry Andric } 365e8d8bef9SDimitry Andric } 366e8d8bef9SDimitry Andric } 367e8d8bef9SDimitry Andric void VisitLambdaExpr(const LambdaExpr *E) { 368e8d8bef9SDimitry Andric if (!E) 369e8d8bef9SDimitry Andric return; 370e8d8bef9SDimitry Andric for (const LambdaCapture &C : E->captures()) { 371e8d8bef9SDimitry Andric if (C.capturesVariable()) { 372e8d8bef9SDimitry Andric if (C.getCaptureKind() == LCK_ByRef) { 373e8d8bef9SDimitry Andric const ValueDecl *VD = C.getCapturedVar(); 374e8d8bef9SDimitry Andric markAsEscaped(VD); 375e8d8bef9SDimitry Andric if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD)) 376e8d8bef9SDimitry Andric VisitValueDecl(VD); 377e8d8bef9SDimitry Andric } 378e8d8bef9SDimitry Andric } 379e8d8bef9SDimitry Andric } 380e8d8bef9SDimitry Andric } 381e8d8bef9SDimitry Andric void VisitBlockExpr(const BlockExpr *E) { 382e8d8bef9SDimitry Andric if (!E) 383e8d8bef9SDimitry Andric return; 384e8d8bef9SDimitry Andric for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) { 385e8d8bef9SDimitry Andric if (C.isByRef()) { 386e8d8bef9SDimitry Andric const VarDecl *VD = C.getVariable(); 387e8d8bef9SDimitry Andric markAsEscaped(VD); 388e8d8bef9SDimitry Andric if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture()) 389e8d8bef9SDimitry Andric VisitValueDecl(VD); 390e8d8bef9SDimitry Andric } 391e8d8bef9SDimitry Andric } 392e8d8bef9SDimitry Andric } 393e8d8bef9SDimitry Andric void VisitCallExpr(const CallExpr *E) { 394e8d8bef9SDimitry Andric if (!E) 395e8d8bef9SDimitry Andric return; 396e8d8bef9SDimitry Andric for (const Expr *Arg : E->arguments()) { 397e8d8bef9SDimitry Andric if (!Arg) 398e8d8bef9SDimitry Andric continue; 399e8d8bef9SDimitry Andric if (Arg->isLValue()) { 400e8d8bef9SDimitry Andric const bool SavedAllEscaped = AllEscaped; 401e8d8bef9SDimitry Andric AllEscaped = true; 402e8d8bef9SDimitry Andric Visit(Arg); 403e8d8bef9SDimitry Andric AllEscaped = SavedAllEscaped; 404e8d8bef9SDimitry Andric } else { 405e8d8bef9SDimitry Andric Visit(Arg); 406e8d8bef9SDimitry Andric } 407e8d8bef9SDimitry Andric } 408e8d8bef9SDimitry Andric Visit(E->getCallee()); 409e8d8bef9SDimitry Andric } 410e8d8bef9SDimitry Andric void VisitDeclRefExpr(const DeclRefExpr *E) { 411e8d8bef9SDimitry Andric if (!E) 412e8d8bef9SDimitry Andric return; 413e8d8bef9SDimitry Andric const ValueDecl *VD = E->getDecl(); 414e8d8bef9SDimitry Andric if (AllEscaped) 415e8d8bef9SDimitry Andric markAsEscaped(VD); 416e8d8bef9SDimitry Andric if (isa<OMPCapturedExprDecl>(VD)) 417e8d8bef9SDimitry Andric VisitValueDecl(VD); 418bdd1243dSDimitry Andric else if (VD->isInitCapture()) 419e8d8bef9SDimitry Andric VisitValueDecl(VD); 420e8d8bef9SDimitry Andric } 421e8d8bef9SDimitry Andric void VisitUnaryOperator(const UnaryOperator *E) { 422e8d8bef9SDimitry Andric if (!E) 423e8d8bef9SDimitry Andric return; 424e8d8bef9SDimitry Andric if (E->getOpcode() == UO_AddrOf) { 425e8d8bef9SDimitry Andric const bool SavedAllEscaped = AllEscaped; 426e8d8bef9SDimitry Andric AllEscaped = true; 427e8d8bef9SDimitry Andric Visit(E->getSubExpr()); 428e8d8bef9SDimitry Andric AllEscaped = SavedAllEscaped; 429e8d8bef9SDimitry Andric } else { 430e8d8bef9SDimitry Andric Visit(E->getSubExpr()); 431e8d8bef9SDimitry Andric } 432e8d8bef9SDimitry Andric } 433e8d8bef9SDimitry Andric void VisitImplicitCastExpr(const ImplicitCastExpr *E) { 434e8d8bef9SDimitry Andric if (!E) 435e8d8bef9SDimitry Andric return; 436e8d8bef9SDimitry Andric if (E->getCastKind() == CK_ArrayToPointerDecay) { 437e8d8bef9SDimitry Andric const bool SavedAllEscaped = AllEscaped; 438e8d8bef9SDimitry Andric AllEscaped = true; 439e8d8bef9SDimitry Andric Visit(E->getSubExpr()); 440e8d8bef9SDimitry Andric AllEscaped = SavedAllEscaped; 441e8d8bef9SDimitry Andric } else { 442e8d8bef9SDimitry Andric Visit(E->getSubExpr()); 443e8d8bef9SDimitry Andric } 444e8d8bef9SDimitry Andric } 445e8d8bef9SDimitry Andric void VisitExpr(const Expr *E) { 446e8d8bef9SDimitry Andric if (!E) 447e8d8bef9SDimitry Andric return; 448e8d8bef9SDimitry Andric bool SavedAllEscaped = AllEscaped; 449e8d8bef9SDimitry Andric if (!E->isLValue()) 450e8d8bef9SDimitry Andric AllEscaped = false; 451e8d8bef9SDimitry Andric for (const Stmt *Child : E->children()) 452e8d8bef9SDimitry Andric if (Child) 453e8d8bef9SDimitry Andric Visit(Child); 454e8d8bef9SDimitry Andric AllEscaped = SavedAllEscaped; 455e8d8bef9SDimitry Andric } 456e8d8bef9SDimitry Andric void VisitStmt(const Stmt *S) { 457e8d8bef9SDimitry Andric if (!S) 458e8d8bef9SDimitry Andric return; 459e8d8bef9SDimitry Andric for (const Stmt *Child : S->children()) 460e8d8bef9SDimitry Andric if (Child) 461e8d8bef9SDimitry Andric Visit(Child); 462e8d8bef9SDimitry Andric } 463e8d8bef9SDimitry Andric 464e8d8bef9SDimitry Andric /// Returns the record that handles all the escaped local variables and used 465e8d8bef9SDimitry Andric /// instead of their original storage. 466e8d8bef9SDimitry Andric const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) { 467e8d8bef9SDimitry Andric if (!GlobalizedRD) 468e8d8bef9SDimitry Andric buildRecordForGlobalizedVars(IsInTTDRegion); 469e8d8bef9SDimitry Andric return GlobalizedRD; 470e8d8bef9SDimitry Andric } 471e8d8bef9SDimitry Andric 472e8d8bef9SDimitry Andric /// Returns the field in the globalized record for the escaped variable. 473e8d8bef9SDimitry Andric const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const { 474e8d8bef9SDimitry Andric assert(GlobalizedRD && 475e8d8bef9SDimitry Andric "Record for globalized variables must be generated already."); 47606c3fb27SDimitry Andric return MappedDeclsFields.lookup(VD); 477e8d8bef9SDimitry Andric } 478e8d8bef9SDimitry Andric 479e8d8bef9SDimitry Andric /// Returns the list of the escaped local variables/parameters. 480e8d8bef9SDimitry Andric ArrayRef<const ValueDecl *> getEscapedDecls() const { 481e8d8bef9SDimitry Andric return EscapedDecls.getArrayRef(); 482e8d8bef9SDimitry Andric } 483e8d8bef9SDimitry Andric 484e8d8bef9SDimitry Andric /// Checks if the escaped local variable is actually a parameter passed by 485e8d8bef9SDimitry Andric /// value. 486e8d8bef9SDimitry Andric const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const { 487e8d8bef9SDimitry Andric return EscapedParameters; 488e8d8bef9SDimitry Andric } 489e8d8bef9SDimitry Andric 490e8d8bef9SDimitry Andric /// Returns the list of the escaped variables with the variably modified 491e8d8bef9SDimitry Andric /// types. 492e8d8bef9SDimitry Andric ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const { 493e8d8bef9SDimitry Andric return EscapedVariableLengthDecls.getArrayRef(); 494e8d8bef9SDimitry Andric } 49506c3fb27SDimitry Andric 49606c3fb27SDimitry Andric /// Returns the list of the delayed variables with the variably modified 49706c3fb27SDimitry Andric /// types. 49806c3fb27SDimitry Andric ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const { 49906c3fb27SDimitry Andric return DelayedVariableLengthDecls.getArrayRef(); 50006c3fb27SDimitry Andric } 501e8d8bef9SDimitry Andric }; 502e8d8bef9SDimitry Andric } // anonymous namespace 503e8d8bef9SDimitry Andric 504e8d8bef9SDimitry Andric CGOpenMPRuntimeGPU::ExecutionMode 505e8d8bef9SDimitry Andric CGOpenMPRuntimeGPU::getExecutionMode() const { 506e8d8bef9SDimitry Andric return CurrentExecutionMode; 507e8d8bef9SDimitry Andric } 508e8d8bef9SDimitry Andric 5095f757f3fSDimitry Andric CGOpenMPRuntimeGPU::DataSharingMode 5105f757f3fSDimitry Andric CGOpenMPRuntimeGPU::getDataSharingMode() const { 5115f757f3fSDimitry Andric return CurrentDataSharingMode; 512e8d8bef9SDimitry Andric } 513e8d8bef9SDimitry Andric 514e8d8bef9SDimitry Andric /// Check for inner (nested) SPMD construct, if any 515e8d8bef9SDimitry Andric static bool hasNestedSPMDDirective(ASTContext &Ctx, 516e8d8bef9SDimitry Andric const OMPExecutableDirective &D) { 517e8d8bef9SDimitry Andric const auto *CS = D.getInnermostCapturedStmt(); 518e8d8bef9SDimitry Andric const auto *Body = 519e8d8bef9SDimitry Andric CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true); 520e8d8bef9SDimitry Andric const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 521e8d8bef9SDimitry Andric 522e8d8bef9SDimitry Andric if (const auto *NestedDir = 523e8d8bef9SDimitry Andric dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 524e8d8bef9SDimitry Andric OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind(); 525e8d8bef9SDimitry Andric switch (D.getDirectiveKind()) { 526e8d8bef9SDimitry Andric case OMPD_target: 527e8d8bef9SDimitry Andric if (isOpenMPParallelDirective(DKind)) 528e8d8bef9SDimitry Andric return true; 529e8d8bef9SDimitry Andric if (DKind == OMPD_teams) { 530e8d8bef9SDimitry Andric Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( 531e8d8bef9SDimitry Andric /*IgnoreCaptured=*/true); 532e8d8bef9SDimitry Andric if (!Body) 533e8d8bef9SDimitry Andric return false; 534e8d8bef9SDimitry Andric ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 535e8d8bef9SDimitry Andric if (const auto *NND = 536e8d8bef9SDimitry Andric dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 537e8d8bef9SDimitry Andric DKind = NND->getDirectiveKind(); 538e8d8bef9SDimitry Andric if (isOpenMPParallelDirective(DKind)) 539e8d8bef9SDimitry Andric return true; 540e8d8bef9SDimitry Andric } 541e8d8bef9SDimitry Andric } 542e8d8bef9SDimitry Andric return false; 543e8d8bef9SDimitry Andric case OMPD_target_teams: 544e8d8bef9SDimitry Andric return isOpenMPParallelDirective(DKind); 545e8d8bef9SDimitry Andric case OMPD_target_simd: 546e8d8bef9SDimitry Andric case OMPD_target_parallel: 547e8d8bef9SDimitry Andric case OMPD_target_parallel_for: 548e8d8bef9SDimitry Andric case OMPD_target_parallel_for_simd: 549e8d8bef9SDimitry Andric case OMPD_target_teams_distribute: 550e8d8bef9SDimitry Andric case OMPD_target_teams_distribute_simd: 551e8d8bef9SDimitry Andric case OMPD_target_teams_distribute_parallel_for: 552e8d8bef9SDimitry Andric case OMPD_target_teams_distribute_parallel_for_simd: 553e8d8bef9SDimitry Andric case OMPD_parallel: 554e8d8bef9SDimitry Andric case OMPD_for: 555e8d8bef9SDimitry Andric case OMPD_parallel_for: 556e8d8bef9SDimitry Andric case OMPD_parallel_master: 557e8d8bef9SDimitry Andric case OMPD_parallel_sections: 558e8d8bef9SDimitry Andric case OMPD_for_simd: 559e8d8bef9SDimitry Andric case OMPD_parallel_for_simd: 560e8d8bef9SDimitry Andric case OMPD_cancel: 561e8d8bef9SDimitry Andric case OMPD_cancellation_point: 562e8d8bef9SDimitry Andric case OMPD_ordered: 563e8d8bef9SDimitry Andric case OMPD_threadprivate: 564e8d8bef9SDimitry Andric case OMPD_allocate: 565e8d8bef9SDimitry Andric case OMPD_task: 566e8d8bef9SDimitry Andric case OMPD_simd: 567e8d8bef9SDimitry Andric case OMPD_sections: 568e8d8bef9SDimitry Andric case OMPD_section: 569e8d8bef9SDimitry Andric case OMPD_single: 570e8d8bef9SDimitry Andric case OMPD_master: 571e8d8bef9SDimitry Andric case OMPD_critical: 572e8d8bef9SDimitry Andric case OMPD_taskyield: 573e8d8bef9SDimitry Andric case OMPD_barrier: 574e8d8bef9SDimitry Andric case OMPD_taskwait: 575e8d8bef9SDimitry Andric case OMPD_taskgroup: 576e8d8bef9SDimitry Andric case OMPD_atomic: 577e8d8bef9SDimitry Andric case OMPD_flush: 578e8d8bef9SDimitry Andric case OMPD_depobj: 579e8d8bef9SDimitry Andric case OMPD_scan: 580e8d8bef9SDimitry Andric case OMPD_teams: 581e8d8bef9SDimitry Andric case OMPD_target_data: 582e8d8bef9SDimitry Andric case OMPD_target_exit_data: 583e8d8bef9SDimitry Andric case OMPD_target_enter_data: 584e8d8bef9SDimitry Andric case OMPD_distribute: 585e8d8bef9SDimitry Andric case OMPD_distribute_simd: 586e8d8bef9SDimitry Andric case OMPD_distribute_parallel_for: 587e8d8bef9SDimitry Andric case OMPD_distribute_parallel_for_simd: 588e8d8bef9SDimitry Andric case OMPD_teams_distribute: 589e8d8bef9SDimitry Andric case OMPD_teams_distribute_simd: 590e8d8bef9SDimitry Andric case OMPD_teams_distribute_parallel_for: 591e8d8bef9SDimitry Andric case OMPD_teams_distribute_parallel_for_simd: 592e8d8bef9SDimitry Andric case OMPD_target_update: 593e8d8bef9SDimitry Andric case OMPD_declare_simd: 594e8d8bef9SDimitry Andric case OMPD_declare_variant: 595e8d8bef9SDimitry Andric case OMPD_begin_declare_variant: 596e8d8bef9SDimitry Andric case OMPD_end_declare_variant: 597e8d8bef9SDimitry Andric case OMPD_declare_target: 598e8d8bef9SDimitry Andric case OMPD_end_declare_target: 599e8d8bef9SDimitry Andric case OMPD_declare_reduction: 600e8d8bef9SDimitry Andric case OMPD_declare_mapper: 601e8d8bef9SDimitry Andric case OMPD_taskloop: 602e8d8bef9SDimitry Andric case OMPD_taskloop_simd: 603e8d8bef9SDimitry Andric case OMPD_master_taskloop: 604e8d8bef9SDimitry Andric case OMPD_master_taskloop_simd: 605e8d8bef9SDimitry Andric case OMPD_parallel_master_taskloop: 606e8d8bef9SDimitry Andric case OMPD_parallel_master_taskloop_simd: 607e8d8bef9SDimitry Andric case OMPD_requires: 608e8d8bef9SDimitry Andric case OMPD_unknown: 609e8d8bef9SDimitry Andric default: 610e8d8bef9SDimitry Andric llvm_unreachable("Unexpected directive."); 611e8d8bef9SDimitry Andric } 612e8d8bef9SDimitry Andric } 613e8d8bef9SDimitry Andric 614e8d8bef9SDimitry Andric return false; 615e8d8bef9SDimitry Andric } 616e8d8bef9SDimitry Andric 617e8d8bef9SDimitry Andric static bool supportsSPMDExecutionMode(ASTContext &Ctx, 618e8d8bef9SDimitry Andric const OMPExecutableDirective &D) { 619e8d8bef9SDimitry Andric OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); 620e8d8bef9SDimitry Andric switch (DirectiveKind) { 621e8d8bef9SDimitry Andric case OMPD_target: 622e8d8bef9SDimitry Andric case OMPD_target_teams: 623e8d8bef9SDimitry Andric return hasNestedSPMDDirective(Ctx, D); 62406c3fb27SDimitry Andric case OMPD_target_parallel_loop: 625e8d8bef9SDimitry Andric case OMPD_target_parallel: 626e8d8bef9SDimitry Andric case OMPD_target_parallel_for: 627e8d8bef9SDimitry Andric case OMPD_target_parallel_for_simd: 628e8d8bef9SDimitry Andric case OMPD_target_teams_distribute_parallel_for: 629e8d8bef9SDimitry Andric case OMPD_target_teams_distribute_parallel_for_simd: 630e8d8bef9SDimitry Andric case OMPD_target_simd: 631e8d8bef9SDimitry Andric case OMPD_target_teams_distribute_simd: 632e8d8bef9SDimitry Andric return true; 633e8d8bef9SDimitry Andric case OMPD_target_teams_distribute: 634e8d8bef9SDimitry Andric return false; 6350fca6ea1SDimitry Andric case OMPD_target_teams_loop: 6360fca6ea1SDimitry Andric // Whether this is true or not depends on how the directive will 6370fca6ea1SDimitry Andric // eventually be emitted. 6380fca6ea1SDimitry Andric if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(&D)) 6390fca6ea1SDimitry Andric return TTLD->canBeParallelFor(); 6400fca6ea1SDimitry Andric return false; 641e8d8bef9SDimitry Andric case OMPD_parallel: 642e8d8bef9SDimitry Andric case OMPD_for: 643e8d8bef9SDimitry Andric case OMPD_parallel_for: 644e8d8bef9SDimitry Andric case OMPD_parallel_master: 645e8d8bef9SDimitry Andric case OMPD_parallel_sections: 646e8d8bef9SDimitry Andric case OMPD_for_simd: 647e8d8bef9SDimitry Andric case OMPD_parallel_for_simd: 648e8d8bef9SDimitry Andric case OMPD_cancel: 649e8d8bef9SDimitry Andric case OMPD_cancellation_point: 650e8d8bef9SDimitry Andric case OMPD_ordered: 651e8d8bef9SDimitry Andric case OMPD_threadprivate: 652e8d8bef9SDimitry Andric case OMPD_allocate: 653e8d8bef9SDimitry Andric case OMPD_task: 654e8d8bef9SDimitry Andric case OMPD_simd: 655e8d8bef9SDimitry Andric case OMPD_sections: 656e8d8bef9SDimitry Andric case OMPD_section: 657e8d8bef9SDimitry Andric case OMPD_single: 658e8d8bef9SDimitry Andric case OMPD_master: 659e8d8bef9SDimitry Andric case OMPD_critical: 660e8d8bef9SDimitry Andric case OMPD_taskyield: 661e8d8bef9SDimitry Andric case OMPD_barrier: 662e8d8bef9SDimitry Andric case OMPD_taskwait: 663e8d8bef9SDimitry Andric case OMPD_taskgroup: 664e8d8bef9SDimitry Andric case OMPD_atomic: 665e8d8bef9SDimitry Andric case OMPD_flush: 666e8d8bef9SDimitry Andric case OMPD_depobj: 667e8d8bef9SDimitry Andric case OMPD_scan: 668e8d8bef9SDimitry Andric case OMPD_teams: 669e8d8bef9SDimitry Andric case OMPD_target_data: 670e8d8bef9SDimitry Andric case OMPD_target_exit_data: 671e8d8bef9SDimitry Andric case OMPD_target_enter_data: 672e8d8bef9SDimitry Andric case OMPD_distribute: 673e8d8bef9SDimitry Andric case OMPD_distribute_simd: 674e8d8bef9SDimitry Andric case OMPD_distribute_parallel_for: 675e8d8bef9SDimitry Andric case OMPD_distribute_parallel_for_simd: 676e8d8bef9SDimitry Andric case OMPD_teams_distribute: 677e8d8bef9SDimitry Andric case OMPD_teams_distribute_simd: 678e8d8bef9SDimitry Andric case OMPD_teams_distribute_parallel_for: 679e8d8bef9SDimitry Andric case OMPD_teams_distribute_parallel_for_simd: 680e8d8bef9SDimitry Andric case OMPD_target_update: 681e8d8bef9SDimitry Andric case OMPD_declare_simd: 682e8d8bef9SDimitry Andric case OMPD_declare_variant: 683e8d8bef9SDimitry Andric case OMPD_begin_declare_variant: 684e8d8bef9SDimitry Andric case OMPD_end_declare_variant: 685e8d8bef9SDimitry Andric case OMPD_declare_target: 686e8d8bef9SDimitry Andric case OMPD_end_declare_target: 687e8d8bef9SDimitry Andric case OMPD_declare_reduction: 688e8d8bef9SDimitry Andric case OMPD_declare_mapper: 689e8d8bef9SDimitry Andric case OMPD_taskloop: 690e8d8bef9SDimitry Andric case OMPD_taskloop_simd: 691e8d8bef9SDimitry Andric case OMPD_master_taskloop: 692e8d8bef9SDimitry Andric case OMPD_master_taskloop_simd: 693e8d8bef9SDimitry Andric case OMPD_parallel_master_taskloop: 694e8d8bef9SDimitry Andric case OMPD_parallel_master_taskloop_simd: 695e8d8bef9SDimitry Andric case OMPD_requires: 696e8d8bef9SDimitry Andric case OMPD_unknown: 697e8d8bef9SDimitry Andric default: 698e8d8bef9SDimitry Andric break; 699e8d8bef9SDimitry Andric } 700e8d8bef9SDimitry Andric llvm_unreachable( 701e8d8bef9SDimitry Andric "Unknown programming model for OpenMP directive on NVPTX target."); 702e8d8bef9SDimitry Andric } 703e8d8bef9SDimitry Andric 704e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D, 705e8d8bef9SDimitry Andric StringRef ParentName, 706e8d8bef9SDimitry Andric llvm::Function *&OutlinedFn, 707e8d8bef9SDimitry Andric llvm::Constant *&OutlinedFnID, 708e8d8bef9SDimitry Andric bool IsOffloadEntry, 709e8d8bef9SDimitry Andric const RegionCodeGenTy &CodeGen) { 710bdd1243dSDimitry Andric ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD); 711e8d8bef9SDimitry Andric EntryFunctionState EST; 712e8d8bef9SDimitry Andric WrapperFunctionsMap.clear(); 713e8d8bef9SDimitry Andric 7145f757f3fSDimitry Andric [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); 7155f757f3fSDimitry Andric assert(!IsBareKernel && "bare kernel should not be at generic mode"); 7165f757f3fSDimitry Andric 717e8d8bef9SDimitry Andric // Emit target region as a standalone region. 718e8d8bef9SDimitry Andric class NVPTXPrePostActionTy : public PrePostActionTy { 719e8d8bef9SDimitry Andric CGOpenMPRuntimeGPU::EntryFunctionState &EST; 7205f757f3fSDimitry Andric const OMPExecutableDirective &D; 721e8d8bef9SDimitry Andric 722e8d8bef9SDimitry Andric public: 7235f757f3fSDimitry Andric NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST, 7245f757f3fSDimitry Andric const OMPExecutableDirective &D) 7255f757f3fSDimitry Andric : EST(EST), D(D) {} 726e8d8bef9SDimitry Andric void Enter(CodeGenFunction &CGF) override { 7275f757f3fSDimitry Andric auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 7285f757f3fSDimitry Andric RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false); 729e8d8bef9SDimitry Andric // Skip target region initialization. 730e8d8bef9SDimitry Andric RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); 731e8d8bef9SDimitry Andric } 732e8d8bef9SDimitry Andric void Exit(CodeGenFunction &CGF) override { 7335f757f3fSDimitry Andric auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 734e8d8bef9SDimitry Andric RT.clearLocThreadIdInsertPt(CGF); 735fe6060f1SDimitry Andric RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false); 736e8d8bef9SDimitry Andric } 7375f757f3fSDimitry Andric } Action(EST, D); 738e8d8bef9SDimitry Andric CodeGen.setAction(Action); 739e8d8bef9SDimitry Andric IsInTTDRegion = true; 740e8d8bef9SDimitry Andric emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, 741e8d8bef9SDimitry Andric IsOffloadEntry, CodeGen); 742e8d8bef9SDimitry Andric IsInTTDRegion = false; 743e8d8bef9SDimitry Andric } 744e8d8bef9SDimitry Andric 7455f757f3fSDimitry Andric void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D, 7465f757f3fSDimitry Andric CodeGenFunction &CGF, 747fe6060f1SDimitry Andric EntryFunctionState &EST, bool IsSPMD) { 7485f757f3fSDimitry Andric int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1, 7495f757f3fSDimitry Andric MaxTeamsVal = -1; 7505f757f3fSDimitry Andric computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal, 7515f757f3fSDimitry Andric MinTeamsVal, MaxTeamsVal); 7525f757f3fSDimitry Andric 753e8d8bef9SDimitry Andric CGBuilderTy &Bld = CGF.Builder; 7545f757f3fSDimitry Andric Bld.restoreIP(OMPBuilder.createTargetInit( 7555f757f3fSDimitry Andric Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal)); 756fe6060f1SDimitry Andric if (!IsSPMD) 757fe6060f1SDimitry Andric emitGenericVarsProlog(CGF, EST.Loc); 758e8d8bef9SDimitry Andric } 759e8d8bef9SDimitry Andric 760fe6060f1SDimitry Andric void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF, 761fe6060f1SDimitry Andric EntryFunctionState &EST, 762fe6060f1SDimitry Andric bool IsSPMD) { 763fe6060f1SDimitry Andric if (!IsSPMD) 764e8d8bef9SDimitry Andric emitGenericVarsEpilog(CGF); 765e8d8bef9SDimitry Andric 7665f757f3fSDimitry Andric // This is temporary until we remove the fixed sized buffer. 7675f757f3fSDimitry Andric ASTContext &C = CGM.getContext(); 7685f757f3fSDimitry Andric RecordDecl *StaticRD = C.buildImplicitRecord( 7695f757f3fSDimitry Andric "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union); 7705f757f3fSDimitry Andric StaticRD->startDefinition(); 7715f757f3fSDimitry Andric for (const RecordDecl *TeamReductionRec : TeamsReductions) { 7725f757f3fSDimitry Andric QualType RecTy = C.getRecordType(TeamReductionRec); 7735f757f3fSDimitry Andric auto *Field = FieldDecl::Create( 7745f757f3fSDimitry Andric C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy, 7755f757f3fSDimitry Andric C.getTrivialTypeSourceInfo(RecTy, SourceLocation()), 7765f757f3fSDimitry Andric /*BW=*/nullptr, /*Mutable=*/false, 7775f757f3fSDimitry Andric /*InitStyle=*/ICIS_NoInit); 7785f757f3fSDimitry Andric Field->setAccess(AS_public); 7795f757f3fSDimitry Andric StaticRD->addDecl(Field); 7805f757f3fSDimitry Andric } 7815f757f3fSDimitry Andric StaticRD->completeDefinition(); 7825f757f3fSDimitry Andric QualType StaticTy = C.getRecordType(StaticRD); 7835f757f3fSDimitry Andric llvm::Type *LLVMReductionsBufferTy = 7845f757f3fSDimitry Andric CGM.getTypes().ConvertTypeForMem(StaticTy); 7855f757f3fSDimitry Andric const auto &DL = CGM.getModule().getDataLayout(); 7865f757f3fSDimitry Andric uint64_t ReductionDataSize = 7875f757f3fSDimitry Andric TeamsReductions.empty() 7885f757f3fSDimitry Andric ? 0 7895f757f3fSDimitry Andric : DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue(); 790fe6060f1SDimitry Andric CGBuilderTy &Bld = CGF.Builder; 7915f757f3fSDimitry Andric OMPBuilder.createTargetDeinit(Bld, ReductionDataSize, 7925f757f3fSDimitry Andric C.getLangOpts().OpenMPCUDAReductionBufNum); 7935f757f3fSDimitry Andric TeamsReductions.clear(); 794e8d8bef9SDimitry Andric } 795e8d8bef9SDimitry Andric 796e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D, 797e8d8bef9SDimitry Andric StringRef ParentName, 798e8d8bef9SDimitry Andric llvm::Function *&OutlinedFn, 799e8d8bef9SDimitry Andric llvm::Constant *&OutlinedFnID, 800e8d8bef9SDimitry Andric bool IsOffloadEntry, 801e8d8bef9SDimitry Andric const RegionCodeGenTy &CodeGen) { 802bdd1243dSDimitry Andric ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD); 803e8d8bef9SDimitry Andric EntryFunctionState EST; 804e8d8bef9SDimitry Andric 8055f757f3fSDimitry Andric bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); 8065f757f3fSDimitry Andric 807e8d8bef9SDimitry Andric // Emit target region as a standalone region. 808e8d8bef9SDimitry Andric class NVPTXPrePostActionTy : public PrePostActionTy { 809e8d8bef9SDimitry Andric CGOpenMPRuntimeGPU &RT; 810e8d8bef9SDimitry Andric CGOpenMPRuntimeGPU::EntryFunctionState &EST; 8115f757f3fSDimitry Andric bool IsBareKernel; 8125f757f3fSDimitry Andric DataSharingMode Mode; 8135f757f3fSDimitry Andric const OMPExecutableDirective &D; 814e8d8bef9SDimitry Andric 815e8d8bef9SDimitry Andric public: 816e8d8bef9SDimitry Andric NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT, 8175f757f3fSDimitry Andric CGOpenMPRuntimeGPU::EntryFunctionState &EST, 8185f757f3fSDimitry Andric bool IsBareKernel, const OMPExecutableDirective &D) 8195f757f3fSDimitry Andric : RT(RT), EST(EST), IsBareKernel(IsBareKernel), 8205f757f3fSDimitry Andric Mode(RT.CurrentDataSharingMode), D(D) {} 821e8d8bef9SDimitry Andric void Enter(CodeGenFunction &CGF) override { 8225f757f3fSDimitry Andric if (IsBareKernel) { 8235f757f3fSDimitry Andric RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA; 8245f757f3fSDimitry Andric return; 8255f757f3fSDimitry Andric } 8265f757f3fSDimitry Andric RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true); 827e8d8bef9SDimitry Andric // Skip target region initialization. 828e8d8bef9SDimitry Andric RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); 829e8d8bef9SDimitry Andric } 830e8d8bef9SDimitry Andric void Exit(CodeGenFunction &CGF) override { 8315f757f3fSDimitry Andric if (IsBareKernel) { 8325f757f3fSDimitry Andric RT.CurrentDataSharingMode = Mode; 8335f757f3fSDimitry Andric return; 8345f757f3fSDimitry Andric } 835e8d8bef9SDimitry Andric RT.clearLocThreadIdInsertPt(CGF); 836fe6060f1SDimitry Andric RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true); 837e8d8bef9SDimitry Andric } 8385f757f3fSDimitry Andric } Action(*this, EST, IsBareKernel, D); 839e8d8bef9SDimitry Andric CodeGen.setAction(Action); 840e8d8bef9SDimitry Andric IsInTTDRegion = true; 841e8d8bef9SDimitry Andric emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, 842e8d8bef9SDimitry Andric IsOffloadEntry, CodeGen); 843e8d8bef9SDimitry Andric IsInTTDRegion = false; 844e8d8bef9SDimitry Andric } 845e8d8bef9SDimitry Andric 846e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction( 847e8d8bef9SDimitry Andric const OMPExecutableDirective &D, StringRef ParentName, 848e8d8bef9SDimitry Andric llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, 849e8d8bef9SDimitry Andric bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { 850e8d8bef9SDimitry Andric if (!IsOffloadEntry) // Nothing to do. 851e8d8bef9SDimitry Andric return; 852e8d8bef9SDimitry Andric 853e8d8bef9SDimitry Andric assert(!ParentName.empty() && "Invalid target region parent name!"); 854e8d8bef9SDimitry Andric 855e8d8bef9SDimitry Andric bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D); 8565f757f3fSDimitry Andric bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); 8575f757f3fSDimitry Andric if (Mode || IsBareKernel) 858e8d8bef9SDimitry Andric emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, 859e8d8bef9SDimitry Andric CodeGen); 860e8d8bef9SDimitry Andric else 861e8d8bef9SDimitry Andric emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, 862e8d8bef9SDimitry Andric CodeGen); 863e8d8bef9SDimitry Andric } 864e8d8bef9SDimitry Andric 865e8d8bef9SDimitry Andric CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM) 866bdd1243dSDimitry Andric : CGOpenMPRuntime(CGM) { 8675f757f3fSDimitry Andric llvm::OpenMPIRBuilderConfig Config( 8685f757f3fSDimitry Andric CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(), 8695f757f3fSDimitry Andric CGM.getLangOpts().OpenMPOffloadMandatory, 8705f757f3fSDimitry Andric /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false, 8715f757f3fSDimitry Andric hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false); 872bdd1243dSDimitry Andric OMPBuilder.setConfig(Config); 873bdd1243dSDimitry Andric 87406c3fb27SDimitry Andric if (!CGM.getLangOpts().OpenMPIsTargetDevice) 875349cc55cSDimitry Andric llvm_unreachable("OpenMP can only handle device code."); 876349cc55cSDimitry Andric 8775f757f3fSDimitry Andric if (CGM.getLangOpts().OpenMPCUDAMode) 8785f757f3fSDimitry Andric CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA; 8795f757f3fSDimitry Andric 880349cc55cSDimitry Andric llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder(); 88181ad6265SDimitry Andric if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty()) 88281ad6265SDimitry Andric return; 88381ad6265SDimitry Andric 884349cc55cSDimitry Andric OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug, 885349cc55cSDimitry Andric "__omp_rtl_debug_kind"); 886349cc55cSDimitry Andric OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription, 887349cc55cSDimitry Andric "__omp_rtl_assume_teams_oversubscription"); 888349cc55cSDimitry Andric OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription, 889349cc55cSDimitry Andric "__omp_rtl_assume_threads_oversubscription"); 89081ad6265SDimitry Andric OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState, 89181ad6265SDimitry Andric "__omp_rtl_assume_no_thread_state"); 892bdd1243dSDimitry Andric OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism, 893bdd1243dSDimitry Andric "__omp_rtl_assume_no_nested_parallelism"); 894e8d8bef9SDimitry Andric } 895e8d8bef9SDimitry Andric 896e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF, 897e8d8bef9SDimitry Andric ProcBindKind ProcBind, 898e8d8bef9SDimitry Andric SourceLocation Loc) { 8995f757f3fSDimitry Andric // Nothing to do. 900e8d8bef9SDimitry Andric } 901e8d8bef9SDimitry Andric 902e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF, 903e8d8bef9SDimitry Andric llvm::Value *NumThreads, 904e8d8bef9SDimitry Andric SourceLocation Loc) { 9050eae32dcSDimitry Andric // Nothing to do. 906e8d8bef9SDimitry Andric } 907e8d8bef9SDimitry Andric 908e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF, 909e8d8bef9SDimitry Andric const Expr *NumTeams, 910e8d8bef9SDimitry Andric const Expr *ThreadLimit, 911e8d8bef9SDimitry Andric SourceLocation Loc) {} 912e8d8bef9SDimitry Andric 913e8d8bef9SDimitry Andric llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction( 91406c3fb27SDimitry Andric CodeGenFunction &CGF, const OMPExecutableDirective &D, 91506c3fb27SDimitry Andric const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, 91606c3fb27SDimitry Andric const RegionCodeGenTy &CodeGen) { 917e8d8bef9SDimitry Andric // Emit target region as a standalone region. 918e8d8bef9SDimitry Andric bool PrevIsInTTDRegion = IsInTTDRegion; 919e8d8bef9SDimitry Andric IsInTTDRegion = false; 920e8d8bef9SDimitry Andric auto *OutlinedFun = 921e8d8bef9SDimitry Andric cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction( 92206c3fb27SDimitry Andric CGF, D, ThreadIDVar, InnermostKind, CodeGen)); 923e8d8bef9SDimitry Andric IsInTTDRegion = PrevIsInTTDRegion; 924bdd1243dSDimitry Andric if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) { 925e8d8bef9SDimitry Andric llvm::Function *WrapperFun = 926e8d8bef9SDimitry Andric createParallelDataSharingWrapper(OutlinedFun, D); 927e8d8bef9SDimitry Andric WrapperFunctionsMap[OutlinedFun] = WrapperFun; 928e8d8bef9SDimitry Andric } 929e8d8bef9SDimitry Andric 930e8d8bef9SDimitry Andric return OutlinedFun; 931e8d8bef9SDimitry Andric } 932e8d8bef9SDimitry Andric 933e8d8bef9SDimitry Andric /// Get list of lastprivate variables from the teams distribute ... or 934e8d8bef9SDimitry Andric /// teams {distribute ...} directives. 935e8d8bef9SDimitry Andric static void 936e8d8bef9SDimitry Andric getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, 937e8d8bef9SDimitry Andric llvm::SmallVectorImpl<const ValueDecl *> &Vars) { 938e8d8bef9SDimitry Andric assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && 939e8d8bef9SDimitry Andric "expected teams directive."); 940e8d8bef9SDimitry Andric const OMPExecutableDirective *Dir = &D; 941e8d8bef9SDimitry Andric if (!isOpenMPDistributeDirective(D.getDirectiveKind())) { 942e8d8bef9SDimitry Andric if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild( 943e8d8bef9SDimitry Andric Ctx, 944e8d8bef9SDimitry Andric D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers( 945e8d8bef9SDimitry Andric /*IgnoreCaptured=*/true))) { 946e8d8bef9SDimitry Andric Dir = dyn_cast_or_null<OMPExecutableDirective>(S); 947e8d8bef9SDimitry Andric if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind())) 948e8d8bef9SDimitry Andric Dir = nullptr; 949e8d8bef9SDimitry Andric } 950e8d8bef9SDimitry Andric } 951e8d8bef9SDimitry Andric if (!Dir) 952e8d8bef9SDimitry Andric return; 953e8d8bef9SDimitry Andric for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) { 954e8d8bef9SDimitry Andric for (const Expr *E : C->getVarRefs()) 955e8d8bef9SDimitry Andric Vars.push_back(getPrivateItem(E)); 956e8d8bef9SDimitry Andric } 957e8d8bef9SDimitry Andric } 958e8d8bef9SDimitry Andric 959e8d8bef9SDimitry Andric /// Get list of reduction variables from the teams ... directives. 960e8d8bef9SDimitry Andric static void 961e8d8bef9SDimitry Andric getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, 962e8d8bef9SDimitry Andric llvm::SmallVectorImpl<const ValueDecl *> &Vars) { 963e8d8bef9SDimitry Andric assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && 964e8d8bef9SDimitry Andric "expected teams directive."); 965e8d8bef9SDimitry Andric for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) { 966e8d8bef9SDimitry Andric for (const Expr *E : C->privates()) 967e8d8bef9SDimitry Andric Vars.push_back(getPrivateItem(E)); 968e8d8bef9SDimitry Andric } 969e8d8bef9SDimitry Andric } 970e8d8bef9SDimitry Andric 971e8d8bef9SDimitry Andric llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction( 97206c3fb27SDimitry Andric CodeGenFunction &CGF, const OMPExecutableDirective &D, 97306c3fb27SDimitry Andric const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, 97406c3fb27SDimitry Andric const RegionCodeGenTy &CodeGen) { 975e8d8bef9SDimitry Andric SourceLocation Loc = D.getBeginLoc(); 976e8d8bef9SDimitry Andric 977e8d8bef9SDimitry Andric const RecordDecl *GlobalizedRD = nullptr; 978e8d8bef9SDimitry Andric llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions; 979e8d8bef9SDimitry Andric llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; 980349cc55cSDimitry Andric unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size; 981e8d8bef9SDimitry Andric // Globalize team reductions variable unconditionally in all modes. 982e8d8bef9SDimitry Andric if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) 983e8d8bef9SDimitry Andric getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); 984e8d8bef9SDimitry Andric if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { 985e8d8bef9SDimitry Andric getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions); 986e8d8bef9SDimitry Andric if (!LastPrivatesReductions.empty()) { 987e8d8bef9SDimitry Andric GlobalizedRD = ::buildRecordForGlobalizedVars( 988bdd1243dSDimitry Andric CGM.getContext(), std::nullopt, LastPrivatesReductions, 989e8d8bef9SDimitry Andric MappedDeclsFields, WarpSize); 990e8d8bef9SDimitry Andric } 991e8d8bef9SDimitry Andric } else if (!LastPrivatesReductions.empty()) { 992e8d8bef9SDimitry Andric assert(!TeamAndReductions.first && 993e8d8bef9SDimitry Andric "Previous team declaration is not expected."); 994e8d8bef9SDimitry Andric TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl(); 995e8d8bef9SDimitry Andric std::swap(TeamAndReductions.second, LastPrivatesReductions); 996e8d8bef9SDimitry Andric } 997e8d8bef9SDimitry Andric 998e8d8bef9SDimitry Andric // Emit target region as a standalone region. 999e8d8bef9SDimitry Andric class NVPTXPrePostActionTy : public PrePostActionTy { 1000e8d8bef9SDimitry Andric SourceLocation &Loc; 1001e8d8bef9SDimitry Andric const RecordDecl *GlobalizedRD; 1002e8d8bef9SDimitry Andric llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 1003e8d8bef9SDimitry Andric &MappedDeclsFields; 1004e8d8bef9SDimitry Andric 1005e8d8bef9SDimitry Andric public: 1006e8d8bef9SDimitry Andric NVPTXPrePostActionTy( 1007e8d8bef9SDimitry Andric SourceLocation &Loc, const RecordDecl *GlobalizedRD, 1008e8d8bef9SDimitry Andric llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 1009e8d8bef9SDimitry Andric &MappedDeclsFields) 1010e8d8bef9SDimitry Andric : Loc(Loc), GlobalizedRD(GlobalizedRD), 1011e8d8bef9SDimitry Andric MappedDeclsFields(MappedDeclsFields) {} 1012e8d8bef9SDimitry Andric void Enter(CodeGenFunction &CGF) override { 1013e8d8bef9SDimitry Andric auto &Rt = 1014e8d8bef9SDimitry Andric static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 1015e8d8bef9SDimitry Andric if (GlobalizedRD) { 1016e8d8bef9SDimitry Andric auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; 1017e8d8bef9SDimitry Andric I->getSecond().MappedParams = 1018e8d8bef9SDimitry Andric std::make_unique<CodeGenFunction::OMPMapVars>(); 1019e8d8bef9SDimitry Andric DeclToAddrMapTy &Data = I->getSecond().LocalVarData; 1020e8d8bef9SDimitry Andric for (const auto &Pair : MappedDeclsFields) { 1021e8d8bef9SDimitry Andric assert(Pair.getFirst()->isCanonicalDecl() && 1022e8d8bef9SDimitry Andric "Expected canonical declaration"); 1023fe6060f1SDimitry Andric Data.insert(std::make_pair(Pair.getFirst(), MappedVarData())); 1024e8d8bef9SDimitry Andric } 1025e8d8bef9SDimitry Andric } 1026e8d8bef9SDimitry Andric Rt.emitGenericVarsProlog(CGF, Loc); 1027e8d8bef9SDimitry Andric } 1028e8d8bef9SDimitry Andric void Exit(CodeGenFunction &CGF) override { 1029e8d8bef9SDimitry Andric static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()) 1030e8d8bef9SDimitry Andric .emitGenericVarsEpilog(CGF); 1031e8d8bef9SDimitry Andric } 1032e8d8bef9SDimitry Andric } Action(Loc, GlobalizedRD, MappedDeclsFields); 1033e8d8bef9SDimitry Andric CodeGen.setAction(Action); 1034e8d8bef9SDimitry Andric llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction( 103506c3fb27SDimitry Andric CGF, D, ThreadIDVar, InnermostKind, CodeGen); 1036e8d8bef9SDimitry Andric 1037e8d8bef9SDimitry Andric return OutlinedFun; 1038e8d8bef9SDimitry Andric } 1039e8d8bef9SDimitry Andric 1040e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF, 10415f757f3fSDimitry Andric SourceLocation Loc) { 10425f757f3fSDimitry Andric if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) 1043e8d8bef9SDimitry Andric return; 1044e8d8bef9SDimitry Andric 1045e8d8bef9SDimitry Andric CGBuilderTy &Bld = CGF.Builder; 1046e8d8bef9SDimitry Andric 1047e8d8bef9SDimitry Andric const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 1048e8d8bef9SDimitry Andric if (I == FunctionGlobalizedDecls.end()) 1049e8d8bef9SDimitry Andric return; 1050e8d8bef9SDimitry Andric 1051e8d8bef9SDimitry Andric for (auto &Rec : I->getSecond().LocalVarData) { 1052fe6060f1SDimitry Andric const auto *VD = cast<VarDecl>(Rec.first); 1053e8d8bef9SDimitry Andric bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); 1054fe6060f1SDimitry Andric QualType VarTy = VD->getType(); 1055fe6060f1SDimitry Andric 1056fe6060f1SDimitry Andric // Get the local allocation of a firstprivate variable before sharing 1057e8d8bef9SDimitry Andric llvm::Value *ParValue; 1058e8d8bef9SDimitry Andric if (EscapedParam) { 1059e8d8bef9SDimitry Andric LValue ParLVal = 1060e8d8bef9SDimitry Andric CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); 1061e8d8bef9SDimitry Andric ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); 1062e8d8bef9SDimitry Andric } 1063fe6060f1SDimitry Andric 1064fe6060f1SDimitry Andric // Allocate space for the variable to be globalized 1065fe6060f1SDimitry Andric llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())}; 106604eeddc0SDimitry Andric llvm::CallBase *VoidPtr = 1067fe6060f1SDimitry Andric CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1068fe6060f1SDimitry Andric CGM.getModule(), OMPRTL___kmpc_alloc_shared), 1069fe6060f1SDimitry Andric AllocArgs, VD->getName()); 107004eeddc0SDimitry Andric // FIXME: We should use the variables actual alignment as an argument. 107104eeddc0SDimitry Andric VoidPtr->addRetAttr(llvm::Attribute::get( 107204eeddc0SDimitry Andric CGM.getLLVMContext(), llvm::Attribute::Alignment, 107304eeddc0SDimitry Andric CGM.getContext().getTargetInfo().getNewAlign() / 8)); 1074fe6060f1SDimitry Andric 1075fe6060f1SDimitry Andric // Cast the void pointer and get the address of the globalized variable. 1076fe6060f1SDimitry Andric llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo(); 1077fe6060f1SDimitry Andric llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 1078fe6060f1SDimitry Andric VoidPtr, VarPtrTy, VD->getName() + "_on_stack"); 10790fca6ea1SDimitry Andric LValue VarAddr = 10800fca6ea1SDimitry Andric CGF.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr, VarTy); 10810fca6ea1SDimitry Andric Rec.second.PrivateAddr = VarAddr.getAddress(); 1082fe6060f1SDimitry Andric Rec.second.GlobalizedVal = VoidPtr; 1083fe6060f1SDimitry Andric 1084fe6060f1SDimitry Andric // Assign the local allocation to the newly globalized location. 1085e8d8bef9SDimitry Andric if (EscapedParam) { 1086e8d8bef9SDimitry Andric CGF.EmitStoreOfScalar(ParValue, VarAddr); 10870fca6ea1SDimitry Andric I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress()); 1088e8d8bef9SDimitry Andric } 1089fe6060f1SDimitry Andric if (auto *DI = CGF.getDebugInfo()) 1090fe6060f1SDimitry Andric VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation())); 1091e8d8bef9SDimitry Andric } 109206c3fb27SDimitry Andric 109306c3fb27SDimitry Andric for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) { 109406c3fb27SDimitry Andric const auto *VD = cast<VarDecl>(ValueD); 109506c3fb27SDimitry Andric std::pair<llvm::Value *, llvm::Value *> AddrSizePair = 109606c3fb27SDimitry Andric getKmpcAllocShared(CGF, VD); 109706c3fb27SDimitry Andric I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair); 109806c3fb27SDimitry Andric LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(), 109906c3fb27SDimitry Andric CGM.getContext().getDeclAlign(VD), 110006c3fb27SDimitry Andric AlignmentSource::Decl); 11010fca6ea1SDimitry Andric I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress()); 110206c3fb27SDimitry Andric } 110306c3fb27SDimitry Andric I->getSecond().MappedParams->apply(CGF); 110406c3fb27SDimitry Andric } 110506c3fb27SDimitry Andric 110606c3fb27SDimitry Andric bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF, 110706c3fb27SDimitry Andric const VarDecl *VD) const { 110806c3fb27SDimitry Andric const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 110906c3fb27SDimitry Andric if (I == FunctionGlobalizedDecls.end()) 111006c3fb27SDimitry Andric return false; 111106c3fb27SDimitry Andric 111206c3fb27SDimitry Andric // Check variable declaration is delayed: 111306c3fb27SDimitry Andric return llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD); 111406c3fb27SDimitry Andric } 111506c3fb27SDimitry Andric 111606c3fb27SDimitry Andric std::pair<llvm::Value *, llvm::Value *> 111706c3fb27SDimitry Andric CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF, 111806c3fb27SDimitry Andric const VarDecl *VD) { 111906c3fb27SDimitry Andric CGBuilderTy &Bld = CGF.Builder; 112006c3fb27SDimitry Andric 112106c3fb27SDimitry Andric // Compute size and alignment. 1122e8d8bef9SDimitry Andric llvm::Value *Size = CGF.getTypeSize(VD->getType()); 1123e8d8bef9SDimitry Andric CharUnits Align = CGM.getContext().getDeclAlign(VD); 1124e8d8bef9SDimitry Andric Size = Bld.CreateNUWAdd( 1125e8d8bef9SDimitry Andric Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1)); 1126e8d8bef9SDimitry Andric llvm::Value *AlignVal = 1127e8d8bef9SDimitry Andric llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity()); 1128e8d8bef9SDimitry Andric Size = Bld.CreateUDiv(Size, AlignVal); 1129e8d8bef9SDimitry Andric Size = Bld.CreateNUWMul(Size, AlignVal); 1130fe6060f1SDimitry Andric 1131fe6060f1SDimitry Andric // Allocate space for this VLA object to be globalized. 113206c3fb27SDimitry Andric llvm::Value *AllocArgs[] = {Size}; 113304eeddc0SDimitry Andric llvm::CallBase *VoidPtr = 1134fe6060f1SDimitry Andric CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1135fe6060f1SDimitry Andric CGM.getModule(), OMPRTL___kmpc_alloc_shared), 1136fe6060f1SDimitry Andric AllocArgs, VD->getName()); 113706c3fb27SDimitry Andric VoidPtr->addRetAttr(llvm::Attribute::get( 113806c3fb27SDimitry Andric CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity())); 1139fe6060f1SDimitry Andric 114006c3fb27SDimitry Andric return std::make_pair(VoidPtr, Size); 1141e8d8bef9SDimitry Andric } 114206c3fb27SDimitry Andric 114306c3fb27SDimitry Andric void CGOpenMPRuntimeGPU::getKmpcFreeShared( 114406c3fb27SDimitry Andric CodeGenFunction &CGF, 114506c3fb27SDimitry Andric const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) { 114606c3fb27SDimitry Andric // Deallocate the memory for each globalized VLA object 114706c3fb27SDimitry Andric CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 114806c3fb27SDimitry Andric CGM.getModule(), OMPRTL___kmpc_free_shared), 114906c3fb27SDimitry Andric {AddrSizePair.first, AddrSizePair.second}); 1150e8d8bef9SDimitry Andric } 1151e8d8bef9SDimitry Andric 11525f757f3fSDimitry Andric void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) { 11535f757f3fSDimitry Andric if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) 1154e8d8bef9SDimitry Andric return; 1155e8d8bef9SDimitry Andric 1156e8d8bef9SDimitry Andric const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 1157e8d8bef9SDimitry Andric if (I != FunctionGlobalizedDecls.end()) { 115806c3fb27SDimitry Andric // Deallocate the memory for each globalized VLA object that was 115906c3fb27SDimitry Andric // globalized in the prolog (i.e. emitGenericVarsProlog). 116006c3fb27SDimitry Andric for (const auto &AddrSizePair : 1161e8d8bef9SDimitry Andric llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) { 1162fe6060f1SDimitry Andric CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1163fe6060f1SDimitry Andric CGM.getModule(), OMPRTL___kmpc_free_shared), 1164fe6060f1SDimitry Andric {AddrSizePair.first, AddrSizePair.second}); 1165e8d8bef9SDimitry Andric } 1166fe6060f1SDimitry Andric // Deallocate the memory for each globalized value 1167fe6060f1SDimitry Andric for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) { 1168fe6060f1SDimitry Andric const auto *VD = cast<VarDecl>(Rec.first); 1169fe6060f1SDimitry Andric I->getSecond().MappedParams->restore(CGF); 1170fe6060f1SDimitry Andric 1171fe6060f1SDimitry Andric llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal, 1172fe6060f1SDimitry Andric CGF.getTypeSize(VD->getType())}; 1173fe6060f1SDimitry Andric CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1174fe6060f1SDimitry Andric CGM.getModule(), OMPRTL___kmpc_free_shared), 1175fe6060f1SDimitry Andric FreeArgs); 1176e8d8bef9SDimitry Andric } 1177e8d8bef9SDimitry Andric } 1178e8d8bef9SDimitry Andric } 1179e8d8bef9SDimitry Andric 1180e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF, 1181e8d8bef9SDimitry Andric const OMPExecutableDirective &D, 1182e8d8bef9SDimitry Andric SourceLocation Loc, 1183e8d8bef9SDimitry Andric llvm::Function *OutlinedFn, 1184e8d8bef9SDimitry Andric ArrayRef<llvm::Value *> CapturedVars) { 1185e8d8bef9SDimitry Andric if (!CGF.HaveInsertPoint()) 1186e8d8bef9SDimitry Andric return; 1187e8d8bef9SDimitry Andric 11885f757f3fSDimitry Andric bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); 11895f757f3fSDimitry Andric 11900fca6ea1SDimitry Andric RawAddress ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, 1191e8d8bef9SDimitry Andric /*Name=*/".zero.addr"); 1192349cc55cSDimitry Andric CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr); 1193e8d8bef9SDimitry Andric llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; 11945f757f3fSDimitry Andric // We don't emit any thread id function call in bare kernel, but because the 11955f757f3fSDimitry Andric // outlined function has a pointer argument, we emit a nullptr here. 11965f757f3fSDimitry Andric if (IsBareKernel) 11975f757f3fSDimitry Andric OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy)); 11985f757f3fSDimitry Andric else 11990fca6ea1SDimitry Andric OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF)); 1200e8d8bef9SDimitry Andric OutlinedFnArgs.push_back(ZeroAddr.getPointer()); 1201e8d8bef9SDimitry Andric OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); 1202e8d8bef9SDimitry Andric emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); 1203e8d8bef9SDimitry Andric } 1204e8d8bef9SDimitry Andric 1205fe6060f1SDimitry Andric void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF, 1206fe6060f1SDimitry Andric SourceLocation Loc, 1207fe6060f1SDimitry Andric llvm::Function *OutlinedFn, 1208fe6060f1SDimitry Andric ArrayRef<llvm::Value *> CapturedVars, 12090eae32dcSDimitry Andric const Expr *IfCond, 12100eae32dcSDimitry Andric llvm::Value *NumThreads) { 1211e8d8bef9SDimitry Andric if (!CGF.HaveInsertPoint()) 1212e8d8bef9SDimitry Andric return; 1213e8d8bef9SDimitry Andric 12140eae32dcSDimitry Andric auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond, 12150eae32dcSDimitry Andric NumThreads](CodeGenFunction &CGF, 12160eae32dcSDimitry Andric PrePostActionTy &Action) { 1217e8d8bef9SDimitry Andric CGBuilderTy &Bld = CGF.Builder; 12180eae32dcSDimitry Andric llvm::Value *NumThreadsVal = NumThreads; 1219fe6060f1SDimitry Andric llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn]; 1220fe6060f1SDimitry Andric llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy); 1221fe6060f1SDimitry Andric if (WFn) 1222fe6060f1SDimitry Andric ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); 1223fe6060f1SDimitry Andric llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy); 1224e8d8bef9SDimitry Andric 1225e8d8bef9SDimitry Andric // Create a private scope that will globalize the arguments 1226e8d8bef9SDimitry Andric // passed from the outside of the target region. 1227fe6060f1SDimitry Andric // TODO: Is that needed? 1228e8d8bef9SDimitry Andric CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF); 1229e8d8bef9SDimitry Andric 1230fe6060f1SDimitry Andric Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca( 1231fe6060f1SDimitry Andric llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()), 1232fe6060f1SDimitry Andric "captured_vars_addrs"); 1233e8d8bef9SDimitry Andric // There's something to share. 1234e8d8bef9SDimitry Andric if (!CapturedVars.empty()) { 1235e8d8bef9SDimitry Andric // Prepare for parallel region. Indicate the outlined function. 1236e8d8bef9SDimitry Andric ASTContext &Ctx = CGF.getContext(); 1237fe6060f1SDimitry Andric unsigned Idx = 0; 1238e8d8bef9SDimitry Andric for (llvm::Value *V : CapturedVars) { 1239fe6060f1SDimitry Andric Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx); 1240e8d8bef9SDimitry Andric llvm::Value *PtrV; 1241e8d8bef9SDimitry Andric if (V->getType()->isIntegerTy()) 1242e8d8bef9SDimitry Andric PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy); 1243e8d8bef9SDimitry Andric else 1244e8d8bef9SDimitry Andric PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy); 1245e8d8bef9SDimitry Andric CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false, 1246e8d8bef9SDimitry Andric Ctx.getPointerType(Ctx.VoidPtrTy)); 1247e8d8bef9SDimitry Andric ++Idx; 1248e8d8bef9SDimitry Andric } 1249e8d8bef9SDimitry Andric } 1250e8d8bef9SDimitry Andric 1251fe6060f1SDimitry Andric llvm::Value *IfCondVal = nullptr; 1252fe6060f1SDimitry Andric if (IfCond) 1253fe6060f1SDimitry Andric IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty, 1254fe6060f1SDimitry Andric /* isSigned */ false); 1255fe6060f1SDimitry Andric else 1256fe6060f1SDimitry Andric IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1); 1257e8d8bef9SDimitry Andric 12580eae32dcSDimitry Andric if (!NumThreadsVal) 12590eae32dcSDimitry Andric NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1); 12600eae32dcSDimitry Andric else 12610eae32dcSDimitry Andric NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty), 12620eae32dcSDimitry Andric 1263fe6060f1SDimitry Andric assert(IfCondVal && "Expected a value"); 1264fe6060f1SDimitry Andric llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); 1265fe6060f1SDimitry Andric llvm::Value *Args[] = { 1266fe6060f1SDimitry Andric RTLoc, 1267fe6060f1SDimitry Andric getThreadID(CGF, Loc), 1268fe6060f1SDimitry Andric IfCondVal, 12690eae32dcSDimitry Andric NumThreadsVal, 1270fe6060f1SDimitry Andric llvm::ConstantInt::get(CGF.Int32Ty, -1), 1271fe6060f1SDimitry Andric FnPtr, 1272fe6060f1SDimitry Andric ID, 12730fca6ea1SDimitry Andric Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF), 1274fe6060f1SDimitry Andric CGF.VoidPtrPtrTy), 1275fe6060f1SDimitry Andric llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())}; 1276e8d8bef9SDimitry Andric CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1277fe6060f1SDimitry Andric CGM.getModule(), OMPRTL___kmpc_parallel_51), 1278e8d8bef9SDimitry Andric Args); 1279e8d8bef9SDimitry Andric }; 1280e8d8bef9SDimitry Andric 1281fe6060f1SDimitry Andric RegionCodeGenTy RCG(ParallelGen); 1282e8d8bef9SDimitry Andric RCG(CGF); 1283e8d8bef9SDimitry Andric } 1284e8d8bef9SDimitry Andric 1285e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) { 1286e8d8bef9SDimitry Andric // Always emit simple barriers! 1287e8d8bef9SDimitry Andric if (!CGF.HaveInsertPoint()) 1288e8d8bef9SDimitry Andric return; 1289e8d8bef9SDimitry Andric // Build call __kmpc_barrier_simple_spmd(nullptr, 0); 1290e8d8bef9SDimitry Andric // This function does not use parameters, so we can emit just default values. 1291e8d8bef9SDimitry Andric llvm::Value *Args[] = { 1292e8d8bef9SDimitry Andric llvm::ConstantPointerNull::get( 1293e8d8bef9SDimitry Andric cast<llvm::PointerType>(getIdentTyPointerTy())), 1294e8d8bef9SDimitry Andric llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)}; 1295e8d8bef9SDimitry Andric CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1296e8d8bef9SDimitry Andric CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd), 1297e8d8bef9SDimitry Andric Args); 1298e8d8bef9SDimitry Andric } 1299e8d8bef9SDimitry Andric 1300e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF, 1301e8d8bef9SDimitry Andric SourceLocation Loc, 1302e8d8bef9SDimitry Andric OpenMPDirectiveKind Kind, bool, 1303e8d8bef9SDimitry Andric bool) { 1304e8d8bef9SDimitry Andric // Always emit simple barriers! 1305e8d8bef9SDimitry Andric if (!CGF.HaveInsertPoint()) 1306e8d8bef9SDimitry Andric return; 1307e8d8bef9SDimitry Andric // Build call __kmpc_cancel_barrier(loc, thread_id); 1308e8d8bef9SDimitry Andric unsigned Flags = getDefaultFlagsForBarriers(Kind); 1309e8d8bef9SDimitry Andric llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags), 1310e8d8bef9SDimitry Andric getThreadID(CGF, Loc)}; 1311e8d8bef9SDimitry Andric 1312e8d8bef9SDimitry Andric CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1313e8d8bef9SDimitry Andric CGM.getModule(), OMPRTL___kmpc_barrier), 1314e8d8bef9SDimitry Andric Args); 1315e8d8bef9SDimitry Andric } 1316e8d8bef9SDimitry Andric 1317e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitCriticalRegion( 1318e8d8bef9SDimitry Andric CodeGenFunction &CGF, StringRef CriticalName, 1319e8d8bef9SDimitry Andric const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, 1320e8d8bef9SDimitry Andric const Expr *Hint) { 1321e8d8bef9SDimitry Andric llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop"); 1322e8d8bef9SDimitry Andric llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test"); 1323e8d8bef9SDimitry Andric llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync"); 1324e8d8bef9SDimitry Andric llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body"); 1325e8d8bef9SDimitry Andric llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit"); 1326e8d8bef9SDimitry Andric 1327e8d8bef9SDimitry Andric auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 1328e8d8bef9SDimitry Andric 1329e8d8bef9SDimitry Andric // Get the mask of active threads in the warp. 1330e8d8bef9SDimitry Andric llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1331e8d8bef9SDimitry Andric CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask)); 1332e8d8bef9SDimitry Andric // Fetch team-local id of the thread. 1333e8d8bef9SDimitry Andric llvm::Value *ThreadID = RT.getGPUThreadID(CGF); 1334e8d8bef9SDimitry Andric 1335e8d8bef9SDimitry Andric // Get the width of the team. 1336e8d8bef9SDimitry Andric llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF); 1337e8d8bef9SDimitry Andric 1338e8d8bef9SDimitry Andric // Initialize the counter variable for the loop. 1339e8d8bef9SDimitry Andric QualType Int32Ty = 1340e8d8bef9SDimitry Andric CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0); 1341e8d8bef9SDimitry Andric Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter"); 1342e8d8bef9SDimitry Andric LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty); 1343e8d8bef9SDimitry Andric CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal, 1344e8d8bef9SDimitry Andric /*isInit=*/true); 1345e8d8bef9SDimitry Andric 1346e8d8bef9SDimitry Andric // Block checks if loop counter exceeds upper bound. 1347e8d8bef9SDimitry Andric CGF.EmitBlock(LoopBB); 1348e8d8bef9SDimitry Andric llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc); 1349e8d8bef9SDimitry Andric llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth); 1350e8d8bef9SDimitry Andric CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB); 1351e8d8bef9SDimitry Andric 1352e8d8bef9SDimitry Andric // Block tests which single thread should execute region, and which threads 1353e8d8bef9SDimitry Andric // should go straight to synchronisation point. 1354e8d8bef9SDimitry Andric CGF.EmitBlock(TestBB); 1355e8d8bef9SDimitry Andric CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc); 1356e8d8bef9SDimitry Andric llvm::Value *CmpThreadToCounter = 1357e8d8bef9SDimitry Andric CGF.Builder.CreateICmpEQ(ThreadID, CounterVal); 1358e8d8bef9SDimitry Andric CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB); 1359e8d8bef9SDimitry Andric 1360e8d8bef9SDimitry Andric // Block emits the body of the critical region. 1361e8d8bef9SDimitry Andric CGF.EmitBlock(BodyBB); 1362e8d8bef9SDimitry Andric 1363e8d8bef9SDimitry Andric // Output the critical statement. 1364e8d8bef9SDimitry Andric CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc, 1365e8d8bef9SDimitry Andric Hint); 1366e8d8bef9SDimitry Andric 1367e8d8bef9SDimitry Andric // After the body surrounded by the critical region, the single executing 1368e8d8bef9SDimitry Andric // thread will jump to the synchronisation point. 1369e8d8bef9SDimitry Andric // Block waits for all threads in current team to finish then increments the 1370e8d8bef9SDimitry Andric // counter variable and returns to the loop. 1371e8d8bef9SDimitry Andric CGF.EmitBlock(SyncBB); 1372e8d8bef9SDimitry Andric // Reconverge active threads in the warp. 1373e8d8bef9SDimitry Andric (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1374e8d8bef9SDimitry Andric CGM.getModule(), OMPRTL___kmpc_syncwarp), 1375e8d8bef9SDimitry Andric Mask); 1376e8d8bef9SDimitry Andric 1377e8d8bef9SDimitry Andric llvm::Value *IncCounterVal = 1378e8d8bef9SDimitry Andric CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1)); 1379e8d8bef9SDimitry Andric CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal); 1380e8d8bef9SDimitry Andric CGF.EmitBranch(LoopBB); 1381e8d8bef9SDimitry Andric 1382e8d8bef9SDimitry Andric // Block that is reached when all threads in the team complete the region. 1383e8d8bef9SDimitry Andric CGF.EmitBlock(ExitBB, /*IsFinished=*/true); 1384e8d8bef9SDimitry Andric } 1385e8d8bef9SDimitry Andric 1386e8d8bef9SDimitry Andric /// Cast value to the specified type. 1387e8d8bef9SDimitry Andric static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val, 1388e8d8bef9SDimitry Andric QualType ValTy, QualType CastTy, 1389e8d8bef9SDimitry Andric SourceLocation Loc) { 1390e8d8bef9SDimitry Andric assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && 1391e8d8bef9SDimitry Andric "Cast type must sized."); 1392e8d8bef9SDimitry Andric assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && 1393e8d8bef9SDimitry Andric "Val type must sized."); 1394e8d8bef9SDimitry Andric llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy); 1395e8d8bef9SDimitry Andric if (ValTy == CastTy) 1396e8d8bef9SDimitry Andric return Val; 1397e8d8bef9SDimitry Andric if (CGF.getContext().getTypeSizeInChars(ValTy) == 1398e8d8bef9SDimitry Andric CGF.getContext().getTypeSizeInChars(CastTy)) 1399e8d8bef9SDimitry Andric return CGF.Builder.CreateBitCast(Val, LLVMCastTy); 1400e8d8bef9SDimitry Andric if (CastTy->isIntegerType() && ValTy->isIntegerType()) 1401e8d8bef9SDimitry Andric return CGF.Builder.CreateIntCast(Val, LLVMCastTy, 1402e8d8bef9SDimitry Andric CastTy->hasSignedIntegerRepresentation()); 1403e8d8bef9SDimitry Andric Address CastItem = CGF.CreateMemTemp(CastTy); 14045f757f3fSDimitry Andric Address ValCastItem = CastItem.withElementType(Val->getType()); 1405e8d8bef9SDimitry Andric CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy, 1406e8d8bef9SDimitry Andric LValueBaseInfo(AlignmentSource::Type), 1407e8d8bef9SDimitry Andric TBAAAccessInfo()); 1408e8d8bef9SDimitry Andric return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc, 1409e8d8bef9SDimitry Andric LValueBaseInfo(AlignmentSource::Type), 1410e8d8bef9SDimitry Andric TBAAAccessInfo()); 1411e8d8bef9SDimitry Andric } 1412e8d8bef9SDimitry Andric 1413e8d8bef9SDimitry Andric /// 1414e8d8bef9SDimitry Andric /// Design of OpenMP reductions on the GPU 1415e8d8bef9SDimitry Andric /// 1416e8d8bef9SDimitry Andric /// Consider a typical OpenMP program with one or more reduction 1417e8d8bef9SDimitry Andric /// clauses: 1418e8d8bef9SDimitry Andric /// 1419e8d8bef9SDimitry Andric /// float foo; 1420e8d8bef9SDimitry Andric /// double bar; 1421e8d8bef9SDimitry Andric /// #pragma omp target teams distribute parallel for \ 1422e8d8bef9SDimitry Andric /// reduction(+:foo) reduction(*:bar) 1423e8d8bef9SDimitry Andric /// for (int i = 0; i < N; i++) { 1424e8d8bef9SDimitry Andric /// foo += A[i]; bar *= B[i]; 1425e8d8bef9SDimitry Andric /// } 1426e8d8bef9SDimitry Andric /// 1427e8d8bef9SDimitry Andric /// where 'foo' and 'bar' are reduced across all OpenMP threads in 1428e8d8bef9SDimitry Andric /// all teams. In our OpenMP implementation on the NVPTX device an 1429e8d8bef9SDimitry Andric /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads 1430e8d8bef9SDimitry Andric /// within a team are mapped to CUDA threads within a threadblock. 1431e8d8bef9SDimitry Andric /// Our goal is to efficiently aggregate values across all OpenMP 1432e8d8bef9SDimitry Andric /// threads such that: 1433e8d8bef9SDimitry Andric /// 1434e8d8bef9SDimitry Andric /// - the compiler and runtime are logically concise, and 1435e8d8bef9SDimitry Andric /// - the reduction is performed efficiently in a hierarchical 1436e8d8bef9SDimitry Andric /// manner as follows: within OpenMP threads in the same warp, 1437e8d8bef9SDimitry Andric /// across warps in a threadblock, and finally across teams on 1438e8d8bef9SDimitry Andric /// the NVPTX device. 1439e8d8bef9SDimitry Andric /// 1440e8d8bef9SDimitry Andric /// Introduction to Decoupling 1441e8d8bef9SDimitry Andric /// 1442e8d8bef9SDimitry Andric /// We would like to decouple the compiler and the runtime so that the 1443e8d8bef9SDimitry Andric /// latter is ignorant of the reduction variables (number, data types) 1444e8d8bef9SDimitry Andric /// and the reduction operators. This allows a simpler interface 1445e8d8bef9SDimitry Andric /// and implementation while still attaining good performance. 1446e8d8bef9SDimitry Andric /// 1447e8d8bef9SDimitry Andric /// Pseudocode for the aforementioned OpenMP program generated by the 1448e8d8bef9SDimitry Andric /// compiler is as follows: 1449e8d8bef9SDimitry Andric /// 1450e8d8bef9SDimitry Andric /// 1. Create private copies of reduction variables on each OpenMP 1451e8d8bef9SDimitry Andric /// thread: 'foo_private', 'bar_private' 1452e8d8bef9SDimitry Andric /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned 1453e8d8bef9SDimitry Andric /// to it and writes the result in 'foo_private' and 'bar_private' 1454e8d8bef9SDimitry Andric /// respectively. 1455e8d8bef9SDimitry Andric /// 3. Call the OpenMP runtime on the GPU to reduce within a team 1456e8d8bef9SDimitry Andric /// and store the result on the team master: 1457e8d8bef9SDimitry Andric /// 1458e8d8bef9SDimitry Andric /// __kmpc_nvptx_parallel_reduce_nowait_v2(..., 1459e8d8bef9SDimitry Andric /// reduceData, shuffleReduceFn, interWarpCpyFn) 1460e8d8bef9SDimitry Andric /// 1461e8d8bef9SDimitry Andric /// where: 1462e8d8bef9SDimitry Andric /// struct ReduceData { 1463e8d8bef9SDimitry Andric /// double *foo; 1464e8d8bef9SDimitry Andric /// double *bar; 1465e8d8bef9SDimitry Andric /// } reduceData 1466e8d8bef9SDimitry Andric /// reduceData.foo = &foo_private 1467e8d8bef9SDimitry Andric /// reduceData.bar = &bar_private 1468e8d8bef9SDimitry Andric /// 1469e8d8bef9SDimitry Andric /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two 1470e8d8bef9SDimitry Andric /// auxiliary functions generated by the compiler that operate on 1471e8d8bef9SDimitry Andric /// variables of type 'ReduceData'. They aid the runtime perform 1472e8d8bef9SDimitry Andric /// algorithmic steps in a data agnostic manner. 1473e8d8bef9SDimitry Andric /// 1474e8d8bef9SDimitry Andric /// 'shuffleReduceFn' is a pointer to a function that reduces data 1475e8d8bef9SDimitry Andric /// of type 'ReduceData' across two OpenMP threads (lanes) in the 1476e8d8bef9SDimitry Andric /// same warp. It takes the following arguments as input: 1477e8d8bef9SDimitry Andric /// 1478e8d8bef9SDimitry Andric /// a. variable of type 'ReduceData' on the calling lane, 1479e8d8bef9SDimitry Andric /// b. its lane_id, 1480e8d8bef9SDimitry Andric /// c. an offset relative to the current lane_id to generate a 1481e8d8bef9SDimitry Andric /// remote_lane_id. The remote lane contains the second 1482e8d8bef9SDimitry Andric /// variable of type 'ReduceData' that is to be reduced. 1483e8d8bef9SDimitry Andric /// d. an algorithm version parameter determining which reduction 1484e8d8bef9SDimitry Andric /// algorithm to use. 1485e8d8bef9SDimitry Andric /// 1486e8d8bef9SDimitry Andric /// 'shuffleReduceFn' retrieves data from the remote lane using 1487e8d8bef9SDimitry Andric /// efficient GPU shuffle intrinsics and reduces, using the 1488e8d8bef9SDimitry Andric /// algorithm specified by the 4th parameter, the two operands 1489e8d8bef9SDimitry Andric /// element-wise. The result is written to the first operand. 1490e8d8bef9SDimitry Andric /// 1491e8d8bef9SDimitry Andric /// Different reduction algorithms are implemented in different 1492e8d8bef9SDimitry Andric /// runtime functions, all calling 'shuffleReduceFn' to perform 1493e8d8bef9SDimitry Andric /// the essential reduction step. Therefore, based on the 4th 1494e8d8bef9SDimitry Andric /// parameter, this function behaves slightly differently to 1495e8d8bef9SDimitry Andric /// cooperate with the runtime to ensure correctness under 1496e8d8bef9SDimitry Andric /// different circumstances. 1497e8d8bef9SDimitry Andric /// 1498e8d8bef9SDimitry Andric /// 'InterWarpCpyFn' is a pointer to a function that transfers 1499e8d8bef9SDimitry Andric /// reduced variables across warps. It tunnels, through CUDA 1500e8d8bef9SDimitry Andric /// shared memory, the thread-private data of type 'ReduceData' 1501e8d8bef9SDimitry Andric /// from lane 0 of each warp to a lane in the first warp. 1502e8d8bef9SDimitry Andric /// 4. Call the OpenMP runtime on the GPU to reduce across teams. 1503e8d8bef9SDimitry Andric /// The last team writes the global reduced value to memory. 1504e8d8bef9SDimitry Andric /// 1505e8d8bef9SDimitry Andric /// ret = __kmpc_nvptx_teams_reduce_nowait(..., 1506e8d8bef9SDimitry Andric /// reduceData, shuffleReduceFn, interWarpCpyFn, 1507e8d8bef9SDimitry Andric /// scratchpadCopyFn, loadAndReduceFn) 1508e8d8bef9SDimitry Andric /// 1509e8d8bef9SDimitry Andric /// 'scratchpadCopyFn' is a helper that stores reduced 1510e8d8bef9SDimitry Andric /// data from the team master to a scratchpad array in 1511e8d8bef9SDimitry Andric /// global memory. 1512e8d8bef9SDimitry Andric /// 1513e8d8bef9SDimitry Andric /// 'loadAndReduceFn' is a helper that loads data from 1514e8d8bef9SDimitry Andric /// the scratchpad array and reduces it with the input 1515e8d8bef9SDimitry Andric /// operand. 1516e8d8bef9SDimitry Andric /// 1517e8d8bef9SDimitry Andric /// These compiler generated functions hide address 1518e8d8bef9SDimitry Andric /// calculation and alignment information from the runtime. 1519e8d8bef9SDimitry Andric /// 5. if ret == 1: 1520e8d8bef9SDimitry Andric /// The team master of the last team stores the reduced 1521e8d8bef9SDimitry Andric /// result to the globals in memory. 1522e8d8bef9SDimitry Andric /// foo += reduceData.foo; bar *= reduceData.bar 1523e8d8bef9SDimitry Andric /// 1524e8d8bef9SDimitry Andric /// 1525e8d8bef9SDimitry Andric /// Warp Reduction Algorithms 1526e8d8bef9SDimitry Andric /// 1527e8d8bef9SDimitry Andric /// On the warp level, we have three algorithms implemented in the 1528e8d8bef9SDimitry Andric /// OpenMP runtime depending on the number of active lanes: 1529e8d8bef9SDimitry Andric /// 1530e8d8bef9SDimitry Andric /// Full Warp Reduction 1531e8d8bef9SDimitry Andric /// 1532e8d8bef9SDimitry Andric /// The reduce algorithm within a warp where all lanes are active 1533e8d8bef9SDimitry Andric /// is implemented in the runtime as follows: 1534e8d8bef9SDimitry Andric /// 1535e8d8bef9SDimitry Andric /// full_warp_reduce(void *reduce_data, 1536e8d8bef9SDimitry Andric /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 1537e8d8bef9SDimitry Andric /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2) 1538e8d8bef9SDimitry Andric /// ShuffleReduceFn(reduce_data, 0, offset, 0); 1539e8d8bef9SDimitry Andric /// } 1540e8d8bef9SDimitry Andric /// 1541e8d8bef9SDimitry Andric /// The algorithm completes in log(2, WARPSIZE) steps. 1542e8d8bef9SDimitry Andric /// 1543e8d8bef9SDimitry Andric /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is 1544e8d8bef9SDimitry Andric /// not used therefore we save instructions by not retrieving lane_id 1545e8d8bef9SDimitry Andric /// from the corresponding special registers. The 4th parameter, which 1546e8d8bef9SDimitry Andric /// represents the version of the algorithm being used, is set to 0 to 1547e8d8bef9SDimitry Andric /// signify full warp reduction. 1548e8d8bef9SDimitry Andric /// 1549e8d8bef9SDimitry Andric /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1550e8d8bef9SDimitry Andric /// 1551e8d8bef9SDimitry Andric /// #reduce_elem refers to an element in the local lane's data structure 1552e8d8bef9SDimitry Andric /// #remote_elem is retrieved from a remote lane 1553e8d8bef9SDimitry Andric /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1554e8d8bef9SDimitry Andric /// reduce_elem = reduce_elem REDUCE_OP remote_elem; 1555e8d8bef9SDimitry Andric /// 1556e8d8bef9SDimitry Andric /// Contiguous Partial Warp Reduction 1557e8d8bef9SDimitry Andric /// 1558e8d8bef9SDimitry Andric /// This reduce algorithm is used within a warp where only the first 1559e8d8bef9SDimitry Andric /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the 1560e8d8bef9SDimitry Andric /// number of OpenMP threads in a parallel region is not a multiple of 1561e8d8bef9SDimitry Andric /// WARPSIZE. The algorithm is implemented in the runtime as follows: 1562e8d8bef9SDimitry Andric /// 1563e8d8bef9SDimitry Andric /// void 1564e8d8bef9SDimitry Andric /// contiguous_partial_reduce(void *reduce_data, 1565e8d8bef9SDimitry Andric /// kmp_ShuffleReductFctPtr ShuffleReduceFn, 1566e8d8bef9SDimitry Andric /// int size, int lane_id) { 1567e8d8bef9SDimitry Andric /// int curr_size; 1568e8d8bef9SDimitry Andric /// int offset; 1569e8d8bef9SDimitry Andric /// curr_size = size; 1570e8d8bef9SDimitry Andric /// mask = curr_size/2; 1571e8d8bef9SDimitry Andric /// while (offset>0) { 1572e8d8bef9SDimitry Andric /// ShuffleReduceFn(reduce_data, lane_id, offset, 1); 1573e8d8bef9SDimitry Andric /// curr_size = (curr_size+1)/2; 1574e8d8bef9SDimitry Andric /// offset = curr_size/2; 1575e8d8bef9SDimitry Andric /// } 1576e8d8bef9SDimitry Andric /// } 1577e8d8bef9SDimitry Andric /// 1578e8d8bef9SDimitry Andric /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1579e8d8bef9SDimitry Andric /// 1580e8d8bef9SDimitry Andric /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1581e8d8bef9SDimitry Andric /// if (lane_id < offset) 1582e8d8bef9SDimitry Andric /// reduce_elem = reduce_elem REDUCE_OP remote_elem 1583e8d8bef9SDimitry Andric /// else 1584e8d8bef9SDimitry Andric /// reduce_elem = remote_elem 1585e8d8bef9SDimitry Andric /// 1586e8d8bef9SDimitry Andric /// This algorithm assumes that the data to be reduced are located in a 1587e8d8bef9SDimitry Andric /// contiguous subset of lanes starting from the first. When there is 1588e8d8bef9SDimitry Andric /// an odd number of active lanes, the data in the last lane is not 1589e8d8bef9SDimitry Andric /// aggregated with any other lane's dat but is instead copied over. 1590e8d8bef9SDimitry Andric /// 1591e8d8bef9SDimitry Andric /// Dispersed Partial Warp Reduction 1592e8d8bef9SDimitry Andric /// 1593e8d8bef9SDimitry Andric /// This algorithm is used within a warp when any discontiguous subset of 1594e8d8bef9SDimitry Andric /// lanes are active. It is used to implement the reduction operation 1595e8d8bef9SDimitry Andric /// across lanes in an OpenMP simd region or in a nested parallel region. 1596e8d8bef9SDimitry Andric /// 1597e8d8bef9SDimitry Andric /// void 1598e8d8bef9SDimitry Andric /// dispersed_partial_reduce(void *reduce_data, 1599e8d8bef9SDimitry Andric /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 1600e8d8bef9SDimitry Andric /// int size, remote_id; 1601e8d8bef9SDimitry Andric /// int logical_lane_id = number_of_active_lanes_before_me() * 2; 1602e8d8bef9SDimitry Andric /// do { 1603e8d8bef9SDimitry Andric /// remote_id = next_active_lane_id_right_after_me(); 1604e8d8bef9SDimitry Andric /// # the above function returns 0 of no active lane 1605e8d8bef9SDimitry Andric /// # is present right after the current lane. 1606e8d8bef9SDimitry Andric /// size = number_of_active_lanes_in_this_warp(); 1607e8d8bef9SDimitry Andric /// logical_lane_id /= 2; 1608e8d8bef9SDimitry Andric /// ShuffleReduceFn(reduce_data, logical_lane_id, 1609e8d8bef9SDimitry Andric /// remote_id-1-threadIdx.x, 2); 1610e8d8bef9SDimitry Andric /// } while (logical_lane_id % 2 == 0 && size > 1); 1611e8d8bef9SDimitry Andric /// } 1612e8d8bef9SDimitry Andric /// 1613e8d8bef9SDimitry Andric /// There is no assumption made about the initial state of the reduction. 1614e8d8bef9SDimitry Andric /// Any number of lanes (>=1) could be active at any position. The reduction 1615e8d8bef9SDimitry Andric /// result is returned in the first active lane. 1616e8d8bef9SDimitry Andric /// 1617e8d8bef9SDimitry Andric /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1618e8d8bef9SDimitry Andric /// 1619e8d8bef9SDimitry Andric /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1620e8d8bef9SDimitry Andric /// if (lane_id % 2 == 0 && offset > 0) 1621e8d8bef9SDimitry Andric /// reduce_elem = reduce_elem REDUCE_OP remote_elem 1622e8d8bef9SDimitry Andric /// else 1623e8d8bef9SDimitry Andric /// reduce_elem = remote_elem 1624e8d8bef9SDimitry Andric /// 1625e8d8bef9SDimitry Andric /// 1626e8d8bef9SDimitry Andric /// Intra-Team Reduction 1627e8d8bef9SDimitry Andric /// 1628e8d8bef9SDimitry Andric /// This function, as implemented in the runtime call 1629e8d8bef9SDimitry Andric /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP 1630e8d8bef9SDimitry Andric /// threads in a team. It first reduces within a warp using the 1631e8d8bef9SDimitry Andric /// aforementioned algorithms. We then proceed to gather all such 1632e8d8bef9SDimitry Andric /// reduced values at the first warp. 1633e8d8bef9SDimitry Andric /// 1634e8d8bef9SDimitry Andric /// The runtime makes use of the function 'InterWarpCpyFn', which copies 1635e8d8bef9SDimitry Andric /// data from each of the "warp master" (zeroth lane of each warp, where 1636e8d8bef9SDimitry Andric /// warp-reduced data is held) to the zeroth warp. This step reduces (in 1637e8d8bef9SDimitry Andric /// a mathematical sense) the problem of reduction across warp masters in 1638e8d8bef9SDimitry Andric /// a block to the problem of warp reduction. 1639e8d8bef9SDimitry Andric /// 1640e8d8bef9SDimitry Andric /// 1641e8d8bef9SDimitry Andric /// Inter-Team Reduction 1642e8d8bef9SDimitry Andric /// 1643e8d8bef9SDimitry Andric /// Once a team has reduced its data to a single value, it is stored in 1644e8d8bef9SDimitry Andric /// a global scratchpad array. Since each team has a distinct slot, this 1645e8d8bef9SDimitry Andric /// can be done without locking. 1646e8d8bef9SDimitry Andric /// 1647e8d8bef9SDimitry Andric /// The last team to write to the scratchpad array proceeds to reduce the 1648e8d8bef9SDimitry Andric /// scratchpad array. One or more workers in the last team use the helper 1649e8d8bef9SDimitry Andric /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e., 1650e8d8bef9SDimitry Andric /// the k'th worker reduces every k'th element. 1651e8d8bef9SDimitry Andric /// 1652e8d8bef9SDimitry Andric /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to 1653e8d8bef9SDimitry Andric /// reduce across workers and compute a globally reduced value. 1654e8d8bef9SDimitry Andric /// 1655e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitReduction( 1656e8d8bef9SDimitry Andric CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates, 1657e8d8bef9SDimitry Andric ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs, 1658e8d8bef9SDimitry Andric ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) { 1659e8d8bef9SDimitry Andric if (!CGF.HaveInsertPoint()) 1660e8d8bef9SDimitry Andric return; 1661e8d8bef9SDimitry Andric 1662e8d8bef9SDimitry Andric bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind); 16630fca6ea1SDimitry Andric bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind); 1664e8d8bef9SDimitry Andric bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind); 16650fca6ea1SDimitry Andric 16660fca6ea1SDimitry Andric ASTContext &C = CGM.getContext(); 1667e8d8bef9SDimitry Andric 1668e8d8bef9SDimitry Andric if (Options.SimpleReduction) { 1669e8d8bef9SDimitry Andric assert(!TeamsReduction && !ParallelReduction && 1670e8d8bef9SDimitry Andric "Invalid reduction selection in emitReduction."); 16710fca6ea1SDimitry Andric (void)ParallelReduction; 1672e8d8bef9SDimitry Andric CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, 1673e8d8bef9SDimitry Andric ReductionOps, Options); 1674e8d8bef9SDimitry Andric return; 1675e8d8bef9SDimitry Andric } 1676e8d8bef9SDimitry Andric 16775f757f3fSDimitry Andric llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap; 16785f757f3fSDimitry Andric llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size()); 16795f757f3fSDimitry Andric int Cnt = 0; 16805f757f3fSDimitry Andric for (const Expr *DRE : Privates) { 16815f757f3fSDimitry Andric PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl(); 16825f757f3fSDimitry Andric ++Cnt; 16835f757f3fSDimitry Andric } 16845f757f3fSDimitry Andric const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars( 16855f757f3fSDimitry Andric CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1); 16865f757f3fSDimitry Andric 16870fca6ea1SDimitry Andric if (TeamsReduction) 16880fca6ea1SDimitry Andric TeamsReductions.push_back(ReductionRec); 16890fca6ea1SDimitry Andric 16900fca6ea1SDimitry Andric // Source location for the ident struct 1691e8d8bef9SDimitry Andric llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); 1692e8d8bef9SDimitry Andric 16930fca6ea1SDimitry Andric using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; 16940fca6ea1SDimitry Andric InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(), 16950fca6ea1SDimitry Andric CGF.AllocaInsertPt->getIterator()); 16960fca6ea1SDimitry Andric InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(), 16970fca6ea1SDimitry Andric CGF.Builder.GetInsertPoint()); 1698*36b606aeSDimitry Andric llvm::OpenMPIRBuilder::LocationDescription OmpLoc( 1699*36b606aeSDimitry Andric CodeGenIP, CGF.SourceLocToDebugLoc(Loc)); 17000fca6ea1SDimitry Andric llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos; 17010fca6ea1SDimitry Andric 17020fca6ea1SDimitry Andric CodeGenFunction::OMPPrivateScope Scope(CGF); 1703e8d8bef9SDimitry Andric unsigned Idx = 0; 17040fca6ea1SDimitry Andric for (const Expr *Private : Privates) { 17050fca6ea1SDimitry Andric llvm::Type *ElementType; 17060fca6ea1SDimitry Andric llvm::Value *Variable; 17070fca6ea1SDimitry Andric llvm::Value *PrivateVariable; 17080fca6ea1SDimitry Andric llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen = nullptr; 17090fca6ea1SDimitry Andric ElementType = CGF.ConvertTypeForMem(Private->getType()); 17100fca6ea1SDimitry Andric const auto *RHSVar = 17110fca6ea1SDimitry Andric cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl()); 17120fca6ea1SDimitry Andric PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).emitRawPointer(CGF); 17130fca6ea1SDimitry Andric const auto *LHSVar = 17140fca6ea1SDimitry Andric cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl()); 17150fca6ea1SDimitry Andric Variable = CGF.GetAddrOfLocalVar(LHSVar).emitRawPointer(CGF); 17160fca6ea1SDimitry Andric llvm::OpenMPIRBuilder::EvalKind EvalKind; 17170fca6ea1SDimitry Andric switch (CGF.getEvaluationKind(Private->getType())) { 17180fca6ea1SDimitry Andric case TEK_Scalar: 17190fca6ea1SDimitry Andric EvalKind = llvm::OpenMPIRBuilder::EvalKind::Scalar; 17200fca6ea1SDimitry Andric break; 17210fca6ea1SDimitry Andric case TEK_Complex: 17220fca6ea1SDimitry Andric EvalKind = llvm::OpenMPIRBuilder::EvalKind::Complex; 17230fca6ea1SDimitry Andric break; 17240fca6ea1SDimitry Andric case TEK_Aggregate: 17250fca6ea1SDimitry Andric EvalKind = llvm::OpenMPIRBuilder::EvalKind::Aggregate; 17260fca6ea1SDimitry Andric break; 1727e8d8bef9SDimitry Andric } 17280fca6ea1SDimitry Andric auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I, 17290fca6ea1SDimitry Andric llvm::Value **LHSPtr, llvm::Value **RHSPtr, 17300fca6ea1SDimitry Andric llvm::Function *NewFunc) { 17310fca6ea1SDimitry Andric CGF.Builder.restoreIP(CodeGenIP); 17320fca6ea1SDimitry Andric auto *CurFn = CGF.CurFn; 17330fca6ea1SDimitry Andric CGF.CurFn = NewFunc; 1734e8d8bef9SDimitry Andric 17350fca6ea1SDimitry Andric *LHSPtr = CGF.GetAddrOfLocalVar( 17360fca6ea1SDimitry Andric cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl())) 17370fca6ea1SDimitry Andric .emitRawPointer(CGF); 17380fca6ea1SDimitry Andric *RHSPtr = CGF.GetAddrOfLocalVar( 17390fca6ea1SDimitry Andric cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl())) 17400fca6ea1SDimitry Andric .emitRawPointer(CGF); 1741e8d8bef9SDimitry Andric 17420fca6ea1SDimitry Andric emitSingleReductionCombiner(CGF, ReductionOps[I], Privates[I], 17430fca6ea1SDimitry Andric cast<DeclRefExpr>(LHSExprs[I]), 17440fca6ea1SDimitry Andric cast<DeclRefExpr>(RHSExprs[I])); 1745e8d8bef9SDimitry Andric 17460fca6ea1SDimitry Andric CGF.CurFn = CurFn; 1747e8d8bef9SDimitry Andric 17480fca6ea1SDimitry Andric return InsertPointTy(CGF.Builder.GetInsertBlock(), 17490fca6ea1SDimitry Andric CGF.Builder.GetInsertPoint()); 1750e8d8bef9SDimitry Andric }; 17510fca6ea1SDimitry Andric ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo( 17520fca6ea1SDimitry Andric ElementType, Variable, PrivateVariable, EvalKind, 17530fca6ea1SDimitry Andric /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen)); 17540fca6ea1SDimitry Andric Idx++; 17550fca6ea1SDimitry Andric } 17560fca6ea1SDimitry Andric 17570fca6ea1SDimitry Andric CGF.Builder.restoreIP(OMPBuilder.createReductionsGPU( 17580fca6ea1SDimitry Andric OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction, 17590fca6ea1SDimitry Andric DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang, 17600fca6ea1SDimitry Andric CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum, 17610fca6ea1SDimitry Andric RTLoc)); 17620fca6ea1SDimitry Andric return; 1763e8d8bef9SDimitry Andric } 1764e8d8bef9SDimitry Andric 1765e8d8bef9SDimitry Andric const VarDecl * 1766e8d8bef9SDimitry Andric CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD, 1767e8d8bef9SDimitry Andric const VarDecl *NativeParam) const { 1768e8d8bef9SDimitry Andric if (!NativeParam->getType()->isReferenceType()) 1769e8d8bef9SDimitry Andric return NativeParam; 1770e8d8bef9SDimitry Andric QualType ArgType = NativeParam->getType(); 1771e8d8bef9SDimitry Andric QualifierCollector QC; 1772e8d8bef9SDimitry Andric const Type *NonQualTy = QC.strip(ArgType); 1773e8d8bef9SDimitry Andric QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType(); 1774e8d8bef9SDimitry Andric if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) { 1775e8d8bef9SDimitry Andric if (Attr->getCaptureKind() == OMPC_map) { 1776e8d8bef9SDimitry Andric PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy, 1777e8d8bef9SDimitry Andric LangAS::opencl_global); 1778e8d8bef9SDimitry Andric } 1779e8d8bef9SDimitry Andric } 1780e8d8bef9SDimitry Andric ArgType = CGM.getContext().getPointerType(PointeeTy); 1781e8d8bef9SDimitry Andric QC.addRestrict(); 1782e8d8bef9SDimitry Andric enum { NVPTX_local_addr = 5 }; 1783e8d8bef9SDimitry Andric QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr)); 1784e8d8bef9SDimitry Andric ArgType = QC.apply(CGM.getContext(), ArgType); 1785e8d8bef9SDimitry Andric if (isa<ImplicitParamDecl>(NativeParam)) 1786e8d8bef9SDimitry Andric return ImplicitParamDecl::Create( 1787e8d8bef9SDimitry Andric CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(), 17885f757f3fSDimitry Andric NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other); 1789e8d8bef9SDimitry Andric return ParmVarDecl::Create( 1790e8d8bef9SDimitry Andric CGM.getContext(), 1791e8d8bef9SDimitry Andric const_cast<DeclContext *>(NativeParam->getDeclContext()), 1792e8d8bef9SDimitry Andric NativeParam->getBeginLoc(), NativeParam->getLocation(), 1793e8d8bef9SDimitry Andric NativeParam->getIdentifier(), ArgType, 1794e8d8bef9SDimitry Andric /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr); 1795e8d8bef9SDimitry Andric } 1796e8d8bef9SDimitry Andric 1797e8d8bef9SDimitry Andric Address 1798e8d8bef9SDimitry Andric CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF, 1799e8d8bef9SDimitry Andric const VarDecl *NativeParam, 1800e8d8bef9SDimitry Andric const VarDecl *TargetParam) const { 1801e8d8bef9SDimitry Andric assert(NativeParam != TargetParam && 1802e8d8bef9SDimitry Andric NativeParam->getType()->isReferenceType() && 1803e8d8bef9SDimitry Andric "Native arg must not be the same as target arg."); 1804e8d8bef9SDimitry Andric Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam); 1805e8d8bef9SDimitry Andric QualType NativeParamType = NativeParam->getType(); 1806e8d8bef9SDimitry Andric QualifierCollector QC; 1807e8d8bef9SDimitry Andric const Type *NonQualTy = QC.strip(NativeParamType); 1808e8d8bef9SDimitry Andric QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType(); 1809e8d8bef9SDimitry Andric unsigned NativePointeeAddrSpace = 1810bdd1243dSDimitry Andric CGF.getTypes().getTargetAddressSpace(NativePointeeTy); 1811e8d8bef9SDimitry Andric QualType TargetTy = TargetParam->getType(); 181206c3fb27SDimitry Andric llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(LocalAddr, /*Volatile=*/false, 181306c3fb27SDimitry Andric TargetTy, SourceLocation()); 18145f757f3fSDimitry Andric // Cast to native address space. 1815e8d8bef9SDimitry Andric TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 181606c3fb27SDimitry Andric TargetAddr, 181706c3fb27SDimitry Andric llvm::PointerType::get(CGF.getLLVMContext(), NativePointeeAddrSpace)); 1818e8d8bef9SDimitry Andric Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType); 1819e8d8bef9SDimitry Andric CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false, 1820e8d8bef9SDimitry Andric NativeParamType); 1821e8d8bef9SDimitry Andric return NativeParamAddr; 1822e8d8bef9SDimitry Andric } 1823e8d8bef9SDimitry Andric 1824e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall( 1825e8d8bef9SDimitry Andric CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, 1826e8d8bef9SDimitry Andric ArrayRef<llvm::Value *> Args) const { 1827e8d8bef9SDimitry Andric SmallVector<llvm::Value *, 4> TargetArgs; 1828e8d8bef9SDimitry Andric TargetArgs.reserve(Args.size()); 1829e8d8bef9SDimitry Andric auto *FnType = OutlinedFn.getFunctionType(); 1830e8d8bef9SDimitry Andric for (unsigned I = 0, E = Args.size(); I < E; ++I) { 1831e8d8bef9SDimitry Andric if (FnType->isVarArg() && FnType->getNumParams() <= I) { 1832e8d8bef9SDimitry Andric TargetArgs.append(std::next(Args.begin(), I), Args.end()); 1833e8d8bef9SDimitry Andric break; 1834e8d8bef9SDimitry Andric } 1835e8d8bef9SDimitry Andric llvm::Type *TargetType = FnType->getParamType(I); 1836e8d8bef9SDimitry Andric llvm::Value *NativeArg = Args[I]; 1837e8d8bef9SDimitry Andric if (!TargetType->isPointerTy()) { 1838e8d8bef9SDimitry Andric TargetArgs.emplace_back(NativeArg); 1839e8d8bef9SDimitry Andric continue; 1840e8d8bef9SDimitry Andric } 1841e8d8bef9SDimitry Andric TargetArgs.emplace_back( 18425f757f3fSDimitry Andric CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(NativeArg, TargetType)); 1843e8d8bef9SDimitry Andric } 1844e8d8bef9SDimitry Andric CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs); 1845e8d8bef9SDimitry Andric } 1846e8d8bef9SDimitry Andric 1847e8d8bef9SDimitry Andric /// Emit function which wraps the outline parallel region 1848e8d8bef9SDimitry Andric /// and controls the arguments which are passed to this function. 1849e8d8bef9SDimitry Andric /// The wrapper ensures that the outlined function is called 1850e8d8bef9SDimitry Andric /// with the correct arguments when data is shared. 1851e8d8bef9SDimitry Andric llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper( 1852e8d8bef9SDimitry Andric llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) { 1853e8d8bef9SDimitry Andric ASTContext &Ctx = CGM.getContext(); 1854e8d8bef9SDimitry Andric const auto &CS = *D.getCapturedStmt(OMPD_parallel); 1855e8d8bef9SDimitry Andric 1856e8d8bef9SDimitry Andric // Create a function that takes as argument the source thread. 1857e8d8bef9SDimitry Andric FunctionArgList WrapperArgs; 1858e8d8bef9SDimitry Andric QualType Int16QTy = 1859e8d8bef9SDimitry Andric Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false); 1860e8d8bef9SDimitry Andric QualType Int32QTy = 1861e8d8bef9SDimitry Andric Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false); 1862e8d8bef9SDimitry Andric ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), 1863e8d8bef9SDimitry Andric /*Id=*/nullptr, Int16QTy, 18645f757f3fSDimitry Andric ImplicitParamKind::Other); 1865e8d8bef9SDimitry Andric ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), 1866e8d8bef9SDimitry Andric /*Id=*/nullptr, Int32QTy, 18675f757f3fSDimitry Andric ImplicitParamKind::Other); 1868e8d8bef9SDimitry Andric WrapperArgs.emplace_back(&ParallelLevelArg); 1869e8d8bef9SDimitry Andric WrapperArgs.emplace_back(&WrapperArg); 1870e8d8bef9SDimitry Andric 1871e8d8bef9SDimitry Andric const CGFunctionInfo &CGFI = 1872e8d8bef9SDimitry Andric CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs); 1873e8d8bef9SDimitry Andric 1874e8d8bef9SDimitry Andric auto *Fn = llvm::Function::Create( 1875e8d8bef9SDimitry Andric CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 1876e8d8bef9SDimitry Andric Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule()); 1877fe6060f1SDimitry Andric 1878fe6060f1SDimitry Andric // Ensure we do not inline the function. This is trivially true for the ones 1879fe6060f1SDimitry Andric // passed to __kmpc_fork_call but the ones calles in serialized regions 1880fe6060f1SDimitry Andric // could be inlined. This is not a perfect but it is closer to the invariant 1881fe6060f1SDimitry Andric // we want, namely, every data environment starts with a new function. 1882fe6060f1SDimitry Andric // TODO: We should pass the if condition to the runtime function and do the 1883fe6060f1SDimitry Andric // handling there. Much cleaner code. 1884fe6060f1SDimitry Andric Fn->addFnAttr(llvm::Attribute::NoInline); 1885fe6060f1SDimitry Andric 1886e8d8bef9SDimitry Andric CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 1887e8d8bef9SDimitry Andric Fn->setLinkage(llvm::GlobalValue::InternalLinkage); 1888e8d8bef9SDimitry Andric Fn->setDoesNotRecurse(); 1889e8d8bef9SDimitry Andric 1890e8d8bef9SDimitry Andric CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); 1891e8d8bef9SDimitry Andric CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs, 1892e8d8bef9SDimitry Andric D.getBeginLoc(), D.getBeginLoc()); 1893e8d8bef9SDimitry Andric 1894e8d8bef9SDimitry Andric const auto *RD = CS.getCapturedRecordDecl(); 1895e8d8bef9SDimitry Andric auto CurField = RD->field_begin(); 1896e8d8bef9SDimitry Andric 1897e8d8bef9SDimitry Andric Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, 1898e8d8bef9SDimitry Andric /*Name=*/".zero.addr"); 1899349cc55cSDimitry Andric CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr); 1900e8d8bef9SDimitry Andric // Get the array of arguments. 1901e8d8bef9SDimitry Andric SmallVector<llvm::Value *, 8> Args; 1902e8d8bef9SDimitry Andric 19030fca6ea1SDimitry Andric Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).emitRawPointer(CGF)); 19040fca6ea1SDimitry Andric Args.emplace_back(ZeroAddr.emitRawPointer(CGF)); 1905e8d8bef9SDimitry Andric 1906e8d8bef9SDimitry Andric CGBuilderTy &Bld = CGF.Builder; 1907e8d8bef9SDimitry Andric auto CI = CS.capture_begin(); 1908e8d8bef9SDimitry Andric 1909e8d8bef9SDimitry Andric // Use global memory for data sharing. 1910e8d8bef9SDimitry Andric // Handle passing of global args to workers. 19110fca6ea1SDimitry Andric RawAddress GlobalArgs = 1912e8d8bef9SDimitry Andric CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args"); 1913e8d8bef9SDimitry Andric llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer(); 1914e8d8bef9SDimitry Andric llvm::Value *DataSharingArgs[] = {GlobalArgsPtr}; 1915e8d8bef9SDimitry Andric CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1916e8d8bef9SDimitry Andric CGM.getModule(), OMPRTL___kmpc_get_shared_variables), 1917e8d8bef9SDimitry Andric DataSharingArgs); 1918e8d8bef9SDimitry Andric 1919e8d8bef9SDimitry Andric // Retrieve the shared variables from the list of references returned 1920e8d8bef9SDimitry Andric // by the runtime. Pass the variables to the outlined function. 1921e8d8bef9SDimitry Andric Address SharedArgListAddress = Address::invalid(); 1922e8d8bef9SDimitry Andric if (CS.capture_size() > 0 || 1923e8d8bef9SDimitry Andric isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { 1924e8d8bef9SDimitry Andric SharedArgListAddress = CGF.EmitLoadOfPointer( 1925e8d8bef9SDimitry Andric GlobalArgs, CGF.getContext() 192681ad6265SDimitry Andric .getPointerType(CGF.getContext().VoidPtrTy) 1927e8d8bef9SDimitry Andric .castAs<PointerType>()); 1928e8d8bef9SDimitry Andric } 1929e8d8bef9SDimitry Andric unsigned Idx = 0; 1930e8d8bef9SDimitry Andric if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { 1931e8d8bef9SDimitry Andric Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); 1932e8d8bef9SDimitry Andric Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( 193381ad6265SDimitry Andric Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy); 1934e8d8bef9SDimitry Andric llvm::Value *LB = CGF.EmitLoadOfScalar( 1935e8d8bef9SDimitry Andric TypedAddress, 1936e8d8bef9SDimitry Andric /*Volatile=*/false, 1937e8d8bef9SDimitry Andric CGF.getContext().getPointerType(CGF.getContext().getSizeType()), 1938e8d8bef9SDimitry Andric cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc()); 1939e8d8bef9SDimitry Andric Args.emplace_back(LB); 1940e8d8bef9SDimitry Andric ++Idx; 1941e8d8bef9SDimitry Andric Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); 1942e8d8bef9SDimitry Andric TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( 194381ad6265SDimitry Andric Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy); 1944e8d8bef9SDimitry Andric llvm::Value *UB = CGF.EmitLoadOfScalar( 1945e8d8bef9SDimitry Andric TypedAddress, 1946e8d8bef9SDimitry Andric /*Volatile=*/false, 1947e8d8bef9SDimitry Andric CGF.getContext().getPointerType(CGF.getContext().getSizeType()), 1948e8d8bef9SDimitry Andric cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc()); 1949e8d8bef9SDimitry Andric Args.emplace_back(UB); 1950e8d8bef9SDimitry Andric ++Idx; 1951e8d8bef9SDimitry Andric } 1952e8d8bef9SDimitry Andric if (CS.capture_size() > 0) { 1953e8d8bef9SDimitry Andric ASTContext &CGFContext = CGF.getContext(); 1954e8d8bef9SDimitry Andric for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) { 1955e8d8bef9SDimitry Andric QualType ElemTy = CurField->getType(); 1956e8d8bef9SDimitry Andric Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx); 1957e8d8bef9SDimitry Andric Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( 195881ad6265SDimitry Andric Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)), 195981ad6265SDimitry Andric CGF.ConvertTypeForMem(ElemTy)); 1960e8d8bef9SDimitry Andric llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress, 1961e8d8bef9SDimitry Andric /*Volatile=*/false, 1962e8d8bef9SDimitry Andric CGFContext.getPointerType(ElemTy), 1963e8d8bef9SDimitry Andric CI->getLocation()); 1964e8d8bef9SDimitry Andric if (CI->capturesVariableByCopy() && 1965e8d8bef9SDimitry Andric !CI->getCapturedVar()->getType()->isAnyPointerType()) { 1966e8d8bef9SDimitry Andric Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(), 1967e8d8bef9SDimitry Andric CI->getLocation()); 1968e8d8bef9SDimitry Andric } 1969e8d8bef9SDimitry Andric Args.emplace_back(Arg); 1970e8d8bef9SDimitry Andric } 1971e8d8bef9SDimitry Andric } 1972e8d8bef9SDimitry Andric 1973e8d8bef9SDimitry Andric emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args); 1974e8d8bef9SDimitry Andric CGF.FinishFunction(); 1975e8d8bef9SDimitry Andric return Fn; 1976e8d8bef9SDimitry Andric } 1977e8d8bef9SDimitry Andric 1978e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF, 1979e8d8bef9SDimitry Andric const Decl *D) { 19805f757f3fSDimitry Andric if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) 1981e8d8bef9SDimitry Andric return; 1982e8d8bef9SDimitry Andric 1983e8d8bef9SDimitry Andric assert(D && "Expected function or captured|block decl."); 1984e8d8bef9SDimitry Andric assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && 1985e8d8bef9SDimitry Andric "Function is registered already."); 1986e8d8bef9SDimitry Andric assert((!TeamAndReductions.first || TeamAndReductions.first == D) && 1987e8d8bef9SDimitry Andric "Team is set but not processed."); 1988e8d8bef9SDimitry Andric const Stmt *Body = nullptr; 1989e8d8bef9SDimitry Andric bool NeedToDelayGlobalization = false; 1990e8d8bef9SDimitry Andric if (const auto *FD = dyn_cast<FunctionDecl>(D)) { 1991e8d8bef9SDimitry Andric Body = FD->getBody(); 1992e8d8bef9SDimitry Andric } else if (const auto *BD = dyn_cast<BlockDecl>(D)) { 1993e8d8bef9SDimitry Andric Body = BD->getBody(); 1994e8d8bef9SDimitry Andric } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) { 1995e8d8bef9SDimitry Andric Body = CD->getBody(); 1996e8d8bef9SDimitry Andric NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP; 1997e8d8bef9SDimitry Andric if (NeedToDelayGlobalization && 1998e8d8bef9SDimitry Andric getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) 1999e8d8bef9SDimitry Andric return; 2000e8d8bef9SDimitry Andric } 2001e8d8bef9SDimitry Andric if (!Body) 2002e8d8bef9SDimitry Andric return; 2003e8d8bef9SDimitry Andric CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second); 2004e8d8bef9SDimitry Andric VarChecker.Visit(Body); 2005e8d8bef9SDimitry Andric const RecordDecl *GlobalizedVarsRecord = 2006e8d8bef9SDimitry Andric VarChecker.getGlobalizedRecord(IsInTTDRegion); 2007e8d8bef9SDimitry Andric TeamAndReductions.first = nullptr; 2008e8d8bef9SDimitry Andric TeamAndReductions.second.clear(); 2009e8d8bef9SDimitry Andric ArrayRef<const ValueDecl *> EscapedVariableLengthDecls = 2010e8d8bef9SDimitry Andric VarChecker.getEscapedVariableLengthDecls(); 201106c3fb27SDimitry Andric ArrayRef<const ValueDecl *> DelayedVariableLengthDecls = 201206c3fb27SDimitry Andric VarChecker.getDelayedVariableLengthDecls(); 201306c3fb27SDimitry Andric if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() && 201406c3fb27SDimitry Andric DelayedVariableLengthDecls.empty()) 2015e8d8bef9SDimitry Andric return; 2016e8d8bef9SDimitry Andric auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; 2017e8d8bef9SDimitry Andric I->getSecond().MappedParams = 2018e8d8bef9SDimitry Andric std::make_unique<CodeGenFunction::OMPMapVars>(); 2019e8d8bef9SDimitry Andric I->getSecond().EscapedParameters.insert( 2020e8d8bef9SDimitry Andric VarChecker.getEscapedParameters().begin(), 2021e8d8bef9SDimitry Andric VarChecker.getEscapedParameters().end()); 2022e8d8bef9SDimitry Andric I->getSecond().EscapedVariableLengthDecls.append( 2023e8d8bef9SDimitry Andric EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end()); 202406c3fb27SDimitry Andric I->getSecond().DelayedVariableLengthDecls.append( 202506c3fb27SDimitry Andric DelayedVariableLengthDecls.begin(), DelayedVariableLengthDecls.end()); 2026e8d8bef9SDimitry Andric DeclToAddrMapTy &Data = I->getSecond().LocalVarData; 2027e8d8bef9SDimitry Andric for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { 2028e8d8bef9SDimitry Andric assert(VD->isCanonicalDecl() && "Expected canonical declaration"); 2029fe6060f1SDimitry Andric Data.insert(std::make_pair(VD, MappedVarData())); 2030e8d8bef9SDimitry Andric } 2031e8d8bef9SDimitry Andric if (!NeedToDelayGlobalization) { 20325f757f3fSDimitry Andric emitGenericVarsProlog(CGF, D->getBeginLoc()); 2033e8d8bef9SDimitry Andric struct GlobalizationScope final : EHScopeStack::Cleanup { 2034e8d8bef9SDimitry Andric GlobalizationScope() = default; 2035e8d8bef9SDimitry Andric 2036e8d8bef9SDimitry Andric void Emit(CodeGenFunction &CGF, Flags flags) override { 2037e8d8bef9SDimitry Andric static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()) 20385f757f3fSDimitry Andric .emitGenericVarsEpilog(CGF); 2039e8d8bef9SDimitry Andric } 2040e8d8bef9SDimitry Andric }; 2041e8d8bef9SDimitry Andric CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup); 2042e8d8bef9SDimitry Andric } 2043e8d8bef9SDimitry Andric } 2044e8d8bef9SDimitry Andric 2045e8d8bef9SDimitry Andric Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF, 2046e8d8bef9SDimitry Andric const VarDecl *VD) { 2047e8d8bef9SDimitry Andric if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) { 2048e8d8bef9SDimitry Andric const auto *A = VD->getAttr<OMPAllocateDeclAttr>(); 2049e8d8bef9SDimitry Andric auto AS = LangAS::Default; 2050e8d8bef9SDimitry Andric switch (A->getAllocatorType()) { 2051e8d8bef9SDimitry Andric // Use the default allocator here as by default local vars are 2052e8d8bef9SDimitry Andric // threadlocal. 2053e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPNullMemAlloc: 2054e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPDefaultMemAlloc: 2055e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPThreadMemAlloc: 2056e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPHighBWMemAlloc: 2057e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPLowLatMemAlloc: 2058e8d8bef9SDimitry Andric // Follow the user decision - use default allocation. 2059e8d8bef9SDimitry Andric return Address::invalid(); 2060e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: 2061e8d8bef9SDimitry Andric // TODO: implement aupport for user-defined allocators. 2062e8d8bef9SDimitry Andric return Address::invalid(); 2063e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPConstMemAlloc: 2064e8d8bef9SDimitry Andric AS = LangAS::cuda_constant; 2065e8d8bef9SDimitry Andric break; 2066e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPPTeamMemAlloc: 2067e8d8bef9SDimitry Andric AS = LangAS::cuda_shared; 2068e8d8bef9SDimitry Andric break; 2069e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: 2070e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPCGroupMemAlloc: 2071e8d8bef9SDimitry Andric break; 2072e8d8bef9SDimitry Andric } 2073e8d8bef9SDimitry Andric llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType()); 2074e8d8bef9SDimitry Andric auto *GV = new llvm::GlobalVariable( 2075e8d8bef9SDimitry Andric CGM.getModule(), VarTy, /*isConstant=*/false, 207606c3fb27SDimitry Andric llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy), 2077e8d8bef9SDimitry Andric VD->getName(), 2078e8d8bef9SDimitry Andric /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, 2079e8d8bef9SDimitry Andric CGM.getContext().getTargetAddressSpace(AS)); 2080e8d8bef9SDimitry Andric CharUnits Align = CGM.getContext().getDeclAlign(VD); 2081e8d8bef9SDimitry Andric GV->setAlignment(Align.getAsAlign()); 2082e8d8bef9SDimitry Andric return Address( 2083e8d8bef9SDimitry Andric CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 2084e8d8bef9SDimitry Andric GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace( 2085e8d8bef9SDimitry Andric VD->getType().getAddressSpace()))), 208681ad6265SDimitry Andric VarTy, Align); 2087e8d8bef9SDimitry Andric } 2088e8d8bef9SDimitry Andric 20895f757f3fSDimitry Andric if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) 2090e8d8bef9SDimitry Andric return Address::invalid(); 2091e8d8bef9SDimitry Andric 2092e8d8bef9SDimitry Andric VD = VD->getCanonicalDecl(); 2093e8d8bef9SDimitry Andric auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 2094e8d8bef9SDimitry Andric if (I == FunctionGlobalizedDecls.end()) 2095e8d8bef9SDimitry Andric return Address::invalid(); 2096e8d8bef9SDimitry Andric auto VDI = I->getSecond().LocalVarData.find(VD); 2097e8d8bef9SDimitry Andric if (VDI != I->getSecond().LocalVarData.end()) 2098e8d8bef9SDimitry Andric return VDI->second.PrivateAddr; 2099e8d8bef9SDimitry Andric if (VD->hasAttrs()) { 2100e8d8bef9SDimitry Andric for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()), 2101e8d8bef9SDimitry Andric E(VD->attr_end()); 2102e8d8bef9SDimitry Andric IT != E; ++IT) { 2103e8d8bef9SDimitry Andric auto VDI = I->getSecond().LocalVarData.find( 2104e8d8bef9SDimitry Andric cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl()) 2105e8d8bef9SDimitry Andric ->getCanonicalDecl()); 2106e8d8bef9SDimitry Andric if (VDI != I->getSecond().LocalVarData.end()) 2107e8d8bef9SDimitry Andric return VDI->second.PrivateAddr; 2108e8d8bef9SDimitry Andric } 2109e8d8bef9SDimitry Andric } 2110e8d8bef9SDimitry Andric 2111e8d8bef9SDimitry Andric return Address::invalid(); 2112e8d8bef9SDimitry Andric } 2113e8d8bef9SDimitry Andric 2114e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) { 2115e8d8bef9SDimitry Andric FunctionGlobalizedDecls.erase(CGF.CurFn); 2116e8d8bef9SDimitry Andric CGOpenMPRuntime::functionFinished(CGF); 2117e8d8bef9SDimitry Andric } 2118e8d8bef9SDimitry Andric 2119e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk( 2120e8d8bef9SDimitry Andric CodeGenFunction &CGF, const OMPLoopDirective &S, 2121e8d8bef9SDimitry Andric OpenMPDistScheduleClauseKind &ScheduleKind, 2122e8d8bef9SDimitry Andric llvm::Value *&Chunk) const { 2123e8d8bef9SDimitry Andric auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 2124e8d8bef9SDimitry Andric if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { 2125e8d8bef9SDimitry Andric ScheduleKind = OMPC_DIST_SCHEDULE_static; 2126e8d8bef9SDimitry Andric Chunk = CGF.EmitScalarConversion( 2127e8d8bef9SDimitry Andric RT.getGPUNumThreads(CGF), 2128e8d8bef9SDimitry Andric CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), 2129e8d8bef9SDimitry Andric S.getIterationVariable()->getType(), S.getBeginLoc()); 2130e8d8bef9SDimitry Andric return; 2131e8d8bef9SDimitry Andric } 2132e8d8bef9SDimitry Andric CGOpenMPRuntime::getDefaultDistScheduleAndChunk( 2133e8d8bef9SDimitry Andric CGF, S, ScheduleKind, Chunk); 2134e8d8bef9SDimitry Andric } 2135e8d8bef9SDimitry Andric 2136e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk( 2137e8d8bef9SDimitry Andric CodeGenFunction &CGF, const OMPLoopDirective &S, 2138e8d8bef9SDimitry Andric OpenMPScheduleClauseKind &ScheduleKind, 2139e8d8bef9SDimitry Andric const Expr *&ChunkExpr) const { 2140e8d8bef9SDimitry Andric ScheduleKind = OMPC_SCHEDULE_static; 2141e8d8bef9SDimitry Andric // Chunk size is 1 in this case. 2142e8d8bef9SDimitry Andric llvm::APInt ChunkSize(32, 1); 2143e8d8bef9SDimitry Andric ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize, 2144e8d8bef9SDimitry Andric CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), 2145e8d8bef9SDimitry Andric SourceLocation()); 2146e8d8bef9SDimitry Andric } 2147e8d8bef9SDimitry Andric 2148e8d8bef9SDimitry Andric void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas( 2149e8d8bef9SDimitry Andric CodeGenFunction &CGF, const OMPExecutableDirective &D) const { 2150e8d8bef9SDimitry Andric assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && 2151e8d8bef9SDimitry Andric " Expected target-based directive."); 2152e8d8bef9SDimitry Andric const CapturedStmt *CS = D.getCapturedStmt(OMPD_target); 2153e8d8bef9SDimitry Andric for (const CapturedStmt::Capture &C : CS->captures()) { 2154e8d8bef9SDimitry Andric // Capture variables captured by reference in lambdas for target-based 2155e8d8bef9SDimitry Andric // directives. 2156e8d8bef9SDimitry Andric if (!C.capturesVariable()) 2157e8d8bef9SDimitry Andric continue; 2158e8d8bef9SDimitry Andric const VarDecl *VD = C.getCapturedVar(); 2159e8d8bef9SDimitry Andric const auto *RD = VD->getType() 2160e8d8bef9SDimitry Andric .getCanonicalType() 2161e8d8bef9SDimitry Andric .getNonReferenceType() 2162e8d8bef9SDimitry Andric ->getAsCXXRecordDecl(); 2163e8d8bef9SDimitry Andric if (!RD || !RD->isLambda()) 2164e8d8bef9SDimitry Andric continue; 2165e8d8bef9SDimitry Andric Address VDAddr = CGF.GetAddrOfLocalVar(VD); 2166e8d8bef9SDimitry Andric LValue VDLVal; 2167e8d8bef9SDimitry Andric if (VD->getType().getCanonicalType()->isReferenceType()) 2168e8d8bef9SDimitry Andric VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType()); 2169e8d8bef9SDimitry Andric else 2170e8d8bef9SDimitry Andric VDLVal = CGF.MakeAddrLValue( 2171e8d8bef9SDimitry Andric VDAddr, VD->getType().getCanonicalType().getNonReferenceType()); 2172bdd1243dSDimitry Andric llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures; 2173e8d8bef9SDimitry Andric FieldDecl *ThisCapture = nullptr; 2174e8d8bef9SDimitry Andric RD->getCaptureFields(Captures, ThisCapture); 2175e8d8bef9SDimitry Andric if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) { 2176e8d8bef9SDimitry Andric LValue ThisLVal = 2177e8d8bef9SDimitry Andric CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture); 2178e8d8bef9SDimitry Andric llvm::Value *CXXThis = CGF.LoadCXXThis(); 2179e8d8bef9SDimitry Andric CGF.EmitStoreOfScalar(CXXThis, ThisLVal); 2180e8d8bef9SDimitry Andric } 2181e8d8bef9SDimitry Andric for (const LambdaCapture &LC : RD->captures()) { 2182e8d8bef9SDimitry Andric if (LC.getCaptureKind() != LCK_ByRef) 2183e8d8bef9SDimitry Andric continue; 2184bdd1243dSDimitry Andric const ValueDecl *VD = LC.getCapturedVar(); 2185bdd1243dSDimitry Andric // FIXME: For now VD is always a VarDecl because OpenMP does not support 2186bdd1243dSDimitry Andric // capturing structured bindings in lambdas yet. 2187bdd1243dSDimitry Andric if (!CS->capturesVariable(cast<VarDecl>(VD))) 2188e8d8bef9SDimitry Andric continue; 2189e8d8bef9SDimitry Andric auto It = Captures.find(VD); 2190e8d8bef9SDimitry Andric assert(It != Captures.end() && "Found lambda capture without field."); 2191e8d8bef9SDimitry Andric LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second); 2192bdd1243dSDimitry Andric Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD)); 2193e8d8bef9SDimitry Andric if (VD->getType().getCanonicalType()->isReferenceType()) 2194e8d8bef9SDimitry Andric VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr, 2195e8d8bef9SDimitry Andric VD->getType().getCanonicalType()) 21960fca6ea1SDimitry Andric .getAddress(); 21970fca6ea1SDimitry Andric CGF.EmitStoreOfScalar(VDAddr.emitRawPointer(CGF), VarLVal); 2198e8d8bef9SDimitry Andric } 2199e8d8bef9SDimitry Andric } 2200e8d8bef9SDimitry Andric } 2201e8d8bef9SDimitry Andric 2202e8d8bef9SDimitry Andric bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD, 2203e8d8bef9SDimitry Andric LangAS &AS) { 2204e8d8bef9SDimitry Andric if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>()) 2205e8d8bef9SDimitry Andric return false; 2206e8d8bef9SDimitry Andric const auto *A = VD->getAttr<OMPAllocateDeclAttr>(); 2207e8d8bef9SDimitry Andric switch(A->getAllocatorType()) { 2208e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPNullMemAlloc: 2209e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPDefaultMemAlloc: 2210e8d8bef9SDimitry Andric // Not supported, fallback to the default mem space. 2211e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPThreadMemAlloc: 2212e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: 2213e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPCGroupMemAlloc: 2214e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPHighBWMemAlloc: 2215e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPLowLatMemAlloc: 2216e8d8bef9SDimitry Andric AS = LangAS::Default; 2217e8d8bef9SDimitry Andric return true; 2218e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPConstMemAlloc: 2219e8d8bef9SDimitry Andric AS = LangAS::cuda_constant; 2220e8d8bef9SDimitry Andric return true; 2221e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPPTeamMemAlloc: 2222e8d8bef9SDimitry Andric AS = LangAS::cuda_shared; 2223e8d8bef9SDimitry Andric return true; 2224e8d8bef9SDimitry Andric case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: 2225e8d8bef9SDimitry Andric llvm_unreachable("Expected predefined allocator for the variables with the " 2226e8d8bef9SDimitry Andric "static storage."); 2227e8d8bef9SDimitry Andric } 2228e8d8bef9SDimitry Andric return false; 2229e8d8bef9SDimitry Andric } 2230e8d8bef9SDimitry Andric 22310fca6ea1SDimitry Andric // Get current OffloadArch and ignore any unknown values 22320fca6ea1SDimitry Andric static OffloadArch getOffloadArch(CodeGenModule &CGM) { 2233e8d8bef9SDimitry Andric if (!CGM.getTarget().hasFeature("ptx")) 22340fca6ea1SDimitry Andric return OffloadArch::UNKNOWN; 2235e8d8bef9SDimitry Andric for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) { 2236e8d8bef9SDimitry Andric if (Feature.getValue()) { 22370fca6ea1SDimitry Andric OffloadArch Arch = StringToOffloadArch(Feature.getKey()); 22380fca6ea1SDimitry Andric if (Arch != OffloadArch::UNKNOWN) 2239e8d8bef9SDimitry Andric return Arch; 2240e8d8bef9SDimitry Andric } 2241e8d8bef9SDimitry Andric } 22420fca6ea1SDimitry Andric return OffloadArch::UNKNOWN; 2243e8d8bef9SDimitry Andric } 2244e8d8bef9SDimitry Andric 2245e8d8bef9SDimitry Andric /// Check to see if target architecture supports unified addressing which is 2246e8d8bef9SDimitry Andric /// a restriction for OpenMP requires clause "unified_shared_memory". 22470fca6ea1SDimitry Andric void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) { 2248e8d8bef9SDimitry Andric for (const OMPClause *Clause : D->clauselists()) { 2249e8d8bef9SDimitry Andric if (Clause->getClauseKind() == OMPC_unified_shared_memory) { 22500fca6ea1SDimitry Andric OffloadArch Arch = getOffloadArch(CGM); 2251e8d8bef9SDimitry Andric switch (Arch) { 22520fca6ea1SDimitry Andric case OffloadArch::SM_20: 22530fca6ea1SDimitry Andric case OffloadArch::SM_21: 22540fca6ea1SDimitry Andric case OffloadArch::SM_30: 22550fca6ea1SDimitry Andric case OffloadArch::SM_32_: 22560fca6ea1SDimitry Andric case OffloadArch::SM_35: 22570fca6ea1SDimitry Andric case OffloadArch::SM_37: 22580fca6ea1SDimitry Andric case OffloadArch::SM_50: 22590fca6ea1SDimitry Andric case OffloadArch::SM_52: 22600fca6ea1SDimitry Andric case OffloadArch::SM_53: { 2261e8d8bef9SDimitry Andric SmallString<256> Buffer; 2262e8d8bef9SDimitry Andric llvm::raw_svector_ostream Out(Buffer); 22630fca6ea1SDimitry Andric Out << "Target architecture " << OffloadArchToString(Arch) 2264e8d8bef9SDimitry Andric << " does not support unified addressing"; 2265e8d8bef9SDimitry Andric CGM.Error(Clause->getBeginLoc(), Out.str()); 2266e8d8bef9SDimitry Andric return; 2267e8d8bef9SDimitry Andric } 22680fca6ea1SDimitry Andric case OffloadArch::SM_60: 22690fca6ea1SDimitry Andric case OffloadArch::SM_61: 22700fca6ea1SDimitry Andric case OffloadArch::SM_62: 22710fca6ea1SDimitry Andric case OffloadArch::SM_70: 22720fca6ea1SDimitry Andric case OffloadArch::SM_72: 22730fca6ea1SDimitry Andric case OffloadArch::SM_75: 22740fca6ea1SDimitry Andric case OffloadArch::SM_80: 22750fca6ea1SDimitry Andric case OffloadArch::SM_86: 22760fca6ea1SDimitry Andric case OffloadArch::SM_87: 22770fca6ea1SDimitry Andric case OffloadArch::SM_89: 22780fca6ea1SDimitry Andric case OffloadArch::SM_90: 22790fca6ea1SDimitry Andric case OffloadArch::SM_90a: 22800fca6ea1SDimitry Andric case OffloadArch::GFX600: 22810fca6ea1SDimitry Andric case OffloadArch::GFX601: 22820fca6ea1SDimitry Andric case OffloadArch::GFX602: 22830fca6ea1SDimitry Andric case OffloadArch::GFX700: 22840fca6ea1SDimitry Andric case OffloadArch::GFX701: 22850fca6ea1SDimitry Andric case OffloadArch::GFX702: 22860fca6ea1SDimitry Andric case OffloadArch::GFX703: 22870fca6ea1SDimitry Andric case OffloadArch::GFX704: 22880fca6ea1SDimitry Andric case OffloadArch::GFX705: 22890fca6ea1SDimitry Andric case OffloadArch::GFX801: 22900fca6ea1SDimitry Andric case OffloadArch::GFX802: 22910fca6ea1SDimitry Andric case OffloadArch::GFX803: 22920fca6ea1SDimitry Andric case OffloadArch::GFX805: 22930fca6ea1SDimitry Andric case OffloadArch::GFX810: 22940fca6ea1SDimitry Andric case OffloadArch::GFX9_GENERIC: 22950fca6ea1SDimitry Andric case OffloadArch::GFX900: 22960fca6ea1SDimitry Andric case OffloadArch::GFX902: 22970fca6ea1SDimitry Andric case OffloadArch::GFX904: 22980fca6ea1SDimitry Andric case OffloadArch::GFX906: 22990fca6ea1SDimitry Andric case OffloadArch::GFX908: 23000fca6ea1SDimitry Andric case OffloadArch::GFX909: 23010fca6ea1SDimitry Andric case OffloadArch::GFX90a: 23020fca6ea1SDimitry Andric case OffloadArch::GFX90c: 23030fca6ea1SDimitry Andric case OffloadArch::GFX940: 23040fca6ea1SDimitry Andric case OffloadArch::GFX941: 23050fca6ea1SDimitry Andric case OffloadArch::GFX942: 23060fca6ea1SDimitry Andric case OffloadArch::GFX10_1_GENERIC: 23070fca6ea1SDimitry Andric case OffloadArch::GFX1010: 23080fca6ea1SDimitry Andric case OffloadArch::GFX1011: 23090fca6ea1SDimitry Andric case OffloadArch::GFX1012: 23100fca6ea1SDimitry Andric case OffloadArch::GFX1013: 23110fca6ea1SDimitry Andric case OffloadArch::GFX10_3_GENERIC: 23120fca6ea1SDimitry Andric case OffloadArch::GFX1030: 23130fca6ea1SDimitry Andric case OffloadArch::GFX1031: 23140fca6ea1SDimitry Andric case OffloadArch::GFX1032: 23150fca6ea1SDimitry Andric case OffloadArch::GFX1033: 23160fca6ea1SDimitry Andric case OffloadArch::GFX1034: 23170fca6ea1SDimitry Andric case OffloadArch::GFX1035: 23180fca6ea1SDimitry Andric case OffloadArch::GFX1036: 23190fca6ea1SDimitry Andric case OffloadArch::GFX11_GENERIC: 23200fca6ea1SDimitry Andric case OffloadArch::GFX1100: 23210fca6ea1SDimitry Andric case OffloadArch::GFX1101: 23220fca6ea1SDimitry Andric case OffloadArch::GFX1102: 23230fca6ea1SDimitry Andric case OffloadArch::GFX1103: 23240fca6ea1SDimitry Andric case OffloadArch::GFX1150: 23250fca6ea1SDimitry Andric case OffloadArch::GFX1151: 23260fca6ea1SDimitry Andric case OffloadArch::GFX1152: 23270fca6ea1SDimitry Andric case OffloadArch::GFX12_GENERIC: 23280fca6ea1SDimitry Andric case OffloadArch::GFX1200: 23290fca6ea1SDimitry Andric case OffloadArch::GFX1201: 23300fca6ea1SDimitry Andric case OffloadArch::AMDGCNSPIRV: 23310fca6ea1SDimitry Andric case OffloadArch::Generic: 23320fca6ea1SDimitry Andric case OffloadArch::UNUSED: 23330fca6ea1SDimitry Andric case OffloadArch::UNKNOWN: 2334e8d8bef9SDimitry Andric break; 23350fca6ea1SDimitry Andric case OffloadArch::LAST: 23360fca6ea1SDimitry Andric llvm_unreachable("Unexpected GPU arch."); 2337e8d8bef9SDimitry Andric } 2338e8d8bef9SDimitry Andric } 2339e8d8bef9SDimitry Andric } 2340e8d8bef9SDimitry Andric CGOpenMPRuntime::processRequiresDirective(D); 2341e8d8bef9SDimitry Andric } 2342e8d8bef9SDimitry Andric 2343349cc55cSDimitry Andric llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) { 2344349cc55cSDimitry Andric CGBuilderTy &Bld = CGF.Builder; 2345349cc55cSDimitry Andric llvm::Module *M = &CGF.CGM.getModule(); 2346349cc55cSDimitry Andric const char *LocSize = "__kmpc_get_hardware_num_threads_in_block"; 2347349cc55cSDimitry Andric llvm::Function *F = M->getFunction(LocSize); 2348349cc55cSDimitry Andric if (!F) { 2349349cc55cSDimitry Andric F = llvm::Function::Create( 2350bdd1243dSDimitry Andric llvm::FunctionType::get(CGF.Int32Ty, std::nullopt, false), 2351349cc55cSDimitry Andric llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule()); 2352349cc55cSDimitry Andric } 2353bdd1243dSDimitry Andric return Bld.CreateCall(F, std::nullopt, "nvptx_num_threads"); 2354349cc55cSDimitry Andric } 2355349cc55cSDimitry Andric 2356349cc55cSDimitry Andric llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) { 2357349cc55cSDimitry Andric ArrayRef<llvm::Value *> Args{}; 2358349cc55cSDimitry Andric return CGF.EmitRuntimeCall( 2359349cc55cSDimitry Andric OMPBuilder.getOrCreateRuntimeFunction( 2360349cc55cSDimitry Andric CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block), 2361349cc55cSDimitry Andric Args); 2362349cc55cSDimitry Andric } 2363