1 //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This provides a generalized class for OpenMP runtime code generation 10 // specialized by GPU targets NVPTX and AMDGCN. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "CGOpenMPRuntimeGPU.h" 15 #include "CodeGenFunction.h" 16 #include "clang/AST/Attr.h" 17 #include "clang/AST/DeclOpenMP.h" 18 #include "clang/AST/OpenMPClause.h" 19 #include "clang/AST/StmtOpenMP.h" 20 #include "clang/AST/StmtVisitor.h" 21 #include "clang/Basic/Cuda.h" 22 #include "llvm/ADT/SmallPtrSet.h" 23 #include "llvm/Frontend/OpenMP/OMPDeviceConstants.h" 24 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 25 26 using namespace clang; 27 using namespace CodeGen; 28 using namespace llvm::omp; 29 30 namespace { 31 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. 32 class NVPTXActionTy final : public PrePostActionTy { 33 llvm::FunctionCallee EnterCallee = nullptr; 34 ArrayRef<llvm::Value *> EnterArgs; 35 llvm::FunctionCallee ExitCallee = nullptr; 36 ArrayRef<llvm::Value *> ExitArgs; 37 bool Conditional = false; 38 llvm::BasicBlock *ContBlock = nullptr; 39 40 public: 41 NVPTXActionTy(llvm::FunctionCallee EnterCallee, 42 ArrayRef<llvm::Value *> EnterArgs, 43 llvm::FunctionCallee ExitCallee, 44 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false) 45 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee), 46 ExitArgs(ExitArgs), Conditional(Conditional) {} 47 void Enter(CodeGenFunction &CGF) override { 48 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs); 49 if (Conditional) { 50 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes); 51 auto *ThenBlock = CGF.createBasicBlock("omp_if.then"); 52 ContBlock = CGF.createBasicBlock("omp_if.end"); 53 // Generate the branch (If-stmt) 54 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock); 55 CGF.EmitBlock(ThenBlock); 56 } 57 } 58 void Done(CodeGenFunction &CGF) { 59 // Emit the rest of blocks/branches 60 CGF.EmitBranch(ContBlock); 61 CGF.EmitBlock(ContBlock, true); 62 } 63 void Exit(CodeGenFunction &CGF) override { 64 CGF.EmitRuntimeCall(ExitCallee, ExitArgs); 65 } 66 }; 67 68 /// A class to track the execution mode when codegening directives within 69 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry 70 /// to the target region and used by containing directives such as 'parallel' 71 /// to emit optimized code. 72 class ExecutionRuntimeModesRAII { 73 private: 74 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode = 75 CGOpenMPRuntimeGPU::EM_Unknown; 76 CGOpenMPRuntimeGPU::ExecutionMode &ExecMode; 77 78 public: 79 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode, 80 CGOpenMPRuntimeGPU::ExecutionMode EntryMode) 81 : ExecMode(ExecMode) { 82 SavedExecMode = ExecMode; 83 ExecMode = EntryMode; 84 } 85 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; } 86 }; 87 88 static const ValueDecl *getPrivateItem(const Expr *RefExpr) { 89 RefExpr = RefExpr->IgnoreParens(); 90 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) { 91 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts(); 92 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) 93 Base = TempASE->getBase()->IgnoreParenImpCasts(); 94 RefExpr = Base; 95 } else if (auto *OASE = dyn_cast<ArraySectionExpr>(RefExpr)) { 96 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts(); 97 while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Base)) 98 Base = TempOASE->getBase()->IgnoreParenImpCasts(); 99 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) 100 Base = TempASE->getBase()->IgnoreParenImpCasts(); 101 RefExpr = Base; 102 } 103 RefExpr = RefExpr->IgnoreParenImpCasts(); 104 if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr)) 105 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl()); 106 const auto *ME = cast<MemberExpr>(RefExpr); 107 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl()); 108 } 109 110 static RecordDecl *buildRecordForGlobalizedVars( 111 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls, 112 ArrayRef<const ValueDecl *> EscapedDeclsForTeams, 113 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 114 &MappedDeclsFields, 115 int BufSize) { 116 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>; 117 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty()) 118 return nullptr; 119 SmallVector<VarsDataTy, 4> GlobalizedVars; 120 for (const ValueDecl *D : EscapedDecls) 121 GlobalizedVars.emplace_back(C.getDeclAlign(D), D); 122 for (const ValueDecl *D : EscapedDeclsForTeams) 123 GlobalizedVars.emplace_back(C.getDeclAlign(D), D); 124 125 // Build struct _globalized_locals_ty { 126 // /* globalized vars */[WarSize] align (decl_align) 127 // /* globalized vars */ for EscapedDeclsForTeams 128 // }; 129 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty"); 130 GlobalizedRD->startDefinition(); 131 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped( 132 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end()); 133 for (const auto &Pair : GlobalizedVars) { 134 const ValueDecl *VD = Pair.second; 135 QualType Type = VD->getType(); 136 if (Type->isLValueReferenceType()) 137 Type = C.getPointerType(Type.getNonReferenceType()); 138 else 139 Type = Type.getNonReferenceType(); 140 SourceLocation Loc = VD->getLocation(); 141 FieldDecl *Field; 142 if (SingleEscaped.count(VD)) { 143 Field = FieldDecl::Create( 144 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, 145 C.getTrivialTypeSourceInfo(Type, SourceLocation()), 146 /*BW=*/nullptr, /*Mutable=*/false, 147 /*InitStyle=*/ICIS_NoInit); 148 Field->setAccess(AS_public); 149 if (VD->hasAttrs()) { 150 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()), 151 E(VD->getAttrs().end()); 152 I != E; ++I) 153 Field->addAttr(*I); 154 } 155 } else { 156 if (BufSize > 1) { 157 llvm::APInt ArraySize(32, BufSize); 158 Type = C.getConstantArrayType(Type, ArraySize, nullptr, 159 ArraySizeModifier::Normal, 0); 160 } 161 Field = FieldDecl::Create( 162 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, 163 C.getTrivialTypeSourceInfo(Type, SourceLocation()), 164 /*BW=*/nullptr, /*Mutable=*/false, 165 /*InitStyle=*/ICIS_NoInit); 166 Field->setAccess(AS_public); 167 llvm::APInt Align(32, Pair.first.getQuantity()); 168 Field->addAttr(AlignedAttr::CreateImplicit( 169 C, /*IsAlignmentExpr=*/true, 170 IntegerLiteral::Create(C, Align, 171 C.getIntTypeForBitwidth(32, /*Signed=*/0), 172 SourceLocation()), 173 {}, AlignedAttr::GNU_aligned)); 174 } 175 GlobalizedRD->addDecl(Field); 176 MappedDeclsFields.try_emplace(VD, Field); 177 } 178 GlobalizedRD->completeDefinition(); 179 return GlobalizedRD; 180 } 181 182 /// Get the list of variables that can escape their declaration context. 183 class CheckVarsEscapingDeclContext final 184 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> { 185 CodeGenFunction &CGF; 186 llvm::SetVector<const ValueDecl *> EscapedDecls; 187 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls; 188 llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls; 189 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters; 190 RecordDecl *GlobalizedRD = nullptr; 191 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; 192 bool AllEscaped = false; 193 bool IsForCombinedParallelRegion = false; 194 195 void markAsEscaped(const ValueDecl *VD) { 196 // Do not globalize declare target variables. 197 if (!isa<VarDecl>(VD) || 198 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) 199 return; 200 VD = cast<ValueDecl>(VD->getCanonicalDecl()); 201 // Use user-specified allocation. 202 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>()) 203 return; 204 // Variables captured by value must be globalized. 205 bool IsCaptured = false; 206 if (auto *CSI = CGF.CapturedStmtInfo) { 207 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) { 208 // Check if need to capture the variable that was already captured by 209 // value in the outer region. 210 IsCaptured = true; 211 if (!IsForCombinedParallelRegion) { 212 if (!FD->hasAttrs()) 213 return; 214 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>(); 215 if (!Attr) 216 return; 217 if (((Attr->getCaptureKind() != OMPC_map) && 218 !isOpenMPPrivate(Attr->getCaptureKind())) || 219 ((Attr->getCaptureKind() == OMPC_map) && 220 !FD->getType()->isAnyPointerType())) 221 return; 222 } 223 if (!FD->getType()->isReferenceType()) { 224 assert(!VD->getType()->isVariablyModifiedType() && 225 "Parameter captured by value with variably modified type"); 226 EscapedParameters.insert(VD); 227 } else if (!IsForCombinedParallelRegion) { 228 return; 229 } 230 } 231 } 232 if ((!CGF.CapturedStmtInfo || 233 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) && 234 VD->getType()->isReferenceType()) 235 // Do not globalize variables with reference type. 236 return; 237 if (VD->getType()->isVariablyModifiedType()) { 238 // If not captured at the target region level then mark the escaped 239 // variable as delayed. 240 if (IsCaptured) 241 EscapedVariableLengthDecls.insert(VD); 242 else 243 DelayedVariableLengthDecls.insert(VD); 244 } else 245 EscapedDecls.insert(VD); 246 } 247 248 void VisitValueDecl(const ValueDecl *VD) { 249 if (VD->getType()->isLValueReferenceType()) 250 markAsEscaped(VD); 251 if (const auto *VarD = dyn_cast<VarDecl>(VD)) { 252 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) { 253 const bool SavedAllEscaped = AllEscaped; 254 AllEscaped = VD->getType()->isLValueReferenceType(); 255 Visit(VarD->getInit()); 256 AllEscaped = SavedAllEscaped; 257 } 258 } 259 } 260 void VisitOpenMPCapturedStmt(const CapturedStmt *S, 261 ArrayRef<OMPClause *> Clauses, 262 bool IsCombinedParallelRegion) { 263 if (!S) 264 return; 265 for (const CapturedStmt::Capture &C : S->captures()) { 266 if (C.capturesVariable() && !C.capturesVariableByCopy()) { 267 const ValueDecl *VD = C.getCapturedVar(); 268 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion; 269 if (IsCombinedParallelRegion) { 270 // Check if the variable is privatized in the combined construct and 271 // those private copies must be shared in the inner parallel 272 // directive. 273 IsForCombinedParallelRegion = false; 274 for (const OMPClause *C : Clauses) { 275 if (!isOpenMPPrivate(C->getClauseKind()) || 276 C->getClauseKind() == OMPC_reduction || 277 C->getClauseKind() == OMPC_linear || 278 C->getClauseKind() == OMPC_private) 279 continue; 280 ArrayRef<const Expr *> Vars; 281 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C)) 282 Vars = PC->getVarRefs(); 283 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C)) 284 Vars = PC->getVarRefs(); 285 else 286 llvm_unreachable("Unexpected clause."); 287 for (const auto *E : Vars) { 288 const Decl *D = 289 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl(); 290 if (D == VD->getCanonicalDecl()) { 291 IsForCombinedParallelRegion = true; 292 break; 293 } 294 } 295 if (IsForCombinedParallelRegion) 296 break; 297 } 298 } 299 markAsEscaped(VD); 300 if (isa<OMPCapturedExprDecl>(VD)) 301 VisitValueDecl(VD); 302 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion; 303 } 304 } 305 } 306 307 void buildRecordForGlobalizedVars(bool IsInTTDRegion) { 308 assert(!GlobalizedRD && 309 "Record for globalized variables is built already."); 310 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams; 311 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size; 312 if (IsInTTDRegion) 313 EscapedDeclsForTeams = EscapedDecls.getArrayRef(); 314 else 315 EscapedDeclsForParallel = EscapedDecls.getArrayRef(); 316 GlobalizedRD = ::buildRecordForGlobalizedVars( 317 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams, 318 MappedDeclsFields, WarpSize); 319 } 320 321 public: 322 CheckVarsEscapingDeclContext(CodeGenFunction &CGF, 323 ArrayRef<const ValueDecl *> TeamsReductions) 324 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) { 325 } 326 virtual ~CheckVarsEscapingDeclContext() = default; 327 void VisitDeclStmt(const DeclStmt *S) { 328 if (!S) 329 return; 330 for (const Decl *D : S->decls()) 331 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D)) 332 VisitValueDecl(VD); 333 } 334 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) { 335 if (!D) 336 return; 337 if (!D->hasAssociatedStmt()) 338 return; 339 if (const auto *S = 340 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) { 341 // Do not analyze directives that do not actually require capturing, 342 // like `omp for` or `omp simd` directives. 343 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions; 344 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind()); 345 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) { 346 VisitStmt(S->getCapturedStmt()); 347 return; 348 } 349 VisitOpenMPCapturedStmt( 350 S, D->clauses(), 351 CaptureRegions.back() == OMPD_parallel && 352 isOpenMPDistributeDirective(D->getDirectiveKind())); 353 } 354 } 355 void VisitCapturedStmt(const CapturedStmt *S) { 356 if (!S) 357 return; 358 for (const CapturedStmt::Capture &C : S->captures()) { 359 if (C.capturesVariable() && !C.capturesVariableByCopy()) { 360 const ValueDecl *VD = C.getCapturedVar(); 361 markAsEscaped(VD); 362 if (isa<OMPCapturedExprDecl>(VD)) 363 VisitValueDecl(VD); 364 } 365 } 366 } 367 void VisitLambdaExpr(const LambdaExpr *E) { 368 if (!E) 369 return; 370 for (const LambdaCapture &C : E->captures()) { 371 if (C.capturesVariable()) { 372 if (C.getCaptureKind() == LCK_ByRef) { 373 const ValueDecl *VD = C.getCapturedVar(); 374 markAsEscaped(VD); 375 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD)) 376 VisitValueDecl(VD); 377 } 378 } 379 } 380 } 381 void VisitBlockExpr(const BlockExpr *E) { 382 if (!E) 383 return; 384 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) { 385 if (C.isByRef()) { 386 const VarDecl *VD = C.getVariable(); 387 markAsEscaped(VD); 388 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture()) 389 VisitValueDecl(VD); 390 } 391 } 392 } 393 void VisitCallExpr(const CallExpr *E) { 394 if (!E) 395 return; 396 for (const Expr *Arg : E->arguments()) { 397 if (!Arg) 398 continue; 399 if (Arg->isLValue()) { 400 const bool SavedAllEscaped = AllEscaped; 401 AllEscaped = true; 402 Visit(Arg); 403 AllEscaped = SavedAllEscaped; 404 } else { 405 Visit(Arg); 406 } 407 } 408 Visit(E->getCallee()); 409 } 410 void VisitDeclRefExpr(const DeclRefExpr *E) { 411 if (!E) 412 return; 413 const ValueDecl *VD = E->getDecl(); 414 if (AllEscaped) 415 markAsEscaped(VD); 416 if (isa<OMPCapturedExprDecl>(VD)) 417 VisitValueDecl(VD); 418 else if (VD->isInitCapture()) 419 VisitValueDecl(VD); 420 } 421 void VisitUnaryOperator(const UnaryOperator *E) { 422 if (!E) 423 return; 424 if (E->getOpcode() == UO_AddrOf) { 425 const bool SavedAllEscaped = AllEscaped; 426 AllEscaped = true; 427 Visit(E->getSubExpr()); 428 AllEscaped = SavedAllEscaped; 429 } else { 430 Visit(E->getSubExpr()); 431 } 432 } 433 void VisitImplicitCastExpr(const ImplicitCastExpr *E) { 434 if (!E) 435 return; 436 if (E->getCastKind() == CK_ArrayToPointerDecay) { 437 const bool SavedAllEscaped = AllEscaped; 438 AllEscaped = true; 439 Visit(E->getSubExpr()); 440 AllEscaped = SavedAllEscaped; 441 } else { 442 Visit(E->getSubExpr()); 443 } 444 } 445 void VisitExpr(const Expr *E) { 446 if (!E) 447 return; 448 bool SavedAllEscaped = AllEscaped; 449 if (!E->isLValue()) 450 AllEscaped = false; 451 for (const Stmt *Child : E->children()) 452 if (Child) 453 Visit(Child); 454 AllEscaped = SavedAllEscaped; 455 } 456 void VisitStmt(const Stmt *S) { 457 if (!S) 458 return; 459 for (const Stmt *Child : S->children()) 460 if (Child) 461 Visit(Child); 462 } 463 464 /// Returns the record that handles all the escaped local variables and used 465 /// instead of their original storage. 466 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) { 467 if (!GlobalizedRD) 468 buildRecordForGlobalizedVars(IsInTTDRegion); 469 return GlobalizedRD; 470 } 471 472 /// Returns the field in the globalized record for the escaped variable. 473 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const { 474 assert(GlobalizedRD && 475 "Record for globalized variables must be generated already."); 476 return MappedDeclsFields.lookup(VD); 477 } 478 479 /// Returns the list of the escaped local variables/parameters. 480 ArrayRef<const ValueDecl *> getEscapedDecls() const { 481 return EscapedDecls.getArrayRef(); 482 } 483 484 /// Checks if the escaped local variable is actually a parameter passed by 485 /// value. 486 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const { 487 return EscapedParameters; 488 } 489 490 /// Returns the list of the escaped variables with the variably modified 491 /// types. 492 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const { 493 return EscapedVariableLengthDecls.getArrayRef(); 494 } 495 496 /// Returns the list of the delayed variables with the variably modified 497 /// types. 498 ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const { 499 return DelayedVariableLengthDecls.getArrayRef(); 500 } 501 }; 502 } // anonymous namespace 503 504 CGOpenMPRuntimeGPU::ExecutionMode 505 CGOpenMPRuntimeGPU::getExecutionMode() const { 506 return CurrentExecutionMode; 507 } 508 509 CGOpenMPRuntimeGPU::DataSharingMode 510 CGOpenMPRuntimeGPU::getDataSharingMode() const { 511 return CurrentDataSharingMode; 512 } 513 514 /// Check for inner (nested) SPMD construct, if any 515 static bool hasNestedSPMDDirective(ASTContext &Ctx, 516 const OMPExecutableDirective &D) { 517 const auto *CS = D.getInnermostCapturedStmt(); 518 const auto *Body = 519 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true); 520 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 521 522 if (const auto *NestedDir = 523 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 524 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind(); 525 switch (D.getDirectiveKind()) { 526 case OMPD_target: 527 if (isOpenMPParallelDirective(DKind)) 528 return true; 529 if (DKind == OMPD_teams) { 530 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( 531 /*IgnoreCaptured=*/true); 532 if (!Body) 533 return false; 534 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 535 if (const auto *NND = 536 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 537 DKind = NND->getDirectiveKind(); 538 if (isOpenMPParallelDirective(DKind)) 539 return true; 540 } 541 } 542 return false; 543 case OMPD_target_teams: 544 return isOpenMPParallelDirective(DKind); 545 case OMPD_target_simd: 546 case OMPD_target_parallel: 547 case OMPD_target_parallel_for: 548 case OMPD_target_parallel_for_simd: 549 case OMPD_target_teams_distribute: 550 case OMPD_target_teams_distribute_simd: 551 case OMPD_target_teams_distribute_parallel_for: 552 case OMPD_target_teams_distribute_parallel_for_simd: 553 case OMPD_parallel: 554 case OMPD_for: 555 case OMPD_parallel_for: 556 case OMPD_parallel_master: 557 case OMPD_parallel_sections: 558 case OMPD_for_simd: 559 case OMPD_parallel_for_simd: 560 case OMPD_cancel: 561 case OMPD_cancellation_point: 562 case OMPD_ordered: 563 case OMPD_threadprivate: 564 case OMPD_allocate: 565 case OMPD_task: 566 case OMPD_simd: 567 case OMPD_sections: 568 case OMPD_section: 569 case OMPD_single: 570 case OMPD_master: 571 case OMPD_critical: 572 case OMPD_taskyield: 573 case OMPD_barrier: 574 case OMPD_taskwait: 575 case OMPD_taskgroup: 576 case OMPD_atomic: 577 case OMPD_flush: 578 case OMPD_depobj: 579 case OMPD_scan: 580 case OMPD_teams: 581 case OMPD_target_data: 582 case OMPD_target_exit_data: 583 case OMPD_target_enter_data: 584 case OMPD_distribute: 585 case OMPD_distribute_simd: 586 case OMPD_distribute_parallel_for: 587 case OMPD_distribute_parallel_for_simd: 588 case OMPD_teams_distribute: 589 case OMPD_teams_distribute_simd: 590 case OMPD_teams_distribute_parallel_for: 591 case OMPD_teams_distribute_parallel_for_simd: 592 case OMPD_target_update: 593 case OMPD_declare_simd: 594 case OMPD_declare_variant: 595 case OMPD_begin_declare_variant: 596 case OMPD_end_declare_variant: 597 case OMPD_declare_target: 598 case OMPD_end_declare_target: 599 case OMPD_declare_reduction: 600 case OMPD_declare_mapper: 601 case OMPD_taskloop: 602 case OMPD_taskloop_simd: 603 case OMPD_master_taskloop: 604 case OMPD_master_taskloop_simd: 605 case OMPD_parallel_master_taskloop: 606 case OMPD_parallel_master_taskloop_simd: 607 case OMPD_requires: 608 case OMPD_unknown: 609 default: 610 llvm_unreachable("Unexpected directive."); 611 } 612 } 613 614 return false; 615 } 616 617 static bool supportsSPMDExecutionMode(ASTContext &Ctx, 618 const OMPExecutableDirective &D) { 619 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); 620 switch (DirectiveKind) { 621 case OMPD_target: 622 case OMPD_target_teams: 623 return hasNestedSPMDDirective(Ctx, D); 624 case OMPD_target_parallel_loop: 625 case OMPD_target_parallel: 626 case OMPD_target_parallel_for: 627 case OMPD_target_parallel_for_simd: 628 case OMPD_target_teams_distribute_parallel_for: 629 case OMPD_target_teams_distribute_parallel_for_simd: 630 case OMPD_target_simd: 631 case OMPD_target_teams_distribute_simd: 632 return true; 633 case OMPD_target_teams_distribute: 634 return false; 635 case OMPD_target_teams_loop: 636 // Whether this is true or not depends on how the directive will 637 // eventually be emitted. 638 if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(&D)) 639 return TTLD->canBeParallelFor(); 640 return false; 641 case OMPD_parallel: 642 case OMPD_for: 643 case OMPD_parallel_for: 644 case OMPD_parallel_master: 645 case OMPD_parallel_sections: 646 case OMPD_for_simd: 647 case OMPD_parallel_for_simd: 648 case OMPD_cancel: 649 case OMPD_cancellation_point: 650 case OMPD_ordered: 651 case OMPD_threadprivate: 652 case OMPD_allocate: 653 case OMPD_task: 654 case OMPD_simd: 655 case OMPD_sections: 656 case OMPD_section: 657 case OMPD_single: 658 case OMPD_master: 659 case OMPD_critical: 660 case OMPD_taskyield: 661 case OMPD_barrier: 662 case OMPD_taskwait: 663 case OMPD_taskgroup: 664 case OMPD_atomic: 665 case OMPD_flush: 666 case OMPD_depobj: 667 case OMPD_scan: 668 case OMPD_teams: 669 case OMPD_target_data: 670 case OMPD_target_exit_data: 671 case OMPD_target_enter_data: 672 case OMPD_distribute: 673 case OMPD_distribute_simd: 674 case OMPD_distribute_parallel_for: 675 case OMPD_distribute_parallel_for_simd: 676 case OMPD_teams_distribute: 677 case OMPD_teams_distribute_simd: 678 case OMPD_teams_distribute_parallel_for: 679 case OMPD_teams_distribute_parallel_for_simd: 680 case OMPD_target_update: 681 case OMPD_declare_simd: 682 case OMPD_declare_variant: 683 case OMPD_begin_declare_variant: 684 case OMPD_end_declare_variant: 685 case OMPD_declare_target: 686 case OMPD_end_declare_target: 687 case OMPD_declare_reduction: 688 case OMPD_declare_mapper: 689 case OMPD_taskloop: 690 case OMPD_taskloop_simd: 691 case OMPD_master_taskloop: 692 case OMPD_master_taskloop_simd: 693 case OMPD_parallel_master_taskloop: 694 case OMPD_parallel_master_taskloop_simd: 695 case OMPD_requires: 696 case OMPD_unknown: 697 default: 698 break; 699 } 700 llvm_unreachable( 701 "Unknown programming model for OpenMP directive on NVPTX target."); 702 } 703 704 void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D, 705 StringRef ParentName, 706 llvm::Function *&OutlinedFn, 707 llvm::Constant *&OutlinedFnID, 708 bool IsOffloadEntry, 709 const RegionCodeGenTy &CodeGen) { 710 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD); 711 EntryFunctionState EST; 712 WrapperFunctionsMap.clear(); 713 714 [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); 715 assert(!IsBareKernel && "bare kernel should not be at generic mode"); 716 717 // Emit target region as a standalone region. 718 class NVPTXPrePostActionTy : public PrePostActionTy { 719 CGOpenMPRuntimeGPU::EntryFunctionState &EST; 720 const OMPExecutableDirective &D; 721 722 public: 723 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST, 724 const OMPExecutableDirective &D) 725 : EST(EST), D(D) {} 726 void Enter(CodeGenFunction &CGF) override { 727 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 728 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false); 729 // Skip target region initialization. 730 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); 731 } 732 void Exit(CodeGenFunction &CGF) override { 733 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 734 RT.clearLocThreadIdInsertPt(CGF); 735 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false); 736 } 737 } Action(EST, D); 738 CodeGen.setAction(Action); 739 IsInTTDRegion = true; 740 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, 741 IsOffloadEntry, CodeGen); 742 IsInTTDRegion = false; 743 } 744 745 void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D, 746 CodeGenFunction &CGF, 747 EntryFunctionState &EST, bool IsSPMD) { 748 llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs Attrs; 749 Attrs.ExecFlags = 750 IsSPMD ? llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD 751 : llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; 752 computeMinAndMaxThreadsAndTeams(D, CGF, Attrs); 753 754 CGBuilderTy &Bld = CGF.Builder; 755 Bld.restoreIP(OMPBuilder.createTargetInit(Bld, Attrs)); 756 if (!IsSPMD) 757 emitGenericVarsProlog(CGF, EST.Loc); 758 } 759 760 void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF, 761 EntryFunctionState &EST, 762 bool IsSPMD) { 763 if (!IsSPMD) 764 emitGenericVarsEpilog(CGF); 765 766 // This is temporary until we remove the fixed sized buffer. 767 ASTContext &C = CGM.getContext(); 768 RecordDecl *StaticRD = C.buildImplicitRecord( 769 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union); 770 StaticRD->startDefinition(); 771 for (const RecordDecl *TeamReductionRec : TeamsReductions) { 772 QualType RecTy = C.getRecordType(TeamReductionRec); 773 auto *Field = FieldDecl::Create( 774 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy, 775 C.getTrivialTypeSourceInfo(RecTy, SourceLocation()), 776 /*BW=*/nullptr, /*Mutable=*/false, 777 /*InitStyle=*/ICIS_NoInit); 778 Field->setAccess(AS_public); 779 StaticRD->addDecl(Field); 780 } 781 StaticRD->completeDefinition(); 782 QualType StaticTy = C.getRecordType(StaticRD); 783 llvm::Type *LLVMReductionsBufferTy = 784 CGM.getTypes().ConvertTypeForMem(StaticTy); 785 const auto &DL = CGM.getModule().getDataLayout(); 786 uint64_t ReductionDataSize = 787 TeamsReductions.empty() 788 ? 0 789 : DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue(); 790 CGBuilderTy &Bld = CGF.Builder; 791 OMPBuilder.createTargetDeinit(Bld, ReductionDataSize, 792 C.getLangOpts().OpenMPCUDAReductionBufNum); 793 TeamsReductions.clear(); 794 } 795 796 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D, 797 StringRef ParentName, 798 llvm::Function *&OutlinedFn, 799 llvm::Constant *&OutlinedFnID, 800 bool IsOffloadEntry, 801 const RegionCodeGenTy &CodeGen) { 802 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD); 803 EntryFunctionState EST; 804 805 bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); 806 807 // Emit target region as a standalone region. 808 class NVPTXPrePostActionTy : public PrePostActionTy { 809 CGOpenMPRuntimeGPU &RT; 810 CGOpenMPRuntimeGPU::EntryFunctionState &EST; 811 bool IsBareKernel; 812 DataSharingMode Mode; 813 const OMPExecutableDirective &D; 814 815 public: 816 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT, 817 CGOpenMPRuntimeGPU::EntryFunctionState &EST, 818 bool IsBareKernel, const OMPExecutableDirective &D) 819 : RT(RT), EST(EST), IsBareKernel(IsBareKernel), 820 Mode(RT.CurrentDataSharingMode), D(D) {} 821 void Enter(CodeGenFunction &CGF) override { 822 if (IsBareKernel) { 823 RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA; 824 return; 825 } 826 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true); 827 // Skip target region initialization. 828 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); 829 } 830 void Exit(CodeGenFunction &CGF) override { 831 if (IsBareKernel) { 832 RT.CurrentDataSharingMode = Mode; 833 return; 834 } 835 RT.clearLocThreadIdInsertPt(CGF); 836 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true); 837 } 838 } Action(*this, EST, IsBareKernel, D); 839 CodeGen.setAction(Action); 840 IsInTTDRegion = true; 841 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, 842 IsOffloadEntry, CodeGen); 843 IsInTTDRegion = false; 844 } 845 846 void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction( 847 const OMPExecutableDirective &D, StringRef ParentName, 848 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, 849 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { 850 if (!IsOffloadEntry) // Nothing to do. 851 return; 852 853 assert(!ParentName.empty() && "Invalid target region parent name!"); 854 855 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D); 856 bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); 857 if (Mode || IsBareKernel) 858 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, 859 CodeGen); 860 else 861 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, 862 CodeGen); 863 } 864 865 CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM) 866 : CGOpenMPRuntime(CGM) { 867 llvm::OpenMPIRBuilderConfig Config( 868 CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(), 869 CGM.getLangOpts().OpenMPOffloadMandatory, 870 /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false, 871 hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false); 872 OMPBuilder.setConfig(Config); 873 874 if (!CGM.getLangOpts().OpenMPIsTargetDevice) 875 llvm_unreachable("OpenMP can only handle device code."); 876 877 if (CGM.getLangOpts().OpenMPCUDAMode) 878 CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA; 879 880 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder(); 881 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty()) 882 return; 883 884 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug, 885 "__omp_rtl_debug_kind"); 886 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription, 887 "__omp_rtl_assume_teams_oversubscription"); 888 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription, 889 "__omp_rtl_assume_threads_oversubscription"); 890 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState, 891 "__omp_rtl_assume_no_thread_state"); 892 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism, 893 "__omp_rtl_assume_no_nested_parallelism"); 894 } 895 896 void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF, 897 ProcBindKind ProcBind, 898 SourceLocation Loc) { 899 // Nothing to do. 900 } 901 902 void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF, 903 llvm::Value *NumThreads, 904 SourceLocation Loc) { 905 // Nothing to do. 906 } 907 908 void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF, 909 const Expr *NumTeams, 910 const Expr *ThreadLimit, 911 SourceLocation Loc) {} 912 913 llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction( 914 CodeGenFunction &CGF, const OMPExecutableDirective &D, 915 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, 916 const RegionCodeGenTy &CodeGen) { 917 // Emit target region as a standalone region. 918 bool PrevIsInTTDRegion = IsInTTDRegion; 919 IsInTTDRegion = false; 920 auto *OutlinedFun = 921 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction( 922 CGF, D, ThreadIDVar, InnermostKind, CodeGen)); 923 IsInTTDRegion = PrevIsInTTDRegion; 924 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) { 925 llvm::Function *WrapperFun = 926 createParallelDataSharingWrapper(OutlinedFun, D); 927 WrapperFunctionsMap[OutlinedFun] = WrapperFun; 928 } 929 930 return OutlinedFun; 931 } 932 933 /// Get list of lastprivate variables from the teams distribute ... or 934 /// teams {distribute ...} directives. 935 static void 936 getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, 937 llvm::SmallVectorImpl<const ValueDecl *> &Vars) { 938 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && 939 "expected teams directive."); 940 const OMPExecutableDirective *Dir = &D; 941 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) { 942 if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild( 943 Ctx, 944 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers( 945 /*IgnoreCaptured=*/true))) { 946 Dir = dyn_cast_or_null<OMPExecutableDirective>(S); 947 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind())) 948 Dir = nullptr; 949 } 950 } 951 if (!Dir) 952 return; 953 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) { 954 for (const Expr *E : C->getVarRefs()) 955 Vars.push_back(getPrivateItem(E)); 956 } 957 } 958 959 /// Get list of reduction variables from the teams ... directives. 960 static void 961 getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, 962 llvm::SmallVectorImpl<const ValueDecl *> &Vars) { 963 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && 964 "expected teams directive."); 965 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) { 966 for (const Expr *E : C->privates()) 967 Vars.push_back(getPrivateItem(E)); 968 } 969 } 970 971 llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction( 972 CodeGenFunction &CGF, const OMPExecutableDirective &D, 973 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, 974 const RegionCodeGenTy &CodeGen) { 975 SourceLocation Loc = D.getBeginLoc(); 976 977 const RecordDecl *GlobalizedRD = nullptr; 978 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions; 979 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; 980 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size; 981 // Globalize team reductions variable unconditionally in all modes. 982 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) 983 getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); 984 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { 985 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions); 986 if (!LastPrivatesReductions.empty()) { 987 GlobalizedRD = ::buildRecordForGlobalizedVars( 988 CGM.getContext(), {}, LastPrivatesReductions, MappedDeclsFields, 989 WarpSize); 990 } 991 } else if (!LastPrivatesReductions.empty()) { 992 assert(!TeamAndReductions.first && 993 "Previous team declaration is not expected."); 994 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl(); 995 std::swap(TeamAndReductions.second, LastPrivatesReductions); 996 } 997 998 // Emit target region as a standalone region. 999 class NVPTXPrePostActionTy : public PrePostActionTy { 1000 SourceLocation &Loc; 1001 const RecordDecl *GlobalizedRD; 1002 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 1003 &MappedDeclsFields; 1004 1005 public: 1006 NVPTXPrePostActionTy( 1007 SourceLocation &Loc, const RecordDecl *GlobalizedRD, 1008 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 1009 &MappedDeclsFields) 1010 : Loc(Loc), GlobalizedRD(GlobalizedRD), 1011 MappedDeclsFields(MappedDeclsFields) {} 1012 void Enter(CodeGenFunction &CGF) override { 1013 auto &Rt = 1014 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 1015 if (GlobalizedRD) { 1016 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; 1017 I->getSecond().MappedParams = 1018 std::make_unique<CodeGenFunction::OMPMapVars>(); 1019 DeclToAddrMapTy &Data = I->getSecond().LocalVarData; 1020 for (const auto &Pair : MappedDeclsFields) { 1021 assert(Pair.getFirst()->isCanonicalDecl() && 1022 "Expected canonical declaration"); 1023 Data.insert(std::make_pair(Pair.getFirst(), MappedVarData())); 1024 } 1025 } 1026 Rt.emitGenericVarsProlog(CGF, Loc); 1027 } 1028 void Exit(CodeGenFunction &CGF) override { 1029 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()) 1030 .emitGenericVarsEpilog(CGF); 1031 } 1032 } Action(Loc, GlobalizedRD, MappedDeclsFields); 1033 CodeGen.setAction(Action); 1034 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction( 1035 CGF, D, ThreadIDVar, InnermostKind, CodeGen); 1036 1037 return OutlinedFun; 1038 } 1039 1040 void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF, 1041 SourceLocation Loc) { 1042 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) 1043 return; 1044 1045 CGBuilderTy &Bld = CGF.Builder; 1046 1047 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 1048 if (I == FunctionGlobalizedDecls.end()) 1049 return; 1050 1051 for (auto &Rec : I->getSecond().LocalVarData) { 1052 const auto *VD = cast<VarDecl>(Rec.first); 1053 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); 1054 QualType VarTy = VD->getType(); 1055 1056 // Get the local allocation of a firstprivate variable before sharing 1057 llvm::Value *ParValue; 1058 if (EscapedParam) { 1059 LValue ParLVal = 1060 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); 1061 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); 1062 } 1063 1064 // Allocate space for the variable to be globalized 1065 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())}; 1066 llvm::CallBase *VoidPtr = 1067 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1068 CGM.getModule(), OMPRTL___kmpc_alloc_shared), 1069 AllocArgs, VD->getName()); 1070 // FIXME: We should use the variables actual alignment as an argument. 1071 VoidPtr->addRetAttr(llvm::Attribute::get( 1072 CGM.getLLVMContext(), llvm::Attribute::Alignment, 1073 CGM.getContext().getTargetInfo().getNewAlign() / 8)); 1074 1075 // Cast the void pointer and get the address of the globalized variable. 1076 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 1077 VoidPtr, Bld.getPtrTy(0), VD->getName() + "_on_stack"); 1078 LValue VarAddr = 1079 CGF.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr, VarTy); 1080 Rec.second.PrivateAddr = VarAddr.getAddress(); 1081 Rec.second.GlobalizedVal = VoidPtr; 1082 1083 // Assign the local allocation to the newly globalized location. 1084 if (EscapedParam) { 1085 CGF.EmitStoreOfScalar(ParValue, VarAddr); 1086 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress()); 1087 } 1088 if (auto *DI = CGF.getDebugInfo()) 1089 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation())); 1090 } 1091 1092 for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) { 1093 const auto *VD = cast<VarDecl>(ValueD); 1094 std::pair<llvm::Value *, llvm::Value *> AddrSizePair = 1095 getKmpcAllocShared(CGF, VD); 1096 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair); 1097 LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(), 1098 CGM.getContext().getDeclAlign(VD), 1099 AlignmentSource::Decl); 1100 I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress()); 1101 } 1102 I->getSecond().MappedParams->apply(CGF); 1103 } 1104 1105 bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF, 1106 const VarDecl *VD) const { 1107 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 1108 if (I == FunctionGlobalizedDecls.end()) 1109 return false; 1110 1111 // Check variable declaration is delayed: 1112 return llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD); 1113 } 1114 1115 std::pair<llvm::Value *, llvm::Value *> 1116 CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF, 1117 const VarDecl *VD) { 1118 CGBuilderTy &Bld = CGF.Builder; 1119 1120 // Compute size and alignment. 1121 llvm::Value *Size = CGF.getTypeSize(VD->getType()); 1122 CharUnits Align = CGM.getContext().getDeclAlign(VD); 1123 Size = Bld.CreateNUWAdd( 1124 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1)); 1125 llvm::Value *AlignVal = 1126 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity()); 1127 Size = Bld.CreateUDiv(Size, AlignVal); 1128 Size = Bld.CreateNUWMul(Size, AlignVal); 1129 1130 // Allocate space for this VLA object to be globalized. 1131 llvm::Value *AllocArgs[] = {Size}; 1132 llvm::CallBase *VoidPtr = 1133 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1134 CGM.getModule(), OMPRTL___kmpc_alloc_shared), 1135 AllocArgs, VD->getName()); 1136 VoidPtr->addRetAttr(llvm::Attribute::get( 1137 CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity())); 1138 1139 return std::make_pair(VoidPtr, Size); 1140 } 1141 1142 void CGOpenMPRuntimeGPU::getKmpcFreeShared( 1143 CodeGenFunction &CGF, 1144 const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) { 1145 // Deallocate the memory for each globalized VLA object 1146 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1147 CGM.getModule(), OMPRTL___kmpc_free_shared), 1148 {AddrSizePair.first, AddrSizePair.second}); 1149 } 1150 1151 void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) { 1152 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) 1153 return; 1154 1155 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 1156 if (I != FunctionGlobalizedDecls.end()) { 1157 // Deallocate the memory for each globalized VLA object that was 1158 // globalized in the prolog (i.e. emitGenericVarsProlog). 1159 for (const auto &AddrSizePair : 1160 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) { 1161 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1162 CGM.getModule(), OMPRTL___kmpc_free_shared), 1163 {AddrSizePair.first, AddrSizePair.second}); 1164 } 1165 // Deallocate the memory for each globalized value 1166 for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) { 1167 const auto *VD = cast<VarDecl>(Rec.first); 1168 I->getSecond().MappedParams->restore(CGF); 1169 1170 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal, 1171 CGF.getTypeSize(VD->getType())}; 1172 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1173 CGM.getModule(), OMPRTL___kmpc_free_shared), 1174 FreeArgs); 1175 } 1176 } 1177 } 1178 1179 void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF, 1180 const OMPExecutableDirective &D, 1181 SourceLocation Loc, 1182 llvm::Function *OutlinedFn, 1183 ArrayRef<llvm::Value *> CapturedVars) { 1184 if (!CGF.HaveInsertPoint()) 1185 return; 1186 1187 bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); 1188 1189 RawAddress ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, 1190 /*Name=*/".zero.addr"); 1191 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr); 1192 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; 1193 // We don't emit any thread id function call in bare kernel, but because the 1194 // outlined function has a pointer argument, we emit a nullptr here. 1195 if (IsBareKernel) 1196 OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy)); 1197 else 1198 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF)); 1199 OutlinedFnArgs.push_back(ZeroAddr.getPointer()); 1200 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); 1201 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); 1202 } 1203 1204 void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF, 1205 SourceLocation Loc, 1206 llvm::Function *OutlinedFn, 1207 ArrayRef<llvm::Value *> CapturedVars, 1208 const Expr *IfCond, 1209 llvm::Value *NumThreads) { 1210 if (!CGF.HaveInsertPoint()) 1211 return; 1212 1213 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond, 1214 NumThreads](CodeGenFunction &CGF, 1215 PrePostActionTy &Action) { 1216 CGBuilderTy &Bld = CGF.Builder; 1217 llvm::Value *NumThreadsVal = NumThreads; 1218 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn]; 1219 llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy); 1220 if (WFn) 1221 ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); 1222 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy); 1223 1224 // Create a private scope that will globalize the arguments 1225 // passed from the outside of the target region. 1226 // TODO: Is that needed? 1227 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF); 1228 1229 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca( 1230 llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()), 1231 "captured_vars_addrs"); 1232 // There's something to share. 1233 if (!CapturedVars.empty()) { 1234 // Prepare for parallel region. Indicate the outlined function. 1235 ASTContext &Ctx = CGF.getContext(); 1236 unsigned Idx = 0; 1237 for (llvm::Value *V : CapturedVars) { 1238 Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx); 1239 llvm::Value *PtrV; 1240 if (V->getType()->isIntegerTy()) 1241 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy); 1242 else 1243 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy); 1244 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false, 1245 Ctx.getPointerType(Ctx.VoidPtrTy)); 1246 ++Idx; 1247 } 1248 } 1249 1250 llvm::Value *IfCondVal = nullptr; 1251 if (IfCond) 1252 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty, 1253 /* isSigned */ false); 1254 else 1255 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1); 1256 1257 if (!NumThreadsVal) 1258 NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1); 1259 else 1260 NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty), 1261 1262 assert(IfCondVal && "Expected a value"); 1263 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); 1264 llvm::Value *Args[] = { 1265 RTLoc, 1266 getThreadID(CGF, Loc), 1267 IfCondVal, 1268 NumThreadsVal, 1269 llvm::ConstantInt::get(CGF.Int32Ty, -1), 1270 FnPtr, 1271 ID, 1272 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF), 1273 CGF.VoidPtrPtrTy), 1274 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())}; 1275 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1276 CGM.getModule(), OMPRTL___kmpc_parallel_51), 1277 Args); 1278 }; 1279 1280 RegionCodeGenTy RCG(ParallelGen); 1281 RCG(CGF); 1282 } 1283 1284 void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) { 1285 // Always emit simple barriers! 1286 if (!CGF.HaveInsertPoint()) 1287 return; 1288 // Build call __kmpc_barrier_simple_spmd(nullptr, 0); 1289 // This function does not use parameters, so we can emit just default values. 1290 llvm::Value *Args[] = { 1291 llvm::ConstantPointerNull::get( 1292 cast<llvm::PointerType>(getIdentTyPointerTy())), 1293 llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)}; 1294 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1295 CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd), 1296 Args); 1297 } 1298 1299 void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF, 1300 SourceLocation Loc, 1301 OpenMPDirectiveKind Kind, bool, 1302 bool) { 1303 // Always emit simple barriers! 1304 if (!CGF.HaveInsertPoint()) 1305 return; 1306 // Build call __kmpc_cancel_barrier(loc, thread_id); 1307 unsigned Flags = getDefaultFlagsForBarriers(Kind); 1308 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags), 1309 getThreadID(CGF, Loc)}; 1310 1311 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1312 CGM.getModule(), OMPRTL___kmpc_barrier), 1313 Args); 1314 } 1315 1316 void CGOpenMPRuntimeGPU::emitCriticalRegion( 1317 CodeGenFunction &CGF, StringRef CriticalName, 1318 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, 1319 const Expr *Hint) { 1320 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop"); 1321 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test"); 1322 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync"); 1323 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body"); 1324 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit"); 1325 1326 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 1327 1328 // Get the mask of active threads in the warp. 1329 llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1330 CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask)); 1331 // Fetch team-local id of the thread. 1332 llvm::Value *ThreadID = RT.getGPUThreadID(CGF); 1333 1334 // Get the width of the team. 1335 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF); 1336 1337 // Initialize the counter variable for the loop. 1338 QualType Int32Ty = 1339 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0); 1340 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter"); 1341 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty); 1342 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal, 1343 /*isInit=*/true); 1344 1345 // Block checks if loop counter exceeds upper bound. 1346 CGF.EmitBlock(LoopBB); 1347 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc); 1348 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth); 1349 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB); 1350 1351 // Block tests which single thread should execute region, and which threads 1352 // should go straight to synchronisation point. 1353 CGF.EmitBlock(TestBB); 1354 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc); 1355 llvm::Value *CmpThreadToCounter = 1356 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal); 1357 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB); 1358 1359 // Block emits the body of the critical region. 1360 CGF.EmitBlock(BodyBB); 1361 1362 // Output the critical statement. 1363 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc, 1364 Hint); 1365 1366 // After the body surrounded by the critical region, the single executing 1367 // thread will jump to the synchronisation point. 1368 // Block waits for all threads in current team to finish then increments the 1369 // counter variable and returns to the loop. 1370 CGF.EmitBlock(SyncBB); 1371 // Reconverge active threads in the warp. 1372 (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1373 CGM.getModule(), OMPRTL___kmpc_syncwarp), 1374 Mask); 1375 1376 llvm::Value *IncCounterVal = 1377 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1)); 1378 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal); 1379 CGF.EmitBranch(LoopBB); 1380 1381 // Block that is reached when all threads in the team complete the region. 1382 CGF.EmitBlock(ExitBB, /*IsFinished=*/true); 1383 } 1384 1385 /// Cast value to the specified type. 1386 static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val, 1387 QualType ValTy, QualType CastTy, 1388 SourceLocation Loc) { 1389 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && 1390 "Cast type must sized."); 1391 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && 1392 "Val type must sized."); 1393 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy); 1394 if (ValTy == CastTy) 1395 return Val; 1396 if (CGF.getContext().getTypeSizeInChars(ValTy) == 1397 CGF.getContext().getTypeSizeInChars(CastTy)) 1398 return CGF.Builder.CreateBitCast(Val, LLVMCastTy); 1399 if (CastTy->isIntegerType() && ValTy->isIntegerType()) 1400 return CGF.Builder.CreateIntCast(Val, LLVMCastTy, 1401 CastTy->hasSignedIntegerRepresentation()); 1402 Address CastItem = CGF.CreateMemTemp(CastTy); 1403 Address ValCastItem = CastItem.withElementType(Val->getType()); 1404 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy, 1405 LValueBaseInfo(AlignmentSource::Type), 1406 TBAAAccessInfo()); 1407 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc, 1408 LValueBaseInfo(AlignmentSource::Type), 1409 TBAAAccessInfo()); 1410 } 1411 1412 /// 1413 /// Design of OpenMP reductions on the GPU 1414 /// 1415 /// Consider a typical OpenMP program with one or more reduction 1416 /// clauses: 1417 /// 1418 /// float foo; 1419 /// double bar; 1420 /// #pragma omp target teams distribute parallel for \ 1421 /// reduction(+:foo) reduction(*:bar) 1422 /// for (int i = 0; i < N; i++) { 1423 /// foo += A[i]; bar *= B[i]; 1424 /// } 1425 /// 1426 /// where 'foo' and 'bar' are reduced across all OpenMP threads in 1427 /// all teams. In our OpenMP implementation on the NVPTX device an 1428 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads 1429 /// within a team are mapped to CUDA threads within a threadblock. 1430 /// Our goal is to efficiently aggregate values across all OpenMP 1431 /// threads such that: 1432 /// 1433 /// - the compiler and runtime are logically concise, and 1434 /// - the reduction is performed efficiently in a hierarchical 1435 /// manner as follows: within OpenMP threads in the same warp, 1436 /// across warps in a threadblock, and finally across teams on 1437 /// the NVPTX device. 1438 /// 1439 /// Introduction to Decoupling 1440 /// 1441 /// We would like to decouple the compiler and the runtime so that the 1442 /// latter is ignorant of the reduction variables (number, data types) 1443 /// and the reduction operators. This allows a simpler interface 1444 /// and implementation while still attaining good performance. 1445 /// 1446 /// Pseudocode for the aforementioned OpenMP program generated by the 1447 /// compiler is as follows: 1448 /// 1449 /// 1. Create private copies of reduction variables on each OpenMP 1450 /// thread: 'foo_private', 'bar_private' 1451 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned 1452 /// to it and writes the result in 'foo_private' and 'bar_private' 1453 /// respectively. 1454 /// 3. Call the OpenMP runtime on the GPU to reduce within a team 1455 /// and store the result on the team master: 1456 /// 1457 /// __kmpc_nvptx_parallel_reduce_nowait_v2(..., 1458 /// reduceData, shuffleReduceFn, interWarpCpyFn) 1459 /// 1460 /// where: 1461 /// struct ReduceData { 1462 /// double *foo; 1463 /// double *bar; 1464 /// } reduceData 1465 /// reduceData.foo = &foo_private 1466 /// reduceData.bar = &bar_private 1467 /// 1468 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two 1469 /// auxiliary functions generated by the compiler that operate on 1470 /// variables of type 'ReduceData'. They aid the runtime perform 1471 /// algorithmic steps in a data agnostic manner. 1472 /// 1473 /// 'shuffleReduceFn' is a pointer to a function that reduces data 1474 /// of type 'ReduceData' across two OpenMP threads (lanes) in the 1475 /// same warp. It takes the following arguments as input: 1476 /// 1477 /// a. variable of type 'ReduceData' on the calling lane, 1478 /// b. its lane_id, 1479 /// c. an offset relative to the current lane_id to generate a 1480 /// remote_lane_id. The remote lane contains the second 1481 /// variable of type 'ReduceData' that is to be reduced. 1482 /// d. an algorithm version parameter determining which reduction 1483 /// algorithm to use. 1484 /// 1485 /// 'shuffleReduceFn' retrieves data from the remote lane using 1486 /// efficient GPU shuffle intrinsics and reduces, using the 1487 /// algorithm specified by the 4th parameter, the two operands 1488 /// element-wise. The result is written to the first operand. 1489 /// 1490 /// Different reduction algorithms are implemented in different 1491 /// runtime functions, all calling 'shuffleReduceFn' to perform 1492 /// the essential reduction step. Therefore, based on the 4th 1493 /// parameter, this function behaves slightly differently to 1494 /// cooperate with the runtime to ensure correctness under 1495 /// different circumstances. 1496 /// 1497 /// 'InterWarpCpyFn' is a pointer to a function that transfers 1498 /// reduced variables across warps. It tunnels, through CUDA 1499 /// shared memory, the thread-private data of type 'ReduceData' 1500 /// from lane 0 of each warp to a lane in the first warp. 1501 /// 4. Call the OpenMP runtime on the GPU to reduce across teams. 1502 /// The last team writes the global reduced value to memory. 1503 /// 1504 /// ret = __kmpc_nvptx_teams_reduce_nowait(..., 1505 /// reduceData, shuffleReduceFn, interWarpCpyFn, 1506 /// scratchpadCopyFn, loadAndReduceFn) 1507 /// 1508 /// 'scratchpadCopyFn' is a helper that stores reduced 1509 /// data from the team master to a scratchpad array in 1510 /// global memory. 1511 /// 1512 /// 'loadAndReduceFn' is a helper that loads data from 1513 /// the scratchpad array and reduces it with the input 1514 /// operand. 1515 /// 1516 /// These compiler generated functions hide address 1517 /// calculation and alignment information from the runtime. 1518 /// 5. if ret == 1: 1519 /// The team master of the last team stores the reduced 1520 /// result to the globals in memory. 1521 /// foo += reduceData.foo; bar *= reduceData.bar 1522 /// 1523 /// 1524 /// Warp Reduction Algorithms 1525 /// 1526 /// On the warp level, we have three algorithms implemented in the 1527 /// OpenMP runtime depending on the number of active lanes: 1528 /// 1529 /// Full Warp Reduction 1530 /// 1531 /// The reduce algorithm within a warp where all lanes are active 1532 /// is implemented in the runtime as follows: 1533 /// 1534 /// full_warp_reduce(void *reduce_data, 1535 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 1536 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2) 1537 /// ShuffleReduceFn(reduce_data, 0, offset, 0); 1538 /// } 1539 /// 1540 /// The algorithm completes in log(2, WARPSIZE) steps. 1541 /// 1542 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is 1543 /// not used therefore we save instructions by not retrieving lane_id 1544 /// from the corresponding special registers. The 4th parameter, which 1545 /// represents the version of the algorithm being used, is set to 0 to 1546 /// signify full warp reduction. 1547 /// 1548 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1549 /// 1550 /// #reduce_elem refers to an element in the local lane's data structure 1551 /// #remote_elem is retrieved from a remote lane 1552 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1553 /// reduce_elem = reduce_elem REDUCE_OP remote_elem; 1554 /// 1555 /// Contiguous Partial Warp Reduction 1556 /// 1557 /// This reduce algorithm is used within a warp where only the first 1558 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the 1559 /// number of OpenMP threads in a parallel region is not a multiple of 1560 /// WARPSIZE. The algorithm is implemented in the runtime as follows: 1561 /// 1562 /// void 1563 /// contiguous_partial_reduce(void *reduce_data, 1564 /// kmp_ShuffleReductFctPtr ShuffleReduceFn, 1565 /// int size, int lane_id) { 1566 /// int curr_size; 1567 /// int offset; 1568 /// curr_size = size; 1569 /// mask = curr_size/2; 1570 /// while (offset>0) { 1571 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1); 1572 /// curr_size = (curr_size+1)/2; 1573 /// offset = curr_size/2; 1574 /// } 1575 /// } 1576 /// 1577 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1578 /// 1579 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1580 /// if (lane_id < offset) 1581 /// reduce_elem = reduce_elem REDUCE_OP remote_elem 1582 /// else 1583 /// reduce_elem = remote_elem 1584 /// 1585 /// This algorithm assumes that the data to be reduced are located in a 1586 /// contiguous subset of lanes starting from the first. When there is 1587 /// an odd number of active lanes, the data in the last lane is not 1588 /// aggregated with any other lane's dat but is instead copied over. 1589 /// 1590 /// Dispersed Partial Warp Reduction 1591 /// 1592 /// This algorithm is used within a warp when any discontiguous subset of 1593 /// lanes are active. It is used to implement the reduction operation 1594 /// across lanes in an OpenMP simd region or in a nested parallel region. 1595 /// 1596 /// void 1597 /// dispersed_partial_reduce(void *reduce_data, 1598 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 1599 /// int size, remote_id; 1600 /// int logical_lane_id = number_of_active_lanes_before_me() * 2; 1601 /// do { 1602 /// remote_id = next_active_lane_id_right_after_me(); 1603 /// # the above function returns 0 of no active lane 1604 /// # is present right after the current lane. 1605 /// size = number_of_active_lanes_in_this_warp(); 1606 /// logical_lane_id /= 2; 1607 /// ShuffleReduceFn(reduce_data, logical_lane_id, 1608 /// remote_id-1-threadIdx.x, 2); 1609 /// } while (logical_lane_id % 2 == 0 && size > 1); 1610 /// } 1611 /// 1612 /// There is no assumption made about the initial state of the reduction. 1613 /// Any number of lanes (>=1) could be active at any position. The reduction 1614 /// result is returned in the first active lane. 1615 /// 1616 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1617 /// 1618 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1619 /// if (lane_id % 2 == 0 && offset > 0) 1620 /// reduce_elem = reduce_elem REDUCE_OP remote_elem 1621 /// else 1622 /// reduce_elem = remote_elem 1623 /// 1624 /// 1625 /// Intra-Team Reduction 1626 /// 1627 /// This function, as implemented in the runtime call 1628 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP 1629 /// threads in a team. It first reduces within a warp using the 1630 /// aforementioned algorithms. We then proceed to gather all such 1631 /// reduced values at the first warp. 1632 /// 1633 /// The runtime makes use of the function 'InterWarpCpyFn', which copies 1634 /// data from each of the "warp master" (zeroth lane of each warp, where 1635 /// warp-reduced data is held) to the zeroth warp. This step reduces (in 1636 /// a mathematical sense) the problem of reduction across warp masters in 1637 /// a block to the problem of warp reduction. 1638 /// 1639 /// 1640 /// Inter-Team Reduction 1641 /// 1642 /// Once a team has reduced its data to a single value, it is stored in 1643 /// a global scratchpad array. Since each team has a distinct slot, this 1644 /// can be done without locking. 1645 /// 1646 /// The last team to write to the scratchpad array proceeds to reduce the 1647 /// scratchpad array. One or more workers in the last team use the helper 1648 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e., 1649 /// the k'th worker reduces every k'th element. 1650 /// 1651 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to 1652 /// reduce across workers and compute a globally reduced value. 1653 /// 1654 void CGOpenMPRuntimeGPU::emitReduction( 1655 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates, 1656 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs, 1657 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) { 1658 if (!CGF.HaveInsertPoint()) 1659 return; 1660 1661 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind); 1662 bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind); 1663 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind); 1664 1665 ASTContext &C = CGM.getContext(); 1666 1667 if (Options.SimpleReduction) { 1668 assert(!TeamsReduction && !ParallelReduction && 1669 "Invalid reduction selection in emitReduction."); 1670 (void)ParallelReduction; 1671 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, 1672 ReductionOps, Options); 1673 return; 1674 } 1675 1676 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap; 1677 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size()); 1678 int Cnt = 0; 1679 for (const Expr *DRE : Privates) { 1680 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl(); 1681 ++Cnt; 1682 } 1683 const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars( 1684 CGM.getContext(), PrivatesReductions, {}, VarFieldMap, 1); 1685 1686 if (TeamsReduction) 1687 TeamsReductions.push_back(ReductionRec); 1688 1689 // Source location for the ident struct 1690 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); 1691 1692 using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; 1693 InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(), 1694 CGF.AllocaInsertPt->getIterator()); 1695 InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(), 1696 CGF.Builder.GetInsertPoint()); 1697 llvm::OpenMPIRBuilder::LocationDescription OmpLoc( 1698 CodeGenIP, CGF.SourceLocToDebugLoc(Loc)); 1699 llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos; 1700 1701 CodeGenFunction::OMPPrivateScope Scope(CGF); 1702 unsigned Idx = 0; 1703 for (const Expr *Private : Privates) { 1704 llvm::Type *ElementType; 1705 llvm::Value *Variable; 1706 llvm::Value *PrivateVariable; 1707 llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen = nullptr; 1708 ElementType = CGF.ConvertTypeForMem(Private->getType()); 1709 const auto *RHSVar = 1710 cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl()); 1711 PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).emitRawPointer(CGF); 1712 const auto *LHSVar = 1713 cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl()); 1714 Variable = CGF.GetAddrOfLocalVar(LHSVar).emitRawPointer(CGF); 1715 llvm::OpenMPIRBuilder::EvalKind EvalKind; 1716 switch (CGF.getEvaluationKind(Private->getType())) { 1717 case TEK_Scalar: 1718 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Scalar; 1719 break; 1720 case TEK_Complex: 1721 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Complex; 1722 break; 1723 case TEK_Aggregate: 1724 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Aggregate; 1725 break; 1726 } 1727 auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I, 1728 llvm::Value **LHSPtr, llvm::Value **RHSPtr, 1729 llvm::Function *NewFunc) { 1730 CGF.Builder.restoreIP(CodeGenIP); 1731 auto *CurFn = CGF.CurFn; 1732 CGF.CurFn = NewFunc; 1733 1734 *LHSPtr = CGF.GetAddrOfLocalVar( 1735 cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl())) 1736 .emitRawPointer(CGF); 1737 *RHSPtr = CGF.GetAddrOfLocalVar( 1738 cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl())) 1739 .emitRawPointer(CGF); 1740 1741 emitSingleReductionCombiner(CGF, ReductionOps[I], Privates[I], 1742 cast<DeclRefExpr>(LHSExprs[I]), 1743 cast<DeclRefExpr>(RHSExprs[I])); 1744 1745 CGF.CurFn = CurFn; 1746 1747 return InsertPointTy(CGF.Builder.GetInsertBlock(), 1748 CGF.Builder.GetInsertPoint()); 1749 }; 1750 ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo( 1751 ElementType, Variable, PrivateVariable, EvalKind, 1752 /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen)); 1753 Idx++; 1754 } 1755 1756 llvm::OpenMPIRBuilder::InsertPointTy AfterIP = 1757 cantFail(OMPBuilder.createReductionsGPU( 1758 OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction, 1759 DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang, 1760 CGF.getTarget().getGridValue(), 1761 C.getLangOpts().OpenMPCUDAReductionBufNum, RTLoc)); 1762 CGF.Builder.restoreIP(AfterIP); 1763 return; 1764 } 1765 1766 const VarDecl * 1767 CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD, 1768 const VarDecl *NativeParam) const { 1769 if (!NativeParam->getType()->isReferenceType()) 1770 return NativeParam; 1771 QualType ArgType = NativeParam->getType(); 1772 QualifierCollector QC; 1773 const Type *NonQualTy = QC.strip(ArgType); 1774 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType(); 1775 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) { 1776 if (Attr->getCaptureKind() == OMPC_map) { 1777 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy, 1778 LangAS::opencl_global); 1779 } 1780 } 1781 ArgType = CGM.getContext().getPointerType(PointeeTy); 1782 QC.addRestrict(); 1783 enum { NVPTX_local_addr = 5 }; 1784 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr)); 1785 ArgType = QC.apply(CGM.getContext(), ArgType); 1786 if (isa<ImplicitParamDecl>(NativeParam)) 1787 return ImplicitParamDecl::Create( 1788 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(), 1789 NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other); 1790 return ParmVarDecl::Create( 1791 CGM.getContext(), 1792 const_cast<DeclContext *>(NativeParam->getDeclContext()), 1793 NativeParam->getBeginLoc(), NativeParam->getLocation(), 1794 NativeParam->getIdentifier(), ArgType, 1795 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr); 1796 } 1797 1798 Address 1799 CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF, 1800 const VarDecl *NativeParam, 1801 const VarDecl *TargetParam) const { 1802 assert(NativeParam != TargetParam && 1803 NativeParam->getType()->isReferenceType() && 1804 "Native arg must not be the same as target arg."); 1805 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam); 1806 QualType NativeParamType = NativeParam->getType(); 1807 QualifierCollector QC; 1808 const Type *NonQualTy = QC.strip(NativeParamType); 1809 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType(); 1810 unsigned NativePointeeAddrSpace = 1811 CGF.getTypes().getTargetAddressSpace(NativePointeeTy); 1812 QualType TargetTy = TargetParam->getType(); 1813 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(LocalAddr, /*Volatile=*/false, 1814 TargetTy, SourceLocation()); 1815 // Cast to native address space. 1816 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 1817 TargetAddr, 1818 llvm::PointerType::get(CGF.getLLVMContext(), NativePointeeAddrSpace)); 1819 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType); 1820 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false, 1821 NativeParamType); 1822 return NativeParamAddr; 1823 } 1824 1825 void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall( 1826 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, 1827 ArrayRef<llvm::Value *> Args) const { 1828 SmallVector<llvm::Value *, 4> TargetArgs; 1829 TargetArgs.reserve(Args.size()); 1830 auto *FnType = OutlinedFn.getFunctionType(); 1831 for (unsigned I = 0, E = Args.size(); I < E; ++I) { 1832 if (FnType->isVarArg() && FnType->getNumParams() <= I) { 1833 TargetArgs.append(std::next(Args.begin(), I), Args.end()); 1834 break; 1835 } 1836 llvm::Type *TargetType = FnType->getParamType(I); 1837 llvm::Value *NativeArg = Args[I]; 1838 if (!TargetType->isPointerTy()) { 1839 TargetArgs.emplace_back(NativeArg); 1840 continue; 1841 } 1842 TargetArgs.emplace_back( 1843 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(NativeArg, TargetType)); 1844 } 1845 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs); 1846 } 1847 1848 /// Emit function which wraps the outline parallel region 1849 /// and controls the arguments which are passed to this function. 1850 /// The wrapper ensures that the outlined function is called 1851 /// with the correct arguments when data is shared. 1852 llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper( 1853 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) { 1854 ASTContext &Ctx = CGM.getContext(); 1855 const auto &CS = *D.getCapturedStmt(OMPD_parallel); 1856 1857 // Create a function that takes as argument the source thread. 1858 FunctionArgList WrapperArgs; 1859 QualType Int16QTy = 1860 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false); 1861 QualType Int32QTy = 1862 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false); 1863 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), 1864 /*Id=*/nullptr, Int16QTy, 1865 ImplicitParamKind::Other); 1866 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), 1867 /*Id=*/nullptr, Int32QTy, 1868 ImplicitParamKind::Other); 1869 WrapperArgs.emplace_back(&ParallelLevelArg); 1870 WrapperArgs.emplace_back(&WrapperArg); 1871 1872 const CGFunctionInfo &CGFI = 1873 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs); 1874 1875 auto *Fn = llvm::Function::Create( 1876 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 1877 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule()); 1878 1879 // Ensure we do not inline the function. This is trivially true for the ones 1880 // passed to __kmpc_fork_call but the ones calles in serialized regions 1881 // could be inlined. This is not a perfect but it is closer to the invariant 1882 // we want, namely, every data environment starts with a new function. 1883 // TODO: We should pass the if condition to the runtime function and do the 1884 // handling there. Much cleaner code. 1885 Fn->addFnAttr(llvm::Attribute::NoInline); 1886 1887 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 1888 Fn->setLinkage(llvm::GlobalValue::InternalLinkage); 1889 Fn->setDoesNotRecurse(); 1890 1891 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); 1892 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs, 1893 D.getBeginLoc(), D.getBeginLoc()); 1894 1895 const auto *RD = CS.getCapturedRecordDecl(); 1896 auto CurField = RD->field_begin(); 1897 1898 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, 1899 /*Name=*/".zero.addr"); 1900 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr); 1901 // Get the array of arguments. 1902 SmallVector<llvm::Value *, 8> Args; 1903 1904 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).emitRawPointer(CGF)); 1905 Args.emplace_back(ZeroAddr.emitRawPointer(CGF)); 1906 1907 CGBuilderTy &Bld = CGF.Builder; 1908 auto CI = CS.capture_begin(); 1909 1910 // Use global memory for data sharing. 1911 // Handle passing of global args to workers. 1912 RawAddress GlobalArgs = 1913 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args"); 1914 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer(); 1915 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr}; 1916 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1917 CGM.getModule(), OMPRTL___kmpc_get_shared_variables), 1918 DataSharingArgs); 1919 1920 // Retrieve the shared variables from the list of references returned 1921 // by the runtime. Pass the variables to the outlined function. 1922 Address SharedArgListAddress = Address::invalid(); 1923 if (CS.capture_size() > 0 || 1924 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { 1925 SharedArgListAddress = CGF.EmitLoadOfPointer( 1926 GlobalArgs, CGF.getContext() 1927 .getPointerType(CGF.getContext().VoidPtrTy) 1928 .castAs<PointerType>()); 1929 } 1930 unsigned Idx = 0; 1931 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { 1932 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); 1933 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( 1934 Src, Bld.getPtrTy(0), CGF.SizeTy); 1935 llvm::Value *LB = CGF.EmitLoadOfScalar( 1936 TypedAddress, 1937 /*Volatile=*/false, 1938 CGF.getContext().getPointerType(CGF.getContext().getSizeType()), 1939 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc()); 1940 Args.emplace_back(LB); 1941 ++Idx; 1942 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); 1943 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(Src, Bld.getPtrTy(0), 1944 CGF.SizeTy); 1945 llvm::Value *UB = CGF.EmitLoadOfScalar( 1946 TypedAddress, 1947 /*Volatile=*/false, 1948 CGF.getContext().getPointerType(CGF.getContext().getSizeType()), 1949 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc()); 1950 Args.emplace_back(UB); 1951 ++Idx; 1952 } 1953 if (CS.capture_size() > 0) { 1954 ASTContext &CGFContext = CGF.getContext(); 1955 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) { 1956 QualType ElemTy = CurField->getType(); 1957 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx); 1958 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( 1959 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)), 1960 CGF.ConvertTypeForMem(ElemTy)); 1961 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress, 1962 /*Volatile=*/false, 1963 CGFContext.getPointerType(ElemTy), 1964 CI->getLocation()); 1965 if (CI->capturesVariableByCopy() && 1966 !CI->getCapturedVar()->getType()->isAnyPointerType()) { 1967 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(), 1968 CI->getLocation()); 1969 } 1970 Args.emplace_back(Arg); 1971 } 1972 } 1973 1974 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args); 1975 CGF.FinishFunction(); 1976 return Fn; 1977 } 1978 1979 void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF, 1980 const Decl *D) { 1981 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) 1982 return; 1983 1984 assert(D && "Expected function or captured|block decl."); 1985 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && 1986 "Function is registered already."); 1987 assert((!TeamAndReductions.first || TeamAndReductions.first == D) && 1988 "Team is set but not processed."); 1989 const Stmt *Body = nullptr; 1990 bool NeedToDelayGlobalization = false; 1991 if (const auto *FD = dyn_cast<FunctionDecl>(D)) { 1992 Body = FD->getBody(); 1993 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) { 1994 Body = BD->getBody(); 1995 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) { 1996 Body = CD->getBody(); 1997 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP; 1998 if (NeedToDelayGlobalization && 1999 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) 2000 return; 2001 } 2002 if (!Body) 2003 return; 2004 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second); 2005 VarChecker.Visit(Body); 2006 const RecordDecl *GlobalizedVarsRecord = 2007 VarChecker.getGlobalizedRecord(IsInTTDRegion); 2008 TeamAndReductions.first = nullptr; 2009 TeamAndReductions.second.clear(); 2010 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls = 2011 VarChecker.getEscapedVariableLengthDecls(); 2012 ArrayRef<const ValueDecl *> DelayedVariableLengthDecls = 2013 VarChecker.getDelayedVariableLengthDecls(); 2014 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() && 2015 DelayedVariableLengthDecls.empty()) 2016 return; 2017 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; 2018 I->getSecond().MappedParams = 2019 std::make_unique<CodeGenFunction::OMPMapVars>(); 2020 I->getSecond().EscapedParameters.insert( 2021 VarChecker.getEscapedParameters().begin(), 2022 VarChecker.getEscapedParameters().end()); 2023 I->getSecond().EscapedVariableLengthDecls.append( 2024 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end()); 2025 I->getSecond().DelayedVariableLengthDecls.append( 2026 DelayedVariableLengthDecls.begin(), DelayedVariableLengthDecls.end()); 2027 DeclToAddrMapTy &Data = I->getSecond().LocalVarData; 2028 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { 2029 assert(VD->isCanonicalDecl() && "Expected canonical declaration"); 2030 Data.insert(std::make_pair(VD, MappedVarData())); 2031 } 2032 if (!NeedToDelayGlobalization) { 2033 emitGenericVarsProlog(CGF, D->getBeginLoc()); 2034 struct GlobalizationScope final : EHScopeStack::Cleanup { 2035 GlobalizationScope() = default; 2036 2037 void Emit(CodeGenFunction &CGF, Flags flags) override { 2038 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()) 2039 .emitGenericVarsEpilog(CGF); 2040 } 2041 }; 2042 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup); 2043 } 2044 } 2045 2046 Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF, 2047 const VarDecl *VD) { 2048 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) { 2049 const auto *A = VD->getAttr<OMPAllocateDeclAttr>(); 2050 auto AS = LangAS::Default; 2051 switch (A->getAllocatorType()) { 2052 case OMPAllocateDeclAttr::OMPNullMemAlloc: 2053 case OMPAllocateDeclAttr::OMPDefaultMemAlloc: 2054 case OMPAllocateDeclAttr::OMPHighBWMemAlloc: 2055 case OMPAllocateDeclAttr::OMPLowLatMemAlloc: 2056 break; 2057 case OMPAllocateDeclAttr::OMPThreadMemAlloc: 2058 return Address::invalid(); 2059 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: 2060 // TODO: implement aupport for user-defined allocators. 2061 return Address::invalid(); 2062 case OMPAllocateDeclAttr::OMPConstMemAlloc: 2063 AS = LangAS::cuda_constant; 2064 break; 2065 case OMPAllocateDeclAttr::OMPPTeamMemAlloc: 2066 AS = LangAS::cuda_shared; 2067 break; 2068 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: 2069 case OMPAllocateDeclAttr::OMPCGroupMemAlloc: 2070 break; 2071 } 2072 llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType()); 2073 auto *GV = new llvm::GlobalVariable( 2074 CGM.getModule(), VarTy, /*isConstant=*/false, 2075 llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy), 2076 VD->getName(), 2077 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, 2078 CGM.getContext().getTargetAddressSpace(AS)); 2079 CharUnits Align = CGM.getContext().getDeclAlign(VD); 2080 GV->setAlignment(Align.getAsAlign()); 2081 return Address( 2082 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 2083 GV, CGF.Builder.getPtrTy(CGM.getContext().getTargetAddressSpace( 2084 VD->getType().getAddressSpace()))), 2085 VarTy, Align); 2086 } 2087 2088 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) 2089 return Address::invalid(); 2090 2091 VD = VD->getCanonicalDecl(); 2092 auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 2093 if (I == FunctionGlobalizedDecls.end()) 2094 return Address::invalid(); 2095 auto VDI = I->getSecond().LocalVarData.find(VD); 2096 if (VDI != I->getSecond().LocalVarData.end()) 2097 return VDI->second.PrivateAddr; 2098 if (VD->hasAttrs()) { 2099 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()), 2100 E(VD->attr_end()); 2101 IT != E; ++IT) { 2102 auto VDI = I->getSecond().LocalVarData.find( 2103 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl()) 2104 ->getCanonicalDecl()); 2105 if (VDI != I->getSecond().LocalVarData.end()) 2106 return VDI->second.PrivateAddr; 2107 } 2108 } 2109 2110 return Address::invalid(); 2111 } 2112 2113 void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) { 2114 FunctionGlobalizedDecls.erase(CGF.CurFn); 2115 CGOpenMPRuntime::functionFinished(CGF); 2116 } 2117 2118 void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk( 2119 CodeGenFunction &CGF, const OMPLoopDirective &S, 2120 OpenMPDistScheduleClauseKind &ScheduleKind, 2121 llvm::Value *&Chunk) const { 2122 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 2123 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { 2124 ScheduleKind = OMPC_DIST_SCHEDULE_static; 2125 Chunk = CGF.EmitScalarConversion( 2126 RT.getGPUNumThreads(CGF), 2127 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), 2128 S.getIterationVariable()->getType(), S.getBeginLoc()); 2129 return; 2130 } 2131 CGOpenMPRuntime::getDefaultDistScheduleAndChunk( 2132 CGF, S, ScheduleKind, Chunk); 2133 } 2134 2135 void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk( 2136 CodeGenFunction &CGF, const OMPLoopDirective &S, 2137 OpenMPScheduleClauseKind &ScheduleKind, 2138 const Expr *&ChunkExpr) const { 2139 ScheduleKind = OMPC_SCHEDULE_static; 2140 // Chunk size is 1 in this case. 2141 llvm::APInt ChunkSize(32, 1); 2142 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize, 2143 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), 2144 SourceLocation()); 2145 } 2146 2147 void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas( 2148 CodeGenFunction &CGF, const OMPExecutableDirective &D) const { 2149 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && 2150 " Expected target-based directive."); 2151 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target); 2152 for (const CapturedStmt::Capture &C : CS->captures()) { 2153 // Capture variables captured by reference in lambdas for target-based 2154 // directives. 2155 if (!C.capturesVariable()) 2156 continue; 2157 const VarDecl *VD = C.getCapturedVar(); 2158 const auto *RD = VD->getType() 2159 .getCanonicalType() 2160 .getNonReferenceType() 2161 ->getAsCXXRecordDecl(); 2162 if (!RD || !RD->isLambda()) 2163 continue; 2164 Address VDAddr = CGF.GetAddrOfLocalVar(VD); 2165 LValue VDLVal; 2166 if (VD->getType().getCanonicalType()->isReferenceType()) 2167 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType()); 2168 else 2169 VDLVal = CGF.MakeAddrLValue( 2170 VDAddr, VD->getType().getCanonicalType().getNonReferenceType()); 2171 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures; 2172 FieldDecl *ThisCapture = nullptr; 2173 RD->getCaptureFields(Captures, ThisCapture); 2174 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) { 2175 LValue ThisLVal = 2176 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture); 2177 llvm::Value *CXXThis = CGF.LoadCXXThis(); 2178 CGF.EmitStoreOfScalar(CXXThis, ThisLVal); 2179 } 2180 for (const LambdaCapture &LC : RD->captures()) { 2181 if (LC.getCaptureKind() != LCK_ByRef) 2182 continue; 2183 const ValueDecl *VD = LC.getCapturedVar(); 2184 // FIXME: For now VD is always a VarDecl because OpenMP does not support 2185 // capturing structured bindings in lambdas yet. 2186 if (!CS->capturesVariable(cast<VarDecl>(VD))) 2187 continue; 2188 auto It = Captures.find(VD); 2189 assert(It != Captures.end() && "Found lambda capture without field."); 2190 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second); 2191 Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD)); 2192 if (VD->getType().getCanonicalType()->isReferenceType()) 2193 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr, 2194 VD->getType().getCanonicalType()) 2195 .getAddress(); 2196 CGF.EmitStoreOfScalar(VDAddr.emitRawPointer(CGF), VarLVal); 2197 } 2198 } 2199 } 2200 2201 bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD, 2202 LangAS &AS) { 2203 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>()) 2204 return false; 2205 const auto *A = VD->getAttr<OMPAllocateDeclAttr>(); 2206 switch(A->getAllocatorType()) { 2207 case OMPAllocateDeclAttr::OMPNullMemAlloc: 2208 case OMPAllocateDeclAttr::OMPDefaultMemAlloc: 2209 // Not supported, fallback to the default mem space. 2210 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: 2211 case OMPAllocateDeclAttr::OMPCGroupMemAlloc: 2212 case OMPAllocateDeclAttr::OMPHighBWMemAlloc: 2213 case OMPAllocateDeclAttr::OMPLowLatMemAlloc: 2214 case OMPAllocateDeclAttr::OMPThreadMemAlloc: 2215 AS = LangAS::Default; 2216 return true; 2217 case OMPAllocateDeclAttr::OMPConstMemAlloc: 2218 AS = LangAS::cuda_constant; 2219 return true; 2220 case OMPAllocateDeclAttr::OMPPTeamMemAlloc: 2221 AS = LangAS::cuda_shared; 2222 return true; 2223 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: 2224 llvm_unreachable("Expected predefined allocator for the variables with the " 2225 "static storage."); 2226 } 2227 return false; 2228 } 2229 2230 // Get current OffloadArch and ignore any unknown values 2231 static OffloadArch getOffloadArch(CodeGenModule &CGM) { 2232 if (!CGM.getTarget().hasFeature("ptx")) 2233 return OffloadArch::UNKNOWN; 2234 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) { 2235 if (Feature.getValue()) { 2236 OffloadArch Arch = StringToOffloadArch(Feature.getKey()); 2237 if (Arch != OffloadArch::UNKNOWN) 2238 return Arch; 2239 } 2240 } 2241 return OffloadArch::UNKNOWN; 2242 } 2243 2244 /// Check to see if target architecture supports unified addressing which is 2245 /// a restriction for OpenMP requires clause "unified_shared_memory". 2246 void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) { 2247 for (const OMPClause *Clause : D->clauselists()) { 2248 if (Clause->getClauseKind() == OMPC_unified_shared_memory) { 2249 OffloadArch Arch = getOffloadArch(CGM); 2250 switch (Arch) { 2251 case OffloadArch::SM_20: 2252 case OffloadArch::SM_21: 2253 case OffloadArch::SM_30: 2254 case OffloadArch::SM_32_: 2255 case OffloadArch::SM_35: 2256 case OffloadArch::SM_37: 2257 case OffloadArch::SM_50: 2258 case OffloadArch::SM_52: 2259 case OffloadArch::SM_53: { 2260 SmallString<256> Buffer; 2261 llvm::raw_svector_ostream Out(Buffer); 2262 Out << "Target architecture " << OffloadArchToString(Arch) 2263 << " does not support unified addressing"; 2264 CGM.Error(Clause->getBeginLoc(), Out.str()); 2265 return; 2266 } 2267 case OffloadArch::SM_60: 2268 case OffloadArch::SM_61: 2269 case OffloadArch::SM_62: 2270 case OffloadArch::SM_70: 2271 case OffloadArch::SM_72: 2272 case OffloadArch::SM_75: 2273 case OffloadArch::SM_80: 2274 case OffloadArch::SM_86: 2275 case OffloadArch::SM_87: 2276 case OffloadArch::SM_89: 2277 case OffloadArch::SM_90: 2278 case OffloadArch::SM_90a: 2279 case OffloadArch::SM_100: 2280 case OffloadArch::SM_100a: 2281 case OffloadArch::GFX600: 2282 case OffloadArch::GFX601: 2283 case OffloadArch::GFX602: 2284 case OffloadArch::GFX700: 2285 case OffloadArch::GFX701: 2286 case OffloadArch::GFX702: 2287 case OffloadArch::GFX703: 2288 case OffloadArch::GFX704: 2289 case OffloadArch::GFX705: 2290 case OffloadArch::GFX801: 2291 case OffloadArch::GFX802: 2292 case OffloadArch::GFX803: 2293 case OffloadArch::GFX805: 2294 case OffloadArch::GFX810: 2295 case OffloadArch::GFX9_GENERIC: 2296 case OffloadArch::GFX900: 2297 case OffloadArch::GFX902: 2298 case OffloadArch::GFX904: 2299 case OffloadArch::GFX906: 2300 case OffloadArch::GFX908: 2301 case OffloadArch::GFX909: 2302 case OffloadArch::GFX90a: 2303 case OffloadArch::GFX90c: 2304 case OffloadArch::GFX9_4_GENERIC: 2305 case OffloadArch::GFX940: 2306 case OffloadArch::GFX941: 2307 case OffloadArch::GFX942: 2308 case OffloadArch::GFX950: 2309 case OffloadArch::GFX10_1_GENERIC: 2310 case OffloadArch::GFX1010: 2311 case OffloadArch::GFX1011: 2312 case OffloadArch::GFX1012: 2313 case OffloadArch::GFX1013: 2314 case OffloadArch::GFX10_3_GENERIC: 2315 case OffloadArch::GFX1030: 2316 case OffloadArch::GFX1031: 2317 case OffloadArch::GFX1032: 2318 case OffloadArch::GFX1033: 2319 case OffloadArch::GFX1034: 2320 case OffloadArch::GFX1035: 2321 case OffloadArch::GFX1036: 2322 case OffloadArch::GFX11_GENERIC: 2323 case OffloadArch::GFX1100: 2324 case OffloadArch::GFX1101: 2325 case OffloadArch::GFX1102: 2326 case OffloadArch::GFX1103: 2327 case OffloadArch::GFX1150: 2328 case OffloadArch::GFX1151: 2329 case OffloadArch::GFX1152: 2330 case OffloadArch::GFX1153: 2331 case OffloadArch::GFX12_GENERIC: 2332 case OffloadArch::GFX1200: 2333 case OffloadArch::GFX1201: 2334 case OffloadArch::AMDGCNSPIRV: 2335 case OffloadArch::Generic: 2336 case OffloadArch::UNUSED: 2337 case OffloadArch::UNKNOWN: 2338 break; 2339 case OffloadArch::LAST: 2340 llvm_unreachable("Unexpected GPU arch."); 2341 } 2342 } 2343 } 2344 CGOpenMPRuntime::processRequiresDirective(D); 2345 } 2346 2347 llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) { 2348 CGBuilderTy &Bld = CGF.Builder; 2349 llvm::Module *M = &CGF.CGM.getModule(); 2350 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block"; 2351 llvm::Function *F = M->getFunction(LocSize); 2352 if (!F) { 2353 F = llvm::Function::Create(llvm::FunctionType::get(CGF.Int32Ty, {}, false), 2354 llvm::GlobalVariable::ExternalLinkage, LocSize, 2355 &CGF.CGM.getModule()); 2356 } 2357 return Bld.CreateCall(F, {}, "nvptx_num_threads"); 2358 } 2359 2360 llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) { 2361 ArrayRef<llvm::Value *> Args{}; 2362 return CGF.EmitRuntimeCall( 2363 OMPBuilder.getOrCreateRuntimeFunction( 2364 CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block), 2365 Args); 2366 } 2367