1 //===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// This file implements the OpenMPIRBuilder class, which is used as a 11 /// convenient way to create LLVM instructions for OpenMP directives. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" 16 #include "llvm/ADT/SmallSet.h" 17 #include "llvm/ADT/StringRef.h" 18 #include "llvm/Analysis/AssumptionCache.h" 19 #include "llvm/Analysis/CodeMetrics.h" 20 #include "llvm/Analysis/LoopInfo.h" 21 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 22 #include "llvm/Analysis/ScalarEvolution.h" 23 #include "llvm/Analysis/TargetLibraryInfo.h" 24 #include "llvm/IR/CFG.h" 25 #include "llvm/IR/Constants.h" 26 #include "llvm/IR/DebugInfoMetadata.h" 27 #include "llvm/IR/GlobalVariable.h" 28 #include "llvm/IR/IRBuilder.h" 29 #include "llvm/IR/MDBuilder.h" 30 #include "llvm/IR/PassManager.h" 31 #include "llvm/IR/Value.h" 32 #include "llvm/MC/TargetRegistry.h" 33 #include "llvm/Support/CommandLine.h" 34 #include "llvm/Target/TargetMachine.h" 35 #include "llvm/Target/TargetOptions.h" 36 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 37 #include "llvm/Transforms/Utils/CodeExtractor.h" 38 #include "llvm/Transforms/Utils/LoopPeel.h" 39 #include "llvm/Transforms/Utils/UnrollLoop.h" 40 41 #include <cstdint> 42 43 #define DEBUG_TYPE "openmp-ir-builder" 44 45 using namespace llvm; 46 using namespace omp; 47 48 static cl::opt<bool> 49 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, 50 cl::desc("Use optimistic attributes describing " 51 "'as-if' properties of runtime calls."), 52 cl::init(false)); 53 54 static cl::opt<double> UnrollThresholdFactor( 55 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden, 56 cl::desc("Factor for the unroll threshold to account for code " 57 "simplifications still taking place"), 58 cl::init(1.5)); 59 60 #ifndef NDEBUG 61 /// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions 62 /// at position IP1 may change the meaning of IP2 or vice-versa. This is because 63 /// an InsertPoint stores the instruction before something is inserted. For 64 /// instance, if both point to the same instruction, two IRBuilders alternating 65 /// creating instruction will cause the instructions to be interleaved. 66 static bool isConflictIP(IRBuilder<>::InsertPoint IP1, 67 IRBuilder<>::InsertPoint IP2) { 68 if (!IP1.isSet() || !IP2.isSet()) 69 return false; 70 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint(); 71 } 72 73 static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) { 74 // Valid ordered/unordered and base algorithm combinations. 75 switch (SchedType & ~OMPScheduleType::MonotonicityMask) { 76 case OMPScheduleType::UnorderedStaticChunked: 77 case OMPScheduleType::UnorderedStatic: 78 case OMPScheduleType::UnorderedDynamicChunked: 79 case OMPScheduleType::UnorderedGuidedChunked: 80 case OMPScheduleType::UnorderedRuntime: 81 case OMPScheduleType::UnorderedAuto: 82 case OMPScheduleType::UnorderedTrapezoidal: 83 case OMPScheduleType::UnorderedGreedy: 84 case OMPScheduleType::UnorderedBalanced: 85 case OMPScheduleType::UnorderedGuidedIterativeChunked: 86 case OMPScheduleType::UnorderedGuidedAnalyticalChunked: 87 case OMPScheduleType::UnorderedSteal: 88 case OMPScheduleType::UnorderedStaticBalancedChunked: 89 case OMPScheduleType::UnorderedGuidedSimd: 90 case OMPScheduleType::UnorderedRuntimeSimd: 91 case OMPScheduleType::OrderedStaticChunked: 92 case OMPScheduleType::OrderedStatic: 93 case OMPScheduleType::OrderedDynamicChunked: 94 case OMPScheduleType::OrderedGuidedChunked: 95 case OMPScheduleType::OrderedRuntime: 96 case OMPScheduleType::OrderedAuto: 97 case OMPScheduleType::OrderdTrapezoidal: 98 case OMPScheduleType::NomergeUnorderedStaticChunked: 99 case OMPScheduleType::NomergeUnorderedStatic: 100 case OMPScheduleType::NomergeUnorderedDynamicChunked: 101 case OMPScheduleType::NomergeUnorderedGuidedChunked: 102 case OMPScheduleType::NomergeUnorderedRuntime: 103 case OMPScheduleType::NomergeUnorderedAuto: 104 case OMPScheduleType::NomergeUnorderedTrapezoidal: 105 case OMPScheduleType::NomergeUnorderedGreedy: 106 case OMPScheduleType::NomergeUnorderedBalanced: 107 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked: 108 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked: 109 case OMPScheduleType::NomergeUnorderedSteal: 110 case OMPScheduleType::NomergeOrderedStaticChunked: 111 case OMPScheduleType::NomergeOrderedStatic: 112 case OMPScheduleType::NomergeOrderedDynamicChunked: 113 case OMPScheduleType::NomergeOrderedGuidedChunked: 114 case OMPScheduleType::NomergeOrderedRuntime: 115 case OMPScheduleType::NomergeOrderedAuto: 116 case OMPScheduleType::NomergeOrderedTrapezoidal: 117 break; 118 default: 119 return false; 120 } 121 122 // Must not set both monotonicity modifiers at the same time. 123 OMPScheduleType MonotonicityFlags = 124 SchedType & OMPScheduleType::MonotonicityMask; 125 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask) 126 return false; 127 128 return true; 129 } 130 #endif 131 132 /// Determine which scheduling algorithm to use, determined from schedule clause 133 /// arguments. 134 static OMPScheduleType 135 getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, 136 bool HasSimdModifier) { 137 // Currently, the default schedule it static. 138 switch (ClauseKind) { 139 case OMP_SCHEDULE_Default: 140 case OMP_SCHEDULE_Static: 141 return HasChunks ? OMPScheduleType::BaseStaticChunked 142 : OMPScheduleType::BaseStatic; 143 case OMP_SCHEDULE_Dynamic: 144 return OMPScheduleType::BaseDynamicChunked; 145 case OMP_SCHEDULE_Guided: 146 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd 147 : OMPScheduleType::BaseGuidedChunked; 148 case OMP_SCHEDULE_Auto: 149 return llvm::omp::OMPScheduleType::BaseAuto; 150 case OMP_SCHEDULE_Runtime: 151 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd 152 : OMPScheduleType::BaseRuntime; 153 } 154 llvm_unreachable("unhandled schedule clause argument"); 155 } 156 157 /// Adds ordering modifier flags to schedule type. 158 static OMPScheduleType 159 getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, 160 bool HasOrderedClause) { 161 assert((BaseScheduleType & OMPScheduleType::ModifierMask) == 162 OMPScheduleType::None && 163 "Must not have ordering nor monotonicity flags already set"); 164 165 OMPScheduleType OrderingModifier = HasOrderedClause 166 ? OMPScheduleType::ModifierOrdered 167 : OMPScheduleType::ModifierUnordered; 168 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier; 169 170 // Unsupported combinations 171 if (OrderingScheduleType == 172 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered)) 173 return OMPScheduleType::OrderedGuidedChunked; 174 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd | 175 OMPScheduleType::ModifierOrdered)) 176 return OMPScheduleType::OrderedRuntime; 177 178 return OrderingScheduleType; 179 } 180 181 /// Adds monotonicity modifier flags to schedule type. 182 static OMPScheduleType 183 getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, 184 bool HasSimdModifier, bool HasMonotonic, 185 bool HasNonmonotonic, bool HasOrderedClause) { 186 assert((ScheduleType & OMPScheduleType::MonotonicityMask) == 187 OMPScheduleType::None && 188 "Must not have monotonicity flags already set"); 189 assert((!HasMonotonic || !HasNonmonotonic) && 190 "Monotonic and Nonmonotonic are contradicting each other"); 191 192 if (HasMonotonic) { 193 return ScheduleType | OMPScheduleType::ModifierMonotonic; 194 } else if (HasNonmonotonic) { 195 return ScheduleType | OMPScheduleType::ModifierNonmonotonic; 196 } else { 197 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description. 198 // If the static schedule kind is specified or if the ordered clause is 199 // specified, and if the nonmonotonic modifier is not specified, the 200 // effect is as if the monotonic modifier is specified. Otherwise, unless 201 // the monotonic modifier is specified, the effect is as if the 202 // nonmonotonic modifier is specified. 203 OMPScheduleType BaseScheduleType = 204 ScheduleType & ~OMPScheduleType::ModifierMask; 205 if ((BaseScheduleType == OMPScheduleType::BaseStatic) || 206 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) || 207 HasOrderedClause) { 208 // The monotonic is used by default in openmp runtime library, so no need 209 // to set it. 210 return ScheduleType; 211 } else { 212 return ScheduleType | OMPScheduleType::ModifierNonmonotonic; 213 } 214 } 215 } 216 217 /// Determine the schedule type using schedule and ordering clause arguments. 218 static OMPScheduleType 219 computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, 220 bool HasSimdModifier, bool HasMonotonicModifier, 221 bool HasNonmonotonicModifier, bool HasOrderedClause) { 222 OMPScheduleType BaseSchedule = 223 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier); 224 OMPScheduleType OrderedSchedule = 225 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause); 226 OMPScheduleType Result = getOpenMPMonotonicityScheduleType( 227 OrderedSchedule, HasSimdModifier, HasMonotonicModifier, 228 HasNonmonotonicModifier, HasOrderedClause); 229 230 assert(isValidWorkshareLoopScheduleType(Result)); 231 return Result; 232 } 233 234 /// Make \p Source branch to \p Target. 235 /// 236 /// Handles two situations: 237 /// * \p Source already has an unconditional branch. 238 /// * \p Source is a degenerate block (no terminator because the BB is 239 /// the current head of the IR construction). 240 static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL) { 241 if (Instruction *Term = Source->getTerminator()) { 242 auto *Br = cast<BranchInst>(Term); 243 assert(!Br->isConditional() && 244 "BB's terminator must be an unconditional branch (or degenerate)"); 245 BasicBlock *Succ = Br->getSuccessor(0); 246 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true); 247 Br->setSuccessor(0, Target); 248 return; 249 } 250 251 auto *NewBr = BranchInst::Create(Target, Source); 252 NewBr->setDebugLoc(DL); 253 } 254 255 void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, 256 bool CreateBranch) { 257 assert(New->getFirstInsertionPt() == New->begin() && 258 "Target BB must not have PHI nodes"); 259 260 // Move instructions to new block. 261 BasicBlock *Old = IP.getBlock(); 262 New->getInstList().splice(New->begin(), Old->getInstList(), IP.getPoint(), 263 Old->end()); 264 265 if (CreateBranch) 266 BranchInst::Create(New, Old); 267 } 268 269 void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) { 270 DebugLoc DebugLoc = Builder.getCurrentDebugLocation(); 271 BasicBlock *Old = Builder.GetInsertBlock(); 272 273 spliceBB(Builder.saveIP(), New, CreateBranch); 274 if (CreateBranch) 275 Builder.SetInsertPoint(Old->getTerminator()); 276 else 277 Builder.SetInsertPoint(Old); 278 279 // SetInsertPoint also updates the Builder's debug location, but we want to 280 // keep the one the Builder was configured to use. 281 Builder.SetCurrentDebugLocation(DebugLoc); 282 } 283 284 BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, 285 llvm::Twine Name) { 286 BasicBlock *Old = IP.getBlock(); 287 BasicBlock *New = BasicBlock::Create( 288 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name, 289 Old->getParent(), Old->getNextNode()); 290 spliceBB(IP, New, CreateBranch); 291 New->replaceSuccessorsPhiUsesWith(Old, New); 292 return New; 293 } 294 295 BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch, 296 llvm::Twine Name) { 297 DebugLoc DebugLoc = Builder.getCurrentDebugLocation(); 298 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name); 299 if (CreateBranch) 300 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator()); 301 else 302 Builder.SetInsertPoint(Builder.GetInsertBlock()); 303 // SetInsertPoint also updates the Builder's debug location, but we want to 304 // keep the one the Builder was configured to use. 305 Builder.SetCurrentDebugLocation(DebugLoc); 306 return New; 307 } 308 309 BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch, 310 llvm::Twine Name) { 311 DebugLoc DebugLoc = Builder.getCurrentDebugLocation(); 312 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name); 313 if (CreateBranch) 314 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator()); 315 else 316 Builder.SetInsertPoint(Builder.GetInsertBlock()); 317 // SetInsertPoint also updates the Builder's debug location, but we want to 318 // keep the one the Builder was configured to use. 319 Builder.SetCurrentDebugLocation(DebugLoc); 320 return New; 321 } 322 323 BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, 324 llvm::Twine Suffix) { 325 BasicBlock *Old = Builder.GetInsertBlock(); 326 return splitBB(Builder, CreateBranch, Old->getName() + Suffix); 327 } 328 329 void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) { 330 LLVMContext &Ctx = Fn.getContext(); 331 332 // Get the function's current attributes. 333 auto Attrs = Fn.getAttributes(); 334 auto FnAttrs = Attrs.getFnAttrs(); 335 auto RetAttrs = Attrs.getRetAttrs(); 336 SmallVector<AttributeSet, 4> ArgAttrs; 337 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo) 338 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo)); 339 340 #define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet; 341 #include "llvm/Frontend/OpenMP/OMPKinds.def" 342 343 // Add attributes to the function declaration. 344 switch (FnID) { 345 #define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \ 346 case Enum: \ 347 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \ 348 RetAttrs = RetAttrs.addAttributes(Ctx, RetAttrSet); \ 349 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \ 350 ArgAttrs[ArgNo] = \ 351 ArgAttrs[ArgNo].addAttributes(Ctx, ArgAttrSets[ArgNo]); \ 352 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \ 353 break; 354 #include "llvm/Frontend/OpenMP/OMPKinds.def" 355 default: 356 // Attributes are optional. 357 break; 358 } 359 } 360 361 FunctionCallee 362 OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) { 363 FunctionType *FnTy = nullptr; 364 Function *Fn = nullptr; 365 366 // Try to find the declation in the module first. 367 switch (FnID) { 368 #define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \ 369 case Enum: \ 370 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \ 371 IsVarArg); \ 372 Fn = M.getFunction(Str); \ 373 break; 374 #include "llvm/Frontend/OpenMP/OMPKinds.def" 375 } 376 377 if (!Fn) { 378 // Create a new declaration if we need one. 379 switch (FnID) { 380 #define OMP_RTL(Enum, Str, ...) \ 381 case Enum: \ 382 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \ 383 break; 384 #include "llvm/Frontend/OpenMP/OMPKinds.def" 385 } 386 387 // Add information if the runtime function takes a callback function 388 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) { 389 if (!Fn->hasMetadata(LLVMContext::MD_callback)) { 390 LLVMContext &Ctx = Fn->getContext(); 391 MDBuilder MDB(Ctx); 392 // Annotate the callback behavior of the runtime function: 393 // - The callback callee is argument number 2 (microtask). 394 // - The first two arguments of the callback callee are unknown (-1). 395 // - All variadic arguments to the runtime function are passed to the 396 // callback callee. 397 Fn->addMetadata( 398 LLVMContext::MD_callback, 399 *MDNode::get(Ctx, {MDB.createCallbackEncoding( 400 2, {-1, -1}, /* VarArgsArePassed */ true)})); 401 } 402 } 403 404 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName() 405 << " with type " << *Fn->getFunctionType() << "\n"); 406 addAttributes(FnID, *Fn); 407 408 } else { 409 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName() 410 << " with type " << *Fn->getFunctionType() << "\n"); 411 } 412 413 assert(Fn && "Failed to create OpenMP runtime function"); 414 415 // Cast the function to the expected type if necessary 416 Constant *C = ConstantExpr::getBitCast(Fn, FnTy->getPointerTo()); 417 return {FnTy, C}; 418 } 419 420 Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) { 421 FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID); 422 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee()); 423 assert(Fn && "Failed to create OpenMP runtime function pointer"); 424 return Fn; 425 } 426 427 void OpenMPIRBuilder::initialize() { initializeTypes(M); } 428 429 void OpenMPIRBuilder::finalize(Function *Fn) { 430 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet; 431 SmallVector<BasicBlock *, 32> Blocks; 432 SmallVector<OutlineInfo, 16> DeferredOutlines; 433 for (OutlineInfo &OI : OutlineInfos) { 434 // Skip functions that have not finalized yet; may happen with nested 435 // function generation. 436 if (Fn && OI.getFunction() != Fn) { 437 DeferredOutlines.push_back(OI); 438 continue; 439 } 440 441 ParallelRegionBlockSet.clear(); 442 Blocks.clear(); 443 OI.collectBlocks(ParallelRegionBlockSet, Blocks); 444 445 Function *OuterFn = OI.getFunction(); 446 CodeExtractorAnalysisCache CEAC(*OuterFn); 447 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr, 448 /* AggregateArgs */ true, 449 /* BlockFrequencyInfo */ nullptr, 450 /* BranchProbabilityInfo */ nullptr, 451 /* AssumptionCache */ nullptr, 452 /* AllowVarArgs */ true, 453 /* AllowAlloca */ true, 454 /* AllocaBlock*/ OI.OuterAllocaBB, 455 /* Suffix */ ".omp_par"); 456 457 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n"); 458 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName() 459 << " Exit: " << OI.ExitBB->getName() << "\n"); 460 assert(Extractor.isEligible() && 461 "Expected OpenMP outlining to be possible!"); 462 463 for (auto *V : OI.ExcludeArgsFromAggregate) 464 Extractor.excludeArgFromAggregate(V); 465 466 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC); 467 468 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n"); 469 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n"); 470 assert(OutlinedFn->getReturnType()->isVoidTy() && 471 "OpenMP outlined functions should not return a value!"); 472 473 // For compability with the clang CG we move the outlined function after the 474 // one with the parallel region. 475 OutlinedFn->removeFromParent(); 476 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn); 477 478 // Remove the artificial entry introduced by the extractor right away, we 479 // made our own entry block after all. 480 { 481 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock(); 482 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB); 483 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry); 484 // Move instructions from the to-be-deleted ArtificialEntry to the entry 485 // basic block of the parallel region. CodeExtractor generates 486 // instructions to unwrap the aggregate argument and may sink 487 // allocas/bitcasts for values that are solely used in the outlined region 488 // and do not escape. 489 assert(!ArtificialEntry.empty() && 490 "Expected instructions to add in the outlined region entry"); 491 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(), 492 End = ArtificialEntry.rend(); 493 It != End;) { 494 Instruction &I = *It; 495 It++; 496 497 if (I.isTerminator()) 498 continue; 499 500 I.moveBefore(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt()); 501 } 502 503 OI.EntryBB->moveBefore(&ArtificialEntry); 504 ArtificialEntry.eraseFromParent(); 505 } 506 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB); 507 assert(OutlinedFn && OutlinedFn->getNumUses() == 1); 508 509 // Run a user callback, e.g. to add attributes. 510 if (OI.PostOutlineCB) 511 OI.PostOutlineCB(*OutlinedFn); 512 } 513 514 // Remove work items that have been completed. 515 OutlineInfos = std::move(DeferredOutlines); 516 } 517 518 OpenMPIRBuilder::~OpenMPIRBuilder() { 519 assert(OutlineInfos.empty() && "There must be no outstanding outlinings"); 520 } 521 522 GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) { 523 IntegerType *I32Ty = Type::getInt32Ty(M.getContext()); 524 auto *GV = 525 new GlobalVariable(M, I32Ty, 526 /* isConstant = */ true, GlobalValue::WeakODRLinkage, 527 ConstantInt::get(I32Ty, Value), Name); 528 GV->setVisibility(GlobalValue::HiddenVisibility); 529 530 return GV; 531 } 532 533 Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr, 534 uint32_t SrcLocStrSize, 535 IdentFlag LocFlags, 536 unsigned Reserve2Flags) { 537 // Enable "C-mode". 538 LocFlags |= OMP_IDENT_FLAG_KMPC; 539 540 Constant *&Ident = 541 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}]; 542 if (!Ident) { 543 Constant *I32Null = ConstantInt::getNullValue(Int32); 544 Constant *IdentData[] = {I32Null, 545 ConstantInt::get(Int32, uint32_t(LocFlags)), 546 ConstantInt::get(Int32, Reserve2Flags), 547 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr}; 548 Constant *Initializer = 549 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData); 550 551 // Look for existing encoding of the location + flags, not needed but 552 // minimizes the difference to the existing solution while we transition. 553 for (GlobalVariable &GV : M.getGlobalList()) 554 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer()) 555 if (GV.getInitializer() == Initializer) 556 Ident = &GV; 557 558 if (!Ident) { 559 auto *GV = new GlobalVariable( 560 M, OpenMPIRBuilder::Ident, 561 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "", 562 nullptr, GlobalValue::NotThreadLocal, 563 M.getDataLayout().getDefaultGlobalsAddressSpace()); 564 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); 565 GV->setAlignment(Align(8)); 566 Ident = GV; 567 } 568 } 569 570 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr); 571 } 572 573 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr, 574 uint32_t &SrcLocStrSize) { 575 SrcLocStrSize = LocStr.size(); 576 Constant *&SrcLocStr = SrcLocStrMap[LocStr]; 577 if (!SrcLocStr) { 578 Constant *Initializer = 579 ConstantDataArray::getString(M.getContext(), LocStr); 580 581 // Look for existing encoding of the location, not needed but minimizes the 582 // difference to the existing solution while we transition. 583 for (GlobalVariable &GV : M.getGlobalList()) 584 if (GV.isConstant() && GV.hasInitializer() && 585 GV.getInitializer() == Initializer) 586 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr); 587 588 SrcLocStr = Builder.CreateGlobalStringPtr(LocStr, /* Name */ "", 589 /* AddressSpace */ 0, &M); 590 } 591 return SrcLocStr; 592 } 593 594 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName, 595 StringRef FileName, 596 unsigned Line, unsigned Column, 597 uint32_t &SrcLocStrSize) { 598 SmallString<128> Buffer; 599 Buffer.push_back(';'); 600 Buffer.append(FileName); 601 Buffer.push_back(';'); 602 Buffer.append(FunctionName); 603 Buffer.push_back(';'); 604 Buffer.append(std::to_string(Line)); 605 Buffer.push_back(';'); 606 Buffer.append(std::to_string(Column)); 607 Buffer.push_back(';'); 608 Buffer.push_back(';'); 609 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize); 610 } 611 612 Constant * 613 OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize) { 614 StringRef UnknownLoc = ";unknown;unknown;0;0;;"; 615 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize); 616 } 617 618 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL, 619 uint32_t &SrcLocStrSize, 620 Function *F) { 621 DILocation *DIL = DL.get(); 622 if (!DIL) 623 return getOrCreateDefaultSrcLocStr(SrcLocStrSize); 624 StringRef FileName = M.getName(); 625 if (DIFile *DIF = DIL->getFile()) 626 if (Optional<StringRef> Source = DIF->getSource()) 627 FileName = *Source; 628 StringRef Function = DIL->getScope()->getSubprogram()->getName(); 629 if (Function.empty() && F) 630 Function = F->getName(); 631 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(), 632 DIL->getColumn(), SrcLocStrSize); 633 } 634 635 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc, 636 uint32_t &SrcLocStrSize) { 637 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize, 638 Loc.IP.getBlock()->getParent()); 639 } 640 641 Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) { 642 return Builder.CreateCall( 643 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident, 644 "omp_global_thread_num"); 645 } 646 647 OpenMPIRBuilder::InsertPointTy 648 OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive DK, 649 bool ForceSimpleCall, bool CheckCancelFlag) { 650 if (!updateToLocation(Loc)) 651 return Loc.IP; 652 return emitBarrierImpl(Loc, DK, ForceSimpleCall, CheckCancelFlag); 653 } 654 655 OpenMPIRBuilder::InsertPointTy 656 OpenMPIRBuilder::emitBarrierImpl(const LocationDescription &Loc, Directive Kind, 657 bool ForceSimpleCall, bool CheckCancelFlag) { 658 // Build call __kmpc_cancel_barrier(loc, thread_id) or 659 // __kmpc_barrier(loc, thread_id); 660 661 IdentFlag BarrierLocFlags; 662 switch (Kind) { 663 case OMPD_for: 664 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR; 665 break; 666 case OMPD_sections: 667 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS; 668 break; 669 case OMPD_single: 670 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE; 671 break; 672 case OMPD_barrier: 673 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL; 674 break; 675 default: 676 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL; 677 break; 678 } 679 680 uint32_t SrcLocStrSize; 681 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 682 Value *Args[] = { 683 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags), 684 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))}; 685 686 // If we are in a cancellable parallel region, barriers are cancellation 687 // points. 688 // TODO: Check why we would force simple calls or to ignore the cancel flag. 689 bool UseCancelBarrier = 690 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel); 691 692 Value *Result = 693 Builder.CreateCall(getOrCreateRuntimeFunctionPtr( 694 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier 695 : OMPRTL___kmpc_barrier), 696 Args); 697 698 if (UseCancelBarrier && CheckCancelFlag) 699 emitCancelationCheckImpl(Result, OMPD_parallel); 700 701 return Builder.saveIP(); 702 } 703 704 OpenMPIRBuilder::InsertPointTy 705 OpenMPIRBuilder::createCancel(const LocationDescription &Loc, 706 Value *IfCondition, 707 omp::Directive CanceledDirective) { 708 if (!updateToLocation(Loc)) 709 return Loc.IP; 710 711 // LLVM utilities like blocks with terminators. 712 auto *UI = Builder.CreateUnreachable(); 713 714 Instruction *ThenTI = UI, *ElseTI = nullptr; 715 if (IfCondition) 716 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI); 717 Builder.SetInsertPoint(ThenTI); 718 719 Value *CancelKind = nullptr; 720 switch (CanceledDirective) { 721 #define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \ 722 case DirectiveEnum: \ 723 CancelKind = Builder.getInt32(Value); \ 724 break; 725 #include "llvm/Frontend/OpenMP/OMPKinds.def" 726 default: 727 llvm_unreachable("Unknown cancel kind!"); 728 } 729 730 uint32_t SrcLocStrSize; 731 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 732 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 733 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind}; 734 Value *Result = Builder.CreateCall( 735 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args); 736 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) { 737 if (CanceledDirective == OMPD_parallel) { 738 IRBuilder<>::InsertPointGuard IPG(Builder); 739 Builder.restoreIP(IP); 740 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL), 741 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false, 742 /* CheckCancelFlag */ false); 743 } 744 }; 745 746 // The actual cancel logic is shared with others, e.g., cancel_barriers. 747 emitCancelationCheckImpl(Result, CanceledDirective, ExitCB); 748 749 // Update the insertion point and remove the terminator we introduced. 750 Builder.SetInsertPoint(UI->getParent()); 751 UI->eraseFromParent(); 752 753 return Builder.saveIP(); 754 } 755 756 void OpenMPIRBuilder::emitOffloadingEntry(Constant *Addr, StringRef Name, 757 uint64_t Size, int32_t Flags, 758 StringRef SectionName) { 759 Type *Int8PtrTy = Type::getInt8PtrTy(M.getContext()); 760 Type *Int32Ty = Type::getInt32Ty(M.getContext()); 761 Type *SizeTy = M.getDataLayout().getIntPtrType(M.getContext()); 762 763 Constant *AddrName = ConstantDataArray::getString(M.getContext(), Name); 764 765 // Create the constant string used to look up the symbol in the device. 766 auto *Str = 767 new llvm::GlobalVariable(M, AddrName->getType(), /*isConstant=*/true, 768 llvm::GlobalValue::InternalLinkage, AddrName, 769 ".omp_offloading.entry_name"); 770 Str->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); 771 772 // Construct the offloading entry. 773 Constant *EntryData[] = { 774 ConstantExpr::getPointerBitCastOrAddrSpaceCast(Addr, Int8PtrTy), 775 ConstantExpr::getPointerBitCastOrAddrSpaceCast(Str, Int8PtrTy), 776 ConstantInt::get(SizeTy, Size), 777 ConstantInt::get(Int32Ty, Flags), 778 ConstantInt::get(Int32Ty, 0), 779 }; 780 Constant *EntryInitializer = 781 ConstantStruct::get(OpenMPIRBuilder::OffloadEntry, EntryData); 782 783 auto *Entry = new GlobalVariable( 784 M, OpenMPIRBuilder::OffloadEntry, 785 /* isConstant = */ true, GlobalValue::WeakAnyLinkage, EntryInitializer, 786 ".omp_offloading.entry." + Name, nullptr, GlobalValue::NotThreadLocal, 787 M.getDataLayout().getDefaultGlobalsAddressSpace()); 788 789 // The entry has to be created in the section the linker expects it to be. 790 Entry->setSection(SectionName); 791 Entry->setAlignment(Align(1)); 792 } 793 794 void OpenMPIRBuilder::emitCancelationCheckImpl(Value *CancelFlag, 795 omp::Directive CanceledDirective, 796 FinalizeCallbackTy ExitCB) { 797 assert(isLastFinalizationInfoCancellable(CanceledDirective) && 798 "Unexpected cancellation!"); 799 800 // For a cancel barrier we create two new blocks. 801 BasicBlock *BB = Builder.GetInsertBlock(); 802 BasicBlock *NonCancellationBlock; 803 if (Builder.GetInsertPoint() == BB->end()) { 804 // TODO: This branch will not be needed once we moved to the 805 // OpenMPIRBuilder codegen completely. 806 NonCancellationBlock = BasicBlock::Create( 807 BB->getContext(), BB->getName() + ".cont", BB->getParent()); 808 } else { 809 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint()); 810 BB->getTerminator()->eraseFromParent(); 811 Builder.SetInsertPoint(BB); 812 } 813 BasicBlock *CancellationBlock = BasicBlock::Create( 814 BB->getContext(), BB->getName() + ".cncl", BB->getParent()); 815 816 // Jump to them based on the return value. 817 Value *Cmp = Builder.CreateIsNull(CancelFlag); 818 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock, 819 /* TODO weight */ nullptr, nullptr); 820 821 // From the cancellation block we finalize all variables and go to the 822 // post finalization block that is known to the FiniCB callback. 823 Builder.SetInsertPoint(CancellationBlock); 824 if (ExitCB) 825 ExitCB(Builder.saveIP()); 826 auto &FI = FinalizationStack.back(); 827 FI.FiniCB(Builder.saveIP()); 828 829 // The continuation block is where code generation continues. 830 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin()); 831 } 832 833 IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel( 834 const LocationDescription &Loc, InsertPointTy OuterAllocaIP, 835 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, 836 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, 837 omp::ProcBindKind ProcBind, bool IsCancellable) { 838 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous"); 839 840 if (!updateToLocation(Loc)) 841 return Loc.IP; 842 843 uint32_t SrcLocStrSize; 844 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 845 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 846 Value *ThreadID = getOrCreateThreadID(Ident); 847 848 if (NumThreads) { 849 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads) 850 Value *Args[] = { 851 Ident, ThreadID, 852 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)}; 853 Builder.CreateCall( 854 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args); 855 } 856 857 if (ProcBind != OMP_PROC_BIND_default) { 858 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind) 859 Value *Args[] = { 860 Ident, ThreadID, 861 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)}; 862 Builder.CreateCall( 863 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args); 864 } 865 866 BasicBlock *InsertBB = Builder.GetInsertBlock(); 867 Function *OuterFn = InsertBB->getParent(); 868 869 // Save the outer alloca block because the insertion iterator may get 870 // invalidated and we still need this later. 871 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock(); 872 873 // Vector to remember instructions we used only during the modeling but which 874 // we want to delete at the end. 875 SmallVector<Instruction *, 4> ToBeDeleted; 876 877 // Change the location to the outer alloca insertion point to create and 878 // initialize the allocas we pass into the parallel region. 879 Builder.restoreIP(OuterAllocaIP); 880 AllocaInst *TIDAddr = Builder.CreateAlloca(Int32, nullptr, "tid.addr"); 881 AllocaInst *ZeroAddr = Builder.CreateAlloca(Int32, nullptr, "zero.addr"); 882 883 // If there is an if condition we actually use the TIDAddr and ZeroAddr in the 884 // program, otherwise we only need them for modeling purposes to get the 885 // associated arguments in the outlined function. In the former case, 886 // initialize the allocas properly, in the latter case, delete them later. 887 if (IfCondition) { 888 Builder.CreateStore(Constant::getNullValue(Int32), TIDAddr); 889 Builder.CreateStore(Constant::getNullValue(Int32), ZeroAddr); 890 } else { 891 ToBeDeleted.push_back(TIDAddr); 892 ToBeDeleted.push_back(ZeroAddr); 893 } 894 895 // Create an artificial insertion point that will also ensure the blocks we 896 // are about to split are not degenerated. 897 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB); 898 899 Instruction *ThenTI = UI, *ElseTI = nullptr; 900 if (IfCondition) 901 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI); 902 903 BasicBlock *ThenBB = ThenTI->getParent(); 904 BasicBlock *PRegEntryBB = ThenBB->splitBasicBlock(ThenTI, "omp.par.entry"); 905 BasicBlock *PRegBodyBB = 906 PRegEntryBB->splitBasicBlock(ThenTI, "omp.par.region"); 907 BasicBlock *PRegPreFiniBB = 908 PRegBodyBB->splitBasicBlock(ThenTI, "omp.par.pre_finalize"); 909 BasicBlock *PRegExitBB = 910 PRegPreFiniBB->splitBasicBlock(ThenTI, "omp.par.exit"); 911 912 auto FiniCBWrapper = [&](InsertPointTy IP) { 913 // Hide "open-ended" blocks from the given FiniCB by setting the right jump 914 // target to the region exit block. 915 if (IP.getBlock()->end() == IP.getPoint()) { 916 IRBuilder<>::InsertPointGuard IPG(Builder); 917 Builder.restoreIP(IP); 918 Instruction *I = Builder.CreateBr(PRegExitBB); 919 IP = InsertPointTy(I->getParent(), I->getIterator()); 920 } 921 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 && 922 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB && 923 "Unexpected insertion point for finalization call!"); 924 return FiniCB(IP); 925 }; 926 927 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable}); 928 929 // Generate the privatization allocas in the block that will become the entry 930 // of the outlined function. 931 Builder.SetInsertPoint(PRegEntryBB->getTerminator()); 932 InsertPointTy InnerAllocaIP = Builder.saveIP(); 933 934 AllocaInst *PrivTIDAddr = 935 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local"); 936 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid"); 937 938 // Add some fake uses for OpenMP provided arguments. 939 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use")); 940 Instruction *ZeroAddrUse = 941 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use"); 942 ToBeDeleted.push_back(ZeroAddrUse); 943 944 // ThenBB 945 // | 946 // V 947 // PRegionEntryBB <- Privatization allocas are placed here. 948 // | 949 // V 950 // PRegionBodyBB <- BodeGen is invoked here. 951 // | 952 // V 953 // PRegPreFiniBB <- The block we will start finalization from. 954 // | 955 // V 956 // PRegionExitBB <- A common exit to simplify block collection. 957 // 958 959 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n"); 960 961 // Let the caller create the body. 962 assert(BodyGenCB && "Expected body generation callback!"); 963 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin()); 964 BodyGenCB(InnerAllocaIP, CodeGenIP); 965 966 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n"); 967 968 FunctionCallee RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call); 969 if (auto *F = dyn_cast<llvm::Function>(RTLFn.getCallee())) { 970 if (!F->hasMetadata(llvm::LLVMContext::MD_callback)) { 971 llvm::LLVMContext &Ctx = F->getContext(); 972 MDBuilder MDB(Ctx); 973 // Annotate the callback behavior of the __kmpc_fork_call: 974 // - The callback callee is argument number 2 (microtask). 975 // - The first two arguments of the callback callee are unknown (-1). 976 // - All variadic arguments to the __kmpc_fork_call are passed to the 977 // callback callee. 978 F->addMetadata( 979 llvm::LLVMContext::MD_callback, 980 *llvm::MDNode::get( 981 Ctx, {MDB.createCallbackEncoding(2, {-1, -1}, 982 /* VarArgsArePassed */ true)})); 983 } 984 } 985 986 OutlineInfo OI; 987 OI.PostOutlineCB = [=](Function &OutlinedFn) { 988 // Add some known attributes. 989 OutlinedFn.addParamAttr(0, Attribute::NoAlias); 990 OutlinedFn.addParamAttr(1, Attribute::NoAlias); 991 OutlinedFn.addFnAttr(Attribute::NoUnwind); 992 OutlinedFn.addFnAttr(Attribute::NoRecurse); 993 994 assert(OutlinedFn.arg_size() >= 2 && 995 "Expected at least tid and bounded tid as arguments"); 996 unsigned NumCapturedVars = 997 OutlinedFn.arg_size() - /* tid & bounded tid */ 2; 998 999 CallInst *CI = cast<CallInst>(OutlinedFn.user_back()); 1000 CI->getParent()->setName("omp_parallel"); 1001 Builder.SetInsertPoint(CI); 1002 1003 // Build call __kmpc_fork_call(Ident, n, microtask, var1, .., varn); 1004 Value *ForkCallArgs[] = { 1005 Ident, Builder.getInt32(NumCapturedVars), 1006 Builder.CreateBitCast(&OutlinedFn, ParallelTaskPtr)}; 1007 1008 SmallVector<Value *, 16> RealArgs; 1009 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs)); 1010 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end()); 1011 1012 Builder.CreateCall(RTLFn, RealArgs); 1013 1014 LLVM_DEBUG(dbgs() << "With fork_call placed: " 1015 << *Builder.GetInsertBlock()->getParent() << "\n"); 1016 1017 InsertPointTy ExitIP(PRegExitBB, PRegExitBB->end()); 1018 1019 // Initialize the local TID stack location with the argument value. 1020 Builder.SetInsertPoint(PrivTID); 1021 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin(); 1022 Builder.CreateStore(Builder.CreateLoad(Int32, OutlinedAI), PrivTIDAddr); 1023 1024 // If no "if" clause was present we do not need the call created during 1025 // outlining, otherwise we reuse it in the serialized parallel region. 1026 if (!ElseTI) { 1027 CI->eraseFromParent(); 1028 } else { 1029 1030 // If an "if" clause was present we are now generating the serialized 1031 // version into the "else" branch. 1032 Builder.SetInsertPoint(ElseTI); 1033 1034 // Build calls __kmpc_serialized_parallel(&Ident, GTid); 1035 Value *SerializedParallelCallArgs[] = {Ident, ThreadID}; 1036 Builder.CreateCall( 1037 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_serialized_parallel), 1038 SerializedParallelCallArgs); 1039 1040 // OutlinedFn(>id, &zero, CapturedStruct); 1041 CI->removeFromParent(); 1042 Builder.Insert(CI); 1043 1044 // __kmpc_end_serialized_parallel(&Ident, GTid); 1045 Value *EndArgs[] = {Ident, ThreadID}; 1046 Builder.CreateCall( 1047 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_serialized_parallel), 1048 EndArgs); 1049 1050 LLVM_DEBUG(dbgs() << "With serialized parallel region: " 1051 << *Builder.GetInsertBlock()->getParent() << "\n"); 1052 } 1053 1054 for (Instruction *I : ToBeDeleted) 1055 I->eraseFromParent(); 1056 }; 1057 1058 // Adjust the finalization stack, verify the adjustment, and call the 1059 // finalize function a last time to finalize values between the pre-fini 1060 // block and the exit block if we left the parallel "the normal way". 1061 auto FiniInfo = FinalizationStack.pop_back_val(); 1062 (void)FiniInfo; 1063 assert(FiniInfo.DK == OMPD_parallel && 1064 "Unexpected finalization stack state!"); 1065 1066 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator(); 1067 1068 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator()); 1069 FiniCB(PreFiniIP); 1070 1071 OI.OuterAllocaBB = OuterAllocaBlock; 1072 OI.EntryBB = PRegEntryBB; 1073 OI.ExitBB = PRegExitBB; 1074 1075 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet; 1076 SmallVector<BasicBlock *, 32> Blocks; 1077 OI.collectBlocks(ParallelRegionBlockSet, Blocks); 1078 1079 // Ensure a single exit node for the outlined region by creating one. 1080 // We might have multiple incoming edges to the exit now due to finalizations, 1081 // e.g., cancel calls that cause the control flow to leave the region. 1082 BasicBlock *PRegOutlinedExitBB = PRegExitBB; 1083 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt()); 1084 PRegOutlinedExitBB->setName("omp.par.outlined.exit"); 1085 Blocks.push_back(PRegOutlinedExitBB); 1086 1087 CodeExtractorAnalysisCache CEAC(*OuterFn); 1088 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr, 1089 /* AggregateArgs */ false, 1090 /* BlockFrequencyInfo */ nullptr, 1091 /* BranchProbabilityInfo */ nullptr, 1092 /* AssumptionCache */ nullptr, 1093 /* AllowVarArgs */ true, 1094 /* AllowAlloca */ true, 1095 /* AllocationBlock */ OuterAllocaBlock, 1096 /* Suffix */ ".omp_par"); 1097 1098 // Find inputs to, outputs from the code region. 1099 BasicBlock *CommonExit = nullptr; 1100 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands; 1101 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit); 1102 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands); 1103 1104 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n"); 1105 1106 FunctionCallee TIDRTLFn = 1107 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num); 1108 1109 auto PrivHelper = [&](Value &V) { 1110 if (&V == TIDAddr || &V == ZeroAddr) { 1111 OI.ExcludeArgsFromAggregate.push_back(&V); 1112 return; 1113 } 1114 1115 SetVector<Use *> Uses; 1116 for (Use &U : V.uses()) 1117 if (auto *UserI = dyn_cast<Instruction>(U.getUser())) 1118 if (ParallelRegionBlockSet.count(UserI->getParent())) 1119 Uses.insert(&U); 1120 1121 // __kmpc_fork_call expects extra arguments as pointers. If the input 1122 // already has a pointer type, everything is fine. Otherwise, store the 1123 // value onto stack and load it back inside the to-be-outlined region. This 1124 // will ensure only the pointer will be passed to the function. 1125 // FIXME: if there are more than 15 trailing arguments, they must be 1126 // additionally packed in a struct. 1127 Value *Inner = &V; 1128 if (!V.getType()->isPointerTy()) { 1129 IRBuilder<>::InsertPointGuard Guard(Builder); 1130 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n"); 1131 1132 Builder.restoreIP(OuterAllocaIP); 1133 Value *Ptr = 1134 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded"); 1135 1136 // Store to stack at end of the block that currently branches to the entry 1137 // block of the to-be-outlined region. 1138 Builder.SetInsertPoint(InsertBB, 1139 InsertBB->getTerminator()->getIterator()); 1140 Builder.CreateStore(&V, Ptr); 1141 1142 // Load back next to allocations in the to-be-outlined region. 1143 Builder.restoreIP(InnerAllocaIP); 1144 Inner = Builder.CreateLoad(V.getType(), Ptr); 1145 } 1146 1147 Value *ReplacementValue = nullptr; 1148 CallInst *CI = dyn_cast<CallInst>(&V); 1149 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) { 1150 ReplacementValue = PrivTID; 1151 } else { 1152 Builder.restoreIP( 1153 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue)); 1154 assert(ReplacementValue && 1155 "Expected copy/create callback to set replacement value!"); 1156 if (ReplacementValue == &V) 1157 return; 1158 } 1159 1160 for (Use *UPtr : Uses) 1161 UPtr->set(ReplacementValue); 1162 }; 1163 1164 // Reset the inner alloca insertion as it will be used for loading the values 1165 // wrapped into pointers before passing them into the to-be-outlined region. 1166 // Configure it to insert immediately after the fake use of zero address so 1167 // that they are available in the generated body and so that the 1168 // OpenMP-related values (thread ID and zero address pointers) remain leading 1169 // in the argument list. 1170 InnerAllocaIP = IRBuilder<>::InsertPoint( 1171 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator()); 1172 1173 // Reset the outer alloca insertion point to the entry of the relevant block 1174 // in case it was invalidated. 1175 OuterAllocaIP = IRBuilder<>::InsertPoint( 1176 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt()); 1177 1178 for (Value *Input : Inputs) { 1179 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n"); 1180 PrivHelper(*Input); 1181 } 1182 LLVM_DEBUG({ 1183 for (Value *Output : Outputs) 1184 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n"); 1185 }); 1186 assert(Outputs.empty() && 1187 "OpenMP outlining should not produce live-out values!"); 1188 1189 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n"); 1190 LLVM_DEBUG({ 1191 for (auto *BB : Blocks) 1192 dbgs() << " PBR: " << BB->getName() << "\n"; 1193 }); 1194 1195 // Register the outlined info. 1196 addOutlineInfo(std::move(OI)); 1197 1198 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end()); 1199 UI->eraseFromParent(); 1200 1201 return AfterIP; 1202 } 1203 1204 void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) { 1205 // Build call void __kmpc_flush(ident_t *loc) 1206 uint32_t SrcLocStrSize; 1207 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1208 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)}; 1209 1210 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args); 1211 } 1212 1213 void OpenMPIRBuilder::createFlush(const LocationDescription &Loc) { 1214 if (!updateToLocation(Loc)) 1215 return; 1216 emitFlush(Loc); 1217 } 1218 1219 void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) { 1220 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32 1221 // global_tid); 1222 uint32_t SrcLocStrSize; 1223 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1224 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 1225 Value *Args[] = {Ident, getOrCreateThreadID(Ident)}; 1226 1227 // Ignore return result until untied tasks are supported. 1228 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait), 1229 Args); 1230 } 1231 1232 void OpenMPIRBuilder::createTaskwait(const LocationDescription &Loc) { 1233 if (!updateToLocation(Loc)) 1234 return; 1235 emitTaskwaitImpl(Loc); 1236 } 1237 1238 void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) { 1239 // Build call __kmpc_omp_taskyield(loc, thread_id, 0); 1240 uint32_t SrcLocStrSize; 1241 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1242 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 1243 Constant *I32Null = ConstantInt::getNullValue(Int32); 1244 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null}; 1245 1246 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield), 1247 Args); 1248 } 1249 1250 void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) { 1251 if (!updateToLocation(Loc)) 1252 return; 1253 emitTaskyieldImpl(Loc); 1254 } 1255 1256 OpenMPIRBuilder::InsertPointTy 1257 OpenMPIRBuilder::createTask(const LocationDescription &Loc, 1258 InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, 1259 bool Tied, Value *Final) { 1260 if (!updateToLocation(Loc)) 1261 return InsertPointTy(); 1262 1263 // The current basic block is split into four basic blocks. After outlining, 1264 // they will be mapped as follows: 1265 // ``` 1266 // def current_fn() { 1267 // current_basic_block: 1268 // br label %task.exit 1269 // task.exit: 1270 // ; instructions after task 1271 // } 1272 // def outlined_fn() { 1273 // task.alloca: 1274 // br label %task.body 1275 // task.body: 1276 // ret void 1277 // } 1278 // ``` 1279 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit"); 1280 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body"); 1281 BasicBlock *TaskAllocaBB = 1282 splitBB(Builder, /*CreateBranch=*/true, "task.alloca"); 1283 1284 OutlineInfo OI; 1285 OI.EntryBB = TaskAllocaBB; 1286 OI.OuterAllocaBB = AllocaIP.getBlock(); 1287 OI.ExitBB = TaskExitBB; 1288 OI.PostOutlineCB = [this, &Loc, Tied, Final](Function &OutlinedFn) { 1289 // The input IR here looks like the following- 1290 // ``` 1291 // func @current_fn() { 1292 // outlined_fn(%args) 1293 // } 1294 // func @outlined_fn(%args) { ... } 1295 // ``` 1296 // 1297 // This is changed to the following- 1298 // 1299 // ``` 1300 // func @current_fn() { 1301 // runtime_call(..., wrapper_fn, ...) 1302 // } 1303 // func @wrapper_fn(..., %args) { 1304 // outlined_fn(%args) 1305 // } 1306 // func @outlined_fn(%args) { ... } 1307 // ``` 1308 1309 // The stale call instruction will be replaced with a new call instruction 1310 // for runtime call with a wrapper function. 1311 assert(OutlinedFn.getNumUses() == 1 && 1312 "there must be a single user for the outlined function"); 1313 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back()); 1314 1315 // HasTaskData is true if any variables are captured in the outlined region, 1316 // false otherwise. 1317 bool HasTaskData = StaleCI->arg_size() > 0; 1318 Builder.SetInsertPoint(StaleCI); 1319 1320 // Gather the arguments for emitting the runtime call for 1321 // @__kmpc_omp_task_alloc 1322 Function *TaskAllocFn = 1323 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc); 1324 1325 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID) 1326 // call. 1327 uint32_t SrcLocStrSize; 1328 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1329 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 1330 Value *ThreadID = getOrCreateThreadID(Ident); 1331 1332 // Argument - `flags` 1333 // Task is tied iff (Flags & 1) == 1. 1334 // Task is untied iff (Flags & 1) == 0. 1335 // Task is final iff (Flags & 2) == 2. 1336 // Task is not final iff (Flags & 2) == 0. 1337 // TODO: Handle the other flags. 1338 Value *Flags = Builder.getInt32(Tied); 1339 if (Final) { 1340 Value *FinalFlag = 1341 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0)); 1342 Flags = Builder.CreateOr(FinalFlag, Flags); 1343 } 1344 1345 // Argument - `sizeof_kmp_task_t` (TaskSize) 1346 // Tasksize refers to the size in bytes of kmp_task_t data structure 1347 // including private vars accessed in task. 1348 Value *TaskSize = Builder.getInt64(0); 1349 if (HasTaskData) { 1350 AllocaInst *ArgStructAlloca = 1351 dyn_cast<AllocaInst>(StaleCI->getArgOperand(0)); 1352 assert(ArgStructAlloca && 1353 "Unable to find the alloca instruction corresponding to arguments " 1354 "for extracted function"); 1355 StructType *ArgStructType = 1356 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType()); 1357 assert(ArgStructType && "Unable to find struct type corresponding to " 1358 "arguments for extracted function"); 1359 TaskSize = 1360 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType)); 1361 } 1362 1363 // TODO: Argument - sizeof_shareds 1364 1365 // Argument - task_entry (the wrapper function) 1366 // If the outlined function has some captured variables (i.e. HasTaskData is 1367 // true), then the wrapper function will have an additional argument (the 1368 // struct containing captured variables). Otherwise, no such argument will 1369 // be present. 1370 SmallVector<Type *> WrapperArgTys{Builder.getInt32Ty()}; 1371 if (HasTaskData) 1372 WrapperArgTys.push_back(OutlinedFn.getArg(0)->getType()); 1373 FunctionCallee WrapperFuncVal = M.getOrInsertFunction( 1374 (Twine(OutlinedFn.getName()) + ".wrapper").str(), 1375 FunctionType::get(Builder.getInt32Ty(), WrapperArgTys, false)); 1376 Function *WrapperFunc = dyn_cast<Function>(WrapperFuncVal.getCallee()); 1377 PointerType *WrapperFuncBitcastType = 1378 FunctionType::get(Builder.getInt32Ty(), 1379 {Builder.getInt32Ty(), Builder.getInt8PtrTy()}, false) 1380 ->getPointerTo(); 1381 Value *WrapperFuncBitcast = 1382 ConstantExpr::getBitCast(WrapperFunc, WrapperFuncBitcastType); 1383 1384 // Emit the @__kmpc_omp_task_alloc runtime call 1385 // The runtime call returns a pointer to an area where the task captured 1386 // variables must be copied before the task is run (NewTaskData) 1387 CallInst *NewTaskData = Builder.CreateCall( 1388 TaskAllocFn, 1389 {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags, 1390 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/Builder.getInt64(0), 1391 /*task_func=*/WrapperFuncBitcast}); 1392 1393 // Copy the arguments for outlined function 1394 if (HasTaskData) { 1395 Value *TaskData = StaleCI->getArgOperand(0); 1396 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout()); 1397 Builder.CreateMemCpy(NewTaskData, Alignment, TaskData, Alignment, 1398 TaskSize); 1399 } 1400 1401 // Emit the @__kmpc_omp_task runtime call to spawn the task 1402 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task); 1403 Builder.CreateCall(TaskFn, {Ident, ThreadID, NewTaskData}); 1404 1405 StaleCI->eraseFromParent(); 1406 1407 // Emit the body for wrapper function 1408 BasicBlock *WrapperEntryBB = 1409 BasicBlock::Create(M.getContext(), "", WrapperFunc); 1410 Builder.SetInsertPoint(WrapperEntryBB); 1411 if (HasTaskData) 1412 Builder.CreateCall(&OutlinedFn, {WrapperFunc->getArg(1)}); 1413 else 1414 Builder.CreateCall(&OutlinedFn); 1415 Builder.CreateRet(Builder.getInt32(0)); 1416 }; 1417 1418 addOutlineInfo(std::move(OI)); 1419 1420 InsertPointTy TaskAllocaIP = 1421 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin()); 1422 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin()); 1423 BodyGenCB(TaskAllocaIP, TaskBodyIP); 1424 Builder.SetInsertPoint(TaskExitBB); 1425 1426 return Builder.saveIP(); 1427 } 1428 1429 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSections( 1430 const LocationDescription &Loc, InsertPointTy AllocaIP, 1431 ArrayRef<StorableBodyGenCallbackTy> SectionCBs, PrivatizeCallbackTy PrivCB, 1432 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) { 1433 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required"); 1434 1435 if (!updateToLocation(Loc)) 1436 return Loc.IP; 1437 1438 auto FiniCBWrapper = [&](InsertPointTy IP) { 1439 if (IP.getBlock()->end() != IP.getPoint()) 1440 return FiniCB(IP); 1441 // This must be done otherwise any nested constructs using FinalizeOMPRegion 1442 // will fail because that function requires the Finalization Basic Block to 1443 // have a terminator, which is already removed by EmitOMPRegionBody. 1444 // IP is currently at cancelation block. 1445 // We need to backtrack to the condition block to fetch 1446 // the exit block and create a branch from cancelation 1447 // to exit block. 1448 IRBuilder<>::InsertPointGuard IPG(Builder); 1449 Builder.restoreIP(IP); 1450 auto *CaseBB = IP.getBlock()->getSinglePredecessor(); 1451 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor(); 1452 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1); 1453 Instruction *I = Builder.CreateBr(ExitBB); 1454 IP = InsertPointTy(I->getParent(), I->getIterator()); 1455 return FiniCB(IP); 1456 }; 1457 1458 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable}); 1459 1460 // Each section is emitted as a switch case 1461 // Each finalization callback is handled from clang.EmitOMPSectionDirective() 1462 // -> OMP.createSection() which generates the IR for each section 1463 // Iterate through all sections and emit a switch construct: 1464 // switch (IV) { 1465 // case 0: 1466 // <SectionStmt[0]>; 1467 // break; 1468 // ... 1469 // case <NumSection> - 1: 1470 // <SectionStmt[<NumSection> - 1]>; 1471 // break; 1472 // } 1473 // ... 1474 // section_loop.after: 1475 // <FiniCB>; 1476 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) { 1477 Builder.restoreIP(CodeGenIP); 1478 BasicBlock *Continue = 1479 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after"); 1480 Function *CurFn = Continue->getParent(); 1481 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue); 1482 1483 unsigned CaseNumber = 0; 1484 for (auto SectionCB : SectionCBs) { 1485 BasicBlock *CaseBB = BasicBlock::Create( 1486 M.getContext(), "omp_section_loop.body.case", CurFn, Continue); 1487 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB); 1488 Builder.SetInsertPoint(CaseBB); 1489 BranchInst *CaseEndBr = Builder.CreateBr(Continue); 1490 SectionCB(InsertPointTy(), 1491 {CaseEndBr->getParent(), CaseEndBr->getIterator()}); 1492 CaseNumber++; 1493 } 1494 // remove the existing terminator from body BB since there can be no 1495 // terminators after switch/case 1496 }; 1497 // Loop body ends here 1498 // LowerBound, UpperBound, and STride for createCanonicalLoop 1499 Type *I32Ty = Type::getInt32Ty(M.getContext()); 1500 Value *LB = ConstantInt::get(I32Ty, 0); 1501 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size()); 1502 Value *ST = ConstantInt::get(I32Ty, 1); 1503 llvm::CanonicalLoopInfo *LoopInfo = createCanonicalLoop( 1504 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop"); 1505 InsertPointTy AfterIP = 1506 applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, !IsNowait); 1507 1508 // Apply the finalization callback in LoopAfterBB 1509 auto FiniInfo = FinalizationStack.pop_back_val(); 1510 assert(FiniInfo.DK == OMPD_sections && 1511 "Unexpected finalization stack state!"); 1512 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) { 1513 Builder.restoreIP(AfterIP); 1514 BasicBlock *FiniBB = 1515 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini"); 1516 CB(Builder.saveIP()); 1517 AfterIP = {FiniBB, FiniBB->begin()}; 1518 } 1519 1520 return AfterIP; 1521 } 1522 1523 OpenMPIRBuilder::InsertPointTy 1524 OpenMPIRBuilder::createSection(const LocationDescription &Loc, 1525 BodyGenCallbackTy BodyGenCB, 1526 FinalizeCallbackTy FiniCB) { 1527 if (!updateToLocation(Loc)) 1528 return Loc.IP; 1529 1530 auto FiniCBWrapper = [&](InsertPointTy IP) { 1531 if (IP.getBlock()->end() != IP.getPoint()) 1532 return FiniCB(IP); 1533 // This must be done otherwise any nested constructs using FinalizeOMPRegion 1534 // will fail because that function requires the Finalization Basic Block to 1535 // have a terminator, which is already removed by EmitOMPRegionBody. 1536 // IP is currently at cancelation block. 1537 // We need to backtrack to the condition block to fetch 1538 // the exit block and create a branch from cancelation 1539 // to exit block. 1540 IRBuilder<>::InsertPointGuard IPG(Builder); 1541 Builder.restoreIP(IP); 1542 auto *CaseBB = Loc.IP.getBlock(); 1543 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor(); 1544 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1); 1545 Instruction *I = Builder.CreateBr(ExitBB); 1546 IP = InsertPointTy(I->getParent(), I->getIterator()); 1547 return FiniCB(IP); 1548 }; 1549 1550 Directive OMPD = Directive::OMPD_sections; 1551 // Since we are using Finalization Callback here, HasFinalize 1552 // and IsCancellable have to be true 1553 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper, 1554 /*Conditional*/ false, /*hasFinalize*/ true, 1555 /*IsCancellable*/ true); 1556 } 1557 1558 /// Create a function with a unique name and a "void (i8*, i8*)" signature in 1559 /// the given module and return it. 1560 Function *getFreshReductionFunc(Module &M) { 1561 Type *VoidTy = Type::getVoidTy(M.getContext()); 1562 Type *Int8PtrTy = Type::getInt8PtrTy(M.getContext()); 1563 auto *FuncTy = 1564 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false); 1565 return Function::Create(FuncTy, GlobalVariable::InternalLinkage, 1566 M.getDataLayout().getDefaultGlobalsAddressSpace(), 1567 ".omp.reduction.func", &M); 1568 } 1569 1570 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions( 1571 const LocationDescription &Loc, InsertPointTy AllocaIP, 1572 ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait) { 1573 for (const ReductionInfo &RI : ReductionInfos) { 1574 (void)RI; 1575 assert(RI.Variable && "expected non-null variable"); 1576 assert(RI.PrivateVariable && "expected non-null private variable"); 1577 assert(RI.ReductionGen && "expected non-null reduction generator callback"); 1578 assert(RI.Variable->getType() == RI.PrivateVariable->getType() && 1579 "expected variables and their private equivalents to have the same " 1580 "type"); 1581 assert(RI.Variable->getType()->isPointerTy() && 1582 "expected variables to be pointers"); 1583 } 1584 1585 if (!updateToLocation(Loc)) 1586 return InsertPointTy(); 1587 1588 BasicBlock *InsertBlock = Loc.IP.getBlock(); 1589 BasicBlock *ContinuationBlock = 1590 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize"); 1591 InsertBlock->getTerminator()->eraseFromParent(); 1592 1593 // Create and populate array of type-erased pointers to private reduction 1594 // values. 1595 unsigned NumReductions = ReductionInfos.size(); 1596 Type *RedArrayTy = ArrayType::get(Builder.getInt8PtrTy(), NumReductions); 1597 Builder.restoreIP(AllocaIP); 1598 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array"); 1599 1600 Builder.SetInsertPoint(InsertBlock, InsertBlock->end()); 1601 1602 for (auto En : enumerate(ReductionInfos)) { 1603 unsigned Index = En.index(); 1604 const ReductionInfo &RI = En.value(); 1605 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64( 1606 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index)); 1607 Value *Casted = 1608 Builder.CreateBitCast(RI.PrivateVariable, Builder.getInt8PtrTy(), 1609 "private.red.var." + Twine(Index) + ".casted"); 1610 Builder.CreateStore(Casted, RedArrayElemPtr); 1611 } 1612 1613 // Emit a call to the runtime function that orchestrates the reduction. 1614 // Declare the reduction function in the process. 1615 Function *Func = Builder.GetInsertBlock()->getParent(); 1616 Module *Module = Func->getParent(); 1617 Value *RedArrayPtr = 1618 Builder.CreateBitCast(RedArray, Builder.getInt8PtrTy(), "red.array.ptr"); 1619 uint32_t SrcLocStrSize; 1620 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1621 bool CanGenerateAtomic = 1622 llvm::all_of(ReductionInfos, [](const ReductionInfo &RI) { 1623 return RI.AtomicReductionGen; 1624 }); 1625 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize, 1626 CanGenerateAtomic 1627 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE 1628 : IdentFlag(0)); 1629 Value *ThreadId = getOrCreateThreadID(Ident); 1630 Constant *NumVariables = Builder.getInt32(NumReductions); 1631 const DataLayout &DL = Module->getDataLayout(); 1632 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy); 1633 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize); 1634 Function *ReductionFunc = getFreshReductionFunc(*Module); 1635 Value *Lock = getOMPCriticalRegionLock(".reduction"); 1636 Function *ReduceFunc = getOrCreateRuntimeFunctionPtr( 1637 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait 1638 : RuntimeFunction::OMPRTL___kmpc_reduce); 1639 CallInst *ReduceCall = 1640 Builder.CreateCall(ReduceFunc, 1641 {Ident, ThreadId, NumVariables, RedArraySize, 1642 RedArrayPtr, ReductionFunc, Lock}, 1643 "reduce"); 1644 1645 // Create final reduction entry blocks for the atomic and non-atomic case. 1646 // Emit IR that dispatches control flow to one of the blocks based on the 1647 // reduction supporting the atomic mode. 1648 BasicBlock *NonAtomicRedBlock = 1649 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func); 1650 BasicBlock *AtomicRedBlock = 1651 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func); 1652 SwitchInst *Switch = 1653 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2); 1654 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock); 1655 Switch->addCase(Builder.getInt32(2), AtomicRedBlock); 1656 1657 // Populate the non-atomic reduction using the elementwise reduction function. 1658 // This loads the elements from the global and private variables and reduces 1659 // them before storing back the result to the global variable. 1660 Builder.SetInsertPoint(NonAtomicRedBlock); 1661 for (auto En : enumerate(ReductionInfos)) { 1662 const ReductionInfo &RI = En.value(); 1663 Type *ValueType = RI.ElementType; 1664 Value *RedValue = Builder.CreateLoad(ValueType, RI.Variable, 1665 "red.value." + Twine(En.index())); 1666 Value *PrivateRedValue = 1667 Builder.CreateLoad(ValueType, RI.PrivateVariable, 1668 "red.private.value." + Twine(En.index())); 1669 Value *Reduced; 1670 Builder.restoreIP( 1671 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced)); 1672 if (!Builder.GetInsertBlock()) 1673 return InsertPointTy(); 1674 Builder.CreateStore(Reduced, RI.Variable); 1675 } 1676 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr( 1677 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait 1678 : RuntimeFunction::OMPRTL___kmpc_end_reduce); 1679 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock}); 1680 Builder.CreateBr(ContinuationBlock); 1681 1682 // Populate the atomic reduction using the atomic elementwise reduction 1683 // function. There are no loads/stores here because they will be happening 1684 // inside the atomic elementwise reduction. 1685 Builder.SetInsertPoint(AtomicRedBlock); 1686 if (CanGenerateAtomic) { 1687 for (const ReductionInfo &RI : ReductionInfos) { 1688 Builder.restoreIP(RI.AtomicReductionGen(Builder.saveIP(), RI.ElementType, 1689 RI.Variable, RI.PrivateVariable)); 1690 if (!Builder.GetInsertBlock()) 1691 return InsertPointTy(); 1692 } 1693 Builder.CreateBr(ContinuationBlock); 1694 } else { 1695 Builder.CreateUnreachable(); 1696 } 1697 1698 // Populate the outlined reduction function using the elementwise reduction 1699 // function. Partial values are extracted from the type-erased array of 1700 // pointers to private variables. 1701 BasicBlock *ReductionFuncBlock = 1702 BasicBlock::Create(Module->getContext(), "", ReductionFunc); 1703 Builder.SetInsertPoint(ReductionFuncBlock); 1704 Value *LHSArrayPtr = Builder.CreateBitCast(ReductionFunc->getArg(0), 1705 RedArrayTy->getPointerTo()); 1706 Value *RHSArrayPtr = Builder.CreateBitCast(ReductionFunc->getArg(1), 1707 RedArrayTy->getPointerTo()); 1708 for (auto En : enumerate(ReductionInfos)) { 1709 const ReductionInfo &RI = En.value(); 1710 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64( 1711 RedArrayTy, LHSArrayPtr, 0, En.index()); 1712 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getInt8PtrTy(), LHSI8PtrPtr); 1713 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType()); 1714 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr); 1715 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64( 1716 RedArrayTy, RHSArrayPtr, 0, En.index()); 1717 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getInt8PtrTy(), RHSI8PtrPtr); 1718 Value *RHSPtr = 1719 Builder.CreateBitCast(RHSI8Ptr, RI.PrivateVariable->getType()); 1720 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr); 1721 Value *Reduced; 1722 Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced)); 1723 if (!Builder.GetInsertBlock()) 1724 return InsertPointTy(); 1725 Builder.CreateStore(Reduced, LHSPtr); 1726 } 1727 Builder.CreateRetVoid(); 1728 1729 Builder.SetInsertPoint(ContinuationBlock); 1730 return Builder.saveIP(); 1731 } 1732 1733 OpenMPIRBuilder::InsertPointTy 1734 OpenMPIRBuilder::createMaster(const LocationDescription &Loc, 1735 BodyGenCallbackTy BodyGenCB, 1736 FinalizeCallbackTy FiniCB) { 1737 1738 if (!updateToLocation(Loc)) 1739 return Loc.IP; 1740 1741 Directive OMPD = Directive::OMPD_master; 1742 uint32_t SrcLocStrSize; 1743 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1744 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 1745 Value *ThreadId = getOrCreateThreadID(Ident); 1746 Value *Args[] = {Ident, ThreadId}; 1747 1748 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master); 1749 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args); 1750 1751 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master); 1752 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args); 1753 1754 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB, 1755 /*Conditional*/ true, /*hasFinalize*/ true); 1756 } 1757 1758 OpenMPIRBuilder::InsertPointTy 1759 OpenMPIRBuilder::createMasked(const LocationDescription &Loc, 1760 BodyGenCallbackTy BodyGenCB, 1761 FinalizeCallbackTy FiniCB, Value *Filter) { 1762 if (!updateToLocation(Loc)) 1763 return Loc.IP; 1764 1765 Directive OMPD = Directive::OMPD_masked; 1766 uint32_t SrcLocStrSize; 1767 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1768 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 1769 Value *ThreadId = getOrCreateThreadID(Ident); 1770 Value *Args[] = {Ident, ThreadId, Filter}; 1771 Value *ArgsEnd[] = {Ident, ThreadId}; 1772 1773 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked); 1774 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args); 1775 1776 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked); 1777 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd); 1778 1779 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB, 1780 /*Conditional*/ true, /*hasFinalize*/ true); 1781 } 1782 1783 CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton( 1784 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, 1785 BasicBlock *PostInsertBefore, const Twine &Name) { 1786 Module *M = F->getParent(); 1787 LLVMContext &Ctx = M->getContext(); 1788 Type *IndVarTy = TripCount->getType(); 1789 1790 // Create the basic block structure. 1791 BasicBlock *Preheader = 1792 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore); 1793 BasicBlock *Header = 1794 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore); 1795 BasicBlock *Cond = 1796 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore); 1797 BasicBlock *Body = 1798 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore); 1799 BasicBlock *Latch = 1800 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore); 1801 BasicBlock *Exit = 1802 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore); 1803 BasicBlock *After = 1804 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore); 1805 1806 // Use specified DebugLoc for new instructions. 1807 Builder.SetCurrentDebugLocation(DL); 1808 1809 Builder.SetInsertPoint(Preheader); 1810 Builder.CreateBr(Header); 1811 1812 Builder.SetInsertPoint(Header); 1813 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv"); 1814 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader); 1815 Builder.CreateBr(Cond); 1816 1817 Builder.SetInsertPoint(Cond); 1818 Value *Cmp = 1819 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp"); 1820 Builder.CreateCondBr(Cmp, Body, Exit); 1821 1822 Builder.SetInsertPoint(Body); 1823 Builder.CreateBr(Latch); 1824 1825 Builder.SetInsertPoint(Latch); 1826 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1), 1827 "omp_" + Name + ".next", /*HasNUW=*/true); 1828 Builder.CreateBr(Header); 1829 IndVarPHI->addIncoming(Next, Latch); 1830 1831 Builder.SetInsertPoint(Exit); 1832 Builder.CreateBr(After); 1833 1834 // Remember and return the canonical control flow. 1835 LoopInfos.emplace_front(); 1836 CanonicalLoopInfo *CL = &LoopInfos.front(); 1837 1838 CL->Header = Header; 1839 CL->Cond = Cond; 1840 CL->Latch = Latch; 1841 CL->Exit = Exit; 1842 1843 #ifndef NDEBUG 1844 CL->assertOK(); 1845 #endif 1846 return CL; 1847 } 1848 1849 CanonicalLoopInfo * 1850 OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc, 1851 LoopBodyGenCallbackTy BodyGenCB, 1852 Value *TripCount, const Twine &Name) { 1853 BasicBlock *BB = Loc.IP.getBlock(); 1854 BasicBlock *NextBB = BB->getNextNode(); 1855 1856 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(), 1857 NextBB, NextBB, Name); 1858 BasicBlock *After = CL->getAfter(); 1859 1860 // If location is not set, don't connect the loop. 1861 if (updateToLocation(Loc)) { 1862 // Split the loop at the insertion point: Branch to the preheader and move 1863 // every following instruction to after the loop (the After BB). Also, the 1864 // new successor is the loop's after block. 1865 spliceBB(Builder, After, /*CreateBranch=*/false); 1866 Builder.CreateBr(CL->getPreheader()); 1867 } 1868 1869 // Emit the body content. We do it after connecting the loop to the CFG to 1870 // avoid that the callback encounters degenerate BBs. 1871 BodyGenCB(CL->getBodyIP(), CL->getIndVar()); 1872 1873 #ifndef NDEBUG 1874 CL->assertOK(); 1875 #endif 1876 return CL; 1877 } 1878 1879 CanonicalLoopInfo *OpenMPIRBuilder::createCanonicalLoop( 1880 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, 1881 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, 1882 InsertPointTy ComputeIP, const Twine &Name) { 1883 1884 // Consider the following difficulties (assuming 8-bit signed integers): 1885 // * Adding \p Step to the loop counter which passes \p Stop may overflow: 1886 // DO I = 1, 100, 50 1887 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction: 1888 // DO I = 100, 0, -128 1889 1890 // Start, Stop and Step must be of the same integer type. 1891 auto *IndVarTy = cast<IntegerType>(Start->getType()); 1892 assert(IndVarTy == Stop->getType() && "Stop type mismatch"); 1893 assert(IndVarTy == Step->getType() && "Step type mismatch"); 1894 1895 LocationDescription ComputeLoc = 1896 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc; 1897 updateToLocation(ComputeLoc); 1898 1899 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0); 1900 ConstantInt *One = ConstantInt::get(IndVarTy, 1); 1901 1902 // Like Step, but always positive. 1903 Value *Incr = Step; 1904 1905 // Distance between Start and Stop; always positive. 1906 Value *Span; 1907 1908 // Condition whether there are no iterations are executed at all, e.g. because 1909 // UB < LB. 1910 Value *ZeroCmp; 1911 1912 if (IsSigned) { 1913 // Ensure that increment is positive. If not, negate and invert LB and UB. 1914 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero); 1915 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step); 1916 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start); 1917 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop); 1918 Span = Builder.CreateSub(UB, LB, "", false, true); 1919 ZeroCmp = Builder.CreateICmp( 1920 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB); 1921 } else { 1922 Span = Builder.CreateSub(Stop, Start, "", true); 1923 ZeroCmp = Builder.CreateICmp( 1924 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start); 1925 } 1926 1927 Value *CountIfLooping; 1928 if (InclusiveStop) { 1929 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One); 1930 } else { 1931 // Avoid incrementing past stop since it could overflow. 1932 Value *CountIfTwo = Builder.CreateAdd( 1933 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One); 1934 Value *OneCmp = Builder.CreateICmp( 1935 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Span, Incr); 1936 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo); 1937 } 1938 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping, 1939 "omp_" + Name + ".tripcount"); 1940 1941 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) { 1942 Builder.restoreIP(CodeGenIP); 1943 Value *Span = Builder.CreateMul(IV, Step); 1944 Value *IndVar = Builder.CreateAdd(Span, Start); 1945 BodyGenCB(Builder.saveIP(), IndVar); 1946 }; 1947 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP(); 1948 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name); 1949 } 1950 1951 // Returns an LLVM function to call for initializing loop bounds using OpenMP 1952 // static scheduling depending on `type`. Only i32 and i64 are supported by the 1953 // runtime. Always interpret integers as unsigned similarly to 1954 // CanonicalLoopInfo. 1955 static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, 1956 OpenMPIRBuilder &OMPBuilder) { 1957 unsigned Bitwidth = Ty->getIntegerBitWidth(); 1958 if (Bitwidth == 32) 1959 return OMPBuilder.getOrCreateRuntimeFunction( 1960 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u); 1961 if (Bitwidth == 64) 1962 return OMPBuilder.getOrCreateRuntimeFunction( 1963 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u); 1964 llvm_unreachable("unknown OpenMP loop iterator bitwidth"); 1965 } 1966 1967 OpenMPIRBuilder::InsertPointTy 1968 OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, 1969 InsertPointTy AllocaIP, 1970 bool NeedsBarrier) { 1971 assert(CLI->isValid() && "Requires a valid canonical loop"); 1972 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) && 1973 "Require dedicated allocate IP"); 1974 1975 // Set up the source location value for OpenMP runtime. 1976 Builder.restoreIP(CLI->getPreheaderIP()); 1977 Builder.SetCurrentDebugLocation(DL); 1978 1979 uint32_t SrcLocStrSize; 1980 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize); 1981 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 1982 1983 // Declare useful OpenMP runtime functions. 1984 Value *IV = CLI->getIndVar(); 1985 Type *IVTy = IV->getType(); 1986 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this); 1987 FunctionCallee StaticFini = 1988 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini); 1989 1990 // Allocate space for computed loop bounds as expected by the "init" function. 1991 Builder.restoreIP(AllocaIP); 1992 Type *I32Type = Type::getInt32Ty(M.getContext()); 1993 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter"); 1994 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound"); 1995 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound"); 1996 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride"); 1997 1998 // At the end of the preheader, prepare for calling the "init" function by 1999 // storing the current loop bounds into the allocated space. A canonical loop 2000 // always iterates from 0 to trip-count with step 1. Note that "init" expects 2001 // and produces an inclusive upper bound. 2002 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator()); 2003 Constant *Zero = ConstantInt::get(IVTy, 0); 2004 Constant *One = ConstantInt::get(IVTy, 1); 2005 Builder.CreateStore(Zero, PLowerBound); 2006 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One); 2007 Builder.CreateStore(UpperBound, PUpperBound); 2008 Builder.CreateStore(One, PStride); 2009 2010 Value *ThreadNum = getOrCreateThreadID(SrcLoc); 2011 2012 Constant *SchedulingType = ConstantInt::get( 2013 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic)); 2014 2015 // Call the "init" function and update the trip count of the loop with the 2016 // value it produced. 2017 Builder.CreateCall(StaticInit, 2018 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound, 2019 PUpperBound, PStride, One, Zero}); 2020 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound); 2021 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound); 2022 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound); 2023 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One); 2024 CLI->setTripCount(TripCount); 2025 2026 // Update all uses of the induction variable except the one in the condition 2027 // block that compares it with the actual upper bound, and the increment in 2028 // the latch block. 2029 2030 CLI->mapIndVar([&](Instruction *OldIV) -> Value * { 2031 Builder.SetInsertPoint(CLI->getBody(), 2032 CLI->getBody()->getFirstInsertionPt()); 2033 Builder.SetCurrentDebugLocation(DL); 2034 return Builder.CreateAdd(OldIV, LowerBound); 2035 }); 2036 2037 // In the "exit" block, call the "fini" function. 2038 Builder.SetInsertPoint(CLI->getExit(), 2039 CLI->getExit()->getTerminator()->getIterator()); 2040 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum}); 2041 2042 // Add the barrier if requested. 2043 if (NeedsBarrier) 2044 createBarrier(LocationDescription(Builder.saveIP(), DL), 2045 omp::Directive::OMPD_for, /* ForceSimpleCall */ false, 2046 /* CheckCancelFlag */ false); 2047 2048 InsertPointTy AfterIP = CLI->getAfterIP(); 2049 CLI->invalidate(); 2050 2051 return AfterIP; 2052 } 2053 2054 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyStaticChunkedWorkshareLoop( 2055 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, 2056 bool NeedsBarrier, Value *ChunkSize) { 2057 assert(CLI->isValid() && "Requires a valid canonical loop"); 2058 assert(ChunkSize && "Chunk size is required"); 2059 2060 LLVMContext &Ctx = CLI->getFunction()->getContext(); 2061 Value *IV = CLI->getIndVar(); 2062 Value *OrigTripCount = CLI->getTripCount(); 2063 Type *IVTy = IV->getType(); 2064 assert(IVTy->getIntegerBitWidth() <= 64 && 2065 "Max supported tripcount bitwidth is 64 bits"); 2066 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx) 2067 : Type::getInt64Ty(Ctx); 2068 Type *I32Type = Type::getInt32Ty(M.getContext()); 2069 Constant *Zero = ConstantInt::get(InternalIVTy, 0); 2070 Constant *One = ConstantInt::get(InternalIVTy, 1); 2071 2072 // Declare useful OpenMP runtime functions. 2073 FunctionCallee StaticInit = 2074 getKmpcForStaticInitForType(InternalIVTy, M, *this); 2075 FunctionCallee StaticFini = 2076 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini); 2077 2078 // Allocate space for computed loop bounds as expected by the "init" function. 2079 Builder.restoreIP(AllocaIP); 2080 Builder.SetCurrentDebugLocation(DL); 2081 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter"); 2082 Value *PLowerBound = 2083 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound"); 2084 Value *PUpperBound = 2085 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound"); 2086 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride"); 2087 2088 // Set up the source location value for the OpenMP runtime. 2089 Builder.restoreIP(CLI->getPreheaderIP()); 2090 Builder.SetCurrentDebugLocation(DL); 2091 2092 // TODO: Detect overflow in ubsan or max-out with current tripcount. 2093 Value *CastedChunkSize = 2094 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize"); 2095 Value *CastedTripCount = 2096 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount"); 2097 2098 Constant *SchedulingType = ConstantInt::get( 2099 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked)); 2100 Builder.CreateStore(Zero, PLowerBound); 2101 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One); 2102 Builder.CreateStore(OrigUpperBound, PUpperBound); 2103 Builder.CreateStore(One, PStride); 2104 2105 // Call the "init" function and update the trip count of the loop with the 2106 // value it produced. 2107 uint32_t SrcLocStrSize; 2108 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize); 2109 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 2110 Value *ThreadNum = getOrCreateThreadID(SrcLoc); 2111 Builder.CreateCall(StaticInit, 2112 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum, 2113 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter, 2114 /*plower=*/PLowerBound, /*pupper=*/PUpperBound, 2115 /*pstride=*/PStride, /*incr=*/One, 2116 /*chunk=*/CastedChunkSize}); 2117 2118 // Load values written by the "init" function. 2119 Value *FirstChunkStart = 2120 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb"); 2121 Value *FirstChunkStop = 2122 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub"); 2123 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One); 2124 Value *ChunkRange = 2125 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range"); 2126 Value *NextChunkStride = 2127 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride"); 2128 2129 // Create outer "dispatch" loop for enumerating the chunks. 2130 BasicBlock *DispatchEnter = splitBB(Builder, true); 2131 Value *DispatchCounter; 2132 CanonicalLoopInfo *DispatchCLI = createCanonicalLoop( 2133 {Builder.saveIP(), DL}, 2134 [&](InsertPointTy BodyIP, Value *Counter) { DispatchCounter = Counter; }, 2135 FirstChunkStart, CastedTripCount, NextChunkStride, 2136 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{}, 2137 "dispatch"); 2138 2139 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to 2140 // not have to preserve the canonical invariant. 2141 BasicBlock *DispatchBody = DispatchCLI->getBody(); 2142 BasicBlock *DispatchLatch = DispatchCLI->getLatch(); 2143 BasicBlock *DispatchExit = DispatchCLI->getExit(); 2144 BasicBlock *DispatchAfter = DispatchCLI->getAfter(); 2145 DispatchCLI->invalidate(); 2146 2147 // Rewire the original loop to become the chunk loop inside the dispatch loop. 2148 redirectTo(DispatchAfter, CLI->getAfter(), DL); 2149 redirectTo(CLI->getExit(), DispatchLatch, DL); 2150 redirectTo(DispatchBody, DispatchEnter, DL); 2151 2152 // Prepare the prolog of the chunk loop. 2153 Builder.restoreIP(CLI->getPreheaderIP()); 2154 Builder.SetCurrentDebugLocation(DL); 2155 2156 // Compute the number of iterations of the chunk loop. 2157 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator()); 2158 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange); 2159 Value *IsLastChunk = 2160 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last"); 2161 Value *CountUntilOrigTripCount = 2162 Builder.CreateSub(CastedTripCount, DispatchCounter); 2163 Value *ChunkTripCount = Builder.CreateSelect( 2164 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount"); 2165 Value *BackcastedChunkTC = 2166 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc"); 2167 CLI->setTripCount(BackcastedChunkTC); 2168 2169 // Update all uses of the induction variable except the one in the condition 2170 // block that compares it with the actual upper bound, and the increment in 2171 // the latch block. 2172 Value *BackcastedDispatchCounter = 2173 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc"); 2174 CLI->mapIndVar([&](Instruction *) -> Value * { 2175 Builder.restoreIP(CLI->getBodyIP()); 2176 return Builder.CreateAdd(IV, BackcastedDispatchCounter); 2177 }); 2178 2179 // In the "exit" block, call the "fini" function. 2180 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt()); 2181 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum}); 2182 2183 // Add the barrier if requested. 2184 if (NeedsBarrier) 2185 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for, 2186 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false); 2187 2188 #ifndef NDEBUG 2189 // Even though we currently do not support applying additional methods to it, 2190 // the chunk loop should remain a canonical loop. 2191 CLI->assertOK(); 2192 #endif 2193 2194 return {DispatchAfter, DispatchAfter->getFirstInsertionPt()}; 2195 } 2196 2197 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoop( 2198 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, 2199 bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind, 2200 llvm::Value *ChunkSize, bool HasSimdModifier, bool HasMonotonicModifier, 2201 bool HasNonmonotonicModifier, bool HasOrderedClause) { 2202 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType( 2203 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier, 2204 HasNonmonotonicModifier, HasOrderedClause); 2205 2206 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) == 2207 OMPScheduleType::ModifierOrdered; 2208 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) { 2209 case OMPScheduleType::BaseStatic: 2210 assert(!ChunkSize && "No chunk size with static-chunked schedule"); 2211 if (IsOrdered) 2212 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType, 2213 NeedsBarrier, ChunkSize); 2214 // FIXME: Monotonicity ignored? 2215 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier); 2216 2217 case OMPScheduleType::BaseStaticChunked: 2218 if (IsOrdered) 2219 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType, 2220 NeedsBarrier, ChunkSize); 2221 // FIXME: Monotonicity ignored? 2222 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier, 2223 ChunkSize); 2224 2225 case OMPScheduleType::BaseRuntime: 2226 case OMPScheduleType::BaseAuto: 2227 case OMPScheduleType::BaseGreedy: 2228 case OMPScheduleType::BaseBalanced: 2229 case OMPScheduleType::BaseSteal: 2230 case OMPScheduleType::BaseGuidedSimd: 2231 case OMPScheduleType::BaseRuntimeSimd: 2232 assert(!ChunkSize && 2233 "schedule type does not support user-defined chunk sizes"); 2234 LLVM_FALLTHROUGH; 2235 case OMPScheduleType::BaseDynamicChunked: 2236 case OMPScheduleType::BaseGuidedChunked: 2237 case OMPScheduleType::BaseGuidedIterativeChunked: 2238 case OMPScheduleType::BaseGuidedAnalyticalChunked: 2239 case OMPScheduleType::BaseStaticBalancedChunked: 2240 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType, 2241 NeedsBarrier, ChunkSize); 2242 2243 default: 2244 llvm_unreachable("Unknown/unimplemented schedule kind"); 2245 } 2246 } 2247 2248 /// Returns an LLVM function to call for initializing loop bounds using OpenMP 2249 /// dynamic scheduling depending on `type`. Only i32 and i64 are supported by 2250 /// the runtime. Always interpret integers as unsigned similarly to 2251 /// CanonicalLoopInfo. 2252 static FunctionCallee 2253 getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) { 2254 unsigned Bitwidth = Ty->getIntegerBitWidth(); 2255 if (Bitwidth == 32) 2256 return OMPBuilder.getOrCreateRuntimeFunction( 2257 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u); 2258 if (Bitwidth == 64) 2259 return OMPBuilder.getOrCreateRuntimeFunction( 2260 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u); 2261 llvm_unreachable("unknown OpenMP loop iterator bitwidth"); 2262 } 2263 2264 /// Returns an LLVM function to call for updating the next loop using OpenMP 2265 /// dynamic scheduling depending on `type`. Only i32 and i64 are supported by 2266 /// the runtime. Always interpret integers as unsigned similarly to 2267 /// CanonicalLoopInfo. 2268 static FunctionCallee 2269 getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) { 2270 unsigned Bitwidth = Ty->getIntegerBitWidth(); 2271 if (Bitwidth == 32) 2272 return OMPBuilder.getOrCreateRuntimeFunction( 2273 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u); 2274 if (Bitwidth == 64) 2275 return OMPBuilder.getOrCreateRuntimeFunction( 2276 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u); 2277 llvm_unreachable("unknown OpenMP loop iterator bitwidth"); 2278 } 2279 2280 /// Returns an LLVM function to call for finalizing the dynamic loop using 2281 /// depending on `type`. Only i32 and i64 are supported by the runtime. Always 2282 /// interpret integers as unsigned similarly to CanonicalLoopInfo. 2283 static FunctionCallee 2284 getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) { 2285 unsigned Bitwidth = Ty->getIntegerBitWidth(); 2286 if (Bitwidth == 32) 2287 return OMPBuilder.getOrCreateRuntimeFunction( 2288 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u); 2289 if (Bitwidth == 64) 2290 return OMPBuilder.getOrCreateRuntimeFunction( 2291 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u); 2292 llvm_unreachable("unknown OpenMP loop iterator bitwidth"); 2293 } 2294 2295 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyDynamicWorkshareLoop( 2296 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, 2297 OMPScheduleType SchedType, bool NeedsBarrier, Value *Chunk) { 2298 assert(CLI->isValid() && "Requires a valid canonical loop"); 2299 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) && 2300 "Require dedicated allocate IP"); 2301 assert(isValidWorkshareLoopScheduleType(SchedType) && 2302 "Require valid schedule type"); 2303 2304 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) == 2305 OMPScheduleType::ModifierOrdered; 2306 2307 // Set up the source location value for OpenMP runtime. 2308 Builder.SetCurrentDebugLocation(DL); 2309 2310 uint32_t SrcLocStrSize; 2311 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize); 2312 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 2313 2314 // Declare useful OpenMP runtime functions. 2315 Value *IV = CLI->getIndVar(); 2316 Type *IVTy = IV->getType(); 2317 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this); 2318 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this); 2319 2320 // Allocate space for computed loop bounds as expected by the "init" function. 2321 Builder.restoreIP(AllocaIP); 2322 Type *I32Type = Type::getInt32Ty(M.getContext()); 2323 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter"); 2324 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound"); 2325 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound"); 2326 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride"); 2327 2328 // At the end of the preheader, prepare for calling the "init" function by 2329 // storing the current loop bounds into the allocated space. A canonical loop 2330 // always iterates from 0 to trip-count with step 1. Note that "init" expects 2331 // and produces an inclusive upper bound. 2332 BasicBlock *PreHeader = CLI->getPreheader(); 2333 Builder.SetInsertPoint(PreHeader->getTerminator()); 2334 Constant *One = ConstantInt::get(IVTy, 1); 2335 Builder.CreateStore(One, PLowerBound); 2336 Value *UpperBound = CLI->getTripCount(); 2337 Builder.CreateStore(UpperBound, PUpperBound); 2338 Builder.CreateStore(One, PStride); 2339 2340 BasicBlock *Header = CLI->getHeader(); 2341 BasicBlock *Exit = CLI->getExit(); 2342 BasicBlock *Cond = CLI->getCond(); 2343 BasicBlock *Latch = CLI->getLatch(); 2344 InsertPointTy AfterIP = CLI->getAfterIP(); 2345 2346 // The CLI will be "broken" in the code below, as the loop is no longer 2347 // a valid canonical loop. 2348 2349 if (!Chunk) 2350 Chunk = One; 2351 2352 Value *ThreadNum = getOrCreateThreadID(SrcLoc); 2353 2354 Constant *SchedulingType = 2355 ConstantInt::get(I32Type, static_cast<int>(SchedType)); 2356 2357 // Call the "init" function. 2358 Builder.CreateCall(DynamicInit, 2359 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One, 2360 UpperBound, /* step */ One, Chunk}); 2361 2362 // An outer loop around the existing one. 2363 BasicBlock *OuterCond = BasicBlock::Create( 2364 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond", 2365 PreHeader->getParent()); 2366 // This needs to be 32-bit always, so can't use the IVTy Zero above. 2367 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt()); 2368 Value *Res = 2369 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter, 2370 PLowerBound, PUpperBound, PStride}); 2371 Constant *Zero32 = ConstantInt::get(I32Type, 0); 2372 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32); 2373 Value *LowerBound = 2374 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb"); 2375 Builder.CreateCondBr(MoreWork, Header, Exit); 2376 2377 // Change PHI-node in loop header to use outer cond rather than preheader, 2378 // and set IV to the LowerBound. 2379 Instruction *Phi = &Header->front(); 2380 auto *PI = cast<PHINode>(Phi); 2381 PI->setIncomingBlock(0, OuterCond); 2382 PI->setIncomingValue(0, LowerBound); 2383 2384 // Then set the pre-header to jump to the OuterCond 2385 Instruction *Term = PreHeader->getTerminator(); 2386 auto *Br = cast<BranchInst>(Term); 2387 Br->setSuccessor(0, OuterCond); 2388 2389 // Modify the inner condition: 2390 // * Use the UpperBound returned from the DynamicNext call. 2391 // * jump to the loop outer loop when done with one of the inner loops. 2392 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt()); 2393 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub"); 2394 Instruction *Comp = &*Builder.GetInsertPoint(); 2395 auto *CI = cast<CmpInst>(Comp); 2396 CI->setOperand(1, UpperBound); 2397 // Redirect the inner exit to branch to outer condition. 2398 Instruction *Branch = &Cond->back(); 2399 auto *BI = cast<BranchInst>(Branch); 2400 assert(BI->getSuccessor(1) == Exit); 2401 BI->setSuccessor(1, OuterCond); 2402 2403 // Call the "fini" function if "ordered" is present in wsloop directive. 2404 if (Ordered) { 2405 Builder.SetInsertPoint(&Latch->back()); 2406 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this); 2407 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum}); 2408 } 2409 2410 // Add the barrier if requested. 2411 if (NeedsBarrier) { 2412 Builder.SetInsertPoint(&Exit->back()); 2413 createBarrier(LocationDescription(Builder.saveIP(), DL), 2414 omp::Directive::OMPD_for, /* ForceSimpleCall */ false, 2415 /* CheckCancelFlag */ false); 2416 } 2417 2418 CLI->invalidate(); 2419 return AfterIP; 2420 } 2421 2422 /// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is, 2423 /// after this \p OldTarget will be orphaned. 2424 static void redirectAllPredecessorsTo(BasicBlock *OldTarget, 2425 BasicBlock *NewTarget, DebugLoc DL) { 2426 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget))) 2427 redirectTo(Pred, NewTarget, DL); 2428 } 2429 2430 /// Determine which blocks in \p BBs are reachable from outside and remove the 2431 /// ones that are not reachable from the function. 2432 static void removeUnusedBlocksFromParent(ArrayRef<BasicBlock *> BBs) { 2433 SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()}; 2434 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) { 2435 for (Use &U : BB->uses()) { 2436 auto *UseInst = dyn_cast<Instruction>(U.getUser()); 2437 if (!UseInst) 2438 continue; 2439 if (BBsToErase.count(UseInst->getParent())) 2440 continue; 2441 return true; 2442 } 2443 return false; 2444 }; 2445 2446 while (true) { 2447 bool Changed = false; 2448 for (BasicBlock *BB : make_early_inc_range(BBsToErase)) { 2449 if (HasRemainingUses(BB)) { 2450 BBsToErase.erase(BB); 2451 Changed = true; 2452 } 2453 } 2454 if (!Changed) 2455 break; 2456 } 2457 2458 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end()); 2459 DeleteDeadBlocks(BBVec); 2460 } 2461 2462 CanonicalLoopInfo * 2463 OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops, 2464 InsertPointTy ComputeIP) { 2465 assert(Loops.size() >= 1 && "At least one loop required"); 2466 size_t NumLoops = Loops.size(); 2467 2468 // Nothing to do if there is already just one loop. 2469 if (NumLoops == 1) 2470 return Loops.front(); 2471 2472 CanonicalLoopInfo *Outermost = Loops.front(); 2473 CanonicalLoopInfo *Innermost = Loops.back(); 2474 BasicBlock *OrigPreheader = Outermost->getPreheader(); 2475 BasicBlock *OrigAfter = Outermost->getAfter(); 2476 Function *F = OrigPreheader->getParent(); 2477 2478 // Loop control blocks that may become orphaned later. 2479 SmallVector<BasicBlock *, 12> OldControlBBs; 2480 OldControlBBs.reserve(6 * Loops.size()); 2481 for (CanonicalLoopInfo *Loop : Loops) 2482 Loop->collectControlBlocks(OldControlBBs); 2483 2484 // Setup the IRBuilder for inserting the trip count computation. 2485 Builder.SetCurrentDebugLocation(DL); 2486 if (ComputeIP.isSet()) 2487 Builder.restoreIP(ComputeIP); 2488 else 2489 Builder.restoreIP(Outermost->getPreheaderIP()); 2490 2491 // Derive the collapsed' loop trip count. 2492 // TODO: Find common/largest indvar type. 2493 Value *CollapsedTripCount = nullptr; 2494 for (CanonicalLoopInfo *L : Loops) { 2495 assert(L->isValid() && 2496 "All loops to collapse must be valid canonical loops"); 2497 Value *OrigTripCount = L->getTripCount(); 2498 if (!CollapsedTripCount) { 2499 CollapsedTripCount = OrigTripCount; 2500 continue; 2501 } 2502 2503 // TODO: Enable UndefinedSanitizer to diagnose an overflow here. 2504 CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount, 2505 {}, /*HasNUW=*/true); 2506 } 2507 2508 // Create the collapsed loop control flow. 2509 CanonicalLoopInfo *Result = 2510 createLoopSkeleton(DL, CollapsedTripCount, F, 2511 OrigPreheader->getNextNode(), OrigAfter, "collapsed"); 2512 2513 // Build the collapsed loop body code. 2514 // Start with deriving the input loop induction variables from the collapsed 2515 // one, using a divmod scheme. To preserve the original loops' order, the 2516 // innermost loop use the least significant bits. 2517 Builder.restoreIP(Result->getBodyIP()); 2518 2519 Value *Leftover = Result->getIndVar(); 2520 SmallVector<Value *> NewIndVars; 2521 NewIndVars.resize(NumLoops); 2522 for (int i = NumLoops - 1; i >= 1; --i) { 2523 Value *OrigTripCount = Loops[i]->getTripCount(); 2524 2525 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount); 2526 NewIndVars[i] = NewIndVar; 2527 2528 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount); 2529 } 2530 // Outermost loop gets all the remaining bits. 2531 NewIndVars[0] = Leftover; 2532 2533 // Construct the loop body control flow. 2534 // We progressively construct the branch structure following in direction of 2535 // the control flow, from the leading in-between code, the loop nest body, the 2536 // trailing in-between code, and rejoining the collapsed loop's latch. 2537 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If 2538 // the ContinueBlock is set, continue with that block. If ContinuePred, use 2539 // its predecessors as sources. 2540 BasicBlock *ContinueBlock = Result->getBody(); 2541 BasicBlock *ContinuePred = nullptr; 2542 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest, 2543 BasicBlock *NextSrc) { 2544 if (ContinueBlock) 2545 redirectTo(ContinueBlock, Dest, DL); 2546 else 2547 redirectAllPredecessorsTo(ContinuePred, Dest, DL); 2548 2549 ContinueBlock = nullptr; 2550 ContinuePred = NextSrc; 2551 }; 2552 2553 // The code before the nested loop of each level. 2554 // Because we are sinking it into the nest, it will be executed more often 2555 // that the original loop. More sophisticated schemes could keep track of what 2556 // the in-between code is and instantiate it only once per thread. 2557 for (size_t i = 0; i < NumLoops - 1; ++i) 2558 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader()); 2559 2560 // Connect the loop nest body. 2561 ContinueWith(Innermost->getBody(), Innermost->getLatch()); 2562 2563 // The code after the nested loop at each level. 2564 for (size_t i = NumLoops - 1; i > 0; --i) 2565 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch()); 2566 2567 // Connect the finished loop to the collapsed loop latch. 2568 ContinueWith(Result->getLatch(), nullptr); 2569 2570 // Replace the input loops with the new collapsed loop. 2571 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL); 2572 redirectTo(Result->getAfter(), Outermost->getAfter(), DL); 2573 2574 // Replace the input loop indvars with the derived ones. 2575 for (size_t i = 0; i < NumLoops; ++i) 2576 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]); 2577 2578 // Remove unused parts of the input loops. 2579 removeUnusedBlocksFromParent(OldControlBBs); 2580 2581 for (CanonicalLoopInfo *L : Loops) 2582 L->invalidate(); 2583 2584 #ifndef NDEBUG 2585 Result->assertOK(); 2586 #endif 2587 return Result; 2588 } 2589 2590 std::vector<CanonicalLoopInfo *> 2591 OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops, 2592 ArrayRef<Value *> TileSizes) { 2593 assert(TileSizes.size() == Loops.size() && 2594 "Must pass as many tile sizes as there are loops"); 2595 int NumLoops = Loops.size(); 2596 assert(NumLoops >= 1 && "At least one loop to tile required"); 2597 2598 CanonicalLoopInfo *OutermostLoop = Loops.front(); 2599 CanonicalLoopInfo *InnermostLoop = Loops.back(); 2600 Function *F = OutermostLoop->getBody()->getParent(); 2601 BasicBlock *InnerEnter = InnermostLoop->getBody(); 2602 BasicBlock *InnerLatch = InnermostLoop->getLatch(); 2603 2604 // Loop control blocks that may become orphaned later. 2605 SmallVector<BasicBlock *, 12> OldControlBBs; 2606 OldControlBBs.reserve(6 * Loops.size()); 2607 for (CanonicalLoopInfo *Loop : Loops) 2608 Loop->collectControlBlocks(OldControlBBs); 2609 2610 // Collect original trip counts and induction variable to be accessible by 2611 // index. Also, the structure of the original loops is not preserved during 2612 // the construction of the tiled loops, so do it before we scavenge the BBs of 2613 // any original CanonicalLoopInfo. 2614 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars; 2615 for (CanonicalLoopInfo *L : Loops) { 2616 assert(L->isValid() && "All input loops must be valid canonical loops"); 2617 OrigTripCounts.push_back(L->getTripCount()); 2618 OrigIndVars.push_back(L->getIndVar()); 2619 } 2620 2621 // Collect the code between loop headers. These may contain SSA definitions 2622 // that are used in the loop nest body. To be usable with in the innermost 2623 // body, these BasicBlocks will be sunk into the loop nest body. That is, 2624 // these instructions may be executed more often than before the tiling. 2625 // TODO: It would be sufficient to only sink them into body of the 2626 // corresponding tile loop. 2627 SmallVector<std::pair<BasicBlock *, BasicBlock *>, 4> InbetweenCode; 2628 for (int i = 0; i < NumLoops - 1; ++i) { 2629 CanonicalLoopInfo *Surrounding = Loops[i]; 2630 CanonicalLoopInfo *Nested = Loops[i + 1]; 2631 2632 BasicBlock *EnterBB = Surrounding->getBody(); 2633 BasicBlock *ExitBB = Nested->getHeader(); 2634 InbetweenCode.emplace_back(EnterBB, ExitBB); 2635 } 2636 2637 // Compute the trip counts of the floor loops. 2638 Builder.SetCurrentDebugLocation(DL); 2639 Builder.restoreIP(OutermostLoop->getPreheaderIP()); 2640 SmallVector<Value *, 4> FloorCount, FloorRems; 2641 for (int i = 0; i < NumLoops; ++i) { 2642 Value *TileSize = TileSizes[i]; 2643 Value *OrigTripCount = OrigTripCounts[i]; 2644 Type *IVType = OrigTripCount->getType(); 2645 2646 Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize); 2647 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize); 2648 2649 // 0 if tripcount divides the tilesize, 1 otherwise. 2650 // 1 means we need an additional iteration for a partial tile. 2651 // 2652 // Unfortunately we cannot just use the roundup-formula 2653 // (tripcount + tilesize - 1)/tilesize 2654 // because the summation might overflow. We do not want introduce undefined 2655 // behavior when the untiled loop nest did not. 2656 Value *FloorTripOverflow = 2657 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0)); 2658 2659 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType); 2660 FloorTripCount = 2661 Builder.CreateAdd(FloorTripCount, FloorTripOverflow, 2662 "omp_floor" + Twine(i) + ".tripcount", true); 2663 2664 // Remember some values for later use. 2665 FloorCount.push_back(FloorTripCount); 2666 FloorRems.push_back(FloorTripRem); 2667 } 2668 2669 // Generate the new loop nest, from the outermost to the innermost. 2670 std::vector<CanonicalLoopInfo *> Result; 2671 Result.reserve(NumLoops * 2); 2672 2673 // The basic block of the surrounding loop that enters the nest generated 2674 // loop. 2675 BasicBlock *Enter = OutermostLoop->getPreheader(); 2676 2677 // The basic block of the surrounding loop where the inner code should 2678 // continue. 2679 BasicBlock *Continue = OutermostLoop->getAfter(); 2680 2681 // Where the next loop basic block should be inserted. 2682 BasicBlock *OutroInsertBefore = InnermostLoop->getExit(); 2683 2684 auto EmbeddNewLoop = 2685 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore]( 2686 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * { 2687 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton( 2688 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name); 2689 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL); 2690 redirectTo(EmbeddedLoop->getAfter(), Continue, DL); 2691 2692 // Setup the position where the next embedded loop connects to this loop. 2693 Enter = EmbeddedLoop->getBody(); 2694 Continue = EmbeddedLoop->getLatch(); 2695 OutroInsertBefore = EmbeddedLoop->getLatch(); 2696 return EmbeddedLoop; 2697 }; 2698 2699 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts, 2700 const Twine &NameBase) { 2701 for (auto P : enumerate(TripCounts)) { 2702 CanonicalLoopInfo *EmbeddedLoop = 2703 EmbeddNewLoop(P.value(), NameBase + Twine(P.index())); 2704 Result.push_back(EmbeddedLoop); 2705 } 2706 }; 2707 2708 EmbeddNewLoops(FloorCount, "floor"); 2709 2710 // Within the innermost floor loop, emit the code that computes the tile 2711 // sizes. 2712 Builder.SetInsertPoint(Enter->getTerminator()); 2713 SmallVector<Value *, 4> TileCounts; 2714 for (int i = 0; i < NumLoops; ++i) { 2715 CanonicalLoopInfo *FloorLoop = Result[i]; 2716 Value *TileSize = TileSizes[i]; 2717 2718 Value *FloorIsEpilogue = 2719 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]); 2720 Value *TileTripCount = 2721 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize); 2722 2723 TileCounts.push_back(TileTripCount); 2724 } 2725 2726 // Create the tile loops. 2727 EmbeddNewLoops(TileCounts, "tile"); 2728 2729 // Insert the inbetween code into the body. 2730 BasicBlock *BodyEnter = Enter; 2731 BasicBlock *BodyEntered = nullptr; 2732 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) { 2733 BasicBlock *EnterBB = P.first; 2734 BasicBlock *ExitBB = P.second; 2735 2736 if (BodyEnter) 2737 redirectTo(BodyEnter, EnterBB, DL); 2738 else 2739 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL); 2740 2741 BodyEnter = nullptr; 2742 BodyEntered = ExitBB; 2743 } 2744 2745 // Append the original loop nest body into the generated loop nest body. 2746 if (BodyEnter) 2747 redirectTo(BodyEnter, InnerEnter, DL); 2748 else 2749 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL); 2750 redirectAllPredecessorsTo(InnerLatch, Continue, DL); 2751 2752 // Replace the original induction variable with an induction variable computed 2753 // from the tile and floor induction variables. 2754 Builder.restoreIP(Result.back()->getBodyIP()); 2755 for (int i = 0; i < NumLoops; ++i) { 2756 CanonicalLoopInfo *FloorLoop = Result[i]; 2757 CanonicalLoopInfo *TileLoop = Result[NumLoops + i]; 2758 Value *OrigIndVar = OrigIndVars[i]; 2759 Value *Size = TileSizes[i]; 2760 2761 Value *Scale = 2762 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true); 2763 Value *Shift = 2764 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true); 2765 OrigIndVar->replaceAllUsesWith(Shift); 2766 } 2767 2768 // Remove unused parts of the original loops. 2769 removeUnusedBlocksFromParent(OldControlBBs); 2770 2771 for (CanonicalLoopInfo *L : Loops) 2772 L->invalidate(); 2773 2774 #ifndef NDEBUG 2775 for (CanonicalLoopInfo *GenL : Result) 2776 GenL->assertOK(); 2777 #endif 2778 return Result; 2779 } 2780 2781 /// Attach loop metadata \p Properties to the loop described by \p Loop. If the 2782 /// loop already has metadata, the loop properties are appended. 2783 static void addLoopMetadata(CanonicalLoopInfo *Loop, 2784 ArrayRef<Metadata *> Properties) { 2785 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo"); 2786 2787 // Nothing to do if no property to attach. 2788 if (Properties.empty()) 2789 return; 2790 2791 LLVMContext &Ctx = Loop->getFunction()->getContext(); 2792 SmallVector<Metadata *> NewLoopProperties; 2793 NewLoopProperties.push_back(nullptr); 2794 2795 // If the loop already has metadata, prepend it to the new metadata. 2796 BasicBlock *Latch = Loop->getLatch(); 2797 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch"); 2798 MDNode *Existing = Latch->getTerminator()->getMetadata(LLVMContext::MD_loop); 2799 if (Existing) 2800 append_range(NewLoopProperties, drop_begin(Existing->operands(), 1)); 2801 2802 append_range(NewLoopProperties, Properties); 2803 MDNode *LoopID = MDNode::getDistinct(Ctx, NewLoopProperties); 2804 LoopID->replaceOperandWith(0, LoopID); 2805 2806 Latch->getTerminator()->setMetadata(LLVMContext::MD_loop, LoopID); 2807 } 2808 2809 /// Attach llvm.access.group metadata to the memref instructions of \p Block 2810 static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, 2811 LoopInfo &LI) { 2812 for (Instruction &I : *Block) { 2813 if (I.mayReadOrWriteMemory()) { 2814 // TODO: This instruction may already have access group from 2815 // other pragmas e.g. #pragma clang loop vectorize. Append 2816 // so that the existing metadata is not overwritten. 2817 I.setMetadata(LLVMContext::MD_access_group, AccessGroup); 2818 } 2819 } 2820 } 2821 2822 void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) { 2823 LLVMContext &Ctx = Builder.getContext(); 2824 addLoopMetadata( 2825 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")), 2826 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))}); 2827 } 2828 2829 void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) { 2830 LLVMContext &Ctx = Builder.getContext(); 2831 addLoopMetadata( 2832 Loop, { 2833 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")), 2834 }); 2835 } 2836 2837 void OpenMPIRBuilder::applySimd(DebugLoc, CanonicalLoopInfo *CanonicalLoop) { 2838 LLVMContext &Ctx = Builder.getContext(); 2839 2840 Function *F = CanonicalLoop->getFunction(); 2841 2842 FunctionAnalysisManager FAM; 2843 FAM.registerPass([]() { return DominatorTreeAnalysis(); }); 2844 FAM.registerPass([]() { return LoopAnalysis(); }); 2845 FAM.registerPass([]() { return PassInstrumentationAnalysis(); }); 2846 2847 LoopAnalysis LIA; 2848 LoopInfo &&LI = LIA.run(*F, FAM); 2849 2850 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader()); 2851 2852 SmallSet<BasicBlock *, 8> Reachable; 2853 2854 // Get the basic blocks from the loop in which memref instructions 2855 // can be found. 2856 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo, 2857 // preferably without running any passes. 2858 for (BasicBlock *Block : L->getBlocks()) { 2859 if (Block == CanonicalLoop->getCond() || 2860 Block == CanonicalLoop->getHeader()) 2861 continue; 2862 Reachable.insert(Block); 2863 } 2864 2865 // Add access group metadata to memory-access instructions. 2866 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {}); 2867 for (BasicBlock *BB : Reachable) 2868 addSimdMetadata(BB, AccessGroup, LI); 2869 2870 // Use the above access group metadata to create loop level 2871 // metadata, which should be distinct for each loop. 2872 ConstantAsMetadata *BoolConst = 2873 ConstantAsMetadata::get(ConstantInt::getTrue(Type::getInt1Ty(Ctx))); 2874 // TODO: If the loop has existing parallel access metadata, have 2875 // to combine two lists. 2876 addLoopMetadata( 2877 CanonicalLoop, 2878 {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), 2879 AccessGroup}), 2880 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), 2881 BoolConst})}); 2882 } 2883 2884 /// Create the TargetMachine object to query the backend for optimization 2885 /// preferences. 2886 /// 2887 /// Ideally, this would be passed from the front-end to the OpenMPBuilder, but 2888 /// e.g. Clang does not pass it to its CodeGen layer and creates it only when 2889 /// needed for the LLVM pass pipline. We use some default options to avoid 2890 /// having to pass too many settings from the frontend that probably do not 2891 /// matter. 2892 /// 2893 /// Currently, TargetMachine is only used sometimes by the unrollLoopPartial 2894 /// method. If we are going to use TargetMachine for more purposes, especially 2895 /// those that are sensitive to TargetOptions, RelocModel and CodeModel, it 2896 /// might become be worth requiring front-ends to pass on their TargetMachine, 2897 /// or at least cache it between methods. Note that while fontends such as Clang 2898 /// have just a single main TargetMachine per translation unit, "target-cpu" and 2899 /// "target-features" that determine the TargetMachine are per-function and can 2900 /// be overrided using __attribute__((target("OPTIONS"))). 2901 static std::unique_ptr<TargetMachine> 2902 createTargetMachine(Function *F, CodeGenOpt::Level OptLevel) { 2903 Module *M = F->getParent(); 2904 2905 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString(); 2906 StringRef Features = F->getFnAttribute("target-features").getValueAsString(); 2907 const std::string &Triple = M->getTargetTriple(); 2908 2909 std::string Error; 2910 const llvm::Target *TheTarget = TargetRegistry::lookupTarget(Triple, Error); 2911 if (!TheTarget) 2912 return {}; 2913 2914 llvm::TargetOptions Options; 2915 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine( 2916 Triple, CPU, Features, Options, /*RelocModel=*/None, /*CodeModel=*/None, 2917 OptLevel)); 2918 } 2919 2920 /// Heuristically determine the best-performant unroll factor for \p CLI. This 2921 /// depends on the target processor. We are re-using the same heuristics as the 2922 /// LoopUnrollPass. 2923 static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) { 2924 Function *F = CLI->getFunction(); 2925 2926 // Assume the user requests the most aggressive unrolling, even if the rest of 2927 // the code is optimized using a lower setting. 2928 CodeGenOpt::Level OptLevel = CodeGenOpt::Aggressive; 2929 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel); 2930 2931 FunctionAnalysisManager FAM; 2932 FAM.registerPass([]() { return TargetLibraryAnalysis(); }); 2933 FAM.registerPass([]() { return AssumptionAnalysis(); }); 2934 FAM.registerPass([]() { return DominatorTreeAnalysis(); }); 2935 FAM.registerPass([]() { return LoopAnalysis(); }); 2936 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); }); 2937 FAM.registerPass([]() { return PassInstrumentationAnalysis(); }); 2938 TargetIRAnalysis TIRA; 2939 if (TM) 2940 TIRA = TargetIRAnalysis( 2941 [&](const Function &F) { return TM->getTargetTransformInfo(F); }); 2942 FAM.registerPass([&]() { return TIRA; }); 2943 2944 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM); 2945 ScalarEvolutionAnalysis SEA; 2946 ScalarEvolution &&SE = SEA.run(*F, FAM); 2947 DominatorTreeAnalysis DTA; 2948 DominatorTree &&DT = DTA.run(*F, FAM); 2949 LoopAnalysis LIA; 2950 LoopInfo &&LI = LIA.run(*F, FAM); 2951 AssumptionAnalysis ACT; 2952 AssumptionCache &&AC = ACT.run(*F, FAM); 2953 OptimizationRemarkEmitter ORE{F}; 2954 2955 Loop *L = LI.getLoopFor(CLI->getHeader()); 2956 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop"); 2957 2958 TargetTransformInfo::UnrollingPreferences UP = 2959 gatherUnrollingPreferences(L, SE, TTI, 2960 /*BlockFrequencyInfo=*/nullptr, 2961 /*ProfileSummaryInfo=*/nullptr, ORE, OptLevel, 2962 /*UserThreshold=*/None, 2963 /*UserCount=*/None, 2964 /*UserAllowPartial=*/true, 2965 /*UserAllowRuntime=*/true, 2966 /*UserUpperBound=*/None, 2967 /*UserFullUnrollMaxCount=*/None); 2968 2969 UP.Force = true; 2970 2971 // Account for additional optimizations taking place before the LoopUnrollPass 2972 // would unroll the loop. 2973 UP.Threshold *= UnrollThresholdFactor; 2974 UP.PartialThreshold *= UnrollThresholdFactor; 2975 2976 // Use normal unroll factors even if the rest of the code is optimized for 2977 // size. 2978 UP.OptSizeThreshold = UP.Threshold; 2979 UP.PartialOptSizeThreshold = UP.PartialThreshold; 2980 2981 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n" 2982 << " Threshold=" << UP.Threshold << "\n" 2983 << " PartialThreshold=" << UP.PartialThreshold << "\n" 2984 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n" 2985 << " PartialOptSizeThreshold=" 2986 << UP.PartialOptSizeThreshold << "\n"); 2987 2988 // Disable peeling. 2989 TargetTransformInfo::PeelingPreferences PP = 2990 gatherPeelingPreferences(L, SE, TTI, 2991 /*UserAllowPeeling=*/false, 2992 /*UserAllowProfileBasedPeeling=*/false, 2993 /*UnrollingSpecficValues=*/false); 2994 2995 SmallPtrSet<const Value *, 32> EphValues; 2996 CodeMetrics::collectEphemeralValues(L, &AC, EphValues); 2997 2998 // Assume that reads and writes to stack variables can be eliminated by 2999 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's 3000 // size. 3001 for (BasicBlock *BB : L->blocks()) { 3002 for (Instruction &I : *BB) { 3003 Value *Ptr; 3004 if (auto *Load = dyn_cast<LoadInst>(&I)) { 3005 Ptr = Load->getPointerOperand(); 3006 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 3007 Ptr = Store->getPointerOperand(); 3008 } else 3009 continue; 3010 3011 Ptr = Ptr->stripPointerCasts(); 3012 3013 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) { 3014 if (Alloca->getParent() == &F->getEntryBlock()) 3015 EphValues.insert(&I); 3016 } 3017 } 3018 } 3019 3020 unsigned NumInlineCandidates; 3021 bool NotDuplicatable; 3022 bool Convergent; 3023 InstructionCost LoopSizeIC = 3024 ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent, 3025 TTI, EphValues, UP.BEInsns); 3026 LLVM_DEBUG(dbgs() << "Estimated loop size is " << LoopSizeIC << "\n"); 3027 3028 // Loop is not unrollable if the loop contains certain instructions. 3029 if (NotDuplicatable || Convergent || !LoopSizeIC.isValid()) { 3030 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n"); 3031 return 1; 3032 } 3033 unsigned LoopSize = *LoopSizeIC.getValue(); 3034 3035 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might 3036 // be able to use it. 3037 int TripCount = 0; 3038 int MaxTripCount = 0; 3039 bool MaxOrZero = false; 3040 unsigned TripMultiple = 0; 3041 3042 bool UseUpperBound = false; 3043 computeUnrollCount(L, TTI, DT, &LI, SE, EphValues, &ORE, TripCount, 3044 MaxTripCount, MaxOrZero, TripMultiple, LoopSize, UP, PP, 3045 UseUpperBound); 3046 unsigned Factor = UP.Count; 3047 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n"); 3048 3049 // This function returns 1 to signal to not unroll a loop. 3050 if (Factor == 0) 3051 return 1; 3052 return Factor; 3053 } 3054 3055 void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, 3056 int32_t Factor, 3057 CanonicalLoopInfo **UnrolledCLI) { 3058 assert(Factor >= 0 && "Unroll factor must not be negative"); 3059 3060 Function *F = Loop->getFunction(); 3061 LLVMContext &Ctx = F->getContext(); 3062 3063 // If the unrolled loop is not used for another loop-associated directive, it 3064 // is sufficient to add metadata for the LoopUnrollPass. 3065 if (!UnrolledCLI) { 3066 SmallVector<Metadata *, 2> LoopMetadata; 3067 LoopMetadata.push_back( 3068 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable"))); 3069 3070 if (Factor >= 1) { 3071 ConstantAsMetadata *FactorConst = ConstantAsMetadata::get( 3072 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor))); 3073 LoopMetadata.push_back(MDNode::get( 3074 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})); 3075 } 3076 3077 addLoopMetadata(Loop, LoopMetadata); 3078 return; 3079 } 3080 3081 // Heuristically determine the unroll factor. 3082 if (Factor == 0) 3083 Factor = computeHeuristicUnrollFactor(Loop); 3084 3085 // No change required with unroll factor 1. 3086 if (Factor == 1) { 3087 *UnrolledCLI = Loop; 3088 return; 3089 } 3090 3091 assert(Factor >= 2 && 3092 "unrolling only makes sense with a factor of 2 or larger"); 3093 3094 Type *IndVarTy = Loop->getIndVarType(); 3095 3096 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully 3097 // unroll the inner loop. 3098 Value *FactorVal = 3099 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor, 3100 /*isSigned=*/false)); 3101 std::vector<CanonicalLoopInfo *> LoopNest = 3102 tileLoops(DL, {Loop}, {FactorVal}); 3103 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling"); 3104 *UnrolledCLI = LoopNest[0]; 3105 CanonicalLoopInfo *InnerLoop = LoopNest[1]; 3106 3107 // LoopUnrollPass can only fully unroll loops with constant trip count. 3108 // Unroll by the unroll factor with a fallback epilog for the remainder 3109 // iterations if necessary. 3110 ConstantAsMetadata *FactorConst = ConstantAsMetadata::get( 3111 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor))); 3112 addLoopMetadata( 3113 InnerLoop, 3114 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")), 3115 MDNode::get( 3116 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})}); 3117 3118 #ifndef NDEBUG 3119 (*UnrolledCLI)->assertOK(); 3120 #endif 3121 } 3122 3123 OpenMPIRBuilder::InsertPointTy 3124 OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc, 3125 llvm::Value *BufSize, llvm::Value *CpyBuf, 3126 llvm::Value *CpyFn, llvm::Value *DidIt) { 3127 if (!updateToLocation(Loc)) 3128 return Loc.IP; 3129 3130 uint32_t SrcLocStrSize; 3131 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 3132 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 3133 Value *ThreadId = getOrCreateThreadID(Ident); 3134 3135 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt); 3136 3137 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD}; 3138 3139 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate); 3140 Builder.CreateCall(Fn, Args); 3141 3142 return Builder.saveIP(); 3143 } 3144 3145 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSingle( 3146 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, 3147 FinalizeCallbackTy FiniCB, bool IsNowait, llvm::Value *DidIt) { 3148 3149 if (!updateToLocation(Loc)) 3150 return Loc.IP; 3151 3152 // If needed (i.e. not null), initialize `DidIt` with 0 3153 if (DidIt) { 3154 Builder.CreateStore(Builder.getInt32(0), DidIt); 3155 } 3156 3157 Directive OMPD = Directive::OMPD_single; 3158 uint32_t SrcLocStrSize; 3159 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 3160 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 3161 Value *ThreadId = getOrCreateThreadID(Ident); 3162 Value *Args[] = {Ident, ThreadId}; 3163 3164 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single); 3165 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args); 3166 3167 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single); 3168 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args); 3169 3170 // generates the following: 3171 // if (__kmpc_single()) { 3172 // .... single region ... 3173 // __kmpc_end_single 3174 // } 3175 // __kmpc_barrier 3176 3177 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB, 3178 /*Conditional*/ true, 3179 /*hasFinalize*/ true); 3180 if (!IsNowait) 3181 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL), 3182 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false, 3183 /* CheckCancelFlag */ false); 3184 return Builder.saveIP(); 3185 } 3186 3187 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCritical( 3188 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, 3189 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) { 3190 3191 if (!updateToLocation(Loc)) 3192 return Loc.IP; 3193 3194 Directive OMPD = Directive::OMPD_critical; 3195 uint32_t SrcLocStrSize; 3196 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 3197 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 3198 Value *ThreadId = getOrCreateThreadID(Ident); 3199 Value *LockVar = getOMPCriticalRegionLock(CriticalName); 3200 Value *Args[] = {Ident, ThreadId, LockVar}; 3201 3202 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args)); 3203 Function *RTFn = nullptr; 3204 if (HintInst) { 3205 // Add Hint to entry Args and create call 3206 EnterArgs.push_back(HintInst); 3207 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint); 3208 } else { 3209 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical); 3210 } 3211 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs); 3212 3213 Function *ExitRTLFn = 3214 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical); 3215 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args); 3216 3217 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB, 3218 /*Conditional*/ false, /*hasFinalize*/ true); 3219 } 3220 3221 OpenMPIRBuilder::InsertPointTy 3222 OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc, 3223 InsertPointTy AllocaIP, unsigned NumLoops, 3224 ArrayRef<llvm::Value *> StoreValues, 3225 const Twine &Name, bool IsDependSource) { 3226 for (size_t I = 0; I < StoreValues.size(); I++) 3227 assert(StoreValues[I]->getType()->isIntegerTy(64) && 3228 "OpenMP runtime requires depend vec with i64 type"); 3229 3230 if (!updateToLocation(Loc)) 3231 return Loc.IP; 3232 3233 // Allocate space for vector and generate alloc instruction. 3234 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops); 3235 Builder.restoreIP(AllocaIP); 3236 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name); 3237 ArgsBase->setAlignment(Align(8)); 3238 Builder.restoreIP(Loc.IP); 3239 3240 // Store the index value with offset in depend vector. 3241 for (unsigned I = 0; I < NumLoops; ++I) { 3242 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP( 3243 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)}); 3244 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter); 3245 STInst->setAlignment(Align(8)); 3246 } 3247 3248 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP( 3249 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)}); 3250 3251 uint32_t SrcLocStrSize; 3252 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 3253 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 3254 Value *ThreadId = getOrCreateThreadID(Ident); 3255 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP}; 3256 3257 Function *RTLFn = nullptr; 3258 if (IsDependSource) 3259 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post); 3260 else 3261 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait); 3262 Builder.CreateCall(RTLFn, Args); 3263 3264 return Builder.saveIP(); 3265 } 3266 3267 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createOrderedThreadsSimd( 3268 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, 3269 FinalizeCallbackTy FiniCB, bool IsThreads) { 3270 if (!updateToLocation(Loc)) 3271 return Loc.IP; 3272 3273 Directive OMPD = Directive::OMPD_ordered; 3274 Instruction *EntryCall = nullptr; 3275 Instruction *ExitCall = nullptr; 3276 3277 if (IsThreads) { 3278 uint32_t SrcLocStrSize; 3279 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 3280 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 3281 Value *ThreadId = getOrCreateThreadID(Ident); 3282 Value *Args[] = {Ident, ThreadId}; 3283 3284 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered); 3285 EntryCall = Builder.CreateCall(EntryRTLFn, Args); 3286 3287 Function *ExitRTLFn = 3288 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered); 3289 ExitCall = Builder.CreateCall(ExitRTLFn, Args); 3290 } 3291 3292 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB, 3293 /*Conditional*/ false, /*hasFinalize*/ true); 3294 } 3295 3296 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::EmitOMPInlinedRegion( 3297 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall, 3298 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional, 3299 bool HasFinalize, bool IsCancellable) { 3300 3301 if (HasFinalize) 3302 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable}); 3303 3304 // Create inlined region's entry and body blocks, in preparation 3305 // for conditional creation 3306 BasicBlock *EntryBB = Builder.GetInsertBlock(); 3307 Instruction *SplitPos = EntryBB->getTerminator(); 3308 if (!isa_and_nonnull<BranchInst>(SplitPos)) 3309 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB); 3310 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end"); 3311 BasicBlock *FiniBB = 3312 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize"); 3313 3314 Builder.SetInsertPoint(EntryBB->getTerminator()); 3315 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional); 3316 3317 // generate body 3318 BodyGenCB(/* AllocaIP */ InsertPointTy(), 3319 /* CodeGenIP */ Builder.saveIP()); 3320 3321 // emit exit call and do any needed finalization. 3322 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt()); 3323 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 && 3324 FiniBB->getTerminator()->getSuccessor(0) == ExitBB && 3325 "Unexpected control flow graph state!!"); 3326 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize); 3327 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB && 3328 "Unexpected Control Flow State!"); 3329 MergeBlockIntoPredecessor(FiniBB); 3330 3331 // If we are skipping the region of a non conditional, remove the exit 3332 // block, and clear the builder's insertion point. 3333 assert(SplitPos->getParent() == ExitBB && 3334 "Unexpected Insertion point location!"); 3335 auto merged = MergeBlockIntoPredecessor(ExitBB); 3336 BasicBlock *ExitPredBB = SplitPos->getParent(); 3337 auto InsertBB = merged ? ExitPredBB : ExitBB; 3338 if (!isa_and_nonnull<BranchInst>(SplitPos)) 3339 SplitPos->eraseFromParent(); 3340 Builder.SetInsertPoint(InsertBB); 3341 3342 return Builder.saveIP(); 3343 } 3344 3345 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry( 3346 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) { 3347 // if nothing to do, Return current insertion point. 3348 if (!Conditional || !EntryCall) 3349 return Builder.saveIP(); 3350 3351 BasicBlock *EntryBB = Builder.GetInsertBlock(); 3352 Value *CallBool = Builder.CreateIsNotNull(EntryCall); 3353 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body"); 3354 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB); 3355 3356 // Emit thenBB and set the Builder's insertion point there for 3357 // body generation next. Place the block after the current block. 3358 Function *CurFn = EntryBB->getParent(); 3359 CurFn->getBasicBlockList().insertAfter(EntryBB->getIterator(), ThenBB); 3360 3361 // Move Entry branch to end of ThenBB, and replace with conditional 3362 // branch (If-stmt) 3363 Instruction *EntryBBTI = EntryBB->getTerminator(); 3364 Builder.CreateCondBr(CallBool, ThenBB, ExitBB); 3365 EntryBBTI->removeFromParent(); 3366 Builder.SetInsertPoint(UI); 3367 Builder.Insert(EntryBBTI); 3368 UI->eraseFromParent(); 3369 Builder.SetInsertPoint(ThenBB->getTerminator()); 3370 3371 // return an insertion point to ExitBB. 3372 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt()); 3373 } 3374 3375 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveExit( 3376 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall, 3377 bool HasFinalize) { 3378 3379 Builder.restoreIP(FinIP); 3380 3381 // If there is finalization to do, emit it before the exit call 3382 if (HasFinalize) { 3383 assert(!FinalizationStack.empty() && 3384 "Unexpected finalization stack state!"); 3385 3386 FinalizationInfo Fi = FinalizationStack.pop_back_val(); 3387 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!"); 3388 3389 Fi.FiniCB(FinIP); 3390 3391 BasicBlock *FiniBB = FinIP.getBlock(); 3392 Instruction *FiniBBTI = FiniBB->getTerminator(); 3393 3394 // set Builder IP for call creation 3395 Builder.SetInsertPoint(FiniBBTI); 3396 } 3397 3398 if (!ExitCall) 3399 return Builder.saveIP(); 3400 3401 // place the Exitcall as last instruction before Finalization block terminator 3402 ExitCall->removeFromParent(); 3403 Builder.Insert(ExitCall); 3404 3405 return IRBuilder<>::InsertPoint(ExitCall->getParent(), 3406 ExitCall->getIterator()); 3407 } 3408 3409 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCopyinClauseBlocks( 3410 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, 3411 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) { 3412 if (!IP.isSet()) 3413 return IP; 3414 3415 IRBuilder<>::InsertPointGuard IPG(Builder); 3416 3417 // creates the following CFG structure 3418 // OMP_Entry : (MasterAddr != PrivateAddr)? 3419 // F T 3420 // | \ 3421 // | copin.not.master 3422 // | / 3423 // v / 3424 // copyin.not.master.end 3425 // | 3426 // v 3427 // OMP.Entry.Next 3428 3429 BasicBlock *OMP_Entry = IP.getBlock(); 3430 Function *CurFn = OMP_Entry->getParent(); 3431 BasicBlock *CopyBegin = 3432 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn); 3433 BasicBlock *CopyEnd = nullptr; 3434 3435 // If entry block is terminated, split to preserve the branch to following 3436 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is. 3437 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) { 3438 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(), 3439 "copyin.not.master.end"); 3440 OMP_Entry->getTerminator()->eraseFromParent(); 3441 } else { 3442 CopyEnd = 3443 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn); 3444 } 3445 3446 Builder.SetInsertPoint(OMP_Entry); 3447 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy); 3448 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy); 3449 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr); 3450 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd); 3451 3452 Builder.SetInsertPoint(CopyBegin); 3453 if (BranchtoEnd) 3454 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd)); 3455 3456 return Builder.saveIP(); 3457 } 3458 3459 CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc, 3460 Value *Size, Value *Allocator, 3461 std::string Name) { 3462 IRBuilder<>::InsertPointGuard IPG(Builder); 3463 Builder.restoreIP(Loc.IP); 3464 3465 uint32_t SrcLocStrSize; 3466 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 3467 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 3468 Value *ThreadId = getOrCreateThreadID(Ident); 3469 Value *Args[] = {ThreadId, Size, Allocator}; 3470 3471 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc); 3472 3473 return Builder.CreateCall(Fn, Args, Name); 3474 } 3475 3476 CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc, 3477 Value *Addr, Value *Allocator, 3478 std::string Name) { 3479 IRBuilder<>::InsertPointGuard IPG(Builder); 3480 Builder.restoreIP(Loc.IP); 3481 3482 uint32_t SrcLocStrSize; 3483 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 3484 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 3485 Value *ThreadId = getOrCreateThreadID(Ident); 3486 Value *Args[] = {ThreadId, Addr, Allocator}; 3487 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free); 3488 return Builder.CreateCall(Fn, Args, Name); 3489 } 3490 3491 CallInst *OpenMPIRBuilder::createOMPInteropInit( 3492 const LocationDescription &Loc, Value *InteropVar, 3493 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, 3494 Value *DependenceAddress, bool HaveNowaitClause) { 3495 IRBuilder<>::InsertPointGuard IPG(Builder); 3496 Builder.restoreIP(Loc.IP); 3497 3498 uint32_t SrcLocStrSize; 3499 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 3500 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 3501 Value *ThreadId = getOrCreateThreadID(Ident); 3502 if (Device == nullptr) 3503 Device = ConstantInt::get(Int32, -1); 3504 Constant *InteropTypeVal = ConstantInt::get(Int64, (int)InteropType); 3505 if (NumDependences == nullptr) { 3506 NumDependences = ConstantInt::get(Int32, 0); 3507 PointerType *PointerTypeVar = Type::getInt8PtrTy(M.getContext()); 3508 DependenceAddress = ConstantPointerNull::get(PointerTypeVar); 3509 } 3510 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause); 3511 Value *Args[] = { 3512 Ident, ThreadId, InteropVar, InteropTypeVal, 3513 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal}; 3514 3515 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init); 3516 3517 return Builder.CreateCall(Fn, Args); 3518 } 3519 3520 CallInst *OpenMPIRBuilder::createOMPInteropDestroy( 3521 const LocationDescription &Loc, Value *InteropVar, Value *Device, 3522 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) { 3523 IRBuilder<>::InsertPointGuard IPG(Builder); 3524 Builder.restoreIP(Loc.IP); 3525 3526 uint32_t SrcLocStrSize; 3527 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 3528 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 3529 Value *ThreadId = getOrCreateThreadID(Ident); 3530 if (Device == nullptr) 3531 Device = ConstantInt::get(Int32, -1); 3532 if (NumDependences == nullptr) { 3533 NumDependences = ConstantInt::get(Int32, 0); 3534 PointerType *PointerTypeVar = Type::getInt8PtrTy(M.getContext()); 3535 DependenceAddress = ConstantPointerNull::get(PointerTypeVar); 3536 } 3537 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause); 3538 Value *Args[] = { 3539 Ident, ThreadId, InteropVar, Device, 3540 NumDependences, DependenceAddress, HaveNowaitClauseVal}; 3541 3542 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy); 3543 3544 return Builder.CreateCall(Fn, Args); 3545 } 3546 3547 CallInst *OpenMPIRBuilder::createOMPInteropUse(const LocationDescription &Loc, 3548 Value *InteropVar, Value *Device, 3549 Value *NumDependences, 3550 Value *DependenceAddress, 3551 bool HaveNowaitClause) { 3552 IRBuilder<>::InsertPointGuard IPG(Builder); 3553 Builder.restoreIP(Loc.IP); 3554 uint32_t SrcLocStrSize; 3555 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 3556 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 3557 Value *ThreadId = getOrCreateThreadID(Ident); 3558 if (Device == nullptr) 3559 Device = ConstantInt::get(Int32, -1); 3560 if (NumDependences == nullptr) { 3561 NumDependences = ConstantInt::get(Int32, 0); 3562 PointerType *PointerTypeVar = Type::getInt8PtrTy(M.getContext()); 3563 DependenceAddress = ConstantPointerNull::get(PointerTypeVar); 3564 } 3565 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause); 3566 Value *Args[] = { 3567 Ident, ThreadId, InteropVar, Device, 3568 NumDependences, DependenceAddress, HaveNowaitClauseVal}; 3569 3570 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use); 3571 3572 return Builder.CreateCall(Fn, Args); 3573 } 3574 3575 CallInst *OpenMPIRBuilder::createCachedThreadPrivate( 3576 const LocationDescription &Loc, llvm::Value *Pointer, 3577 llvm::ConstantInt *Size, const llvm::Twine &Name) { 3578 IRBuilder<>::InsertPointGuard IPG(Builder); 3579 Builder.restoreIP(Loc.IP); 3580 3581 uint32_t SrcLocStrSize; 3582 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 3583 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 3584 Value *ThreadId = getOrCreateThreadID(Ident); 3585 Constant *ThreadPrivateCache = 3586 getOrCreateOMPInternalVariable(Int8PtrPtr, Name); 3587 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache}; 3588 3589 Function *Fn = 3590 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached); 3591 3592 return Builder.CreateCall(Fn, Args); 3593 } 3594 3595 OpenMPIRBuilder::InsertPointTy 3596 OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD, 3597 bool RequiresFullRuntime) { 3598 if (!updateToLocation(Loc)) 3599 return Loc.IP; 3600 3601 uint32_t SrcLocStrSize; 3602 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 3603 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 3604 ConstantInt *IsSPMDVal = ConstantInt::getSigned( 3605 IntegerType::getInt8Ty(Int8->getContext()), 3606 IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC); 3607 ConstantInt *UseGenericStateMachine = 3608 ConstantInt::getBool(Int32->getContext(), !IsSPMD); 3609 ConstantInt *RequiresFullRuntimeVal = 3610 ConstantInt::getBool(Int32->getContext(), RequiresFullRuntime); 3611 3612 Function *Fn = getOrCreateRuntimeFunctionPtr( 3613 omp::RuntimeFunction::OMPRTL___kmpc_target_init); 3614 3615 CallInst *ThreadKind = Builder.CreateCall( 3616 Fn, {Ident, IsSPMDVal, UseGenericStateMachine, RequiresFullRuntimeVal}); 3617 3618 Value *ExecUserCode = Builder.CreateICmpEQ( 3619 ThreadKind, ConstantInt::get(ThreadKind->getType(), -1), 3620 "exec_user_code"); 3621 3622 // ThreadKind = __kmpc_target_init(...) 3623 // if (ThreadKind == -1) 3624 // user_code 3625 // else 3626 // return; 3627 3628 auto *UI = Builder.CreateUnreachable(); 3629 BasicBlock *CheckBB = UI->getParent(); 3630 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry"); 3631 3632 BasicBlock *WorkerExitBB = BasicBlock::Create( 3633 CheckBB->getContext(), "worker.exit", CheckBB->getParent()); 3634 Builder.SetInsertPoint(WorkerExitBB); 3635 Builder.CreateRetVoid(); 3636 3637 auto *CheckBBTI = CheckBB->getTerminator(); 3638 Builder.SetInsertPoint(CheckBBTI); 3639 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB); 3640 3641 CheckBBTI->eraseFromParent(); 3642 UI->eraseFromParent(); 3643 3644 // Continue in the "user_code" block, see diagram above and in 3645 // openmp/libomptarget/deviceRTLs/common/include/target.h . 3646 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt()); 3647 } 3648 3649 void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc, 3650 bool IsSPMD, 3651 bool RequiresFullRuntime) { 3652 if (!updateToLocation(Loc)) 3653 return; 3654 3655 uint32_t SrcLocStrSize; 3656 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 3657 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 3658 ConstantInt *IsSPMDVal = ConstantInt::getSigned( 3659 IntegerType::getInt8Ty(Int8->getContext()), 3660 IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC); 3661 ConstantInt *RequiresFullRuntimeVal = 3662 ConstantInt::getBool(Int32->getContext(), RequiresFullRuntime); 3663 3664 Function *Fn = getOrCreateRuntimeFunctionPtr( 3665 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit); 3666 3667 Builder.CreateCall(Fn, {Ident, IsSPMDVal, RequiresFullRuntimeVal}); 3668 } 3669 3670 std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts, 3671 StringRef FirstSeparator, 3672 StringRef Separator) { 3673 SmallString<128> Buffer; 3674 llvm::raw_svector_ostream OS(Buffer); 3675 StringRef Sep = FirstSeparator; 3676 for (StringRef Part : Parts) { 3677 OS << Sep << Part; 3678 Sep = Separator; 3679 } 3680 return OS.str().str(); 3681 } 3682 3683 Constant *OpenMPIRBuilder::getOrCreateOMPInternalVariable( 3684 llvm::Type *Ty, const llvm::Twine &Name, unsigned AddressSpace) { 3685 // TODO: Replace the twine arg with stringref to get rid of the conversion 3686 // logic. However This is taken from current implementation in clang as is. 3687 // Since this method is used in many places exclusively for OMP internal use 3688 // we will keep it as is for temporarily until we move all users to the 3689 // builder and then, if possible, fix it everywhere in one go. 3690 SmallString<256> Buffer; 3691 llvm::raw_svector_ostream Out(Buffer); 3692 Out << Name; 3693 StringRef RuntimeName = Out.str(); 3694 auto &Elem = *InternalVars.try_emplace(RuntimeName, nullptr).first; 3695 if (Elem.second) { 3696 assert(cast<PointerType>(Elem.second->getType()) 3697 ->isOpaqueOrPointeeTypeMatches(Ty) && 3698 "OMP internal variable has different type than requested"); 3699 } else { 3700 // TODO: investigate the appropriate linkage type used for the global 3701 // variable for possibly changing that to internal or private, or maybe 3702 // create different versions of the function for different OMP internal 3703 // variables. 3704 Elem.second = new llvm::GlobalVariable( 3705 M, Ty, /*IsConstant*/ false, llvm::GlobalValue::CommonLinkage, 3706 llvm::Constant::getNullValue(Ty), Elem.first(), 3707 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, 3708 AddressSpace); 3709 } 3710 3711 return Elem.second; 3712 } 3713 3714 Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) { 3715 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str(); 3716 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", "."); 3717 return getOrCreateOMPInternalVariable(KmpCriticalNameTy, Name); 3718 } 3719 3720 GlobalVariable * 3721 OpenMPIRBuilder::createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings, 3722 std::string VarName) { 3723 llvm::Constant *MaptypesArrayInit = 3724 llvm::ConstantDataArray::get(M.getContext(), Mappings); 3725 auto *MaptypesArrayGlobal = new llvm::GlobalVariable( 3726 M, MaptypesArrayInit->getType(), 3727 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit, 3728 VarName); 3729 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); 3730 return MaptypesArrayGlobal; 3731 } 3732 3733 void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc, 3734 InsertPointTy AllocaIP, 3735 unsigned NumOperands, 3736 struct MapperAllocas &MapperAllocas) { 3737 if (!updateToLocation(Loc)) 3738 return; 3739 3740 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands); 3741 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands); 3742 Builder.restoreIP(AllocaIP); 3743 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI8PtrTy); 3744 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy); 3745 AllocaInst *ArgSizes = Builder.CreateAlloca(ArrI64Ty); 3746 Builder.restoreIP(Loc.IP); 3747 MapperAllocas.ArgsBase = ArgsBase; 3748 MapperAllocas.Args = Args; 3749 MapperAllocas.ArgSizes = ArgSizes; 3750 } 3751 3752 void OpenMPIRBuilder::emitMapperCall(const LocationDescription &Loc, 3753 Function *MapperFunc, Value *SrcLocInfo, 3754 Value *MaptypesArg, Value *MapnamesArg, 3755 struct MapperAllocas &MapperAllocas, 3756 int64_t DeviceID, unsigned NumOperands) { 3757 if (!updateToLocation(Loc)) 3758 return; 3759 3760 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands); 3761 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands); 3762 Value *ArgsBaseGEP = 3763 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase, 3764 {Builder.getInt32(0), Builder.getInt32(0)}); 3765 Value *ArgsGEP = 3766 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args, 3767 {Builder.getInt32(0), Builder.getInt32(0)}); 3768 Value *ArgSizesGEP = 3769 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes, 3770 {Builder.getInt32(0), Builder.getInt32(0)}); 3771 Value *NullPtr = Constant::getNullValue(Int8Ptr->getPointerTo()); 3772 Builder.CreateCall(MapperFunc, 3773 {SrcLocInfo, Builder.getInt64(DeviceID), 3774 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP, 3775 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr}); 3776 } 3777 3778 bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic( 3779 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) { 3780 assert(!(AO == AtomicOrdering::NotAtomic || 3781 AO == llvm::AtomicOrdering::Unordered) && 3782 "Unexpected Atomic Ordering."); 3783 3784 bool Flush = false; 3785 llvm::AtomicOrdering FlushAO = AtomicOrdering::Monotonic; 3786 3787 switch (AK) { 3788 case Read: 3789 if (AO == AtomicOrdering::Acquire || AO == AtomicOrdering::AcquireRelease || 3790 AO == AtomicOrdering::SequentiallyConsistent) { 3791 FlushAO = AtomicOrdering::Acquire; 3792 Flush = true; 3793 } 3794 break; 3795 case Write: 3796 case Compare: 3797 case Update: 3798 if (AO == AtomicOrdering::Release || AO == AtomicOrdering::AcquireRelease || 3799 AO == AtomicOrdering::SequentiallyConsistent) { 3800 FlushAO = AtomicOrdering::Release; 3801 Flush = true; 3802 } 3803 break; 3804 case Capture: 3805 switch (AO) { 3806 case AtomicOrdering::Acquire: 3807 FlushAO = AtomicOrdering::Acquire; 3808 Flush = true; 3809 break; 3810 case AtomicOrdering::Release: 3811 FlushAO = AtomicOrdering::Release; 3812 Flush = true; 3813 break; 3814 case AtomicOrdering::AcquireRelease: 3815 case AtomicOrdering::SequentiallyConsistent: 3816 FlushAO = AtomicOrdering::AcquireRelease; 3817 Flush = true; 3818 break; 3819 default: 3820 // do nothing - leave silently. 3821 break; 3822 } 3823 } 3824 3825 if (Flush) { 3826 // Currently Flush RT call still doesn't take memory_ordering, so for when 3827 // that happens, this tries to do the resolution of which atomic ordering 3828 // to use with but issue the flush call 3829 // TODO: pass `FlushAO` after memory ordering support is added 3830 (void)FlushAO; 3831 emitFlush(Loc); 3832 } 3833 3834 // for AO == AtomicOrdering::Monotonic and all other case combinations 3835 // do nothing 3836 return Flush; 3837 } 3838 3839 OpenMPIRBuilder::InsertPointTy 3840 OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc, 3841 AtomicOpValue &X, AtomicOpValue &V, 3842 AtomicOrdering AO) { 3843 if (!updateToLocation(Loc)) 3844 return Loc.IP; 3845 3846 Type *XTy = X.Var->getType(); 3847 assert(XTy->isPointerTy() && "OMP Atomic expects a pointer to target memory"); 3848 Type *XElemTy = X.ElemTy; 3849 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() || 3850 XElemTy->isPointerTy()) && 3851 "OMP atomic read expected a scalar type"); 3852 3853 Value *XRead = nullptr; 3854 3855 if (XElemTy->isIntegerTy()) { 3856 LoadInst *XLD = 3857 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read"); 3858 XLD->setAtomic(AO); 3859 XRead = cast<Value>(XLD); 3860 } else { 3861 // We need to bitcast and perform atomic op as integer 3862 unsigned Addrspace = cast<PointerType>(XTy)->getAddressSpace(); 3863 IntegerType *IntCastTy = 3864 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits()); 3865 Value *XBCast = Builder.CreateBitCast( 3866 X.Var, IntCastTy->getPointerTo(Addrspace), "atomic.src.int.cast"); 3867 LoadInst *XLoad = 3868 Builder.CreateLoad(IntCastTy, XBCast, X.IsVolatile, "omp.atomic.load"); 3869 XLoad->setAtomic(AO); 3870 if (XElemTy->isFloatingPointTy()) { 3871 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast"); 3872 } else { 3873 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast"); 3874 } 3875 } 3876 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read); 3877 Builder.CreateStore(XRead, V.Var, V.IsVolatile); 3878 return Builder.saveIP(); 3879 } 3880 3881 OpenMPIRBuilder::InsertPointTy 3882 OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc, 3883 AtomicOpValue &X, Value *Expr, 3884 AtomicOrdering AO) { 3885 if (!updateToLocation(Loc)) 3886 return Loc.IP; 3887 3888 Type *XTy = X.Var->getType(); 3889 assert(XTy->isPointerTy() && "OMP Atomic expects a pointer to target memory"); 3890 Type *XElemTy = X.ElemTy; 3891 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() || 3892 XElemTy->isPointerTy()) && 3893 "OMP atomic write expected a scalar type"); 3894 3895 if (XElemTy->isIntegerTy()) { 3896 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile); 3897 XSt->setAtomic(AO); 3898 } else { 3899 // We need to bitcast and perform atomic op as integers 3900 unsigned Addrspace = cast<PointerType>(XTy)->getAddressSpace(); 3901 IntegerType *IntCastTy = 3902 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits()); 3903 Value *XBCast = Builder.CreateBitCast( 3904 X.Var, IntCastTy->getPointerTo(Addrspace), "atomic.dst.int.cast"); 3905 Value *ExprCast = 3906 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast"); 3907 StoreInst *XSt = Builder.CreateStore(ExprCast, XBCast, X.IsVolatile); 3908 XSt->setAtomic(AO); 3909 } 3910 3911 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write); 3912 return Builder.saveIP(); 3913 } 3914 3915 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicUpdate( 3916 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, 3917 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, 3918 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) { 3919 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous"); 3920 if (!updateToLocation(Loc)) 3921 return Loc.IP; 3922 3923 LLVM_DEBUG({ 3924 Type *XTy = X.Var->getType(); 3925 assert(XTy->isPointerTy() && 3926 "OMP Atomic expects a pointer to target memory"); 3927 Type *XElemTy = X.ElemTy; 3928 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() || 3929 XElemTy->isPointerTy()) && 3930 "OMP atomic update expected a scalar type"); 3931 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) && 3932 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) && 3933 "OpenMP atomic does not support LT or GT operations"); 3934 }); 3935 3936 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, 3937 X.IsVolatile, IsXBinopExpr); 3938 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update); 3939 return Builder.saveIP(); 3940 } 3941 3942 Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2, 3943 AtomicRMWInst::BinOp RMWOp) { 3944 switch (RMWOp) { 3945 case AtomicRMWInst::Add: 3946 return Builder.CreateAdd(Src1, Src2); 3947 case AtomicRMWInst::Sub: 3948 return Builder.CreateSub(Src1, Src2); 3949 case AtomicRMWInst::And: 3950 return Builder.CreateAnd(Src1, Src2); 3951 case AtomicRMWInst::Nand: 3952 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2)); 3953 case AtomicRMWInst::Or: 3954 return Builder.CreateOr(Src1, Src2); 3955 case AtomicRMWInst::Xor: 3956 return Builder.CreateXor(Src1, Src2); 3957 case AtomicRMWInst::Xchg: 3958 case AtomicRMWInst::FAdd: 3959 case AtomicRMWInst::FSub: 3960 case AtomicRMWInst::BAD_BINOP: 3961 case AtomicRMWInst::Max: 3962 case AtomicRMWInst::Min: 3963 case AtomicRMWInst::UMax: 3964 case AtomicRMWInst::UMin: 3965 llvm_unreachable("Unsupported atomic update operation"); 3966 } 3967 llvm_unreachable("Unsupported atomic update operation"); 3968 } 3969 3970 std::pair<Value *, Value *> OpenMPIRBuilder::emitAtomicUpdate( 3971 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr, 3972 AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, 3973 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) { 3974 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2 3975 // or a complex datatype. 3976 bool emitRMWOp = false; 3977 switch (RMWOp) { 3978 case AtomicRMWInst::Add: 3979 case AtomicRMWInst::And: 3980 case AtomicRMWInst::Nand: 3981 case AtomicRMWInst::Or: 3982 case AtomicRMWInst::Xor: 3983 case AtomicRMWInst::Xchg: 3984 emitRMWOp = XElemTy; 3985 break; 3986 case AtomicRMWInst::Sub: 3987 emitRMWOp = (IsXBinopExpr && XElemTy); 3988 break; 3989 default: 3990 emitRMWOp = false; 3991 } 3992 emitRMWOp &= XElemTy->isIntegerTy(); 3993 3994 std::pair<Value *, Value *> Res; 3995 if (emitRMWOp) { 3996 Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO); 3997 // not needed except in case of postfix captures. Generate anyway for 3998 // consistency with the else part. Will be removed with any DCE pass. 3999 // AtomicRMWInst::Xchg does not have a coressponding instruction. 4000 if (RMWOp == AtomicRMWInst::Xchg) 4001 Res.second = Res.first; 4002 else 4003 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp); 4004 } else { 4005 unsigned Addrspace = cast<PointerType>(X->getType())->getAddressSpace(); 4006 IntegerType *IntCastTy = 4007 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits()); 4008 Value *XBCast = 4009 Builder.CreateBitCast(X, IntCastTy->getPointerTo(Addrspace)); 4010 LoadInst *OldVal = 4011 Builder.CreateLoad(IntCastTy, XBCast, X->getName() + ".atomic.load"); 4012 OldVal->setAtomic(AO); 4013 // CurBB 4014 // | /---\ 4015 // ContBB | 4016 // | \---/ 4017 // ExitBB 4018 BasicBlock *CurBB = Builder.GetInsertBlock(); 4019 Instruction *CurBBTI = CurBB->getTerminator(); 4020 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable(); 4021 BasicBlock *ExitBB = 4022 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit"); 4023 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(), 4024 X->getName() + ".atomic.cont"); 4025 ContBB->getTerminator()->eraseFromParent(); 4026 Builder.restoreIP(AllocaIP); 4027 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy); 4028 NewAtomicAddr->setName(X->getName() + "x.new.val"); 4029 Builder.SetInsertPoint(ContBB); 4030 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2); 4031 PHI->addIncoming(OldVal, CurBB); 4032 IntegerType *NewAtomicCastTy = 4033 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits()); 4034 bool IsIntTy = XElemTy->isIntegerTy(); 4035 Value *NewAtomicIntAddr = 4036 (IsIntTy) 4037 ? NewAtomicAddr 4038 : Builder.CreateBitCast(NewAtomicAddr, 4039 NewAtomicCastTy->getPointerTo(Addrspace)); 4040 Value *OldExprVal = PHI; 4041 if (!IsIntTy) { 4042 if (XElemTy->isFloatingPointTy()) { 4043 OldExprVal = Builder.CreateBitCast(PHI, XElemTy, 4044 X->getName() + ".atomic.fltCast"); 4045 } else { 4046 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy, 4047 X->getName() + ".atomic.ptrCast"); 4048 } 4049 } 4050 4051 Value *Upd = UpdateOp(OldExprVal, Builder); 4052 Builder.CreateStore(Upd, NewAtomicAddr); 4053 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicIntAddr); 4054 Value *XAddr = 4055 (IsIntTy) 4056 ? X 4057 : Builder.CreateBitCast(X, IntCastTy->getPointerTo(Addrspace)); 4058 AtomicOrdering Failure = 4059 llvm::AtomicCmpXchgInst::getStrongestFailureOrdering(AO); 4060 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg( 4061 XAddr, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure); 4062 Result->setVolatile(VolatileX); 4063 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0); 4064 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1); 4065 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock()); 4066 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB); 4067 4068 Res.first = OldExprVal; 4069 Res.second = Upd; 4070 4071 // set Insertion point in exit block 4072 if (UnreachableInst *ExitTI = 4073 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) { 4074 CurBBTI->eraseFromParent(); 4075 Builder.SetInsertPoint(ExitBB); 4076 } else { 4077 Builder.SetInsertPoint(ExitTI); 4078 } 4079 } 4080 4081 return Res; 4082 } 4083 4084 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCapture( 4085 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, 4086 AtomicOpValue &V, Value *Expr, AtomicOrdering AO, 4087 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, 4088 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) { 4089 if (!updateToLocation(Loc)) 4090 return Loc.IP; 4091 4092 LLVM_DEBUG({ 4093 Type *XTy = X.Var->getType(); 4094 assert(XTy->isPointerTy() && 4095 "OMP Atomic expects a pointer to target memory"); 4096 Type *XElemTy = X.ElemTy; 4097 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() || 4098 XElemTy->isPointerTy()) && 4099 "OMP atomic capture expected a scalar type"); 4100 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) && 4101 "OpenMP atomic does not support LT or GT operations"); 4102 }); 4103 4104 // If UpdateExpr is 'x' updated with some `expr` not based on 'x', 4105 // 'x' is simply atomically rewritten with 'expr'. 4106 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg); 4107 std::pair<Value *, Value *> Result = 4108 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, 4109 X.IsVolatile, IsXBinopExpr); 4110 4111 Value *CapturedVal = (IsPostfixUpdate ? Result.first : Result.second); 4112 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile); 4113 4114 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture); 4115 return Builder.saveIP(); 4116 } 4117 4118 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare( 4119 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, 4120 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, 4121 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, 4122 bool IsFailOnly) { 4123 4124 if (!updateToLocation(Loc)) 4125 return Loc.IP; 4126 4127 assert(X.Var->getType()->isPointerTy() && 4128 "OMP atomic expects a pointer to target memory"); 4129 assert((X.ElemTy->isIntegerTy() || X.ElemTy->isPointerTy()) && 4130 "OMP atomic compare expected a integer scalar type"); 4131 // compare capture 4132 if (V.Var) { 4133 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type"); 4134 assert(V.ElemTy == X.ElemTy && "x and v must be of same type"); 4135 } 4136 4137 if (Op == OMPAtomicCompareOp::EQ) { 4138 AtomicOrdering Failure = AtomicCmpXchgInst::getStrongestFailureOrdering(AO); 4139 AtomicCmpXchgInst *Result = 4140 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure); 4141 if (V.Var) { 4142 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0); 4143 assert(OldValue->getType() == V.ElemTy && 4144 "OldValue and V must be of same type"); 4145 if (IsPostfixUpdate) { 4146 Builder.CreateStore(OldValue, V.Var, V.IsVolatile); 4147 } else { 4148 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1); 4149 if (IsFailOnly) { 4150 // CurBB---- 4151 // | | 4152 // v | 4153 // ContBB | 4154 // | | 4155 // v | 4156 // ExitBB <- 4157 // 4158 // where ContBB only contains the store of old value to 'v'. 4159 BasicBlock *CurBB = Builder.GetInsertBlock(); 4160 Instruction *CurBBTI = CurBB->getTerminator(); 4161 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable(); 4162 BasicBlock *ExitBB = CurBB->splitBasicBlock( 4163 CurBBTI, X.Var->getName() + ".atomic.exit"); 4164 BasicBlock *ContBB = CurBB->splitBasicBlock( 4165 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont"); 4166 ContBB->getTerminator()->eraseFromParent(); 4167 CurBB->getTerminator()->eraseFromParent(); 4168 4169 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB); 4170 4171 Builder.SetInsertPoint(ContBB); 4172 Builder.CreateStore(OldValue, V.Var); 4173 Builder.CreateBr(ExitBB); 4174 4175 if (UnreachableInst *ExitTI = 4176 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) { 4177 CurBBTI->eraseFromParent(); 4178 Builder.SetInsertPoint(ExitBB); 4179 } else { 4180 Builder.SetInsertPoint(ExitTI); 4181 } 4182 } else { 4183 Value *CapturedValue = 4184 Builder.CreateSelect(SuccessOrFail, E, OldValue); 4185 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile); 4186 } 4187 } 4188 } 4189 // The comparison result has to be stored. 4190 if (R.Var) { 4191 assert(R.Var->getType()->isPointerTy() && 4192 "r.var must be of pointer type"); 4193 assert(R.ElemTy->isIntegerTy() && "r must be of integral type"); 4194 4195 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1); 4196 Value *ResultCast = R.IsSigned 4197 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy) 4198 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy); 4199 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile); 4200 } 4201 } else { 4202 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) && 4203 "Op should be either max or min at this point"); 4204 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is =="); 4205 4206 // Reverse the ordop as the OpenMP forms are different from LLVM forms. 4207 // Let's take max as example. 4208 // OpenMP form: 4209 // x = x > expr ? expr : x; 4210 // LLVM form: 4211 // *ptr = *ptr > val ? *ptr : val; 4212 // We need to transform to LLVM form. 4213 // x = x <= expr ? x : expr; 4214 AtomicRMWInst::BinOp NewOp; 4215 if (IsXBinopExpr) { 4216 if (X.IsSigned) 4217 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min 4218 : AtomicRMWInst::Max; 4219 else 4220 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin 4221 : AtomicRMWInst::UMax; 4222 } else { 4223 if (X.IsSigned) 4224 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max 4225 : AtomicRMWInst::Min; 4226 else 4227 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax 4228 : AtomicRMWInst::UMin; 4229 } 4230 4231 AtomicRMWInst *OldValue = 4232 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO); 4233 if (V.Var) { 4234 Value *CapturedValue = nullptr; 4235 if (IsPostfixUpdate) { 4236 CapturedValue = OldValue; 4237 } else { 4238 CmpInst::Predicate Pred; 4239 switch (NewOp) { 4240 case AtomicRMWInst::Max: 4241 Pred = CmpInst::ICMP_SGT; 4242 break; 4243 case AtomicRMWInst::UMax: 4244 Pred = CmpInst::ICMP_UGT; 4245 break; 4246 case AtomicRMWInst::Min: 4247 Pred = CmpInst::ICMP_SLT; 4248 break; 4249 case AtomicRMWInst::UMin: 4250 Pred = CmpInst::ICMP_ULT; 4251 break; 4252 default: 4253 llvm_unreachable("unexpected comparison op"); 4254 } 4255 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E); 4256 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue); 4257 } 4258 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile); 4259 } 4260 } 4261 4262 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare); 4263 4264 return Builder.saveIP(); 4265 } 4266 4267 GlobalVariable * 4268 OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names, 4269 std::string VarName) { 4270 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get( 4271 llvm::ArrayType::get( 4272 llvm::Type::getInt8Ty(M.getContext())->getPointerTo(), Names.size()), 4273 Names); 4274 auto *MapNamesArrayGlobal = new llvm::GlobalVariable( 4275 M, MapNamesArrayInit->getType(), 4276 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit, 4277 VarName); 4278 return MapNamesArrayGlobal; 4279 } 4280 4281 // Create all simple and struct types exposed by the runtime and remember 4282 // the llvm::PointerTypes of them for easy access later. 4283 void OpenMPIRBuilder::initializeTypes(Module &M) { 4284 LLVMContext &Ctx = M.getContext(); 4285 StructType *T; 4286 #define OMP_TYPE(VarName, InitValue) VarName = InitValue; 4287 #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \ 4288 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \ 4289 VarName##PtrTy = PointerType::getUnqual(VarName##Ty); 4290 #define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \ 4291 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \ 4292 VarName##Ptr = PointerType::getUnqual(VarName); 4293 #define OMP_STRUCT_TYPE(VarName, StructName, ...) \ 4294 T = StructType::getTypeByName(Ctx, StructName); \ 4295 if (!T) \ 4296 T = StructType::create(Ctx, {__VA_ARGS__}, StructName); \ 4297 VarName = T; \ 4298 VarName##Ptr = PointerType::getUnqual(T); 4299 #include "llvm/Frontend/OpenMP/OMPKinds.def" 4300 } 4301 4302 void OpenMPIRBuilder::OutlineInfo::collectBlocks( 4303 SmallPtrSetImpl<BasicBlock *> &BlockSet, 4304 SmallVectorImpl<BasicBlock *> &BlockVector) { 4305 SmallVector<BasicBlock *, 32> Worklist; 4306 BlockSet.insert(EntryBB); 4307 BlockSet.insert(ExitBB); 4308 4309 Worklist.push_back(EntryBB); 4310 while (!Worklist.empty()) { 4311 BasicBlock *BB = Worklist.pop_back_val(); 4312 BlockVector.push_back(BB); 4313 for (BasicBlock *SuccBB : successors(BB)) 4314 if (BlockSet.insert(SuccBB).second) 4315 Worklist.push_back(SuccBB); 4316 } 4317 } 4318 4319 void CanonicalLoopInfo::collectControlBlocks( 4320 SmallVectorImpl<BasicBlock *> &BBs) { 4321 // We only count those BBs as control block for which we do not need to 4322 // reverse the CFG, i.e. not the loop body which can contain arbitrary control 4323 // flow. For consistency, this also means we do not add the Body block, which 4324 // is just the entry to the body code. 4325 BBs.reserve(BBs.size() + 6); 4326 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()}); 4327 } 4328 4329 BasicBlock *CanonicalLoopInfo::getPreheader() const { 4330 assert(isValid() && "Requires a valid canonical loop"); 4331 for (BasicBlock *Pred : predecessors(Header)) { 4332 if (Pred != Latch) 4333 return Pred; 4334 } 4335 llvm_unreachable("Missing preheader"); 4336 } 4337 4338 void CanonicalLoopInfo::setTripCount(Value *TripCount) { 4339 assert(isValid() && "Requires a valid canonical loop"); 4340 4341 Instruction *CmpI = &getCond()->front(); 4342 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount"); 4343 CmpI->setOperand(1, TripCount); 4344 4345 #ifndef NDEBUG 4346 assertOK(); 4347 #endif 4348 } 4349 4350 void CanonicalLoopInfo::mapIndVar( 4351 llvm::function_ref<Value *(Instruction *)> Updater) { 4352 assert(isValid() && "Requires a valid canonical loop"); 4353 4354 Instruction *OldIV = getIndVar(); 4355 4356 // Record all uses excluding those introduced by the updater. Uses by the 4357 // CanonicalLoopInfo itself to keep track of the number of iterations are 4358 // excluded. 4359 SmallVector<Use *> ReplacableUses; 4360 for (Use &U : OldIV->uses()) { 4361 auto *User = dyn_cast<Instruction>(U.getUser()); 4362 if (!User) 4363 continue; 4364 if (User->getParent() == getCond()) 4365 continue; 4366 if (User->getParent() == getLatch()) 4367 continue; 4368 ReplacableUses.push_back(&U); 4369 } 4370 4371 // Run the updater that may introduce new uses 4372 Value *NewIV = Updater(OldIV); 4373 4374 // Replace the old uses with the value returned by the updater. 4375 for (Use *U : ReplacableUses) 4376 U->set(NewIV); 4377 4378 #ifndef NDEBUG 4379 assertOK(); 4380 #endif 4381 } 4382 4383 void CanonicalLoopInfo::assertOK() const { 4384 #ifndef NDEBUG 4385 // No constraints if this object currently does not describe a loop. 4386 if (!isValid()) 4387 return; 4388 4389 BasicBlock *Preheader = getPreheader(); 4390 BasicBlock *Body = getBody(); 4391 BasicBlock *After = getAfter(); 4392 4393 // Verify standard control-flow we use for OpenMP loops. 4394 assert(Preheader); 4395 assert(isa<BranchInst>(Preheader->getTerminator()) && 4396 "Preheader must terminate with unconditional branch"); 4397 assert(Preheader->getSingleSuccessor() == Header && 4398 "Preheader must jump to header"); 4399 4400 assert(Header); 4401 assert(isa<BranchInst>(Header->getTerminator()) && 4402 "Header must terminate with unconditional branch"); 4403 assert(Header->getSingleSuccessor() == Cond && 4404 "Header must jump to exiting block"); 4405 4406 assert(Cond); 4407 assert(Cond->getSinglePredecessor() == Header && 4408 "Exiting block only reachable from header"); 4409 4410 assert(isa<BranchInst>(Cond->getTerminator()) && 4411 "Exiting block must terminate with conditional branch"); 4412 assert(size(successors(Cond)) == 2 && 4413 "Exiting block must have two successors"); 4414 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body && 4415 "Exiting block's first successor jump to the body"); 4416 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit && 4417 "Exiting block's second successor must exit the loop"); 4418 4419 assert(Body); 4420 assert(Body->getSinglePredecessor() == Cond && 4421 "Body only reachable from exiting block"); 4422 assert(!isa<PHINode>(Body->front())); 4423 4424 assert(Latch); 4425 assert(isa<BranchInst>(Latch->getTerminator()) && 4426 "Latch must terminate with unconditional branch"); 4427 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header"); 4428 // TODO: To support simple redirecting of the end of the body code that has 4429 // multiple; introduce another auxiliary basic block like preheader and after. 4430 assert(Latch->getSinglePredecessor() != nullptr); 4431 assert(!isa<PHINode>(Latch->front())); 4432 4433 assert(Exit); 4434 assert(isa<BranchInst>(Exit->getTerminator()) && 4435 "Exit block must terminate with unconditional branch"); 4436 assert(Exit->getSingleSuccessor() == After && 4437 "Exit block must jump to after block"); 4438 4439 assert(After); 4440 assert(After->getSinglePredecessor() == Exit && 4441 "After block only reachable from exit block"); 4442 assert(After->empty() || !isa<PHINode>(After->front())); 4443 4444 Instruction *IndVar = getIndVar(); 4445 assert(IndVar && "Canonical induction variable not found?"); 4446 assert(isa<IntegerType>(IndVar->getType()) && 4447 "Induction variable must be an integer"); 4448 assert(cast<PHINode>(IndVar)->getParent() == Header && 4449 "Induction variable must be a PHI in the loop header"); 4450 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader); 4451 assert( 4452 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero()); 4453 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch); 4454 4455 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1); 4456 assert(cast<Instruction>(NextIndVar)->getParent() == Latch); 4457 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add); 4458 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar); 4459 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1)) 4460 ->isOne()); 4461 4462 Value *TripCount = getTripCount(); 4463 assert(TripCount && "Loop trip count not found?"); 4464 assert(IndVar->getType() == TripCount->getType() && 4465 "Trip count and induction variable must have the same type"); 4466 4467 auto *CmpI = cast<CmpInst>(&Cond->front()); 4468 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT && 4469 "Exit condition must be a signed less-than comparison"); 4470 assert(CmpI->getOperand(0) == IndVar && 4471 "Exit condition must compare the induction variable"); 4472 assert(CmpI->getOperand(1) == TripCount && 4473 "Exit condition must compare with the trip count"); 4474 #endif 4475 } 4476 4477 void CanonicalLoopInfo::invalidate() { 4478 Header = nullptr; 4479 Cond = nullptr; 4480 Latch = nullptr; 4481 Exit = nullptr; 4482 } 4483