1 //===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// This file implements the OpenMPIRBuilder class, which is used as a 11 /// convenient way to create LLVM instructions for OpenMP directives. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" 16 #include "llvm/ADT/SmallBitVector.h" 17 #include "llvm/ADT/SmallSet.h" 18 #include "llvm/ADT/StringExtras.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/Analysis/AssumptionCache.h" 21 #include "llvm/Analysis/CodeMetrics.h" 22 #include "llvm/Analysis/LoopInfo.h" 23 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 24 #include "llvm/Analysis/ScalarEvolution.h" 25 #include "llvm/Analysis/TargetLibraryInfo.h" 26 #include "llvm/Bitcode/BitcodeReader.h" 27 #include "llvm/Frontend/Offloading/Utility.h" 28 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 29 #include "llvm/IR/Attributes.h" 30 #include "llvm/IR/BasicBlock.h" 31 #include "llvm/IR/CFG.h" 32 #include "llvm/IR/CallingConv.h" 33 #include "llvm/IR/Constant.h" 34 #include "llvm/IR/Constants.h" 35 #include "llvm/IR/DIBuilder.h" 36 #include "llvm/IR/DebugInfoMetadata.h" 37 #include "llvm/IR/DerivedTypes.h" 38 #include "llvm/IR/Function.h" 39 #include "llvm/IR/GlobalVariable.h" 40 #include "llvm/IR/IRBuilder.h" 41 #include "llvm/IR/LLVMContext.h" 42 #include "llvm/IR/MDBuilder.h" 43 #include "llvm/IR/Metadata.h" 44 #include "llvm/IR/PassInstrumentation.h" 45 #include "llvm/IR/PassManager.h" 46 #include "llvm/IR/ReplaceConstant.h" 47 #include "llvm/IR/Value.h" 48 #include "llvm/MC/TargetRegistry.h" 49 #include "llvm/Support/CommandLine.h" 50 #include "llvm/Support/ErrorHandling.h" 51 #include "llvm/Support/FileSystem.h" 52 #include "llvm/Target/TargetMachine.h" 53 #include "llvm/Target/TargetOptions.h" 54 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 55 #include "llvm/Transforms/Utils/Cloning.h" 56 #include "llvm/Transforms/Utils/CodeExtractor.h" 57 #include "llvm/Transforms/Utils/LoopPeel.h" 58 #include "llvm/Transforms/Utils/UnrollLoop.h" 59 60 #include <cstdint> 61 #include <optional> 62 63 #define DEBUG_TYPE "openmp-ir-builder" 64 65 using namespace llvm; 66 using namespace omp; 67 68 static cl::opt<bool> 69 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, 70 cl::desc("Use optimistic attributes describing " 71 "'as-if' properties of runtime calls."), 72 cl::init(false)); 73 74 static cl::opt<double> UnrollThresholdFactor( 75 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden, 76 cl::desc("Factor for the unroll threshold to account for code " 77 "simplifications still taking place"), 78 cl::init(1.5)); 79 80 #ifndef NDEBUG 81 /// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions 82 /// at position IP1 may change the meaning of IP2 or vice-versa. This is because 83 /// an InsertPoint stores the instruction before something is inserted. For 84 /// instance, if both point to the same instruction, two IRBuilders alternating 85 /// creating instruction will cause the instructions to be interleaved. 86 static bool isConflictIP(IRBuilder<>::InsertPoint IP1, 87 IRBuilder<>::InsertPoint IP2) { 88 if (!IP1.isSet() || !IP2.isSet()) 89 return false; 90 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint(); 91 } 92 93 static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) { 94 // Valid ordered/unordered and base algorithm combinations. 95 switch (SchedType & ~OMPScheduleType::MonotonicityMask) { 96 case OMPScheduleType::UnorderedStaticChunked: 97 case OMPScheduleType::UnorderedStatic: 98 case OMPScheduleType::UnorderedDynamicChunked: 99 case OMPScheduleType::UnorderedGuidedChunked: 100 case OMPScheduleType::UnorderedRuntime: 101 case OMPScheduleType::UnorderedAuto: 102 case OMPScheduleType::UnorderedTrapezoidal: 103 case OMPScheduleType::UnorderedGreedy: 104 case OMPScheduleType::UnorderedBalanced: 105 case OMPScheduleType::UnorderedGuidedIterativeChunked: 106 case OMPScheduleType::UnorderedGuidedAnalyticalChunked: 107 case OMPScheduleType::UnorderedSteal: 108 case OMPScheduleType::UnorderedStaticBalancedChunked: 109 case OMPScheduleType::UnorderedGuidedSimd: 110 case OMPScheduleType::UnorderedRuntimeSimd: 111 case OMPScheduleType::OrderedStaticChunked: 112 case OMPScheduleType::OrderedStatic: 113 case OMPScheduleType::OrderedDynamicChunked: 114 case OMPScheduleType::OrderedGuidedChunked: 115 case OMPScheduleType::OrderedRuntime: 116 case OMPScheduleType::OrderedAuto: 117 case OMPScheduleType::OrderdTrapezoidal: 118 case OMPScheduleType::NomergeUnorderedStaticChunked: 119 case OMPScheduleType::NomergeUnorderedStatic: 120 case OMPScheduleType::NomergeUnorderedDynamicChunked: 121 case OMPScheduleType::NomergeUnorderedGuidedChunked: 122 case OMPScheduleType::NomergeUnorderedRuntime: 123 case OMPScheduleType::NomergeUnorderedAuto: 124 case OMPScheduleType::NomergeUnorderedTrapezoidal: 125 case OMPScheduleType::NomergeUnorderedGreedy: 126 case OMPScheduleType::NomergeUnorderedBalanced: 127 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked: 128 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked: 129 case OMPScheduleType::NomergeUnorderedSteal: 130 case OMPScheduleType::NomergeOrderedStaticChunked: 131 case OMPScheduleType::NomergeOrderedStatic: 132 case OMPScheduleType::NomergeOrderedDynamicChunked: 133 case OMPScheduleType::NomergeOrderedGuidedChunked: 134 case OMPScheduleType::NomergeOrderedRuntime: 135 case OMPScheduleType::NomergeOrderedAuto: 136 case OMPScheduleType::NomergeOrderedTrapezoidal: 137 break; 138 default: 139 return false; 140 } 141 142 // Must not set both monotonicity modifiers at the same time. 143 OMPScheduleType MonotonicityFlags = 144 SchedType & OMPScheduleType::MonotonicityMask; 145 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask) 146 return false; 147 148 return true; 149 } 150 #endif 151 152 static const omp::GV &getGridValue(const Triple &T, Function *Kernel) { 153 if (T.isAMDGPU()) { 154 StringRef Features = 155 Kernel->getFnAttribute("target-features").getValueAsString(); 156 if (Features.count("+wavefrontsize64")) 157 return omp::getAMDGPUGridValues<64>(); 158 return omp::getAMDGPUGridValues<32>(); 159 } 160 if (T.isNVPTX()) 161 return omp::NVPTXGridValues; 162 llvm_unreachable("No grid value available for this architecture!"); 163 } 164 165 /// Determine which scheduling algorithm to use, determined from schedule clause 166 /// arguments. 167 static OMPScheduleType 168 getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, 169 bool HasSimdModifier) { 170 // Currently, the default schedule it static. 171 switch (ClauseKind) { 172 case OMP_SCHEDULE_Default: 173 case OMP_SCHEDULE_Static: 174 return HasChunks ? OMPScheduleType::BaseStaticChunked 175 : OMPScheduleType::BaseStatic; 176 case OMP_SCHEDULE_Dynamic: 177 return OMPScheduleType::BaseDynamicChunked; 178 case OMP_SCHEDULE_Guided: 179 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd 180 : OMPScheduleType::BaseGuidedChunked; 181 case OMP_SCHEDULE_Auto: 182 return llvm::omp::OMPScheduleType::BaseAuto; 183 case OMP_SCHEDULE_Runtime: 184 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd 185 : OMPScheduleType::BaseRuntime; 186 } 187 llvm_unreachable("unhandled schedule clause argument"); 188 } 189 190 /// Adds ordering modifier flags to schedule type. 191 static OMPScheduleType 192 getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, 193 bool HasOrderedClause) { 194 assert((BaseScheduleType & OMPScheduleType::ModifierMask) == 195 OMPScheduleType::None && 196 "Must not have ordering nor monotonicity flags already set"); 197 198 OMPScheduleType OrderingModifier = HasOrderedClause 199 ? OMPScheduleType::ModifierOrdered 200 : OMPScheduleType::ModifierUnordered; 201 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier; 202 203 // Unsupported combinations 204 if (OrderingScheduleType == 205 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered)) 206 return OMPScheduleType::OrderedGuidedChunked; 207 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd | 208 OMPScheduleType::ModifierOrdered)) 209 return OMPScheduleType::OrderedRuntime; 210 211 return OrderingScheduleType; 212 } 213 214 /// Adds monotonicity modifier flags to schedule type. 215 static OMPScheduleType 216 getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, 217 bool HasSimdModifier, bool HasMonotonic, 218 bool HasNonmonotonic, bool HasOrderedClause) { 219 assert((ScheduleType & OMPScheduleType::MonotonicityMask) == 220 OMPScheduleType::None && 221 "Must not have monotonicity flags already set"); 222 assert((!HasMonotonic || !HasNonmonotonic) && 223 "Monotonic and Nonmonotonic are contradicting each other"); 224 225 if (HasMonotonic) { 226 return ScheduleType | OMPScheduleType::ModifierMonotonic; 227 } else if (HasNonmonotonic) { 228 return ScheduleType | OMPScheduleType::ModifierNonmonotonic; 229 } else { 230 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description. 231 // If the static schedule kind is specified or if the ordered clause is 232 // specified, and if the nonmonotonic modifier is not specified, the 233 // effect is as if the monotonic modifier is specified. Otherwise, unless 234 // the monotonic modifier is specified, the effect is as if the 235 // nonmonotonic modifier is specified. 236 OMPScheduleType BaseScheduleType = 237 ScheduleType & ~OMPScheduleType::ModifierMask; 238 if ((BaseScheduleType == OMPScheduleType::BaseStatic) || 239 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) || 240 HasOrderedClause) { 241 // The monotonic is used by default in openmp runtime library, so no need 242 // to set it. 243 return ScheduleType; 244 } else { 245 return ScheduleType | OMPScheduleType::ModifierNonmonotonic; 246 } 247 } 248 } 249 250 /// Determine the schedule type using schedule and ordering clause arguments. 251 static OMPScheduleType 252 computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, 253 bool HasSimdModifier, bool HasMonotonicModifier, 254 bool HasNonmonotonicModifier, bool HasOrderedClause) { 255 OMPScheduleType BaseSchedule = 256 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier); 257 OMPScheduleType OrderedSchedule = 258 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause); 259 OMPScheduleType Result = getOpenMPMonotonicityScheduleType( 260 OrderedSchedule, HasSimdModifier, HasMonotonicModifier, 261 HasNonmonotonicModifier, HasOrderedClause); 262 263 assert(isValidWorkshareLoopScheduleType(Result)); 264 return Result; 265 } 266 267 /// Emit an implicit cast to convert \p XRead to type of variable \p V 268 static llvm::Value *emitImplicitCast(IRBuilder<> &Builder, llvm::Value *XRead, 269 llvm::Value *V) { 270 // TODO: Add this functionality to the `AtomicInfo` interface 271 llvm::Type *XReadType = XRead->getType(); 272 llvm::Type *VType = V->getType(); 273 if (llvm::AllocaInst *vAlloca = dyn_cast<llvm::AllocaInst>(V)) 274 VType = vAlloca->getAllocatedType(); 275 276 if (XReadType->isStructTy() && VType->isStructTy()) 277 // No need to extract or convert. A direct 278 // `store` will suffice. 279 return XRead; 280 281 if (XReadType->isStructTy()) 282 XRead = Builder.CreateExtractValue(XRead, /*Idxs=*/0); 283 if (VType->isIntegerTy() && XReadType->isFloatingPointTy()) 284 XRead = Builder.CreateFPToSI(XRead, VType); 285 else if (VType->isFloatingPointTy() && XReadType->isIntegerTy()) 286 XRead = Builder.CreateSIToFP(XRead, VType); 287 else if (VType->isIntegerTy() && XReadType->isIntegerTy()) 288 XRead = Builder.CreateIntCast(XRead, VType, true); 289 else if (VType->isFloatingPointTy() && XReadType->isFloatingPointTy()) 290 XRead = Builder.CreateFPCast(XRead, VType); 291 return XRead; 292 } 293 294 /// Make \p Source branch to \p Target. 295 /// 296 /// Handles two situations: 297 /// * \p Source already has an unconditional branch. 298 /// * \p Source is a degenerate block (no terminator because the BB is 299 /// the current head of the IR construction). 300 static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL) { 301 if (Instruction *Term = Source->getTerminator()) { 302 auto *Br = cast<BranchInst>(Term); 303 assert(!Br->isConditional() && 304 "BB's terminator must be an unconditional branch (or degenerate)"); 305 BasicBlock *Succ = Br->getSuccessor(0); 306 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true); 307 Br->setSuccessor(0, Target); 308 return; 309 } 310 311 auto *NewBr = BranchInst::Create(Target, Source); 312 NewBr->setDebugLoc(DL); 313 } 314 315 void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, 316 bool CreateBranch) { 317 assert(New->getFirstInsertionPt() == New->begin() && 318 "Target BB must not have PHI nodes"); 319 320 // Move instructions to new block. 321 BasicBlock *Old = IP.getBlock(); 322 New->splice(New->begin(), Old, IP.getPoint(), Old->end()); 323 324 if (CreateBranch) 325 BranchInst::Create(New, Old); 326 } 327 328 void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) { 329 DebugLoc DebugLoc = Builder.getCurrentDebugLocation(); 330 BasicBlock *Old = Builder.GetInsertBlock(); 331 332 spliceBB(Builder.saveIP(), New, CreateBranch); 333 if (CreateBranch) 334 Builder.SetInsertPoint(Old->getTerminator()); 335 else 336 Builder.SetInsertPoint(Old); 337 338 // SetInsertPoint also updates the Builder's debug location, but we want to 339 // keep the one the Builder was configured to use. 340 Builder.SetCurrentDebugLocation(DebugLoc); 341 } 342 343 BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, 344 llvm::Twine Name) { 345 BasicBlock *Old = IP.getBlock(); 346 BasicBlock *New = BasicBlock::Create( 347 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name, 348 Old->getParent(), Old->getNextNode()); 349 spliceBB(IP, New, CreateBranch); 350 New->replaceSuccessorsPhiUsesWith(Old, New); 351 return New; 352 } 353 354 BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch, 355 llvm::Twine Name) { 356 DebugLoc DebugLoc = Builder.getCurrentDebugLocation(); 357 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name); 358 if (CreateBranch) 359 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator()); 360 else 361 Builder.SetInsertPoint(Builder.GetInsertBlock()); 362 // SetInsertPoint also updates the Builder's debug location, but we want to 363 // keep the one the Builder was configured to use. 364 Builder.SetCurrentDebugLocation(DebugLoc); 365 return New; 366 } 367 368 BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch, 369 llvm::Twine Name) { 370 DebugLoc DebugLoc = Builder.getCurrentDebugLocation(); 371 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name); 372 if (CreateBranch) 373 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator()); 374 else 375 Builder.SetInsertPoint(Builder.GetInsertBlock()); 376 // SetInsertPoint also updates the Builder's debug location, but we want to 377 // keep the one the Builder was configured to use. 378 Builder.SetCurrentDebugLocation(DebugLoc); 379 return New; 380 } 381 382 BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, 383 llvm::Twine Suffix) { 384 BasicBlock *Old = Builder.GetInsertBlock(); 385 return splitBB(Builder, CreateBranch, Old->getName() + Suffix); 386 } 387 388 // This function creates a fake integer value and a fake use for the integer 389 // value. It returns the fake value created. This is useful in modeling the 390 // extra arguments to the outlined functions. 391 Value *createFakeIntVal(IRBuilderBase &Builder, 392 OpenMPIRBuilder::InsertPointTy OuterAllocaIP, 393 llvm::SmallVectorImpl<Instruction *> &ToBeDeleted, 394 OpenMPIRBuilder::InsertPointTy InnerAllocaIP, 395 const Twine &Name = "", bool AsPtr = true) { 396 Builder.restoreIP(OuterAllocaIP); 397 Instruction *FakeVal; 398 AllocaInst *FakeValAddr = 399 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr"); 400 ToBeDeleted.push_back(FakeValAddr); 401 402 if (AsPtr) { 403 FakeVal = FakeValAddr; 404 } else { 405 FakeVal = 406 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val"); 407 ToBeDeleted.push_back(FakeVal); 408 } 409 410 // Generate a fake use of this value 411 Builder.restoreIP(InnerAllocaIP); 412 Instruction *UseFakeVal; 413 if (AsPtr) { 414 UseFakeVal = 415 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use"); 416 } else { 417 UseFakeVal = 418 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10))); 419 } 420 ToBeDeleted.push_back(UseFakeVal); 421 return FakeVal; 422 } 423 424 //===----------------------------------------------------------------------===// 425 // OpenMPIRBuilderConfig 426 //===----------------------------------------------------------------------===// 427 428 namespace { 429 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 430 /// Values for bit flags for marking which requires clauses have been used. 431 enum OpenMPOffloadingRequiresDirFlags { 432 /// flag undefined. 433 OMP_REQ_UNDEFINED = 0x000, 434 /// no requires directive present. 435 OMP_REQ_NONE = 0x001, 436 /// reverse_offload clause. 437 OMP_REQ_REVERSE_OFFLOAD = 0x002, 438 /// unified_address clause. 439 OMP_REQ_UNIFIED_ADDRESS = 0x004, 440 /// unified_shared_memory clause. 441 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008, 442 /// dynamic_allocators clause. 443 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010, 444 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS) 445 }; 446 447 } // anonymous namespace 448 449 OpenMPIRBuilderConfig::OpenMPIRBuilderConfig() 450 : RequiresFlags(OMP_REQ_UNDEFINED) {} 451 452 OpenMPIRBuilderConfig::OpenMPIRBuilderConfig( 453 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory, 454 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress, 455 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators) 456 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU), 457 OpenMPOffloadMandatory(OpenMPOffloadMandatory), 458 RequiresFlags(OMP_REQ_UNDEFINED) { 459 if (HasRequiresReverseOffload) 460 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD; 461 if (HasRequiresUnifiedAddress) 462 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS; 463 if (HasRequiresUnifiedSharedMemory) 464 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY; 465 if (HasRequiresDynamicAllocators) 466 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS; 467 } 468 469 bool OpenMPIRBuilderConfig::hasRequiresReverseOffload() const { 470 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD; 471 } 472 473 bool OpenMPIRBuilderConfig::hasRequiresUnifiedAddress() const { 474 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS; 475 } 476 477 bool OpenMPIRBuilderConfig::hasRequiresUnifiedSharedMemory() const { 478 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY; 479 } 480 481 bool OpenMPIRBuilderConfig::hasRequiresDynamicAllocators() const { 482 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS; 483 } 484 485 int64_t OpenMPIRBuilderConfig::getRequiresFlags() const { 486 return hasRequiresFlags() ? RequiresFlags 487 : static_cast<int64_t>(OMP_REQ_NONE); 488 } 489 490 void OpenMPIRBuilderConfig::setHasRequiresReverseOffload(bool Value) { 491 if (Value) 492 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD; 493 else 494 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD; 495 } 496 497 void OpenMPIRBuilderConfig::setHasRequiresUnifiedAddress(bool Value) { 498 if (Value) 499 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS; 500 else 501 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS; 502 } 503 504 void OpenMPIRBuilderConfig::setHasRequiresUnifiedSharedMemory(bool Value) { 505 if (Value) 506 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY; 507 else 508 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY; 509 } 510 511 void OpenMPIRBuilderConfig::setHasRequiresDynamicAllocators(bool Value) { 512 if (Value) 513 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS; 514 else 515 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS; 516 } 517 518 //===----------------------------------------------------------------------===// 519 // OpenMPIRBuilder 520 //===----------------------------------------------------------------------===// 521 522 void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs, 523 IRBuilderBase &Builder, 524 SmallVector<Value *> &ArgsVector) { 525 Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION); 526 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems); 527 auto Int32Ty = Type::getInt32Ty(Builder.getContext()); 528 constexpr const size_t MaxDim = 3; 529 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim)); 530 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait); 531 532 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty()); 533 534 Value *NumTeams3D = 535 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0}); 536 Value *NumThreads3D = 537 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0}); 538 for (unsigned I : 539 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim))) 540 NumTeams3D = 541 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I}); 542 for (unsigned I : 543 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim))) 544 NumThreads3D = 545 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I}); 546 547 ArgsVector = {Version, 548 PointerNum, 549 KernelArgs.RTArgs.BasePointersArray, 550 KernelArgs.RTArgs.PointersArray, 551 KernelArgs.RTArgs.SizesArray, 552 KernelArgs.RTArgs.MapTypesArray, 553 KernelArgs.RTArgs.MapNamesArray, 554 KernelArgs.RTArgs.MappersArray, 555 KernelArgs.NumIterations, 556 Flags, 557 NumTeams3D, 558 NumThreads3D, 559 KernelArgs.DynCGGroupMem}; 560 } 561 562 void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) { 563 LLVMContext &Ctx = Fn.getContext(); 564 565 // Get the function's current attributes. 566 auto Attrs = Fn.getAttributes(); 567 auto FnAttrs = Attrs.getFnAttrs(); 568 auto RetAttrs = Attrs.getRetAttrs(); 569 SmallVector<AttributeSet, 4> ArgAttrs; 570 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo) 571 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo)); 572 573 // Add AS to FnAS while taking special care with integer extensions. 574 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS, 575 bool Param = true) -> void { 576 bool HasSignExt = AS.hasAttribute(Attribute::SExt); 577 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt); 578 if (HasSignExt || HasZeroExt) { 579 assert(AS.getNumAttributes() == 1 && 580 "Currently not handling extension attr combined with others."); 581 if (Param) { 582 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt)) 583 FnAS = FnAS.addAttribute(Ctx, AK); 584 } else if (auto AK = 585 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt)) 586 FnAS = FnAS.addAttribute(Ctx, AK); 587 } else { 588 FnAS = FnAS.addAttributes(Ctx, AS); 589 } 590 }; 591 592 #define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet; 593 #include "llvm/Frontend/OpenMP/OMPKinds.def" 594 595 // Add attributes to the function declaration. 596 switch (FnID) { 597 #define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \ 598 case Enum: \ 599 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \ 600 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \ 601 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \ 602 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \ 603 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \ 604 break; 605 #include "llvm/Frontend/OpenMP/OMPKinds.def" 606 default: 607 // Attributes are optional. 608 break; 609 } 610 } 611 612 FunctionCallee 613 OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) { 614 FunctionType *FnTy = nullptr; 615 Function *Fn = nullptr; 616 617 // Try to find the declation in the module first. 618 switch (FnID) { 619 #define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \ 620 case Enum: \ 621 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \ 622 IsVarArg); \ 623 Fn = M.getFunction(Str); \ 624 break; 625 #include "llvm/Frontend/OpenMP/OMPKinds.def" 626 } 627 628 if (!Fn) { 629 // Create a new declaration if we need one. 630 switch (FnID) { 631 #define OMP_RTL(Enum, Str, ...) \ 632 case Enum: \ 633 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \ 634 break; 635 #include "llvm/Frontend/OpenMP/OMPKinds.def" 636 } 637 638 // Add information if the runtime function takes a callback function 639 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) { 640 if (!Fn->hasMetadata(LLVMContext::MD_callback)) { 641 LLVMContext &Ctx = Fn->getContext(); 642 MDBuilder MDB(Ctx); 643 // Annotate the callback behavior of the runtime function: 644 // - The callback callee is argument number 2 (microtask). 645 // - The first two arguments of the callback callee are unknown (-1). 646 // - All variadic arguments to the runtime function are passed to the 647 // callback callee. 648 Fn->addMetadata( 649 LLVMContext::MD_callback, 650 *MDNode::get(Ctx, {MDB.createCallbackEncoding( 651 2, {-1, -1}, /* VarArgsArePassed */ true)})); 652 } 653 } 654 655 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName() 656 << " with type " << *Fn->getFunctionType() << "\n"); 657 addAttributes(FnID, *Fn); 658 659 } else { 660 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName() 661 << " with type " << *Fn->getFunctionType() << "\n"); 662 } 663 664 assert(Fn && "Failed to create OpenMP runtime function"); 665 666 return {FnTy, Fn}; 667 } 668 669 Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) { 670 FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID); 671 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee()); 672 assert(Fn && "Failed to create OpenMP runtime function pointer"); 673 return Fn; 674 } 675 676 void OpenMPIRBuilder::initialize() { initializeTypes(M); } 677 678 static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, 679 Function *Function) { 680 BasicBlock &EntryBlock = Function->getEntryBlock(); 681 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt(); 682 683 // Loop over blocks looking for constant allocas, skipping the entry block 684 // as any allocas there are already in the desired location. 685 for (auto Block = std::next(Function->begin(), 1); Block != Function->end(); 686 Block++) { 687 for (auto Inst = Block->getReverseIterator()->begin(); 688 Inst != Block->getReverseIterator()->end();) { 689 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) { 690 Inst++; 691 if (!isa<ConstantData>(AllocaInst->getArraySize())) 692 continue; 693 AllocaInst->moveBeforePreserving(MoveLocInst); 694 } else { 695 Inst++; 696 } 697 } 698 } 699 } 700 701 void OpenMPIRBuilder::finalize(Function *Fn) { 702 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet; 703 SmallVector<BasicBlock *, 32> Blocks; 704 SmallVector<OutlineInfo, 16> DeferredOutlines; 705 for (OutlineInfo &OI : OutlineInfos) { 706 // Skip functions that have not finalized yet; may happen with nested 707 // function generation. 708 if (Fn && OI.getFunction() != Fn) { 709 DeferredOutlines.push_back(OI); 710 continue; 711 } 712 713 ParallelRegionBlockSet.clear(); 714 Blocks.clear(); 715 OI.collectBlocks(ParallelRegionBlockSet, Blocks); 716 717 Function *OuterFn = OI.getFunction(); 718 CodeExtractorAnalysisCache CEAC(*OuterFn); 719 // If we generate code for the target device, we need to allocate 720 // struct for aggregate params in the device default alloca address space. 721 // OpenMP runtime requires that the params of the extracted functions are 722 // passed as zero address space pointers. This flag ensures that 723 // CodeExtractor generates correct code for extracted functions 724 // which are used by OpenMP runtime. 725 bool ArgsInZeroAddressSpace = Config.isTargetDevice(); 726 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr, 727 /* AggregateArgs */ true, 728 /* BlockFrequencyInfo */ nullptr, 729 /* BranchProbabilityInfo */ nullptr, 730 /* AssumptionCache */ nullptr, 731 /* AllowVarArgs */ true, 732 /* AllowAlloca */ true, 733 /* AllocaBlock*/ OI.OuterAllocaBB, 734 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace); 735 736 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n"); 737 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName() 738 << " Exit: " << OI.ExitBB->getName() << "\n"); 739 assert(Extractor.isEligible() && 740 "Expected OpenMP outlining to be possible!"); 741 742 for (auto *V : OI.ExcludeArgsFromAggregate) 743 Extractor.excludeArgFromAggregate(V); 744 745 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC); 746 747 // Forward target-cpu, target-features attributes to the outlined function. 748 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu"); 749 if (TargetCpuAttr.isStringAttribute()) 750 OutlinedFn->addFnAttr(TargetCpuAttr); 751 752 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features"); 753 if (TargetFeaturesAttr.isStringAttribute()) 754 OutlinedFn->addFnAttr(TargetFeaturesAttr); 755 756 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n"); 757 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n"); 758 assert(OutlinedFn->getReturnType()->isVoidTy() && 759 "OpenMP outlined functions should not return a value!"); 760 761 // For compability with the clang CG we move the outlined function after the 762 // one with the parallel region. 763 OutlinedFn->removeFromParent(); 764 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn); 765 766 // Remove the artificial entry introduced by the extractor right away, we 767 // made our own entry block after all. 768 { 769 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock(); 770 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB); 771 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry); 772 // Move instructions from the to-be-deleted ArtificialEntry to the entry 773 // basic block of the parallel region. CodeExtractor generates 774 // instructions to unwrap the aggregate argument and may sink 775 // allocas/bitcasts for values that are solely used in the outlined region 776 // and do not escape. 777 assert(!ArtificialEntry.empty() && 778 "Expected instructions to add in the outlined region entry"); 779 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(), 780 End = ArtificialEntry.rend(); 781 It != End;) { 782 Instruction &I = *It; 783 It++; 784 785 if (I.isTerminator()) 786 continue; 787 788 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt()); 789 } 790 791 OI.EntryBB->moveBefore(&ArtificialEntry); 792 ArtificialEntry.eraseFromParent(); 793 } 794 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB); 795 assert(OutlinedFn && OutlinedFn->getNumUses() == 1); 796 797 // Run a user callback, e.g. to add attributes. 798 if (OI.PostOutlineCB) 799 OI.PostOutlineCB(*OutlinedFn); 800 } 801 802 // Remove work items that have been completed. 803 OutlineInfos = std::move(DeferredOutlines); 804 805 // The createTarget functions embeds user written code into 806 // the target region which may inject allocas which need to 807 // be moved to the entry block of our target or risk malformed 808 // optimisations by later passes, this is only relevant for 809 // the device pass which appears to be a little more delicate 810 // when it comes to optimisations (however, we do not block on 811 // that here, it's up to the inserter to the list to do so). 812 // This notbaly has to occur after the OutlinedInfo candidates 813 // have been extracted so we have an end product that will not 814 // be implicitly adversely affected by any raises unless 815 // intentionally appended to the list. 816 // NOTE: This only does so for ConstantData, it could be extended 817 // to ConstantExpr's with further effort, however, they should 818 // largely be folded when they get here. Extending it to runtime 819 // defined/read+writeable allocation sizes would be non-trivial 820 // (need to factor in movement of any stores to variables the 821 // allocation size depends on, as well as the usual loads, 822 // otherwise it'll yield the wrong result after movement) and 823 // likely be more suitable as an LLVM optimisation pass. 824 for (Function *F : ConstantAllocaRaiseCandidates) 825 raiseUserConstantDataAllocasToEntryBlock(Builder, F); 826 827 EmitMetadataErrorReportFunctionTy &&ErrorReportFn = 828 [](EmitMetadataErrorKind Kind, 829 const TargetRegionEntryInfo &EntryInfo) -> void { 830 errs() << "Error of kind: " << Kind 831 << " when emitting offload entries and metadata during " 832 "OMPIRBuilder finalization \n"; 833 }; 834 835 if (!OffloadInfoManager.empty()) 836 createOffloadEntriesAndInfoMetadata(ErrorReportFn); 837 838 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) { 839 std::vector<WeakTrackingVH> LLVMCompilerUsed = { 840 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")}; 841 emitUsed("llvm.compiler.used", LLVMCompilerUsed); 842 } 843 } 844 845 OpenMPIRBuilder::~OpenMPIRBuilder() { 846 assert(OutlineInfos.empty() && "There must be no outstanding outlinings"); 847 } 848 849 GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) { 850 IntegerType *I32Ty = Type::getInt32Ty(M.getContext()); 851 auto *GV = 852 new GlobalVariable(M, I32Ty, 853 /* isConstant = */ true, GlobalValue::WeakODRLinkage, 854 ConstantInt::get(I32Ty, Value), Name); 855 GV->setVisibility(GlobalValue::HiddenVisibility); 856 857 return GV; 858 } 859 860 void OpenMPIRBuilder::emitUsed(StringRef Name, ArrayRef<WeakTrackingVH> List) { 861 if (List.empty()) 862 return; 863 864 // Convert List to what ConstantArray needs. 865 SmallVector<Constant *, 8> UsedArray; 866 UsedArray.resize(List.size()); 867 for (unsigned I = 0, E = List.size(); I != E; ++I) 868 UsedArray[I] = ConstantExpr::getPointerBitCastOrAddrSpaceCast( 869 cast<Constant>(&*List[I]), Builder.getPtrTy()); 870 871 if (UsedArray.empty()) 872 return; 873 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size()); 874 875 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage, 876 ConstantArray::get(ATy, UsedArray), Name); 877 878 GV->setSection("llvm.metadata"); 879 } 880 881 GlobalVariable * 882 OpenMPIRBuilder::emitKernelExecutionMode(StringRef KernelName, 883 OMPTgtExecModeFlags Mode) { 884 auto *Int8Ty = Builder.getInt8Ty(); 885 auto *GVMode = new GlobalVariable( 886 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage, 887 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode")); 888 GVMode->setVisibility(GlobalVariable::ProtectedVisibility); 889 return GVMode; 890 } 891 892 Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr, 893 uint32_t SrcLocStrSize, 894 IdentFlag LocFlags, 895 unsigned Reserve2Flags) { 896 // Enable "C-mode". 897 LocFlags |= OMP_IDENT_FLAG_KMPC; 898 899 Constant *&Ident = 900 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}]; 901 if (!Ident) { 902 Constant *I32Null = ConstantInt::getNullValue(Int32); 903 Constant *IdentData[] = {I32Null, 904 ConstantInt::get(Int32, uint32_t(LocFlags)), 905 ConstantInt::get(Int32, Reserve2Flags), 906 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr}; 907 Constant *Initializer = 908 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData); 909 910 // Look for existing encoding of the location + flags, not needed but 911 // minimizes the difference to the existing solution while we transition. 912 for (GlobalVariable &GV : M.globals()) 913 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer()) 914 if (GV.getInitializer() == Initializer) 915 Ident = &GV; 916 917 if (!Ident) { 918 auto *GV = new GlobalVariable( 919 M, OpenMPIRBuilder::Ident, 920 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "", 921 nullptr, GlobalValue::NotThreadLocal, 922 M.getDataLayout().getDefaultGlobalsAddressSpace()); 923 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); 924 GV->setAlignment(Align(8)); 925 Ident = GV; 926 } 927 } 928 929 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr); 930 } 931 932 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr, 933 uint32_t &SrcLocStrSize) { 934 SrcLocStrSize = LocStr.size(); 935 Constant *&SrcLocStr = SrcLocStrMap[LocStr]; 936 if (!SrcLocStr) { 937 Constant *Initializer = 938 ConstantDataArray::getString(M.getContext(), LocStr); 939 940 // Look for existing encoding of the location, not needed but minimizes the 941 // difference to the existing solution while we transition. 942 for (GlobalVariable &GV : M.globals()) 943 if (GV.isConstant() && GV.hasInitializer() && 944 GV.getInitializer() == Initializer) 945 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr); 946 947 SrcLocStr = Builder.CreateGlobalString(LocStr, /* Name */ "", 948 /* AddressSpace */ 0, &M); 949 } 950 return SrcLocStr; 951 } 952 953 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName, 954 StringRef FileName, 955 unsigned Line, unsigned Column, 956 uint32_t &SrcLocStrSize) { 957 SmallString<128> Buffer; 958 Buffer.push_back(';'); 959 Buffer.append(FileName); 960 Buffer.push_back(';'); 961 Buffer.append(FunctionName); 962 Buffer.push_back(';'); 963 Buffer.append(std::to_string(Line)); 964 Buffer.push_back(';'); 965 Buffer.append(std::to_string(Column)); 966 Buffer.push_back(';'); 967 Buffer.push_back(';'); 968 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize); 969 } 970 971 Constant * 972 OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize) { 973 StringRef UnknownLoc = ";unknown;unknown;0;0;;"; 974 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize); 975 } 976 977 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL, 978 uint32_t &SrcLocStrSize, 979 Function *F) { 980 DILocation *DIL = DL.get(); 981 if (!DIL) 982 return getOrCreateDefaultSrcLocStr(SrcLocStrSize); 983 StringRef FileName = M.getName(); 984 if (DIFile *DIF = DIL->getFile()) 985 if (std::optional<StringRef> Source = DIF->getSource()) 986 FileName = *Source; 987 StringRef Function = DIL->getScope()->getSubprogram()->getName(); 988 if (Function.empty() && F) 989 Function = F->getName(); 990 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(), 991 DIL->getColumn(), SrcLocStrSize); 992 } 993 994 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc, 995 uint32_t &SrcLocStrSize) { 996 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize, 997 Loc.IP.getBlock()->getParent()); 998 } 999 1000 Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) { 1001 return Builder.CreateCall( 1002 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident, 1003 "omp_global_thread_num"); 1004 } 1005 1006 OpenMPIRBuilder::InsertPointOrErrorTy 1007 OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind, 1008 bool ForceSimpleCall, bool CheckCancelFlag) { 1009 if (!updateToLocation(Loc)) 1010 return Loc.IP; 1011 1012 // Build call __kmpc_cancel_barrier(loc, thread_id) or 1013 // __kmpc_barrier(loc, thread_id); 1014 1015 IdentFlag BarrierLocFlags; 1016 switch (Kind) { 1017 case OMPD_for: 1018 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR; 1019 break; 1020 case OMPD_sections: 1021 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS; 1022 break; 1023 case OMPD_single: 1024 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE; 1025 break; 1026 case OMPD_barrier: 1027 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL; 1028 break; 1029 default: 1030 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL; 1031 break; 1032 } 1033 1034 uint32_t SrcLocStrSize; 1035 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1036 Value *Args[] = { 1037 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags), 1038 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))}; 1039 1040 // If we are in a cancellable parallel region, barriers are cancellation 1041 // points. 1042 // TODO: Check why we would force simple calls or to ignore the cancel flag. 1043 bool UseCancelBarrier = 1044 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel); 1045 1046 Value *Result = 1047 Builder.CreateCall(getOrCreateRuntimeFunctionPtr( 1048 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier 1049 : OMPRTL___kmpc_barrier), 1050 Args); 1051 1052 if (UseCancelBarrier && CheckCancelFlag) 1053 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel)) 1054 return Err; 1055 1056 return Builder.saveIP(); 1057 } 1058 1059 OpenMPIRBuilder::InsertPointOrErrorTy 1060 OpenMPIRBuilder::createCancel(const LocationDescription &Loc, 1061 Value *IfCondition, 1062 omp::Directive CanceledDirective) { 1063 if (!updateToLocation(Loc)) 1064 return Loc.IP; 1065 1066 // LLVM utilities like blocks with terminators. 1067 auto *UI = Builder.CreateUnreachable(); 1068 1069 Instruction *ThenTI = UI, *ElseTI = nullptr; 1070 if (IfCondition) 1071 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI); 1072 Builder.SetInsertPoint(ThenTI); 1073 1074 Value *CancelKind = nullptr; 1075 switch (CanceledDirective) { 1076 #define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \ 1077 case DirectiveEnum: \ 1078 CancelKind = Builder.getInt32(Value); \ 1079 break; 1080 #include "llvm/Frontend/OpenMP/OMPKinds.def" 1081 default: 1082 llvm_unreachable("Unknown cancel kind!"); 1083 } 1084 1085 uint32_t SrcLocStrSize; 1086 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1087 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 1088 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind}; 1089 Value *Result = Builder.CreateCall( 1090 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args); 1091 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error { 1092 if (CanceledDirective == OMPD_parallel) { 1093 IRBuilder<>::InsertPointGuard IPG(Builder); 1094 Builder.restoreIP(IP); 1095 return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL), 1096 omp::Directive::OMPD_unknown, 1097 /* ForceSimpleCall */ false, 1098 /* CheckCancelFlag */ false) 1099 .takeError(); 1100 } 1101 return Error::success(); 1102 }; 1103 1104 // The actual cancel logic is shared with others, e.g., cancel_barriers. 1105 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB)) 1106 return Err; 1107 1108 // Update the insertion point and remove the terminator we introduced. 1109 Builder.SetInsertPoint(UI->getParent()); 1110 UI->eraseFromParent(); 1111 1112 return Builder.saveIP(); 1113 } 1114 1115 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel( 1116 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, 1117 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, 1118 Value *HostPtr, ArrayRef<Value *> KernelArgs) { 1119 if (!updateToLocation(Loc)) 1120 return Loc.IP; 1121 1122 Builder.restoreIP(AllocaIP); 1123 auto *KernelArgsPtr = 1124 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args"); 1125 Builder.restoreIP(Loc.IP); 1126 1127 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) { 1128 llvm::Value *Arg = 1129 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I); 1130 Builder.CreateAlignedStore( 1131 KernelArgs[I], Arg, 1132 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType())); 1133 } 1134 1135 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams, 1136 NumThreads, HostPtr, KernelArgsPtr}; 1137 1138 Return = Builder.CreateCall( 1139 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel), 1140 OffloadingArgs); 1141 1142 return Builder.saveIP(); 1143 } 1144 1145 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch( 1146 const LocationDescription &Loc, Value *OutlinedFnID, 1147 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, 1148 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) { 1149 1150 if (!updateToLocation(Loc)) 1151 return Loc.IP; 1152 1153 Builder.restoreIP(Loc.IP); 1154 // On top of the arrays that were filled up, the target offloading call 1155 // takes as arguments the device id as well as the host pointer. The host 1156 // pointer is used by the runtime library to identify the current target 1157 // region, so it only has to be unique and not necessarily point to 1158 // anything. It could be the pointer to the outlined function that 1159 // implements the target region, but we aren't using that so that the 1160 // compiler doesn't need to keep that, and could therefore inline the host 1161 // function if proven worthwhile during optimization. 1162 1163 // From this point on, we need to have an ID of the target region defined. 1164 assert(OutlinedFnID && "Invalid outlined function ID!"); 1165 (void)OutlinedFnID; 1166 1167 // Return value of the runtime offloading call. 1168 Value *Return = nullptr; 1169 1170 // Arguments for the target kernel. 1171 SmallVector<Value *> ArgsVector; 1172 getKernelArgsVector(Args, Builder, ArgsVector); 1173 1174 // The target region is an outlined function launched by the runtime 1175 // via calls to __tgt_target_kernel(). 1176 // 1177 // Note that on the host and CPU targets, the runtime implementation of 1178 // these calls simply call the outlined function without forking threads. 1179 // The outlined functions themselves have runtime calls to 1180 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by 1181 // the compiler in emitTeamsCall() and emitParallelCall(). 1182 // 1183 // In contrast, on the NVPTX target, the implementation of 1184 // __tgt_target_teams() launches a GPU kernel with the requested number 1185 // of teams and threads so no additional calls to the runtime are required. 1186 // Check the error code and execute the host version if required. 1187 Builder.restoreIP(emitTargetKernel( 1188 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(), 1189 Args.NumThreads.front(), OutlinedFnID, ArgsVector)); 1190 1191 BasicBlock *OffloadFailedBlock = 1192 BasicBlock::Create(Builder.getContext(), "omp_offload.failed"); 1193 BasicBlock *OffloadContBlock = 1194 BasicBlock::Create(Builder.getContext(), "omp_offload.cont"); 1195 Value *Failed = Builder.CreateIsNotNull(Return); 1196 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock); 1197 1198 auto CurFn = Builder.GetInsertBlock()->getParent(); 1199 emitBlock(OffloadFailedBlock, CurFn); 1200 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP()); 1201 if (!AfterIP) 1202 return AfterIP.takeError(); 1203 Builder.restoreIP(*AfterIP); 1204 emitBranch(OffloadContBlock); 1205 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true); 1206 return Builder.saveIP(); 1207 } 1208 1209 Error OpenMPIRBuilder::emitCancelationCheckImpl( 1210 Value *CancelFlag, omp::Directive CanceledDirective, 1211 FinalizeCallbackTy ExitCB) { 1212 assert(isLastFinalizationInfoCancellable(CanceledDirective) && 1213 "Unexpected cancellation!"); 1214 1215 // For a cancel barrier we create two new blocks. 1216 BasicBlock *BB = Builder.GetInsertBlock(); 1217 BasicBlock *NonCancellationBlock; 1218 if (Builder.GetInsertPoint() == BB->end()) { 1219 // TODO: This branch will not be needed once we moved to the 1220 // OpenMPIRBuilder codegen completely. 1221 NonCancellationBlock = BasicBlock::Create( 1222 BB->getContext(), BB->getName() + ".cont", BB->getParent()); 1223 } else { 1224 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint()); 1225 BB->getTerminator()->eraseFromParent(); 1226 Builder.SetInsertPoint(BB); 1227 } 1228 BasicBlock *CancellationBlock = BasicBlock::Create( 1229 BB->getContext(), BB->getName() + ".cncl", BB->getParent()); 1230 1231 // Jump to them based on the return value. 1232 Value *Cmp = Builder.CreateIsNull(CancelFlag); 1233 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock, 1234 /* TODO weight */ nullptr, nullptr); 1235 1236 // From the cancellation block we finalize all variables and go to the 1237 // post finalization block that is known to the FiniCB callback. 1238 Builder.SetInsertPoint(CancellationBlock); 1239 if (ExitCB) 1240 if (Error Err = ExitCB(Builder.saveIP())) 1241 return Err; 1242 auto &FI = FinalizationStack.back(); 1243 if (Error Err = FI.FiniCB(Builder.saveIP())) 1244 return Err; 1245 1246 // The continuation block is where code generation continues. 1247 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin()); 1248 return Error::success(); 1249 } 1250 1251 // Callback used to create OpenMP runtime calls to support 1252 // omp parallel clause for the device. 1253 // We need to use this callback to replace call to the OutlinedFn in OuterFn 1254 // by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51) 1255 static void targetParallelCallback( 1256 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, 1257 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, 1258 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, 1259 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) { 1260 // Add some known attributes. 1261 IRBuilder<> &Builder = OMPIRBuilder->Builder; 1262 OutlinedFn.addParamAttr(0, Attribute::NoAlias); 1263 OutlinedFn.addParamAttr(1, Attribute::NoAlias); 1264 OutlinedFn.addParamAttr(0, Attribute::NoUndef); 1265 OutlinedFn.addParamAttr(1, Attribute::NoUndef); 1266 OutlinedFn.addFnAttr(Attribute::NoUnwind); 1267 1268 assert(OutlinedFn.arg_size() >= 2 && 1269 "Expected at least tid and bounded tid as arguments"); 1270 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2; 1271 1272 CallInst *CI = cast<CallInst>(OutlinedFn.user_back()); 1273 assert(CI && "Expected call instruction to outlined function"); 1274 CI->getParent()->setName("omp_parallel"); 1275 1276 Builder.SetInsertPoint(CI); 1277 Type *PtrTy = OMPIRBuilder->VoidPtr; 1278 Value *NullPtrValue = Constant::getNullValue(PtrTy); 1279 1280 // Add alloca for kernel args 1281 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP(); 1282 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt()); 1283 AllocaInst *ArgsAlloca = 1284 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars)); 1285 Value *Args = ArgsAlloca; 1286 // Add address space cast if array for storing arguments is not allocated 1287 // in address space 0 1288 if (ArgsAlloca->getAddressSpace()) 1289 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy); 1290 Builder.restoreIP(CurrentIP); 1291 1292 // Store captured vars which are used by kmpc_parallel_51 1293 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) { 1294 Value *V = *(CI->arg_begin() + 2 + Idx); 1295 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64( 1296 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx); 1297 Builder.CreateStore(V, StoreAddress); 1298 } 1299 1300 Value *Cond = 1301 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32) 1302 : Builder.getInt32(1); 1303 1304 // Build kmpc_parallel_51 call 1305 Value *Parallel51CallArgs[] = { 1306 /* identifier*/ Ident, 1307 /* global thread num*/ ThreadID, 1308 /* if expression */ Cond, 1309 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1), 1310 /* Proc bind */ Builder.getInt32(-1), 1311 /* outlined function */ 1312 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr), 1313 /* wrapper function */ NullPtrValue, 1314 /* arguments of the outlined funciton*/ Args, 1315 /* number of arguments */ Builder.getInt64(NumCapturedVars)}; 1316 1317 FunctionCallee RTLFn = 1318 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51); 1319 1320 Builder.CreateCall(RTLFn, Parallel51CallArgs); 1321 1322 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: " 1323 << *Builder.GetInsertBlock()->getParent() << "\n"); 1324 1325 // Initialize the local TID stack location with the argument value. 1326 Builder.SetInsertPoint(PrivTID); 1327 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin(); 1328 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI), 1329 PrivTIDAddr); 1330 1331 // Remove redundant call to the outlined function. 1332 CI->eraseFromParent(); 1333 1334 for (Instruction *I : ToBeDeleted) { 1335 I->eraseFromParent(); 1336 } 1337 } 1338 1339 // Callback used to create OpenMP runtime calls to support 1340 // omp parallel clause for the host. 1341 // We need to use this callback to replace call to the OutlinedFn in OuterFn 1342 // by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if]) 1343 static void 1344 hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, 1345 Function *OuterFn, Value *Ident, Value *IfCondition, 1346 Instruction *PrivTID, AllocaInst *PrivTIDAddr, 1347 const SmallVector<Instruction *, 4> &ToBeDeleted) { 1348 IRBuilder<> &Builder = OMPIRBuilder->Builder; 1349 FunctionCallee RTLFn; 1350 if (IfCondition) { 1351 RTLFn = 1352 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if); 1353 } else { 1354 RTLFn = 1355 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call); 1356 } 1357 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) { 1358 if (!F->hasMetadata(LLVMContext::MD_callback)) { 1359 LLVMContext &Ctx = F->getContext(); 1360 MDBuilder MDB(Ctx); 1361 // Annotate the callback behavior of the __kmpc_fork_call: 1362 // - The callback callee is argument number 2 (microtask). 1363 // - The first two arguments of the callback callee are unknown (-1). 1364 // - All variadic arguments to the __kmpc_fork_call are passed to the 1365 // callback callee. 1366 F->addMetadata(LLVMContext::MD_callback, 1367 *MDNode::get(Ctx, {MDB.createCallbackEncoding( 1368 2, {-1, -1}, 1369 /* VarArgsArePassed */ true)})); 1370 } 1371 } 1372 // Add some known attributes. 1373 OutlinedFn.addParamAttr(0, Attribute::NoAlias); 1374 OutlinedFn.addParamAttr(1, Attribute::NoAlias); 1375 OutlinedFn.addFnAttr(Attribute::NoUnwind); 1376 1377 assert(OutlinedFn.arg_size() >= 2 && 1378 "Expected at least tid and bounded tid as arguments"); 1379 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2; 1380 1381 CallInst *CI = cast<CallInst>(OutlinedFn.user_back()); 1382 CI->getParent()->setName("omp_parallel"); 1383 Builder.SetInsertPoint(CI); 1384 1385 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn); 1386 Value *ForkCallArgs[] = { 1387 Ident, Builder.getInt32(NumCapturedVars), 1388 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)}; 1389 1390 SmallVector<Value *, 16> RealArgs; 1391 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs)); 1392 if (IfCondition) { 1393 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32); 1394 RealArgs.push_back(Cond); 1395 } 1396 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end()); 1397 1398 // __kmpc_fork_call_if always expects a void ptr as the last argument 1399 // If there are no arguments, pass a null pointer. 1400 auto PtrTy = OMPIRBuilder->VoidPtr; 1401 if (IfCondition && NumCapturedVars == 0) { 1402 Value *NullPtrValue = Constant::getNullValue(PtrTy); 1403 RealArgs.push_back(NullPtrValue); 1404 } 1405 if (IfCondition && RealArgs.back()->getType() != PtrTy) 1406 RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy); 1407 1408 Builder.CreateCall(RTLFn, RealArgs); 1409 1410 LLVM_DEBUG(dbgs() << "With fork_call placed: " 1411 << *Builder.GetInsertBlock()->getParent() << "\n"); 1412 1413 // Initialize the local TID stack location with the argument value. 1414 Builder.SetInsertPoint(PrivTID); 1415 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin(); 1416 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI), 1417 PrivTIDAddr); 1418 1419 // Remove redundant call to the outlined function. 1420 CI->eraseFromParent(); 1421 1422 for (Instruction *I : ToBeDeleted) { 1423 I->eraseFromParent(); 1424 } 1425 } 1426 1427 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel( 1428 const LocationDescription &Loc, InsertPointTy OuterAllocaIP, 1429 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, 1430 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, 1431 omp::ProcBindKind ProcBind, bool IsCancellable) { 1432 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous"); 1433 1434 if (!updateToLocation(Loc)) 1435 return Loc.IP; 1436 1437 uint32_t SrcLocStrSize; 1438 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1439 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 1440 Value *ThreadID = getOrCreateThreadID(Ident); 1441 // If we generate code for the target device, we need to allocate 1442 // struct for aggregate params in the device default alloca address space. 1443 // OpenMP runtime requires that the params of the extracted functions are 1444 // passed as zero address space pointers. This flag ensures that extracted 1445 // function arguments are declared in zero address space 1446 bool ArgsInZeroAddressSpace = Config.isTargetDevice(); 1447 1448 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads) 1449 // only if we compile for host side. 1450 if (NumThreads && !Config.isTargetDevice()) { 1451 Value *Args[] = { 1452 Ident, ThreadID, 1453 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)}; 1454 Builder.CreateCall( 1455 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args); 1456 } 1457 1458 if (ProcBind != OMP_PROC_BIND_default) { 1459 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind) 1460 Value *Args[] = { 1461 Ident, ThreadID, 1462 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)}; 1463 Builder.CreateCall( 1464 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args); 1465 } 1466 1467 BasicBlock *InsertBB = Builder.GetInsertBlock(); 1468 Function *OuterFn = InsertBB->getParent(); 1469 1470 // Save the outer alloca block because the insertion iterator may get 1471 // invalidated and we still need this later. 1472 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock(); 1473 1474 // Vector to remember instructions we used only during the modeling but which 1475 // we want to delete at the end. 1476 SmallVector<Instruction *, 4> ToBeDeleted; 1477 1478 // Change the location to the outer alloca insertion point to create and 1479 // initialize the allocas we pass into the parallel region. 1480 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin()); 1481 Builder.restoreIP(NewOuter); 1482 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr"); 1483 AllocaInst *ZeroAddrAlloca = 1484 Builder.CreateAlloca(Int32, nullptr, "zero.addr"); 1485 Instruction *TIDAddr = TIDAddrAlloca; 1486 Instruction *ZeroAddr = ZeroAddrAlloca; 1487 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) { 1488 // Add additional casts to enforce pointers in zero address space 1489 TIDAddr = new AddrSpaceCastInst( 1490 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast"); 1491 TIDAddr->insertAfter(TIDAddrAlloca->getIterator()); 1492 ToBeDeleted.push_back(TIDAddr); 1493 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca, 1494 PointerType ::get(M.getContext(), 0), 1495 "zero.addr.ascast"); 1496 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator()); 1497 ToBeDeleted.push_back(ZeroAddr); 1498 } 1499 1500 // We only need TIDAddr and ZeroAddr for modeling purposes to get the 1501 // associated arguments in the outlined function, so we delete them later. 1502 ToBeDeleted.push_back(TIDAddrAlloca); 1503 ToBeDeleted.push_back(ZeroAddrAlloca); 1504 1505 // Create an artificial insertion point that will also ensure the blocks we 1506 // are about to split are not degenerated. 1507 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB); 1508 1509 BasicBlock *EntryBB = UI->getParent(); 1510 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry"); 1511 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region"); 1512 BasicBlock *PRegPreFiniBB = 1513 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize"); 1514 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit"); 1515 1516 auto FiniCBWrapper = [&](InsertPointTy IP) { 1517 // Hide "open-ended" blocks from the given FiniCB by setting the right jump 1518 // target to the region exit block. 1519 if (IP.getBlock()->end() == IP.getPoint()) { 1520 IRBuilder<>::InsertPointGuard IPG(Builder); 1521 Builder.restoreIP(IP); 1522 Instruction *I = Builder.CreateBr(PRegExitBB); 1523 IP = InsertPointTy(I->getParent(), I->getIterator()); 1524 } 1525 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 && 1526 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB && 1527 "Unexpected insertion point for finalization call!"); 1528 return FiniCB(IP); 1529 }; 1530 1531 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable}); 1532 1533 // Generate the privatization allocas in the block that will become the entry 1534 // of the outlined function. 1535 Builder.SetInsertPoint(PRegEntryBB->getTerminator()); 1536 InsertPointTy InnerAllocaIP = Builder.saveIP(); 1537 1538 AllocaInst *PrivTIDAddr = 1539 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local"); 1540 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid"); 1541 1542 // Add some fake uses for OpenMP provided arguments. 1543 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use")); 1544 Instruction *ZeroAddrUse = 1545 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use"); 1546 ToBeDeleted.push_back(ZeroAddrUse); 1547 1548 // EntryBB 1549 // | 1550 // V 1551 // PRegionEntryBB <- Privatization allocas are placed here. 1552 // | 1553 // V 1554 // PRegionBodyBB <- BodeGen is invoked here. 1555 // | 1556 // V 1557 // PRegPreFiniBB <- The block we will start finalization from. 1558 // | 1559 // V 1560 // PRegionExitBB <- A common exit to simplify block collection. 1561 // 1562 1563 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n"); 1564 1565 // Let the caller create the body. 1566 assert(BodyGenCB && "Expected body generation callback!"); 1567 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin()); 1568 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP)) 1569 return Err; 1570 1571 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n"); 1572 1573 OutlineInfo OI; 1574 if (Config.isTargetDevice()) { 1575 // Generate OpenMP target specific runtime call 1576 OI.PostOutlineCB = [=, ToBeDeletedVec = 1577 std::move(ToBeDeleted)](Function &OutlinedFn) { 1578 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident, 1579 IfCondition, NumThreads, PrivTID, PrivTIDAddr, 1580 ThreadID, ToBeDeletedVec); 1581 }; 1582 } else { 1583 // Generate OpenMP host runtime call 1584 OI.PostOutlineCB = [=, ToBeDeletedVec = 1585 std::move(ToBeDeleted)](Function &OutlinedFn) { 1586 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition, 1587 PrivTID, PrivTIDAddr, ToBeDeletedVec); 1588 }; 1589 } 1590 1591 OI.OuterAllocaBB = OuterAllocaBlock; 1592 OI.EntryBB = PRegEntryBB; 1593 OI.ExitBB = PRegExitBB; 1594 1595 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet; 1596 SmallVector<BasicBlock *, 32> Blocks; 1597 OI.collectBlocks(ParallelRegionBlockSet, Blocks); 1598 1599 // Ensure a single exit node for the outlined region by creating one. 1600 // We might have multiple incoming edges to the exit now due to finalizations, 1601 // e.g., cancel calls that cause the control flow to leave the region. 1602 BasicBlock *PRegOutlinedExitBB = PRegExitBB; 1603 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt()); 1604 PRegOutlinedExitBB->setName("omp.par.outlined.exit"); 1605 Blocks.push_back(PRegOutlinedExitBB); 1606 1607 CodeExtractorAnalysisCache CEAC(*OuterFn); 1608 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr, 1609 /* AggregateArgs */ false, 1610 /* BlockFrequencyInfo */ nullptr, 1611 /* BranchProbabilityInfo */ nullptr, 1612 /* AssumptionCache */ nullptr, 1613 /* AllowVarArgs */ true, 1614 /* AllowAlloca */ true, 1615 /* AllocationBlock */ OuterAllocaBlock, 1616 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace); 1617 1618 // Find inputs to, outputs from the code region. 1619 BasicBlock *CommonExit = nullptr; 1620 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands; 1621 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit); 1622 1623 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands, 1624 /*CollectGlobalInputs=*/true); 1625 1626 Inputs.remove_if([&](Value *I) { 1627 if (auto *GV = dyn_cast_if_present<GlobalVariable>(I)) 1628 return GV->getValueType() == OpenMPIRBuilder::Ident; 1629 1630 return false; 1631 }); 1632 1633 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n"); 1634 1635 FunctionCallee TIDRTLFn = 1636 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num); 1637 1638 auto PrivHelper = [&](Value &V) -> Error { 1639 if (&V == TIDAddr || &V == ZeroAddr) { 1640 OI.ExcludeArgsFromAggregate.push_back(&V); 1641 return Error::success(); 1642 } 1643 1644 SetVector<Use *> Uses; 1645 for (Use &U : V.uses()) 1646 if (auto *UserI = dyn_cast<Instruction>(U.getUser())) 1647 if (ParallelRegionBlockSet.count(UserI->getParent())) 1648 Uses.insert(&U); 1649 1650 // __kmpc_fork_call expects extra arguments as pointers. If the input 1651 // already has a pointer type, everything is fine. Otherwise, store the 1652 // value onto stack and load it back inside the to-be-outlined region. This 1653 // will ensure only the pointer will be passed to the function. 1654 // FIXME: if there are more than 15 trailing arguments, they must be 1655 // additionally packed in a struct. 1656 Value *Inner = &V; 1657 if (!V.getType()->isPointerTy()) { 1658 IRBuilder<>::InsertPointGuard Guard(Builder); 1659 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n"); 1660 1661 Builder.restoreIP(OuterAllocaIP); 1662 Value *Ptr = 1663 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded"); 1664 1665 // Store to stack at end of the block that currently branches to the entry 1666 // block of the to-be-outlined region. 1667 Builder.SetInsertPoint(InsertBB, 1668 InsertBB->getTerminator()->getIterator()); 1669 Builder.CreateStore(&V, Ptr); 1670 1671 // Load back next to allocations in the to-be-outlined region. 1672 Builder.restoreIP(InnerAllocaIP); 1673 Inner = Builder.CreateLoad(V.getType(), Ptr); 1674 } 1675 1676 Value *ReplacementValue = nullptr; 1677 CallInst *CI = dyn_cast<CallInst>(&V); 1678 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) { 1679 ReplacementValue = PrivTID; 1680 } else { 1681 InsertPointOrErrorTy AfterIP = 1682 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue); 1683 if (!AfterIP) 1684 return AfterIP.takeError(); 1685 Builder.restoreIP(*AfterIP); 1686 InnerAllocaIP = { 1687 InnerAllocaIP.getBlock(), 1688 InnerAllocaIP.getBlock()->getTerminator()->getIterator()}; 1689 1690 assert(ReplacementValue && 1691 "Expected copy/create callback to set replacement value!"); 1692 if (ReplacementValue == &V) 1693 return Error::success(); 1694 } 1695 1696 for (Use *UPtr : Uses) 1697 UPtr->set(ReplacementValue); 1698 1699 return Error::success(); 1700 }; 1701 1702 // Reset the inner alloca insertion as it will be used for loading the values 1703 // wrapped into pointers before passing them into the to-be-outlined region. 1704 // Configure it to insert immediately after the fake use of zero address so 1705 // that they are available in the generated body and so that the 1706 // OpenMP-related values (thread ID and zero address pointers) remain leading 1707 // in the argument list. 1708 InnerAllocaIP = IRBuilder<>::InsertPoint( 1709 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator()); 1710 1711 // Reset the outer alloca insertion point to the entry of the relevant block 1712 // in case it was invalidated. 1713 OuterAllocaIP = IRBuilder<>::InsertPoint( 1714 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt()); 1715 1716 for (Value *Input : Inputs) { 1717 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n"); 1718 if (Error Err = PrivHelper(*Input)) 1719 return Err; 1720 } 1721 LLVM_DEBUG({ 1722 for (Value *Output : Outputs) 1723 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n"); 1724 }); 1725 assert(Outputs.empty() && 1726 "OpenMP outlining should not produce live-out values!"); 1727 1728 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n"); 1729 LLVM_DEBUG({ 1730 for (auto *BB : Blocks) 1731 dbgs() << " PBR: " << BB->getName() << "\n"; 1732 }); 1733 1734 // Adjust the finalization stack, verify the adjustment, and call the 1735 // finalize function a last time to finalize values between the pre-fini 1736 // block and the exit block if we left the parallel "the normal way". 1737 auto FiniInfo = FinalizationStack.pop_back_val(); 1738 (void)FiniInfo; 1739 assert(FiniInfo.DK == OMPD_parallel && 1740 "Unexpected finalization stack state!"); 1741 1742 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator(); 1743 1744 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator()); 1745 if (Error Err = FiniCB(PreFiniIP)) 1746 return Err; 1747 1748 // Register the outlined info. 1749 addOutlineInfo(std::move(OI)); 1750 1751 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end()); 1752 UI->eraseFromParent(); 1753 1754 return AfterIP; 1755 } 1756 1757 void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) { 1758 // Build call void __kmpc_flush(ident_t *loc) 1759 uint32_t SrcLocStrSize; 1760 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1761 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)}; 1762 1763 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args); 1764 } 1765 1766 void OpenMPIRBuilder::createFlush(const LocationDescription &Loc) { 1767 if (!updateToLocation(Loc)) 1768 return; 1769 emitFlush(Loc); 1770 } 1771 1772 void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) { 1773 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32 1774 // global_tid); 1775 uint32_t SrcLocStrSize; 1776 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1777 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 1778 Value *Args[] = {Ident, getOrCreateThreadID(Ident)}; 1779 1780 // Ignore return result until untied tasks are supported. 1781 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait), 1782 Args); 1783 } 1784 1785 void OpenMPIRBuilder::createTaskwait(const LocationDescription &Loc) { 1786 if (!updateToLocation(Loc)) 1787 return; 1788 emitTaskwaitImpl(Loc); 1789 } 1790 1791 void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) { 1792 // Build call __kmpc_omp_taskyield(loc, thread_id, 0); 1793 uint32_t SrcLocStrSize; 1794 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1795 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 1796 Constant *I32Null = ConstantInt::getNullValue(Int32); 1797 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null}; 1798 1799 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield), 1800 Args); 1801 } 1802 1803 void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) { 1804 if (!updateToLocation(Loc)) 1805 return; 1806 emitTaskyieldImpl(Loc); 1807 } 1808 1809 // Processes the dependencies in Dependencies and does the following 1810 // - Allocates space on the stack of an array of DependInfo objects 1811 // - Populates each DependInfo object with relevant information of 1812 // the corresponding dependence. 1813 // - All code is inserted in the entry block of the current function. 1814 static Value *emitTaskDependencies( 1815 OpenMPIRBuilder &OMPBuilder, 1816 const SmallVectorImpl<OpenMPIRBuilder::DependData> &Dependencies) { 1817 // Early return if we have no dependencies to process 1818 if (Dependencies.empty()) 1819 return nullptr; 1820 1821 // Given a vector of DependData objects, in this function we create an 1822 // array on the stack that holds kmp_dep_info objects corresponding 1823 // to each dependency. This is then passed to the OpenMP runtime. 1824 // For example, if there are 'n' dependencies then the following psedo 1825 // code is generated. Assume the first dependence is on a variable 'a' 1826 // 1827 // \code{c} 1828 // DepArray = alloc(n x sizeof(kmp_depend_info); 1829 // idx = 0; 1830 // DepArray[idx].base_addr = ptrtoint(&a); 1831 // DepArray[idx].len = 8; 1832 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/ 1833 // ++idx; 1834 // DepArray[idx].base_addr = ...; 1835 // \endcode 1836 1837 IRBuilderBase &Builder = OMPBuilder.Builder; 1838 Type *DependInfo = OMPBuilder.DependInfo; 1839 Module &M = OMPBuilder.M; 1840 1841 Value *DepArray = nullptr; 1842 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); 1843 Builder.SetInsertPoint( 1844 OldIP.getBlock()->getParent()->getEntryBlock().getTerminator()); 1845 1846 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size()); 1847 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr"); 1848 1849 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) { 1850 Value *Base = 1851 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx); 1852 // Store the pointer to the variable 1853 Value *Addr = Builder.CreateStructGEP( 1854 DependInfo, Base, 1855 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr)); 1856 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty()); 1857 Builder.CreateStore(DepValPtr, Addr); 1858 // Store the size of the variable 1859 Value *Size = Builder.CreateStructGEP( 1860 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len)); 1861 Builder.CreateStore( 1862 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)), 1863 Size); 1864 // Store the dependency kind 1865 Value *Flags = Builder.CreateStructGEP( 1866 DependInfo, Base, 1867 static_cast<unsigned int>(RTLDependInfoFields::Flags)); 1868 Builder.CreateStore( 1869 ConstantInt::get(Builder.getInt8Ty(), 1870 static_cast<unsigned int>(Dep.DepKind)), 1871 Flags); 1872 } 1873 Builder.restoreIP(OldIP); 1874 return DepArray; 1875 } 1876 1877 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask( 1878 const LocationDescription &Loc, InsertPointTy AllocaIP, 1879 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition, 1880 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle, 1881 Value *Priority) { 1882 1883 if (!updateToLocation(Loc)) 1884 return InsertPointTy(); 1885 1886 uint32_t SrcLocStrSize; 1887 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1888 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 1889 // The current basic block is split into four basic blocks. After outlining, 1890 // they will be mapped as follows: 1891 // ``` 1892 // def current_fn() { 1893 // current_basic_block: 1894 // br label %task.exit 1895 // task.exit: 1896 // ; instructions after task 1897 // } 1898 // def outlined_fn() { 1899 // task.alloca: 1900 // br label %task.body 1901 // task.body: 1902 // ret void 1903 // } 1904 // ``` 1905 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit"); 1906 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body"); 1907 BasicBlock *TaskAllocaBB = 1908 splitBB(Builder, /*CreateBranch=*/true, "task.alloca"); 1909 1910 InsertPointTy TaskAllocaIP = 1911 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin()); 1912 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin()); 1913 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP)) 1914 return Err; 1915 1916 OutlineInfo OI; 1917 OI.EntryBB = TaskAllocaBB; 1918 OI.OuterAllocaBB = AllocaIP.getBlock(); 1919 OI.ExitBB = TaskExitBB; 1920 1921 // Add the thread ID argument. 1922 SmallVector<Instruction *, 4> ToBeDeleted; 1923 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal( 1924 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false)); 1925 1926 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies, 1927 Mergeable, Priority, EventHandle, TaskAllocaBB, 1928 ToBeDeleted](Function &OutlinedFn) mutable { 1929 // Replace the Stale CI by appropriate RTL function call. 1930 assert(OutlinedFn.getNumUses() == 1 && 1931 "there must be a single user for the outlined function"); 1932 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back()); 1933 1934 // HasShareds is true if any variables are captured in the outlined region, 1935 // false otherwise. 1936 bool HasShareds = StaleCI->arg_size() > 1; 1937 Builder.SetInsertPoint(StaleCI); 1938 1939 // Gather the arguments for emitting the runtime call for 1940 // @__kmpc_omp_task_alloc 1941 Function *TaskAllocFn = 1942 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc); 1943 1944 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID) 1945 // call. 1946 Value *ThreadID = getOrCreateThreadID(Ident); 1947 1948 // Argument - `flags` 1949 // Task is tied iff (Flags & 1) == 1. 1950 // Task is untied iff (Flags & 1) == 0. 1951 // Task is final iff (Flags & 2) == 2. 1952 // Task is not final iff (Flags & 2) == 0. 1953 // Task is mergeable iff (Flags & 4) == 4. 1954 // Task is not mergeable iff (Flags & 4) == 0. 1955 // Task is priority iff (Flags & 32) == 32. 1956 // Task is not priority iff (Flags & 32) == 0. 1957 // TODO: Handle the other flags. 1958 Value *Flags = Builder.getInt32(Tied); 1959 if (Final) { 1960 Value *FinalFlag = 1961 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0)); 1962 Flags = Builder.CreateOr(FinalFlag, Flags); 1963 } 1964 1965 if (Mergeable) 1966 Flags = Builder.CreateOr(Builder.getInt32(4), Flags); 1967 if (Priority) 1968 Flags = Builder.CreateOr(Builder.getInt32(32), Flags); 1969 1970 // Argument - `sizeof_kmp_task_t` (TaskSize) 1971 // Tasksize refers to the size in bytes of kmp_task_t data structure 1972 // including private vars accessed in task. 1973 // TODO: add kmp_task_t_with_privates (privates) 1974 Value *TaskSize = Builder.getInt64( 1975 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8)); 1976 1977 // Argument - `sizeof_shareds` (SharedsSize) 1978 // SharedsSize refers to the shareds array size in the kmp_task_t data 1979 // structure. 1980 Value *SharedsSize = Builder.getInt64(0); 1981 if (HasShareds) { 1982 AllocaInst *ArgStructAlloca = 1983 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1)); 1984 assert(ArgStructAlloca && 1985 "Unable to find the alloca instruction corresponding to arguments " 1986 "for extracted function"); 1987 StructType *ArgStructType = 1988 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType()); 1989 assert(ArgStructType && "Unable to find struct type corresponding to " 1990 "arguments for extracted function"); 1991 SharedsSize = 1992 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType)); 1993 } 1994 // Emit the @__kmpc_omp_task_alloc runtime call 1995 // The runtime call returns a pointer to an area where the task captured 1996 // variables must be copied before the task is run (TaskData) 1997 CallInst *TaskData = Builder.CreateCall( 1998 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags, 1999 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize, 2000 /*task_func=*/&OutlinedFn}); 2001 2002 // Emit detach clause initialization. 2003 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid, 2004 // task_descriptor); 2005 if (EventHandle) { 2006 Function *TaskDetachFn = getOrCreateRuntimeFunctionPtr( 2007 OMPRTL___kmpc_task_allow_completion_event); 2008 llvm::Value *EventVal = 2009 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData}); 2010 llvm::Value *EventHandleAddr = 2011 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle, 2012 Builder.getPtrTy(0)); 2013 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty()); 2014 Builder.CreateStore(EventVal, EventHandleAddr); 2015 } 2016 // Copy the arguments for outlined function 2017 if (HasShareds) { 2018 Value *Shareds = StaleCI->getArgOperand(1); 2019 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout()); 2020 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData); 2021 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment, 2022 SharedsSize); 2023 } 2024 2025 if (Priority) { 2026 // 2027 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *", 2028 // we populate the priority information into the "kmp_task_t" here 2029 // 2030 // The struct "kmp_task_t" definition is available in kmp.h 2031 // kmp_task_t = { shareds, routine, part_id, data1, data2 } 2032 // data2 is used for priority 2033 // 2034 Type *Int32Ty = Builder.getInt32Ty(); 2035 Constant *Zero = ConstantInt::get(Int32Ty, 0); 2036 // kmp_task_t* => { ptr } 2037 Type *TaskPtr = StructType::get(VoidPtr); 2038 Value *TaskGEP = 2039 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero}); 2040 // kmp_task_t => { ptr, ptr, i32, ptr, ptr } 2041 Type *TaskStructType = StructType::get( 2042 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr); 2043 Value *PriorityData = Builder.CreateInBoundsGEP( 2044 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)}); 2045 // kmp_cmplrdata_t => { ptr, ptr } 2046 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr); 2047 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType, 2048 PriorityData, {Zero, Zero}); 2049 Builder.CreateStore(Priority, CmplrData); 2050 } 2051 2052 Value *DepArray = nullptr; 2053 if (Dependencies.size()) { 2054 InsertPointTy OldIP = Builder.saveIP(); 2055 Builder.SetInsertPoint( 2056 &OldIP.getBlock()->getParent()->getEntryBlock().back()); 2057 2058 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size()); 2059 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr"); 2060 2061 unsigned P = 0; 2062 for (const DependData &Dep : Dependencies) { 2063 Value *Base = 2064 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P); 2065 // Store the pointer to the variable 2066 Value *Addr = Builder.CreateStructGEP( 2067 DependInfo, Base, 2068 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr)); 2069 Value *DepValPtr = 2070 Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty()); 2071 Builder.CreateStore(DepValPtr, Addr); 2072 // Store the size of the variable 2073 Value *Size = Builder.CreateStructGEP( 2074 DependInfo, Base, 2075 static_cast<unsigned int>(RTLDependInfoFields::Len)); 2076 Builder.CreateStore(Builder.getInt64(M.getDataLayout().getTypeStoreSize( 2077 Dep.DepValueType)), 2078 Size); 2079 // Store the dependency kind 2080 Value *Flags = Builder.CreateStructGEP( 2081 DependInfo, Base, 2082 static_cast<unsigned int>(RTLDependInfoFields::Flags)); 2083 Builder.CreateStore( 2084 ConstantInt::get(Builder.getInt8Ty(), 2085 static_cast<unsigned int>(Dep.DepKind)), 2086 Flags); 2087 ++P; 2088 } 2089 2090 Builder.restoreIP(OldIP); 2091 } 2092 2093 // In the presence of the `if` clause, the following IR is generated: 2094 // ... 2095 // %data = call @__kmpc_omp_task_alloc(...) 2096 // br i1 %if_condition, label %then, label %else 2097 // then: 2098 // call @__kmpc_omp_task(...) 2099 // br label %exit 2100 // else: 2101 // ;; Wait for resolution of dependencies, if any, before 2102 // ;; beginning the task 2103 // call @__kmpc_omp_wait_deps(...) 2104 // call @__kmpc_omp_task_begin_if0(...) 2105 // call @outlined_fn(...) 2106 // call @__kmpc_omp_task_complete_if0(...) 2107 // br label %exit 2108 // exit: 2109 // ... 2110 if (IfCondition) { 2111 // `SplitBlockAndInsertIfThenElse` requires the block to have a 2112 // terminator. 2113 splitBB(Builder, /*CreateBranch=*/true, "if.end"); 2114 Instruction *IfTerminator = 2115 Builder.GetInsertPoint()->getParent()->getTerminator(); 2116 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr; 2117 Builder.SetInsertPoint(IfTerminator); 2118 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI, 2119 &ElseTI); 2120 Builder.SetInsertPoint(ElseTI); 2121 2122 if (Dependencies.size()) { 2123 Function *TaskWaitFn = 2124 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps); 2125 Builder.CreateCall( 2126 TaskWaitFn, 2127 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray, 2128 ConstantInt::get(Builder.getInt32Ty(), 0), 2129 ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))}); 2130 } 2131 Function *TaskBeginFn = 2132 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0); 2133 Function *TaskCompleteFn = 2134 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0); 2135 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData}); 2136 CallInst *CI = nullptr; 2137 if (HasShareds) 2138 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData}); 2139 else 2140 CI = Builder.CreateCall(&OutlinedFn, {ThreadID}); 2141 CI->setDebugLoc(StaleCI->getDebugLoc()); 2142 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData}); 2143 Builder.SetInsertPoint(ThenTI); 2144 } 2145 2146 if (Dependencies.size()) { 2147 Function *TaskFn = 2148 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps); 2149 Builder.CreateCall( 2150 TaskFn, 2151 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()), 2152 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0), 2153 ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))}); 2154 2155 } else { 2156 // Emit the @__kmpc_omp_task runtime call to spawn the task 2157 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task); 2158 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData}); 2159 } 2160 2161 StaleCI->eraseFromParent(); 2162 2163 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin()); 2164 if (HasShareds) { 2165 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1)); 2166 OutlinedFn.getArg(1)->replaceUsesWithIf( 2167 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; }); 2168 } 2169 2170 for (Instruction *I : llvm::reverse(ToBeDeleted)) 2171 I->eraseFromParent(); 2172 }; 2173 2174 addOutlineInfo(std::move(OI)); 2175 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin()); 2176 2177 return Builder.saveIP(); 2178 } 2179 2180 OpenMPIRBuilder::InsertPointOrErrorTy 2181 OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc, 2182 InsertPointTy AllocaIP, 2183 BodyGenCallbackTy BodyGenCB) { 2184 if (!updateToLocation(Loc)) 2185 return InsertPointTy(); 2186 2187 uint32_t SrcLocStrSize; 2188 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 2189 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 2190 Value *ThreadID = getOrCreateThreadID(Ident); 2191 2192 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup 2193 Function *TaskgroupFn = 2194 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup); 2195 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID}); 2196 2197 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit"); 2198 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP())) 2199 return Err; 2200 2201 Builder.SetInsertPoint(TaskgroupExitBB); 2202 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup 2203 Function *EndTaskgroupFn = 2204 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup); 2205 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID}); 2206 2207 return Builder.saveIP(); 2208 } 2209 2210 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections( 2211 const LocationDescription &Loc, InsertPointTy AllocaIP, 2212 ArrayRef<StorableBodyGenCallbackTy> SectionCBs, PrivatizeCallbackTy PrivCB, 2213 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) { 2214 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required"); 2215 2216 if (!updateToLocation(Loc)) 2217 return Loc.IP; 2218 2219 auto FiniCBWrapper = [&](InsertPointTy IP) { 2220 if (IP.getBlock()->end() != IP.getPoint()) 2221 return FiniCB(IP); 2222 // This must be done otherwise any nested constructs using FinalizeOMPRegion 2223 // will fail because that function requires the Finalization Basic Block to 2224 // have a terminator, which is already removed by EmitOMPRegionBody. 2225 // IP is currently at cancelation block. 2226 // We need to backtrack to the condition block to fetch 2227 // the exit block and create a branch from cancelation 2228 // to exit block. 2229 IRBuilder<>::InsertPointGuard IPG(Builder); 2230 Builder.restoreIP(IP); 2231 auto *CaseBB = IP.getBlock()->getSinglePredecessor(); 2232 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor(); 2233 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1); 2234 Instruction *I = Builder.CreateBr(ExitBB); 2235 IP = InsertPointTy(I->getParent(), I->getIterator()); 2236 return FiniCB(IP); 2237 }; 2238 2239 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable}); 2240 2241 // Each section is emitted as a switch case 2242 // Each finalization callback is handled from clang.EmitOMPSectionDirective() 2243 // -> OMP.createSection() which generates the IR for each section 2244 // Iterate through all sections and emit a switch construct: 2245 // switch (IV) { 2246 // case 0: 2247 // <SectionStmt[0]>; 2248 // break; 2249 // ... 2250 // case <NumSection> - 1: 2251 // <SectionStmt[<NumSection> - 1]>; 2252 // break; 2253 // } 2254 // ... 2255 // section_loop.after: 2256 // <FiniCB>; 2257 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error { 2258 Builder.restoreIP(CodeGenIP); 2259 BasicBlock *Continue = 2260 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after"); 2261 Function *CurFn = Continue->getParent(); 2262 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue); 2263 2264 unsigned CaseNumber = 0; 2265 for (auto SectionCB : SectionCBs) { 2266 BasicBlock *CaseBB = BasicBlock::Create( 2267 M.getContext(), "omp_section_loop.body.case", CurFn, Continue); 2268 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB); 2269 Builder.SetInsertPoint(CaseBB); 2270 BranchInst *CaseEndBr = Builder.CreateBr(Continue); 2271 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(), 2272 CaseEndBr->getIterator()})) 2273 return Err; 2274 CaseNumber++; 2275 } 2276 // remove the existing terminator from body BB since there can be no 2277 // terminators after switch/case 2278 return Error::success(); 2279 }; 2280 // Loop body ends here 2281 // LowerBound, UpperBound, and STride for createCanonicalLoop 2282 Type *I32Ty = Type::getInt32Ty(M.getContext()); 2283 Value *LB = ConstantInt::get(I32Ty, 0); 2284 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size()); 2285 Value *ST = ConstantInt::get(I32Ty, 1); 2286 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop( 2287 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop"); 2288 if (!LoopInfo) 2289 return LoopInfo.takeError(); 2290 2291 InsertPointOrErrorTy WsloopIP = 2292 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP, !IsNowait); 2293 if (!WsloopIP) 2294 return WsloopIP.takeError(); 2295 InsertPointTy AfterIP = *WsloopIP; 2296 2297 // Apply the finalization callback in LoopAfterBB 2298 auto FiniInfo = FinalizationStack.pop_back_val(); 2299 assert(FiniInfo.DK == OMPD_sections && 2300 "Unexpected finalization stack state!"); 2301 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) { 2302 Builder.restoreIP(AfterIP); 2303 BasicBlock *FiniBB = 2304 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini"); 2305 if (Error Err = CB(Builder.saveIP())) 2306 return Err; 2307 AfterIP = {FiniBB, FiniBB->begin()}; 2308 } 2309 2310 return AfterIP; 2311 } 2312 2313 OpenMPIRBuilder::InsertPointOrErrorTy 2314 OpenMPIRBuilder::createSection(const LocationDescription &Loc, 2315 BodyGenCallbackTy BodyGenCB, 2316 FinalizeCallbackTy FiniCB) { 2317 if (!updateToLocation(Loc)) 2318 return Loc.IP; 2319 2320 auto FiniCBWrapper = [&](InsertPointTy IP) { 2321 if (IP.getBlock()->end() != IP.getPoint()) 2322 return FiniCB(IP); 2323 // This must be done otherwise any nested constructs using FinalizeOMPRegion 2324 // will fail because that function requires the Finalization Basic Block to 2325 // have a terminator, which is already removed by EmitOMPRegionBody. 2326 // IP is currently at cancelation block. 2327 // We need to backtrack to the condition block to fetch 2328 // the exit block and create a branch from cancelation 2329 // to exit block. 2330 IRBuilder<>::InsertPointGuard IPG(Builder); 2331 Builder.restoreIP(IP); 2332 auto *CaseBB = Loc.IP.getBlock(); 2333 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor(); 2334 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1); 2335 Instruction *I = Builder.CreateBr(ExitBB); 2336 IP = InsertPointTy(I->getParent(), I->getIterator()); 2337 return FiniCB(IP); 2338 }; 2339 2340 Directive OMPD = Directive::OMPD_sections; 2341 // Since we are using Finalization Callback here, HasFinalize 2342 // and IsCancellable have to be true 2343 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper, 2344 /*Conditional*/ false, /*hasFinalize*/ true, 2345 /*IsCancellable*/ true); 2346 } 2347 2348 static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) { 2349 BasicBlock::iterator IT(I); 2350 IT++; 2351 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT); 2352 } 2353 2354 Value *OpenMPIRBuilder::getGPUThreadID() { 2355 return Builder.CreateCall( 2356 getOrCreateRuntimeFunction(M, 2357 OMPRTL___kmpc_get_hardware_thread_id_in_block), 2358 {}); 2359 } 2360 2361 Value *OpenMPIRBuilder::getGPUWarpSize() { 2362 return Builder.CreateCall( 2363 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {}); 2364 } 2365 2366 Value *OpenMPIRBuilder::getNVPTXWarpID() { 2367 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size); 2368 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id"); 2369 } 2370 2371 Value *OpenMPIRBuilder::getNVPTXLaneID() { 2372 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size); 2373 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device."); 2374 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits); 2375 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask), 2376 "nvptx_lane_id"); 2377 } 2378 2379 Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From, 2380 Type *ToType) { 2381 Type *FromType = From->getType(); 2382 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType); 2383 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType); 2384 assert(FromSize > 0 && "From size must be greater than zero"); 2385 assert(ToSize > 0 && "To size must be greater than zero"); 2386 if (FromType == ToType) 2387 return From; 2388 if (FromSize == ToSize) 2389 return Builder.CreateBitCast(From, ToType); 2390 if (ToType->isIntegerTy() && FromType->isIntegerTy()) 2391 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true); 2392 InsertPointTy SaveIP = Builder.saveIP(); 2393 Builder.restoreIP(AllocaIP); 2394 Value *CastItem = Builder.CreateAlloca(ToType); 2395 Builder.restoreIP(SaveIP); 2396 2397 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast( 2398 CastItem, Builder.getPtrTy(0)); 2399 Builder.CreateStore(From, ValCastItem); 2400 return Builder.CreateLoad(ToType, CastItem); 2401 } 2402 2403 Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP, 2404 Value *Element, 2405 Type *ElementType, 2406 Value *Offset) { 2407 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType); 2408 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction"); 2409 2410 // Cast all types to 32- or 64-bit values before calling shuffle routines. 2411 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64); 2412 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy); 2413 Value *WarpSize = 2414 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true); 2415 Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr( 2416 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32 2417 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64); 2418 Value *WarpSizeCast = 2419 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true); 2420 Value *ShuffleCall = 2421 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast}); 2422 return castValueToType(AllocaIP, ShuffleCall, CastTy); 2423 } 2424 2425 void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, 2426 Value *DstAddr, Type *ElemType, 2427 Value *Offset, Type *ReductionArrayTy) { 2428 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType); 2429 // Create the loop over the big sized data. 2430 // ptr = (void*)Elem; 2431 // ptrEnd = (void*) Elem + 1; 2432 // Step = 8; 2433 // while (ptr + Step < ptrEnd) 2434 // shuffle((int64_t)*ptr); 2435 // Step = 4; 2436 // while (ptr + Step < ptrEnd) 2437 // shuffle((int32_t)*ptr); 2438 // ... 2439 Type *IndexTy = Builder.getIndexTy( 2440 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); 2441 Value *ElemPtr = DstAddr; 2442 Value *Ptr = SrcAddr; 2443 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) { 2444 if (Size < IntSize) 2445 continue; 2446 Type *IntType = Builder.getIntNTy(IntSize * 8); 2447 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast( 2448 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast"); 2449 Value *SrcAddrGEP = 2450 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)}); 2451 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( 2452 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast"); 2453 2454 Function *CurFunc = Builder.GetInsertBlock()->getParent(); 2455 if ((Size / IntSize) > 1) { 2456 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast( 2457 SrcAddrGEP, Builder.getPtrTy()); 2458 BasicBlock *PreCondBB = 2459 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond"); 2460 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then"); 2461 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit"); 2462 BasicBlock *CurrentBB = Builder.GetInsertBlock(); 2463 emitBlock(PreCondBB, CurFunc); 2464 PHINode *PhiSrc = 2465 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2); 2466 PhiSrc->addIncoming(Ptr, CurrentBB); 2467 PHINode *PhiDest = 2468 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2); 2469 PhiDest->addIncoming(ElemPtr, CurrentBB); 2470 Ptr = PhiSrc; 2471 ElemPtr = PhiDest; 2472 Value *PtrDiff = Builder.CreatePtrDiff( 2473 Builder.getInt8Ty(), PtrEnd, 2474 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy())); 2475 Builder.CreateCondBr( 2476 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB, 2477 ExitBB); 2478 emitBlock(ThenBB, CurFunc); 2479 Value *Res = createRuntimeShuffleFunction( 2480 AllocaIP, 2481 Builder.CreateAlignedLoad( 2482 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)), 2483 IntType, Offset); 2484 Builder.CreateAlignedStore(Res, ElemPtr, 2485 M.getDataLayout().getPrefTypeAlign(ElemType)); 2486 Value *LocalPtr = 2487 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)}); 2488 Value *LocalElemPtr = 2489 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)}); 2490 PhiSrc->addIncoming(LocalPtr, ThenBB); 2491 PhiDest->addIncoming(LocalElemPtr, ThenBB); 2492 emitBranch(PreCondBB); 2493 emitBlock(ExitBB, CurFunc); 2494 } else { 2495 Value *Res = createRuntimeShuffleFunction( 2496 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset); 2497 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() < 2498 Res->getType()->getScalarSizeInBits()) 2499 Res = Builder.CreateTrunc(Res, ElemType); 2500 Builder.CreateStore(Res, ElemPtr); 2501 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)}); 2502 ElemPtr = 2503 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)}); 2504 } 2505 Size = Size % IntSize; 2506 } 2507 } 2508 2509 void OpenMPIRBuilder::emitReductionListCopy( 2510 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy, 2511 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase, 2512 CopyOptionsTy CopyOptions) { 2513 Type *IndexTy = Builder.getIndexTy( 2514 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); 2515 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset; 2516 2517 // Iterates, element-by-element, through the source Reduce list and 2518 // make a copy. 2519 for (auto En : enumerate(ReductionInfos)) { 2520 const ReductionInfo &RI = En.value(); 2521 Value *SrcElementAddr = nullptr; 2522 Value *DestElementAddr = nullptr; 2523 Value *DestElementPtrAddr = nullptr; 2524 // Should we shuffle in an element from a remote lane? 2525 bool ShuffleInElement = false; 2526 // Set to true to update the pointer in the dest Reduce list to a 2527 // newly created element. 2528 bool UpdateDestListPtr = false; 2529 2530 // Step 1.1: Get the address for the src element in the Reduce list. 2531 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP( 2532 ReductionArrayTy, SrcBase, 2533 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); 2534 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr); 2535 2536 // Step 1.2: Create a temporary to store the element in the destination 2537 // Reduce list. 2538 DestElementPtrAddr = Builder.CreateInBoundsGEP( 2539 ReductionArrayTy, DestBase, 2540 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); 2541 switch (Action) { 2542 case CopyAction::RemoteLaneToThread: { 2543 InsertPointTy CurIP = Builder.saveIP(); 2544 Builder.restoreIP(AllocaIP); 2545 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr, 2546 ".omp.reduction.element"); 2547 DestAlloca->setAlignment( 2548 M.getDataLayout().getPrefTypeAlign(RI.ElementType)); 2549 DestElementAddr = DestAlloca; 2550 DestElementAddr = 2551 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(), 2552 DestElementAddr->getName() + ".ascast"); 2553 Builder.restoreIP(CurIP); 2554 ShuffleInElement = true; 2555 UpdateDestListPtr = true; 2556 break; 2557 } 2558 case CopyAction::ThreadCopy: { 2559 DestElementAddr = 2560 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr); 2561 break; 2562 } 2563 } 2564 2565 // Now that all active lanes have read the element in the 2566 // Reduce list, shuffle over the value from the remote lane. 2567 if (ShuffleInElement) { 2568 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType, 2569 RemoteLaneOffset, ReductionArrayTy); 2570 } else { 2571 switch (RI.EvaluationKind) { 2572 case EvalKind::Scalar: { 2573 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr); 2574 // Store the source element value to the dest element address. 2575 Builder.CreateStore(Elem, DestElementAddr); 2576 break; 2577 } 2578 case EvalKind::Complex: { 2579 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32( 2580 RI.ElementType, SrcElementAddr, 0, 0, ".realp"); 2581 Value *SrcReal = Builder.CreateLoad( 2582 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real"); 2583 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32( 2584 RI.ElementType, SrcElementAddr, 0, 1, ".imagp"); 2585 Value *SrcImg = Builder.CreateLoad( 2586 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag"); 2587 2588 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32( 2589 RI.ElementType, DestElementAddr, 0, 0, ".realp"); 2590 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32( 2591 RI.ElementType, DestElementAddr, 0, 1, ".imagp"); 2592 Builder.CreateStore(SrcReal, DestRealPtr); 2593 Builder.CreateStore(SrcImg, DestImgPtr); 2594 break; 2595 } 2596 case EvalKind::Aggregate: { 2597 Value *SizeVal = Builder.getInt64( 2598 M.getDataLayout().getTypeStoreSize(RI.ElementType)); 2599 Builder.CreateMemCpy( 2600 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType), 2601 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType), 2602 SizeVal, false); 2603 break; 2604 } 2605 }; 2606 } 2607 2608 // Step 3.1: Modify reference in dest Reduce list as needed. 2609 // Modifying the reference in Reduce list to point to the newly 2610 // created element. The element is live in the current function 2611 // scope and that of functions it invokes (i.e., reduce_function). 2612 // RemoteReduceData[i] = (void*)&RemoteElem 2613 if (UpdateDestListPtr) { 2614 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast( 2615 DestElementAddr, Builder.getPtrTy(), 2616 DestElementAddr->getName() + ".ascast"); 2617 Builder.CreateStore(CastDestAddr, DestElementPtrAddr); 2618 } 2619 } 2620 } 2621 2622 Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction( 2623 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos, 2624 AttributeList FuncAttrs) { 2625 InsertPointTy SavedIP = Builder.saveIP(); 2626 LLVMContext &Ctx = M.getContext(); 2627 FunctionType *FuncTy = FunctionType::get( 2628 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()}, 2629 /* IsVarArg */ false); 2630 Function *WcFunc = 2631 Function::Create(FuncTy, GlobalVariable::InternalLinkage, 2632 "_omp_reduction_inter_warp_copy_func", &M); 2633 WcFunc->setAttributes(FuncAttrs); 2634 WcFunc->addParamAttr(0, Attribute::NoUndef); 2635 WcFunc->addParamAttr(1, Attribute::NoUndef); 2636 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc); 2637 Builder.SetInsertPoint(EntryBB); 2638 2639 // ReduceList: thread local Reduce list. 2640 // At the stage of the computation when this function is called, partially 2641 // aggregated values reside in the first lane of every active warp. 2642 Argument *ReduceListArg = WcFunc->getArg(0); 2643 // NumWarps: number of warps active in the parallel region. This could 2644 // be smaller than 32 (max warps in a CTA) for partial block reduction. 2645 Argument *NumWarpsArg = WcFunc->getArg(1); 2646 2647 // This array is used as a medium to transfer, one reduce element at a time, 2648 // the data from the first lane of every warp to lanes in the first warp 2649 // in order to perform the final step of a reduction in a parallel region 2650 // (reduction across warps). The array is placed in NVPTX __shared__ memory 2651 // for reduced latency, as well as to have a distinct copy for concurrently 2652 // executing target regions. The array is declared with common linkage so 2653 // as to be shared across compilation units. 2654 StringRef TransferMediumName = 2655 "__openmp_nvptx_data_transfer_temporary_storage"; 2656 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName); 2657 unsigned WarpSize = Config.getGridValue().GV_Warp_Size; 2658 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize); 2659 if (!TransferMedium) { 2660 TransferMedium = new GlobalVariable( 2661 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage, 2662 UndefValue::get(ArrayTy), TransferMediumName, 2663 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal, 2664 /*AddressSpace=*/3); 2665 } 2666 2667 // Get the CUDA thread id of the current OpenMP thread on the GPU. 2668 Value *GPUThreadID = getGPUThreadID(); 2669 // nvptx_lane_id = nvptx_id % warpsize 2670 Value *LaneID = getNVPTXLaneID(); 2671 // nvptx_warp_id = nvptx_id / warpsize 2672 Value *WarpID = getNVPTXWarpID(); 2673 2674 InsertPointTy AllocaIP = 2675 InsertPointTy(Builder.GetInsertBlock(), 2676 Builder.GetInsertBlock()->getFirstInsertionPt()); 2677 Type *Arg0Type = ReduceListArg->getType(); 2678 Type *Arg1Type = NumWarpsArg->getType(); 2679 Builder.restoreIP(AllocaIP); 2680 AllocaInst *ReduceListAlloca = Builder.CreateAlloca( 2681 Arg0Type, nullptr, ReduceListArg->getName() + ".addr"); 2682 AllocaInst *NumWarpsAlloca = 2683 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr"); 2684 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 2685 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast"); 2686 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 2687 NumWarpsAlloca, Builder.getPtrTy(0), 2688 NumWarpsAlloca->getName() + ".ascast"); 2689 Builder.CreateStore(ReduceListArg, ReduceListAddrCast); 2690 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast); 2691 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca); 2692 InsertPointTy CodeGenIP = 2693 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back()); 2694 Builder.restoreIP(CodeGenIP); 2695 2696 Value *ReduceList = 2697 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast); 2698 2699 for (auto En : enumerate(ReductionInfos)) { 2700 // 2701 // Warp master copies reduce element to transfer medium in __shared__ 2702 // memory. 2703 // 2704 const ReductionInfo &RI = En.value(); 2705 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType); 2706 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) { 2707 Type *CType = Builder.getIntNTy(TySize * 8); 2708 2709 unsigned NumIters = RealTySize / TySize; 2710 if (NumIters == 0) 2711 continue; 2712 Value *Cnt = nullptr; 2713 Value *CntAddr = nullptr; 2714 BasicBlock *PrecondBB = nullptr; 2715 BasicBlock *ExitBB = nullptr; 2716 if (NumIters > 1) { 2717 CodeGenIP = Builder.saveIP(); 2718 Builder.restoreIP(AllocaIP); 2719 CntAddr = 2720 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr"); 2721 2722 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(), 2723 CntAddr->getName() + ".ascast"); 2724 Builder.restoreIP(CodeGenIP); 2725 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()), 2726 CntAddr, 2727 /*Volatile=*/false); 2728 PrecondBB = BasicBlock::Create(Ctx, "precond"); 2729 ExitBB = BasicBlock::Create(Ctx, "exit"); 2730 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body"); 2731 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent()); 2732 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr, 2733 /*Volatile=*/false); 2734 Value *Cmp = Builder.CreateICmpULT( 2735 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters)); 2736 Builder.CreateCondBr(Cmp, BodyBB, ExitBB); 2737 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent()); 2738 } 2739 2740 // kmpc_barrier. 2741 InsertPointOrErrorTy BarrierIP1 = 2742 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL), 2743 omp::Directive::OMPD_unknown, 2744 /* ForceSimpleCall */ false, 2745 /* CheckCancelFlag */ true); 2746 if (!BarrierIP1) 2747 return BarrierIP1.takeError(); 2748 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then"); 2749 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else"); 2750 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont"); 2751 2752 // if (lane_id == 0) 2753 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master"); 2754 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB); 2755 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent()); 2756 2757 // Reduce element = LocalReduceList[i] 2758 auto *RedListArrayTy = 2759 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size()); 2760 Type *IndexTy = Builder.getIndexTy( 2761 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); 2762 Value *ElemPtrPtr = 2763 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList, 2764 {ConstantInt::get(IndexTy, 0), 2765 ConstantInt::get(IndexTy, En.index())}); 2766 // elemptr = ((CopyType*)(elemptrptr)) + I 2767 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr); 2768 if (NumIters > 1) 2769 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt); 2770 2771 // Get pointer to location in transfer medium. 2772 // MediumPtr = &medium[warp_id] 2773 Value *MediumPtr = Builder.CreateInBoundsGEP( 2774 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID}); 2775 // elem = *elemptr 2776 //*MediumPtr = elem 2777 Value *Elem = Builder.CreateLoad(CType, ElemPtr); 2778 // Store the source element value to the dest element address. 2779 Builder.CreateStore(Elem, MediumPtr, 2780 /*IsVolatile*/ true); 2781 Builder.CreateBr(MergeBB); 2782 2783 // else 2784 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent()); 2785 Builder.CreateBr(MergeBB); 2786 2787 // endif 2788 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent()); 2789 InsertPointOrErrorTy BarrierIP2 = 2790 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL), 2791 omp::Directive::OMPD_unknown, 2792 /* ForceSimpleCall */ false, 2793 /* CheckCancelFlag */ true); 2794 if (!BarrierIP2) 2795 return BarrierIP2.takeError(); 2796 2797 // Warp 0 copies reduce element from transfer medium 2798 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then"); 2799 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else"); 2800 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont"); 2801 2802 Value *NumWarpsVal = 2803 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast); 2804 // Up to 32 threads in warp 0 are active. 2805 Value *IsActiveThread = 2806 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread"); 2807 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB); 2808 2809 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent()); 2810 2811 // SecMediumPtr = &medium[tid] 2812 // SrcMediumVal = *SrcMediumPtr 2813 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP( 2814 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID}); 2815 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I 2816 Value *TargetElemPtrPtr = 2817 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList, 2818 {ConstantInt::get(IndexTy, 0), 2819 ConstantInt::get(IndexTy, En.index())}); 2820 Value *TargetElemPtrVal = 2821 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr); 2822 Value *TargetElemPtr = TargetElemPtrVal; 2823 if (NumIters > 1) 2824 TargetElemPtr = 2825 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt); 2826 2827 // *TargetElemPtr = SrcMediumVal; 2828 Value *SrcMediumValue = 2829 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true); 2830 Builder.CreateStore(SrcMediumValue, TargetElemPtr); 2831 Builder.CreateBr(W0MergeBB); 2832 2833 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent()); 2834 Builder.CreateBr(W0MergeBB); 2835 2836 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent()); 2837 2838 if (NumIters > 1) { 2839 Cnt = Builder.CreateNSWAdd( 2840 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1)); 2841 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false); 2842 2843 auto *CurFn = Builder.GetInsertBlock()->getParent(); 2844 emitBranch(PrecondBB); 2845 emitBlock(ExitBB, CurFn); 2846 } 2847 RealTySize %= TySize; 2848 } 2849 } 2850 2851 Builder.CreateRetVoid(); 2852 Builder.restoreIP(SavedIP); 2853 2854 return WcFunc; 2855 } 2856 2857 Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( 2858 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn, 2859 AttributeList FuncAttrs) { 2860 LLVMContext &Ctx = M.getContext(); 2861 FunctionType *FuncTy = 2862 FunctionType::get(Builder.getVoidTy(), 2863 {Builder.getPtrTy(), Builder.getInt16Ty(), 2864 Builder.getInt16Ty(), Builder.getInt16Ty()}, 2865 /* IsVarArg */ false); 2866 Function *SarFunc = 2867 Function::Create(FuncTy, GlobalVariable::InternalLinkage, 2868 "_omp_reduction_shuffle_and_reduce_func", &M); 2869 SarFunc->setAttributes(FuncAttrs); 2870 SarFunc->addParamAttr(0, Attribute::NoUndef); 2871 SarFunc->addParamAttr(1, Attribute::NoUndef); 2872 SarFunc->addParamAttr(2, Attribute::NoUndef); 2873 SarFunc->addParamAttr(3, Attribute::NoUndef); 2874 SarFunc->addParamAttr(1, Attribute::SExt); 2875 SarFunc->addParamAttr(2, Attribute::SExt); 2876 SarFunc->addParamAttr(3, Attribute::SExt); 2877 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc); 2878 Builder.SetInsertPoint(EntryBB); 2879 2880 // Thread local Reduce list used to host the values of data to be reduced. 2881 Argument *ReduceListArg = SarFunc->getArg(0); 2882 // Current lane id; could be logical. 2883 Argument *LaneIDArg = SarFunc->getArg(1); 2884 // Offset of the remote source lane relative to the current lane. 2885 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2); 2886 // Algorithm version. This is expected to be known at compile time. 2887 Argument *AlgoVerArg = SarFunc->getArg(3); 2888 2889 Type *ReduceListArgType = ReduceListArg->getType(); 2890 Type *LaneIDArgType = LaneIDArg->getType(); 2891 Type *LaneIDArgPtrType = Builder.getPtrTy(0); 2892 Value *ReduceListAlloca = Builder.CreateAlloca( 2893 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr"); 2894 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr, 2895 LaneIDArg->getName() + ".addr"); 2896 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca( 2897 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr"); 2898 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr, 2899 AlgoVerArg->getName() + ".addr"); 2900 ArrayType *RedListArrayTy = 2901 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size()); 2902 2903 // Create a local thread-private variable to host the Reduce list 2904 // from a remote lane. 2905 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca( 2906 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list"); 2907 2908 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 2909 ReduceListAlloca, ReduceListArgType, 2910 ReduceListAlloca->getName() + ".ascast"); 2911 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 2912 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast"); 2913 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 2914 RemoteLaneOffsetAlloca, LaneIDArgPtrType, 2915 RemoteLaneOffsetAlloca->getName() + ".ascast"); 2916 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 2917 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast"); 2918 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 2919 RemoteReductionListAlloca, Builder.getPtrTy(), 2920 RemoteReductionListAlloca->getName() + ".ascast"); 2921 2922 Builder.CreateStore(ReduceListArg, ReduceListAddrCast); 2923 Builder.CreateStore(LaneIDArg, LaneIdAddrCast); 2924 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast); 2925 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast); 2926 2927 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast); 2928 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast); 2929 Value *RemoteLaneOffset = 2930 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast); 2931 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast); 2932 2933 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca); 2934 2935 // This loop iterates through the list of reduce elements and copies, 2936 // element by element, from a remote lane in the warp to RemoteReduceList, 2937 // hosted on the thread's stack. 2938 emitReductionListCopy( 2939 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos, 2940 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr}); 2941 2942 // The actions to be performed on the Remote Reduce list is dependent 2943 // on the algorithm version. 2944 // 2945 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 && 2946 // LaneId % 2 == 0 && Offset > 0): 2947 // do the reduction value aggregation 2948 // 2949 // The thread local variable Reduce list is mutated in place to host the 2950 // reduced data, which is the aggregated value produced from local and 2951 // remote lanes. 2952 // 2953 // Note that AlgoVer is expected to be a constant integer known at compile 2954 // time. 2955 // When AlgoVer==0, the first conjunction evaluates to true, making 2956 // the entire predicate true during compile time. 2957 // When AlgoVer==1, the second conjunction has only the second part to be 2958 // evaluated during runtime. Other conjunctions evaluates to false 2959 // during compile time. 2960 // When AlgoVer==2, the third conjunction has only the second part to be 2961 // evaluated during runtime. Other conjunctions evaluates to false 2962 // during compile time. 2963 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer); 2964 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1)); 2965 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset); 2966 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp); 2967 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2)); 2968 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1)); 2969 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1); 2970 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp); 2971 Value *RemoteOffsetComp = 2972 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0)); 2973 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp); 2974 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1); 2975 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2); 2976 2977 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then"); 2978 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else"); 2979 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont"); 2980 2981 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB); 2982 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent()); 2983 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( 2984 ReduceList, Builder.getPtrTy()); 2985 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( 2986 RemoteListAddrCast, Builder.getPtrTy()); 2987 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr}) 2988 ->addFnAttr(Attribute::NoUnwind); 2989 Builder.CreateBr(MergeBB); 2990 2991 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent()); 2992 Builder.CreateBr(MergeBB); 2993 2994 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent()); 2995 2996 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local 2997 // Reduce list. 2998 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1)); 2999 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset); 3000 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset); 3001 3002 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then"); 3003 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else"); 3004 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont"); 3005 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB); 3006 3007 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent()); 3008 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, 3009 ReductionInfos, RemoteListAddrCast, ReduceList); 3010 Builder.CreateBr(CpyMergeBB); 3011 3012 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent()); 3013 Builder.CreateBr(CpyMergeBB); 3014 3015 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent()); 3016 3017 Builder.CreateRetVoid(); 3018 3019 return SarFunc; 3020 } 3021 3022 Function *OpenMPIRBuilder::emitListToGlobalCopyFunction( 3023 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy, 3024 AttributeList FuncAttrs) { 3025 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); 3026 LLVMContext &Ctx = M.getContext(); 3027 FunctionType *FuncTy = FunctionType::get( 3028 Builder.getVoidTy(), 3029 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()}, 3030 /* IsVarArg */ false); 3031 Function *LtGCFunc = 3032 Function::Create(FuncTy, GlobalVariable::InternalLinkage, 3033 "_omp_reduction_list_to_global_copy_func", &M); 3034 LtGCFunc->setAttributes(FuncAttrs); 3035 LtGCFunc->addParamAttr(0, Attribute::NoUndef); 3036 LtGCFunc->addParamAttr(1, Attribute::NoUndef); 3037 LtGCFunc->addParamAttr(2, Attribute::NoUndef); 3038 3039 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc); 3040 Builder.SetInsertPoint(EntryBlock); 3041 3042 // Buffer: global reduction buffer. 3043 Argument *BufferArg = LtGCFunc->getArg(0); 3044 // Idx: index of the buffer. 3045 Argument *IdxArg = LtGCFunc->getArg(1); 3046 // ReduceList: thread local Reduce list. 3047 Argument *ReduceListArg = LtGCFunc->getArg(2); 3048 3049 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr, 3050 BufferArg->getName() + ".addr"); 3051 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, 3052 IdxArg->getName() + ".addr"); 3053 Value *ReduceListArgAlloca = Builder.CreateAlloca( 3054 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr"); 3055 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 3056 BufferArgAlloca, Builder.getPtrTy(), 3057 BufferArgAlloca->getName() + ".ascast"); 3058 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 3059 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast"); 3060 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 3061 ReduceListArgAlloca, Builder.getPtrTy(), 3062 ReduceListArgAlloca->getName() + ".ascast"); 3063 3064 Builder.CreateStore(BufferArg, BufferArgAddrCast); 3065 Builder.CreateStore(IdxArg, IdxArgAddrCast); 3066 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast); 3067 3068 Value *LocalReduceList = 3069 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast); 3070 Value *BufferArgVal = 3071 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast); 3072 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)}; 3073 Type *IndexTy = Builder.getIndexTy( 3074 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); 3075 for (auto En : enumerate(ReductionInfos)) { 3076 const ReductionInfo &RI = En.value(); 3077 auto *RedListArrayTy = 3078 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size()); 3079 // Reduce element = LocalReduceList[i] 3080 Value *ElemPtrPtr = Builder.CreateInBoundsGEP( 3081 RedListArrayTy, LocalReduceList, 3082 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); 3083 // elemptr = ((CopyType*)(elemptrptr)) + I 3084 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr); 3085 3086 // Global = Buffer.VD[Idx]; 3087 Value *BufferVD = 3088 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs); 3089 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32( 3090 ReductionsBufferTy, BufferVD, 0, En.index()); 3091 3092 switch (RI.EvaluationKind) { 3093 case EvalKind::Scalar: { 3094 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr); 3095 Builder.CreateStore(TargetElement, GlobVal); 3096 break; 3097 } 3098 case EvalKind::Complex: { 3099 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32( 3100 RI.ElementType, ElemPtr, 0, 0, ".realp"); 3101 Value *SrcReal = Builder.CreateLoad( 3102 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real"); 3103 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32( 3104 RI.ElementType, ElemPtr, 0, 1, ".imagp"); 3105 Value *SrcImg = Builder.CreateLoad( 3106 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag"); 3107 3108 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32( 3109 RI.ElementType, GlobVal, 0, 0, ".realp"); 3110 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32( 3111 RI.ElementType, GlobVal, 0, 1, ".imagp"); 3112 Builder.CreateStore(SrcReal, DestRealPtr); 3113 Builder.CreateStore(SrcImg, DestImgPtr); 3114 break; 3115 } 3116 case EvalKind::Aggregate: { 3117 Value *SizeVal = 3118 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType)); 3119 Builder.CreateMemCpy( 3120 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr, 3121 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false); 3122 break; 3123 } 3124 } 3125 } 3126 3127 Builder.CreateRetVoid(); 3128 Builder.restoreIP(OldIP); 3129 return LtGCFunc; 3130 } 3131 3132 Function *OpenMPIRBuilder::emitListToGlobalReduceFunction( 3133 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn, 3134 Type *ReductionsBufferTy, AttributeList FuncAttrs) { 3135 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); 3136 LLVMContext &Ctx = M.getContext(); 3137 FunctionType *FuncTy = FunctionType::get( 3138 Builder.getVoidTy(), 3139 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()}, 3140 /* IsVarArg */ false); 3141 Function *LtGRFunc = 3142 Function::Create(FuncTy, GlobalVariable::InternalLinkage, 3143 "_omp_reduction_list_to_global_reduce_func", &M); 3144 LtGRFunc->setAttributes(FuncAttrs); 3145 LtGRFunc->addParamAttr(0, Attribute::NoUndef); 3146 LtGRFunc->addParamAttr(1, Attribute::NoUndef); 3147 LtGRFunc->addParamAttr(2, Attribute::NoUndef); 3148 3149 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc); 3150 Builder.SetInsertPoint(EntryBlock); 3151 3152 // Buffer: global reduction buffer. 3153 Argument *BufferArg = LtGRFunc->getArg(0); 3154 // Idx: index of the buffer. 3155 Argument *IdxArg = LtGRFunc->getArg(1); 3156 // ReduceList: thread local Reduce list. 3157 Argument *ReduceListArg = LtGRFunc->getArg(2); 3158 3159 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr, 3160 BufferArg->getName() + ".addr"); 3161 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, 3162 IdxArg->getName() + ".addr"); 3163 Value *ReduceListArgAlloca = Builder.CreateAlloca( 3164 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr"); 3165 auto *RedListArrayTy = 3166 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size()); 3167 3168 // 1. Build a list of reduction variables. 3169 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; 3170 Value *LocalReduceList = 3171 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list"); 3172 3173 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 3174 BufferArgAlloca, Builder.getPtrTy(), 3175 BufferArgAlloca->getName() + ".ascast"); 3176 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 3177 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast"); 3178 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 3179 ReduceListArgAlloca, Builder.getPtrTy(), 3180 ReduceListArgAlloca->getName() + ".ascast"); 3181 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 3182 LocalReduceList, Builder.getPtrTy(), 3183 LocalReduceList->getName() + ".ascast"); 3184 3185 Builder.CreateStore(BufferArg, BufferArgAddrCast); 3186 Builder.CreateStore(IdxArg, IdxArgAddrCast); 3187 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast); 3188 3189 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast); 3190 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)}; 3191 Type *IndexTy = Builder.getIndexTy( 3192 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); 3193 for (auto En : enumerate(ReductionInfos)) { 3194 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP( 3195 RedListArrayTy, LocalReduceListAddrCast, 3196 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); 3197 Value *BufferVD = 3198 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs); 3199 // Global = Buffer.VD[Idx]; 3200 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32( 3201 ReductionsBufferTy, BufferVD, 0, En.index()); 3202 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr); 3203 } 3204 3205 // Call reduce_function(GlobalReduceList, ReduceList) 3206 Value *ReduceList = 3207 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast); 3208 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList}) 3209 ->addFnAttr(Attribute::NoUnwind); 3210 Builder.CreateRetVoid(); 3211 Builder.restoreIP(OldIP); 3212 return LtGRFunc; 3213 } 3214 3215 Function *OpenMPIRBuilder::emitGlobalToListCopyFunction( 3216 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy, 3217 AttributeList FuncAttrs) { 3218 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); 3219 LLVMContext &Ctx = M.getContext(); 3220 FunctionType *FuncTy = FunctionType::get( 3221 Builder.getVoidTy(), 3222 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()}, 3223 /* IsVarArg */ false); 3224 Function *LtGCFunc = 3225 Function::Create(FuncTy, GlobalVariable::InternalLinkage, 3226 "_omp_reduction_global_to_list_copy_func", &M); 3227 LtGCFunc->setAttributes(FuncAttrs); 3228 LtGCFunc->addParamAttr(0, Attribute::NoUndef); 3229 LtGCFunc->addParamAttr(1, Attribute::NoUndef); 3230 LtGCFunc->addParamAttr(2, Attribute::NoUndef); 3231 3232 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc); 3233 Builder.SetInsertPoint(EntryBlock); 3234 3235 // Buffer: global reduction buffer. 3236 Argument *BufferArg = LtGCFunc->getArg(0); 3237 // Idx: index of the buffer. 3238 Argument *IdxArg = LtGCFunc->getArg(1); 3239 // ReduceList: thread local Reduce list. 3240 Argument *ReduceListArg = LtGCFunc->getArg(2); 3241 3242 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr, 3243 BufferArg->getName() + ".addr"); 3244 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, 3245 IdxArg->getName() + ".addr"); 3246 Value *ReduceListArgAlloca = Builder.CreateAlloca( 3247 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr"); 3248 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 3249 BufferArgAlloca, Builder.getPtrTy(), 3250 BufferArgAlloca->getName() + ".ascast"); 3251 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 3252 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast"); 3253 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 3254 ReduceListArgAlloca, Builder.getPtrTy(), 3255 ReduceListArgAlloca->getName() + ".ascast"); 3256 Builder.CreateStore(BufferArg, BufferArgAddrCast); 3257 Builder.CreateStore(IdxArg, IdxArgAddrCast); 3258 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast); 3259 3260 Value *LocalReduceList = 3261 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast); 3262 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast); 3263 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)}; 3264 Type *IndexTy = Builder.getIndexTy( 3265 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); 3266 for (auto En : enumerate(ReductionInfos)) { 3267 const OpenMPIRBuilder::ReductionInfo &RI = En.value(); 3268 auto *RedListArrayTy = 3269 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size()); 3270 // Reduce element = LocalReduceList[i] 3271 Value *ElemPtrPtr = Builder.CreateInBoundsGEP( 3272 RedListArrayTy, LocalReduceList, 3273 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); 3274 // elemptr = ((CopyType*)(elemptrptr)) + I 3275 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr); 3276 // Global = Buffer.VD[Idx]; 3277 Value *BufferVD = 3278 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs); 3279 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32( 3280 ReductionsBufferTy, BufferVD, 0, En.index()); 3281 3282 switch (RI.EvaluationKind) { 3283 case EvalKind::Scalar: { 3284 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr); 3285 Builder.CreateStore(TargetElement, ElemPtr); 3286 break; 3287 } 3288 case EvalKind::Complex: { 3289 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32( 3290 RI.ElementType, GlobValPtr, 0, 0, ".realp"); 3291 Value *SrcReal = Builder.CreateLoad( 3292 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real"); 3293 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32( 3294 RI.ElementType, GlobValPtr, 0, 1, ".imagp"); 3295 Value *SrcImg = Builder.CreateLoad( 3296 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag"); 3297 3298 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32( 3299 RI.ElementType, ElemPtr, 0, 0, ".realp"); 3300 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32( 3301 RI.ElementType, ElemPtr, 0, 1, ".imagp"); 3302 Builder.CreateStore(SrcReal, DestRealPtr); 3303 Builder.CreateStore(SrcImg, DestImgPtr); 3304 break; 3305 } 3306 case EvalKind::Aggregate: { 3307 Value *SizeVal = 3308 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType)); 3309 Builder.CreateMemCpy( 3310 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType), 3311 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType), 3312 SizeVal, false); 3313 break; 3314 } 3315 } 3316 } 3317 3318 Builder.CreateRetVoid(); 3319 Builder.restoreIP(OldIP); 3320 return LtGCFunc; 3321 } 3322 3323 Function *OpenMPIRBuilder::emitGlobalToListReduceFunction( 3324 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn, 3325 Type *ReductionsBufferTy, AttributeList FuncAttrs) { 3326 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); 3327 LLVMContext &Ctx = M.getContext(); 3328 auto *FuncTy = FunctionType::get( 3329 Builder.getVoidTy(), 3330 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()}, 3331 /* IsVarArg */ false); 3332 Function *LtGRFunc = 3333 Function::Create(FuncTy, GlobalVariable::InternalLinkage, 3334 "_omp_reduction_global_to_list_reduce_func", &M); 3335 LtGRFunc->setAttributes(FuncAttrs); 3336 LtGRFunc->addParamAttr(0, Attribute::NoUndef); 3337 LtGRFunc->addParamAttr(1, Attribute::NoUndef); 3338 LtGRFunc->addParamAttr(2, Attribute::NoUndef); 3339 3340 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc); 3341 Builder.SetInsertPoint(EntryBlock); 3342 3343 // Buffer: global reduction buffer. 3344 Argument *BufferArg = LtGRFunc->getArg(0); 3345 // Idx: index of the buffer. 3346 Argument *IdxArg = LtGRFunc->getArg(1); 3347 // ReduceList: thread local Reduce list. 3348 Argument *ReduceListArg = LtGRFunc->getArg(2); 3349 3350 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr, 3351 BufferArg->getName() + ".addr"); 3352 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, 3353 IdxArg->getName() + ".addr"); 3354 Value *ReduceListArgAlloca = Builder.CreateAlloca( 3355 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr"); 3356 ArrayType *RedListArrayTy = 3357 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size()); 3358 3359 // 1. Build a list of reduction variables. 3360 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; 3361 Value *LocalReduceList = 3362 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list"); 3363 3364 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 3365 BufferArgAlloca, Builder.getPtrTy(), 3366 BufferArgAlloca->getName() + ".ascast"); 3367 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 3368 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast"); 3369 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 3370 ReduceListArgAlloca, Builder.getPtrTy(), 3371 ReduceListArgAlloca->getName() + ".ascast"); 3372 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast( 3373 LocalReduceList, Builder.getPtrTy(), 3374 LocalReduceList->getName() + ".ascast"); 3375 3376 Builder.CreateStore(BufferArg, BufferArgAddrCast); 3377 Builder.CreateStore(IdxArg, IdxArgAddrCast); 3378 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast); 3379 3380 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast); 3381 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)}; 3382 Type *IndexTy = Builder.getIndexTy( 3383 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); 3384 for (auto En : enumerate(ReductionInfos)) { 3385 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP( 3386 RedListArrayTy, ReductionList, 3387 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); 3388 // Global = Buffer.VD[Idx]; 3389 Value *BufferVD = 3390 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs); 3391 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32( 3392 ReductionsBufferTy, BufferVD, 0, En.index()); 3393 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr); 3394 } 3395 3396 // Call reduce_function(ReduceList, GlobalReduceList) 3397 Value *ReduceList = 3398 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast); 3399 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList}) 3400 ->addFnAttr(Attribute::NoUnwind); 3401 Builder.CreateRetVoid(); 3402 Builder.restoreIP(OldIP); 3403 return LtGRFunc; 3404 } 3405 3406 std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const { 3407 std::string Suffix = 3408 createPlatformSpecificName({"omp", "reduction", "reduction_func"}); 3409 return (Name + Suffix).str(); 3410 } 3411 3412 Expected<Function *> OpenMPIRBuilder::createReductionFunction( 3413 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos, 3414 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) { 3415 auto *FuncTy = FunctionType::get(Builder.getVoidTy(), 3416 {Builder.getPtrTy(), Builder.getPtrTy()}, 3417 /* IsVarArg */ false); 3418 std::string Name = getReductionFuncName(ReducerName); 3419 Function *ReductionFunc = 3420 Function::Create(FuncTy, GlobalVariable::InternalLinkage, Name, &M); 3421 ReductionFunc->setAttributes(FuncAttrs); 3422 ReductionFunc->addParamAttr(0, Attribute::NoUndef); 3423 ReductionFunc->addParamAttr(1, Attribute::NoUndef); 3424 BasicBlock *EntryBB = 3425 BasicBlock::Create(M.getContext(), "entry", ReductionFunc); 3426 Builder.SetInsertPoint(EntryBB); 3427 3428 // Need to alloca memory here and deal with the pointers before getting 3429 // LHS/RHS pointers out 3430 Value *LHSArrayPtr = nullptr; 3431 Value *RHSArrayPtr = nullptr; 3432 Argument *Arg0 = ReductionFunc->getArg(0); 3433 Argument *Arg1 = ReductionFunc->getArg(1); 3434 Type *Arg0Type = Arg0->getType(); 3435 Type *Arg1Type = Arg1->getType(); 3436 3437 Value *LHSAlloca = 3438 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr"); 3439 Value *RHSAlloca = 3440 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr"); 3441 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 3442 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast"); 3443 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( 3444 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast"); 3445 Builder.CreateStore(Arg0, LHSAddrCast); 3446 Builder.CreateStore(Arg1, RHSAddrCast); 3447 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast); 3448 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast); 3449 3450 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size()); 3451 Type *IndexTy = Builder.getIndexTy( 3452 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); 3453 SmallVector<Value *> LHSPtrs, RHSPtrs; 3454 for (auto En : enumerate(ReductionInfos)) { 3455 const ReductionInfo &RI = En.value(); 3456 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP( 3457 RedArrayTy, RHSArrayPtr, 3458 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); 3459 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr); 3460 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( 3461 RHSI8Ptr, RI.PrivateVariable->getType(), 3462 RHSI8Ptr->getName() + ".ascast"); 3463 3464 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP( 3465 RedArrayTy, LHSArrayPtr, 3466 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); 3467 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr); 3468 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( 3469 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast"); 3470 3471 if (ReductionGenCBKind == ReductionGenCBKind::Clang) { 3472 LHSPtrs.emplace_back(LHSPtr); 3473 RHSPtrs.emplace_back(RHSPtr); 3474 } else { 3475 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr); 3476 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr); 3477 Value *Reduced; 3478 InsertPointOrErrorTy AfterIP = 3479 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced); 3480 if (!AfterIP) 3481 return AfterIP.takeError(); 3482 if (!Builder.GetInsertBlock()) 3483 return ReductionFunc; 3484 Builder.CreateStore(Reduced, LHSPtr); 3485 } 3486 } 3487 3488 if (ReductionGenCBKind == ReductionGenCBKind::Clang) 3489 for (auto En : enumerate(ReductionInfos)) { 3490 unsigned Index = En.index(); 3491 const ReductionInfo &RI = En.value(); 3492 Value *LHSFixupPtr, *RHSFixupPtr; 3493 Builder.restoreIP(RI.ReductionGenClang( 3494 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc)); 3495 3496 // Fix the CallBack code genereated to use the correct Values for the LHS 3497 // and RHS 3498 LHSFixupPtr->replaceUsesWithIf( 3499 LHSPtrs[Index], [ReductionFunc](const Use &U) { 3500 return cast<Instruction>(U.getUser())->getParent()->getParent() == 3501 ReductionFunc; 3502 }); 3503 RHSFixupPtr->replaceUsesWithIf( 3504 RHSPtrs[Index], [ReductionFunc](const Use &U) { 3505 return cast<Instruction>(U.getUser())->getParent()->getParent() == 3506 ReductionFunc; 3507 }); 3508 } 3509 3510 Builder.CreateRetVoid(); 3511 return ReductionFunc; 3512 } 3513 3514 static void 3515 checkReductionInfos(ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, 3516 bool IsGPU) { 3517 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) { 3518 (void)RI; 3519 assert(RI.Variable && "expected non-null variable"); 3520 assert(RI.PrivateVariable && "expected non-null private variable"); 3521 assert((RI.ReductionGen || RI.ReductionGenClang) && 3522 "expected non-null reduction generator callback"); 3523 if (!IsGPU) { 3524 assert( 3525 RI.Variable->getType() == RI.PrivateVariable->getType() && 3526 "expected variables and their private equivalents to have the same " 3527 "type"); 3528 } 3529 assert(RI.Variable->getType()->isPointerTy() && 3530 "expected variables to be pointers"); 3531 } 3532 } 3533 3534 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( 3535 const LocationDescription &Loc, InsertPointTy AllocaIP, 3536 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos, 3537 bool IsNoWait, bool IsTeamsReduction, bool HasDistribute, 3538 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue, 3539 unsigned ReductionBufNum, Value *SrcLocInfo) { 3540 if (!updateToLocation(Loc)) 3541 return InsertPointTy(); 3542 Builder.restoreIP(CodeGenIP); 3543 checkReductionInfos(ReductionInfos, /*IsGPU*/ true); 3544 LLVMContext &Ctx = M.getContext(); 3545 3546 // Source location for the ident struct 3547 if (!SrcLocInfo) { 3548 uint32_t SrcLocStrSize; 3549 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 3550 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 3551 } 3552 3553 if (ReductionInfos.size() == 0) 3554 return Builder.saveIP(); 3555 3556 Function *CurFunc = Builder.GetInsertBlock()->getParent(); 3557 AttributeList FuncAttrs; 3558 AttrBuilder AttrBldr(Ctx); 3559 for (auto Attr : CurFunc->getAttributes().getFnAttrs()) 3560 AttrBldr.addAttribute(Attr); 3561 AttrBldr.removeAttribute(Attribute::OptimizeNone); 3562 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr); 3563 3564 CodeGenIP = Builder.saveIP(); 3565 Expected<Function *> ReductionResult = 3566 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(), 3567 ReductionInfos, ReductionGenCBKind, FuncAttrs); 3568 if (!ReductionResult) 3569 return ReductionResult.takeError(); 3570 Function *ReductionFunc = *ReductionResult; 3571 Builder.restoreIP(CodeGenIP); 3572 3573 // Set the grid value in the config needed for lowering later on 3574 if (GridValue.has_value()) 3575 Config.setGridValue(GridValue.value()); 3576 else 3577 Config.setGridValue(getGridValue(T, ReductionFunc)); 3578 3579 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList), 3580 // RedList, shuffle_reduce_func, interwarp_copy_func); 3581 // or 3582 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>); 3583 Value *Res; 3584 3585 // 1. Build a list of reduction variables. 3586 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; 3587 auto Size = ReductionInfos.size(); 3588 Type *PtrTy = PointerType::getUnqual(Ctx); 3589 Type *RedArrayTy = ArrayType::get(PtrTy, Size); 3590 CodeGenIP = Builder.saveIP(); 3591 Builder.restoreIP(AllocaIP); 3592 Value *ReductionListAlloca = 3593 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list"); 3594 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast( 3595 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast"); 3596 Builder.restoreIP(CodeGenIP); 3597 Type *IndexTy = Builder.getIndexTy( 3598 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); 3599 for (auto En : enumerate(ReductionInfos)) { 3600 const ReductionInfo &RI = En.value(); 3601 Value *ElemPtr = Builder.CreateInBoundsGEP( 3602 RedArrayTy, ReductionList, 3603 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); 3604 Value *CastElem = 3605 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy); 3606 Builder.CreateStore(CastElem, ElemPtr); 3607 } 3608 CodeGenIP = Builder.saveIP(); 3609 Function *SarFunc = 3610 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs); 3611 Expected<Function *> CopyResult = 3612 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs); 3613 if (!CopyResult) 3614 return CopyResult.takeError(); 3615 Function *WcFunc = *CopyResult; 3616 Builder.restoreIP(CodeGenIP); 3617 3618 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy); 3619 3620 unsigned MaxDataSize = 0; 3621 SmallVector<Type *> ReductionTypeArgs; 3622 for (auto En : enumerate(ReductionInfos)) { 3623 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType); 3624 if (Size > MaxDataSize) 3625 MaxDataSize = Size; 3626 ReductionTypeArgs.emplace_back(En.value().ElementType); 3627 } 3628 Value *ReductionDataSize = 3629 Builder.getInt64(MaxDataSize * ReductionInfos.size()); 3630 if (!IsTeamsReduction) { 3631 Value *SarFuncCast = 3632 Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy); 3633 Value *WcFuncCast = 3634 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy); 3635 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast, 3636 WcFuncCast}; 3637 Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr( 3638 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2); 3639 Res = Builder.CreateCall(Pv2Ptr, Args); 3640 } else { 3641 CodeGenIP = Builder.saveIP(); 3642 StructType *ReductionsBufferTy = StructType::create( 3643 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty"); 3644 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr( 3645 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer); 3646 Function *LtGCFunc = emitListToGlobalCopyFunction( 3647 ReductionInfos, ReductionsBufferTy, FuncAttrs); 3648 Function *LtGRFunc = emitListToGlobalReduceFunction( 3649 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs); 3650 Function *GtLCFunc = emitGlobalToListCopyFunction( 3651 ReductionInfos, ReductionsBufferTy, FuncAttrs); 3652 Function *GtLRFunc = emitGlobalToListReduceFunction( 3653 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs); 3654 Builder.restoreIP(CodeGenIP); 3655 3656 Value *KernelTeamsReductionPtr = Builder.CreateCall( 3657 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr"); 3658 3659 Value *Args3[] = {SrcLocInfo, 3660 KernelTeamsReductionPtr, 3661 Builder.getInt32(ReductionBufNum), 3662 ReductionDataSize, 3663 RL, 3664 SarFunc, 3665 WcFunc, 3666 LtGCFunc, 3667 LtGRFunc, 3668 GtLCFunc, 3669 GtLRFunc}; 3670 3671 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr( 3672 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2); 3673 Res = Builder.CreateCall(TeamsReduceFn, Args3); 3674 } 3675 3676 // 5. Build if (res == 1) 3677 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done"); 3678 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then"); 3679 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1)); 3680 Builder.CreateCondBr(Cond, ThenBB, ExitBB); 3681 3682 // 6. Build then branch: where we have reduced values in the master 3683 // thread in each team. 3684 // __kmpc_end_reduce{_nowait}(<gtid>); 3685 // break; 3686 emitBlock(ThenBB, CurFunc); 3687 3688 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>); 3689 for (auto En : enumerate(ReductionInfos)) { 3690 const ReductionInfo &RI = En.value(); 3691 Value *LHS = RI.Variable; 3692 Value *RHS = 3693 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy); 3694 3695 if (ReductionGenCBKind == ReductionGenCBKind::Clang) { 3696 Value *LHSPtr, *RHSPtr; 3697 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(), 3698 &LHSPtr, &RHSPtr, CurFunc)); 3699 3700 // Fix the CallBack code genereated to use the correct Values for the LHS 3701 // and RHS 3702 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) { 3703 return cast<Instruction>(U.getUser())->getParent()->getParent() == 3704 ReductionFunc; 3705 }); 3706 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) { 3707 return cast<Instruction>(U.getUser())->getParent()->getParent() == 3708 ReductionFunc; 3709 }); 3710 } else { 3711 assert(false && "Unhandled ReductionGenCBKind"); 3712 } 3713 } 3714 emitBlock(ExitBB, CurFunc); 3715 3716 Config.setEmitLLVMUsed(); 3717 3718 return Builder.saveIP(); 3719 } 3720 3721 static Function *getFreshReductionFunc(Module &M) { 3722 Type *VoidTy = Type::getVoidTy(M.getContext()); 3723 Type *Int8PtrTy = PointerType::getUnqual(M.getContext()); 3724 auto *FuncTy = 3725 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false); 3726 return Function::Create(FuncTy, GlobalVariable::InternalLinkage, 3727 ".omp.reduction.func", &M); 3728 } 3729 3730 OpenMPIRBuilder::InsertPointOrErrorTy 3731 OpenMPIRBuilder::createReductions(const LocationDescription &Loc, 3732 InsertPointTy AllocaIP, 3733 ArrayRef<ReductionInfo> ReductionInfos, 3734 ArrayRef<bool> IsByRef, bool IsNoWait) { 3735 assert(ReductionInfos.size() == IsByRef.size()); 3736 for (const ReductionInfo &RI : ReductionInfos) { 3737 (void)RI; 3738 assert(RI.Variable && "expected non-null variable"); 3739 assert(RI.PrivateVariable && "expected non-null private variable"); 3740 assert(RI.ReductionGen && "expected non-null reduction generator callback"); 3741 assert(RI.Variable->getType() == RI.PrivateVariable->getType() && 3742 "expected variables and their private equivalents to have the same " 3743 "type"); 3744 assert(RI.Variable->getType()->isPointerTy() && 3745 "expected variables to be pointers"); 3746 } 3747 3748 if (!updateToLocation(Loc)) 3749 return InsertPointTy(); 3750 3751 BasicBlock *InsertBlock = Loc.IP.getBlock(); 3752 BasicBlock *ContinuationBlock = 3753 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize"); 3754 InsertBlock->getTerminator()->eraseFromParent(); 3755 3756 // Create and populate array of type-erased pointers to private reduction 3757 // values. 3758 unsigned NumReductions = ReductionInfos.size(); 3759 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions); 3760 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator()); 3761 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array"); 3762 3763 Builder.SetInsertPoint(InsertBlock, InsertBlock->end()); 3764 3765 for (auto En : enumerate(ReductionInfos)) { 3766 unsigned Index = En.index(); 3767 const ReductionInfo &RI = En.value(); 3768 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64( 3769 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index)); 3770 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr); 3771 } 3772 3773 // Emit a call to the runtime function that orchestrates the reduction. 3774 // Declare the reduction function in the process. 3775 Function *Func = Builder.GetInsertBlock()->getParent(); 3776 Module *Module = Func->getParent(); 3777 uint32_t SrcLocStrSize; 3778 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 3779 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) { 3780 return RI.AtomicReductionGen; 3781 }); 3782 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize, 3783 CanGenerateAtomic 3784 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE 3785 : IdentFlag(0)); 3786 Value *ThreadId = getOrCreateThreadID(Ident); 3787 Constant *NumVariables = Builder.getInt32(NumReductions); 3788 const DataLayout &DL = Module->getDataLayout(); 3789 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy); 3790 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize); 3791 Function *ReductionFunc = getFreshReductionFunc(*Module); 3792 Value *Lock = getOMPCriticalRegionLock(".reduction"); 3793 Function *ReduceFunc = getOrCreateRuntimeFunctionPtr( 3794 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait 3795 : RuntimeFunction::OMPRTL___kmpc_reduce); 3796 CallInst *ReduceCall = 3797 Builder.CreateCall(ReduceFunc, 3798 {Ident, ThreadId, NumVariables, RedArraySize, RedArray, 3799 ReductionFunc, Lock}, 3800 "reduce"); 3801 3802 // Create final reduction entry blocks for the atomic and non-atomic case. 3803 // Emit IR that dispatches control flow to one of the blocks based on the 3804 // reduction supporting the atomic mode. 3805 BasicBlock *NonAtomicRedBlock = 3806 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func); 3807 BasicBlock *AtomicRedBlock = 3808 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func); 3809 SwitchInst *Switch = 3810 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2); 3811 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock); 3812 Switch->addCase(Builder.getInt32(2), AtomicRedBlock); 3813 3814 // Populate the non-atomic reduction using the elementwise reduction function. 3815 // This loads the elements from the global and private variables and reduces 3816 // them before storing back the result to the global variable. 3817 Builder.SetInsertPoint(NonAtomicRedBlock); 3818 for (auto En : enumerate(ReductionInfos)) { 3819 const ReductionInfo &RI = En.value(); 3820 Type *ValueType = RI.ElementType; 3821 // We have one less load for by-ref case because that load is now inside of 3822 // the reduction region 3823 Value *RedValue = RI.Variable; 3824 if (!IsByRef[En.index()]) { 3825 RedValue = Builder.CreateLoad(ValueType, RI.Variable, 3826 "red.value." + Twine(En.index())); 3827 } 3828 Value *PrivateRedValue = 3829 Builder.CreateLoad(ValueType, RI.PrivateVariable, 3830 "red.private.value." + Twine(En.index())); 3831 Value *Reduced; 3832 InsertPointOrErrorTy AfterIP = 3833 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced); 3834 if (!AfterIP) 3835 return AfterIP.takeError(); 3836 Builder.restoreIP(*AfterIP); 3837 3838 if (!Builder.GetInsertBlock()) 3839 return InsertPointTy(); 3840 // for by-ref case, the load is inside of the reduction region 3841 if (!IsByRef[En.index()]) 3842 Builder.CreateStore(Reduced, RI.Variable); 3843 } 3844 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr( 3845 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait 3846 : RuntimeFunction::OMPRTL___kmpc_end_reduce); 3847 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock}); 3848 Builder.CreateBr(ContinuationBlock); 3849 3850 // Populate the atomic reduction using the atomic elementwise reduction 3851 // function. There are no loads/stores here because they will be happening 3852 // inside the atomic elementwise reduction. 3853 Builder.SetInsertPoint(AtomicRedBlock); 3854 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) { 3855 for (const ReductionInfo &RI : ReductionInfos) { 3856 InsertPointOrErrorTy AfterIP = RI.AtomicReductionGen( 3857 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable); 3858 if (!AfterIP) 3859 return AfterIP.takeError(); 3860 Builder.restoreIP(*AfterIP); 3861 if (!Builder.GetInsertBlock()) 3862 return InsertPointTy(); 3863 } 3864 Builder.CreateBr(ContinuationBlock); 3865 } else { 3866 Builder.CreateUnreachable(); 3867 } 3868 3869 // Populate the outlined reduction function using the elementwise reduction 3870 // function. Partial values are extracted from the type-erased array of 3871 // pointers to private variables. 3872 BasicBlock *ReductionFuncBlock = 3873 BasicBlock::Create(Module->getContext(), "", ReductionFunc); 3874 Builder.SetInsertPoint(ReductionFuncBlock); 3875 Value *LHSArrayPtr = ReductionFunc->getArg(0); 3876 Value *RHSArrayPtr = ReductionFunc->getArg(1); 3877 3878 for (auto En : enumerate(ReductionInfos)) { 3879 const ReductionInfo &RI = En.value(); 3880 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64( 3881 RedArrayTy, LHSArrayPtr, 0, En.index()); 3882 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr); 3883 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType()); 3884 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr); 3885 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64( 3886 RedArrayTy, RHSArrayPtr, 0, En.index()); 3887 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr); 3888 Value *RHSPtr = 3889 Builder.CreateBitCast(RHSI8Ptr, RI.PrivateVariable->getType()); 3890 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr); 3891 Value *Reduced; 3892 InsertPointOrErrorTy AfterIP = 3893 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced); 3894 if (!AfterIP) 3895 return AfterIP.takeError(); 3896 Builder.restoreIP(*AfterIP); 3897 if (!Builder.GetInsertBlock()) 3898 return InsertPointTy(); 3899 // store is inside of the reduction region when using by-ref 3900 if (!IsByRef[En.index()]) 3901 Builder.CreateStore(Reduced, LHSPtr); 3902 } 3903 Builder.CreateRetVoid(); 3904 3905 Builder.SetInsertPoint(ContinuationBlock); 3906 return Builder.saveIP(); 3907 } 3908 3909 OpenMPIRBuilder::InsertPointOrErrorTy 3910 OpenMPIRBuilder::createMaster(const LocationDescription &Loc, 3911 BodyGenCallbackTy BodyGenCB, 3912 FinalizeCallbackTy FiniCB) { 3913 if (!updateToLocation(Loc)) 3914 return Loc.IP; 3915 3916 Directive OMPD = Directive::OMPD_master; 3917 uint32_t SrcLocStrSize; 3918 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 3919 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 3920 Value *ThreadId = getOrCreateThreadID(Ident); 3921 Value *Args[] = {Ident, ThreadId}; 3922 3923 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master); 3924 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args); 3925 3926 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master); 3927 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args); 3928 3929 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB, 3930 /*Conditional*/ true, /*hasFinalize*/ true); 3931 } 3932 3933 OpenMPIRBuilder::InsertPointOrErrorTy 3934 OpenMPIRBuilder::createMasked(const LocationDescription &Loc, 3935 BodyGenCallbackTy BodyGenCB, 3936 FinalizeCallbackTy FiniCB, Value *Filter) { 3937 if (!updateToLocation(Loc)) 3938 return Loc.IP; 3939 3940 Directive OMPD = Directive::OMPD_masked; 3941 uint32_t SrcLocStrSize; 3942 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 3943 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 3944 Value *ThreadId = getOrCreateThreadID(Ident); 3945 Value *Args[] = {Ident, ThreadId, Filter}; 3946 Value *ArgsEnd[] = {Ident, ThreadId}; 3947 3948 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked); 3949 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args); 3950 3951 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked); 3952 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd); 3953 3954 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB, 3955 /*Conditional*/ true, /*hasFinalize*/ true); 3956 } 3957 3958 CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton( 3959 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, 3960 BasicBlock *PostInsertBefore, const Twine &Name) { 3961 Module *M = F->getParent(); 3962 LLVMContext &Ctx = M->getContext(); 3963 Type *IndVarTy = TripCount->getType(); 3964 3965 // Create the basic block structure. 3966 BasicBlock *Preheader = 3967 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore); 3968 BasicBlock *Header = 3969 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore); 3970 BasicBlock *Cond = 3971 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore); 3972 BasicBlock *Body = 3973 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore); 3974 BasicBlock *Latch = 3975 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore); 3976 BasicBlock *Exit = 3977 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore); 3978 BasicBlock *After = 3979 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore); 3980 3981 // Use specified DebugLoc for new instructions. 3982 Builder.SetCurrentDebugLocation(DL); 3983 3984 Builder.SetInsertPoint(Preheader); 3985 Builder.CreateBr(Header); 3986 3987 Builder.SetInsertPoint(Header); 3988 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv"); 3989 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader); 3990 Builder.CreateBr(Cond); 3991 3992 Builder.SetInsertPoint(Cond); 3993 Value *Cmp = 3994 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp"); 3995 Builder.CreateCondBr(Cmp, Body, Exit); 3996 3997 Builder.SetInsertPoint(Body); 3998 Builder.CreateBr(Latch); 3999 4000 Builder.SetInsertPoint(Latch); 4001 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1), 4002 "omp_" + Name + ".next", /*HasNUW=*/true); 4003 Builder.CreateBr(Header); 4004 IndVarPHI->addIncoming(Next, Latch); 4005 4006 Builder.SetInsertPoint(Exit); 4007 Builder.CreateBr(After); 4008 4009 // Remember and return the canonical control flow. 4010 LoopInfos.emplace_front(); 4011 CanonicalLoopInfo *CL = &LoopInfos.front(); 4012 4013 CL->Header = Header; 4014 CL->Cond = Cond; 4015 CL->Latch = Latch; 4016 CL->Exit = Exit; 4017 4018 #ifndef NDEBUG 4019 CL->assertOK(); 4020 #endif 4021 return CL; 4022 } 4023 4024 Expected<CanonicalLoopInfo *> 4025 OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc, 4026 LoopBodyGenCallbackTy BodyGenCB, 4027 Value *TripCount, const Twine &Name) { 4028 BasicBlock *BB = Loc.IP.getBlock(); 4029 BasicBlock *NextBB = BB->getNextNode(); 4030 4031 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(), 4032 NextBB, NextBB, Name); 4033 BasicBlock *After = CL->getAfter(); 4034 4035 // If location is not set, don't connect the loop. 4036 if (updateToLocation(Loc)) { 4037 // Split the loop at the insertion point: Branch to the preheader and move 4038 // every following instruction to after the loop (the After BB). Also, the 4039 // new successor is the loop's after block. 4040 spliceBB(Builder, After, /*CreateBranch=*/false); 4041 Builder.CreateBr(CL->getPreheader()); 4042 } 4043 4044 // Emit the body content. We do it after connecting the loop to the CFG to 4045 // avoid that the callback encounters degenerate BBs. 4046 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar())) 4047 return Err; 4048 4049 #ifndef NDEBUG 4050 CL->assertOK(); 4051 #endif 4052 return CL; 4053 } 4054 4055 Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop( 4056 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, 4057 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, 4058 InsertPointTy ComputeIP, const Twine &Name) { 4059 4060 // Consider the following difficulties (assuming 8-bit signed integers): 4061 // * Adding \p Step to the loop counter which passes \p Stop may overflow: 4062 // DO I = 1, 100, 50 4063 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction: 4064 // DO I = 100, 0, -128 4065 4066 // Start, Stop and Step must be of the same integer type. 4067 auto *IndVarTy = cast<IntegerType>(Start->getType()); 4068 assert(IndVarTy == Stop->getType() && "Stop type mismatch"); 4069 assert(IndVarTy == Step->getType() && "Step type mismatch"); 4070 4071 LocationDescription ComputeLoc = 4072 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc; 4073 updateToLocation(ComputeLoc); 4074 4075 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0); 4076 ConstantInt *One = ConstantInt::get(IndVarTy, 1); 4077 4078 // Like Step, but always positive. 4079 Value *Incr = Step; 4080 4081 // Distance between Start and Stop; always positive. 4082 Value *Span; 4083 4084 // Condition whether there are no iterations are executed at all, e.g. because 4085 // UB < LB. 4086 Value *ZeroCmp; 4087 4088 if (IsSigned) { 4089 // Ensure that increment is positive. If not, negate and invert LB and UB. 4090 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero); 4091 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step); 4092 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start); 4093 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop); 4094 Span = Builder.CreateSub(UB, LB, "", false, true); 4095 ZeroCmp = Builder.CreateICmp( 4096 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB); 4097 } else { 4098 Span = Builder.CreateSub(Stop, Start, "", true); 4099 ZeroCmp = Builder.CreateICmp( 4100 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start); 4101 } 4102 4103 Value *CountIfLooping; 4104 if (InclusiveStop) { 4105 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One); 4106 } else { 4107 // Avoid incrementing past stop since it could overflow. 4108 Value *CountIfTwo = Builder.CreateAdd( 4109 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One); 4110 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr); 4111 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo); 4112 } 4113 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping, 4114 "omp_" + Name + ".tripcount"); 4115 4116 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) { 4117 Builder.restoreIP(CodeGenIP); 4118 Value *Span = Builder.CreateMul(IV, Step); 4119 Value *IndVar = Builder.CreateAdd(Span, Start); 4120 return BodyGenCB(Builder.saveIP(), IndVar); 4121 }; 4122 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP(); 4123 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name); 4124 } 4125 4126 // Returns an LLVM function to call for initializing loop bounds using OpenMP 4127 // static scheduling depending on `type`. Only i32 and i64 are supported by the 4128 // runtime. Always interpret integers as unsigned similarly to 4129 // CanonicalLoopInfo. 4130 static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, 4131 OpenMPIRBuilder &OMPBuilder) { 4132 unsigned Bitwidth = Ty->getIntegerBitWidth(); 4133 if (Bitwidth == 32) 4134 return OMPBuilder.getOrCreateRuntimeFunction( 4135 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u); 4136 if (Bitwidth == 64) 4137 return OMPBuilder.getOrCreateRuntimeFunction( 4138 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u); 4139 llvm_unreachable("unknown OpenMP loop iterator bitwidth"); 4140 } 4141 4142 OpenMPIRBuilder::InsertPointOrErrorTy 4143 OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, 4144 InsertPointTy AllocaIP, 4145 bool NeedsBarrier) { 4146 assert(CLI->isValid() && "Requires a valid canonical loop"); 4147 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) && 4148 "Require dedicated allocate IP"); 4149 4150 // Set up the source location value for OpenMP runtime. 4151 Builder.restoreIP(CLI->getPreheaderIP()); 4152 Builder.SetCurrentDebugLocation(DL); 4153 4154 uint32_t SrcLocStrSize; 4155 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize); 4156 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 4157 4158 // Declare useful OpenMP runtime functions. 4159 Value *IV = CLI->getIndVar(); 4160 Type *IVTy = IV->getType(); 4161 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this); 4162 FunctionCallee StaticFini = 4163 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini); 4164 4165 // Allocate space for computed loop bounds as expected by the "init" function. 4166 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca()); 4167 4168 Type *I32Type = Type::getInt32Ty(M.getContext()); 4169 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter"); 4170 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound"); 4171 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound"); 4172 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride"); 4173 4174 // At the end of the preheader, prepare for calling the "init" function by 4175 // storing the current loop bounds into the allocated space. A canonical loop 4176 // always iterates from 0 to trip-count with step 1. Note that "init" expects 4177 // and produces an inclusive upper bound. 4178 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator()); 4179 Constant *Zero = ConstantInt::get(IVTy, 0); 4180 Constant *One = ConstantInt::get(IVTy, 1); 4181 Builder.CreateStore(Zero, PLowerBound); 4182 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One); 4183 Builder.CreateStore(UpperBound, PUpperBound); 4184 Builder.CreateStore(One, PStride); 4185 4186 Value *ThreadNum = getOrCreateThreadID(SrcLoc); 4187 4188 Constant *SchedulingType = ConstantInt::get( 4189 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic)); 4190 4191 // Call the "init" function and update the trip count of the loop with the 4192 // value it produced. 4193 Builder.CreateCall(StaticInit, 4194 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound, 4195 PUpperBound, PStride, One, Zero}); 4196 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound); 4197 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound); 4198 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound); 4199 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One); 4200 CLI->setTripCount(TripCount); 4201 4202 // Update all uses of the induction variable except the one in the condition 4203 // block that compares it with the actual upper bound, and the increment in 4204 // the latch block. 4205 4206 CLI->mapIndVar([&](Instruction *OldIV) -> Value * { 4207 Builder.SetInsertPoint(CLI->getBody(), 4208 CLI->getBody()->getFirstInsertionPt()); 4209 Builder.SetCurrentDebugLocation(DL); 4210 return Builder.CreateAdd(OldIV, LowerBound); 4211 }); 4212 4213 // In the "exit" block, call the "fini" function. 4214 Builder.SetInsertPoint(CLI->getExit(), 4215 CLI->getExit()->getTerminator()->getIterator()); 4216 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum}); 4217 4218 // Add the barrier if requested. 4219 if (NeedsBarrier) { 4220 InsertPointOrErrorTy BarrierIP = 4221 createBarrier(LocationDescription(Builder.saveIP(), DL), 4222 omp::Directive::OMPD_for, /* ForceSimpleCall */ false, 4223 /* CheckCancelFlag */ false); 4224 if (!BarrierIP) 4225 return BarrierIP.takeError(); 4226 } 4227 4228 InsertPointTy AfterIP = CLI->getAfterIP(); 4229 CLI->invalidate(); 4230 4231 return AfterIP; 4232 } 4233 4234 OpenMPIRBuilder::InsertPointOrErrorTy 4235 OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL, 4236 CanonicalLoopInfo *CLI, 4237 InsertPointTy AllocaIP, 4238 bool NeedsBarrier, 4239 Value *ChunkSize) { 4240 assert(CLI->isValid() && "Requires a valid canonical loop"); 4241 assert(ChunkSize && "Chunk size is required"); 4242 4243 LLVMContext &Ctx = CLI->getFunction()->getContext(); 4244 Value *IV = CLI->getIndVar(); 4245 Value *OrigTripCount = CLI->getTripCount(); 4246 Type *IVTy = IV->getType(); 4247 assert(IVTy->getIntegerBitWidth() <= 64 && 4248 "Max supported tripcount bitwidth is 64 bits"); 4249 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx) 4250 : Type::getInt64Ty(Ctx); 4251 Type *I32Type = Type::getInt32Ty(M.getContext()); 4252 Constant *Zero = ConstantInt::get(InternalIVTy, 0); 4253 Constant *One = ConstantInt::get(InternalIVTy, 1); 4254 4255 // Declare useful OpenMP runtime functions. 4256 FunctionCallee StaticInit = 4257 getKmpcForStaticInitForType(InternalIVTy, M, *this); 4258 FunctionCallee StaticFini = 4259 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini); 4260 4261 // Allocate space for computed loop bounds as expected by the "init" function. 4262 Builder.restoreIP(AllocaIP); 4263 Builder.SetCurrentDebugLocation(DL); 4264 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter"); 4265 Value *PLowerBound = 4266 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound"); 4267 Value *PUpperBound = 4268 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound"); 4269 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride"); 4270 4271 // Set up the source location value for the OpenMP runtime. 4272 Builder.restoreIP(CLI->getPreheaderIP()); 4273 Builder.SetCurrentDebugLocation(DL); 4274 4275 // TODO: Detect overflow in ubsan or max-out with current tripcount. 4276 Value *CastedChunkSize = 4277 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize"); 4278 Value *CastedTripCount = 4279 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount"); 4280 4281 Constant *SchedulingType = ConstantInt::get( 4282 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked)); 4283 Builder.CreateStore(Zero, PLowerBound); 4284 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One); 4285 Builder.CreateStore(OrigUpperBound, PUpperBound); 4286 Builder.CreateStore(One, PStride); 4287 4288 // Call the "init" function and update the trip count of the loop with the 4289 // value it produced. 4290 uint32_t SrcLocStrSize; 4291 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize); 4292 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 4293 Value *ThreadNum = getOrCreateThreadID(SrcLoc); 4294 Builder.CreateCall(StaticInit, 4295 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum, 4296 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter, 4297 /*plower=*/PLowerBound, /*pupper=*/PUpperBound, 4298 /*pstride=*/PStride, /*incr=*/One, 4299 /*chunk=*/CastedChunkSize}); 4300 4301 // Load values written by the "init" function. 4302 Value *FirstChunkStart = 4303 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb"); 4304 Value *FirstChunkStop = 4305 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub"); 4306 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One); 4307 Value *ChunkRange = 4308 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range"); 4309 Value *NextChunkStride = 4310 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride"); 4311 4312 // Create outer "dispatch" loop for enumerating the chunks. 4313 BasicBlock *DispatchEnter = splitBB(Builder, true); 4314 Value *DispatchCounter; 4315 4316 // It is safe to assume this didn't return an error because the callback 4317 // passed into createCanonicalLoop is the only possible error source, and it 4318 // always returns success. 4319 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop( 4320 {Builder.saveIP(), DL}, 4321 [&](InsertPointTy BodyIP, Value *Counter) { 4322 DispatchCounter = Counter; 4323 return Error::success(); 4324 }, 4325 FirstChunkStart, CastedTripCount, NextChunkStride, 4326 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{}, 4327 "dispatch")); 4328 4329 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to 4330 // not have to preserve the canonical invariant. 4331 BasicBlock *DispatchBody = DispatchCLI->getBody(); 4332 BasicBlock *DispatchLatch = DispatchCLI->getLatch(); 4333 BasicBlock *DispatchExit = DispatchCLI->getExit(); 4334 BasicBlock *DispatchAfter = DispatchCLI->getAfter(); 4335 DispatchCLI->invalidate(); 4336 4337 // Rewire the original loop to become the chunk loop inside the dispatch loop. 4338 redirectTo(DispatchAfter, CLI->getAfter(), DL); 4339 redirectTo(CLI->getExit(), DispatchLatch, DL); 4340 redirectTo(DispatchBody, DispatchEnter, DL); 4341 4342 // Prepare the prolog of the chunk loop. 4343 Builder.restoreIP(CLI->getPreheaderIP()); 4344 Builder.SetCurrentDebugLocation(DL); 4345 4346 // Compute the number of iterations of the chunk loop. 4347 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator()); 4348 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange); 4349 Value *IsLastChunk = 4350 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last"); 4351 Value *CountUntilOrigTripCount = 4352 Builder.CreateSub(CastedTripCount, DispatchCounter); 4353 Value *ChunkTripCount = Builder.CreateSelect( 4354 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount"); 4355 Value *BackcastedChunkTC = 4356 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc"); 4357 CLI->setTripCount(BackcastedChunkTC); 4358 4359 // Update all uses of the induction variable except the one in the condition 4360 // block that compares it with the actual upper bound, and the increment in 4361 // the latch block. 4362 Value *BackcastedDispatchCounter = 4363 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc"); 4364 CLI->mapIndVar([&](Instruction *) -> Value * { 4365 Builder.restoreIP(CLI->getBodyIP()); 4366 return Builder.CreateAdd(IV, BackcastedDispatchCounter); 4367 }); 4368 4369 // In the "exit" block, call the "fini" function. 4370 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt()); 4371 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum}); 4372 4373 // Add the barrier if requested. 4374 if (NeedsBarrier) { 4375 InsertPointOrErrorTy AfterIP = 4376 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for, 4377 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false); 4378 if (!AfterIP) 4379 return AfterIP.takeError(); 4380 } 4381 4382 #ifndef NDEBUG 4383 // Even though we currently do not support applying additional methods to it, 4384 // the chunk loop should remain a canonical loop. 4385 CLI->assertOK(); 4386 #endif 4387 4388 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt()); 4389 } 4390 4391 // Returns an LLVM function to call for executing an OpenMP static worksharing 4392 // for loop depending on `type`. Only i32 and i64 are supported by the runtime. 4393 // Always interpret integers as unsigned similarly to CanonicalLoopInfo. 4394 static FunctionCallee 4395 getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, 4396 WorksharingLoopType LoopType) { 4397 unsigned Bitwidth = Ty->getIntegerBitWidth(); 4398 Module &M = OMPBuilder->M; 4399 switch (LoopType) { 4400 case WorksharingLoopType::ForStaticLoop: 4401 if (Bitwidth == 32) 4402 return OMPBuilder->getOrCreateRuntimeFunction( 4403 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u); 4404 if (Bitwidth == 64) 4405 return OMPBuilder->getOrCreateRuntimeFunction( 4406 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u); 4407 break; 4408 case WorksharingLoopType::DistributeStaticLoop: 4409 if (Bitwidth == 32) 4410 return OMPBuilder->getOrCreateRuntimeFunction( 4411 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u); 4412 if (Bitwidth == 64) 4413 return OMPBuilder->getOrCreateRuntimeFunction( 4414 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u); 4415 break; 4416 case WorksharingLoopType::DistributeForStaticLoop: 4417 if (Bitwidth == 32) 4418 return OMPBuilder->getOrCreateRuntimeFunction( 4419 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u); 4420 if (Bitwidth == 64) 4421 return OMPBuilder->getOrCreateRuntimeFunction( 4422 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u); 4423 break; 4424 } 4425 if (Bitwidth != 32 && Bitwidth != 64) { 4426 llvm_unreachable("Unknown OpenMP loop iterator bitwidth"); 4427 } 4428 llvm_unreachable("Unknown type of OpenMP worksharing loop"); 4429 } 4430 4431 // Inserts a call to proper OpenMP Device RTL function which handles 4432 // loop worksharing. 4433 static void createTargetLoopWorkshareCall( 4434 OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, 4435 BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, 4436 Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) { 4437 Type *TripCountTy = TripCount->getType(); 4438 Module &M = OMPBuilder->M; 4439 IRBuilder<> &Builder = OMPBuilder->Builder; 4440 FunctionCallee RTLFn = 4441 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType); 4442 SmallVector<Value *, 8> RealArgs; 4443 RealArgs.push_back(Ident); 4444 RealArgs.push_back(Builder.CreateBitCast(&LoopBodyFn, ParallelTaskPtr)); 4445 RealArgs.push_back(LoopBodyArg); 4446 RealArgs.push_back(TripCount); 4447 if (LoopType == WorksharingLoopType::DistributeStaticLoop) { 4448 RealArgs.push_back(ConstantInt::get(TripCountTy, 0)); 4449 Builder.CreateCall(RTLFn, RealArgs); 4450 return; 4451 } 4452 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction( 4453 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads); 4454 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())}); 4455 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {}); 4456 4457 RealArgs.push_back( 4458 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast")); 4459 RealArgs.push_back(ConstantInt::get(TripCountTy, 0)); 4460 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) { 4461 RealArgs.push_back(ConstantInt::get(TripCountTy, 0)); 4462 } 4463 4464 Builder.CreateCall(RTLFn, RealArgs); 4465 } 4466 4467 static void 4468 workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, 4469 CanonicalLoopInfo *CLI, Value *Ident, 4470 Function &OutlinedFn, Type *ParallelTaskPtr, 4471 const SmallVector<Instruction *, 4> &ToBeDeleted, 4472 WorksharingLoopType LoopType) { 4473 IRBuilder<> &Builder = OMPIRBuilder->Builder; 4474 BasicBlock *Preheader = CLI->getPreheader(); 4475 Value *TripCount = CLI->getTripCount(); 4476 4477 // After loop body outling, the loop body contains only set up 4478 // of loop body argument structure and the call to the outlined 4479 // loop body function. Firstly, we need to move setup of loop body args 4480 // into loop preheader. 4481 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(), 4482 CLI->getBody()->begin(), std::prev(CLI->getBody()->end())); 4483 4484 // The next step is to remove the whole loop. We do not it need anymore. 4485 // That's why make an unconditional branch from loop preheader to loop 4486 // exit block 4487 Builder.restoreIP({Preheader, Preheader->end()}); 4488 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc()); 4489 Preheader->getTerminator()->eraseFromParent(); 4490 Builder.CreateBr(CLI->getExit()); 4491 4492 // Delete dead loop blocks 4493 OpenMPIRBuilder::OutlineInfo CleanUpInfo; 4494 SmallPtrSet<BasicBlock *, 32> RegionBlockSet; 4495 SmallVector<BasicBlock *, 32> BlocksToBeRemoved; 4496 CleanUpInfo.EntryBB = CLI->getHeader(); 4497 CleanUpInfo.ExitBB = CLI->getExit(); 4498 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved); 4499 DeleteDeadBlocks(BlocksToBeRemoved); 4500 4501 // Find the instruction which corresponds to loop body argument structure 4502 // and remove the call to loop body function instruction. 4503 Value *LoopBodyArg; 4504 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser(); 4505 assert(OutlinedFnUser && 4506 "Expected unique undroppable user of outlined function"); 4507 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser); 4508 assert(OutlinedFnCallInstruction && "Expected outlined function call"); 4509 assert((OutlinedFnCallInstruction->getParent() == Preheader) && 4510 "Expected outlined function call to be located in loop preheader"); 4511 // Check in case no argument structure has been passed. 4512 if (OutlinedFnCallInstruction->arg_size() > 1) 4513 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1); 4514 else 4515 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy()); 4516 OutlinedFnCallInstruction->eraseFromParent(); 4517 4518 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident, 4519 LoopBodyArg, ParallelTaskPtr, TripCount, 4520 OutlinedFn); 4521 4522 for (auto &ToBeDeletedItem : ToBeDeleted) 4523 ToBeDeletedItem->eraseFromParent(); 4524 CLI->invalidate(); 4525 } 4526 4527 OpenMPIRBuilder::InsertPointTy 4528 OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI, 4529 InsertPointTy AllocaIP, 4530 WorksharingLoopType LoopType) { 4531 uint32_t SrcLocStrSize; 4532 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize); 4533 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 4534 4535 OutlineInfo OI; 4536 OI.OuterAllocaBB = CLI->getPreheader(); 4537 Function *OuterFn = CLI->getPreheader()->getParent(); 4538 4539 // Instructions which need to be deleted at the end of code generation 4540 SmallVector<Instruction *, 4> ToBeDeleted; 4541 4542 OI.OuterAllocaBB = AllocaIP.getBlock(); 4543 4544 // Mark the body loop as region which needs to be extracted 4545 OI.EntryBB = CLI->getBody(); 4546 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(), 4547 "omp.prelatch", true); 4548 4549 // Prepare loop body for extraction 4550 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()}); 4551 4552 // Insert new loop counter variable which will be used only in loop 4553 // body. 4554 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, ""); 4555 Instruction *NewLoopCntLoad = 4556 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt); 4557 // New loop counter instructions are redundant in the loop preheader when 4558 // code generation for workshare loop is finshed. That's why mark them as 4559 // ready for deletion. 4560 ToBeDeleted.push_back(NewLoopCntLoad); 4561 ToBeDeleted.push_back(NewLoopCnt); 4562 4563 // Analyse loop body region. Find all input variables which are used inside 4564 // loop body region. 4565 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet; 4566 SmallVector<BasicBlock *, 32> Blocks; 4567 OI.collectBlocks(ParallelRegionBlockSet, Blocks); 4568 SmallVector<BasicBlock *, 32> BlocksT(ParallelRegionBlockSet.begin(), 4569 ParallelRegionBlockSet.end()); 4570 4571 CodeExtractorAnalysisCache CEAC(*OuterFn); 4572 CodeExtractor Extractor(Blocks, 4573 /* DominatorTree */ nullptr, 4574 /* AggregateArgs */ true, 4575 /* BlockFrequencyInfo */ nullptr, 4576 /* BranchProbabilityInfo */ nullptr, 4577 /* AssumptionCache */ nullptr, 4578 /* AllowVarArgs */ true, 4579 /* AllowAlloca */ true, 4580 /* AllocationBlock */ CLI->getPreheader(), 4581 /* Suffix */ ".omp_wsloop", 4582 /* AggrArgsIn0AddrSpace */ true); 4583 4584 BasicBlock *CommonExit = nullptr; 4585 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands; 4586 4587 // Find allocas outside the loop body region which are used inside loop 4588 // body 4589 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit); 4590 4591 // We need to model loop body region as the function f(cnt, loop_arg). 4592 // That's why we replace loop induction variable by the new counter 4593 // which will be one of loop body function argument 4594 SmallVector<User *> Users(CLI->getIndVar()->user_begin(), 4595 CLI->getIndVar()->user_end()); 4596 for (auto Use : Users) { 4597 if (Instruction *Inst = dyn_cast<Instruction>(Use)) { 4598 if (ParallelRegionBlockSet.count(Inst->getParent())) { 4599 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad); 4600 } 4601 } 4602 } 4603 // Make sure that loop counter variable is not merged into loop body 4604 // function argument structure and it is passed as separate variable 4605 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad); 4606 4607 // PostOutline CB is invoked when loop body function is outlined and 4608 // loop body is replaced by call to outlined function. We need to add 4609 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl 4610 // function will handle loop control logic. 4611 // 4612 OI.PostOutlineCB = [=, ToBeDeletedVec = 4613 std::move(ToBeDeleted)](Function &OutlinedFn) { 4614 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr, 4615 ToBeDeletedVec, LoopType); 4616 }; 4617 addOutlineInfo(std::move(OI)); 4618 return CLI->getAfterIP(); 4619 } 4620 4621 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop( 4622 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, 4623 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize, 4624 bool HasSimdModifier, bool HasMonotonicModifier, 4625 bool HasNonmonotonicModifier, bool HasOrderedClause, 4626 WorksharingLoopType LoopType) { 4627 if (Config.isTargetDevice()) 4628 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType); 4629 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType( 4630 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier, 4631 HasNonmonotonicModifier, HasOrderedClause); 4632 4633 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) == 4634 OMPScheduleType::ModifierOrdered; 4635 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) { 4636 case OMPScheduleType::BaseStatic: 4637 assert(!ChunkSize && "No chunk size with static-chunked schedule"); 4638 if (IsOrdered) 4639 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType, 4640 NeedsBarrier, ChunkSize); 4641 // FIXME: Monotonicity ignored? 4642 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier); 4643 4644 case OMPScheduleType::BaseStaticChunked: 4645 if (IsOrdered) 4646 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType, 4647 NeedsBarrier, ChunkSize); 4648 // FIXME: Monotonicity ignored? 4649 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier, 4650 ChunkSize); 4651 4652 case OMPScheduleType::BaseRuntime: 4653 case OMPScheduleType::BaseAuto: 4654 case OMPScheduleType::BaseGreedy: 4655 case OMPScheduleType::BaseBalanced: 4656 case OMPScheduleType::BaseSteal: 4657 case OMPScheduleType::BaseGuidedSimd: 4658 case OMPScheduleType::BaseRuntimeSimd: 4659 assert(!ChunkSize && 4660 "schedule type does not support user-defined chunk sizes"); 4661 [[fallthrough]]; 4662 case OMPScheduleType::BaseDynamicChunked: 4663 case OMPScheduleType::BaseGuidedChunked: 4664 case OMPScheduleType::BaseGuidedIterativeChunked: 4665 case OMPScheduleType::BaseGuidedAnalyticalChunked: 4666 case OMPScheduleType::BaseStaticBalancedChunked: 4667 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType, 4668 NeedsBarrier, ChunkSize); 4669 4670 default: 4671 llvm_unreachable("Unknown/unimplemented schedule kind"); 4672 } 4673 } 4674 4675 /// Returns an LLVM function to call for initializing loop bounds using OpenMP 4676 /// dynamic scheduling depending on `type`. Only i32 and i64 are supported by 4677 /// the runtime. Always interpret integers as unsigned similarly to 4678 /// CanonicalLoopInfo. 4679 static FunctionCallee 4680 getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) { 4681 unsigned Bitwidth = Ty->getIntegerBitWidth(); 4682 if (Bitwidth == 32) 4683 return OMPBuilder.getOrCreateRuntimeFunction( 4684 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u); 4685 if (Bitwidth == 64) 4686 return OMPBuilder.getOrCreateRuntimeFunction( 4687 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u); 4688 llvm_unreachable("unknown OpenMP loop iterator bitwidth"); 4689 } 4690 4691 /// Returns an LLVM function to call for updating the next loop using OpenMP 4692 /// dynamic scheduling depending on `type`. Only i32 and i64 are supported by 4693 /// the runtime. Always interpret integers as unsigned similarly to 4694 /// CanonicalLoopInfo. 4695 static FunctionCallee 4696 getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) { 4697 unsigned Bitwidth = Ty->getIntegerBitWidth(); 4698 if (Bitwidth == 32) 4699 return OMPBuilder.getOrCreateRuntimeFunction( 4700 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u); 4701 if (Bitwidth == 64) 4702 return OMPBuilder.getOrCreateRuntimeFunction( 4703 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u); 4704 llvm_unreachable("unknown OpenMP loop iterator bitwidth"); 4705 } 4706 4707 /// Returns an LLVM function to call for finalizing the dynamic loop using 4708 /// depending on `type`. Only i32 and i64 are supported by the runtime. Always 4709 /// interpret integers as unsigned similarly to CanonicalLoopInfo. 4710 static FunctionCallee 4711 getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) { 4712 unsigned Bitwidth = Ty->getIntegerBitWidth(); 4713 if (Bitwidth == 32) 4714 return OMPBuilder.getOrCreateRuntimeFunction( 4715 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u); 4716 if (Bitwidth == 64) 4717 return OMPBuilder.getOrCreateRuntimeFunction( 4718 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u); 4719 llvm_unreachable("unknown OpenMP loop iterator bitwidth"); 4720 } 4721 4722 OpenMPIRBuilder::InsertPointOrErrorTy 4723 OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, 4724 InsertPointTy AllocaIP, 4725 OMPScheduleType SchedType, 4726 bool NeedsBarrier, Value *Chunk) { 4727 assert(CLI->isValid() && "Requires a valid canonical loop"); 4728 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) && 4729 "Require dedicated allocate IP"); 4730 assert(isValidWorkshareLoopScheduleType(SchedType) && 4731 "Require valid schedule type"); 4732 4733 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) == 4734 OMPScheduleType::ModifierOrdered; 4735 4736 // Set up the source location value for OpenMP runtime. 4737 Builder.SetCurrentDebugLocation(DL); 4738 4739 uint32_t SrcLocStrSize; 4740 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize); 4741 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 4742 4743 // Declare useful OpenMP runtime functions. 4744 Value *IV = CLI->getIndVar(); 4745 Type *IVTy = IV->getType(); 4746 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this); 4747 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this); 4748 4749 // Allocate space for computed loop bounds as expected by the "init" function. 4750 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca()); 4751 Type *I32Type = Type::getInt32Ty(M.getContext()); 4752 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter"); 4753 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound"); 4754 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound"); 4755 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride"); 4756 4757 // At the end of the preheader, prepare for calling the "init" function by 4758 // storing the current loop bounds into the allocated space. A canonical loop 4759 // always iterates from 0 to trip-count with step 1. Note that "init" expects 4760 // and produces an inclusive upper bound. 4761 BasicBlock *PreHeader = CLI->getPreheader(); 4762 Builder.SetInsertPoint(PreHeader->getTerminator()); 4763 Constant *One = ConstantInt::get(IVTy, 1); 4764 Builder.CreateStore(One, PLowerBound); 4765 Value *UpperBound = CLI->getTripCount(); 4766 Builder.CreateStore(UpperBound, PUpperBound); 4767 Builder.CreateStore(One, PStride); 4768 4769 BasicBlock *Header = CLI->getHeader(); 4770 BasicBlock *Exit = CLI->getExit(); 4771 BasicBlock *Cond = CLI->getCond(); 4772 BasicBlock *Latch = CLI->getLatch(); 4773 InsertPointTy AfterIP = CLI->getAfterIP(); 4774 4775 // The CLI will be "broken" in the code below, as the loop is no longer 4776 // a valid canonical loop. 4777 4778 if (!Chunk) 4779 Chunk = One; 4780 4781 Value *ThreadNum = getOrCreateThreadID(SrcLoc); 4782 4783 Constant *SchedulingType = 4784 ConstantInt::get(I32Type, static_cast<int>(SchedType)); 4785 4786 // Call the "init" function. 4787 Builder.CreateCall(DynamicInit, 4788 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One, 4789 UpperBound, /* step */ One, Chunk}); 4790 4791 // An outer loop around the existing one. 4792 BasicBlock *OuterCond = BasicBlock::Create( 4793 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond", 4794 PreHeader->getParent()); 4795 // This needs to be 32-bit always, so can't use the IVTy Zero above. 4796 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt()); 4797 Value *Res = 4798 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter, 4799 PLowerBound, PUpperBound, PStride}); 4800 Constant *Zero32 = ConstantInt::get(I32Type, 0); 4801 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32); 4802 Value *LowerBound = 4803 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb"); 4804 Builder.CreateCondBr(MoreWork, Header, Exit); 4805 4806 // Change PHI-node in loop header to use outer cond rather than preheader, 4807 // and set IV to the LowerBound. 4808 Instruction *Phi = &Header->front(); 4809 auto *PI = cast<PHINode>(Phi); 4810 PI->setIncomingBlock(0, OuterCond); 4811 PI->setIncomingValue(0, LowerBound); 4812 4813 // Then set the pre-header to jump to the OuterCond 4814 Instruction *Term = PreHeader->getTerminator(); 4815 auto *Br = cast<BranchInst>(Term); 4816 Br->setSuccessor(0, OuterCond); 4817 4818 // Modify the inner condition: 4819 // * Use the UpperBound returned from the DynamicNext call. 4820 // * jump to the loop outer loop when done with one of the inner loops. 4821 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt()); 4822 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub"); 4823 Instruction *Comp = &*Builder.GetInsertPoint(); 4824 auto *CI = cast<CmpInst>(Comp); 4825 CI->setOperand(1, UpperBound); 4826 // Redirect the inner exit to branch to outer condition. 4827 Instruction *Branch = &Cond->back(); 4828 auto *BI = cast<BranchInst>(Branch); 4829 assert(BI->getSuccessor(1) == Exit); 4830 BI->setSuccessor(1, OuterCond); 4831 4832 // Call the "fini" function if "ordered" is present in wsloop directive. 4833 if (Ordered) { 4834 Builder.SetInsertPoint(&Latch->back()); 4835 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this); 4836 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum}); 4837 } 4838 4839 // Add the barrier if requested. 4840 if (NeedsBarrier) { 4841 Builder.SetInsertPoint(&Exit->back()); 4842 InsertPointOrErrorTy BarrierIP = 4843 createBarrier(LocationDescription(Builder.saveIP(), DL), 4844 omp::Directive::OMPD_for, /* ForceSimpleCall */ false, 4845 /* CheckCancelFlag */ false); 4846 if (!BarrierIP) 4847 return BarrierIP.takeError(); 4848 } 4849 4850 CLI->invalidate(); 4851 return AfterIP; 4852 } 4853 4854 /// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is, 4855 /// after this \p OldTarget will be orphaned. 4856 static void redirectAllPredecessorsTo(BasicBlock *OldTarget, 4857 BasicBlock *NewTarget, DebugLoc DL) { 4858 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget))) 4859 redirectTo(Pred, NewTarget, DL); 4860 } 4861 4862 /// Determine which blocks in \p BBs are reachable from outside and remove the 4863 /// ones that are not reachable from the function. 4864 static void removeUnusedBlocksFromParent(ArrayRef<BasicBlock *> BBs) { 4865 SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()}; 4866 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) { 4867 for (Use &U : BB->uses()) { 4868 auto *UseInst = dyn_cast<Instruction>(U.getUser()); 4869 if (!UseInst) 4870 continue; 4871 if (BBsToErase.count(UseInst->getParent())) 4872 continue; 4873 return true; 4874 } 4875 return false; 4876 }; 4877 4878 while (BBsToErase.remove_if(HasRemainingUses)) { 4879 // Try again if anything was removed. 4880 } 4881 4882 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end()); 4883 DeleteDeadBlocks(BBVec); 4884 } 4885 4886 CanonicalLoopInfo * 4887 OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops, 4888 InsertPointTy ComputeIP) { 4889 assert(Loops.size() >= 1 && "At least one loop required"); 4890 size_t NumLoops = Loops.size(); 4891 4892 // Nothing to do if there is already just one loop. 4893 if (NumLoops == 1) 4894 return Loops.front(); 4895 4896 CanonicalLoopInfo *Outermost = Loops.front(); 4897 CanonicalLoopInfo *Innermost = Loops.back(); 4898 BasicBlock *OrigPreheader = Outermost->getPreheader(); 4899 BasicBlock *OrigAfter = Outermost->getAfter(); 4900 Function *F = OrigPreheader->getParent(); 4901 4902 // Loop control blocks that may become orphaned later. 4903 SmallVector<BasicBlock *, 12> OldControlBBs; 4904 OldControlBBs.reserve(6 * Loops.size()); 4905 for (CanonicalLoopInfo *Loop : Loops) 4906 Loop->collectControlBlocks(OldControlBBs); 4907 4908 // Setup the IRBuilder for inserting the trip count computation. 4909 Builder.SetCurrentDebugLocation(DL); 4910 if (ComputeIP.isSet()) 4911 Builder.restoreIP(ComputeIP); 4912 else 4913 Builder.restoreIP(Outermost->getPreheaderIP()); 4914 4915 // Derive the collapsed' loop trip count. 4916 // TODO: Find common/largest indvar type. 4917 Value *CollapsedTripCount = nullptr; 4918 for (CanonicalLoopInfo *L : Loops) { 4919 assert(L->isValid() && 4920 "All loops to collapse must be valid canonical loops"); 4921 Value *OrigTripCount = L->getTripCount(); 4922 if (!CollapsedTripCount) { 4923 CollapsedTripCount = OrigTripCount; 4924 continue; 4925 } 4926 4927 // TODO: Enable UndefinedSanitizer to diagnose an overflow here. 4928 CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount, 4929 {}, /*HasNUW=*/true); 4930 } 4931 4932 // Create the collapsed loop control flow. 4933 CanonicalLoopInfo *Result = 4934 createLoopSkeleton(DL, CollapsedTripCount, F, 4935 OrigPreheader->getNextNode(), OrigAfter, "collapsed"); 4936 4937 // Build the collapsed loop body code. 4938 // Start with deriving the input loop induction variables from the collapsed 4939 // one, using a divmod scheme. To preserve the original loops' order, the 4940 // innermost loop use the least significant bits. 4941 Builder.restoreIP(Result->getBodyIP()); 4942 4943 Value *Leftover = Result->getIndVar(); 4944 SmallVector<Value *> NewIndVars; 4945 NewIndVars.resize(NumLoops); 4946 for (int i = NumLoops - 1; i >= 1; --i) { 4947 Value *OrigTripCount = Loops[i]->getTripCount(); 4948 4949 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount); 4950 NewIndVars[i] = NewIndVar; 4951 4952 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount); 4953 } 4954 // Outermost loop gets all the remaining bits. 4955 NewIndVars[0] = Leftover; 4956 4957 // Construct the loop body control flow. 4958 // We progressively construct the branch structure following in direction of 4959 // the control flow, from the leading in-between code, the loop nest body, the 4960 // trailing in-between code, and rejoining the collapsed loop's latch. 4961 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If 4962 // the ContinueBlock is set, continue with that block. If ContinuePred, use 4963 // its predecessors as sources. 4964 BasicBlock *ContinueBlock = Result->getBody(); 4965 BasicBlock *ContinuePred = nullptr; 4966 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest, 4967 BasicBlock *NextSrc) { 4968 if (ContinueBlock) 4969 redirectTo(ContinueBlock, Dest, DL); 4970 else 4971 redirectAllPredecessorsTo(ContinuePred, Dest, DL); 4972 4973 ContinueBlock = nullptr; 4974 ContinuePred = NextSrc; 4975 }; 4976 4977 // The code before the nested loop of each level. 4978 // Because we are sinking it into the nest, it will be executed more often 4979 // that the original loop. More sophisticated schemes could keep track of what 4980 // the in-between code is and instantiate it only once per thread. 4981 for (size_t i = 0; i < NumLoops - 1; ++i) 4982 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader()); 4983 4984 // Connect the loop nest body. 4985 ContinueWith(Innermost->getBody(), Innermost->getLatch()); 4986 4987 // The code after the nested loop at each level. 4988 for (size_t i = NumLoops - 1; i > 0; --i) 4989 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch()); 4990 4991 // Connect the finished loop to the collapsed loop latch. 4992 ContinueWith(Result->getLatch(), nullptr); 4993 4994 // Replace the input loops with the new collapsed loop. 4995 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL); 4996 redirectTo(Result->getAfter(), Outermost->getAfter(), DL); 4997 4998 // Replace the input loop indvars with the derived ones. 4999 for (size_t i = 0; i < NumLoops; ++i) 5000 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]); 5001 5002 // Remove unused parts of the input loops. 5003 removeUnusedBlocksFromParent(OldControlBBs); 5004 5005 for (CanonicalLoopInfo *L : Loops) 5006 L->invalidate(); 5007 5008 #ifndef NDEBUG 5009 Result->assertOK(); 5010 #endif 5011 return Result; 5012 } 5013 5014 std::vector<CanonicalLoopInfo *> 5015 OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops, 5016 ArrayRef<Value *> TileSizes) { 5017 assert(TileSizes.size() == Loops.size() && 5018 "Must pass as many tile sizes as there are loops"); 5019 int NumLoops = Loops.size(); 5020 assert(NumLoops >= 1 && "At least one loop to tile required"); 5021 5022 CanonicalLoopInfo *OutermostLoop = Loops.front(); 5023 CanonicalLoopInfo *InnermostLoop = Loops.back(); 5024 Function *F = OutermostLoop->getBody()->getParent(); 5025 BasicBlock *InnerEnter = InnermostLoop->getBody(); 5026 BasicBlock *InnerLatch = InnermostLoop->getLatch(); 5027 5028 // Loop control blocks that may become orphaned later. 5029 SmallVector<BasicBlock *, 12> OldControlBBs; 5030 OldControlBBs.reserve(6 * Loops.size()); 5031 for (CanonicalLoopInfo *Loop : Loops) 5032 Loop->collectControlBlocks(OldControlBBs); 5033 5034 // Collect original trip counts and induction variable to be accessible by 5035 // index. Also, the structure of the original loops is not preserved during 5036 // the construction of the tiled loops, so do it before we scavenge the BBs of 5037 // any original CanonicalLoopInfo. 5038 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars; 5039 for (CanonicalLoopInfo *L : Loops) { 5040 assert(L->isValid() && "All input loops must be valid canonical loops"); 5041 OrigTripCounts.push_back(L->getTripCount()); 5042 OrigIndVars.push_back(L->getIndVar()); 5043 } 5044 5045 // Collect the code between loop headers. These may contain SSA definitions 5046 // that are used in the loop nest body. To be usable with in the innermost 5047 // body, these BasicBlocks will be sunk into the loop nest body. That is, 5048 // these instructions may be executed more often than before the tiling. 5049 // TODO: It would be sufficient to only sink them into body of the 5050 // corresponding tile loop. 5051 SmallVector<std::pair<BasicBlock *, BasicBlock *>, 4> InbetweenCode; 5052 for (int i = 0; i < NumLoops - 1; ++i) { 5053 CanonicalLoopInfo *Surrounding = Loops[i]; 5054 CanonicalLoopInfo *Nested = Loops[i + 1]; 5055 5056 BasicBlock *EnterBB = Surrounding->getBody(); 5057 BasicBlock *ExitBB = Nested->getHeader(); 5058 InbetweenCode.emplace_back(EnterBB, ExitBB); 5059 } 5060 5061 // Compute the trip counts of the floor loops. 5062 Builder.SetCurrentDebugLocation(DL); 5063 Builder.restoreIP(OutermostLoop->getPreheaderIP()); 5064 SmallVector<Value *, 4> FloorCount, FloorRems; 5065 for (int i = 0; i < NumLoops; ++i) { 5066 Value *TileSize = TileSizes[i]; 5067 Value *OrigTripCount = OrigTripCounts[i]; 5068 Type *IVType = OrigTripCount->getType(); 5069 5070 Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize); 5071 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize); 5072 5073 // 0 if tripcount divides the tilesize, 1 otherwise. 5074 // 1 means we need an additional iteration for a partial tile. 5075 // 5076 // Unfortunately we cannot just use the roundup-formula 5077 // (tripcount + tilesize - 1)/tilesize 5078 // because the summation might overflow. We do not want introduce undefined 5079 // behavior when the untiled loop nest did not. 5080 Value *FloorTripOverflow = 5081 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0)); 5082 5083 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType); 5084 FloorTripCount = 5085 Builder.CreateAdd(FloorTripCount, FloorTripOverflow, 5086 "omp_floor" + Twine(i) + ".tripcount", true); 5087 5088 // Remember some values for later use. 5089 FloorCount.push_back(FloorTripCount); 5090 FloorRems.push_back(FloorTripRem); 5091 } 5092 5093 // Generate the new loop nest, from the outermost to the innermost. 5094 std::vector<CanonicalLoopInfo *> Result; 5095 Result.reserve(NumLoops * 2); 5096 5097 // The basic block of the surrounding loop that enters the nest generated 5098 // loop. 5099 BasicBlock *Enter = OutermostLoop->getPreheader(); 5100 5101 // The basic block of the surrounding loop where the inner code should 5102 // continue. 5103 BasicBlock *Continue = OutermostLoop->getAfter(); 5104 5105 // Where the next loop basic block should be inserted. 5106 BasicBlock *OutroInsertBefore = InnermostLoop->getExit(); 5107 5108 auto EmbeddNewLoop = 5109 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore]( 5110 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * { 5111 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton( 5112 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name); 5113 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL); 5114 redirectTo(EmbeddedLoop->getAfter(), Continue, DL); 5115 5116 // Setup the position where the next embedded loop connects to this loop. 5117 Enter = EmbeddedLoop->getBody(); 5118 Continue = EmbeddedLoop->getLatch(); 5119 OutroInsertBefore = EmbeddedLoop->getLatch(); 5120 return EmbeddedLoop; 5121 }; 5122 5123 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts, 5124 const Twine &NameBase) { 5125 for (auto P : enumerate(TripCounts)) { 5126 CanonicalLoopInfo *EmbeddedLoop = 5127 EmbeddNewLoop(P.value(), NameBase + Twine(P.index())); 5128 Result.push_back(EmbeddedLoop); 5129 } 5130 }; 5131 5132 EmbeddNewLoops(FloorCount, "floor"); 5133 5134 // Within the innermost floor loop, emit the code that computes the tile 5135 // sizes. 5136 Builder.SetInsertPoint(Enter->getTerminator()); 5137 SmallVector<Value *, 4> TileCounts; 5138 for (int i = 0; i < NumLoops; ++i) { 5139 CanonicalLoopInfo *FloorLoop = Result[i]; 5140 Value *TileSize = TileSizes[i]; 5141 5142 Value *FloorIsEpilogue = 5143 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]); 5144 Value *TileTripCount = 5145 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize); 5146 5147 TileCounts.push_back(TileTripCount); 5148 } 5149 5150 // Create the tile loops. 5151 EmbeddNewLoops(TileCounts, "tile"); 5152 5153 // Insert the inbetween code into the body. 5154 BasicBlock *BodyEnter = Enter; 5155 BasicBlock *BodyEntered = nullptr; 5156 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) { 5157 BasicBlock *EnterBB = P.first; 5158 BasicBlock *ExitBB = P.second; 5159 5160 if (BodyEnter) 5161 redirectTo(BodyEnter, EnterBB, DL); 5162 else 5163 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL); 5164 5165 BodyEnter = nullptr; 5166 BodyEntered = ExitBB; 5167 } 5168 5169 // Append the original loop nest body into the generated loop nest body. 5170 if (BodyEnter) 5171 redirectTo(BodyEnter, InnerEnter, DL); 5172 else 5173 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL); 5174 redirectAllPredecessorsTo(InnerLatch, Continue, DL); 5175 5176 // Replace the original induction variable with an induction variable computed 5177 // from the tile and floor induction variables. 5178 Builder.restoreIP(Result.back()->getBodyIP()); 5179 for (int i = 0; i < NumLoops; ++i) { 5180 CanonicalLoopInfo *FloorLoop = Result[i]; 5181 CanonicalLoopInfo *TileLoop = Result[NumLoops + i]; 5182 Value *OrigIndVar = OrigIndVars[i]; 5183 Value *Size = TileSizes[i]; 5184 5185 Value *Scale = 5186 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true); 5187 Value *Shift = 5188 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true); 5189 OrigIndVar->replaceAllUsesWith(Shift); 5190 } 5191 5192 // Remove unused parts of the original loops. 5193 removeUnusedBlocksFromParent(OldControlBBs); 5194 5195 for (CanonicalLoopInfo *L : Loops) 5196 L->invalidate(); 5197 5198 #ifndef NDEBUG 5199 for (CanonicalLoopInfo *GenL : Result) 5200 GenL->assertOK(); 5201 #endif 5202 return Result; 5203 } 5204 5205 /// Attach metadata \p Properties to the basic block described by \p BB. If the 5206 /// basic block already has metadata, the basic block properties are appended. 5207 static void addBasicBlockMetadata(BasicBlock *BB, 5208 ArrayRef<Metadata *> Properties) { 5209 // Nothing to do if no property to attach. 5210 if (Properties.empty()) 5211 return; 5212 5213 LLVMContext &Ctx = BB->getContext(); 5214 SmallVector<Metadata *> NewProperties; 5215 NewProperties.push_back(nullptr); 5216 5217 // If the basic block already has metadata, prepend it to the new metadata. 5218 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop); 5219 if (Existing) 5220 append_range(NewProperties, drop_begin(Existing->operands(), 1)); 5221 5222 append_range(NewProperties, Properties); 5223 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties); 5224 BasicBlockID->replaceOperandWith(0, BasicBlockID); 5225 5226 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID); 5227 } 5228 5229 /// Attach loop metadata \p Properties to the loop described by \p Loop. If the 5230 /// loop already has metadata, the loop properties are appended. 5231 static void addLoopMetadata(CanonicalLoopInfo *Loop, 5232 ArrayRef<Metadata *> Properties) { 5233 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo"); 5234 5235 // Attach metadata to the loop's latch 5236 BasicBlock *Latch = Loop->getLatch(); 5237 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch"); 5238 addBasicBlockMetadata(Latch, Properties); 5239 } 5240 5241 /// Attach llvm.access.group metadata to the memref instructions of \p Block 5242 static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, 5243 LoopInfo &LI) { 5244 for (Instruction &I : *Block) { 5245 if (I.mayReadOrWriteMemory()) { 5246 // TODO: This instruction may already have access group from 5247 // other pragmas e.g. #pragma clang loop vectorize. Append 5248 // so that the existing metadata is not overwritten. 5249 I.setMetadata(LLVMContext::MD_access_group, AccessGroup); 5250 } 5251 } 5252 } 5253 5254 void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) { 5255 LLVMContext &Ctx = Builder.getContext(); 5256 addLoopMetadata( 5257 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")), 5258 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))}); 5259 } 5260 5261 void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) { 5262 LLVMContext &Ctx = Builder.getContext(); 5263 addLoopMetadata( 5264 Loop, { 5265 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")), 5266 }); 5267 } 5268 5269 void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop, 5270 Value *IfCond, ValueToValueMapTy &VMap, 5271 const Twine &NamePrefix) { 5272 Function *F = CanonicalLoop->getFunction(); 5273 5274 // Define where if branch should be inserted 5275 Instruction *SplitBefore = CanonicalLoop->getPreheader()->getTerminator(); 5276 5277 // TODO: We should not rely on pass manager. Currently we use pass manager 5278 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo 5279 // object. We should have a method which returns all blocks between 5280 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter() 5281 FunctionAnalysisManager FAM; 5282 FAM.registerPass([]() { return DominatorTreeAnalysis(); }); 5283 FAM.registerPass([]() { return LoopAnalysis(); }); 5284 FAM.registerPass([]() { return PassInstrumentationAnalysis(); }); 5285 5286 // Get the loop which needs to be cloned 5287 LoopAnalysis LIA; 5288 LoopInfo &&LI = LIA.run(*F, FAM); 5289 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader()); 5290 5291 // Create additional blocks for the if statement 5292 BasicBlock *Head = SplitBefore->getParent(); 5293 Instruction *HeadOldTerm = Head->getTerminator(); 5294 llvm::LLVMContext &C = Head->getContext(); 5295 llvm::BasicBlock *ThenBlock = llvm::BasicBlock::Create( 5296 C, NamePrefix + ".if.then", Head->getParent(), Head->getNextNode()); 5297 llvm::BasicBlock *ElseBlock = llvm::BasicBlock::Create( 5298 C, NamePrefix + ".if.else", Head->getParent(), CanonicalLoop->getExit()); 5299 5300 // Create if condition branch. 5301 Builder.SetInsertPoint(HeadOldTerm); 5302 Instruction *BrInstr = 5303 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock); 5304 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()}; 5305 // Then block contains branch to omp loop which needs to be vectorized 5306 spliceBB(IP, ThenBlock, false); 5307 ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock); 5308 5309 Builder.SetInsertPoint(ElseBlock); 5310 5311 // Clone loop for the else branch 5312 SmallVector<BasicBlock *, 8> NewBlocks; 5313 5314 VMap[CanonicalLoop->getPreheader()] = ElseBlock; 5315 for (BasicBlock *Block : L->getBlocks()) { 5316 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F); 5317 NewBB->moveBefore(CanonicalLoop->getExit()); 5318 VMap[Block] = NewBB; 5319 NewBlocks.push_back(NewBB); 5320 } 5321 remapInstructionsInBlocks(NewBlocks, VMap); 5322 Builder.CreateBr(NewBlocks.front()); 5323 } 5324 5325 unsigned 5326 OpenMPIRBuilder::getOpenMPDefaultSimdAlign(const Triple &TargetTriple, 5327 const StringMap<bool> &Features) { 5328 if (TargetTriple.isX86()) { 5329 if (Features.lookup("avx512f")) 5330 return 512; 5331 else if (Features.lookup("avx")) 5332 return 256; 5333 return 128; 5334 } 5335 if (TargetTriple.isPPC()) 5336 return 128; 5337 if (TargetTriple.isWasm()) 5338 return 128; 5339 return 0; 5340 } 5341 5342 void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop, 5343 MapVector<Value *, Value *> AlignedVars, 5344 Value *IfCond, OrderKind Order, 5345 ConstantInt *Simdlen, ConstantInt *Safelen) { 5346 LLVMContext &Ctx = Builder.getContext(); 5347 5348 Function *F = CanonicalLoop->getFunction(); 5349 5350 // TODO: We should not rely on pass manager. Currently we use pass manager 5351 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo 5352 // object. We should have a method which returns all blocks between 5353 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter() 5354 FunctionAnalysisManager FAM; 5355 FAM.registerPass([]() { return DominatorTreeAnalysis(); }); 5356 FAM.registerPass([]() { return LoopAnalysis(); }); 5357 FAM.registerPass([]() { return PassInstrumentationAnalysis(); }); 5358 5359 LoopAnalysis LIA; 5360 LoopInfo &&LI = LIA.run(*F, FAM); 5361 5362 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader()); 5363 if (AlignedVars.size()) { 5364 InsertPointTy IP = Builder.saveIP(); 5365 for (auto &AlignedItem : AlignedVars) { 5366 Value *AlignedPtr = AlignedItem.first; 5367 Value *Alignment = AlignedItem.second; 5368 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr); 5369 Builder.SetInsertPoint(loadInst->getNextNode()); 5370 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr, 5371 Alignment); 5372 } 5373 Builder.restoreIP(IP); 5374 } 5375 5376 if (IfCond) { 5377 ValueToValueMapTy VMap; 5378 createIfVersion(CanonicalLoop, IfCond, VMap, "simd"); 5379 // Add metadata to the cloned loop which disables vectorization 5380 Value *MappedLatch = VMap.lookup(CanonicalLoop->getLatch()); 5381 assert(MappedLatch && 5382 "Cannot find value which corresponds to original loop latch"); 5383 assert(isa<BasicBlock>(MappedLatch) && 5384 "Cannot cast mapped latch block value to BasicBlock"); 5385 BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch); 5386 ConstantAsMetadata *BoolConst = 5387 ConstantAsMetadata::get(ConstantInt::getFalse(Type::getInt1Ty(Ctx))); 5388 addBasicBlockMetadata( 5389 NewLatchBlock, 5390 {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), 5391 BoolConst})}); 5392 } 5393 5394 SmallSet<BasicBlock *, 8> Reachable; 5395 5396 // Get the basic blocks from the loop in which memref instructions 5397 // can be found. 5398 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo, 5399 // preferably without running any passes. 5400 for (BasicBlock *Block : L->getBlocks()) { 5401 if (Block == CanonicalLoop->getCond() || 5402 Block == CanonicalLoop->getHeader()) 5403 continue; 5404 Reachable.insert(Block); 5405 } 5406 5407 SmallVector<Metadata *> LoopMDList; 5408 5409 // In presence of finite 'safelen', it may be unsafe to mark all 5410 // the memory instructions parallel, because loop-carried 5411 // dependences of 'safelen' iterations are possible. 5412 // If clause order(concurrent) is specified then the memory instructions 5413 // are marked parallel even if 'safelen' is finite. 5414 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) { 5415 // Add access group metadata to memory-access instructions. 5416 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {}); 5417 for (BasicBlock *BB : Reachable) 5418 addSimdMetadata(BB, AccessGroup, LI); 5419 // TODO: If the loop has existing parallel access metadata, have 5420 // to combine two lists. 5421 LoopMDList.push_back(MDNode::get( 5422 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup})); 5423 } 5424 5425 // Use the above access group metadata to create loop level 5426 // metadata, which should be distinct for each loop. 5427 ConstantAsMetadata *BoolConst = 5428 ConstantAsMetadata::get(ConstantInt::getTrue(Type::getInt1Ty(Ctx))); 5429 LoopMDList.push_back(MDNode::get( 5430 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst})); 5431 5432 if (Simdlen || Safelen) { 5433 // If both simdlen and safelen clauses are specified, the value of the 5434 // simdlen parameter must be less than or equal to the value of the safelen 5435 // parameter. Therefore, use safelen only in the absence of simdlen. 5436 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen; 5437 LoopMDList.push_back( 5438 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"), 5439 ConstantAsMetadata::get(VectorizeWidth)})); 5440 } 5441 5442 addLoopMetadata(CanonicalLoop, LoopMDList); 5443 } 5444 5445 /// Create the TargetMachine object to query the backend for optimization 5446 /// preferences. 5447 /// 5448 /// Ideally, this would be passed from the front-end to the OpenMPBuilder, but 5449 /// e.g. Clang does not pass it to its CodeGen layer and creates it only when 5450 /// needed for the LLVM pass pipline. We use some default options to avoid 5451 /// having to pass too many settings from the frontend that probably do not 5452 /// matter. 5453 /// 5454 /// Currently, TargetMachine is only used sometimes by the unrollLoopPartial 5455 /// method. If we are going to use TargetMachine for more purposes, especially 5456 /// those that are sensitive to TargetOptions, RelocModel and CodeModel, it 5457 /// might become be worth requiring front-ends to pass on their TargetMachine, 5458 /// or at least cache it between methods. Note that while fontends such as Clang 5459 /// have just a single main TargetMachine per translation unit, "target-cpu" and 5460 /// "target-features" that determine the TargetMachine are per-function and can 5461 /// be overrided using __attribute__((target("OPTIONS"))). 5462 static std::unique_ptr<TargetMachine> 5463 createTargetMachine(Function *F, CodeGenOptLevel OptLevel) { 5464 Module *M = F->getParent(); 5465 5466 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString(); 5467 StringRef Features = F->getFnAttribute("target-features").getValueAsString(); 5468 const std::string &Triple = M->getTargetTriple(); 5469 5470 std::string Error; 5471 const llvm::Target *TheTarget = TargetRegistry::lookupTarget(Triple, Error); 5472 if (!TheTarget) 5473 return {}; 5474 5475 llvm::TargetOptions Options; 5476 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine( 5477 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt, 5478 /*CodeModel=*/std::nullopt, OptLevel)); 5479 } 5480 5481 /// Heuristically determine the best-performant unroll factor for \p CLI. This 5482 /// depends on the target processor. We are re-using the same heuristics as the 5483 /// LoopUnrollPass. 5484 static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) { 5485 Function *F = CLI->getFunction(); 5486 5487 // Assume the user requests the most aggressive unrolling, even if the rest of 5488 // the code is optimized using a lower setting. 5489 CodeGenOptLevel OptLevel = CodeGenOptLevel::Aggressive; 5490 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel); 5491 5492 FunctionAnalysisManager FAM; 5493 FAM.registerPass([]() { return TargetLibraryAnalysis(); }); 5494 FAM.registerPass([]() { return AssumptionAnalysis(); }); 5495 FAM.registerPass([]() { return DominatorTreeAnalysis(); }); 5496 FAM.registerPass([]() { return LoopAnalysis(); }); 5497 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); }); 5498 FAM.registerPass([]() { return PassInstrumentationAnalysis(); }); 5499 TargetIRAnalysis TIRA; 5500 if (TM) 5501 TIRA = TargetIRAnalysis( 5502 [&](const Function &F) { return TM->getTargetTransformInfo(F); }); 5503 FAM.registerPass([&]() { return TIRA; }); 5504 5505 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM); 5506 ScalarEvolutionAnalysis SEA; 5507 ScalarEvolution &&SE = SEA.run(*F, FAM); 5508 DominatorTreeAnalysis DTA; 5509 DominatorTree &&DT = DTA.run(*F, FAM); 5510 LoopAnalysis LIA; 5511 LoopInfo &&LI = LIA.run(*F, FAM); 5512 AssumptionAnalysis ACT; 5513 AssumptionCache &&AC = ACT.run(*F, FAM); 5514 OptimizationRemarkEmitter ORE{F}; 5515 5516 Loop *L = LI.getLoopFor(CLI->getHeader()); 5517 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop"); 5518 5519 TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( 5520 L, SE, TTI, 5521 /*BlockFrequencyInfo=*/nullptr, 5522 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel), 5523 /*UserThreshold=*/std::nullopt, 5524 /*UserCount=*/std::nullopt, 5525 /*UserAllowPartial=*/true, 5526 /*UserAllowRuntime=*/true, 5527 /*UserUpperBound=*/std::nullopt, 5528 /*UserFullUnrollMaxCount=*/std::nullopt); 5529 5530 UP.Force = true; 5531 5532 // Account for additional optimizations taking place before the LoopUnrollPass 5533 // would unroll the loop. 5534 UP.Threshold *= UnrollThresholdFactor; 5535 UP.PartialThreshold *= UnrollThresholdFactor; 5536 5537 // Use normal unroll factors even if the rest of the code is optimized for 5538 // size. 5539 UP.OptSizeThreshold = UP.Threshold; 5540 UP.PartialOptSizeThreshold = UP.PartialThreshold; 5541 5542 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n" 5543 << " Threshold=" << UP.Threshold << "\n" 5544 << " PartialThreshold=" << UP.PartialThreshold << "\n" 5545 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n" 5546 << " PartialOptSizeThreshold=" 5547 << UP.PartialOptSizeThreshold << "\n"); 5548 5549 // Disable peeling. 5550 TargetTransformInfo::PeelingPreferences PP = 5551 gatherPeelingPreferences(L, SE, TTI, 5552 /*UserAllowPeeling=*/false, 5553 /*UserAllowProfileBasedPeeling=*/false, 5554 /*UnrollingSpecficValues=*/false); 5555 5556 SmallPtrSet<const Value *, 32> EphValues; 5557 CodeMetrics::collectEphemeralValues(L, &AC, EphValues); 5558 5559 // Assume that reads and writes to stack variables can be eliminated by 5560 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's 5561 // size. 5562 for (BasicBlock *BB : L->blocks()) { 5563 for (Instruction &I : *BB) { 5564 Value *Ptr; 5565 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5566 Ptr = Load->getPointerOperand(); 5567 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5568 Ptr = Store->getPointerOperand(); 5569 } else 5570 continue; 5571 5572 Ptr = Ptr->stripPointerCasts(); 5573 5574 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) { 5575 if (Alloca->getParent() == &F->getEntryBlock()) 5576 EphValues.insert(&I); 5577 } 5578 } 5579 } 5580 5581 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns); 5582 5583 // Loop is not unrollable if the loop contains certain instructions. 5584 if (!UCE.canUnroll()) { 5585 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n"); 5586 return 1; 5587 } 5588 5589 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize() 5590 << "\n"); 5591 5592 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might 5593 // be able to use it. 5594 int TripCount = 0; 5595 int MaxTripCount = 0; 5596 bool MaxOrZero = false; 5597 unsigned TripMultiple = 0; 5598 5599 bool UseUpperBound = false; 5600 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount, 5601 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP, 5602 UseUpperBound); 5603 unsigned Factor = UP.Count; 5604 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n"); 5605 5606 // This function returns 1 to signal to not unroll a loop. 5607 if (Factor == 0) 5608 return 1; 5609 return Factor; 5610 } 5611 5612 void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, 5613 int32_t Factor, 5614 CanonicalLoopInfo **UnrolledCLI) { 5615 assert(Factor >= 0 && "Unroll factor must not be negative"); 5616 5617 Function *F = Loop->getFunction(); 5618 LLVMContext &Ctx = F->getContext(); 5619 5620 // If the unrolled loop is not used for another loop-associated directive, it 5621 // is sufficient to add metadata for the LoopUnrollPass. 5622 if (!UnrolledCLI) { 5623 SmallVector<Metadata *, 2> LoopMetadata; 5624 LoopMetadata.push_back( 5625 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable"))); 5626 5627 if (Factor >= 1) { 5628 ConstantAsMetadata *FactorConst = ConstantAsMetadata::get( 5629 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor))); 5630 LoopMetadata.push_back(MDNode::get( 5631 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})); 5632 } 5633 5634 addLoopMetadata(Loop, LoopMetadata); 5635 return; 5636 } 5637 5638 // Heuristically determine the unroll factor. 5639 if (Factor == 0) 5640 Factor = computeHeuristicUnrollFactor(Loop); 5641 5642 // No change required with unroll factor 1. 5643 if (Factor == 1) { 5644 *UnrolledCLI = Loop; 5645 return; 5646 } 5647 5648 assert(Factor >= 2 && 5649 "unrolling only makes sense with a factor of 2 or larger"); 5650 5651 Type *IndVarTy = Loop->getIndVarType(); 5652 5653 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully 5654 // unroll the inner loop. 5655 Value *FactorVal = 5656 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor, 5657 /*isSigned=*/false)); 5658 std::vector<CanonicalLoopInfo *> LoopNest = 5659 tileLoops(DL, {Loop}, {FactorVal}); 5660 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling"); 5661 *UnrolledCLI = LoopNest[0]; 5662 CanonicalLoopInfo *InnerLoop = LoopNest[1]; 5663 5664 // LoopUnrollPass can only fully unroll loops with constant trip count. 5665 // Unroll by the unroll factor with a fallback epilog for the remainder 5666 // iterations if necessary. 5667 ConstantAsMetadata *FactorConst = ConstantAsMetadata::get( 5668 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor))); 5669 addLoopMetadata( 5670 InnerLoop, 5671 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")), 5672 MDNode::get( 5673 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})}); 5674 5675 #ifndef NDEBUG 5676 (*UnrolledCLI)->assertOK(); 5677 #endif 5678 } 5679 5680 OpenMPIRBuilder::InsertPointTy 5681 OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc, 5682 llvm::Value *BufSize, llvm::Value *CpyBuf, 5683 llvm::Value *CpyFn, llvm::Value *DidIt) { 5684 if (!updateToLocation(Loc)) 5685 return Loc.IP; 5686 5687 uint32_t SrcLocStrSize; 5688 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 5689 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 5690 Value *ThreadId = getOrCreateThreadID(Ident); 5691 5692 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt); 5693 5694 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD}; 5695 5696 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate); 5697 Builder.CreateCall(Fn, Args); 5698 5699 return Builder.saveIP(); 5700 } 5701 5702 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSingle( 5703 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, 5704 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars, 5705 ArrayRef<llvm::Function *> CPFuncs) { 5706 5707 if (!updateToLocation(Loc)) 5708 return Loc.IP; 5709 5710 // If needed allocate and initialize `DidIt` with 0. 5711 // DidIt: flag variable: 1=single thread; 0=not single thread. 5712 llvm::Value *DidIt = nullptr; 5713 if (!CPVars.empty()) { 5714 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext())); 5715 Builder.CreateStore(Builder.getInt32(0), DidIt); 5716 } 5717 5718 Directive OMPD = Directive::OMPD_single; 5719 uint32_t SrcLocStrSize; 5720 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 5721 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 5722 Value *ThreadId = getOrCreateThreadID(Ident); 5723 Value *Args[] = {Ident, ThreadId}; 5724 5725 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single); 5726 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args); 5727 5728 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single); 5729 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args); 5730 5731 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error { 5732 if (Error Err = FiniCB(IP)) 5733 return Err; 5734 5735 // The thread that executes the single region must set `DidIt` to 1. 5736 // This is used by __kmpc_copyprivate, to know if the caller is the 5737 // single thread or not. 5738 if (DidIt) 5739 Builder.CreateStore(Builder.getInt32(1), DidIt); 5740 5741 return Error::success(); 5742 }; 5743 5744 // generates the following: 5745 // if (__kmpc_single()) { 5746 // .... single region ... 5747 // __kmpc_end_single 5748 // } 5749 // __kmpc_copyprivate 5750 // __kmpc_barrier 5751 5752 InsertPointOrErrorTy AfterIP = 5753 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper, 5754 /*Conditional*/ true, 5755 /*hasFinalize*/ true); 5756 if (!AfterIP) 5757 return AfterIP.takeError(); 5758 5759 if (DidIt) { 5760 for (size_t I = 0, E = CPVars.size(); I < E; ++I) 5761 // NOTE BufSize is currently unused, so just pass 0. 5762 createCopyPrivate(LocationDescription(Builder.saveIP(), Loc.DL), 5763 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I], 5764 CPFuncs[I], DidIt); 5765 // NOTE __kmpc_copyprivate already inserts a barrier 5766 } else if (!IsNowait) { 5767 InsertPointOrErrorTy AfterIP = 5768 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL), 5769 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false, 5770 /* CheckCancelFlag */ false); 5771 if (!AfterIP) 5772 return AfterIP.takeError(); 5773 } 5774 return Builder.saveIP(); 5775 } 5776 5777 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createCritical( 5778 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, 5779 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) { 5780 5781 if (!updateToLocation(Loc)) 5782 return Loc.IP; 5783 5784 Directive OMPD = Directive::OMPD_critical; 5785 uint32_t SrcLocStrSize; 5786 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 5787 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 5788 Value *ThreadId = getOrCreateThreadID(Ident); 5789 Value *LockVar = getOMPCriticalRegionLock(CriticalName); 5790 Value *Args[] = {Ident, ThreadId, LockVar}; 5791 5792 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args)); 5793 Function *RTFn = nullptr; 5794 if (HintInst) { 5795 // Add Hint to entry Args and create call 5796 EnterArgs.push_back(HintInst); 5797 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint); 5798 } else { 5799 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical); 5800 } 5801 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs); 5802 5803 Function *ExitRTLFn = 5804 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical); 5805 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args); 5806 5807 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB, 5808 /*Conditional*/ false, /*hasFinalize*/ true); 5809 } 5810 5811 OpenMPIRBuilder::InsertPointTy 5812 OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc, 5813 InsertPointTy AllocaIP, unsigned NumLoops, 5814 ArrayRef<llvm::Value *> StoreValues, 5815 const Twine &Name, bool IsDependSource) { 5816 assert( 5817 llvm::all_of(StoreValues, 5818 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) && 5819 "OpenMP runtime requires depend vec with i64 type"); 5820 5821 if (!updateToLocation(Loc)) 5822 return Loc.IP; 5823 5824 // Allocate space for vector and generate alloc instruction. 5825 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops); 5826 Builder.restoreIP(AllocaIP); 5827 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name); 5828 ArgsBase->setAlignment(Align(8)); 5829 Builder.restoreIP(Loc.IP); 5830 5831 // Store the index value with offset in depend vector. 5832 for (unsigned I = 0; I < NumLoops; ++I) { 5833 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP( 5834 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)}); 5835 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter); 5836 STInst->setAlignment(Align(8)); 5837 } 5838 5839 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP( 5840 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)}); 5841 5842 uint32_t SrcLocStrSize; 5843 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 5844 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 5845 Value *ThreadId = getOrCreateThreadID(Ident); 5846 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP}; 5847 5848 Function *RTLFn = nullptr; 5849 if (IsDependSource) 5850 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post); 5851 else 5852 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait); 5853 Builder.CreateCall(RTLFn, Args); 5854 5855 return Builder.saveIP(); 5856 } 5857 5858 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createOrderedThreadsSimd( 5859 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, 5860 FinalizeCallbackTy FiniCB, bool IsThreads) { 5861 if (!updateToLocation(Loc)) 5862 return Loc.IP; 5863 5864 Directive OMPD = Directive::OMPD_ordered; 5865 Instruction *EntryCall = nullptr; 5866 Instruction *ExitCall = nullptr; 5867 5868 if (IsThreads) { 5869 uint32_t SrcLocStrSize; 5870 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 5871 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 5872 Value *ThreadId = getOrCreateThreadID(Ident); 5873 Value *Args[] = {Ident, ThreadId}; 5874 5875 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered); 5876 EntryCall = Builder.CreateCall(EntryRTLFn, Args); 5877 5878 Function *ExitRTLFn = 5879 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered); 5880 ExitCall = Builder.CreateCall(ExitRTLFn, Args); 5881 } 5882 5883 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB, 5884 /*Conditional*/ false, /*hasFinalize*/ true); 5885 } 5886 5887 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion( 5888 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall, 5889 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional, 5890 bool HasFinalize, bool IsCancellable) { 5891 5892 if (HasFinalize) 5893 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable}); 5894 5895 // Create inlined region's entry and body blocks, in preparation 5896 // for conditional creation 5897 BasicBlock *EntryBB = Builder.GetInsertBlock(); 5898 Instruction *SplitPos = EntryBB->getTerminator(); 5899 if (!isa_and_nonnull<BranchInst>(SplitPos)) 5900 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB); 5901 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end"); 5902 BasicBlock *FiniBB = 5903 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize"); 5904 5905 Builder.SetInsertPoint(EntryBB->getTerminator()); 5906 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional); 5907 5908 // generate body 5909 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(), 5910 /* CodeGenIP */ Builder.saveIP())) 5911 return Err; 5912 5913 // emit exit call and do any needed finalization. 5914 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt()); 5915 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 && 5916 FiniBB->getTerminator()->getSuccessor(0) == ExitBB && 5917 "Unexpected control flow graph state!!"); 5918 InsertPointOrErrorTy AfterIP = 5919 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize); 5920 if (!AfterIP) 5921 return AfterIP.takeError(); 5922 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB && 5923 "Unexpected Control Flow State!"); 5924 MergeBlockIntoPredecessor(FiniBB); 5925 5926 // If we are skipping the region of a non conditional, remove the exit 5927 // block, and clear the builder's insertion point. 5928 assert(SplitPos->getParent() == ExitBB && 5929 "Unexpected Insertion point location!"); 5930 auto merged = MergeBlockIntoPredecessor(ExitBB); 5931 BasicBlock *ExitPredBB = SplitPos->getParent(); 5932 auto InsertBB = merged ? ExitPredBB : ExitBB; 5933 if (!isa_and_nonnull<BranchInst>(SplitPos)) 5934 SplitPos->eraseFromParent(); 5935 Builder.SetInsertPoint(InsertBB); 5936 5937 return Builder.saveIP(); 5938 } 5939 5940 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry( 5941 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) { 5942 // if nothing to do, Return current insertion point. 5943 if (!Conditional || !EntryCall) 5944 return Builder.saveIP(); 5945 5946 BasicBlock *EntryBB = Builder.GetInsertBlock(); 5947 Value *CallBool = Builder.CreateIsNotNull(EntryCall); 5948 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body"); 5949 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB); 5950 5951 // Emit thenBB and set the Builder's insertion point there for 5952 // body generation next. Place the block after the current block. 5953 Function *CurFn = EntryBB->getParent(); 5954 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB); 5955 5956 // Move Entry branch to end of ThenBB, and replace with conditional 5957 // branch (If-stmt) 5958 Instruction *EntryBBTI = EntryBB->getTerminator(); 5959 Builder.CreateCondBr(CallBool, ThenBB, ExitBB); 5960 EntryBBTI->removeFromParent(); 5961 Builder.SetInsertPoint(UI); 5962 Builder.Insert(EntryBBTI); 5963 UI->eraseFromParent(); 5964 Builder.SetInsertPoint(ThenBB->getTerminator()); 5965 5966 // return an insertion point to ExitBB. 5967 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt()); 5968 } 5969 5970 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit( 5971 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall, 5972 bool HasFinalize) { 5973 5974 Builder.restoreIP(FinIP); 5975 5976 // If there is finalization to do, emit it before the exit call 5977 if (HasFinalize) { 5978 assert(!FinalizationStack.empty() && 5979 "Unexpected finalization stack state!"); 5980 5981 FinalizationInfo Fi = FinalizationStack.pop_back_val(); 5982 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!"); 5983 5984 if (Error Err = Fi.FiniCB(FinIP)) 5985 return Err; 5986 5987 BasicBlock *FiniBB = FinIP.getBlock(); 5988 Instruction *FiniBBTI = FiniBB->getTerminator(); 5989 5990 // set Builder IP for call creation 5991 Builder.SetInsertPoint(FiniBBTI); 5992 } 5993 5994 if (!ExitCall) 5995 return Builder.saveIP(); 5996 5997 // place the Exitcall as last instruction before Finalization block terminator 5998 ExitCall->removeFromParent(); 5999 Builder.Insert(ExitCall); 6000 6001 return IRBuilder<>::InsertPoint(ExitCall->getParent(), 6002 ExitCall->getIterator()); 6003 } 6004 6005 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCopyinClauseBlocks( 6006 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, 6007 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) { 6008 if (!IP.isSet()) 6009 return IP; 6010 6011 IRBuilder<>::InsertPointGuard IPG(Builder); 6012 6013 // creates the following CFG structure 6014 // OMP_Entry : (MasterAddr != PrivateAddr)? 6015 // F T 6016 // | \ 6017 // | copin.not.master 6018 // | / 6019 // v / 6020 // copyin.not.master.end 6021 // | 6022 // v 6023 // OMP.Entry.Next 6024 6025 BasicBlock *OMP_Entry = IP.getBlock(); 6026 Function *CurFn = OMP_Entry->getParent(); 6027 BasicBlock *CopyBegin = 6028 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn); 6029 BasicBlock *CopyEnd = nullptr; 6030 6031 // If entry block is terminated, split to preserve the branch to following 6032 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is. 6033 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) { 6034 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(), 6035 "copyin.not.master.end"); 6036 OMP_Entry->getTerminator()->eraseFromParent(); 6037 } else { 6038 CopyEnd = 6039 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn); 6040 } 6041 6042 Builder.SetInsertPoint(OMP_Entry); 6043 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy); 6044 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy); 6045 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr); 6046 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd); 6047 6048 Builder.SetInsertPoint(CopyBegin); 6049 if (BranchtoEnd) 6050 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd)); 6051 6052 return Builder.saveIP(); 6053 } 6054 6055 CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc, 6056 Value *Size, Value *Allocator, 6057 std::string Name) { 6058 IRBuilder<>::InsertPointGuard IPG(Builder); 6059 updateToLocation(Loc); 6060 6061 uint32_t SrcLocStrSize; 6062 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 6063 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 6064 Value *ThreadId = getOrCreateThreadID(Ident); 6065 Value *Args[] = {ThreadId, Size, Allocator}; 6066 6067 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc); 6068 6069 return Builder.CreateCall(Fn, Args, Name); 6070 } 6071 6072 CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc, 6073 Value *Addr, Value *Allocator, 6074 std::string Name) { 6075 IRBuilder<>::InsertPointGuard IPG(Builder); 6076 updateToLocation(Loc); 6077 6078 uint32_t SrcLocStrSize; 6079 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 6080 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 6081 Value *ThreadId = getOrCreateThreadID(Ident); 6082 Value *Args[] = {ThreadId, Addr, Allocator}; 6083 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free); 6084 return Builder.CreateCall(Fn, Args, Name); 6085 } 6086 6087 CallInst *OpenMPIRBuilder::createOMPInteropInit( 6088 const LocationDescription &Loc, Value *InteropVar, 6089 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, 6090 Value *DependenceAddress, bool HaveNowaitClause) { 6091 IRBuilder<>::InsertPointGuard IPG(Builder); 6092 updateToLocation(Loc); 6093 6094 uint32_t SrcLocStrSize; 6095 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 6096 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 6097 Value *ThreadId = getOrCreateThreadID(Ident); 6098 if (Device == nullptr) 6099 Device = Constant::getAllOnesValue(Int32); 6100 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType); 6101 if (NumDependences == nullptr) { 6102 NumDependences = ConstantInt::get(Int32, 0); 6103 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext()); 6104 DependenceAddress = ConstantPointerNull::get(PointerTypeVar); 6105 } 6106 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause); 6107 Value *Args[] = { 6108 Ident, ThreadId, InteropVar, InteropTypeVal, 6109 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal}; 6110 6111 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init); 6112 6113 return Builder.CreateCall(Fn, Args); 6114 } 6115 6116 CallInst *OpenMPIRBuilder::createOMPInteropDestroy( 6117 const LocationDescription &Loc, Value *InteropVar, Value *Device, 6118 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) { 6119 IRBuilder<>::InsertPointGuard IPG(Builder); 6120 updateToLocation(Loc); 6121 6122 uint32_t SrcLocStrSize; 6123 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 6124 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 6125 Value *ThreadId = getOrCreateThreadID(Ident); 6126 if (Device == nullptr) 6127 Device = Constant::getAllOnesValue(Int32); 6128 if (NumDependences == nullptr) { 6129 NumDependences = ConstantInt::get(Int32, 0); 6130 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext()); 6131 DependenceAddress = ConstantPointerNull::get(PointerTypeVar); 6132 } 6133 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause); 6134 Value *Args[] = { 6135 Ident, ThreadId, InteropVar, Device, 6136 NumDependences, DependenceAddress, HaveNowaitClauseVal}; 6137 6138 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy); 6139 6140 return Builder.CreateCall(Fn, Args); 6141 } 6142 6143 CallInst *OpenMPIRBuilder::createOMPInteropUse(const LocationDescription &Loc, 6144 Value *InteropVar, Value *Device, 6145 Value *NumDependences, 6146 Value *DependenceAddress, 6147 bool HaveNowaitClause) { 6148 IRBuilder<>::InsertPointGuard IPG(Builder); 6149 updateToLocation(Loc); 6150 uint32_t SrcLocStrSize; 6151 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 6152 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 6153 Value *ThreadId = getOrCreateThreadID(Ident); 6154 if (Device == nullptr) 6155 Device = Constant::getAllOnesValue(Int32); 6156 if (NumDependences == nullptr) { 6157 NumDependences = ConstantInt::get(Int32, 0); 6158 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext()); 6159 DependenceAddress = ConstantPointerNull::get(PointerTypeVar); 6160 } 6161 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause); 6162 Value *Args[] = { 6163 Ident, ThreadId, InteropVar, Device, 6164 NumDependences, DependenceAddress, HaveNowaitClauseVal}; 6165 6166 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use); 6167 6168 return Builder.CreateCall(Fn, Args); 6169 } 6170 6171 CallInst *OpenMPIRBuilder::createCachedThreadPrivate( 6172 const LocationDescription &Loc, llvm::Value *Pointer, 6173 llvm::ConstantInt *Size, const llvm::Twine &Name) { 6174 IRBuilder<>::InsertPointGuard IPG(Builder); 6175 updateToLocation(Loc); 6176 6177 uint32_t SrcLocStrSize; 6178 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 6179 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 6180 Value *ThreadId = getOrCreateThreadID(Ident); 6181 Constant *ThreadPrivateCache = 6182 getOrCreateInternalVariable(Int8PtrPtr, Name.str()); 6183 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache}; 6184 6185 Function *Fn = 6186 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached); 6187 6188 return Builder.CreateCall(Fn, Args); 6189 } 6190 6191 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit( 6192 const LocationDescription &Loc, 6193 const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs) { 6194 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() && 6195 "expected num_threads and num_teams to be specified"); 6196 6197 if (!updateToLocation(Loc)) 6198 return Loc.IP; 6199 6200 uint32_t SrcLocStrSize; 6201 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 6202 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 6203 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags); 6204 Constant *UseGenericStateMachineVal = ConstantInt::getSigned( 6205 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD); 6206 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true); 6207 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0); 6208 6209 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent(); 6210 Function *Kernel = DebugKernelWrapper; 6211 6212 // We need to strip the debug prefix to get the correct kernel name. 6213 StringRef KernelName = Kernel->getName(); 6214 const std::string DebugPrefix = "_debug__"; 6215 if (KernelName.ends_with(DebugPrefix)) { 6216 KernelName = KernelName.drop_back(DebugPrefix.length()); 6217 Kernel = M.getFunction(KernelName); 6218 assert(Kernel && "Expected the real kernel to exist"); 6219 } 6220 6221 // Manifest the launch configuration in the metadata matching the kernel 6222 // environment. 6223 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0) 6224 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front()); 6225 6226 // If MaxThreads not set, select the maximum between the default workgroup 6227 // size and the MinThreads value. 6228 int32_t MaxThreadsVal = Attrs.MaxThreads.front(); 6229 if (MaxThreadsVal < 0) 6230 MaxThreadsVal = std::max( 6231 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads); 6232 6233 if (MaxThreadsVal > 0) 6234 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal); 6235 6236 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads); 6237 Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal); 6238 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams); 6239 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front()); 6240 Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0); 6241 Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0); 6242 6243 Function *Fn = getOrCreateRuntimeFunctionPtr( 6244 omp::RuntimeFunction::OMPRTL___kmpc_target_init); 6245 const DataLayout &DL = Fn->getDataLayout(); 6246 6247 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment"; 6248 Constant *DynamicEnvironmentInitializer = 6249 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal}); 6250 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable( 6251 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage, 6252 DynamicEnvironmentInitializer, DynamicEnvironmentName, 6253 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal, 6254 DL.getDefaultGlobalsAddressSpace()); 6255 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility); 6256 6257 Constant *DynamicEnvironment = 6258 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr 6259 ? DynamicEnvironmentGV 6260 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV, 6261 DynamicEnvironmentPtr); 6262 6263 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get( 6264 ConfigurationEnvironment, { 6265 UseGenericStateMachineVal, 6266 MayUseNestedParallelismVal, 6267 IsSPMDVal, 6268 MinThreads, 6269 MaxThreads, 6270 MinTeams, 6271 MaxTeams, 6272 ReductionDataSize, 6273 ReductionBufferLength, 6274 }); 6275 Constant *KernelEnvironmentInitializer = ConstantStruct::get( 6276 KernelEnvironment, { 6277 ConfigurationEnvironmentInitializer, 6278 Ident, 6279 DynamicEnvironment, 6280 }); 6281 std::string KernelEnvironmentName = 6282 (KernelName + "_kernel_environment").str(); 6283 GlobalVariable *KernelEnvironmentGV = new GlobalVariable( 6284 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage, 6285 KernelEnvironmentInitializer, KernelEnvironmentName, 6286 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal, 6287 DL.getDefaultGlobalsAddressSpace()); 6288 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility); 6289 6290 Constant *KernelEnvironment = 6291 KernelEnvironmentGV->getType() == KernelEnvironmentPtr 6292 ? KernelEnvironmentGV 6293 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV, 6294 KernelEnvironmentPtr); 6295 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0); 6296 CallInst *ThreadKind = 6297 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment}); 6298 6299 Value *ExecUserCode = Builder.CreateICmpEQ( 6300 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()), 6301 "exec_user_code"); 6302 6303 // ThreadKind = __kmpc_target_init(...) 6304 // if (ThreadKind == -1) 6305 // user_code 6306 // else 6307 // return; 6308 6309 auto *UI = Builder.CreateUnreachable(); 6310 BasicBlock *CheckBB = UI->getParent(); 6311 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry"); 6312 6313 BasicBlock *WorkerExitBB = BasicBlock::Create( 6314 CheckBB->getContext(), "worker.exit", CheckBB->getParent()); 6315 Builder.SetInsertPoint(WorkerExitBB); 6316 Builder.CreateRetVoid(); 6317 6318 auto *CheckBBTI = CheckBB->getTerminator(); 6319 Builder.SetInsertPoint(CheckBBTI); 6320 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB); 6321 6322 CheckBBTI->eraseFromParent(); 6323 UI->eraseFromParent(); 6324 6325 // Continue in the "user_code" block, see diagram above and in 6326 // openmp/libomptarget/deviceRTLs/common/include/target.h . 6327 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt()); 6328 } 6329 6330 void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc, 6331 int32_t TeamsReductionDataSize, 6332 int32_t TeamsReductionBufferLength) { 6333 if (!updateToLocation(Loc)) 6334 return; 6335 6336 Function *Fn = getOrCreateRuntimeFunctionPtr( 6337 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit); 6338 6339 Builder.CreateCall(Fn, {}); 6340 6341 if (!TeamsReductionBufferLength || !TeamsReductionDataSize) 6342 return; 6343 6344 Function *Kernel = Builder.GetInsertBlock()->getParent(); 6345 // We need to strip the debug prefix to get the correct kernel name. 6346 StringRef KernelName = Kernel->getName(); 6347 const std::string DebugPrefix = "_debug__"; 6348 if (KernelName.ends_with(DebugPrefix)) 6349 KernelName = KernelName.drop_back(DebugPrefix.length()); 6350 auto *KernelEnvironmentGV = 6351 M.getNamedGlobal((KernelName + "_kernel_environment").str()); 6352 assert(KernelEnvironmentGV && "Expected kernel environment global\n"); 6353 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer(); 6354 auto *NewInitializer = ConstantFoldInsertValueInstruction( 6355 KernelEnvironmentInitializer, 6356 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7}); 6357 NewInitializer = ConstantFoldInsertValueInstruction( 6358 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength), 6359 {0, 8}); 6360 KernelEnvironmentGV->setInitializer(NewInitializer); 6361 } 6362 6363 static MDNode *getNVPTXMDNode(Function &Kernel, StringRef Name) { 6364 Module &M = *Kernel.getParent(); 6365 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); 6366 for (auto *Op : MD->operands()) { 6367 if (Op->getNumOperands() != 3) 6368 continue; 6369 auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0)); 6370 if (!KernelOp || KernelOp->getValue() != &Kernel) 6371 continue; 6372 auto *Prop = dyn_cast<MDString>(Op->getOperand(1)); 6373 if (!Prop || Prop->getString() != Name) 6374 continue; 6375 return Op; 6376 } 6377 return nullptr; 6378 } 6379 6380 static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value, 6381 bool Min) { 6382 // Update the "maxntidx" metadata for NVIDIA, or add it. 6383 MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name); 6384 if (ExistingOp) { 6385 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2)); 6386 int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue(); 6387 ExistingOp->replaceOperandWith( 6388 2, ConstantAsMetadata::get(ConstantInt::get( 6389 OldVal->getValue()->getType(), 6390 Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value)))); 6391 } else { 6392 LLVMContext &Ctx = Kernel.getContext(); 6393 Metadata *MDVals[] = {ConstantAsMetadata::get(&Kernel), 6394 MDString::get(Ctx, Name), 6395 ConstantAsMetadata::get( 6396 ConstantInt::get(Type::getInt32Ty(Ctx), Value))}; 6397 // Append metadata to nvvm.annotations 6398 Module &M = *Kernel.getParent(); 6399 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); 6400 MD->addOperand(MDNode::get(Ctx, MDVals)); 6401 } 6402 } 6403 6404 std::pair<int32_t, int32_t> 6405 OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) { 6406 int32_t ThreadLimit = 6407 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit"); 6408 6409 if (T.isAMDGPU()) { 6410 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size"); 6411 if (!Attr.isValid() || !Attr.isStringAttribute()) 6412 return {0, ThreadLimit}; 6413 auto [LBStr, UBStr] = Attr.getValueAsString().split(','); 6414 int32_t LB, UB; 6415 if (!llvm::to_integer(UBStr, UB, 10)) 6416 return {0, ThreadLimit}; 6417 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB; 6418 if (!llvm::to_integer(LBStr, LB, 10)) 6419 return {0, UB}; 6420 return {LB, UB}; 6421 } 6422 6423 if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) { 6424 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2)); 6425 int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue(); 6426 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB}; 6427 } 6428 return {0, ThreadLimit}; 6429 } 6430 6431 void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T, 6432 Function &Kernel, int32_t LB, 6433 int32_t UB) { 6434 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB)); 6435 6436 if (T.isAMDGPU()) { 6437 Kernel.addFnAttr("amdgpu-flat-work-group-size", 6438 llvm::utostr(LB) + "," + llvm::utostr(UB)); 6439 return; 6440 } 6441 6442 updateNVPTXMetadata(Kernel, "maxntidx", UB, true); 6443 } 6444 6445 std::pair<int32_t, int32_t> 6446 OpenMPIRBuilder::readTeamBoundsForKernel(const Triple &, Function &Kernel) { 6447 // TODO: Read from backend annotations if available. 6448 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")}; 6449 } 6450 6451 void OpenMPIRBuilder::writeTeamsForKernel(const Triple &T, Function &Kernel, 6452 int32_t LB, int32_t UB) { 6453 if (T.isNVPTX()) 6454 if (UB > 0) 6455 updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true); 6456 if (T.isAMDGPU()) 6457 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1"); 6458 6459 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB)); 6460 } 6461 6462 void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes( 6463 Function *OutlinedFn) { 6464 if (Config.isTargetDevice()) { 6465 OutlinedFn->setLinkage(GlobalValue::WeakODRLinkage); 6466 // TODO: Determine if DSO local can be set to true. 6467 OutlinedFn->setDSOLocal(false); 6468 OutlinedFn->setVisibility(GlobalValue::ProtectedVisibility); 6469 if (T.isAMDGCN()) 6470 OutlinedFn->setCallingConv(CallingConv::AMDGPU_KERNEL); 6471 else if (T.isNVPTX()) 6472 OutlinedFn->setCallingConv(CallingConv::PTX_Kernel); 6473 } 6474 } 6475 6476 Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn, 6477 StringRef EntryFnIDName) { 6478 if (Config.isTargetDevice()) { 6479 assert(OutlinedFn && "The outlined function must exist if embedded"); 6480 return OutlinedFn; 6481 } 6482 6483 return new GlobalVariable( 6484 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage, 6485 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName); 6486 } 6487 6488 Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn, 6489 StringRef EntryFnName) { 6490 if (OutlinedFn) 6491 return OutlinedFn; 6492 6493 assert(!M.getGlobalVariable(EntryFnName, true) && 6494 "Named kernel already exists?"); 6495 return new GlobalVariable( 6496 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage, 6497 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName); 6498 } 6499 6500 Error OpenMPIRBuilder::emitTargetRegionFunction( 6501 TargetRegionEntryInfo &EntryInfo, 6502 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, 6503 Function *&OutlinedFn, Constant *&OutlinedFnID) { 6504 6505 SmallString<64> EntryFnName; 6506 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo); 6507 6508 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) { 6509 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName); 6510 if (!CBResult) 6511 return CBResult.takeError(); 6512 OutlinedFn = *CBResult; 6513 } else { 6514 OutlinedFn = nullptr; 6515 } 6516 6517 // If this target outline function is not an offload entry, we don't need to 6518 // register it. This may be in the case of a false if clause, or if there are 6519 // no OpenMP targets. 6520 if (!IsOffloadEntry) 6521 return Error::success(); 6522 6523 std::string EntryFnIDName = 6524 Config.isTargetDevice() 6525 ? std::string(EntryFnName) 6526 : createPlatformSpecificName({EntryFnName, "region_id"}); 6527 6528 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn, 6529 EntryFnName, EntryFnIDName); 6530 return Error::success(); 6531 } 6532 6533 Constant *OpenMPIRBuilder::registerTargetRegionFunction( 6534 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn, 6535 StringRef EntryFnName, StringRef EntryFnIDName) { 6536 if (OutlinedFn) 6537 setOutlinedTargetRegionFunctionAttributes(OutlinedFn); 6538 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName); 6539 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName); 6540 OffloadInfoManager.registerTargetRegionEntryInfo( 6541 EntryInfo, EntryAddr, OutlinedFnID, 6542 OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion); 6543 return OutlinedFnID; 6544 } 6545 6546 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData( 6547 const LocationDescription &Loc, InsertPointTy AllocaIP, 6548 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, 6549 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, 6550 omp::RuntimeFunction *MapperFunc, 6551 function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP, 6552 BodyGenTy BodyGenType)> 6553 BodyGenCB, 6554 function_ref<void(unsigned int, Value *)> DeviceAddrCB, 6555 function_ref<Value *(unsigned int)> CustomMapperCB, Value *SrcLocInfo) { 6556 if (!updateToLocation(Loc)) 6557 return InsertPointTy(); 6558 6559 Builder.restoreIP(CodeGenIP); 6560 // Disable TargetData CodeGen on Device pass. 6561 if (Config.IsTargetDevice.value_or(false)) { 6562 if (BodyGenCB) { 6563 InsertPointOrErrorTy AfterIP = 6564 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv); 6565 if (!AfterIP) 6566 return AfterIP.takeError(); 6567 Builder.restoreIP(*AfterIP); 6568 } 6569 return Builder.saveIP(); 6570 } 6571 6572 bool IsStandAlone = !BodyGenCB; 6573 MapInfosTy *MapInfo; 6574 // Generate the code for the opening of the data environment. Capture all the 6575 // arguments of the runtime call by reference because they are used in the 6576 // closing of the region. 6577 auto BeginThenGen = [&](InsertPointTy AllocaIP, 6578 InsertPointTy CodeGenIP) -> Error { 6579 MapInfo = &GenMapInfoCB(Builder.saveIP()); 6580 emitOffloadingArrays(AllocaIP, Builder.saveIP(), *MapInfo, Info, 6581 /*IsNonContiguous=*/true, DeviceAddrCB, 6582 CustomMapperCB); 6583 6584 TargetDataRTArgs RTArgs; 6585 emitOffloadingArraysArgument(Builder, RTArgs, Info); 6586 6587 // Emit the number of elements in the offloading arrays. 6588 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs); 6589 6590 // Source location for the ident struct 6591 if (!SrcLocInfo) { 6592 uint32_t SrcLocStrSize; 6593 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 6594 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 6595 } 6596 6597 SmallVector<llvm::Value *, 13> OffloadingArgs = { 6598 SrcLocInfo, DeviceID, 6599 PointerNum, RTArgs.BasePointersArray, 6600 RTArgs.PointersArray, RTArgs.SizesArray, 6601 RTArgs.MapTypesArray, RTArgs.MapNamesArray, 6602 RTArgs.MappersArray}; 6603 6604 if (IsStandAlone) { 6605 assert(MapperFunc && "MapperFunc missing for standalone target data"); 6606 6607 auto TaskBodyCB = [&](Value *, Value *, 6608 IRBuilderBase::InsertPoint) -> Error { 6609 if (Info.HasNoWait) { 6610 OffloadingArgs.append({llvm::Constant::getNullValue(Int32), 6611 llvm::Constant::getNullValue(VoidPtr), 6612 llvm::Constant::getNullValue(Int32), 6613 llvm::Constant::getNullValue(VoidPtr)}); 6614 } 6615 6616 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(*MapperFunc), 6617 OffloadingArgs); 6618 6619 if (Info.HasNoWait) { 6620 BasicBlock *OffloadContBlock = 6621 BasicBlock::Create(Builder.getContext(), "omp_offload.cont"); 6622 Function *CurFn = Builder.GetInsertBlock()->getParent(); 6623 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true); 6624 Builder.restoreIP(Builder.saveIP()); 6625 } 6626 return Error::success(); 6627 }; 6628 6629 bool RequiresOuterTargetTask = Info.HasNoWait; 6630 if (!RequiresOuterTargetTask) 6631 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr, 6632 /*TargetTaskAllocaIP=*/{})); 6633 else 6634 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP, 6635 /*Dependencies=*/{}, Info.HasNoWait)); 6636 } else { 6637 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr( 6638 omp::OMPRTL___tgt_target_data_begin_mapper); 6639 6640 Builder.CreateCall(BeginMapperFunc, OffloadingArgs); 6641 6642 for (auto DeviceMap : Info.DevicePtrInfoMap) { 6643 if (isa<AllocaInst>(DeviceMap.second.second)) { 6644 auto *LI = 6645 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first); 6646 Builder.CreateStore(LI, DeviceMap.second.second); 6647 } 6648 } 6649 6650 // If device pointer privatization is required, emit the body of the 6651 // region here. It will have to be duplicated: with and without 6652 // privatization. 6653 InsertPointOrErrorTy AfterIP = 6654 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv); 6655 if (!AfterIP) 6656 return AfterIP.takeError(); 6657 Builder.restoreIP(*AfterIP); 6658 } 6659 return Error::success(); 6660 }; 6661 6662 // If we need device pointer privatization, we need to emit the body of the 6663 // region with no privatization in the 'else' branch of the conditional. 6664 // Otherwise, we don't have to do anything. 6665 auto BeginElseGen = [&](InsertPointTy AllocaIP, 6666 InsertPointTy CodeGenIP) -> Error { 6667 InsertPointOrErrorTy AfterIP = 6668 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv); 6669 if (!AfterIP) 6670 return AfterIP.takeError(); 6671 Builder.restoreIP(*AfterIP); 6672 return Error::success(); 6673 }; 6674 6675 // Generate code for the closing of the data region. 6676 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { 6677 TargetDataRTArgs RTArgs; 6678 Info.EmitDebug = !MapInfo->Names.empty(); 6679 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true); 6680 6681 // Emit the number of elements in the offloading arrays. 6682 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs); 6683 6684 // Source location for the ident struct 6685 if (!SrcLocInfo) { 6686 uint32_t SrcLocStrSize; 6687 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 6688 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 6689 } 6690 6691 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID, 6692 PointerNum, RTArgs.BasePointersArray, 6693 RTArgs.PointersArray, RTArgs.SizesArray, 6694 RTArgs.MapTypesArray, RTArgs.MapNamesArray, 6695 RTArgs.MappersArray}; 6696 Function *EndMapperFunc = 6697 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper); 6698 6699 Builder.CreateCall(EndMapperFunc, OffloadingArgs); 6700 return Error::success(); 6701 }; 6702 6703 // We don't have to do anything to close the region if the if clause evaluates 6704 // to false. 6705 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { 6706 return Error::success(); 6707 }; 6708 6709 Error Err = [&]() -> Error { 6710 if (BodyGenCB) { 6711 Error Err = [&]() { 6712 if (IfCond) 6713 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP); 6714 return BeginThenGen(AllocaIP, Builder.saveIP()); 6715 }(); 6716 6717 if (Err) 6718 return Err; 6719 6720 // If we don't require privatization of device pointers, we emit the body 6721 // in between the runtime calls. This avoids duplicating the body code. 6722 InsertPointOrErrorTy AfterIP = 6723 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv); 6724 if (!AfterIP) 6725 return AfterIP.takeError(); 6726 Builder.restoreIP(*AfterIP); 6727 6728 if (IfCond) 6729 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP); 6730 return EndThenGen(AllocaIP, Builder.saveIP()); 6731 } 6732 if (IfCond) 6733 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP); 6734 return BeginThenGen(AllocaIP, Builder.saveIP()); 6735 }(); 6736 6737 if (Err) 6738 return Err; 6739 6740 return Builder.saveIP(); 6741 } 6742 6743 FunctionCallee 6744 OpenMPIRBuilder::createForStaticInitFunction(unsigned IVSize, bool IVSigned, 6745 bool IsGPUDistribute) { 6746 assert((IVSize == 32 || IVSize == 64) && 6747 "IV size is not compatible with the omp runtime"); 6748 RuntimeFunction Name; 6749 if (IsGPUDistribute) 6750 Name = IVSize == 32 6751 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4 6752 : omp::OMPRTL___kmpc_distribute_static_init_4u) 6753 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8 6754 : omp::OMPRTL___kmpc_distribute_static_init_8u); 6755 else 6756 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4 6757 : omp::OMPRTL___kmpc_for_static_init_4u) 6758 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8 6759 : omp::OMPRTL___kmpc_for_static_init_8u); 6760 6761 return getOrCreateRuntimeFunction(M, Name); 6762 } 6763 6764 FunctionCallee OpenMPIRBuilder::createDispatchInitFunction(unsigned IVSize, 6765 bool IVSigned) { 6766 assert((IVSize == 32 || IVSize == 64) && 6767 "IV size is not compatible with the omp runtime"); 6768 RuntimeFunction Name = IVSize == 32 6769 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4 6770 : omp::OMPRTL___kmpc_dispatch_init_4u) 6771 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8 6772 : omp::OMPRTL___kmpc_dispatch_init_8u); 6773 6774 return getOrCreateRuntimeFunction(M, Name); 6775 } 6776 6777 FunctionCallee OpenMPIRBuilder::createDispatchNextFunction(unsigned IVSize, 6778 bool IVSigned) { 6779 assert((IVSize == 32 || IVSize == 64) && 6780 "IV size is not compatible with the omp runtime"); 6781 RuntimeFunction Name = IVSize == 32 6782 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4 6783 : omp::OMPRTL___kmpc_dispatch_next_4u) 6784 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8 6785 : omp::OMPRTL___kmpc_dispatch_next_8u); 6786 6787 return getOrCreateRuntimeFunction(M, Name); 6788 } 6789 6790 FunctionCallee OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize, 6791 bool IVSigned) { 6792 assert((IVSize == 32 || IVSize == 64) && 6793 "IV size is not compatible with the omp runtime"); 6794 RuntimeFunction Name = IVSize == 32 6795 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4 6796 : omp::OMPRTL___kmpc_dispatch_fini_4u) 6797 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8 6798 : omp::OMPRTL___kmpc_dispatch_fini_8u); 6799 6800 return getOrCreateRuntimeFunction(M, Name); 6801 } 6802 6803 FunctionCallee OpenMPIRBuilder::createDispatchDeinitFunction() { 6804 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit); 6805 } 6806 6807 static Expected<Function *> createOutlinedFunction( 6808 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, 6809 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, 6810 StringRef FuncName, SmallVectorImpl<Value *> &Inputs, 6811 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, 6812 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) { 6813 SmallVector<Type *> ParameterTypes; 6814 if (OMPBuilder.Config.isTargetDevice()) { 6815 // Add the "implicit" runtime argument we use to provide launch specific 6816 // information for target devices. 6817 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext()); 6818 ParameterTypes.push_back(Int8PtrTy); 6819 6820 // All parameters to target devices are passed as pointers 6821 // or i64. This assumes 64-bit address spaces/pointers. 6822 for (auto &Arg : Inputs) 6823 ParameterTypes.push_back(Arg->getType()->isPointerTy() 6824 ? Arg->getType() 6825 : Type::getInt64Ty(Builder.getContext())); 6826 } else { 6827 for (auto &Arg : Inputs) 6828 ParameterTypes.push_back(Arg->getType()); 6829 } 6830 6831 auto BB = Builder.GetInsertBlock(); 6832 auto M = BB->getModule(); 6833 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes, 6834 /*isVarArg*/ false); 6835 auto Func = 6836 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M); 6837 6838 // Forward target-cpu and target-features function attributes from the 6839 // original function to the new outlined function. 6840 Function *ParentFn = Builder.GetInsertBlock()->getParent(); 6841 6842 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu"); 6843 if (TargetCpuAttr.isStringAttribute()) 6844 Func->addFnAttr(TargetCpuAttr); 6845 6846 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features"); 6847 if (TargetFeaturesAttr.isStringAttribute()) 6848 Func->addFnAttr(TargetFeaturesAttr); 6849 6850 if (OMPBuilder.Config.isTargetDevice()) { 6851 Value *ExecMode = 6852 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags); 6853 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode}); 6854 } 6855 6856 // Save insert point. 6857 IRBuilder<>::InsertPointGuard IPG(Builder); 6858 // If there's a DISubprogram associated with current function, then 6859 // generate one for the outlined function. 6860 if (Function *ParentFunc = BB->getParent()) { 6861 if (DISubprogram *SP = ParentFunc->getSubprogram()) { 6862 DICompileUnit *CU = SP->getUnit(); 6863 DIBuilder DB(*M, true, CU); 6864 DebugLoc DL = Builder.getCurrentDebugLocation(); 6865 if (DL) { 6866 // TODO: We are using nullopt for arguments at the moment. This will 6867 // need to be updated when debug data is being generated for variables. 6868 DISubroutineType *Ty = 6869 DB.createSubroutineType(DB.getOrCreateTypeArray({})); 6870 DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagDefinition | 6871 DISubprogram::SPFlagOptimized | 6872 DISubprogram::SPFlagLocalToUnit; 6873 6874 DISubprogram *OutlinedSP = DB.createFunction( 6875 CU, FuncName, FuncName, SP->getFile(), DL.getLine(), Ty, 6876 DL.getLine(), DINode::DIFlags::FlagArtificial, SPFlags); 6877 6878 // Attach subprogram to the function. 6879 Func->setSubprogram(OutlinedSP); 6880 // Update the CurrentDebugLocation in the builder so that right scope 6881 // is used for things inside outlined function. 6882 Builder.SetCurrentDebugLocation( 6883 DILocation::get(Func->getContext(), DL.getLine(), DL.getCol(), 6884 OutlinedSP, DL.getInlinedAt())); 6885 } 6886 } 6887 } 6888 6889 // Generate the region into the function. 6890 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func); 6891 Builder.SetInsertPoint(EntryBB); 6892 6893 // Insert target init call in the device compilation pass. 6894 if (OMPBuilder.Config.isTargetDevice()) 6895 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs)); 6896 6897 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock(); 6898 6899 // As we embed the user code in the middle of our target region after we 6900 // generate entry code, we must move what allocas we can into the entry 6901 // block to avoid possible breaking optimisations for device 6902 if (OMPBuilder.Config.isTargetDevice()) 6903 OMPBuilder.ConstantAllocaRaiseCandidates.emplace_back(Func); 6904 6905 // Insert target deinit call in the device compilation pass. 6906 BasicBlock *OutlinedBodyBB = 6907 splitBB(Builder, /*CreateBranch=*/true, "outlined.body"); 6908 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = CBFunc( 6909 Builder.saveIP(), 6910 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin())); 6911 if (!AfterIP) 6912 return AfterIP.takeError(); 6913 Builder.restoreIP(*AfterIP); 6914 if (OMPBuilder.Config.isTargetDevice()) 6915 OMPBuilder.createTargetDeinit(Builder); 6916 6917 // Insert return instruction. 6918 Builder.CreateRetVoid(); 6919 6920 // New Alloca IP at entry point of created device function. 6921 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt()); 6922 auto AllocaIP = Builder.saveIP(); 6923 6924 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg()); 6925 6926 // Skip the artificial dyn_ptr on the device. 6927 const auto &ArgRange = 6928 OMPBuilder.Config.isTargetDevice() 6929 ? make_range(Func->arg_begin() + 1, Func->arg_end()) 6930 : Func->args(); 6931 6932 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) { 6933 // Things like GEP's can come in the form of Constants. Constants and 6934 // ConstantExpr's do not have access to the knowledge of what they're 6935 // contained in, so we must dig a little to find an instruction so we 6936 // can tell if they're used inside of the function we're outlining. We 6937 // also replace the original constant expression with a new instruction 6938 // equivalent; an instruction as it allows easy modification in the 6939 // following loop, as we can now know the constant (instruction) is 6940 // owned by our target function and replaceUsesOfWith can now be invoked 6941 // on it (cannot do this with constants it seems). A brand new one also 6942 // allows us to be cautious as it is perhaps possible the old expression 6943 // was used inside of the function but exists and is used externally 6944 // (unlikely by the nature of a Constant, but still). 6945 // NOTE: We cannot remove dead constants that have been rewritten to 6946 // instructions at this stage, we run the risk of breaking later lowering 6947 // by doing so as we could still be in the process of lowering the module 6948 // from MLIR to LLVM-IR and the MLIR lowering may still require the original 6949 // constants we have created rewritten versions of. 6950 if (auto *Const = dyn_cast<Constant>(Input)) 6951 convertUsersOfConstantsToInstructions(Const, Func, false); 6952 6953 // Collect all the instructions 6954 for (User *User : make_early_inc_range(Input->users())) 6955 if (auto *Instr = dyn_cast<Instruction>(User)) 6956 if (Instr->getFunction() == Func) 6957 Instr->replaceUsesOfWith(Input, InputCopy); 6958 }; 6959 6960 SmallVector<std::pair<Value *, Value *>> DeferredReplacement; 6961 6962 // Rewrite uses of input valus to parameters. 6963 for (auto InArg : zip(Inputs, ArgRange)) { 6964 Value *Input = std::get<0>(InArg); 6965 Argument &Arg = std::get<1>(InArg); 6966 Value *InputCopy = nullptr; 6967 6968 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = 6969 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP()); 6970 if (!AfterIP) 6971 return AfterIP.takeError(); 6972 Builder.restoreIP(*AfterIP); 6973 6974 // In certain cases a Global may be set up for replacement, however, this 6975 // Global may be used in multiple arguments to the kernel, just segmented 6976 // apart, for example, if we have a global array, that is sectioned into 6977 // multiple mappings (technically not legal in OpenMP, but there is a case 6978 // in Fortran for Common Blocks where this is neccesary), we will end up 6979 // with GEP's into this array inside the kernel, that refer to the Global 6980 // but are technically seperate arguments to the kernel for all intents and 6981 // purposes. If we have mapped a segment that requires a GEP into the 0-th 6982 // index, it will fold into an referal to the Global, if we then encounter 6983 // this folded GEP during replacement all of the references to the 6984 // Global in the kernel will be replaced with the argument we have generated 6985 // that corresponds to it, including any other GEP's that refer to the 6986 // Global that may be other arguments. This will invalidate all of the other 6987 // preceding mapped arguments that refer to the same global that may be 6988 // seperate segments. To prevent this, we defer global processing until all 6989 // other processing has been performed. 6990 if (llvm::isa<llvm::GlobalValue>(std::get<0>(InArg)) || 6991 llvm::isa<llvm::GlobalObject>(std::get<0>(InArg)) || 6992 llvm::isa<llvm::GlobalVariable>(std::get<0>(InArg))) { 6993 DeferredReplacement.push_back(std::make_pair(Input, InputCopy)); 6994 continue; 6995 } 6996 6997 ReplaceValue(Input, InputCopy, Func); 6998 } 6999 7000 // Replace all of our deferred Input values, currently just Globals. 7001 for (auto Deferred : DeferredReplacement) 7002 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func); 7003 7004 return Func; 7005 } 7006 7007 /// Create an entry point for a target task with the following. 7008 /// It'll have the following signature 7009 /// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task) 7010 /// This function is called from emitTargetTask once the 7011 /// code to launch the target kernel has been outlined already. 7012 static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, 7013 IRBuilderBase &Builder, 7014 CallInst *StaleCI) { 7015 Module &M = OMPBuilder.M; 7016 // KernelLaunchFunction is the target launch function, i.e. 7017 // the function that sets up kernel arguments and calls 7018 // __tgt_target_kernel to launch the kernel on the device. 7019 // 7020 Function *KernelLaunchFunction = StaleCI->getCalledFunction(); 7021 7022 // StaleCI is the CallInst which is the call to the outlined 7023 // target kernel launch function. If there are values that the 7024 // outlined function uses then these are aggregated into a structure 7025 // which is passed as the second argument. If not, then there's 7026 // only one argument, the threadID. So, StaleCI can be 7027 // 7028 // %structArg = alloca { ptr, ptr }, align 8 7029 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0 7030 // store ptr %20, ptr %gep_, align 8 7031 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1 7032 // store ptr %21, ptr %gep_8, align 8 7033 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg) 7034 // 7035 // OR 7036 // 7037 // call void @_QQmain..omp_par.1(i32 %global.tid.val6) 7038 OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(), 7039 StaleCI->getIterator()); 7040 LLVMContext &Ctx = StaleCI->getParent()->getContext(); 7041 Type *ThreadIDTy = Type::getInt32Ty(Ctx); 7042 Type *TaskPtrTy = OMPBuilder.TaskPtr; 7043 Type *TaskTy = OMPBuilder.Task; 7044 auto ProxyFnTy = 7045 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy}, 7046 /* isVarArg */ false); 7047 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage, 7048 ".omp_target_task_proxy_func", 7049 Builder.GetInsertBlock()->getModule()); 7050 ProxyFn->getArg(0)->setName("thread.id"); 7051 ProxyFn->getArg(1)->setName("task"); 7052 7053 BasicBlock *EntryBB = 7054 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn); 7055 Builder.SetInsertPoint(EntryBB); 7056 7057 bool HasShareds = StaleCI->arg_size() > 1; 7058 // TODO: This is a temporary assert to prove to ourselves that 7059 // the outlined target launch function is always going to have 7060 // atmost two arguments if there is any data shared between 7061 // host and device. 7062 assert((!HasShareds || (StaleCI->arg_size() == 2)) && 7063 "StaleCI with shareds should have exactly two arguments."); 7064 if (HasShareds) { 7065 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1)); 7066 assert(ArgStructAlloca && 7067 "Unable to find the alloca instruction corresponding to arguments " 7068 "for extracted function"); 7069 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType()); 7070 7071 AllocaInst *NewArgStructAlloca = 7072 Builder.CreateAlloca(ArgStructType, nullptr, "structArg"); 7073 Value *TaskT = ProxyFn->getArg(1); 7074 Value *ThreadId = ProxyFn->getArg(0); 7075 Value *SharedsSize = 7076 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType)); 7077 7078 Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0); 7079 LoadInst *LoadShared = 7080 Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds); 7081 7082 Builder.CreateMemCpy( 7083 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared, 7084 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize); 7085 7086 Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca}); 7087 } 7088 Builder.CreateRetVoid(); 7089 return ProxyFn; 7090 } 7091 7092 static Error emitTargetOutlinedFunction( 7093 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, 7094 TargetRegionEntryInfo &EntryInfo, 7095 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, 7096 Function *&OutlinedFn, Constant *&OutlinedFnID, 7097 SmallVectorImpl<Value *> &Inputs, 7098 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, 7099 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) { 7100 7101 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction = 7102 [&](StringRef EntryFnName) { 7103 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs, 7104 EntryFnName, Inputs, CBFunc, 7105 ArgAccessorFuncCB); 7106 }; 7107 7108 return OMPBuilder.emitTargetRegionFunction( 7109 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn, 7110 OutlinedFnID); 7111 } 7112 7113 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask( 7114 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, 7115 OpenMPIRBuilder::InsertPointTy AllocaIP, 7116 const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies, 7117 bool HasNoWait) { 7118 7119 // The following explains the code-gen scenario for the `target` directive. A 7120 // similar scneario is followed for other device-related directives (e.g. 7121 // `target enter data`) but in similar fashion since we only need to emit task 7122 // that encapsulates the proper runtime call. 7123 // 7124 // When we arrive at this function, the target region itself has been 7125 // outlined into the function OutlinedFn. 7126 // So at ths point, for 7127 // -------------------------------------------------- 7128 // void user_code_that_offloads(...) { 7129 // omp target depend(..) map(from:a) map(to:b, c) 7130 // a = b + c 7131 // } 7132 // 7133 // -------------------------------------------------- 7134 // 7135 // we have 7136 // 7137 // -------------------------------------------------- 7138 // 7139 // void user_code_that_offloads(...) { 7140 // %.offload_baseptrs = alloca [3 x ptr], align 8 7141 // %.offload_ptrs = alloca [3 x ptr], align 8 7142 // %.offload_mappers = alloca [3 x ptr], align 8 7143 // ;; target region has been outlined and now we need to 7144 // ;; offload to it via a target task. 7145 // } 7146 // void outlined_device_function(ptr a, ptr b, ptr c) { 7147 // *a = *b + *c 7148 // } 7149 // 7150 // We have to now do the following 7151 // (i) Make an offloading call to outlined_device_function using the OpenMP 7152 // RTL. See 'kernel_launch_function' in the pseudo code below. This is 7153 // emitted by emitKernelLaunch 7154 // (ii) Create a task entry point function that calls kernel_launch_function 7155 // and is the entry point for the target task. See 7156 // '@.omp_target_task_proxy_func in the pseudocode below. 7157 // (iii) Create a task with the task entry point created in (ii) 7158 // 7159 // That is we create the following 7160 // 7161 // void user_code_that_offloads(...) { 7162 // %.offload_baseptrs = alloca [3 x ptr], align 8 7163 // %.offload_ptrs = alloca [3 x ptr], align 8 7164 // %.offload_mappers = alloca [3 x ptr], align 8 7165 // 7166 // %structArg = alloca { ptr, ptr, ptr }, align 8 7167 // %strucArg[0] = %.offload_baseptrs 7168 // %strucArg[1] = %.offload_ptrs 7169 // %strucArg[2] = %.offload_mappers 7170 // proxy_target_task = @__kmpc_omp_task_alloc(..., 7171 // @.omp_target_task_proxy_func) 7172 // memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg)) 7173 // dependencies_array = ... 7174 // ;; if nowait not present 7175 // call @__kmpc_omp_wait_deps(..., dependencies_array) 7176 // call @__kmpc_omp_task_begin_if0(...) 7177 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr 7178 // %proxy_target_task) call @__kmpc_omp_task_complete_if0(...) 7179 // } 7180 // 7181 // define internal void @.omp_target_task_proxy_func(i32 %thread.id, 7182 // ptr %task) { 7183 // %structArg = alloca {ptr, ptr, ptr} 7184 // %shared_data = load (getelementptr %task, 0, 0) 7185 // mempcy(%structArg, %shared_data, sizeof(structArg)) 7186 // kernel_launch_function(%thread.id, %structArg) 7187 // } 7188 // 7189 // We need the proxy function because the signature of the task entry point 7190 // expected by kmpc_omp_task is always the same and will be different from 7191 // that of the kernel_launch function. 7192 // 7193 // kernel_launch_function is generated by emitKernelLaunch and has the 7194 // always_inline attribute. 7195 // void kernel_launch_function(thread_id, 7196 // structArg) alwaysinline { 7197 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8 7198 // offload_baseptrs = load(getelementptr structArg, 0, 0) 7199 // offload_ptrs = load(getelementptr structArg, 0, 1) 7200 // offload_mappers = load(getelementptr structArg, 0, 2) 7201 // ; setup kernel_args using offload_baseptrs, offload_ptrs and 7202 // ; offload_mappers 7203 // call i32 @__tgt_target_kernel(..., 7204 // outlined_device_function, 7205 // ptr %kernel_args) 7206 // } 7207 // void outlined_device_function(ptr a, ptr b, ptr c) { 7208 // *a = *b + *c 7209 // } 7210 // 7211 BasicBlock *TargetTaskBodyBB = 7212 splitBB(Builder, /*CreateBranch=*/true, "target.task.body"); 7213 BasicBlock *TargetTaskAllocaBB = 7214 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca"); 7215 7216 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB, 7217 TargetTaskAllocaBB->begin()); 7218 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin()); 7219 7220 OutlineInfo OI; 7221 OI.EntryBB = TargetTaskAllocaBB; 7222 OI.OuterAllocaBB = AllocaIP.getBlock(); 7223 7224 // Add the thread ID argument. 7225 SmallVector<Instruction *, 4> ToBeDeleted; 7226 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal( 7227 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false)); 7228 7229 Builder.restoreIP(TargetTaskBodyIP); 7230 7231 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP)) 7232 return Err; 7233 7234 OI.ExitBB = Builder.saveIP().getBlock(); 7235 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait, 7236 DeviceID](Function &OutlinedFn) mutable { 7237 assert(OutlinedFn.getNumUses() == 1 && 7238 "there must be a single user for the outlined function"); 7239 7240 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back()); 7241 bool HasShareds = StaleCI->arg_size() > 1; 7242 7243 Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI); 7244 7245 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn 7246 << "\n"); 7247 7248 Builder.SetInsertPoint(StaleCI); 7249 7250 // Gather the arguments for emitting the runtime call. 7251 uint32_t SrcLocStrSize; 7252 Constant *SrcLocStr = 7253 getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize); 7254 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 7255 7256 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc 7257 // 7258 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide 7259 // the DeviceID to the deferred task and also since 7260 // @__kmpc_omp_target_task_alloc creates an untied/async task. 7261 bool NeedsTargetTask = HasNoWait && DeviceID; 7262 Function *TaskAllocFn = 7263 !NeedsTargetTask 7264 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc) 7265 : getOrCreateRuntimeFunctionPtr( 7266 OMPRTL___kmpc_omp_target_task_alloc); 7267 7268 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID) 7269 // call. 7270 Value *ThreadID = getOrCreateThreadID(Ident); 7271 7272 // Argument - `sizeof_kmp_task_t` (TaskSize) 7273 // Tasksize refers to the size in bytes of kmp_task_t data structure 7274 // including private vars accessed in task. 7275 // TODO: add kmp_task_t_with_privates (privates) 7276 Value *TaskSize = 7277 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Task)); 7278 7279 // Argument - `sizeof_shareds` (SharedsSize) 7280 // SharedsSize refers to the shareds array size in the kmp_task_t data 7281 // structure. 7282 Value *SharedsSize = Builder.getInt64(0); 7283 if (HasShareds) { 7284 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1)); 7285 assert(ArgStructAlloca && 7286 "Unable to find the alloca instruction corresponding to arguments " 7287 "for extracted function"); 7288 auto *ArgStructType = 7289 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType()); 7290 assert(ArgStructType && "Unable to find struct type corresponding to " 7291 "arguments for extracted function"); 7292 SharedsSize = 7293 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType)); 7294 } 7295 7296 // Argument - `flags` 7297 // Task is tied iff (Flags & 1) == 1. 7298 // Task is untied iff (Flags & 1) == 0. 7299 // Task is final iff (Flags & 2) == 2. 7300 // Task is not final iff (Flags & 2) == 0. 7301 // A target task is not final and is untied. 7302 Value *Flags = Builder.getInt32(0); 7303 7304 // Emit the @__kmpc_omp_task_alloc runtime call 7305 // The runtime call returns a pointer to an area where the task captured 7306 // variables must be copied before the task is run (TaskData) 7307 CallInst *TaskData = nullptr; 7308 7309 SmallVector<llvm::Value *> TaskAllocArgs = { 7310 /*loc_ref=*/Ident, /*gtid=*/ThreadID, 7311 /*flags=*/Flags, 7312 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize, 7313 /*task_func=*/ProxyFn}; 7314 7315 if (NeedsTargetTask) { 7316 assert(DeviceID && "Expected non-empty device ID."); 7317 TaskAllocArgs.push_back(DeviceID); 7318 } 7319 7320 TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs); 7321 7322 if (HasShareds) { 7323 Value *Shareds = StaleCI->getArgOperand(1); 7324 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout()); 7325 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData); 7326 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment, 7327 SharedsSize); 7328 } 7329 7330 Value *DepArray = emitTaskDependencies(*this, Dependencies); 7331 7332 // --------------------------------------------------------------- 7333 // V5.2 13.8 target construct 7334 // If the nowait clause is present, execution of the target task 7335 // may be deferred. If the nowait clause is not present, the target task is 7336 // an included task. 7337 // --------------------------------------------------------------- 7338 // The above means that the lack of a nowait on the target construct 7339 // translates to '#pragma omp task if(0)' 7340 if (!NeedsTargetTask) { 7341 if (DepArray) { 7342 Function *TaskWaitFn = 7343 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps); 7344 Builder.CreateCall( 7345 TaskWaitFn, 7346 {/*loc_ref=*/Ident, /*gtid=*/ThreadID, 7347 /*ndeps=*/Builder.getInt32(Dependencies.size()), 7348 /*dep_list=*/DepArray, 7349 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0), 7350 /*noalias_dep_list=*/ 7351 ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))}); 7352 } 7353 // Included task. 7354 Function *TaskBeginFn = 7355 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0); 7356 Function *TaskCompleteFn = 7357 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0); 7358 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData}); 7359 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData}); 7360 CI->setDebugLoc(StaleCI->getDebugLoc()); 7361 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData}); 7362 } else if (DepArray) { 7363 // HasNoWait - meaning the task may be deferred. Call 7364 // __kmpc_omp_task_with_deps if there are dependencies, 7365 // else call __kmpc_omp_task 7366 Function *TaskFn = 7367 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps); 7368 Builder.CreateCall( 7369 TaskFn, 7370 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()), 7371 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0), 7372 ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))}); 7373 } else { 7374 // Emit the @__kmpc_omp_task runtime call to spawn the task 7375 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task); 7376 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData}); 7377 } 7378 7379 StaleCI->eraseFromParent(); 7380 for (Instruction *I : llvm::reverse(ToBeDeleted)) 7381 I->eraseFromParent(); 7382 }; 7383 addOutlineInfo(std::move(OI)); 7384 7385 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n" 7386 << *(Builder.GetInsertBlock()) << "\n"); 7387 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n" 7388 << *(Builder.GetInsertBlock()->getParent()->getParent()) 7389 << "\n"); 7390 return Builder.saveIP(); 7391 } 7392 7393 void OpenMPIRBuilder::emitOffloadingArraysAndArgs( 7394 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, 7395 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous, 7396 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB, 7397 function_ref<Value *(unsigned int)> CustomMapperCB) { 7398 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info, IsNonContiguous, 7399 DeviceAddrCB, CustomMapperCB); 7400 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall); 7401 } 7402 7403 static void 7404 emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, 7405 OpenMPIRBuilder::InsertPointTy AllocaIP, 7406 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, 7407 const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, 7408 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, 7409 SmallVectorImpl<Value *> &Args, 7410 OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, 7411 SmallVector<llvm::OpenMPIRBuilder::DependData> Dependencies = {}, 7412 bool HasNoWait = false) { 7413 // Generate a function call to the host fallback implementation of the target 7414 // region. This is called by the host when no offload entry was generated for 7415 // the target region and when the offloading call fails at runtime. 7416 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP) 7417 -> OpenMPIRBuilder::InsertPointOrErrorTy { 7418 Builder.restoreIP(IP); 7419 Builder.CreateCall(OutlinedFn, Args); 7420 return Builder.saveIP(); 7421 }; 7422 7423 bool HasDependencies = Dependencies.size() > 0; 7424 bool RequiresOuterTargetTask = HasNoWait || HasDependencies; 7425 7426 OpenMPIRBuilder::TargetKernelArgs KArgs; 7427 7428 auto TaskBodyCB = 7429 [&](Value *DeviceID, Value *RTLoc, 7430 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error { 7431 // Assume no error was returned because EmitTargetCallFallbackCB doesn't 7432 // produce any. 7433 llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() { 7434 // emitKernelLaunch makes the necessary runtime call to offload the 7435 // kernel. We then outline all that code into a separate function 7436 // ('kernel_launch_function' in the pseudo code above). This function is 7437 // then called by the target task proxy function (see 7438 // '@.omp_target_task_proxy_func' in the pseudo code above) 7439 // "@.omp_target_task_proxy_func' is generated by 7440 // emitTargetTaskProxyFunction. 7441 if (OutlinedFnID) 7442 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID, 7443 EmitTargetCallFallbackCB, KArgs, 7444 DeviceID, RTLoc, TargetTaskAllocaIP); 7445 // When OutlinedFnID is set to nullptr, then it's not an offloading call. 7446 // In this case, we execute the host implementation directly. 7447 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP()); 7448 }()); 7449 7450 OMPBuilder.Builder.restoreIP(AfterIP); 7451 return Error::success(); 7452 }; 7453 7454 auto &&EmitTargetCallElse = 7455 [&](OpenMPIRBuilder::InsertPointTy AllocaIP, 7456 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error { 7457 // Assume no error was returned because EmitTargetCallFallbackCB doesn't 7458 // produce any. 7459 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() { 7460 if (RequiresOuterTargetTask) { 7461 // Arguments that are intended to be directly forwarded to an 7462 // emitKernelLaunch call are pased as nullptr, since 7463 // OutlinedFnID=nullptr results in that call not being done. 7464 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr, 7465 /*RTLoc=*/nullptr, AllocaIP, 7466 Dependencies, HasNoWait); 7467 } 7468 return EmitTargetCallFallbackCB(Builder.saveIP()); 7469 }()); 7470 7471 Builder.restoreIP(AfterIP); 7472 return Error::success(); 7473 }; 7474 7475 auto &&EmitTargetCallThen = 7476 [&](OpenMPIRBuilder::InsertPointTy AllocaIP, 7477 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error { 7478 OpenMPIRBuilder::TargetDataInfo Info( 7479 /*RequiresDevicePointerInfo=*/false, 7480 /*SeparateBeginEndCalls=*/true); 7481 7482 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP()); 7483 OpenMPIRBuilder::TargetDataRTArgs RTArgs; 7484 OMPBuilder.emitOffloadingArraysAndArgs(AllocaIP, Builder.saveIP(), Info, 7485 RTArgs, MapInfo, 7486 /*IsNonContiguous=*/true, 7487 /*ForEndCall=*/false); 7488 7489 SmallVector<Value *, 3> NumTeamsC; 7490 for (auto [DefaultVal, RuntimeVal] : 7491 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams)) 7492 NumTeamsC.push_back(RuntimeVal ? RuntimeVal 7493 : Builder.getInt32(DefaultVal)); 7494 7495 // Calculate number of threads: 0 if no clauses specified, otherwise it is 7496 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses. 7497 auto InitMaxThreadsClause = [&Builder](Value *Clause) { 7498 if (Clause) 7499 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(), 7500 /*isSigned=*/false); 7501 return Clause; 7502 }; 7503 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) { 7504 if (Clause) 7505 Result = 7506 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause), 7507 Result, Clause) 7508 : Clause; 7509 }; 7510 7511 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so 7512 // the NUM_THREADS clause is overriden by THREAD_LIMIT. 7513 SmallVector<Value *, 3> NumThreadsC; 7514 Value *MaxThreadsClause = 7515 RuntimeAttrs.TeamsThreadLimit.size() == 1 7516 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads) 7517 : nullptr; 7518 7519 for (auto [TeamsVal, TargetVal] : zip_equal( 7520 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) { 7521 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal); 7522 Value *NumThreads = InitMaxThreadsClause(TargetVal); 7523 7524 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads); 7525 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads); 7526 7527 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0)); 7528 } 7529 7530 unsigned NumTargetItems = Info.NumberOfPtrs; 7531 // TODO: Use correct device ID 7532 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF); 7533 uint32_t SrcLocStrSize; 7534 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize); 7535 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize, 7536 llvm::omp::IdentFlag(0), 0); 7537 7538 Value *TripCount = RuntimeAttrs.LoopTripCount 7539 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount, 7540 Builder.getInt64Ty(), 7541 /*isSigned=*/false) 7542 : Builder.getInt64(0); 7543 7544 // TODO: Use correct DynCGGroupMem 7545 Value *DynCGGroupMem = Builder.getInt32(0); 7546 7547 KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount, 7548 NumTeamsC, NumThreadsC, 7549 DynCGGroupMem, HasNoWait); 7550 7551 // Assume no error was returned because TaskBodyCB and 7552 // EmitTargetCallFallbackCB don't produce any. 7553 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() { 7554 // The presence of certain clauses on the target directive require the 7555 // explicit generation of the target task. 7556 if (RequiresOuterTargetTask) 7557 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP, 7558 Dependencies, HasNoWait); 7559 7560 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID, 7561 EmitTargetCallFallbackCB, KArgs, 7562 DeviceID, RTLoc, AllocaIP); 7563 }()); 7564 7565 Builder.restoreIP(AfterIP); 7566 return Error::success(); 7567 }; 7568 7569 // If we don't have an ID for the target region, it means an offload entry 7570 // wasn't created. In this case we just run the host fallback directly and 7571 // ignore any potential 'if' clauses. 7572 if (!OutlinedFnID) { 7573 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP())); 7574 return; 7575 } 7576 7577 // If there's no 'if' clause, only generate the kernel launch code path. 7578 if (!IfCond) { 7579 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP())); 7580 return; 7581 } 7582 7583 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen, 7584 EmitTargetCallElse, AllocaIP)); 7585 } 7586 7587 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget( 7588 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP, 7589 InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, 7590 const TargetKernelDefaultAttrs &DefaultAttrs, 7591 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, 7592 SmallVectorImpl<Value *> &Args, GenMapInfoCallbackTy GenMapInfoCB, 7593 OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc, 7594 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, 7595 SmallVector<DependData> Dependencies, bool HasNowait) { 7596 7597 if (!updateToLocation(Loc)) 7598 return InsertPointTy(); 7599 7600 Builder.restoreIP(CodeGenIP); 7601 7602 Function *OutlinedFn; 7603 Constant *OutlinedFnID = nullptr; 7604 // The target region is outlined into its own function. The LLVM IR for 7605 // the target region itself is generated using the callbacks CBFunc 7606 // and ArgAccessorFuncCB 7607 if (Error Err = emitTargetOutlinedFunction( 7608 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn, 7609 OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB)) 7610 return Err; 7611 7612 // If we are not on the target device, then we need to generate code 7613 // to make a remote call (offload) to the previously outlined function 7614 // that represents the target region. Do that now. 7615 if (!Config.isTargetDevice()) 7616 emitTargetCall(*this, Builder, AllocaIP, DefaultAttrs, RuntimeAttrs, IfCond, 7617 OutlinedFn, OutlinedFnID, Args, GenMapInfoCB, Dependencies, 7618 HasNowait); 7619 return Builder.saveIP(); 7620 } 7621 7622 std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts, 7623 StringRef FirstSeparator, 7624 StringRef Separator) { 7625 SmallString<128> Buffer; 7626 llvm::raw_svector_ostream OS(Buffer); 7627 StringRef Sep = FirstSeparator; 7628 for (StringRef Part : Parts) { 7629 OS << Sep << Part; 7630 Sep = Separator; 7631 } 7632 return OS.str().str(); 7633 } 7634 7635 std::string 7636 OpenMPIRBuilder::createPlatformSpecificName(ArrayRef<StringRef> Parts) const { 7637 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(), 7638 Config.separator()); 7639 } 7640 7641 GlobalVariable * 7642 OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name, 7643 unsigned AddressSpace) { 7644 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first; 7645 if (Elem.second) { 7646 assert(Elem.second->getValueType() == Ty && 7647 "OMP internal variable has different type than requested"); 7648 } else { 7649 // TODO: investigate the appropriate linkage type used for the global 7650 // variable for possibly changing that to internal or private, or maybe 7651 // create different versions of the function for different OMP internal 7652 // variables. 7653 auto Linkage = this->M.getTargetTriple().rfind("wasm32") == 0 7654 ? GlobalValue::InternalLinkage 7655 : GlobalValue::CommonLinkage; 7656 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage, 7657 Constant::getNullValue(Ty), Elem.first(), 7658 /*InsertBefore=*/nullptr, 7659 GlobalValue::NotThreadLocal, AddressSpace); 7660 const DataLayout &DL = M.getDataLayout(); 7661 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty); 7662 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace); 7663 GV->setAlignment(std::max(TypeAlign, PtrAlign)); 7664 Elem.second = GV; 7665 } 7666 7667 return Elem.second; 7668 } 7669 7670 Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) { 7671 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str(); 7672 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", "."); 7673 return getOrCreateInternalVariable(KmpCriticalNameTy, Name); 7674 } 7675 7676 Value *OpenMPIRBuilder::getSizeInBytes(Value *BasePtr) { 7677 LLVMContext &Ctx = Builder.getContext(); 7678 Value *Null = 7679 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext())); 7680 Value *SizeGep = 7681 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1)); 7682 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx)); 7683 return SizePtrToInt; 7684 } 7685 7686 GlobalVariable * 7687 OpenMPIRBuilder::createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings, 7688 std::string VarName) { 7689 llvm::Constant *MaptypesArrayInit = 7690 llvm::ConstantDataArray::get(M.getContext(), Mappings); 7691 auto *MaptypesArrayGlobal = new llvm::GlobalVariable( 7692 M, MaptypesArrayInit->getType(), 7693 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit, 7694 VarName); 7695 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); 7696 return MaptypesArrayGlobal; 7697 } 7698 7699 void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc, 7700 InsertPointTy AllocaIP, 7701 unsigned NumOperands, 7702 struct MapperAllocas &MapperAllocas) { 7703 if (!updateToLocation(Loc)) 7704 return; 7705 7706 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands); 7707 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands); 7708 Builder.restoreIP(AllocaIP); 7709 AllocaInst *ArgsBase = Builder.CreateAlloca( 7710 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs"); 7711 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr, 7712 ".offload_ptrs"); 7713 AllocaInst *ArgSizes = Builder.CreateAlloca( 7714 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes"); 7715 Builder.restoreIP(Loc.IP); 7716 MapperAllocas.ArgsBase = ArgsBase; 7717 MapperAllocas.Args = Args; 7718 MapperAllocas.ArgSizes = ArgSizes; 7719 } 7720 7721 void OpenMPIRBuilder::emitMapperCall(const LocationDescription &Loc, 7722 Function *MapperFunc, Value *SrcLocInfo, 7723 Value *MaptypesArg, Value *MapnamesArg, 7724 struct MapperAllocas &MapperAllocas, 7725 int64_t DeviceID, unsigned NumOperands) { 7726 if (!updateToLocation(Loc)) 7727 return; 7728 7729 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands); 7730 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands); 7731 Value *ArgsBaseGEP = 7732 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase, 7733 {Builder.getInt32(0), Builder.getInt32(0)}); 7734 Value *ArgsGEP = 7735 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args, 7736 {Builder.getInt32(0), Builder.getInt32(0)}); 7737 Value *ArgSizesGEP = 7738 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes, 7739 {Builder.getInt32(0), Builder.getInt32(0)}); 7740 Value *NullPtr = 7741 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext())); 7742 Builder.CreateCall(MapperFunc, 7743 {SrcLocInfo, Builder.getInt64(DeviceID), 7744 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP, 7745 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr}); 7746 } 7747 7748 void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder, 7749 TargetDataRTArgs &RTArgs, 7750 TargetDataInfo &Info, 7751 bool ForEndCall) { 7752 assert((!ForEndCall || Info.separateBeginEndCalls()) && 7753 "expected region end call to runtime only when end call is separate"); 7754 auto UnqualPtrTy = PointerType::getUnqual(M.getContext()); 7755 auto VoidPtrTy = UnqualPtrTy; 7756 auto VoidPtrPtrTy = UnqualPtrTy; 7757 auto Int64Ty = Type::getInt64Ty(M.getContext()); 7758 auto Int64PtrTy = UnqualPtrTy; 7759 7760 if (!Info.NumberOfPtrs) { 7761 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy); 7762 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy); 7763 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy); 7764 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy); 7765 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy); 7766 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy); 7767 return; 7768 } 7769 7770 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32( 7771 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), 7772 Info.RTArgs.BasePointersArray, 7773 /*Idx0=*/0, /*Idx1=*/0); 7774 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32( 7775 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 7776 /*Idx0=*/0, 7777 /*Idx1=*/0); 7778 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32( 7779 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray, 7780 /*Idx0=*/0, /*Idx1=*/0); 7781 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32( 7782 ArrayType::get(Int64Ty, Info.NumberOfPtrs), 7783 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd 7784 : Info.RTArgs.MapTypesArray, 7785 /*Idx0=*/0, 7786 /*Idx1=*/0); 7787 7788 // Only emit the mapper information arrays if debug information is 7789 // requested. 7790 if (!Info.EmitDebug) 7791 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy); 7792 else 7793 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32( 7794 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray, 7795 /*Idx0=*/0, 7796 /*Idx1=*/0); 7797 // If there is no user-defined mapper, set the mapper array to nullptr to 7798 // avoid an unnecessary data privatization 7799 if (!Info.HasMapper) 7800 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy); 7801 else 7802 RTArgs.MappersArray = 7803 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy); 7804 } 7805 7806 void OpenMPIRBuilder::emitNonContiguousDescriptor(InsertPointTy AllocaIP, 7807 InsertPointTy CodeGenIP, 7808 MapInfosTy &CombinedInfo, 7809 TargetDataInfo &Info) { 7810 MapInfosTy::StructNonContiguousInfo &NonContigInfo = 7811 CombinedInfo.NonContigInfo; 7812 7813 // Build an array of struct descriptor_dim and then assign it to 7814 // offload_args. 7815 // 7816 // struct descriptor_dim { 7817 // uint64_t offset; 7818 // uint64_t count; 7819 // uint64_t stride 7820 // }; 7821 Type *Int64Ty = Builder.getInt64Ty(); 7822 StructType *DimTy = StructType::create( 7823 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}), 7824 "struct.descriptor_dim"); 7825 7826 enum { OffsetFD = 0, CountFD, StrideFD }; 7827 // We need two index variable here since the size of "Dims" is the same as 7828 // the size of Components, however, the size of offset, count, and stride is 7829 // equal to the size of base declaration that is non-contiguous. 7830 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) { 7831 // Skip emitting ir if dimension size is 1 since it cannot be 7832 // non-contiguous. 7833 if (NonContigInfo.Dims[I] == 1) 7834 continue; 7835 Builder.restoreIP(AllocaIP); 7836 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]); 7837 AllocaInst *DimsAddr = 7838 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims"); 7839 Builder.restoreIP(CodeGenIP); 7840 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) { 7841 unsigned RevIdx = EE - II - 1; 7842 Value *DimsLVal = Builder.CreateInBoundsGEP( 7843 DimsAddr->getAllocatedType(), DimsAddr, 7844 {Builder.getInt64(0), Builder.getInt64(II)}); 7845 // Offset 7846 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD); 7847 Builder.CreateAlignedStore( 7848 NonContigInfo.Offsets[L][RevIdx], OffsetLVal, 7849 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType())); 7850 // Count 7851 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD); 7852 Builder.CreateAlignedStore( 7853 NonContigInfo.Counts[L][RevIdx], CountLVal, 7854 M.getDataLayout().getPrefTypeAlign(CountLVal->getType())); 7855 // Stride 7856 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD); 7857 Builder.CreateAlignedStore( 7858 NonContigInfo.Strides[L][RevIdx], StrideLVal, 7859 M.getDataLayout().getPrefTypeAlign(CountLVal->getType())); 7860 } 7861 // args[I] = &dims 7862 Builder.restoreIP(CodeGenIP); 7863 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast( 7864 DimsAddr, Builder.getPtrTy()); 7865 Value *P = Builder.CreateConstInBoundsGEP2_32( 7866 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs), 7867 Info.RTArgs.PointersArray, 0, I); 7868 Builder.CreateAlignedStore( 7869 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy())); 7870 ++L; 7871 } 7872 } 7873 7874 void OpenMPIRBuilder::emitUDMapperArrayInitOrDel( 7875 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin, 7876 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize, 7877 BasicBlock *ExitBB, bool IsInit) { 7878 StringRef Prefix = IsInit ? ".init" : ".del"; 7879 7880 // Evaluate if this is an array section. 7881 BasicBlock *BodyBB = BasicBlock::Create( 7882 M.getContext(), createPlatformSpecificName({"omp.array", Prefix})); 7883 Value *IsArray = 7884 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray"); 7885 Value *DeleteBit = Builder.CreateAnd( 7886 MapType, 7887 Builder.getInt64( 7888 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( 7889 OpenMPOffloadMappingFlags::OMP_MAP_DELETE))); 7890 Value *DeleteCond; 7891 Value *Cond; 7892 if (IsInit) { 7893 // base != begin? 7894 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin); 7895 // IsPtrAndObj? 7896 Value *PtrAndObjBit = Builder.CreateAnd( 7897 MapType, 7898 Builder.getInt64( 7899 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( 7900 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ))); 7901 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit); 7902 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit); 7903 Cond = Builder.CreateOr(IsArray, BaseIsBegin); 7904 DeleteCond = Builder.CreateIsNull( 7905 DeleteBit, 7906 createPlatformSpecificName({"omp.array", Prefix, ".delete"})); 7907 } else { 7908 Cond = IsArray; 7909 DeleteCond = Builder.CreateIsNotNull( 7910 DeleteBit, 7911 createPlatformSpecificName({"omp.array", Prefix, ".delete"})); 7912 } 7913 Cond = Builder.CreateAnd(Cond, DeleteCond); 7914 Builder.CreateCondBr(Cond, BodyBB, ExitBB); 7915 7916 emitBlock(BodyBB, MapperFn); 7917 // Get the array size by multiplying element size and element number (i.e., \p 7918 // Size). 7919 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize)); 7920 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves 7921 // memory allocation/deletion purpose only. 7922 Value *MapTypeArg = Builder.CreateAnd( 7923 MapType, 7924 Builder.getInt64( 7925 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( 7926 OpenMPOffloadMappingFlags::OMP_MAP_TO | 7927 OpenMPOffloadMappingFlags::OMP_MAP_FROM))); 7928 MapTypeArg = Builder.CreateOr( 7929 MapTypeArg, 7930 Builder.getInt64( 7931 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( 7932 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT))); 7933 7934 // Call the runtime API __tgt_push_mapper_component to fill up the runtime 7935 // data structure. 7936 Value *OffloadingArgs[] = {MapperHandle, Base, Begin, 7937 ArraySize, MapTypeArg, MapName}; 7938 Builder.CreateCall( 7939 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component), 7940 OffloadingArgs); 7941 } 7942 7943 Function *OpenMPIRBuilder::emitUserDefinedMapper( 7944 function_ref<MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, 7945 llvm::Value *BeginArg)> 7946 GenMapInfoCB, 7947 Type *ElemTy, StringRef FuncName, 7948 function_ref<bool(unsigned int, Function **)> CustomMapperCB) { 7949 SmallVector<Type *> Params; 7950 Params.emplace_back(Builder.getPtrTy()); 7951 Params.emplace_back(Builder.getPtrTy()); 7952 Params.emplace_back(Builder.getPtrTy()); 7953 Params.emplace_back(Builder.getInt64Ty()); 7954 Params.emplace_back(Builder.getInt64Ty()); 7955 Params.emplace_back(Builder.getPtrTy()); 7956 7957 auto *FnTy = 7958 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false); 7959 7960 SmallString<64> TyStr; 7961 raw_svector_ostream Out(TyStr); 7962 Function *MapperFn = 7963 Function::Create(FnTy, GlobalValue::InternalLinkage, FuncName, M); 7964 MapperFn->addFnAttr(Attribute::NoInline); 7965 MapperFn->addFnAttr(Attribute::NoUnwind); 7966 MapperFn->addParamAttr(0, Attribute::NoUndef); 7967 MapperFn->addParamAttr(1, Attribute::NoUndef); 7968 MapperFn->addParamAttr(2, Attribute::NoUndef); 7969 MapperFn->addParamAttr(3, Attribute::NoUndef); 7970 MapperFn->addParamAttr(4, Attribute::NoUndef); 7971 MapperFn->addParamAttr(5, Attribute::NoUndef); 7972 7973 // Start the mapper function code generation. 7974 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn); 7975 auto SavedIP = Builder.saveIP(); 7976 Builder.SetInsertPoint(EntryBB); 7977 7978 Value *MapperHandle = MapperFn->getArg(0); 7979 Value *BaseIn = MapperFn->getArg(1); 7980 Value *BeginIn = MapperFn->getArg(2); 7981 Value *Size = MapperFn->getArg(3); 7982 Value *MapType = MapperFn->getArg(4); 7983 Value *MapName = MapperFn->getArg(5); 7984 7985 // Compute the starting and end addresses of array elements. 7986 // Prepare common arguments for array initiation and deletion. 7987 // Convert the size in bytes into the number of array elements. 7988 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy); 7989 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize)); 7990 Value *PtrBegin = Builder.CreateBitCast(BeginIn, Builder.getPtrTy()); 7991 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size); 7992 7993 // Emit array initiation if this is an array section and \p MapType indicates 7994 // that memory allocation is required. 7995 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head"); 7996 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size, 7997 MapType, MapName, ElementSize, HeadBB, 7998 /*IsInit=*/true); 7999 8000 // Emit a for loop to iterate through SizeArg of elements and map all of them. 8001 8002 // Emit the loop header block. 8003 emitBlock(HeadBB, MapperFn); 8004 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body"); 8005 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done"); 8006 // Evaluate whether the initial condition is satisfied. 8007 Value *IsEmpty = 8008 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty"); 8009 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB); 8010 8011 // Emit the loop body block. 8012 emitBlock(BodyBB, MapperFn); 8013 BasicBlock *LastBB = BodyBB; 8014 PHINode *PtrPHI = 8015 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent"); 8016 PtrPHI->addIncoming(PtrBegin, HeadBB); 8017 8018 // Get map clause information. Fill up the arrays with all mapped variables. 8019 MapInfosTy &Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn); 8020 8021 // Call the runtime API __tgt_mapper_num_components to get the number of 8022 // pre-existing components. 8023 Value *OffloadingArgs[] = {MapperHandle}; 8024 Value *PreviousSize = Builder.CreateCall( 8025 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components), 8026 OffloadingArgs); 8027 Value *ShiftedPreviousSize = 8028 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset())); 8029 8030 // Fill up the runtime mapper handle for all components. 8031 for (unsigned I = 0; I < Info.BasePointers.size(); ++I) { 8032 Value *CurBaseArg = 8033 Builder.CreateBitCast(Info.BasePointers[I], Builder.getPtrTy()); 8034 Value *CurBeginArg = 8035 Builder.CreateBitCast(Info.Pointers[I], Builder.getPtrTy()); 8036 Value *CurSizeArg = Info.Sizes[I]; 8037 Value *CurNameArg = Info.Names.size() 8038 ? Info.Names[I] 8039 : Constant::getNullValue(Builder.getPtrTy()); 8040 8041 // Extract the MEMBER_OF field from the map type. 8042 Value *OriMapType = Builder.getInt64( 8043 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( 8044 Info.Types[I])); 8045 Value *MemberMapType = 8046 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize); 8047 8048 // Combine the map type inherited from user-defined mapper with that 8049 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM 8050 // bits of the \a MapType, which is the input argument of the mapper 8051 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM 8052 // bits of MemberMapType. 8053 // [OpenMP 5.0], 1.2.6. map-type decay. 8054 // | alloc | to | from | tofrom | release | delete 8055 // ---------------------------------------------------------- 8056 // alloc | alloc | alloc | alloc | alloc | release | delete 8057 // to | alloc | to | alloc | to | release | delete 8058 // from | alloc | alloc | from | from | release | delete 8059 // tofrom | alloc | to | from | tofrom | release | delete 8060 Value *LeftToFrom = Builder.CreateAnd( 8061 MapType, 8062 Builder.getInt64( 8063 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( 8064 OpenMPOffloadMappingFlags::OMP_MAP_TO | 8065 OpenMPOffloadMappingFlags::OMP_MAP_FROM))); 8066 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc"); 8067 BasicBlock *AllocElseBB = 8068 BasicBlock::Create(M.getContext(), "omp.type.alloc.else"); 8069 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to"); 8070 BasicBlock *ToElseBB = 8071 BasicBlock::Create(M.getContext(), "omp.type.to.else"); 8072 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from"); 8073 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end"); 8074 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom); 8075 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB); 8076 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM. 8077 emitBlock(AllocBB, MapperFn); 8078 Value *AllocMapType = Builder.CreateAnd( 8079 MemberMapType, 8080 Builder.getInt64( 8081 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( 8082 OpenMPOffloadMappingFlags::OMP_MAP_TO | 8083 OpenMPOffloadMappingFlags::OMP_MAP_FROM))); 8084 Builder.CreateBr(EndBB); 8085 emitBlock(AllocElseBB, MapperFn); 8086 Value *IsTo = Builder.CreateICmpEQ( 8087 LeftToFrom, 8088 Builder.getInt64( 8089 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( 8090 OpenMPOffloadMappingFlags::OMP_MAP_TO))); 8091 Builder.CreateCondBr(IsTo, ToBB, ToElseBB); 8092 // In case of to, clear OMP_MAP_FROM. 8093 emitBlock(ToBB, MapperFn); 8094 Value *ToMapType = Builder.CreateAnd( 8095 MemberMapType, 8096 Builder.getInt64( 8097 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( 8098 OpenMPOffloadMappingFlags::OMP_MAP_FROM))); 8099 Builder.CreateBr(EndBB); 8100 emitBlock(ToElseBB, MapperFn); 8101 Value *IsFrom = Builder.CreateICmpEQ( 8102 LeftToFrom, 8103 Builder.getInt64( 8104 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( 8105 OpenMPOffloadMappingFlags::OMP_MAP_FROM))); 8106 Builder.CreateCondBr(IsFrom, FromBB, EndBB); 8107 // In case of from, clear OMP_MAP_TO. 8108 emitBlock(FromBB, MapperFn); 8109 Value *FromMapType = Builder.CreateAnd( 8110 MemberMapType, 8111 Builder.getInt64( 8112 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( 8113 OpenMPOffloadMappingFlags::OMP_MAP_TO))); 8114 // In case of tofrom, do nothing. 8115 emitBlock(EndBB, MapperFn); 8116 LastBB = EndBB; 8117 PHINode *CurMapType = 8118 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype"); 8119 CurMapType->addIncoming(AllocMapType, AllocBB); 8120 CurMapType->addIncoming(ToMapType, ToBB); 8121 CurMapType->addIncoming(FromMapType, FromBB); 8122 CurMapType->addIncoming(MemberMapType, ToElseBB); 8123 8124 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg, 8125 CurSizeArg, CurMapType, CurNameArg}; 8126 Function *ChildMapperFn = nullptr; 8127 if (CustomMapperCB && CustomMapperCB(I, &ChildMapperFn)) { 8128 // Call the corresponding mapper function. 8129 Builder.CreateCall(ChildMapperFn, OffloadingArgs)->setDoesNotThrow(); 8130 } else { 8131 // Call the runtime API __tgt_push_mapper_component to fill up the runtime 8132 // data structure. 8133 Builder.CreateCall( 8134 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component), 8135 OffloadingArgs); 8136 } 8137 } 8138 8139 // Update the pointer to point to the next element that needs to be mapped, 8140 // and check whether we have mapped all elements. 8141 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1, 8142 "omp.arraymap.next"); 8143 PtrPHI->addIncoming(PtrNext, LastBB); 8144 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone"); 8145 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit"); 8146 Builder.CreateCondBr(IsDone, ExitBB, BodyBB); 8147 8148 emitBlock(ExitBB, MapperFn); 8149 // Emit array deletion if this is an array section and \p MapType indicates 8150 // that deletion is required. 8151 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size, 8152 MapType, MapName, ElementSize, DoneBB, 8153 /*IsInit=*/false); 8154 8155 // Emit the function exit block. 8156 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true); 8157 8158 Builder.CreateRetVoid(); 8159 Builder.restoreIP(SavedIP); 8160 return MapperFn; 8161 } 8162 8163 void OpenMPIRBuilder::emitOffloadingArrays( 8164 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, 8165 TargetDataInfo &Info, bool IsNonContiguous, 8166 function_ref<void(unsigned int, Value *)> DeviceAddrCB, 8167 function_ref<Value *(unsigned int)> CustomMapperCB) { 8168 8169 // Reset the array information. 8170 Info.clearArrayInfo(); 8171 Info.NumberOfPtrs = CombinedInfo.BasePointers.size(); 8172 8173 if (Info.NumberOfPtrs == 0) 8174 return; 8175 8176 Builder.restoreIP(AllocaIP); 8177 // Detect if we have any capture size requiring runtime evaluation of the 8178 // size so that a constant array could be eventually used. 8179 ArrayType *PointerArrayType = 8180 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs); 8181 8182 Info.RTArgs.BasePointersArray = Builder.CreateAlloca( 8183 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs"); 8184 8185 Info.RTArgs.PointersArray = Builder.CreateAlloca( 8186 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs"); 8187 AllocaInst *MappersArray = Builder.CreateAlloca( 8188 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers"); 8189 Info.RTArgs.MappersArray = MappersArray; 8190 8191 // If we don't have any VLA types or other types that require runtime 8192 // evaluation, we can use a constant array for the map sizes, otherwise we 8193 // need to fill up the arrays as we do for the pointers. 8194 Type *Int64Ty = Builder.getInt64Ty(); 8195 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(), 8196 ConstantInt::get(Int64Ty, 0)); 8197 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size()); 8198 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) { 8199 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) { 8200 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) { 8201 if (IsNonContiguous && 8202 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( 8203 CombinedInfo.Types[I] & 8204 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG)) 8205 ConstSizes[I] = 8206 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]); 8207 else 8208 ConstSizes[I] = CI; 8209 continue; 8210 } 8211 } 8212 RuntimeSizes.set(I); 8213 } 8214 8215 if (RuntimeSizes.all()) { 8216 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs); 8217 Info.RTArgs.SizesArray = Builder.CreateAlloca( 8218 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes"); 8219 Builder.restoreIP(CodeGenIP); 8220 } else { 8221 auto *SizesArrayInit = ConstantArray::get( 8222 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes); 8223 std::string Name = createPlatformSpecificName({"offload_sizes"}); 8224 auto *SizesArrayGbl = 8225 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true, 8226 GlobalValue::PrivateLinkage, SizesArrayInit, Name); 8227 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); 8228 8229 if (!RuntimeSizes.any()) { 8230 Info.RTArgs.SizesArray = SizesArrayGbl; 8231 } else { 8232 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0); 8233 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64); 8234 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs); 8235 AllocaInst *Buffer = Builder.CreateAlloca( 8236 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes"); 8237 Buffer->setAlignment(OffloadSizeAlign); 8238 Builder.restoreIP(CodeGenIP); 8239 Builder.CreateMemCpy( 8240 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()), 8241 SizesArrayGbl, OffloadSizeAlign, 8242 Builder.getIntN( 8243 IndexSize, 8244 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue())); 8245 8246 Info.RTArgs.SizesArray = Buffer; 8247 } 8248 Builder.restoreIP(CodeGenIP); 8249 } 8250 8251 // The map types are always constant so we don't need to generate code to 8252 // fill arrays. Instead, we create an array constant. 8253 SmallVector<uint64_t, 4> Mapping; 8254 for (auto mapFlag : CombinedInfo.Types) 8255 Mapping.push_back( 8256 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( 8257 mapFlag)); 8258 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"}); 8259 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName); 8260 Info.RTArgs.MapTypesArray = MapTypesArrayGbl; 8261 8262 // The information types are only built if provided. 8263 if (!CombinedInfo.Names.empty()) { 8264 std::string MapnamesName = createPlatformSpecificName({"offload_mapnames"}); 8265 auto *MapNamesArrayGbl = 8266 createOffloadMapnames(CombinedInfo.Names, MapnamesName); 8267 Info.RTArgs.MapNamesArray = MapNamesArrayGbl; 8268 Info.EmitDebug = true; 8269 } else { 8270 Info.RTArgs.MapNamesArray = 8271 Constant::getNullValue(PointerType::getUnqual(Builder.getContext())); 8272 Info.EmitDebug = false; 8273 } 8274 8275 // If there's a present map type modifier, it must not be applied to the end 8276 // of a region, so generate a separate map type array in that case. 8277 if (Info.separateBeginEndCalls()) { 8278 bool EndMapTypesDiffer = false; 8279 for (uint64_t &Type : Mapping) { 8280 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( 8281 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) { 8282 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( 8283 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT); 8284 EndMapTypesDiffer = true; 8285 } 8286 } 8287 if (EndMapTypesDiffer) { 8288 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName); 8289 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl; 8290 } 8291 } 8292 8293 PointerType *PtrTy = Builder.getPtrTy(); 8294 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) { 8295 Value *BPVal = CombinedInfo.BasePointers[I]; 8296 Value *BP = Builder.CreateConstInBoundsGEP2_32( 8297 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray, 8298 0, I); 8299 Builder.CreateAlignedStore(BPVal, BP, 8300 M.getDataLayout().getPrefTypeAlign(PtrTy)); 8301 8302 if (Info.requiresDevicePointerInfo()) { 8303 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) { 8304 CodeGenIP = Builder.saveIP(); 8305 Builder.restoreIP(AllocaIP); 8306 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)}; 8307 Builder.restoreIP(CodeGenIP); 8308 if (DeviceAddrCB) 8309 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second); 8310 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) { 8311 Info.DevicePtrInfoMap[BPVal] = {BP, BP}; 8312 if (DeviceAddrCB) 8313 DeviceAddrCB(I, BP); 8314 } 8315 } 8316 8317 Value *PVal = CombinedInfo.Pointers[I]; 8318 Value *P = Builder.CreateConstInBoundsGEP2_32( 8319 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0, 8320 I); 8321 // TODO: Check alignment correct. 8322 Builder.CreateAlignedStore(PVal, P, 8323 M.getDataLayout().getPrefTypeAlign(PtrTy)); 8324 8325 if (RuntimeSizes.test(I)) { 8326 Value *S = Builder.CreateConstInBoundsGEP2_32( 8327 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray, 8328 /*Idx0=*/0, 8329 /*Idx1=*/I); 8330 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I], 8331 Int64Ty, 8332 /*isSigned=*/true), 8333 S, M.getDataLayout().getPrefTypeAlign(PtrTy)); 8334 } 8335 // Fill up the mapper array. 8336 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0); 8337 Value *MFunc = ConstantPointerNull::get(PtrTy); 8338 if (CustomMapperCB) 8339 if (Value *CustomMFunc = CustomMapperCB(I)) 8340 MFunc = Builder.CreatePointerCast(CustomMFunc, PtrTy); 8341 Value *MAddr = Builder.CreateInBoundsGEP( 8342 MappersArray->getAllocatedType(), MappersArray, 8343 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)}); 8344 Builder.CreateAlignedStore( 8345 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType())); 8346 } 8347 8348 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() || 8349 Info.NumberOfPtrs == 0) 8350 return; 8351 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info); 8352 } 8353 8354 void OpenMPIRBuilder::emitBranch(BasicBlock *Target) { 8355 BasicBlock *CurBB = Builder.GetInsertBlock(); 8356 8357 if (!CurBB || CurBB->getTerminator()) { 8358 // If there is no insert point or the previous block is already 8359 // terminated, don't touch it. 8360 } else { 8361 // Otherwise, create a fall-through branch. 8362 Builder.CreateBr(Target); 8363 } 8364 8365 Builder.ClearInsertionPoint(); 8366 } 8367 8368 void OpenMPIRBuilder::emitBlock(BasicBlock *BB, Function *CurFn, 8369 bool IsFinished) { 8370 BasicBlock *CurBB = Builder.GetInsertBlock(); 8371 8372 // Fall out of the current block (if necessary). 8373 emitBranch(BB); 8374 8375 if (IsFinished && BB->use_empty()) { 8376 BB->eraseFromParent(); 8377 return; 8378 } 8379 8380 // Place the block after the current block, if possible, or else at 8381 // the end of the function. 8382 if (CurBB && CurBB->getParent()) 8383 CurFn->insert(std::next(CurBB->getIterator()), BB); 8384 else 8385 CurFn->insert(CurFn->end(), BB); 8386 Builder.SetInsertPoint(BB); 8387 } 8388 8389 Error OpenMPIRBuilder::emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, 8390 BodyGenCallbackTy ElseGen, 8391 InsertPointTy AllocaIP) { 8392 // If the condition constant folds and can be elided, try to avoid emitting 8393 // the condition and the dead arm of the if/else. 8394 if (auto *CI = dyn_cast<ConstantInt>(Cond)) { 8395 auto CondConstant = CI->getSExtValue(); 8396 if (CondConstant) 8397 return ThenGen(AllocaIP, Builder.saveIP()); 8398 8399 return ElseGen(AllocaIP, Builder.saveIP()); 8400 } 8401 8402 Function *CurFn = Builder.GetInsertBlock()->getParent(); 8403 8404 // Otherwise, the condition did not fold, or we couldn't elide it. Just 8405 // emit the conditional branch. 8406 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then"); 8407 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else"); 8408 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end"); 8409 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock); 8410 // Emit the 'then' code. 8411 emitBlock(ThenBlock, CurFn); 8412 if (Error Err = ThenGen(AllocaIP, Builder.saveIP())) 8413 return Err; 8414 emitBranch(ContBlock); 8415 // Emit the 'else' code if present. 8416 // There is no need to emit line number for unconditional branch. 8417 emitBlock(ElseBlock, CurFn); 8418 if (Error Err = ElseGen(AllocaIP, Builder.saveIP())) 8419 return Err; 8420 // There is no need to emit line number for unconditional branch. 8421 emitBranch(ContBlock); 8422 // Emit the continuation block for code after the if. 8423 emitBlock(ContBlock, CurFn, /*IsFinished=*/true); 8424 return Error::success(); 8425 } 8426 8427 bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic( 8428 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) { 8429 assert(!(AO == AtomicOrdering::NotAtomic || 8430 AO == llvm::AtomicOrdering::Unordered) && 8431 "Unexpected Atomic Ordering."); 8432 8433 bool Flush = false; 8434 llvm::AtomicOrdering FlushAO = AtomicOrdering::Monotonic; 8435 8436 switch (AK) { 8437 case Read: 8438 if (AO == AtomicOrdering::Acquire || AO == AtomicOrdering::AcquireRelease || 8439 AO == AtomicOrdering::SequentiallyConsistent) { 8440 FlushAO = AtomicOrdering::Acquire; 8441 Flush = true; 8442 } 8443 break; 8444 case Write: 8445 case Compare: 8446 case Update: 8447 if (AO == AtomicOrdering::Release || AO == AtomicOrdering::AcquireRelease || 8448 AO == AtomicOrdering::SequentiallyConsistent) { 8449 FlushAO = AtomicOrdering::Release; 8450 Flush = true; 8451 } 8452 break; 8453 case Capture: 8454 switch (AO) { 8455 case AtomicOrdering::Acquire: 8456 FlushAO = AtomicOrdering::Acquire; 8457 Flush = true; 8458 break; 8459 case AtomicOrdering::Release: 8460 FlushAO = AtomicOrdering::Release; 8461 Flush = true; 8462 break; 8463 case AtomicOrdering::AcquireRelease: 8464 case AtomicOrdering::SequentiallyConsistent: 8465 FlushAO = AtomicOrdering::AcquireRelease; 8466 Flush = true; 8467 break; 8468 default: 8469 // do nothing - leave silently. 8470 break; 8471 } 8472 } 8473 8474 if (Flush) { 8475 // Currently Flush RT call still doesn't take memory_ordering, so for when 8476 // that happens, this tries to do the resolution of which atomic ordering 8477 // to use with but issue the flush call 8478 // TODO: pass `FlushAO` after memory ordering support is added 8479 (void)FlushAO; 8480 emitFlush(Loc); 8481 } 8482 8483 // for AO == AtomicOrdering::Monotonic and all other case combinations 8484 // do nothing 8485 return Flush; 8486 } 8487 8488 OpenMPIRBuilder::InsertPointTy 8489 OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc, 8490 AtomicOpValue &X, AtomicOpValue &V, 8491 AtomicOrdering AO) { 8492 if (!updateToLocation(Loc)) 8493 return Loc.IP; 8494 8495 assert(X.Var->getType()->isPointerTy() && 8496 "OMP Atomic expects a pointer to target memory"); 8497 Type *XElemTy = X.ElemTy; 8498 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() || 8499 XElemTy->isPointerTy() || XElemTy->isStructTy()) && 8500 "OMP atomic read expected a scalar type"); 8501 8502 Value *XRead = nullptr; 8503 8504 if (XElemTy->isIntegerTy()) { 8505 LoadInst *XLD = 8506 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read"); 8507 XLD->setAtomic(AO); 8508 XRead = cast<Value>(XLD); 8509 } else if (XElemTy->isStructTy()) { 8510 // FIXME: Add checks to ensure __atomic_load is emitted iff the 8511 // target does not support `atomicrmw` of the size of the struct 8512 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read"); 8513 OldVal->setAtomic(AO); 8514 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout(); 8515 unsigned LoadSize = 8516 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType()); 8517 OpenMPIRBuilder::AtomicInfo atomicInfo( 8518 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(), 8519 OldVal->getAlign(), true /* UseLibcall */, X.Var); 8520 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO); 8521 XRead = AtomicLoadRes.first; 8522 OldVal->eraseFromParent(); 8523 } else { 8524 // We need to perform atomic op as integer 8525 IntegerType *IntCastTy = 8526 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits()); 8527 LoadInst *XLoad = 8528 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load"); 8529 XLoad->setAtomic(AO); 8530 if (XElemTy->isFloatingPointTy()) { 8531 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast"); 8532 } else { 8533 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast"); 8534 } 8535 } 8536 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read); 8537 if (XRead->getType() != V.Var->getType()) 8538 XRead = emitImplicitCast(Builder, XRead, V.Var); 8539 Builder.CreateStore(XRead, V.Var, V.IsVolatile); 8540 return Builder.saveIP(); 8541 } 8542 8543 OpenMPIRBuilder::InsertPointTy 8544 OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc, 8545 AtomicOpValue &X, Value *Expr, 8546 AtomicOrdering AO) { 8547 if (!updateToLocation(Loc)) 8548 return Loc.IP; 8549 8550 assert(X.Var->getType()->isPointerTy() && 8551 "OMP Atomic expects a pointer to target memory"); 8552 Type *XElemTy = X.ElemTy; 8553 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() || 8554 XElemTy->isPointerTy()) && 8555 "OMP atomic write expected a scalar type"); 8556 8557 if (XElemTy->isIntegerTy()) { 8558 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile); 8559 XSt->setAtomic(AO); 8560 } else { 8561 // We need to bitcast and perform atomic op as integers 8562 IntegerType *IntCastTy = 8563 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits()); 8564 Value *ExprCast = 8565 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast"); 8566 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile); 8567 XSt->setAtomic(AO); 8568 } 8569 8570 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write); 8571 return Builder.saveIP(); 8572 } 8573 8574 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate( 8575 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, 8576 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, 8577 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) { 8578 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous"); 8579 if (!updateToLocation(Loc)) 8580 return Loc.IP; 8581 8582 LLVM_DEBUG({ 8583 Type *XTy = X.Var->getType(); 8584 assert(XTy->isPointerTy() && 8585 "OMP Atomic expects a pointer to target memory"); 8586 Type *XElemTy = X.ElemTy; 8587 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() || 8588 XElemTy->isPointerTy()) && 8589 "OMP atomic update expected a scalar type"); 8590 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) && 8591 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) && 8592 "OpenMP atomic does not support LT or GT operations"); 8593 }); 8594 8595 Expected<std::pair<Value *, Value *>> AtomicResult = 8596 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, 8597 X.IsVolatile, IsXBinopExpr); 8598 if (!AtomicResult) 8599 return AtomicResult.takeError(); 8600 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update); 8601 return Builder.saveIP(); 8602 } 8603 8604 // FIXME: Duplicating AtomicExpand 8605 Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2, 8606 AtomicRMWInst::BinOp RMWOp) { 8607 switch (RMWOp) { 8608 case AtomicRMWInst::Add: 8609 return Builder.CreateAdd(Src1, Src2); 8610 case AtomicRMWInst::Sub: 8611 return Builder.CreateSub(Src1, Src2); 8612 case AtomicRMWInst::And: 8613 return Builder.CreateAnd(Src1, Src2); 8614 case AtomicRMWInst::Nand: 8615 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2)); 8616 case AtomicRMWInst::Or: 8617 return Builder.CreateOr(Src1, Src2); 8618 case AtomicRMWInst::Xor: 8619 return Builder.CreateXor(Src1, Src2); 8620 case AtomicRMWInst::Xchg: 8621 case AtomicRMWInst::FAdd: 8622 case AtomicRMWInst::FSub: 8623 case AtomicRMWInst::BAD_BINOP: 8624 case AtomicRMWInst::Max: 8625 case AtomicRMWInst::Min: 8626 case AtomicRMWInst::UMax: 8627 case AtomicRMWInst::UMin: 8628 case AtomicRMWInst::FMax: 8629 case AtomicRMWInst::FMin: 8630 case AtomicRMWInst::UIncWrap: 8631 case AtomicRMWInst::UDecWrap: 8632 case AtomicRMWInst::USubCond: 8633 case AtomicRMWInst::USubSat: 8634 llvm_unreachable("Unsupported atomic update operation"); 8635 } 8636 llvm_unreachable("Unsupported atomic update operation"); 8637 } 8638 8639 Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate( 8640 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr, 8641 AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, 8642 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) { 8643 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2 8644 // or a complex datatype. 8645 bool emitRMWOp = false; 8646 switch (RMWOp) { 8647 case AtomicRMWInst::Add: 8648 case AtomicRMWInst::And: 8649 case AtomicRMWInst::Nand: 8650 case AtomicRMWInst::Or: 8651 case AtomicRMWInst::Xor: 8652 case AtomicRMWInst::Xchg: 8653 emitRMWOp = XElemTy; 8654 break; 8655 case AtomicRMWInst::Sub: 8656 emitRMWOp = (IsXBinopExpr && XElemTy); 8657 break; 8658 default: 8659 emitRMWOp = false; 8660 } 8661 emitRMWOp &= XElemTy->isIntegerTy(); 8662 8663 std::pair<Value *, Value *> Res; 8664 if (emitRMWOp) { 8665 Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO); 8666 // not needed except in case of postfix captures. Generate anyway for 8667 // consistency with the else part. Will be removed with any DCE pass. 8668 // AtomicRMWInst::Xchg does not have a coressponding instruction. 8669 if (RMWOp == AtomicRMWInst::Xchg) 8670 Res.second = Res.first; 8671 else 8672 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp); 8673 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP && 8674 XElemTy->isStructTy()) { 8675 LoadInst *OldVal = 8676 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load"); 8677 OldVal->setAtomic(AO); 8678 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout(); 8679 unsigned LoadSize = 8680 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType()); 8681 8682 OpenMPIRBuilder::AtomicInfo atomicInfo( 8683 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(), 8684 OldVal->getAlign(), true /* UseLibcall */, X); 8685 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO); 8686 BasicBlock *CurBB = Builder.GetInsertBlock(); 8687 Instruction *CurBBTI = CurBB->getTerminator(); 8688 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable(); 8689 BasicBlock *ExitBB = 8690 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit"); 8691 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(), 8692 X->getName() + ".atomic.cont"); 8693 ContBB->getTerminator()->eraseFromParent(); 8694 Builder.restoreIP(AllocaIP); 8695 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy); 8696 NewAtomicAddr->setName(X->getName() + "x.new.val"); 8697 Builder.SetInsertPoint(ContBB); 8698 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2); 8699 PHI->addIncoming(AtomicLoadRes.first, CurBB); 8700 Value *OldExprVal = PHI; 8701 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder); 8702 if (!CBResult) 8703 return CBResult.takeError(); 8704 Value *Upd = *CBResult; 8705 Builder.CreateStore(Upd, NewAtomicAddr); 8706 AtomicOrdering Failure = 8707 llvm::AtomicCmpXchgInst::getStrongestFailureOrdering(AO); 8708 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall( 8709 AtomicLoadRes.second, NewAtomicAddr, AO, Failure); 8710 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first); 8711 PHI->addIncoming(PHILoad, Builder.GetInsertBlock()); 8712 Builder.CreateCondBr(Result.second, ExitBB, ContBB); 8713 OldVal->eraseFromParent(); 8714 Res.first = OldExprVal; 8715 Res.second = Upd; 8716 8717 if (UnreachableInst *ExitTI = 8718 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) { 8719 CurBBTI->eraseFromParent(); 8720 Builder.SetInsertPoint(ExitBB); 8721 } else { 8722 Builder.SetInsertPoint(ExitTI); 8723 } 8724 } else { 8725 IntegerType *IntCastTy = 8726 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits()); 8727 LoadInst *OldVal = 8728 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load"); 8729 OldVal->setAtomic(AO); 8730 // CurBB 8731 // | /---\ 8732 // ContBB | 8733 // | \---/ 8734 // ExitBB 8735 BasicBlock *CurBB = Builder.GetInsertBlock(); 8736 Instruction *CurBBTI = CurBB->getTerminator(); 8737 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable(); 8738 BasicBlock *ExitBB = 8739 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit"); 8740 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(), 8741 X->getName() + ".atomic.cont"); 8742 ContBB->getTerminator()->eraseFromParent(); 8743 Builder.restoreIP(AllocaIP); 8744 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy); 8745 NewAtomicAddr->setName(X->getName() + "x.new.val"); 8746 Builder.SetInsertPoint(ContBB); 8747 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2); 8748 PHI->addIncoming(OldVal, CurBB); 8749 bool IsIntTy = XElemTy->isIntegerTy(); 8750 Value *OldExprVal = PHI; 8751 if (!IsIntTy) { 8752 if (XElemTy->isFloatingPointTy()) { 8753 OldExprVal = Builder.CreateBitCast(PHI, XElemTy, 8754 X->getName() + ".atomic.fltCast"); 8755 } else { 8756 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy, 8757 X->getName() + ".atomic.ptrCast"); 8758 } 8759 } 8760 8761 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder); 8762 if (!CBResult) 8763 return CBResult.takeError(); 8764 Value *Upd = *CBResult; 8765 Builder.CreateStore(Upd, NewAtomicAddr); 8766 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr); 8767 AtomicOrdering Failure = 8768 llvm::AtomicCmpXchgInst::getStrongestFailureOrdering(AO); 8769 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg( 8770 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure); 8771 Result->setVolatile(VolatileX); 8772 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0); 8773 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1); 8774 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock()); 8775 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB); 8776 8777 Res.first = OldExprVal; 8778 Res.second = Upd; 8779 8780 // set Insertion point in exit block 8781 if (UnreachableInst *ExitTI = 8782 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) { 8783 CurBBTI->eraseFromParent(); 8784 Builder.SetInsertPoint(ExitBB); 8785 } else { 8786 Builder.SetInsertPoint(ExitTI); 8787 } 8788 } 8789 8790 return Res; 8791 } 8792 8793 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture( 8794 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, 8795 AtomicOpValue &V, Value *Expr, AtomicOrdering AO, 8796 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, 8797 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) { 8798 if (!updateToLocation(Loc)) 8799 return Loc.IP; 8800 8801 LLVM_DEBUG({ 8802 Type *XTy = X.Var->getType(); 8803 assert(XTy->isPointerTy() && 8804 "OMP Atomic expects a pointer to target memory"); 8805 Type *XElemTy = X.ElemTy; 8806 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() || 8807 XElemTy->isPointerTy()) && 8808 "OMP atomic capture expected a scalar type"); 8809 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) && 8810 "OpenMP atomic does not support LT or GT operations"); 8811 }); 8812 8813 // If UpdateExpr is 'x' updated with some `expr` not based on 'x', 8814 // 'x' is simply atomically rewritten with 'expr'. 8815 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg); 8816 Expected<std::pair<Value *, Value *>> AtomicResult = 8817 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, 8818 X.IsVolatile, IsXBinopExpr); 8819 if (!AtomicResult) 8820 return AtomicResult.takeError(); 8821 Value *CapturedVal = 8822 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second); 8823 if (CapturedVal->getType() != V.Var->getType()) 8824 CapturedVal = emitImplicitCast(Builder, CapturedVal, V.Var); 8825 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile); 8826 8827 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture); 8828 return Builder.saveIP(); 8829 } 8830 8831 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare( 8832 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, 8833 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, 8834 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, 8835 bool IsFailOnly) { 8836 8837 AtomicOrdering Failure = AtomicCmpXchgInst::getStrongestFailureOrdering(AO); 8838 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr, 8839 IsPostfixUpdate, IsFailOnly, Failure); 8840 } 8841 8842 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare( 8843 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, 8844 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, 8845 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, 8846 bool IsFailOnly, AtomicOrdering Failure) { 8847 8848 if (!updateToLocation(Loc)) 8849 return Loc.IP; 8850 8851 assert(X.Var->getType()->isPointerTy() && 8852 "OMP atomic expects a pointer to target memory"); 8853 // compare capture 8854 if (V.Var) { 8855 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type"); 8856 assert(V.ElemTy == X.ElemTy && "x and v must be of same type"); 8857 } 8858 8859 bool IsInteger = E->getType()->isIntegerTy(); 8860 8861 if (Op == OMPAtomicCompareOp::EQ) { 8862 AtomicCmpXchgInst *Result = nullptr; 8863 if (!IsInteger) { 8864 IntegerType *IntCastTy = 8865 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits()); 8866 Value *EBCast = Builder.CreateBitCast(E, IntCastTy); 8867 Value *DBCast = Builder.CreateBitCast(D, IntCastTy); 8868 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(), 8869 AO, Failure); 8870 } else { 8871 Result = 8872 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure); 8873 } 8874 8875 if (V.Var) { 8876 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0); 8877 if (!IsInteger) 8878 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy); 8879 assert(OldValue->getType() == V.ElemTy && 8880 "OldValue and V must be of same type"); 8881 if (IsPostfixUpdate) { 8882 Builder.CreateStore(OldValue, V.Var, V.IsVolatile); 8883 } else { 8884 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1); 8885 if (IsFailOnly) { 8886 // CurBB---- 8887 // | | 8888 // v | 8889 // ContBB | 8890 // | | 8891 // v | 8892 // ExitBB <- 8893 // 8894 // where ContBB only contains the store of old value to 'v'. 8895 BasicBlock *CurBB = Builder.GetInsertBlock(); 8896 Instruction *CurBBTI = CurBB->getTerminator(); 8897 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable(); 8898 BasicBlock *ExitBB = CurBB->splitBasicBlock( 8899 CurBBTI, X.Var->getName() + ".atomic.exit"); 8900 BasicBlock *ContBB = CurBB->splitBasicBlock( 8901 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont"); 8902 ContBB->getTerminator()->eraseFromParent(); 8903 CurBB->getTerminator()->eraseFromParent(); 8904 8905 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB); 8906 8907 Builder.SetInsertPoint(ContBB); 8908 Builder.CreateStore(OldValue, V.Var); 8909 Builder.CreateBr(ExitBB); 8910 8911 if (UnreachableInst *ExitTI = 8912 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) { 8913 CurBBTI->eraseFromParent(); 8914 Builder.SetInsertPoint(ExitBB); 8915 } else { 8916 Builder.SetInsertPoint(ExitTI); 8917 } 8918 } else { 8919 Value *CapturedValue = 8920 Builder.CreateSelect(SuccessOrFail, E, OldValue); 8921 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile); 8922 } 8923 } 8924 } 8925 // The comparison result has to be stored. 8926 if (R.Var) { 8927 assert(R.Var->getType()->isPointerTy() && 8928 "r.var must be of pointer type"); 8929 assert(R.ElemTy->isIntegerTy() && "r must be of integral type"); 8930 8931 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1); 8932 Value *ResultCast = R.IsSigned 8933 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy) 8934 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy); 8935 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile); 8936 } 8937 } else { 8938 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) && 8939 "Op should be either max or min at this point"); 8940 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is =="); 8941 8942 // Reverse the ordop as the OpenMP forms are different from LLVM forms. 8943 // Let's take max as example. 8944 // OpenMP form: 8945 // x = x > expr ? expr : x; 8946 // LLVM form: 8947 // *ptr = *ptr > val ? *ptr : val; 8948 // We need to transform to LLVM form. 8949 // x = x <= expr ? x : expr; 8950 AtomicRMWInst::BinOp NewOp; 8951 if (IsXBinopExpr) { 8952 if (IsInteger) { 8953 if (X.IsSigned) 8954 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min 8955 : AtomicRMWInst::Max; 8956 else 8957 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin 8958 : AtomicRMWInst::UMax; 8959 } else { 8960 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin 8961 : AtomicRMWInst::FMax; 8962 } 8963 } else { 8964 if (IsInteger) { 8965 if (X.IsSigned) 8966 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max 8967 : AtomicRMWInst::Min; 8968 else 8969 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax 8970 : AtomicRMWInst::UMin; 8971 } else { 8972 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax 8973 : AtomicRMWInst::FMin; 8974 } 8975 } 8976 8977 AtomicRMWInst *OldValue = 8978 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO); 8979 if (V.Var) { 8980 Value *CapturedValue = nullptr; 8981 if (IsPostfixUpdate) { 8982 CapturedValue = OldValue; 8983 } else { 8984 CmpInst::Predicate Pred; 8985 switch (NewOp) { 8986 case AtomicRMWInst::Max: 8987 Pred = CmpInst::ICMP_SGT; 8988 break; 8989 case AtomicRMWInst::UMax: 8990 Pred = CmpInst::ICMP_UGT; 8991 break; 8992 case AtomicRMWInst::FMax: 8993 Pred = CmpInst::FCMP_OGT; 8994 break; 8995 case AtomicRMWInst::Min: 8996 Pred = CmpInst::ICMP_SLT; 8997 break; 8998 case AtomicRMWInst::UMin: 8999 Pred = CmpInst::ICMP_ULT; 9000 break; 9001 case AtomicRMWInst::FMin: 9002 Pred = CmpInst::FCMP_OLT; 9003 break; 9004 default: 9005 llvm_unreachable("unexpected comparison op"); 9006 } 9007 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E); 9008 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue); 9009 } 9010 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile); 9011 } 9012 } 9013 9014 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare); 9015 9016 return Builder.saveIP(); 9017 } 9018 9019 OpenMPIRBuilder::InsertPointOrErrorTy 9020 OpenMPIRBuilder::createTeams(const LocationDescription &Loc, 9021 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower, 9022 Value *NumTeamsUpper, Value *ThreadLimit, 9023 Value *IfExpr) { 9024 if (!updateToLocation(Loc)) 9025 return InsertPointTy(); 9026 9027 uint32_t SrcLocStrSize; 9028 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 9029 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 9030 Function *CurrentFunction = Builder.GetInsertBlock()->getParent(); 9031 9032 // Outer allocation basicblock is the entry block of the current function. 9033 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock(); 9034 if (&OuterAllocaBB == Builder.GetInsertBlock()) { 9035 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry"); 9036 Builder.SetInsertPoint(BodyBB, BodyBB->begin()); 9037 } 9038 9039 // The current basic block is split into four basic blocks. After outlining, 9040 // they will be mapped as follows: 9041 // ``` 9042 // def current_fn() { 9043 // current_basic_block: 9044 // br label %teams.exit 9045 // teams.exit: 9046 // ; instructions after teams 9047 // } 9048 // 9049 // def outlined_fn() { 9050 // teams.alloca: 9051 // br label %teams.body 9052 // teams.body: 9053 // ; instructions within teams body 9054 // } 9055 // ``` 9056 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit"); 9057 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body"); 9058 BasicBlock *AllocaBB = 9059 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca"); 9060 9061 bool SubClausesPresent = 9062 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr); 9063 // Push num_teams 9064 if (!Config.isTargetDevice() && SubClausesPresent) { 9065 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) && 9066 "if lowerbound is non-null, then upperbound must also be non-null " 9067 "for bounds on num_teams"); 9068 9069 if (NumTeamsUpper == nullptr) 9070 NumTeamsUpper = Builder.getInt32(0); 9071 9072 if (NumTeamsLower == nullptr) 9073 NumTeamsLower = NumTeamsUpper; 9074 9075 if (IfExpr) { 9076 assert(IfExpr->getType()->isIntegerTy() && 9077 "argument to if clause must be an integer value"); 9078 9079 // upper = ifexpr ? upper : 1 9080 if (IfExpr->getType() != Int1) 9081 IfExpr = Builder.CreateICmpNE(IfExpr, 9082 ConstantInt::get(IfExpr->getType(), 0)); 9083 NumTeamsUpper = Builder.CreateSelect( 9084 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper"); 9085 9086 // lower = ifexpr ? lower : 1 9087 NumTeamsLower = Builder.CreateSelect( 9088 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower"); 9089 } 9090 9091 if (ThreadLimit == nullptr) 9092 ThreadLimit = Builder.getInt32(0); 9093 9094 Value *ThreadNum = getOrCreateThreadID(Ident); 9095 Builder.CreateCall( 9096 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51), 9097 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit}); 9098 } 9099 // Generate the body of teams. 9100 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin()); 9101 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin()); 9102 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP)) 9103 return Err; 9104 9105 OutlineInfo OI; 9106 OI.EntryBB = AllocaBB; 9107 OI.ExitBB = ExitBB; 9108 OI.OuterAllocaBB = &OuterAllocaBB; 9109 9110 // Insert fake values for global tid and bound tid. 9111 SmallVector<Instruction *, 8> ToBeDeleted; 9112 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin()); 9113 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal( 9114 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true)); 9115 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal( 9116 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true)); 9117 9118 auto HostPostOutlineCB = [this, Ident, 9119 ToBeDeleted](Function &OutlinedFn) mutable { 9120 // The stale call instruction will be replaced with a new call instruction 9121 // for runtime call with the outlined function. 9122 9123 assert(OutlinedFn.getNumUses() == 1 && 9124 "there must be a single user for the outlined function"); 9125 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back()); 9126 ToBeDeleted.push_back(StaleCI); 9127 9128 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) && 9129 "Outlined function must have two or three arguments only"); 9130 9131 bool HasShared = OutlinedFn.arg_size() == 3; 9132 9133 OutlinedFn.getArg(0)->setName("global.tid.ptr"); 9134 OutlinedFn.getArg(1)->setName("bound.tid.ptr"); 9135 if (HasShared) 9136 OutlinedFn.getArg(2)->setName("data"); 9137 9138 // Call to the runtime function for teams in the current function. 9139 assert(StaleCI && "Error while outlining - no CallInst user found for the " 9140 "outlined function."); 9141 Builder.SetInsertPoint(StaleCI); 9142 SmallVector<Value *> Args = { 9143 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn}; 9144 if (HasShared) 9145 Args.push_back(StaleCI->getArgOperand(2)); 9146 Builder.CreateCall(getOrCreateRuntimeFunctionPtr( 9147 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams), 9148 Args); 9149 9150 for (Instruction *I : llvm::reverse(ToBeDeleted)) 9151 I->eraseFromParent(); 9152 }; 9153 9154 if (!Config.isTargetDevice()) 9155 OI.PostOutlineCB = HostPostOutlineCB; 9156 9157 addOutlineInfo(std::move(OI)); 9158 9159 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 9160 9161 return Builder.saveIP(); 9162 } 9163 9164 GlobalVariable * 9165 OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names, 9166 std::string VarName) { 9167 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get( 9168 llvm::ArrayType::get(llvm::PointerType::getUnqual(M.getContext()), 9169 Names.size()), 9170 Names); 9171 auto *MapNamesArrayGlobal = new llvm::GlobalVariable( 9172 M, MapNamesArrayInit->getType(), 9173 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit, 9174 VarName); 9175 return MapNamesArrayGlobal; 9176 } 9177 9178 // Create all simple and struct types exposed by the runtime and remember 9179 // the llvm::PointerTypes of them for easy access later. 9180 void OpenMPIRBuilder::initializeTypes(Module &M) { 9181 LLVMContext &Ctx = M.getContext(); 9182 StructType *T; 9183 #define OMP_TYPE(VarName, InitValue) VarName = InitValue; 9184 #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \ 9185 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \ 9186 VarName##PtrTy = PointerType::getUnqual(Ctx); 9187 #define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \ 9188 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \ 9189 VarName##Ptr = PointerType::getUnqual(Ctx); 9190 #define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \ 9191 T = StructType::getTypeByName(Ctx, StructName); \ 9192 if (!T) \ 9193 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \ 9194 VarName = T; \ 9195 VarName##Ptr = PointerType::getUnqual(Ctx); 9196 #include "llvm/Frontend/OpenMP/OMPKinds.def" 9197 } 9198 9199 void OpenMPIRBuilder::OutlineInfo::collectBlocks( 9200 SmallPtrSetImpl<BasicBlock *> &BlockSet, 9201 SmallVectorImpl<BasicBlock *> &BlockVector) { 9202 SmallVector<BasicBlock *, 32> Worklist; 9203 BlockSet.insert(EntryBB); 9204 BlockSet.insert(ExitBB); 9205 9206 Worklist.push_back(EntryBB); 9207 while (!Worklist.empty()) { 9208 BasicBlock *BB = Worklist.pop_back_val(); 9209 BlockVector.push_back(BB); 9210 for (BasicBlock *SuccBB : successors(BB)) 9211 if (BlockSet.insert(SuccBB).second) 9212 Worklist.push_back(SuccBB); 9213 } 9214 } 9215 9216 void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr, 9217 uint64_t Size, int32_t Flags, 9218 GlobalValue::LinkageTypes, 9219 StringRef Name) { 9220 if (!Config.isGPU()) { 9221 llvm::offloading::emitOffloadingEntry( 9222 M, object::OffloadKind::OFK_OpenMP, ID, 9223 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0, 9224 "omp_offloading_entries"); 9225 return; 9226 } 9227 // TODO: Add support for global variables on the device after declare target 9228 // support. 9229 Function *Fn = dyn_cast<Function>(Addr); 9230 if (!Fn) 9231 return; 9232 9233 // Add a function attribute for the kernel. 9234 Fn->addFnAttr("kernel"); 9235 if (T.isAMDGCN()) 9236 Fn->addFnAttr("uniform-work-group-size", "true"); 9237 Fn->addFnAttr(Attribute::MustProgress); 9238 } 9239 9240 // We only generate metadata for function that contain target regions. 9241 void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata( 9242 EmitMetadataErrorReportFunctionTy &ErrorFn) { 9243 9244 // If there are no entries, we don't need to do anything. 9245 if (OffloadInfoManager.empty()) 9246 return; 9247 9248 LLVMContext &C = M.getContext(); 9249 SmallVector<std::pair<const OffloadEntriesInfoManager::OffloadEntryInfo *, 9250 TargetRegionEntryInfo>, 9251 16> 9252 OrderedEntries(OffloadInfoManager.size()); 9253 9254 // Auxiliary methods to create metadata values and strings. 9255 auto &&GetMDInt = [this](unsigned V) { 9256 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V)); 9257 }; 9258 9259 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); }; 9260 9261 // Create the offloading info metadata node. 9262 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info"); 9263 auto &&TargetRegionMetadataEmitter = 9264 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString]( 9265 const TargetRegionEntryInfo &EntryInfo, 9266 const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) { 9267 // Generate metadata for target regions. Each entry of this metadata 9268 // contains: 9269 // - Entry 0 -> Kind of this type of metadata (0). 9270 // - Entry 1 -> Device ID of the file where the entry was identified. 9271 // - Entry 2 -> File ID of the file where the entry was identified. 9272 // - Entry 3 -> Mangled name of the function where the entry was 9273 // identified. 9274 // - Entry 4 -> Line in the file where the entry was identified. 9275 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line. 9276 // - Entry 6 -> Order the entry was created. 9277 // The first element of the metadata node is the kind. 9278 Metadata *Ops[] = { 9279 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID), 9280 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName), 9281 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count), 9282 GetMDInt(E.getOrder())}; 9283 9284 // Save this entry in the right position of the ordered entries array. 9285 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo); 9286 9287 // Add metadata to the named metadata node. 9288 MD->addOperand(MDNode::get(C, Ops)); 9289 }; 9290 9291 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter); 9292 9293 // Create function that emits metadata for each device global variable entry; 9294 auto &&DeviceGlobalVarMetadataEmitter = 9295 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD]( 9296 StringRef MangledName, 9297 const OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar &E) { 9298 // Generate metadata for global variables. Each entry of this metadata 9299 // contains: 9300 // - Entry 0 -> Kind of this type of metadata (1). 9301 // - Entry 1 -> Mangled name of the variable. 9302 // - Entry 2 -> Declare target kind. 9303 // - Entry 3 -> Order the entry was created. 9304 // The first element of the metadata node is the kind. 9305 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName), 9306 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())}; 9307 9308 // Save this entry in the right position of the ordered entries array. 9309 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0); 9310 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo); 9311 9312 // Add metadata to the named metadata node. 9313 MD->addOperand(MDNode::get(C, Ops)); 9314 }; 9315 9316 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo( 9317 DeviceGlobalVarMetadataEmitter); 9318 9319 for (const auto &E : OrderedEntries) { 9320 assert(E.first && "All ordered entries must exist!"); 9321 if (const auto *CE = 9322 dyn_cast<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion>( 9323 E.first)) { 9324 if (!CE->getID() || !CE->getAddress()) { 9325 // Do not blame the entry if the parent funtion is not emitted. 9326 TargetRegionEntryInfo EntryInfo = E.second; 9327 StringRef FnName = EntryInfo.ParentName; 9328 if (!M.getNamedValue(FnName)) 9329 continue; 9330 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo); 9331 continue; 9332 } 9333 createOffloadEntry(CE->getID(), CE->getAddress(), 9334 /*Size=*/0, CE->getFlags(), 9335 GlobalValue::WeakAnyLinkage); 9336 } else if (const auto *CE = dyn_cast< 9337 OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar>( 9338 E.first)) { 9339 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags = 9340 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>( 9341 CE->getFlags()); 9342 switch (Flags) { 9343 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter: 9344 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo: 9345 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory()) 9346 continue; 9347 if (!CE->getAddress()) { 9348 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second); 9349 continue; 9350 } 9351 // The vaiable has no definition - no need to add the entry. 9352 if (CE->getVarSize() == 0) 9353 continue; 9354 break; 9355 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink: 9356 assert(((Config.isTargetDevice() && !CE->getAddress()) || 9357 (!Config.isTargetDevice() && CE->getAddress())) && 9358 "Declaret target link address is set."); 9359 if (Config.isTargetDevice()) 9360 continue; 9361 if (!CE->getAddress()) { 9362 ErrorFn(EMIT_MD_GLOBAL_VAR_LINK_ERROR, TargetRegionEntryInfo()); 9363 continue; 9364 } 9365 break; 9366 default: 9367 break; 9368 } 9369 9370 // Hidden or internal symbols on the device are not externally visible. 9371 // We should not attempt to register them by creating an offloading 9372 // entry. Indirect variables are handled separately on the device. 9373 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress())) 9374 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) && 9375 Flags != OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect) 9376 continue; 9377 9378 // Indirect globals need to use a special name that doesn't match the name 9379 // of the associated host global. 9380 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect) 9381 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(), 9382 Flags, CE->getLinkage(), CE->getVarName()); 9383 else 9384 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(), 9385 Flags, CE->getLinkage()); 9386 9387 } else { 9388 llvm_unreachable("Unsupported entry kind."); 9389 } 9390 } 9391 9392 // Emit requires directive globals to a special entry so the runtime can 9393 // register them when the device image is loaded. 9394 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading 9395 // entries should be redesigned to better suit this use-case. 9396 if (Config.hasRequiresFlags() && !Config.isTargetDevice()) 9397 offloading::emitOffloadingEntry( 9398 M, object::OffloadKind::OFK_OpenMP, 9399 Constant::getNullValue(PointerType::getUnqual(M.getContext())), 9400 /*Name=*/"", 9401 /*Size=*/0, OffloadEntriesInfoManager::OMPTargetGlobalRegisterRequires, 9402 Config.getRequiresFlags(), "omp_offloading_entries"); 9403 } 9404 9405 void TargetRegionEntryInfo::getTargetRegionEntryFnName( 9406 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID, 9407 unsigned FileID, unsigned Line, unsigned Count) { 9408 raw_svector_ostream OS(Name); 9409 OS << KernelNamePrefix << llvm::format("%x", DeviceID) 9410 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line; 9411 if (Count) 9412 OS << "_" << Count; 9413 } 9414 9415 void OffloadEntriesInfoManager::getTargetRegionEntryFnName( 9416 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) { 9417 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo); 9418 TargetRegionEntryInfo::getTargetRegionEntryFnName( 9419 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID, 9420 EntryInfo.Line, NewCount); 9421 } 9422 9423 TargetRegionEntryInfo 9424 OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, 9425 StringRef ParentName) { 9426 sys::fs::UniqueID ID; 9427 auto FileIDInfo = CallBack(); 9428 if (auto EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID)) { 9429 report_fatal_error(("Unable to get unique ID for file, during " 9430 "getTargetEntryUniqueInfo, error message: " + 9431 EC.message()) 9432 .c_str()); 9433 } 9434 9435 return TargetRegionEntryInfo(ParentName, ID.getDevice(), ID.getFile(), 9436 std::get<1>(FileIDInfo)); 9437 } 9438 9439 unsigned OpenMPIRBuilder::getFlagMemberOffset() { 9440 unsigned Offset = 0; 9441 for (uint64_t Remain = 9442 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>( 9443 omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF); 9444 !(Remain & 1); Remain = Remain >> 1) 9445 Offset++; 9446 return Offset; 9447 } 9448 9449 omp::OpenMPOffloadMappingFlags 9450 OpenMPIRBuilder::getMemberOfFlag(unsigned Position) { 9451 // Rotate by getFlagMemberOffset() bits. 9452 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1) 9453 << getFlagMemberOffset()); 9454 } 9455 9456 void OpenMPIRBuilder::setCorrectMemberOfFlag( 9457 omp::OpenMPOffloadMappingFlags &Flags, 9458 omp::OpenMPOffloadMappingFlags MemberOfFlag) { 9459 // If the entry is PTR_AND_OBJ but has not been marked with the special 9460 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be 9461 // marked as MEMBER_OF. 9462 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>( 9463 Flags & omp::OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ) && 9464 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>( 9465 (Flags & omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF) != 9466 omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF)) 9467 return; 9468 9469 // Reset the placeholder value to prepare the flag for the assignment of the 9470 // proper MEMBER_OF value. 9471 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF; 9472 Flags |= MemberOfFlag; 9473 } 9474 9475 Constant *OpenMPIRBuilder::getAddrOfDeclareTargetVar( 9476 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, 9477 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, 9478 bool IsDeclaration, bool IsExternallyVisible, 9479 TargetRegionEntryInfo EntryInfo, StringRef MangledName, 9480 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD, 9481 std::vector<Triple> TargetTriple, Type *LlvmPtrTy, 9482 std::function<Constant *()> GlobalInitializer, 9483 std::function<GlobalValue::LinkageTypes()> VariableLinkage) { 9484 // TODO: convert this to utilise the IRBuilder Config rather than 9485 // a passed down argument. 9486 if (OpenMPSIMD) 9487 return nullptr; 9488 9489 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink || 9490 ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo || 9491 CaptureClause == 9492 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) && 9493 Config.hasRequiresUnifiedSharedMemory())) { 9494 SmallString<64> PtrName; 9495 { 9496 raw_svector_ostream OS(PtrName); 9497 OS << MangledName; 9498 if (!IsExternallyVisible) 9499 OS << format("_%x", EntryInfo.FileID); 9500 OS << "_decl_tgt_ref_ptr"; 9501 } 9502 9503 Value *Ptr = M.getNamedValue(PtrName); 9504 9505 if (!Ptr) { 9506 GlobalValue *GlobalValue = M.getNamedValue(MangledName); 9507 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName); 9508 9509 auto *GV = cast<GlobalVariable>(Ptr); 9510 GV->setLinkage(GlobalValue::WeakAnyLinkage); 9511 9512 if (!Config.isTargetDevice()) { 9513 if (GlobalInitializer) 9514 GV->setInitializer(GlobalInitializer()); 9515 else 9516 GV->setInitializer(GlobalValue); 9517 } 9518 9519 registerTargetGlobalVariable( 9520 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible, 9521 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple, 9522 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr)); 9523 } 9524 9525 return cast<Constant>(Ptr); 9526 } 9527 9528 return nullptr; 9529 } 9530 9531 void OpenMPIRBuilder::registerTargetGlobalVariable( 9532 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, 9533 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, 9534 bool IsDeclaration, bool IsExternallyVisible, 9535 TargetRegionEntryInfo EntryInfo, StringRef MangledName, 9536 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD, 9537 std::vector<Triple> TargetTriple, 9538 std::function<Constant *()> GlobalInitializer, 9539 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, 9540 Constant *Addr) { 9541 if (DeviceClause != OffloadEntriesInfoManager::OMPTargetDeviceClauseAny || 9542 (TargetTriple.empty() && !Config.isTargetDevice())) 9543 return; 9544 9545 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags; 9546 StringRef VarName; 9547 int64_t VarSize; 9548 GlobalValue::LinkageTypes Linkage; 9549 9550 if ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo || 9551 CaptureClause == 9552 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) && 9553 !Config.hasRequiresUnifiedSharedMemory()) { 9554 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo; 9555 VarName = MangledName; 9556 GlobalValue *LlvmVal = M.getNamedValue(VarName); 9557 9558 if (!IsDeclaration) 9559 VarSize = divideCeil( 9560 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8); 9561 else 9562 VarSize = 0; 9563 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage(); 9564 9565 // This is a workaround carried over from Clang which prevents undesired 9566 // optimisation of internal variables. 9567 if (Config.isTargetDevice() && 9568 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) { 9569 // Do not create a "ref-variable" if the original is not also available 9570 // on the host. 9571 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName)) 9572 return; 9573 9574 std::string RefName = createPlatformSpecificName({VarName, "ref"}); 9575 9576 if (!M.getNamedValue(RefName)) { 9577 Constant *AddrRef = 9578 getOrCreateInternalVariable(Addr->getType(), RefName); 9579 auto *GvAddrRef = cast<GlobalVariable>(AddrRef); 9580 GvAddrRef->setConstant(true); 9581 GvAddrRef->setLinkage(GlobalValue::InternalLinkage); 9582 GvAddrRef->setInitializer(Addr); 9583 GeneratedRefs.push_back(GvAddrRef); 9584 } 9585 } 9586 } else { 9587 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink) 9588 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink; 9589 else 9590 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo; 9591 9592 if (Config.isTargetDevice()) { 9593 VarName = (Addr) ? Addr->getName() : ""; 9594 Addr = nullptr; 9595 } else { 9596 Addr = getAddrOfDeclareTargetVar( 9597 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible, 9598 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple, 9599 LlvmPtrTy, GlobalInitializer, VariableLinkage); 9600 VarName = (Addr) ? Addr->getName() : ""; 9601 } 9602 VarSize = M.getDataLayout().getPointerSize(); 9603 Linkage = GlobalValue::WeakAnyLinkage; 9604 } 9605 9606 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize, 9607 Flags, Linkage); 9608 } 9609 9610 /// Loads all the offload entries information from the host IR 9611 /// metadata. 9612 void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) { 9613 // If we are in target mode, load the metadata from the host IR. This code has 9614 // to match the metadata creation in createOffloadEntriesAndInfoMetadata(). 9615 9616 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName); 9617 if (!MD) 9618 return; 9619 9620 for (MDNode *MN : MD->operands()) { 9621 auto &&GetMDInt = [MN](unsigned Idx) { 9622 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx)); 9623 return cast<ConstantInt>(V->getValue())->getZExtValue(); 9624 }; 9625 9626 auto &&GetMDString = [MN](unsigned Idx) { 9627 auto *V = cast<MDString>(MN->getOperand(Idx)); 9628 return V->getString(); 9629 }; 9630 9631 switch (GetMDInt(0)) { 9632 default: 9633 llvm_unreachable("Unexpected metadata!"); 9634 break; 9635 case OffloadEntriesInfoManager::OffloadEntryInfo:: 9636 OffloadingEntryInfoTargetRegion: { 9637 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3), 9638 /*DeviceID=*/GetMDInt(1), 9639 /*FileID=*/GetMDInt(2), 9640 /*Line=*/GetMDInt(4), 9641 /*Count=*/GetMDInt(5)); 9642 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo, 9643 /*Order=*/GetMDInt(6)); 9644 break; 9645 } 9646 case OffloadEntriesInfoManager::OffloadEntryInfo:: 9647 OffloadingEntryInfoDeviceGlobalVar: 9648 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo( 9649 /*MangledName=*/GetMDString(1), 9650 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>( 9651 /*Flags=*/GetMDInt(2)), 9652 /*Order=*/GetMDInt(3)); 9653 break; 9654 } 9655 } 9656 } 9657 9658 void OpenMPIRBuilder::loadOffloadInfoMetadata(StringRef HostFilePath) { 9659 if (HostFilePath.empty()) 9660 return; 9661 9662 auto Buf = MemoryBuffer::getFile(HostFilePath); 9663 if (std::error_code Err = Buf.getError()) { 9664 report_fatal_error(("error opening host file from host file path inside of " 9665 "OpenMPIRBuilder: " + 9666 Err.message()) 9667 .c_str()); 9668 } 9669 9670 LLVMContext Ctx; 9671 auto M = expectedToErrorOrAndEmitErrors( 9672 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx)); 9673 if (std::error_code Err = M.getError()) { 9674 report_fatal_error( 9675 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message()) 9676 .c_str()); 9677 } 9678 9679 loadOffloadInfoMetadata(*M.get()); 9680 } 9681 9682 //===----------------------------------------------------------------------===// 9683 // OffloadEntriesInfoManager 9684 //===----------------------------------------------------------------------===// 9685 9686 bool OffloadEntriesInfoManager::empty() const { 9687 return OffloadEntriesTargetRegion.empty() && 9688 OffloadEntriesDeviceGlobalVar.empty(); 9689 } 9690 9691 unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount( 9692 const TargetRegionEntryInfo &EntryInfo) const { 9693 auto It = OffloadEntriesTargetRegionCount.find( 9694 getTargetRegionEntryCountKey(EntryInfo)); 9695 if (It == OffloadEntriesTargetRegionCount.end()) 9696 return 0; 9697 return It->second; 9698 } 9699 9700 void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount( 9701 const TargetRegionEntryInfo &EntryInfo) { 9702 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] = 9703 EntryInfo.Count + 1; 9704 } 9705 9706 /// Initialize target region entry. 9707 void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo( 9708 const TargetRegionEntryInfo &EntryInfo, unsigned Order) { 9709 OffloadEntriesTargetRegion[EntryInfo] = 9710 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr, 9711 OMPTargetRegionEntryTargetRegion); 9712 ++OffloadingEntriesNum; 9713 } 9714 9715 void OffloadEntriesInfoManager::registerTargetRegionEntryInfo( 9716 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, 9717 OMPTargetRegionEntryKind Flags) { 9718 assert(EntryInfo.Count == 0 && "expected default EntryInfo"); 9719 9720 // Update the EntryInfo with the next available count for this location. 9721 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo); 9722 9723 // If we are emitting code for a target, the entry is already initialized, 9724 // only has to be registered. 9725 if (OMPBuilder->Config.isTargetDevice()) { 9726 // This could happen if the device compilation is invoked standalone. 9727 if (!hasTargetRegionEntryInfo(EntryInfo)) { 9728 return; 9729 } 9730 auto &Entry = OffloadEntriesTargetRegion[EntryInfo]; 9731 Entry.setAddress(Addr); 9732 Entry.setID(ID); 9733 Entry.setFlags(Flags); 9734 } else { 9735 if (Flags == OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion && 9736 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true)) 9737 return; 9738 assert(!hasTargetRegionEntryInfo(EntryInfo) && 9739 "Target region entry already registered!"); 9740 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags); 9741 OffloadEntriesTargetRegion[EntryInfo] = Entry; 9742 ++OffloadingEntriesNum; 9743 } 9744 incrementTargetRegionEntryInfoCount(EntryInfo); 9745 } 9746 9747 bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo( 9748 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const { 9749 9750 // Update the EntryInfo with the next available count for this location. 9751 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo); 9752 9753 auto It = OffloadEntriesTargetRegion.find(EntryInfo); 9754 if (It == OffloadEntriesTargetRegion.end()) { 9755 return false; 9756 } 9757 // Fail if this entry is already registered. 9758 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID())) 9759 return false; 9760 return true; 9761 } 9762 9763 void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo( 9764 const OffloadTargetRegionEntryInfoActTy &Action) { 9765 // Scan all target region entries and perform the provided action. 9766 for (const auto &It : OffloadEntriesTargetRegion) { 9767 Action(It.first, It.second); 9768 } 9769 } 9770 9771 void OffloadEntriesInfoManager::initializeDeviceGlobalVarEntryInfo( 9772 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) { 9773 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags); 9774 ++OffloadingEntriesNum; 9775 } 9776 9777 void OffloadEntriesInfoManager::registerDeviceGlobalVarEntryInfo( 9778 StringRef VarName, Constant *Addr, int64_t VarSize, 9779 OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage) { 9780 if (OMPBuilder->Config.isTargetDevice()) { 9781 // This could happen if the device compilation is invoked standalone. 9782 if (!hasDeviceGlobalVarEntryInfo(VarName)) 9783 return; 9784 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName]; 9785 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) { 9786 if (Entry.getVarSize() == 0) { 9787 Entry.setVarSize(VarSize); 9788 Entry.setLinkage(Linkage); 9789 } 9790 return; 9791 } 9792 Entry.setVarSize(VarSize); 9793 Entry.setLinkage(Linkage); 9794 Entry.setAddress(Addr); 9795 } else { 9796 if (hasDeviceGlobalVarEntryInfo(VarName)) { 9797 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName]; 9798 assert(Entry.isValid() && Entry.getFlags() == Flags && 9799 "Entry not initialized!"); 9800 if (Entry.getVarSize() == 0) { 9801 Entry.setVarSize(VarSize); 9802 Entry.setLinkage(Linkage); 9803 } 9804 return; 9805 } 9806 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect) 9807 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum, 9808 Addr, VarSize, Flags, Linkage, 9809 VarName.str()); 9810 else 9811 OffloadEntriesDeviceGlobalVar.try_emplace( 9812 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, ""); 9813 ++OffloadingEntriesNum; 9814 } 9815 } 9816 9817 void OffloadEntriesInfoManager::actOnDeviceGlobalVarEntriesInfo( 9818 const OffloadDeviceGlobalVarEntryInfoActTy &Action) { 9819 // Scan all target region entries and perform the provided action. 9820 for (const auto &E : OffloadEntriesDeviceGlobalVar) 9821 Action(E.getKey(), E.getValue()); 9822 } 9823 9824 //===----------------------------------------------------------------------===// 9825 // CanonicalLoopInfo 9826 //===----------------------------------------------------------------------===// 9827 9828 void CanonicalLoopInfo::collectControlBlocks( 9829 SmallVectorImpl<BasicBlock *> &BBs) { 9830 // We only count those BBs as control block for which we do not need to 9831 // reverse the CFG, i.e. not the loop body which can contain arbitrary control 9832 // flow. For consistency, this also means we do not add the Body block, which 9833 // is just the entry to the body code. 9834 BBs.reserve(BBs.size() + 6); 9835 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()}); 9836 } 9837 9838 BasicBlock *CanonicalLoopInfo::getPreheader() const { 9839 assert(isValid() && "Requires a valid canonical loop"); 9840 for (BasicBlock *Pred : predecessors(Header)) { 9841 if (Pred != Latch) 9842 return Pred; 9843 } 9844 llvm_unreachable("Missing preheader"); 9845 } 9846 9847 void CanonicalLoopInfo::setTripCount(Value *TripCount) { 9848 assert(isValid() && "Requires a valid canonical loop"); 9849 9850 Instruction *CmpI = &getCond()->front(); 9851 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount"); 9852 CmpI->setOperand(1, TripCount); 9853 9854 #ifndef NDEBUG 9855 assertOK(); 9856 #endif 9857 } 9858 9859 void CanonicalLoopInfo::mapIndVar( 9860 llvm::function_ref<Value *(Instruction *)> Updater) { 9861 assert(isValid() && "Requires a valid canonical loop"); 9862 9863 Instruction *OldIV = getIndVar(); 9864 9865 // Record all uses excluding those introduced by the updater. Uses by the 9866 // CanonicalLoopInfo itself to keep track of the number of iterations are 9867 // excluded. 9868 SmallVector<Use *> ReplacableUses; 9869 for (Use &U : OldIV->uses()) { 9870 auto *User = dyn_cast<Instruction>(U.getUser()); 9871 if (!User) 9872 continue; 9873 if (User->getParent() == getCond()) 9874 continue; 9875 if (User->getParent() == getLatch()) 9876 continue; 9877 ReplacableUses.push_back(&U); 9878 } 9879 9880 // Run the updater that may introduce new uses 9881 Value *NewIV = Updater(OldIV); 9882 9883 // Replace the old uses with the value returned by the updater. 9884 for (Use *U : ReplacableUses) 9885 U->set(NewIV); 9886 9887 #ifndef NDEBUG 9888 assertOK(); 9889 #endif 9890 } 9891 9892 void CanonicalLoopInfo::assertOK() const { 9893 #ifndef NDEBUG 9894 // No constraints if this object currently does not describe a loop. 9895 if (!isValid()) 9896 return; 9897 9898 BasicBlock *Preheader = getPreheader(); 9899 BasicBlock *Body = getBody(); 9900 BasicBlock *After = getAfter(); 9901 9902 // Verify standard control-flow we use for OpenMP loops. 9903 assert(Preheader); 9904 assert(isa<BranchInst>(Preheader->getTerminator()) && 9905 "Preheader must terminate with unconditional branch"); 9906 assert(Preheader->getSingleSuccessor() == Header && 9907 "Preheader must jump to header"); 9908 9909 assert(Header); 9910 assert(isa<BranchInst>(Header->getTerminator()) && 9911 "Header must terminate with unconditional branch"); 9912 assert(Header->getSingleSuccessor() == Cond && 9913 "Header must jump to exiting block"); 9914 9915 assert(Cond); 9916 assert(Cond->getSinglePredecessor() == Header && 9917 "Exiting block only reachable from header"); 9918 9919 assert(isa<BranchInst>(Cond->getTerminator()) && 9920 "Exiting block must terminate with conditional branch"); 9921 assert(size(successors(Cond)) == 2 && 9922 "Exiting block must have two successors"); 9923 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body && 9924 "Exiting block's first successor jump to the body"); 9925 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit && 9926 "Exiting block's second successor must exit the loop"); 9927 9928 assert(Body); 9929 assert(Body->getSinglePredecessor() == Cond && 9930 "Body only reachable from exiting block"); 9931 assert(!isa<PHINode>(Body->front())); 9932 9933 assert(Latch); 9934 assert(isa<BranchInst>(Latch->getTerminator()) && 9935 "Latch must terminate with unconditional branch"); 9936 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header"); 9937 // TODO: To support simple redirecting of the end of the body code that has 9938 // multiple; introduce another auxiliary basic block like preheader and after. 9939 assert(Latch->getSinglePredecessor() != nullptr); 9940 assert(!isa<PHINode>(Latch->front())); 9941 9942 assert(Exit); 9943 assert(isa<BranchInst>(Exit->getTerminator()) && 9944 "Exit block must terminate with unconditional branch"); 9945 assert(Exit->getSingleSuccessor() == After && 9946 "Exit block must jump to after block"); 9947 9948 assert(After); 9949 assert(After->getSinglePredecessor() == Exit && 9950 "After block only reachable from exit block"); 9951 assert(After->empty() || !isa<PHINode>(After->front())); 9952 9953 Instruction *IndVar = getIndVar(); 9954 assert(IndVar && "Canonical induction variable not found?"); 9955 assert(isa<IntegerType>(IndVar->getType()) && 9956 "Induction variable must be an integer"); 9957 assert(cast<PHINode>(IndVar)->getParent() == Header && 9958 "Induction variable must be a PHI in the loop header"); 9959 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader); 9960 assert( 9961 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero()); 9962 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch); 9963 9964 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1); 9965 assert(cast<Instruction>(NextIndVar)->getParent() == Latch); 9966 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add); 9967 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar); 9968 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1)) 9969 ->isOne()); 9970 9971 Value *TripCount = getTripCount(); 9972 assert(TripCount && "Loop trip count not found?"); 9973 assert(IndVar->getType() == TripCount->getType() && 9974 "Trip count and induction variable must have the same type"); 9975 9976 auto *CmpI = cast<CmpInst>(&Cond->front()); 9977 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT && 9978 "Exit condition must be a signed less-than comparison"); 9979 assert(CmpI->getOperand(0) == IndVar && 9980 "Exit condition must compare the induction variable"); 9981 assert(CmpI->getOperand(1) == TripCount && 9982 "Exit condition must compare with the trip count"); 9983 #endif 9984 } 9985 9986 void CanonicalLoopInfo::invalidate() { 9987 Header = nullptr; 9988 Cond = nullptr; 9989 Latch = nullptr; 9990 Exit = nullptr; 9991 } 9992