1 //===- IR/OpenMPIRBuilder.h - OpenMP encoding builder for LLVM IR - C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the OpenMPIRBuilder class and helpers used as a convenient 10 // way to create LLVM instructions for OpenMP directives. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H 15 #define LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H 16 17 #include "llvm/Analysis/MemorySSAUpdater.h" 18 #include "llvm/Frontend/Atomic/Atomic.h" 19 #include "llvm/Frontend/OpenMP/OMPConstants.h" 20 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 21 #include "llvm/IR/DebugLoc.h" 22 #include "llvm/IR/IRBuilder.h" 23 #include "llvm/IR/Module.h" 24 #include "llvm/Support/Allocator.h" 25 #include "llvm/TargetParser/Triple.h" 26 #include <forward_list> 27 #include <map> 28 #include <optional> 29 30 namespace llvm { 31 class CanonicalLoopInfo; 32 struct TargetRegionEntryInfo; 33 class OffloadEntriesInfoManager; 34 class OpenMPIRBuilder; 35 36 /// Move the instruction after an InsertPoint to the beginning of another 37 /// BasicBlock. 38 /// 39 /// The instructions after \p IP are moved to the beginning of \p New which must 40 /// not have any PHINodes. If \p CreateBranch is true, a branch instruction to 41 /// \p New will be added such that there is no semantic change. Otherwise, the 42 /// \p IP insert block remains degenerate and it is up to the caller to insert a 43 /// terminator. 44 void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, 45 bool CreateBranch); 46 47 /// Splice a BasicBlock at an IRBuilder's current insertion point. Its new 48 /// insert location will stick to after the instruction before the insertion 49 /// point (instead of moving with the instruction the InsertPoint stores 50 /// internally). 51 void spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch); 52 53 /// Split a BasicBlock at an InsertPoint, even if the block is degenerate 54 /// (missing the terminator). 55 /// 56 /// llvm::SplitBasicBlock and BasicBlock::splitBasicBlock require a well-formed 57 /// BasicBlock. \p Name is used for the new successor block. If \p CreateBranch 58 /// is true, a branch to the new successor will new created such that 59 /// semantically there is no change; otherwise the block of the insertion point 60 /// remains degenerate and it is the caller's responsibility to insert a 61 /// terminator. Returns the new successor block. 62 BasicBlock *splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, 63 llvm::Twine Name = {}); 64 65 /// Split a BasicBlock at \p Builder's insertion point, even if the block is 66 /// degenerate (missing the terminator). Its new insert location will stick to 67 /// after the instruction before the insertion point (instead of moving with the 68 /// instruction the InsertPoint stores internally). 69 BasicBlock *splitBB(IRBuilderBase &Builder, bool CreateBranch, 70 llvm::Twine Name = {}); 71 72 /// Split a BasicBlock at \p Builder's insertion point, even if the block is 73 /// degenerate (missing the terminator). Its new insert location will stick to 74 /// after the instruction before the insertion point (instead of moving with the 75 /// instruction the InsertPoint stores internally). 76 BasicBlock *splitBB(IRBuilder<> &Builder, bool CreateBranch, llvm::Twine Name); 77 78 /// Like splitBB, but reuses the current block's name for the new name. 79 BasicBlock *splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, 80 llvm::Twine Suffix = ".split"); 81 82 /// Captures attributes that affect generating LLVM-IR using the 83 /// OpenMPIRBuilder and related classes. Note that not all attributes are 84 /// required for all classes or functions. In some use cases the configuration 85 /// is not necessary at all, because because the only functions that are called 86 /// are ones that are not dependent on the configuration. 87 class OpenMPIRBuilderConfig { 88 public: 89 /// Flag to define whether to generate code for the role of the OpenMP host 90 /// (if set to false) or device (if set to true) in an offloading context. It 91 /// is set when the -fopenmp-is-target-device compiler frontend option is 92 /// specified. 93 std::optional<bool> IsTargetDevice; 94 95 /// Flag for specifying if the compilation is done for an accelerator. It is 96 /// set according to the architecture of the target triple and currently only 97 /// true when targeting AMDGPU or NVPTX. Today, these targets can only perform 98 /// the role of an OpenMP target device, so `IsTargetDevice` must also be true 99 /// if `IsGPU` is true. This restriction might be lifted if an accelerator- 100 /// like target with the ability to work as the OpenMP host is added, or if 101 /// the capabilities of the currently supported GPU architectures are 102 /// expanded. 103 std::optional<bool> IsGPU; 104 105 /// Flag for specifying if LLVMUsed information should be emitted. 106 std::optional<bool> EmitLLVMUsedMetaInfo; 107 108 /// Flag for specifying if offloading is mandatory. 109 std::optional<bool> OpenMPOffloadMandatory; 110 111 /// First separator used between the initial two parts of a name. 112 std::optional<StringRef> FirstSeparator; 113 /// Separator used between all of the rest consecutive parts of s name 114 std::optional<StringRef> Separator; 115 116 // Grid Value for the GPU target 117 std::optional<omp::GV> GridValue; 118 119 /// When compilation is being done for the OpenMP host (i.e. `IsTargetDevice = 120 /// false`), this contains the list of offloading triples associated, if any. 121 SmallVector<Triple> TargetTriples; 122 123 OpenMPIRBuilderConfig(); 124 OpenMPIRBuilderConfig(bool IsTargetDevice, bool IsGPU, 125 bool OpenMPOffloadMandatory, 126 bool HasRequiresReverseOffload, 127 bool HasRequiresUnifiedAddress, 128 bool HasRequiresUnifiedSharedMemory, 129 bool HasRequiresDynamicAllocators); 130 131 // Getters functions that assert if the required values are not present. 132 bool isTargetDevice() const { 133 assert(IsTargetDevice.has_value() && "IsTargetDevice is not set"); 134 return *IsTargetDevice; 135 } 136 137 bool isGPU() const { 138 assert(IsGPU.has_value() && "IsGPU is not set"); 139 return *IsGPU; 140 } 141 142 bool openMPOffloadMandatory() const { 143 assert(OpenMPOffloadMandatory.has_value() && 144 "OpenMPOffloadMandatory is not set"); 145 return *OpenMPOffloadMandatory; 146 } 147 148 omp::GV getGridValue() const { 149 assert(GridValue.has_value() && "GridValue is not set"); 150 return *GridValue; 151 } 152 153 bool hasRequiresFlags() const { return RequiresFlags; } 154 bool hasRequiresReverseOffload() const; 155 bool hasRequiresUnifiedAddress() const; 156 bool hasRequiresUnifiedSharedMemory() const; 157 bool hasRequiresDynamicAllocators() const; 158 159 /// Returns requires directive clauses as flags compatible with those expected 160 /// by libomptarget. 161 int64_t getRequiresFlags() const; 162 163 // Returns the FirstSeparator if set, otherwise use the default separator 164 // depending on isGPU 165 StringRef firstSeparator() const { 166 if (FirstSeparator.has_value()) 167 return *FirstSeparator; 168 if (isGPU()) 169 return "_"; 170 return "."; 171 } 172 173 // Returns the Separator if set, otherwise use the default separator depending 174 // on isGPU 175 StringRef separator() const { 176 if (Separator.has_value()) 177 return *Separator; 178 if (isGPU()) 179 return "$"; 180 return "."; 181 } 182 183 void setIsTargetDevice(bool Value) { IsTargetDevice = Value; } 184 void setIsGPU(bool Value) { IsGPU = Value; } 185 void setEmitLLVMUsed(bool Value = true) { EmitLLVMUsedMetaInfo = Value; } 186 void setOpenMPOffloadMandatory(bool Value) { OpenMPOffloadMandatory = Value; } 187 void setFirstSeparator(StringRef FS) { FirstSeparator = FS; } 188 void setSeparator(StringRef S) { Separator = S; } 189 void setGridValue(omp::GV G) { GridValue = G; } 190 191 void setHasRequiresReverseOffload(bool Value); 192 void setHasRequiresUnifiedAddress(bool Value); 193 void setHasRequiresUnifiedSharedMemory(bool Value); 194 void setHasRequiresDynamicAllocators(bool Value); 195 196 private: 197 /// Flags for specifying which requires directive clauses are present. 198 int64_t RequiresFlags; 199 }; 200 201 /// Data structure to contain the information needed to uniquely identify 202 /// a target entry. 203 struct TargetRegionEntryInfo { 204 /// The prefix used for kernel names. 205 static constexpr const char *KernelNamePrefix = "__omp_offloading_"; 206 207 std::string ParentName; 208 unsigned DeviceID; 209 unsigned FileID; 210 unsigned Line; 211 unsigned Count; 212 213 TargetRegionEntryInfo() : DeviceID(0), FileID(0), Line(0), Count(0) {} 214 TargetRegionEntryInfo(StringRef ParentName, unsigned DeviceID, 215 unsigned FileID, unsigned Line, unsigned Count = 0) 216 : ParentName(ParentName), DeviceID(DeviceID), FileID(FileID), Line(Line), 217 Count(Count) {} 218 219 static void getTargetRegionEntryFnName(SmallVectorImpl<char> &Name, 220 StringRef ParentName, 221 unsigned DeviceID, unsigned FileID, 222 unsigned Line, unsigned Count); 223 224 bool operator<(const TargetRegionEntryInfo &RHS) const { 225 return std::make_tuple(ParentName, DeviceID, FileID, Line, Count) < 226 std::make_tuple(RHS.ParentName, RHS.DeviceID, RHS.FileID, RHS.Line, 227 RHS.Count); 228 } 229 }; 230 231 /// Class that manages information about offload code regions and data 232 class OffloadEntriesInfoManager { 233 /// Number of entries registered so far. 234 OpenMPIRBuilder *OMPBuilder; 235 unsigned OffloadingEntriesNum = 0; 236 237 public: 238 /// Base class of the entries info. 239 class OffloadEntryInfo { 240 public: 241 /// Kind of a given entry. 242 enum OffloadingEntryInfoKinds : unsigned { 243 /// Entry is a target region. 244 OffloadingEntryInfoTargetRegion = 0, 245 /// Entry is a declare target variable. 246 OffloadingEntryInfoDeviceGlobalVar = 1, 247 /// Invalid entry info. 248 OffloadingEntryInfoInvalid = ~0u 249 }; 250 251 protected: 252 OffloadEntryInfo() = delete; 253 explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind) : Kind(Kind) {} 254 explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order, 255 uint32_t Flags) 256 : Flags(Flags), Order(Order), Kind(Kind) {} 257 ~OffloadEntryInfo() = default; 258 259 public: 260 bool isValid() const { return Order != ~0u; } 261 unsigned getOrder() const { return Order; } 262 OffloadingEntryInfoKinds getKind() const { return Kind; } 263 uint32_t getFlags() const { return Flags; } 264 void setFlags(uint32_t NewFlags) { Flags = NewFlags; } 265 Constant *getAddress() const { return cast_or_null<Constant>(Addr); } 266 void setAddress(Constant *V) { 267 assert(!Addr.pointsToAliveValue() && "Address has been set before!"); 268 Addr = V; 269 } 270 static bool classof(const OffloadEntryInfo *Info) { return true; } 271 272 private: 273 /// Address of the entity that has to be mapped for offloading. 274 WeakTrackingVH Addr; 275 276 /// Flags associated with the device global. 277 uint32_t Flags = 0u; 278 279 /// Order this entry was emitted. 280 unsigned Order = ~0u; 281 282 OffloadingEntryInfoKinds Kind = OffloadingEntryInfoInvalid; 283 }; 284 285 /// Return true if a there are no entries defined. 286 bool empty() const; 287 /// Return number of entries defined so far. 288 unsigned size() const { return OffloadingEntriesNum; } 289 290 OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {} 291 292 // 293 // Target region entries related. 294 // 295 296 /// Kind of the target registry entry. 297 enum OMPTargetRegionEntryKind : uint32_t { 298 /// Mark the entry as target region. 299 OMPTargetRegionEntryTargetRegion = 0x0, 300 }; 301 302 /// Target region entries info. 303 class OffloadEntryInfoTargetRegion final : public OffloadEntryInfo { 304 /// Address that can be used as the ID of the entry. 305 Constant *ID = nullptr; 306 307 public: 308 OffloadEntryInfoTargetRegion() 309 : OffloadEntryInfo(OffloadingEntryInfoTargetRegion) {} 310 explicit OffloadEntryInfoTargetRegion(unsigned Order, Constant *Addr, 311 Constant *ID, 312 OMPTargetRegionEntryKind Flags) 313 : OffloadEntryInfo(OffloadingEntryInfoTargetRegion, Order, Flags), 314 ID(ID) { 315 setAddress(Addr); 316 } 317 318 Constant *getID() const { return ID; } 319 void setID(Constant *V) { 320 assert(!ID && "ID has been set before!"); 321 ID = V; 322 } 323 static bool classof(const OffloadEntryInfo *Info) { 324 return Info->getKind() == OffloadingEntryInfoTargetRegion; 325 } 326 }; 327 328 /// Initialize target region entry. 329 /// This is ONLY needed for DEVICE compilation. 330 void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, 331 unsigned Order); 332 /// Register target region entry. 333 void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, 334 Constant *Addr, Constant *ID, 335 OMPTargetRegionEntryKind Flags); 336 /// Return true if a target region entry with the provided information 337 /// exists. 338 bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, 339 bool IgnoreAddressId = false) const; 340 341 // Return the Name based on \a EntryInfo using the next available Count. 342 void getTargetRegionEntryFnName(SmallVectorImpl<char> &Name, 343 const TargetRegionEntryInfo &EntryInfo); 344 345 /// brief Applies action \a Action on all registered entries. 346 typedef function_ref<void(const TargetRegionEntryInfo &EntryInfo, 347 const OffloadEntryInfoTargetRegion &)> 348 OffloadTargetRegionEntryInfoActTy; 349 void 350 actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action); 351 352 // 353 // Device global variable entries related. 354 // 355 356 /// Kind of the global variable entry.. 357 enum OMPTargetGlobalVarEntryKind : uint32_t { 358 /// Mark the entry as a to declare target. 359 OMPTargetGlobalVarEntryTo = 0x0, 360 /// Mark the entry as a to declare target link. 361 OMPTargetGlobalVarEntryLink = 0x1, 362 /// Mark the entry as a declare target enter. 363 OMPTargetGlobalVarEntryEnter = 0x2, 364 /// Mark the entry as having no declare target entry kind. 365 OMPTargetGlobalVarEntryNone = 0x3, 366 /// Mark the entry as a declare target indirect global. 367 OMPTargetGlobalVarEntryIndirect = 0x8, 368 /// Mark the entry as a register requires global. 369 OMPTargetGlobalRegisterRequires = 0x10, 370 }; 371 372 /// Kind of device clause for declare target variables 373 /// and functions 374 /// NOTE: Currently not used as a part of a variable entry 375 /// used for Flang and Clang to interface with the variable 376 /// related registration functions 377 enum OMPTargetDeviceClauseKind : uint32_t { 378 /// The target is marked for all devices 379 OMPTargetDeviceClauseAny = 0x0, 380 /// The target is marked for non-host devices 381 OMPTargetDeviceClauseNoHost = 0x1, 382 /// The target is marked for host devices 383 OMPTargetDeviceClauseHost = 0x2, 384 /// The target is marked as having no clause 385 OMPTargetDeviceClauseNone = 0x3 386 }; 387 388 /// Device global variable entries info. 389 class OffloadEntryInfoDeviceGlobalVar final : public OffloadEntryInfo { 390 /// Type of the global variable. 391 int64_t VarSize; 392 GlobalValue::LinkageTypes Linkage; 393 const std::string VarName; 394 395 public: 396 OffloadEntryInfoDeviceGlobalVar() 397 : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar) {} 398 explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order, 399 OMPTargetGlobalVarEntryKind Flags) 400 : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags) {} 401 explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order, Constant *Addr, 402 int64_t VarSize, 403 OMPTargetGlobalVarEntryKind Flags, 404 GlobalValue::LinkageTypes Linkage, 405 const std::string &VarName) 406 : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags), 407 VarSize(VarSize), Linkage(Linkage), VarName(VarName) { 408 setAddress(Addr); 409 } 410 411 int64_t getVarSize() const { return VarSize; } 412 StringRef getVarName() const { return VarName; } 413 void setVarSize(int64_t Size) { VarSize = Size; } 414 GlobalValue::LinkageTypes getLinkage() const { return Linkage; } 415 void setLinkage(GlobalValue::LinkageTypes LT) { Linkage = LT; } 416 static bool classof(const OffloadEntryInfo *Info) { 417 return Info->getKind() == OffloadingEntryInfoDeviceGlobalVar; 418 } 419 }; 420 421 /// Initialize device global variable entry. 422 /// This is ONLY used for DEVICE compilation. 423 void initializeDeviceGlobalVarEntryInfo(StringRef Name, 424 OMPTargetGlobalVarEntryKind Flags, 425 unsigned Order); 426 427 /// Register device global variable entry. 428 void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, 429 int64_t VarSize, 430 OMPTargetGlobalVarEntryKind Flags, 431 GlobalValue::LinkageTypes Linkage); 432 /// Checks if the variable with the given name has been registered already. 433 bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const { 434 return OffloadEntriesDeviceGlobalVar.count(VarName) > 0; 435 } 436 /// Applies action \a Action on all registered entries. 437 typedef function_ref<void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)> 438 OffloadDeviceGlobalVarEntryInfoActTy; 439 void actOnDeviceGlobalVarEntriesInfo( 440 const OffloadDeviceGlobalVarEntryInfoActTy &Action); 441 442 private: 443 /// Return the count of entries at a particular source location. 444 unsigned 445 getTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo) const; 446 447 /// Update the count of entries at a particular source location. 448 void 449 incrementTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo); 450 451 static TargetRegionEntryInfo 452 getTargetRegionEntryCountKey(const TargetRegionEntryInfo &EntryInfo) { 453 return TargetRegionEntryInfo(EntryInfo.ParentName, EntryInfo.DeviceID, 454 EntryInfo.FileID, EntryInfo.Line, 0); 455 } 456 457 // Count of entries at a location. 458 std::map<TargetRegionEntryInfo, unsigned> OffloadEntriesTargetRegionCount; 459 460 // Storage for target region entries kind. 461 typedef std::map<TargetRegionEntryInfo, OffloadEntryInfoTargetRegion> 462 OffloadEntriesTargetRegionTy; 463 OffloadEntriesTargetRegionTy OffloadEntriesTargetRegion; 464 /// Storage for device global variable entries kind. The storage is to be 465 /// indexed by mangled name. 466 typedef StringMap<OffloadEntryInfoDeviceGlobalVar> 467 OffloadEntriesDeviceGlobalVarTy; 468 OffloadEntriesDeviceGlobalVarTy OffloadEntriesDeviceGlobalVar; 469 }; 470 471 /// An interface to create LLVM-IR for OpenMP directives. 472 /// 473 /// Each OpenMP directive has a corresponding public generator method. 474 class OpenMPIRBuilder { 475 public: 476 /// Create a new OpenMPIRBuilder operating on the given module \p M. This will 477 /// not have an effect on \p M (see initialize) 478 OpenMPIRBuilder(Module &M) 479 : M(M), Builder(M.getContext()), OffloadInfoManager(this), 480 T(Triple(M.getTargetTriple())) {} 481 ~OpenMPIRBuilder(); 482 483 class AtomicInfo : public llvm::AtomicInfo { 484 llvm::Value *AtomicVar; 485 486 public: 487 AtomicInfo(IRBuilder<> *Builder, llvm::Type *Ty, uint64_t AtomicSizeInBits, 488 uint64_t ValueSizeInBits, llvm::Align AtomicAlign, 489 llvm::Align ValueAlign, bool UseLibcall, llvm::Value *AtomicVar) 490 : llvm::AtomicInfo(Builder, Ty, AtomicSizeInBits, ValueSizeInBits, 491 AtomicAlign, ValueAlign, UseLibcall), 492 AtomicVar(AtomicVar) {} 493 494 llvm::Value *getAtomicPointer() const override { return AtomicVar; } 495 void decorateWithTBAA(llvm::Instruction *I) override {} 496 llvm::AllocaInst *CreateAlloca(llvm::Type *Ty, 497 const llvm::Twine &Name) const override { 498 llvm::AllocaInst *allocaInst = Builder->CreateAlloca(Ty); 499 allocaInst->setName(Name); 500 return allocaInst; 501 } 502 }; 503 /// Initialize the internal state, this will put structures types and 504 /// potentially other helpers into the underlying module. Must be called 505 /// before any other method and only once! This internal state includes types 506 /// used in the OpenMPIRBuilder generated from OMPKinds.def. 507 void initialize(); 508 509 void setConfig(OpenMPIRBuilderConfig C) { Config = C; } 510 511 /// Finalize the underlying module, e.g., by outlining regions. 512 /// \param Fn The function to be finalized. If not used, 513 /// all functions are finalized. 514 void finalize(Function *Fn = nullptr); 515 516 /// Add attributes known for \p FnID to \p Fn. 517 void addAttributes(omp::RuntimeFunction FnID, Function &Fn); 518 519 /// Type used throughout for insertion points. 520 using InsertPointTy = IRBuilder<>::InsertPoint; 521 522 /// Type used to represent an insertion point or an error value. 523 using InsertPointOrErrorTy = Expected<InsertPointTy>; 524 525 /// Get the create a name using the platform specific separators. 526 /// \param Parts parts of the final name that needs separation 527 /// The created name has a first separator between the first and second part 528 /// and a second separator between all other parts. 529 /// E.g. with FirstSeparator "$" and Separator "." and 530 /// parts: "p1", "p2", "p3", "p4" 531 /// The resulting name is "p1$p2.p3.p4" 532 /// The separators are retrieved from the OpenMPIRBuilderConfig. 533 std::string createPlatformSpecificName(ArrayRef<StringRef> Parts) const; 534 535 /// Callback type for variable finalization (think destructors). 536 /// 537 /// \param CodeGenIP is the insertion point at which the finalization code 538 /// should be placed. 539 /// 540 /// A finalize callback knows about all objects that need finalization, e.g. 541 /// destruction, when the scope of the currently generated construct is left 542 /// at the time, and location, the callback is invoked. 543 using FinalizeCallbackTy = std::function<Error(InsertPointTy CodeGenIP)>; 544 545 struct FinalizationInfo { 546 /// The finalization callback provided by the last in-flight invocation of 547 /// createXXXX for the directive of kind DK. 548 FinalizeCallbackTy FiniCB; 549 550 /// The directive kind of the innermost directive that has an associated 551 /// region which might require finalization when it is left. 552 omp::Directive DK; 553 554 /// Flag to indicate if the directive is cancellable. 555 bool IsCancellable; 556 }; 557 558 /// Push a finalization callback on the finalization stack. 559 /// 560 /// NOTE: Temporary solution until Clang CG is gone. 561 void pushFinalizationCB(const FinalizationInfo &FI) { 562 FinalizationStack.push_back(FI); 563 } 564 565 /// Pop the last finalization callback from the finalization stack. 566 /// 567 /// NOTE: Temporary solution until Clang CG is gone. 568 void popFinalizationCB() { FinalizationStack.pop_back(); } 569 570 /// Callback type for body (=inner region) code generation 571 /// 572 /// The callback takes code locations as arguments, each describing a 573 /// location where additional instructions can be inserted. 574 /// 575 /// The CodeGenIP may be in the middle of a basic block or point to the end of 576 /// it. The basic block may have a terminator or be degenerate. The callback 577 /// function may just insert instructions at that position, but also split the 578 /// block (without the Before argument of BasicBlock::splitBasicBlock such 579 /// that the identify of the split predecessor block is preserved) and insert 580 /// additional control flow, including branches that do not lead back to what 581 /// follows the CodeGenIP. Note that since the callback is allowed to split 582 /// the block, callers must assume that InsertPoints to positions in the 583 /// BasicBlock after CodeGenIP including CodeGenIP itself are invalidated. If 584 /// such InsertPoints need to be preserved, it can split the block itself 585 /// before calling the callback. 586 /// 587 /// AllocaIP and CodeGenIP must not point to the same position. 588 /// 589 /// \param AllocaIP is the insertion point at which new alloca instructions 590 /// should be placed. The BasicBlock it is pointing to must 591 /// not be split. 592 /// \param CodeGenIP is the insertion point at which the body code should be 593 /// placed. 594 /// 595 /// \return an error, if any were triggered during execution. 596 using BodyGenCallbackTy = 597 function_ref<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>; 598 599 // This is created primarily for sections construct as llvm::function_ref 600 // (BodyGenCallbackTy) is not storable (as described in the comments of 601 // function_ref class - function_ref contains non-ownable reference 602 // to the callable. 603 /// 604 /// \return an error, if any were triggered during execution. 605 using StorableBodyGenCallbackTy = 606 std::function<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>; 607 608 /// Callback type for loop body code generation. 609 /// 610 /// \param CodeGenIP is the insertion point where the loop's body code must be 611 /// placed. This will be a dedicated BasicBlock with a 612 /// conditional branch from the loop condition check and 613 /// terminated with an unconditional branch to the loop 614 /// latch. 615 /// \param IndVar is the induction variable usable at the insertion point. 616 /// 617 /// \return an error, if any were triggered during execution. 618 using LoopBodyGenCallbackTy = 619 function_ref<Error(InsertPointTy CodeGenIP, Value *IndVar)>; 620 621 /// Callback type for variable privatization (think copy & default 622 /// constructor). 623 /// 624 /// \param AllocaIP is the insertion point at which new alloca instructions 625 /// should be placed. 626 /// \param CodeGenIP is the insertion point at which the privatization code 627 /// should be placed. 628 /// \param Original The value being copied/created, should not be used in the 629 /// generated IR. 630 /// \param Inner The equivalent of \p Original that should be used in the 631 /// generated IR; this is equal to \p Original if the value is 632 /// a pointer and can thus be passed directly, otherwise it is 633 /// an equivalent but different value. 634 /// \param ReplVal The replacement value, thus a copy or new created version 635 /// of \p Inner. 636 /// 637 /// \returns The new insertion point where code generation continues and 638 /// \p ReplVal the replacement value. 639 using PrivatizeCallbackTy = function_ref<InsertPointOrErrorTy( 640 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original, 641 Value &Inner, Value *&ReplVal)>; 642 643 /// Description of a LLVM-IR insertion point (IP) and a debug/source location 644 /// (filename, line, column, ...). 645 struct LocationDescription { 646 LocationDescription(const IRBuilderBase &IRB) 647 : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {} 648 LocationDescription(const InsertPointTy &IP) : IP(IP) {} 649 LocationDescription(const InsertPointTy &IP, const DebugLoc &DL) 650 : IP(IP), DL(DL) {} 651 InsertPointTy IP; 652 DebugLoc DL; 653 }; 654 655 /// Emitter methods for OpenMP directives. 656 /// 657 ///{ 658 659 /// Generator for '#omp barrier' 660 /// 661 /// \param Loc The location where the barrier directive was encountered. 662 /// \param Kind The kind of directive that caused the barrier. 663 /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier. 664 /// \param CheckCancelFlag Flag to indicate a cancel barrier return value 665 /// should be checked and acted upon. 666 /// \param ThreadID Optional parameter to pass in any existing ThreadID value. 667 /// 668 /// \returns The insertion point after the barrier. 669 InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, 670 omp::Directive Kind, 671 bool ForceSimpleCall = false, 672 bool CheckCancelFlag = true); 673 674 /// Generator for '#omp cancel' 675 /// 676 /// \param Loc The location where the directive was encountered. 677 /// \param IfCondition The evaluated 'if' clause expression, if any. 678 /// \param CanceledDirective The kind of directive that is cancled. 679 /// 680 /// \returns The insertion point after the barrier. 681 InsertPointOrErrorTy createCancel(const LocationDescription &Loc, 682 Value *IfCondition, 683 omp::Directive CanceledDirective); 684 685 /// Generator for '#omp parallel' 686 /// 687 /// \param Loc The insert and source location description. 688 /// \param AllocaIP The insertion points to be used for alloca instructions. 689 /// \param BodyGenCB Callback that will generate the region code. 690 /// \param PrivCB Callback to copy a given variable (think copy constructor). 691 /// \param FiniCB Callback to finalize variable copies. 692 /// \param IfCondition The evaluated 'if' clause expression, if any. 693 /// \param NumThreads The evaluated 'num_threads' clause expression, if any. 694 /// \param ProcBind The value of the 'proc_bind' clause (see ProcBindKind). 695 /// \param IsCancellable Flag to indicate a cancellable parallel region. 696 /// 697 /// \returns The insertion position *after* the parallel. 698 InsertPointOrErrorTy 699 createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, 700 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, 701 FinalizeCallbackTy FiniCB, Value *IfCondition, 702 Value *NumThreads, omp::ProcBindKind ProcBind, 703 bool IsCancellable); 704 705 /// Generator for the control flow structure of an OpenMP canonical loop. 706 /// 707 /// This generator operates on the logical iteration space of the loop, i.e. 708 /// the caller only has to provide a loop trip count of the loop as defined by 709 /// base language semantics. The trip count is interpreted as an unsigned 710 /// integer. The induction variable passed to \p BodyGenCB will be of the same 711 /// type and run from 0 to \p TripCount - 1. It is up to the callback to 712 /// convert the logical iteration variable to the loop counter variable in the 713 /// loop body. 714 /// 715 /// \param Loc The insert and source location description. The insert 716 /// location can be between two instructions or the end of a 717 /// degenerate block (e.g. a BB under construction). 718 /// \param BodyGenCB Callback that will generate the loop body code. 719 /// \param TripCount Number of iterations the loop body is executed. 720 /// \param Name Base name used to derive BB and instruction names. 721 /// 722 /// \returns An object representing the created control flow structure which 723 /// can be used for loop-associated directives. 724 Expected<CanonicalLoopInfo *> 725 createCanonicalLoop(const LocationDescription &Loc, 726 LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, 727 const Twine &Name = "loop"); 728 729 /// Generator for the control flow structure of an OpenMP canonical loop. 730 /// 731 /// Instead of a logical iteration space, this allows specifying user-defined 732 /// loop counter values using increment, upper- and lower bounds. To 733 /// disambiguate the terminology when counting downwards, instead of lower 734 /// bounds we use \p Start for the loop counter value in the first body 735 /// iteration. 736 /// 737 /// Consider the following limitations: 738 /// 739 /// * A loop counter space over all integer values of its bit-width cannot be 740 /// represented. E.g using uint8_t, its loop trip count of 256 cannot be 741 /// stored into an 8 bit integer): 742 /// 743 /// DO I = 0, 255, 1 744 /// 745 /// * Unsigned wrapping is only supported when wrapping only "once"; E.g. 746 /// effectively counting downwards: 747 /// 748 /// for (uint8_t i = 100u; i > 0; i += 127u) 749 /// 750 /// 751 /// TODO: May need to add additional parameters to represent: 752 /// 753 /// * Allow representing downcounting with unsigned integers. 754 /// 755 /// * Sign of the step and the comparison operator might disagree: 756 /// 757 /// for (int i = 0; i < 42; i -= 1u) 758 /// 759 // 760 /// \param Loc The insert and source location description. 761 /// \param BodyGenCB Callback that will generate the loop body code. 762 /// \param Start Value of the loop counter for the first iterations. 763 /// \param Stop Loop counter values past this will stop the loop. 764 /// \param Step Loop counter increment after each iteration; negative 765 /// means counting down. 766 /// \param IsSigned Whether Start, Stop and Step are signed integers. 767 /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop 768 /// counter. 769 /// \param ComputeIP Insertion point for instructions computing the trip 770 /// count. Can be used to ensure the trip count is available 771 /// at the outermost loop of a loop nest. If not set, 772 /// defaults to the preheader of the generated loop. 773 /// \param Name Base name used to derive BB and instruction names. 774 /// 775 /// \returns An object representing the created control flow structure which 776 /// can be used for loop-associated directives. 777 Expected<CanonicalLoopInfo *> createCanonicalLoop( 778 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, 779 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, 780 InsertPointTy ComputeIP = {}, const Twine &Name = "loop"); 781 782 /// Collapse a loop nest into a single loop. 783 /// 784 /// Merges loops of a loop nest into a single CanonicalLoopNest representation 785 /// that has the same number of innermost loop iterations as the origin loop 786 /// nest. The induction variables of the input loops are derived from the 787 /// collapsed loop's induction variable. This is intended to be used to 788 /// implement OpenMP's collapse clause. Before applying a directive, 789 /// collapseLoops normalizes a loop nest to contain only a single loop and the 790 /// directive's implementation does not need to handle multiple loops itself. 791 /// This does not remove the need to handle all loop nest handling by 792 /// directives, such as the ordered(<n>) clause or the simd schedule-clause 793 /// modifier of the worksharing-loop directive. 794 /// 795 /// Example: 796 /// \code 797 /// for (int i = 0; i < 7; ++i) // Canonical loop "i" 798 /// for (int j = 0; j < 9; ++j) // Canonical loop "j" 799 /// body(i, j); 800 /// \endcode 801 /// 802 /// After collapsing with Loops={i,j}, the loop is changed to 803 /// \code 804 /// for (int ij = 0; ij < 63; ++ij) { 805 /// int i = ij / 9; 806 /// int j = ij % 9; 807 /// body(i, j); 808 /// } 809 /// \endcode 810 /// 811 /// In the current implementation, the following limitations apply: 812 /// 813 /// * All input loops have an induction variable of the same type. 814 /// 815 /// * The collapsed loop will have the same trip count integer type as the 816 /// input loops. Therefore it is possible that the collapsed loop cannot 817 /// represent all iterations of the input loops. For instance, assuming a 818 /// 32 bit integer type, and two input loops both iterating 2^16 times, the 819 /// theoretical trip count of the collapsed loop would be 2^32 iteration, 820 /// which cannot be represented in an 32-bit integer. Behavior is undefined 821 /// in this case. 822 /// 823 /// * The trip counts of every input loop must be available at \p ComputeIP. 824 /// Non-rectangular loops are not yet supported. 825 /// 826 /// * At each nest level, code between a surrounding loop and its nested loop 827 /// is hoisted into the loop body, and such code will be executed more 828 /// often than before collapsing (or not at all if any inner loop iteration 829 /// has a trip count of 0). This is permitted by the OpenMP specification. 830 /// 831 /// \param DL Debug location for instructions added for collapsing, 832 /// such as instructions to compute/derive the input loop's 833 /// induction variables. 834 /// \param Loops Loops in the loop nest to collapse. Loops are specified 835 /// from outermost-to-innermost and every control flow of a 836 /// loop's body must pass through its directly nested loop. 837 /// \param ComputeIP Where additional instruction that compute the collapsed 838 /// trip count. If not set, defaults to before the generated 839 /// loop. 840 /// 841 /// \returns The CanonicalLoopInfo object representing the collapsed loop. 842 CanonicalLoopInfo *collapseLoops(DebugLoc DL, 843 ArrayRef<CanonicalLoopInfo *> Loops, 844 InsertPointTy ComputeIP); 845 846 /// Get the default alignment value for given target 847 /// 848 /// \param TargetTriple Target triple 849 /// \param Features StringMap which describes extra CPU features 850 static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, 851 const StringMap<bool> &Features); 852 853 /// Retrieve (or create if non-existent) the address of a declare 854 /// target variable, used in conjunction with registerTargetGlobalVariable 855 /// to create declare target global variables. 856 /// 857 /// \param CaptureClause - enumerator corresponding to the OpenMP capture 858 /// clause used in conjunction with the variable being registered (link, 859 /// to, enter). 860 /// \param DeviceClause - enumerator corresponding to the OpenMP capture 861 /// clause used in conjunction with the variable being registered (nohost, 862 /// host, any) 863 /// \param IsDeclaration - boolean stating if the variable being registered 864 /// is a declaration-only and not a definition 865 /// \param IsExternallyVisible - boolean stating if the variable is externally 866 /// visible 867 /// \param EntryInfo - Unique entry information for the value generated 868 /// using getTargetEntryUniqueInfo, used to name generated pointer references 869 /// to the declare target variable 870 /// \param MangledName - the mangled name of the variable being registered 871 /// \param GeneratedRefs - references generated by invocations of 872 /// registerTargetGlobalVariable invoked from getAddrOfDeclareTargetVar, 873 /// these are required by Clang for book keeping. 874 /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled 875 /// \param TargetTriple - The OpenMP device target triple we are compiling 876 /// for 877 /// \param LlvmPtrTy - The type of the variable we are generating or 878 /// retrieving an address for 879 /// \param GlobalInitializer - a lambda function which creates a constant 880 /// used for initializing a pointer reference to the variable in certain 881 /// cases. If a nullptr is passed, it will default to utilising the original 882 /// variable to initialize the pointer reference. 883 /// \param VariableLinkage - a lambda function which returns the variables 884 /// linkage type, if unspecified and a nullptr is given, it will instead 885 /// utilise the linkage stored on the existing global variable in the 886 /// LLVMModule. 887 Constant *getAddrOfDeclareTargetVar( 888 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, 889 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, 890 bool IsDeclaration, bool IsExternallyVisible, 891 TargetRegionEntryInfo EntryInfo, StringRef MangledName, 892 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD, 893 std::vector<Triple> TargetTriple, Type *LlvmPtrTy, 894 std::function<Constant *()> GlobalInitializer, 895 std::function<GlobalValue::LinkageTypes()> VariableLinkage); 896 897 /// Registers a target variable for device or host. 898 /// 899 /// \param CaptureClause - enumerator corresponding to the OpenMP capture 900 /// clause used in conjunction with the variable being registered (link, 901 /// to, enter). 902 /// \param DeviceClause - enumerator corresponding to the OpenMP capture 903 /// clause used in conjunction with the variable being registered (nohost, 904 /// host, any) 905 /// \param IsDeclaration - boolean stating if the variable being registered 906 /// is a declaration-only and not a definition 907 /// \param IsExternallyVisible - boolean stating if the variable is externally 908 /// visible 909 /// \param EntryInfo - Unique entry information for the value generated 910 /// using getTargetEntryUniqueInfo, used to name generated pointer references 911 /// to the declare target variable 912 /// \param MangledName - the mangled name of the variable being registered 913 /// \param GeneratedRefs - references generated by invocations of 914 /// registerTargetGlobalVariable these are required by Clang for book 915 /// keeping. 916 /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled 917 /// \param TargetTriple - The OpenMP device target triple we are compiling 918 /// for 919 /// \param GlobalInitializer - a lambda function which creates a constant 920 /// used for initializing a pointer reference to the variable in certain 921 /// cases. If a nullptr is passed, it will default to utilising the original 922 /// variable to initialize the pointer reference. 923 /// \param VariableLinkage - a lambda function which returns the variables 924 /// linkage type, if unspecified and a nullptr is given, it will instead 925 /// utilise the linkage stored on the existing global variable in the 926 /// LLVMModule. 927 /// \param LlvmPtrTy - The type of the variable we are generating or 928 /// retrieving an address for 929 /// \param Addr - the original llvm value (addr) of the variable to be 930 /// registered 931 void registerTargetGlobalVariable( 932 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, 933 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, 934 bool IsDeclaration, bool IsExternallyVisible, 935 TargetRegionEntryInfo EntryInfo, StringRef MangledName, 936 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD, 937 std::vector<Triple> TargetTriple, 938 std::function<Constant *()> GlobalInitializer, 939 std::function<GlobalValue::LinkageTypes()> VariableLinkage, 940 Type *LlvmPtrTy, Constant *Addr); 941 942 /// Get the offset of the OMP_MAP_MEMBER_OF field. 943 unsigned getFlagMemberOffset(); 944 945 /// Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on 946 /// the position given. 947 /// \param Position - A value indicating the position of the parent 948 /// of the member in the kernel argument structure, often retrieved 949 /// by the parents position in the combined information vectors used 950 /// to generate the structure itself. Multiple children (member's of) 951 /// with the same parent will use the same returned member flag. 952 omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position); 953 954 /// Given an initial flag set, this function modifies it to contain 955 /// the passed in MemberOfFlag generated from the getMemberOfFlag 956 /// function. The results are dependent on the existing flag bits 957 /// set in the original flag set. 958 /// \param Flags - The original set of flags to be modified with the 959 /// passed in MemberOfFlag. 960 /// \param MemberOfFlag - A modified OMP_MAP_MEMBER_OF flag, adjusted 961 /// slightly based on the getMemberOfFlag which adjusts the flag bits 962 /// based on the members position in its parent. 963 void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, 964 omp::OpenMPOffloadMappingFlags MemberOfFlag); 965 966 private: 967 /// Modifies the canonical loop to be a statically-scheduled workshare loop 968 /// which is executed on the device 969 /// 970 /// This takes a \p CLI representing a canonical loop, such as the one 971 /// created by \see createCanonicalLoop and emits additional instructions to 972 /// turn it into a workshare loop. In particular, it calls to an OpenMP 973 /// runtime function in the preheader to call OpenMP device rtl function 974 /// which handles worksharing of loop body interations. 975 /// 976 /// \param DL Debug location for instructions added for the 977 /// workshare-loop construct itself. 978 /// \param CLI A descriptor of the canonical loop to workshare. 979 /// \param AllocaIP An insertion point for Alloca instructions usable in the 980 /// preheader of the loop. 981 /// \param LoopType Information about type of loop worksharing. 982 /// It corresponds to type of loop workshare OpenMP pragma. 983 /// 984 /// \returns Point where to insert code after the workshare construct. 985 InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI, 986 InsertPointTy AllocaIP, 987 omp::WorksharingLoopType LoopType); 988 989 /// Modifies the canonical loop to be a statically-scheduled workshare loop. 990 /// 991 /// This takes a \p LoopInfo representing a canonical loop, such as the one 992 /// created by \p createCanonicalLoop and emits additional instructions to 993 /// turn it into a workshare loop. In particular, it calls to an OpenMP 994 /// runtime function in the preheader to obtain the loop bounds to be used in 995 /// the current thread, updates the relevant instructions in the canonical 996 /// loop and calls to an OpenMP runtime finalization function after the loop. 997 /// 998 /// \param DL Debug location for instructions added for the 999 /// workshare-loop construct itself. 1000 /// \param CLI A descriptor of the canonical loop to workshare. 1001 /// \param AllocaIP An insertion point for Alloca instructions usable in the 1002 /// preheader of the loop. 1003 /// \param NeedsBarrier Indicates whether a barrier must be inserted after 1004 /// the loop. 1005 /// 1006 /// \returns Point where to insert code after the workshare construct. 1007 InsertPointOrErrorTy applyStaticWorkshareLoop(DebugLoc DL, 1008 CanonicalLoopInfo *CLI, 1009 InsertPointTy AllocaIP, 1010 bool NeedsBarrier); 1011 1012 /// Modifies the canonical loop a statically-scheduled workshare loop with a 1013 /// user-specified chunk size. 1014 /// 1015 /// \param DL Debug location for instructions added for the 1016 /// workshare-loop construct itself. 1017 /// \param CLI A descriptor of the canonical loop to workshare. 1018 /// \param AllocaIP An insertion point for Alloca instructions usable in 1019 /// the preheader of the loop. 1020 /// \param NeedsBarrier Indicates whether a barrier must be inserted after the 1021 /// loop. 1022 /// \param ChunkSize The user-specified chunk size. 1023 /// 1024 /// \returns Point where to insert code after the workshare construct. 1025 InsertPointOrErrorTy applyStaticChunkedWorkshareLoop(DebugLoc DL, 1026 CanonicalLoopInfo *CLI, 1027 InsertPointTy AllocaIP, 1028 bool NeedsBarrier, 1029 Value *ChunkSize); 1030 1031 /// Modifies the canonical loop to be a dynamically-scheduled workshare loop. 1032 /// 1033 /// This takes a \p LoopInfo representing a canonical loop, such as the one 1034 /// created by \p createCanonicalLoop and emits additional instructions to 1035 /// turn it into a workshare loop. In particular, it calls to an OpenMP 1036 /// runtime function in the preheader to obtain, and then in each iteration 1037 /// to update the loop counter. 1038 /// 1039 /// \param DL Debug location for instructions added for the 1040 /// workshare-loop construct itself. 1041 /// \param CLI A descriptor of the canonical loop to workshare. 1042 /// \param AllocaIP An insertion point for Alloca instructions usable in the 1043 /// preheader of the loop. 1044 /// \param SchedType Type of scheduling to be passed to the init function. 1045 /// \param NeedsBarrier Indicates whether a barrier must be insterted after 1046 /// the loop. 1047 /// \param Chunk The size of loop chunk considered as a unit when 1048 /// scheduling. If \p nullptr, defaults to 1. 1049 /// 1050 /// \returns Point where to insert code after the workshare construct. 1051 InsertPointOrErrorTy applyDynamicWorkshareLoop(DebugLoc DL, 1052 CanonicalLoopInfo *CLI, 1053 InsertPointTy AllocaIP, 1054 omp::OMPScheduleType SchedType, 1055 bool NeedsBarrier, 1056 Value *Chunk = nullptr); 1057 1058 /// Create alternative version of the loop to support if clause 1059 /// 1060 /// OpenMP if clause can require to generate second loop. This loop 1061 /// will be executed when if clause condition is not met. createIfVersion 1062 /// adds branch instruction to the copied loop if \p ifCond is not met. 1063 /// 1064 /// \param Loop Original loop which should be versioned. 1065 /// \param IfCond Value which corresponds to if clause condition 1066 /// \param VMap Value to value map to define relation between 1067 /// original and copied loop values and loop blocks. 1068 /// \param NamePrefix Optional name prefix for if.then if.else blocks. 1069 void createIfVersion(CanonicalLoopInfo *Loop, Value *IfCond, 1070 ValueToValueMapTy &VMap, const Twine &NamePrefix = ""); 1071 1072 public: 1073 /// Modifies the canonical loop to be a workshare loop. 1074 /// 1075 /// This takes a \p LoopInfo representing a canonical loop, such as the one 1076 /// created by \p createCanonicalLoop and emits additional instructions to 1077 /// turn it into a workshare loop. In particular, it calls to an OpenMP 1078 /// runtime function in the preheader to obtain the loop bounds to be used in 1079 /// the current thread, updates the relevant instructions in the canonical 1080 /// loop and calls to an OpenMP runtime finalization function after the loop. 1081 /// 1082 /// The concrete transformation is done by applyStaticWorkshareLoop, 1083 /// applyStaticChunkedWorkshareLoop, or applyDynamicWorkshareLoop, depending 1084 /// on the value of \p SchedKind and \p ChunkSize. 1085 /// 1086 /// \param DL Debug location for instructions added for the 1087 /// workshare-loop construct itself. 1088 /// \param CLI A descriptor of the canonical loop to workshare. 1089 /// \param AllocaIP An insertion point for Alloca instructions usable in the 1090 /// preheader of the loop. 1091 /// \param NeedsBarrier Indicates whether a barrier must be insterted after 1092 /// the loop. 1093 /// \param SchedKind Scheduling algorithm to use. 1094 /// \param ChunkSize The chunk size for the inner loop. 1095 /// \param HasSimdModifier Whether the simd modifier is present in the 1096 /// schedule clause. 1097 /// \param HasMonotonicModifier Whether the monotonic modifier is present in 1098 /// the schedule clause. 1099 /// \param HasNonmonotonicModifier Whether the nonmonotonic modifier is 1100 /// present in the schedule clause. 1101 /// \param HasOrderedClause Whether the (parameterless) ordered clause is 1102 /// present. 1103 /// \param LoopType Information about type of loop worksharing. 1104 /// It corresponds to type of loop workshare OpenMP pragma. 1105 /// 1106 /// \returns Point where to insert code after the workshare construct. 1107 InsertPointOrErrorTy applyWorkshareLoop( 1108 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, 1109 bool NeedsBarrier, 1110 llvm::omp::ScheduleKind SchedKind = llvm::omp::OMP_SCHEDULE_Default, 1111 Value *ChunkSize = nullptr, bool HasSimdModifier = false, 1112 bool HasMonotonicModifier = false, bool HasNonmonotonicModifier = false, 1113 bool HasOrderedClause = false, 1114 omp::WorksharingLoopType LoopType = 1115 omp::WorksharingLoopType::ForStaticLoop); 1116 1117 /// Tile a loop nest. 1118 /// 1119 /// Tiles the loops of \p Loops by the tile sizes in \p TileSizes. Loops in 1120 /// \p/ Loops must be perfectly nested, from outermost to innermost loop 1121 /// (i.e. Loops.front() is the outermost loop). The trip count llvm::Value 1122 /// of every loop and every tile sizes must be usable in the outermost 1123 /// loop's preheader. This implies that the loop nest is rectangular. 1124 /// 1125 /// Example: 1126 /// \code 1127 /// for (int i = 0; i < 15; ++i) // Canonical loop "i" 1128 /// for (int j = 0; j < 14; ++j) // Canonical loop "j" 1129 /// body(i, j); 1130 /// \endcode 1131 /// 1132 /// After tiling with Loops={i,j} and TileSizes={5,7}, the loop is changed to 1133 /// \code 1134 /// for (int i1 = 0; i1 < 3; ++i1) 1135 /// for (int j1 = 0; j1 < 2; ++j1) 1136 /// for (int i2 = 0; i2 < 5; ++i2) 1137 /// for (int j2 = 0; j2 < 7; ++j2) 1138 /// body(i1*3+i2, j1*3+j2); 1139 /// \endcode 1140 /// 1141 /// The returned vector are the loops {i1,j1,i2,j2}. The loops i1 and j1 are 1142 /// referred to the floor, and the loops i2 and j2 are the tiles. Tiling also 1143 /// handles non-constant trip counts, non-constant tile sizes and trip counts 1144 /// that are not multiples of the tile size. In the latter case the tile loop 1145 /// of the last floor-loop iteration will have fewer iterations than specified 1146 /// as its tile size. 1147 /// 1148 /// 1149 /// @param DL Debug location for instructions added by tiling, for 1150 /// instance the floor- and tile trip count computation. 1151 /// @param Loops Loops to tile. The CanonicalLoopInfo objects are 1152 /// invalidated by this method, i.e. should not used after 1153 /// tiling. 1154 /// @param TileSizes For each loop in \p Loops, the tile size for that 1155 /// dimensions. 1156 /// 1157 /// \returns A list of generated loops. Contains twice as many loops as the 1158 /// input loop nest; the first half are the floor loops and the 1159 /// second half are the tile loops. 1160 std::vector<CanonicalLoopInfo *> 1161 tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops, 1162 ArrayRef<Value *> TileSizes); 1163 1164 /// Fully unroll a loop. 1165 /// 1166 /// Instead of unrolling the loop immediately (and duplicating its body 1167 /// instructions), it is deferred to LLVM's LoopUnrollPass by adding loop 1168 /// metadata. 1169 /// 1170 /// \param DL Debug location for instructions added by unrolling. 1171 /// \param Loop The loop to unroll. The loop will be invalidated. 1172 void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop); 1173 1174 /// Fully or partially unroll a loop. How the loop is unrolled is determined 1175 /// using LLVM's LoopUnrollPass. 1176 /// 1177 /// \param DL Debug location for instructions added by unrolling. 1178 /// \param Loop The loop to unroll. The loop will be invalidated. 1179 void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop); 1180 1181 /// Partially unroll a loop. 1182 /// 1183 /// The CanonicalLoopInfo of the unrolled loop for use with chained 1184 /// loop-associated directive can be requested using \p UnrolledCLI. Not 1185 /// needing the CanonicalLoopInfo allows more efficient code generation by 1186 /// deferring the actual unrolling to the LoopUnrollPass using loop metadata. 1187 /// A loop-associated directive applied to the unrolled loop needs to know the 1188 /// new trip count which means that if using a heuristically determined unroll 1189 /// factor (\p Factor == 0), that factor must be computed immediately. We are 1190 /// using the same logic as the LoopUnrollPass to derived the unroll factor, 1191 /// but which assumes that some canonicalization has taken place (e.g. 1192 /// Mem2Reg, LICM, GVN, Inlining, etc.). That is, the heuristic will perform 1193 /// better when the unrolled loop's CanonicalLoopInfo is not needed. 1194 /// 1195 /// \param DL Debug location for instructions added by unrolling. 1196 /// \param Loop The loop to unroll. The loop will be invalidated. 1197 /// \param Factor The factor to unroll the loop by. A factor of 0 1198 /// indicates that a heuristic should be used to determine 1199 /// the unroll-factor. 1200 /// \param UnrolledCLI If non-null, receives the CanonicalLoopInfo of the 1201 /// partially unrolled loop. Otherwise, uses loop metadata 1202 /// to defer unrolling to the LoopUnrollPass. 1203 void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, 1204 CanonicalLoopInfo **UnrolledCLI); 1205 1206 /// Add metadata to simd-ize a loop. If IfCond is not nullptr, the loop 1207 /// is cloned. The metadata which prevents vectorization is added to 1208 /// to the cloned loop. The cloned loop is executed when ifCond is evaluated 1209 /// to false. 1210 /// 1211 /// \param Loop The loop to simd-ize. 1212 /// \param AlignedVars The map which containts pairs of the pointer 1213 /// and its corresponding alignment. 1214 /// \param IfCond The value which corresponds to the if clause 1215 /// condition. 1216 /// \param Order The enum to map order clause. 1217 /// \param Simdlen The Simdlen length to apply to the simd loop. 1218 /// \param Safelen The Safelen length to apply to the simd loop. 1219 void applySimd(CanonicalLoopInfo *Loop, 1220 MapVector<Value *, Value *> AlignedVars, Value *IfCond, 1221 omp::OrderKind Order, ConstantInt *Simdlen, 1222 ConstantInt *Safelen); 1223 1224 /// Generator for '#omp flush' 1225 /// 1226 /// \param Loc The location where the flush directive was encountered 1227 void createFlush(const LocationDescription &Loc); 1228 1229 /// Generator for '#omp taskwait' 1230 /// 1231 /// \param Loc The location where the taskwait directive was encountered. 1232 void createTaskwait(const LocationDescription &Loc); 1233 1234 /// Generator for '#omp taskyield' 1235 /// 1236 /// \param Loc The location where the taskyield directive was encountered. 1237 void createTaskyield(const LocationDescription &Loc); 1238 1239 /// A struct to pack the relevant information for an OpenMP depend clause. 1240 struct DependData { 1241 omp::RTLDependenceKindTy DepKind = omp::RTLDependenceKindTy::DepUnknown; 1242 Type *DepValueType; 1243 Value *DepVal; 1244 explicit DependData() = default; 1245 DependData(omp::RTLDependenceKindTy DepKind, Type *DepValueType, 1246 Value *DepVal) 1247 : DepKind(DepKind), DepValueType(DepValueType), DepVal(DepVal) {} 1248 }; 1249 1250 /// Generator for `#omp task` 1251 /// 1252 /// \param Loc The location where the task construct was encountered. 1253 /// \param AllocaIP The insertion point to be used for alloca instructions. 1254 /// \param BodyGenCB Callback that will generate the region code. 1255 /// \param Tied True if the task is tied, false if the task is untied. 1256 /// \param Final i1 value which is `true` if the task is final, `false` if the 1257 /// task is not final. 1258 /// \param IfCondition i1 value. If it evaluates to `false`, an undeferred 1259 /// task is generated, and the encountering thread must 1260 /// suspend the current task region, for which execution 1261 /// cannot be resumed until execution of the structured 1262 /// block that is associated with the generated task is 1263 /// completed. 1264 /// \param EventHandle If present, signifies the event handle as part of 1265 /// the detach clause 1266 /// \param Mergeable If the given task is `mergeable` 1267 /// \param priority `priority-value' specifies the execution order of the 1268 /// tasks that is generated by the construct 1269 InsertPointOrErrorTy 1270 createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, 1271 BodyGenCallbackTy BodyGenCB, bool Tied = true, 1272 Value *Final = nullptr, Value *IfCondition = nullptr, 1273 SmallVector<DependData> Dependencies = {}, bool Mergeable = false, 1274 Value *EventHandle = nullptr, Value *Priority = nullptr); 1275 1276 /// Generator for the taskgroup construct 1277 /// 1278 /// \param Loc The location where the taskgroup construct was encountered. 1279 /// \param AllocaIP The insertion point to be used for alloca instructions. 1280 /// \param BodyGenCB Callback that will generate the region code. 1281 InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, 1282 InsertPointTy AllocaIP, 1283 BodyGenCallbackTy BodyGenCB); 1284 1285 using FileIdentifierInfoCallbackTy = 1286 std::function<std::tuple<std::string, uint64_t>()>; 1287 1288 /// Creates a unique info for a target entry when provided a filename and 1289 /// line number from. 1290 /// 1291 /// \param CallBack A callback function which should return filename the entry 1292 /// resides in as well as the line number for the target entry 1293 /// \param ParentName The name of the parent the target entry resides in, if 1294 /// any. 1295 static TargetRegionEntryInfo 1296 getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, 1297 StringRef ParentName = ""); 1298 1299 /// Enum class for the RedctionGen CallBack type to be used. 1300 enum class ReductionGenCBKind { Clang, MLIR }; 1301 1302 /// ReductionGen CallBack for Clang 1303 /// 1304 /// \param CodeGenIP InsertPoint for CodeGen. 1305 /// \param Index Index of the ReductionInfo to generate code for. 1306 /// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for 1307 /// codegen, used for fixup later. 1308 /// \param RHSPtr Optionally used by Clang to 1309 /// return the RHSPtr it used for codegen, used for fixup later. 1310 /// \param CurFn Optionally used by Clang to pass in the Current Function as 1311 /// Clang context may be old. 1312 using ReductionGenClangCBTy = 1313 std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index, 1314 Value **LHS, Value **RHS, Function *CurFn)>; 1315 1316 /// ReductionGen CallBack for MLIR 1317 /// 1318 /// \param CodeGenIP InsertPoint for CodeGen. 1319 /// \param LHS Pass in the LHS Value to be used for CodeGen. 1320 /// \param RHS Pass in the RHS Value to be used for CodeGen. 1321 using ReductionGenCBTy = std::function<InsertPointOrErrorTy( 1322 InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>; 1323 1324 /// Functions used to generate atomic reductions. Such functions take two 1325 /// Values representing pointers to LHS and RHS of the reduction, as well as 1326 /// the element type of these pointers. They are expected to atomically 1327 /// update the LHS to the reduced value. 1328 using ReductionGenAtomicCBTy = std::function<InsertPointOrErrorTy( 1329 InsertPointTy, Type *, Value *, Value *)>; 1330 1331 /// Enum class for reduction evaluation types scalar, complex and aggregate. 1332 enum class EvalKind { Scalar, Complex, Aggregate }; 1333 1334 /// Information about an OpenMP reduction. 1335 struct ReductionInfo { 1336 ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable, 1337 EvalKind EvaluationKind, ReductionGenCBTy ReductionGen, 1338 ReductionGenClangCBTy ReductionGenClang, 1339 ReductionGenAtomicCBTy AtomicReductionGen) 1340 : ElementType(ElementType), Variable(Variable), 1341 PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind), 1342 ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang), 1343 AtomicReductionGen(AtomicReductionGen) {} 1344 ReductionInfo(Value *PrivateVariable) 1345 : ElementType(nullptr), Variable(nullptr), 1346 PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar), 1347 ReductionGen(), ReductionGenClang(), AtomicReductionGen() {} 1348 1349 /// Reduction element type, must match pointee type of variable. 1350 Type *ElementType; 1351 1352 /// Reduction variable of pointer type. 1353 Value *Variable; 1354 1355 /// Thread-private partial reduction variable. 1356 Value *PrivateVariable; 1357 1358 /// Reduction evaluation kind - scalar, complex or aggregate. 1359 EvalKind EvaluationKind; 1360 1361 /// Callback for generating the reduction body. The IR produced by this will 1362 /// be used to combine two values in a thread-safe context, e.g., under 1363 /// lock or within the same thread, and therefore need not be atomic. 1364 ReductionGenCBTy ReductionGen; 1365 1366 /// Clang callback for generating the reduction body. The IR produced by 1367 /// this will be used to combine two values in a thread-safe context, e.g., 1368 /// under lock or within the same thread, and therefore need not be atomic. 1369 ReductionGenClangCBTy ReductionGenClang; 1370 1371 /// Callback for generating the atomic reduction body, may be null. The IR 1372 /// produced by this will be used to atomically combine two values during 1373 /// reduction. If null, the implementation will use the non-atomic version 1374 /// along with the appropriate synchronization mechanisms. 1375 ReductionGenAtomicCBTy AtomicReductionGen; 1376 }; 1377 1378 enum class CopyAction : unsigned { 1379 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in 1380 // the warp using shuffle instructions. 1381 RemoteLaneToThread, 1382 // ThreadCopy: Make a copy of a Reduce list on the thread's stack. 1383 ThreadCopy, 1384 }; 1385 1386 struct CopyOptionsTy { 1387 Value *RemoteLaneOffset = nullptr; 1388 Value *ScratchpadIndex = nullptr; 1389 Value *ScratchpadWidth = nullptr; 1390 }; 1391 1392 /// Supporting functions for Reductions CodeGen. 1393 private: 1394 /// Get the id of the current thread on the GPU. 1395 Value *getGPUThreadID(); 1396 1397 /// Get the GPU warp size. 1398 Value *getGPUWarpSize(); 1399 1400 /// Get the id of the warp in the block. 1401 /// We assume that the warp size is 32, which is always the case 1402 /// on the NVPTX device, to generate more efficient code. 1403 Value *getNVPTXWarpID(); 1404 1405 /// Get the id of the current lane in the Warp. 1406 /// We assume that the warp size is 32, which is always the case 1407 /// on the NVPTX device, to generate more efficient code. 1408 Value *getNVPTXLaneID(); 1409 1410 /// Cast value to the specified type. 1411 Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType); 1412 1413 /// This function creates calls to one of two shuffle functions to copy 1414 /// variables between lanes in a warp. 1415 Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element, 1416 Type *ElementType, Value *Offset); 1417 1418 /// Function to shuffle over the value from the remote lane. 1419 void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr, 1420 Type *ElementType, Value *Offset, 1421 Type *ReductionArrayTy); 1422 1423 /// Emit instructions to copy a Reduce list, which contains partially 1424 /// aggregated values, in the specified direction. 1425 void emitReductionListCopy( 1426 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy, 1427 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase, 1428 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}); 1429 1430 /// Emit a helper that reduces data across two OpenMP threads (lanes) 1431 /// in the same warp. It uses shuffle instructions to copy over data from 1432 /// a remote lane's stack. The reduction algorithm performed is specified 1433 /// by the fourth parameter. 1434 /// 1435 /// Algorithm Versions. 1436 /// Full Warp Reduce (argument value 0): 1437 /// This algorithm assumes that all 32 lanes are active and gathers 1438 /// data from these 32 lanes, producing a single resultant value. 1439 /// Contiguous Partial Warp Reduce (argument value 1): 1440 /// This algorithm assumes that only a *contiguous* subset of lanes 1441 /// are active. This happens for the last warp in a parallel region 1442 /// when the user specified num_threads is not an integer multiple of 1443 /// 32. This contiguous subset always starts with the zeroth lane. 1444 /// Partial Warp Reduce (argument value 2): 1445 /// This algorithm gathers data from any number of lanes at any position. 1446 /// All reduced values are stored in the lowest possible lane. The set 1447 /// of problems every algorithm addresses is a super set of those 1448 /// addressable by algorithms with a lower version number. Overhead 1449 /// increases as algorithm version increases. 1450 /// 1451 /// Terminology 1452 /// Reduce element: 1453 /// Reduce element refers to the individual data field with primitive 1454 /// data types to be combined and reduced across threads. 1455 /// Reduce list: 1456 /// Reduce list refers to a collection of local, thread-private 1457 /// reduce elements. 1458 /// Remote Reduce list: 1459 /// Remote Reduce list refers to a collection of remote (relative to 1460 /// the current thread) reduce elements. 1461 /// 1462 /// We distinguish between three states of threads that are important to 1463 /// the implementation of this function. 1464 /// Alive threads: 1465 /// Threads in a warp executing the SIMT instruction, as distinguished from 1466 /// threads that are inactive due to divergent control flow. 1467 /// Active threads: 1468 /// The minimal set of threads that has to be alive upon entry to this 1469 /// function. The computation is correct iff active threads are alive. 1470 /// Some threads are alive but they are not active because they do not 1471 /// contribute to the computation in any useful manner. Turning them off 1472 /// may introduce control flow overheads without any tangible benefits. 1473 /// Effective threads: 1474 /// In order to comply with the argument requirements of the shuffle 1475 /// function, we must keep all lanes holding data alive. But at most 1476 /// half of them perform value aggregation; we refer to this half of 1477 /// threads as effective. The other half is simply handing off their 1478 /// data. 1479 /// 1480 /// Procedure 1481 /// Value shuffle: 1482 /// In this step active threads transfer data from higher lane positions 1483 /// in the warp to lower lane positions, creating Remote Reduce list. 1484 /// Value aggregation: 1485 /// In this step, effective threads combine their thread local Reduce list 1486 /// with Remote Reduce list and store the result in the thread local 1487 /// Reduce list. 1488 /// Value copy: 1489 /// In this step, we deal with the assumption made by algorithm 2 1490 /// (i.e. contiguity assumption). When we have an odd number of lanes 1491 /// active, say 2k+1, only k threads will be effective and therefore k 1492 /// new values will be produced. However, the Reduce list owned by the 1493 /// (2k+1)th thread is ignored in the value aggregation. Therefore 1494 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so 1495 /// that the contiguity assumption still holds. 1496 /// 1497 /// \param ReductionInfos Array type containing the ReductionOps. 1498 /// \param ReduceFn The reduction function. 1499 /// \param FuncAttrs Optional param to specify any function attributes that 1500 /// need to be copied to the new function. 1501 /// 1502 /// \return The ShuffleAndReduce function. 1503 Function *emitShuffleAndReduceFunction( 1504 ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, 1505 Function *ReduceFn, AttributeList FuncAttrs); 1506 1507 /// This function emits a helper that gathers Reduce lists from the first 1508 /// lane of every active warp to lanes in the first warp. 1509 /// 1510 /// void inter_warp_copy_func(void* reduce_data, num_warps) 1511 /// shared smem[warp_size]; 1512 /// For all data entries D in reduce_data: 1513 /// sync 1514 /// If (I am the first lane in each warp) 1515 /// Copy my local D to smem[warp_id] 1516 /// sync 1517 /// if (I am the first warp) 1518 /// Copy smem[thread_id] to my local D 1519 /// 1520 /// \param Loc The insert and source location description. 1521 /// \param ReductionInfos Array type containing the ReductionOps. 1522 /// \param FuncAttrs Optional param to specify any function attributes that 1523 /// need to be copied to the new function. 1524 /// 1525 /// \return The InterWarpCopy function. 1526 Expected<Function *> 1527 emitInterWarpCopyFunction(const LocationDescription &Loc, 1528 ArrayRef<ReductionInfo> ReductionInfos, 1529 AttributeList FuncAttrs); 1530 1531 /// This function emits a helper that copies all the reduction variables from 1532 /// the team into the provided global buffer for the reduction variables. 1533 /// 1534 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) 1535 /// For all data entries D in reduce_data: 1536 /// Copy local D to buffer.D[Idx] 1537 /// 1538 /// \param ReductionInfos Array type containing the ReductionOps. 1539 /// \param ReductionsBufferTy The StructTy for the reductions buffer. 1540 /// \param FuncAttrs Optional param to specify any function attributes that 1541 /// need to be copied to the new function. 1542 /// 1543 /// \return The ListToGlobalCopy function. 1544 Function *emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos, 1545 Type *ReductionsBufferTy, 1546 AttributeList FuncAttrs); 1547 1548 /// This function emits a helper that copies all the reduction variables from 1549 /// the team into the provided global buffer for the reduction variables. 1550 /// 1551 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) 1552 /// For all data entries D in reduce_data: 1553 /// Copy buffer.D[Idx] to local D; 1554 /// 1555 /// \param ReductionInfos Array type containing the ReductionOps. 1556 /// \param ReductionsBufferTy The StructTy for the reductions buffer. 1557 /// \param FuncAttrs Optional param to specify any function attributes that 1558 /// need to be copied to the new function. 1559 /// 1560 /// \return The GlobalToList function. 1561 Function *emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos, 1562 Type *ReductionsBufferTy, 1563 AttributeList FuncAttrs); 1564 1565 /// This function emits a helper that reduces all the reduction variables from 1566 /// the team into the provided global buffer for the reduction variables. 1567 /// 1568 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data) 1569 /// void *GlobPtrs[]; 1570 /// GlobPtrs[0] = (void*)&buffer.D0[Idx]; 1571 /// ... 1572 /// GlobPtrs[N] = (void*)&buffer.DN[Idx]; 1573 /// reduce_function(GlobPtrs, reduce_data); 1574 /// 1575 /// \param ReductionInfos Array type containing the ReductionOps. 1576 /// \param ReduceFn The reduction function. 1577 /// \param ReductionsBufferTy The StructTy for the reductions buffer. 1578 /// \param FuncAttrs Optional param to specify any function attributes that 1579 /// need to be copied to the new function. 1580 /// 1581 /// \return The ListToGlobalReduce function. 1582 Function * 1583 emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos, 1584 Function *ReduceFn, Type *ReductionsBufferTy, 1585 AttributeList FuncAttrs); 1586 1587 /// This function emits a helper that reduces all the reduction variables from 1588 /// the team into the provided global buffer for the reduction variables. 1589 /// 1590 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data) 1591 /// void *GlobPtrs[]; 1592 /// GlobPtrs[0] = (void*)&buffer.D0[Idx]; 1593 /// ... 1594 /// GlobPtrs[N] = (void*)&buffer.DN[Idx]; 1595 /// reduce_function(reduce_data, GlobPtrs); 1596 /// 1597 /// \param ReductionInfos Array type containing the ReductionOps. 1598 /// \param ReduceFn The reduction function. 1599 /// \param ReductionsBufferTy The StructTy for the reductions buffer. 1600 /// \param FuncAttrs Optional param to specify any function attributes that 1601 /// need to be copied to the new function. 1602 /// 1603 /// \return The GlobalToListReduce function. 1604 Function * 1605 emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos, 1606 Function *ReduceFn, Type *ReductionsBufferTy, 1607 AttributeList FuncAttrs); 1608 1609 /// Get the function name of a reduction function. 1610 std::string getReductionFuncName(StringRef Name) const; 1611 1612 /// Emits reduction function. 1613 /// \param ReducerName Name of the function calling the reduction. 1614 /// \param ReductionInfos Array type containing the ReductionOps. 1615 /// \param ReductionGenCBKind Optional param to specify Clang or MLIR 1616 /// CodeGenCB kind. 1617 /// \param FuncAttrs Optional param to specify any function attributes that 1618 /// need to be copied to the new function. 1619 /// 1620 /// \return The reduction function. 1621 Expected<Function *> createReductionFunction( 1622 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos, 1623 ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR, 1624 AttributeList FuncAttrs = {}); 1625 1626 public: 1627 /// 1628 /// Design of OpenMP reductions on the GPU 1629 /// 1630 /// Consider a typical OpenMP program with one or more reduction 1631 /// clauses: 1632 /// 1633 /// float foo; 1634 /// double bar; 1635 /// #pragma omp target teams distribute parallel for \ 1636 /// reduction(+:foo) reduction(*:bar) 1637 /// for (int i = 0; i < N; i++) { 1638 /// foo += A[i]; bar *= B[i]; 1639 /// } 1640 /// 1641 /// where 'foo' and 'bar' are reduced across all OpenMP threads in 1642 /// all teams. In our OpenMP implementation on the NVPTX device an 1643 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads 1644 /// within a team are mapped to CUDA threads within a threadblock. 1645 /// Our goal is to efficiently aggregate values across all OpenMP 1646 /// threads such that: 1647 /// 1648 /// - the compiler and runtime are logically concise, and 1649 /// - the reduction is performed efficiently in a hierarchical 1650 /// manner as follows: within OpenMP threads in the same warp, 1651 /// across warps in a threadblock, and finally across teams on 1652 /// the NVPTX device. 1653 /// 1654 /// Introduction to Decoupling 1655 /// 1656 /// We would like to decouple the compiler and the runtime so that the 1657 /// latter is ignorant of the reduction variables (number, data types) 1658 /// and the reduction operators. This allows a simpler interface 1659 /// and implementation while still attaining good performance. 1660 /// 1661 /// Pseudocode for the aforementioned OpenMP program generated by the 1662 /// compiler is as follows: 1663 /// 1664 /// 1. Create private copies of reduction variables on each OpenMP 1665 /// thread: 'foo_private', 'bar_private' 1666 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned 1667 /// to it and writes the result in 'foo_private' and 'bar_private' 1668 /// respectively. 1669 /// 3. Call the OpenMP runtime on the GPU to reduce within a team 1670 /// and store the result on the team master: 1671 /// 1672 /// __kmpc_nvptx_parallel_reduce_nowait_v2(..., 1673 /// reduceData, shuffleReduceFn, interWarpCpyFn) 1674 /// 1675 /// where: 1676 /// struct ReduceData { 1677 /// double *foo; 1678 /// double *bar; 1679 /// } reduceData 1680 /// reduceData.foo = &foo_private 1681 /// reduceData.bar = &bar_private 1682 /// 1683 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two 1684 /// auxiliary functions generated by the compiler that operate on 1685 /// variables of type 'ReduceData'. They aid the runtime perform 1686 /// algorithmic steps in a data agnostic manner. 1687 /// 1688 /// 'shuffleReduceFn' is a pointer to a function that reduces data 1689 /// of type 'ReduceData' across two OpenMP threads (lanes) in the 1690 /// same warp. It takes the following arguments as input: 1691 /// 1692 /// a. variable of type 'ReduceData' on the calling lane, 1693 /// b. its lane_id, 1694 /// c. an offset relative to the current lane_id to generate a 1695 /// remote_lane_id. The remote lane contains the second 1696 /// variable of type 'ReduceData' that is to be reduced. 1697 /// d. an algorithm version parameter determining which reduction 1698 /// algorithm to use. 1699 /// 1700 /// 'shuffleReduceFn' retrieves data from the remote lane using 1701 /// efficient GPU shuffle intrinsics and reduces, using the 1702 /// algorithm specified by the 4th parameter, the two operands 1703 /// element-wise. The result is written to the first operand. 1704 /// 1705 /// Different reduction algorithms are implemented in different 1706 /// runtime functions, all calling 'shuffleReduceFn' to perform 1707 /// the essential reduction step. Therefore, based on the 4th 1708 /// parameter, this function behaves slightly differently to 1709 /// cooperate with the runtime to ensure correctness under 1710 /// different circumstances. 1711 /// 1712 /// 'InterWarpCpyFn' is a pointer to a function that transfers 1713 /// reduced variables across warps. It tunnels, through CUDA 1714 /// shared memory, the thread-private data of type 'ReduceData' 1715 /// from lane 0 of each warp to a lane in the first warp. 1716 /// 4. Call the OpenMP runtime on the GPU to reduce across teams. 1717 /// The last team writes the global reduced value to memory. 1718 /// 1719 /// ret = __kmpc_nvptx_teams_reduce_nowait(..., 1720 /// reduceData, shuffleReduceFn, interWarpCpyFn, 1721 /// scratchpadCopyFn, loadAndReduceFn) 1722 /// 1723 /// 'scratchpadCopyFn' is a helper that stores reduced 1724 /// data from the team master to a scratchpad array in 1725 /// global memory. 1726 /// 1727 /// 'loadAndReduceFn' is a helper that loads data from 1728 /// the scratchpad array and reduces it with the input 1729 /// operand. 1730 /// 1731 /// These compiler generated functions hide address 1732 /// calculation and alignment information from the runtime. 1733 /// 5. if ret == 1: 1734 /// The team master of the last team stores the reduced 1735 /// result to the globals in memory. 1736 /// foo += reduceData.foo; bar *= reduceData.bar 1737 /// 1738 /// 1739 /// Warp Reduction Algorithms 1740 /// 1741 /// On the warp level, we have three algorithms implemented in the 1742 /// OpenMP runtime depending on the number of active lanes: 1743 /// 1744 /// Full Warp Reduction 1745 /// 1746 /// The reduce algorithm within a warp where all lanes are active 1747 /// is implemented in the runtime as follows: 1748 /// 1749 /// full_warp_reduce(void *reduce_data, 1750 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 1751 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2) 1752 /// ShuffleReduceFn(reduce_data, 0, offset, 0); 1753 /// } 1754 /// 1755 /// The algorithm completes in log(2, WARPSIZE) steps. 1756 /// 1757 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is 1758 /// not used therefore we save instructions by not retrieving lane_id 1759 /// from the corresponding special registers. The 4th parameter, which 1760 /// represents the version of the algorithm being used, is set to 0 to 1761 /// signify full warp reduction. 1762 /// 1763 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1764 /// 1765 /// #reduce_elem refers to an element in the local lane's data structure 1766 /// #remote_elem is retrieved from a remote lane 1767 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1768 /// reduce_elem = reduce_elem REDUCE_OP remote_elem; 1769 /// 1770 /// Contiguous Partial Warp Reduction 1771 /// 1772 /// This reduce algorithm is used within a warp where only the first 1773 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the 1774 /// number of OpenMP threads in a parallel region is not a multiple of 1775 /// WARPSIZE. The algorithm is implemented in the runtime as follows: 1776 /// 1777 /// void 1778 /// contiguous_partial_reduce(void *reduce_data, 1779 /// kmp_ShuffleReductFctPtr ShuffleReduceFn, 1780 /// int size, int lane_id) { 1781 /// int curr_size; 1782 /// int offset; 1783 /// curr_size = size; 1784 /// mask = curr_size/2; 1785 /// while (offset>0) { 1786 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1); 1787 /// curr_size = (curr_size+1)/2; 1788 /// offset = curr_size/2; 1789 /// } 1790 /// } 1791 /// 1792 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1793 /// 1794 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1795 /// if (lane_id < offset) 1796 /// reduce_elem = reduce_elem REDUCE_OP remote_elem 1797 /// else 1798 /// reduce_elem = remote_elem 1799 /// 1800 /// This algorithm assumes that the data to be reduced are located in a 1801 /// contiguous subset of lanes starting from the first. When there is 1802 /// an odd number of active lanes, the data in the last lane is not 1803 /// aggregated with any other lane's dat but is instead copied over. 1804 /// 1805 /// Dispersed Partial Warp Reduction 1806 /// 1807 /// This algorithm is used within a warp when any discontiguous subset of 1808 /// lanes are active. It is used to implement the reduction operation 1809 /// across lanes in an OpenMP simd region or in a nested parallel region. 1810 /// 1811 /// void 1812 /// dispersed_partial_reduce(void *reduce_data, 1813 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 1814 /// int size, remote_id; 1815 /// int logical_lane_id = number_of_active_lanes_before_me() * 2; 1816 /// do { 1817 /// remote_id = next_active_lane_id_right_after_me(); 1818 /// # the above function returns 0 of no active lane 1819 /// # is present right after the current lane. 1820 /// size = number_of_active_lanes_in_this_warp(); 1821 /// logical_lane_id /= 2; 1822 /// ShuffleReduceFn(reduce_data, logical_lane_id, 1823 /// remote_id-1-threadIdx.x, 2); 1824 /// } while (logical_lane_id % 2 == 0 && size > 1); 1825 /// } 1826 /// 1827 /// There is no assumption made about the initial state of the reduction. 1828 /// Any number of lanes (>=1) could be active at any position. The reduction 1829 /// result is returned in the first active lane. 1830 /// 1831 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1832 /// 1833 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1834 /// if (lane_id % 2 == 0 && offset > 0) 1835 /// reduce_elem = reduce_elem REDUCE_OP remote_elem 1836 /// else 1837 /// reduce_elem = remote_elem 1838 /// 1839 /// 1840 /// Intra-Team Reduction 1841 /// 1842 /// This function, as implemented in the runtime call 1843 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP 1844 /// threads in a team. It first reduces within a warp using the 1845 /// aforementioned algorithms. We then proceed to gather all such 1846 /// reduced values at the first warp. 1847 /// 1848 /// The runtime makes use of the function 'InterWarpCpyFn', which copies 1849 /// data from each of the "warp master" (zeroth lane of each warp, where 1850 /// warp-reduced data is held) to the zeroth warp. This step reduces (in 1851 /// a mathematical sense) the problem of reduction across warp masters in 1852 /// a block to the problem of warp reduction. 1853 /// 1854 /// 1855 /// Inter-Team Reduction 1856 /// 1857 /// Once a team has reduced its data to a single value, it is stored in 1858 /// a global scratchpad array. Since each team has a distinct slot, this 1859 /// can be done without locking. 1860 /// 1861 /// The last team to write to the scratchpad array proceeds to reduce the 1862 /// scratchpad array. One or more workers in the last team use the helper 1863 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e., 1864 /// the k'th worker reduces every k'th element. 1865 /// 1866 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to 1867 /// reduce across workers and compute a globally reduced value. 1868 /// 1869 /// \param Loc The location where the reduction was 1870 /// encountered. Must be within the associate 1871 /// directive and after the last local access to the 1872 /// reduction variables. 1873 /// \param AllocaIP An insertion point suitable for allocas usable 1874 /// in reductions. 1875 /// \param CodeGenIP An insertion point suitable for code 1876 /// generation. \param ReductionInfos A list of info on each reduction 1877 /// variable. \param IsNoWait Optional flag set if the reduction is 1878 /// marked as 1879 /// nowait. 1880 /// \param IsTeamsReduction Optional flag set if it is a teams 1881 /// reduction. 1882 /// \param HasDistribute Optional flag set if it is a 1883 /// distribute reduction. 1884 /// \param GridValue Optional GPU grid value. 1885 /// \param ReductionBufNum Optional OpenMPCUDAReductionBufNumValue to be 1886 /// used for teams reduction. 1887 /// \param SrcLocInfo Source location information global. 1888 InsertPointOrErrorTy createReductionsGPU( 1889 const LocationDescription &Loc, InsertPointTy AllocaIP, 1890 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos, 1891 bool IsNoWait = false, bool IsTeamsReduction = false, 1892 bool HasDistribute = false, 1893 ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR, 1894 std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024, 1895 Value *SrcLocInfo = nullptr); 1896 1897 // TODO: provide atomic and non-atomic reduction generators for reduction 1898 // operators defined by the OpenMP specification. 1899 1900 /// Generator for '#omp reduction'. 1901 /// 1902 /// Emits the IR instructing the runtime to perform the specific kind of 1903 /// reductions. Expects reduction variables to have been privatized and 1904 /// initialized to reduction-neutral values separately. Emits the calls to 1905 /// runtime functions as well as the reduction function and the basic blocks 1906 /// performing the reduction atomically and non-atomically. 1907 /// 1908 /// The code emitted for the following: 1909 /// 1910 /// \code 1911 /// type var_1; 1912 /// type var_2; 1913 /// #pragma omp <directive> reduction(reduction-op:var_1,var_2) 1914 /// /* body */; 1915 /// \endcode 1916 /// 1917 /// corresponds to the following sketch. 1918 /// 1919 /// \code 1920 /// void _outlined_par() { 1921 /// // N is the number of different reductions. 1922 /// void *red_array[] = {privatized_var_1, privatized_var_2, ...}; 1923 /// switch(__kmpc_reduce(..., N, /*size of data in red array*/, red_array, 1924 /// _omp_reduction_func, 1925 /// _gomp_critical_user.reduction.var)) { 1926 /// case 1: { 1927 /// var_1 = var_1 <reduction-op> privatized_var_1; 1928 /// var_2 = var_2 <reduction-op> privatized_var_2; 1929 /// // ... 1930 /// __kmpc_end_reduce(...); 1931 /// break; 1932 /// } 1933 /// case 2: { 1934 /// _Atomic<ReductionOp>(var_1, privatized_var_1); 1935 /// _Atomic<ReductionOp>(var_2, privatized_var_2); 1936 /// // ... 1937 /// break; 1938 /// } 1939 /// default: break; 1940 /// } 1941 /// } 1942 /// 1943 /// void _omp_reduction_func(void **lhs, void **rhs) { 1944 /// *(type *)lhs[0] = *(type *)lhs[0] <reduction-op> *(type *)rhs[0]; 1945 /// *(type *)lhs[1] = *(type *)lhs[1] <reduction-op> *(type *)rhs[1]; 1946 /// // ... 1947 /// } 1948 /// \endcode 1949 /// 1950 /// \param Loc The location where the reduction was 1951 /// encountered. Must be within the associate 1952 /// directive and after the last local access to the 1953 /// reduction variables. 1954 /// \param AllocaIP An insertion point suitable for allocas usable 1955 /// in reductions. 1956 /// \param ReductionInfos A list of info on each reduction variable. 1957 /// \param IsNoWait A flag set if the reduction is marked as nowait. 1958 /// \param IsByRef A flag set if the reduction is using reference 1959 /// or direct value. 1960 InsertPointOrErrorTy createReductions(const LocationDescription &Loc, 1961 InsertPointTy AllocaIP, 1962 ArrayRef<ReductionInfo> ReductionInfos, 1963 ArrayRef<bool> IsByRef, 1964 bool IsNoWait = false); 1965 1966 ///} 1967 1968 /// Return the insertion point used by the underlying IRBuilder. 1969 InsertPointTy getInsertionPoint() { return Builder.saveIP(); } 1970 1971 /// Update the internal location to \p Loc. 1972 bool updateToLocation(const LocationDescription &Loc) { 1973 Builder.restoreIP(Loc.IP); 1974 Builder.SetCurrentDebugLocation(Loc.DL); 1975 return Loc.IP.getBlock() != nullptr; 1976 } 1977 1978 /// Return the function declaration for the runtime function with \p FnID. 1979 FunctionCallee getOrCreateRuntimeFunction(Module &M, 1980 omp::RuntimeFunction FnID); 1981 1982 Function *getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID); 1983 1984 /// Return the (LLVM-IR) string describing the source location \p LocStr. 1985 Constant *getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize); 1986 1987 /// Return the (LLVM-IR) string describing the default source location. 1988 Constant *getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize); 1989 1990 /// Return the (LLVM-IR) string describing the source location identified by 1991 /// the arguments. 1992 Constant *getOrCreateSrcLocStr(StringRef FunctionName, StringRef FileName, 1993 unsigned Line, unsigned Column, 1994 uint32_t &SrcLocStrSize); 1995 1996 /// Return the (LLVM-IR) string describing the DebugLoc \p DL. Use \p F as 1997 /// fallback if \p DL does not specify the function name. 1998 Constant *getOrCreateSrcLocStr(DebugLoc DL, uint32_t &SrcLocStrSize, 1999 Function *F = nullptr); 2000 2001 /// Return the (LLVM-IR) string describing the source location \p Loc. 2002 Constant *getOrCreateSrcLocStr(const LocationDescription &Loc, 2003 uint32_t &SrcLocStrSize); 2004 2005 /// Return an ident_t* encoding the source location \p SrcLocStr and \p Flags. 2006 /// TODO: Create a enum class for the Reserve2Flags 2007 Constant *getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, 2008 omp::IdentFlag Flags = omp::IdentFlag(0), 2009 unsigned Reserve2Flags = 0); 2010 2011 /// Create a hidden global flag \p Name in the module with initial value \p 2012 /// Value. 2013 GlobalValue *createGlobalFlag(unsigned Value, StringRef Name); 2014 2015 /// Emit the llvm.used metadata. 2016 void emitUsed(StringRef Name, ArrayRef<llvm::WeakTrackingVH> List); 2017 2018 /// Emit the kernel execution mode. 2019 GlobalVariable *emitKernelExecutionMode(StringRef KernelName, 2020 omp::OMPTgtExecModeFlags Mode); 2021 2022 /// Generate control flow and cleanup for cancellation. 2023 /// 2024 /// \param CancelFlag Flag indicating if the cancellation is performed. 2025 /// \param CanceledDirective The kind of directive that is cancled. 2026 /// \param ExitCB Extra code to be generated in the exit block. 2027 /// 2028 /// \return an error, if any were triggered during execution. 2029 Error emitCancelationCheckImpl(Value *CancelFlag, 2030 omp::Directive CanceledDirective, 2031 FinalizeCallbackTy ExitCB = {}); 2032 2033 /// Generate a target region entry call. 2034 /// 2035 /// \param Loc The location at which the request originated and is fulfilled. 2036 /// \param AllocaIP The insertion point to be used for alloca instructions. 2037 /// \param Return Return value of the created function returned by reference. 2038 /// \param DeviceID Identifier for the device via the 'device' clause. 2039 /// \param NumTeams Numer of teams for the region via the 'num_teams' clause 2040 /// or 0 if unspecified and -1 if there is no 'teams' clause. 2041 /// \param NumThreads Number of threads via the 'thread_limit' clause. 2042 /// \param HostPtr Pointer to the host-side pointer of the target kernel. 2043 /// \param KernelArgs Array of arguments to the kernel. 2044 InsertPointTy emitTargetKernel(const LocationDescription &Loc, 2045 InsertPointTy AllocaIP, Value *&Return, 2046 Value *Ident, Value *DeviceID, Value *NumTeams, 2047 Value *NumThreads, Value *HostPtr, 2048 ArrayRef<Value *> KernelArgs); 2049 2050 /// Generate a flush runtime call. 2051 /// 2052 /// \param Loc The location at which the request originated and is fulfilled. 2053 void emitFlush(const LocationDescription &Loc); 2054 2055 /// The finalization stack made up of finalize callbacks currently in-flight, 2056 /// wrapped into FinalizationInfo objects that reference also the finalization 2057 /// target block and the kind of cancellable directive. 2058 SmallVector<FinalizationInfo, 8> FinalizationStack; 2059 2060 /// Return true if the last entry in the finalization stack is of kind \p DK 2061 /// and cancellable. 2062 bool isLastFinalizationInfoCancellable(omp::Directive DK) { 2063 return !FinalizationStack.empty() && 2064 FinalizationStack.back().IsCancellable && 2065 FinalizationStack.back().DK == DK; 2066 } 2067 2068 /// Generate a taskwait runtime call. 2069 /// 2070 /// \param Loc The location at which the request originated and is fulfilled. 2071 void emitTaskwaitImpl(const LocationDescription &Loc); 2072 2073 /// Generate a taskyield runtime call. 2074 /// 2075 /// \param Loc The location at which the request originated and is fulfilled. 2076 void emitTaskyieldImpl(const LocationDescription &Loc); 2077 2078 /// Return the current thread ID. 2079 /// 2080 /// \param Ident The ident (ident_t*) describing the query origin. 2081 Value *getOrCreateThreadID(Value *Ident); 2082 2083 /// The OpenMPIRBuilder Configuration 2084 OpenMPIRBuilderConfig Config; 2085 2086 /// The underlying LLVM-IR module 2087 Module &M; 2088 2089 /// The LLVM-IR Builder used to create IR. 2090 IRBuilder<> Builder; 2091 2092 /// Map to remember source location strings 2093 StringMap<Constant *> SrcLocStrMap; 2094 2095 /// Map to remember existing ident_t*. 2096 DenseMap<std::pair<Constant *, uint64_t>, Constant *> IdentMap; 2097 2098 /// Info manager to keep track of target regions. 2099 OffloadEntriesInfoManager OffloadInfoManager; 2100 2101 /// The target triple of the underlying module. 2102 const Triple T; 2103 2104 /// Helper that contains information about regions we need to outline 2105 /// during finalization. 2106 struct OutlineInfo { 2107 using PostOutlineCBTy = std::function<void(Function &)>; 2108 PostOutlineCBTy PostOutlineCB; 2109 BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB; 2110 SmallVector<Value *, 2> ExcludeArgsFromAggregate; 2111 2112 /// Collect all blocks in between EntryBB and ExitBB in both the given 2113 /// vector and set. 2114 void collectBlocks(SmallPtrSetImpl<BasicBlock *> &BlockSet, 2115 SmallVectorImpl<BasicBlock *> &BlockVector); 2116 2117 /// Return the function that contains the region to be outlined. 2118 Function *getFunction() const { return EntryBB->getParent(); } 2119 }; 2120 2121 /// Collection of regions that need to be outlined during finalization. 2122 SmallVector<OutlineInfo, 16> OutlineInfos; 2123 2124 /// A collection of candidate target functions that's constant allocas will 2125 /// attempt to be raised on a call of finalize after all currently enqueued 2126 /// outline info's have been processed. 2127 SmallVector<llvm::Function *, 16> ConstantAllocaRaiseCandidates; 2128 2129 /// Collection of owned canonical loop objects that eventually need to be 2130 /// free'd. 2131 std::forward_list<CanonicalLoopInfo> LoopInfos; 2132 2133 /// Add a new region that will be outlined later. 2134 void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); } 2135 2136 /// An ordered map of auto-generated variables to their unique names. 2137 /// It stores variables with the following names: 1) ".gomp_critical_user_" + 2138 /// <critical_section_name> + ".var" for "omp critical" directives; 2) 2139 /// <mangled_name_for_global_var> + ".cache." for cache for threadprivate 2140 /// variables. 2141 StringMap<GlobalVariable *, BumpPtrAllocator> InternalVars; 2142 2143 /// Computes the size of type in bytes. 2144 Value *getSizeInBytes(Value *BasePtr); 2145 2146 // Emit a branch from the current block to the Target block only if 2147 // the current block has a terminator. 2148 void emitBranch(BasicBlock *Target); 2149 2150 // If BB has no use then delete it and return. Else place BB after the current 2151 // block, if possible, or else at the end of the function. Also add a branch 2152 // from current block to BB if current block does not have a terminator. 2153 void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished = false); 2154 2155 /// Emits code for OpenMP 'if' clause using specified \a BodyGenCallbackTy 2156 /// Here is the logic: 2157 /// if (Cond) { 2158 /// ThenGen(); 2159 /// } else { 2160 /// ElseGen(); 2161 /// } 2162 /// 2163 /// \return an error, if any were triggered during execution. 2164 Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, 2165 BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP = {}); 2166 2167 /// Create the global variable holding the offload mappings information. 2168 GlobalVariable *createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings, 2169 std::string VarName); 2170 2171 /// Create the global variable holding the offload names information. 2172 GlobalVariable * 2173 createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names, 2174 std::string VarName); 2175 2176 struct MapperAllocas { 2177 AllocaInst *ArgsBase = nullptr; 2178 AllocaInst *Args = nullptr; 2179 AllocaInst *ArgSizes = nullptr; 2180 }; 2181 2182 /// Create the allocas instruction used in call to mapper functions. 2183 void createMapperAllocas(const LocationDescription &Loc, 2184 InsertPointTy AllocaIP, unsigned NumOperands, 2185 struct MapperAllocas &MapperAllocas); 2186 2187 /// Create the call for the target mapper function. 2188 /// \param Loc The source location description. 2189 /// \param MapperFunc Function to be called. 2190 /// \param SrcLocInfo Source location information global. 2191 /// \param MaptypesArg The argument types. 2192 /// \param MapnamesArg The argument names. 2193 /// \param MapperAllocas The AllocaInst used for the call. 2194 /// \param DeviceID Device ID for the call. 2195 /// \param NumOperands Number of operands in the call. 2196 void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, 2197 Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, 2198 struct MapperAllocas &MapperAllocas, int64_t DeviceID, 2199 unsigned NumOperands); 2200 2201 /// Container for the arguments used to pass data to the runtime library. 2202 struct TargetDataRTArgs { 2203 /// The array of base pointer passed to the runtime library. 2204 Value *BasePointersArray = nullptr; 2205 /// The array of section pointers passed to the runtime library. 2206 Value *PointersArray = nullptr; 2207 /// The array of sizes passed to the runtime library. 2208 Value *SizesArray = nullptr; 2209 /// The array of map types passed to the runtime library for the beginning 2210 /// of the region or for the entire region if there are no separate map 2211 /// types for the region end. 2212 Value *MapTypesArray = nullptr; 2213 /// The array of map types passed to the runtime library for the end of the 2214 /// region, or nullptr if there are no separate map types for the region 2215 /// end. 2216 Value *MapTypesArrayEnd = nullptr; 2217 /// The array of user-defined mappers passed to the runtime library. 2218 Value *MappersArray = nullptr; 2219 /// The array of original declaration names of mapped pointers sent to the 2220 /// runtime library for debugging 2221 Value *MapNamesArray = nullptr; 2222 2223 explicit TargetDataRTArgs() {} 2224 explicit TargetDataRTArgs(Value *BasePointersArray, Value *PointersArray, 2225 Value *SizesArray, Value *MapTypesArray, 2226 Value *MapTypesArrayEnd, Value *MappersArray, 2227 Value *MapNamesArray) 2228 : BasePointersArray(BasePointersArray), PointersArray(PointersArray), 2229 SizesArray(SizesArray), MapTypesArray(MapTypesArray), 2230 MapTypesArrayEnd(MapTypesArrayEnd), MappersArray(MappersArray), 2231 MapNamesArray(MapNamesArray) {} 2232 }; 2233 2234 /// Container to pass the default attributes with which a kernel must be 2235 /// launched, used to set kernel attributes and populate associated static 2236 /// structures. 2237 /// 2238 /// For max values, < 0 means unset, == 0 means set but unknown at compile 2239 /// time. The number of max values will be 1 except for the case where 2240 /// ompx_bare is set. 2241 struct TargetKernelDefaultAttrs { 2242 omp::OMPTgtExecModeFlags ExecFlags = 2243 omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; 2244 SmallVector<int32_t, 3> MaxTeams = {-1}; 2245 int32_t MinTeams = 1; 2246 SmallVector<int32_t, 3> MaxThreads = {-1}; 2247 int32_t MinThreads = 1; 2248 }; 2249 2250 /// Container to pass LLVM IR runtime values or constants related to the 2251 /// number of teams and threads with which the kernel must be launched, as 2252 /// well as the trip count of the loop, if it is an SPMD or Generic-SPMD 2253 /// kernel. These must be defined in the host prior to the call to the kernel 2254 /// launch OpenMP RTL function. 2255 struct TargetKernelRuntimeAttrs { 2256 SmallVector<Value *, 3> MaxTeams = {nullptr}; 2257 Value *MinTeams = nullptr; 2258 SmallVector<Value *, 3> TargetThreadLimit = {nullptr}; 2259 SmallVector<Value *, 3> TeamsThreadLimit = {nullptr}; 2260 2261 /// 'parallel' construct 'num_threads' clause value, if present and it is an 2262 /// SPMD kernel. 2263 Value *MaxThreads = nullptr; 2264 2265 /// Total number of iterations of the SPMD or Generic-SPMD kernel or null if 2266 /// it is a generic kernel. 2267 Value *LoopTripCount = nullptr; 2268 }; 2269 2270 /// Data structure that contains the needed information to construct the 2271 /// kernel args vector. 2272 struct TargetKernelArgs { 2273 /// Number of arguments passed to the runtime library. 2274 unsigned NumTargetItems = 0; 2275 /// Arguments passed to the runtime library 2276 TargetDataRTArgs RTArgs; 2277 /// The number of iterations 2278 Value *NumIterations = nullptr; 2279 /// The number of teams. 2280 ArrayRef<Value *> NumTeams; 2281 /// The number of threads. 2282 ArrayRef<Value *> NumThreads; 2283 /// The size of the dynamic shared memory. 2284 Value *DynCGGroupMem = nullptr; 2285 /// True if the kernel has 'no wait' clause. 2286 bool HasNoWait = false; 2287 2288 // Constructors for TargetKernelArgs. 2289 TargetKernelArgs() {} 2290 TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs, 2291 Value *NumIterations, ArrayRef<Value *> NumTeams, 2292 ArrayRef<Value *> NumThreads, Value *DynCGGroupMem, 2293 bool HasNoWait) 2294 : NumTargetItems(NumTargetItems), RTArgs(RTArgs), 2295 NumIterations(NumIterations), NumTeams(NumTeams), 2296 NumThreads(NumThreads), DynCGGroupMem(DynCGGroupMem), 2297 HasNoWait(HasNoWait) {} 2298 }; 2299 2300 /// Create the kernel args vector used by emitTargetKernel. This function 2301 /// creates various constant values that are used in the resulting args 2302 /// vector. 2303 static void getKernelArgsVector(TargetKernelArgs &KernelArgs, 2304 IRBuilderBase &Builder, 2305 SmallVector<Value *> &ArgsVector); 2306 2307 /// Struct that keeps the information that should be kept throughout 2308 /// a 'target data' region. 2309 class TargetDataInfo { 2310 /// Set to true if device pointer information have to be obtained. 2311 bool RequiresDevicePointerInfo = false; 2312 /// Set to true if Clang emits separate runtime calls for the beginning and 2313 /// end of the region. These calls might have separate map type arrays. 2314 bool SeparateBeginEndCalls = false; 2315 2316 public: 2317 TargetDataRTArgs RTArgs; 2318 2319 SmallMapVector<const Value *, std::pair<Value *, Value *>, 4> 2320 DevicePtrInfoMap; 2321 2322 /// Indicate whether any user-defined mapper exists. 2323 bool HasMapper = false; 2324 /// The total number of pointers passed to the runtime library. 2325 unsigned NumberOfPtrs = 0u; 2326 2327 bool EmitDebug = false; 2328 2329 /// Whether the `target ... data` directive has a `nowait` clause. 2330 bool HasNoWait = false; 2331 2332 explicit TargetDataInfo() {} 2333 explicit TargetDataInfo(bool RequiresDevicePointerInfo, 2334 bool SeparateBeginEndCalls) 2335 : RequiresDevicePointerInfo(RequiresDevicePointerInfo), 2336 SeparateBeginEndCalls(SeparateBeginEndCalls) {} 2337 /// Clear information about the data arrays. 2338 void clearArrayInfo() { 2339 RTArgs = TargetDataRTArgs(); 2340 HasMapper = false; 2341 NumberOfPtrs = 0u; 2342 } 2343 /// Return true if the current target data information has valid arrays. 2344 bool isValid() { 2345 return RTArgs.BasePointersArray && RTArgs.PointersArray && 2346 RTArgs.SizesArray && RTArgs.MapTypesArray && 2347 (!HasMapper || RTArgs.MappersArray) && NumberOfPtrs; 2348 } 2349 bool requiresDevicePointerInfo() { return RequiresDevicePointerInfo; } 2350 bool separateBeginEndCalls() { return SeparateBeginEndCalls; } 2351 }; 2352 2353 enum class DeviceInfoTy { None, Pointer, Address }; 2354 using MapValuesArrayTy = SmallVector<Value *, 4>; 2355 using MapDeviceInfoArrayTy = SmallVector<DeviceInfoTy, 4>; 2356 using MapFlagsArrayTy = SmallVector<omp::OpenMPOffloadMappingFlags, 4>; 2357 using MapNamesArrayTy = SmallVector<Constant *, 4>; 2358 using MapDimArrayTy = SmallVector<uint64_t, 4>; 2359 using MapNonContiguousArrayTy = SmallVector<MapValuesArrayTy, 4>; 2360 2361 /// This structure contains combined information generated for mappable 2362 /// clauses, including base pointers, pointers, sizes, map types, user-defined 2363 /// mappers, and non-contiguous information. 2364 struct MapInfosTy { 2365 struct StructNonContiguousInfo { 2366 bool IsNonContiguous = false; 2367 MapDimArrayTy Dims; 2368 MapNonContiguousArrayTy Offsets; 2369 MapNonContiguousArrayTy Counts; 2370 MapNonContiguousArrayTy Strides; 2371 }; 2372 MapValuesArrayTy BasePointers; 2373 MapValuesArrayTy Pointers; 2374 MapDeviceInfoArrayTy DevicePointers; 2375 MapValuesArrayTy Sizes; 2376 MapFlagsArrayTy Types; 2377 MapNamesArrayTy Names; 2378 StructNonContiguousInfo NonContigInfo; 2379 2380 /// Append arrays in \a CurInfo. 2381 void append(MapInfosTy &CurInfo) { 2382 BasePointers.append(CurInfo.BasePointers.begin(), 2383 CurInfo.BasePointers.end()); 2384 Pointers.append(CurInfo.Pointers.begin(), CurInfo.Pointers.end()); 2385 DevicePointers.append(CurInfo.DevicePointers.begin(), 2386 CurInfo.DevicePointers.end()); 2387 Sizes.append(CurInfo.Sizes.begin(), CurInfo.Sizes.end()); 2388 Types.append(CurInfo.Types.begin(), CurInfo.Types.end()); 2389 Names.append(CurInfo.Names.begin(), CurInfo.Names.end()); 2390 NonContigInfo.Dims.append(CurInfo.NonContigInfo.Dims.begin(), 2391 CurInfo.NonContigInfo.Dims.end()); 2392 NonContigInfo.Offsets.append(CurInfo.NonContigInfo.Offsets.begin(), 2393 CurInfo.NonContigInfo.Offsets.end()); 2394 NonContigInfo.Counts.append(CurInfo.NonContigInfo.Counts.begin(), 2395 CurInfo.NonContigInfo.Counts.end()); 2396 NonContigInfo.Strides.append(CurInfo.NonContigInfo.Strides.begin(), 2397 CurInfo.NonContigInfo.Strides.end()); 2398 } 2399 }; 2400 2401 /// Callback function type for functions emitting the host fallback code that 2402 /// is executed when the kernel launch fails. It takes an insertion point as 2403 /// parameter where the code should be emitted. It returns an insertion point 2404 /// that points right after after the emitted code. 2405 using EmitFallbackCallbackTy = 2406 function_ref<InsertPointOrErrorTy(InsertPointTy)>; 2407 2408 /// Generate a target region entry call and host fallback call. 2409 /// 2410 /// \param Loc The location at which the request originated and is fulfilled. 2411 /// \param OutlinedFnID The ooulined function ID. 2412 /// \param EmitTargetCallFallbackCB Call back function to generate host 2413 /// fallback code. 2414 /// \param Args Data structure holding information about the kernel arguments. 2415 /// \param DeviceID Identifier for the device via the 'device' clause. 2416 /// \param RTLoc Source location identifier 2417 /// \param AllocaIP The insertion point to be used for alloca instructions. 2418 InsertPointOrErrorTy 2419 emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, 2420 EmitFallbackCallbackTy EmitTargetCallFallbackCB, 2421 TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, 2422 InsertPointTy AllocaIP); 2423 2424 /// Callback type for generating the bodies of device directives that require 2425 /// outer target tasks (e.g. in case of having `nowait` or `depend` clauses). 2426 /// 2427 /// \param DeviceID The ID of the device on which the target region will 2428 /// execute. 2429 /// \param RTLoc Source location identifier 2430 /// \Param TargetTaskAllocaIP Insertion point for the alloca block of the 2431 /// generated task. 2432 /// 2433 /// \return an error, if any were triggered during execution. 2434 using TargetTaskBodyCallbackTy = 2435 function_ref<Error(Value *DeviceID, Value *RTLoc, 2436 IRBuilderBase::InsertPoint TargetTaskAllocaIP)>; 2437 2438 /// Generate a target-task for the target construct 2439 /// 2440 /// \param TaskBodyCB Callback to generate the actual body of the target task. 2441 /// \param DeviceID Identifier for the device via the 'device' clause. 2442 /// \param RTLoc Source location identifier 2443 /// \param AllocaIP The insertion point to be used for alloca instructions. 2444 /// \param Dependencies Vector of DependData objects holding information of 2445 /// dependencies as specified by the 'depend' clause. 2446 /// \param HasNoWait True if the target construct had 'nowait' on it, false 2447 /// otherwise 2448 InsertPointOrErrorTy emitTargetTask( 2449 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, 2450 OpenMPIRBuilder::InsertPointTy AllocaIP, 2451 const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies, 2452 bool HasNoWait); 2453 2454 /// Emit the arguments to be passed to the runtime library based on the 2455 /// arrays of base pointers, pointers, sizes, map types, and mappers. If 2456 /// ForEndCall, emit map types to be passed for the end of the region instead 2457 /// of the beginning. 2458 void emitOffloadingArraysArgument(IRBuilderBase &Builder, 2459 OpenMPIRBuilder::TargetDataRTArgs &RTArgs, 2460 OpenMPIRBuilder::TargetDataInfo &Info, 2461 bool ForEndCall = false); 2462 2463 /// Emit an array of struct descriptors to be assigned to the offload args. 2464 void emitNonContiguousDescriptor(InsertPointTy AllocaIP, 2465 InsertPointTy CodeGenIP, 2466 MapInfosTy &CombinedInfo, 2467 TargetDataInfo &Info); 2468 2469 /// Emit the arrays used to pass the captures and map information to the 2470 /// offloading runtime library. If there is no map or capture information, 2471 /// return nullptr by reference. Accepts a reference to a MapInfosTy object 2472 /// that contains information generated for mappable clauses, 2473 /// including base pointers, pointers, sizes, map types, user-defined mappers. 2474 void emitOffloadingArrays( 2475 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, 2476 TargetDataInfo &Info, bool IsNonContiguous = false, 2477 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr, 2478 function_ref<Value *(unsigned int)> CustomMapperCB = nullptr); 2479 2480 /// Allocates memory for and populates the arrays required for offloading 2481 /// (offload_{baseptrs|ptrs|mappers|sizes|maptypes|mapnames}). Then, it 2482 /// emits their base addresses as arguments to be passed to the runtime 2483 /// library. In essence, this function is a combination of 2484 /// emitOffloadingArrays and emitOffloadingArraysArgument and should arguably 2485 /// be preferred by clients of OpenMPIRBuilder. 2486 void emitOffloadingArraysAndArgs( 2487 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, 2488 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, 2489 bool IsNonContiguous = false, bool ForEndCall = false, 2490 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr, 2491 function_ref<Value *(unsigned int)> CustomMapperCB = nullptr); 2492 2493 /// Creates offloading entry for the provided entry ID \a ID, address \a 2494 /// Addr, size \a Size, and flags \a Flags. 2495 void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, 2496 int32_t Flags, GlobalValue::LinkageTypes, 2497 StringRef Name = ""); 2498 2499 /// The kind of errors that can occur when emitting the offload entries and 2500 /// metadata. 2501 enum EmitMetadataErrorKind { 2502 EMIT_MD_TARGET_REGION_ERROR, 2503 EMIT_MD_DECLARE_TARGET_ERROR, 2504 EMIT_MD_GLOBAL_VAR_LINK_ERROR 2505 }; 2506 2507 /// Callback function type 2508 using EmitMetadataErrorReportFunctionTy = 2509 std::function<void(EmitMetadataErrorKind, TargetRegionEntryInfo)>; 2510 2511 // Emit the offloading entries and metadata so that the device codegen side 2512 // can easily figure out what to emit. The produced metadata looks like 2513 // this: 2514 // 2515 // !omp_offload.info = !{!1, ...} 2516 // 2517 // We only generate metadata for function that contain target regions. 2518 void createOffloadEntriesAndInfoMetadata( 2519 EmitMetadataErrorReportFunctionTy &ErrorReportFunction); 2520 2521 public: 2522 /// Generator for __kmpc_copyprivate 2523 /// 2524 /// \param Loc The source location description. 2525 /// \param BufSize Number of elements in the buffer. 2526 /// \param CpyBuf List of pointers to data to be copied. 2527 /// \param CpyFn function to call for copying data. 2528 /// \param DidIt flag variable; 1 for 'single' thread, 0 otherwise. 2529 /// 2530 /// \return The insertion position *after* the CopyPrivate call. 2531 2532 InsertPointTy createCopyPrivate(const LocationDescription &Loc, 2533 llvm::Value *BufSize, llvm::Value *CpyBuf, 2534 llvm::Value *CpyFn, llvm::Value *DidIt); 2535 2536 /// Generator for '#omp single' 2537 /// 2538 /// \param Loc The source location description. 2539 /// \param BodyGenCB Callback that will generate the region code. 2540 /// \param FiniCB Callback to finalize variable copies. 2541 /// \param IsNowait If false, a barrier is emitted. 2542 /// \param CPVars copyprivate variables. 2543 /// \param CPFuncs copy functions to use for each copyprivate variable. 2544 /// 2545 /// \returns The insertion position *after* the single call. 2546 InsertPointOrErrorTy createSingle(const LocationDescription &Loc, 2547 BodyGenCallbackTy BodyGenCB, 2548 FinalizeCallbackTy FiniCB, bool IsNowait, 2549 ArrayRef<llvm::Value *> CPVars = {}, 2550 ArrayRef<llvm::Function *> CPFuncs = {}); 2551 2552 /// Generator for '#omp master' 2553 /// 2554 /// \param Loc The insert and source location description. 2555 /// \param BodyGenCB Callback that will generate the region code. 2556 /// \param FiniCB Callback to finalize variable copies. 2557 /// 2558 /// \returns The insertion position *after* the master. 2559 InsertPointOrErrorTy createMaster(const LocationDescription &Loc, 2560 BodyGenCallbackTy BodyGenCB, 2561 FinalizeCallbackTy FiniCB); 2562 2563 /// Generator for '#omp masked' 2564 /// 2565 /// \param Loc The insert and source location description. 2566 /// \param BodyGenCB Callback that will generate the region code. 2567 /// \param FiniCB Callback to finialize variable copies. 2568 /// 2569 /// \returns The insertion position *after* the masked. 2570 InsertPointOrErrorTy createMasked(const LocationDescription &Loc, 2571 BodyGenCallbackTy BodyGenCB, 2572 FinalizeCallbackTy FiniCB, Value *Filter); 2573 2574 /// Generator for '#omp critical' 2575 /// 2576 /// \param Loc The insert and source location description. 2577 /// \param BodyGenCB Callback that will generate the region body code. 2578 /// \param FiniCB Callback to finalize variable copies. 2579 /// \param CriticalName name of the lock used by the critical directive 2580 /// \param HintInst Hint Instruction for hint clause associated with critical 2581 /// 2582 /// \returns The insertion position *after* the critical. 2583 InsertPointOrErrorTy createCritical(const LocationDescription &Loc, 2584 BodyGenCallbackTy BodyGenCB, 2585 FinalizeCallbackTy FiniCB, 2586 StringRef CriticalName, Value *HintInst); 2587 2588 /// Generator for '#omp ordered depend (source | sink)' 2589 /// 2590 /// \param Loc The insert and source location description. 2591 /// \param AllocaIP The insertion point to be used for alloca instructions. 2592 /// \param NumLoops The number of loops in depend clause. 2593 /// \param StoreValues The value will be stored in vector address. 2594 /// \param Name The name of alloca instruction. 2595 /// \param IsDependSource If true, depend source; otherwise, depend sink. 2596 /// 2597 /// \return The insertion position *after* the ordered. 2598 InsertPointTy createOrderedDepend(const LocationDescription &Loc, 2599 InsertPointTy AllocaIP, unsigned NumLoops, 2600 ArrayRef<llvm::Value *> StoreValues, 2601 const Twine &Name, bool IsDependSource); 2602 2603 /// Generator for '#omp ordered [threads | simd]' 2604 /// 2605 /// \param Loc The insert and source location description. 2606 /// \param BodyGenCB Callback that will generate the region code. 2607 /// \param FiniCB Callback to finalize variable copies. 2608 /// \param IsThreads If true, with threads clause or without clause; 2609 /// otherwise, with simd clause; 2610 /// 2611 /// \returns The insertion position *after* the ordered. 2612 InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, 2613 BodyGenCallbackTy BodyGenCB, 2614 FinalizeCallbackTy FiniCB, 2615 bool IsThreads); 2616 2617 /// Generator for '#omp sections' 2618 /// 2619 /// \param Loc The insert and source location description. 2620 /// \param AllocaIP The insertion points to be used for alloca instructions. 2621 /// \param SectionCBs Callbacks that will generate body of each section. 2622 /// \param PrivCB Callback to copy a given variable (think copy constructor). 2623 /// \param FiniCB Callback to finalize variable copies. 2624 /// \param IsCancellable Flag to indicate a cancellable parallel region. 2625 /// \param IsNowait If true, barrier - to ensure all sections are executed 2626 /// before moving forward will not be generated. 2627 /// \returns The insertion position *after* the sections. 2628 InsertPointOrErrorTy 2629 createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, 2630 ArrayRef<StorableBodyGenCallbackTy> SectionCBs, 2631 PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, 2632 bool IsCancellable, bool IsNowait); 2633 2634 /// Generator for '#omp section' 2635 /// 2636 /// \param Loc The insert and source location description. 2637 /// \param BodyGenCB Callback that will generate the region body code. 2638 /// \param FiniCB Callback to finalize variable copies. 2639 /// \returns The insertion position *after* the section. 2640 InsertPointOrErrorTy createSection(const LocationDescription &Loc, 2641 BodyGenCallbackTy BodyGenCB, 2642 FinalizeCallbackTy FiniCB); 2643 2644 /// Generator for `#omp teams` 2645 /// 2646 /// \param Loc The location where the teams construct was encountered. 2647 /// \param BodyGenCB Callback that will generate the region code. 2648 /// \param NumTeamsLower Lower bound on number of teams. If this is nullptr, 2649 /// it is as if lower bound is specified as equal to upperbound. If 2650 /// this is non-null, then upperbound must also be non-null. 2651 /// \param NumTeamsUpper Upper bound on the number of teams. 2652 /// \param ThreadLimit on the number of threads that may participate in a 2653 /// contention group created by each team. 2654 /// \param IfExpr is the integer argument value of the if condition on the 2655 /// teams clause. 2656 InsertPointOrErrorTy 2657 createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, 2658 Value *NumTeamsLower = nullptr, Value *NumTeamsUpper = nullptr, 2659 Value *ThreadLimit = nullptr, Value *IfExpr = nullptr); 2660 2661 /// Generate conditional branch and relevant BasicBlocks through which private 2662 /// threads copy the 'copyin' variables from Master copy to threadprivate 2663 /// copies. 2664 /// 2665 /// \param IP insertion block for copyin conditional 2666 /// \param MasterVarPtr a pointer to the master variable 2667 /// \param PrivateVarPtr a pointer to the threadprivate variable 2668 /// \param IntPtrTy Pointer size type 2669 /// \param BranchtoEnd Create a branch between the copyin.not.master blocks 2670 // and copy.in.end block 2671 /// 2672 /// \returns The insertion point where copying operation to be emitted. 2673 InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, 2674 Value *PrivateAddr, 2675 llvm::IntegerType *IntPtrTy, 2676 bool BranchtoEnd = true); 2677 2678 /// Create a runtime call for kmpc_Alloc 2679 /// 2680 /// \param Loc The insert and source location description. 2681 /// \param Size Size of allocated memory space 2682 /// \param Allocator Allocator information instruction 2683 /// \param Name Name of call Instruction for OMP_alloc 2684 /// 2685 /// \returns CallInst to the OMP_Alloc call 2686 CallInst *createOMPAlloc(const LocationDescription &Loc, Value *Size, 2687 Value *Allocator, std::string Name = ""); 2688 2689 /// Create a runtime call for kmpc_free 2690 /// 2691 /// \param Loc The insert and source location description. 2692 /// \param Addr Address of memory space to be freed 2693 /// \param Allocator Allocator information instruction 2694 /// \param Name Name of call Instruction for OMP_Free 2695 /// 2696 /// \returns CallInst to the OMP_Free call 2697 CallInst *createOMPFree(const LocationDescription &Loc, Value *Addr, 2698 Value *Allocator, std::string Name = ""); 2699 2700 /// Create a runtime call for kmpc_threadprivate_cached 2701 /// 2702 /// \param Loc The insert and source location description. 2703 /// \param Pointer pointer to data to be cached 2704 /// \param Size size of data to be cached 2705 /// \param Name Name of call Instruction for callinst 2706 /// 2707 /// \returns CallInst to the thread private cache call. 2708 CallInst *createCachedThreadPrivate(const LocationDescription &Loc, 2709 llvm::Value *Pointer, 2710 llvm::ConstantInt *Size, 2711 const llvm::Twine &Name = Twine("")); 2712 2713 /// Create a runtime call for __tgt_interop_init 2714 /// 2715 /// \param Loc The insert and source location description. 2716 /// \param InteropVar variable to be allocated 2717 /// \param InteropType type of interop operation 2718 /// \param Device devide to which offloading will occur 2719 /// \param NumDependences number of dependence variables 2720 /// \param DependenceAddress pointer to dependence variables 2721 /// \param HaveNowaitClause does nowait clause exist 2722 /// 2723 /// \returns CallInst to the __tgt_interop_init call 2724 CallInst *createOMPInteropInit(const LocationDescription &Loc, 2725 Value *InteropVar, 2726 omp::OMPInteropType InteropType, Value *Device, 2727 Value *NumDependences, 2728 Value *DependenceAddress, 2729 bool HaveNowaitClause); 2730 2731 /// Create a runtime call for __tgt_interop_destroy 2732 /// 2733 /// \param Loc The insert and source location description. 2734 /// \param InteropVar variable to be allocated 2735 /// \param Device devide to which offloading will occur 2736 /// \param NumDependences number of dependence variables 2737 /// \param DependenceAddress pointer to dependence variables 2738 /// \param HaveNowaitClause does nowait clause exist 2739 /// 2740 /// \returns CallInst to the __tgt_interop_destroy call 2741 CallInst *createOMPInteropDestroy(const LocationDescription &Loc, 2742 Value *InteropVar, Value *Device, 2743 Value *NumDependences, 2744 Value *DependenceAddress, 2745 bool HaveNowaitClause); 2746 2747 /// Create a runtime call for __tgt_interop_use 2748 /// 2749 /// \param Loc The insert and source location description. 2750 /// \param InteropVar variable to be allocated 2751 /// \param Device devide to which offloading will occur 2752 /// \param NumDependences number of dependence variables 2753 /// \param DependenceAddress pointer to dependence variables 2754 /// \param HaveNowaitClause does nowait clause exist 2755 /// 2756 /// \returns CallInst to the __tgt_interop_use call 2757 CallInst *createOMPInteropUse(const LocationDescription &Loc, 2758 Value *InteropVar, Value *Device, 2759 Value *NumDependences, Value *DependenceAddress, 2760 bool HaveNowaitClause); 2761 2762 /// The `omp target` interface 2763 /// 2764 /// For more information about the usage of this interface, 2765 /// \see openmp/libomptarget/deviceRTLs/common/include/target.h 2766 /// 2767 ///{ 2768 2769 /// Create a runtime call for kmpc_target_init 2770 /// 2771 /// \param Loc The insert and source location description. 2772 /// \param Attrs Structure containing the default attributes, including 2773 /// numbers of threads and teams to launch the kernel with. 2774 InsertPointTy createTargetInit( 2775 const LocationDescription &Loc, 2776 const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs); 2777 2778 /// Create a runtime call for kmpc_target_deinit 2779 /// 2780 /// \param Loc The insert and source location description. 2781 /// \param TeamsReductionDataSize The maximal size of all the reduction data 2782 /// for teams reduction. 2783 /// \param TeamsReductionBufferLength The number of elements (each of up to 2784 /// \p TeamsReductionDataSize size), in the teams reduction buffer. 2785 void createTargetDeinit(const LocationDescription &Loc, 2786 int32_t TeamsReductionDataSize = 0, 2787 int32_t TeamsReductionBufferLength = 1024); 2788 2789 ///} 2790 2791 /// Helpers to read/write kernel annotations from the IR. 2792 /// 2793 ///{ 2794 2795 /// Read/write a bounds on threads for \p Kernel. Read will return 0 if none 2796 /// is set. 2797 static std::pair<int32_t, int32_t> 2798 readThreadBoundsForKernel(const Triple &T, Function &Kernel); 2799 static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, 2800 int32_t LB, int32_t UB); 2801 2802 /// Read/write a bounds on teams for \p Kernel. Read will return 0 if none 2803 /// is set. 2804 static std::pair<int32_t, int32_t> readTeamBoundsForKernel(const Triple &T, 2805 Function &Kernel); 2806 static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, 2807 int32_t UB); 2808 ///} 2809 2810 private: 2811 // Sets the function attributes expected for the outlined function 2812 void setOutlinedTargetRegionFunctionAttributes(Function *OutlinedFn); 2813 2814 // Creates the function ID/Address for the given outlined function. 2815 // In the case of an embedded device function the address of the function is 2816 // used, in the case of a non-offload function a constant is created. 2817 Constant *createOutlinedFunctionID(Function *OutlinedFn, 2818 StringRef EntryFnIDName); 2819 2820 // Creates the region entry address for the outlined function 2821 Constant *createTargetRegionEntryAddr(Function *OutlinedFunction, 2822 StringRef EntryFnName); 2823 2824 public: 2825 /// Functions used to generate a function with the given name. 2826 using FunctionGenCallback = 2827 std::function<Expected<Function *>(StringRef FunctionName)>; 2828 2829 /// Create a unique name for the entry function using the source location 2830 /// information of the current target region. The name will be something like: 2831 /// 2832 /// __omp_offloading_DD_FFFF_PP_lBB[_CC] 2833 /// 2834 /// where DD_FFFF is an ID unique to the file (device and file IDs), PP is the 2835 /// mangled name of the function that encloses the target region and BB is the 2836 /// line number of the target region. CC is a count added when more than one 2837 /// region is located at the same location. 2838 /// 2839 /// If this target outline function is not an offload entry, we don't need to 2840 /// register it. This may happen if it is guarded by an if clause that is 2841 /// false at compile time, or no target archs have been specified. 2842 /// 2843 /// The created target region ID is used by the runtime library to identify 2844 /// the current target region, so it only has to be unique and not 2845 /// necessarily point to anything. It could be the pointer to the outlined 2846 /// function that implements the target region, but we aren't using that so 2847 /// that the compiler doesn't need to keep that, and could therefore inline 2848 /// the host function if proven worthwhile during optimization. In the other 2849 /// hand, if emitting code for the device, the ID has to be the function 2850 /// address so that it can retrieved from the offloading entry and launched 2851 /// by the runtime library. We also mark the outlined function to have 2852 /// external linkage in case we are emitting code for the device, because 2853 /// these functions will be entry points to the device. 2854 /// 2855 /// \param InfoManager The info manager keeping track of the offload entries 2856 /// \param EntryInfo The entry information about the function 2857 /// \param GenerateFunctionCallback The callback function to generate the code 2858 /// \param OutlinedFunction Pointer to the outlined function 2859 /// \param EntryFnIDName Name of the ID o be created 2860 Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, 2861 FunctionGenCallback &GenerateFunctionCallback, 2862 bool IsOffloadEntry, Function *&OutlinedFn, 2863 Constant *&OutlinedFnID); 2864 2865 /// Registers the given function and sets up the attribtues of the function 2866 /// Returns the FunctionID. 2867 /// 2868 /// \param InfoManager The info manager keeping track of the offload entries 2869 /// \param EntryInfo The entry information about the function 2870 /// \param OutlinedFunction Pointer to the outlined function 2871 /// \param EntryFnName Name of the outlined function 2872 /// \param EntryFnIDName Name of the ID o be created 2873 Constant *registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, 2874 Function *OutlinedFunction, 2875 StringRef EntryFnName, 2876 StringRef EntryFnIDName); 2877 2878 /// Type of BodyGen to use for region codegen 2879 /// 2880 /// Priv: If device pointer privatization is required, emit the body of the 2881 /// region here. It will have to be duplicated: with and without 2882 /// privatization. 2883 /// DupNoPriv: If we need device pointer privatization, we need 2884 /// to emit the body of the region with no privatization in the 'else' branch 2885 /// of the conditional. 2886 /// NoPriv: If we don't require privatization of device 2887 /// pointers, we emit the body in between the runtime calls. This avoids 2888 /// duplicating the body code. 2889 enum BodyGenTy { Priv, DupNoPriv, NoPriv }; 2890 2891 /// Callback type for creating the map infos for the kernel parameters. 2892 /// \param CodeGenIP is the insertion point where code should be generated, 2893 /// if any. 2894 using GenMapInfoCallbackTy = 2895 function_ref<MapInfosTy &(InsertPointTy CodeGenIP)>; 2896 2897 private: 2898 /// Emit the array initialization or deletion portion for user-defined mapper 2899 /// code generation. First, it evaluates whether an array section is mapped 2900 /// and whether the \a MapType instructs to delete this section. If \a IsInit 2901 /// is true, and \a MapType indicates to not delete this array, array 2902 /// initialization code is generated. If \a IsInit is false, and \a MapType 2903 /// indicates to delete this array, array deletion code is generated. 2904 void emitUDMapperArrayInitOrDel(Function *MapperFn, llvm::Value *MapperHandle, 2905 llvm::Value *Base, llvm::Value *Begin, 2906 llvm::Value *Size, llvm::Value *MapType, 2907 llvm::Value *MapName, TypeSize ElementSize, 2908 llvm::BasicBlock *ExitBB, bool IsInit); 2909 2910 public: 2911 /// Emit the user-defined mapper function. The code generation follows the 2912 /// pattern in the example below. 2913 /// \code 2914 /// void .omp_mapper.<type_name>.<mapper_id>.(void *rt_mapper_handle, 2915 /// void *base, void *begin, 2916 /// int64_t size, int64_t type, 2917 /// void *name = nullptr) { 2918 /// // Allocate space for an array section first or add a base/begin for 2919 /// // pointer dereference. 2920 /// if ((size > 1 || (base != begin && maptype.IsPtrAndObj)) && 2921 /// !maptype.IsDelete) 2922 /// __tgt_push_mapper_component(rt_mapper_handle, base, begin, 2923 /// size*sizeof(Ty), clearToFromMember(type)); 2924 /// // Map members. 2925 /// for (unsigned i = 0; i < size; i++) { 2926 /// // For each component specified by this mapper: 2927 /// for (auto c : begin[i]->all_components) { 2928 /// if (c.hasMapper()) 2929 /// (*c.Mapper())(rt_mapper_handle, c.arg_base, c.arg_begin, 2930 /// c.arg_size, 2931 /// c.arg_type, c.arg_name); 2932 /// else 2933 /// __tgt_push_mapper_component(rt_mapper_handle, c.arg_base, 2934 /// c.arg_begin, c.arg_size, c.arg_type, 2935 /// c.arg_name); 2936 /// } 2937 /// } 2938 /// // Delete the array section. 2939 /// if (size > 1 && maptype.IsDelete) 2940 /// __tgt_push_mapper_component(rt_mapper_handle, base, begin, 2941 /// size*sizeof(Ty), clearToFromMember(type)); 2942 /// } 2943 /// \endcode 2944 /// 2945 /// \param PrivAndGenMapInfoCB Callback that privatizes code and populates the 2946 /// MapInfos and returns. 2947 /// \param ElemTy DeclareMapper element type. 2948 /// \param FuncName Optional param to specify mapper function name. 2949 /// \param CustomMapperCB Optional callback to generate code related to 2950 /// custom mappers. 2951 Function *emitUserDefinedMapper( 2952 function_ref<MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, 2953 llvm::Value *BeginArg)> 2954 PrivAndGenMapInfoCB, 2955 llvm::Type *ElemTy, StringRef FuncName, 2956 function_ref<bool(unsigned int, Function **)> CustomMapperCB = nullptr); 2957 2958 /// Generator for '#omp target data' 2959 /// 2960 /// \param Loc The location where the target data construct was encountered. 2961 /// \param AllocaIP The insertion points to be used for alloca instructions. 2962 /// \param CodeGenIP The insertion point at which the target directive code 2963 /// should be placed. 2964 /// \param IsBegin If true then emits begin mapper call otherwise emits 2965 /// end mapper call. 2966 /// \param DeviceID Stores the DeviceID from the device clause. 2967 /// \param IfCond Value which corresponds to the if clause condition. 2968 /// \param Info Stores all information realted to the Target Data directive. 2969 /// \param GenMapInfoCB Callback that populates the MapInfos and returns. 2970 /// \param BodyGenCB Optional Callback to generate the region code. 2971 /// \param DeviceAddrCB Optional callback to generate code related to 2972 /// use_device_ptr and use_device_addr. 2973 /// \param CustomMapperCB Optional callback to generate code related to 2974 /// custom mappers. 2975 InsertPointOrErrorTy createTargetData( 2976 const LocationDescription &Loc, InsertPointTy AllocaIP, 2977 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, 2978 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, 2979 omp::RuntimeFunction *MapperFunc = nullptr, 2980 function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP, 2981 BodyGenTy BodyGenType)> 2982 BodyGenCB = nullptr, 2983 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr, 2984 function_ref<Value *(unsigned int)> CustomMapperCB = nullptr, 2985 Value *SrcLocInfo = nullptr); 2986 2987 using TargetBodyGenCallbackTy = function_ref<InsertPointOrErrorTy( 2988 InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>; 2989 2990 using TargetGenArgAccessorsCallbackTy = function_ref<InsertPointOrErrorTy( 2991 Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP, 2992 InsertPointTy CodeGenIP)>; 2993 2994 /// Generator for '#omp target' 2995 /// 2996 /// \param Loc where the target data construct was encountered. 2997 /// \param IsOffloadEntry whether it is an offload entry. 2998 /// \param CodeGenIP The insertion point where the call to the outlined 2999 /// function should be emitted. 3000 /// \param EntryInfo The entry information about the function. 3001 /// \param DefaultAttrs Structure containing the default attributes, including 3002 /// numbers of threads and teams to launch the kernel with. 3003 /// \param RuntimeAttrs Structure containing the runtime numbers of threads 3004 /// and teams to launch the kernel with. 3005 /// \param IfCond value of the `if` clause. 3006 /// \param Inputs The input values to the region that will be passed. 3007 /// as arguments to the outlined function. 3008 /// \param BodyGenCB Callback that will generate the region code. 3009 /// \param ArgAccessorFuncCB Callback that will generate accessors 3010 /// instructions for passed in target arguments where neccessary 3011 /// \param Dependencies A vector of DependData objects that carry 3012 /// dependency information as passed in the depend clause 3013 /// \param HasNowait Whether the target construct has a `nowait` clause or 3014 /// not. 3015 InsertPointOrErrorTy createTarget( 3016 const LocationDescription &Loc, bool IsOffloadEntry, 3017 OpenMPIRBuilder::InsertPointTy AllocaIP, 3018 OpenMPIRBuilder::InsertPointTy CodeGenIP, 3019 TargetRegionEntryInfo &EntryInfo, 3020 const TargetKernelDefaultAttrs &DefaultAttrs, 3021 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, 3022 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB, 3023 TargetBodyGenCallbackTy BodyGenCB, 3024 TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, 3025 SmallVector<DependData> Dependencies = {}, bool HasNowait = false); 3026 3027 /// Returns __kmpc_for_static_init_* runtime function for the specified 3028 /// size \a IVSize and sign \a IVSigned. Will create a distribute call 3029 /// __kmpc_distribute_static_init* if \a IsGPUDistribute is set. 3030 FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, 3031 bool IsGPUDistribute); 3032 3033 /// Returns __kmpc_dispatch_init_* runtime function for the specified 3034 /// size \a IVSize and sign \a IVSigned. 3035 FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned); 3036 3037 /// Returns __kmpc_dispatch_next_* runtime function for the specified 3038 /// size \a IVSize and sign \a IVSigned. 3039 FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned); 3040 3041 /// Returns __kmpc_dispatch_fini_* runtime function for the specified 3042 /// size \a IVSize and sign \a IVSigned. 3043 FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned); 3044 3045 /// Returns __kmpc_dispatch_deinit runtime function. 3046 FunctionCallee createDispatchDeinitFunction(); 3047 3048 /// Declarations for LLVM-IR types (simple, array, function and structure) are 3049 /// generated below. Their names are defined and used in OpenMPKinds.def. Here 3050 /// we provide the declarations, the initializeTypes function will provide the 3051 /// values. 3052 /// 3053 ///{ 3054 #define OMP_TYPE(VarName, InitValue) Type *VarName = nullptr; 3055 #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \ 3056 ArrayType *VarName##Ty = nullptr; \ 3057 PointerType *VarName##PtrTy = nullptr; 3058 #define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \ 3059 FunctionType *VarName = nullptr; \ 3060 PointerType *VarName##Ptr = nullptr; 3061 #define OMP_STRUCT_TYPE(VarName, StrName, ...) \ 3062 StructType *VarName = nullptr; \ 3063 PointerType *VarName##Ptr = nullptr; 3064 #include "llvm/Frontend/OpenMP/OMPKinds.def" 3065 3066 ///} 3067 3068 private: 3069 /// Create all simple and struct types exposed by the runtime and remember 3070 /// the llvm::PointerTypes of them for easy access later. 3071 void initializeTypes(Module &M); 3072 3073 /// Common interface for generating entry calls for OMP Directives. 3074 /// if the directive has a region/body, It will set the insertion 3075 /// point to the body 3076 /// 3077 /// \param OMPD Directive to generate entry blocks for 3078 /// \param EntryCall Call to the entry OMP Runtime Function 3079 /// \param ExitBB block where the region ends. 3080 /// \param Conditional indicate if the entry call result will be used 3081 /// to evaluate a conditional of whether a thread will execute 3082 /// body code or not. 3083 /// 3084 /// \return The insertion position in exit block 3085 InsertPointTy emitCommonDirectiveEntry(omp::Directive OMPD, Value *EntryCall, 3086 BasicBlock *ExitBB, 3087 bool Conditional = false); 3088 3089 /// Common interface to finalize the region 3090 /// 3091 /// \param OMPD Directive to generate exiting code for 3092 /// \param FinIP Insertion point for emitting Finalization code and exit call 3093 /// \param ExitCall Call to the ending OMP Runtime Function 3094 /// \param HasFinalize indicate if the directive will require finalization 3095 /// and has a finalization callback in the stack that 3096 /// should be called. 3097 /// 3098 /// \return The insertion position in exit block 3099 InsertPointOrErrorTy emitCommonDirectiveExit(omp::Directive OMPD, 3100 InsertPointTy FinIP, 3101 Instruction *ExitCall, 3102 bool HasFinalize = true); 3103 3104 /// Common Interface to generate OMP inlined regions 3105 /// 3106 /// \param OMPD Directive to generate inlined region for 3107 /// \param EntryCall Call to the entry OMP Runtime Function 3108 /// \param ExitCall Call to the ending OMP Runtime Function 3109 /// \param BodyGenCB Body code generation callback. 3110 /// \param FiniCB Finalization Callback. Will be called when finalizing region 3111 /// \param Conditional indicate if the entry call result will be used 3112 /// to evaluate a conditional of whether a thread will execute 3113 /// body code or not. 3114 /// \param HasFinalize indicate if the directive will require finalization 3115 /// and has a finalization callback in the stack that 3116 /// should be called. 3117 /// \param IsCancellable if HasFinalize is set to true, indicate if the 3118 /// the directive should be cancellable. 3119 /// \return The insertion point after the region 3120 InsertPointOrErrorTy 3121 EmitOMPInlinedRegion(omp::Directive OMPD, Instruction *EntryCall, 3122 Instruction *ExitCall, BodyGenCallbackTy BodyGenCB, 3123 FinalizeCallbackTy FiniCB, bool Conditional = false, 3124 bool HasFinalize = true, bool IsCancellable = false); 3125 3126 /// Get the platform-specific name separator. 3127 /// \param Parts different parts of the final name that needs separation 3128 /// \param FirstSeparator First separator used between the initial two 3129 /// parts of the name. 3130 /// \param Separator separator used between all of the rest consecutive 3131 /// parts of the name 3132 static std::string getNameWithSeparators(ArrayRef<StringRef> Parts, 3133 StringRef FirstSeparator, 3134 StringRef Separator); 3135 3136 /// Returns corresponding lock object for the specified critical region 3137 /// name. If the lock object does not exist it is created, otherwise the 3138 /// reference to the existing copy is returned. 3139 /// \param CriticalName Name of the critical region. 3140 /// 3141 Value *getOMPCriticalRegionLock(StringRef CriticalName); 3142 3143 /// Callback type for Atomic Expression update 3144 /// ex: 3145 /// \code{.cpp} 3146 /// unsigned x = 0; 3147 /// #pragma omp atomic update 3148 /// x = Expr(x_old); //Expr() is any legal operation 3149 /// \endcode 3150 /// 3151 /// \param XOld the value of the atomic memory address to use for update 3152 /// \param IRB reference to the IRBuilder to use 3153 /// 3154 /// \returns Value to update X to. 3155 using AtomicUpdateCallbackTy = 3156 const function_ref<Expected<Value *>(Value *XOld, IRBuilder<> &IRB)>; 3157 3158 private: 3159 enum AtomicKind { Read, Write, Update, Capture, Compare }; 3160 3161 /// Determine whether to emit flush or not 3162 /// 3163 /// \param Loc The insert and source location description. 3164 /// \param AO The required atomic ordering 3165 /// \param AK The OpenMP atomic operation kind used. 3166 /// 3167 /// \returns wether a flush was emitted or not 3168 bool checkAndEmitFlushAfterAtomic(const LocationDescription &Loc, 3169 AtomicOrdering AO, AtomicKind AK); 3170 3171 /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X 3172 /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X) 3173 /// Only Scalar data types. 3174 /// 3175 /// \param AllocaIP The insertion point to be used for alloca 3176 /// instructions. 3177 /// \param X The target atomic pointer to be updated 3178 /// \param XElemTy The element type of the atomic pointer. 3179 /// \param Expr The value to update X with. 3180 /// \param AO Atomic ordering of the generated atomic 3181 /// instructions. 3182 /// \param RMWOp The binary operation used for update. If 3183 /// operation is not supported by atomicRMW, 3184 /// or belong to {FADD, FSUB, BAD_BINOP}. 3185 /// Then a `cmpExch` based atomic will be generated. 3186 /// \param UpdateOp Code generator for complex expressions that cannot be 3187 /// expressed through atomicrmw instruction. 3188 /// \param VolatileX true if \a X volatile? 3189 /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the 3190 /// update expression, false otherwise. 3191 /// (e.g. true for X = X BinOp Expr) 3192 /// 3193 /// \returns A pair of the old value of X before the update, and the value 3194 /// used for the update. 3195 Expected<std::pair<Value *, Value *>> 3196 emitAtomicUpdate(InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr, 3197 AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, 3198 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, 3199 bool IsXBinopExpr); 3200 3201 /// Emit the binary op. described by \p RMWOp, using \p Src1 and \p Src2 . 3202 /// 3203 /// \Return The instruction 3204 Value *emitRMWOpAsInstruction(Value *Src1, Value *Src2, 3205 AtomicRMWInst::BinOp RMWOp); 3206 3207 public: 3208 /// a struct to pack relevant information while generating atomic Ops 3209 struct AtomicOpValue { 3210 Value *Var = nullptr; 3211 Type *ElemTy = nullptr; 3212 bool IsSigned = false; 3213 bool IsVolatile = false; 3214 }; 3215 3216 /// Emit atomic Read for : V = X --- Only Scalar data types. 3217 /// 3218 /// \param Loc The insert and source location description. 3219 /// \param X The target pointer to be atomically read 3220 /// \param V Memory address where to store atomically read 3221 /// value 3222 /// \param AO Atomic ordering of the generated atomic 3223 /// instructions. 3224 /// 3225 /// \return Insertion point after generated atomic read IR. 3226 InsertPointTy createAtomicRead(const LocationDescription &Loc, 3227 AtomicOpValue &X, AtomicOpValue &V, 3228 AtomicOrdering AO); 3229 3230 /// Emit atomic write for : X = Expr --- Only Scalar data types. 3231 /// 3232 /// \param Loc The insert and source location description. 3233 /// \param X The target pointer to be atomically written to 3234 /// \param Expr The value to store. 3235 /// \param AO Atomic ordering of the generated atomic 3236 /// instructions. 3237 /// 3238 /// \return Insertion point after generated atomic Write IR. 3239 InsertPointTy createAtomicWrite(const LocationDescription &Loc, 3240 AtomicOpValue &X, Value *Expr, 3241 AtomicOrdering AO); 3242 3243 /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X 3244 /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X) 3245 /// Only Scalar data types. 3246 /// 3247 /// \param Loc The insert and source location description. 3248 /// \param AllocaIP The insertion point to be used for alloca instructions. 3249 /// \param X The target atomic pointer to be updated 3250 /// \param Expr The value to update X with. 3251 /// \param AO Atomic ordering of the generated atomic instructions. 3252 /// \param RMWOp The binary operation used for update. If operation 3253 /// is not supported by atomicRMW, or belong to 3254 /// {FADD, FSUB, BAD_BINOP}. Then a `cmpExch` based 3255 /// atomic will be generated. 3256 /// \param UpdateOp Code generator for complex expressions that cannot be 3257 /// expressed through atomicrmw instruction. 3258 /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the 3259 /// update expression, false otherwise. 3260 /// (e.g. true for X = X BinOp Expr) 3261 /// 3262 /// \return Insertion point after generated atomic update IR. 3263 InsertPointOrErrorTy 3264 createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, 3265 AtomicOpValue &X, Value *Expr, AtomicOrdering AO, 3266 AtomicRMWInst::BinOp RMWOp, 3267 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr); 3268 3269 /// Emit atomic update for constructs: --- Only Scalar data types 3270 /// V = X; X = X BinOp Expr , 3271 /// X = X BinOp Expr; V = X, 3272 /// V = X; X = Expr BinOp X, 3273 /// X = Expr BinOp X; V = X, 3274 /// V = X; X = UpdateOp(X), 3275 /// X = UpdateOp(X); V = X, 3276 /// 3277 /// \param Loc The insert and source location description. 3278 /// \param AllocaIP The insertion point to be used for alloca instructions. 3279 /// \param X The target atomic pointer to be updated 3280 /// \param V Memory address where to store captured value 3281 /// \param Expr The value to update X with. 3282 /// \param AO Atomic ordering of the generated atomic instructions 3283 /// \param RMWOp The binary operation used for update. If 3284 /// operation is not supported by atomicRMW, or belong to 3285 /// {FADD, FSUB, BAD_BINOP}. Then a cmpExch based 3286 /// atomic will be generated. 3287 /// \param UpdateOp Code generator for complex expressions that cannot be 3288 /// expressed through atomicrmw instruction. 3289 /// \param UpdateExpr true if X is an in place update of the form 3290 /// X = X BinOp Expr or X = Expr BinOp X 3291 /// \param IsXBinopExpr true if X is Left H.S. in Right H.S. part of the 3292 /// update expression, false otherwise. 3293 /// (e.g. true for X = X BinOp Expr) 3294 /// \param IsPostfixUpdate true if original value of 'x' must be stored in 3295 /// 'v', not an updated one. 3296 /// 3297 /// \return Insertion point after generated atomic capture IR. 3298 InsertPointOrErrorTy 3299 createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, 3300 AtomicOpValue &X, AtomicOpValue &V, Value *Expr, 3301 AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, 3302 AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, 3303 bool IsPostfixUpdate, bool IsXBinopExpr); 3304 3305 /// Emit atomic compare for constructs: --- Only scalar data types 3306 /// cond-expr-stmt: 3307 /// x = x ordop expr ? expr : x; 3308 /// x = expr ordop x ? expr : x; 3309 /// x = x == e ? d : x; 3310 /// x = e == x ? d : x; (this one is not in the spec) 3311 /// cond-update-stmt: 3312 /// if (x ordop expr) { x = expr; } 3313 /// if (expr ordop x) { x = expr; } 3314 /// if (x == e) { x = d; } 3315 /// if (e == x) { x = d; } (this one is not in the spec) 3316 /// conditional-update-capture-atomic: 3317 /// v = x; cond-update-stmt; (IsPostfixUpdate=true, IsFailOnly=false) 3318 /// cond-update-stmt; v = x; (IsPostfixUpdate=false, IsFailOnly=false) 3319 /// if (x == e) { x = d; } else { v = x; } (IsPostfixUpdate=false, 3320 /// IsFailOnly=true) 3321 /// r = x == e; if (r) { x = d; } (IsPostfixUpdate=false, IsFailOnly=false) 3322 /// r = x == e; if (r) { x = d; } else { v = x; } (IsPostfixUpdate=false, 3323 /// IsFailOnly=true) 3324 /// 3325 /// \param Loc The insert and source location description. 3326 /// \param X The target atomic pointer to be updated. 3327 /// \param V Memory address where to store captured value (for 3328 /// compare capture only). 3329 /// \param R Memory address where to store comparison result 3330 /// (for compare capture with '==' only). 3331 /// \param E The expected value ('e') for forms that use an 3332 /// equality comparison or an expression ('expr') for 3333 /// forms that use 'ordop' (logically an atomic maximum or 3334 /// minimum). 3335 /// \param D The desired value for forms that use an equality 3336 /// comparison. If forms that use 'ordop', it should be 3337 /// \p nullptr. 3338 /// \param AO Atomic ordering of the generated atomic instructions. 3339 /// \param Op Atomic compare operation. It can only be ==, <, or >. 3340 /// \param IsXBinopExpr True if the conditional statement is in the form where 3341 /// x is on LHS. It only matters for < or >. 3342 /// \param IsPostfixUpdate True if original value of 'x' must be stored in 3343 /// 'v', not an updated one (for compare capture 3344 /// only). 3345 /// \param IsFailOnly True if the original value of 'x' is stored to 'v' 3346 /// only when the comparison fails. This is only valid for 3347 /// the case the comparison is '=='. 3348 /// 3349 /// \return Insertion point after generated atomic capture IR. 3350 InsertPointTy 3351 createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, 3352 AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, 3353 AtomicOrdering AO, omp::OMPAtomicCompareOp Op, 3354 bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly); 3355 InsertPointTy createAtomicCompare(const LocationDescription &Loc, 3356 AtomicOpValue &X, AtomicOpValue &V, 3357 AtomicOpValue &R, Value *E, Value *D, 3358 AtomicOrdering AO, 3359 omp::OMPAtomicCompareOp Op, 3360 bool IsXBinopExpr, bool IsPostfixUpdate, 3361 bool IsFailOnly, AtomicOrdering Failure); 3362 3363 /// Create the control flow structure of a canonical OpenMP loop. 3364 /// 3365 /// The emitted loop will be disconnected, i.e. no edge to the loop's 3366 /// preheader and no terminator in the AfterBB. The OpenMPIRBuilder's 3367 /// IRBuilder location is not preserved. 3368 /// 3369 /// \param DL DebugLoc used for the instructions in the skeleton. 3370 /// \param TripCount Value to be used for the trip count. 3371 /// \param F Function in which to insert the BasicBlocks. 3372 /// \param PreInsertBefore Where to insert BBs that execute before the body, 3373 /// typically the body itself. 3374 /// \param PostInsertBefore Where to insert BBs that execute after the body. 3375 /// \param Name Base name used to derive BB 3376 /// and instruction names. 3377 /// 3378 /// \returns The CanonicalLoopInfo that represents the emitted loop. 3379 CanonicalLoopInfo *createLoopSkeleton(DebugLoc DL, Value *TripCount, 3380 Function *F, 3381 BasicBlock *PreInsertBefore, 3382 BasicBlock *PostInsertBefore, 3383 const Twine &Name = {}); 3384 /// OMP Offload Info Metadata name string 3385 const std::string ompOffloadInfoName = "omp_offload.info"; 3386 3387 /// Loads all the offload entries information from the host IR 3388 /// metadata. This function is only meant to be used with device code 3389 /// generation. 3390 /// 3391 /// \param M Module to load Metadata info from. Module passed maybe 3392 /// loaded from bitcode file, i.e, different from OpenMPIRBuilder::M module. 3393 void loadOffloadInfoMetadata(Module &M); 3394 3395 /// Loads all the offload entries information from the host IR 3396 /// metadata read from the file passed in as the HostFilePath argument. This 3397 /// function is only meant to be used with device code generation. 3398 /// 3399 /// \param HostFilePath The path to the host IR file, 3400 /// used to load in offload metadata for the device, allowing host and device 3401 /// to maintain the same metadata mapping. 3402 void loadOffloadInfoMetadata(StringRef HostFilePath); 3403 3404 /// Gets (if variable with the given name already exist) or creates 3405 /// internal global variable with the specified Name. The created variable has 3406 /// linkage CommonLinkage by default and is initialized by null value. 3407 /// \param Ty Type of the global variable. If it is exist already the type 3408 /// must be the same. 3409 /// \param Name Name of the variable. 3410 GlobalVariable *getOrCreateInternalVariable(Type *Ty, const StringRef &Name, 3411 unsigned AddressSpace = 0); 3412 }; 3413 3414 /// Class to represented the control flow structure of an OpenMP canonical loop. 3415 /// 3416 /// The control-flow structure is standardized for easy consumption by 3417 /// directives associated with loops. For instance, the worksharing-loop 3418 /// construct may change this control flow such that each loop iteration is 3419 /// executed on only one thread. The constraints of a canonical loop in brief 3420 /// are: 3421 /// 3422 /// * The number of loop iterations must have been computed before entering the 3423 /// loop. 3424 /// 3425 /// * Has an (unsigned) logical induction variable that starts at zero and 3426 /// increments by one. 3427 /// 3428 /// * The loop's CFG itself has no side-effects. The OpenMP specification 3429 /// itself allows side-effects, but the order in which they happen, including 3430 /// how often or whether at all, is unspecified. We expect that the frontend 3431 /// will emit those side-effect instructions somewhere (e.g. before the loop) 3432 /// such that the CanonicalLoopInfo itself can be side-effect free. 3433 /// 3434 /// Keep in mind that CanonicalLoopInfo is meant to only describe a repeated 3435 /// execution of a loop body that satifies these constraints. It does NOT 3436 /// represent arbitrary SESE regions that happen to contain a loop. Do not use 3437 /// CanonicalLoopInfo for such purposes. 3438 /// 3439 /// The control flow can be described as follows: 3440 /// 3441 /// Preheader 3442 /// | 3443 /// /-> Header 3444 /// | | 3445 /// | Cond---\ 3446 /// | | | 3447 /// | Body | 3448 /// | | | | 3449 /// | <...> | 3450 /// | | | | 3451 /// \--Latch | 3452 /// | 3453 /// Exit 3454 /// | 3455 /// After 3456 /// 3457 /// The loop is thought to start at PreheaderIP (at the Preheader's terminator, 3458 /// including) and end at AfterIP (at the After's first instruction, excluding). 3459 /// That is, instructions in the Preheader and After blocks (except the 3460 /// Preheader's terminator) are out of CanonicalLoopInfo's control and may have 3461 /// side-effects. Typically, the Preheader is used to compute the loop's trip 3462 /// count. The instructions from BodyIP (at the Body block's first instruction, 3463 /// excluding) until the Latch are also considered outside CanonicalLoopInfo's 3464 /// control and thus can have side-effects. The body block is the single entry 3465 /// point into the loop body, which may contain arbitrary control flow as long 3466 /// as all control paths eventually branch to the Latch block. 3467 /// 3468 /// TODO: Consider adding another standardized BasicBlock between Body CFG and 3469 /// Latch to guarantee that there is only a single edge to the latch. It would 3470 /// make loop transformations easier to not needing to consider multiple 3471 /// predecessors of the latch (See redirectAllPredecessorsTo) and would give us 3472 /// an equivalant to PreheaderIP, AfterIP and BodyIP for inserting code that 3473 /// executes after each body iteration. 3474 /// 3475 /// There must be no loop-carried dependencies through llvm::Values. This is 3476 /// equivalant to that the Latch has no PHINode and the Header's only PHINode is 3477 /// for the induction variable. 3478 /// 3479 /// All code in Header, Cond, Latch and Exit (plus the terminator of the 3480 /// Preheader) are CanonicalLoopInfo's responsibility and their build-up checked 3481 /// by assertOK(). They are expected to not be modified unless explicitly 3482 /// modifying the CanonicalLoopInfo through a methods that applies a OpenMP 3483 /// loop-associated construct such as applyWorkshareLoop, tileLoops, unrollLoop, 3484 /// etc. These methods usually invalidate the CanonicalLoopInfo and re-use its 3485 /// basic blocks. After invalidation, the CanonicalLoopInfo must not be used 3486 /// anymore as its underlying control flow may not exist anymore. 3487 /// Loop-transformation methods such as tileLoops, collapseLoops and unrollLoop 3488 /// may also return a new CanonicalLoopInfo that can be passed to other 3489 /// loop-associated construct implementing methods. These loop-transforming 3490 /// methods may either create a new CanonicalLoopInfo usually using 3491 /// createLoopSkeleton and invalidate the input CanonicalLoopInfo, or reuse and 3492 /// modify one of the input CanonicalLoopInfo and return it as representing the 3493 /// modified loop. What is done is an implementation detail of 3494 /// transformation-implementing method and callers should always assume that the 3495 /// CanonicalLoopInfo passed to it is invalidated and a new object is returned. 3496 /// Returned CanonicalLoopInfo have the same structure and guarantees as the one 3497 /// created by createCanonicalLoop, such that transforming methods do not have 3498 /// to special case where the CanonicalLoopInfo originated from. 3499 /// 3500 /// Generally, methods consuming CanonicalLoopInfo do not need an 3501 /// OpenMPIRBuilder::InsertPointTy as argument, but use the locations of the 3502 /// CanonicalLoopInfo to insert new or modify existing instructions. Unless 3503 /// documented otherwise, methods consuming CanonicalLoopInfo do not invalidate 3504 /// any InsertPoint that is outside CanonicalLoopInfo's control. Specifically, 3505 /// any InsertPoint in the Preheader, After or Block can still be used after 3506 /// calling such a method. 3507 /// 3508 /// TODO: Provide mechanisms for exception handling and cancellation points. 3509 /// 3510 /// Defined outside OpenMPIRBuilder because nested classes cannot be 3511 /// forward-declared, e.g. to avoid having to include the entire OMPIRBuilder.h. 3512 class CanonicalLoopInfo { 3513 friend class OpenMPIRBuilder; 3514 3515 private: 3516 BasicBlock *Header = nullptr; 3517 BasicBlock *Cond = nullptr; 3518 BasicBlock *Latch = nullptr; 3519 BasicBlock *Exit = nullptr; 3520 3521 /// Add the control blocks of this loop to \p BBs. 3522 /// 3523 /// This does not include any block from the body, including the one returned 3524 /// by getBody(). 3525 /// 3526 /// FIXME: This currently includes the Preheader and After blocks even though 3527 /// their content is (mostly) not under CanonicalLoopInfo's control. 3528 /// Re-evaluated whether this makes sense. 3529 void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs); 3530 3531 /// Sets the number of loop iterations to the given value. This value must be 3532 /// valid in the condition block (i.e., defined in the preheader) and is 3533 /// interpreted as an unsigned integer. 3534 void setTripCount(Value *TripCount); 3535 3536 /// Replace all uses of the canonical induction variable in the loop body with 3537 /// a new one. 3538 /// 3539 /// The intended use case is to update the induction variable for an updated 3540 /// iteration space such that it can stay normalized in the 0...tripcount-1 3541 /// range. 3542 /// 3543 /// The \p Updater is called with the (presumable updated) current normalized 3544 /// induction variable and is expected to return the value that uses of the 3545 /// pre-updated induction values should use instead, typically dependent on 3546 /// the new induction variable. This is a lambda (instead of e.g. just passing 3547 /// the new value) to be able to distinguish the uses of the pre-updated 3548 /// induction variable and uses of the induction varible to compute the 3549 /// updated induction variable value. 3550 void mapIndVar(llvm::function_ref<Value *(Instruction *)> Updater); 3551 3552 public: 3553 /// Returns whether this object currently represents the IR of a loop. If 3554 /// returning false, it may have been consumed by a loop transformation or not 3555 /// been intialized. Do not use in this case; 3556 bool isValid() const { return Header; } 3557 3558 /// The preheader ensures that there is only a single edge entering the loop. 3559 /// Code that must be execute before any loop iteration can be emitted here, 3560 /// such as computing the loop trip count and begin lifetime markers. Code in 3561 /// the preheader is not considered part of the canonical loop. 3562 BasicBlock *getPreheader() const; 3563 3564 /// The header is the entry for each iteration. In the canonical control flow, 3565 /// it only contains the PHINode for the induction variable. 3566 BasicBlock *getHeader() const { 3567 assert(isValid() && "Requires a valid canonical loop"); 3568 return Header; 3569 } 3570 3571 /// The condition block computes whether there is another loop iteration. If 3572 /// yes, branches to the body; otherwise to the exit block. 3573 BasicBlock *getCond() const { 3574 assert(isValid() && "Requires a valid canonical loop"); 3575 return Cond; 3576 } 3577 3578 /// The body block is the single entry for a loop iteration and not controlled 3579 /// by CanonicalLoopInfo. It can contain arbitrary control flow but must 3580 /// eventually branch to the \p Latch block. 3581 BasicBlock *getBody() const { 3582 assert(isValid() && "Requires a valid canonical loop"); 3583 return cast<BranchInst>(Cond->getTerminator())->getSuccessor(0); 3584 } 3585 3586 /// Reaching the latch indicates the end of the loop body code. In the 3587 /// canonical control flow, it only contains the increment of the induction 3588 /// variable. 3589 BasicBlock *getLatch() const { 3590 assert(isValid() && "Requires a valid canonical loop"); 3591 return Latch; 3592 } 3593 3594 /// Reaching the exit indicates no more iterations are being executed. 3595 BasicBlock *getExit() const { 3596 assert(isValid() && "Requires a valid canonical loop"); 3597 return Exit; 3598 } 3599 3600 /// The after block is intended for clean-up code such as lifetime end 3601 /// markers. It is separate from the exit block to ensure, analogous to the 3602 /// preheader, it having just a single entry edge and being free from PHI 3603 /// nodes should there be multiple loop exits (such as from break 3604 /// statements/cancellations). 3605 BasicBlock *getAfter() const { 3606 assert(isValid() && "Requires a valid canonical loop"); 3607 return Exit->getSingleSuccessor(); 3608 } 3609 3610 /// Returns the llvm::Value containing the number of loop iterations. It must 3611 /// be valid in the preheader and always interpreted as an unsigned integer of 3612 /// any bit-width. 3613 Value *getTripCount() const { 3614 assert(isValid() && "Requires a valid canonical loop"); 3615 Instruction *CmpI = &Cond->front(); 3616 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount"); 3617 return CmpI->getOperand(1); 3618 } 3619 3620 /// Returns the instruction representing the current logical induction 3621 /// variable. Always unsigned, always starting at 0 with an increment of one. 3622 Instruction *getIndVar() const { 3623 assert(isValid() && "Requires a valid canonical loop"); 3624 Instruction *IndVarPHI = &Header->front(); 3625 assert(isa<PHINode>(IndVarPHI) && "First inst must be the IV PHI"); 3626 return IndVarPHI; 3627 } 3628 3629 /// Return the type of the induction variable (and the trip count). 3630 Type *getIndVarType() const { 3631 assert(isValid() && "Requires a valid canonical loop"); 3632 return getIndVar()->getType(); 3633 } 3634 3635 /// Return the insertion point for user code before the loop. 3636 OpenMPIRBuilder::InsertPointTy getPreheaderIP() const { 3637 assert(isValid() && "Requires a valid canonical loop"); 3638 BasicBlock *Preheader = getPreheader(); 3639 return {Preheader, std::prev(Preheader->end())}; 3640 }; 3641 3642 /// Return the insertion point for user code in the body. 3643 OpenMPIRBuilder::InsertPointTy getBodyIP() const { 3644 assert(isValid() && "Requires a valid canonical loop"); 3645 BasicBlock *Body = getBody(); 3646 return {Body, Body->begin()}; 3647 }; 3648 3649 /// Return the insertion point for user code after the loop. 3650 OpenMPIRBuilder::InsertPointTy getAfterIP() const { 3651 assert(isValid() && "Requires a valid canonical loop"); 3652 BasicBlock *After = getAfter(); 3653 return {After, After->begin()}; 3654 }; 3655 3656 Function *getFunction() const { 3657 assert(isValid() && "Requires a valid canonical loop"); 3658 return Header->getParent(); 3659 } 3660 3661 /// Consistency self-check. 3662 void assertOK() const; 3663 3664 /// Invalidate this loop. That is, the underlying IR does not fulfill the 3665 /// requirements of an OpenMP canonical loop anymore. 3666 void invalidate(); 3667 }; 3668 3669 } // end namespace llvm 3670 3671 #endif // LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H 3672