xref: /llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h (revision 1d890b06eedf0cc6746873a5c69b761a0a43cc35)
1 //===- IR/OpenMPIRBuilder.h - OpenMP encoding builder for LLVM IR - C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the OpenMPIRBuilder class and helpers used as a convenient
10 // way to create LLVM instructions for OpenMP directives.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
15 #define LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
16 
17 #include "llvm/Analysis/MemorySSAUpdater.h"
18 #include "llvm/Frontend/Atomic/Atomic.h"
19 #include "llvm/Frontend/OpenMP/OMPConstants.h"
20 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
21 #include "llvm/IR/DebugLoc.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/IR/Module.h"
24 #include "llvm/Support/Allocator.h"
25 #include "llvm/TargetParser/Triple.h"
26 #include <forward_list>
27 #include <map>
28 #include <optional>
29 
30 namespace llvm {
31 class CanonicalLoopInfo;
32 struct TargetRegionEntryInfo;
33 class OffloadEntriesInfoManager;
34 class OpenMPIRBuilder;
35 
36 /// Move the instruction after an InsertPoint to the beginning of another
37 /// BasicBlock.
38 ///
39 /// The instructions after \p IP are moved to the beginning of \p New which must
40 /// not have any PHINodes. If \p CreateBranch is true, a branch instruction to
41 /// \p New will be added such that there is no semantic change. Otherwise, the
42 /// \p IP insert block remains degenerate and it is up to the caller to insert a
43 /// terminator.
44 void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
45               bool CreateBranch);
46 
47 /// Splice a BasicBlock at an IRBuilder's current insertion point. Its new
48 /// insert location will stick to after the instruction before the insertion
49 /// point (instead of moving with the instruction the InsertPoint stores
50 /// internally).
51 void spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch);
52 
53 /// Split a BasicBlock at an InsertPoint, even if the block is degenerate
54 /// (missing the terminator).
55 ///
56 /// llvm::SplitBasicBlock and BasicBlock::splitBasicBlock require a well-formed
57 /// BasicBlock. \p Name is used for the new successor block. If \p CreateBranch
58 /// is true, a branch to the new successor will new created such that
59 /// semantically there is no change; otherwise the block of the insertion point
60 /// remains degenerate and it is the caller's responsibility to insert a
61 /// terminator. Returns the new successor block.
62 BasicBlock *splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
63                     llvm::Twine Name = {});
64 
65 /// Split a BasicBlock at \p Builder's insertion point, even if the block is
66 /// degenerate (missing the terminator).  Its new insert location will stick to
67 /// after the instruction before the insertion point (instead of moving with the
68 /// instruction the InsertPoint stores internally).
69 BasicBlock *splitBB(IRBuilderBase &Builder, bool CreateBranch,
70                     llvm::Twine Name = {});
71 
72 /// Split a BasicBlock at \p Builder's insertion point, even if the block is
73 /// degenerate (missing the terminator).  Its new insert location will stick to
74 /// after the instruction before the insertion point (instead of moving with the
75 /// instruction the InsertPoint stores internally).
76 BasicBlock *splitBB(IRBuilder<> &Builder, bool CreateBranch, llvm::Twine Name);
77 
78 /// Like splitBB, but reuses the current block's name for the new name.
79 BasicBlock *splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
80                               llvm::Twine Suffix = ".split");
81 
82 /// Captures attributes that affect generating LLVM-IR using the
83 /// OpenMPIRBuilder and related classes. Note that not all attributes are
84 /// required for all classes or functions. In some use cases the configuration
85 /// is not necessary at all, because because the only functions that are called
86 /// are ones that are not dependent on the configuration.
87 class OpenMPIRBuilderConfig {
88 public:
89   /// Flag to define whether to generate code for the role of the OpenMP host
90   /// (if set to false) or device (if set to true) in an offloading context. It
91   /// is set when the -fopenmp-is-target-device compiler frontend option is
92   /// specified.
93   std::optional<bool> IsTargetDevice;
94 
95   /// Flag for specifying if the compilation is done for an accelerator. It is
96   /// set according to the architecture of the target triple and currently only
97   /// true when targeting AMDGPU or NVPTX. Today, these targets can only perform
98   /// the role of an OpenMP target device, so `IsTargetDevice` must also be true
99   /// if `IsGPU` is true. This restriction might be lifted if an accelerator-
100   /// like target with the ability to work as the OpenMP host is added, or if
101   /// the capabilities of the currently supported GPU architectures are
102   /// expanded.
103   std::optional<bool> IsGPU;
104 
105   /// Flag for specifying if LLVMUsed information should be emitted.
106   std::optional<bool> EmitLLVMUsedMetaInfo;
107 
108   /// Flag for specifying if offloading is mandatory.
109   std::optional<bool> OpenMPOffloadMandatory;
110 
111   /// First separator used between the initial two parts of a name.
112   std::optional<StringRef> FirstSeparator;
113   /// Separator used between all of the rest consecutive parts of s name
114   std::optional<StringRef> Separator;
115 
116   // Grid Value for the GPU target
117   std::optional<omp::GV> GridValue;
118 
119   /// When compilation is being done for the OpenMP host (i.e. `IsTargetDevice =
120   /// false`), this contains the list of offloading triples associated, if any.
121   SmallVector<Triple> TargetTriples;
122 
123   OpenMPIRBuilderConfig();
124   OpenMPIRBuilderConfig(bool IsTargetDevice, bool IsGPU,
125                         bool OpenMPOffloadMandatory,
126                         bool HasRequiresReverseOffload,
127                         bool HasRequiresUnifiedAddress,
128                         bool HasRequiresUnifiedSharedMemory,
129                         bool HasRequiresDynamicAllocators);
130 
131   // Getters functions that assert if the required values are not present.
132   bool isTargetDevice() const {
133     assert(IsTargetDevice.has_value() && "IsTargetDevice is not set");
134     return *IsTargetDevice;
135   }
136 
137   bool isGPU() const {
138     assert(IsGPU.has_value() && "IsGPU is not set");
139     return *IsGPU;
140   }
141 
142   bool openMPOffloadMandatory() const {
143     assert(OpenMPOffloadMandatory.has_value() &&
144            "OpenMPOffloadMandatory is not set");
145     return *OpenMPOffloadMandatory;
146   }
147 
148   omp::GV getGridValue() const {
149     assert(GridValue.has_value() && "GridValue is not set");
150     return *GridValue;
151   }
152 
153   bool hasRequiresFlags() const { return RequiresFlags; }
154   bool hasRequiresReverseOffload() const;
155   bool hasRequiresUnifiedAddress() const;
156   bool hasRequiresUnifiedSharedMemory() const;
157   bool hasRequiresDynamicAllocators() const;
158 
159   /// Returns requires directive clauses as flags compatible with those expected
160   /// by libomptarget.
161   int64_t getRequiresFlags() const;
162 
163   // Returns the FirstSeparator if set, otherwise use the default separator
164   // depending on isGPU
165   StringRef firstSeparator() const {
166     if (FirstSeparator.has_value())
167       return *FirstSeparator;
168     if (isGPU())
169       return "_";
170     return ".";
171   }
172 
173   // Returns the Separator if set, otherwise use the default separator depending
174   // on isGPU
175   StringRef separator() const {
176     if (Separator.has_value())
177       return *Separator;
178     if (isGPU())
179       return "$";
180     return ".";
181   }
182 
183   void setIsTargetDevice(bool Value) { IsTargetDevice = Value; }
184   void setIsGPU(bool Value) { IsGPU = Value; }
185   void setEmitLLVMUsed(bool Value = true) { EmitLLVMUsedMetaInfo = Value; }
186   void setOpenMPOffloadMandatory(bool Value) { OpenMPOffloadMandatory = Value; }
187   void setFirstSeparator(StringRef FS) { FirstSeparator = FS; }
188   void setSeparator(StringRef S) { Separator = S; }
189   void setGridValue(omp::GV G) { GridValue = G; }
190 
191   void setHasRequiresReverseOffload(bool Value);
192   void setHasRequiresUnifiedAddress(bool Value);
193   void setHasRequiresUnifiedSharedMemory(bool Value);
194   void setHasRequiresDynamicAllocators(bool Value);
195 
196 private:
197   /// Flags for specifying which requires directive clauses are present.
198   int64_t RequiresFlags;
199 };
200 
201 /// Data structure to contain the information needed to uniquely identify
202 /// a target entry.
203 struct TargetRegionEntryInfo {
204   /// The prefix used for kernel names.
205   static constexpr const char *KernelNamePrefix = "__omp_offloading_";
206 
207   std::string ParentName;
208   unsigned DeviceID;
209   unsigned FileID;
210   unsigned Line;
211   unsigned Count;
212 
213   TargetRegionEntryInfo() : DeviceID(0), FileID(0), Line(0), Count(0) {}
214   TargetRegionEntryInfo(StringRef ParentName, unsigned DeviceID,
215                         unsigned FileID, unsigned Line, unsigned Count = 0)
216       : ParentName(ParentName), DeviceID(DeviceID), FileID(FileID), Line(Line),
217         Count(Count) {}
218 
219   static void getTargetRegionEntryFnName(SmallVectorImpl<char> &Name,
220                                          StringRef ParentName,
221                                          unsigned DeviceID, unsigned FileID,
222                                          unsigned Line, unsigned Count);
223 
224   bool operator<(const TargetRegionEntryInfo &RHS) const {
225     return std::make_tuple(ParentName, DeviceID, FileID, Line, Count) <
226            std::make_tuple(RHS.ParentName, RHS.DeviceID, RHS.FileID, RHS.Line,
227                            RHS.Count);
228   }
229 };
230 
231 /// Class that manages information about offload code regions and data
232 class OffloadEntriesInfoManager {
233   /// Number of entries registered so far.
234   OpenMPIRBuilder *OMPBuilder;
235   unsigned OffloadingEntriesNum = 0;
236 
237 public:
238   /// Base class of the entries info.
239   class OffloadEntryInfo {
240   public:
241     /// Kind of a given entry.
242     enum OffloadingEntryInfoKinds : unsigned {
243       /// Entry is a target region.
244       OffloadingEntryInfoTargetRegion = 0,
245       /// Entry is a declare target variable.
246       OffloadingEntryInfoDeviceGlobalVar = 1,
247       /// Invalid entry info.
248       OffloadingEntryInfoInvalid = ~0u
249     };
250 
251   protected:
252     OffloadEntryInfo() = delete;
253     explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind) : Kind(Kind) {}
254     explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order,
255                               uint32_t Flags)
256         : Flags(Flags), Order(Order), Kind(Kind) {}
257     ~OffloadEntryInfo() = default;
258 
259   public:
260     bool isValid() const { return Order != ~0u; }
261     unsigned getOrder() const { return Order; }
262     OffloadingEntryInfoKinds getKind() const { return Kind; }
263     uint32_t getFlags() const { return Flags; }
264     void setFlags(uint32_t NewFlags) { Flags = NewFlags; }
265     Constant *getAddress() const { return cast_or_null<Constant>(Addr); }
266     void setAddress(Constant *V) {
267       assert(!Addr.pointsToAliveValue() && "Address has been set before!");
268       Addr = V;
269     }
270     static bool classof(const OffloadEntryInfo *Info) { return true; }
271 
272   private:
273     /// Address of the entity that has to be mapped for offloading.
274     WeakTrackingVH Addr;
275 
276     /// Flags associated with the device global.
277     uint32_t Flags = 0u;
278 
279     /// Order this entry was emitted.
280     unsigned Order = ~0u;
281 
282     OffloadingEntryInfoKinds Kind = OffloadingEntryInfoInvalid;
283   };
284 
285   /// Return true if a there are no entries defined.
286   bool empty() const;
287   /// Return number of entries defined so far.
288   unsigned size() const { return OffloadingEntriesNum; }
289 
290   OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {}
291 
292   //
293   // Target region entries related.
294   //
295 
296   /// Kind of the target registry entry.
297   enum OMPTargetRegionEntryKind : uint32_t {
298     /// Mark the entry as target region.
299     OMPTargetRegionEntryTargetRegion = 0x0,
300   };
301 
302   /// Target region entries info.
303   class OffloadEntryInfoTargetRegion final : public OffloadEntryInfo {
304     /// Address that can be used as the ID of the entry.
305     Constant *ID = nullptr;
306 
307   public:
308     OffloadEntryInfoTargetRegion()
309         : OffloadEntryInfo(OffloadingEntryInfoTargetRegion) {}
310     explicit OffloadEntryInfoTargetRegion(unsigned Order, Constant *Addr,
311                                           Constant *ID,
312                                           OMPTargetRegionEntryKind Flags)
313         : OffloadEntryInfo(OffloadingEntryInfoTargetRegion, Order, Flags),
314           ID(ID) {
315       setAddress(Addr);
316     }
317 
318     Constant *getID() const { return ID; }
319     void setID(Constant *V) {
320       assert(!ID && "ID has been set before!");
321       ID = V;
322     }
323     static bool classof(const OffloadEntryInfo *Info) {
324       return Info->getKind() == OffloadingEntryInfoTargetRegion;
325     }
326   };
327 
328   /// Initialize target region entry.
329   /// This is ONLY needed for DEVICE compilation.
330   void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo,
331                                        unsigned Order);
332   /// Register target region entry.
333   void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
334                                      Constant *Addr, Constant *ID,
335                                      OMPTargetRegionEntryKind Flags);
336   /// Return true if a target region entry with the provided information
337   /// exists.
338   bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
339                                 bool IgnoreAddressId = false) const;
340 
341   // Return the Name based on \a EntryInfo using the next available Count.
342   void getTargetRegionEntryFnName(SmallVectorImpl<char> &Name,
343                                   const TargetRegionEntryInfo &EntryInfo);
344 
345   /// brief Applies action \a Action on all registered entries.
346   typedef function_ref<void(const TargetRegionEntryInfo &EntryInfo,
347                             const OffloadEntryInfoTargetRegion &)>
348       OffloadTargetRegionEntryInfoActTy;
349   void
350   actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action);
351 
352   //
353   // Device global variable entries related.
354   //
355 
356   /// Kind of the global variable entry..
357   enum OMPTargetGlobalVarEntryKind : uint32_t {
358     /// Mark the entry as a to declare target.
359     OMPTargetGlobalVarEntryTo = 0x0,
360     /// Mark the entry as a to declare target link.
361     OMPTargetGlobalVarEntryLink = 0x1,
362     /// Mark the entry as a declare target enter.
363     OMPTargetGlobalVarEntryEnter = 0x2,
364     /// Mark the entry as having no declare target entry kind.
365     OMPTargetGlobalVarEntryNone = 0x3,
366     /// Mark the entry as a declare target indirect global.
367     OMPTargetGlobalVarEntryIndirect = 0x8,
368     /// Mark the entry as a register requires global.
369     OMPTargetGlobalRegisterRequires = 0x10,
370   };
371 
372   /// Kind of device clause for declare target variables
373   /// and functions
374   /// NOTE: Currently not used as a part of a variable entry
375   /// used for Flang and Clang to interface with the variable
376   /// related registration functions
377   enum OMPTargetDeviceClauseKind : uint32_t {
378     /// The target is marked for all devices
379     OMPTargetDeviceClauseAny = 0x0,
380     /// The target is marked for non-host devices
381     OMPTargetDeviceClauseNoHost = 0x1,
382     /// The target is marked for host devices
383     OMPTargetDeviceClauseHost = 0x2,
384     /// The target is marked as having no clause
385     OMPTargetDeviceClauseNone = 0x3
386   };
387 
388   /// Device global variable entries info.
389   class OffloadEntryInfoDeviceGlobalVar final : public OffloadEntryInfo {
390     /// Type of the global variable.
391     int64_t VarSize;
392     GlobalValue::LinkageTypes Linkage;
393     const std::string VarName;
394 
395   public:
396     OffloadEntryInfoDeviceGlobalVar()
397         : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar) {}
398     explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order,
399                                              OMPTargetGlobalVarEntryKind Flags)
400         : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags) {}
401     explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order, Constant *Addr,
402                                              int64_t VarSize,
403                                              OMPTargetGlobalVarEntryKind Flags,
404                                              GlobalValue::LinkageTypes Linkage,
405                                              const std::string &VarName)
406         : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags),
407           VarSize(VarSize), Linkage(Linkage), VarName(VarName) {
408       setAddress(Addr);
409     }
410 
411     int64_t getVarSize() const { return VarSize; }
412     StringRef getVarName() const { return VarName; }
413     void setVarSize(int64_t Size) { VarSize = Size; }
414     GlobalValue::LinkageTypes getLinkage() const { return Linkage; }
415     void setLinkage(GlobalValue::LinkageTypes LT) { Linkage = LT; }
416     static bool classof(const OffloadEntryInfo *Info) {
417       return Info->getKind() == OffloadingEntryInfoDeviceGlobalVar;
418     }
419   };
420 
421   /// Initialize device global variable entry.
422   /// This is ONLY used for DEVICE compilation.
423   void initializeDeviceGlobalVarEntryInfo(StringRef Name,
424                                           OMPTargetGlobalVarEntryKind Flags,
425                                           unsigned Order);
426 
427   /// Register device global variable entry.
428   void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr,
429                                         int64_t VarSize,
430                                         OMPTargetGlobalVarEntryKind Flags,
431                                         GlobalValue::LinkageTypes Linkage);
432   /// Checks if the variable with the given name has been registered already.
433   bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const {
434     return OffloadEntriesDeviceGlobalVar.count(VarName) > 0;
435   }
436   /// Applies action \a Action on all registered entries.
437   typedef function_ref<void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)>
438       OffloadDeviceGlobalVarEntryInfoActTy;
439   void actOnDeviceGlobalVarEntriesInfo(
440       const OffloadDeviceGlobalVarEntryInfoActTy &Action);
441 
442 private:
443   /// Return the count of entries at a particular source location.
444   unsigned
445   getTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo) const;
446 
447   /// Update the count of entries at a particular source location.
448   void
449   incrementTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo);
450 
451   static TargetRegionEntryInfo
452   getTargetRegionEntryCountKey(const TargetRegionEntryInfo &EntryInfo) {
453     return TargetRegionEntryInfo(EntryInfo.ParentName, EntryInfo.DeviceID,
454                                  EntryInfo.FileID, EntryInfo.Line, 0);
455   }
456 
457   // Count of entries at a location.
458   std::map<TargetRegionEntryInfo, unsigned> OffloadEntriesTargetRegionCount;
459 
460   // Storage for target region entries kind.
461   typedef std::map<TargetRegionEntryInfo, OffloadEntryInfoTargetRegion>
462       OffloadEntriesTargetRegionTy;
463   OffloadEntriesTargetRegionTy OffloadEntriesTargetRegion;
464   /// Storage for device global variable entries kind. The storage is to be
465   /// indexed by mangled name.
466   typedef StringMap<OffloadEntryInfoDeviceGlobalVar>
467       OffloadEntriesDeviceGlobalVarTy;
468   OffloadEntriesDeviceGlobalVarTy OffloadEntriesDeviceGlobalVar;
469 };
470 
471 /// An interface to create LLVM-IR for OpenMP directives.
472 ///
473 /// Each OpenMP directive has a corresponding public generator method.
474 class OpenMPIRBuilder {
475 public:
476   /// Create a new OpenMPIRBuilder operating on the given module \p M. This will
477   /// not have an effect on \p M (see initialize)
478   OpenMPIRBuilder(Module &M)
479       : M(M), Builder(M.getContext()), OffloadInfoManager(this),
480         T(Triple(M.getTargetTriple())) {}
481   ~OpenMPIRBuilder();
482 
483   class AtomicInfo : public llvm::AtomicInfo {
484     llvm::Value *AtomicVar;
485 
486   public:
487     AtomicInfo(IRBuilder<> *Builder, llvm::Type *Ty, uint64_t AtomicSizeInBits,
488                uint64_t ValueSizeInBits, llvm::Align AtomicAlign,
489                llvm::Align ValueAlign, bool UseLibcall, llvm::Value *AtomicVar)
490         : llvm::AtomicInfo(Builder, Ty, AtomicSizeInBits, ValueSizeInBits,
491                            AtomicAlign, ValueAlign, UseLibcall),
492           AtomicVar(AtomicVar) {}
493 
494     llvm::Value *getAtomicPointer() const override { return AtomicVar; }
495     void decorateWithTBAA(llvm::Instruction *I) override {}
496     llvm::AllocaInst *CreateAlloca(llvm::Type *Ty,
497                                    const llvm::Twine &Name) const override {
498       llvm::AllocaInst *allocaInst = Builder->CreateAlloca(Ty);
499       allocaInst->setName(Name);
500       return allocaInst;
501     }
502   };
503   /// Initialize the internal state, this will put structures types and
504   /// potentially other helpers into the underlying module. Must be called
505   /// before any other method and only once! This internal state includes types
506   /// used in the OpenMPIRBuilder generated from OMPKinds.def.
507   void initialize();
508 
509   void setConfig(OpenMPIRBuilderConfig C) { Config = C; }
510 
511   /// Finalize the underlying module, e.g., by outlining regions.
512   /// \param Fn                    The function to be finalized. If not used,
513   ///                              all functions are finalized.
514   void finalize(Function *Fn = nullptr);
515 
516   /// Add attributes known for \p FnID to \p Fn.
517   void addAttributes(omp::RuntimeFunction FnID, Function &Fn);
518 
519   /// Type used throughout for insertion points.
520   using InsertPointTy = IRBuilder<>::InsertPoint;
521 
522   /// Type used to represent an insertion point or an error value.
523   using InsertPointOrErrorTy = Expected<InsertPointTy>;
524 
525   /// Get the create a name using the platform specific separators.
526   /// \param Parts parts of the final name that needs separation
527   /// The created name has a first separator between the first and second part
528   /// and a second separator between all other parts.
529   /// E.g. with FirstSeparator "$" and Separator "." and
530   /// parts: "p1", "p2", "p3", "p4"
531   /// The resulting name is "p1$p2.p3.p4"
532   /// The separators are retrieved from the OpenMPIRBuilderConfig.
533   std::string createPlatformSpecificName(ArrayRef<StringRef> Parts) const;
534 
535   /// Callback type for variable finalization (think destructors).
536   ///
537   /// \param CodeGenIP is the insertion point at which the finalization code
538   ///                  should be placed.
539   ///
540   /// A finalize callback knows about all objects that need finalization, e.g.
541   /// destruction, when the scope of the currently generated construct is left
542   /// at the time, and location, the callback is invoked.
543   using FinalizeCallbackTy = std::function<Error(InsertPointTy CodeGenIP)>;
544 
545   struct FinalizationInfo {
546     /// The finalization callback provided by the last in-flight invocation of
547     /// createXXXX for the directive of kind DK.
548     FinalizeCallbackTy FiniCB;
549 
550     /// The directive kind of the innermost directive that has an associated
551     /// region which might require finalization when it is left.
552     omp::Directive DK;
553 
554     /// Flag to indicate if the directive is cancellable.
555     bool IsCancellable;
556   };
557 
558   /// Push a finalization callback on the finalization stack.
559   ///
560   /// NOTE: Temporary solution until Clang CG is gone.
561   void pushFinalizationCB(const FinalizationInfo &FI) {
562     FinalizationStack.push_back(FI);
563   }
564 
565   /// Pop the last finalization callback from the finalization stack.
566   ///
567   /// NOTE: Temporary solution until Clang CG is gone.
568   void popFinalizationCB() { FinalizationStack.pop_back(); }
569 
570   /// Callback type for body (=inner region) code generation
571   ///
572   /// The callback takes code locations as arguments, each describing a
573   /// location where additional instructions can be inserted.
574   ///
575   /// The CodeGenIP may be in the middle of a basic block or point to the end of
576   /// it. The basic block may have a terminator or be degenerate. The callback
577   /// function may just insert instructions at that position, but also split the
578   /// block (without the Before argument of BasicBlock::splitBasicBlock such
579   /// that the identify of the split predecessor block is preserved) and insert
580   /// additional control flow, including branches that do not lead back to what
581   /// follows the CodeGenIP. Note that since the callback is allowed to split
582   /// the block, callers must assume that InsertPoints to positions in the
583   /// BasicBlock after CodeGenIP including CodeGenIP itself are invalidated. If
584   /// such InsertPoints need to be preserved, it can split the block itself
585   /// before calling the callback.
586   ///
587   /// AllocaIP and CodeGenIP must not point to the same position.
588   ///
589   /// \param AllocaIP is the insertion point at which new alloca instructions
590   ///                 should be placed. The BasicBlock it is pointing to must
591   ///                 not be split.
592   /// \param CodeGenIP is the insertion point at which the body code should be
593   ///                  placed.
594   ///
595   /// \return an error, if any were triggered during execution.
596   using BodyGenCallbackTy =
597       function_ref<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
598 
599   // This is created primarily for sections construct as llvm::function_ref
600   // (BodyGenCallbackTy) is not storable (as described in the comments of
601   // function_ref class - function_ref contains non-ownable reference
602   // to the callable.
603   ///
604   /// \return an error, if any were triggered during execution.
605   using StorableBodyGenCallbackTy =
606       std::function<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
607 
608   /// Callback type for loop body code generation.
609   ///
610   /// \param CodeGenIP is the insertion point where the loop's body code must be
611   ///                  placed. This will be a dedicated BasicBlock with a
612   ///                  conditional branch from the loop condition check and
613   ///                  terminated with an unconditional branch to the loop
614   ///                  latch.
615   /// \param IndVar    is the induction variable usable at the insertion point.
616   ///
617   /// \return an error, if any were triggered during execution.
618   using LoopBodyGenCallbackTy =
619       function_ref<Error(InsertPointTy CodeGenIP, Value *IndVar)>;
620 
621   /// Callback type for variable privatization (think copy & default
622   /// constructor).
623   ///
624   /// \param AllocaIP is the insertion point at which new alloca instructions
625   ///                 should be placed.
626   /// \param CodeGenIP is the insertion point at which the privatization code
627   ///                  should be placed.
628   /// \param Original The value being copied/created, should not be used in the
629   ///                 generated IR.
630   /// \param Inner The equivalent of \p Original that should be used in the
631   ///              generated IR; this is equal to \p Original if the value is
632   ///              a pointer and can thus be passed directly, otherwise it is
633   ///              an equivalent but different value.
634   /// \param ReplVal The replacement value, thus a copy or new created version
635   ///                of \p Inner.
636   ///
637   /// \returns The new insertion point where code generation continues and
638   ///          \p ReplVal the replacement value.
639   using PrivatizeCallbackTy = function_ref<InsertPointOrErrorTy(
640       InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original,
641       Value &Inner, Value *&ReplVal)>;
642 
643   /// Description of a LLVM-IR insertion point (IP) and a debug/source location
644   /// (filename, line, column, ...).
645   struct LocationDescription {
646     LocationDescription(const IRBuilderBase &IRB)
647         : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {}
648     LocationDescription(const InsertPointTy &IP) : IP(IP) {}
649     LocationDescription(const InsertPointTy &IP, const DebugLoc &DL)
650         : IP(IP), DL(DL) {}
651     InsertPointTy IP;
652     DebugLoc DL;
653   };
654 
655   /// Emitter methods for OpenMP directives.
656   ///
657   ///{
658 
659   /// Generator for '#omp barrier'
660   ///
661   /// \param Loc The location where the barrier directive was encountered.
662   /// \param Kind The kind of directive that caused the barrier.
663   /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
664   /// \param CheckCancelFlag Flag to indicate a cancel barrier return value
665   ///                        should be checked and acted upon.
666   /// \param ThreadID Optional parameter to pass in any existing ThreadID value.
667   ///
668   /// \returns The insertion point after the barrier.
669   InsertPointOrErrorTy createBarrier(const LocationDescription &Loc,
670                                      omp::Directive Kind,
671                                      bool ForceSimpleCall = false,
672                                      bool CheckCancelFlag = true);
673 
674   /// Generator for '#omp cancel'
675   ///
676   /// \param Loc The location where the directive was encountered.
677   /// \param IfCondition The evaluated 'if' clause expression, if any.
678   /// \param CanceledDirective The kind of directive that is cancled.
679   ///
680   /// \returns The insertion point after the barrier.
681   InsertPointOrErrorTy createCancel(const LocationDescription &Loc,
682                                     Value *IfCondition,
683                                     omp::Directive CanceledDirective);
684 
685   /// Generator for '#omp parallel'
686   ///
687   /// \param Loc The insert and source location description.
688   /// \param AllocaIP The insertion points to be used for alloca instructions.
689   /// \param BodyGenCB Callback that will generate the region code.
690   /// \param PrivCB Callback to copy a given variable (think copy constructor).
691   /// \param FiniCB Callback to finalize variable copies.
692   /// \param IfCondition The evaluated 'if' clause expression, if any.
693   /// \param NumThreads The evaluated 'num_threads' clause expression, if any.
694   /// \param ProcBind The value of the 'proc_bind' clause (see ProcBindKind).
695   /// \param IsCancellable Flag to indicate a cancellable parallel region.
696   ///
697   /// \returns The insertion position *after* the parallel.
698   InsertPointOrErrorTy
699   createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP,
700                  BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
701                  FinalizeCallbackTy FiniCB, Value *IfCondition,
702                  Value *NumThreads, omp::ProcBindKind ProcBind,
703                  bool IsCancellable);
704 
705   /// Generator for the control flow structure of an OpenMP canonical loop.
706   ///
707   /// This generator operates on the logical iteration space of the loop, i.e.
708   /// the caller only has to provide a loop trip count of the loop as defined by
709   /// base language semantics. The trip count is interpreted as an unsigned
710   /// integer. The induction variable passed to \p BodyGenCB will be of the same
711   /// type and run from 0 to \p TripCount - 1. It is up to the callback to
712   /// convert the logical iteration variable to the loop counter variable in the
713   /// loop body.
714   ///
715   /// \param Loc       The insert and source location description. The insert
716   ///                  location can be between two instructions or the end of a
717   ///                  degenerate block (e.g. a BB under construction).
718   /// \param BodyGenCB Callback that will generate the loop body code.
719   /// \param TripCount Number of iterations the loop body is executed.
720   /// \param Name      Base name used to derive BB and instruction names.
721   ///
722   /// \returns An object representing the created control flow structure which
723   ///          can be used for loop-associated directives.
724   Expected<CanonicalLoopInfo *>
725   createCanonicalLoop(const LocationDescription &Loc,
726                       LoopBodyGenCallbackTy BodyGenCB, Value *TripCount,
727                       const Twine &Name = "loop");
728 
729   /// Generator for the control flow structure of an OpenMP canonical loop.
730   ///
731   /// Instead of a logical iteration space, this allows specifying user-defined
732   /// loop counter values using increment, upper- and lower bounds. To
733   /// disambiguate the terminology when counting downwards, instead of lower
734   /// bounds we use \p Start for the loop counter value in the first body
735   /// iteration.
736   ///
737   /// Consider the following limitations:
738   ///
739   ///  * A loop counter space over all integer values of its bit-width cannot be
740   ///    represented. E.g using uint8_t, its loop trip count of 256 cannot be
741   ///    stored into an 8 bit integer):
742   ///
743   ///      DO I = 0, 255, 1
744   ///
745   ///  * Unsigned wrapping is only supported when wrapping only "once"; E.g.
746   ///    effectively counting downwards:
747   ///
748   ///      for (uint8_t i = 100u; i > 0; i += 127u)
749   ///
750   ///
751   /// TODO: May need to add additional parameters to represent:
752   ///
753   ///  * Allow representing downcounting with unsigned integers.
754   ///
755   ///  * Sign of the step and the comparison operator might disagree:
756   ///
757   ///      for (int i = 0; i < 42; i -= 1u)
758   ///
759   //
760   /// \param Loc       The insert and source location description.
761   /// \param BodyGenCB Callback that will generate the loop body code.
762   /// \param Start     Value of the loop counter for the first iterations.
763   /// \param Stop      Loop counter values past this will stop the loop.
764   /// \param Step      Loop counter increment after each iteration; negative
765   ///                  means counting down.
766   /// \param IsSigned  Whether Start, Stop and Step are signed integers.
767   /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
768   ///                      counter.
769   /// \param ComputeIP Insertion point for instructions computing the trip
770   ///                  count. Can be used to ensure the trip count is available
771   ///                  at the outermost loop of a loop nest. If not set,
772   ///                  defaults to the preheader of the generated loop.
773   /// \param Name      Base name used to derive BB and instruction names.
774   ///
775   /// \returns An object representing the created control flow structure which
776   ///          can be used for loop-associated directives.
777   Expected<CanonicalLoopInfo *> createCanonicalLoop(
778       const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
779       Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
780       InsertPointTy ComputeIP = {}, const Twine &Name = "loop");
781 
782   /// Collapse a loop nest into a single loop.
783   ///
784   /// Merges loops of a loop nest into a single CanonicalLoopNest representation
785   /// that has the same number of innermost loop iterations as the origin loop
786   /// nest. The induction variables of the input loops are derived from the
787   /// collapsed loop's induction variable. This is intended to be used to
788   /// implement OpenMP's collapse clause. Before applying a directive,
789   /// collapseLoops normalizes a loop nest to contain only a single loop and the
790   /// directive's implementation does not need to handle multiple loops itself.
791   /// This does not remove the need to handle all loop nest handling by
792   /// directives, such as the ordered(<n>) clause or the simd schedule-clause
793   /// modifier of the worksharing-loop directive.
794   ///
795   /// Example:
796   /// \code
797   ///   for (int i = 0; i < 7; ++i) // Canonical loop "i"
798   ///     for (int j = 0; j < 9; ++j) // Canonical loop "j"
799   ///       body(i, j);
800   /// \endcode
801   ///
802   /// After collapsing with Loops={i,j}, the loop is changed to
803   /// \code
804   ///   for (int ij = 0; ij < 63; ++ij) {
805   ///     int i = ij / 9;
806   ///     int j = ij % 9;
807   ///     body(i, j);
808   ///   }
809   /// \endcode
810   ///
811   /// In the current implementation, the following limitations apply:
812   ///
813   ///  * All input loops have an induction variable of the same type.
814   ///
815   ///  * The collapsed loop will have the same trip count integer type as the
816   ///    input loops. Therefore it is possible that the collapsed loop cannot
817   ///    represent all iterations of the input loops. For instance, assuming a
818   ///    32 bit integer type, and two input loops both iterating 2^16 times, the
819   ///    theoretical trip count of the collapsed loop would be 2^32 iteration,
820   ///    which cannot be represented in an 32-bit integer. Behavior is undefined
821   ///    in this case.
822   ///
823   ///  * The trip counts of every input loop must be available at \p ComputeIP.
824   ///    Non-rectangular loops are not yet supported.
825   ///
826   ///  * At each nest level, code between a surrounding loop and its nested loop
827   ///    is hoisted into the loop body, and such code will be executed more
828   ///    often than before collapsing (or not at all if any inner loop iteration
829   ///    has a trip count of 0). This is permitted by the OpenMP specification.
830   ///
831   /// \param DL        Debug location for instructions added for collapsing,
832   ///                  such as instructions to compute/derive the input loop's
833   ///                  induction variables.
834   /// \param Loops     Loops in the loop nest to collapse. Loops are specified
835   ///                  from outermost-to-innermost and every control flow of a
836   ///                  loop's body must pass through its directly nested loop.
837   /// \param ComputeIP Where additional instruction that compute the collapsed
838   ///                  trip count. If not set, defaults to before the generated
839   ///                  loop.
840   ///
841   /// \returns The CanonicalLoopInfo object representing the collapsed loop.
842   CanonicalLoopInfo *collapseLoops(DebugLoc DL,
843                                    ArrayRef<CanonicalLoopInfo *> Loops,
844                                    InsertPointTy ComputeIP);
845 
846   /// Get the default alignment value for given target
847   ///
848   /// \param TargetTriple   Target triple
849   /// \param Features       StringMap which describes extra CPU features
850   static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
851                                             const StringMap<bool> &Features);
852 
853   /// Retrieve (or create if non-existent) the address of a declare
854   /// target variable, used in conjunction with registerTargetGlobalVariable
855   /// to create declare target global variables.
856   ///
857   /// \param CaptureClause - enumerator corresponding to the OpenMP capture
858   /// clause used in conjunction with the variable being registered (link,
859   /// to, enter).
860   /// \param DeviceClause - enumerator corresponding to the OpenMP capture
861   /// clause used in conjunction with the variable being registered (nohost,
862   /// host, any)
863   /// \param IsDeclaration - boolean stating if the variable being registered
864   /// is a declaration-only and not a definition
865   /// \param IsExternallyVisible - boolean stating if the variable is externally
866   /// visible
867   /// \param EntryInfo - Unique entry information for the value generated
868   /// using getTargetEntryUniqueInfo, used to name generated pointer references
869   /// to the declare target variable
870   /// \param MangledName - the mangled name of the variable being registered
871   /// \param GeneratedRefs - references generated by invocations of
872   /// registerTargetGlobalVariable invoked from getAddrOfDeclareTargetVar,
873   /// these are required by Clang for book keeping.
874   /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
875   /// \param TargetTriple - The OpenMP device target triple we are compiling
876   /// for
877   /// \param LlvmPtrTy - The type of the variable we are generating or
878   /// retrieving an address for
879   /// \param GlobalInitializer - a lambda function which creates a constant
880   /// used for initializing a pointer reference to the variable in certain
881   /// cases. If a nullptr is passed, it will default to utilising the original
882   /// variable to initialize the pointer reference.
883   /// \param VariableLinkage - a lambda function which returns the variables
884   /// linkage type, if unspecified and a nullptr is given, it will instead
885   /// utilise the linkage stored on the existing global variable in the
886   /// LLVMModule.
887   Constant *getAddrOfDeclareTargetVar(
888       OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
889       OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
890       bool IsDeclaration, bool IsExternallyVisible,
891       TargetRegionEntryInfo EntryInfo, StringRef MangledName,
892       std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
893       std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
894       std::function<Constant *()> GlobalInitializer,
895       std::function<GlobalValue::LinkageTypes()> VariableLinkage);
896 
897   /// Registers a target variable for device or host.
898   ///
899   /// \param CaptureClause - enumerator corresponding to the OpenMP capture
900   /// clause used in conjunction with the variable being registered (link,
901   /// to, enter).
902   /// \param DeviceClause - enumerator corresponding to the OpenMP capture
903   /// clause used in conjunction with the variable being registered (nohost,
904   /// host, any)
905   /// \param IsDeclaration - boolean stating if the variable being registered
906   /// is a declaration-only and not a definition
907   /// \param IsExternallyVisible - boolean stating if the variable is externally
908   /// visible
909   /// \param EntryInfo - Unique entry information for the value generated
910   /// using getTargetEntryUniqueInfo, used to name generated pointer references
911   /// to the declare target variable
912   /// \param MangledName - the mangled name of the variable being registered
913   /// \param GeneratedRefs - references generated by invocations of
914   /// registerTargetGlobalVariable these are required by Clang for book
915   /// keeping.
916   /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
917   /// \param TargetTriple - The OpenMP device target triple we are compiling
918   /// for
919   /// \param GlobalInitializer - a lambda function which creates a constant
920   /// used for initializing a pointer reference to the variable in certain
921   /// cases. If a nullptr is passed, it will default to utilising the original
922   /// variable to initialize the pointer reference.
923   /// \param VariableLinkage - a lambda function which returns the variables
924   /// linkage type, if unspecified and a nullptr is given, it will instead
925   /// utilise the linkage stored on the existing global variable in the
926   /// LLVMModule.
927   /// \param LlvmPtrTy - The type of the variable we are generating or
928   /// retrieving an address for
929   /// \param Addr - the original llvm value (addr) of the variable to be
930   /// registered
931   void registerTargetGlobalVariable(
932       OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
933       OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
934       bool IsDeclaration, bool IsExternallyVisible,
935       TargetRegionEntryInfo EntryInfo, StringRef MangledName,
936       std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
937       std::vector<Triple> TargetTriple,
938       std::function<Constant *()> GlobalInitializer,
939       std::function<GlobalValue::LinkageTypes()> VariableLinkage,
940       Type *LlvmPtrTy, Constant *Addr);
941 
942   /// Get the offset of the OMP_MAP_MEMBER_OF field.
943   unsigned getFlagMemberOffset();
944 
945   /// Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on
946   /// the position given.
947   /// \param Position - A value indicating the position of the parent
948   /// of the member in the kernel argument structure, often retrieved
949   /// by the parents position in the combined information vectors used
950   /// to generate the structure itself. Multiple children (member's of)
951   /// with the same parent will use the same returned member flag.
952   omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position);
953 
954   /// Given an initial flag set, this function modifies it to contain
955   /// the passed in MemberOfFlag generated from the getMemberOfFlag
956   /// function. The results are dependent on the existing flag bits
957   /// set in the original flag set.
958   /// \param Flags - The original set of flags to be modified with the
959   /// passed in MemberOfFlag.
960   /// \param MemberOfFlag - A modified OMP_MAP_MEMBER_OF flag, adjusted
961   /// slightly based on the getMemberOfFlag which adjusts the flag bits
962   /// based on the members position in its parent.
963   void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags,
964                               omp::OpenMPOffloadMappingFlags MemberOfFlag);
965 
966 private:
967   /// Modifies the canonical loop to be a statically-scheduled workshare loop
968   /// which is executed on the device
969   ///
970   /// This takes a \p CLI representing a canonical loop, such as the one
971   /// created by \see createCanonicalLoop and emits additional instructions to
972   /// turn it into a workshare loop. In particular, it calls to an OpenMP
973   /// runtime function in the preheader to call OpenMP device rtl function
974   /// which handles worksharing of loop body interations.
975   ///
976   /// \param DL       Debug location for instructions added for the
977   ///                 workshare-loop construct itself.
978   /// \param CLI      A descriptor of the canonical loop to workshare.
979   /// \param AllocaIP An insertion point for Alloca instructions usable in the
980   ///                 preheader of the loop.
981   /// \param LoopType Information about type of loop worksharing.
982   ///                 It corresponds to type of loop workshare OpenMP pragma.
983   ///
984   /// \returns Point where to insert code after the workshare construct.
985   InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
986                                          InsertPointTy AllocaIP,
987                                          omp::WorksharingLoopType LoopType);
988 
989   /// Modifies the canonical loop to be a statically-scheduled workshare loop.
990   ///
991   /// This takes a \p LoopInfo representing a canonical loop, such as the one
992   /// created by \p createCanonicalLoop and emits additional instructions to
993   /// turn it into a workshare loop. In particular, it calls to an OpenMP
994   /// runtime function in the preheader to obtain the loop bounds to be used in
995   /// the current thread, updates the relevant instructions in the canonical
996   /// loop and calls to an OpenMP runtime finalization function after the loop.
997   ///
998   /// \param DL       Debug location for instructions added for the
999   ///                 workshare-loop construct itself.
1000   /// \param CLI      A descriptor of the canonical loop to workshare.
1001   /// \param AllocaIP An insertion point for Alloca instructions usable in the
1002   ///                 preheader of the loop.
1003   /// \param NeedsBarrier Indicates whether a barrier must be inserted after
1004   ///                     the loop.
1005   ///
1006   /// \returns Point where to insert code after the workshare construct.
1007   InsertPointOrErrorTy applyStaticWorkshareLoop(DebugLoc DL,
1008                                                 CanonicalLoopInfo *CLI,
1009                                                 InsertPointTy AllocaIP,
1010                                                 bool NeedsBarrier);
1011 
1012   /// Modifies the canonical loop a statically-scheduled workshare loop with a
1013   /// user-specified chunk size.
1014   ///
1015   /// \param DL           Debug location for instructions added for the
1016   ///                     workshare-loop construct itself.
1017   /// \param CLI          A descriptor of the canonical loop to workshare.
1018   /// \param AllocaIP     An insertion point for Alloca instructions usable in
1019   ///                     the preheader of the loop.
1020   /// \param NeedsBarrier Indicates whether a barrier must be inserted after the
1021   ///                     loop.
1022   /// \param ChunkSize    The user-specified chunk size.
1023   ///
1024   /// \returns Point where to insert code after the workshare construct.
1025   InsertPointOrErrorTy applyStaticChunkedWorkshareLoop(DebugLoc DL,
1026                                                        CanonicalLoopInfo *CLI,
1027                                                        InsertPointTy AllocaIP,
1028                                                        bool NeedsBarrier,
1029                                                        Value *ChunkSize);
1030 
1031   /// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
1032   ///
1033   /// This takes a \p LoopInfo representing a canonical loop, such as the one
1034   /// created by \p createCanonicalLoop and emits additional instructions to
1035   /// turn it into a workshare loop. In particular, it calls to an OpenMP
1036   /// runtime function in the preheader to obtain, and then in each iteration
1037   /// to update the loop counter.
1038   ///
1039   /// \param DL       Debug location for instructions added for the
1040   ///                 workshare-loop construct itself.
1041   /// \param CLI      A descriptor of the canonical loop to workshare.
1042   /// \param AllocaIP An insertion point for Alloca instructions usable in the
1043   ///                 preheader of the loop.
1044   /// \param SchedType Type of scheduling to be passed to the init function.
1045   /// \param NeedsBarrier Indicates whether a barrier must be insterted after
1046   ///                     the loop.
1047   /// \param Chunk    The size of loop chunk considered as a unit when
1048   ///                 scheduling. If \p nullptr, defaults to 1.
1049   ///
1050   /// \returns Point where to insert code after the workshare construct.
1051   InsertPointOrErrorTy applyDynamicWorkshareLoop(DebugLoc DL,
1052                                                  CanonicalLoopInfo *CLI,
1053                                                  InsertPointTy AllocaIP,
1054                                                  omp::OMPScheduleType SchedType,
1055                                                  bool NeedsBarrier,
1056                                                  Value *Chunk = nullptr);
1057 
1058   /// Create alternative version of the loop to support if clause
1059   ///
1060   /// OpenMP if clause can require to generate second loop. This loop
1061   /// will be executed when if clause condition is not met. createIfVersion
1062   /// adds branch instruction to the copied loop if \p  ifCond is not met.
1063   ///
1064   /// \param Loop       Original loop which should be versioned.
1065   /// \param IfCond     Value which corresponds to if clause condition
1066   /// \param VMap       Value to value map to define relation between
1067   ///                   original and copied loop values and loop blocks.
1068   /// \param NamePrefix Optional name prefix for if.then if.else blocks.
1069   void createIfVersion(CanonicalLoopInfo *Loop, Value *IfCond,
1070                        ValueToValueMapTy &VMap, const Twine &NamePrefix = "");
1071 
1072 public:
1073   /// Modifies the canonical loop to be a workshare loop.
1074   ///
1075   /// This takes a \p LoopInfo representing a canonical loop, such as the one
1076   /// created by \p createCanonicalLoop and emits additional instructions to
1077   /// turn it into a workshare loop. In particular, it calls to an OpenMP
1078   /// runtime function in the preheader to obtain the loop bounds to be used in
1079   /// the current thread, updates the relevant instructions in the canonical
1080   /// loop and calls to an OpenMP runtime finalization function after the loop.
1081   ///
1082   /// The concrete transformation is done by applyStaticWorkshareLoop,
1083   /// applyStaticChunkedWorkshareLoop, or applyDynamicWorkshareLoop, depending
1084   /// on the value of \p SchedKind and \p ChunkSize.
1085   ///
1086   /// \param DL       Debug location for instructions added for the
1087   ///                 workshare-loop construct itself.
1088   /// \param CLI      A descriptor of the canonical loop to workshare.
1089   /// \param AllocaIP An insertion point for Alloca instructions usable in the
1090   ///                 preheader of the loop.
1091   /// \param NeedsBarrier Indicates whether a barrier must be insterted after
1092   ///                     the loop.
1093   /// \param SchedKind Scheduling algorithm to use.
1094   /// \param ChunkSize The chunk size for the inner loop.
1095   /// \param HasSimdModifier Whether the simd modifier is present in the
1096   ///                        schedule clause.
1097   /// \param HasMonotonicModifier Whether the monotonic modifier is present in
1098   ///                             the schedule clause.
1099   /// \param HasNonmonotonicModifier Whether the nonmonotonic modifier is
1100   ///                                present in the schedule clause.
1101   /// \param HasOrderedClause Whether the (parameterless) ordered clause is
1102   ///                         present.
1103   /// \param LoopType Information about type of loop worksharing.
1104   ///                 It corresponds to type of loop workshare OpenMP pragma.
1105   ///
1106   /// \returns Point where to insert code after the workshare construct.
1107   InsertPointOrErrorTy applyWorkshareLoop(
1108       DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
1109       bool NeedsBarrier,
1110       llvm::omp::ScheduleKind SchedKind = llvm::omp::OMP_SCHEDULE_Default,
1111       Value *ChunkSize = nullptr, bool HasSimdModifier = false,
1112       bool HasMonotonicModifier = false, bool HasNonmonotonicModifier = false,
1113       bool HasOrderedClause = false,
1114       omp::WorksharingLoopType LoopType =
1115           omp::WorksharingLoopType::ForStaticLoop);
1116 
1117   /// Tile a loop nest.
1118   ///
1119   /// Tiles the loops of \p Loops by the tile sizes in \p TileSizes. Loops in
1120   /// \p/ Loops must be perfectly nested, from outermost to innermost loop
1121   /// (i.e. Loops.front() is the outermost loop). The trip count llvm::Value
1122   /// of every loop and every tile sizes must be usable in the outermost
1123   /// loop's preheader. This implies that the loop nest is rectangular.
1124   ///
1125   /// Example:
1126   /// \code
1127   ///   for (int i = 0; i < 15; ++i) // Canonical loop "i"
1128   ///     for (int j = 0; j < 14; ++j) // Canonical loop "j"
1129   ///         body(i, j);
1130   /// \endcode
1131   ///
1132   /// After tiling with Loops={i,j} and TileSizes={5,7}, the loop is changed to
1133   /// \code
1134   ///   for (int i1 = 0; i1 < 3; ++i1)
1135   ///     for (int j1 = 0; j1 < 2; ++j1)
1136   ///       for (int i2 = 0; i2 < 5; ++i2)
1137   ///         for (int j2 = 0; j2 < 7; ++j2)
1138   ///           body(i1*3+i2, j1*3+j2);
1139   /// \endcode
1140   ///
1141   /// The returned vector are the loops {i1,j1,i2,j2}. The loops i1 and j1 are
1142   /// referred to the floor, and the loops i2 and j2 are the tiles. Tiling also
1143   /// handles non-constant trip counts, non-constant tile sizes and trip counts
1144   /// that are not multiples of the tile size. In the latter case the tile loop
1145   /// of the last floor-loop iteration will have fewer iterations than specified
1146   /// as its tile size.
1147   ///
1148   ///
1149   /// @param DL        Debug location for instructions added by tiling, for
1150   ///                  instance the floor- and tile trip count computation.
1151   /// @param Loops     Loops to tile. The CanonicalLoopInfo objects are
1152   ///                  invalidated by this method, i.e. should not used after
1153   ///                  tiling.
1154   /// @param TileSizes For each loop in \p Loops, the tile size for that
1155   ///                  dimensions.
1156   ///
1157   /// \returns A list of generated loops. Contains twice as many loops as the
1158   ///          input loop nest; the first half are the floor loops and the
1159   ///          second half are the tile loops.
1160   std::vector<CanonicalLoopInfo *>
1161   tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
1162             ArrayRef<Value *> TileSizes);
1163 
1164   /// Fully unroll a loop.
1165   ///
1166   /// Instead of unrolling the loop immediately (and duplicating its body
1167   /// instructions), it is deferred to LLVM's LoopUnrollPass by adding loop
1168   /// metadata.
1169   ///
1170   /// \param DL   Debug location for instructions added by unrolling.
1171   /// \param Loop The loop to unroll. The loop will be invalidated.
1172   void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop);
1173 
1174   /// Fully or partially unroll a loop. How the loop is unrolled is determined
1175   /// using LLVM's LoopUnrollPass.
1176   ///
1177   /// \param DL   Debug location for instructions added by unrolling.
1178   /// \param Loop The loop to unroll. The loop will be invalidated.
1179   void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop);
1180 
1181   /// Partially unroll a loop.
1182   ///
1183   /// The CanonicalLoopInfo of the unrolled loop for use with chained
1184   /// loop-associated directive can be requested using \p UnrolledCLI. Not
1185   /// needing the CanonicalLoopInfo allows more efficient code generation by
1186   /// deferring the actual unrolling to the LoopUnrollPass using loop metadata.
1187   /// A loop-associated directive applied to the unrolled loop needs to know the
1188   /// new trip count which means that if using a heuristically determined unroll
1189   /// factor (\p Factor == 0), that factor must be computed immediately. We are
1190   /// using the same logic as the LoopUnrollPass to derived the unroll factor,
1191   /// but which assumes that some canonicalization has taken place (e.g.
1192   /// Mem2Reg, LICM, GVN, Inlining, etc.). That is, the heuristic will perform
1193   /// better when the unrolled loop's CanonicalLoopInfo is not needed.
1194   ///
1195   /// \param DL          Debug location for instructions added by unrolling.
1196   /// \param Loop        The loop to unroll. The loop will be invalidated.
1197   /// \param Factor      The factor to unroll the loop by. A factor of 0
1198   ///                    indicates that a heuristic should be used to determine
1199   ///                    the unroll-factor.
1200   /// \param UnrolledCLI If non-null, receives the CanonicalLoopInfo of the
1201   ///                    partially unrolled loop. Otherwise, uses loop metadata
1202   ///                    to defer unrolling to the LoopUnrollPass.
1203   void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor,
1204                          CanonicalLoopInfo **UnrolledCLI);
1205 
1206   /// Add metadata to simd-ize a loop. If IfCond is not nullptr, the loop
1207   /// is cloned. The metadata which prevents vectorization is added to
1208   /// to the cloned loop. The cloned loop is executed when ifCond is evaluated
1209   /// to false.
1210   ///
1211   /// \param Loop        The loop to simd-ize.
1212   /// \param AlignedVars The map which containts pairs of the pointer
1213   ///                    and its corresponding alignment.
1214   /// \param IfCond      The value which corresponds to the if clause
1215   ///                    condition.
1216   /// \param Order       The enum to map order clause.
1217   /// \param Simdlen     The Simdlen length to apply to the simd loop.
1218   /// \param Safelen     The Safelen length to apply to the simd loop.
1219   void applySimd(CanonicalLoopInfo *Loop,
1220                  MapVector<Value *, Value *> AlignedVars, Value *IfCond,
1221                  omp::OrderKind Order, ConstantInt *Simdlen,
1222                  ConstantInt *Safelen);
1223 
1224   /// Generator for '#omp flush'
1225   ///
1226   /// \param Loc The location where the flush directive was encountered
1227   void createFlush(const LocationDescription &Loc);
1228 
1229   /// Generator for '#omp taskwait'
1230   ///
1231   /// \param Loc The location where the taskwait directive was encountered.
1232   void createTaskwait(const LocationDescription &Loc);
1233 
1234   /// Generator for '#omp taskyield'
1235   ///
1236   /// \param Loc The location where the taskyield directive was encountered.
1237   void createTaskyield(const LocationDescription &Loc);
1238 
1239   /// A struct to pack the relevant information for an OpenMP depend clause.
1240   struct DependData {
1241     omp::RTLDependenceKindTy DepKind = omp::RTLDependenceKindTy::DepUnknown;
1242     Type *DepValueType;
1243     Value *DepVal;
1244     explicit DependData() = default;
1245     DependData(omp::RTLDependenceKindTy DepKind, Type *DepValueType,
1246                Value *DepVal)
1247         : DepKind(DepKind), DepValueType(DepValueType), DepVal(DepVal) {}
1248   };
1249 
1250   /// Generator for `#omp task`
1251   ///
1252   /// \param Loc The location where the task construct was encountered.
1253   /// \param AllocaIP The insertion point to be used for alloca instructions.
1254   /// \param BodyGenCB Callback that will generate the region code.
1255   /// \param Tied True if the task is tied, false if the task is untied.
1256   /// \param Final i1 value which is `true` if the task is final, `false` if the
1257   ///              task is not final.
1258   /// \param IfCondition i1 value. If it evaluates to `false`, an undeferred
1259   ///                    task is generated, and the encountering thread must
1260   ///                    suspend the current task region, for which execution
1261   ///                    cannot be resumed until execution of the structured
1262   ///                    block that is associated with the generated task is
1263   ///                    completed.
1264   /// \param EventHandle If present, signifies the event handle as part of
1265   ///			 the detach clause
1266   /// \param Mergeable	 If the given task is `mergeable`
1267   /// \param priority `priority-value' specifies the execution order of the
1268   ///                 tasks that is generated by the construct
1269   InsertPointOrErrorTy
1270   createTask(const LocationDescription &Loc, InsertPointTy AllocaIP,
1271              BodyGenCallbackTy BodyGenCB, bool Tied = true,
1272              Value *Final = nullptr, Value *IfCondition = nullptr,
1273              SmallVector<DependData> Dependencies = {}, bool Mergeable = false,
1274              Value *EventHandle = nullptr, Value *Priority = nullptr);
1275 
1276   /// Generator for the taskgroup construct
1277   ///
1278   /// \param Loc The location where the taskgroup construct was encountered.
1279   /// \param AllocaIP The insertion point to be used for alloca instructions.
1280   /// \param BodyGenCB Callback that will generate the region code.
1281   InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc,
1282                                        InsertPointTy AllocaIP,
1283                                        BodyGenCallbackTy BodyGenCB);
1284 
1285   using FileIdentifierInfoCallbackTy =
1286       std::function<std::tuple<std::string, uint64_t>()>;
1287 
1288   /// Creates a unique info for a target entry when provided a filename and
1289   /// line number from.
1290   ///
1291   /// \param CallBack A callback function which should return filename the entry
1292   /// resides in as well as the line number for the target entry
1293   /// \param ParentName The name of the parent the target entry resides in, if
1294   /// any.
1295   static TargetRegionEntryInfo
1296   getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
1297                            StringRef ParentName = "");
1298 
1299   /// Enum class for the RedctionGen CallBack type to be used.
1300   enum class ReductionGenCBKind { Clang, MLIR };
1301 
1302   /// ReductionGen CallBack for Clang
1303   ///
1304   /// \param CodeGenIP InsertPoint for CodeGen.
1305   /// \param Index Index of the ReductionInfo to generate code for.
1306   /// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for
1307   /// codegen, used for fixup later.
1308   /// \param RHSPtr Optionally used by Clang to
1309   /// return the RHSPtr it used for codegen, used for fixup later.
1310   /// \param CurFn Optionally used by Clang to pass in the Current Function as
1311   /// Clang context may be old.
1312   using ReductionGenClangCBTy =
1313       std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index,
1314                                   Value **LHS, Value **RHS, Function *CurFn)>;
1315 
1316   /// ReductionGen CallBack for MLIR
1317   ///
1318   /// \param CodeGenIP InsertPoint for CodeGen.
1319   /// \param LHS Pass in the LHS Value to be used for CodeGen.
1320   /// \param RHS Pass in the RHS Value to be used for CodeGen.
1321   using ReductionGenCBTy = std::function<InsertPointOrErrorTy(
1322       InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>;
1323 
1324   /// Functions used to generate atomic reductions. Such functions take two
1325   /// Values representing pointers to LHS and RHS of the reduction, as well as
1326   /// the element type of these pointers. They are expected to atomically
1327   /// update the LHS to the reduced value.
1328   using ReductionGenAtomicCBTy = std::function<InsertPointOrErrorTy(
1329       InsertPointTy, Type *, Value *, Value *)>;
1330 
1331   /// Enum class for reduction evaluation types scalar, complex and aggregate.
1332   enum class EvalKind { Scalar, Complex, Aggregate };
1333 
1334   /// Information about an OpenMP reduction.
1335   struct ReductionInfo {
1336     ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable,
1337                   EvalKind EvaluationKind, ReductionGenCBTy ReductionGen,
1338                   ReductionGenClangCBTy ReductionGenClang,
1339                   ReductionGenAtomicCBTy AtomicReductionGen)
1340         : ElementType(ElementType), Variable(Variable),
1341           PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind),
1342           ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang),
1343           AtomicReductionGen(AtomicReductionGen) {}
1344     ReductionInfo(Value *PrivateVariable)
1345         : ElementType(nullptr), Variable(nullptr),
1346           PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar),
1347           ReductionGen(), ReductionGenClang(), AtomicReductionGen() {}
1348 
1349     /// Reduction element type, must match pointee type of variable.
1350     Type *ElementType;
1351 
1352     /// Reduction variable of pointer type.
1353     Value *Variable;
1354 
1355     /// Thread-private partial reduction variable.
1356     Value *PrivateVariable;
1357 
1358     /// Reduction evaluation kind - scalar, complex or aggregate.
1359     EvalKind EvaluationKind;
1360 
1361     /// Callback for generating the reduction body. The IR produced by this will
1362     /// be used to combine two values in a thread-safe context, e.g., under
1363     /// lock or within the same thread, and therefore need not be atomic.
1364     ReductionGenCBTy ReductionGen;
1365 
1366     /// Clang callback for generating the reduction body. The IR produced by
1367     /// this will be used to combine two values in a thread-safe context, e.g.,
1368     /// under lock or within the same thread, and therefore need not be atomic.
1369     ReductionGenClangCBTy ReductionGenClang;
1370 
1371     /// Callback for generating the atomic reduction body, may be null. The IR
1372     /// produced by this will be used to atomically combine two values during
1373     /// reduction. If null, the implementation will use the non-atomic version
1374     /// along with the appropriate synchronization mechanisms.
1375     ReductionGenAtomicCBTy AtomicReductionGen;
1376   };
1377 
1378   enum class CopyAction : unsigned {
1379     // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1380     // the warp using shuffle instructions.
1381     RemoteLaneToThread,
1382     // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1383     ThreadCopy,
1384   };
1385 
1386   struct CopyOptionsTy {
1387     Value *RemoteLaneOffset = nullptr;
1388     Value *ScratchpadIndex = nullptr;
1389     Value *ScratchpadWidth = nullptr;
1390   };
1391 
1392   /// Supporting functions for Reductions CodeGen.
1393 private:
1394   /// Get the id of the current thread on the GPU.
1395   Value *getGPUThreadID();
1396 
1397   /// Get the GPU warp size.
1398   Value *getGPUWarpSize();
1399 
1400   /// Get the id of the warp in the block.
1401   /// We assume that the warp size is 32, which is always the case
1402   /// on the NVPTX device, to generate more efficient code.
1403   Value *getNVPTXWarpID();
1404 
1405   /// Get the id of the current lane in the Warp.
1406   /// We assume that the warp size is 32, which is always the case
1407   /// on the NVPTX device, to generate more efficient code.
1408   Value *getNVPTXLaneID();
1409 
1410   /// Cast value to the specified type.
1411   Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType);
1412 
1413   /// This function creates calls to one of two shuffle functions to copy
1414   /// variables between lanes in a warp.
1415   Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element,
1416                                       Type *ElementType, Value *Offset);
1417 
1418   /// Function to shuffle over the value from the remote lane.
1419   void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr,
1420                        Type *ElementType, Value *Offset,
1421                        Type *ReductionArrayTy);
1422 
1423   /// Emit instructions to copy a Reduce list, which contains partially
1424   /// aggregated values, in the specified direction.
1425   void emitReductionListCopy(
1426       InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
1427       ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
1428       CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr});
1429 
1430   /// Emit a helper that reduces data across two OpenMP threads (lanes)
1431   /// in the same warp.  It uses shuffle instructions to copy over data from
1432   /// a remote lane's stack.  The reduction algorithm performed is specified
1433   /// by the fourth parameter.
1434   ///
1435   /// Algorithm Versions.
1436   /// Full Warp Reduce (argument value 0):
1437   ///   This algorithm assumes that all 32 lanes are active and gathers
1438   ///   data from these 32 lanes, producing a single resultant value.
1439   /// Contiguous Partial Warp Reduce (argument value 1):
1440   ///   This algorithm assumes that only a *contiguous* subset of lanes
1441   ///   are active.  This happens for the last warp in a parallel region
1442   ///   when the user specified num_threads is not an integer multiple of
1443   ///   32.  This contiguous subset always starts with the zeroth lane.
1444   /// Partial Warp Reduce (argument value 2):
1445   ///   This algorithm gathers data from any number of lanes at any position.
1446   /// All reduced values are stored in the lowest possible lane.  The set
1447   /// of problems every algorithm addresses is a super set of those
1448   /// addressable by algorithms with a lower version number.  Overhead
1449   /// increases as algorithm version increases.
1450   ///
1451   /// Terminology
1452   /// Reduce element:
1453   ///   Reduce element refers to the individual data field with primitive
1454   ///   data types to be combined and reduced across threads.
1455   /// Reduce list:
1456   ///   Reduce list refers to a collection of local, thread-private
1457   ///   reduce elements.
1458   /// Remote Reduce list:
1459   ///   Remote Reduce list refers to a collection of remote (relative to
1460   ///   the current thread) reduce elements.
1461   ///
1462   /// We distinguish between three states of threads that are important to
1463   /// the implementation of this function.
1464   /// Alive threads:
1465   ///   Threads in a warp executing the SIMT instruction, as distinguished from
1466   ///   threads that are inactive due to divergent control flow.
1467   /// Active threads:
1468   ///   The minimal set of threads that has to be alive upon entry to this
1469   ///   function.  The computation is correct iff active threads are alive.
1470   ///   Some threads are alive but they are not active because they do not
1471   ///   contribute to the computation in any useful manner.  Turning them off
1472   ///   may introduce control flow overheads without any tangible benefits.
1473   /// Effective threads:
1474   ///   In order to comply with the argument requirements of the shuffle
1475   ///   function, we must keep all lanes holding data alive.  But at most
1476   ///   half of them perform value aggregation; we refer to this half of
1477   ///   threads as effective. The other half is simply handing off their
1478   ///   data.
1479   ///
1480   /// Procedure
1481   /// Value shuffle:
1482   ///   In this step active threads transfer data from higher lane positions
1483   ///   in the warp to lower lane positions, creating Remote Reduce list.
1484   /// Value aggregation:
1485   ///   In this step, effective threads combine their thread local Reduce list
1486   ///   with Remote Reduce list and store the result in the thread local
1487   ///   Reduce list.
1488   /// Value copy:
1489   ///   In this step, we deal with the assumption made by algorithm 2
1490   ///   (i.e. contiguity assumption).  When we have an odd number of lanes
1491   ///   active, say 2k+1, only k threads will be effective and therefore k
1492   ///   new values will be produced.  However, the Reduce list owned by the
1493   ///   (2k+1)th thread is ignored in the value aggregation.  Therefore
1494   ///   we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1495   ///   that the contiguity assumption still holds.
1496   ///
1497   /// \param ReductionInfos Array type containing the ReductionOps.
1498   /// \param ReduceFn The reduction function.
1499   /// \param FuncAttrs Optional param to specify any function attributes that
1500   ///                  need to be copied to the new function.
1501   ///
1502   /// \return The ShuffleAndReduce function.
1503   Function *emitShuffleAndReduceFunction(
1504       ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
1505       Function *ReduceFn, AttributeList FuncAttrs);
1506 
1507   /// This function emits a helper that gathers Reduce lists from the first
1508   /// lane of every active warp to lanes in the first warp.
1509   ///
1510   /// void inter_warp_copy_func(void* reduce_data, num_warps)
1511   ///   shared smem[warp_size];
1512   ///   For all data entries D in reduce_data:
1513   ///     sync
1514   ///     If (I am the first lane in each warp)
1515   ///       Copy my local D to smem[warp_id]
1516   ///     sync
1517   ///     if (I am the first warp)
1518   ///       Copy smem[thread_id] to my local D
1519   ///
1520   /// \param Loc The insert and source location description.
1521   /// \param ReductionInfos Array type containing the ReductionOps.
1522   /// \param FuncAttrs Optional param to specify any function attributes that
1523   ///                  need to be copied to the new function.
1524   ///
1525   /// \return The InterWarpCopy function.
1526   Expected<Function *>
1527   emitInterWarpCopyFunction(const LocationDescription &Loc,
1528                             ArrayRef<ReductionInfo> ReductionInfos,
1529                             AttributeList FuncAttrs);
1530 
1531   /// This function emits a helper that copies all the reduction variables from
1532   /// the team into the provided global buffer for the reduction variables.
1533   ///
1534   /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
1535   ///   For all data entries D in reduce_data:
1536   ///     Copy local D to buffer.D[Idx]
1537   ///
1538   /// \param ReductionInfos Array type containing the ReductionOps.
1539   /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1540   /// \param FuncAttrs Optional param to specify any function attributes that
1541   ///                  need to be copied to the new function.
1542   ///
1543   /// \return The ListToGlobalCopy function.
1544   Function *emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
1545                                          Type *ReductionsBufferTy,
1546                                          AttributeList FuncAttrs);
1547 
1548   /// This function emits a helper that copies all the reduction variables from
1549   /// the team into the provided global buffer for the reduction variables.
1550   ///
1551   /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
1552   ///   For all data entries D in reduce_data:
1553   ///     Copy buffer.D[Idx] to local D;
1554   ///
1555   /// \param ReductionInfos Array type containing the ReductionOps.
1556   /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1557   /// \param FuncAttrs Optional param to specify any function attributes that
1558   ///                  need to be copied to the new function.
1559   ///
1560   /// \return The GlobalToList function.
1561   Function *emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
1562                                          Type *ReductionsBufferTy,
1563                                          AttributeList FuncAttrs);
1564 
1565   /// This function emits a helper that reduces all the reduction variables from
1566   /// the team into the provided global buffer for the reduction variables.
1567   ///
1568   /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
1569   ///  void *GlobPtrs[];
1570   ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
1571   ///  ...
1572   ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
1573   ///  reduce_function(GlobPtrs, reduce_data);
1574   ///
1575   /// \param ReductionInfos Array type containing the ReductionOps.
1576   /// \param ReduceFn The reduction function.
1577   /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1578   /// \param FuncAttrs Optional param to specify any function attributes that
1579   ///                  need to be copied to the new function.
1580   ///
1581   /// \return The ListToGlobalReduce function.
1582   Function *
1583   emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
1584                                  Function *ReduceFn, Type *ReductionsBufferTy,
1585                                  AttributeList FuncAttrs);
1586 
1587   /// This function emits a helper that reduces all the reduction variables from
1588   /// the team into the provided global buffer for the reduction variables.
1589   ///
1590   /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
1591   ///  void *GlobPtrs[];
1592   ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
1593   ///  ...
1594   ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
1595   ///  reduce_function(reduce_data, GlobPtrs);
1596   ///
1597   /// \param ReductionInfos Array type containing the ReductionOps.
1598   /// \param ReduceFn The reduction function.
1599   /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1600   /// \param FuncAttrs Optional param to specify any function attributes that
1601   ///                  need to be copied to the new function.
1602   ///
1603   /// \return The GlobalToListReduce function.
1604   Function *
1605   emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
1606                                  Function *ReduceFn, Type *ReductionsBufferTy,
1607                                  AttributeList FuncAttrs);
1608 
1609   /// Get the function name of a reduction function.
1610   std::string getReductionFuncName(StringRef Name) const;
1611 
1612   /// Emits reduction function.
1613   /// \param ReducerName Name of the function calling the reduction.
1614   /// \param ReductionInfos Array type containing the ReductionOps.
1615   /// \param ReductionGenCBKind Optional param to specify Clang or MLIR
1616   ///                           CodeGenCB kind.
1617   /// \param FuncAttrs Optional param to specify any function attributes that
1618   ///                  need to be copied to the new function.
1619   ///
1620   /// \return The reduction function.
1621   Expected<Function *> createReductionFunction(
1622       StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
1623       ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
1624       AttributeList FuncAttrs = {});
1625 
1626 public:
1627   ///
1628   /// Design of OpenMP reductions on the GPU
1629   ///
1630   /// Consider a typical OpenMP program with one or more reduction
1631   /// clauses:
1632   ///
1633   /// float foo;
1634   /// double bar;
1635   /// #pragma omp target teams distribute parallel for \
1636   ///             reduction(+:foo) reduction(*:bar)
1637   /// for (int i = 0; i < N; i++) {
1638   ///   foo += A[i]; bar *= B[i];
1639   /// }
1640   ///
1641   /// where 'foo' and 'bar' are reduced across all OpenMP threads in
1642   /// all teams.  In our OpenMP implementation on the NVPTX device an
1643   /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1644   /// within a team are mapped to CUDA threads within a threadblock.
1645   /// Our goal is to efficiently aggregate values across all OpenMP
1646   /// threads such that:
1647   ///
1648   ///   - the compiler and runtime are logically concise, and
1649   ///   - the reduction is performed efficiently in a hierarchical
1650   ///     manner as follows: within OpenMP threads in the same warp,
1651   ///     across warps in a threadblock, and finally across teams on
1652   ///     the NVPTX device.
1653   ///
1654   /// Introduction to Decoupling
1655   ///
1656   /// We would like to decouple the compiler and the runtime so that the
1657   /// latter is ignorant of the reduction variables (number, data types)
1658   /// and the reduction operators.  This allows a simpler interface
1659   /// and implementation while still attaining good performance.
1660   ///
1661   /// Pseudocode for the aforementioned OpenMP program generated by the
1662   /// compiler is as follows:
1663   ///
1664   /// 1. Create private copies of reduction variables on each OpenMP
1665   ///    thread: 'foo_private', 'bar_private'
1666   /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1667   ///    to it and writes the result in 'foo_private' and 'bar_private'
1668   ///    respectively.
1669   /// 3. Call the OpenMP runtime on the GPU to reduce within a team
1670   ///    and store the result on the team master:
1671   ///
1672   ///     __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1673   ///        reduceData, shuffleReduceFn, interWarpCpyFn)
1674   ///
1675   ///     where:
1676   ///       struct ReduceData {
1677   ///         double *foo;
1678   ///         double *bar;
1679   ///       } reduceData
1680   ///       reduceData.foo = &foo_private
1681   ///       reduceData.bar = &bar_private
1682   ///
1683   ///     'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1684   ///     auxiliary functions generated by the compiler that operate on
1685   ///     variables of type 'ReduceData'.  They aid the runtime perform
1686   ///     algorithmic steps in a data agnostic manner.
1687   ///
1688   ///     'shuffleReduceFn' is a pointer to a function that reduces data
1689   ///     of type 'ReduceData' across two OpenMP threads (lanes) in the
1690   ///     same warp.  It takes the following arguments as input:
1691   ///
1692   ///     a. variable of type 'ReduceData' on the calling lane,
1693   ///     b. its lane_id,
1694   ///     c. an offset relative to the current lane_id to generate a
1695   ///        remote_lane_id.  The remote lane contains the second
1696   ///        variable of type 'ReduceData' that is to be reduced.
1697   ///     d. an algorithm version parameter determining which reduction
1698   ///        algorithm to use.
1699   ///
1700   ///     'shuffleReduceFn' retrieves data from the remote lane using
1701   ///     efficient GPU shuffle intrinsics and reduces, using the
1702   ///     algorithm specified by the 4th parameter, the two operands
1703   ///     element-wise.  The result is written to the first operand.
1704   ///
1705   ///     Different reduction algorithms are implemented in different
1706   ///     runtime functions, all calling 'shuffleReduceFn' to perform
1707   ///     the essential reduction step.  Therefore, based on the 4th
1708   ///     parameter, this function behaves slightly differently to
1709   ///     cooperate with the runtime to ensure correctness under
1710   ///     different circumstances.
1711   ///
1712   ///     'InterWarpCpyFn' is a pointer to a function that transfers
1713   ///     reduced variables across warps.  It tunnels, through CUDA
1714   ///     shared memory, the thread-private data of type 'ReduceData'
1715   ///     from lane 0 of each warp to a lane in the first warp.
1716   /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1717   ///    The last team writes the global reduced value to memory.
1718   ///
1719   ///     ret = __kmpc_nvptx_teams_reduce_nowait(...,
1720   ///             reduceData, shuffleReduceFn, interWarpCpyFn,
1721   ///             scratchpadCopyFn, loadAndReduceFn)
1722   ///
1723   ///     'scratchpadCopyFn' is a helper that stores reduced
1724   ///     data from the team master to a scratchpad array in
1725   ///     global memory.
1726   ///
1727   ///     'loadAndReduceFn' is a helper that loads data from
1728   ///     the scratchpad array and reduces it with the input
1729   ///     operand.
1730   ///
1731   ///     These compiler generated functions hide address
1732   ///     calculation and alignment information from the runtime.
1733   /// 5. if ret == 1:
1734   ///     The team master of the last team stores the reduced
1735   ///     result to the globals in memory.
1736   ///     foo += reduceData.foo; bar *= reduceData.bar
1737   ///
1738   ///
1739   /// Warp Reduction Algorithms
1740   ///
1741   /// On the warp level, we have three algorithms implemented in the
1742   /// OpenMP runtime depending on the number of active lanes:
1743   ///
1744   /// Full Warp Reduction
1745   ///
1746   /// The reduce algorithm within a warp where all lanes are active
1747   /// is implemented in the runtime as follows:
1748   ///
1749   /// full_warp_reduce(void *reduce_data,
1750   ///                  kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1751   ///   for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1752   ///     ShuffleReduceFn(reduce_data, 0, offset, 0);
1753   /// }
1754   ///
1755   /// The algorithm completes in log(2, WARPSIZE) steps.
1756   ///
1757   /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1758   /// not used therefore we save instructions by not retrieving lane_id
1759   /// from the corresponding special registers.  The 4th parameter, which
1760   /// represents the version of the algorithm being used, is set to 0 to
1761   /// signify full warp reduction.
1762   ///
1763   /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1764   ///
1765   /// #reduce_elem refers to an element in the local lane's data structure
1766   /// #remote_elem is retrieved from a remote lane
1767   /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1768   /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1769   ///
1770   /// Contiguous Partial Warp Reduction
1771   ///
1772   /// This reduce algorithm is used within a warp where only the first
1773   /// 'n' (n <= WARPSIZE) lanes are active.  It is typically used when the
1774   /// number of OpenMP threads in a parallel region is not a multiple of
1775   /// WARPSIZE.  The algorithm is implemented in the runtime as follows:
1776   ///
1777   /// void
1778   /// contiguous_partial_reduce(void *reduce_data,
1779   ///                           kmp_ShuffleReductFctPtr ShuffleReduceFn,
1780   ///                           int size, int lane_id) {
1781   ///   int curr_size;
1782   ///   int offset;
1783   ///   curr_size = size;
1784   ///   mask = curr_size/2;
1785   ///   while (offset>0) {
1786   ///     ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1787   ///     curr_size = (curr_size+1)/2;
1788   ///     offset = curr_size/2;
1789   ///   }
1790   /// }
1791   ///
1792   /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1793   ///
1794   /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1795   /// if (lane_id < offset)
1796   ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
1797   /// else
1798   ///     reduce_elem = remote_elem
1799   ///
1800   /// This algorithm assumes that the data to be reduced are located in a
1801   /// contiguous subset of lanes starting from the first.  When there is
1802   /// an odd number of active lanes, the data in the last lane is not
1803   /// aggregated with any other lane's dat but is instead copied over.
1804   ///
1805   /// Dispersed Partial Warp Reduction
1806   ///
1807   /// This algorithm is used within a warp when any discontiguous subset of
1808   /// lanes are active.  It is used to implement the reduction operation
1809   /// across lanes in an OpenMP simd region or in a nested parallel region.
1810   ///
1811   /// void
1812   /// dispersed_partial_reduce(void *reduce_data,
1813   ///                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1814   ///   int size, remote_id;
1815   ///   int logical_lane_id = number_of_active_lanes_before_me() * 2;
1816   ///   do {
1817   ///       remote_id = next_active_lane_id_right_after_me();
1818   ///       # the above function returns 0 of no active lane
1819   ///       # is present right after the current lane.
1820   ///       size = number_of_active_lanes_in_this_warp();
1821   ///       logical_lane_id /= 2;
1822   ///       ShuffleReduceFn(reduce_data, logical_lane_id,
1823   ///                       remote_id-1-threadIdx.x, 2);
1824   ///   } while (logical_lane_id % 2 == 0 && size > 1);
1825   /// }
1826   ///
1827   /// There is no assumption made about the initial state of the reduction.
1828   /// Any number of lanes (>=1) could be active at any position.  The reduction
1829   /// result is returned in the first active lane.
1830   ///
1831   /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1832   ///
1833   /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1834   /// if (lane_id % 2 == 0 && offset > 0)
1835   ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
1836   /// else
1837   ///     reduce_elem = remote_elem
1838   ///
1839   ///
1840   /// Intra-Team Reduction
1841   ///
1842   /// This function, as implemented in the runtime call
1843   /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1844   /// threads in a team.  It first reduces within a warp using the
1845   /// aforementioned algorithms.  We then proceed to gather all such
1846   /// reduced values at the first warp.
1847   ///
1848   /// The runtime makes use of the function 'InterWarpCpyFn', which copies
1849   /// data from each of the "warp master" (zeroth lane of each warp, where
1850   /// warp-reduced data is held) to the zeroth warp.  This step reduces (in
1851   /// a mathematical sense) the problem of reduction across warp masters in
1852   /// a block to the problem of warp reduction.
1853   ///
1854   ///
1855   /// Inter-Team Reduction
1856   ///
1857   /// Once a team has reduced its data to a single value, it is stored in
1858   /// a global scratchpad array.  Since each team has a distinct slot, this
1859   /// can be done without locking.
1860   ///
1861   /// The last team to write to the scratchpad array proceeds to reduce the
1862   /// scratchpad array.  One or more workers in the last team use the helper
1863   /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
1864   /// the k'th worker reduces every k'th element.
1865   ///
1866   /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
1867   /// reduce across workers and compute a globally reduced value.
1868   ///
1869   /// \param Loc                The location where the reduction was
1870   ///                           encountered. Must be within the associate
1871   ///                           directive and after the last local access to the
1872   ///                           reduction variables.
1873   /// \param AllocaIP           An insertion point suitable for allocas usable
1874   ///                           in reductions.
1875   /// \param CodeGenIP           An insertion point suitable for code
1876   /// generation. \param ReductionInfos     A list of info on each reduction
1877   /// variable. \param IsNoWait           Optional flag set if the reduction is
1878   /// marked as
1879   ///                           nowait.
1880   /// \param IsTeamsReduction   Optional flag set if it is a teams
1881   ///                           reduction.
1882   /// \param HasDistribute      Optional flag set if it is a
1883   ///                           distribute reduction.
1884   /// \param GridValue          Optional GPU grid value.
1885   /// \param ReductionBufNum    Optional OpenMPCUDAReductionBufNumValue to be
1886   /// used for teams reduction.
1887   /// \param SrcLocInfo         Source location information global.
1888   InsertPointOrErrorTy createReductionsGPU(
1889       const LocationDescription &Loc, InsertPointTy AllocaIP,
1890       InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
1891       bool IsNoWait = false, bool IsTeamsReduction = false,
1892       bool HasDistribute = false,
1893       ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
1894       std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
1895       Value *SrcLocInfo = nullptr);
1896 
1897   // TODO: provide atomic and non-atomic reduction generators for reduction
1898   // operators defined by the OpenMP specification.
1899 
1900   /// Generator for '#omp reduction'.
1901   ///
1902   /// Emits the IR instructing the runtime to perform the specific kind of
1903   /// reductions. Expects reduction variables to have been privatized and
1904   /// initialized to reduction-neutral values separately. Emits the calls to
1905   /// runtime functions as well as the reduction function and the basic blocks
1906   /// performing the reduction atomically and non-atomically.
1907   ///
1908   /// The code emitted for the following:
1909   ///
1910   /// \code
1911   ///   type var_1;
1912   ///   type var_2;
1913   ///   #pragma omp <directive> reduction(reduction-op:var_1,var_2)
1914   ///   /* body */;
1915   /// \endcode
1916   ///
1917   /// corresponds to the following sketch.
1918   ///
1919   /// \code
1920   /// void _outlined_par() {
1921   ///   // N is the number of different reductions.
1922   ///   void *red_array[] = {privatized_var_1, privatized_var_2, ...};
1923   ///   switch(__kmpc_reduce(..., N, /*size of data in red array*/, red_array,
1924   ///                        _omp_reduction_func,
1925   ///                        _gomp_critical_user.reduction.var)) {
1926   ///   case 1: {
1927   ///     var_1 = var_1 <reduction-op> privatized_var_1;
1928   ///     var_2 = var_2 <reduction-op> privatized_var_2;
1929   ///     // ...
1930   ///    __kmpc_end_reduce(...);
1931   ///     break;
1932   ///   }
1933   ///   case 2: {
1934   ///     _Atomic<ReductionOp>(var_1, privatized_var_1);
1935   ///     _Atomic<ReductionOp>(var_2, privatized_var_2);
1936   ///     // ...
1937   ///     break;
1938   ///   }
1939   ///   default: break;
1940   ///   }
1941   /// }
1942   ///
1943   /// void _omp_reduction_func(void **lhs, void **rhs) {
1944   ///   *(type *)lhs[0] = *(type *)lhs[0] <reduction-op> *(type *)rhs[0];
1945   ///   *(type *)lhs[1] = *(type *)lhs[1] <reduction-op> *(type *)rhs[1];
1946   ///   // ...
1947   /// }
1948   /// \endcode
1949   ///
1950   /// \param Loc                The location where the reduction was
1951   ///                           encountered. Must be within the associate
1952   ///                           directive and after the last local access to the
1953   ///                           reduction variables.
1954   /// \param AllocaIP           An insertion point suitable for allocas usable
1955   ///                           in reductions.
1956   /// \param ReductionInfos     A list of info on each reduction variable.
1957   /// \param IsNoWait           A flag set if the reduction is marked as nowait.
1958   /// \param IsByRef            A flag set if the reduction is using reference
1959   /// or direct value.
1960   InsertPointOrErrorTy createReductions(const LocationDescription &Loc,
1961                                         InsertPointTy AllocaIP,
1962                                         ArrayRef<ReductionInfo> ReductionInfos,
1963                                         ArrayRef<bool> IsByRef,
1964                                         bool IsNoWait = false);
1965 
1966   ///}
1967 
1968   /// Return the insertion point used by the underlying IRBuilder.
1969   InsertPointTy getInsertionPoint() { return Builder.saveIP(); }
1970 
1971   /// Update the internal location to \p Loc.
1972   bool updateToLocation(const LocationDescription &Loc) {
1973     Builder.restoreIP(Loc.IP);
1974     Builder.SetCurrentDebugLocation(Loc.DL);
1975     return Loc.IP.getBlock() != nullptr;
1976   }
1977 
1978   /// Return the function declaration for the runtime function with \p FnID.
1979   FunctionCallee getOrCreateRuntimeFunction(Module &M,
1980                                             omp::RuntimeFunction FnID);
1981 
1982   Function *getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID);
1983 
1984   /// Return the (LLVM-IR) string describing the source location \p LocStr.
1985   Constant *getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize);
1986 
1987   /// Return the (LLVM-IR) string describing the default source location.
1988   Constant *getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize);
1989 
1990   /// Return the (LLVM-IR) string describing the source location identified by
1991   /// the arguments.
1992   Constant *getOrCreateSrcLocStr(StringRef FunctionName, StringRef FileName,
1993                                  unsigned Line, unsigned Column,
1994                                  uint32_t &SrcLocStrSize);
1995 
1996   /// Return the (LLVM-IR) string describing the DebugLoc \p DL. Use \p F as
1997   /// fallback if \p DL does not specify the function name.
1998   Constant *getOrCreateSrcLocStr(DebugLoc DL, uint32_t &SrcLocStrSize,
1999                                  Function *F = nullptr);
2000 
2001   /// Return the (LLVM-IR) string describing the source location \p Loc.
2002   Constant *getOrCreateSrcLocStr(const LocationDescription &Loc,
2003                                  uint32_t &SrcLocStrSize);
2004 
2005   /// Return an ident_t* encoding the source location \p SrcLocStr and \p Flags.
2006   /// TODO: Create a enum class for the Reserve2Flags
2007   Constant *getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize,
2008                              omp::IdentFlag Flags = omp::IdentFlag(0),
2009                              unsigned Reserve2Flags = 0);
2010 
2011   /// Create a hidden global flag \p Name in the module with initial value \p
2012   /// Value.
2013   GlobalValue *createGlobalFlag(unsigned Value, StringRef Name);
2014 
2015   /// Emit the llvm.used metadata.
2016   void emitUsed(StringRef Name, ArrayRef<llvm::WeakTrackingVH> List);
2017 
2018   /// Emit the kernel execution mode.
2019   GlobalVariable *emitKernelExecutionMode(StringRef KernelName,
2020                                           omp::OMPTgtExecModeFlags Mode);
2021 
2022   /// Generate control flow and cleanup for cancellation.
2023   ///
2024   /// \param CancelFlag Flag indicating if the cancellation is performed.
2025   /// \param CanceledDirective The kind of directive that is cancled.
2026   /// \param ExitCB Extra code to be generated in the exit block.
2027   ///
2028   /// \return an error, if any were triggered during execution.
2029   Error emitCancelationCheckImpl(Value *CancelFlag,
2030                                  omp::Directive CanceledDirective,
2031                                  FinalizeCallbackTy ExitCB = {});
2032 
2033   /// Generate a target region entry call.
2034   ///
2035   /// \param Loc The location at which the request originated and is fulfilled.
2036   /// \param AllocaIP The insertion point to be used for alloca instructions.
2037   /// \param Return Return value of the created function returned by reference.
2038   /// \param DeviceID Identifier for the device via the 'device' clause.
2039   /// \param NumTeams Numer of teams for the region via the 'num_teams' clause
2040   ///                 or 0 if unspecified and -1 if there is no 'teams' clause.
2041   /// \param NumThreads Number of threads via the 'thread_limit' clause.
2042   /// \param HostPtr Pointer to the host-side pointer of the target kernel.
2043   /// \param KernelArgs Array of arguments to the kernel.
2044   InsertPointTy emitTargetKernel(const LocationDescription &Loc,
2045                                  InsertPointTy AllocaIP, Value *&Return,
2046                                  Value *Ident, Value *DeviceID, Value *NumTeams,
2047                                  Value *NumThreads, Value *HostPtr,
2048                                  ArrayRef<Value *> KernelArgs);
2049 
2050   /// Generate a flush runtime call.
2051   ///
2052   /// \param Loc The location at which the request originated and is fulfilled.
2053   void emitFlush(const LocationDescription &Loc);
2054 
2055   /// The finalization stack made up of finalize callbacks currently in-flight,
2056   /// wrapped into FinalizationInfo objects that reference also the finalization
2057   /// target block and the kind of cancellable directive.
2058   SmallVector<FinalizationInfo, 8> FinalizationStack;
2059 
2060   /// Return true if the last entry in the finalization stack is of kind \p DK
2061   /// and cancellable.
2062   bool isLastFinalizationInfoCancellable(omp::Directive DK) {
2063     return !FinalizationStack.empty() &&
2064            FinalizationStack.back().IsCancellable &&
2065            FinalizationStack.back().DK == DK;
2066   }
2067 
2068   /// Generate a taskwait runtime call.
2069   ///
2070   /// \param Loc The location at which the request originated and is fulfilled.
2071   void emitTaskwaitImpl(const LocationDescription &Loc);
2072 
2073   /// Generate a taskyield runtime call.
2074   ///
2075   /// \param Loc The location at which the request originated and is fulfilled.
2076   void emitTaskyieldImpl(const LocationDescription &Loc);
2077 
2078   /// Return the current thread ID.
2079   ///
2080   /// \param Ident The ident (ident_t*) describing the query origin.
2081   Value *getOrCreateThreadID(Value *Ident);
2082 
2083   /// The OpenMPIRBuilder Configuration
2084   OpenMPIRBuilderConfig Config;
2085 
2086   /// The underlying LLVM-IR module
2087   Module &M;
2088 
2089   /// The LLVM-IR Builder used to create IR.
2090   IRBuilder<> Builder;
2091 
2092   /// Map to remember source location strings
2093   StringMap<Constant *> SrcLocStrMap;
2094 
2095   /// Map to remember existing ident_t*.
2096   DenseMap<std::pair<Constant *, uint64_t>, Constant *> IdentMap;
2097 
2098   /// Info manager to keep track of target regions.
2099   OffloadEntriesInfoManager OffloadInfoManager;
2100 
2101   /// The target triple of the underlying module.
2102   const Triple T;
2103 
2104   /// Helper that contains information about regions we need to outline
2105   /// during finalization.
2106   struct OutlineInfo {
2107     using PostOutlineCBTy = std::function<void(Function &)>;
2108     PostOutlineCBTy PostOutlineCB;
2109     BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB;
2110     SmallVector<Value *, 2> ExcludeArgsFromAggregate;
2111 
2112     /// Collect all blocks in between EntryBB and ExitBB in both the given
2113     /// vector and set.
2114     void collectBlocks(SmallPtrSetImpl<BasicBlock *> &BlockSet,
2115                        SmallVectorImpl<BasicBlock *> &BlockVector);
2116 
2117     /// Return the function that contains the region to be outlined.
2118     Function *getFunction() const { return EntryBB->getParent(); }
2119   };
2120 
2121   /// Collection of regions that need to be outlined during finalization.
2122   SmallVector<OutlineInfo, 16> OutlineInfos;
2123 
2124   /// A collection of candidate target functions that's constant allocas will
2125   /// attempt to be raised on a call of finalize after all currently enqueued
2126   /// outline info's have been processed.
2127   SmallVector<llvm::Function *, 16> ConstantAllocaRaiseCandidates;
2128 
2129   /// Collection of owned canonical loop objects that eventually need to be
2130   /// free'd.
2131   std::forward_list<CanonicalLoopInfo> LoopInfos;
2132 
2133   /// Add a new region that will be outlined later.
2134   void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); }
2135 
2136   /// An ordered map of auto-generated variables to their unique names.
2137   /// It stores variables with the following names: 1) ".gomp_critical_user_" +
2138   /// <critical_section_name> + ".var" for "omp critical" directives; 2)
2139   /// <mangled_name_for_global_var> + ".cache." for cache for threadprivate
2140   /// variables.
2141   StringMap<GlobalVariable *, BumpPtrAllocator> InternalVars;
2142 
2143   /// Computes the size of type in bytes.
2144   Value *getSizeInBytes(Value *BasePtr);
2145 
2146   // Emit a branch from the current block to the Target block only if
2147   // the current block has a terminator.
2148   void emitBranch(BasicBlock *Target);
2149 
2150   // If BB has no use then delete it and return. Else place BB after the current
2151   // block, if possible, or else at the end of the function. Also add a branch
2152   // from current block to BB if current block does not have a terminator.
2153   void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished = false);
2154 
2155   /// Emits code for OpenMP 'if' clause using specified \a BodyGenCallbackTy
2156   /// Here is the logic:
2157   /// if (Cond) {
2158   ///   ThenGen();
2159   /// } else {
2160   ///   ElseGen();
2161   /// }
2162   ///
2163   /// \return an error, if any were triggered during execution.
2164   Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
2165                      BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP = {});
2166 
2167   /// Create the global variable holding the offload mappings information.
2168   GlobalVariable *createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
2169                                         std::string VarName);
2170 
2171   /// Create the global variable holding the offload names information.
2172   GlobalVariable *
2173   createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
2174                         std::string VarName);
2175 
2176   struct MapperAllocas {
2177     AllocaInst *ArgsBase = nullptr;
2178     AllocaInst *Args = nullptr;
2179     AllocaInst *ArgSizes = nullptr;
2180   };
2181 
2182   /// Create the allocas instruction used in call to mapper functions.
2183   void createMapperAllocas(const LocationDescription &Loc,
2184                            InsertPointTy AllocaIP, unsigned NumOperands,
2185                            struct MapperAllocas &MapperAllocas);
2186 
2187   /// Create the call for the target mapper function.
2188   /// \param Loc The source location description.
2189   /// \param MapperFunc Function to be called.
2190   /// \param SrcLocInfo Source location information global.
2191   /// \param MaptypesArg The argument types.
2192   /// \param MapnamesArg The argument names.
2193   /// \param MapperAllocas The AllocaInst used for the call.
2194   /// \param DeviceID Device ID for the call.
2195   /// \param NumOperands Number of operands in the call.
2196   void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc,
2197                       Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg,
2198                       struct MapperAllocas &MapperAllocas, int64_t DeviceID,
2199                       unsigned NumOperands);
2200 
2201   /// Container for the arguments used to pass data to the runtime library.
2202   struct TargetDataRTArgs {
2203     /// The array of base pointer passed to the runtime library.
2204     Value *BasePointersArray = nullptr;
2205     /// The array of section pointers passed to the runtime library.
2206     Value *PointersArray = nullptr;
2207     /// The array of sizes passed to the runtime library.
2208     Value *SizesArray = nullptr;
2209     /// The array of map types passed to the runtime library for the beginning
2210     /// of the region or for the entire region if there are no separate map
2211     /// types for the region end.
2212     Value *MapTypesArray = nullptr;
2213     /// The array of map types passed to the runtime library for the end of the
2214     /// region, or nullptr if there are no separate map types for the region
2215     /// end.
2216     Value *MapTypesArrayEnd = nullptr;
2217     /// The array of user-defined mappers passed to the runtime library.
2218     Value *MappersArray = nullptr;
2219     /// The array of original declaration names of mapped pointers sent to the
2220     /// runtime library for debugging
2221     Value *MapNamesArray = nullptr;
2222 
2223     explicit TargetDataRTArgs() {}
2224     explicit TargetDataRTArgs(Value *BasePointersArray, Value *PointersArray,
2225                               Value *SizesArray, Value *MapTypesArray,
2226                               Value *MapTypesArrayEnd, Value *MappersArray,
2227                               Value *MapNamesArray)
2228         : BasePointersArray(BasePointersArray), PointersArray(PointersArray),
2229           SizesArray(SizesArray), MapTypesArray(MapTypesArray),
2230           MapTypesArrayEnd(MapTypesArrayEnd), MappersArray(MappersArray),
2231           MapNamesArray(MapNamesArray) {}
2232   };
2233 
2234   /// Container to pass the default attributes with which a kernel must be
2235   /// launched, used to set kernel attributes and populate associated static
2236   /// structures.
2237   ///
2238   /// For max values, < 0 means unset, == 0 means set but unknown at compile
2239   /// time. The number of max values will be 1 except for the case where
2240   /// ompx_bare is set.
2241   struct TargetKernelDefaultAttrs {
2242     omp::OMPTgtExecModeFlags ExecFlags =
2243         omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
2244     SmallVector<int32_t, 3> MaxTeams = {-1};
2245     int32_t MinTeams = 1;
2246     SmallVector<int32_t, 3> MaxThreads = {-1};
2247     int32_t MinThreads = 1;
2248   };
2249 
2250   /// Container to pass LLVM IR runtime values or constants related to the
2251   /// number of teams and threads with which the kernel must be launched, as
2252   /// well as the trip count of the loop, if it is an SPMD or Generic-SPMD
2253   /// kernel. These must be defined in the host prior to the call to the kernel
2254   /// launch OpenMP RTL function.
2255   struct TargetKernelRuntimeAttrs {
2256     SmallVector<Value *, 3> MaxTeams = {nullptr};
2257     Value *MinTeams = nullptr;
2258     SmallVector<Value *, 3> TargetThreadLimit = {nullptr};
2259     SmallVector<Value *, 3> TeamsThreadLimit = {nullptr};
2260 
2261     /// 'parallel' construct 'num_threads' clause value, if present and it is an
2262     /// SPMD kernel.
2263     Value *MaxThreads = nullptr;
2264 
2265     /// Total number of iterations of the SPMD or Generic-SPMD kernel or null if
2266     /// it is a generic kernel.
2267     Value *LoopTripCount = nullptr;
2268   };
2269 
2270   /// Data structure that contains the needed information to construct the
2271   /// kernel args vector.
2272   struct TargetKernelArgs {
2273     /// Number of arguments passed to the runtime library.
2274     unsigned NumTargetItems = 0;
2275     /// Arguments passed to the runtime library
2276     TargetDataRTArgs RTArgs;
2277     /// The number of iterations
2278     Value *NumIterations = nullptr;
2279     /// The number of teams.
2280     ArrayRef<Value *> NumTeams;
2281     /// The number of threads.
2282     ArrayRef<Value *> NumThreads;
2283     /// The size of the dynamic shared memory.
2284     Value *DynCGGroupMem = nullptr;
2285     /// True if the kernel has 'no wait' clause.
2286     bool HasNoWait = false;
2287 
2288     // Constructors for TargetKernelArgs.
2289     TargetKernelArgs() {}
2290     TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs,
2291                      Value *NumIterations, ArrayRef<Value *> NumTeams,
2292                      ArrayRef<Value *> NumThreads, Value *DynCGGroupMem,
2293                      bool HasNoWait)
2294         : NumTargetItems(NumTargetItems), RTArgs(RTArgs),
2295           NumIterations(NumIterations), NumTeams(NumTeams),
2296           NumThreads(NumThreads), DynCGGroupMem(DynCGGroupMem),
2297           HasNoWait(HasNoWait) {}
2298   };
2299 
2300   /// Create the kernel args vector used by emitTargetKernel. This function
2301   /// creates various constant values that are used in the resulting args
2302   /// vector.
2303   static void getKernelArgsVector(TargetKernelArgs &KernelArgs,
2304                                   IRBuilderBase &Builder,
2305                                   SmallVector<Value *> &ArgsVector);
2306 
2307   /// Struct that keeps the information that should be kept throughout
2308   /// a 'target data' region.
2309   class TargetDataInfo {
2310     /// Set to true if device pointer information have to be obtained.
2311     bool RequiresDevicePointerInfo = false;
2312     /// Set to true if Clang emits separate runtime calls for the beginning and
2313     /// end of the region.  These calls might have separate map type arrays.
2314     bool SeparateBeginEndCalls = false;
2315 
2316   public:
2317     TargetDataRTArgs RTArgs;
2318 
2319     SmallMapVector<const Value *, std::pair<Value *, Value *>, 4>
2320         DevicePtrInfoMap;
2321 
2322     /// Indicate whether any user-defined mapper exists.
2323     bool HasMapper = false;
2324     /// The total number of pointers passed to the runtime library.
2325     unsigned NumberOfPtrs = 0u;
2326 
2327     bool EmitDebug = false;
2328 
2329     /// Whether the `target ... data` directive has a `nowait` clause.
2330     bool HasNoWait = false;
2331 
2332     explicit TargetDataInfo() {}
2333     explicit TargetDataInfo(bool RequiresDevicePointerInfo,
2334                             bool SeparateBeginEndCalls)
2335         : RequiresDevicePointerInfo(RequiresDevicePointerInfo),
2336           SeparateBeginEndCalls(SeparateBeginEndCalls) {}
2337     /// Clear information about the data arrays.
2338     void clearArrayInfo() {
2339       RTArgs = TargetDataRTArgs();
2340       HasMapper = false;
2341       NumberOfPtrs = 0u;
2342     }
2343     /// Return true if the current target data information has valid arrays.
2344     bool isValid() {
2345       return RTArgs.BasePointersArray && RTArgs.PointersArray &&
2346              RTArgs.SizesArray && RTArgs.MapTypesArray &&
2347              (!HasMapper || RTArgs.MappersArray) && NumberOfPtrs;
2348     }
2349     bool requiresDevicePointerInfo() { return RequiresDevicePointerInfo; }
2350     bool separateBeginEndCalls() { return SeparateBeginEndCalls; }
2351   };
2352 
2353   enum class DeviceInfoTy { None, Pointer, Address };
2354   using MapValuesArrayTy = SmallVector<Value *, 4>;
2355   using MapDeviceInfoArrayTy = SmallVector<DeviceInfoTy, 4>;
2356   using MapFlagsArrayTy = SmallVector<omp::OpenMPOffloadMappingFlags, 4>;
2357   using MapNamesArrayTy = SmallVector<Constant *, 4>;
2358   using MapDimArrayTy = SmallVector<uint64_t, 4>;
2359   using MapNonContiguousArrayTy = SmallVector<MapValuesArrayTy, 4>;
2360 
2361   /// This structure contains combined information generated for mappable
2362   /// clauses, including base pointers, pointers, sizes, map types, user-defined
2363   /// mappers, and non-contiguous information.
2364   struct MapInfosTy {
2365     struct StructNonContiguousInfo {
2366       bool IsNonContiguous = false;
2367       MapDimArrayTy Dims;
2368       MapNonContiguousArrayTy Offsets;
2369       MapNonContiguousArrayTy Counts;
2370       MapNonContiguousArrayTy Strides;
2371     };
2372     MapValuesArrayTy BasePointers;
2373     MapValuesArrayTy Pointers;
2374     MapDeviceInfoArrayTy DevicePointers;
2375     MapValuesArrayTy Sizes;
2376     MapFlagsArrayTy Types;
2377     MapNamesArrayTy Names;
2378     StructNonContiguousInfo NonContigInfo;
2379 
2380     /// Append arrays in \a CurInfo.
2381     void append(MapInfosTy &CurInfo) {
2382       BasePointers.append(CurInfo.BasePointers.begin(),
2383                           CurInfo.BasePointers.end());
2384       Pointers.append(CurInfo.Pointers.begin(), CurInfo.Pointers.end());
2385       DevicePointers.append(CurInfo.DevicePointers.begin(),
2386                             CurInfo.DevicePointers.end());
2387       Sizes.append(CurInfo.Sizes.begin(), CurInfo.Sizes.end());
2388       Types.append(CurInfo.Types.begin(), CurInfo.Types.end());
2389       Names.append(CurInfo.Names.begin(), CurInfo.Names.end());
2390       NonContigInfo.Dims.append(CurInfo.NonContigInfo.Dims.begin(),
2391                                 CurInfo.NonContigInfo.Dims.end());
2392       NonContigInfo.Offsets.append(CurInfo.NonContigInfo.Offsets.begin(),
2393                                    CurInfo.NonContigInfo.Offsets.end());
2394       NonContigInfo.Counts.append(CurInfo.NonContigInfo.Counts.begin(),
2395                                   CurInfo.NonContigInfo.Counts.end());
2396       NonContigInfo.Strides.append(CurInfo.NonContigInfo.Strides.begin(),
2397                                    CurInfo.NonContigInfo.Strides.end());
2398     }
2399   };
2400 
2401   /// Callback function type for functions emitting the host fallback code that
2402   /// is executed when the kernel launch fails. It takes an insertion point as
2403   /// parameter where the code should be emitted. It returns an insertion point
2404   /// that points right after after the emitted code.
2405   using EmitFallbackCallbackTy =
2406       function_ref<InsertPointOrErrorTy(InsertPointTy)>;
2407 
2408   /// Generate a target region entry call and host fallback call.
2409   ///
2410   /// \param Loc The location at which the request originated and is fulfilled.
2411   /// \param OutlinedFnID The ooulined function ID.
2412   /// \param EmitTargetCallFallbackCB Call back function to generate host
2413   ///        fallback code.
2414   /// \param Args Data structure holding information about the kernel arguments.
2415   /// \param DeviceID Identifier for the device via the 'device' clause.
2416   /// \param RTLoc Source location identifier
2417   /// \param AllocaIP The insertion point to be used for alloca instructions.
2418   InsertPointOrErrorTy
2419   emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID,
2420                    EmitFallbackCallbackTy EmitTargetCallFallbackCB,
2421                    TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc,
2422                    InsertPointTy AllocaIP);
2423 
2424   /// Callback type for generating the bodies of device directives that require
2425   /// outer target tasks (e.g. in case of having `nowait` or `depend` clauses).
2426   ///
2427   /// \param DeviceID The ID of the device on which the target region will
2428   ///        execute.
2429   /// \param RTLoc Source location identifier
2430   /// \Param TargetTaskAllocaIP Insertion point for the alloca block of the
2431   ///        generated task.
2432   ///
2433   /// \return an error, if any were triggered during execution.
2434   using TargetTaskBodyCallbackTy =
2435       function_ref<Error(Value *DeviceID, Value *RTLoc,
2436                          IRBuilderBase::InsertPoint TargetTaskAllocaIP)>;
2437 
2438   /// Generate a target-task for the target construct
2439   ///
2440   /// \param TaskBodyCB Callback to generate the actual body of the target task.
2441   /// \param DeviceID Identifier for the device via the 'device' clause.
2442   /// \param RTLoc Source location identifier
2443   /// \param AllocaIP The insertion point to be used for alloca instructions.
2444   /// \param Dependencies Vector of DependData objects holding information of
2445   ///        dependencies as specified by the 'depend' clause.
2446   /// \param HasNoWait True if the target construct had 'nowait' on it, false
2447   ///        otherwise
2448   InsertPointOrErrorTy emitTargetTask(
2449       TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
2450       OpenMPIRBuilder::InsertPointTy AllocaIP,
2451       const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
2452       bool HasNoWait);
2453 
2454   /// Emit the arguments to be passed to the runtime library based on the
2455   /// arrays of base pointers, pointers, sizes, map types, and mappers.  If
2456   /// ForEndCall, emit map types to be passed for the end of the region instead
2457   /// of the beginning.
2458   void emitOffloadingArraysArgument(IRBuilderBase &Builder,
2459                                     OpenMPIRBuilder::TargetDataRTArgs &RTArgs,
2460                                     OpenMPIRBuilder::TargetDataInfo &Info,
2461                                     bool ForEndCall = false);
2462 
2463   /// Emit an array of struct descriptors to be assigned to the offload args.
2464   void emitNonContiguousDescriptor(InsertPointTy AllocaIP,
2465                                    InsertPointTy CodeGenIP,
2466                                    MapInfosTy &CombinedInfo,
2467                                    TargetDataInfo &Info);
2468 
2469   /// Emit the arrays used to pass the captures and map information to the
2470   /// offloading runtime library. If there is no map or capture information,
2471   /// return nullptr by reference. Accepts a reference to a MapInfosTy object
2472   /// that contains information generated for mappable clauses,
2473   /// including base pointers, pointers, sizes, map types, user-defined mappers.
2474   void emitOffloadingArrays(
2475       InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
2476       TargetDataInfo &Info, bool IsNonContiguous = false,
2477       function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
2478       function_ref<Value *(unsigned int)> CustomMapperCB = nullptr);
2479 
2480   /// Allocates memory for and populates the arrays required for offloading
2481   /// (offload_{baseptrs|ptrs|mappers|sizes|maptypes|mapnames}). Then, it
2482   /// emits their base addresses as arguments to be passed to the runtime
2483   /// library. In essence, this function is a combination of
2484   /// emitOffloadingArrays and emitOffloadingArraysArgument and should arguably
2485   /// be preferred by clients of OpenMPIRBuilder.
2486   void emitOffloadingArraysAndArgs(
2487       InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
2488       TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
2489       bool IsNonContiguous = false, bool ForEndCall = false,
2490       function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
2491       function_ref<Value *(unsigned int)> CustomMapperCB = nullptr);
2492 
2493   /// Creates offloading entry for the provided entry ID \a ID, address \a
2494   /// Addr, size \a Size, and flags \a Flags.
2495   void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size,
2496                           int32_t Flags, GlobalValue::LinkageTypes,
2497                           StringRef Name = "");
2498 
2499   /// The kind of errors that can occur when emitting the offload entries and
2500   /// metadata.
2501   enum EmitMetadataErrorKind {
2502     EMIT_MD_TARGET_REGION_ERROR,
2503     EMIT_MD_DECLARE_TARGET_ERROR,
2504     EMIT_MD_GLOBAL_VAR_LINK_ERROR
2505   };
2506 
2507   /// Callback function type
2508   using EmitMetadataErrorReportFunctionTy =
2509       std::function<void(EmitMetadataErrorKind, TargetRegionEntryInfo)>;
2510 
2511   // Emit the offloading entries and metadata so that the device codegen side
2512   // can easily figure out what to emit. The produced metadata looks like
2513   // this:
2514   //
2515   // !omp_offload.info = !{!1, ...}
2516   //
2517   // We only generate metadata for function that contain target regions.
2518   void createOffloadEntriesAndInfoMetadata(
2519       EmitMetadataErrorReportFunctionTy &ErrorReportFunction);
2520 
2521 public:
2522   /// Generator for __kmpc_copyprivate
2523   ///
2524   /// \param Loc The source location description.
2525   /// \param BufSize Number of elements in the buffer.
2526   /// \param CpyBuf List of pointers to data to be copied.
2527   /// \param CpyFn function to call for copying data.
2528   /// \param DidIt flag variable; 1 for 'single' thread, 0 otherwise.
2529   ///
2530   /// \return The insertion position *after* the CopyPrivate call.
2531 
2532   InsertPointTy createCopyPrivate(const LocationDescription &Loc,
2533                                   llvm::Value *BufSize, llvm::Value *CpyBuf,
2534                                   llvm::Value *CpyFn, llvm::Value *DidIt);
2535 
2536   /// Generator for '#omp single'
2537   ///
2538   /// \param Loc The source location description.
2539   /// \param BodyGenCB Callback that will generate the region code.
2540   /// \param FiniCB Callback to finalize variable copies.
2541   /// \param IsNowait If false, a barrier is emitted.
2542   /// \param CPVars copyprivate variables.
2543   /// \param CPFuncs copy functions to use for each copyprivate variable.
2544   ///
2545   /// \returns The insertion position *after* the single call.
2546   InsertPointOrErrorTy createSingle(const LocationDescription &Loc,
2547                                     BodyGenCallbackTy BodyGenCB,
2548                                     FinalizeCallbackTy FiniCB, bool IsNowait,
2549                                     ArrayRef<llvm::Value *> CPVars = {},
2550                                     ArrayRef<llvm::Function *> CPFuncs = {});
2551 
2552   /// Generator for '#omp master'
2553   ///
2554   /// \param Loc The insert and source location description.
2555   /// \param BodyGenCB Callback that will generate the region code.
2556   /// \param FiniCB Callback to finalize variable copies.
2557   ///
2558   /// \returns The insertion position *after* the master.
2559   InsertPointOrErrorTy createMaster(const LocationDescription &Loc,
2560                                     BodyGenCallbackTy BodyGenCB,
2561                                     FinalizeCallbackTy FiniCB);
2562 
2563   /// Generator for '#omp masked'
2564   ///
2565   /// \param Loc The insert and source location description.
2566   /// \param BodyGenCB Callback that will generate the region code.
2567   /// \param FiniCB Callback to finialize variable copies.
2568   ///
2569   /// \returns The insertion position *after* the masked.
2570   InsertPointOrErrorTy createMasked(const LocationDescription &Loc,
2571                                     BodyGenCallbackTy BodyGenCB,
2572                                     FinalizeCallbackTy FiniCB, Value *Filter);
2573 
2574   /// Generator for '#omp critical'
2575   ///
2576   /// \param Loc The insert and source location description.
2577   /// \param BodyGenCB Callback that will generate the region body code.
2578   /// \param FiniCB Callback to finalize variable copies.
2579   /// \param CriticalName name of the lock used by the critical directive
2580   /// \param HintInst Hint Instruction for hint clause associated with critical
2581   ///
2582   /// \returns The insertion position *after* the critical.
2583   InsertPointOrErrorTy createCritical(const LocationDescription &Loc,
2584                                       BodyGenCallbackTy BodyGenCB,
2585                                       FinalizeCallbackTy FiniCB,
2586                                       StringRef CriticalName, Value *HintInst);
2587 
2588   /// Generator for '#omp ordered depend (source | sink)'
2589   ///
2590   /// \param Loc The insert and source location description.
2591   /// \param AllocaIP The insertion point to be used for alloca instructions.
2592   /// \param NumLoops The number of loops in depend clause.
2593   /// \param StoreValues The value will be stored in vector address.
2594   /// \param Name The name of alloca instruction.
2595   /// \param IsDependSource If true, depend source; otherwise, depend sink.
2596   ///
2597   /// \return The insertion position *after* the ordered.
2598   InsertPointTy createOrderedDepend(const LocationDescription &Loc,
2599                                     InsertPointTy AllocaIP, unsigned NumLoops,
2600                                     ArrayRef<llvm::Value *> StoreValues,
2601                                     const Twine &Name, bool IsDependSource);
2602 
2603   /// Generator for '#omp ordered [threads | simd]'
2604   ///
2605   /// \param Loc The insert and source location description.
2606   /// \param BodyGenCB Callback that will generate the region code.
2607   /// \param FiniCB Callback to finalize variable copies.
2608   /// \param IsThreads If true, with threads clause or without clause;
2609   /// otherwise, with simd clause;
2610   ///
2611   /// \returns The insertion position *after* the ordered.
2612   InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc,
2613                                                 BodyGenCallbackTy BodyGenCB,
2614                                                 FinalizeCallbackTy FiniCB,
2615                                                 bool IsThreads);
2616 
2617   /// Generator for '#omp sections'
2618   ///
2619   /// \param Loc The insert and source location description.
2620   /// \param AllocaIP The insertion points to be used for alloca instructions.
2621   /// \param SectionCBs Callbacks that will generate body of each section.
2622   /// \param PrivCB Callback to copy a given variable (think copy constructor).
2623   /// \param FiniCB Callback to finalize variable copies.
2624   /// \param IsCancellable Flag to indicate a cancellable parallel region.
2625   /// \param IsNowait If true, barrier - to ensure all sections are executed
2626   /// before moving forward will not be generated.
2627   /// \returns The insertion position *after* the sections.
2628   InsertPointOrErrorTy
2629   createSections(const LocationDescription &Loc, InsertPointTy AllocaIP,
2630                  ArrayRef<StorableBodyGenCallbackTy> SectionCBs,
2631                  PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB,
2632                  bool IsCancellable, bool IsNowait);
2633 
2634   /// Generator for '#omp section'
2635   ///
2636   /// \param Loc The insert and source location description.
2637   /// \param BodyGenCB Callback that will generate the region body code.
2638   /// \param FiniCB Callback to finalize variable copies.
2639   /// \returns The insertion position *after* the section.
2640   InsertPointOrErrorTy createSection(const LocationDescription &Loc,
2641                                      BodyGenCallbackTy BodyGenCB,
2642                                      FinalizeCallbackTy FiniCB);
2643 
2644   /// Generator for `#omp teams`
2645   ///
2646   /// \param Loc The location where the teams construct was encountered.
2647   /// \param BodyGenCB Callback that will generate the region code.
2648   /// \param NumTeamsLower Lower bound on number of teams. If this is nullptr,
2649   ///        it is as if lower bound is specified as equal to upperbound. If
2650   ///        this is non-null, then upperbound must also be non-null.
2651   /// \param NumTeamsUpper Upper bound on the number of teams.
2652   /// \param ThreadLimit on the number of threads that may participate in a
2653   ///        contention group created by each team.
2654   /// \param IfExpr is the integer argument value of the if condition on the
2655   ///        teams clause.
2656   InsertPointOrErrorTy
2657   createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
2658               Value *NumTeamsLower = nullptr, Value *NumTeamsUpper = nullptr,
2659               Value *ThreadLimit = nullptr, Value *IfExpr = nullptr);
2660 
2661   /// Generate conditional branch and relevant BasicBlocks through which private
2662   /// threads copy the 'copyin' variables from Master copy to threadprivate
2663   /// copies.
2664   ///
2665   /// \param IP insertion block for copyin conditional
2666   /// \param MasterVarPtr a pointer to the master variable
2667   /// \param PrivateVarPtr a pointer to the threadprivate variable
2668   /// \param IntPtrTy Pointer size type
2669   /// \param BranchtoEnd Create a branch between the copyin.not.master blocks
2670   //				 and copy.in.end block
2671   ///
2672   /// \returns The insertion point where copying operation to be emitted.
2673   InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr,
2674                                          Value *PrivateAddr,
2675                                          llvm::IntegerType *IntPtrTy,
2676                                          bool BranchtoEnd = true);
2677 
2678   /// Create a runtime call for kmpc_Alloc
2679   ///
2680   /// \param Loc The insert and source location description.
2681   /// \param Size Size of allocated memory space
2682   /// \param Allocator Allocator information instruction
2683   /// \param Name Name of call Instruction for OMP_alloc
2684   ///
2685   /// \returns CallInst to the OMP_Alloc call
2686   CallInst *createOMPAlloc(const LocationDescription &Loc, Value *Size,
2687                            Value *Allocator, std::string Name = "");
2688 
2689   /// Create a runtime call for kmpc_free
2690   ///
2691   /// \param Loc The insert and source location description.
2692   /// \param Addr Address of memory space to be freed
2693   /// \param Allocator Allocator information instruction
2694   /// \param Name Name of call Instruction for OMP_Free
2695   ///
2696   /// \returns CallInst to the OMP_Free call
2697   CallInst *createOMPFree(const LocationDescription &Loc, Value *Addr,
2698                           Value *Allocator, std::string Name = "");
2699 
2700   /// Create a runtime call for kmpc_threadprivate_cached
2701   ///
2702   /// \param Loc The insert and source location description.
2703   /// \param Pointer pointer to data to be cached
2704   /// \param Size size of data to be cached
2705   /// \param Name Name of call Instruction for callinst
2706   ///
2707   /// \returns CallInst to the thread private cache call.
2708   CallInst *createCachedThreadPrivate(const LocationDescription &Loc,
2709                                       llvm::Value *Pointer,
2710                                       llvm::ConstantInt *Size,
2711                                       const llvm::Twine &Name = Twine(""));
2712 
2713   /// Create a runtime call for __tgt_interop_init
2714   ///
2715   /// \param Loc The insert and source location description.
2716   /// \param InteropVar variable to be allocated
2717   /// \param InteropType type of interop operation
2718   /// \param Device devide to which offloading will occur
2719   /// \param NumDependences  number of dependence variables
2720   /// \param DependenceAddress pointer to dependence variables
2721   /// \param HaveNowaitClause does nowait clause exist
2722   ///
2723   /// \returns CallInst to the __tgt_interop_init call
2724   CallInst *createOMPInteropInit(const LocationDescription &Loc,
2725                                  Value *InteropVar,
2726                                  omp::OMPInteropType InteropType, Value *Device,
2727                                  Value *NumDependences,
2728                                  Value *DependenceAddress,
2729                                  bool HaveNowaitClause);
2730 
2731   /// Create a runtime call for __tgt_interop_destroy
2732   ///
2733   /// \param Loc The insert and source location description.
2734   /// \param InteropVar variable to be allocated
2735   /// \param Device devide to which offloading will occur
2736   /// \param NumDependences  number of dependence variables
2737   /// \param DependenceAddress pointer to dependence variables
2738   /// \param HaveNowaitClause does nowait clause exist
2739   ///
2740   /// \returns CallInst to the __tgt_interop_destroy call
2741   CallInst *createOMPInteropDestroy(const LocationDescription &Loc,
2742                                     Value *InteropVar, Value *Device,
2743                                     Value *NumDependences,
2744                                     Value *DependenceAddress,
2745                                     bool HaveNowaitClause);
2746 
2747   /// Create a runtime call for __tgt_interop_use
2748   ///
2749   /// \param Loc The insert and source location description.
2750   /// \param InteropVar variable to be allocated
2751   /// \param Device devide to which offloading will occur
2752   /// \param NumDependences  number of dependence variables
2753   /// \param DependenceAddress pointer to dependence variables
2754   /// \param HaveNowaitClause does nowait clause exist
2755   ///
2756   /// \returns CallInst to the __tgt_interop_use call
2757   CallInst *createOMPInteropUse(const LocationDescription &Loc,
2758                                 Value *InteropVar, Value *Device,
2759                                 Value *NumDependences, Value *DependenceAddress,
2760                                 bool HaveNowaitClause);
2761 
2762   /// The `omp target` interface
2763   ///
2764   /// For more information about the usage of this interface,
2765   /// \see openmp/libomptarget/deviceRTLs/common/include/target.h
2766   ///
2767   ///{
2768 
2769   /// Create a runtime call for kmpc_target_init
2770   ///
2771   /// \param Loc The insert and source location description.
2772   /// \param Attrs Structure containing the default attributes, including
2773   ///        numbers of threads and teams to launch the kernel with.
2774   InsertPointTy createTargetInit(
2775       const LocationDescription &Loc,
2776       const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs);
2777 
2778   /// Create a runtime call for kmpc_target_deinit
2779   ///
2780   /// \param Loc The insert and source location description.
2781   /// \param TeamsReductionDataSize The maximal size of all the reduction data
2782   ///        for teams reduction.
2783   /// \param TeamsReductionBufferLength The number of elements (each of up to
2784   ///        \p TeamsReductionDataSize size), in the teams reduction buffer.
2785   void createTargetDeinit(const LocationDescription &Loc,
2786                           int32_t TeamsReductionDataSize = 0,
2787                           int32_t TeamsReductionBufferLength = 1024);
2788 
2789   ///}
2790 
2791   /// Helpers to read/write kernel annotations from the IR.
2792   ///
2793   ///{
2794 
2795   /// Read/write a bounds on threads for \p Kernel. Read will return 0 if none
2796   /// is set.
2797   static std::pair<int32_t, int32_t>
2798   readThreadBoundsForKernel(const Triple &T, Function &Kernel);
2799   static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel,
2800                                          int32_t LB, int32_t UB);
2801 
2802   /// Read/write a bounds on teams for \p Kernel. Read will return 0 if none
2803   /// is set.
2804   static std::pair<int32_t, int32_t> readTeamBoundsForKernel(const Triple &T,
2805                                                              Function &Kernel);
2806   static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB,
2807                                   int32_t UB);
2808   ///}
2809 
2810 private:
2811   // Sets the function attributes expected for the outlined function
2812   void setOutlinedTargetRegionFunctionAttributes(Function *OutlinedFn);
2813 
2814   // Creates the function ID/Address for the given outlined function.
2815   // In the case of an embedded device function the address of the function is
2816   // used, in the case of a non-offload function a constant is created.
2817   Constant *createOutlinedFunctionID(Function *OutlinedFn,
2818                                      StringRef EntryFnIDName);
2819 
2820   // Creates the region entry address for the outlined function
2821   Constant *createTargetRegionEntryAddr(Function *OutlinedFunction,
2822                                         StringRef EntryFnName);
2823 
2824 public:
2825   /// Functions used to generate a function with the given name.
2826   using FunctionGenCallback =
2827       std::function<Expected<Function *>(StringRef FunctionName)>;
2828 
2829   /// Create a unique name for the entry function using the source location
2830   /// information of the current target region. The name will be something like:
2831   ///
2832   /// __omp_offloading_DD_FFFF_PP_lBB[_CC]
2833   ///
2834   /// where DD_FFFF is an ID unique to the file (device and file IDs), PP is the
2835   /// mangled name of the function that encloses the target region and BB is the
2836   /// line number of the target region. CC is a count added when more than one
2837   /// region is located at the same location.
2838   ///
2839   /// If this target outline function is not an offload entry, we don't need to
2840   /// register it. This may happen if it is guarded by an if clause that is
2841   /// false at compile time, or no target archs have been specified.
2842   ///
2843   /// The created target region ID is used by the runtime library to identify
2844   /// the current target region, so it only has to be unique and not
2845   /// necessarily point to anything. It could be the pointer to the outlined
2846   /// function that implements the target region, but we aren't using that so
2847   /// that the compiler doesn't need to keep that, and could therefore inline
2848   /// the host function if proven worthwhile during optimization. In the other
2849   /// hand, if emitting code for the device, the ID has to be the function
2850   /// address so that it can retrieved from the offloading entry and launched
2851   /// by the runtime library. We also mark the outlined function to have
2852   /// external linkage in case we are emitting code for the device, because
2853   /// these functions will be entry points to the device.
2854   ///
2855   /// \param InfoManager The info manager keeping track of the offload entries
2856   /// \param EntryInfo The entry information about the function
2857   /// \param GenerateFunctionCallback The callback function to generate the code
2858   /// \param OutlinedFunction Pointer to the outlined function
2859   /// \param EntryFnIDName Name of the ID o be created
2860   Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
2861                                  FunctionGenCallback &GenerateFunctionCallback,
2862                                  bool IsOffloadEntry, Function *&OutlinedFn,
2863                                  Constant *&OutlinedFnID);
2864 
2865   /// Registers the given function and sets up the attribtues of the function
2866   /// Returns the FunctionID.
2867   ///
2868   /// \param InfoManager The info manager keeping track of the offload entries
2869   /// \param EntryInfo The entry information about the function
2870   /// \param OutlinedFunction Pointer to the outlined function
2871   /// \param EntryFnName Name of the outlined function
2872   /// \param EntryFnIDName Name of the ID o be created
2873   Constant *registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
2874                                          Function *OutlinedFunction,
2875                                          StringRef EntryFnName,
2876                                          StringRef EntryFnIDName);
2877 
2878   /// Type of BodyGen to use for region codegen
2879   ///
2880   /// Priv: If device pointer privatization is required, emit the body of the
2881   /// region here. It will have to be duplicated: with and without
2882   /// privatization.
2883   /// DupNoPriv: If we need device pointer privatization, we need
2884   /// to emit the body of the region with no privatization in the 'else' branch
2885   /// of the conditional.
2886   /// NoPriv: If we don't require privatization of device
2887   /// pointers, we emit the body in between the runtime calls. This avoids
2888   /// duplicating the body code.
2889   enum BodyGenTy { Priv, DupNoPriv, NoPriv };
2890 
2891   /// Callback type for creating the map infos for the kernel parameters.
2892   /// \param CodeGenIP is the insertion point where code should be generated,
2893   ///        if any.
2894   using GenMapInfoCallbackTy =
2895       function_ref<MapInfosTy &(InsertPointTy CodeGenIP)>;
2896 
2897 private:
2898   /// Emit the array initialization or deletion portion for user-defined mapper
2899   /// code generation. First, it evaluates whether an array section is mapped
2900   /// and whether the \a MapType instructs to delete this section. If \a IsInit
2901   /// is true, and \a MapType indicates to not delete this array, array
2902   /// initialization code is generated. If \a IsInit is false, and \a MapType
2903   /// indicates to delete this array, array deletion code is generated.
2904   void emitUDMapperArrayInitOrDel(Function *MapperFn, llvm::Value *MapperHandle,
2905                                   llvm::Value *Base, llvm::Value *Begin,
2906                                   llvm::Value *Size, llvm::Value *MapType,
2907                                   llvm::Value *MapName, TypeSize ElementSize,
2908                                   llvm::BasicBlock *ExitBB, bool IsInit);
2909 
2910 public:
2911   /// Emit the user-defined mapper function. The code generation follows the
2912   /// pattern in the example below.
2913   /// \code
2914   /// void .omp_mapper.<type_name>.<mapper_id>.(void *rt_mapper_handle,
2915   ///                                           void *base, void *begin,
2916   ///                                           int64_t size, int64_t type,
2917   ///                                           void *name = nullptr) {
2918   ///   // Allocate space for an array section first or add a base/begin for
2919   ///   // pointer dereference.
2920   ///   if ((size > 1 || (base != begin && maptype.IsPtrAndObj)) &&
2921   ///       !maptype.IsDelete)
2922   ///     __tgt_push_mapper_component(rt_mapper_handle, base, begin,
2923   ///                                 size*sizeof(Ty), clearToFromMember(type));
2924   ///   // Map members.
2925   ///   for (unsigned i = 0; i < size; i++) {
2926   ///     // For each component specified by this mapper:
2927   ///     for (auto c : begin[i]->all_components) {
2928   ///       if (c.hasMapper())
2929   ///         (*c.Mapper())(rt_mapper_handle, c.arg_base, c.arg_begin,
2930   ///         c.arg_size,
2931   ///                       c.arg_type, c.arg_name);
2932   ///       else
2933   ///         __tgt_push_mapper_component(rt_mapper_handle, c.arg_base,
2934   ///                                     c.arg_begin, c.arg_size, c.arg_type,
2935   ///                                     c.arg_name);
2936   ///     }
2937   ///   }
2938   ///   // Delete the array section.
2939   ///   if (size > 1 && maptype.IsDelete)
2940   ///     __tgt_push_mapper_component(rt_mapper_handle, base, begin,
2941   ///                                 size*sizeof(Ty), clearToFromMember(type));
2942   /// }
2943   /// \endcode
2944   ///
2945   /// \param PrivAndGenMapInfoCB Callback that privatizes code and populates the
2946   /// MapInfos and returns.
2947   /// \param ElemTy DeclareMapper element type.
2948   /// \param FuncName Optional param to specify mapper function name.
2949   /// \param CustomMapperCB Optional callback to generate code related to
2950   /// custom mappers.
2951   Function *emitUserDefinedMapper(
2952       function_ref<MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
2953                                 llvm::Value *BeginArg)>
2954           PrivAndGenMapInfoCB,
2955       llvm::Type *ElemTy, StringRef FuncName,
2956       function_ref<bool(unsigned int, Function **)> CustomMapperCB = nullptr);
2957 
2958   /// Generator for '#omp target data'
2959   ///
2960   /// \param Loc The location where the target data construct was encountered.
2961   /// \param AllocaIP The insertion points to be used for alloca instructions.
2962   /// \param CodeGenIP The insertion point at which the target directive code
2963   /// should be placed.
2964   /// \param IsBegin If true then emits begin mapper call otherwise emits
2965   /// end mapper call.
2966   /// \param DeviceID Stores the DeviceID from the device clause.
2967   /// \param IfCond Value which corresponds to the if clause condition.
2968   /// \param Info Stores all information realted to the Target Data directive.
2969   /// \param GenMapInfoCB Callback that populates the MapInfos and returns.
2970   /// \param BodyGenCB Optional Callback to generate the region code.
2971   /// \param DeviceAddrCB Optional callback to generate code related to
2972   /// use_device_ptr and use_device_addr.
2973   /// \param CustomMapperCB Optional callback to generate code related to
2974   /// custom mappers.
2975   InsertPointOrErrorTy createTargetData(
2976       const LocationDescription &Loc, InsertPointTy AllocaIP,
2977       InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
2978       TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
2979       omp::RuntimeFunction *MapperFunc = nullptr,
2980       function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
2981                                         BodyGenTy BodyGenType)>
2982           BodyGenCB = nullptr,
2983       function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
2984       function_ref<Value *(unsigned int)> CustomMapperCB = nullptr,
2985       Value *SrcLocInfo = nullptr);
2986 
2987   using TargetBodyGenCallbackTy = function_ref<InsertPointOrErrorTy(
2988       InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
2989 
2990   using TargetGenArgAccessorsCallbackTy = function_ref<InsertPointOrErrorTy(
2991       Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP,
2992       InsertPointTy CodeGenIP)>;
2993 
2994   /// Generator for '#omp target'
2995   ///
2996   /// \param Loc where the target data construct was encountered.
2997   /// \param IsOffloadEntry whether it is an offload entry.
2998   /// \param CodeGenIP The insertion point where the call to the outlined
2999   ///        function should be emitted.
3000   /// \param EntryInfo The entry information about the function.
3001   /// \param DefaultAttrs Structure containing the default attributes, including
3002   ///        numbers of threads and teams to launch the kernel with.
3003   /// \param RuntimeAttrs Structure containing the runtime numbers of threads
3004   ///        and teams to launch the kernel with.
3005   /// \param IfCond value of the `if` clause.
3006   /// \param Inputs The input values to the region that will be passed.
3007   ///        as arguments to the outlined function.
3008   /// \param BodyGenCB Callback that will generate the region code.
3009   /// \param ArgAccessorFuncCB Callback that will generate accessors
3010   ///        instructions for passed in target arguments where neccessary
3011   /// \param Dependencies A vector of DependData objects that carry
3012   ///        dependency information as passed in the depend clause
3013   /// \param HasNowait Whether the target construct has a `nowait` clause or
3014   ///        not.
3015   InsertPointOrErrorTy createTarget(
3016       const LocationDescription &Loc, bool IsOffloadEntry,
3017       OpenMPIRBuilder::InsertPointTy AllocaIP,
3018       OpenMPIRBuilder::InsertPointTy CodeGenIP,
3019       TargetRegionEntryInfo &EntryInfo,
3020       const TargetKernelDefaultAttrs &DefaultAttrs,
3021       const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
3022       SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
3023       TargetBodyGenCallbackTy BodyGenCB,
3024       TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
3025       SmallVector<DependData> Dependencies = {}, bool HasNowait = false);
3026 
3027   /// Returns __kmpc_for_static_init_* runtime function for the specified
3028   /// size \a IVSize and sign \a IVSigned. Will create a distribute call
3029   /// __kmpc_distribute_static_init* if \a IsGPUDistribute is set.
3030   FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned,
3031                                              bool IsGPUDistribute);
3032 
3033   /// Returns __kmpc_dispatch_init_* runtime function for the specified
3034   /// size \a IVSize and sign \a IVSigned.
3035   FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned);
3036 
3037   /// Returns __kmpc_dispatch_next_* runtime function for the specified
3038   /// size \a IVSize and sign \a IVSigned.
3039   FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned);
3040 
3041   /// Returns __kmpc_dispatch_fini_* runtime function for the specified
3042   /// size \a IVSize and sign \a IVSigned.
3043   FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned);
3044 
3045   /// Returns __kmpc_dispatch_deinit runtime function.
3046   FunctionCallee createDispatchDeinitFunction();
3047 
3048   /// Declarations for LLVM-IR types (simple, array, function and structure) are
3049   /// generated below. Their names are defined and used in OpenMPKinds.def. Here
3050   /// we provide the declarations, the initializeTypes function will provide the
3051   /// values.
3052   ///
3053   ///{
3054 #define OMP_TYPE(VarName, InitValue) Type *VarName = nullptr;
3055 #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize)                             \
3056   ArrayType *VarName##Ty = nullptr;                                            \
3057   PointerType *VarName##PtrTy = nullptr;
3058 #define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...)                  \
3059   FunctionType *VarName = nullptr;                                             \
3060   PointerType *VarName##Ptr = nullptr;
3061 #define OMP_STRUCT_TYPE(VarName, StrName, ...)                                 \
3062   StructType *VarName = nullptr;                                               \
3063   PointerType *VarName##Ptr = nullptr;
3064 #include "llvm/Frontend/OpenMP/OMPKinds.def"
3065 
3066   ///}
3067 
3068 private:
3069   /// Create all simple and struct types exposed by the runtime and remember
3070   /// the llvm::PointerTypes of them for easy access later.
3071   void initializeTypes(Module &M);
3072 
3073   /// Common interface for generating entry calls for OMP Directives.
3074   /// if the directive has a region/body, It will set the insertion
3075   /// point to the body
3076   ///
3077   /// \param OMPD Directive to generate entry blocks for
3078   /// \param EntryCall Call to the entry OMP Runtime Function
3079   /// \param ExitBB block where the region ends.
3080   /// \param Conditional indicate if the entry call result will be used
3081   ///        to evaluate a conditional of whether a thread will execute
3082   ///        body code or not.
3083   ///
3084   /// \return The insertion position in exit block
3085   InsertPointTy emitCommonDirectiveEntry(omp::Directive OMPD, Value *EntryCall,
3086                                          BasicBlock *ExitBB,
3087                                          bool Conditional = false);
3088 
3089   /// Common interface to finalize the region
3090   ///
3091   /// \param OMPD Directive to generate exiting code for
3092   /// \param FinIP Insertion point for emitting Finalization code and exit call
3093   /// \param ExitCall Call to the ending OMP Runtime Function
3094   /// \param HasFinalize indicate if the directive will require finalization
3095   ///         and has a finalization callback in the stack that
3096   ///        should be called.
3097   ///
3098   /// \return The insertion position in exit block
3099   InsertPointOrErrorTy emitCommonDirectiveExit(omp::Directive OMPD,
3100                                                InsertPointTy FinIP,
3101                                                Instruction *ExitCall,
3102                                                bool HasFinalize = true);
3103 
3104   /// Common Interface to generate OMP inlined regions
3105   ///
3106   /// \param OMPD Directive to generate inlined region for
3107   /// \param EntryCall Call to the entry OMP Runtime Function
3108   /// \param ExitCall Call to the ending OMP Runtime Function
3109   /// \param BodyGenCB Body code generation callback.
3110   /// \param FiniCB Finalization Callback. Will be called when finalizing region
3111   /// \param Conditional indicate if the entry call result will be used
3112   ///        to evaluate a conditional of whether a thread will execute
3113   ///        body code or not.
3114   /// \param HasFinalize indicate if the directive will require finalization
3115   ///        and has a finalization callback in the stack that
3116   ///        should be called.
3117   /// \param IsCancellable if HasFinalize is set to true, indicate if the
3118   ///        the directive should be cancellable.
3119   /// \return The insertion point after the region
3120   InsertPointOrErrorTy
3121   EmitOMPInlinedRegion(omp::Directive OMPD, Instruction *EntryCall,
3122                        Instruction *ExitCall, BodyGenCallbackTy BodyGenCB,
3123                        FinalizeCallbackTy FiniCB, bool Conditional = false,
3124                        bool HasFinalize = true, bool IsCancellable = false);
3125 
3126   /// Get the platform-specific name separator.
3127   /// \param Parts different parts of the final name that needs separation
3128   /// \param FirstSeparator First separator used between the initial two
3129   ///        parts of the name.
3130   /// \param Separator separator used between all of the rest consecutive
3131   ///        parts of the name
3132   static std::string getNameWithSeparators(ArrayRef<StringRef> Parts,
3133                                            StringRef FirstSeparator,
3134                                            StringRef Separator);
3135 
3136   /// Returns corresponding lock object for the specified critical region
3137   /// name. If the lock object does not exist it is created, otherwise the
3138   /// reference to the existing copy is returned.
3139   /// \param CriticalName Name of the critical region.
3140   ///
3141   Value *getOMPCriticalRegionLock(StringRef CriticalName);
3142 
3143   /// Callback type for Atomic Expression update
3144   /// ex:
3145   /// \code{.cpp}
3146   /// unsigned x = 0;
3147   /// #pragma omp atomic update
3148   /// x = Expr(x_old);  //Expr() is any legal operation
3149   /// \endcode
3150   ///
3151   /// \param XOld the value of the atomic memory address to use for update
3152   /// \param IRB reference to the IRBuilder to use
3153   ///
3154   /// \returns Value to update X to.
3155   using AtomicUpdateCallbackTy =
3156       const function_ref<Expected<Value *>(Value *XOld, IRBuilder<> &IRB)>;
3157 
3158 private:
3159   enum AtomicKind { Read, Write, Update, Capture, Compare };
3160 
3161   /// Determine whether to emit flush or not
3162   ///
3163   /// \param Loc    The insert and source location description.
3164   /// \param AO     The required atomic ordering
3165   /// \param AK     The OpenMP atomic operation kind used.
3166   ///
3167   /// \returns		wether a flush was emitted or not
3168   bool checkAndEmitFlushAfterAtomic(const LocationDescription &Loc,
3169                                     AtomicOrdering AO, AtomicKind AK);
3170 
3171   /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
3172   /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
3173   /// Only Scalar data types.
3174   ///
3175   /// \param AllocaIP	  The insertion point to be used for alloca
3176   ///                   instructions.
3177   /// \param X			    The target atomic pointer to be updated
3178   /// \param XElemTy    The element type of the atomic pointer.
3179   /// \param Expr		    The value to update X with.
3180   /// \param AO			    Atomic ordering of the generated atomic
3181   ///                   instructions.
3182   /// \param RMWOp		  The binary operation used for update. If
3183   ///                   operation is not supported by atomicRMW,
3184   ///                   or belong to {FADD, FSUB, BAD_BINOP}.
3185   ///                   Then a `cmpExch` based	atomic will be generated.
3186   /// \param UpdateOp 	Code generator for complex expressions that cannot be
3187   ///                   expressed through atomicrmw instruction.
3188   /// \param VolatileX	     true if \a X volatile?
3189   /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
3190   ///                     update expression, false otherwise.
3191   ///                     (e.g. true for X = X BinOp Expr)
3192   ///
3193   /// \returns A pair of the old value of X before the update, and the value
3194   ///          used for the update.
3195   Expected<std::pair<Value *, Value *>>
3196   emitAtomicUpdate(InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
3197                    AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
3198                    AtomicUpdateCallbackTy &UpdateOp, bool VolatileX,
3199                    bool IsXBinopExpr);
3200 
3201   /// Emit the binary op. described by \p RMWOp, using \p Src1 and \p Src2 .
3202   ///
3203   /// \Return The instruction
3204   Value *emitRMWOpAsInstruction(Value *Src1, Value *Src2,
3205                                 AtomicRMWInst::BinOp RMWOp);
3206 
3207 public:
3208   /// a struct to pack relevant information while generating atomic Ops
3209   struct AtomicOpValue {
3210     Value *Var = nullptr;
3211     Type *ElemTy = nullptr;
3212     bool IsSigned = false;
3213     bool IsVolatile = false;
3214   };
3215 
3216   /// Emit atomic Read for : V = X --- Only Scalar data types.
3217   ///
3218   /// \param Loc    The insert and source location description.
3219   /// \param X			The target pointer to be atomically read
3220   /// \param V			Memory address where to store atomically read
3221   /// 					    value
3222   /// \param AO			Atomic ordering of the generated atomic
3223   /// 					    instructions.
3224   ///
3225   /// \return Insertion point after generated atomic read IR.
3226   InsertPointTy createAtomicRead(const LocationDescription &Loc,
3227                                  AtomicOpValue &X, AtomicOpValue &V,
3228                                  AtomicOrdering AO);
3229 
3230   /// Emit atomic write for : X = Expr --- Only Scalar data types.
3231   ///
3232   /// \param Loc    The insert and source location description.
3233   /// \param X			The target pointer to be atomically written to
3234   /// \param Expr		The value to store.
3235   /// \param AO			Atomic ordering of the generated atomic
3236   ///               instructions.
3237   ///
3238   /// \return Insertion point after generated atomic Write IR.
3239   InsertPointTy createAtomicWrite(const LocationDescription &Loc,
3240                                   AtomicOpValue &X, Value *Expr,
3241                                   AtomicOrdering AO);
3242 
3243   /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
3244   /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
3245   /// Only Scalar data types.
3246   ///
3247   /// \param Loc      The insert and source location description.
3248   /// \param AllocaIP The insertion point to be used for alloca instructions.
3249   /// \param X        The target atomic pointer to be updated
3250   /// \param Expr     The value to update X with.
3251   /// \param AO       Atomic ordering of the generated atomic instructions.
3252   /// \param RMWOp    The binary operation used for update. If operation
3253   ///                 is	not supported by atomicRMW, or belong to
3254   ///	                {FADD, FSUB, BAD_BINOP}. Then a `cmpExch` based
3255   ///                 atomic will be generated.
3256   /// \param UpdateOp 	Code generator for complex expressions that cannot be
3257   ///                   expressed through atomicrmw instruction.
3258   /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
3259   ///                     update expression, false otherwise.
3260   ///	                    (e.g. true for X = X BinOp Expr)
3261   ///
3262   /// \return Insertion point after generated atomic update IR.
3263   InsertPointOrErrorTy
3264   createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP,
3265                      AtomicOpValue &X, Value *Expr, AtomicOrdering AO,
3266                      AtomicRMWInst::BinOp RMWOp,
3267                      AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr);
3268 
3269   /// Emit atomic update for constructs: --- Only Scalar data types
3270   /// V = X; X = X BinOp Expr ,
3271   /// X = X BinOp Expr; V = X,
3272   /// V = X; X = Expr BinOp X,
3273   /// X = Expr BinOp X; V = X,
3274   /// V = X; X = UpdateOp(X),
3275   /// X = UpdateOp(X); V = X,
3276   ///
3277   /// \param Loc        The insert and source location description.
3278   /// \param AllocaIP   The insertion point to be used for alloca instructions.
3279   /// \param X          The target atomic pointer to be updated
3280   /// \param V          Memory address where to store captured value
3281   /// \param Expr       The value to update X with.
3282   /// \param AO         Atomic ordering of the generated atomic instructions
3283   /// \param RMWOp      The binary operation used for update. If
3284   ///                   operation is not supported by atomicRMW, or belong to
3285   ///	                  {FADD, FSUB, BAD_BINOP}. Then a cmpExch based
3286   ///                   atomic will be generated.
3287   /// \param UpdateOp   Code generator for complex expressions that cannot be
3288   ///                   expressed through atomicrmw instruction.
3289   /// \param UpdateExpr true if X is an in place update of the form
3290   ///                   X = X BinOp Expr or X = Expr BinOp X
3291   /// \param IsXBinopExpr true if X is Left H.S. in Right H.S. part of the
3292   ///                     update expression, false otherwise.
3293   ///                     (e.g. true for X = X BinOp Expr)
3294   /// \param IsPostfixUpdate true if original value of 'x' must be stored in
3295   ///                        'v', not an updated one.
3296   ///
3297   /// \return Insertion point after generated atomic capture IR.
3298   InsertPointOrErrorTy
3299   createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP,
3300                       AtomicOpValue &X, AtomicOpValue &V, Value *Expr,
3301                       AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
3302                       AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr,
3303                       bool IsPostfixUpdate, bool IsXBinopExpr);
3304 
3305   /// Emit atomic compare for constructs: --- Only scalar data types
3306   /// cond-expr-stmt:
3307   /// x = x ordop expr ? expr : x;
3308   /// x = expr ordop x ? expr : x;
3309   /// x = x == e ? d : x;
3310   /// x = e == x ? d : x; (this one is not in the spec)
3311   /// cond-update-stmt:
3312   /// if (x ordop expr) { x = expr; }
3313   /// if (expr ordop x) { x = expr; }
3314   /// if (x == e) { x = d; }
3315   /// if (e == x) { x = d; } (this one is not in the spec)
3316   /// conditional-update-capture-atomic:
3317   /// v = x; cond-update-stmt; (IsPostfixUpdate=true, IsFailOnly=false)
3318   /// cond-update-stmt; v = x; (IsPostfixUpdate=false, IsFailOnly=false)
3319   /// if (x == e) { x = d; } else { v = x; } (IsPostfixUpdate=false,
3320   ///                                         IsFailOnly=true)
3321   /// r = x == e; if (r) { x = d; } (IsPostfixUpdate=false, IsFailOnly=false)
3322   /// r = x == e; if (r) { x = d; } else { v = x; } (IsPostfixUpdate=false,
3323   ///                                                IsFailOnly=true)
3324   ///
3325   /// \param Loc          The insert and source location description.
3326   /// \param X            The target atomic pointer to be updated.
3327   /// \param V            Memory address where to store captured value (for
3328   ///                     compare capture only).
3329   /// \param R            Memory address where to store comparison result
3330   ///                     (for compare capture with '==' only).
3331   /// \param E            The expected value ('e') for forms that use an
3332   ///                     equality comparison or an expression ('expr') for
3333   ///                     forms that use 'ordop' (logically an atomic maximum or
3334   ///                     minimum).
3335   /// \param D            The desired value for forms that use an equality
3336   ///                     comparison. If forms that use 'ordop', it should be
3337   ///                     \p nullptr.
3338   /// \param AO           Atomic ordering of the generated atomic instructions.
3339   /// \param Op           Atomic compare operation. It can only be ==, <, or >.
3340   /// \param IsXBinopExpr True if the conditional statement is in the form where
3341   ///                     x is on LHS. It only matters for < or >.
3342   /// \param IsPostfixUpdate  True if original value of 'x' must be stored in
3343   ///                         'v', not an updated one (for compare capture
3344   ///                         only).
3345   /// \param IsFailOnly   True if the original value of 'x' is stored to 'v'
3346   ///                     only when the comparison fails. This is only valid for
3347   ///                     the case the comparison is '=='.
3348   ///
3349   /// \return Insertion point after generated atomic capture IR.
3350   InsertPointTy
3351   createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X,
3352                       AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D,
3353                       AtomicOrdering AO, omp::OMPAtomicCompareOp Op,
3354                       bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly);
3355   InsertPointTy createAtomicCompare(const LocationDescription &Loc,
3356                                     AtomicOpValue &X, AtomicOpValue &V,
3357                                     AtomicOpValue &R, Value *E, Value *D,
3358                                     AtomicOrdering AO,
3359                                     omp::OMPAtomicCompareOp Op,
3360                                     bool IsXBinopExpr, bool IsPostfixUpdate,
3361                                     bool IsFailOnly, AtomicOrdering Failure);
3362 
3363   /// Create the control flow structure of a canonical OpenMP loop.
3364   ///
3365   /// The emitted loop will be disconnected, i.e. no edge to the loop's
3366   /// preheader and no terminator in the AfterBB. The OpenMPIRBuilder's
3367   /// IRBuilder location is not preserved.
3368   ///
3369   /// \param DL        DebugLoc used for the instructions in the skeleton.
3370   /// \param TripCount Value to be used for the trip count.
3371   /// \param F         Function in which to insert the BasicBlocks.
3372   /// \param PreInsertBefore  Where to insert BBs that execute before the body,
3373   ///                         typically the body itself.
3374   /// \param PostInsertBefore Where to insert BBs that execute after the body.
3375   /// \param Name      Base name used to derive BB
3376   ///                  and instruction names.
3377   ///
3378   /// \returns The CanonicalLoopInfo that represents the emitted loop.
3379   CanonicalLoopInfo *createLoopSkeleton(DebugLoc DL, Value *TripCount,
3380                                         Function *F,
3381                                         BasicBlock *PreInsertBefore,
3382                                         BasicBlock *PostInsertBefore,
3383                                         const Twine &Name = {});
3384   /// OMP Offload Info Metadata name string
3385   const std::string ompOffloadInfoName = "omp_offload.info";
3386 
3387   /// Loads all the offload entries information from the host IR
3388   /// metadata. This function is only meant to be used with device code
3389   /// generation.
3390   ///
3391   /// \param M         Module to load Metadata info from. Module passed maybe
3392   /// loaded from bitcode file, i.e, different from OpenMPIRBuilder::M module.
3393   void loadOffloadInfoMetadata(Module &M);
3394 
3395   /// Loads all the offload entries information from the host IR
3396   /// metadata read from the file passed in as the HostFilePath argument. This
3397   /// function is only meant to be used with device code generation.
3398   ///
3399   /// \param HostFilePath The path to the host IR file,
3400   /// used to load in offload metadata for the device, allowing host and device
3401   /// to maintain the same metadata mapping.
3402   void loadOffloadInfoMetadata(StringRef HostFilePath);
3403 
3404   /// Gets (if variable with the given name already exist) or creates
3405   /// internal global variable with the specified Name. The created variable has
3406   /// linkage CommonLinkage by default and is initialized by null value.
3407   /// \param Ty Type of the global variable. If it is exist already the type
3408   /// must be the same.
3409   /// \param Name Name of the variable.
3410   GlobalVariable *getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
3411                                               unsigned AddressSpace = 0);
3412 };
3413 
3414 /// Class to represented the control flow structure of an OpenMP canonical loop.
3415 ///
3416 /// The control-flow structure is standardized for easy consumption by
3417 /// directives associated with loops. For instance, the worksharing-loop
3418 /// construct may change this control flow such that each loop iteration is
3419 /// executed on only one thread. The constraints of a canonical loop in brief
3420 /// are:
3421 ///
3422 ///  * The number of loop iterations must have been computed before entering the
3423 ///    loop.
3424 ///
3425 ///  * Has an (unsigned) logical induction variable that starts at zero and
3426 ///    increments by one.
3427 ///
3428 ///  * The loop's CFG itself has no side-effects. The OpenMP specification
3429 ///    itself allows side-effects, but the order in which they happen, including
3430 ///    how often or whether at all, is unspecified. We expect that the frontend
3431 ///    will emit those side-effect instructions somewhere (e.g. before the loop)
3432 ///    such that the CanonicalLoopInfo itself can be side-effect free.
3433 ///
3434 /// Keep in mind that CanonicalLoopInfo is meant to only describe a repeated
3435 /// execution of a loop body that satifies these constraints. It does NOT
3436 /// represent arbitrary SESE regions that happen to contain a loop. Do not use
3437 /// CanonicalLoopInfo for such purposes.
3438 ///
3439 /// The control flow can be described as follows:
3440 ///
3441 ///     Preheader
3442 ///        |
3443 ///  /-> Header
3444 ///  |     |
3445 ///  |    Cond---\
3446 ///  |     |     |
3447 ///  |    Body   |
3448 ///  |    | |    |
3449 ///  |   <...>   |
3450 ///  |    | |    |
3451 ///   \--Latch   |
3452 ///              |
3453 ///             Exit
3454 ///              |
3455 ///            After
3456 ///
3457 /// The loop is thought to start at PreheaderIP (at the Preheader's terminator,
3458 /// including) and end at AfterIP (at the After's first instruction, excluding).
3459 /// That is, instructions in the Preheader and After blocks (except the
3460 /// Preheader's terminator) are out of CanonicalLoopInfo's control and may have
3461 /// side-effects. Typically, the Preheader is used to compute the loop's trip
3462 /// count. The instructions from BodyIP (at the Body block's first instruction,
3463 /// excluding) until the Latch are also considered outside CanonicalLoopInfo's
3464 /// control and thus can have side-effects. The body block is the single entry
3465 /// point into the loop body, which may contain arbitrary control flow as long
3466 /// as all control paths eventually branch to the Latch block.
3467 ///
3468 /// TODO: Consider adding another standardized BasicBlock between Body CFG and
3469 /// Latch to guarantee that there is only a single edge to the latch. It would
3470 /// make loop transformations easier to not needing to consider multiple
3471 /// predecessors of the latch (See redirectAllPredecessorsTo) and would give us
3472 /// an equivalant to PreheaderIP, AfterIP and BodyIP for inserting code that
3473 /// executes after each body iteration.
3474 ///
3475 /// There must be no loop-carried dependencies through llvm::Values. This is
3476 /// equivalant to that the Latch has no PHINode and the Header's only PHINode is
3477 /// for the induction variable.
3478 ///
3479 /// All code in Header, Cond, Latch and Exit (plus the terminator of the
3480 /// Preheader) are CanonicalLoopInfo's responsibility and their build-up checked
3481 /// by assertOK(). They are expected to not be modified unless explicitly
3482 /// modifying the CanonicalLoopInfo through a methods that applies a OpenMP
3483 /// loop-associated construct such as applyWorkshareLoop, tileLoops, unrollLoop,
3484 /// etc. These methods usually invalidate the CanonicalLoopInfo and re-use its
3485 /// basic blocks. After invalidation, the CanonicalLoopInfo must not be used
3486 /// anymore as its underlying control flow may not exist anymore.
3487 /// Loop-transformation methods such as tileLoops, collapseLoops and unrollLoop
3488 /// may also return a new CanonicalLoopInfo that can be passed to other
3489 /// loop-associated construct implementing methods. These loop-transforming
3490 /// methods may either create a new CanonicalLoopInfo usually using
3491 /// createLoopSkeleton and invalidate the input CanonicalLoopInfo, or reuse and
3492 /// modify one of the input CanonicalLoopInfo and return it as representing the
3493 /// modified loop. What is done is an implementation detail of
3494 /// transformation-implementing method and callers should always assume that the
3495 /// CanonicalLoopInfo passed to it is invalidated and a new object is returned.
3496 /// Returned CanonicalLoopInfo have the same structure and guarantees as the one
3497 /// created by createCanonicalLoop, such that transforming methods do not have
3498 /// to special case where the CanonicalLoopInfo originated from.
3499 ///
3500 /// Generally, methods consuming CanonicalLoopInfo do not need an
3501 /// OpenMPIRBuilder::InsertPointTy as argument, but use the locations of the
3502 /// CanonicalLoopInfo to insert new or modify existing instructions. Unless
3503 /// documented otherwise, methods consuming CanonicalLoopInfo do not invalidate
3504 /// any InsertPoint that is outside CanonicalLoopInfo's control. Specifically,
3505 /// any InsertPoint in the Preheader, After or Block can still be used after
3506 /// calling such a method.
3507 ///
3508 /// TODO: Provide mechanisms for exception handling and cancellation points.
3509 ///
3510 /// Defined outside OpenMPIRBuilder because nested classes cannot be
3511 /// forward-declared, e.g. to avoid having to include the entire OMPIRBuilder.h.
3512 class CanonicalLoopInfo {
3513   friend class OpenMPIRBuilder;
3514 
3515 private:
3516   BasicBlock *Header = nullptr;
3517   BasicBlock *Cond = nullptr;
3518   BasicBlock *Latch = nullptr;
3519   BasicBlock *Exit = nullptr;
3520 
3521   /// Add the control blocks of this loop to \p BBs.
3522   ///
3523   /// This does not include any block from the body, including the one returned
3524   /// by getBody().
3525   ///
3526   /// FIXME: This currently includes the Preheader and After blocks even though
3527   /// their content is (mostly) not under CanonicalLoopInfo's control.
3528   /// Re-evaluated whether this makes sense.
3529   void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs);
3530 
3531   /// Sets the number of loop iterations to the given value. This value must be
3532   /// valid in the condition block (i.e., defined in the preheader) and is
3533   /// interpreted as an unsigned integer.
3534   void setTripCount(Value *TripCount);
3535 
3536   /// Replace all uses of the canonical induction variable in the loop body with
3537   /// a new one.
3538   ///
3539   /// The intended use case is to update the induction variable for an updated
3540   /// iteration space such that it can stay normalized in the 0...tripcount-1
3541   /// range.
3542   ///
3543   /// The \p Updater is called with the (presumable updated) current normalized
3544   /// induction variable and is expected to return the value that uses of the
3545   /// pre-updated induction values should use instead, typically dependent on
3546   /// the new induction variable. This is a lambda (instead of e.g. just passing
3547   /// the new value) to be able to distinguish the uses of the pre-updated
3548   /// induction variable and uses of the induction varible to compute the
3549   /// updated induction variable value.
3550   void mapIndVar(llvm::function_ref<Value *(Instruction *)> Updater);
3551 
3552 public:
3553   /// Returns whether this object currently represents the IR of a loop. If
3554   /// returning false, it may have been consumed by a loop transformation or not
3555   /// been intialized. Do not use in this case;
3556   bool isValid() const { return Header; }
3557 
3558   /// The preheader ensures that there is only a single edge entering the loop.
3559   /// Code that must be execute before any loop iteration can be emitted here,
3560   /// such as computing the loop trip count and begin lifetime markers. Code in
3561   /// the preheader is not considered part of the canonical loop.
3562   BasicBlock *getPreheader() const;
3563 
3564   /// The header is the entry for each iteration. In the canonical control flow,
3565   /// it only contains the PHINode for the induction variable.
3566   BasicBlock *getHeader() const {
3567     assert(isValid() && "Requires a valid canonical loop");
3568     return Header;
3569   }
3570 
3571   /// The condition block computes whether there is another loop iteration. If
3572   /// yes, branches to the body; otherwise to the exit block.
3573   BasicBlock *getCond() const {
3574     assert(isValid() && "Requires a valid canonical loop");
3575     return Cond;
3576   }
3577 
3578   /// The body block is the single entry for a loop iteration and not controlled
3579   /// by CanonicalLoopInfo. It can contain arbitrary control flow but must
3580   /// eventually branch to the \p Latch block.
3581   BasicBlock *getBody() const {
3582     assert(isValid() && "Requires a valid canonical loop");
3583     return cast<BranchInst>(Cond->getTerminator())->getSuccessor(0);
3584   }
3585 
3586   /// Reaching the latch indicates the end of the loop body code. In the
3587   /// canonical control flow, it only contains the increment of the induction
3588   /// variable.
3589   BasicBlock *getLatch() const {
3590     assert(isValid() && "Requires a valid canonical loop");
3591     return Latch;
3592   }
3593 
3594   /// Reaching the exit indicates no more iterations are being executed.
3595   BasicBlock *getExit() const {
3596     assert(isValid() && "Requires a valid canonical loop");
3597     return Exit;
3598   }
3599 
3600   /// The after block is intended for clean-up code such as lifetime end
3601   /// markers. It is separate from the exit block to ensure, analogous to the
3602   /// preheader, it having just a single entry edge and being free from PHI
3603   /// nodes should there be multiple loop exits (such as from break
3604   /// statements/cancellations).
3605   BasicBlock *getAfter() const {
3606     assert(isValid() && "Requires a valid canonical loop");
3607     return Exit->getSingleSuccessor();
3608   }
3609 
3610   /// Returns the llvm::Value containing the number of loop iterations. It must
3611   /// be valid in the preheader and always interpreted as an unsigned integer of
3612   /// any bit-width.
3613   Value *getTripCount() const {
3614     assert(isValid() && "Requires a valid canonical loop");
3615     Instruction *CmpI = &Cond->front();
3616     assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
3617     return CmpI->getOperand(1);
3618   }
3619 
3620   /// Returns the instruction representing the current logical induction
3621   /// variable. Always unsigned, always starting at 0 with an increment of one.
3622   Instruction *getIndVar() const {
3623     assert(isValid() && "Requires a valid canonical loop");
3624     Instruction *IndVarPHI = &Header->front();
3625     assert(isa<PHINode>(IndVarPHI) && "First inst must be the IV PHI");
3626     return IndVarPHI;
3627   }
3628 
3629   /// Return the type of the induction variable (and the trip count).
3630   Type *getIndVarType() const {
3631     assert(isValid() && "Requires a valid canonical loop");
3632     return getIndVar()->getType();
3633   }
3634 
3635   /// Return the insertion point for user code before the loop.
3636   OpenMPIRBuilder::InsertPointTy getPreheaderIP() const {
3637     assert(isValid() && "Requires a valid canonical loop");
3638     BasicBlock *Preheader = getPreheader();
3639     return {Preheader, std::prev(Preheader->end())};
3640   };
3641 
3642   /// Return the insertion point for user code in the body.
3643   OpenMPIRBuilder::InsertPointTy getBodyIP() const {
3644     assert(isValid() && "Requires a valid canonical loop");
3645     BasicBlock *Body = getBody();
3646     return {Body, Body->begin()};
3647   };
3648 
3649   /// Return the insertion point for user code after the loop.
3650   OpenMPIRBuilder::InsertPointTy getAfterIP() const {
3651     assert(isValid() && "Requires a valid canonical loop");
3652     BasicBlock *After = getAfter();
3653     return {After, After->begin()};
3654   };
3655 
3656   Function *getFunction() const {
3657     assert(isValid() && "Requires a valid canonical loop");
3658     return Header->getParent();
3659   }
3660 
3661   /// Consistency self-check.
3662   void assertOK() const;
3663 
3664   /// Invalidate this loop. That is, the underlying IR does not fulfill the
3665   /// requirements of an OpenMP canonical loop anymore.
3666   void invalidate();
3667 };
3668 
3669 } // end namespace llvm
3670 
3671 #endif // LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
3672