xref: /llvm-project/offload/plugins-nextgen/common/include/PluginInterface.h (revision 92376c3ff5453cb954a614d368fa3d52d6d0fa99)
1 //===- PluginInterface.h - Target independent plugin device interface -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
10 
11 #ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H
12 #define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H
13 
14 #include <cstddef>
15 #include <cstdint>
16 #include <deque>
17 #include <list>
18 #include <map>
19 #include <shared_mutex>
20 #include <vector>
21 
22 #include "ExclusiveAccess.h"
23 #include "Shared/APITypes.h"
24 #include "Shared/Debug.h"
25 #include "Shared/Environment.h"
26 #include "Shared/EnvironmentVar.h"
27 #include "Shared/Requirements.h"
28 #include "Shared/Utils.h"
29 
30 #include "GlobalHandler.h"
31 #include "JIT.h"
32 #include "MemoryManager.h"
33 #include "RPC.h"
34 #include "omptarget.h"
35 
36 #ifdef OMPT_SUPPORT
37 #include "omp-tools.h"
38 #endif
39 
40 #include "llvm/ADT/SmallVector.h"
41 #include "llvm/Frontend/OpenMP/OMPConstants.h"
42 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
43 #include "llvm/Support/Allocator.h"
44 #include "llvm/Support/Error.h"
45 #include "llvm/Support/ErrorHandling.h"
46 #include "llvm/Support/MemoryBufferRef.h"
47 #include "llvm/Support/raw_ostream.h"
48 #include "llvm/TargetParser/Triple.h"
49 
50 namespace llvm {
51 namespace omp {
52 namespace target {
53 
54 namespace plugin {
55 
56 struct GenericPluginTy;
57 struct GenericKernelTy;
58 struct GenericDeviceTy;
59 struct RecordReplayTy;
60 
61 /// Class that wraps the __tgt_async_info to simply its usage. In case the
62 /// object is constructed without a valid __tgt_async_info, the object will use
63 /// an internal one and will synchronize the current thread with the pending
64 /// operations when calling AsyncInfoWrapperTy::finalize(). This latter function
65 /// must be called before destroying the wrapper object.
66 struct AsyncInfoWrapperTy {
67   AsyncInfoWrapperTy(GenericDeviceTy &Device, __tgt_async_info *AsyncInfoPtr);
68 
69   ~AsyncInfoWrapperTy() {
70     assert(!AsyncInfoPtr && "AsyncInfoWrapperTy not finalized");
71   }
72 
73   /// Get the raw __tgt_async_info pointer.
74   operator __tgt_async_info *() const { return AsyncInfoPtr; }
75 
76   /// Indicate whether there is queue.
77   bool hasQueue() const { return (AsyncInfoPtr->Queue != nullptr); }
78 
79   /// Get the queue.
80   template <typename Ty> Ty getQueueAs() {
81     static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue),
82                   "Queue is not of the same size as target type");
83     return static_cast<Ty>(AsyncInfoPtr->Queue);
84   }
85 
86   /// Set the queue.
87   template <typename Ty> void setQueueAs(Ty Queue) {
88     static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue),
89                   "Queue is not of the same size as target type");
90     assert(!AsyncInfoPtr->Queue && "Overwriting queue");
91     AsyncInfoPtr->Queue = Queue;
92   }
93 
94   /// Synchronize with the __tgt_async_info's pending operations if it's the
95   /// internal async info. The error associated to the aysnchronous operations
96   /// issued in this queue must be provided in \p Err. This function will update
97   /// the error parameter with the result of the synchronization if it was
98   /// actually executed. This function must be called before destroying the
99   /// object and only once.
100   void finalize(Error &Err);
101 
102   /// Register \p Ptr as an associated alloction that is freed after
103   /// finalization.
104   void freeAllocationAfterSynchronization(void *Ptr) {
105     AsyncInfoPtr->AssociatedAllocations.push_back(Ptr);
106   }
107 
108 private:
109   GenericDeviceTy &Device;
110   __tgt_async_info LocalAsyncInfo;
111   __tgt_async_info *AsyncInfoPtr;
112 };
113 
114 /// The information level represents the level of a key-value property in the
115 /// info tree print (i.e. indentation). The first level should be the default.
116 enum InfoLevelKind { InfoLevel1 = 1, InfoLevel2, InfoLevel3 };
117 
118 /// Class for storing device information and later be printed. An object of this
119 /// type acts as a queue of key-value properties. Each property has a key, a
120 /// a value, and an optional unit for the value. For printing purposes, the
121 /// information can be classified into several levels. These levels are useful
122 /// for defining sections and subsections. Thus, each key-value property also
123 /// has an additional field indicating to which level belongs to. Notice that
124 /// we use the level to determine the indentation of the key-value property at
125 /// printing time. See the enum InfoLevelKind for the list of accepted levels.
126 class InfoQueueTy {
127 public:
128   struct InfoQueueEntryTy {
129     std::string Key;
130     std::string Value;
131     std::string Units;
132     uint64_t Level;
133   };
134 
135 private:
136   std::deque<InfoQueueEntryTy> Queue;
137 
138 public:
139   /// Add a new info entry to the queue. The entry requires at least a key
140   /// string in \p Key. The value in \p Value is optional and can be any type
141   /// that is representable as a string. The units in \p Units is optional and
142   /// must be a string. The info level is a template parameter that defaults to
143   /// the first level (top level).
144   template <InfoLevelKind L = InfoLevel1, typename T = std::string>
145   void add(const std::string &Key, T Value = T(),
146            const std::string &Units = std::string()) {
147     assert(!Key.empty() && "Invalid info key");
148 
149     // Convert the value to a string depending on its type.
150     if constexpr (std::is_same_v<T, bool>)
151       Queue.push_back({Key, Value ? "Yes" : "No", Units, L});
152     else if constexpr (std::is_arithmetic_v<T>)
153       Queue.push_back({Key, std::to_string(Value), Units, L});
154     else
155       Queue.push_back({Key, Value, Units, L});
156   }
157 
158   const std::deque<InfoQueueEntryTy> &getQueue() const { return Queue; }
159 
160   /// Print all info entries added to the queue.
161   void print() const {
162     // We print four spances for each level.
163     constexpr uint64_t IndentSize = 4;
164 
165     // Find the maximum key length (level + key) to compute the individual
166     // indentation of each entry.
167     uint64_t MaxKeySize = 0;
168     for (const auto &Entry : Queue) {
169       uint64_t KeySize = Entry.Key.size() + Entry.Level * IndentSize;
170       if (KeySize > MaxKeySize)
171         MaxKeySize = KeySize;
172     }
173 
174     // Print all info entries.
175     for (const auto &Entry : Queue) {
176       // Compute the indentations for the current entry.
177       uint64_t KeyIndentSize = Entry.Level * IndentSize;
178       uint64_t ValIndentSize =
179           MaxKeySize - (Entry.Key.size() + KeyIndentSize) + IndentSize;
180 
181       llvm::outs() << std::string(KeyIndentSize, ' ') << Entry.Key
182                    << std::string(ValIndentSize, ' ') << Entry.Value
183                    << (Entry.Units.empty() ? "" : " ") << Entry.Units << "\n";
184     }
185   }
186 };
187 
188 /// Class wrapping a __tgt_device_image and its offload entry table on a
189 /// specific device. This class is responsible for storing and managing
190 /// the offload entries for an image on a device.
191 class DeviceImageTy {
192   /// Image identifier within the corresponding device. Notice that this id is
193   /// not unique between different device; they may overlap.
194   int32_t ImageId;
195 
196   /// The pointer to the raw __tgt_device_image.
197   const __tgt_device_image *TgtImage;
198   const __tgt_device_image *TgtImageBitcode;
199 
200   /// Reference to the device this image is loaded on.
201   GenericDeviceTy &Device;
202 
203   /// If this image has any global destructors that much be called.
204   /// FIXME: This is only required because we currently have no invariants
205   ///        towards the lifetime of the underlying image. We should either copy
206   ///        the image into memory locally or erase the pointers after init.
207   bool PendingGlobalDtors;
208 
209 public:
210   DeviceImageTy(int32_t Id, GenericDeviceTy &Device,
211                 const __tgt_device_image *Image)
212       : ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr), Device(Device),
213         PendingGlobalDtors(false) {
214     assert(TgtImage && "Invalid target image");
215   }
216 
217   /// Get the image identifier within the device.
218   int32_t getId() const { return ImageId; }
219 
220   /// Get the device that this image is loaded onto.
221   GenericDeviceTy &getDevice() const { return Device; }
222 
223   /// Get the pointer to the raw __tgt_device_image.
224   const __tgt_device_image *getTgtImage() const { return TgtImage; }
225 
226   void setTgtImageBitcode(const __tgt_device_image *TgtImageBitcode) {
227     this->TgtImageBitcode = TgtImageBitcode;
228   }
229 
230   const __tgt_device_image *getTgtImageBitcode() const {
231     return TgtImageBitcode;
232   }
233 
234   /// Get the image starting address.
235   void *getStart() const { return TgtImage->ImageStart; }
236 
237   /// Get the image size.
238   size_t getSize() const {
239     return utils::getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart);
240   }
241 
242   /// Get a memory buffer reference to the whole image.
243   MemoryBufferRef getMemoryBuffer() const {
244     return MemoryBufferRef(StringRef((const char *)getStart(), getSize()),
245                            "Image");
246   }
247   /// Accessors to the boolean value
248   bool setPendingGlobalDtors() { return PendingGlobalDtors = true; }
249   bool hasPendingGlobalDtors() const { return PendingGlobalDtors; }
250 };
251 
252 /// Class implementing common functionalities of offload kernels. Each plugin
253 /// should define the specific kernel class, derive from this generic one, and
254 /// implement the necessary virtual function members.
255 struct GenericKernelTy {
256   /// Construct a kernel with a name and a execution mode.
257   GenericKernelTy(const char *Name)
258       : Name(Name), PreferredNumThreads(0), MaxNumThreads(0) {}
259 
260   virtual ~GenericKernelTy() {}
261 
262   /// Initialize the kernel object from a specific device.
263   Error init(GenericDeviceTy &GenericDevice, DeviceImageTy &Image);
264   virtual Error initImpl(GenericDeviceTy &GenericDevice,
265                          DeviceImageTy &Image) = 0;
266 
267   /// Launch the kernel on the specific device. The device must be the same
268   /// one used to initialize the kernel.
269   Error launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
270                ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
271                AsyncInfoWrapperTy &AsyncInfoWrapper) const;
272   virtual Error launchImpl(GenericDeviceTy &GenericDevice,
273                            uint32_t NumThreads[3], uint32_t NumBlocks[3],
274                            KernelArgsTy &KernelArgs,
275                            KernelLaunchParamsTy LaunchParams,
276                            AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
277 
278   /// Get the kernel name.
279   const char *getName() const { return Name; }
280 
281   /// Get the kernel image.
282   DeviceImageTy &getImage() const {
283     assert(ImagePtr && "Kernel is not initialized!");
284     return *ImagePtr;
285   }
286 
287   /// Return the kernel environment object for kernel \p Name.
288   const KernelEnvironmentTy &getKernelEnvironmentForKernel() {
289     return KernelEnvironment;
290   }
291 
292   /// Return a device pointer to a new kernel launch environment.
293   Expected<KernelLaunchEnvironmentTy *>
294   getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version,
295                              AsyncInfoWrapperTy &AsyncInfo) const;
296 
297   /// Indicate whether an execution mode is valid.
298   static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
299     switch (ExecutionMode) {
300     case OMP_TGT_EXEC_MODE_SPMD:
301     case OMP_TGT_EXEC_MODE_GENERIC:
302     case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
303       return true;
304     }
305     return false;
306   }
307 
308 protected:
309   /// Get the execution mode name of the kernel.
310   const char *getExecutionModeName() const {
311     switch (KernelEnvironment.Configuration.ExecMode) {
312     case OMP_TGT_EXEC_MODE_SPMD:
313       return "SPMD";
314     case OMP_TGT_EXEC_MODE_GENERIC:
315       return "Generic";
316     case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
317       return "Generic-SPMD";
318     }
319     llvm_unreachable("Unknown execution mode!");
320   }
321 
322   /// Prints generic kernel launch information.
323   Error printLaunchInfo(GenericDeviceTy &GenericDevice,
324                         KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
325                         uint32_t NumBlocks[3]) const;
326 
327   /// Prints plugin-specific kernel launch information after generic kernel
328   /// launch information
329   virtual Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
330                                        KernelArgsTy &KernelArgs,
331                                        uint32_t NumThreads[3],
332                                        uint32_t NumBlocks[3]) const;
333 
334 private:
335   /// Prepare the arguments before launching the kernel.
336   KernelLaunchParamsTy
337   prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
338               ptrdiff_t *ArgOffsets, uint32_t &NumArgs,
339               llvm::SmallVectorImpl<void *> &Args,
340               llvm::SmallVectorImpl<void *> &Ptrs,
341               KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const;
342 
343   /// Get the number of threads and blocks for the kernel based on the
344   /// user-defined threads and block clauses.
345   uint32_t getNumThreads(GenericDeviceTy &GenericDevice,
346                          uint32_t ThreadLimitClause[3]) const;
347 
348   /// The number of threads \p NumThreads can be adjusted by this method.
349   /// \p IsNumThreadsFromUser is true is \p NumThreads is defined by user via
350   /// thread_limit clause.
351   uint32_t getNumBlocks(GenericDeviceTy &GenericDevice,
352                         uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
353                         uint32_t &NumThreads, bool IsNumThreadsFromUser) const;
354 
355   /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
356   bool isGenericSPMDMode() const {
357     return KernelEnvironment.Configuration.ExecMode ==
358            OMP_TGT_EXEC_MODE_GENERIC_SPMD;
359   }
360   bool isGenericMode() const {
361     return KernelEnvironment.Configuration.ExecMode ==
362            OMP_TGT_EXEC_MODE_GENERIC;
363   }
364   bool isSPMDMode() const {
365     return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_SPMD;
366   }
367 
368   /// The kernel name.
369   const char *Name;
370 
371   /// The image that contains this kernel.
372   DeviceImageTy *ImagePtr = nullptr;
373 
374 protected:
375   /// The preferred number of threads to run the kernel.
376   uint32_t PreferredNumThreads;
377 
378   /// The maximum number of threads which the kernel could leverage.
379   uint32_t MaxNumThreads;
380 
381   /// The kernel environment, including execution flags.
382   KernelEnvironmentTy KernelEnvironment;
383 
384   /// The prototype kernel launch environment.
385   KernelLaunchEnvironmentTy KernelLaunchEnvironment;
386 
387   /// If the kernel is a bare kernel.
388   bool IsBareKernel = false;
389 };
390 
391 /// Information about an allocation, when it has been allocated, and when/if it
392 /// has been deallocated, for error reporting purposes.
393 struct AllocationTraceInfoTy {
394 
395   /// The stack trace of the allocation itself.
396   std::string AllocationTrace;
397 
398   /// The stack trace of the deallocation, or empty.
399   std::string DeallocationTrace;
400 
401   /// The allocated device pointer.
402   void *DevicePtr = nullptr;
403 
404   /// The corresponding host pointer (can be null).
405   void *HostPtr = nullptr;
406 
407   /// The size of the allocation.
408   uint64_t Size = 0;
409 
410   /// The kind of the allocation.
411   TargetAllocTy Kind = TargetAllocTy::TARGET_ALLOC_DEFAULT;
412 
413   /// Information about the last allocation at this address, if any.
414   AllocationTraceInfoTy *LastAllocationInfo = nullptr;
415 
416   /// Lock to keep accesses race free.
417   std::mutex Lock;
418 };
419 
420 /// Information about an allocation, when it has been allocated, and when/if it
421 /// has been deallocated, for error reporting purposes.
422 struct KernelTraceInfoTy {
423 
424   /// The launched kernel.
425   GenericKernelTy *Kernel;
426 
427   /// The stack trace of the launch itself.
428   std::string LaunchTrace;
429 
430   /// The async info the kernel was launched in.
431   __tgt_async_info *AsyncInfo;
432 };
433 
434 struct KernelTraceInfoRecordTy {
435   KernelTraceInfoRecordTy() { KTIs.fill({}); }
436 
437   /// Return the (maximal) record size.
438   auto size() const { return KTIs.size(); }
439 
440   /// Create a new kernel trace info and add it into the record.
441   void emplace(GenericKernelTy *Kernel, const std::string &&StackTrace,
442                __tgt_async_info *AsyncInfo) {
443     KTIs[Idx] = {Kernel, std::move(StackTrace), AsyncInfo};
444     Idx = (Idx + 1) % size();
445   }
446 
447   /// Return the \p I'th last kernel trace info.
448   auto getKernelTraceInfo(int32_t I) const {
449     // Note that kernel trace infos "grow forward", so lookup is backwards.
450     return KTIs[(Idx - I - 1 + size()) % size()];
451   }
452 
453 private:
454   std::array<KernelTraceInfoTy, 8> KTIs;
455   unsigned Idx = 0;
456 };
457 
458 /// Class representing a map of host pinned allocations. We track these pinned
459 /// allocations, so memory tranfers invloving these buffers can be optimized.
460 class PinnedAllocationMapTy {
461 
462   /// Struct representing a map entry.
463   struct EntryTy {
464     /// The host pointer of the pinned allocation.
465     void *HstPtr;
466 
467     /// The pointer that devices' driver should use to transfer data from/to the
468     /// pinned allocation. In most plugins, this pointer will be the same as the
469     /// host pointer above.
470     void *DevAccessiblePtr;
471 
472     /// The size of the pinned allocation.
473     size_t Size;
474 
475     /// Indicate whether the allocation was locked from outside the plugin, for
476     /// instance, from the application. The externally locked allocations are
477     /// not unlocked by the plugin when unregistering the last user.
478     bool ExternallyLocked;
479 
480     /// The number of references to the pinned allocation. The allocation should
481     /// remain pinned and registered to the map until the number of references
482     /// becomes zero.
483     mutable size_t References;
484 
485     /// Create an entry with the host and device acessible pointers, the buffer
486     /// size, and a boolean indicating whether the buffer was locked externally.
487     EntryTy(void *HstPtr, void *DevAccessiblePtr, size_t Size,
488             bool ExternallyLocked)
489         : HstPtr(HstPtr), DevAccessiblePtr(DevAccessiblePtr), Size(Size),
490           ExternallyLocked(ExternallyLocked), References(1) {}
491 
492     /// Utility constructor used for std::set searches.
493     EntryTy(void *HstPtr)
494         : HstPtr(HstPtr), DevAccessiblePtr(nullptr), Size(0),
495           ExternallyLocked(false), References(0) {}
496   };
497 
498   /// Comparator of mep entries. Use the host pointer to enforce an order
499   /// between entries.
500   struct EntryCmpTy {
501     bool operator()(const EntryTy &Left, const EntryTy &Right) const {
502       return Left.HstPtr < Right.HstPtr;
503     }
504   };
505 
506   typedef std::set<EntryTy, EntryCmpTy> PinnedAllocSetTy;
507 
508   /// The map of host pinned allocations.
509   PinnedAllocSetTy Allocs;
510 
511   /// The mutex to protect accesses to the map.
512   mutable std::shared_mutex Mutex;
513 
514   /// Reference to the corresponding device.
515   GenericDeviceTy &Device;
516 
517   /// Indicate whether mapped host buffers should be locked automatically.
518   bool LockMappedBuffers;
519 
520   /// Indicate whether failures when locking mapped buffers should be ingored.
521   bool IgnoreLockMappedFailures;
522 
523   /// Find an allocation that intersects with \p HstPtr pointer. Assume the
524   /// map's mutex is acquired.
525   const EntryTy *findIntersecting(const void *HstPtr) const {
526     if (Allocs.empty())
527       return nullptr;
528 
529     // Search the first allocation with starting address that is not less than
530     // the buffer address.
531     auto It = Allocs.lower_bound({const_cast<void *>(HstPtr)});
532 
533     // Direct match of starting addresses.
534     if (It != Allocs.end() && It->HstPtr == HstPtr)
535       return &(*It);
536 
537     // Not direct match but may be a previous pinned allocation in the map which
538     // contains the buffer. Return false if there is no such a previous
539     // allocation.
540     if (It == Allocs.begin())
541       return nullptr;
542 
543     // Move to the previous pinned allocation.
544     --It;
545 
546     // The buffer is not contained in the pinned allocation.
547     if (utils::advancePtr(It->HstPtr, It->Size) > HstPtr)
548       return &(*It);
549 
550     // None found.
551     return nullptr;
552   }
553 
554   /// Insert an entry to the map representing a locked buffer. The number of
555   /// references is set to one.
556   Error insertEntry(void *HstPtr, void *DevAccessiblePtr, size_t Size,
557                     bool ExternallyLocked = false);
558 
559   /// Erase an existing entry from the map.
560   Error eraseEntry(const EntryTy &Entry);
561 
562   /// Register a new user into an entry that represents a locked buffer. Check
563   /// also that the registered buffer with \p HstPtr address and \p Size is
564   /// actually contained into the entry.
565   Error registerEntryUse(const EntryTy &Entry, void *HstPtr, size_t Size);
566 
567   /// Unregister a user from the entry and return whether it is the last user.
568   /// If it is the last user, the entry will have to be removed from the map
569   /// and unlock the entry's host buffer (if necessary).
570   Expected<bool> unregisterEntryUse(const EntryTy &Entry);
571 
572   /// Indicate whether the first range A fully contains the second range B.
573   static bool contains(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
574     void *EndA = utils::advancePtr(PtrA, SizeA);
575     void *EndB = utils::advancePtr(PtrB, SizeB);
576     return (PtrB >= PtrA && EndB <= EndA);
577   }
578 
579   /// Indicate whether the first range A intersects with the second range B.
580   static bool intersects(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
581     void *EndA = utils::advancePtr(PtrA, SizeA);
582     void *EndB = utils::advancePtr(PtrB, SizeB);
583     return (PtrA < EndB && PtrB < EndA);
584   }
585 
586 public:
587   /// Create the map of pinned allocations corresponding to a specific device.
588   PinnedAllocationMapTy(GenericDeviceTy &Device) : Device(Device) {
589 
590     // Envar that indicates whether mapped host buffers should be locked
591     // automatically. The possible values are boolean (on/off) and a special:
592     //   off:       Mapped host buffers are not locked.
593     //   on:        Mapped host buffers are locked in a best-effort approach.
594     //              Failure to lock the buffers are silent.
595     //   mandatory: Mapped host buffers are always locked and failures to lock
596     //              a buffer results in a fatal error.
597     StringEnvar OMPX_LockMappedBuffers("LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS",
598                                        "off");
599 
600     bool Enabled;
601     if (StringParser::parse(OMPX_LockMappedBuffers.get().data(), Enabled)) {
602       // Parsed as a boolean value. Enable the feature if necessary.
603       LockMappedBuffers = Enabled;
604       IgnoreLockMappedFailures = true;
605     } else if (OMPX_LockMappedBuffers.get() == "mandatory") {
606       // Enable the feature and failures are fatal.
607       LockMappedBuffers = true;
608       IgnoreLockMappedFailures = false;
609     } else {
610       // Disable by default.
611       DP("Invalid value LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS=%s\n",
612          OMPX_LockMappedBuffers.get().data());
613       LockMappedBuffers = false;
614     }
615   }
616 
617   /// Register a buffer that was recently allocated as a locked host buffer.
618   /// None of the already registered pinned allocations should intersect with
619   /// this new one. The registration requires the host pointer in \p HstPtr,
620   /// the device accessible pointer in \p DevAccessiblePtr, and the size of the
621   /// allocation in \p Size. The allocation must be unregistered using the
622   /// unregisterHostBuffer function.
623   Error registerHostBuffer(void *HstPtr, void *DevAccessiblePtr, size_t Size);
624 
625   /// Unregister a host pinned allocation passing the host pointer which was
626   /// previously registered using the registerHostBuffer function. When calling
627   /// this function, the pinned allocation cannot have any other user and will
628   /// not be unlocked by this function.
629   Error unregisterHostBuffer(void *HstPtr);
630 
631   /// Lock the host buffer at \p HstPtr or register a new user if it intersects
632   /// with an already existing one. A partial overlapping with extension is not
633   /// allowed. The function returns the device accessible pointer of the pinned
634   /// buffer. The buffer must be unlocked using the unlockHostBuffer function.
635   Expected<void *> lockHostBuffer(void *HstPtr, size_t Size);
636 
637   /// Unlock the host buffer at \p HstPtr or unregister a user if other users
638   /// are still using the pinned allocation. If this was the last user, the
639   /// pinned allocation is removed from the map and the memory is unlocked.
640   Error unlockHostBuffer(void *HstPtr);
641 
642   /// Lock or register a host buffer that was recently mapped by libomptarget.
643   /// This behavior is applied if LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS is
644   /// enabled. Even if not enabled, externally locked buffers are registered
645   /// in order to optimize their transfers.
646   Error lockMappedHostBuffer(void *HstPtr, size_t Size);
647 
648   /// Unlock or unregister a host buffer that was unmapped by libomptarget.
649   Error unlockUnmappedHostBuffer(void *HstPtr);
650 
651   /// Return the device accessible pointer associated to the host pinned
652   /// allocation which the \p HstPtr belongs, if any. Return null in case the
653   /// \p HstPtr does not belong to any host pinned allocation. The device
654   /// accessible pointer is the one that devices should use for data transfers
655   /// that involve a host pinned buffer.
656   void *getDeviceAccessiblePtrFromPinnedBuffer(const void *HstPtr) const {
657     std::shared_lock<std::shared_mutex> Lock(Mutex);
658 
659     // Find the intersecting allocation if any.
660     const EntryTy *Entry = findIntersecting(HstPtr);
661     if (!Entry)
662       return nullptr;
663 
664     return utils::advancePtr(Entry->DevAccessiblePtr,
665                              utils::getPtrDiff(HstPtr, Entry->HstPtr));
666   }
667 
668   /// Check whether a buffer belongs to a registered host pinned allocation.
669   bool isHostPinnedBuffer(const void *HstPtr) const {
670     std::shared_lock<std::shared_mutex> Lock(Mutex);
671 
672     // Return whether there is an intersecting allocation.
673     return (findIntersecting(const_cast<void *>(HstPtr)) != nullptr);
674   }
675 };
676 
677 /// Class implementing common functionalities of offload devices. Each plugin
678 /// should define the specific device class, derive from this generic one, and
679 /// implement the necessary virtual function members.
680 struct GenericDeviceTy : public DeviceAllocatorTy {
681   /// Construct a device with its device id within the plugin, the number of
682   /// devices in the plugin and the grid values for that kind of device.
683   GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
684                   const llvm::omp::GV &GridValues);
685 
686   /// Get the device identifier within the corresponding plugin. Notice that
687   /// this id is not unique between different plugins; they may overlap.
688   int32_t getDeviceId() const { return DeviceId; }
689 
690   /// Set the context of the device if needed, before calling device-specific
691   /// functions. Plugins may implement this function as a no-op if not needed.
692   virtual Error setContext() = 0;
693 
694   /// Initialize the device. After this call, the device should be already
695   /// working and ready to accept queries or modifications.
696   Error init(GenericPluginTy &Plugin);
697   virtual Error initImpl(GenericPluginTy &Plugin) = 0;
698 
699   /// Deinitialize the device and free all its resources. After this call, the
700   /// device is no longer considered ready, so no queries or modifications are
701   /// allowed.
702   Error deinit(GenericPluginTy &Plugin);
703   virtual Error deinitImpl() = 0;
704 
705   /// Load the binary image into the device and return the target table.
706   Expected<DeviceImageTy *> loadBinary(GenericPluginTy &Plugin,
707                                        const __tgt_device_image *TgtImage);
708   virtual Expected<DeviceImageTy *>
709   loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0;
710 
711   /// Setup the device environment if needed. Notice this setup may not be run
712   /// on some plugins. By default, it will be executed, but plugins can change
713   /// this behavior by overriding the shouldSetupDeviceEnvironment function.
714   Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image);
715 
716   /// Setup the global device memory pool, if the plugin requires one.
717   Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
718                               uint64_t PoolSize);
719 
720   // Setup the RPC server for this device if needed. This may not run on some
721   // plugins like the CPU targets. By default, it will not be executed so it is
722   // up to the target to override this using the shouldSetupRPCServer function.
723   Error setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image);
724 
725   /// Synchronize the current thread with the pending operations on the
726   /// __tgt_async_info structure.
727   Error synchronize(__tgt_async_info *AsyncInfo);
728   virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo) = 0;
729 
730   /// Invokes any global constructors on the device if present and is required
731   /// by the target.
732   virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
733                                        DeviceImageTy &Image) {
734     return Error::success();
735   }
736 
737   /// Invokes any global destructors on the device if present and is required
738   /// by the target.
739   virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
740                                       DeviceImageTy &Image) {
741     return Error::success();
742   }
743 
744   /// Query for the completion of the pending operations on the __tgt_async_info
745   /// structure in a non-blocking manner.
746   Error queryAsync(__tgt_async_info *AsyncInfo);
747   virtual Error queryAsyncImpl(__tgt_async_info &AsyncInfo) = 0;
748 
749   /// Check whether the architecture supports VA management
750   virtual bool supportVAManagement() const { return false; }
751 
752   /// Get the total device memory size
753   virtual Error getDeviceMemorySize(uint64_t &DSize);
754 
755   /// Allocates \p RSize bytes (rounded up to page size) and hints the driver to
756   /// map it to \p VAddr. The obtained address is stored in \p Addr. At return
757   /// \p RSize contains the actual size which can be equal or larger than the
758   /// requested size.
759   virtual Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize);
760 
761   /// De-allocates device memory and unmaps the virtual address \p VAddr
762   virtual Error memoryVAUnMap(void *VAddr, size_t Size);
763 
764   /// Allocate data on the device or involving the device.
765   Expected<void *> dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind);
766 
767   /// Deallocate data from the device or involving the device.
768   Error dataDelete(void *TgtPtr, TargetAllocTy Kind);
769 
770   /// Pin host memory to optimize transfers and return the device accessible
771   /// pointer that devices should use for memory transfers involving the host
772   /// pinned allocation.
773   Expected<void *> dataLock(void *HstPtr, int64_t Size) {
774     return PinnedAllocs.lockHostBuffer(HstPtr, Size);
775   }
776 
777   /// Unpin a host memory buffer that was previously pinned.
778   Error dataUnlock(void *HstPtr) {
779     return PinnedAllocs.unlockHostBuffer(HstPtr);
780   }
781 
782   /// Lock the host buffer \p HstPtr with \p Size bytes with the vendor-specific
783   /// API and return the device accessible pointer.
784   virtual Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) = 0;
785 
786   /// Unlock a previously locked host buffer starting at \p HstPtr.
787   virtual Error dataUnlockImpl(void *HstPtr) = 0;
788 
789   /// Mark the host buffer with address \p HstPtr and \p Size bytes as a mapped
790   /// buffer. This means that libomptarget created a new mapping of that host
791   /// buffer (e.g., because a user OpenMP target map) and the buffer may be used
792   /// as source/destination of memory transfers. We can use this information to
793   /// lock the host buffer and optimize its memory transfers.
794   Error notifyDataMapped(void *HstPtr, int64_t Size) {
795     return PinnedAllocs.lockMappedHostBuffer(HstPtr, Size);
796   }
797 
798   /// Mark the host buffer with address \p HstPtr as unmapped. This means that
799   /// libomptarget removed an existing mapping. If the plugin locked the buffer
800   /// in notifyDataMapped, this function should unlock it.
801   Error notifyDataUnmapped(void *HstPtr) {
802     return PinnedAllocs.unlockUnmappedHostBuffer(HstPtr);
803   }
804 
805   /// Check whether the host buffer with address \p HstPtr is pinned by the
806   /// underlying vendor-specific runtime (if any). Retrieve the host pointer,
807   /// the device accessible pointer and the size of the original pinned buffer.
808   virtual Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
809                                          void *&BaseDevAccessiblePtr,
810                                          size_t &BaseSize) const = 0;
811 
812   /// Submit data to the device (host to device transfer).
813   Error dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size,
814                    __tgt_async_info *AsyncInfo);
815   virtual Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
816                                AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
817 
818   /// Retrieve data from the device (device to host transfer).
819   Error dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size,
820                      __tgt_async_info *AsyncInfo);
821   virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
822                                  AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
823 
824   /// Exchange data between devices (device to device transfer). Calling this
825   /// function is only valid if GenericPlugin::isDataExchangable() passing the
826   /// two devices returns true.
827   Error dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, void *DstPtr,
828                      int64_t Size, __tgt_async_info *AsyncInfo);
829   virtual Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
830                                  void *DstPtr, int64_t Size,
831                                  AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
832 
833   /// Run the kernel associated with \p EntryPtr
834   Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
835                      KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);
836 
837   /// Initialize a __tgt_async_info structure. Related to interop features.
838   Error initAsyncInfo(__tgt_async_info **AsyncInfoPtr);
839   virtual Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
840 
841   /// Initialize a __tgt_device_info structure. Related to interop features.
842   Error initDeviceInfo(__tgt_device_info *DeviceInfo);
843   virtual Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) = 0;
844 
845   /// Create an event.
846   Error createEvent(void **EventPtrStorage);
847   virtual Error createEventImpl(void **EventPtrStorage) = 0;
848 
849   /// Destroy an event.
850   Error destroyEvent(void *Event);
851   virtual Error destroyEventImpl(void *EventPtr) = 0;
852 
853   /// Start the recording of the event.
854   Error recordEvent(void *Event, __tgt_async_info *AsyncInfo);
855   virtual Error recordEventImpl(void *EventPtr,
856                                 AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
857 
858   /// Wait for an event to finish. Notice this wait is asynchronous if the
859   /// __tgt_async_info is not nullptr.
860   Error waitEvent(void *Event, __tgt_async_info *AsyncInfo);
861   virtual Error waitEventImpl(void *EventPtr,
862                               AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
863 
864   /// Synchronize the current thread with the event.
865   Error syncEvent(void *EventPtr);
866   virtual Error syncEventImpl(void *EventPtr) = 0;
867 
868   /// Print information about the device.
869   Error printInfo();
870   virtual Error obtainInfoImpl(InfoQueueTy &Info) = 0;
871 
872   /// Getters of the grid values.
873   uint32_t getWarpSize() const { return GridValues.GV_Warp_Size; }
874   uint32_t getThreadLimit() const { return GridValues.GV_Max_WG_Size; }
875   uint32_t getBlockLimit() const { return GridValues.GV_Max_Teams; }
876   uint32_t getDefaultNumThreads() const {
877     return GridValues.GV_Default_WG_Size;
878   }
879   uint32_t getDefaultNumBlocks() const {
880     return GridValues.GV_Default_Num_Teams;
881   }
882   uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
883   virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; }
884 
885   /// Get target compute unit kind (e.g., sm_80, or gfx908).
886   virtual std::string getComputeUnitKind() const { return "unknown"; }
887 
888   /// Post processing after jit backend. The ownership of \p MB will be taken.
889   virtual Expected<std::unique_ptr<MemoryBuffer>>
890   doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const {
891     return std::move(MB);
892   }
893 
894   /// The minimum number of threads we use for a low-trip count combined loop.
895   /// Instead of using more threads we increase the outer (block/team)
896   /// parallelism.
897   /// @see OMPX_MinThreadsForLowTripCount
898   virtual uint32_t getMinThreadsForLowTripCountLoop() {
899     return OMPX_MinThreadsForLowTripCount;
900   }
901 
902   /// Whether or not to reuse blocks for high trip count loops.
903   /// @see OMPX_ReuseBlocksForHighTripCount
904   bool getReuseBlocksForHighTripCount() {
905     return OMPX_ReuseBlocksForHighTripCount;
906   }
907 
908   /// Get the total amount of hardware parallelism supported by the target
909   /// device. This is the total amount of warps or wavefronts that can be
910   /// resident on the device simultaneously.
911   virtual uint64_t getHardwareParallelism() const { return 0; }
912 
913   /// Get the RPC server running on this device.
914   RPCServerTy *getRPCServer() const { return RPCServer; }
915 
916   /// The number of parallel RPC ports to use on the device. In general, this
917   /// should be roughly equivalent to the amount of hardware parallelism the
918   /// device can support. This is because GPUs in general do not have forward
919   /// progress guarantees, so we minimize thread level dependencies by
920   /// allocating enough space such that each device thread can have a port. This
921   /// is likely overly pessimistic in the average case, but guarantees no
922   /// deadlocks at the cost of memory. This must be overloaded by targets
923   /// expecting to use the RPC server.
924   virtual uint64_t requestedRPCPortCount() const {
925     assert(!shouldSetupRPCServer() && "Default implementation cannot be used");
926     return 0;
927   }
928 
929   virtual Error getDeviceStackSize(uint64_t &V) = 0;
930 
931   /// Returns true if current plugin architecture is an APU
932   /// and unified_shared_memory was not requested by the program.
933   bool useAutoZeroCopy();
934   virtual bool useAutoZeroCopyImpl() { return false; }
935 
936   /// Allocate and construct a kernel object.
937   virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;
938 
939   /// Reference to the underlying plugin that created this device.
940   GenericPluginTy &Plugin;
941 
942   /// Map to record when allocations have been performed, and when they have
943   /// been deallocated, both for error reporting purposes.
944   ProtectedObj<DenseMap<void *, AllocationTraceInfoTy *>> AllocationTraces;
945 
946   /// Return the allocation trace info for a device pointer, that is the
947   /// allocation into which this device pointer points to (or pointed into).
948   AllocationTraceInfoTy *getAllocationTraceInfoForAddr(void *DevicePtr) {
949     auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
950     for (auto &It : *AllocationTraceMap) {
951       if (It.first <= DevicePtr &&
952           utils::advancePtr(It.first, It.second->Size) > DevicePtr)
953         return It.second;
954     }
955     return nullptr;
956   }
957 
958   /// Return the allocation trace info for a device pointer, that is the
959   /// allocation into which this device pointer points to (or pointed into).
960   AllocationTraceInfoTy *
961   getClosestAllocationTraceInfoForAddr(void *DevicePtr, uintptr_t &Distance) {
962     Distance = 0;
963     if (auto *ATI = getAllocationTraceInfoForAddr(DevicePtr)) {
964       return ATI;
965     }
966 
967     AllocationTraceInfoTy *ATI = nullptr;
968     uintptr_t DevicePtrI = uintptr_t(DevicePtr);
969     auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
970     for (auto &It : *AllocationTraceMap) {
971       uintptr_t Begin = uintptr_t(It.second->DevicePtr);
972       uintptr_t End = Begin + It.second->Size - 1;
973       uintptr_t ItDistance = std::min(Begin - DevicePtrI, DevicePtrI - End);
974       if (ATI && ItDistance > Distance)
975         continue;
976       ATI = It.second;
977       Distance = ItDistance;
978     }
979     return ATI;
980   }
981 
982   /// Map to record kernel have been launchedl, for error reporting purposes.
983   ProtectedObj<KernelTraceInfoRecordTy> KernelLaunchTraces;
984 
985   /// Environment variable to determine if stack traces for kernel launches are
986   /// tracked.
987   UInt32Envar OMPX_TrackNumKernelLaunches =
988       UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0);
989 
990   /// Environment variable to determine if stack traces for allocations and
991   /// deallocations are tracked.
992   BoolEnvar OMPX_TrackAllocationTraces =
993       BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false);
994 
995 private:
996   /// Get and set the stack size and heap size for the device. If not used, the
997   /// plugin can implement the setters as no-op and setting the output
998   /// value to zero for the getters.
999   virtual Error setDeviceStackSize(uint64_t V) = 0;
1000   virtual Error getDeviceHeapSize(uint64_t &V) = 0;
1001   virtual Error setDeviceHeapSize(uint64_t V) = 0;
1002 
1003   /// Indicate whether the device should setup the device environment. Notice
1004   /// that returning false in this function will change the behavior of the
1005   /// setupDeviceEnvironment() function.
1006   virtual bool shouldSetupDeviceEnvironment() const { return true; }
1007 
1008   /// Indicate whether the device should setup the global device memory pool. If
1009   /// false is return the value on the device will be uninitialized.
1010   virtual bool shouldSetupDeviceMemoryPool() const { return true; }
1011 
1012   /// Indicate whether or not the device should setup the RPC server. This is
1013   /// only necessary for unhosted targets like the GPU.
1014   virtual bool shouldSetupRPCServer() const { return false; }
1015 
1016   /// Pointer to the memory manager or nullptr if not available.
1017   MemoryManagerTy *MemoryManager;
1018 
1019   /// Environment variables defined by the OpenMP standard.
1020   Int32Envar OMP_TeamLimit;
1021   Int32Envar OMP_NumTeams;
1022   Int32Envar OMP_TeamsThreadLimit;
1023 
1024   /// Environment variables defined by the LLVM OpenMP implementation.
1025   Int32Envar OMPX_DebugKind;
1026   UInt32Envar OMPX_SharedMemorySize;
1027   UInt64Envar OMPX_TargetStackSize;
1028   UInt64Envar OMPX_TargetHeapSize;
1029 
1030   /// Environment flag to set the minimum number of threads we use for a
1031   /// low-trip count combined loop. Instead of using more threads we increase
1032   /// the outer (block/team) parallelism.
1033   UInt32Envar OMPX_MinThreadsForLowTripCount =
1034       UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32);
1035 
1036   BoolEnvar OMPX_ReuseBlocksForHighTripCount =
1037       BoolEnvar("LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT", true);
1038 
1039 protected:
1040   /// Environment variables defined by the LLVM OpenMP implementation
1041   /// regarding the initial number of streams and events.
1042   UInt32Envar OMPX_InitialNumStreams;
1043   UInt32Envar OMPX_InitialNumEvents;
1044 
1045   /// Array of images loaded into the device. Images are automatically
1046   /// deallocated by the allocator.
1047   llvm::SmallVector<DeviceImageTy *> LoadedImages;
1048 
1049   /// The identifier of the device within the plugin. Notice this is not a
1050   /// global device id and is not the device id visible to the OpenMP user.
1051   const int32_t DeviceId;
1052 
1053   /// The default grid values used for this device.
1054   llvm::omp::GV GridValues;
1055 
1056   /// Enumeration used for representing the current state between two devices
1057   /// two devices (both under the same plugin) for the peer access between them.
1058   /// The states can be a) PENDING when the state has not been queried and needs
1059   /// to be queried, b) AVAILABLE when the peer access is available to be used,
1060   /// and c) UNAVAILABLE if the system does not allow it.
1061   enum class PeerAccessState : uint8_t { AVAILABLE, UNAVAILABLE, PENDING };
1062 
1063   /// Array of peer access states with the rest of devices. This means that if
1064   /// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE,
1065   /// the device I can access device J's memory directly. However, notice this
1066   /// does not mean that device J can access device I's memory directly.
1067   llvm::SmallVector<PeerAccessState> PeerAccesses;
1068   std::mutex PeerAccessesLock;
1069 
1070   /// Map of host pinned allocations used for optimize device transfers.
1071   PinnedAllocationMapTy PinnedAllocs;
1072 
1073   /// A pointer to an RPC server instance attached to this device if present.
1074   /// This is used to run the RPC server during task synchronization.
1075   RPCServerTy *RPCServer;
1076 
1077 #ifdef OMPT_SUPPORT
1078   /// OMPT callback functions
1079 #define defineOmptCallback(Name, Type, Code) Name##_t Name##_fn = nullptr;
1080   FOREACH_OMPT_DEVICE_EVENT(defineOmptCallback)
1081 #undef defineOmptCallback
1082 
1083   /// Internal representation for OMPT device (initialize & finalize)
1084   std::atomic<bool> OmptInitialized;
1085 #endif
1086 
1087 private:
1088   DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
1089   DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
1090 };
1091 
1092 /// Class implementing common functionalities of offload plugins. Each plugin
1093 /// should define the specific plugin class, derive from this generic one, and
1094 /// implement the necessary virtual function members.
1095 struct GenericPluginTy {
1096 
1097   /// Construct a plugin instance.
1098   GenericPluginTy(Triple::ArchType TA)
1099       : GlobalHandler(nullptr), JIT(TA), RPCServer(nullptr),
1100         RecordReplay(nullptr) {}
1101 
1102   virtual ~GenericPluginTy() {}
1103 
1104   /// Initialize the plugin.
1105   Error init();
1106 
1107   /// Initialize the plugin and return the number of available devices.
1108   virtual Expected<int32_t> initImpl() = 0;
1109 
1110   /// Deinitialize the plugin and release the resources.
1111   Error deinit();
1112   virtual Error deinitImpl() = 0;
1113 
1114   /// Create a new device for the underlying plugin.
1115   virtual GenericDeviceTy *createDevice(GenericPluginTy &Plugin,
1116                                         int32_t DeviceID,
1117                                         int32_t NumDevices) = 0;
1118 
1119   /// Create a new global handler for the underlying plugin.
1120   virtual GenericGlobalHandlerTy *createGlobalHandler() = 0;
1121 
1122   /// Get the reference to the device with a certain device id.
1123   GenericDeviceTy &getDevice(int32_t DeviceId) {
1124     assert(isValidDeviceId(DeviceId) && "Invalid device id");
1125     assert(Devices[DeviceId] && "Device is unitialized");
1126 
1127     return *Devices[DeviceId];
1128   }
1129 
1130   /// Get the number of active devices.
1131   int32_t getNumDevices() const { return NumDevices; }
1132 
1133   /// Get the plugin-specific device identifier.
1134   int32_t getUserId(int32_t DeviceId) const {
1135     assert(UserDeviceIds.contains(DeviceId) && "No user-id registered");
1136     return UserDeviceIds.at(DeviceId);
1137   }
1138 
1139   /// Get the ELF code to recognize the binary image of this plugin.
1140   virtual uint16_t getMagicElfBits() const = 0;
1141 
1142   /// Get the target triple of this plugin.
1143   virtual Triple::ArchType getTripleArch() const = 0;
1144 
1145   /// Get the constant name identifier for this plugin.
1146   virtual const char *getName() const = 0;
1147 
1148   /// Allocate a structure using the internal allocator.
1149   template <typename Ty> Ty *allocate() {
1150     return reinterpret_cast<Ty *>(Allocator.Allocate(sizeof(Ty), alignof(Ty)));
1151   }
1152 
1153   /// Get the reference to the global handler of this plugin.
1154   GenericGlobalHandlerTy &getGlobalHandler() {
1155     assert(GlobalHandler && "Global handler not initialized");
1156     return *GlobalHandler;
1157   }
1158 
1159   /// Get the reference to the JIT used for all devices connected to this
1160   /// plugin.
1161   JITEngine &getJIT() { return JIT; }
1162 
1163   /// Get a reference to the RPC server used to provide host services.
1164   RPCServerTy &getRPCServer() {
1165     assert(RPCServer && "RPC server not initialized");
1166     return *RPCServer;
1167   }
1168 
1169   /// Get a reference to the record and replay interface for the plugin.
1170   RecordReplayTy &getRecordReplay() {
1171     assert(RecordReplay && "RR interface not initialized");
1172     return *RecordReplay;
1173   }
1174 
1175   /// Initialize a device within the plugin.
1176   Error initDevice(int32_t DeviceId);
1177 
1178   /// Deinitialize a device within the plugin and release its resources.
1179   Error deinitDevice(int32_t DeviceId);
1180 
1181   /// Indicate whether data can be exchanged directly between two devices under
1182   /// this same plugin. If this function returns true, it's safe to call the
1183   /// GenericDeviceTy::exchangeData() function on the source device.
1184   virtual bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) {
1185     return isValidDeviceId(SrcDeviceId) && isValidDeviceId(DstDeviceId);
1186   }
1187 
1188   /// Top level interface to verify if a given ELF image can be executed on a
1189   /// given target. Returns true if the \p Image is compatible with the plugin.
1190   Expected<bool> checkELFImage(StringRef Image) const;
1191 
1192   /// Return true if the \p Image can be compiled to run on the platform's
1193   /// target architecture.
1194   Expected<bool> checkBitcodeImage(StringRef Image) const;
1195 
1196   /// Indicate if an image is compatible with the plugin devices. Notice that
1197   /// this function may be called before actually initializing the devices. So
1198   /// we could not move this function into GenericDeviceTy.
1199   virtual Expected<bool> isELFCompatible(uint32_t DeviceID,
1200                                          StringRef Image) const = 0;
1201 
1202 protected:
1203   /// Indicate whether a device id is valid.
1204   bool isValidDeviceId(int32_t DeviceId) const {
1205     return (DeviceId >= 0 && DeviceId < getNumDevices());
1206   }
1207 
1208 public:
1209   // TODO: This plugin interface needs to be cleaned up.
1210 
1211   /// Returns non-zero if the plugin runtime has been initialized.
1212   int32_t is_initialized() const;
1213 
1214   /// Returns non-zero if the \p Image is compatible with the plugin. This
1215   /// function does not require the plugin to be initialized before use.
1216   int32_t is_plugin_compatible(__tgt_device_image *Image);
1217 
1218   /// Returns non-zero if the \p Image is compatible with the device.
1219   int32_t is_device_compatible(int32_t DeviceId, __tgt_device_image *Image);
1220 
1221   /// Returns non-zero if the plugin device has been initialized.
1222   int32_t is_device_initialized(int32_t DeviceId) const;
1223 
1224   /// Initialize the device inside of the plugin.
1225   int32_t init_device(int32_t DeviceId);
1226 
1227   /// Return the number of devices this plugin can support.
1228   int32_t number_of_devices();
1229 
1230   /// Returns non-zero if the data can be exchanged between the two devices.
1231   int32_t is_data_exchangable(int32_t SrcDeviceId, int32_t DstDeviceId);
1232 
1233   /// Initializes the record and replay mechanism inside the plugin.
1234   int32_t initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
1235                                    void *VAddr, bool isRecord, bool SaveOutput,
1236                                    uint64_t &ReqPtrArgOffset);
1237 
1238   /// Loads the associated binary into the plugin and returns a handle to it.
1239   int32_t load_binary(int32_t DeviceId, __tgt_device_image *TgtImage,
1240                       __tgt_device_binary *Binary);
1241 
1242   /// Allocates memory that is accessively to the given device.
1243   void *data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr, int32_t Kind);
1244 
1245   /// Deallocates memory on the given device.
1246   int32_t data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind);
1247 
1248   /// Locks / pins host memory using the plugin runtime.
1249   int32_t data_lock(int32_t DeviceId, void *Ptr, int64_t Size,
1250                     void **LockedPtr);
1251 
1252   /// Unlocks / unpins host memory using the plugin runtime.
1253   int32_t data_unlock(int32_t DeviceId, void *Ptr);
1254 
1255   /// Notify the runtime about a new mapping that has been created outside.
1256   int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr, int64_t Size);
1257 
1258   /// Notify t he runtime about a mapping that has been deleted.
1259   int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr);
1260 
1261   /// Copy data to the given device.
1262   int32_t data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
1263                       int64_t Size);
1264 
1265   /// Copy data to the given device asynchronously.
1266   int32_t data_submit_async(int32_t DeviceId, void *TgtPtr, void *HstPtr,
1267                             int64_t Size, __tgt_async_info *AsyncInfoPtr);
1268 
1269   /// Copy data from the given device.
1270   int32_t data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
1271                         int64_t Size);
1272 
1273   /// Copy data from the given device asynchornously.
1274   int32_t data_retrieve_async(int32_t DeviceId, void *HstPtr, void *TgtPtr,
1275                               int64_t Size, __tgt_async_info *AsyncInfoPtr);
1276 
1277   /// Exchange memory addresses between two devices.
1278   int32_t data_exchange(int32_t SrcDeviceId, void *SrcPtr, int32_t DstDeviceId,
1279                         void *DstPtr, int64_t Size);
1280 
1281   /// Exchange memory addresses between two devices asynchronously.
1282   int32_t data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
1283                               int DstDeviceId, void *DstPtr, int64_t Size,
1284                               __tgt_async_info *AsyncInfo);
1285 
1286   /// Begin executing a kernel on the given device.
1287   int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
1288                         ptrdiff_t *TgtOffsets, KernelArgsTy *KernelArgs,
1289                         __tgt_async_info *AsyncInfoPtr);
1290 
1291   /// Synchronize an asyncrhonous queue with the plugin runtime.
1292   int32_t synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
1293 
1294   /// Query the current state of an asynchronous queue.
1295   int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
1296 
1297   /// Prints information about the given devices supported by the plugin.
1298   void print_device_info(int32_t DeviceId);
1299 
1300   /// Creates an event in the given plugin if supported.
1301   int32_t create_event(int32_t DeviceId, void **EventPtr);
1302 
1303   /// Records an event that has occurred.
1304   int32_t record_event(int32_t DeviceId, void *EventPtr,
1305                        __tgt_async_info *AsyncInfoPtr);
1306 
1307   /// Wait until an event has occurred.
1308   int32_t wait_event(int32_t DeviceId, void *EventPtr,
1309                      __tgt_async_info *AsyncInfoPtr);
1310 
1311   /// Syncrhonize execution until an event is done.
1312   int32_t sync_event(int32_t DeviceId, void *EventPtr);
1313 
1314   /// Remove the event from the plugin.
1315   int32_t destroy_event(int32_t DeviceId, void *EventPtr);
1316 
1317   /// Remove the event from the plugin.
1318   void set_info_flag(uint32_t NewInfoLevel);
1319 
1320   /// Creates an asynchronous queue for the given plugin.
1321   int32_t init_async_info(int32_t DeviceId, __tgt_async_info **AsyncInfoPtr);
1322 
1323   /// Creates device information to be used for diagnostics.
1324   int32_t init_device_info(int32_t DeviceId, __tgt_device_info *DeviceInfo,
1325                            const char **ErrStr);
1326 
1327   /// Sets the offset into the devices for use by OMPT.
1328   int32_t set_device_identifier(int32_t UserId, int32_t DeviceId);
1329 
1330   /// Returns if the plugin can support auotmatic copy.
1331   int32_t use_auto_zero_copy(int32_t DeviceId);
1332 
1333   /// Look up a global symbol in the given binary.
1334   int32_t get_global(__tgt_device_binary Binary, uint64_t Size,
1335                      const char *Name, void **DevicePtr);
1336 
1337   /// Look up a kernel function in the given binary.
1338   int32_t get_function(__tgt_device_binary Binary, const char *Name,
1339                        void **KernelPtr);
1340 
1341 private:
1342   /// Indicates if the platform runtime has been fully initialized.
1343   bool Initialized = false;
1344 
1345   /// Number of devices available for the plugin.
1346   int32_t NumDevices = 0;
1347 
1348   /// Map of plugin device identifiers to the user device identifier.
1349   llvm::DenseMap<int32_t, int32_t> UserDeviceIds;
1350 
1351   /// Array of pointers to the devices. Initially, they are all set to nullptr.
1352   /// Once a device is initialized, the pointer is stored in the position given
1353   /// by its device id. A position with nullptr means that the corresponding
1354   /// device was not initialized yet.
1355   llvm::SmallVector<GenericDeviceTy *> Devices;
1356 
1357   /// Pointer to the global handler for this plugin.
1358   GenericGlobalHandlerTy *GlobalHandler;
1359 
1360   /// Internal allocator for different structures.
1361   BumpPtrAllocator Allocator;
1362 
1363   /// The JIT engine shared by all devices connected to this plugin.
1364   JITEngine JIT;
1365 
1366   /// The interface between the plugin and the GPU for host services.
1367   RPCServerTy *RPCServer;
1368 
1369   /// The interface between the plugin and the GPU for host services.
1370   RecordReplayTy *RecordReplay;
1371 };
1372 
1373 namespace Plugin {
1374 /// Create a success error. This is the same as calling Error::success(), but
1375 /// it is recommended to use this one for consistency with Plugin::error() and
1376 /// Plugin::check().
1377 static inline Error success() { return Error::success(); }
1378 
1379 /// Create a string error.
1380 template <typename... ArgsTy>
1381 static Error error(const char *ErrFmt, ArgsTy... Args) {
1382   return createStringError(inconvertibleErrorCode(), ErrFmt, Args...);
1383 }
1384 
1385 /// Check the plugin-specific error code and return an error or success
1386 /// accordingly. In case of an error, create a string error with the error
1387 /// description. The ErrFmt should follow the format:
1388 ///     "Error in <function name>[<optional info>]: %s"
1389 /// The last format specifier "%s" is mandatory and will be used to place the
1390 /// error code's description. Notice this function should be only called from
1391 /// the plugin-specific code.
1392 /// TODO: Refactor this, must be defined individually by each plugin.
1393 template <typename... ArgsTy>
1394 static Error check(int32_t ErrorCode, const char *ErrFmt, ArgsTy... Args);
1395 } // namespace Plugin
1396 
1397 /// Auxiliary interface class for GenericDeviceResourceManagerTy. This class
1398 /// acts as a reference to a device resource, such as a stream, and requires
1399 /// some basic functions to be implemented. The derived class should define an
1400 /// empty constructor that creates an empty and invalid resource reference. Do
1401 /// not create a new resource on the ctor, but on the create() function instead.
1402 ///
1403 /// The derived class should also define the type HandleTy as the underlying
1404 /// resource handle type. For instance, in a CUDA stream it would be:
1405 ///   using HandleTy = CUstream;
1406 struct GenericDeviceResourceRef {
1407   /// Create a new resource and stores a reference.
1408   virtual Error create(GenericDeviceTy &Device) = 0;
1409 
1410   /// Destroy and release the resources pointed by the reference.
1411   virtual Error destroy(GenericDeviceTy &Device) = 0;
1412 
1413 protected:
1414   ~GenericDeviceResourceRef() = default;
1415 };
1416 
1417 /// Class that implements a resource pool belonging to a device. This class
1418 /// operates with references to the actual resources. These reference must
1419 /// derive from the GenericDeviceResourceRef class and implement the create
1420 /// and destroy virtual functions.
1421 template <typename ResourceRef> class GenericDeviceResourceManagerTy {
1422   using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
1423   using ResourceHandleTy = typename ResourceRef::HandleTy;
1424 
1425 public:
1426   /// Create an empty resource pool for a specific device.
1427   GenericDeviceResourceManagerTy(GenericDeviceTy &Device)
1428       : Device(Device), NextAvailable(0) {}
1429 
1430   /// Destroy the resource pool. At this point, the deinit() function should
1431   /// already have been executed so the resource pool should be empty.
1432   virtual ~GenericDeviceResourceManagerTy() {
1433     assert(ResourcePool.empty() && "Resource pool not empty");
1434   }
1435 
1436   /// Initialize the resource pool.
1437   Error init(uint32_t InitialSize) {
1438     assert(ResourcePool.empty() && "Resource pool already initialized");
1439     return ResourcePoolTy::resizeResourcePool(InitialSize);
1440   }
1441 
1442   /// Deinitialize the resource pool and delete all resources. This function
1443   /// must be called before the destructor.
1444   virtual Error deinit() {
1445     if (NextAvailable)
1446       DP("Missing %d resources to be returned\n", NextAvailable);
1447 
1448     // TODO: This prevents a bug on libomptarget to make the plugins fail. There
1449     // may be some resources not returned. Do not destroy these ones.
1450     if (auto Err = ResourcePoolTy::resizeResourcePool(NextAvailable))
1451       return Err;
1452 
1453     ResourcePool.clear();
1454 
1455     return Plugin::success();
1456   }
1457 
1458   /// Get a resource from the pool or create new ones. If the function
1459   /// succeeds, the handle to the resource is saved in \p Handle.
1460   virtual Error getResource(ResourceHandleTy &Handle) {
1461     // Get a resource with an empty resource processor.
1462     return getResourcesImpl(1, &Handle,
1463                             [](ResourceHandleTy) { return Plugin::success(); });
1464   }
1465 
1466   /// Get multiple resources from the pool or create new ones. If the function
1467   /// succeeds, the handles to the resources are saved in \p Handles.
1468   virtual Error getResources(uint32_t Num, ResourceHandleTy *Handles) {
1469     // Get resources with an empty resource processor.
1470     return getResourcesImpl(Num, Handles,
1471                             [](ResourceHandleTy) { return Plugin::success(); });
1472   }
1473 
1474   /// Return resource to the pool.
1475   virtual Error returnResource(ResourceHandleTy Handle) {
1476     // Return a resource with an empty resource processor.
1477     return returnResourceImpl(
1478         Handle, [](ResourceHandleTy) { return Plugin::success(); });
1479   }
1480 
1481 protected:
1482   /// Get multiple resources from the pool or create new ones. If the function
1483   /// succeeds, the handles to the resources are saved in \p Handles. Also
1484   /// process each of the obtained resources with \p Processor.
1485   template <typename FuncTy>
1486   Error getResourcesImpl(uint32_t Num, ResourceHandleTy *Handles,
1487                          FuncTy Processor) {
1488     const std::lock_guard<std::mutex> Lock(Mutex);
1489 
1490     assert(NextAvailable <= ResourcePool.size() &&
1491            "Resource pool is corrupted");
1492 
1493     if (NextAvailable + Num > ResourcePool.size())
1494       // Double the resource pool or resize it to provide the requested ones.
1495       if (auto Err = ResourcePoolTy::resizeResourcePool(
1496               std::max(NextAvailable * 2, NextAvailable + Num)))
1497         return Err;
1498 
1499     // Save the handles in the output array parameter.
1500     for (uint32_t r = 0; r < Num; ++r)
1501       Handles[r] = ResourcePool[NextAvailable + r];
1502 
1503     // Process all obtained resources.
1504     for (uint32_t r = 0; r < Num; ++r)
1505       if (auto Err = Processor(Handles[r]))
1506         return Err;
1507 
1508     NextAvailable += Num;
1509 
1510     return Plugin::success();
1511   }
1512 
1513   /// Return resource to the pool and process the resource with \p Processor.
1514   template <typename FuncTy>
1515   Error returnResourceImpl(ResourceHandleTy Handle, FuncTy Processor) {
1516     const std::lock_guard<std::mutex> Lock(Mutex);
1517 
1518     // Process the returned resource.
1519     if (auto Err = Processor(Handle))
1520       return Err;
1521 
1522     assert(NextAvailable > 0 && "Resource pool is corrupted");
1523     ResourcePool[--NextAvailable] = Handle;
1524 
1525     return Plugin::success();
1526   }
1527 
1528 protected:
1529   /// The resources between \p OldSize and \p NewSize need to be created or
1530   /// destroyed. The mutex is locked when this function is called.
1531   Error resizeResourcePoolImpl(uint32_t OldSize, uint32_t NewSize) {
1532     assert(OldSize != NewSize && "Resizing to the same size");
1533 
1534     if (auto Err = Device.setContext())
1535       return Err;
1536 
1537     if (OldSize < NewSize) {
1538       // Create new resources.
1539       for (uint32_t I = OldSize; I < NewSize; ++I) {
1540         if (auto Err = ResourcePool[I].create(Device))
1541           return Err;
1542       }
1543     } else {
1544       // Destroy the obsolete resources.
1545       for (uint32_t I = NewSize; I < OldSize; ++I) {
1546         if (auto Err = ResourcePool[I].destroy(Device))
1547           return Err;
1548       }
1549     }
1550     return Plugin::success();
1551   }
1552 
1553   /// Increase or decrease the number of resources. This function should
1554   /// be called with the mutex acquired.
1555   Error resizeResourcePool(uint32_t NewSize) {
1556     uint32_t OldSize = ResourcePool.size();
1557 
1558     // Nothing to do.
1559     if (OldSize == NewSize)
1560       return Plugin::success();
1561 
1562     if (OldSize < NewSize) {
1563       // Increase the number of resources.
1564       ResourcePool.resize(NewSize);
1565       return ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize);
1566     }
1567 
1568     // Decrease the number of resources otherwise.
1569     auto Err = ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize);
1570     ResourcePool.resize(NewSize);
1571 
1572     return Err;
1573   }
1574 
1575   /// The device to which the resources belong
1576   GenericDeviceTy &Device;
1577 
1578   /// Mutex for the resource pool.
1579   std::mutex Mutex;
1580 
1581   /// The next available resource in the pool.
1582   uint32_t NextAvailable;
1583 
1584   /// The actual resource pool.
1585   std::deque<ResourceRef> ResourcePool;
1586 };
1587 
1588 } // namespace plugin
1589 } // namespace target
1590 } // namespace omp
1591 } // namespace llvm
1592 
1593 #endif // OPENMP_LIBOMPTARGET_PLUGINS_COMMON_PLUGININTERFACE_H
1594