xref: /llvm-project/offload/plugins-nextgen/amdgpu/src/rtl.cpp (revision 134401deea5e86d646bb99fab39c182cfa8e5292)
1 //===----RTLs/amdgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL NextGen for AMDGPU machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <atomic>
14 #include <cassert>
15 #include <cstddef>
16 #include <cstdint>
17 #include <deque>
18 #include <functional>
19 #include <mutex>
20 #include <string>
21 #include <system_error>
22 #include <unistd.h>
23 #include <unordered_map>
24 
25 #include "ErrorReporting.h"
26 #include "Shared/APITypes.h"
27 #include "Shared/Debug.h"
28 #include "Shared/Environment.h"
29 #include "Shared/RefCnt.h"
30 #include "Shared/Utils.h"
31 #include "Utils/ELF.h"
32 
33 #include "GlobalHandler.h"
34 #include "OpenMP/OMPT/Callback.h"
35 #include "PluginInterface.h"
36 #include "UtilitiesRTL.h"
37 #include "omptarget.h"
38 
39 #include "llvm/ADT/SmallString.h"
40 #include "llvm/ADT/SmallVector.h"
41 #include "llvm/ADT/StringRef.h"
42 #include "llvm/BinaryFormat/ELF.h"
43 #include "llvm/Frontend/OpenMP/OMPConstants.h"
44 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
45 #include "llvm/Support/Error.h"
46 #include "llvm/Support/FileOutputBuffer.h"
47 #include "llvm/Support/FileSystem.h"
48 #include "llvm/Support/MemoryBuffer.h"
49 #include "llvm/Support/Program.h"
50 #include "llvm/Support/Signals.h"
51 #include "llvm/Support/raw_ostream.h"
52 
53 #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) ||           \
54     !defined(__ORDER_BIG_ENDIAN__)
55 #error "Missing preprocessor definitions for endianness detection."
56 #endif
57 
58 // The HSA headers require these definitions.
59 #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
60 #define LITTLEENDIAN_CPU
61 #elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
62 #define BIGENDIAN_CPU
63 #endif
64 
65 #if defined(__has_include)
66 #if __has_include("hsa.h")
67 #include "hsa.h"
68 #include "hsa_ext_amd.h"
69 #elif __has_include("hsa/hsa.h")
70 #include "hsa/hsa.h"
71 #include "hsa/hsa_ext_amd.h"
72 #endif
73 #else
74 #include "hsa/hsa.h"
75 #include "hsa/hsa_ext_amd.h"
76 #endif
77 
78 namespace llvm {
79 namespace omp {
80 namespace target {
81 namespace plugin {
82 
83 /// Forward declarations for all specialized data structures.
84 struct AMDGPUKernelTy;
85 struct AMDGPUDeviceTy;
86 struct AMDGPUPluginTy;
87 struct AMDGPUStreamTy;
88 struct AMDGPUEventTy;
89 struct AMDGPUStreamManagerTy;
90 struct AMDGPUEventManagerTy;
91 struct AMDGPUDeviceImageTy;
92 struct AMDGPUMemoryManagerTy;
93 struct AMDGPUMemoryPoolTy;
94 
95 namespace hsa_utils {
96 
97 /// Iterate elements using an HSA iterate function. Do not use this function
98 /// directly but the specialized ones below instead.
99 template <typename ElemTy, typename IterFuncTy, typename CallbackTy>
100 hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) {
101   auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
102     CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
103     return (*Unwrapped)(Elem);
104   };
105   return Func(L, static_cast<void *>(&Cb));
106 }
107 
108 /// Iterate elements using an HSA iterate function passing a parameter. Do not
109 /// use this function directly but the specialized ones below instead.
110 template <typename ElemTy, typename IterFuncTy, typename IterFuncArgTy,
111           typename CallbackTy>
112 hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
113   auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
114     CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
115     return (*Unwrapped)(Elem);
116   };
117   return Func(FuncArg, L, static_cast<void *>(&Cb));
118 }
119 
120 /// Iterate elements using an HSA iterate function passing a parameter. Do not
121 /// use this function directly but the specialized ones below instead.
122 template <typename Elem1Ty, typename Elem2Ty, typename IterFuncTy,
123           typename IterFuncArgTy, typename CallbackTy>
124 hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
125   auto L = [](Elem1Ty Elem1, Elem2Ty Elem2, void *Data) -> hsa_status_t {
126     CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
127     return (*Unwrapped)(Elem1, Elem2);
128   };
129   return Func(FuncArg, L, static_cast<void *>(&Cb));
130 }
131 
132 /// Iterate agents.
133 template <typename CallbackTy> Error iterateAgents(CallbackTy Callback) {
134   hsa_status_t Status = iterate<hsa_agent_t>(hsa_iterate_agents, Callback);
135   return Plugin::check(Status, "Error in hsa_iterate_agents: %s");
136 }
137 
138 /// Iterate ISAs of an agent.
139 template <typename CallbackTy>
140 Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) {
141   hsa_status_t Status = iterate<hsa_isa_t>(hsa_agent_iterate_isas, Agent, Cb);
142   return Plugin::check(Status, "Error in hsa_agent_iterate_isas: %s");
143 }
144 
145 /// Iterate memory pools of an agent.
146 template <typename CallbackTy>
147 Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
148   hsa_status_t Status = iterate<hsa_amd_memory_pool_t>(
149       hsa_amd_agent_iterate_memory_pools, Agent, Cb);
150   return Plugin::check(Status,
151                        "Error in hsa_amd_agent_iterate_memory_pools: %s");
152 }
153 
154 /// Dispatches an asynchronous memory copy.
155 /// Enables different SDMA engines for the dispatch in a round-robin fashion.
156 Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
157                    const void *Src, hsa_agent_t SrcAgent, size_t Size,
158                    uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
159                    hsa_signal_t CompletionSignal) {
160   if (!UseMultipleSdmaEngines) {
161     hsa_status_t S =
162         hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size,
163                                   NumDepSignals, DepSignals, CompletionSignal);
164     return Plugin::check(S, "Error in hsa_amd_memory_async_copy: %s");
165   }
166 
167 // This solution is probably not the best
168 #if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 &&                                  \
169       HSA_AMD_INTERFACE_VERSION_MINOR >= 2)
170   return Plugin::error("Async copy on selected SDMA requires ROCm 5.7");
171 #else
172   static std::atomic<int> SdmaEngine{1};
173 
174   // This atomics solution is probably not the best, but should be sufficient
175   // for now.
176   // In a worst case scenario, in which threads read the same value, they will
177   // dispatch to the same SDMA engine. This may result in sub-optimal
178   // performance. However, I think the possibility to be fairly low.
179   int LocalSdmaEngine = SdmaEngine.load(std::memory_order_acquire);
180   // This call is only avail in ROCm >= 5.7
181   hsa_status_t S = hsa_amd_memory_async_copy_on_engine(
182       Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
183       CompletionSignal, (hsa_amd_sdma_engine_id_t)LocalSdmaEngine,
184       /*force_copy_on_sdma=*/true);
185   // Increment to use one of two SDMA engines: 0x1, 0x2
186   LocalSdmaEngine = (LocalSdmaEngine << 1) % 3;
187   SdmaEngine.store(LocalSdmaEngine, std::memory_order_relaxed);
188 
189   return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine: %s");
190 #endif
191 }
192 
193 Error getTargetTripleAndFeatures(hsa_agent_t Agent,
194                                  SmallVector<SmallString<32>> &Targets) {
195   auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
196     uint32_t Length;
197     hsa_status_t Status;
198     Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
199     if (Status != HSA_STATUS_SUCCESS)
200       return Status;
201 
202     llvm::SmallVector<char> ISAName(Length);
203     Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin());
204     if (Status != HSA_STATUS_SUCCESS)
205       return Status;
206 
207     llvm::StringRef TripleTarget(ISAName.begin(), Length);
208     if (TripleTarget.consume_front("amdgcn-amd-amdhsa")) {
209       auto Target = TripleTarget.ltrim('-').rtrim('\0');
210       Targets.push_back(Target);
211     }
212     return HSA_STATUS_SUCCESS;
213   });
214   return Err;
215 }
216 } // namespace hsa_utils
217 
218 /// Utility class representing generic resource references to AMDGPU resources.
219 template <typename ResourceTy>
220 struct AMDGPUResourceRef : public GenericDeviceResourceRef {
221   /// The underlying handle type for resources.
222   using HandleTy = ResourceTy *;
223 
224   /// Create an empty reference to an invalid resource.
225   AMDGPUResourceRef() : Resource(nullptr) {}
226 
227   /// Create a reference to an existing resource.
228   AMDGPUResourceRef(HandleTy Resource) : Resource(Resource) {}
229 
230   virtual ~AMDGPUResourceRef() {}
231 
232   /// Create a new resource and save the reference. The reference must be empty
233   /// before calling to this function.
234   Error create(GenericDeviceTy &Device) override;
235 
236   /// Destroy the referenced resource and invalidate the reference. The
237   /// reference must be to a valid resource before calling to this function.
238   Error destroy(GenericDeviceTy &Device) override {
239     if (!Resource)
240       return Plugin::error("Destroying an invalid resource");
241 
242     if (auto Err = Resource->deinit())
243       return Err;
244 
245     delete Resource;
246 
247     Resource = nullptr;
248     return Plugin::success();
249   }
250 
251   /// Get the underlying resource handle.
252   operator HandleTy() const { return Resource; }
253 
254 private:
255   /// The handle to the actual resource.
256   HandleTy Resource;
257 };
258 
259 /// Class holding an HSA memory pool.
260 struct AMDGPUMemoryPoolTy {
261   /// Create a memory pool from an HSA memory pool.
262   AMDGPUMemoryPoolTy(hsa_amd_memory_pool_t MemoryPool)
263       : MemoryPool(MemoryPool), GlobalFlags(0) {}
264 
265   /// Initialize the memory pool retrieving its properties.
266   Error init() {
267     if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_SEGMENT, Segment))
268       return Err;
269 
270     if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
271       return Err;
272 
273     return Plugin::success();
274   }
275 
276   /// Getter of the HSA memory pool.
277   hsa_amd_memory_pool_t get() const { return MemoryPool; }
278 
279   /// Indicate the segment which belongs to.
280   bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); }
281   bool isReadOnly() const { return (Segment == HSA_AMD_SEGMENT_READONLY); }
282   bool isPrivate() const { return (Segment == HSA_AMD_SEGMENT_PRIVATE); }
283   bool isGroup() const { return (Segment == HSA_AMD_SEGMENT_GROUP); }
284 
285   /// Indicate if it is fine-grained memory. Valid only for global.
286   bool isFineGrained() const {
287     assert(isGlobal() && "Not global memory");
288     return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED);
289   }
290 
291   /// Indicate if it is coarse-grained memory. Valid only for global.
292   bool isCoarseGrained() const {
293     assert(isGlobal() && "Not global memory");
294     return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED);
295   }
296 
297   /// Indicate if it supports storing kernel arguments. Valid only for global.
298   bool supportsKernelArgs() const {
299     assert(isGlobal() && "Not global memory");
300     return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT);
301   }
302 
303   /// Allocate memory on the memory pool.
304   Error allocate(size_t Size, void **PtrStorage) {
305     hsa_status_t Status =
306         hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, PtrStorage);
307     return Plugin::check(Status, "Error in hsa_amd_memory_pool_allocate: %s");
308   }
309 
310   /// Return memory to the memory pool.
311   Error deallocate(void *Ptr) {
312     hsa_status_t Status = hsa_amd_memory_pool_free(Ptr);
313     return Plugin::check(Status, "Error in hsa_amd_memory_pool_free: %s");
314   }
315 
316   /// Returns if the \p Agent can access the memory pool.
317   bool canAccess(hsa_agent_t Agent) {
318     hsa_amd_memory_pool_access_t Access;
319     if (hsa_amd_agent_memory_pool_get_info(
320             Agent, MemoryPool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &Access))
321       return false;
322     return Access != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED;
323   }
324 
325   /// Allow the device to access a specific allocation.
326   Error enableAccess(void *Ptr, int64_t Size,
327                      const llvm::SmallVector<hsa_agent_t> &Agents) const {
328 #ifdef OMPTARGET_DEBUG
329     for (hsa_agent_t Agent : Agents) {
330       hsa_amd_memory_pool_access_t Access;
331       if (auto Err =
332               getAttr(Agent, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, Access))
333         return Err;
334 
335       // The agent is not allowed to access the memory pool in any case. Do not
336       // continue because otherwise it result in undefined behavior.
337       if (Access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED)
338         return Plugin::error("An agent is not allowed to access a memory pool");
339     }
340 #endif
341 
342     // We can access but it is disabled by default. Enable the access then.
343     hsa_status_t Status =
344         hsa_amd_agents_allow_access(Agents.size(), Agents.data(), nullptr, Ptr);
345     return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s");
346   }
347 
348   /// Get attribute from the memory pool.
349   template <typename Ty>
350   Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
351     hsa_status_t Status;
352     Status = hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
353     return Plugin::check(Status, "Error in hsa_amd_memory_pool_get_info: %s");
354   }
355 
356   template <typename Ty>
357   hsa_status_t getAttrRaw(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
358     return hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
359   }
360 
361   /// Get attribute from the memory pool relating to an agent.
362   template <typename Ty>
363   Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind,
364                 Ty &Value) const {
365     hsa_status_t Status;
366     Status =
367         hsa_amd_agent_memory_pool_get_info(Agent, MemoryPool, Kind, &Value);
368     return Plugin::check(Status,
369                          "Error in hsa_amd_agent_memory_pool_get_info: %s");
370   }
371 
372 private:
373   /// The HSA memory pool.
374   hsa_amd_memory_pool_t MemoryPool;
375 
376   /// The segment where the memory pool belongs to.
377   hsa_amd_segment_t Segment;
378 
379   /// The global flags of memory pool. Only valid if the memory pool belongs to
380   /// the global segment.
381   uint32_t GlobalFlags;
382 };
383 
384 /// Class that implements a memory manager that gets memory from a specific
385 /// memory pool.
386 struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
387 
388   /// Create an empty memory manager.
389   AMDGPUMemoryManagerTy(AMDGPUPluginTy &Plugin)
390       : Plugin(Plugin), MemoryPool(nullptr), MemoryManager(nullptr) {}
391 
392   /// Initialize the memory manager from a memory pool.
393   Error init(AMDGPUMemoryPoolTy &MemoryPool) {
394     const uint32_t Threshold = 1 << 30;
395     this->MemoryManager = new MemoryManagerTy(*this, Threshold);
396     this->MemoryPool = &MemoryPool;
397     return Plugin::success();
398   }
399 
400   /// Deinitialize the memory manager and free its allocations.
401   Error deinit() {
402     assert(MemoryManager && "Invalid memory manager");
403 
404     // Delete and invalidate the memory manager. At this point, the memory
405     // manager will deallocate all its allocations.
406     delete MemoryManager;
407     MemoryManager = nullptr;
408 
409     return Plugin::success();
410   }
411 
412   /// Reuse or allocate memory through the memory manager.
413   Error allocate(size_t Size, void **PtrStorage) {
414     assert(MemoryManager && "Invalid memory manager");
415     assert(PtrStorage && "Invalid pointer storage");
416 
417     *PtrStorage = MemoryManager->allocate(Size, nullptr);
418     if (*PtrStorage == nullptr)
419       return Plugin::error("Failure to allocate from AMDGPU memory manager");
420 
421     return Plugin::success();
422   }
423 
424   /// Release an allocation to be reused.
425   Error deallocate(void *Ptr) {
426     assert(Ptr && "Invalid pointer");
427 
428     if (MemoryManager->free(Ptr))
429       return Plugin::error("Failure to deallocate from AMDGPU memory manager");
430 
431     return Plugin::success();
432   }
433 
434 private:
435   /// Allocation callback that will be called once the memory manager does not
436   /// have more previously allocated buffers.
437   void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
438 
439   /// Deallocation callack that will be called by the memory manager.
440   int free(void *TgtPtr, TargetAllocTy Kind) override {
441     if (auto Err = MemoryPool->deallocate(TgtPtr)) {
442       consumeError(std::move(Err));
443       return OFFLOAD_FAIL;
444     }
445     return OFFLOAD_SUCCESS;
446   }
447 
448   /// The underlying plugin that owns this memory manager.
449   AMDGPUPluginTy &Plugin;
450 
451   /// The memory pool used to allocate memory.
452   AMDGPUMemoryPoolTy *MemoryPool;
453 
454   /// Reference to the actual memory manager.
455   MemoryManagerTy *MemoryManager;
456 };
457 
458 /// Class implementing the AMDGPU device images' properties.
459 struct AMDGPUDeviceImageTy : public DeviceImageTy {
460   /// Create the AMDGPU image with the id and the target image pointer.
461   AMDGPUDeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
462                       const __tgt_device_image *TgtImage)
463       : DeviceImageTy(ImageId, Device, TgtImage) {}
464 
465   /// Prepare and load the executable corresponding to the image.
466   Error loadExecutable(const AMDGPUDeviceTy &Device);
467 
468   /// Unload the executable.
469   Error unloadExecutable() {
470     hsa_status_t Status = hsa_executable_destroy(Executable);
471     return Plugin::check(Status, "Error in hsa_executable_destroy: %s");
472   }
473 
474   /// Get the executable.
475   hsa_executable_t getExecutable() const { return Executable; }
476 
477   /// Get to Code Object Version of the ELF
478   uint16_t getELFABIVersion() const { return ELFABIVersion; }
479 
480   /// Find an HSA device symbol by its name on the executable.
481   Expected<hsa_executable_symbol_t>
482   findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
483 
484   /// Get additional info for kernel, e.g., register spill counts
485   std::optional<offloading::amdgpu::AMDGPUKernelMetaData>
486   getKernelInfo(StringRef Identifier) const {
487     auto It = KernelInfoMap.find(Identifier);
488 
489     if (It == KernelInfoMap.end())
490       return {};
491 
492     return It->second;
493   }
494 
495 private:
496   /// The exectuable loaded on the agent.
497   hsa_executable_t Executable;
498   StringMap<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfoMap;
499   uint16_t ELFABIVersion;
500 };
501 
502 /// Class implementing the AMDGPU kernel functionalities which derives from the
503 /// generic kernel class.
504 struct AMDGPUKernelTy : public GenericKernelTy {
505   /// Create an AMDGPU kernel with a name and an execution mode.
506   AMDGPUKernelTy(const char *Name) : GenericKernelTy(Name) {}
507 
508   /// Initialize the AMDGPU kernel.
509   Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
510     AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
511 
512     // Kernel symbols have a ".kd" suffix.
513     std::string KernelName(getName());
514     KernelName += ".kd";
515 
516     // Find the symbol on the device executable.
517     auto SymbolOrErr = AMDImage.findDeviceSymbol(Device, KernelName);
518     if (!SymbolOrErr)
519       return SymbolOrErr.takeError();
520 
521     hsa_executable_symbol_t Symbol = *SymbolOrErr;
522     hsa_symbol_kind_t SymbolType;
523     hsa_status_t Status;
524 
525     // Retrieve different properties of the kernel symbol.
526     std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
527         {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
528         {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
529         {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
530         {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
531         {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack},
532         {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};
533 
534     for (auto &Info : RequiredInfos) {
535       Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
536       if (auto Err = Plugin::check(
537               Status, "Error in hsa_executable_symbol_get_info: %s"))
538         return Err;
539     }
540 
541     // Make sure it is a kernel symbol.
542     if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
543       return Plugin::error("Symbol %s is not a kernel function");
544 
545     // TODO: Read the kernel descriptor for the max threads per block. May be
546     // read from the image.
547 
548     ImplicitArgsSize =
549         hsa_utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
550     DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());
551 
552     // Get additional kernel info read from image
553     KernelInfo = AMDImage.getKernelInfo(getName());
554     if (!KernelInfo.has_value())
555       INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device.getDeviceId(),
556            "Could not read extra information for kernel %s.", getName());
557 
558     return Plugin::success();
559   }
560 
561   /// Launch the AMDGPU kernel function.
562   Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
563                    uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
564                    KernelLaunchParamsTy LaunchParams,
565                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
566 
567   /// Print more elaborate kernel launch info for AMDGPU
568   Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
569                                KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
570                                uint32_t NumBlocks[3]) const override;
571 
572   /// Get group and private segment kernel size.
573   uint32_t getGroupSize() const { return GroupSize; }
574   uint32_t getPrivateSize() const { return PrivateSize; }
575 
576   /// Get the HSA kernel object representing the kernel function.
577   uint64_t getKernelObject() const { return KernelObject; }
578 
579   /// Get the size of implicitargs based on the code object version
580   /// @return 56 for cov4 and 256 for cov5
581   uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }
582 
583   /// Indicates whether or not we need to set up our own private segment size.
584   bool usesDynamicStack() const { return DynamicStack; }
585 
586 private:
587   /// The kernel object to execute.
588   uint64_t KernelObject;
589 
590   /// The args, group and private segments sizes required by a kernel instance.
591   uint32_t ArgsSize;
592   uint32_t GroupSize;
593   uint32_t PrivateSize;
594   bool DynamicStack;
595 
596   /// The size of implicit kernel arguments.
597   uint32_t ImplicitArgsSize;
598 
599   /// Additional Info for the AMD GPU Kernel
600   std::optional<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfo;
601 };
602 
603 /// Class representing an HSA signal. Signals are used to define dependencies
604 /// between asynchronous operations: kernel launches and memory transfers.
605 struct AMDGPUSignalTy {
606   /// Create an empty signal.
607   AMDGPUSignalTy() : HSASignal({0}), UseCount() {}
608   AMDGPUSignalTy(AMDGPUDeviceTy &Device) : HSASignal({0}), UseCount() {}
609 
610   /// Initialize the signal with an initial value.
611   Error init(uint32_t InitialValue = 1) {
612     hsa_status_t Status =
613         hsa_amd_signal_create(InitialValue, 0, nullptr, 0, &HSASignal);
614     return Plugin::check(Status, "Error in hsa_signal_create: %s");
615   }
616 
617   /// Deinitialize the signal.
618   Error deinit() {
619     hsa_status_t Status = hsa_signal_destroy(HSASignal);
620     return Plugin::check(Status, "Error in hsa_signal_destroy: %s");
621   }
622 
623   /// Wait until the signal gets a zero value.
624   Error wait(const uint64_t ActiveTimeout = 0,
625              GenericDeviceTy *Device = nullptr) const {
626     if (ActiveTimeout) {
627       hsa_signal_value_t Got = 1;
628       Got = hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
629                                       ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
630       if (Got == 0)
631         return Plugin::success();
632     }
633 
634     // If there is an RPC device attached to this stream we run it as a server.
635     uint64_t Timeout = UINT64_MAX;
636     auto WaitState = HSA_WAIT_STATE_BLOCKED;
637     while (hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
638                                      Timeout, WaitState) != 0)
639       ;
640     return Plugin::success();
641   }
642 
643   /// Load the value on the signal.
644   hsa_signal_value_t load() const {
645     return hsa_signal_load_scacquire(HSASignal);
646   }
647 
648   /// Signal decrementing by one.
649   void signal() {
650     assert(load() > 0 && "Invalid signal value");
651     hsa_signal_subtract_screlease(HSASignal, 1);
652   }
653 
654   /// Reset the signal value before reusing the signal. Do not call this
655   /// function if the signal is being currently used by any watcher, such as a
656   /// plugin thread or the HSA runtime.
657   void reset() { hsa_signal_store_screlease(HSASignal, 1); }
658 
659   /// Increase the number of concurrent uses.
660   void increaseUseCount() { UseCount.increase(); }
661 
662   /// Decrease the number of concurrent uses and return whether was the last.
663   bool decreaseUseCount() { return UseCount.decrease(); }
664 
665   hsa_signal_t get() const { return HSASignal; }
666 
667 private:
668   /// The underlying HSA signal.
669   hsa_signal_t HSASignal;
670 
671   /// Reference counter for tracking the concurrent use count. This is mainly
672   /// used for knowing how many streams are using the signal.
673   RefCountTy<> UseCount;
674 };
675 
676 /// Classes for holding AMDGPU signals and managing signals.
677 using AMDGPUSignalRef = AMDGPUResourceRef<AMDGPUSignalTy>;
678 using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>;
679 
680 /// Class holding an HSA queue to submit kernel and barrier packets.
681 struct AMDGPUQueueTy {
682   /// Create an empty queue.
683   AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
684 
685   /// Lazily initialize a new queue belonging to a specific agent.
686   Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) {
687     if (Queue)
688       return Plugin::success();
689     hsa_status_t Status =
690         hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
691                          &Device, UINT32_MAX, UINT32_MAX, &Queue);
692     return Plugin::check(Status, "Error in hsa_queue_create: %s");
693   }
694 
695   /// Deinitialize the queue and destroy its resources.
696   Error deinit() {
697     std::lock_guard<std::mutex> Lock(Mutex);
698     if (!Queue)
699       return Plugin::success();
700     hsa_status_t Status = hsa_queue_destroy(Queue);
701     return Plugin::check(Status, "Error in hsa_queue_destroy: %s");
702   }
703 
704   /// Returns the number of streams, this queue is currently assigned to.
705   bool getUserCount() const { return NumUsers; }
706 
707   /// Returns if the underlying HSA queue is initialized.
708   bool isInitialized() { return Queue != nullptr; }
709 
710   /// Decrement user count of the queue object.
711   void removeUser() { --NumUsers; }
712 
713   /// Increase user count of the queue object.
714   void addUser() { ++NumUsers; }
715 
716   /// Push a kernel launch to the queue. The kernel launch requires an output
717   /// signal and can define an optional input signal (nullptr if none).
718   Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
719                          uint32_t NumThreads[3], uint32_t NumBlocks[3],
720                          uint32_t GroupSize, uint64_t StackSize,
721                          AMDGPUSignalTy *OutputSignal,
722                          AMDGPUSignalTy *InputSignal) {
723     assert(OutputSignal && "Invalid kernel output signal");
724 
725     // Lock the queue during the packet publishing process. Notice this blocks
726     // the addition of other packets to the queue. The following piece of code
727     // should be lightweight; do not block the thread, allocate memory, etc.
728     std::lock_guard<std::mutex> Lock(Mutex);
729     assert(Queue && "Interacted with a non-initialized queue!");
730 
731     // Add a barrier packet before the kernel packet in case there is a pending
732     // preceding operation. The barrier packet will delay the processing of
733     // subsequent queue's packets until the barrier input signal are satisfied.
734     // No need output signal needed because the dependency is already guaranteed
735     // by the queue barrier itself.
736     if (InputSignal && InputSignal->load())
737       if (auto Err = pushBarrierImpl(nullptr, InputSignal))
738         return Err;
739 
740     // Now prepare the kernel packet.
741     uint64_t PacketId;
742     hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId);
743     assert(Packet && "Invalid packet");
744 
745     // The first 32 bits of the packet are written after the other fields
746     uint16_t Dims = NumBlocks[2] * NumThreads[2] > 1
747                         ? 3
748                         : 1 + (NumBlocks[1] * NumThreads[1] != 1);
749     uint16_t Setup = UINT16_C(Dims)
750                      << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
751     Packet->workgroup_size_x = NumThreads[0];
752     Packet->workgroup_size_y = NumThreads[1];
753     Packet->workgroup_size_z = NumThreads[2];
754     Packet->reserved0 = 0;
755     Packet->grid_size_x = NumBlocks[0] * NumThreads[0];
756     Packet->grid_size_y = NumBlocks[1] * NumThreads[1];
757     Packet->grid_size_z = NumBlocks[2] * NumThreads[2];
758     Packet->private_segment_size =
759         Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
760     Packet->group_segment_size = GroupSize;
761     Packet->kernel_object = Kernel.getKernelObject();
762     Packet->kernarg_address = KernelArgs;
763     Packet->reserved2 = 0;
764     Packet->completion_signal = OutputSignal->get();
765 
766     // Publish the packet. Do not modify the packet after this point.
767     publishKernelPacket(PacketId, Setup, Packet);
768 
769     return Plugin::success();
770   }
771 
772   /// Push a barrier packet that will wait up to two input signals. All signals
773   /// are optional (nullptr if none).
774   Error pushBarrier(AMDGPUSignalTy *OutputSignal,
775                     const AMDGPUSignalTy *InputSignal1,
776                     const AMDGPUSignalTy *InputSignal2) {
777     // Lock the queue during the packet publishing process.
778     std::lock_guard<std::mutex> Lock(Mutex);
779     assert(Queue && "Interacted with a non-initialized queue!");
780 
781     // Push the barrier with the lock acquired.
782     return pushBarrierImpl(OutputSignal, InputSignal1, InputSignal2);
783   }
784 
785 private:
786   /// Push a barrier packet that will wait up to two input signals. Assumes the
787   /// the queue lock is acquired.
788   Error pushBarrierImpl(AMDGPUSignalTy *OutputSignal,
789                         const AMDGPUSignalTy *InputSignal1,
790                         const AMDGPUSignalTy *InputSignal2 = nullptr) {
791     // Add a queue barrier waiting on both the other stream's operation and the
792     // last operation on the current stream (if any).
793     uint64_t PacketId;
794     hsa_barrier_and_packet_t *Packet =
795         (hsa_barrier_and_packet_t *)acquirePacket(PacketId);
796     assert(Packet && "Invalid packet");
797 
798     Packet->reserved0 = 0;
799     Packet->reserved1 = 0;
800     Packet->dep_signal[0] = {0};
801     Packet->dep_signal[1] = {0};
802     Packet->dep_signal[2] = {0};
803     Packet->dep_signal[3] = {0};
804     Packet->dep_signal[4] = {0};
805     Packet->reserved2 = 0;
806     Packet->completion_signal = {0};
807 
808     // Set input and output dependencies if needed.
809     if (OutputSignal)
810       Packet->completion_signal = OutputSignal->get();
811     if (InputSignal1)
812       Packet->dep_signal[0] = InputSignal1->get();
813     if (InputSignal2)
814       Packet->dep_signal[1] = InputSignal2->get();
815 
816     // Publish the packet. Do not modify the packet after this point.
817     publishBarrierPacket(PacketId, Packet);
818 
819     return Plugin::success();
820   }
821 
822   /// Acquire a packet from the queue. This call may block the thread if there
823   /// is no space in the underlying HSA queue. It may need to wait until the HSA
824   /// runtime processes some packets. Assumes the queue lock is acquired.
825   hsa_kernel_dispatch_packet_t *acquirePacket(uint64_t &PacketId) {
826     // Increase the queue index with relaxed memory order. Notice this will need
827     // another subsequent atomic operation with acquire order.
828     PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
829 
830     // Wait for the package to be available. Notice the atomic operation uses
831     // the acquire memory order.
832     while (PacketId - hsa_queue_load_read_index_scacquire(Queue) >= Queue->size)
833       ;
834 
835     // Return the packet reference.
836     const uint32_t Mask = Queue->size - 1; // The size is a power of 2.
837     return (hsa_kernel_dispatch_packet_t *)Queue->base_address +
838            (PacketId & Mask);
839   }
840 
841   /// Publish the kernel packet so that the HSA runtime can start processing
842   /// the kernel launch. Do not modify the packet once this function is called.
843   /// Assumes the queue lock is acquired.
844   void publishKernelPacket(uint64_t PacketId, uint16_t Setup,
845                            hsa_kernel_dispatch_packet_t *Packet) {
846     uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
847 
848     uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
849     Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
850     Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
851 
852     // Publish the packet. Do not modify the package after this point.
853     uint32_t HeaderWord = Header | (Setup << 16u);
854     __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
855 
856     // Signal the doorbell about the published packet.
857     hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
858   }
859 
860   /// Publish the barrier packet so that the HSA runtime can start processing
861   /// the barrier. Next packets in the queue will not be processed until all
862   /// barrier dependencies (signals) are satisfied. Assumes the queue is locked
863   void publishBarrierPacket(uint64_t PacketId,
864                             hsa_barrier_and_packet_t *Packet) {
865     uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
866     uint16_t Setup = 0;
867     uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
868     Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
869     Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
870 
871     // Publish the packet. Do not modify the package after this point.
872     uint32_t HeaderWord = Header | (Setup << 16u);
873     __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
874 
875     // Signal the doorbell about the published packet.
876     hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
877   }
878 
879   /// Callack that will be called when an error is detected on the HSA queue.
880   static void callbackError(hsa_status_t Status, hsa_queue_t *Source,
881                             void *Data);
882 
883   /// The HSA queue.
884   hsa_queue_t *Queue;
885 
886   /// Mutex to protect the acquiring and publishing of packets. For the moment,
887   /// we need this mutex to prevent publishing packets that are not ready to be
888   /// published in a multi-thread scenario. Without a queue lock, a thread T1
889   /// could acquire packet P and thread T2 acquire packet P+1. Thread T2 could
890   /// publish its packet P+1 (signaling the queue's doorbell) before packet P
891   /// from T1 is ready to be processed. That scenario should be invalid. Thus,
892   /// we use the following mutex to make packet acquiring and publishing atomic.
893   /// TODO: There are other more advanced approaches to avoid this mutex using
894   /// atomic operations. We can further investigate it if this is a bottleneck.
895   std::mutex Mutex;
896 
897   /// The number of streams, this queue is currently assigned to. A queue is
898   /// considered idle when this is zero, otherwise: busy.
899   uint32_t NumUsers;
900 };
901 
902 /// Struct that implements a stream of asynchronous operations for AMDGPU
903 /// devices. This class relies on signals to implement streams and define the
904 /// dependencies between asynchronous operations.
905 struct AMDGPUStreamTy {
906 private:
907   /// Utility struct holding arguments for async H2H memory copies.
908   struct MemcpyArgsTy {
909     void *Dst;
910     const void *Src;
911     size_t Size;
912   };
913 
914   /// Utility struct holding arguments for freeing buffers to memory managers.
915   struct ReleaseBufferArgsTy {
916     void *Buffer;
917     AMDGPUMemoryManagerTy *MemoryManager;
918   };
919 
920   /// Utility struct holding arguments for releasing signals to signal managers.
921   struct ReleaseSignalArgsTy {
922     AMDGPUSignalTy *Signal;
923     AMDGPUSignalManagerTy *SignalManager;
924   };
925 
926   using AMDGPUStreamCallbackTy = Error(void *Data);
927 
928   /// The stream is composed of N stream's slots. The struct below represents
929   /// the fields of each slot. Each slot has a signal and an optional action
930   /// function. When appending an HSA asynchronous operation to the stream, one
931   /// slot is consumed and used to store the operation's information. The
932   /// operation's output signal is set to the consumed slot's signal. If there
933   /// is a previous asynchronous operation on the previous slot, the HSA async
934   /// operation's input signal is set to the signal of the previous slot. This
935   /// way, we obtain a chain of dependant async operations. The action is a
936   /// function that will be executed eventually after the operation is
937   /// completed, e.g., for releasing a buffer.
938   struct StreamSlotTy {
939     /// The output signal of the stream operation. May be used by the subsequent
940     /// operation as input signal.
941     AMDGPUSignalTy *Signal;
942 
943     /// The actions that must be performed after the operation's completion. Set
944     /// to nullptr when there is no action to perform.
945     llvm::SmallVector<AMDGPUStreamCallbackTy *> Callbacks;
946 
947     /// Space for the action's arguments. A pointer to these arguments is passed
948     /// to the action function. Notice the space of arguments is limited.
949     union ActionArgsTy {
950       MemcpyArgsTy MemcpyArgs;
951       ReleaseBufferArgsTy ReleaseBufferArgs;
952       ReleaseSignalArgsTy ReleaseSignalArgs;
953       void *CallbackArgs;
954     };
955 
956     llvm::SmallVector<ActionArgsTy> ActionArgs;
957 
958     /// Create an empty slot.
959     StreamSlotTy() : Signal(nullptr), Callbacks({}), ActionArgs({}) {}
960 
961     /// Schedule a host memory copy action on the slot.
962     Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) {
963       Callbacks.emplace_back(memcpyAction);
964       ActionArgs.emplace_back().MemcpyArgs = MemcpyArgsTy{Dst, Src, Size};
965       return Plugin::success();
966     }
967 
968     /// Schedule a release buffer action on the slot.
969     Error schedReleaseBuffer(void *Buffer, AMDGPUMemoryManagerTy &Manager) {
970       Callbacks.emplace_back(releaseBufferAction);
971       ActionArgs.emplace_back().ReleaseBufferArgs =
972           ReleaseBufferArgsTy{Buffer, &Manager};
973       return Plugin::success();
974     }
975 
976     /// Schedule a signal release action on the slot.
977     Error schedReleaseSignal(AMDGPUSignalTy *SignalToRelease,
978                              AMDGPUSignalManagerTy *SignalManager) {
979       Callbacks.emplace_back(releaseSignalAction);
980       ActionArgs.emplace_back().ReleaseSignalArgs =
981           ReleaseSignalArgsTy{SignalToRelease, SignalManager};
982       return Plugin::success();
983     }
984 
985     /// Register a callback to be called on compleition
986     Error schedCallback(AMDGPUStreamCallbackTy *Func, void *Data) {
987       Callbacks.emplace_back(Func);
988       ActionArgs.emplace_back().CallbackArgs = Data;
989 
990       return Plugin::success();
991     }
992 
993     // Perform the action if needed.
994     Error performAction() {
995       if (Callbacks.empty())
996         return Plugin::success();
997 
998       assert(Callbacks.size() == ActionArgs.size() && "Size mismatch");
999       for (auto [Callback, ActionArg] : llvm::zip(Callbacks, ActionArgs)) {
1000         // Perform the action.
1001         if (Callback == memcpyAction) {
1002           if (auto Err = memcpyAction(&ActionArg))
1003             return Err;
1004         } else if (Callback == releaseBufferAction) {
1005           if (auto Err = releaseBufferAction(&ActionArg))
1006             return Err;
1007         } else if (Callback == releaseSignalAction) {
1008           if (auto Err = releaseSignalAction(&ActionArg))
1009             return Err;
1010         } else if (Callback) {
1011           if (auto Err = Callback(ActionArg.CallbackArgs))
1012             return Err;
1013         }
1014       }
1015 
1016       // Invalidate the action.
1017       Callbacks.clear();
1018       ActionArgs.clear();
1019 
1020       return Plugin::success();
1021     }
1022   };
1023 
1024   /// The device agent where the stream was created.
1025   hsa_agent_t Agent;
1026 
1027   /// The queue that the stream uses to launch kernels.
1028   AMDGPUQueueTy *Queue;
1029 
1030   /// The manager of signals to reuse signals.
1031   AMDGPUSignalManagerTy &SignalManager;
1032 
1033   /// A reference to the associated device.
1034   GenericDeviceTy &Device;
1035 
1036   /// Array of stream slots. Use std::deque because it can dynamically grow
1037   /// without invalidating the already inserted elements. For instance, the
1038   /// std::vector may invalidate the elements by reallocating the internal
1039   /// array if there is not enough space on new insertions.
1040   std::deque<StreamSlotTy> Slots;
1041 
1042   /// The next available slot on the queue. This is reset to zero each time the
1043   /// stream is synchronized. It also indicates the current number of consumed
1044   /// slots at a given time.
1045   uint32_t NextSlot;
1046 
1047   /// The synchronization id. This number is increased each time the stream is
1048   /// synchronized. It is useful to detect if an AMDGPUEventTy points to an
1049   /// operation that was already finalized in a previous stream sycnhronize.
1050   uint32_t SyncCycle;
1051 
1052   /// Mutex to protect stream's management.
1053   mutable std::mutex Mutex;
1054 
1055   /// Timeout hint for HSA actively waiting for signal value to change
1056   const uint64_t StreamBusyWaitMicroseconds;
1057 
1058   /// Indicate to spread data transfers across all avilable SDMAs
1059   bool UseMultipleSdmaEngines;
1060 
1061   /// Return the current number of asychronous operations on the stream.
1062   uint32_t size() const { return NextSlot; }
1063 
1064   /// Return the last valid slot on the stream.
1065   uint32_t last() const { return size() - 1; }
1066 
1067   /// Consume one slot from the stream. Since the stream uses signals on demand
1068   /// and releases them once the slot is no longer used, the function requires
1069   /// an idle signal for the new consumed slot.
1070   std::pair<uint32_t, AMDGPUSignalTy *> consume(AMDGPUSignalTy *OutputSignal) {
1071     // Double the stream size if needed. Since we use std::deque, this operation
1072     // does not invalidate the already added slots.
1073     if (Slots.size() == NextSlot)
1074       Slots.resize(Slots.size() * 2);
1075 
1076     // Update the next available slot and the stream size.
1077     uint32_t Curr = NextSlot++;
1078 
1079     // Retrieve the input signal, if any, of the current operation.
1080     AMDGPUSignalTy *InputSignal = (Curr > 0) ? Slots[Curr - 1].Signal : nullptr;
1081 
1082     // Set the output signal of the current slot.
1083     Slots[Curr].Signal = OutputSignal;
1084 
1085     return std::make_pair(Curr, InputSignal);
1086   }
1087 
1088   /// Complete all pending post actions and reset the stream after synchronizing
1089   /// or positively querying the stream.
1090   Error complete() {
1091     for (uint32_t Slot = 0; Slot < NextSlot; ++Slot) {
1092       // Take the post action of the operation if any.
1093       if (auto Err = Slots[Slot].performAction())
1094         return Err;
1095 
1096       // Release the slot's signal if possible. Otherwise, another user will.
1097       if (Slots[Slot].Signal->decreaseUseCount())
1098         if (auto Err = SignalManager.returnResource(Slots[Slot].Signal))
1099           return Err;
1100 
1101       Slots[Slot].Signal = nullptr;
1102     }
1103 
1104     // Reset the stream slots to zero.
1105     NextSlot = 0;
1106 
1107     // Increase the synchronization id since the stream completed a sync cycle.
1108     SyncCycle += 1;
1109 
1110     return Plugin::success();
1111   }
1112 
1113   /// Make the current stream wait on a specific operation of another stream.
1114   /// The idea is to make the current stream waiting on two signals: 1) the last
1115   /// signal of the current stream, and 2) the last signal of the other stream.
1116   /// Use a barrier packet with two input signals.
1117   Error waitOnStreamOperation(AMDGPUStreamTy &OtherStream, uint32_t Slot) {
1118     if (Queue == nullptr)
1119       return Plugin::error("Target queue was nullptr");
1120 
1121     /// The signal that we must wait from the other stream.
1122     AMDGPUSignalTy *OtherSignal = OtherStream.Slots[Slot].Signal;
1123 
1124     // Prevent the release of the other stream's signal.
1125     OtherSignal->increaseUseCount();
1126 
1127     // Retrieve an available signal for the operation's output.
1128     AMDGPUSignalTy *OutputSignal = nullptr;
1129     if (auto Err = SignalManager.getResource(OutputSignal))
1130       return Err;
1131     OutputSignal->reset();
1132     OutputSignal->increaseUseCount();
1133 
1134     // Consume stream slot and compute dependencies.
1135     auto [Curr, InputSignal] = consume(OutputSignal);
1136 
1137     // Setup the post action to release the signal.
1138     if (auto Err = Slots[Curr].schedReleaseSignal(OtherSignal, &SignalManager))
1139       return Err;
1140 
1141     // Push a barrier into the queue with both input signals.
1142     return Queue->pushBarrier(OutputSignal, InputSignal, OtherSignal);
1143   }
1144 
1145   /// Callback for running a specific asynchronous operation. This callback is
1146   /// used for hsa_amd_signal_async_handler. The argument is the operation that
1147   /// should be executed. Notice we use the post action mechanism to codify the
1148   /// asynchronous operation.
1149   static bool asyncActionCallback(hsa_signal_value_t Value, void *Args) {
1150     StreamSlotTy *Slot = reinterpret_cast<StreamSlotTy *>(Args);
1151     assert(Slot && "Invalid slot");
1152     assert(Slot->Signal && "Invalid signal");
1153 
1154     // This thread is outside the stream mutex. Make sure the thread sees the
1155     // changes on the slot.
1156     std::atomic_thread_fence(std::memory_order_acquire);
1157 
1158     // Peform the operation.
1159     if (auto Err = Slot->performAction())
1160       FATAL_MESSAGE(1, "Error peforming post action: %s",
1161                     toString(std::move(Err)).data());
1162 
1163     // Signal the output signal to notify the asycnhronous operation finalized.
1164     Slot->Signal->signal();
1165 
1166     // Unregister callback.
1167     return false;
1168   }
1169 
1170   // Callback for host-to-host memory copies. This is an asynchronous action.
1171   static Error memcpyAction(void *Data) {
1172     MemcpyArgsTy *Args = reinterpret_cast<MemcpyArgsTy *>(Data);
1173     assert(Args && "Invalid arguments");
1174     assert(Args->Dst && "Invalid destination buffer");
1175     assert(Args->Src && "Invalid source buffer");
1176 
1177     std::memcpy(Args->Dst, Args->Src, Args->Size);
1178 
1179     return Plugin::success();
1180   }
1181 
1182   /// Releasing a memory buffer to a memory manager. This is a post completion
1183   /// action. There are two kinds of memory buffers:
1184   ///   1. For kernel arguments. This buffer can be freed after receiving the
1185   ///   kernel completion signal.
1186   ///   2. For H2D tranfers that need pinned memory space for staging. This
1187   ///   buffer can be freed after receiving the transfer completion signal.
1188   ///   3. For D2H tranfers that need pinned memory space for staging. This
1189   ///   buffer cannot be freed after receiving the transfer completion signal
1190   ///   because of the following asynchronous H2H callback.
1191   ///      For this reason, This action can only be taken at
1192   ///      AMDGPUStreamTy::complete()
1193   /// Because of the case 3, all releaseBufferActions are taken at
1194   /// AMDGPUStreamTy::complete() in the current implementation.
1195   static Error releaseBufferAction(void *Data) {
1196     ReleaseBufferArgsTy *Args = reinterpret_cast<ReleaseBufferArgsTy *>(Data);
1197     assert(Args && "Invalid arguments");
1198     assert(Args->MemoryManager && "Invalid memory manager");
1199     assert(Args->Buffer && "Invalid buffer");
1200 
1201     // Release the allocation to the memory manager.
1202     return Args->MemoryManager->deallocate(Args->Buffer);
1203   }
1204 
1205   /// Releasing a signal object back to SignalManager. This is a post completion
1206   /// action. This action can only be taken at AMDGPUStreamTy::complete()
1207   static Error releaseSignalAction(void *Data) {
1208     ReleaseSignalArgsTy *Args = reinterpret_cast<ReleaseSignalArgsTy *>(Data);
1209     assert(Args && "Invalid arguments");
1210     assert(Args->Signal && "Invalid signal");
1211     assert(Args->SignalManager && "Invalid signal manager");
1212 
1213     // Release the signal if needed.
1214     if (Args->Signal->decreaseUseCount())
1215       if (auto Err = Args->SignalManager->returnResource(Args->Signal))
1216         return Err;
1217 
1218     return Plugin::success();
1219   }
1220 
1221 public:
1222   /// Create an empty stream associated with a specific device.
1223   AMDGPUStreamTy(AMDGPUDeviceTy &Device);
1224 
1225   /// Intialize the stream's signals.
1226   Error init() { return Plugin::success(); }
1227 
1228   /// Deinitialize the stream's signals.
1229   Error deinit() { return Plugin::success(); }
1230 
1231   /// Push a asynchronous kernel to the stream. The kernel arguments must be
1232   /// placed in a special allocation for kernel args and must keep alive until
1233   /// the kernel finalizes. Once the kernel is finished, the stream will release
1234   /// the kernel args buffer to the specified memory manager.
1235   Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
1236                          uint32_t NumThreads[3], uint32_t NumBlocks[3],
1237                          uint32_t GroupSize, uint64_t StackSize,
1238                          AMDGPUMemoryManagerTy &MemoryManager) {
1239     if (Queue == nullptr)
1240       return Plugin::error("Target queue was nullptr");
1241 
1242     // Retrieve an available signal for the operation's output.
1243     AMDGPUSignalTy *OutputSignal = nullptr;
1244     if (auto Err = SignalManager.getResource(OutputSignal))
1245       return Err;
1246     OutputSignal->reset();
1247     OutputSignal->increaseUseCount();
1248 
1249     std::lock_guard<std::mutex> StreamLock(Mutex);
1250 
1251     // Consume stream slot and compute dependencies.
1252     auto [Curr, InputSignal] = consume(OutputSignal);
1253 
1254     // Setup the post action to release the kernel args buffer.
1255     if (auto Err = Slots[Curr].schedReleaseBuffer(KernelArgs, MemoryManager))
1256       return Err;
1257 
1258     // If we are running an RPC server we want to wake up the server thread
1259     // whenever there is a kernel running and let it sleep otherwise.
1260     if (Device.getRPCServer())
1261       Device.Plugin.getRPCServer().Thread->notify();
1262 
1263     // Push the kernel with the output signal and an input signal (optional)
1264     if (auto Err = Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads,
1265                                            NumBlocks, GroupSize, StackSize,
1266                                            OutputSignal, InputSignal))
1267       return Err;
1268 
1269     // Register a callback to indicate when the kernel is complete.
1270     if (Device.getRPCServer()) {
1271       if (auto Err = Slots[Curr].schedCallback(
1272               [](void *Data) -> llvm::Error {
1273                 GenericPluginTy &Plugin =
1274                     *reinterpret_cast<GenericPluginTy *>(Data);
1275                 Plugin.getRPCServer().Thread->finish();
1276                 return Error::success();
1277               },
1278               &Device.Plugin))
1279         return Err;
1280     }
1281     return Plugin::success();
1282   }
1283 
1284   /// Push an asynchronous memory copy between pinned memory buffers.
1285   Error pushPinnedMemoryCopyAsync(void *Dst, const void *Src,
1286                                   uint64_t CopySize) {
1287     // Retrieve an available signal for the operation's output.
1288     AMDGPUSignalTy *OutputSignal = nullptr;
1289     if (auto Err = SignalManager.getResource(OutputSignal))
1290       return Err;
1291     OutputSignal->reset();
1292     OutputSignal->increaseUseCount();
1293 
1294     std::lock_guard<std::mutex> Lock(Mutex);
1295 
1296     // Consume stream slot and compute dependencies.
1297     auto [Curr, InputSignal] = consume(OutputSignal);
1298 
1299     // Issue the async memory copy.
1300     if (InputSignal && InputSignal->load()) {
1301       hsa_signal_t InputSignalRaw = InputSignal->get();
1302       return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src,
1303                                      Agent, CopySize, 1, &InputSignalRaw,
1304                                      OutputSignal->get());
1305     }
1306 
1307     return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src,
1308                                    Agent, CopySize, 0, nullptr,
1309                                    OutputSignal->get());
1310   }
1311 
1312   /// Push an asynchronous memory copy device-to-host involving an unpinned
1313   /// memory buffer. The operation consists of a two-step copy from the
1314   /// device buffer to an intermediate pinned host buffer, and then, to a
1315   /// unpinned host buffer. Both operations are asynchronous and dependant.
1316   /// The intermediate pinned buffer will be released to the specified memory
1317   /// manager once the operation completes.
1318   Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter,
1319                                uint64_t CopySize,
1320                                AMDGPUMemoryManagerTy &MemoryManager) {
1321     // Retrieve available signals for the operation's outputs.
1322     AMDGPUSignalTy *OutputSignals[2] = {};
1323     if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
1324       return Err;
1325     for (auto *Signal : OutputSignals) {
1326       Signal->reset();
1327       Signal->increaseUseCount();
1328     }
1329 
1330     std::lock_guard<std::mutex> Lock(Mutex);
1331 
1332     // Consume stream slot and compute dependencies.
1333     auto [Curr, InputSignal] = consume(OutputSignals[0]);
1334 
1335     // Setup the post action for releasing the intermediate buffer.
1336     if (auto Err = Slots[Curr].schedReleaseBuffer(Inter, MemoryManager))
1337       return Err;
1338 
1339     // Issue the first step: device to host transfer. Avoid defining the input
1340     // dependency if already satisfied.
1341     if (InputSignal && InputSignal->load()) {
1342       hsa_signal_t InputSignalRaw = InputSignal->get();
1343       if (auto Err = hsa_utils::asyncMemCopy(
1344               UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1,
1345               &InputSignalRaw, OutputSignals[0]->get()))
1346         return Err;
1347     } else {
1348       if (auto Err = hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Inter,
1349                                              Agent, Src, Agent, CopySize, 0,
1350                                              nullptr, OutputSignals[0]->get()))
1351         return Err;
1352     }
1353 
1354     // Consume another stream slot and compute dependencies.
1355     std::tie(Curr, InputSignal) = consume(OutputSignals[1]);
1356     assert(InputSignal && "Invalid input signal");
1357 
1358     // The std::memcpy is done asynchronously using an async handler. We store
1359     // the function's information in the action but it's not actually an action.
1360     if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst, Inter, CopySize))
1361       return Err;
1362 
1363     // Make changes on this slot visible to the async handler's thread.
1364     std::atomic_thread_fence(std::memory_order_release);
1365 
1366     // Issue the second step: host to host transfer.
1367     hsa_status_t Status = hsa_amd_signal_async_handler(
1368         InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
1369         (void *)&Slots[Curr]);
1370 
1371     return Plugin::check(Status, "Error in hsa_amd_signal_async_handler: %s");
1372   }
1373 
1374   /// Push an asynchronous memory copy host-to-device involving an unpinned
1375   /// memory buffer. The operation consists of a two-step copy from the
1376   /// unpinned host buffer to an intermediate pinned host buffer, and then, to
1377   /// the pinned host buffer. Both operations are asynchronous and dependant.
1378   /// The intermediate pinned buffer will be released to the specified memory
1379   /// manager once the operation completes.
1380   Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter,
1381                                uint64_t CopySize,
1382                                AMDGPUMemoryManagerTy &MemoryManager) {
1383     // Retrieve available signals for the operation's outputs.
1384     AMDGPUSignalTy *OutputSignals[2] = {};
1385     if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
1386       return Err;
1387     for (auto *Signal : OutputSignals) {
1388       Signal->reset();
1389       Signal->increaseUseCount();
1390     }
1391 
1392     AMDGPUSignalTy *OutputSignal = OutputSignals[0];
1393 
1394     std::lock_guard<std::mutex> Lock(Mutex);
1395 
1396     // Consume stream slot and compute dependencies.
1397     auto [Curr, InputSignal] = consume(OutputSignal);
1398 
1399     // Issue the first step: host to host transfer.
1400     if (InputSignal && InputSignal->load()) {
1401       // The std::memcpy is done asynchronously using an async handler. We store
1402       // the function's information in the action but it is not actually a
1403       // post action.
1404       if (auto Err = Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize))
1405         return Err;
1406 
1407       // Make changes on this slot visible to the async handler's thread.
1408       std::atomic_thread_fence(std::memory_order_release);
1409 
1410       hsa_status_t Status = hsa_amd_signal_async_handler(
1411           InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
1412           (void *)&Slots[Curr]);
1413 
1414       if (auto Err = Plugin::check(Status,
1415                                    "Error in hsa_amd_signal_async_handler: %s"))
1416         return Err;
1417 
1418       // Let's use now the second output signal.
1419       OutputSignal = OutputSignals[1];
1420 
1421       // Consume another stream slot and compute dependencies.
1422       std::tie(Curr, InputSignal) = consume(OutputSignal);
1423     } else {
1424       // All preceding operations completed, copy the memory synchronously.
1425       std::memcpy(Inter, Src, CopySize);
1426 
1427       // Return the second signal because it will not be used.
1428       OutputSignals[1]->decreaseUseCount();
1429       if (auto Err = SignalManager.returnResource(OutputSignals[1]))
1430         return Err;
1431     }
1432 
1433     // Setup the post action to release the intermediate pinned buffer.
1434     if (auto Err = Slots[Curr].schedReleaseBuffer(Inter, MemoryManager))
1435       return Err;
1436 
1437     // Issue the second step: host to device transfer. Avoid defining the input
1438     // dependency if already satisfied.
1439     if (InputSignal && InputSignal->load()) {
1440       hsa_signal_t InputSignalRaw = InputSignal->get();
1441       return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
1442                                      Agent, CopySize, 1, &InputSignalRaw,
1443                                      OutputSignal->get());
1444     }
1445     return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
1446                                    Agent, CopySize, 0, nullptr,
1447                                    OutputSignal->get());
1448   }
1449 
1450   // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
1451   Error pushMemoryCopyD2DAsync(void *Dst, hsa_agent_t DstAgent, const void *Src,
1452                                hsa_agent_t SrcAgent, uint64_t CopySize) {
1453     AMDGPUSignalTy *OutputSignal;
1454     if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal))
1455       return Err;
1456     OutputSignal->reset();
1457     OutputSignal->increaseUseCount();
1458 
1459     std::lock_guard<std::mutex> Lock(Mutex);
1460 
1461     // Consume stream slot and compute dependencies.
1462     auto [Curr, InputSignal] = consume(OutputSignal);
1463 
1464     // The agents need to have access to the corresponding memory
1465     // This is presently only true if the pointers were originally
1466     // allocated by this runtime or the caller made the appropriate
1467     // access calls.
1468 
1469     if (InputSignal && InputSignal->load()) {
1470       hsa_signal_t InputSignalRaw = InputSignal->get();
1471       return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
1472                                      SrcAgent, CopySize, 1, &InputSignalRaw,
1473                                      OutputSignal->get());
1474     }
1475     return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
1476                                    SrcAgent, CopySize, 0, nullptr,
1477                                    OutputSignal->get());
1478   }
1479 
1480   /// Synchronize with the stream. The current thread waits until all operations
1481   /// are finalized and it performs the pending post actions (i.e., releasing
1482   /// intermediate buffers).
1483   Error synchronize() {
1484     std::lock_guard<std::mutex> Lock(Mutex);
1485 
1486     // No need to synchronize anything.
1487     if (size() == 0)
1488       return Plugin::success();
1489 
1490     // Wait until all previous operations on the stream have completed.
1491     if (auto Err =
1492             Slots[last()].Signal->wait(StreamBusyWaitMicroseconds, &Device))
1493       return Err;
1494 
1495     // Reset the stream and perform all pending post actions.
1496     return complete();
1497   }
1498 
1499   /// Query the stream and complete pending post actions if operations finished.
1500   /// Return whether all the operations completed. This operation does not block
1501   /// the calling thread.
1502   Expected<bool> query() {
1503     std::lock_guard<std::mutex> Lock(Mutex);
1504 
1505     // No need to query anything.
1506     if (size() == 0)
1507       return true;
1508 
1509     // The last operation did not complete yet. Return directly.
1510     if (Slots[last()].Signal->load())
1511       return false;
1512 
1513     // Reset the stream and perform all pending post actions.
1514     if (auto Err = complete())
1515       return std::move(Err);
1516 
1517     return true;
1518   }
1519 
1520   const AMDGPUQueueTy *getQueue() const { return Queue; }
1521 
1522   /// Record the state of the stream on an event.
1523   Error recordEvent(AMDGPUEventTy &Event) const;
1524 
1525   /// Make the stream wait on an event.
1526   Error waitEvent(const AMDGPUEventTy &Event);
1527 
1528   friend struct AMDGPUStreamManagerTy;
1529 };
1530 
1531 /// Class representing an event on AMDGPU. The event basically stores some
1532 /// information regarding the state of the recorded stream.
1533 struct AMDGPUEventTy {
1534   /// Create an empty event.
1535   AMDGPUEventTy(AMDGPUDeviceTy &Device)
1536       : RecordedStream(nullptr), RecordedSlot(-1), RecordedSyncCycle(-1) {}
1537 
1538   /// Initialize and deinitialize.
1539   Error init() { return Plugin::success(); }
1540   Error deinit() { return Plugin::success(); }
1541 
1542   /// Record the state of a stream on the event.
1543   Error record(AMDGPUStreamTy &Stream) {
1544     std::lock_guard<std::mutex> Lock(Mutex);
1545 
1546     // Ignore the last recorded stream.
1547     RecordedStream = &Stream;
1548 
1549     return Stream.recordEvent(*this);
1550   }
1551 
1552   /// Make a stream wait on the current event.
1553   Error wait(AMDGPUStreamTy &Stream) {
1554     std::lock_guard<std::mutex> Lock(Mutex);
1555 
1556     if (!RecordedStream)
1557       return Plugin::error("Event does not have any recorded stream");
1558 
1559     // Synchronizing the same stream. Do nothing.
1560     if (RecordedStream == &Stream)
1561       return Plugin::success();
1562 
1563     // No need to wait anything, the recorded stream already finished the
1564     // corresponding operation.
1565     if (RecordedSlot < 0)
1566       return Plugin::success();
1567 
1568     return Stream.waitEvent(*this);
1569   }
1570 
1571 protected:
1572   /// The stream registered in this event.
1573   AMDGPUStreamTy *RecordedStream;
1574 
1575   /// The recordered operation on the recorded stream.
1576   int64_t RecordedSlot;
1577 
1578   /// The sync cycle when the stream was recorded. Used to detect stale events.
1579   int64_t RecordedSyncCycle;
1580 
1581   /// Mutex to safely access event fields.
1582   mutable std::mutex Mutex;
1583 
1584   friend struct AMDGPUStreamTy;
1585 };
1586 
1587 Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const {
1588   std::lock_guard<std::mutex> Lock(Mutex);
1589 
1590   if (size() > 0) {
1591     // Record the synchronize identifier (to detect stale recordings) and
1592     // the last valid stream's operation.
1593     Event.RecordedSyncCycle = SyncCycle;
1594     Event.RecordedSlot = last();
1595 
1596     assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle");
1597     assert(Event.RecordedSlot >= 0 && "Invalid recorded slot");
1598   } else {
1599     // The stream is empty, everything already completed, record nothing.
1600     Event.RecordedSyncCycle = -1;
1601     Event.RecordedSlot = -1;
1602   }
1603   return Plugin::success();
1604 }
1605 
1606 Error AMDGPUStreamTy::waitEvent(const AMDGPUEventTy &Event) {
1607   // Retrieve the recorded stream on the event.
1608   AMDGPUStreamTy &RecordedStream = *Event.RecordedStream;
1609 
1610   std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, RecordedStream.Mutex);
1611 
1612   // The recorded stream already completed the operation because the synchronize
1613   // identifier is already outdated.
1614   if (RecordedStream.SyncCycle != (uint32_t)Event.RecordedSyncCycle)
1615     return Plugin::success();
1616 
1617   // Again, the recorded stream already completed the operation, the last
1618   // operation's output signal is satisfied.
1619   if (!RecordedStream.Slots[Event.RecordedSlot].Signal->load())
1620     return Plugin::success();
1621 
1622   // Otherwise, make the current stream wait on the other stream's operation.
1623   return waitOnStreamOperation(RecordedStream, Event.RecordedSlot);
1624 }
1625 
1626 struct AMDGPUStreamManagerTy final
1627     : GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> {
1628   using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>;
1629   using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
1630 
1631   AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
1632       : GenericDeviceResourceManagerTy(Device), Device(Device),
1633         OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
1634         NextQueue(0), Agent(HSAAgent) {}
1635 
1636   Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) {
1637     Queues = std::vector<AMDGPUQueueTy>(NumHSAQueues);
1638     QueueSize = HSAQueueSize;
1639     MaxNumQueues = NumHSAQueues;
1640     // Initialize one queue eagerly
1641     if (auto Err = Queues.front().init(Device, Agent, QueueSize))
1642       return Err;
1643 
1644     return GenericDeviceResourceManagerTy::init(InitialSize);
1645   }
1646 
1647   /// Deinitialize the resource pool and delete all resources. This function
1648   /// must be called before the destructor.
1649   Error deinit() override {
1650     // De-init all queues
1651     for (AMDGPUQueueTy &Queue : Queues) {
1652       if (auto Err = Queue.deinit())
1653         return Err;
1654     }
1655 
1656     return GenericDeviceResourceManagerTy::deinit();
1657   }
1658 
1659   /// Get a single stream from the pool or create new resources.
1660   virtual Error getResource(AMDGPUStreamTy *&StreamHandle) override {
1661     return getResourcesImpl(1, &StreamHandle, [this](AMDGPUStreamTy *&Handle) {
1662       return assignNextQueue(Handle);
1663     });
1664   }
1665 
1666   /// Return stream to the pool.
1667   virtual Error returnResource(AMDGPUStreamTy *StreamHandle) override {
1668     return returnResourceImpl(StreamHandle, [](AMDGPUStreamTy *Handle) {
1669       Handle->Queue->removeUser();
1670       return Plugin::success();
1671     });
1672   }
1673 
1674 private:
1675   /// Search for and assign an prefereably idle queue to the given Stream. If
1676   /// there is no queue without current users, choose the queue with the lowest
1677   /// user count. If utilization is ignored: use round robin selection.
1678   inline Error assignNextQueue(AMDGPUStreamTy *Stream) {
1679     // Start from zero when tracking utilization, otherwise: round robin policy.
1680     uint32_t Index = OMPX_QueueTracking ? 0 : NextQueue++ % MaxNumQueues;
1681 
1682     if (OMPX_QueueTracking) {
1683       // Find the least used queue.
1684       for (uint32_t I = 0; I < MaxNumQueues; ++I) {
1685         // Early exit when an initialized queue is idle.
1686         if (Queues[I].isInitialized() && Queues[I].getUserCount() == 0) {
1687           Index = I;
1688           break;
1689         }
1690 
1691         // Update the least used queue.
1692         if (Queues[Index].getUserCount() > Queues[I].getUserCount())
1693           Index = I;
1694       }
1695     }
1696 
1697     // Make sure the queue is initialized, then add user & assign.
1698     if (auto Err = Queues[Index].init(Device, Agent, QueueSize))
1699       return Err;
1700     Queues[Index].addUser();
1701     Stream->Queue = &Queues[Index];
1702 
1703     return Plugin::success();
1704   }
1705 
1706   /// The device associated with this stream.
1707   GenericDeviceTy &Device;
1708 
1709   /// Envar for controlling the tracking of busy HSA queues.
1710   BoolEnvar OMPX_QueueTracking;
1711 
1712   /// The next queue index to use for round robin selection.
1713   uint32_t NextQueue;
1714 
1715   /// The queues which are assigned to requested streams.
1716   std::vector<AMDGPUQueueTy> Queues;
1717 
1718   /// The corresponding device as HSA agent.
1719   hsa_agent_t Agent;
1720 
1721   /// The maximum number of queues.
1722   uint32_t MaxNumQueues;
1723 
1724   /// The size of created queues.
1725   uint32_t QueueSize;
1726 };
1727 
1728 /// Abstract class that holds the common members of the actual kernel devices
1729 /// and the host device. Both types should inherit from this class.
1730 struct AMDGenericDeviceTy {
1731   AMDGenericDeviceTy() {}
1732 
1733   virtual ~AMDGenericDeviceTy() {}
1734 
1735   /// Create all memory pools which the device has access to and classify them.
1736   Error initMemoryPools() {
1737     // Retrieve all memory pools from the device agent(s).
1738     Error Err = retrieveAllMemoryPools();
1739     if (Err)
1740       return Err;
1741 
1742     for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) {
1743       // Initialize the memory pool and retrieve some basic info.
1744       Error Err = MemoryPool->init();
1745       if (Err)
1746         return Err;
1747 
1748       if (!MemoryPool->isGlobal())
1749         continue;
1750 
1751       // Classify the memory pools depending on their properties.
1752       if (MemoryPool->isFineGrained()) {
1753         FineGrainedMemoryPools.push_back(MemoryPool);
1754         if (MemoryPool->supportsKernelArgs())
1755           ArgsMemoryPools.push_back(MemoryPool);
1756       } else if (MemoryPool->isCoarseGrained()) {
1757         CoarseGrainedMemoryPools.push_back(MemoryPool);
1758       }
1759     }
1760     return Plugin::success();
1761   }
1762 
1763   /// Destroy all memory pools.
1764   Error deinitMemoryPools() {
1765     for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools)
1766       delete Pool;
1767 
1768     AllMemoryPools.clear();
1769     FineGrainedMemoryPools.clear();
1770     CoarseGrainedMemoryPools.clear();
1771     ArgsMemoryPools.clear();
1772 
1773     return Plugin::success();
1774   }
1775 
1776   /// Retrieve and construct all memory pools from the device agent(s).
1777   virtual Error retrieveAllMemoryPools() = 0;
1778 
1779   /// Get the device agent.
1780   virtual hsa_agent_t getAgent() const = 0;
1781 
1782 protected:
1783   /// Array of all memory pools available to the host agents.
1784   llvm::SmallVector<AMDGPUMemoryPoolTy *> AllMemoryPools;
1785 
1786   /// Array of fine-grained memory pools available to the host agents.
1787   llvm::SmallVector<AMDGPUMemoryPoolTy *> FineGrainedMemoryPools;
1788 
1789   /// Array of coarse-grained memory pools available to the host agents.
1790   llvm::SmallVector<AMDGPUMemoryPoolTy *> CoarseGrainedMemoryPools;
1791 
1792   /// Array of kernel args memory pools available to the host agents.
1793   llvm::SmallVector<AMDGPUMemoryPoolTy *> ArgsMemoryPools;
1794 };
1795 
1796 /// Class representing the host device. This host device may have more than one
1797 /// HSA host agent. We aggregate all its resources into the same instance.
1798 struct AMDHostDeviceTy : public AMDGenericDeviceTy {
1799   /// Create a host device from an array of host agents.
1800   AMDHostDeviceTy(AMDGPUPluginTy &Plugin,
1801                   const llvm::SmallVector<hsa_agent_t> &HostAgents)
1802       : AMDGenericDeviceTy(), Agents(HostAgents), ArgsMemoryManager(Plugin),
1803         PinnedMemoryManager(Plugin) {
1804     assert(HostAgents.size() && "No host agent found");
1805   }
1806 
1807   /// Initialize the host device memory pools and the memory managers for
1808   /// kernel args and host pinned memory allocations.
1809   Error init() {
1810     if (auto Err = initMemoryPools())
1811       return Err;
1812 
1813     if (auto Err = ArgsMemoryManager.init(getArgsMemoryPool()))
1814       return Err;
1815 
1816     if (auto Err = PinnedMemoryManager.init(getFineGrainedMemoryPool()))
1817       return Err;
1818 
1819     return Plugin::success();
1820   }
1821 
1822   /// Deinitialize memory pools and managers.
1823   Error deinit() {
1824     if (auto Err = deinitMemoryPools())
1825       return Err;
1826 
1827     if (auto Err = ArgsMemoryManager.deinit())
1828       return Err;
1829 
1830     if (auto Err = PinnedMemoryManager.deinit())
1831       return Err;
1832 
1833     return Plugin::success();
1834   }
1835 
1836   /// Retrieve and construct all memory pools from the host agents.
1837   Error retrieveAllMemoryPools() override {
1838     // Iterate through the available pools across the host agents.
1839     for (hsa_agent_t Agent : Agents) {
1840       Error Err = hsa_utils::iterateAgentMemoryPools(
1841           Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
1842             AMDGPUMemoryPoolTy *MemoryPool =
1843                 new AMDGPUMemoryPoolTy(HSAMemoryPool);
1844             AllMemoryPools.push_back(MemoryPool);
1845             return HSA_STATUS_SUCCESS;
1846           });
1847       if (Err)
1848         return Err;
1849     }
1850     return Plugin::success();
1851   }
1852 
1853   /// Get one of the host agents. Return always the first agent.
1854   hsa_agent_t getAgent() const override { return Agents[0]; }
1855 
1856   /// Get a memory pool for fine-grained allocations.
1857   AMDGPUMemoryPoolTy &getFineGrainedMemoryPool() {
1858     assert(!FineGrainedMemoryPools.empty() && "No fine-grained mempool");
1859     // Retrive any memory pool.
1860     return *FineGrainedMemoryPools[0];
1861   }
1862 
1863   AMDGPUMemoryPoolTy &getCoarseGrainedMemoryPool() {
1864     assert(!CoarseGrainedMemoryPools.empty() && "No coarse-grained mempool");
1865     // Retrive any memory pool.
1866     return *CoarseGrainedMemoryPools[0];
1867   }
1868 
1869   /// Get a memory pool for kernel args allocations.
1870   AMDGPUMemoryPoolTy &getArgsMemoryPool() {
1871     assert(!ArgsMemoryPools.empty() && "No kernelargs mempool");
1872     // Retrieve any memory pool.
1873     return *ArgsMemoryPools[0];
1874   }
1875 
1876   /// Getters for kernel args and host pinned memory managers.
1877   AMDGPUMemoryManagerTy &getArgsMemoryManager() { return ArgsMemoryManager; }
1878   AMDGPUMemoryManagerTy &getPinnedMemoryManager() {
1879     return PinnedMemoryManager;
1880   }
1881 
1882 private:
1883   /// Array of agents on the host side.
1884   const llvm::SmallVector<hsa_agent_t> Agents;
1885 
1886   // Memory manager for kernel arguments.
1887   AMDGPUMemoryManagerTy ArgsMemoryManager;
1888 
1889   // Memory manager for pinned memory.
1890   AMDGPUMemoryManagerTy PinnedMemoryManager;
1891 };
1892 
1893 /// Class implementing the AMDGPU device functionalities which derives from the
1894 /// generic device class.
1895 struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
1896   // Create an AMDGPU device with a device id and default AMDGPU grid values.
1897   AMDGPUDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
1898                  AMDHostDeviceTy &HostDevice, hsa_agent_t Agent)
1899       : GenericDeviceTy(Plugin, DeviceId, NumDevices, {}), AMDGenericDeviceTy(),
1900         OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 4),
1901         OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 512),
1902         OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU", 4),
1903         OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES",
1904                                1 * 1024 * 1024), // 1MB
1905         OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
1906                                64),
1907         OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
1908         OMPX_UseMultipleSdmaEngines(
1909             "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
1910         OMPX_ApuMaps("OMPX_APU_MAPS", false), AMDGPUStreamManager(*this, Agent),
1911         AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent),
1912         HostDevice(HostDevice) {}
1913 
1914   ~AMDGPUDeviceTy() {}
1915 
1916   /// Initialize the device, its resources and get its properties.
1917   Error initImpl(GenericPluginTy &Plugin) override {
1918     // First setup all the memory pools.
1919     if (auto Err = initMemoryPools())
1920       return Err;
1921 
1922     char GPUName[64];
1923     if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName))
1924       return Err;
1925     ComputeUnitKind = GPUName;
1926 
1927     // Get the wavefront size.
1928     uint32_t WavefrontSize = 0;
1929     if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize))
1930       return Err;
1931     GridValues.GV_Warp_Size = WavefrontSize;
1932 
1933     // Get the frequency of the steady clock. If the attribute is missing
1934     // assume running on an older libhsa and default to 0, omp_get_wtime
1935     // will be inaccurate but otherwise programs can still run.
1936     if (getDeviceAttrRaw(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY,
1937                          ClockFrequency) != HSA_STATUS_SUCCESS)
1938       ClockFrequency = 0;
1939 
1940     // Load the grid values dependending on the wavefront.
1941     if (WavefrontSize == 32)
1942       GridValues = getAMDGPUGridValues<32>();
1943     else if (WavefrontSize == 64)
1944       GridValues = getAMDGPUGridValues<64>();
1945     else
1946       return Plugin::error("Unexpected AMDGPU wavefront %d", WavefrontSize);
1947 
1948     // Get maximum number of workitems per workgroup.
1949     uint16_t WorkgroupMaxDim[3];
1950     if (auto Err =
1951             getDeviceAttr(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgroupMaxDim))
1952       return Err;
1953     GridValues.GV_Max_WG_Size = WorkgroupMaxDim[0];
1954 
1955     // Get maximum number of workgroups.
1956     hsa_dim3_t GridMaxDim;
1957     if (auto Err = getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim))
1958       return Err;
1959 
1960     GridValues.GV_Max_Teams = GridMaxDim.x / GridValues.GV_Max_WG_Size;
1961     if (GridValues.GV_Max_Teams == 0)
1962       return Plugin::error("Maximum number of teams cannot be zero");
1963 
1964     // Compute the default number of teams.
1965     uint32_t ComputeUnits = 0;
1966     if (auto Err =
1967             getDeviceAttr(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, ComputeUnits))
1968       return Err;
1969     GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU;
1970 
1971     uint32_t WavesPerCU = 0;
1972     if (auto Err =
1973             getDeviceAttr(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, WavesPerCU))
1974       return Err;
1975     HardwareParallelism = ComputeUnits * WavesPerCU;
1976 
1977     // Get maximum size of any device queues and maximum number of queues.
1978     uint32_t MaxQueueSize;
1979     if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize))
1980       return Err;
1981 
1982     uint32_t MaxQueues;
1983     if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUES_MAX, MaxQueues))
1984       return Err;
1985 
1986     // Compute the number of queues and their size.
1987     OMPX_NumQueues = std::max(1U, std::min(OMPX_NumQueues.get(), MaxQueues));
1988     OMPX_QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize);
1989 
1990     // Initialize stream pool.
1991     if (auto Err = AMDGPUStreamManager.init(OMPX_InitialNumStreams,
1992                                             OMPX_NumQueues, OMPX_QueueSize))
1993       return Err;
1994 
1995     // Initialize event pool.
1996     if (auto Err = AMDGPUEventManager.init(OMPX_InitialNumEvents))
1997       return Err;
1998 
1999     // Initialize signal pool.
2000     if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals))
2001       return Err;
2002 
2003     // Detect if XNACK is enabled
2004     SmallVector<SmallString<32>> Targets;
2005     if (auto Err = hsa_utils::getTargetTripleAndFeatures(Agent, Targets))
2006       return Err;
2007     if (!Targets.empty() && Targets[0].str().contains("xnack+"))
2008       IsXnackEnabled = true;
2009 
2010     // detect if device is an APU.
2011     if (auto Err = checkIfAPU())
2012       return Err;
2013 
2014     return Plugin::success();
2015   }
2016 
2017   /// Deinitialize the device and release its resources.
2018   Error deinitImpl() override {
2019     // Deinitialize the stream and event pools.
2020     if (auto Err = AMDGPUStreamManager.deinit())
2021       return Err;
2022 
2023     if (auto Err = AMDGPUEventManager.deinit())
2024       return Err;
2025 
2026     if (auto Err = AMDGPUSignalManager.deinit())
2027       return Err;
2028 
2029     // Close modules if necessary.
2030     if (!LoadedImages.empty()) {
2031       // Each image has its own module.
2032       for (DeviceImageTy *Image : LoadedImages) {
2033         AMDGPUDeviceImageTy &AMDImage =
2034             static_cast<AMDGPUDeviceImageTy &>(*Image);
2035 
2036         // Unload the executable of the image.
2037         if (auto Err = AMDImage.unloadExecutable())
2038           return Err;
2039       }
2040     }
2041 
2042     // Invalidate agent reference.
2043     Agent = {0};
2044 
2045     return Plugin::success();
2046   }
2047 
2048   virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
2049                                        DeviceImageTy &Image) override {
2050     GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
2051     if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini"))
2052       Image.setPendingGlobalDtors();
2053 
2054     return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
2055   }
2056 
2057   virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
2058                                       DeviceImageTy &Image) override {
2059     if (Image.hasPendingGlobalDtors())
2060       return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
2061     return Plugin::success();
2062   }
2063 
2064   uint64_t getStreamBusyWaitMicroseconds() const { return OMPX_StreamBusyWait; }
2065 
2066   Expected<std::unique_ptr<MemoryBuffer>>
2067   doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
2068 
2069     // TODO: We should try to avoid materialization but there seems to be no
2070     // good linker interface w/o file i/o.
2071     SmallString<128> LinkerInputFilePath;
2072     std::error_code EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit",
2073                                                       "o", LinkerInputFilePath);
2074     if (EC)
2075       return Plugin::error("Failed to create temporary file for linker");
2076 
2077     // Write the file's contents to the output file.
2078     Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
2079         FileOutputBuffer::create(LinkerInputFilePath, MB->getBuffer().size());
2080     if (!OutputOrErr)
2081       return OutputOrErr.takeError();
2082     std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
2083     llvm::copy(MB->getBuffer(), Output->getBufferStart());
2084     if (Error E = Output->commit())
2085       return std::move(E);
2086 
2087     SmallString<128> LinkerOutputFilePath;
2088     EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit", "so",
2089                                       LinkerOutputFilePath);
2090     if (EC)
2091       return Plugin::error("Failed to create temporary file for linker");
2092 
2093     const auto &ErrorOrPath = sys::findProgramByName("lld");
2094     if (!ErrorOrPath)
2095       return createStringError(inconvertibleErrorCode(),
2096                                "Failed to find `lld` on the PATH.");
2097 
2098     std::string LLDPath = ErrorOrPath.get();
2099     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
2100          "Using `%s` to link JITed amdgcn ouput.", LLDPath.c_str());
2101 
2102     std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind();
2103     StringRef Args[] = {LLDPath,
2104                         "-flavor",
2105                         "gnu",
2106                         "--no-undefined",
2107                         "-shared",
2108                         MCPU,
2109                         "-o",
2110                         LinkerOutputFilePath.data(),
2111                         LinkerInputFilePath.data()};
2112 
2113     std::string Error;
2114     int RC = sys::ExecuteAndWait(LLDPath, Args, std::nullopt, {}, 0, 0, &Error);
2115     if (RC)
2116       return Plugin::error("Linking optimized bitcode failed: %s",
2117                            Error.c_str());
2118 
2119     auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(LinkerOutputFilePath);
2120     if (!BufferOrErr)
2121       return Plugin::error("Failed to open temporary file for lld");
2122 
2123     // Clean up the temporary files afterwards.
2124     if (sys::fs::remove(LinkerOutputFilePath))
2125       return Plugin::error("Failed to remove temporary output file for lld");
2126     if (sys::fs::remove(LinkerInputFilePath))
2127       return Plugin::error("Failed to remove temporary input file for lld");
2128 
2129     return std::move(*BufferOrErr);
2130   }
2131 
2132   /// See GenericDeviceTy::getComputeUnitKind().
2133   std::string getComputeUnitKind() const override { return ComputeUnitKind; }
2134 
2135   /// Returns the clock frequency for the given AMDGPU device.
2136   uint64_t getClockFrequency() const override { return ClockFrequency; }
2137 
2138   /// Allocate and construct an AMDGPU kernel.
2139   Expected<GenericKernelTy &> constructKernel(const char *Name) override {
2140     // Allocate and construct the AMDGPU kernel.
2141     AMDGPUKernelTy *AMDGPUKernel = Plugin.allocate<AMDGPUKernelTy>();
2142     if (!AMDGPUKernel)
2143       return Plugin::error("Failed to allocate memory for AMDGPU kernel");
2144 
2145     new (AMDGPUKernel) AMDGPUKernelTy(Name);
2146 
2147     return *AMDGPUKernel;
2148   }
2149 
2150   /// Set the current context to this device's context. Do nothing since the
2151   /// AMDGPU devices do not have the concept of contexts.
2152   Error setContext() override { return Plugin::success(); }
2153 
2154   /// AMDGPU returns the product of the number of compute units and the waves
2155   /// per compute unit.
2156   uint64_t getHardwareParallelism() const override {
2157     return HardwareParallelism;
2158   }
2159 
2160   /// We want to set up the RPC server for host services to the GPU if it is
2161   /// availible.
2162   bool shouldSetupRPCServer() const override { return true; }
2163 
2164   /// The RPC interface should have enough space for all availible parallelism.
2165   uint64_t requestedRPCPortCount() const override {
2166     return getHardwareParallelism();
2167   }
2168 
2169   /// Get the stream of the asynchronous info sructure or get a new one.
2170   Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper,
2171                   AMDGPUStreamTy *&Stream) {
2172     // Get the stream (if any) from the async info.
2173     Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>();
2174     if (!Stream) {
2175       // There was no stream; get an idle one.
2176       if (auto Err = AMDGPUStreamManager.getResource(Stream))
2177         return Err;
2178 
2179       // Modify the async info's stream.
2180       AsyncInfoWrapper.setQueueAs<AMDGPUStreamTy *>(Stream);
2181     }
2182     return Plugin::success();
2183   }
2184 
2185   /// Load the binary image into the device and allocate an image object.
2186   Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
2187                                            int32_t ImageId) override {
2188     // Allocate and initialize the image object.
2189     AMDGPUDeviceImageTy *AMDImage = Plugin.allocate<AMDGPUDeviceImageTy>();
2190     new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage);
2191 
2192     // Load the HSA executable.
2193     if (Error Err = AMDImage->loadExecutable(*this))
2194       return std::move(Err);
2195 
2196     return AMDImage;
2197   }
2198 
2199   /// Allocate memory on the device or related to the device.
2200   void *allocate(size_t Size, void *, TargetAllocTy Kind) override;
2201 
2202   /// Deallocate memory on the device or related to the device.
2203   int free(void *TgtPtr, TargetAllocTy Kind) override {
2204     if (TgtPtr == nullptr)
2205       return OFFLOAD_SUCCESS;
2206 
2207     AMDGPUMemoryPoolTy *MemoryPool = nullptr;
2208     switch (Kind) {
2209     case TARGET_ALLOC_DEFAULT:
2210     case TARGET_ALLOC_DEVICE:
2211     case TARGET_ALLOC_DEVICE_NON_BLOCKING:
2212       MemoryPool = CoarseGrainedMemoryPools[0];
2213       break;
2214     case TARGET_ALLOC_HOST:
2215       MemoryPool = &HostDevice.getFineGrainedMemoryPool();
2216       break;
2217     case TARGET_ALLOC_SHARED:
2218       MemoryPool = &HostDevice.getFineGrainedMemoryPool();
2219       break;
2220     }
2221 
2222     if (!MemoryPool) {
2223       REPORT("No memory pool for the specified allocation kind\n");
2224       return OFFLOAD_FAIL;
2225     }
2226 
2227     if (Error Err = MemoryPool->deallocate(TgtPtr)) {
2228       REPORT("%s\n", toString(std::move(Err)).data());
2229       return OFFLOAD_FAIL;
2230     }
2231 
2232     return OFFLOAD_SUCCESS;
2233   }
2234 
2235   /// Synchronize current thread with the pending operations on the async info.
2236   Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
2237     AMDGPUStreamTy *Stream =
2238         reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
2239     assert(Stream && "Invalid stream");
2240 
2241     if (auto Err = Stream->synchronize())
2242       return Err;
2243 
2244     // Once the stream is synchronized, return it to stream pool and reset
2245     // AsyncInfo. This is to make sure the synchronization only works for its
2246     // own tasks.
2247     AsyncInfo.Queue = nullptr;
2248     return AMDGPUStreamManager.returnResource(Stream);
2249   }
2250 
2251   /// Query for the completion of the pending operations on the async info.
2252   Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
2253     AMDGPUStreamTy *Stream =
2254         reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
2255     assert(Stream && "Invalid stream");
2256 
2257     auto CompletedOrErr = Stream->query();
2258     if (!CompletedOrErr)
2259       return CompletedOrErr.takeError();
2260 
2261     // Return if it the stream did not complete yet.
2262     if (!(*CompletedOrErr))
2263       return Plugin::success();
2264 
2265     // Once the stream is completed, return it to stream pool and reset
2266     // AsyncInfo. This is to make sure the synchronization only works for its
2267     // own tasks.
2268     AsyncInfo.Queue = nullptr;
2269     return AMDGPUStreamManager.returnResource(Stream);
2270   }
2271 
2272   /// Pin the host buffer and return the device pointer that should be used for
2273   /// device transfers.
2274   Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
2275     void *PinnedPtr = nullptr;
2276 
2277     hsa_status_t Status =
2278         hsa_amd_memory_lock(HstPtr, Size, nullptr, 0, &PinnedPtr);
2279     if (auto Err = Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
2280       return std::move(Err);
2281 
2282     return PinnedPtr;
2283   }
2284 
2285   /// Unpin the host buffer.
2286   Error dataUnlockImpl(void *HstPtr) override {
2287     hsa_status_t Status = hsa_amd_memory_unlock(HstPtr);
2288     return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
2289   }
2290 
2291   /// Check through the HSA runtime whether the \p HstPtr buffer is pinned.
2292   Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
2293                                  void *&BaseDevAccessiblePtr,
2294                                  size_t &BaseSize) const override {
2295     hsa_amd_pointer_info_t Info;
2296     Info.size = sizeof(hsa_amd_pointer_info_t);
2297 
2298     hsa_status_t Status = hsa_amd_pointer_info(
2299         HstPtr, &Info, /*Allocator=*/nullptr, /*num_agents_accessible=*/nullptr,
2300         /*accessible=*/nullptr);
2301     if (auto Err = Plugin::check(Status, "Error in hsa_amd_pointer_info: %s"))
2302       return std::move(Err);
2303 
2304     // The buffer may be locked or allocated through HSA allocators. Assume that
2305     // the buffer is host pinned if the runtime reports a HSA type.
2306     if (Info.type != HSA_EXT_POINTER_TYPE_LOCKED &&
2307         Info.type != HSA_EXT_POINTER_TYPE_HSA)
2308       return false;
2309 
2310     assert(Info.hostBaseAddress && "Invalid host pinned address");
2311     assert(Info.agentBaseAddress && "Invalid agent pinned address");
2312     assert(Info.sizeInBytes > 0 && "Invalid pinned allocation size");
2313 
2314     // Save the allocation info in the output parameters.
2315     BaseHstPtr = Info.hostBaseAddress;
2316     BaseDevAccessiblePtr = Info.agentBaseAddress;
2317     BaseSize = Info.sizeInBytes;
2318 
2319     return true;
2320   }
2321 
2322   /// Submit data to the device (host to device transfer).
2323   Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
2324                        AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2325     AMDGPUStreamTy *Stream = nullptr;
2326     void *PinnedPtr = nullptr;
2327 
2328     // Use one-step asynchronous operation when host memory is already pinned.
2329     if (void *PinnedPtr =
2330             PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
2331       if (auto Err = getStream(AsyncInfoWrapper, Stream))
2332         return Err;
2333       return Stream->pushPinnedMemoryCopyAsync(TgtPtr, PinnedPtr, Size);
2334     }
2335 
2336     // For large transfers use synchronous behavior.
2337     if (Size >= OMPX_MaxAsyncCopyBytes) {
2338       if (AsyncInfoWrapper.hasQueue())
2339         if (auto Err = synchronize(AsyncInfoWrapper))
2340           return Err;
2341 
2342       hsa_status_t Status;
2343       Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0,
2344                                    &PinnedPtr);
2345       if (auto Err =
2346               Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
2347         return Err;
2348 
2349       AMDGPUSignalTy Signal;
2350       if (auto Err = Signal.init())
2351         return Err;
2352 
2353       if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
2354                                              Agent, PinnedPtr, Agent, Size, 0,
2355                                              nullptr, Signal.get()))
2356         return Err;
2357 
2358       if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2359         return Err;
2360 
2361       if (auto Err = Signal.deinit())
2362         return Err;
2363 
2364       Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
2365       return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
2366     }
2367 
2368     // Otherwise, use two-step copy with an intermediate pinned host buffer.
2369     AMDGPUMemoryManagerTy &PinnedMemoryManager =
2370         HostDevice.getPinnedMemoryManager();
2371     if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr))
2372       return Err;
2373 
2374     if (auto Err = getStream(AsyncInfoWrapper, Stream))
2375       return Err;
2376 
2377     return Stream->pushMemoryCopyH2DAsync(TgtPtr, HstPtr, PinnedPtr, Size,
2378                                           PinnedMemoryManager);
2379   }
2380 
2381   /// Retrieve data from the device (device to host transfer).
2382   Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
2383                          AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2384     AMDGPUStreamTy *Stream = nullptr;
2385     void *PinnedPtr = nullptr;
2386 
2387     // Use one-step asynchronous operation when host memory is already pinned.
2388     if (void *PinnedPtr =
2389             PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
2390       if (auto Err = getStream(AsyncInfoWrapper, Stream))
2391         return Err;
2392 
2393       return Stream->pushPinnedMemoryCopyAsync(PinnedPtr, TgtPtr, Size);
2394     }
2395 
2396     // For large transfers use synchronous behavior.
2397     if (Size >= OMPX_MaxAsyncCopyBytes) {
2398       if (AsyncInfoWrapper.hasQueue())
2399         if (auto Err = synchronize(AsyncInfoWrapper))
2400           return Err;
2401 
2402       hsa_status_t Status;
2403       Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0,
2404                                    &PinnedPtr);
2405       if (auto Err =
2406               Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
2407         return Err;
2408 
2409       AMDGPUSignalTy Signal;
2410       if (auto Err = Signal.init())
2411         return Err;
2412 
2413       if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(),
2414                                              PinnedPtr, Agent, TgtPtr, Agent,
2415                                              Size, 0, nullptr, Signal.get()))
2416         return Err;
2417 
2418       if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2419         return Err;
2420 
2421       if (auto Err = Signal.deinit())
2422         return Err;
2423 
2424       Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
2425       return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
2426     }
2427 
2428     // Otherwise, use two-step copy with an intermediate pinned host buffer.
2429     AMDGPUMemoryManagerTy &PinnedMemoryManager =
2430         HostDevice.getPinnedMemoryManager();
2431     if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr))
2432       return Err;
2433 
2434     if (auto Err = getStream(AsyncInfoWrapper, Stream))
2435       return Err;
2436 
2437     return Stream->pushMemoryCopyD2HAsync(HstPtr, TgtPtr, PinnedPtr, Size,
2438                                           PinnedMemoryManager);
2439   }
2440 
2441   /// Exchange data between two devices within the plugin.
2442   Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
2443                          void *DstPtr, int64_t Size,
2444                          AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2445     AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
2446 
2447     // For large transfers use synchronous behavior.
2448     if (Size >= OMPX_MaxAsyncCopyBytes) {
2449       if (AsyncInfoWrapper.hasQueue())
2450         if (auto Err = synchronize(AsyncInfoWrapper))
2451           return Err;
2452 
2453       AMDGPUSignalTy Signal;
2454       if (auto Err = Signal.init())
2455         return Err;
2456 
2457       if (auto Err = hsa_utils::asyncMemCopy(
2458               useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr,
2459               getAgent(), (uint64_t)Size, 0, nullptr, Signal.get()))
2460         return Err;
2461 
2462       if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2463         return Err;
2464 
2465       return Signal.deinit();
2466     }
2467 
2468     AMDGPUStreamTy *Stream = nullptr;
2469     if (auto Err = getStream(AsyncInfoWrapper, Stream))
2470       return Err;
2471     if (Size <= 0)
2472       return Plugin::success();
2473 
2474     return Stream->pushMemoryCopyD2DAsync(DstPtr, DstDevice.getAgent(), SrcPtr,
2475                                           getAgent(), (uint64_t)Size);
2476   }
2477 
2478   /// Initialize the async info for interoperability purposes.
2479   Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2480     // TODO: Implement this function.
2481     return Plugin::success();
2482   }
2483 
2484   /// Initialize the device info for interoperability purposes.
2485   Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
2486     DeviceInfo->Context = nullptr;
2487 
2488     if (!DeviceInfo->Device)
2489       DeviceInfo->Device = reinterpret_cast<void *>(Agent.handle);
2490 
2491     return Plugin::success();
2492   }
2493 
2494   /// Create an event.
2495   Error createEventImpl(void **EventPtrStorage) override {
2496     AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage);
2497     return AMDGPUEventManager.getResource(*Event);
2498   }
2499 
2500   /// Destroy a previously created event.
2501   Error destroyEventImpl(void *EventPtr) override {
2502     AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
2503     return AMDGPUEventManager.returnResource(Event);
2504   }
2505 
2506   /// Record the event.
2507   Error recordEventImpl(void *EventPtr,
2508                         AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2509     AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
2510     assert(Event && "Invalid event");
2511 
2512     AMDGPUStreamTy *Stream = nullptr;
2513     if (auto Err = getStream(AsyncInfoWrapper, Stream))
2514       return Err;
2515 
2516     return Event->record(*Stream);
2517   }
2518 
2519   /// Make the stream wait on the event.
2520   Error waitEventImpl(void *EventPtr,
2521                       AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2522     AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
2523 
2524     AMDGPUStreamTy *Stream = nullptr;
2525     if (auto Err = getStream(AsyncInfoWrapper, Stream))
2526       return Err;
2527 
2528     return Event->wait(*Stream);
2529   }
2530 
2531   /// Synchronize the current thread with the event.
2532   Error syncEventImpl(void *EventPtr) override {
2533     return Plugin::error("Synchronize event not implemented");
2534   }
2535 
2536   /// Print information about the device.
2537   Error obtainInfoImpl(InfoQueueTy &Info) override {
2538     char TmpChar[1000];
2539     const char *TmpCharPtr = "Unknown";
2540     uint16_t Major, Minor;
2541     uint32_t TmpUInt, TmpUInt2;
2542     uint32_t CacheSize[4];
2543     size_t TmpSt;
2544     bool TmpBool;
2545     uint16_t WorkgrpMaxDim[3];
2546     hsa_dim3_t GridMaxDim;
2547     hsa_status_t Status, Status2;
2548 
2549     Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major);
2550     Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor);
2551     if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS)
2552       Info.add("HSA Runtime Version",
2553                std::to_string(Major) + "." + std::to_string(Minor));
2554 
2555     Info.add("HSA OpenMP Device Number", DeviceId);
2556 
2557     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar);
2558     if (Status == HSA_STATUS_SUCCESS)
2559       Info.add("Product Name", TmpChar);
2560 
2561     Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar);
2562     if (Status == HSA_STATUS_SUCCESS)
2563       Info.add("Device Name", TmpChar);
2564 
2565     Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar);
2566     if (Status == HSA_STATUS_SUCCESS)
2567       Info.add("Vendor Name", TmpChar);
2568 
2569     hsa_device_type_t DevType;
2570     Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType);
2571     if (Status == HSA_STATUS_SUCCESS) {
2572       switch (DevType) {
2573       case HSA_DEVICE_TYPE_CPU:
2574         TmpCharPtr = "CPU";
2575         break;
2576       case HSA_DEVICE_TYPE_GPU:
2577         TmpCharPtr = "GPU";
2578         break;
2579       case HSA_DEVICE_TYPE_DSP:
2580         TmpCharPtr = "DSP";
2581         break;
2582       }
2583       Info.add("Device Type", TmpCharPtr);
2584     }
2585 
2586     Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUES_MAX, TmpUInt);
2587     if (Status == HSA_STATUS_SUCCESS)
2588       Info.add("Max Queues", TmpUInt);
2589 
2590     Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MIN_SIZE, TmpUInt);
2591     if (Status == HSA_STATUS_SUCCESS)
2592       Info.add("Queue Min Size", TmpUInt);
2593 
2594     Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MAX_SIZE, TmpUInt);
2595     if (Status == HSA_STATUS_SUCCESS)
2596       Info.add("Queue Max Size", TmpUInt);
2597 
2598     // FIXME: This is deprecated according to HSA documentation. But using
2599     // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
2600     // runtime.
2601     Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize);
2602     if (Status == HSA_STATUS_SUCCESS) {
2603       Info.add("Cache");
2604 
2605       for (int I = 0; I < 4; I++)
2606         if (CacheSize[I])
2607           Info.add<InfoLevel2>("L" + std::to_string(I), CacheSize[I]);
2608     }
2609 
2610     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt);
2611     if (Status == HSA_STATUS_SUCCESS)
2612       Info.add("Cacheline Size", TmpUInt);
2613 
2614     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
2615     if (Status == HSA_STATUS_SUCCESS)
2616       Info.add("Max Clock Freq", TmpUInt, "MHz");
2617 
2618     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt);
2619     if (Status == HSA_STATUS_SUCCESS)
2620       Info.add("Compute Units", TmpUInt);
2621 
2622     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt);
2623     if (Status == HSA_STATUS_SUCCESS)
2624       Info.add("SIMD per CU", TmpUInt);
2625 
2626     Status = getDeviceAttrRaw(HSA_AGENT_INFO_FAST_F16_OPERATION, TmpBool);
2627     if (Status == HSA_STATUS_SUCCESS)
2628       Info.add("Fast F16 Operation", TmpBool);
2629 
2630     Status = getDeviceAttrRaw(HSA_AGENT_INFO_WAVEFRONT_SIZE, TmpUInt2);
2631     if (Status == HSA_STATUS_SUCCESS)
2632       Info.add("Wavefront Size", TmpUInt2);
2633 
2634     Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, TmpUInt);
2635     if (Status == HSA_STATUS_SUCCESS)
2636       Info.add("Workgroup Max Size", TmpUInt);
2637 
2638     Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim);
2639     if (Status == HSA_STATUS_SUCCESS) {
2640       Info.add("Workgroup Max Size per Dimension");
2641       Info.add<InfoLevel2>("x", WorkgrpMaxDim[0]);
2642       Info.add<InfoLevel2>("y", WorkgrpMaxDim[1]);
2643       Info.add<InfoLevel2>("z", WorkgrpMaxDim[2]);
2644     }
2645 
2646     Status = getDeviceAttrRaw(
2647         (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, TmpUInt);
2648     if (Status == HSA_STATUS_SUCCESS) {
2649       Info.add("Max Waves Per CU", TmpUInt);
2650       Info.add("Max Work-item Per CU", TmpUInt * TmpUInt2);
2651     }
2652 
2653     Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt);
2654     if (Status == HSA_STATUS_SUCCESS)
2655       Info.add("Grid Max Size", TmpUInt);
2656 
2657     Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim);
2658     if (Status == HSA_STATUS_SUCCESS) {
2659       Info.add("Grid Max Size per Dimension");
2660       Info.add<InfoLevel2>("x", GridMaxDim.x);
2661       Info.add<InfoLevel2>("y", GridMaxDim.y);
2662       Info.add<InfoLevel2>("z", GridMaxDim.z);
2663     }
2664 
2665     Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt);
2666     if (Status == HSA_STATUS_SUCCESS)
2667       Info.add("Max fbarriers/Workgrp", TmpUInt);
2668 
2669     Info.add("Memory Pools");
2670     for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
2671       std::string TmpStr, TmpStr2;
2672 
2673       if (Pool->isGlobal())
2674         TmpStr = "Global";
2675       else if (Pool->isReadOnly())
2676         TmpStr = "ReadOnly";
2677       else if (Pool->isPrivate())
2678         TmpStr = "Private";
2679       else if (Pool->isGroup())
2680         TmpStr = "Group";
2681       else
2682         TmpStr = "Unknown";
2683 
2684       Info.add<InfoLevel2>(std::string("Pool ") + TmpStr);
2685 
2686       if (Pool->isGlobal()) {
2687         if (Pool->isFineGrained())
2688           TmpStr2 += "Fine Grained ";
2689         if (Pool->isCoarseGrained())
2690           TmpStr2 += "Coarse Grained ";
2691         if (Pool->supportsKernelArgs())
2692           TmpStr2 += "Kernarg ";
2693 
2694         Info.add<InfoLevel3>("Flags", TmpStr2);
2695       }
2696 
2697       Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt);
2698       if (Status == HSA_STATUS_SUCCESS)
2699         Info.add<InfoLevel3>("Size", TmpSt, "bytes");
2700 
2701       Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
2702                                 TmpBool);
2703       if (Status == HSA_STATUS_SUCCESS)
2704         Info.add<InfoLevel3>("Allocatable", TmpBool);
2705 
2706       Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
2707                                 TmpSt);
2708       if (Status == HSA_STATUS_SUCCESS)
2709         Info.add<InfoLevel3>("Runtime Alloc Granule", TmpSt, "bytes");
2710 
2711       Status = Pool->getAttrRaw(
2712           HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt);
2713       if (Status == HSA_STATUS_SUCCESS)
2714         Info.add<InfoLevel3>("Runtime Alloc Alignment", TmpSt, "bytes");
2715 
2716       Status =
2717           Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool);
2718       if (Status == HSA_STATUS_SUCCESS)
2719         Info.add<InfoLevel3>("Accessable by all", TmpBool);
2720     }
2721 
2722     Info.add("ISAs");
2723     auto Err = hsa_utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
2724       Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar);
2725       if (Status == HSA_STATUS_SUCCESS)
2726         Info.add<InfoLevel2>("Name", TmpChar);
2727 
2728       return Status;
2729     });
2730 
2731     // Silently consume the error.
2732     if (Err)
2733       consumeError(std::move(Err));
2734 
2735     return Plugin::success();
2736   }
2737 
2738   /// Returns true if auto zero-copy the best configuration for the current
2739   /// arch.
2740   /// On AMDGPUs, automatic zero-copy is turned on
2741   /// when running on an APU with XNACK (unified memory) support
2742   /// enabled. On discrete GPUs, automatic zero-copy is triggered
2743   /// if the user sets the environment variable OMPX_APU_MAPS=1
2744   /// and if XNACK is enabled. The rationale is that zero-copy
2745   /// is the best configuration (performance, memory footprint) on APUs,
2746   /// while it is often not the best on discrete GPUs.
2747   /// XNACK can be enabled with a kernel boot parameter or with
2748   /// the HSA_XNACK environment variable.
2749   bool useAutoZeroCopyImpl() override {
2750     return ((IsAPU || OMPX_ApuMaps) && IsXnackEnabled);
2751   }
2752 
2753   /// Getters and setters for stack and heap sizes.
2754   Error getDeviceStackSize(uint64_t &Value) override {
2755     Value = StackSize;
2756     return Plugin::success();
2757   }
2758   Error setDeviceStackSize(uint64_t Value) override {
2759     StackSize = Value;
2760     return Plugin::success();
2761   }
2762   Error getDeviceHeapSize(uint64_t &Value) override {
2763     Value = DeviceMemoryPoolSize;
2764     return Plugin::success();
2765   }
2766   Error setDeviceHeapSize(uint64_t Value) override {
2767     for (DeviceImageTy *Image : LoadedImages)
2768       if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value))
2769         return Err;
2770     DeviceMemoryPoolSize = Value;
2771     return Plugin::success();
2772   }
2773   Error getDeviceMemorySize(uint64_t &Value) override {
2774     for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
2775       if (Pool->isGlobal()) {
2776         hsa_status_t Status =
2777             Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
2778         return Plugin::check(Status, "Error in getting device memory size: %s");
2779       }
2780     }
2781     return Plugin::error("getDeviceMemorySize:: no global pool");
2782   }
2783 
2784   /// AMDGPU-specific function to get device attributes.
2785   template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
2786     hsa_status_t Status =
2787         hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
2788     return Plugin::check(Status, "Error in hsa_agent_get_info: %s");
2789   }
2790 
2791   template <typename Ty>
2792   hsa_status_t getDeviceAttrRaw(uint32_t Kind, Ty &Value) {
2793     return hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
2794   }
2795 
2796   /// Get the device agent.
2797   hsa_agent_t getAgent() const override { return Agent; }
2798 
2799   /// Get the signal manager.
2800   AMDGPUSignalManagerTy &getSignalManager() { return AMDGPUSignalManager; }
2801 
2802   /// Retrieve and construct all memory pools of the device agent.
2803   Error retrieveAllMemoryPools() override {
2804     // Iterate through the available pools of the device agent.
2805     return hsa_utils::iterateAgentMemoryPools(
2806         Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
2807           AMDGPUMemoryPoolTy *MemoryPool =
2808               Plugin.allocate<AMDGPUMemoryPoolTy>();
2809           new (MemoryPool) AMDGPUMemoryPoolTy(HSAMemoryPool);
2810           AllMemoryPools.push_back(MemoryPool);
2811           return HSA_STATUS_SUCCESS;
2812         });
2813   }
2814 
2815   bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; }
2816 
2817 private:
2818   using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
2819   using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
2820 
2821   /// Common method to invoke a single threaded constructor or destructor
2822   /// kernel by name.
2823   Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
2824                                  bool IsCtor) {
2825     const char *KernelName =
2826         IsCtor ? "amdgcn.device.init" : "amdgcn.device.fini";
2827     // Perform a quick check for the named kernel in the image. The kernel
2828     // should be created by the 'amdgpu-lower-ctor-dtor' pass.
2829     GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
2830     if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
2831       return Plugin::success();
2832 
2833     // Allocate and construct the AMDGPU kernel.
2834     AMDGPUKernelTy AMDGPUKernel(KernelName);
2835     if (auto Err = AMDGPUKernel.init(*this, Image))
2836       return Err;
2837 
2838     AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
2839 
2840     KernelArgsTy KernelArgs = {};
2841     uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
2842     if (auto Err = AMDGPUKernel.launchImpl(
2843             *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
2844             KernelLaunchParamsTy{}, AsyncInfoWrapper))
2845       return Err;
2846 
2847     Error Err = Plugin::success();
2848     AsyncInfoWrapper.finalize(Err);
2849 
2850     return Err;
2851   }
2852 
2853   /// Detect if current architecture is an APU.
2854   Error checkIfAPU() {
2855     // TODO: replace with ROCr API once it becomes available.
2856     llvm::StringRef StrGfxName(ComputeUnitKind);
2857     IsAPU = llvm::StringSwitch<bool>(StrGfxName)
2858                 .Case("gfx940", true)
2859                 .Default(false);
2860     if (IsAPU)
2861       return Plugin::success();
2862 
2863     bool MayBeAPU = llvm::StringSwitch<bool>(StrGfxName)
2864                         .Case("gfx942", true)
2865                         .Default(false);
2866     if (!MayBeAPU)
2867       return Plugin::success();
2868 
2869     // can be MI300A or MI300X
2870     uint32_t ChipID = 0;
2871     if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_CHIP_ID, ChipID))
2872       return Err;
2873 
2874     if (!(ChipID & 0x1)) {
2875       IsAPU = true;
2876       return Plugin::success();
2877     }
2878     return Plugin::success();
2879   }
2880 
2881   /// Envar for controlling the number of HSA queues per device. High number of
2882   /// queues may degrade performance.
2883   UInt32Envar OMPX_NumQueues;
2884 
2885   /// Envar for controlling the size of each HSA queue. The size is the number
2886   /// of HSA packets a queue is expected to hold. It is also the number of HSA
2887   /// packets that can be pushed into each queue without waiting the driver to
2888   /// process them.
2889   UInt32Envar OMPX_QueueSize;
2890 
2891   /// Envar for controlling the default number of teams relative to the number
2892   /// of compute units (CUs) the device has:
2893   ///   #default_teams = OMPX_DefaultTeamsPerCU * #CUs.
2894   UInt32Envar OMPX_DefaultTeamsPerCU;
2895 
2896   /// Envar specifying the maximum size in bytes where the memory copies are
2897   /// asynchronous operations. Up to this transfer size, the memory copies are
2898   /// asychronous operations pushed to the corresponding stream. For larger
2899   /// transfers, they are synchronous transfers.
2900   UInt32Envar OMPX_MaxAsyncCopyBytes;
2901 
2902   /// Envar controlling the initial number of HSA signals per device. There is
2903   /// one manager of signals per device managing several pre-allocated signals.
2904   /// These signals are mainly used by AMDGPU streams. If needed, more signals
2905   /// will be created.
2906   UInt32Envar OMPX_InitialNumSignals;
2907 
2908   /// Environment variables to set the time to wait in active state before
2909   /// switching to blocked state. The default 2000000 busywaits for 2 seconds
2910   /// before going into a blocking HSA wait state. The unit for these variables
2911   /// are microseconds.
2912   UInt32Envar OMPX_StreamBusyWait;
2913 
2914   /// Use ROCm 5.7 interface for multiple SDMA engines
2915   BoolEnvar OMPX_UseMultipleSdmaEngines;
2916 
2917   /// Value of OMPX_APU_MAPS env var used to force
2918   /// automatic zero-copy behavior on non-APU GPUs.
2919   BoolEnvar OMPX_ApuMaps;
2920 
2921   /// Stream manager for AMDGPU streams.
2922   AMDGPUStreamManagerTy AMDGPUStreamManager;
2923 
2924   /// Event manager for AMDGPU events.
2925   AMDGPUEventManagerTy AMDGPUEventManager;
2926 
2927   /// Signal manager for AMDGPU signals.
2928   AMDGPUSignalManagerTy AMDGPUSignalManager;
2929 
2930   /// The agent handler corresponding to the device.
2931   hsa_agent_t Agent;
2932 
2933   /// The GPU architecture.
2934   std::string ComputeUnitKind;
2935 
2936   /// The frequency of the steady clock inside the device.
2937   uint64_t ClockFrequency;
2938 
2939   /// The total number of concurrent work items that can be running on the GPU.
2940   uint64_t HardwareParallelism;
2941 
2942   /// Reference to the host device.
2943   AMDHostDeviceTy &HostDevice;
2944 
2945   /// The current size of the global device memory pool (managed by us).
2946   uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/;
2947 
2948   /// The current size of the stack that will be used in cases where it could
2949   /// not be statically determined.
2950   uint64_t StackSize = 16 * 1024 /* 16 KB */;
2951 
2952   /// Is the plugin associated with an APU?
2953   bool IsAPU = false;
2954 
2955   /// True is the system is configured with XNACK-Enabled.
2956   /// False otherwise.
2957   bool IsXnackEnabled = false;
2958 };
2959 
2960 Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
2961   hsa_code_object_reader_t Reader;
2962   hsa_status_t Status =
2963       hsa_code_object_reader_create_from_memory(getStart(), getSize(), &Reader);
2964   if (auto Err = Plugin::check(
2965           Status, "Error in hsa_code_object_reader_create_from_memory: %s"))
2966     return Err;
2967 
2968   Status = hsa_executable_create_alt(
2969       HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", &Executable);
2970   if (auto Err =
2971           Plugin::check(Status, "Error in hsa_executable_create_alt: %s"))
2972     return Err;
2973 
2974   hsa_loaded_code_object_t Object;
2975   Status = hsa_executable_load_agent_code_object(Executable, Device.getAgent(),
2976                                                  Reader, "", &Object);
2977   if (auto Err = Plugin::check(
2978           Status, "Error in hsa_executable_load_agent_code_object: %s"))
2979     return Err;
2980 
2981   Status = hsa_executable_freeze(Executable, "");
2982   if (auto Err = Plugin::check(Status, "Error in hsa_executable_freeze: %s"))
2983     return Err;
2984 
2985   uint32_t Result;
2986   Status = hsa_executable_validate(Executable, &Result);
2987   if (auto Err = Plugin::check(Status, "Error in hsa_executable_validate: %s"))
2988     return Err;
2989 
2990   if (Result)
2991     return Plugin::error("Loaded HSA executable does not validate");
2992 
2993   Status = hsa_code_object_reader_destroy(Reader);
2994   if (auto Err =
2995           Plugin::check(Status, "Error in hsa_code_object_reader_destroy: %s"))
2996     return Err;
2997 
2998   if (auto Err = hsa_utils::readAMDGPUMetaDataFromImage(
2999           getMemoryBuffer(), KernelInfoMap, ELFABIVersion))
3000     return Err;
3001 
3002   return Plugin::success();
3003 }
3004 
3005 Expected<hsa_executable_symbol_t>
3006 AMDGPUDeviceImageTy::findDeviceSymbol(GenericDeviceTy &Device,
3007                                       StringRef SymbolName) const {
3008 
3009   AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
3010   hsa_agent_t Agent = AMDGPUDevice.getAgent();
3011 
3012   hsa_executable_symbol_t Symbol;
3013   hsa_status_t Status = hsa_executable_get_symbol_by_name(
3014       Executable, SymbolName.data(), &Agent, &Symbol);
3015   if (auto Err = Plugin::check(
3016           Status, "Error in hsa_executable_get_symbol_by_name(%s): %s",
3017           SymbolName.data()))
3018     return std::move(Err);
3019 
3020   return Symbol;
3021 }
3022 
3023 template <typename ResourceTy>
3024 Error AMDGPUResourceRef<ResourceTy>::create(GenericDeviceTy &Device) {
3025   if (Resource)
3026     return Plugin::error("Creating an existing resource");
3027 
3028   AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
3029 
3030   Resource = new ResourceTy(AMDGPUDevice);
3031 
3032   return Resource->init();
3033 }
3034 
3035 AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
3036     : Agent(Device.getAgent()), Queue(nullptr),
3037       SignalManager(Device.getSignalManager()), Device(Device),
3038       // Initialize the std::deque with some empty positions.
3039       Slots(32), NextSlot(0), SyncCycle(0),
3040       StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
3041       UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
3042 
3043 /// Class implementing the AMDGPU-specific functionalities of the global
3044 /// handler.
3045 struct AMDGPUGlobalHandlerTy final : public GenericGlobalHandlerTy {
3046   /// Get the metadata of a global from the device. The name and size of the
3047   /// global is read from DeviceGlobal and the address of the global is written
3048   /// to DeviceGlobal.
3049   Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
3050                                     DeviceImageTy &Image,
3051                                     GlobalTy &DeviceGlobal) override {
3052     AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
3053 
3054     // Find the symbol on the device executable.
3055     auto SymbolOrErr =
3056         AMDImage.findDeviceSymbol(Device, DeviceGlobal.getName());
3057     if (!SymbolOrErr)
3058       return SymbolOrErr.takeError();
3059 
3060     hsa_executable_symbol_t Symbol = *SymbolOrErr;
3061     hsa_symbol_kind_t SymbolType;
3062     hsa_status_t Status;
3063     uint64_t SymbolAddr;
3064     uint32_t SymbolSize;
3065 
3066     // Retrieve the type, address and size of the symbol.
3067     std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
3068         {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
3069         {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &SymbolAddr},
3070         {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &SymbolSize}};
3071 
3072     for (auto &Info : RequiredInfos) {
3073       Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
3074       if (auto Err = Plugin::check(
3075               Status, "Error in hsa_executable_symbol_get_info: %s"))
3076         return Err;
3077     }
3078 
3079     // Check the size of the symbol.
3080     if (SymbolSize != DeviceGlobal.getSize())
3081       return Plugin::error(
3082           "Failed to load global '%s' due to size mismatch (%zu != %zu)",
3083           DeviceGlobal.getName().data(), SymbolSize,
3084           (size_t)DeviceGlobal.getSize());
3085 
3086     // Store the symbol address on the device global metadata.
3087     DeviceGlobal.setPtr(reinterpret_cast<void *>(SymbolAddr));
3088 
3089     return Plugin::success();
3090   }
3091 };
3092 
3093 /// Class implementing the AMDGPU-specific functionalities of the plugin.
3094 struct AMDGPUPluginTy final : public GenericPluginTy {
3095   /// Create an AMDGPU plugin and initialize the AMDGPU driver.
3096   AMDGPUPluginTy()
3097       : GenericPluginTy(getTripleArch()), Initialized(false),
3098         HostDevice(nullptr) {}
3099 
3100   /// This class should not be copied.
3101   AMDGPUPluginTy(const AMDGPUPluginTy &) = delete;
3102   AMDGPUPluginTy(AMDGPUPluginTy &&) = delete;
3103 
3104   /// Initialize the plugin and return the number of devices.
3105   Expected<int32_t> initImpl() override {
3106     hsa_status_t Status = hsa_init();
3107     if (Status != HSA_STATUS_SUCCESS) {
3108       // Cannot call hsa_success_string.
3109       DP("Failed to initialize AMDGPU's HSA library\n");
3110       return 0;
3111     }
3112 
3113     // The initialization of HSA was successful. It should be safe to call
3114     // HSA functions from now on, e.g., hsa_shut_down.
3115     Initialized = true;
3116 
3117     // Register event handler to detect memory errors on the devices.
3118     Status = hsa_amd_register_system_event_handler(eventHandler, this);
3119     if (auto Err = Plugin::check(
3120             Status, "Error in hsa_amd_register_system_event_handler: %s"))
3121       return std::move(Err);
3122 
3123     // List of host (CPU) agents.
3124     llvm::SmallVector<hsa_agent_t> HostAgents;
3125 
3126     // Count the number of available agents.
3127     auto Err = hsa_utils::iterateAgents([&](hsa_agent_t Agent) {
3128       // Get the device type of the agent.
3129       hsa_device_type_t DeviceType;
3130       hsa_status_t Status =
3131           hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
3132       if (Status != HSA_STATUS_SUCCESS)
3133         return Status;
3134 
3135       // Classify the agents into kernel (GPU) and host (CPU) kernels.
3136       if (DeviceType == HSA_DEVICE_TYPE_GPU) {
3137         // Ensure that the GPU agent supports kernel dispatch packets.
3138         hsa_agent_feature_t Features;
3139         Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &Features);
3140         if (Features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
3141           KernelAgents.push_back(Agent);
3142       } else if (DeviceType == HSA_DEVICE_TYPE_CPU) {
3143         HostAgents.push_back(Agent);
3144       }
3145       return HSA_STATUS_SUCCESS;
3146     });
3147 
3148     if (Err)
3149       return std::move(Err);
3150 
3151     int32_t NumDevices = KernelAgents.size();
3152     if (NumDevices == 0) {
3153       // Do not initialize if there are no devices.
3154       DP("There are no devices supporting AMDGPU.\n");
3155       return 0;
3156     }
3157 
3158     // There are kernel agents but there is no host agent. That should be
3159     // treated as an error.
3160     if (HostAgents.empty())
3161       return Plugin::error("No AMDGPU host agents");
3162 
3163     // Initialize the host device using host agents.
3164     HostDevice = allocate<AMDHostDeviceTy>();
3165     new (HostDevice) AMDHostDeviceTy(*this, HostAgents);
3166 
3167     // Setup the memory pools of available for the host.
3168     if (auto Err = HostDevice->init())
3169       return std::move(Err);
3170 
3171     return NumDevices;
3172   }
3173 
3174   /// Deinitialize the plugin.
3175   Error deinitImpl() override {
3176     // The HSA runtime was not initialized, so nothing from the plugin was
3177     // actually initialized.
3178     if (!Initialized)
3179       return Plugin::success();
3180 
3181     if (HostDevice)
3182       if (auto Err = HostDevice->deinit())
3183         return Err;
3184 
3185     // Finalize the HSA runtime.
3186     hsa_status_t Status = hsa_shut_down();
3187     return Plugin::check(Status, "Error in hsa_shut_down: %s");
3188   }
3189 
3190   /// Creates an AMDGPU device.
3191   GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
3192                                 int32_t NumDevices) override {
3193     return new AMDGPUDeviceTy(Plugin, DeviceId, NumDevices, getHostDevice(),
3194                               getKernelAgent(DeviceId));
3195   }
3196 
3197   /// Creates an AMDGPU global handler.
3198   GenericGlobalHandlerTy *createGlobalHandler() override {
3199     return new AMDGPUGlobalHandlerTy();
3200   }
3201 
3202   Triple::ArchType getTripleArch() const override { return Triple::amdgcn; }
3203 
3204   const char *getName() const override { return GETNAME(TARGET_NAME); }
3205 
3206   /// Get the ELF code for recognizing the compatible image binary.
3207   uint16_t getMagicElfBits() const override { return ELF::EM_AMDGPU; }
3208 
3209   /// Check whether the image is compatible with an AMDGPU device.
3210   Expected<bool> isELFCompatible(uint32_t DeviceId,
3211                                  StringRef Image) const override {
3212     // Get the associated architecture and flags from the ELF.
3213     auto ElfOrErr = ELF64LEObjectFile::create(
3214         MemoryBufferRef(Image, /*Identifier=*/""), /*InitContent=*/false);
3215     if (!ElfOrErr)
3216       return ElfOrErr.takeError();
3217     std::optional<StringRef> Processor = ElfOrErr->tryGetCPUName();
3218     if (!Processor)
3219       return false;
3220 
3221     SmallVector<SmallString<32>> Targets;
3222     if (auto Err = hsa_utils::getTargetTripleAndFeatures(
3223             getKernelAgent(DeviceId), Targets))
3224       return Err;
3225     for (auto &Target : Targets)
3226       if (offloading::amdgpu::isImageCompatibleWithEnv(
3227               Processor ? *Processor : "", ElfOrErr->getPlatformFlags(),
3228               Target.str()))
3229         return true;
3230     return false;
3231   }
3232 
3233   bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {
3234     return true;
3235   }
3236 
3237   /// Get the host device instance.
3238   AMDHostDeviceTy &getHostDevice() {
3239     assert(HostDevice && "Host device not initialized");
3240     return *HostDevice;
3241   }
3242 
3243   /// Get the kernel agent with the corresponding agent id.
3244   hsa_agent_t getKernelAgent(int32_t AgentId) const {
3245     assert((uint32_t)AgentId < KernelAgents.size() && "Invalid agent id");
3246     return KernelAgents[AgentId];
3247   }
3248 
3249   /// Get the list of the available kernel agents.
3250   const llvm::SmallVector<hsa_agent_t> &getKernelAgents() const {
3251     return KernelAgents;
3252   }
3253 
3254 private:
3255   /// Event handler that will be called by ROCr if an event is detected.
3256   static hsa_status_t eventHandler(const hsa_amd_event_t *Event,
3257                                    void *PluginPtr) {
3258     if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
3259       return HSA_STATUS_SUCCESS;
3260 
3261     SmallVector<std::string> Reasons;
3262     uint32_t ReasonsMask = Event->memory_fault.fault_reason_mask;
3263     if (ReasonsMask & HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT)
3264       Reasons.emplace_back("Page not present or supervisor privilege");
3265     if (ReasonsMask & HSA_AMD_MEMORY_FAULT_READ_ONLY)
3266       Reasons.emplace_back("Write access to a read-only page");
3267     if (ReasonsMask & HSA_AMD_MEMORY_FAULT_NX)
3268       Reasons.emplace_back("Execute access to a page marked NX");
3269     if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HOST_ONLY)
3270       Reasons.emplace_back("GPU attempted access to a host only page");
3271     if (ReasonsMask & HSA_AMD_MEMORY_FAULT_DRAMECC)
3272       Reasons.emplace_back("DRAM ECC failure");
3273     if (ReasonsMask & HSA_AMD_MEMORY_FAULT_IMPRECISE)
3274       Reasons.emplace_back("Can't determine the exact fault address");
3275     if (ReasonsMask & HSA_AMD_MEMORY_FAULT_SRAMECC)
3276       Reasons.emplace_back("SRAM ECC failure (ie registers, no fault address)");
3277     if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HANG)
3278       Reasons.emplace_back("GPU reset following unspecified hang");
3279 
3280     // If we do not know the reason, say so, otherwise remove the trailing comma
3281     // and space.
3282     if (Reasons.empty())
3283       Reasons.emplace_back("Unknown (" + std::to_string(ReasonsMask) + ")");
3284 
3285     uint32_t Node = -1;
3286     hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node);
3287 
3288     AMDGPUPluginTy &Plugin = *reinterpret_cast<AMDGPUPluginTy *>(PluginPtr);
3289     for (uint32_t I = 0, E = Plugin.getNumDevices();
3290          Node != uint32_t(-1) && I < E; ++I) {
3291       AMDGPUDeviceTy &AMDGPUDevice =
3292           reinterpret_cast<AMDGPUDeviceTy &>(Plugin.getDevice(I));
3293       auto KernelTraceInfoRecord =
3294           AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
3295 
3296       uint32_t DeviceNode = -1;
3297       if (auto Err =
3298               AMDGPUDevice.getDeviceAttr(HSA_AGENT_INFO_NODE, DeviceNode)) {
3299         consumeError(std::move(Err));
3300         continue;
3301       }
3302       if (DeviceNode != Node)
3303         continue;
3304       void *DevicePtr = (void *)Event->memory_fault.virtual_address;
3305       std::string S;
3306       llvm::raw_string_ostream OS(S);
3307       OS << llvm::format("Memory access fault by GPU %" PRIu32
3308                          " (agent 0x%" PRIx64
3309                          ") at virtual address %p. Reasons: %s",
3310                          Node, Event->memory_fault.agent.handle,
3311                          (void *)Event->memory_fault.virtual_address,
3312                          llvm::join(Reasons, ", ").c_str());
3313       ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord);
3314       ErrorReporter::reportMemoryAccessError(AMDGPUDevice, DevicePtr, S,
3315                                              /*Abort*/ true);
3316     }
3317 
3318     // Abort the execution since we do not recover from this error.
3319     FATAL_MESSAGE(1,
3320                   "Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
3321                   ") at virtual address %p. Reasons: %s",
3322                   Node, Event->memory_fault.agent.handle,
3323                   (void *)Event->memory_fault.virtual_address,
3324                   llvm::join(Reasons, ", ").c_str());
3325 
3326     return HSA_STATUS_ERROR;
3327   }
3328 
3329   /// Indicate whether the HSA runtime was correctly initialized. Even if there
3330   /// is no available devices this boolean will be true. It indicates whether
3331   /// we can safely call HSA functions (e.g., hsa_shut_down).
3332   bool Initialized;
3333 
3334   /// Arrays of the available GPU and CPU agents. These arrays of handles should
3335   /// not be here but in the AMDGPUDeviceTy structures directly. However, the
3336   /// HSA standard does not provide API functions to retirve agents directly,
3337   /// only iterating functions. We cache the agents here for convenience.
3338   llvm::SmallVector<hsa_agent_t> KernelAgents;
3339 
3340   /// The device representing all HSA host agents.
3341   AMDHostDeviceTy *HostDevice;
3342 };
3343 
3344 Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3345                                  uint32_t NumThreads[3], uint32_t NumBlocks[3],
3346                                  KernelArgsTy &KernelArgs,
3347                                  KernelLaunchParamsTy LaunchParams,
3348                                  AsyncInfoWrapperTy &AsyncInfoWrapper) const {
3349   if (ArgsSize != LaunchParams.Size &&
3350       ArgsSize != LaunchParams.Size + getImplicitArgsSize())
3351     return Plugin::error("Mismatch of kernel arguments size");
3352 
3353   AMDGPUPluginTy &AMDGPUPlugin =
3354       static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin);
3355   AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice();
3356   AMDGPUMemoryManagerTy &ArgsMemoryManager = HostDevice.getArgsMemoryManager();
3357 
3358   void *AllArgs = nullptr;
3359   if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs))
3360     return Err;
3361 
3362   // Account for user requested dynamic shared memory.
3363   uint32_t GroupSize = getGroupSize();
3364   if (uint32_t MaxDynCGroupMem = std::max(
3365           KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
3366     GroupSize += MaxDynCGroupMem;
3367   }
3368 
3369   uint64_t StackSize;
3370   if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
3371     return Err;
3372 
3373   hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
3374   if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) {
3375     // Initialize implicit arguments.
3376     ImplArgs = reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
3377         utils::advancePtr(AllArgs, LaunchParams.Size));
3378 
3379     // Initialize the implicit arguments to zero.
3380     std::memset(ImplArgs, 0, getImplicitArgsSize());
3381   }
3382 
3383   // Copy the explicit arguments.
3384   // TODO: We should expose the args memory manager alloc to the common part as
3385   // 	   alternative to copying them twice.
3386   if (LaunchParams.Size)
3387     std::memcpy(AllArgs, LaunchParams.Data, LaunchParams.Size);
3388 
3389   AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
3390 
3391   AMDGPUStreamTy *Stream = nullptr;
3392   if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
3393     return Err;
3394 
3395   // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
3396   if (ImplArgs &&
3397       getImplicitArgsSize() == sizeof(hsa_utils::AMDGPUImplicitArgsTy)) {
3398     ImplArgs->BlockCountX = NumBlocks[0];
3399     ImplArgs->BlockCountY = NumBlocks[1];
3400     ImplArgs->BlockCountZ = NumBlocks[2];
3401     ImplArgs->GroupSizeX = NumThreads[0];
3402     ImplArgs->GroupSizeY = NumThreads[1];
3403     ImplArgs->GroupSizeZ = NumThreads[2];
3404     ImplArgs->GridDims = NumBlocks[2] * NumThreads[2] > 1
3405                              ? 3
3406                              : 1 + (NumBlocks[1] * NumThreads[1] != 1);
3407     ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
3408   }
3409 
3410   // Push the kernel launch into the stream.
3411   return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
3412                                   GroupSize, StackSize, ArgsMemoryManager);
3413 }
3414 
3415 Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
3416                                              KernelArgsTy &KernelArgs,
3417                                              uint32_t NumThreads[3],
3418                                              uint32_t NumBlocks[3]) const {
3419   // Only do all this when the output is requested
3420   if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL))
3421     return Plugin::success();
3422 
3423   // We don't have data to print additional info, but no hard error
3424   if (!KernelInfo.has_value())
3425     return Plugin::success();
3426 
3427   // General Info
3428   auto NumGroups = NumBlocks;
3429   auto ThreadsPerGroup = NumThreads;
3430 
3431   // Kernel Arguments Info
3432   auto ArgNum = KernelArgs.NumArgs;
3433   auto LoopTripCount = KernelArgs.Tripcount;
3434 
3435   // Details for AMDGPU kernels (read from image)
3436   // https://www.llvm.org/docs/AMDGPUUsage.html#code-object-v4-metadata
3437   auto GroupSegmentSize = (*KernelInfo).GroupSegmentList;
3438   auto SGPRCount = (*KernelInfo).SGPRCount;
3439   auto VGPRCount = (*KernelInfo).VGPRCount;
3440   auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount;
3441   auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount;
3442   auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;
3443 
3444   // Prints additional launch info that contains the following.
3445   // Num Args: The number of kernel arguments
3446   // Teams x Thrds: The number of teams and the number of threads actually
3447   // running.
3448   // MaxFlatWorkgroupSize: Maximum flat work-group size supported by the
3449   // kernel in work-items
3450   // LDS Usage: Amount of bytes used in LDS storage
3451   // S/VGPR Count: the number of S/V GPRs occupied by the kernel
3452   // S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel
3453   // Tripcount: loop tripcount for the kernel
3454   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
3455        "#Args: %d Teams x Thrds: %4ux%4u (MaxFlatWorkGroupSize: %u) LDS "
3456        "Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: "
3457        "%lu\n",
3458        ArgNum, NumGroups[0] * NumGroups[1] * NumGroups[2],
3459        ThreadsPerGroup[0] * ThreadsPerGroup[1] * ThreadsPerGroup[2],
3460        MaxFlatWorkgroupSize, GroupSegmentSize, SGPRCount, VGPRCount,
3461        SGPRSpillCount, VGPRSpillCount, LoopTripCount);
3462 
3463   return Plugin::success();
3464 }
3465 
3466 template <typename... ArgsTy>
3467 static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
3468   hsa_status_t ResultCode = static_cast<hsa_status_t>(Code);
3469   if (ResultCode == HSA_STATUS_SUCCESS || ResultCode == HSA_STATUS_INFO_BREAK)
3470     return Error::success();
3471 
3472   const char *Desc = "Unknown error";
3473   hsa_status_t Ret = hsa_status_string(ResultCode, &Desc);
3474   if (Ret != HSA_STATUS_SUCCESS)
3475     REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code);
3476 
3477   return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
3478                                                     ErrFmt, Args..., Desc);
3479 }
3480 
3481 void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr,
3482                                       TargetAllocTy Kind) {
3483   // Allocate memory from the pool.
3484   void *Ptr = nullptr;
3485   if (auto Err = MemoryPool->allocate(Size, &Ptr)) {
3486     consumeError(std::move(Err));
3487     return nullptr;
3488   }
3489   assert(Ptr && "Invalid pointer");
3490 
3491   // Get a list of agents that can access this memory pool.
3492   llvm::SmallVector<hsa_agent_t> Agents;
3493   llvm::copy_if(
3494       Plugin.getKernelAgents(), std::back_inserter(Agents),
3495       [&](hsa_agent_t Agent) { return MemoryPool->canAccess(Agent); });
3496 
3497   // Allow all valid kernel agents to access the allocation.
3498   if (auto Err = MemoryPool->enableAccess(Ptr, Size, Agents)) {
3499     REPORT("%s\n", toString(std::move(Err)).data());
3500     return nullptr;
3501   }
3502   return Ptr;
3503 }
3504 
3505 void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
3506   if (Size == 0)
3507     return nullptr;
3508 
3509   // Find the correct memory pool.
3510   AMDGPUMemoryPoolTy *MemoryPool = nullptr;
3511   switch (Kind) {
3512   case TARGET_ALLOC_DEFAULT:
3513   case TARGET_ALLOC_DEVICE:
3514   case TARGET_ALLOC_DEVICE_NON_BLOCKING:
3515     MemoryPool = CoarseGrainedMemoryPools[0];
3516     break;
3517   case TARGET_ALLOC_HOST:
3518     MemoryPool = &HostDevice.getFineGrainedMemoryPool();
3519     break;
3520   case TARGET_ALLOC_SHARED:
3521     MemoryPool = &HostDevice.getFineGrainedMemoryPool();
3522     break;
3523   }
3524 
3525   if (!MemoryPool) {
3526     REPORT("No memory pool for the specified allocation kind\n");
3527     return nullptr;
3528   }
3529 
3530   // Allocate from the corresponding memory pool.
3531   void *Alloc = nullptr;
3532   if (Error Err = MemoryPool->allocate(Size, &Alloc)) {
3533     REPORT("%s\n", toString(std::move(Err)).data());
3534     return nullptr;
3535   }
3536 
3537   if (Alloc) {
3538     // Get a list of agents that can access this memory pool. Inherently
3539     // necessary for host or shared allocations Also enabled for device memory
3540     // to allow device to device memcpy
3541     llvm::SmallVector<hsa_agent_t> Agents;
3542     llvm::copy_if(static_cast<AMDGPUPluginTy &>(Plugin).getKernelAgents(),
3543                   std::back_inserter(Agents), [&](hsa_agent_t Agent) {
3544                     return MemoryPool->canAccess(Agent);
3545                   });
3546 
3547     // Enable all valid kernel agents to access the buffer.
3548     if (auto Err = MemoryPool->enableAccess(Alloc, Size, Agents)) {
3549       REPORT("%s\n", toString(std::move(Err)).data());
3550       return nullptr;
3551     }
3552   }
3553 
3554   return Alloc;
3555 }
3556 
3557 void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source,
3558                                   void *Data) {
3559   auto &AMDGPUDevice = *reinterpret_cast<AMDGPUDeviceTy *>(Data);
3560 
3561   if (Status == HSA_STATUS_ERROR_EXCEPTION) {
3562     auto KernelTraceInfoRecord =
3563         AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
3564     std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher =
3565         [=](__tgt_async_info &AsyncInfo) {
3566           auto *Stream = reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
3567           if (!Stream || !Stream->getQueue())
3568             return false;
3569           return Stream->getQueue()->Queue == Source;
3570         };
3571     ErrorReporter::reportTrapInKernel(AMDGPUDevice, *KernelTraceInfoRecord,
3572                                       AsyncInfoWrapperMatcher);
3573   }
3574 
3575   auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
3576   FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
3577 }
3578 
3579 } // namespace plugin
3580 } // namespace target
3581 } // namespace omp
3582 } // namespace llvm
3583 
3584 extern "C" {
3585 llvm::omp::target::plugin::GenericPluginTy *createPlugin_amdgpu() {
3586   return new llvm::omp::target::plugin::AMDGPUPluginTy();
3587 }
3588 }
3589