xref: /llvm-project/offload/plugins-nextgen/common/src/PluginInterface.cpp (revision 38b3f45a811282511c014cffd09a8ae2a96435ba)
1 //===- PluginInterface.cpp - Target independent plugin device interface ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
10 
11 #include "PluginInterface.h"
12 
13 #include "Shared/APITypes.h"
14 #include "Shared/Debug.h"
15 #include "Shared/Environment.h"
16 
17 #include "ErrorReporting.h"
18 #include "GlobalHandler.h"
19 #include "JIT.h"
20 #include "Shared/Utils.h"
21 #include "Utils/ELF.h"
22 #include "omptarget.h"
23 
24 #ifdef OMPT_SUPPORT
25 #include "OpenMP/OMPT/Callback.h"
26 #include "omp-tools.h"
27 #endif
28 
29 #include "llvm/Bitcode/BitcodeReader.h"
30 #include "llvm/Frontend/OpenMP/OMPConstants.h"
31 #include "llvm/Support/Error.h"
32 #include "llvm/Support/JSON.h"
33 #include "llvm/Support/MathExtras.h"
34 #include "llvm/Support/MemoryBuffer.h"
35 #include "llvm/Support/Signals.h"
36 #include "llvm/Support/raw_ostream.h"
37 
38 #include <cstdint>
39 #include <limits>
40 
41 using namespace llvm;
42 using namespace omp;
43 using namespace target;
44 using namespace plugin;
45 
46 // TODO: Fix any thread safety issues for multi-threaded kernel recording.
47 namespace llvm::omp::target::plugin {
48 struct RecordReplayTy {
49 
50   // Describes the state of the record replay mechanism.
51   enum RRStatusTy { RRDeactivated = 0, RRRecording, RRReplaying };
52 
53 private:
54   // Memory pointers for recording, replaying memory.
55   void *MemoryStart = nullptr;
56   void *MemoryPtr = nullptr;
57   size_t MemorySize = 0;
58   size_t TotalSize = 0;
59   GenericDeviceTy *Device = nullptr;
60   std::mutex AllocationLock;
61 
62   RRStatusTy Status = RRDeactivated;
63   bool ReplaySaveOutput = false;
64   bool UsedVAMap = false;
65   uintptr_t MemoryOffset = 0;
66 
67   // A list of all globals mapped to the device.
68   struct GlobalEntry {
69     const char *Name;
70     uint64_t Size;
71     void *Addr;
72   };
73   llvm::SmallVector<GlobalEntry> GlobalEntries{};
74 
75   void *suggestAddress(uint64_t MaxMemoryAllocation) {
76     // Get a valid pointer address for this system
77     void *Addr =
78         Device->allocate(1024, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT);
79     Device->free(Addr);
80     // Align Address to MaxMemoryAllocation
81     Addr = (void *)utils::alignPtr((Addr), MaxMemoryAllocation);
82     return Addr;
83   }
84 
85   Error preAllocateVAMemory(uint64_t MaxMemoryAllocation, void *VAddr) {
86     size_t ASize = MaxMemoryAllocation;
87 
88     if (!VAddr && isRecording())
89       VAddr = suggestAddress(MaxMemoryAllocation);
90 
91     DP("Request %ld bytes allocated at %p\n", MaxMemoryAllocation, VAddr);
92 
93     if (auto Err = Device->memoryVAMap(&MemoryStart, VAddr, &ASize))
94       return Err;
95 
96     if (isReplaying() && VAddr != MemoryStart) {
97       return Plugin::error("Record-Replay cannot assign the"
98                            "requested recorded address (%p, %p)",
99                            VAddr, MemoryStart);
100     }
101 
102     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
103          "Allocated %" PRIu64 " bytes at %p for replay.\n", ASize, MemoryStart);
104 
105     MemoryPtr = MemoryStart;
106     MemorySize = 0;
107     TotalSize = ASize;
108     UsedVAMap = true;
109     return Plugin::success();
110   }
111 
112   Error preAllocateHeuristic(uint64_t MaxMemoryAllocation,
113                              uint64_t RequiredMemoryAllocation, void *VAddr) {
114     const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation;
115     constexpr size_t STEP = 1024 * 1024 * 1024ULL;
116     MemoryStart = nullptr;
117     for (TotalSize = MAX_MEMORY_ALLOCATION; TotalSize > 0; TotalSize -= STEP) {
118       MemoryStart =
119           Device->allocate(TotalSize, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT);
120       if (MemoryStart)
121         break;
122     }
123     if (!MemoryStart)
124       return Plugin::error("Allocating record/replay memory");
125 
126     if (VAddr && VAddr != MemoryStart)
127       MemoryOffset = uintptr_t(VAddr) - uintptr_t(MemoryStart);
128 
129     MemoryPtr = MemoryStart;
130     MemorySize = 0;
131 
132     // Check if we need adjustment.
133     if (MemoryOffset > 0 &&
134         TotalSize >= RequiredMemoryAllocation + MemoryOffset) {
135       // If we are off but "before" the required address and with enough space,
136       // we just "allocate" the offset to match the required address.
137       MemoryPtr = (char *)MemoryPtr + MemoryOffset;
138       MemorySize += MemoryOffset;
139       MemoryOffset = 0;
140       assert(MemoryPtr == VAddr && "Expected offset adjustment to work");
141     } else if (MemoryOffset) {
142       // If we are off and in a situation we cannot just "waste" memory to force
143       // a match, we hope adjusting the arguments is sufficient.
144       REPORT(
145           "WARNING Failed to allocate replay memory at required location %p, "
146           "got %p, trying to offset argument pointers by %" PRIi64 "\n",
147           VAddr, MemoryStart, MemoryOffset);
148     }
149 
150     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
151          "Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize,
152          MemoryStart);
153 
154     return Plugin::success();
155   }
156 
157   Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr) {
158     if (Device->supportVAManagement()) {
159       auto Err = preAllocateVAMemory(DeviceMemorySize, ReqVAddr);
160       if (Err) {
161         REPORT("WARNING VA mapping failed, fallback to heuristic: "
162                "(Error: %s)\n",
163                toString(std::move(Err)).data());
164       }
165     }
166 
167     uint64_t DevMemSize;
168     if (Device->getDeviceMemorySize(DevMemSize))
169       return Plugin::error("Cannot determine Device Memory Size");
170 
171     return preAllocateHeuristic(DevMemSize, DeviceMemorySize, ReqVAddr);
172   }
173 
174   void dumpDeviceMemory(StringRef Filename) {
175     ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
176         WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize);
177     if (!DeviceMemoryMB)
178       report_fatal_error("Error creating MemoryBuffer for device memory");
179 
180     auto Err = Device->dataRetrieve(DeviceMemoryMB.get()->getBufferStart(),
181                                     MemoryStart, MemorySize, nullptr);
182     if (Err)
183       report_fatal_error("Error retrieving data for target pointer");
184 
185     StringRef DeviceMemory(DeviceMemoryMB.get()->getBufferStart(), MemorySize);
186     std::error_code EC;
187     raw_fd_ostream OS(Filename, EC);
188     if (EC)
189       report_fatal_error("Error dumping memory to file " + Filename + " :" +
190                          EC.message());
191     OS << DeviceMemory;
192     OS.close();
193   }
194 
195 public:
196   bool isRecording() const { return Status == RRStatusTy::RRRecording; }
197   bool isReplaying() const { return Status == RRStatusTy::RRReplaying; }
198   bool isRecordingOrReplaying() const {
199     return (Status != RRStatusTy::RRDeactivated);
200   }
201   void setStatus(RRStatusTy Status) { this->Status = Status; }
202   bool isSaveOutputEnabled() const { return ReplaySaveOutput; }
203   void addEntry(const char *Name, uint64_t Size, void *Addr) {
204     GlobalEntries.emplace_back(GlobalEntry{Name, Size, Addr});
205   }
206 
207   void saveImage(const char *Name, const DeviceImageTy &Image) {
208     SmallString<128> ImageName = {Name, ".image"};
209     std::error_code EC;
210     raw_fd_ostream OS(ImageName, EC);
211     if (EC)
212       report_fatal_error("Error saving image : " + StringRef(EC.message()));
213     if (const auto *TgtImageBitcode = Image.getTgtImageBitcode()) {
214       size_t Size = utils::getPtrDiff(TgtImageBitcode->ImageEnd,
215                                       TgtImageBitcode->ImageStart);
216       MemoryBufferRef MBR = MemoryBufferRef(
217           StringRef((const char *)TgtImageBitcode->ImageStart, Size), "");
218       OS << MBR.getBuffer();
219     } else {
220       OS << Image.getMemoryBuffer().getBuffer();
221     }
222     OS.close();
223   }
224 
225   void dumpGlobals(StringRef Filename, DeviceImageTy &Image) {
226     int32_t Size = 0;
227 
228     for (auto &OffloadEntry : GlobalEntries) {
229       if (!OffloadEntry.Size)
230         continue;
231       // Get the total size of the string and entry including the null byte.
232       Size += std::strlen(OffloadEntry.Name) + 1 + sizeof(uint32_t) +
233               OffloadEntry.Size;
234     }
235 
236     ErrorOr<std::unique_ptr<WritableMemoryBuffer>> GlobalsMB =
237         WritableMemoryBuffer::getNewUninitMemBuffer(Size);
238     if (!GlobalsMB)
239       report_fatal_error("Error creating MemoryBuffer for globals memory");
240 
241     void *BufferPtr = GlobalsMB.get()->getBufferStart();
242     for (auto &OffloadEntry : GlobalEntries) {
243       if (!OffloadEntry.Size)
244         continue;
245 
246       int32_t NameLength = std::strlen(OffloadEntry.Name) + 1;
247       memcpy(BufferPtr, OffloadEntry.Name, NameLength);
248       BufferPtr = utils::advancePtr(BufferPtr, NameLength);
249 
250       *((uint32_t *)(BufferPtr)) = OffloadEntry.Size;
251       BufferPtr = utils::advancePtr(BufferPtr, sizeof(uint32_t));
252 
253       auto Err = Plugin::success();
254       {
255         if (auto Err = Device->dataRetrieve(BufferPtr, OffloadEntry.Addr,
256                                             OffloadEntry.Size, nullptr))
257           report_fatal_error("Error retrieving data for global");
258       }
259       if (Err)
260         report_fatal_error("Error retrieving data for global");
261       BufferPtr = utils::advancePtr(BufferPtr, OffloadEntry.Size);
262     }
263     assert(BufferPtr == GlobalsMB->get()->getBufferEnd() &&
264            "Buffer over/under-filled.");
265     assert(Size == utils::getPtrDiff(BufferPtr,
266                                      GlobalsMB->get()->getBufferStart()) &&
267            "Buffer size mismatch");
268 
269     StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), Size);
270     std::error_code EC;
271     raw_fd_ostream OS(Filename, EC);
272     OS << GlobalsMemory;
273     OS.close();
274   }
275 
276   void saveKernelDescr(const char *Name, KernelLaunchParamsTy LaunchParams,
277                        int32_t NumArgs, uint64_t NumTeamsClause,
278                        uint32_t ThreadLimitClause, uint64_t LoopTripCount) {
279     json::Object JsonKernelInfo;
280     JsonKernelInfo["Name"] = Name;
281     JsonKernelInfo["NumArgs"] = NumArgs;
282     JsonKernelInfo["NumTeamsClause"] = NumTeamsClause;
283     JsonKernelInfo["ThreadLimitClause"] = ThreadLimitClause;
284     JsonKernelInfo["LoopTripCount"] = LoopTripCount;
285     JsonKernelInfo["DeviceMemorySize"] = MemorySize;
286     JsonKernelInfo["DeviceId"] = Device->getDeviceId();
287     JsonKernelInfo["BumpAllocVAStart"] = (intptr_t)MemoryStart;
288 
289     json::Array JsonArgPtrs;
290     for (int I = 0; I < NumArgs; ++I)
291       JsonArgPtrs.push_back((intptr_t)LaunchParams.Ptrs[I]);
292     JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs));
293 
294     json::Array JsonArgOffsets;
295     for (int I = 0; I < NumArgs; ++I)
296       JsonArgOffsets.push_back(0);
297     JsonKernelInfo["ArgOffsets"] = json::Value(std::move(JsonArgOffsets));
298 
299     SmallString<128> JsonFilename = {Name, ".json"};
300     std::error_code EC;
301     raw_fd_ostream JsonOS(JsonFilename.str(), EC);
302     if (EC)
303       report_fatal_error("Error saving kernel json file : " +
304                          StringRef(EC.message()));
305     JsonOS << json::Value(std::move(JsonKernelInfo));
306     JsonOS.close();
307   }
308 
309   void saveKernelInput(const char *Name, DeviceImageTy &Image) {
310     SmallString<128> GlobalsFilename = {Name, ".globals"};
311     dumpGlobals(GlobalsFilename, Image);
312 
313     SmallString<128> MemoryFilename = {Name, ".memory"};
314     dumpDeviceMemory(MemoryFilename);
315   }
316 
317   void saveKernelOutputInfo(const char *Name) {
318     SmallString<128> OutputFilename = {
319         Name, (isRecording() ? ".original.output" : ".replay.output")};
320     dumpDeviceMemory(OutputFilename);
321   }
322 
323   void *alloc(uint64_t Size) {
324     assert(MemoryStart && "Expected memory has been pre-allocated");
325     void *Alloc = nullptr;
326     constexpr int Alignment = 16;
327     // Assumes alignment is a power of 2.
328     int64_t AlignedSize = (Size + (Alignment - 1)) & (~(Alignment - 1));
329     std::lock_guard<std::mutex> LG(AllocationLock);
330     Alloc = MemoryPtr;
331     MemoryPtr = (char *)MemoryPtr + AlignedSize;
332     MemorySize += AlignedSize;
333     DP("Memory Allocator return " DPxMOD "\n", DPxPTR(Alloc));
334     return Alloc;
335   }
336 
337   Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr,
338              RRStatusTy Status, bool SaveOutput, uint64_t &ReqPtrArgOffset) {
339     this->Device = Device;
340     this->Status = Status;
341     this->ReplaySaveOutput = SaveOutput;
342 
343     if (auto Err = preallocateDeviceMemory(MemSize, VAddr))
344       return Err;
345 
346     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
347          "Record Replay Initialized (%p)"
348          " as starting address, %lu Memory Size"
349          " and set on status %s\n",
350          MemoryStart, TotalSize,
351          Status == RRStatusTy::RRRecording ? "Recording" : "Replaying");
352 
353     // Tell the user to offset pointer arguments as the memory allocation does
354     // not match.
355     ReqPtrArgOffset = MemoryOffset;
356     return Plugin::success();
357   }
358 
359   void deinit() {
360     if (UsedVAMap) {
361       if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize))
362         report_fatal_error("Error on releasing virtual memory space");
363     } else {
364       Device->free(MemoryStart);
365     }
366   }
367 };
368 } // namespace llvm::omp::target::plugin
369 
370 // Extract the mapping of host function pointers to device function pointers
371 // from the entry table. Functions marked as 'indirect' in OpenMP will have
372 // offloading entries generated for them which map the host's function pointer
373 // to a global containing the corresponding function pointer on the device.
374 static Expected<std::pair<void *, uint64_t>>
375 setupIndirectCallTable(GenericPluginTy &Plugin, GenericDeviceTy &Device,
376                        DeviceImageTy &Image) {
377   GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
378 
379   llvm::ArrayRef<llvm::offloading::EntryTy> Entries(
380       Image.getTgtImage()->EntriesBegin, Image.getTgtImage()->EntriesEnd);
381   llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable;
382   for (const auto &Entry : Entries) {
383     if (Entry.Size == 0 || !(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT))
384       continue;
385 
386     assert(Entry.Size == sizeof(void *) && "Global not a function pointer?");
387     auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back();
388 
389     GlobalTy DeviceGlobal(Entry.SymbolName, Entry.Size);
390     if (auto Err =
391             Handler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal))
392       return std::move(Err);
393 
394     HstPtr = Entry.Address;
395     if (auto Err = Device.dataRetrieve(&DevPtr, DeviceGlobal.getPtr(),
396                                        Entry.Size, nullptr))
397       return std::move(Err);
398   }
399 
400   // If we do not have any indirect globals we exit early.
401   if (IndirectCallTable.empty())
402     return std::pair{nullptr, 0};
403 
404   // Sort the array to allow for more efficient lookup of device pointers.
405   llvm::sort(IndirectCallTable,
406              [](const auto &x, const auto &y) { return x.first < y.first; });
407 
408   uint64_t TableSize =
409       IndirectCallTable.size() * sizeof(std::pair<void *, void *>);
410   void *DevicePtr = Device.allocate(TableSize, nullptr, TARGET_ALLOC_DEVICE);
411   if (auto Err = Device.dataSubmit(DevicePtr, IndirectCallTable.data(),
412                                    TableSize, nullptr))
413     return std::move(Err);
414   return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size());
415 }
416 
417 AsyncInfoWrapperTy::AsyncInfoWrapperTy(GenericDeviceTy &Device,
418                                        __tgt_async_info *AsyncInfoPtr)
419     : Device(Device),
420       AsyncInfoPtr(AsyncInfoPtr ? AsyncInfoPtr : &LocalAsyncInfo) {}
421 
422 void AsyncInfoWrapperTy::finalize(Error &Err) {
423   assert(AsyncInfoPtr && "AsyncInfoWrapperTy already finalized");
424 
425   // If we used a local async info object we want synchronous behavior. In that
426   // case, and assuming the current status code is correct, we will synchronize
427   // explicitly when the object is deleted. Update the error with the result of
428   // the synchronize operation.
429   if (AsyncInfoPtr == &LocalAsyncInfo && LocalAsyncInfo.Queue && !Err)
430     Err = Device.synchronize(&LocalAsyncInfo);
431 
432   // Invalidate the wrapper object.
433   AsyncInfoPtr = nullptr;
434 }
435 
436 Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
437                             DeviceImageTy &Image) {
438 
439   ImagePtr = &Image;
440 
441   // Retrieve kernel environment object for the kernel.
442   GlobalTy KernelEnv(std::string(Name) + "_kernel_environment",
443                      sizeof(KernelEnvironment), &KernelEnvironment);
444   GenericGlobalHandlerTy &GHandler = GenericDevice.Plugin.getGlobalHandler();
445   if (auto Err =
446           GHandler.readGlobalFromImage(GenericDevice, *ImagePtr, KernelEnv)) {
447     [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
448     DP("Failed to read kernel environment for '%s': %s\n"
449        "Using default SPMD (2) execution mode\n",
450        Name, ErrStr.data());
451     assert(KernelEnvironment.Configuration.ReductionDataSize == 0 &&
452            "Default initialization failed.");
453     IsBareKernel = true;
454   }
455 
456   // Max = Config.Max > 0 ? min(Config.Max, Device.Max) : Device.Max;
457   MaxNumThreads = KernelEnvironment.Configuration.MaxThreads > 0
458                       ? std::min(KernelEnvironment.Configuration.MaxThreads,
459                                  int32_t(GenericDevice.getThreadLimit()))
460                       : GenericDevice.getThreadLimit();
461 
462   // Pref = Config.Pref > 0 ? max(Config.Pref, Device.Pref) : Device.Pref;
463   PreferredNumThreads =
464       KernelEnvironment.Configuration.MinThreads > 0
465           ? std::max(KernelEnvironment.Configuration.MinThreads,
466                      int32_t(GenericDevice.getDefaultNumThreads()))
467           : GenericDevice.getDefaultNumThreads();
468 
469   return initImpl(GenericDevice, Image);
470 }
471 
472 Expected<KernelLaunchEnvironmentTy *>
473 GenericKernelTy::getKernelLaunchEnvironment(
474     GenericDeviceTy &GenericDevice, uint32_t Version,
475     AsyncInfoWrapperTy &AsyncInfoWrapper) const {
476   // Ctor/Dtor have no arguments, replaying uses the original kernel launch
477   // environment. Older versions of the compiler do not generate a kernel
478   // launch environment.
479   if (GenericDevice.Plugin.getRecordReplay().isReplaying() ||
480       Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
481     return nullptr;
482 
483   if (!KernelEnvironment.Configuration.ReductionDataSize ||
484       !KernelEnvironment.Configuration.ReductionBufferLength)
485     return reinterpret_cast<KernelLaunchEnvironmentTy *>(~0);
486 
487   // TODO: Check if the kernel needs a launch environment.
488   auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy),
489                                             /*HostPtr=*/nullptr,
490                                             TargetAllocTy::TARGET_ALLOC_DEVICE);
491   if (!AllocOrErr)
492     return AllocOrErr.takeError();
493 
494   // Remember to free the memory later.
495   AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
496 
497   /// Use the KLE in the __tgt_async_info to ensure a stable address for the
498   /// async data transfer.
499   auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
500   LocalKLE = KernelLaunchEnvironment;
501   {
502     auto AllocOrErr = GenericDevice.dataAlloc(
503         KernelEnvironment.Configuration.ReductionDataSize *
504             KernelEnvironment.Configuration.ReductionBufferLength,
505         /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
506     if (!AllocOrErr)
507       return AllocOrErr.takeError();
508     LocalKLE.ReductionBuffer = *AllocOrErr;
509     // Remember to free the memory later.
510     AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
511   }
512 
513   INFO(OMP_INFOTYPE_DATA_TRANSFER, GenericDevice.getDeviceId(),
514        "Copying data from host to device, HstPtr=" DPxMOD ", TgtPtr=" DPxMOD
515        ", Size=%" PRId64 ", Name=KernelLaunchEnv\n",
516        DPxPTR(&LocalKLE), DPxPTR(*AllocOrErr),
517        sizeof(KernelLaunchEnvironmentTy));
518 
519   auto Err = GenericDevice.dataSubmit(*AllocOrErr, &LocalKLE,
520                                       sizeof(KernelLaunchEnvironmentTy),
521                                       AsyncInfoWrapper);
522   if (Err)
523     return Err;
524   return static_cast<KernelLaunchEnvironmentTy *>(*AllocOrErr);
525 }
526 
527 Error GenericKernelTy::printLaunchInfo(GenericDeviceTy &GenericDevice,
528                                        KernelArgsTy &KernelArgs,
529                                        uint32_t NumThreads[3],
530                                        uint32_t NumBlocks[3]) const {
531   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
532        "Launching kernel %s with [%u,%u,%u] blocks and [%u,%u,%u] threads in "
533        "%s mode\n",
534        getName(), NumBlocks[0], NumBlocks[1], NumBlocks[2], NumThreads[0],
535        NumThreads[1], NumThreads[2], getExecutionModeName());
536   return printLaunchInfoDetails(GenericDevice, KernelArgs, NumThreads,
537                                 NumBlocks);
538 }
539 
540 Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
541                                               KernelArgsTy &KernelArgs,
542                                               uint32_t NumThreads[3],
543                                               uint32_t NumBlocks[3]) const {
544   return Plugin::success();
545 }
546 
547 Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
548                               ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
549                               AsyncInfoWrapperTy &AsyncInfoWrapper) const {
550   llvm::SmallVector<void *, 16> Args;
551   llvm::SmallVector<void *, 16> Ptrs;
552 
553   auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment(
554       GenericDevice, KernelArgs.Version, AsyncInfoWrapper);
555   if (!KernelLaunchEnvOrErr)
556     return KernelLaunchEnvOrErr.takeError();
557 
558   KernelLaunchParamsTy LaunchParams;
559 
560   // Kernel languages don't use indirection.
561   if (KernelArgs.Flags.IsCUDA) {
562     LaunchParams =
563         *reinterpret_cast<KernelLaunchParamsTy *>(KernelArgs.ArgPtrs);
564   } else {
565     LaunchParams =
566         prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs,
567                     Args, Ptrs, *KernelLaunchEnvOrErr);
568   }
569 
570   uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
571                             KernelArgs.ThreadLimit[1],
572                             KernelArgs.ThreadLimit[2]};
573   uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
574                            KernelArgs.NumTeams[2]};
575   if (!IsBareKernel) {
576     NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
577     NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
578                                 NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
579   }
580 
581   // Record the kernel description after we modified the argument count and num
582   // blocks/threads.
583   RecordReplayTy &RecordReplay = GenericDevice.Plugin.getRecordReplay();
584   if (RecordReplay.isRecording()) {
585     RecordReplay.saveImage(getName(), getImage());
586     RecordReplay.saveKernelInput(getName(), getImage());
587     RecordReplay.saveKernelDescr(getName(), LaunchParams, KernelArgs.NumArgs,
588                                  NumBlocks[0], NumThreads[0],
589                                  KernelArgs.Tripcount);
590   }
591 
592   if (auto Err =
593           printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
594     return Err;
595 
596   return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
597                     LaunchParams, AsyncInfoWrapper);
598 }
599 
600 KernelLaunchParamsTy GenericKernelTy::prepareArgs(
601     GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets,
602     uint32_t &NumArgs, llvm::SmallVectorImpl<void *> &Args,
603     llvm::SmallVectorImpl<void *> &Ptrs,
604     KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const {
605   uint32_t KLEOffset = !!KernelLaunchEnvironment;
606   NumArgs += KLEOffset;
607 
608   if (NumArgs == 0)
609     return KernelLaunchParamsTy{};
610 
611   Args.resize(NumArgs);
612   Ptrs.resize(NumArgs);
613 
614   if (KernelLaunchEnvironment) {
615     Args[0] = KernelLaunchEnvironment;
616     Ptrs[0] = &Args[0];
617   }
618 
619   for (uint32_t I = KLEOffset; I < NumArgs; ++I) {
620     Args[I] =
621         (void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]);
622     Ptrs[I] = &Args[I];
623   }
624   return KernelLaunchParamsTy{sizeof(void *) * NumArgs, &Args[0], &Ptrs[0]};
625 }
626 
627 uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
628                                         uint32_t ThreadLimitClause[3]) const {
629   assert(!IsBareKernel && "bare kernel should not call this function");
630 
631   assert(ThreadLimitClause[1] == 1 && ThreadLimitClause[2] == 1 &&
632          "Multi dimensional launch not supported yet.");
633 
634   if (ThreadLimitClause[0] > 0 && isGenericMode())
635     ThreadLimitClause[0] += GenericDevice.getWarpSize();
636 
637   return std::min(MaxNumThreads, (ThreadLimitClause[0] > 0)
638                                      ? ThreadLimitClause[0]
639                                      : PreferredNumThreads);
640 }
641 
642 uint32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
643                                        uint32_t NumTeamsClause[3],
644                                        uint64_t LoopTripCount,
645                                        uint32_t &NumThreads,
646                                        bool IsNumThreadsFromUser) const {
647   assert(!IsBareKernel && "bare kernel should not call this function");
648 
649   assert(NumTeamsClause[1] == 1 && NumTeamsClause[2] == 1 &&
650          "Multi dimensional launch not supported yet.");
651 
652   if (NumTeamsClause[0] > 0) {
653     // TODO: We need to honor any value and consequently allow more than the
654     // block limit. For this we might need to start multiple kernels or let the
655     // blocks start again until the requested number has been started.
656     return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit());
657   }
658 
659   uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks();
660   uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
661   if (LoopTripCount > 0) {
662     if (isSPMDMode()) {
663       // We have a combined construct, i.e. `target teams distribute
664       // parallel for [simd]`. We launch so many teams so that each thread
665       // will execute one iteration of the loop; rounded up to the nearest
666       // integer. However, if that results in too few teams, we artificially
667       // reduce the thread count per team to increase the outer parallelism.
668       auto MinThreads = GenericDevice.getMinThreadsForLowTripCountLoop();
669       MinThreads = std::min(MinThreads, NumThreads);
670 
671       // Honor the thread_limit clause; only lower the number of threads.
672       [[maybe_unused]] auto OldNumThreads = NumThreads;
673       if (LoopTripCount >= DefaultNumBlocks * NumThreads ||
674           IsNumThreadsFromUser) {
675         // Enough parallelism for teams and threads.
676         TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
677         assert(IsNumThreadsFromUser ||
678                TripCountNumBlocks >= DefaultNumBlocks &&
679                    "Expected sufficient outer parallelism.");
680       } else if (LoopTripCount >= DefaultNumBlocks * MinThreads) {
681         // Enough parallelism for teams, limit threads.
682 
683         // This case is hard; for now, we force "full warps":
684         // First, compute a thread count assuming DefaultNumBlocks.
685         auto NumThreadsDefaultBlocks =
686             (LoopTripCount + DefaultNumBlocks - 1) / DefaultNumBlocks;
687         // Now get a power of two that is larger or equal.
688         auto NumThreadsDefaultBlocksP2 =
689             llvm::PowerOf2Ceil(NumThreadsDefaultBlocks);
690         // Do not increase a thread limit given be the user.
691         NumThreads = std::min(NumThreads, uint32_t(NumThreadsDefaultBlocksP2));
692         assert(NumThreads >= MinThreads &&
693                "Expected sufficient inner parallelism.");
694         TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
695       } else {
696         // Not enough parallelism for teams and threads, limit both.
697         NumThreads = std::min(NumThreads, MinThreads);
698         TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
699       }
700 
701       assert(NumThreads * TripCountNumBlocks >= LoopTripCount &&
702              "Expected sufficient parallelism");
703       assert(OldNumThreads >= NumThreads &&
704              "Number of threads cannot be increased!");
705     } else {
706       assert((isGenericMode() || isGenericSPMDMode()) &&
707              "Unexpected execution mode!");
708       // If we reach this point, then we have a non-combined construct, i.e.
709       // `teams distribute` with a nested `parallel for` and each team is
710       // assigned one iteration of the `distribute` loop. E.g.:
711       //
712       // #pragma omp target teams distribute
713       // for(...loop_tripcount...) {
714       //   #pragma omp parallel for
715       //   for(...) {}
716       // }
717       //
718       // Threads within a team will execute the iterations of the `parallel`
719       // loop.
720       TripCountNumBlocks = LoopTripCount;
721     }
722   }
723 
724   uint32_t PreferredNumBlocks = TripCountNumBlocks;
725   // If the loops are long running we rather reuse blocks than spawn too many.
726   if (GenericDevice.getReuseBlocksForHighTripCount())
727     PreferredNumBlocks = std::min(TripCountNumBlocks, DefaultNumBlocks);
728   return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit());
729 }
730 
731 GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
732                                  int32_t NumDevices,
733                                  const llvm::omp::GV &OMPGridValues)
734     : Plugin(Plugin), MemoryManager(nullptr), OMP_TeamLimit("OMP_TEAM_LIMIT"),
735       OMP_NumTeams("OMP_NUM_TEAMS"),
736       OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT"),
737       OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG"),
738       OMPX_SharedMemorySize("LIBOMPTARGET_SHARED_MEMORY_SIZE"),
739       // Do not initialize the following two envars since they depend on the
740       // device initialization. These cannot be consulted until the device is
741       // initialized correctly. We intialize them in GenericDeviceTy::init().
742       OMPX_TargetStackSize(), OMPX_TargetHeapSize(),
743       // By default, the initial number of streams and events is 1.
744       OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", 1),
745       OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1),
746       DeviceId(DeviceId), GridValues(OMPGridValues),
747       PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(),
748       PinnedAllocs(*this), RPCServer(nullptr) {
749 #ifdef OMPT_SUPPORT
750   OmptInitialized.store(false);
751   // Bind the callbacks to this device's member functions
752 #define bindOmptCallback(Name, Type, Code)                                     \
753   if (ompt::Initialized && ompt::lookupCallbackByCode) {                       \
754     ompt::lookupCallbackByCode((ompt_callbacks_t)(Code),                       \
755                                ((ompt_callback_t *)&(Name##_fn)));             \
756     DP("OMPT: class bound %s=%p\n", #Name, ((void *)(uint64_t)Name##_fn));     \
757   }
758 
759   FOREACH_OMPT_DEVICE_EVENT(bindOmptCallback);
760 #undef bindOmptCallback
761 
762 #endif
763 }
764 
765 Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
766   if (auto Err = initImpl(Plugin))
767     return Err;
768 
769 #ifdef OMPT_SUPPORT
770   if (ompt::Initialized) {
771     bool ExpectedStatus = false;
772     if (OmptInitialized.compare_exchange_strong(ExpectedStatus, true))
773       performOmptCallback(device_initialize, Plugin.getUserId(DeviceId),
774                           /*type=*/getComputeUnitKind().c_str(),
775                           /*device=*/reinterpret_cast<ompt_device_t *>(this),
776                           /*lookup=*/ompt::lookupCallbackByName,
777                           /*documentation=*/nullptr);
778   }
779 #endif
780 
781   // Read and reinitialize the envars that depend on the device initialization.
782   // Notice these two envars may change the stack size and heap size of the
783   // device, so they need the device properly initialized.
784   auto StackSizeEnvarOrErr = UInt64Envar::create(
785       "LIBOMPTARGET_STACK_SIZE",
786       [this](uint64_t &V) -> Error { return getDeviceStackSize(V); },
787       [this](uint64_t V) -> Error { return setDeviceStackSize(V); });
788   if (!StackSizeEnvarOrErr)
789     return StackSizeEnvarOrErr.takeError();
790   OMPX_TargetStackSize = std::move(*StackSizeEnvarOrErr);
791 
792   auto HeapSizeEnvarOrErr = UInt64Envar::create(
793       "LIBOMPTARGET_HEAP_SIZE",
794       [this](uint64_t &V) -> Error { return getDeviceHeapSize(V); },
795       [this](uint64_t V) -> Error { return setDeviceHeapSize(V); });
796   if (!HeapSizeEnvarOrErr)
797     return HeapSizeEnvarOrErr.takeError();
798   OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr);
799 
800   // Update the maximum number of teams and threads after the device
801   // initialization sets the corresponding hardware limit.
802   if (OMP_NumTeams > 0)
803     GridValues.GV_Max_Teams =
804         std::min(GridValues.GV_Max_Teams, uint32_t(OMP_NumTeams));
805 
806   if (OMP_TeamsThreadLimit > 0)
807     GridValues.GV_Max_WG_Size =
808         std::min(GridValues.GV_Max_WG_Size, uint32_t(OMP_TeamsThreadLimit));
809 
810   // Enable the memory manager if required.
811   auto [ThresholdMM, EnableMM] = MemoryManagerTy::getSizeThresholdFromEnv();
812   if (EnableMM)
813     MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
814 
815   return Plugin::success();
816 }
817 
818 Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
819   for (DeviceImageTy *Image : LoadedImages)
820     if (auto Err = callGlobalDestructors(Plugin, *Image))
821       return Err;
822 
823   if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
824     GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
825     for (auto *Image : LoadedImages) {
826       DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
827       GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
828                              sizeof(DeviceMemoryPoolTrackingTy),
829                              &ImageDeviceMemoryPoolTracking);
830       if (auto Err =
831               GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
832         consumeError(std::move(Err));
833         continue;
834       }
835       DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
836     }
837 
838     // TODO: Write this by default into a file.
839     printf("\n\n|-----------------------\n"
840            "| Device memory tracker:\n"
841            "|-----------------------\n"
842            "| #Allocations: %lu\n"
843            "| Byes allocated: %lu\n"
844            "| Minimal allocation: %lu\n"
845            "| Maximal allocation: %lu\n"
846            "|-----------------------\n\n\n",
847            DeviceMemoryPoolTracking.NumAllocations,
848            DeviceMemoryPoolTracking.AllocationTotal,
849            DeviceMemoryPoolTracking.AllocationMin,
850            DeviceMemoryPoolTracking.AllocationMax);
851   }
852 
853   for (auto *Image : LoadedImages) {
854     GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
855     if (!Handler.hasProfilingGlobals(*this, *Image))
856       continue;
857 
858     GPUProfGlobals profdata;
859     auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
860     if (!ProfOrErr)
861       return ProfOrErr.takeError();
862 
863     // TODO: write data to profiling file
864     ProfOrErr->dump();
865   }
866 
867   // Delete the memory manager before deinitializing the device. Otherwise,
868   // we may delete device allocations after the device is deinitialized.
869   if (MemoryManager)
870     delete MemoryManager;
871   MemoryManager = nullptr;
872 
873   RecordReplayTy &RecordReplay = Plugin.getRecordReplay();
874   if (RecordReplay.isRecordingOrReplaying())
875     RecordReplay.deinit();
876 
877   if (RPCServer)
878     if (auto Err = RPCServer->deinitDevice(*this))
879       return Err;
880 
881 #ifdef OMPT_SUPPORT
882   if (ompt::Initialized) {
883     bool ExpectedStatus = true;
884     if (OmptInitialized.compare_exchange_strong(ExpectedStatus, false))
885       performOmptCallback(device_finalize, Plugin.getUserId(DeviceId));
886   }
887 #endif
888 
889   return deinitImpl();
890 }
891 Expected<DeviceImageTy *>
892 GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
893                             const __tgt_device_image *InputTgtImage) {
894   assert(InputTgtImage && "Expected non-null target image");
895   DP("Load data from image " DPxMOD "\n", DPxPTR(InputTgtImage->ImageStart));
896 
897   auto PostJITImageOrErr = Plugin.getJIT().process(*InputTgtImage, *this);
898   if (!PostJITImageOrErr) {
899     auto Err = PostJITImageOrErr.takeError();
900     REPORT("Failure to jit IR image %p on device %d: %s\n", InputTgtImage,
901            DeviceId, toString(std::move(Err)).data());
902     return nullptr;
903   }
904 
905   // Load the binary and allocate the image object. Use the next available id
906   // for the image id, which is the number of previously loaded images.
907   auto ImageOrErr =
908       loadBinaryImpl(PostJITImageOrErr.get(), LoadedImages.size());
909   if (!ImageOrErr)
910     return ImageOrErr.takeError();
911 
912   DeviceImageTy *Image = *ImageOrErr;
913   assert(Image != nullptr && "Invalid image");
914   if (InputTgtImage != PostJITImageOrErr.get())
915     Image->setTgtImageBitcode(InputTgtImage);
916 
917   // Add the image to list.
918   LoadedImages.push_back(Image);
919 
920   // Setup the device environment if needed.
921   if (auto Err = setupDeviceEnvironment(Plugin, *Image))
922     return std::move(Err);
923 
924   // Setup the global device memory pool if needed.
925   if (!Plugin.getRecordReplay().isReplaying() &&
926       shouldSetupDeviceMemoryPool()) {
927     uint64_t HeapSize;
928     auto SizeOrErr = getDeviceHeapSize(HeapSize);
929     if (SizeOrErr) {
930       REPORT("No global device memory pool due to error: %s\n",
931              toString(std::move(SizeOrErr)).data());
932     } else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize))
933       return std::move(Err);
934   }
935 
936   if (auto Err = setupRPCServer(Plugin, *Image))
937     return std::move(Err);
938 
939 #ifdef OMPT_SUPPORT
940   if (ompt::Initialized) {
941     size_t Bytes =
942         utils::getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart);
943     performOmptCallback(
944         device_load, Plugin.getUserId(DeviceId),
945         /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr,
946         /*ImgSize=*/Bytes, /*HostAddr=*/InputTgtImage->ImageStart,
947         /*DeviceAddr=*/nullptr, /* FIXME: ModuleId */ 0);
948   }
949 #endif
950 
951   // Call any global constructors present on the device.
952   if (auto Err = callGlobalConstructors(Plugin, *Image))
953     return std::move(Err);
954 
955   // Return the pointer to the table of entries.
956   return Image;
957 }
958 
959 Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
960                                               DeviceImageTy &Image) {
961   // There are some plugins that do not need this step.
962   if (!shouldSetupDeviceEnvironment())
963     return Plugin::success();
964 
965   // Obtain a table mapping host function pointers to device function pointers.
966   auto CallTablePairOrErr = setupIndirectCallTable(Plugin, *this, Image);
967   if (!CallTablePairOrErr)
968     return CallTablePairOrErr.takeError();
969 
970   DeviceEnvironmentTy DeviceEnvironment;
971   DeviceEnvironment.DeviceDebugKind = OMPX_DebugKind;
972   DeviceEnvironment.NumDevices = Plugin.getNumDevices();
973   // TODO: The device ID used here is not the real device ID used by OpenMP.
974   DeviceEnvironment.DeviceNum = DeviceId;
975   DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize;
976   DeviceEnvironment.ClockFrequency = getClockFrequency();
977   DeviceEnvironment.IndirectCallTable =
978       reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);
979   DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second;
980   DeviceEnvironment.HardwareParallelism = getHardwareParallelism();
981 
982   // Create the metainfo of the device environment global.
983   GlobalTy DevEnvGlobal("__omp_rtl_device_environment",
984                         sizeof(DeviceEnvironmentTy), &DeviceEnvironment);
985 
986   // Write device environment values to the device.
987   GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
988   if (auto Err = GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal)) {
989     DP("Missing symbol %s, continue execution anyway.\n",
990        DevEnvGlobal.getName().data());
991     consumeError(std::move(Err));
992   }
993   return Plugin::success();
994 }
995 
996 Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
997                                              DeviceImageTy &Image,
998                                              uint64_t PoolSize) {
999   // Free the old pool, if any.
1000   if (DeviceMemoryPool.Ptr) {
1001     if (auto Err = dataDelete(DeviceMemoryPool.Ptr,
1002                               TargetAllocTy::TARGET_ALLOC_DEVICE))
1003       return Err;
1004   }
1005 
1006   DeviceMemoryPool.Size = PoolSize;
1007   auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr,
1008                               TargetAllocTy::TARGET_ALLOC_DEVICE);
1009   if (AllocOrErr) {
1010     DeviceMemoryPool.Ptr = *AllocOrErr;
1011   } else {
1012     auto Err = AllocOrErr.takeError();
1013     REPORT("Failure to allocate device memory for global memory pool: %s\n",
1014            toString(std::move(Err)).data());
1015     DeviceMemoryPool.Ptr = nullptr;
1016     DeviceMemoryPool.Size = 0;
1017   }
1018 
1019   // Create the metainfo of the device environment global.
1020   GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
1021   if (!GHandler.isSymbolInImage(*this, Image,
1022                                 "__omp_rtl_device_memory_pool_tracker")) {
1023     DP("Skip the memory pool as there is no tracker symbol in the image.");
1024     return Error::success();
1025   }
1026 
1027   GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
1028                          sizeof(DeviceMemoryPoolTrackingTy),
1029                          &DeviceMemoryPoolTracking);
1030   if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
1031     return Err;
1032 
1033   // Create the metainfo of the device environment global.
1034   GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool",
1035                         sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool);
1036 
1037   // Write device environment values to the device.
1038   return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal);
1039 }
1040 
1041 Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
1042                                       DeviceImageTy &Image) {
1043   // The plugin either does not need an RPC server or it is unavailible.
1044   if (!shouldSetupRPCServer())
1045     return Plugin::success();
1046 
1047   // Check if this device needs to run an RPC server.
1048   RPCServerTy &Server = Plugin.getRPCServer();
1049   auto UsingOrErr =
1050       Server.isDeviceUsingRPC(*this, Plugin.getGlobalHandler(), Image);
1051   if (!UsingOrErr)
1052     return UsingOrErr.takeError();
1053 
1054   if (!UsingOrErr.get())
1055     return Plugin::success();
1056 
1057   if (auto Err = Server.initDevice(*this, Plugin.getGlobalHandler(), Image))
1058     return Err;
1059 
1060   if (auto Err = Server.startThread())
1061     return Err;
1062 
1063   RPCServer = &Server;
1064   DP("Running an RPC server on device %d\n", getDeviceId());
1065   return Plugin::success();
1066 }
1067 
1068 Error PinnedAllocationMapTy::insertEntry(void *HstPtr, void *DevAccessiblePtr,
1069                                          size_t Size, bool ExternallyLocked) {
1070   // Insert the new entry into the map.
1071   auto Res = Allocs.insert({HstPtr, DevAccessiblePtr, Size, ExternallyLocked});
1072   if (!Res.second)
1073     return Plugin::error("Cannot insert locked buffer entry");
1074 
1075   // Check whether the next entry overlaps with the inserted entry.
1076   auto It = std::next(Res.first);
1077   if (It == Allocs.end())
1078     return Plugin::success();
1079 
1080   const EntryTy *NextEntry = &(*It);
1081   if (intersects(NextEntry->HstPtr, NextEntry->Size, HstPtr, Size))
1082     return Plugin::error("Partial overlapping not allowed in locked buffers");
1083 
1084   return Plugin::success();
1085 }
1086 
1087 Error PinnedAllocationMapTy::eraseEntry(const EntryTy &Entry) {
1088   // Erase the existing entry. Notice this requires an additional map lookup,
1089   // but this should not be a performance issue. Using iterators would make
1090   // the code more difficult to read.
1091   size_t Erased = Allocs.erase({Entry.HstPtr});
1092   if (!Erased)
1093     return Plugin::error("Cannot erase locked buffer entry");
1094   return Plugin::success();
1095 }
1096 
1097 Error PinnedAllocationMapTy::registerEntryUse(const EntryTy &Entry,
1098                                               void *HstPtr, size_t Size) {
1099   if (!contains(Entry.HstPtr, Entry.Size, HstPtr, Size))
1100     return Plugin::error("Partial overlapping not allowed in locked buffers");
1101 
1102   ++Entry.References;
1103   return Plugin::success();
1104 }
1105 
1106 Expected<bool> PinnedAllocationMapTy::unregisterEntryUse(const EntryTy &Entry) {
1107   if (Entry.References == 0)
1108     return Plugin::error("Invalid number of references");
1109 
1110   // Return whether this was the last user.
1111   return (--Entry.References == 0);
1112 }
1113 
1114 Error PinnedAllocationMapTy::registerHostBuffer(void *HstPtr,
1115                                                 void *DevAccessiblePtr,
1116                                                 size_t Size) {
1117   assert(HstPtr && "Invalid pointer");
1118   assert(DevAccessiblePtr && "Invalid pointer");
1119   assert(Size && "Invalid size");
1120 
1121   std::lock_guard<std::shared_mutex> Lock(Mutex);
1122 
1123   // No pinned allocation should intersect.
1124   const EntryTy *Entry = findIntersecting(HstPtr);
1125   if (Entry)
1126     return Plugin::error("Cannot insert entry due to an existing one");
1127 
1128   // Now insert the new entry.
1129   return insertEntry(HstPtr, DevAccessiblePtr, Size);
1130 }
1131 
1132 Error PinnedAllocationMapTy::unregisterHostBuffer(void *HstPtr) {
1133   assert(HstPtr && "Invalid pointer");
1134 
1135   std::lock_guard<std::shared_mutex> Lock(Mutex);
1136 
1137   const EntryTy *Entry = findIntersecting(HstPtr);
1138   if (!Entry)
1139     return Plugin::error("Cannot find locked buffer");
1140 
1141   // The address in the entry should be the same we are unregistering.
1142   if (Entry->HstPtr != HstPtr)
1143     return Plugin::error("Unexpected host pointer in locked buffer entry");
1144 
1145   // Unregister from the entry.
1146   auto LastUseOrErr = unregisterEntryUse(*Entry);
1147   if (!LastUseOrErr)
1148     return LastUseOrErr.takeError();
1149 
1150   // There should be no other references to the pinned allocation.
1151   if (!(*LastUseOrErr))
1152     return Plugin::error("The locked buffer is still being used");
1153 
1154   // Erase the entry from the map.
1155   return eraseEntry(*Entry);
1156 }
1157 
1158 Expected<void *> PinnedAllocationMapTy::lockHostBuffer(void *HstPtr,
1159                                                        size_t Size) {
1160   assert(HstPtr && "Invalid pointer");
1161   assert(Size && "Invalid size");
1162 
1163   std::lock_guard<std::shared_mutex> Lock(Mutex);
1164 
1165   const EntryTy *Entry = findIntersecting(HstPtr);
1166 
1167   if (Entry) {
1168     // An already registered intersecting buffer was found. Register a new use.
1169     if (auto Err = registerEntryUse(*Entry, HstPtr, Size))
1170       return std::move(Err);
1171 
1172     // Return the device accessible pointer with the correct offset.
1173     return utils::advancePtr(Entry->DevAccessiblePtr,
1174                              utils::getPtrDiff(HstPtr, Entry->HstPtr));
1175   }
1176 
1177   // No intersecting registered allocation found in the map. First, lock the
1178   // host buffer and retrieve the device accessible pointer.
1179   auto DevAccessiblePtrOrErr = Device.dataLockImpl(HstPtr, Size);
1180   if (!DevAccessiblePtrOrErr)
1181     return DevAccessiblePtrOrErr.takeError();
1182 
1183   // Now insert the new entry into the map.
1184   if (auto Err = insertEntry(HstPtr, *DevAccessiblePtrOrErr, Size))
1185     return std::move(Err);
1186 
1187   // Return the device accessible pointer.
1188   return *DevAccessiblePtrOrErr;
1189 }
1190 
1191 Error PinnedAllocationMapTy::unlockHostBuffer(void *HstPtr) {
1192   assert(HstPtr && "Invalid pointer");
1193 
1194   std::lock_guard<std::shared_mutex> Lock(Mutex);
1195 
1196   const EntryTy *Entry = findIntersecting(HstPtr);
1197   if (!Entry)
1198     return Plugin::error("Cannot find locked buffer");
1199 
1200   // Unregister from the locked buffer. No need to do anything if there are
1201   // others using the allocation.
1202   auto LastUseOrErr = unregisterEntryUse(*Entry);
1203   if (!LastUseOrErr)
1204     return LastUseOrErr.takeError();
1205 
1206   // No need to do anything if there are others using the allocation.
1207   if (!(*LastUseOrErr))
1208     return Plugin::success();
1209 
1210   // This was the last user of the allocation. Unlock the original locked buffer
1211   // if it was locked by the plugin. Do not unlock it if it was locked by an
1212   // external entity. Unlock the buffer using the host pointer of the entry.
1213   if (!Entry->ExternallyLocked)
1214     if (auto Err = Device.dataUnlockImpl(Entry->HstPtr))
1215       return Err;
1216 
1217   // Erase the entry from the map.
1218   return eraseEntry(*Entry);
1219 }
1220 
1221 Error PinnedAllocationMapTy::lockMappedHostBuffer(void *HstPtr, size_t Size) {
1222   assert(HstPtr && "Invalid pointer");
1223   assert(Size && "Invalid size");
1224 
1225   std::lock_guard<std::shared_mutex> Lock(Mutex);
1226 
1227   // If previously registered, just register a new user on the entry.
1228   const EntryTy *Entry = findIntersecting(HstPtr);
1229   if (Entry)
1230     return registerEntryUse(*Entry, HstPtr, Size);
1231 
1232   size_t BaseSize;
1233   void *BaseHstPtr, *BaseDevAccessiblePtr;
1234 
1235   // Check if it was externally pinned by a vendor-specific API.
1236   auto IsPinnedOrErr = Device.isPinnedPtrImpl(HstPtr, BaseHstPtr,
1237                                               BaseDevAccessiblePtr, BaseSize);
1238   if (!IsPinnedOrErr)
1239     return IsPinnedOrErr.takeError();
1240 
1241   // If pinned, just insert the entry representing the whole pinned buffer.
1242   if (*IsPinnedOrErr)
1243     return insertEntry(BaseHstPtr, BaseDevAccessiblePtr, BaseSize,
1244                        /*Externallylocked=*/true);
1245 
1246   // Not externally pinned. Do nothing if locking of mapped buffers is disabled.
1247   if (!LockMappedBuffers)
1248     return Plugin::success();
1249 
1250   // Otherwise, lock the buffer and insert the new entry.
1251   auto DevAccessiblePtrOrErr = Device.dataLockImpl(HstPtr, Size);
1252   if (!DevAccessiblePtrOrErr) {
1253     // Errors may be tolerated.
1254     if (!IgnoreLockMappedFailures)
1255       return DevAccessiblePtrOrErr.takeError();
1256 
1257     consumeError(DevAccessiblePtrOrErr.takeError());
1258     return Plugin::success();
1259   }
1260 
1261   return insertEntry(HstPtr, *DevAccessiblePtrOrErr, Size);
1262 }
1263 
1264 Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
1265   assert(HstPtr && "Invalid pointer");
1266 
1267   std::lock_guard<std::shared_mutex> Lock(Mutex);
1268 
1269   // Check whether there is any intersecting entry.
1270   const EntryTy *Entry = findIntersecting(HstPtr);
1271 
1272   // No entry but automatic locking of mapped buffers is disabled, so
1273   // nothing to do.
1274   if (!Entry && !LockMappedBuffers)
1275     return Plugin::success();
1276 
1277   // No entry, automatic locking is enabled, but the locking may have failed, so
1278   // do nothing.
1279   if (!Entry && IgnoreLockMappedFailures)
1280     return Plugin::success();
1281 
1282   // No entry, but the automatic locking is enabled, so this is an error.
1283   if (!Entry)
1284     return Plugin::error("Locked buffer not found");
1285 
1286   // There is entry, so unregister a user and check whether it was the last one.
1287   auto LastUseOrErr = unregisterEntryUse(*Entry);
1288   if (!LastUseOrErr)
1289     return LastUseOrErr.takeError();
1290 
1291   // If it is not the last one, there is nothing to do.
1292   if (!(*LastUseOrErr))
1293     return Plugin::success();
1294 
1295   // Otherwise, if it was the last and the buffer was locked by the plugin,
1296   // unlock it.
1297   if (!Entry->ExternallyLocked)
1298     if (auto Err = Device.dataUnlockImpl(Entry->HstPtr))
1299       return Err;
1300 
1301   // Finally erase the entry from the map.
1302   return eraseEntry(*Entry);
1303 }
1304 
1305 Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo) {
1306   if (!AsyncInfo || !AsyncInfo->Queue)
1307     return Plugin::error("Invalid async info queue");
1308 
1309   if (auto Err = synchronizeImpl(*AsyncInfo))
1310     return Err;
1311 
1312   for (auto *Ptr : AsyncInfo->AssociatedAllocations)
1313     if (auto Err = dataDelete(Ptr, TargetAllocTy::TARGET_ALLOC_DEVICE))
1314       return Err;
1315   AsyncInfo->AssociatedAllocations.clear();
1316 
1317   return Plugin::success();
1318 }
1319 
1320 Error GenericDeviceTy::queryAsync(__tgt_async_info *AsyncInfo) {
1321   if (!AsyncInfo || !AsyncInfo->Queue)
1322     return Plugin::error("Invalid async info queue");
1323 
1324   return queryAsyncImpl(*AsyncInfo);
1325 }
1326 
1327 Error GenericDeviceTy::memoryVAMap(void **Addr, void *VAddr, size_t *RSize) {
1328   return Plugin::error("Device does not suppport VA Management");
1329 }
1330 
1331 Error GenericDeviceTy::memoryVAUnMap(void *VAddr, size_t Size) {
1332   return Plugin::error("Device does not suppport VA Management");
1333 }
1334 
1335 Error GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize) {
1336   return Plugin::error(
1337       "Mising getDeviceMemorySize impelmentation (required by RR-heuristic");
1338 }
1339 
1340 Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
1341                                             TargetAllocTy Kind) {
1342   void *Alloc = nullptr;
1343 
1344   if (Plugin.getRecordReplay().isRecordingOrReplaying())
1345     return Plugin.getRecordReplay().alloc(Size);
1346 
1347   switch (Kind) {
1348   case TARGET_ALLOC_DEFAULT:
1349   case TARGET_ALLOC_DEVICE_NON_BLOCKING:
1350   case TARGET_ALLOC_DEVICE:
1351     if (MemoryManager) {
1352       Alloc = MemoryManager->allocate(Size, HostPtr);
1353       if (!Alloc)
1354         return Plugin::error("Failed to allocate from memory manager");
1355       break;
1356     }
1357     [[fallthrough]];
1358   case TARGET_ALLOC_HOST:
1359   case TARGET_ALLOC_SHARED:
1360     Alloc = allocate(Size, HostPtr, Kind);
1361     if (!Alloc)
1362       return Plugin::error("Failed to allocate from device allocator");
1363   }
1364 
1365   // Report error if the memory manager or the device allocator did not return
1366   // any memory buffer.
1367   if (!Alloc)
1368     return Plugin::error("Invalid target data allocation kind or requested "
1369                          "allocator not implemented yet");
1370 
1371   // Register allocated buffer as pinned memory if the type is host memory.
1372   if (Kind == TARGET_ALLOC_HOST)
1373     if (auto Err = PinnedAllocs.registerHostBuffer(Alloc, Alloc, Size))
1374       return std::move(Err);
1375 
1376   // Keep track of the allocation stack if we track allocation traces.
1377   if (OMPX_TrackAllocationTraces) {
1378     std::string StackTrace;
1379     llvm::raw_string_ostream OS(StackTrace);
1380     llvm::sys::PrintStackTrace(OS);
1381 
1382     AllocationTraceInfoTy *ATI = new AllocationTraceInfoTy();
1383     ATI->AllocationTrace = std::move(StackTrace);
1384     ATI->DevicePtr = Alloc;
1385     ATI->HostPtr = HostPtr;
1386     ATI->Size = Size;
1387     ATI->Kind = Kind;
1388 
1389     auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
1390     auto *&MapATI = (*AllocationTraceMap)[Alloc];
1391     ATI->LastAllocationInfo = MapATI;
1392     MapATI = ATI;
1393   }
1394 
1395   return Alloc;
1396 }
1397 
1398 Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
1399   // Free is a noop when recording or replaying.
1400   if (Plugin.getRecordReplay().isRecordingOrReplaying())
1401     return Plugin::success();
1402 
1403   // Keep track of the deallocation stack if we track allocation traces.
1404   if (OMPX_TrackAllocationTraces) {
1405     AllocationTraceInfoTy *ATI = nullptr;
1406     {
1407       auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
1408       ATI = (*AllocationTraceMap)[TgtPtr];
1409     }
1410 
1411     std::string StackTrace;
1412     llvm::raw_string_ostream OS(StackTrace);
1413     llvm::sys::PrintStackTrace(OS);
1414 
1415     if (!ATI)
1416       ErrorReporter::reportDeallocationOfNonAllocatedPtr(TgtPtr, Kind, ATI,
1417                                                          StackTrace);
1418 
1419     // ATI is not null, thus we can lock it to inspect and modify it further.
1420     std::lock_guard<std::mutex> LG(ATI->Lock);
1421     if (!ATI->DeallocationTrace.empty())
1422       ErrorReporter::reportDeallocationOfDeallocatedPtr(TgtPtr, Kind, ATI,
1423                                                         StackTrace);
1424 
1425     if (ATI->Kind != Kind)
1426       ErrorReporter::reportDeallocationOfWrongPtrKind(TgtPtr, Kind, ATI,
1427                                                       StackTrace);
1428 
1429     ATI->DeallocationTrace = StackTrace;
1430 
1431 #undef DEALLOCATION_ERROR
1432   }
1433 
1434   int Res;
1435   switch (Kind) {
1436   case TARGET_ALLOC_DEFAULT:
1437   case TARGET_ALLOC_DEVICE_NON_BLOCKING:
1438   case TARGET_ALLOC_DEVICE:
1439     if (MemoryManager) {
1440       Res = MemoryManager->free(TgtPtr);
1441       if (Res)
1442         return Plugin::error(
1443             "Failure to deallocate device pointer %p via memory manager",
1444             TgtPtr);
1445       break;
1446     }
1447     [[fallthrough]];
1448   case TARGET_ALLOC_HOST:
1449   case TARGET_ALLOC_SHARED:
1450     Res = free(TgtPtr, Kind);
1451     if (Res)
1452       return Plugin::error(
1453           "Failure to deallocate device pointer %p via device deallocator",
1454           TgtPtr);
1455   }
1456 
1457   // Unregister deallocated pinned memory buffer if the type is host memory.
1458   if (Kind == TARGET_ALLOC_HOST)
1459     if (auto Err = PinnedAllocs.unregisterHostBuffer(TgtPtr))
1460       return Err;
1461 
1462   return Plugin::success();
1463 }
1464 
1465 Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr,
1466                                   int64_t Size, __tgt_async_info *AsyncInfo) {
1467   AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1468 
1469   auto Err = dataSubmitImpl(TgtPtr, HstPtr, Size, AsyncInfoWrapper);
1470   AsyncInfoWrapper.finalize(Err);
1471   return Err;
1472 }
1473 
1474 Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr,
1475                                     int64_t Size, __tgt_async_info *AsyncInfo) {
1476   AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1477 
1478   auto Err = dataRetrieveImpl(HstPtr, TgtPtr, Size, AsyncInfoWrapper);
1479   AsyncInfoWrapper.finalize(Err);
1480   return Err;
1481 }
1482 
1483 Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
1484                                     void *DstPtr, int64_t Size,
1485                                     __tgt_async_info *AsyncInfo) {
1486   AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1487 
1488   auto Err = dataExchangeImpl(SrcPtr, DstDev, DstPtr, Size, AsyncInfoWrapper);
1489   AsyncInfoWrapper.finalize(Err);
1490   return Err;
1491 }
1492 
1493 Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
1494                                     ptrdiff_t *ArgOffsets,
1495                                     KernelArgsTy &KernelArgs,
1496                                     __tgt_async_info *AsyncInfo) {
1497   AsyncInfoWrapperTy AsyncInfoWrapper(
1498       *this,
1499       Plugin.getRecordReplay().isRecordingOrReplaying() ? nullptr : AsyncInfo);
1500 
1501   GenericKernelTy &GenericKernel =
1502       *reinterpret_cast<GenericKernelTy *>(EntryPtr);
1503 
1504   {
1505     std::string StackTrace;
1506     if (OMPX_TrackNumKernelLaunches) {
1507       llvm::raw_string_ostream OS(StackTrace);
1508       llvm::sys::PrintStackTrace(OS);
1509     }
1510 
1511     auto KernelTraceInfoRecord = KernelLaunchTraces.getExclusiveAccessor();
1512     (*KernelTraceInfoRecord)
1513         .emplace(&GenericKernel, std::move(StackTrace), AsyncInfo);
1514   }
1515 
1516   auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs,
1517                                   AsyncInfoWrapper);
1518 
1519   // 'finalize' here to guarantee next record-replay actions are in-sync
1520   AsyncInfoWrapper.finalize(Err);
1521 
1522   RecordReplayTy &RecordReplay = Plugin.getRecordReplay();
1523   if (RecordReplay.isRecordingOrReplaying() &&
1524       RecordReplay.isSaveOutputEnabled())
1525     RecordReplay.saveKernelOutputInfo(GenericKernel.getName());
1526 
1527   return Err;
1528 }
1529 
1530 Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) {
1531   assert(AsyncInfoPtr && "Invalid async info");
1532 
1533   *AsyncInfoPtr = new __tgt_async_info();
1534 
1535   AsyncInfoWrapperTy AsyncInfoWrapper(*this, *AsyncInfoPtr);
1536 
1537   auto Err = initAsyncInfoImpl(AsyncInfoWrapper);
1538   AsyncInfoWrapper.finalize(Err);
1539   return Err;
1540 }
1541 
1542 Error GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo) {
1543   assert(DeviceInfo && "Invalid device info");
1544 
1545   return initDeviceInfoImpl(DeviceInfo);
1546 }
1547 
1548 Error GenericDeviceTy::printInfo() {
1549   InfoQueueTy InfoQueue;
1550 
1551   // Get the vendor-specific info entries describing the device properties.
1552   if (auto Err = obtainInfoImpl(InfoQueue))
1553     return Err;
1554 
1555   // Print all info entries.
1556   InfoQueue.print();
1557 
1558   return Plugin::success();
1559 }
1560 
1561 Error GenericDeviceTy::createEvent(void **EventPtrStorage) {
1562   return createEventImpl(EventPtrStorage);
1563 }
1564 
1565 Error GenericDeviceTy::destroyEvent(void *EventPtr) {
1566   return destroyEventImpl(EventPtr);
1567 }
1568 
1569 Error GenericDeviceTy::recordEvent(void *EventPtr,
1570                                    __tgt_async_info *AsyncInfo) {
1571   AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1572 
1573   auto Err = recordEventImpl(EventPtr, AsyncInfoWrapper);
1574   AsyncInfoWrapper.finalize(Err);
1575   return Err;
1576 }
1577 
1578 Error GenericDeviceTy::waitEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
1579   AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1580 
1581   auto Err = waitEventImpl(EventPtr, AsyncInfoWrapper);
1582   AsyncInfoWrapper.finalize(Err);
1583   return Err;
1584 }
1585 
1586 Error GenericDeviceTy::syncEvent(void *EventPtr) {
1587   return syncEventImpl(EventPtr);
1588 }
1589 
1590 bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); }
1591 
1592 Error GenericPluginTy::init() {
1593   if (Initialized)
1594     return Plugin::success();
1595 
1596   auto NumDevicesOrErr = initImpl();
1597   if (!NumDevicesOrErr)
1598     return NumDevicesOrErr.takeError();
1599   Initialized = true;
1600 
1601   NumDevices = *NumDevicesOrErr;
1602   if (NumDevices == 0)
1603     return Plugin::success();
1604 
1605   assert(Devices.size() == 0 && "Plugin already initialized");
1606   Devices.resize(NumDevices, nullptr);
1607 
1608   GlobalHandler = createGlobalHandler();
1609   assert(GlobalHandler && "Invalid global handler");
1610 
1611   RPCServer = new RPCServerTy(*this);
1612   assert(RPCServer && "Invalid RPC server");
1613 
1614   RecordReplay = new RecordReplayTy();
1615   assert(RecordReplay && "Invalid RR interface");
1616 
1617   return Plugin::success();
1618 }
1619 
1620 Error GenericPluginTy::deinit() {
1621   assert(Initialized && "Plugin was not initialized!");
1622 
1623   // Deinitialize all active devices.
1624   for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) {
1625     if (Devices[DeviceId]) {
1626       if (auto Err = deinitDevice(DeviceId))
1627         return Err;
1628     }
1629     assert(!Devices[DeviceId] && "Device was not deinitialized");
1630   }
1631 
1632   // There is no global handler if no device is available.
1633   if (GlobalHandler)
1634     delete GlobalHandler;
1635 
1636   if (RPCServer && RPCServer->Thread->Running.load(std::memory_order_relaxed))
1637     if (Error Err = RPCServer->shutDown())
1638       return Err;
1639 
1640   if (RPCServer)
1641     delete RPCServer;
1642 
1643   if (RecordReplay)
1644     delete RecordReplay;
1645 
1646   // Perform last deinitializations on the plugin.
1647   if (Error Err = deinitImpl())
1648     return Err;
1649   Initialized = false;
1650 
1651   return Plugin::success();
1652 }
1653 
1654 Error GenericPluginTy::initDevice(int32_t DeviceId) {
1655   assert(!Devices[DeviceId] && "Device already initialized");
1656 
1657   // Create the device and save the reference.
1658   GenericDeviceTy *Device = createDevice(*this, DeviceId, NumDevices);
1659   assert(Device && "Invalid device");
1660 
1661   // Save the device reference into the list.
1662   Devices[DeviceId] = Device;
1663 
1664   // Initialize the device and its resources.
1665   return Device->init(*this);
1666 }
1667 
1668 Error GenericPluginTy::deinitDevice(int32_t DeviceId) {
1669   // The device may be already deinitialized.
1670   if (Devices[DeviceId] == nullptr)
1671     return Plugin::success();
1672 
1673   // Deinitialize the device and release its resources.
1674   if (auto Err = Devices[DeviceId]->deinit(*this))
1675     return Err;
1676 
1677   // Delete the device and invalidate its reference.
1678   delete Devices[DeviceId];
1679   Devices[DeviceId] = nullptr;
1680 
1681   return Plugin::success();
1682 }
1683 
1684 Expected<bool> GenericPluginTy::checkELFImage(StringRef Image) const {
1685   // First check if this image is a regular ELF file.
1686   if (!utils::elf::isELF(Image))
1687     return false;
1688 
1689   // Check if this image is an ELF with a matching machine value.
1690   auto MachineOrErr = utils::elf::checkMachine(Image, getMagicElfBits());
1691   if (!MachineOrErr)
1692     return MachineOrErr.takeError();
1693 
1694   return MachineOrErr;
1695 }
1696 
1697 Expected<bool> GenericPluginTy::checkBitcodeImage(StringRef Image) const {
1698   if (identify_magic(Image) != file_magic::bitcode)
1699     return false;
1700 
1701   LLVMContext Context;
1702   auto ModuleOrErr = getLazyBitcodeModule(MemoryBufferRef(Image, ""), Context,
1703                                           /*ShouldLazyLoadMetadata=*/true);
1704   if (!ModuleOrErr)
1705     return ModuleOrErr.takeError();
1706   Module &M = **ModuleOrErr;
1707 
1708   return Triple(M.getTargetTriple()).getArch() == getTripleArch();
1709 }
1710 
1711 int32_t GenericPluginTy::is_initialized() const { return Initialized; }
1712 
1713 int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) {
1714   StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
1715                    utils::getPtrDiff(Image->ImageEnd, Image->ImageStart));
1716 
1717   auto HandleError = [&](Error Err) -> bool {
1718     [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
1719     DP("Failure to check validity of image %p: %s", Image, ErrStr.c_str());
1720     return false;
1721   };
1722   switch (identify_magic(Buffer)) {
1723   case file_magic::elf:
1724   case file_magic::elf_relocatable:
1725   case file_magic::elf_executable:
1726   case file_magic::elf_shared_object:
1727   case file_magic::elf_core: {
1728     auto MatchOrErr = checkELFImage(Buffer);
1729     if (Error Err = MatchOrErr.takeError())
1730       return HandleError(std::move(Err));
1731     return *MatchOrErr;
1732   }
1733   case file_magic::bitcode: {
1734     auto MatchOrErr = checkBitcodeImage(Buffer);
1735     if (Error Err = MatchOrErr.takeError())
1736       return HandleError(std::move(Err));
1737     return *MatchOrErr;
1738   }
1739   default:
1740     return false;
1741   }
1742 }
1743 
1744 int32_t GenericPluginTy::is_device_compatible(int32_t DeviceId,
1745                                               __tgt_device_image *Image) {
1746   StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
1747                    utils::getPtrDiff(Image->ImageEnd, Image->ImageStart));
1748 
1749   auto HandleError = [&](Error Err) -> bool {
1750     [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
1751     DP("Failure to check validity of image %p: %s", Image, ErrStr.c_str());
1752     return false;
1753   };
1754   switch (identify_magic(Buffer)) {
1755   case file_magic::elf:
1756   case file_magic::elf_relocatable:
1757   case file_magic::elf_executable:
1758   case file_magic::elf_shared_object:
1759   case file_magic::elf_core: {
1760     auto MatchOrErr = checkELFImage(Buffer);
1761     if (Error Err = MatchOrErr.takeError())
1762       return HandleError(std::move(Err));
1763     if (!*MatchOrErr)
1764       return false;
1765 
1766     // Perform plugin-dependent checks for the specific architecture if needed.
1767     auto CompatibleOrErr = isELFCompatible(DeviceId, Buffer);
1768     if (Error Err = CompatibleOrErr.takeError())
1769       return HandleError(std::move(Err));
1770     return *CompatibleOrErr;
1771   }
1772   case file_magic::bitcode: {
1773     auto MatchOrErr = checkBitcodeImage(Buffer);
1774     if (Error Err = MatchOrErr.takeError())
1775       return HandleError(std::move(Err));
1776     return *MatchOrErr;
1777   }
1778   default:
1779     return false;
1780   }
1781 }
1782 
1783 int32_t GenericPluginTy::is_device_initialized(int32_t DeviceId) const {
1784   return isValidDeviceId(DeviceId) && Devices[DeviceId] != nullptr;
1785 }
1786 
1787 int32_t GenericPluginTy::init_device(int32_t DeviceId) {
1788   auto Err = initDevice(DeviceId);
1789   if (Err) {
1790     REPORT("Failure to initialize device %d: %s\n", DeviceId,
1791            toString(std::move(Err)).data());
1792     return OFFLOAD_FAIL;
1793   }
1794 
1795   return OFFLOAD_SUCCESS;
1796 }
1797 
1798 int32_t GenericPluginTy::number_of_devices() { return getNumDevices(); }
1799 
1800 int32_t GenericPluginTy::is_data_exchangable(int32_t SrcDeviceId,
1801                                              int32_t DstDeviceId) {
1802   return isDataExchangable(SrcDeviceId, DstDeviceId);
1803 }
1804 
1805 int32_t GenericPluginTy::initialize_record_replay(int32_t DeviceId,
1806                                                   int64_t MemorySize,
1807                                                   void *VAddr, bool isRecord,
1808                                                   bool SaveOutput,
1809                                                   uint64_t &ReqPtrArgOffset) {
1810   GenericDeviceTy &Device = getDevice(DeviceId);
1811   RecordReplayTy::RRStatusTy Status =
1812       isRecord ? RecordReplayTy::RRStatusTy::RRRecording
1813                : RecordReplayTy::RRStatusTy::RRReplaying;
1814 
1815   if (auto Err = RecordReplay->init(&Device, MemorySize, VAddr, Status,
1816                                     SaveOutput, ReqPtrArgOffset)) {
1817     REPORT("WARNING RR did not intialize RR-properly with %lu bytes"
1818            "(Error: %s)\n",
1819            MemorySize, toString(std::move(Err)).data());
1820     RecordReplay->setStatus(RecordReplayTy::RRStatusTy::RRDeactivated);
1821 
1822     if (!isRecord) {
1823       return OFFLOAD_FAIL;
1824     }
1825   }
1826   return OFFLOAD_SUCCESS;
1827 }
1828 
1829 int32_t GenericPluginTy::load_binary(int32_t DeviceId,
1830                                      __tgt_device_image *TgtImage,
1831                                      __tgt_device_binary *Binary) {
1832   GenericDeviceTy &Device = getDevice(DeviceId);
1833 
1834   auto ImageOrErr = Device.loadBinary(*this, TgtImage);
1835   if (!ImageOrErr) {
1836     auto Err = ImageOrErr.takeError();
1837     REPORT("Failure to load binary image %p on device %d: %s\n", TgtImage,
1838            DeviceId, toString(std::move(Err)).data());
1839     return OFFLOAD_FAIL;
1840   }
1841 
1842   DeviceImageTy *Image = *ImageOrErr;
1843   assert(Image != nullptr && "Invalid Image");
1844 
1845   *Binary = __tgt_device_binary{reinterpret_cast<uint64_t>(Image)};
1846 
1847   return OFFLOAD_SUCCESS;
1848 }
1849 
1850 void *GenericPluginTy::data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr,
1851                                   int32_t Kind) {
1852   auto AllocOrErr =
1853       getDevice(DeviceId).dataAlloc(Size, HostPtr, (TargetAllocTy)Kind);
1854   if (!AllocOrErr) {
1855     auto Err = AllocOrErr.takeError();
1856     REPORT("Failure to allocate device memory: %s\n",
1857            toString(std::move(Err)).data());
1858     return nullptr;
1859   }
1860   assert(*AllocOrErr && "Null pointer upon successful allocation");
1861 
1862   return *AllocOrErr;
1863 }
1864 
1865 int32_t GenericPluginTy::data_delete(int32_t DeviceId, void *TgtPtr,
1866                                      int32_t Kind) {
1867   auto Err =
1868       getDevice(DeviceId).dataDelete(TgtPtr, static_cast<TargetAllocTy>(Kind));
1869   if (Err) {
1870     REPORT("Failure to deallocate device pointer %p: %s\n", TgtPtr,
1871            toString(std::move(Err)).data());
1872     return OFFLOAD_FAIL;
1873   }
1874 
1875   return OFFLOAD_SUCCESS;
1876 }
1877 
1878 int32_t GenericPluginTy::data_lock(int32_t DeviceId, void *Ptr, int64_t Size,
1879                                    void **LockedPtr) {
1880   auto LockedPtrOrErr = getDevice(DeviceId).dataLock(Ptr, Size);
1881   if (!LockedPtrOrErr) {
1882     auto Err = LockedPtrOrErr.takeError();
1883     REPORT("Failure to lock memory %p: %s\n", Ptr,
1884            toString(std::move(Err)).data());
1885     return OFFLOAD_FAIL;
1886   }
1887 
1888   if (!(*LockedPtrOrErr)) {
1889     REPORT("Failure to lock memory %p: obtained a null locked pointer\n", Ptr);
1890     return OFFLOAD_FAIL;
1891   }
1892   *LockedPtr = *LockedPtrOrErr;
1893 
1894   return OFFLOAD_SUCCESS;
1895 }
1896 
1897 int32_t GenericPluginTy::data_unlock(int32_t DeviceId, void *Ptr) {
1898   auto Err = getDevice(DeviceId).dataUnlock(Ptr);
1899   if (Err) {
1900     REPORT("Failure to unlock memory %p: %s\n", Ptr,
1901            toString(std::move(Err)).data());
1902     return OFFLOAD_FAIL;
1903   }
1904 
1905   return OFFLOAD_SUCCESS;
1906 }
1907 
1908 int32_t GenericPluginTy::data_notify_mapped(int32_t DeviceId, void *HstPtr,
1909                                             int64_t Size) {
1910   auto Err = getDevice(DeviceId).notifyDataMapped(HstPtr, Size);
1911   if (Err) {
1912     REPORT("Failure to notify data mapped %p: %s\n", HstPtr,
1913            toString(std::move(Err)).data());
1914     return OFFLOAD_FAIL;
1915   }
1916 
1917   return OFFLOAD_SUCCESS;
1918 }
1919 
1920 int32_t GenericPluginTy::data_notify_unmapped(int32_t DeviceId, void *HstPtr) {
1921   auto Err = getDevice(DeviceId).notifyDataUnmapped(HstPtr);
1922   if (Err) {
1923     REPORT("Failure to notify data unmapped %p: %s\n", HstPtr,
1924            toString(std::move(Err)).data());
1925     return OFFLOAD_FAIL;
1926   }
1927 
1928   return OFFLOAD_SUCCESS;
1929 }
1930 
1931 int32_t GenericPluginTy::data_submit(int32_t DeviceId, void *TgtPtr,
1932                                      void *HstPtr, int64_t Size) {
1933   return data_submit_async(DeviceId, TgtPtr, HstPtr, Size,
1934                            /*AsyncInfoPtr=*/nullptr);
1935 }
1936 
1937 int32_t GenericPluginTy::data_submit_async(int32_t DeviceId, void *TgtPtr,
1938                                            void *HstPtr, int64_t Size,
1939                                            __tgt_async_info *AsyncInfoPtr) {
1940   auto Err = getDevice(DeviceId).dataSubmit(TgtPtr, HstPtr, Size, AsyncInfoPtr);
1941   if (Err) {
1942     REPORT("Failure to copy data from host to device. Pointers: host "
1943            "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
1944            DPxPTR(HstPtr), DPxPTR(TgtPtr), Size,
1945            toString(std::move(Err)).data());
1946     return OFFLOAD_FAIL;
1947   }
1948 
1949   return OFFLOAD_SUCCESS;
1950 }
1951 
1952 int32_t GenericPluginTy::data_retrieve(int32_t DeviceId, void *HstPtr,
1953                                        void *TgtPtr, int64_t Size) {
1954   return data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size,
1955                              /*AsyncInfoPtr=*/nullptr);
1956 }
1957 
1958 int32_t GenericPluginTy::data_retrieve_async(int32_t DeviceId, void *HstPtr,
1959                                              void *TgtPtr, int64_t Size,
1960                                              __tgt_async_info *AsyncInfoPtr) {
1961   auto Err =
1962       getDevice(DeviceId).dataRetrieve(HstPtr, TgtPtr, Size, AsyncInfoPtr);
1963   if (Err) {
1964     REPORT("Faliure to copy data from device to host. Pointers: host "
1965            "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
1966            DPxPTR(HstPtr), DPxPTR(TgtPtr), Size,
1967            toString(std::move(Err)).data());
1968     return OFFLOAD_FAIL;
1969   }
1970 
1971   return OFFLOAD_SUCCESS;
1972 }
1973 
1974 int32_t GenericPluginTy::data_exchange(int32_t SrcDeviceId, void *SrcPtr,
1975                                        int32_t DstDeviceId, void *DstPtr,
1976                                        int64_t Size) {
1977   return data_exchange_async(SrcDeviceId, SrcPtr, DstDeviceId, DstPtr, Size,
1978                              /*AsyncInfoPtr=*/nullptr);
1979 }
1980 
1981 int32_t GenericPluginTy::data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
1982                                              int DstDeviceId, void *DstPtr,
1983                                              int64_t Size,
1984                                              __tgt_async_info *AsyncInfo) {
1985   GenericDeviceTy &SrcDevice = getDevice(SrcDeviceId);
1986   GenericDeviceTy &DstDevice = getDevice(DstDeviceId);
1987   auto Err = SrcDevice.dataExchange(SrcPtr, DstDevice, DstPtr, Size, AsyncInfo);
1988   if (Err) {
1989     REPORT("Failure to copy data from device (%d) to device (%d). Pointers: "
1990            "host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
1991            SrcDeviceId, DstDeviceId, DPxPTR(SrcPtr), DPxPTR(DstPtr), Size,
1992            toString(std::move(Err)).data());
1993     return OFFLOAD_FAIL;
1994   }
1995 
1996   return OFFLOAD_SUCCESS;
1997 }
1998 
1999 int32_t GenericPluginTy::launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
2000                                        void **TgtArgs, ptrdiff_t *TgtOffsets,
2001                                        KernelArgsTy *KernelArgs,
2002                                        __tgt_async_info *AsyncInfoPtr) {
2003   auto Err = getDevice(DeviceId).launchKernel(TgtEntryPtr, TgtArgs, TgtOffsets,
2004                                               *KernelArgs, AsyncInfoPtr);
2005   if (Err) {
2006     REPORT("Failure to run target region " DPxMOD " in device %d: %s\n",
2007            DPxPTR(TgtEntryPtr), DeviceId, toString(std::move(Err)).data());
2008     return OFFLOAD_FAIL;
2009   }
2010 
2011   return OFFLOAD_SUCCESS;
2012 }
2013 
2014 int32_t GenericPluginTy::synchronize(int32_t DeviceId,
2015                                      __tgt_async_info *AsyncInfoPtr) {
2016   auto Err = getDevice(DeviceId).synchronize(AsyncInfoPtr);
2017   if (Err) {
2018     REPORT("Failure to synchronize stream %p: %s\n", AsyncInfoPtr->Queue,
2019            toString(std::move(Err)).data());
2020     return OFFLOAD_FAIL;
2021   }
2022 
2023   return OFFLOAD_SUCCESS;
2024 }
2025 
2026 int32_t GenericPluginTy::query_async(int32_t DeviceId,
2027                                      __tgt_async_info *AsyncInfoPtr) {
2028   auto Err = getDevice(DeviceId).queryAsync(AsyncInfoPtr);
2029   if (Err) {
2030     REPORT("Failure to query stream %p: %s\n", AsyncInfoPtr->Queue,
2031            toString(std::move(Err)).data());
2032     return OFFLOAD_FAIL;
2033   }
2034 
2035   return OFFLOAD_SUCCESS;
2036 }
2037 
2038 void GenericPluginTy::print_device_info(int32_t DeviceId) {
2039   if (auto Err = getDevice(DeviceId).printInfo())
2040     REPORT("Failure to print device %d info: %s\n", DeviceId,
2041            toString(std::move(Err)).data());
2042 }
2043 
2044 int32_t GenericPluginTy::create_event(int32_t DeviceId, void **EventPtr) {
2045   auto Err = getDevice(DeviceId).createEvent(EventPtr);
2046   if (Err) {
2047     REPORT("Failure to create event: %s\n", toString(std::move(Err)).data());
2048     return OFFLOAD_FAIL;
2049   }
2050 
2051   return OFFLOAD_SUCCESS;
2052 }
2053 
2054 int32_t GenericPluginTy::record_event(int32_t DeviceId, void *EventPtr,
2055                                       __tgt_async_info *AsyncInfoPtr) {
2056   auto Err = getDevice(DeviceId).recordEvent(EventPtr, AsyncInfoPtr);
2057   if (Err) {
2058     REPORT("Failure to record event %p: %s\n", EventPtr,
2059            toString(std::move(Err)).data());
2060     return OFFLOAD_FAIL;
2061   }
2062 
2063   return OFFLOAD_SUCCESS;
2064 }
2065 
2066 int32_t GenericPluginTy::wait_event(int32_t DeviceId, void *EventPtr,
2067                                     __tgt_async_info *AsyncInfoPtr) {
2068   auto Err = getDevice(DeviceId).waitEvent(EventPtr, AsyncInfoPtr);
2069   if (Err) {
2070     REPORT("Failure to wait event %p: %s\n", EventPtr,
2071            toString(std::move(Err)).data());
2072     return OFFLOAD_FAIL;
2073   }
2074 
2075   return OFFLOAD_SUCCESS;
2076 }
2077 
2078 int32_t GenericPluginTy::sync_event(int32_t DeviceId, void *EventPtr) {
2079   auto Err = getDevice(DeviceId).syncEvent(EventPtr);
2080   if (Err) {
2081     REPORT("Failure to synchronize event %p: %s\n", EventPtr,
2082            toString(std::move(Err)).data());
2083     return OFFLOAD_FAIL;
2084   }
2085 
2086   return OFFLOAD_SUCCESS;
2087 }
2088 
2089 int32_t GenericPluginTy::destroy_event(int32_t DeviceId, void *EventPtr) {
2090   auto Err = getDevice(DeviceId).destroyEvent(EventPtr);
2091   if (Err) {
2092     REPORT("Failure to destroy event %p: %s\n", EventPtr,
2093            toString(std::move(Err)).data());
2094     return OFFLOAD_FAIL;
2095   }
2096 
2097   return OFFLOAD_SUCCESS;
2098 }
2099 
2100 void GenericPluginTy::set_info_flag(uint32_t NewInfoLevel) {
2101   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
2102   InfoLevel.store(NewInfoLevel);
2103 }
2104 
2105 int32_t GenericPluginTy::init_async_info(int32_t DeviceId,
2106                                          __tgt_async_info **AsyncInfoPtr) {
2107   assert(AsyncInfoPtr && "Invalid async info");
2108 
2109   auto Err = getDevice(DeviceId).initAsyncInfo(AsyncInfoPtr);
2110   if (Err) {
2111     REPORT("Failure to initialize async info at " DPxMOD " on device %d: %s\n",
2112            DPxPTR(*AsyncInfoPtr), DeviceId, toString(std::move(Err)).data());
2113     return OFFLOAD_FAIL;
2114   }
2115 
2116   return OFFLOAD_SUCCESS;
2117 }
2118 
2119 int32_t GenericPluginTy::init_device_info(int32_t DeviceId,
2120                                           __tgt_device_info *DeviceInfo,
2121                                           const char **ErrStr) {
2122   *ErrStr = "";
2123 
2124   auto Err = getDevice(DeviceId).initDeviceInfo(DeviceInfo);
2125   if (Err) {
2126     REPORT("Failure to initialize device info at " DPxMOD " on device %d: %s\n",
2127            DPxPTR(DeviceInfo), DeviceId, toString(std::move(Err)).data());
2128     return OFFLOAD_FAIL;
2129   }
2130 
2131   return OFFLOAD_SUCCESS;
2132 }
2133 
2134 int32_t GenericPluginTy::set_device_identifier(int32_t UserId,
2135                                                int32_t DeviceId) {
2136   UserDeviceIds[DeviceId] = UserId;
2137 
2138   return OFFLOAD_SUCCESS;
2139 }
2140 
2141 int32_t GenericPluginTy::use_auto_zero_copy(int32_t DeviceId) {
2142   return getDevice(DeviceId).useAutoZeroCopy();
2143 }
2144 
2145 int32_t GenericPluginTy::get_global(__tgt_device_binary Binary, uint64_t Size,
2146                                     const char *Name, void **DevicePtr) {
2147   assert(Binary.handle && "Invalid device binary handle");
2148   DeviceImageTy &Image = *reinterpret_cast<DeviceImageTy *>(Binary.handle);
2149 
2150   GenericDeviceTy &Device = Image.getDevice();
2151 
2152   GlobalTy DeviceGlobal(Name, Size);
2153   GenericGlobalHandlerTy &GHandler = getGlobalHandler();
2154   if (auto Err =
2155           GHandler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) {
2156     REPORT("Failure to look up global address: %s\n",
2157            toString(std::move(Err)).data());
2158     return OFFLOAD_FAIL;
2159   }
2160 
2161   *DevicePtr = DeviceGlobal.getPtr();
2162   assert(DevicePtr && "Invalid device global's address");
2163 
2164   // Save the loaded globals if we are recording.
2165   RecordReplayTy &RecordReplay = Device.Plugin.getRecordReplay();
2166   if (RecordReplay.isRecording())
2167     RecordReplay.addEntry(Name, Size, *DevicePtr);
2168 
2169   return OFFLOAD_SUCCESS;
2170 }
2171 
2172 int32_t GenericPluginTy::get_function(__tgt_device_binary Binary,
2173                                       const char *Name, void **KernelPtr) {
2174   assert(Binary.handle && "Invalid device binary handle");
2175   DeviceImageTy &Image = *reinterpret_cast<DeviceImageTy *>(Binary.handle);
2176 
2177   GenericDeviceTy &Device = Image.getDevice();
2178 
2179   auto KernelOrErr = Device.constructKernel(Name);
2180   if (Error Err = KernelOrErr.takeError()) {
2181     REPORT("Failure to look up kernel: %s\n", toString(std::move(Err)).data());
2182     return OFFLOAD_FAIL;
2183   }
2184 
2185   GenericKernelTy &Kernel = *KernelOrErr;
2186   if (auto Err = Kernel.init(Device, Image)) {
2187     REPORT("Failure to init kernel: %s\n", toString(std::move(Err)).data());
2188     return OFFLOAD_FAIL;
2189   }
2190 
2191   // Note that this is not the kernel's device address.
2192   *KernelPtr = &Kernel;
2193   return OFFLOAD_SUCCESS;
2194 }
2195