xref: /llvm-project/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp (revision 13dcc95dcd4999ff99f2de89d881f1aed5b21709)
1 //===- llvm-omp-kernel-replay.cpp - Replay OpenMP offload kernel ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is a command line utility to replay the execution of recorded OpenMP
10 // offload kernels.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "omptarget.h"
15 
16 #include "llvm/Frontend/Offloading/Utility.h"
17 #include "llvm/Support/CommandLine.h"
18 #include "llvm/Support/JSON.h"
19 #include "llvm/Support/MemoryBuffer.h"
20 
21 #include <cstdint>
22 #include <cstdlib>
23 
24 using namespace llvm;
25 
26 cl::OptionCategory ReplayOptions("llvm-omp-kernel-replay Options");
27 
28 // InputFilename - The filename to read the json description of the kernel.
29 static cl::opt<std::string> InputFilename(cl::Positional,
30                                           cl::desc("<input kernel json file>"),
31                                           cl::Required);
32 
33 static cl::opt<bool> VerifyOpt(
34     "verify",
35     cl::desc(
36         "Verify device memory post execution against the original output."),
37     cl::init(false), cl::cat(ReplayOptions));
38 
39 static cl::opt<bool> SaveOutputOpt(
40     "save-output",
41     cl::desc("Save the device memory output of the replayed kernel execution."),
42     cl::init(false), cl::cat(ReplayOptions));
43 
44 static cl::opt<unsigned> NumTeamsOpt("num-teams",
45                                      cl::desc("Set the number of teams."),
46                                      cl::init(0), cl::cat(ReplayOptions));
47 
48 static cl::opt<unsigned> NumThreadsOpt("num-threads",
49                                        cl::desc("Set the number of threads."),
50                                        cl::init(0), cl::cat(ReplayOptions));
51 
52 static cl::opt<int32_t> DeviceIdOpt("device-id", cl::desc("Set the device id."),
53                                     cl::init(-1), cl::cat(ReplayOptions));
54 
55 int main(int argc, char **argv) {
56   cl::HideUnrelatedOptions(ReplayOptions);
57   cl::ParseCommandLineOptions(argc, argv, "llvm-omp-kernel-replay\n");
58 
59   ErrorOr<std::unique_ptr<MemoryBuffer>> KernelInfoMB =
60       MemoryBuffer::getFile(InputFilename, /*isText=*/true,
61                             /*RequiresNullTerminator=*/true);
62   if (!KernelInfoMB)
63     report_fatal_error("Error reading the kernel info json file");
64   Expected<json::Value> JsonKernelInfo =
65       json::parse(KernelInfoMB.get()->getBuffer());
66   if (auto Err = JsonKernelInfo.takeError())
67     report_fatal_error("Cannot parse the kernel info json file");
68 
69   auto NumTeamsJson =
70       JsonKernelInfo->getAsObject()->getInteger("NumTeamsClause");
71   unsigned NumTeams = (NumTeamsOpt > 0 ? NumTeamsOpt : NumTeamsJson.value());
72   auto NumThreadsJson =
73       JsonKernelInfo->getAsObject()->getInteger("ThreadLimitClause");
74   unsigned NumThreads =
75       (NumThreadsOpt > 0 ? NumThreadsOpt : NumThreadsJson.value());
76   // TODO: Print a warning if number of teams/threads is explicitly set in the
77   // kernel info but overriden through command line options.
78   auto LoopTripCount =
79       JsonKernelInfo->getAsObject()->getInteger("LoopTripCount");
80   auto KernelFunc = JsonKernelInfo->getAsObject()->getString("Name");
81 
82   SmallVector<void *> TgtArgs;
83   SmallVector<ptrdiff_t> TgtArgOffsets;
84   auto NumArgs = JsonKernelInfo->getAsObject()->getInteger("NumArgs");
85   auto *TgtArgsArray = JsonKernelInfo->getAsObject()->getArray("ArgPtrs");
86   for (auto It : *TgtArgsArray)
87     TgtArgs.push_back(reinterpret_cast<void *>(It.getAsInteger().value()));
88   auto *TgtArgOffsetsArray =
89       JsonKernelInfo->getAsObject()->getArray("ArgOffsets");
90   for (auto It : *TgtArgOffsetsArray)
91     TgtArgOffsets.push_back(static_cast<ptrdiff_t>(It.getAsInteger().value()));
92 
93   void *BAllocStart = reinterpret_cast<void *>(
94       JsonKernelInfo->getAsObject()->getInteger("BumpAllocVAStart").value());
95 
96   llvm::offloading::EntryTy KernelEntry = {~0U,     0, 0, 0,      nullptr,
97                                            nullptr, 0, 0, nullptr};
98   std::string KernelEntryName = KernelFunc.value().str();
99   KernelEntry.SymbolName = const_cast<char *>(KernelEntryName.c_str());
100   // Anything non-zero works to uniquely identify the kernel.
101   KernelEntry.Address = (void *)0x1;
102 
103   ErrorOr<std::unique_ptr<MemoryBuffer>> ImageMB =
104       MemoryBuffer::getFile(KernelEntryName + ".image", /*isText=*/false,
105                             /*RequiresNullTerminator=*/false);
106   if (!ImageMB)
107     report_fatal_error("Error reading the kernel image.");
108 
109   __tgt_device_image DeviceImage;
110   DeviceImage.ImageStart = const_cast<char *>(ImageMB.get()->getBufferStart());
111   DeviceImage.ImageEnd = const_cast<char *>(ImageMB.get()->getBufferEnd());
112   DeviceImage.EntriesBegin = &KernelEntry;
113   DeviceImage.EntriesEnd = &KernelEntry + 1;
114 
115   __tgt_bin_desc Desc;
116   Desc.NumDeviceImages = 1;
117   Desc.HostEntriesBegin = &KernelEntry;
118   Desc.HostEntriesEnd = &KernelEntry + 1;
119   Desc.DeviceImages = &DeviceImage;
120 
121   auto DeviceMemorySizeJson =
122       JsonKernelInfo->getAsObject()->getInteger("DeviceMemorySize");
123   // Set device memory size to the ceiling of GB granularity.
124   uint64_t DeviceMemorySize = std::ceil(DeviceMemorySizeJson.value());
125 
126   auto DeviceIdJson = JsonKernelInfo->getAsObject()->getInteger("DeviceId");
127   // TODO: Print warning if the user overrides the device id in the json file.
128   int32_t DeviceId = (DeviceIdOpt > -1 ? DeviceIdOpt : DeviceIdJson.value());
129 
130   // TODO: do we need requires?
131   //__tgt_register_requires(/*Flags=*/1);
132 
133   __tgt_register_lib(&Desc);
134 
135   uint64_t ReqPtrArgOffset = 0;
136   int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, BAllocStart,
137                                         false, VerifyOpt, ReqPtrArgOffset);
138 
139   if (Rc != OMP_TGT_SUCCESS) {
140     report_fatal_error("Cannot activate record replay\n");
141   }
142 
143   ErrorOr<std::unique_ptr<MemoryBuffer>> DeviceMemoryMB =
144       MemoryBuffer::getFile(KernelEntryName + ".memory", /*isText=*/false,
145                             /*RequiresNullTerminator=*/false);
146 
147   if (!DeviceMemoryMB)
148     report_fatal_error("Error reading the kernel input device memory.");
149 
150   // On AMD for currently unknown reasons we cannot copy memory mapped data to
151   // device. This is a work-around.
152   uint8_t *recored_data = new uint8_t[DeviceMemoryMB.get()->getBufferSize()];
153   std::memcpy(recored_data,
154               const_cast<char *>(DeviceMemoryMB.get()->getBuffer().data()),
155               DeviceMemoryMB.get()->getBufferSize());
156 
157   // If necessary, adjust pointer arguments.
158   if (ReqPtrArgOffset) {
159     for (auto *&Arg : TgtArgs) {
160       auto ArgInt = uintptr_t(Arg);
161       // Try to find pointer arguments.
162       if (ArgInt < uintptr_t(BAllocStart) ||
163           ArgInt >= uintptr_t(BAllocStart) + DeviceMemorySize)
164         continue;
165       Arg = reinterpret_cast<void *>(ArgInt - ReqPtrArgOffset);
166     }
167   }
168 
169   __tgt_target_kernel_replay(
170       /*Loc=*/nullptr, DeviceId, KernelEntry.Address, (char *)recored_data,
171       DeviceMemoryMB.get()->getBufferSize(), TgtArgs.data(),
172       TgtArgOffsets.data(), NumArgs.value(), NumTeams, NumThreads,
173       LoopTripCount.value());
174 
175   if (VerifyOpt) {
176     ErrorOr<std::unique_ptr<MemoryBuffer>> OriginalOutputMB =
177         MemoryBuffer::getFile(KernelEntryName + ".original.output",
178                               /*isText=*/false,
179                               /*RequiresNullTerminator=*/false);
180     if (!OriginalOutputMB)
181       report_fatal_error("Error reading the kernel original output file, make "
182                          "sure LIBOMPTARGET_SAVE_OUTPUT is set when recording");
183     ErrorOr<std::unique_ptr<MemoryBuffer>> ReplayOutputMB =
184         MemoryBuffer::getFile(KernelEntryName + ".replay.output",
185                               /*isText=*/false,
186                               /*RequiresNullTerminator=*/false);
187     if (!ReplayOutputMB)
188       report_fatal_error("Error reading the kernel replay output file");
189 
190     StringRef OriginalOutput = OriginalOutputMB.get()->getBuffer();
191     StringRef ReplayOutput = ReplayOutputMB.get()->getBuffer();
192     if (OriginalOutput == ReplayOutput)
193       outs() << "[llvm-omp-kernel-replay] Replay device memory verified!\n";
194     else
195       outs() << "[llvm-omp-kernel-replay] Replay device memory failed to "
196                 "verify!\n";
197   }
198 
199   delete[] recored_data;
200 
201   return 0;
202 }
203