1 //===- PluginInterface.cpp - Target independent plugin device interface ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 //===----------------------------------------------------------------------===// 10 11 #include "PluginInterface.h" 12 13 #include "Shared/APITypes.h" 14 #include "Shared/Debug.h" 15 #include "Shared/Environment.h" 16 17 #include "ErrorReporting.h" 18 #include "GlobalHandler.h" 19 #include "JIT.h" 20 #include "Shared/Utils.h" 21 #include "Utils/ELF.h" 22 #include "omptarget.h" 23 24 #ifdef OMPT_SUPPORT 25 #include "OpenMP/OMPT/Callback.h" 26 #include "omp-tools.h" 27 #endif 28 29 #include "llvm/Bitcode/BitcodeReader.h" 30 #include "llvm/Frontend/OpenMP/OMPConstants.h" 31 #include "llvm/Support/Error.h" 32 #include "llvm/Support/JSON.h" 33 #include "llvm/Support/MathExtras.h" 34 #include "llvm/Support/MemoryBuffer.h" 35 #include "llvm/Support/Signals.h" 36 #include "llvm/Support/raw_ostream.h" 37 38 #include <cstdint> 39 #include <limits> 40 41 using namespace llvm; 42 using namespace omp; 43 using namespace target; 44 using namespace plugin; 45 46 // TODO: Fix any thread safety issues for multi-threaded kernel recording. 47 namespace llvm::omp::target::plugin { 48 struct RecordReplayTy { 49 50 // Describes the state of the record replay mechanism. 51 enum RRStatusTy { RRDeactivated = 0, RRRecording, RRReplaying }; 52 53 private: 54 // Memory pointers for recording, replaying memory. 55 void *MemoryStart = nullptr; 56 void *MemoryPtr = nullptr; 57 size_t MemorySize = 0; 58 size_t TotalSize = 0; 59 GenericDeviceTy *Device = nullptr; 60 std::mutex AllocationLock; 61 62 RRStatusTy Status = RRDeactivated; 63 bool ReplaySaveOutput = false; 64 bool UsedVAMap = false; 65 uintptr_t MemoryOffset = 0; 66 67 // A list of all globals mapped to the device. 68 struct GlobalEntry { 69 const char *Name; 70 uint64_t Size; 71 void *Addr; 72 }; 73 llvm::SmallVector<GlobalEntry> GlobalEntries{}; 74 75 void *suggestAddress(uint64_t MaxMemoryAllocation) { 76 // Get a valid pointer address for this system 77 void *Addr = 78 Device->allocate(1024, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT); 79 Device->free(Addr); 80 // Align Address to MaxMemoryAllocation 81 Addr = (void *)utils::alignPtr((Addr), MaxMemoryAllocation); 82 return Addr; 83 } 84 85 Error preAllocateVAMemory(uint64_t MaxMemoryAllocation, void *VAddr) { 86 size_t ASize = MaxMemoryAllocation; 87 88 if (!VAddr && isRecording()) 89 VAddr = suggestAddress(MaxMemoryAllocation); 90 91 DP("Request %ld bytes allocated at %p\n", MaxMemoryAllocation, VAddr); 92 93 if (auto Err = Device->memoryVAMap(&MemoryStart, VAddr, &ASize)) 94 return Err; 95 96 if (isReplaying() && VAddr != MemoryStart) { 97 return Plugin::error("Record-Replay cannot assign the" 98 "requested recorded address (%p, %p)", 99 VAddr, MemoryStart); 100 } 101 102 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(), 103 "Allocated %" PRIu64 " bytes at %p for replay.\n", ASize, MemoryStart); 104 105 MemoryPtr = MemoryStart; 106 MemorySize = 0; 107 TotalSize = ASize; 108 UsedVAMap = true; 109 return Plugin::success(); 110 } 111 112 Error preAllocateHeuristic(uint64_t MaxMemoryAllocation, 113 uint64_t RequiredMemoryAllocation, void *VAddr) { 114 const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation; 115 constexpr size_t STEP = 1024 * 1024 * 1024ULL; 116 MemoryStart = nullptr; 117 for (TotalSize = MAX_MEMORY_ALLOCATION; TotalSize > 0; TotalSize -= STEP) { 118 MemoryStart = 119 Device->allocate(TotalSize, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT); 120 if (MemoryStart) 121 break; 122 } 123 if (!MemoryStart) 124 return Plugin::error("Allocating record/replay memory"); 125 126 if (VAddr && VAddr != MemoryStart) 127 MemoryOffset = uintptr_t(VAddr) - uintptr_t(MemoryStart); 128 129 MemoryPtr = MemoryStart; 130 MemorySize = 0; 131 132 // Check if we need adjustment. 133 if (MemoryOffset > 0 && 134 TotalSize >= RequiredMemoryAllocation + MemoryOffset) { 135 // If we are off but "before" the required address and with enough space, 136 // we just "allocate" the offset to match the required address. 137 MemoryPtr = (char *)MemoryPtr + MemoryOffset; 138 MemorySize += MemoryOffset; 139 MemoryOffset = 0; 140 assert(MemoryPtr == VAddr && "Expected offset adjustment to work"); 141 } else if (MemoryOffset) { 142 // If we are off and in a situation we cannot just "waste" memory to force 143 // a match, we hope adjusting the arguments is sufficient. 144 REPORT( 145 "WARNING Failed to allocate replay memory at required location %p, " 146 "got %p, trying to offset argument pointers by %" PRIi64 "\n", 147 VAddr, MemoryStart, MemoryOffset); 148 } 149 150 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(), 151 "Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize, 152 MemoryStart); 153 154 return Plugin::success(); 155 } 156 157 Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr) { 158 if (Device->supportVAManagement()) { 159 auto Err = preAllocateVAMemory(DeviceMemorySize, ReqVAddr); 160 if (Err) { 161 REPORT("WARNING VA mapping failed, fallback to heuristic: " 162 "(Error: %s)\n", 163 toString(std::move(Err)).data()); 164 } 165 } 166 167 uint64_t DevMemSize; 168 if (Device->getDeviceMemorySize(DevMemSize)) 169 return Plugin::error("Cannot determine Device Memory Size"); 170 171 return preAllocateHeuristic(DevMemSize, DeviceMemorySize, ReqVAddr); 172 } 173 174 void dumpDeviceMemory(StringRef Filename) { 175 ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB = 176 WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize); 177 if (!DeviceMemoryMB) 178 report_fatal_error("Error creating MemoryBuffer for device memory"); 179 180 auto Err = Device->dataRetrieve(DeviceMemoryMB.get()->getBufferStart(), 181 MemoryStart, MemorySize, nullptr); 182 if (Err) 183 report_fatal_error("Error retrieving data for target pointer"); 184 185 StringRef DeviceMemory(DeviceMemoryMB.get()->getBufferStart(), MemorySize); 186 std::error_code EC; 187 raw_fd_ostream OS(Filename, EC); 188 if (EC) 189 report_fatal_error("Error dumping memory to file " + Filename + " :" + 190 EC.message()); 191 OS << DeviceMemory; 192 OS.close(); 193 } 194 195 public: 196 bool isRecording() const { return Status == RRStatusTy::RRRecording; } 197 bool isReplaying() const { return Status == RRStatusTy::RRReplaying; } 198 bool isRecordingOrReplaying() const { 199 return (Status != RRStatusTy::RRDeactivated); 200 } 201 void setStatus(RRStatusTy Status) { this->Status = Status; } 202 bool isSaveOutputEnabled() const { return ReplaySaveOutput; } 203 void addEntry(const char *Name, uint64_t Size, void *Addr) { 204 GlobalEntries.emplace_back(GlobalEntry{Name, Size, Addr}); 205 } 206 207 void saveImage(const char *Name, const DeviceImageTy &Image) { 208 SmallString<128> ImageName = {Name, ".image"}; 209 std::error_code EC; 210 raw_fd_ostream OS(ImageName, EC); 211 if (EC) 212 report_fatal_error("Error saving image : " + StringRef(EC.message())); 213 if (const auto *TgtImageBitcode = Image.getTgtImageBitcode()) { 214 size_t Size = utils::getPtrDiff(TgtImageBitcode->ImageEnd, 215 TgtImageBitcode->ImageStart); 216 MemoryBufferRef MBR = MemoryBufferRef( 217 StringRef((const char *)TgtImageBitcode->ImageStart, Size), ""); 218 OS << MBR.getBuffer(); 219 } else { 220 OS << Image.getMemoryBuffer().getBuffer(); 221 } 222 OS.close(); 223 } 224 225 void dumpGlobals(StringRef Filename, DeviceImageTy &Image) { 226 int32_t Size = 0; 227 228 for (auto &OffloadEntry : GlobalEntries) { 229 if (!OffloadEntry.Size) 230 continue; 231 // Get the total size of the string and entry including the null byte. 232 Size += std::strlen(OffloadEntry.Name) + 1 + sizeof(uint32_t) + 233 OffloadEntry.Size; 234 } 235 236 ErrorOr<std::unique_ptr<WritableMemoryBuffer>> GlobalsMB = 237 WritableMemoryBuffer::getNewUninitMemBuffer(Size); 238 if (!GlobalsMB) 239 report_fatal_error("Error creating MemoryBuffer for globals memory"); 240 241 void *BufferPtr = GlobalsMB.get()->getBufferStart(); 242 for (auto &OffloadEntry : GlobalEntries) { 243 if (!OffloadEntry.Size) 244 continue; 245 246 int32_t NameLength = std::strlen(OffloadEntry.Name) + 1; 247 memcpy(BufferPtr, OffloadEntry.Name, NameLength); 248 BufferPtr = utils::advancePtr(BufferPtr, NameLength); 249 250 *((uint32_t *)(BufferPtr)) = OffloadEntry.Size; 251 BufferPtr = utils::advancePtr(BufferPtr, sizeof(uint32_t)); 252 253 auto Err = Plugin::success(); 254 { 255 if (auto Err = Device->dataRetrieve(BufferPtr, OffloadEntry.Addr, 256 OffloadEntry.Size, nullptr)) 257 report_fatal_error("Error retrieving data for global"); 258 } 259 if (Err) 260 report_fatal_error("Error retrieving data for global"); 261 BufferPtr = utils::advancePtr(BufferPtr, OffloadEntry.Size); 262 } 263 assert(BufferPtr == GlobalsMB->get()->getBufferEnd() && 264 "Buffer over/under-filled."); 265 assert(Size == utils::getPtrDiff(BufferPtr, 266 GlobalsMB->get()->getBufferStart()) && 267 "Buffer size mismatch"); 268 269 StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), Size); 270 std::error_code EC; 271 raw_fd_ostream OS(Filename, EC); 272 OS << GlobalsMemory; 273 OS.close(); 274 } 275 276 void saveKernelDescr(const char *Name, KernelLaunchParamsTy LaunchParams, 277 int32_t NumArgs, uint64_t NumTeamsClause, 278 uint32_t ThreadLimitClause, uint64_t LoopTripCount) { 279 json::Object JsonKernelInfo; 280 JsonKernelInfo["Name"] = Name; 281 JsonKernelInfo["NumArgs"] = NumArgs; 282 JsonKernelInfo["NumTeamsClause"] = NumTeamsClause; 283 JsonKernelInfo["ThreadLimitClause"] = ThreadLimitClause; 284 JsonKernelInfo["LoopTripCount"] = LoopTripCount; 285 JsonKernelInfo["DeviceMemorySize"] = MemorySize; 286 JsonKernelInfo["DeviceId"] = Device->getDeviceId(); 287 JsonKernelInfo["BumpAllocVAStart"] = (intptr_t)MemoryStart; 288 289 json::Array JsonArgPtrs; 290 for (int I = 0; I < NumArgs; ++I) 291 JsonArgPtrs.push_back((intptr_t)LaunchParams.Ptrs[I]); 292 JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs)); 293 294 json::Array JsonArgOffsets; 295 for (int I = 0; I < NumArgs; ++I) 296 JsonArgOffsets.push_back(0); 297 JsonKernelInfo["ArgOffsets"] = json::Value(std::move(JsonArgOffsets)); 298 299 SmallString<128> JsonFilename = {Name, ".json"}; 300 std::error_code EC; 301 raw_fd_ostream JsonOS(JsonFilename.str(), EC); 302 if (EC) 303 report_fatal_error("Error saving kernel json file : " + 304 StringRef(EC.message())); 305 JsonOS << json::Value(std::move(JsonKernelInfo)); 306 JsonOS.close(); 307 } 308 309 void saveKernelInput(const char *Name, DeviceImageTy &Image) { 310 SmallString<128> GlobalsFilename = {Name, ".globals"}; 311 dumpGlobals(GlobalsFilename, Image); 312 313 SmallString<128> MemoryFilename = {Name, ".memory"}; 314 dumpDeviceMemory(MemoryFilename); 315 } 316 317 void saveKernelOutputInfo(const char *Name) { 318 SmallString<128> OutputFilename = { 319 Name, (isRecording() ? ".original.output" : ".replay.output")}; 320 dumpDeviceMemory(OutputFilename); 321 } 322 323 void *alloc(uint64_t Size) { 324 assert(MemoryStart && "Expected memory has been pre-allocated"); 325 void *Alloc = nullptr; 326 constexpr int Alignment = 16; 327 // Assumes alignment is a power of 2. 328 int64_t AlignedSize = (Size + (Alignment - 1)) & (~(Alignment - 1)); 329 std::lock_guard<std::mutex> LG(AllocationLock); 330 Alloc = MemoryPtr; 331 MemoryPtr = (char *)MemoryPtr + AlignedSize; 332 MemorySize += AlignedSize; 333 DP("Memory Allocator return " DPxMOD "\n", DPxPTR(Alloc)); 334 return Alloc; 335 } 336 337 Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr, 338 RRStatusTy Status, bool SaveOutput, uint64_t &ReqPtrArgOffset) { 339 this->Device = Device; 340 this->Status = Status; 341 this->ReplaySaveOutput = SaveOutput; 342 343 if (auto Err = preallocateDeviceMemory(MemSize, VAddr)) 344 return Err; 345 346 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(), 347 "Record Replay Initialized (%p)" 348 " as starting address, %lu Memory Size" 349 " and set on status %s\n", 350 MemoryStart, TotalSize, 351 Status == RRStatusTy::RRRecording ? "Recording" : "Replaying"); 352 353 // Tell the user to offset pointer arguments as the memory allocation does 354 // not match. 355 ReqPtrArgOffset = MemoryOffset; 356 return Plugin::success(); 357 } 358 359 void deinit() { 360 if (UsedVAMap) { 361 if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize)) 362 report_fatal_error("Error on releasing virtual memory space"); 363 } else { 364 Device->free(MemoryStart); 365 } 366 } 367 }; 368 } // namespace llvm::omp::target::plugin 369 370 // Extract the mapping of host function pointers to device function pointers 371 // from the entry table. Functions marked as 'indirect' in OpenMP will have 372 // offloading entries generated for them which map the host's function pointer 373 // to a global containing the corresponding function pointer on the device. 374 static Expected<std::pair<void *, uint64_t>> 375 setupIndirectCallTable(GenericPluginTy &Plugin, GenericDeviceTy &Device, 376 DeviceImageTy &Image) { 377 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); 378 379 llvm::ArrayRef<llvm::offloading::EntryTy> Entries( 380 Image.getTgtImage()->EntriesBegin, Image.getTgtImage()->EntriesEnd); 381 llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable; 382 for (const auto &Entry : Entries) { 383 if (Entry.Size == 0 || !(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT)) 384 continue; 385 386 assert(Entry.Size == sizeof(void *) && "Global not a function pointer?"); 387 auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back(); 388 389 GlobalTy DeviceGlobal(Entry.SymbolName, Entry.Size); 390 if (auto Err = 391 Handler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) 392 return std::move(Err); 393 394 HstPtr = Entry.Address; 395 if (auto Err = Device.dataRetrieve(&DevPtr, DeviceGlobal.getPtr(), 396 Entry.Size, nullptr)) 397 return std::move(Err); 398 } 399 400 // If we do not have any indirect globals we exit early. 401 if (IndirectCallTable.empty()) 402 return std::pair{nullptr, 0}; 403 404 // Sort the array to allow for more efficient lookup of device pointers. 405 llvm::sort(IndirectCallTable, 406 [](const auto &x, const auto &y) { return x.first < y.first; }); 407 408 uint64_t TableSize = 409 IndirectCallTable.size() * sizeof(std::pair<void *, void *>); 410 void *DevicePtr = Device.allocate(TableSize, nullptr, TARGET_ALLOC_DEVICE); 411 if (auto Err = Device.dataSubmit(DevicePtr, IndirectCallTable.data(), 412 TableSize, nullptr)) 413 return std::move(Err); 414 return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size()); 415 } 416 417 AsyncInfoWrapperTy::AsyncInfoWrapperTy(GenericDeviceTy &Device, 418 __tgt_async_info *AsyncInfoPtr) 419 : Device(Device), 420 AsyncInfoPtr(AsyncInfoPtr ? AsyncInfoPtr : &LocalAsyncInfo) {} 421 422 void AsyncInfoWrapperTy::finalize(Error &Err) { 423 assert(AsyncInfoPtr && "AsyncInfoWrapperTy already finalized"); 424 425 // If we used a local async info object we want synchronous behavior. In that 426 // case, and assuming the current status code is correct, we will synchronize 427 // explicitly when the object is deleted. Update the error with the result of 428 // the synchronize operation. 429 if (AsyncInfoPtr == &LocalAsyncInfo && LocalAsyncInfo.Queue && !Err) 430 Err = Device.synchronize(&LocalAsyncInfo); 431 432 // Invalidate the wrapper object. 433 AsyncInfoPtr = nullptr; 434 } 435 436 Error GenericKernelTy::init(GenericDeviceTy &GenericDevice, 437 DeviceImageTy &Image) { 438 439 ImagePtr = &Image; 440 441 // Retrieve kernel environment object for the kernel. 442 GlobalTy KernelEnv(std::string(Name) + "_kernel_environment", 443 sizeof(KernelEnvironment), &KernelEnvironment); 444 GenericGlobalHandlerTy &GHandler = GenericDevice.Plugin.getGlobalHandler(); 445 if (auto Err = 446 GHandler.readGlobalFromImage(GenericDevice, *ImagePtr, KernelEnv)) { 447 [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); 448 DP("Failed to read kernel environment for '%s': %s\n" 449 "Using default SPMD (2) execution mode\n", 450 Name, ErrStr.data()); 451 assert(KernelEnvironment.Configuration.ReductionDataSize == 0 && 452 "Default initialization failed."); 453 IsBareKernel = true; 454 } 455 456 // Max = Config.Max > 0 ? min(Config.Max, Device.Max) : Device.Max; 457 MaxNumThreads = KernelEnvironment.Configuration.MaxThreads > 0 458 ? std::min(KernelEnvironment.Configuration.MaxThreads, 459 int32_t(GenericDevice.getThreadLimit())) 460 : GenericDevice.getThreadLimit(); 461 462 // Pref = Config.Pref > 0 ? max(Config.Pref, Device.Pref) : Device.Pref; 463 PreferredNumThreads = 464 KernelEnvironment.Configuration.MinThreads > 0 465 ? std::max(KernelEnvironment.Configuration.MinThreads, 466 int32_t(GenericDevice.getDefaultNumThreads())) 467 : GenericDevice.getDefaultNumThreads(); 468 469 return initImpl(GenericDevice, Image); 470 } 471 472 Expected<KernelLaunchEnvironmentTy *> 473 GenericKernelTy::getKernelLaunchEnvironment( 474 GenericDeviceTy &GenericDevice, uint32_t Version, 475 AsyncInfoWrapperTy &AsyncInfoWrapper) const { 476 // Ctor/Dtor have no arguments, replaying uses the original kernel launch 477 // environment. Older versions of the compiler do not generate a kernel 478 // launch environment. 479 if (GenericDevice.Plugin.getRecordReplay().isReplaying() || 480 Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR) 481 return nullptr; 482 483 if (!KernelEnvironment.Configuration.ReductionDataSize || 484 !KernelEnvironment.Configuration.ReductionBufferLength) 485 return reinterpret_cast<KernelLaunchEnvironmentTy *>(~0); 486 487 // TODO: Check if the kernel needs a launch environment. 488 auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy), 489 /*HostPtr=*/nullptr, 490 TargetAllocTy::TARGET_ALLOC_DEVICE); 491 if (!AllocOrErr) 492 return AllocOrErr.takeError(); 493 494 // Remember to free the memory later. 495 AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr); 496 497 /// Use the KLE in the __tgt_async_info to ensure a stable address for the 498 /// async data transfer. 499 auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment; 500 LocalKLE = KernelLaunchEnvironment; 501 { 502 auto AllocOrErr = GenericDevice.dataAlloc( 503 KernelEnvironment.Configuration.ReductionDataSize * 504 KernelEnvironment.Configuration.ReductionBufferLength, 505 /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE); 506 if (!AllocOrErr) 507 return AllocOrErr.takeError(); 508 LocalKLE.ReductionBuffer = *AllocOrErr; 509 // Remember to free the memory later. 510 AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr); 511 } 512 513 INFO(OMP_INFOTYPE_DATA_TRANSFER, GenericDevice.getDeviceId(), 514 "Copying data from host to device, HstPtr=" DPxMOD ", TgtPtr=" DPxMOD 515 ", Size=%" PRId64 ", Name=KernelLaunchEnv\n", 516 DPxPTR(&LocalKLE), DPxPTR(*AllocOrErr), 517 sizeof(KernelLaunchEnvironmentTy)); 518 519 auto Err = GenericDevice.dataSubmit(*AllocOrErr, &LocalKLE, 520 sizeof(KernelLaunchEnvironmentTy), 521 AsyncInfoWrapper); 522 if (Err) 523 return Err; 524 return static_cast<KernelLaunchEnvironmentTy *>(*AllocOrErr); 525 } 526 527 Error GenericKernelTy::printLaunchInfo(GenericDeviceTy &GenericDevice, 528 KernelArgsTy &KernelArgs, 529 uint32_t NumThreads[3], 530 uint32_t NumBlocks[3]) const { 531 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(), 532 "Launching kernel %s with [%u,%u,%u] blocks and [%u,%u,%u] threads in " 533 "%s mode\n", 534 getName(), NumBlocks[0], NumBlocks[1], NumBlocks[2], NumThreads[0], 535 NumThreads[1], NumThreads[2], getExecutionModeName()); 536 return printLaunchInfoDetails(GenericDevice, KernelArgs, NumThreads, 537 NumBlocks); 538 } 539 540 Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, 541 KernelArgsTy &KernelArgs, 542 uint32_t NumThreads[3], 543 uint32_t NumBlocks[3]) const { 544 return Plugin::success(); 545 } 546 547 Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, 548 ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, 549 AsyncInfoWrapperTy &AsyncInfoWrapper) const { 550 llvm::SmallVector<void *, 16> Args; 551 llvm::SmallVector<void *, 16> Ptrs; 552 553 auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment( 554 GenericDevice, KernelArgs.Version, AsyncInfoWrapper); 555 if (!KernelLaunchEnvOrErr) 556 return KernelLaunchEnvOrErr.takeError(); 557 558 KernelLaunchParamsTy LaunchParams; 559 560 // Kernel languages don't use indirection. 561 if (KernelArgs.Flags.IsCUDA) { 562 LaunchParams = 563 *reinterpret_cast<KernelLaunchParamsTy *>(KernelArgs.ArgPtrs); 564 } else { 565 LaunchParams = 566 prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, 567 Args, Ptrs, *KernelLaunchEnvOrErr); 568 } 569 570 uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0], 571 KernelArgs.ThreadLimit[1], 572 KernelArgs.ThreadLimit[2]}; 573 uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1], 574 KernelArgs.NumTeams[2]}; 575 if (!IsBareKernel) { 576 NumThreads[0] = getNumThreads(GenericDevice, NumThreads); 577 NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount, 578 NumThreads[0], KernelArgs.ThreadLimit[0] > 0); 579 } 580 581 // Record the kernel description after we modified the argument count and num 582 // blocks/threads. 583 RecordReplayTy &RecordReplay = GenericDevice.Plugin.getRecordReplay(); 584 if (RecordReplay.isRecording()) { 585 RecordReplay.saveImage(getName(), getImage()); 586 RecordReplay.saveKernelInput(getName(), getImage()); 587 RecordReplay.saveKernelDescr(getName(), LaunchParams, KernelArgs.NumArgs, 588 NumBlocks[0], NumThreads[0], 589 KernelArgs.Tripcount); 590 } 591 592 if (auto Err = 593 printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks)) 594 return Err; 595 596 return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs, 597 LaunchParams, AsyncInfoWrapper); 598 } 599 600 KernelLaunchParamsTy GenericKernelTy::prepareArgs( 601 GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets, 602 uint32_t &NumArgs, llvm::SmallVectorImpl<void *> &Args, 603 llvm::SmallVectorImpl<void *> &Ptrs, 604 KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const { 605 uint32_t KLEOffset = !!KernelLaunchEnvironment; 606 NumArgs += KLEOffset; 607 608 if (NumArgs == 0) 609 return KernelLaunchParamsTy{}; 610 611 Args.resize(NumArgs); 612 Ptrs.resize(NumArgs); 613 614 if (KernelLaunchEnvironment) { 615 Args[0] = KernelLaunchEnvironment; 616 Ptrs[0] = &Args[0]; 617 } 618 619 for (uint32_t I = KLEOffset; I < NumArgs; ++I) { 620 Args[I] = 621 (void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]); 622 Ptrs[I] = &Args[I]; 623 } 624 return KernelLaunchParamsTy{sizeof(void *) * NumArgs, &Args[0], &Ptrs[0]}; 625 } 626 627 uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice, 628 uint32_t ThreadLimitClause[3]) const { 629 assert(!IsBareKernel && "bare kernel should not call this function"); 630 631 assert(ThreadLimitClause[1] == 1 && ThreadLimitClause[2] == 1 && 632 "Multi dimensional launch not supported yet."); 633 634 if (ThreadLimitClause[0] > 0 && isGenericMode()) 635 ThreadLimitClause[0] += GenericDevice.getWarpSize(); 636 637 return std::min(MaxNumThreads, (ThreadLimitClause[0] > 0) 638 ? ThreadLimitClause[0] 639 : PreferredNumThreads); 640 } 641 642 uint32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice, 643 uint32_t NumTeamsClause[3], 644 uint64_t LoopTripCount, 645 uint32_t &NumThreads, 646 bool IsNumThreadsFromUser) const { 647 assert(!IsBareKernel && "bare kernel should not call this function"); 648 649 assert(NumTeamsClause[1] == 1 && NumTeamsClause[2] == 1 && 650 "Multi dimensional launch not supported yet."); 651 652 if (NumTeamsClause[0] > 0) { 653 // TODO: We need to honor any value and consequently allow more than the 654 // block limit. For this we might need to start multiple kernels or let the 655 // blocks start again until the requested number has been started. 656 return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit()); 657 } 658 659 uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks(); 660 uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max(); 661 if (LoopTripCount > 0) { 662 if (isSPMDMode()) { 663 // We have a combined construct, i.e. `target teams distribute 664 // parallel for [simd]`. We launch so many teams so that each thread 665 // will execute one iteration of the loop; rounded up to the nearest 666 // integer. However, if that results in too few teams, we artificially 667 // reduce the thread count per team to increase the outer parallelism. 668 auto MinThreads = GenericDevice.getMinThreadsForLowTripCountLoop(); 669 MinThreads = std::min(MinThreads, NumThreads); 670 671 // Honor the thread_limit clause; only lower the number of threads. 672 [[maybe_unused]] auto OldNumThreads = NumThreads; 673 if (LoopTripCount >= DefaultNumBlocks * NumThreads || 674 IsNumThreadsFromUser) { 675 // Enough parallelism for teams and threads. 676 TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1; 677 assert(IsNumThreadsFromUser || 678 TripCountNumBlocks >= DefaultNumBlocks && 679 "Expected sufficient outer parallelism."); 680 } else if (LoopTripCount >= DefaultNumBlocks * MinThreads) { 681 // Enough parallelism for teams, limit threads. 682 683 // This case is hard; for now, we force "full warps": 684 // First, compute a thread count assuming DefaultNumBlocks. 685 auto NumThreadsDefaultBlocks = 686 (LoopTripCount + DefaultNumBlocks - 1) / DefaultNumBlocks; 687 // Now get a power of two that is larger or equal. 688 auto NumThreadsDefaultBlocksP2 = 689 llvm::PowerOf2Ceil(NumThreadsDefaultBlocks); 690 // Do not increase a thread limit given be the user. 691 NumThreads = std::min(NumThreads, uint32_t(NumThreadsDefaultBlocksP2)); 692 assert(NumThreads >= MinThreads && 693 "Expected sufficient inner parallelism."); 694 TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1; 695 } else { 696 // Not enough parallelism for teams and threads, limit both. 697 NumThreads = std::min(NumThreads, MinThreads); 698 TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1; 699 } 700 701 assert(NumThreads * TripCountNumBlocks >= LoopTripCount && 702 "Expected sufficient parallelism"); 703 assert(OldNumThreads >= NumThreads && 704 "Number of threads cannot be increased!"); 705 } else { 706 assert((isGenericMode() || isGenericSPMDMode()) && 707 "Unexpected execution mode!"); 708 // If we reach this point, then we have a non-combined construct, i.e. 709 // `teams distribute` with a nested `parallel for` and each team is 710 // assigned one iteration of the `distribute` loop. E.g.: 711 // 712 // #pragma omp target teams distribute 713 // for(...loop_tripcount...) { 714 // #pragma omp parallel for 715 // for(...) {} 716 // } 717 // 718 // Threads within a team will execute the iterations of the `parallel` 719 // loop. 720 TripCountNumBlocks = LoopTripCount; 721 } 722 } 723 724 uint32_t PreferredNumBlocks = TripCountNumBlocks; 725 // If the loops are long running we rather reuse blocks than spawn too many. 726 if (GenericDevice.getReuseBlocksForHighTripCount()) 727 PreferredNumBlocks = std::min(TripCountNumBlocks, DefaultNumBlocks); 728 return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit()); 729 } 730 731 GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, 732 int32_t NumDevices, 733 const llvm::omp::GV &OMPGridValues) 734 : Plugin(Plugin), MemoryManager(nullptr), OMP_TeamLimit("OMP_TEAM_LIMIT"), 735 OMP_NumTeams("OMP_NUM_TEAMS"), 736 OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT"), 737 OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG"), 738 OMPX_SharedMemorySize("LIBOMPTARGET_SHARED_MEMORY_SIZE"), 739 // Do not initialize the following two envars since they depend on the 740 // device initialization. These cannot be consulted until the device is 741 // initialized correctly. We intialize them in GenericDeviceTy::init(). 742 OMPX_TargetStackSize(), OMPX_TargetHeapSize(), 743 // By default, the initial number of streams and events is 1. 744 OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", 1), 745 OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1), 746 DeviceId(DeviceId), GridValues(OMPGridValues), 747 PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(), 748 PinnedAllocs(*this), RPCServer(nullptr) { 749 #ifdef OMPT_SUPPORT 750 OmptInitialized.store(false); 751 // Bind the callbacks to this device's member functions 752 #define bindOmptCallback(Name, Type, Code) \ 753 if (ompt::Initialized && ompt::lookupCallbackByCode) { \ 754 ompt::lookupCallbackByCode((ompt_callbacks_t)(Code), \ 755 ((ompt_callback_t *)&(Name##_fn))); \ 756 DP("OMPT: class bound %s=%p\n", #Name, ((void *)(uint64_t)Name##_fn)); \ 757 } 758 759 FOREACH_OMPT_DEVICE_EVENT(bindOmptCallback); 760 #undef bindOmptCallback 761 762 #endif 763 } 764 765 Error GenericDeviceTy::init(GenericPluginTy &Plugin) { 766 if (auto Err = initImpl(Plugin)) 767 return Err; 768 769 #ifdef OMPT_SUPPORT 770 if (ompt::Initialized) { 771 bool ExpectedStatus = false; 772 if (OmptInitialized.compare_exchange_strong(ExpectedStatus, true)) 773 performOmptCallback(device_initialize, Plugin.getUserId(DeviceId), 774 /*type=*/getComputeUnitKind().c_str(), 775 /*device=*/reinterpret_cast<ompt_device_t *>(this), 776 /*lookup=*/ompt::lookupCallbackByName, 777 /*documentation=*/nullptr); 778 } 779 #endif 780 781 // Read and reinitialize the envars that depend on the device initialization. 782 // Notice these two envars may change the stack size and heap size of the 783 // device, so they need the device properly initialized. 784 auto StackSizeEnvarOrErr = UInt64Envar::create( 785 "LIBOMPTARGET_STACK_SIZE", 786 [this](uint64_t &V) -> Error { return getDeviceStackSize(V); }, 787 [this](uint64_t V) -> Error { return setDeviceStackSize(V); }); 788 if (!StackSizeEnvarOrErr) 789 return StackSizeEnvarOrErr.takeError(); 790 OMPX_TargetStackSize = std::move(*StackSizeEnvarOrErr); 791 792 auto HeapSizeEnvarOrErr = UInt64Envar::create( 793 "LIBOMPTARGET_HEAP_SIZE", 794 [this](uint64_t &V) -> Error { return getDeviceHeapSize(V); }, 795 [this](uint64_t V) -> Error { return setDeviceHeapSize(V); }); 796 if (!HeapSizeEnvarOrErr) 797 return HeapSizeEnvarOrErr.takeError(); 798 OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr); 799 800 // Update the maximum number of teams and threads after the device 801 // initialization sets the corresponding hardware limit. 802 if (OMP_NumTeams > 0) 803 GridValues.GV_Max_Teams = 804 std::min(GridValues.GV_Max_Teams, uint32_t(OMP_NumTeams)); 805 806 if (OMP_TeamsThreadLimit > 0) 807 GridValues.GV_Max_WG_Size = 808 std::min(GridValues.GV_Max_WG_Size, uint32_t(OMP_TeamsThreadLimit)); 809 810 // Enable the memory manager if required. 811 auto [ThresholdMM, EnableMM] = MemoryManagerTy::getSizeThresholdFromEnv(); 812 if (EnableMM) 813 MemoryManager = new MemoryManagerTy(*this, ThresholdMM); 814 815 return Plugin::success(); 816 } 817 818 Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { 819 for (DeviceImageTy *Image : LoadedImages) 820 if (auto Err = callGlobalDestructors(Plugin, *Image)) 821 return Err; 822 823 if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) { 824 GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); 825 for (auto *Image : LoadedImages) { 826 DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0}; 827 GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker", 828 sizeof(DeviceMemoryPoolTrackingTy), 829 &ImageDeviceMemoryPoolTracking); 830 if (auto Err = 831 GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) { 832 consumeError(std::move(Err)); 833 continue; 834 } 835 DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking); 836 } 837 838 // TODO: Write this by default into a file. 839 printf("\n\n|-----------------------\n" 840 "| Device memory tracker:\n" 841 "|-----------------------\n" 842 "| #Allocations: %lu\n" 843 "| Byes allocated: %lu\n" 844 "| Minimal allocation: %lu\n" 845 "| Maximal allocation: %lu\n" 846 "|-----------------------\n\n\n", 847 DeviceMemoryPoolTracking.NumAllocations, 848 DeviceMemoryPoolTracking.AllocationTotal, 849 DeviceMemoryPoolTracking.AllocationMin, 850 DeviceMemoryPoolTracking.AllocationMax); 851 } 852 853 for (auto *Image : LoadedImages) { 854 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); 855 if (!Handler.hasProfilingGlobals(*this, *Image)) 856 continue; 857 858 GPUProfGlobals profdata; 859 auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image); 860 if (!ProfOrErr) 861 return ProfOrErr.takeError(); 862 863 // TODO: write data to profiling file 864 ProfOrErr->dump(); 865 } 866 867 // Delete the memory manager before deinitializing the device. Otherwise, 868 // we may delete device allocations after the device is deinitialized. 869 if (MemoryManager) 870 delete MemoryManager; 871 MemoryManager = nullptr; 872 873 RecordReplayTy &RecordReplay = Plugin.getRecordReplay(); 874 if (RecordReplay.isRecordingOrReplaying()) 875 RecordReplay.deinit(); 876 877 if (RPCServer) 878 if (auto Err = RPCServer->deinitDevice(*this)) 879 return Err; 880 881 #ifdef OMPT_SUPPORT 882 if (ompt::Initialized) { 883 bool ExpectedStatus = true; 884 if (OmptInitialized.compare_exchange_strong(ExpectedStatus, false)) 885 performOmptCallback(device_finalize, Plugin.getUserId(DeviceId)); 886 } 887 #endif 888 889 return deinitImpl(); 890 } 891 Expected<DeviceImageTy *> 892 GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, 893 const __tgt_device_image *InputTgtImage) { 894 assert(InputTgtImage && "Expected non-null target image"); 895 DP("Load data from image " DPxMOD "\n", DPxPTR(InputTgtImage->ImageStart)); 896 897 auto PostJITImageOrErr = Plugin.getJIT().process(*InputTgtImage, *this); 898 if (!PostJITImageOrErr) { 899 auto Err = PostJITImageOrErr.takeError(); 900 REPORT("Failure to jit IR image %p on device %d: %s\n", InputTgtImage, 901 DeviceId, toString(std::move(Err)).data()); 902 return nullptr; 903 } 904 905 // Load the binary and allocate the image object. Use the next available id 906 // for the image id, which is the number of previously loaded images. 907 auto ImageOrErr = 908 loadBinaryImpl(PostJITImageOrErr.get(), LoadedImages.size()); 909 if (!ImageOrErr) 910 return ImageOrErr.takeError(); 911 912 DeviceImageTy *Image = *ImageOrErr; 913 assert(Image != nullptr && "Invalid image"); 914 if (InputTgtImage != PostJITImageOrErr.get()) 915 Image->setTgtImageBitcode(InputTgtImage); 916 917 // Add the image to list. 918 LoadedImages.push_back(Image); 919 920 // Setup the device environment if needed. 921 if (auto Err = setupDeviceEnvironment(Plugin, *Image)) 922 return std::move(Err); 923 924 // Setup the global device memory pool if needed. 925 if (!Plugin.getRecordReplay().isReplaying() && 926 shouldSetupDeviceMemoryPool()) { 927 uint64_t HeapSize; 928 auto SizeOrErr = getDeviceHeapSize(HeapSize); 929 if (SizeOrErr) { 930 REPORT("No global device memory pool due to error: %s\n", 931 toString(std::move(SizeOrErr)).data()); 932 } else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize)) 933 return std::move(Err); 934 } 935 936 if (auto Err = setupRPCServer(Plugin, *Image)) 937 return std::move(Err); 938 939 #ifdef OMPT_SUPPORT 940 if (ompt::Initialized) { 941 size_t Bytes = 942 utils::getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart); 943 performOmptCallback( 944 device_load, Plugin.getUserId(DeviceId), 945 /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr, 946 /*ImgSize=*/Bytes, /*HostAddr=*/InputTgtImage->ImageStart, 947 /*DeviceAddr=*/nullptr, /* FIXME: ModuleId */ 0); 948 } 949 #endif 950 951 // Call any global constructors present on the device. 952 if (auto Err = callGlobalConstructors(Plugin, *Image)) 953 return std::move(Err); 954 955 // Return the pointer to the table of entries. 956 return Image; 957 } 958 959 Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin, 960 DeviceImageTy &Image) { 961 // There are some plugins that do not need this step. 962 if (!shouldSetupDeviceEnvironment()) 963 return Plugin::success(); 964 965 // Obtain a table mapping host function pointers to device function pointers. 966 auto CallTablePairOrErr = setupIndirectCallTable(Plugin, *this, Image); 967 if (!CallTablePairOrErr) 968 return CallTablePairOrErr.takeError(); 969 970 DeviceEnvironmentTy DeviceEnvironment; 971 DeviceEnvironment.DeviceDebugKind = OMPX_DebugKind; 972 DeviceEnvironment.NumDevices = Plugin.getNumDevices(); 973 // TODO: The device ID used here is not the real device ID used by OpenMP. 974 DeviceEnvironment.DeviceNum = DeviceId; 975 DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize; 976 DeviceEnvironment.ClockFrequency = getClockFrequency(); 977 DeviceEnvironment.IndirectCallTable = 978 reinterpret_cast<uintptr_t>(CallTablePairOrErr->first); 979 DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second; 980 DeviceEnvironment.HardwareParallelism = getHardwareParallelism(); 981 982 // Create the metainfo of the device environment global. 983 GlobalTy DevEnvGlobal("__omp_rtl_device_environment", 984 sizeof(DeviceEnvironmentTy), &DeviceEnvironment); 985 986 // Write device environment values to the device. 987 GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); 988 if (auto Err = GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal)) { 989 DP("Missing symbol %s, continue execution anyway.\n", 990 DevEnvGlobal.getName().data()); 991 consumeError(std::move(Err)); 992 } 993 return Plugin::success(); 994 } 995 996 Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin, 997 DeviceImageTy &Image, 998 uint64_t PoolSize) { 999 // Free the old pool, if any. 1000 if (DeviceMemoryPool.Ptr) { 1001 if (auto Err = dataDelete(DeviceMemoryPool.Ptr, 1002 TargetAllocTy::TARGET_ALLOC_DEVICE)) 1003 return Err; 1004 } 1005 1006 DeviceMemoryPool.Size = PoolSize; 1007 auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr, 1008 TargetAllocTy::TARGET_ALLOC_DEVICE); 1009 if (AllocOrErr) { 1010 DeviceMemoryPool.Ptr = *AllocOrErr; 1011 } else { 1012 auto Err = AllocOrErr.takeError(); 1013 REPORT("Failure to allocate device memory for global memory pool: %s\n", 1014 toString(std::move(Err)).data()); 1015 DeviceMemoryPool.Ptr = nullptr; 1016 DeviceMemoryPool.Size = 0; 1017 } 1018 1019 // Create the metainfo of the device environment global. 1020 GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); 1021 if (!GHandler.isSymbolInImage(*this, Image, 1022 "__omp_rtl_device_memory_pool_tracker")) { 1023 DP("Skip the memory pool as there is no tracker symbol in the image."); 1024 return Error::success(); 1025 } 1026 1027 GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker", 1028 sizeof(DeviceMemoryPoolTrackingTy), 1029 &DeviceMemoryPoolTracking); 1030 if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal)) 1031 return Err; 1032 1033 // Create the metainfo of the device environment global. 1034 GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool", 1035 sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool); 1036 1037 // Write device environment values to the device. 1038 return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal); 1039 } 1040 1041 Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin, 1042 DeviceImageTy &Image) { 1043 // The plugin either does not need an RPC server or it is unavailible. 1044 if (!shouldSetupRPCServer()) 1045 return Plugin::success(); 1046 1047 // Check if this device needs to run an RPC server. 1048 RPCServerTy &Server = Plugin.getRPCServer(); 1049 auto UsingOrErr = 1050 Server.isDeviceUsingRPC(*this, Plugin.getGlobalHandler(), Image); 1051 if (!UsingOrErr) 1052 return UsingOrErr.takeError(); 1053 1054 if (!UsingOrErr.get()) 1055 return Plugin::success(); 1056 1057 if (auto Err = Server.initDevice(*this, Plugin.getGlobalHandler(), Image)) 1058 return Err; 1059 1060 if (auto Err = Server.startThread()) 1061 return Err; 1062 1063 RPCServer = &Server; 1064 DP("Running an RPC server on device %d\n", getDeviceId()); 1065 return Plugin::success(); 1066 } 1067 1068 Error PinnedAllocationMapTy::insertEntry(void *HstPtr, void *DevAccessiblePtr, 1069 size_t Size, bool ExternallyLocked) { 1070 // Insert the new entry into the map. 1071 auto Res = Allocs.insert({HstPtr, DevAccessiblePtr, Size, ExternallyLocked}); 1072 if (!Res.second) 1073 return Plugin::error("Cannot insert locked buffer entry"); 1074 1075 // Check whether the next entry overlaps with the inserted entry. 1076 auto It = std::next(Res.first); 1077 if (It == Allocs.end()) 1078 return Plugin::success(); 1079 1080 const EntryTy *NextEntry = &(*It); 1081 if (intersects(NextEntry->HstPtr, NextEntry->Size, HstPtr, Size)) 1082 return Plugin::error("Partial overlapping not allowed in locked buffers"); 1083 1084 return Plugin::success(); 1085 } 1086 1087 Error PinnedAllocationMapTy::eraseEntry(const EntryTy &Entry) { 1088 // Erase the existing entry. Notice this requires an additional map lookup, 1089 // but this should not be a performance issue. Using iterators would make 1090 // the code more difficult to read. 1091 size_t Erased = Allocs.erase({Entry.HstPtr}); 1092 if (!Erased) 1093 return Plugin::error("Cannot erase locked buffer entry"); 1094 return Plugin::success(); 1095 } 1096 1097 Error PinnedAllocationMapTy::registerEntryUse(const EntryTy &Entry, 1098 void *HstPtr, size_t Size) { 1099 if (!contains(Entry.HstPtr, Entry.Size, HstPtr, Size)) 1100 return Plugin::error("Partial overlapping not allowed in locked buffers"); 1101 1102 ++Entry.References; 1103 return Plugin::success(); 1104 } 1105 1106 Expected<bool> PinnedAllocationMapTy::unregisterEntryUse(const EntryTy &Entry) { 1107 if (Entry.References == 0) 1108 return Plugin::error("Invalid number of references"); 1109 1110 // Return whether this was the last user. 1111 return (--Entry.References == 0); 1112 } 1113 1114 Error PinnedAllocationMapTy::registerHostBuffer(void *HstPtr, 1115 void *DevAccessiblePtr, 1116 size_t Size) { 1117 assert(HstPtr && "Invalid pointer"); 1118 assert(DevAccessiblePtr && "Invalid pointer"); 1119 assert(Size && "Invalid size"); 1120 1121 std::lock_guard<std::shared_mutex> Lock(Mutex); 1122 1123 // No pinned allocation should intersect. 1124 const EntryTy *Entry = findIntersecting(HstPtr); 1125 if (Entry) 1126 return Plugin::error("Cannot insert entry due to an existing one"); 1127 1128 // Now insert the new entry. 1129 return insertEntry(HstPtr, DevAccessiblePtr, Size); 1130 } 1131 1132 Error PinnedAllocationMapTy::unregisterHostBuffer(void *HstPtr) { 1133 assert(HstPtr && "Invalid pointer"); 1134 1135 std::lock_guard<std::shared_mutex> Lock(Mutex); 1136 1137 const EntryTy *Entry = findIntersecting(HstPtr); 1138 if (!Entry) 1139 return Plugin::error("Cannot find locked buffer"); 1140 1141 // The address in the entry should be the same we are unregistering. 1142 if (Entry->HstPtr != HstPtr) 1143 return Plugin::error("Unexpected host pointer in locked buffer entry"); 1144 1145 // Unregister from the entry. 1146 auto LastUseOrErr = unregisterEntryUse(*Entry); 1147 if (!LastUseOrErr) 1148 return LastUseOrErr.takeError(); 1149 1150 // There should be no other references to the pinned allocation. 1151 if (!(*LastUseOrErr)) 1152 return Plugin::error("The locked buffer is still being used"); 1153 1154 // Erase the entry from the map. 1155 return eraseEntry(*Entry); 1156 } 1157 1158 Expected<void *> PinnedAllocationMapTy::lockHostBuffer(void *HstPtr, 1159 size_t Size) { 1160 assert(HstPtr && "Invalid pointer"); 1161 assert(Size && "Invalid size"); 1162 1163 std::lock_guard<std::shared_mutex> Lock(Mutex); 1164 1165 const EntryTy *Entry = findIntersecting(HstPtr); 1166 1167 if (Entry) { 1168 // An already registered intersecting buffer was found. Register a new use. 1169 if (auto Err = registerEntryUse(*Entry, HstPtr, Size)) 1170 return std::move(Err); 1171 1172 // Return the device accessible pointer with the correct offset. 1173 return utils::advancePtr(Entry->DevAccessiblePtr, 1174 utils::getPtrDiff(HstPtr, Entry->HstPtr)); 1175 } 1176 1177 // No intersecting registered allocation found in the map. First, lock the 1178 // host buffer and retrieve the device accessible pointer. 1179 auto DevAccessiblePtrOrErr = Device.dataLockImpl(HstPtr, Size); 1180 if (!DevAccessiblePtrOrErr) 1181 return DevAccessiblePtrOrErr.takeError(); 1182 1183 // Now insert the new entry into the map. 1184 if (auto Err = insertEntry(HstPtr, *DevAccessiblePtrOrErr, Size)) 1185 return std::move(Err); 1186 1187 // Return the device accessible pointer. 1188 return *DevAccessiblePtrOrErr; 1189 } 1190 1191 Error PinnedAllocationMapTy::unlockHostBuffer(void *HstPtr) { 1192 assert(HstPtr && "Invalid pointer"); 1193 1194 std::lock_guard<std::shared_mutex> Lock(Mutex); 1195 1196 const EntryTy *Entry = findIntersecting(HstPtr); 1197 if (!Entry) 1198 return Plugin::error("Cannot find locked buffer"); 1199 1200 // Unregister from the locked buffer. No need to do anything if there are 1201 // others using the allocation. 1202 auto LastUseOrErr = unregisterEntryUse(*Entry); 1203 if (!LastUseOrErr) 1204 return LastUseOrErr.takeError(); 1205 1206 // No need to do anything if there are others using the allocation. 1207 if (!(*LastUseOrErr)) 1208 return Plugin::success(); 1209 1210 // This was the last user of the allocation. Unlock the original locked buffer 1211 // if it was locked by the plugin. Do not unlock it if it was locked by an 1212 // external entity. Unlock the buffer using the host pointer of the entry. 1213 if (!Entry->ExternallyLocked) 1214 if (auto Err = Device.dataUnlockImpl(Entry->HstPtr)) 1215 return Err; 1216 1217 // Erase the entry from the map. 1218 return eraseEntry(*Entry); 1219 } 1220 1221 Error PinnedAllocationMapTy::lockMappedHostBuffer(void *HstPtr, size_t Size) { 1222 assert(HstPtr && "Invalid pointer"); 1223 assert(Size && "Invalid size"); 1224 1225 std::lock_guard<std::shared_mutex> Lock(Mutex); 1226 1227 // If previously registered, just register a new user on the entry. 1228 const EntryTy *Entry = findIntersecting(HstPtr); 1229 if (Entry) 1230 return registerEntryUse(*Entry, HstPtr, Size); 1231 1232 size_t BaseSize; 1233 void *BaseHstPtr, *BaseDevAccessiblePtr; 1234 1235 // Check if it was externally pinned by a vendor-specific API. 1236 auto IsPinnedOrErr = Device.isPinnedPtrImpl(HstPtr, BaseHstPtr, 1237 BaseDevAccessiblePtr, BaseSize); 1238 if (!IsPinnedOrErr) 1239 return IsPinnedOrErr.takeError(); 1240 1241 // If pinned, just insert the entry representing the whole pinned buffer. 1242 if (*IsPinnedOrErr) 1243 return insertEntry(BaseHstPtr, BaseDevAccessiblePtr, BaseSize, 1244 /*Externallylocked=*/true); 1245 1246 // Not externally pinned. Do nothing if locking of mapped buffers is disabled. 1247 if (!LockMappedBuffers) 1248 return Plugin::success(); 1249 1250 // Otherwise, lock the buffer and insert the new entry. 1251 auto DevAccessiblePtrOrErr = Device.dataLockImpl(HstPtr, Size); 1252 if (!DevAccessiblePtrOrErr) { 1253 // Errors may be tolerated. 1254 if (!IgnoreLockMappedFailures) 1255 return DevAccessiblePtrOrErr.takeError(); 1256 1257 consumeError(DevAccessiblePtrOrErr.takeError()); 1258 return Plugin::success(); 1259 } 1260 1261 return insertEntry(HstPtr, *DevAccessiblePtrOrErr, Size); 1262 } 1263 1264 Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) { 1265 assert(HstPtr && "Invalid pointer"); 1266 1267 std::lock_guard<std::shared_mutex> Lock(Mutex); 1268 1269 // Check whether there is any intersecting entry. 1270 const EntryTy *Entry = findIntersecting(HstPtr); 1271 1272 // No entry but automatic locking of mapped buffers is disabled, so 1273 // nothing to do. 1274 if (!Entry && !LockMappedBuffers) 1275 return Plugin::success(); 1276 1277 // No entry, automatic locking is enabled, but the locking may have failed, so 1278 // do nothing. 1279 if (!Entry && IgnoreLockMappedFailures) 1280 return Plugin::success(); 1281 1282 // No entry, but the automatic locking is enabled, so this is an error. 1283 if (!Entry) 1284 return Plugin::error("Locked buffer not found"); 1285 1286 // There is entry, so unregister a user and check whether it was the last one. 1287 auto LastUseOrErr = unregisterEntryUse(*Entry); 1288 if (!LastUseOrErr) 1289 return LastUseOrErr.takeError(); 1290 1291 // If it is not the last one, there is nothing to do. 1292 if (!(*LastUseOrErr)) 1293 return Plugin::success(); 1294 1295 // Otherwise, if it was the last and the buffer was locked by the plugin, 1296 // unlock it. 1297 if (!Entry->ExternallyLocked) 1298 if (auto Err = Device.dataUnlockImpl(Entry->HstPtr)) 1299 return Err; 1300 1301 // Finally erase the entry from the map. 1302 return eraseEntry(*Entry); 1303 } 1304 1305 Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo) { 1306 if (!AsyncInfo || !AsyncInfo->Queue) 1307 return Plugin::error("Invalid async info queue"); 1308 1309 if (auto Err = synchronizeImpl(*AsyncInfo)) 1310 return Err; 1311 1312 for (auto *Ptr : AsyncInfo->AssociatedAllocations) 1313 if (auto Err = dataDelete(Ptr, TargetAllocTy::TARGET_ALLOC_DEVICE)) 1314 return Err; 1315 AsyncInfo->AssociatedAllocations.clear(); 1316 1317 return Plugin::success(); 1318 } 1319 1320 Error GenericDeviceTy::queryAsync(__tgt_async_info *AsyncInfo) { 1321 if (!AsyncInfo || !AsyncInfo->Queue) 1322 return Plugin::error("Invalid async info queue"); 1323 1324 return queryAsyncImpl(*AsyncInfo); 1325 } 1326 1327 Error GenericDeviceTy::memoryVAMap(void **Addr, void *VAddr, size_t *RSize) { 1328 return Plugin::error("Device does not suppport VA Management"); 1329 } 1330 1331 Error GenericDeviceTy::memoryVAUnMap(void *VAddr, size_t Size) { 1332 return Plugin::error("Device does not suppport VA Management"); 1333 } 1334 1335 Error GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize) { 1336 return Plugin::error( 1337 "Mising getDeviceMemorySize impelmentation (required by RR-heuristic"); 1338 } 1339 1340 Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr, 1341 TargetAllocTy Kind) { 1342 void *Alloc = nullptr; 1343 1344 if (Plugin.getRecordReplay().isRecordingOrReplaying()) 1345 return Plugin.getRecordReplay().alloc(Size); 1346 1347 switch (Kind) { 1348 case TARGET_ALLOC_DEFAULT: 1349 case TARGET_ALLOC_DEVICE_NON_BLOCKING: 1350 case TARGET_ALLOC_DEVICE: 1351 if (MemoryManager) { 1352 Alloc = MemoryManager->allocate(Size, HostPtr); 1353 if (!Alloc) 1354 return Plugin::error("Failed to allocate from memory manager"); 1355 break; 1356 } 1357 [[fallthrough]]; 1358 case TARGET_ALLOC_HOST: 1359 case TARGET_ALLOC_SHARED: 1360 Alloc = allocate(Size, HostPtr, Kind); 1361 if (!Alloc) 1362 return Plugin::error("Failed to allocate from device allocator"); 1363 } 1364 1365 // Report error if the memory manager or the device allocator did not return 1366 // any memory buffer. 1367 if (!Alloc) 1368 return Plugin::error("Invalid target data allocation kind or requested " 1369 "allocator not implemented yet"); 1370 1371 // Register allocated buffer as pinned memory if the type is host memory. 1372 if (Kind == TARGET_ALLOC_HOST) 1373 if (auto Err = PinnedAllocs.registerHostBuffer(Alloc, Alloc, Size)) 1374 return std::move(Err); 1375 1376 // Keep track of the allocation stack if we track allocation traces. 1377 if (OMPX_TrackAllocationTraces) { 1378 std::string StackTrace; 1379 llvm::raw_string_ostream OS(StackTrace); 1380 llvm::sys::PrintStackTrace(OS); 1381 1382 AllocationTraceInfoTy *ATI = new AllocationTraceInfoTy(); 1383 ATI->AllocationTrace = std::move(StackTrace); 1384 ATI->DevicePtr = Alloc; 1385 ATI->HostPtr = HostPtr; 1386 ATI->Size = Size; 1387 ATI->Kind = Kind; 1388 1389 auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor(); 1390 auto *&MapATI = (*AllocationTraceMap)[Alloc]; 1391 ATI->LastAllocationInfo = MapATI; 1392 MapATI = ATI; 1393 } 1394 1395 return Alloc; 1396 } 1397 1398 Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) { 1399 // Free is a noop when recording or replaying. 1400 if (Plugin.getRecordReplay().isRecordingOrReplaying()) 1401 return Plugin::success(); 1402 1403 // Keep track of the deallocation stack if we track allocation traces. 1404 if (OMPX_TrackAllocationTraces) { 1405 AllocationTraceInfoTy *ATI = nullptr; 1406 { 1407 auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor(); 1408 ATI = (*AllocationTraceMap)[TgtPtr]; 1409 } 1410 1411 std::string StackTrace; 1412 llvm::raw_string_ostream OS(StackTrace); 1413 llvm::sys::PrintStackTrace(OS); 1414 1415 if (!ATI) 1416 ErrorReporter::reportDeallocationOfNonAllocatedPtr(TgtPtr, Kind, ATI, 1417 StackTrace); 1418 1419 // ATI is not null, thus we can lock it to inspect and modify it further. 1420 std::lock_guard<std::mutex> LG(ATI->Lock); 1421 if (!ATI->DeallocationTrace.empty()) 1422 ErrorReporter::reportDeallocationOfDeallocatedPtr(TgtPtr, Kind, ATI, 1423 StackTrace); 1424 1425 if (ATI->Kind != Kind) 1426 ErrorReporter::reportDeallocationOfWrongPtrKind(TgtPtr, Kind, ATI, 1427 StackTrace); 1428 1429 ATI->DeallocationTrace = StackTrace; 1430 1431 #undef DEALLOCATION_ERROR 1432 } 1433 1434 int Res; 1435 switch (Kind) { 1436 case TARGET_ALLOC_DEFAULT: 1437 case TARGET_ALLOC_DEVICE_NON_BLOCKING: 1438 case TARGET_ALLOC_DEVICE: 1439 if (MemoryManager) { 1440 Res = MemoryManager->free(TgtPtr); 1441 if (Res) 1442 return Plugin::error( 1443 "Failure to deallocate device pointer %p via memory manager", 1444 TgtPtr); 1445 break; 1446 } 1447 [[fallthrough]]; 1448 case TARGET_ALLOC_HOST: 1449 case TARGET_ALLOC_SHARED: 1450 Res = free(TgtPtr, Kind); 1451 if (Res) 1452 return Plugin::error( 1453 "Failure to deallocate device pointer %p via device deallocator", 1454 TgtPtr); 1455 } 1456 1457 // Unregister deallocated pinned memory buffer if the type is host memory. 1458 if (Kind == TARGET_ALLOC_HOST) 1459 if (auto Err = PinnedAllocs.unregisterHostBuffer(TgtPtr)) 1460 return Err; 1461 1462 return Plugin::success(); 1463 } 1464 1465 Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr, 1466 int64_t Size, __tgt_async_info *AsyncInfo) { 1467 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); 1468 1469 auto Err = dataSubmitImpl(TgtPtr, HstPtr, Size, AsyncInfoWrapper); 1470 AsyncInfoWrapper.finalize(Err); 1471 return Err; 1472 } 1473 1474 Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr, 1475 int64_t Size, __tgt_async_info *AsyncInfo) { 1476 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); 1477 1478 auto Err = dataRetrieveImpl(HstPtr, TgtPtr, Size, AsyncInfoWrapper); 1479 AsyncInfoWrapper.finalize(Err); 1480 return Err; 1481 } 1482 1483 Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, 1484 void *DstPtr, int64_t Size, 1485 __tgt_async_info *AsyncInfo) { 1486 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); 1487 1488 auto Err = dataExchangeImpl(SrcPtr, DstDev, DstPtr, Size, AsyncInfoWrapper); 1489 AsyncInfoWrapper.finalize(Err); 1490 return Err; 1491 } 1492 1493 Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs, 1494 ptrdiff_t *ArgOffsets, 1495 KernelArgsTy &KernelArgs, 1496 __tgt_async_info *AsyncInfo) { 1497 AsyncInfoWrapperTy AsyncInfoWrapper( 1498 *this, 1499 Plugin.getRecordReplay().isRecordingOrReplaying() ? nullptr : AsyncInfo); 1500 1501 GenericKernelTy &GenericKernel = 1502 *reinterpret_cast<GenericKernelTy *>(EntryPtr); 1503 1504 { 1505 std::string StackTrace; 1506 if (OMPX_TrackNumKernelLaunches) { 1507 llvm::raw_string_ostream OS(StackTrace); 1508 llvm::sys::PrintStackTrace(OS); 1509 } 1510 1511 auto KernelTraceInfoRecord = KernelLaunchTraces.getExclusiveAccessor(); 1512 (*KernelTraceInfoRecord) 1513 .emplace(&GenericKernel, std::move(StackTrace), AsyncInfo); 1514 } 1515 1516 auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs, 1517 AsyncInfoWrapper); 1518 1519 // 'finalize' here to guarantee next record-replay actions are in-sync 1520 AsyncInfoWrapper.finalize(Err); 1521 1522 RecordReplayTy &RecordReplay = Plugin.getRecordReplay(); 1523 if (RecordReplay.isRecordingOrReplaying() && 1524 RecordReplay.isSaveOutputEnabled()) 1525 RecordReplay.saveKernelOutputInfo(GenericKernel.getName()); 1526 1527 return Err; 1528 } 1529 1530 Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) { 1531 assert(AsyncInfoPtr && "Invalid async info"); 1532 1533 *AsyncInfoPtr = new __tgt_async_info(); 1534 1535 AsyncInfoWrapperTy AsyncInfoWrapper(*this, *AsyncInfoPtr); 1536 1537 auto Err = initAsyncInfoImpl(AsyncInfoWrapper); 1538 AsyncInfoWrapper.finalize(Err); 1539 return Err; 1540 } 1541 1542 Error GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo) { 1543 assert(DeviceInfo && "Invalid device info"); 1544 1545 return initDeviceInfoImpl(DeviceInfo); 1546 } 1547 1548 Error GenericDeviceTy::printInfo() { 1549 InfoQueueTy InfoQueue; 1550 1551 // Get the vendor-specific info entries describing the device properties. 1552 if (auto Err = obtainInfoImpl(InfoQueue)) 1553 return Err; 1554 1555 // Print all info entries. 1556 InfoQueue.print(); 1557 1558 return Plugin::success(); 1559 } 1560 1561 Error GenericDeviceTy::createEvent(void **EventPtrStorage) { 1562 return createEventImpl(EventPtrStorage); 1563 } 1564 1565 Error GenericDeviceTy::destroyEvent(void *EventPtr) { 1566 return destroyEventImpl(EventPtr); 1567 } 1568 1569 Error GenericDeviceTy::recordEvent(void *EventPtr, 1570 __tgt_async_info *AsyncInfo) { 1571 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); 1572 1573 auto Err = recordEventImpl(EventPtr, AsyncInfoWrapper); 1574 AsyncInfoWrapper.finalize(Err); 1575 return Err; 1576 } 1577 1578 Error GenericDeviceTy::waitEvent(void *EventPtr, __tgt_async_info *AsyncInfo) { 1579 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); 1580 1581 auto Err = waitEventImpl(EventPtr, AsyncInfoWrapper); 1582 AsyncInfoWrapper.finalize(Err); 1583 return Err; 1584 } 1585 1586 Error GenericDeviceTy::syncEvent(void *EventPtr) { 1587 return syncEventImpl(EventPtr); 1588 } 1589 1590 bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); } 1591 1592 Error GenericPluginTy::init() { 1593 if (Initialized) 1594 return Plugin::success(); 1595 1596 auto NumDevicesOrErr = initImpl(); 1597 if (!NumDevicesOrErr) 1598 return NumDevicesOrErr.takeError(); 1599 Initialized = true; 1600 1601 NumDevices = *NumDevicesOrErr; 1602 if (NumDevices == 0) 1603 return Plugin::success(); 1604 1605 assert(Devices.size() == 0 && "Plugin already initialized"); 1606 Devices.resize(NumDevices, nullptr); 1607 1608 GlobalHandler = createGlobalHandler(); 1609 assert(GlobalHandler && "Invalid global handler"); 1610 1611 RPCServer = new RPCServerTy(*this); 1612 assert(RPCServer && "Invalid RPC server"); 1613 1614 RecordReplay = new RecordReplayTy(); 1615 assert(RecordReplay && "Invalid RR interface"); 1616 1617 return Plugin::success(); 1618 } 1619 1620 Error GenericPluginTy::deinit() { 1621 assert(Initialized && "Plugin was not initialized!"); 1622 1623 // Deinitialize all active devices. 1624 for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) { 1625 if (Devices[DeviceId]) { 1626 if (auto Err = deinitDevice(DeviceId)) 1627 return Err; 1628 } 1629 assert(!Devices[DeviceId] && "Device was not deinitialized"); 1630 } 1631 1632 // There is no global handler if no device is available. 1633 if (GlobalHandler) 1634 delete GlobalHandler; 1635 1636 if (RPCServer && RPCServer->Thread->Running.load(std::memory_order_relaxed)) 1637 if (Error Err = RPCServer->shutDown()) 1638 return Err; 1639 1640 if (RPCServer) 1641 delete RPCServer; 1642 1643 if (RecordReplay) 1644 delete RecordReplay; 1645 1646 // Perform last deinitializations on the plugin. 1647 if (Error Err = deinitImpl()) 1648 return Err; 1649 Initialized = false; 1650 1651 return Plugin::success(); 1652 } 1653 1654 Error GenericPluginTy::initDevice(int32_t DeviceId) { 1655 assert(!Devices[DeviceId] && "Device already initialized"); 1656 1657 // Create the device and save the reference. 1658 GenericDeviceTy *Device = createDevice(*this, DeviceId, NumDevices); 1659 assert(Device && "Invalid device"); 1660 1661 // Save the device reference into the list. 1662 Devices[DeviceId] = Device; 1663 1664 // Initialize the device and its resources. 1665 return Device->init(*this); 1666 } 1667 1668 Error GenericPluginTy::deinitDevice(int32_t DeviceId) { 1669 // The device may be already deinitialized. 1670 if (Devices[DeviceId] == nullptr) 1671 return Plugin::success(); 1672 1673 // Deinitialize the device and release its resources. 1674 if (auto Err = Devices[DeviceId]->deinit(*this)) 1675 return Err; 1676 1677 // Delete the device and invalidate its reference. 1678 delete Devices[DeviceId]; 1679 Devices[DeviceId] = nullptr; 1680 1681 return Plugin::success(); 1682 } 1683 1684 Expected<bool> GenericPluginTy::checkELFImage(StringRef Image) const { 1685 // First check if this image is a regular ELF file. 1686 if (!utils::elf::isELF(Image)) 1687 return false; 1688 1689 // Check if this image is an ELF with a matching machine value. 1690 auto MachineOrErr = utils::elf::checkMachine(Image, getMagicElfBits()); 1691 if (!MachineOrErr) 1692 return MachineOrErr.takeError(); 1693 1694 return MachineOrErr; 1695 } 1696 1697 Expected<bool> GenericPluginTy::checkBitcodeImage(StringRef Image) const { 1698 if (identify_magic(Image) != file_magic::bitcode) 1699 return false; 1700 1701 LLVMContext Context; 1702 auto ModuleOrErr = getLazyBitcodeModule(MemoryBufferRef(Image, ""), Context, 1703 /*ShouldLazyLoadMetadata=*/true); 1704 if (!ModuleOrErr) 1705 return ModuleOrErr.takeError(); 1706 Module &M = **ModuleOrErr; 1707 1708 return Triple(M.getTargetTriple()).getArch() == getTripleArch(); 1709 } 1710 1711 int32_t GenericPluginTy::is_initialized() const { return Initialized; } 1712 1713 int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) { 1714 StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart), 1715 utils::getPtrDiff(Image->ImageEnd, Image->ImageStart)); 1716 1717 auto HandleError = [&](Error Err) -> bool { 1718 [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); 1719 DP("Failure to check validity of image %p: %s", Image, ErrStr.c_str()); 1720 return false; 1721 }; 1722 switch (identify_magic(Buffer)) { 1723 case file_magic::elf: 1724 case file_magic::elf_relocatable: 1725 case file_magic::elf_executable: 1726 case file_magic::elf_shared_object: 1727 case file_magic::elf_core: { 1728 auto MatchOrErr = checkELFImage(Buffer); 1729 if (Error Err = MatchOrErr.takeError()) 1730 return HandleError(std::move(Err)); 1731 return *MatchOrErr; 1732 } 1733 case file_magic::bitcode: { 1734 auto MatchOrErr = checkBitcodeImage(Buffer); 1735 if (Error Err = MatchOrErr.takeError()) 1736 return HandleError(std::move(Err)); 1737 return *MatchOrErr; 1738 } 1739 default: 1740 return false; 1741 } 1742 } 1743 1744 int32_t GenericPluginTy::is_device_compatible(int32_t DeviceId, 1745 __tgt_device_image *Image) { 1746 StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart), 1747 utils::getPtrDiff(Image->ImageEnd, Image->ImageStart)); 1748 1749 auto HandleError = [&](Error Err) -> bool { 1750 [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); 1751 DP("Failure to check validity of image %p: %s", Image, ErrStr.c_str()); 1752 return false; 1753 }; 1754 switch (identify_magic(Buffer)) { 1755 case file_magic::elf: 1756 case file_magic::elf_relocatable: 1757 case file_magic::elf_executable: 1758 case file_magic::elf_shared_object: 1759 case file_magic::elf_core: { 1760 auto MatchOrErr = checkELFImage(Buffer); 1761 if (Error Err = MatchOrErr.takeError()) 1762 return HandleError(std::move(Err)); 1763 if (!*MatchOrErr) 1764 return false; 1765 1766 // Perform plugin-dependent checks for the specific architecture if needed. 1767 auto CompatibleOrErr = isELFCompatible(DeviceId, Buffer); 1768 if (Error Err = CompatibleOrErr.takeError()) 1769 return HandleError(std::move(Err)); 1770 return *CompatibleOrErr; 1771 } 1772 case file_magic::bitcode: { 1773 auto MatchOrErr = checkBitcodeImage(Buffer); 1774 if (Error Err = MatchOrErr.takeError()) 1775 return HandleError(std::move(Err)); 1776 return *MatchOrErr; 1777 } 1778 default: 1779 return false; 1780 } 1781 } 1782 1783 int32_t GenericPluginTy::is_device_initialized(int32_t DeviceId) const { 1784 return isValidDeviceId(DeviceId) && Devices[DeviceId] != nullptr; 1785 } 1786 1787 int32_t GenericPluginTy::init_device(int32_t DeviceId) { 1788 auto Err = initDevice(DeviceId); 1789 if (Err) { 1790 REPORT("Failure to initialize device %d: %s\n", DeviceId, 1791 toString(std::move(Err)).data()); 1792 return OFFLOAD_FAIL; 1793 } 1794 1795 return OFFLOAD_SUCCESS; 1796 } 1797 1798 int32_t GenericPluginTy::number_of_devices() { return getNumDevices(); } 1799 1800 int32_t GenericPluginTy::is_data_exchangable(int32_t SrcDeviceId, 1801 int32_t DstDeviceId) { 1802 return isDataExchangable(SrcDeviceId, DstDeviceId); 1803 } 1804 1805 int32_t GenericPluginTy::initialize_record_replay(int32_t DeviceId, 1806 int64_t MemorySize, 1807 void *VAddr, bool isRecord, 1808 bool SaveOutput, 1809 uint64_t &ReqPtrArgOffset) { 1810 GenericDeviceTy &Device = getDevice(DeviceId); 1811 RecordReplayTy::RRStatusTy Status = 1812 isRecord ? RecordReplayTy::RRStatusTy::RRRecording 1813 : RecordReplayTy::RRStatusTy::RRReplaying; 1814 1815 if (auto Err = RecordReplay->init(&Device, MemorySize, VAddr, Status, 1816 SaveOutput, ReqPtrArgOffset)) { 1817 REPORT("WARNING RR did not intialize RR-properly with %lu bytes" 1818 "(Error: %s)\n", 1819 MemorySize, toString(std::move(Err)).data()); 1820 RecordReplay->setStatus(RecordReplayTy::RRStatusTy::RRDeactivated); 1821 1822 if (!isRecord) { 1823 return OFFLOAD_FAIL; 1824 } 1825 } 1826 return OFFLOAD_SUCCESS; 1827 } 1828 1829 int32_t GenericPluginTy::load_binary(int32_t DeviceId, 1830 __tgt_device_image *TgtImage, 1831 __tgt_device_binary *Binary) { 1832 GenericDeviceTy &Device = getDevice(DeviceId); 1833 1834 auto ImageOrErr = Device.loadBinary(*this, TgtImage); 1835 if (!ImageOrErr) { 1836 auto Err = ImageOrErr.takeError(); 1837 REPORT("Failure to load binary image %p on device %d: %s\n", TgtImage, 1838 DeviceId, toString(std::move(Err)).data()); 1839 return OFFLOAD_FAIL; 1840 } 1841 1842 DeviceImageTy *Image = *ImageOrErr; 1843 assert(Image != nullptr && "Invalid Image"); 1844 1845 *Binary = __tgt_device_binary{reinterpret_cast<uint64_t>(Image)}; 1846 1847 return OFFLOAD_SUCCESS; 1848 } 1849 1850 void *GenericPluginTy::data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr, 1851 int32_t Kind) { 1852 auto AllocOrErr = 1853 getDevice(DeviceId).dataAlloc(Size, HostPtr, (TargetAllocTy)Kind); 1854 if (!AllocOrErr) { 1855 auto Err = AllocOrErr.takeError(); 1856 REPORT("Failure to allocate device memory: %s\n", 1857 toString(std::move(Err)).data()); 1858 return nullptr; 1859 } 1860 assert(*AllocOrErr && "Null pointer upon successful allocation"); 1861 1862 return *AllocOrErr; 1863 } 1864 1865 int32_t GenericPluginTy::data_delete(int32_t DeviceId, void *TgtPtr, 1866 int32_t Kind) { 1867 auto Err = 1868 getDevice(DeviceId).dataDelete(TgtPtr, static_cast<TargetAllocTy>(Kind)); 1869 if (Err) { 1870 REPORT("Failure to deallocate device pointer %p: %s\n", TgtPtr, 1871 toString(std::move(Err)).data()); 1872 return OFFLOAD_FAIL; 1873 } 1874 1875 return OFFLOAD_SUCCESS; 1876 } 1877 1878 int32_t GenericPluginTy::data_lock(int32_t DeviceId, void *Ptr, int64_t Size, 1879 void **LockedPtr) { 1880 auto LockedPtrOrErr = getDevice(DeviceId).dataLock(Ptr, Size); 1881 if (!LockedPtrOrErr) { 1882 auto Err = LockedPtrOrErr.takeError(); 1883 REPORT("Failure to lock memory %p: %s\n", Ptr, 1884 toString(std::move(Err)).data()); 1885 return OFFLOAD_FAIL; 1886 } 1887 1888 if (!(*LockedPtrOrErr)) { 1889 REPORT("Failure to lock memory %p: obtained a null locked pointer\n", Ptr); 1890 return OFFLOAD_FAIL; 1891 } 1892 *LockedPtr = *LockedPtrOrErr; 1893 1894 return OFFLOAD_SUCCESS; 1895 } 1896 1897 int32_t GenericPluginTy::data_unlock(int32_t DeviceId, void *Ptr) { 1898 auto Err = getDevice(DeviceId).dataUnlock(Ptr); 1899 if (Err) { 1900 REPORT("Failure to unlock memory %p: %s\n", Ptr, 1901 toString(std::move(Err)).data()); 1902 return OFFLOAD_FAIL; 1903 } 1904 1905 return OFFLOAD_SUCCESS; 1906 } 1907 1908 int32_t GenericPluginTy::data_notify_mapped(int32_t DeviceId, void *HstPtr, 1909 int64_t Size) { 1910 auto Err = getDevice(DeviceId).notifyDataMapped(HstPtr, Size); 1911 if (Err) { 1912 REPORT("Failure to notify data mapped %p: %s\n", HstPtr, 1913 toString(std::move(Err)).data()); 1914 return OFFLOAD_FAIL; 1915 } 1916 1917 return OFFLOAD_SUCCESS; 1918 } 1919 1920 int32_t GenericPluginTy::data_notify_unmapped(int32_t DeviceId, void *HstPtr) { 1921 auto Err = getDevice(DeviceId).notifyDataUnmapped(HstPtr); 1922 if (Err) { 1923 REPORT("Failure to notify data unmapped %p: %s\n", HstPtr, 1924 toString(std::move(Err)).data()); 1925 return OFFLOAD_FAIL; 1926 } 1927 1928 return OFFLOAD_SUCCESS; 1929 } 1930 1931 int32_t GenericPluginTy::data_submit(int32_t DeviceId, void *TgtPtr, 1932 void *HstPtr, int64_t Size) { 1933 return data_submit_async(DeviceId, TgtPtr, HstPtr, Size, 1934 /*AsyncInfoPtr=*/nullptr); 1935 } 1936 1937 int32_t GenericPluginTy::data_submit_async(int32_t DeviceId, void *TgtPtr, 1938 void *HstPtr, int64_t Size, 1939 __tgt_async_info *AsyncInfoPtr) { 1940 auto Err = getDevice(DeviceId).dataSubmit(TgtPtr, HstPtr, Size, AsyncInfoPtr); 1941 if (Err) { 1942 REPORT("Failure to copy data from host to device. Pointers: host " 1943 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n", 1944 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size, 1945 toString(std::move(Err)).data()); 1946 return OFFLOAD_FAIL; 1947 } 1948 1949 return OFFLOAD_SUCCESS; 1950 } 1951 1952 int32_t GenericPluginTy::data_retrieve(int32_t DeviceId, void *HstPtr, 1953 void *TgtPtr, int64_t Size) { 1954 return data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size, 1955 /*AsyncInfoPtr=*/nullptr); 1956 } 1957 1958 int32_t GenericPluginTy::data_retrieve_async(int32_t DeviceId, void *HstPtr, 1959 void *TgtPtr, int64_t Size, 1960 __tgt_async_info *AsyncInfoPtr) { 1961 auto Err = 1962 getDevice(DeviceId).dataRetrieve(HstPtr, TgtPtr, Size, AsyncInfoPtr); 1963 if (Err) { 1964 REPORT("Faliure to copy data from device to host. Pointers: host " 1965 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n", 1966 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size, 1967 toString(std::move(Err)).data()); 1968 return OFFLOAD_FAIL; 1969 } 1970 1971 return OFFLOAD_SUCCESS; 1972 } 1973 1974 int32_t GenericPluginTy::data_exchange(int32_t SrcDeviceId, void *SrcPtr, 1975 int32_t DstDeviceId, void *DstPtr, 1976 int64_t Size) { 1977 return data_exchange_async(SrcDeviceId, SrcPtr, DstDeviceId, DstPtr, Size, 1978 /*AsyncInfoPtr=*/nullptr); 1979 } 1980 1981 int32_t GenericPluginTy::data_exchange_async(int32_t SrcDeviceId, void *SrcPtr, 1982 int DstDeviceId, void *DstPtr, 1983 int64_t Size, 1984 __tgt_async_info *AsyncInfo) { 1985 GenericDeviceTy &SrcDevice = getDevice(SrcDeviceId); 1986 GenericDeviceTy &DstDevice = getDevice(DstDeviceId); 1987 auto Err = SrcDevice.dataExchange(SrcPtr, DstDevice, DstPtr, Size, AsyncInfo); 1988 if (Err) { 1989 REPORT("Failure to copy data from device (%d) to device (%d). Pointers: " 1990 "host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n", 1991 SrcDeviceId, DstDeviceId, DPxPTR(SrcPtr), DPxPTR(DstPtr), Size, 1992 toString(std::move(Err)).data()); 1993 return OFFLOAD_FAIL; 1994 } 1995 1996 return OFFLOAD_SUCCESS; 1997 } 1998 1999 int32_t GenericPluginTy::launch_kernel(int32_t DeviceId, void *TgtEntryPtr, 2000 void **TgtArgs, ptrdiff_t *TgtOffsets, 2001 KernelArgsTy *KernelArgs, 2002 __tgt_async_info *AsyncInfoPtr) { 2003 auto Err = getDevice(DeviceId).launchKernel(TgtEntryPtr, TgtArgs, TgtOffsets, 2004 *KernelArgs, AsyncInfoPtr); 2005 if (Err) { 2006 REPORT("Failure to run target region " DPxMOD " in device %d: %s\n", 2007 DPxPTR(TgtEntryPtr), DeviceId, toString(std::move(Err)).data()); 2008 return OFFLOAD_FAIL; 2009 } 2010 2011 return OFFLOAD_SUCCESS; 2012 } 2013 2014 int32_t GenericPluginTy::synchronize(int32_t DeviceId, 2015 __tgt_async_info *AsyncInfoPtr) { 2016 auto Err = getDevice(DeviceId).synchronize(AsyncInfoPtr); 2017 if (Err) { 2018 REPORT("Failure to synchronize stream %p: %s\n", AsyncInfoPtr->Queue, 2019 toString(std::move(Err)).data()); 2020 return OFFLOAD_FAIL; 2021 } 2022 2023 return OFFLOAD_SUCCESS; 2024 } 2025 2026 int32_t GenericPluginTy::query_async(int32_t DeviceId, 2027 __tgt_async_info *AsyncInfoPtr) { 2028 auto Err = getDevice(DeviceId).queryAsync(AsyncInfoPtr); 2029 if (Err) { 2030 REPORT("Failure to query stream %p: %s\n", AsyncInfoPtr->Queue, 2031 toString(std::move(Err)).data()); 2032 return OFFLOAD_FAIL; 2033 } 2034 2035 return OFFLOAD_SUCCESS; 2036 } 2037 2038 void GenericPluginTy::print_device_info(int32_t DeviceId) { 2039 if (auto Err = getDevice(DeviceId).printInfo()) 2040 REPORT("Failure to print device %d info: %s\n", DeviceId, 2041 toString(std::move(Err)).data()); 2042 } 2043 2044 int32_t GenericPluginTy::create_event(int32_t DeviceId, void **EventPtr) { 2045 auto Err = getDevice(DeviceId).createEvent(EventPtr); 2046 if (Err) { 2047 REPORT("Failure to create event: %s\n", toString(std::move(Err)).data()); 2048 return OFFLOAD_FAIL; 2049 } 2050 2051 return OFFLOAD_SUCCESS; 2052 } 2053 2054 int32_t GenericPluginTy::record_event(int32_t DeviceId, void *EventPtr, 2055 __tgt_async_info *AsyncInfoPtr) { 2056 auto Err = getDevice(DeviceId).recordEvent(EventPtr, AsyncInfoPtr); 2057 if (Err) { 2058 REPORT("Failure to record event %p: %s\n", EventPtr, 2059 toString(std::move(Err)).data()); 2060 return OFFLOAD_FAIL; 2061 } 2062 2063 return OFFLOAD_SUCCESS; 2064 } 2065 2066 int32_t GenericPluginTy::wait_event(int32_t DeviceId, void *EventPtr, 2067 __tgt_async_info *AsyncInfoPtr) { 2068 auto Err = getDevice(DeviceId).waitEvent(EventPtr, AsyncInfoPtr); 2069 if (Err) { 2070 REPORT("Failure to wait event %p: %s\n", EventPtr, 2071 toString(std::move(Err)).data()); 2072 return OFFLOAD_FAIL; 2073 } 2074 2075 return OFFLOAD_SUCCESS; 2076 } 2077 2078 int32_t GenericPluginTy::sync_event(int32_t DeviceId, void *EventPtr) { 2079 auto Err = getDevice(DeviceId).syncEvent(EventPtr); 2080 if (Err) { 2081 REPORT("Failure to synchronize event %p: %s\n", EventPtr, 2082 toString(std::move(Err)).data()); 2083 return OFFLOAD_FAIL; 2084 } 2085 2086 return OFFLOAD_SUCCESS; 2087 } 2088 2089 int32_t GenericPluginTy::destroy_event(int32_t DeviceId, void *EventPtr) { 2090 auto Err = getDevice(DeviceId).destroyEvent(EventPtr); 2091 if (Err) { 2092 REPORT("Failure to destroy event %p: %s\n", EventPtr, 2093 toString(std::move(Err)).data()); 2094 return OFFLOAD_FAIL; 2095 } 2096 2097 return OFFLOAD_SUCCESS; 2098 } 2099 2100 void GenericPluginTy::set_info_flag(uint32_t NewInfoLevel) { 2101 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); 2102 InfoLevel.store(NewInfoLevel); 2103 } 2104 2105 int32_t GenericPluginTy::init_async_info(int32_t DeviceId, 2106 __tgt_async_info **AsyncInfoPtr) { 2107 assert(AsyncInfoPtr && "Invalid async info"); 2108 2109 auto Err = getDevice(DeviceId).initAsyncInfo(AsyncInfoPtr); 2110 if (Err) { 2111 REPORT("Failure to initialize async info at " DPxMOD " on device %d: %s\n", 2112 DPxPTR(*AsyncInfoPtr), DeviceId, toString(std::move(Err)).data()); 2113 return OFFLOAD_FAIL; 2114 } 2115 2116 return OFFLOAD_SUCCESS; 2117 } 2118 2119 int32_t GenericPluginTy::init_device_info(int32_t DeviceId, 2120 __tgt_device_info *DeviceInfo, 2121 const char **ErrStr) { 2122 *ErrStr = ""; 2123 2124 auto Err = getDevice(DeviceId).initDeviceInfo(DeviceInfo); 2125 if (Err) { 2126 REPORT("Failure to initialize device info at " DPxMOD " on device %d: %s\n", 2127 DPxPTR(DeviceInfo), DeviceId, toString(std::move(Err)).data()); 2128 return OFFLOAD_FAIL; 2129 } 2130 2131 return OFFLOAD_SUCCESS; 2132 } 2133 2134 int32_t GenericPluginTy::set_device_identifier(int32_t UserId, 2135 int32_t DeviceId) { 2136 UserDeviceIds[DeviceId] = UserId; 2137 2138 return OFFLOAD_SUCCESS; 2139 } 2140 2141 int32_t GenericPluginTy::use_auto_zero_copy(int32_t DeviceId) { 2142 return getDevice(DeviceId).useAutoZeroCopy(); 2143 } 2144 2145 int32_t GenericPluginTy::get_global(__tgt_device_binary Binary, uint64_t Size, 2146 const char *Name, void **DevicePtr) { 2147 assert(Binary.handle && "Invalid device binary handle"); 2148 DeviceImageTy &Image = *reinterpret_cast<DeviceImageTy *>(Binary.handle); 2149 2150 GenericDeviceTy &Device = Image.getDevice(); 2151 2152 GlobalTy DeviceGlobal(Name, Size); 2153 GenericGlobalHandlerTy &GHandler = getGlobalHandler(); 2154 if (auto Err = 2155 GHandler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) { 2156 REPORT("Failure to look up global address: %s\n", 2157 toString(std::move(Err)).data()); 2158 return OFFLOAD_FAIL; 2159 } 2160 2161 *DevicePtr = DeviceGlobal.getPtr(); 2162 assert(DevicePtr && "Invalid device global's address"); 2163 2164 // Save the loaded globals if we are recording. 2165 RecordReplayTy &RecordReplay = Device.Plugin.getRecordReplay(); 2166 if (RecordReplay.isRecording()) 2167 RecordReplay.addEntry(Name, Size, *DevicePtr); 2168 2169 return OFFLOAD_SUCCESS; 2170 } 2171 2172 int32_t GenericPluginTy::get_function(__tgt_device_binary Binary, 2173 const char *Name, void **KernelPtr) { 2174 assert(Binary.handle && "Invalid device binary handle"); 2175 DeviceImageTy &Image = *reinterpret_cast<DeviceImageTy *>(Binary.handle); 2176 2177 GenericDeviceTy &Device = Image.getDevice(); 2178 2179 auto KernelOrErr = Device.constructKernel(Name); 2180 if (Error Err = KernelOrErr.takeError()) { 2181 REPORT("Failure to look up kernel: %s\n", toString(std::move(Err)).data()); 2182 return OFFLOAD_FAIL; 2183 } 2184 2185 GenericKernelTy &Kernel = *KernelOrErr; 2186 if (auto Err = Kernel.init(Device, Image)) { 2187 REPORT("Failure to init kernel: %s\n", toString(std::move(Err)).data()); 2188 return OFFLOAD_FAIL; 2189 } 2190 2191 // Note that this is not the kernel's device address. 2192 *KernelPtr = &Kernel; 2193 return OFFLOAD_SUCCESS; 2194 } 2195