1 //===----RTLs/amdgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL NextGen for AMDGPU machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <atomic> 14 #include <cassert> 15 #include <cstddef> 16 #include <cstdint> 17 #include <deque> 18 #include <functional> 19 #include <mutex> 20 #include <string> 21 #include <system_error> 22 #include <unistd.h> 23 #include <unordered_map> 24 25 #include "ErrorReporting.h" 26 #include "Shared/APITypes.h" 27 #include "Shared/Debug.h" 28 #include "Shared/Environment.h" 29 #include "Shared/RefCnt.h" 30 #include "Shared/Utils.h" 31 #include "Utils/ELF.h" 32 33 #include "GlobalHandler.h" 34 #include "OpenMP/OMPT/Callback.h" 35 #include "PluginInterface.h" 36 #include "UtilitiesRTL.h" 37 #include "omptarget.h" 38 39 #include "llvm/ADT/SmallString.h" 40 #include "llvm/ADT/SmallVector.h" 41 #include "llvm/ADT/StringRef.h" 42 #include "llvm/BinaryFormat/ELF.h" 43 #include "llvm/Frontend/OpenMP/OMPConstants.h" 44 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 45 #include "llvm/Support/Error.h" 46 #include "llvm/Support/FileOutputBuffer.h" 47 #include "llvm/Support/FileSystem.h" 48 #include "llvm/Support/MemoryBuffer.h" 49 #include "llvm/Support/Program.h" 50 #include "llvm/Support/Signals.h" 51 #include "llvm/Support/raw_ostream.h" 52 53 #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \ 54 !defined(__ORDER_BIG_ENDIAN__) 55 #error "Missing preprocessor definitions for endianness detection." 56 #endif 57 58 // The HSA headers require these definitions. 59 #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) 60 #define LITTLEENDIAN_CPU 61 #elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) 62 #define BIGENDIAN_CPU 63 #endif 64 65 #if defined(__has_include) 66 #if __has_include("hsa.h") 67 #include "hsa.h" 68 #include "hsa_ext_amd.h" 69 #elif __has_include("hsa/hsa.h") 70 #include "hsa/hsa.h" 71 #include "hsa/hsa_ext_amd.h" 72 #endif 73 #else 74 #include "hsa/hsa.h" 75 #include "hsa/hsa_ext_amd.h" 76 #endif 77 78 namespace llvm { 79 namespace omp { 80 namespace target { 81 namespace plugin { 82 83 /// Forward declarations for all specialized data structures. 84 struct AMDGPUKernelTy; 85 struct AMDGPUDeviceTy; 86 struct AMDGPUPluginTy; 87 struct AMDGPUStreamTy; 88 struct AMDGPUEventTy; 89 struct AMDGPUStreamManagerTy; 90 struct AMDGPUEventManagerTy; 91 struct AMDGPUDeviceImageTy; 92 struct AMDGPUMemoryManagerTy; 93 struct AMDGPUMemoryPoolTy; 94 95 namespace hsa_utils { 96 97 /// Iterate elements using an HSA iterate function. Do not use this function 98 /// directly but the specialized ones below instead. 99 template <typename ElemTy, typename IterFuncTy, typename CallbackTy> 100 hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) { 101 auto L = [](ElemTy Elem, void *Data) -> hsa_status_t { 102 CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data); 103 return (*Unwrapped)(Elem); 104 }; 105 return Func(L, static_cast<void *>(&Cb)); 106 } 107 108 /// Iterate elements using an HSA iterate function passing a parameter. Do not 109 /// use this function directly but the specialized ones below instead. 110 template <typename ElemTy, typename IterFuncTy, typename IterFuncArgTy, 111 typename CallbackTy> 112 hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) { 113 auto L = [](ElemTy Elem, void *Data) -> hsa_status_t { 114 CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data); 115 return (*Unwrapped)(Elem); 116 }; 117 return Func(FuncArg, L, static_cast<void *>(&Cb)); 118 } 119 120 /// Iterate elements using an HSA iterate function passing a parameter. Do not 121 /// use this function directly but the specialized ones below instead. 122 template <typename Elem1Ty, typename Elem2Ty, typename IterFuncTy, 123 typename IterFuncArgTy, typename CallbackTy> 124 hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) { 125 auto L = [](Elem1Ty Elem1, Elem2Ty Elem2, void *Data) -> hsa_status_t { 126 CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data); 127 return (*Unwrapped)(Elem1, Elem2); 128 }; 129 return Func(FuncArg, L, static_cast<void *>(&Cb)); 130 } 131 132 /// Iterate agents. 133 template <typename CallbackTy> Error iterateAgents(CallbackTy Callback) { 134 hsa_status_t Status = iterate<hsa_agent_t>(hsa_iterate_agents, Callback); 135 return Plugin::check(Status, "Error in hsa_iterate_agents: %s"); 136 } 137 138 /// Iterate ISAs of an agent. 139 template <typename CallbackTy> 140 Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) { 141 hsa_status_t Status = iterate<hsa_isa_t>(hsa_agent_iterate_isas, Agent, Cb); 142 return Plugin::check(Status, "Error in hsa_agent_iterate_isas: %s"); 143 } 144 145 /// Iterate memory pools of an agent. 146 template <typename CallbackTy> 147 Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) { 148 hsa_status_t Status = iterate<hsa_amd_memory_pool_t>( 149 hsa_amd_agent_iterate_memory_pools, Agent, Cb); 150 return Plugin::check(Status, 151 "Error in hsa_amd_agent_iterate_memory_pools: %s"); 152 } 153 154 /// Dispatches an asynchronous memory copy. 155 /// Enables different SDMA engines for the dispatch in a round-robin fashion. 156 Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent, 157 const void *Src, hsa_agent_t SrcAgent, size_t Size, 158 uint32_t NumDepSignals, const hsa_signal_t *DepSignals, 159 hsa_signal_t CompletionSignal) { 160 if (!UseMultipleSdmaEngines) { 161 hsa_status_t S = 162 hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size, 163 NumDepSignals, DepSignals, CompletionSignal); 164 return Plugin::check(S, "Error in hsa_amd_memory_async_copy: %s"); 165 } 166 167 // This solution is probably not the best 168 #if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \ 169 HSA_AMD_INTERFACE_VERSION_MINOR >= 2) 170 return Plugin::error("Async copy on selected SDMA requires ROCm 5.7"); 171 #else 172 static std::atomic<int> SdmaEngine{1}; 173 174 // This atomics solution is probably not the best, but should be sufficient 175 // for now. 176 // In a worst case scenario, in which threads read the same value, they will 177 // dispatch to the same SDMA engine. This may result in sub-optimal 178 // performance. However, I think the possibility to be fairly low. 179 int LocalSdmaEngine = SdmaEngine.load(std::memory_order_acquire); 180 // This call is only avail in ROCm >= 5.7 181 hsa_status_t S = hsa_amd_memory_async_copy_on_engine( 182 Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals, 183 CompletionSignal, (hsa_amd_sdma_engine_id_t)LocalSdmaEngine, 184 /*force_copy_on_sdma=*/true); 185 // Increment to use one of two SDMA engines: 0x1, 0x2 186 LocalSdmaEngine = (LocalSdmaEngine << 1) % 3; 187 SdmaEngine.store(LocalSdmaEngine, std::memory_order_relaxed); 188 189 return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine: %s"); 190 #endif 191 } 192 193 Error getTargetTripleAndFeatures(hsa_agent_t Agent, 194 SmallVector<SmallString<32>> &Targets) { 195 auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) { 196 uint32_t Length; 197 hsa_status_t Status; 198 Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length); 199 if (Status != HSA_STATUS_SUCCESS) 200 return Status; 201 202 llvm::SmallVector<char> ISAName(Length); 203 Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin()); 204 if (Status != HSA_STATUS_SUCCESS) 205 return Status; 206 207 llvm::StringRef TripleTarget(ISAName.begin(), Length); 208 if (TripleTarget.consume_front("amdgcn-amd-amdhsa")) { 209 auto Target = TripleTarget.ltrim('-').rtrim('\0'); 210 Targets.push_back(Target); 211 } 212 return HSA_STATUS_SUCCESS; 213 }); 214 return Err; 215 } 216 } // namespace hsa_utils 217 218 /// Utility class representing generic resource references to AMDGPU resources. 219 template <typename ResourceTy> 220 struct AMDGPUResourceRef : public GenericDeviceResourceRef { 221 /// The underlying handle type for resources. 222 using HandleTy = ResourceTy *; 223 224 /// Create an empty reference to an invalid resource. 225 AMDGPUResourceRef() : Resource(nullptr) {} 226 227 /// Create a reference to an existing resource. 228 AMDGPUResourceRef(HandleTy Resource) : Resource(Resource) {} 229 230 virtual ~AMDGPUResourceRef() {} 231 232 /// Create a new resource and save the reference. The reference must be empty 233 /// before calling to this function. 234 Error create(GenericDeviceTy &Device) override; 235 236 /// Destroy the referenced resource and invalidate the reference. The 237 /// reference must be to a valid resource before calling to this function. 238 Error destroy(GenericDeviceTy &Device) override { 239 if (!Resource) 240 return Plugin::error("Destroying an invalid resource"); 241 242 if (auto Err = Resource->deinit()) 243 return Err; 244 245 delete Resource; 246 247 Resource = nullptr; 248 return Plugin::success(); 249 } 250 251 /// Get the underlying resource handle. 252 operator HandleTy() const { return Resource; } 253 254 private: 255 /// The handle to the actual resource. 256 HandleTy Resource; 257 }; 258 259 /// Class holding an HSA memory pool. 260 struct AMDGPUMemoryPoolTy { 261 /// Create a memory pool from an HSA memory pool. 262 AMDGPUMemoryPoolTy(hsa_amd_memory_pool_t MemoryPool) 263 : MemoryPool(MemoryPool), GlobalFlags(0) {} 264 265 /// Initialize the memory pool retrieving its properties. 266 Error init() { 267 if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_SEGMENT, Segment)) 268 return Err; 269 270 if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags)) 271 return Err; 272 273 return Plugin::success(); 274 } 275 276 /// Getter of the HSA memory pool. 277 hsa_amd_memory_pool_t get() const { return MemoryPool; } 278 279 /// Indicate the segment which belongs to. 280 bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); } 281 bool isReadOnly() const { return (Segment == HSA_AMD_SEGMENT_READONLY); } 282 bool isPrivate() const { return (Segment == HSA_AMD_SEGMENT_PRIVATE); } 283 bool isGroup() const { return (Segment == HSA_AMD_SEGMENT_GROUP); } 284 285 /// Indicate if it is fine-grained memory. Valid only for global. 286 bool isFineGrained() const { 287 assert(isGlobal() && "Not global memory"); 288 return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED); 289 } 290 291 /// Indicate if it is coarse-grained memory. Valid only for global. 292 bool isCoarseGrained() const { 293 assert(isGlobal() && "Not global memory"); 294 return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED); 295 } 296 297 /// Indicate if it supports storing kernel arguments. Valid only for global. 298 bool supportsKernelArgs() const { 299 assert(isGlobal() && "Not global memory"); 300 return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT); 301 } 302 303 /// Allocate memory on the memory pool. 304 Error allocate(size_t Size, void **PtrStorage) { 305 hsa_status_t Status = 306 hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, PtrStorage); 307 return Plugin::check(Status, "Error in hsa_amd_memory_pool_allocate: %s"); 308 } 309 310 /// Return memory to the memory pool. 311 Error deallocate(void *Ptr) { 312 hsa_status_t Status = hsa_amd_memory_pool_free(Ptr); 313 return Plugin::check(Status, "Error in hsa_amd_memory_pool_free: %s"); 314 } 315 316 /// Returns if the \p Agent can access the memory pool. 317 bool canAccess(hsa_agent_t Agent) { 318 hsa_amd_memory_pool_access_t Access; 319 if (hsa_amd_agent_memory_pool_get_info( 320 Agent, MemoryPool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &Access)) 321 return false; 322 return Access != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; 323 } 324 325 /// Allow the device to access a specific allocation. 326 Error enableAccess(void *Ptr, int64_t Size, 327 const llvm::SmallVector<hsa_agent_t> &Agents) const { 328 #ifdef OMPTARGET_DEBUG 329 for (hsa_agent_t Agent : Agents) { 330 hsa_amd_memory_pool_access_t Access; 331 if (auto Err = 332 getAttr(Agent, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, Access)) 333 return Err; 334 335 // The agent is not allowed to access the memory pool in any case. Do not 336 // continue because otherwise it result in undefined behavior. 337 if (Access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) 338 return Plugin::error("An agent is not allowed to access a memory pool"); 339 } 340 #endif 341 342 // We can access but it is disabled by default. Enable the access then. 343 hsa_status_t Status = 344 hsa_amd_agents_allow_access(Agents.size(), Agents.data(), nullptr, Ptr); 345 return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s"); 346 } 347 348 /// Get attribute from the memory pool. 349 template <typename Ty> 350 Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const { 351 hsa_status_t Status; 352 Status = hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value); 353 return Plugin::check(Status, "Error in hsa_amd_memory_pool_get_info: %s"); 354 } 355 356 template <typename Ty> 357 hsa_status_t getAttrRaw(hsa_amd_memory_pool_info_t Kind, Ty &Value) const { 358 return hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value); 359 } 360 361 /// Get attribute from the memory pool relating to an agent. 362 template <typename Ty> 363 Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind, 364 Ty &Value) const { 365 hsa_status_t Status; 366 Status = 367 hsa_amd_agent_memory_pool_get_info(Agent, MemoryPool, Kind, &Value); 368 return Plugin::check(Status, 369 "Error in hsa_amd_agent_memory_pool_get_info: %s"); 370 } 371 372 private: 373 /// The HSA memory pool. 374 hsa_amd_memory_pool_t MemoryPool; 375 376 /// The segment where the memory pool belongs to. 377 hsa_amd_segment_t Segment; 378 379 /// The global flags of memory pool. Only valid if the memory pool belongs to 380 /// the global segment. 381 uint32_t GlobalFlags; 382 }; 383 384 /// Class that implements a memory manager that gets memory from a specific 385 /// memory pool. 386 struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy { 387 388 /// Create an empty memory manager. 389 AMDGPUMemoryManagerTy(AMDGPUPluginTy &Plugin) 390 : Plugin(Plugin), MemoryPool(nullptr), MemoryManager(nullptr) {} 391 392 /// Initialize the memory manager from a memory pool. 393 Error init(AMDGPUMemoryPoolTy &MemoryPool) { 394 const uint32_t Threshold = 1 << 30; 395 this->MemoryManager = new MemoryManagerTy(*this, Threshold); 396 this->MemoryPool = &MemoryPool; 397 return Plugin::success(); 398 } 399 400 /// Deinitialize the memory manager and free its allocations. 401 Error deinit() { 402 assert(MemoryManager && "Invalid memory manager"); 403 404 // Delete and invalidate the memory manager. At this point, the memory 405 // manager will deallocate all its allocations. 406 delete MemoryManager; 407 MemoryManager = nullptr; 408 409 return Plugin::success(); 410 } 411 412 /// Reuse or allocate memory through the memory manager. 413 Error allocate(size_t Size, void **PtrStorage) { 414 assert(MemoryManager && "Invalid memory manager"); 415 assert(PtrStorage && "Invalid pointer storage"); 416 417 *PtrStorage = MemoryManager->allocate(Size, nullptr); 418 if (*PtrStorage == nullptr) 419 return Plugin::error("Failure to allocate from AMDGPU memory manager"); 420 421 return Plugin::success(); 422 } 423 424 /// Release an allocation to be reused. 425 Error deallocate(void *Ptr) { 426 assert(Ptr && "Invalid pointer"); 427 428 if (MemoryManager->free(Ptr)) 429 return Plugin::error("Failure to deallocate from AMDGPU memory manager"); 430 431 return Plugin::success(); 432 } 433 434 private: 435 /// Allocation callback that will be called once the memory manager does not 436 /// have more previously allocated buffers. 437 void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override; 438 439 /// Deallocation callack that will be called by the memory manager. 440 int free(void *TgtPtr, TargetAllocTy Kind) override { 441 if (auto Err = MemoryPool->deallocate(TgtPtr)) { 442 consumeError(std::move(Err)); 443 return OFFLOAD_FAIL; 444 } 445 return OFFLOAD_SUCCESS; 446 } 447 448 /// The underlying plugin that owns this memory manager. 449 AMDGPUPluginTy &Plugin; 450 451 /// The memory pool used to allocate memory. 452 AMDGPUMemoryPoolTy *MemoryPool; 453 454 /// Reference to the actual memory manager. 455 MemoryManagerTy *MemoryManager; 456 }; 457 458 /// Class implementing the AMDGPU device images' properties. 459 struct AMDGPUDeviceImageTy : public DeviceImageTy { 460 /// Create the AMDGPU image with the id and the target image pointer. 461 AMDGPUDeviceImageTy(int32_t ImageId, GenericDeviceTy &Device, 462 const __tgt_device_image *TgtImage) 463 : DeviceImageTy(ImageId, Device, TgtImage) {} 464 465 /// Prepare and load the executable corresponding to the image. 466 Error loadExecutable(const AMDGPUDeviceTy &Device); 467 468 /// Unload the executable. 469 Error unloadExecutable() { 470 hsa_status_t Status = hsa_executable_destroy(Executable); 471 return Plugin::check(Status, "Error in hsa_executable_destroy: %s"); 472 } 473 474 /// Get the executable. 475 hsa_executable_t getExecutable() const { return Executable; } 476 477 /// Get to Code Object Version of the ELF 478 uint16_t getELFABIVersion() const { return ELFABIVersion; } 479 480 /// Find an HSA device symbol by its name on the executable. 481 Expected<hsa_executable_symbol_t> 482 findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const; 483 484 /// Get additional info for kernel, e.g., register spill counts 485 std::optional<offloading::amdgpu::AMDGPUKernelMetaData> 486 getKernelInfo(StringRef Identifier) const { 487 auto It = KernelInfoMap.find(Identifier); 488 489 if (It == KernelInfoMap.end()) 490 return {}; 491 492 return It->second; 493 } 494 495 private: 496 /// The exectuable loaded on the agent. 497 hsa_executable_t Executable; 498 StringMap<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfoMap; 499 uint16_t ELFABIVersion; 500 }; 501 502 /// Class implementing the AMDGPU kernel functionalities which derives from the 503 /// generic kernel class. 504 struct AMDGPUKernelTy : public GenericKernelTy { 505 /// Create an AMDGPU kernel with a name and an execution mode. 506 AMDGPUKernelTy(const char *Name) : GenericKernelTy(Name) {} 507 508 /// Initialize the AMDGPU kernel. 509 Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override { 510 AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image); 511 512 // Kernel symbols have a ".kd" suffix. 513 std::string KernelName(getName()); 514 KernelName += ".kd"; 515 516 // Find the symbol on the device executable. 517 auto SymbolOrErr = AMDImage.findDeviceSymbol(Device, KernelName); 518 if (!SymbolOrErr) 519 return SymbolOrErr.takeError(); 520 521 hsa_executable_symbol_t Symbol = *SymbolOrErr; 522 hsa_symbol_kind_t SymbolType; 523 hsa_status_t Status; 524 525 // Retrieve different properties of the kernel symbol. 526 std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = { 527 {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType}, 528 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject}, 529 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize}, 530 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize}, 531 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack}, 532 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}}; 533 534 for (auto &Info : RequiredInfos) { 535 Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second); 536 if (auto Err = Plugin::check( 537 Status, "Error in hsa_executable_symbol_get_info: %s")) 538 return Err; 539 } 540 541 // Make sure it is a kernel symbol. 542 if (SymbolType != HSA_SYMBOL_KIND_KERNEL) 543 return Plugin::error("Symbol %s is not a kernel function"); 544 545 // TODO: Read the kernel descriptor for the max threads per block. May be 546 // read from the image. 547 548 ImplicitArgsSize = 549 hsa_utils::getImplicitArgsSize(AMDImage.getELFABIVersion()); 550 DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion()); 551 552 // Get additional kernel info read from image 553 KernelInfo = AMDImage.getKernelInfo(getName()); 554 if (!KernelInfo.has_value()) 555 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device.getDeviceId(), 556 "Could not read extra information for kernel %s.", getName()); 557 558 return Plugin::success(); 559 } 560 561 /// Launch the AMDGPU kernel function. 562 Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], 563 uint32_t NumBlocks[3], KernelArgsTy &KernelArgs, 564 KernelLaunchParamsTy LaunchParams, 565 AsyncInfoWrapperTy &AsyncInfoWrapper) const override; 566 567 /// Print more elaborate kernel launch info for AMDGPU 568 Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice, 569 KernelArgsTy &KernelArgs, uint32_t NumThreads[3], 570 uint32_t NumBlocks[3]) const override; 571 572 /// Get group and private segment kernel size. 573 uint32_t getGroupSize() const { return GroupSize; } 574 uint32_t getPrivateSize() const { return PrivateSize; } 575 576 /// Get the HSA kernel object representing the kernel function. 577 uint64_t getKernelObject() const { return KernelObject; } 578 579 /// Get the size of implicitargs based on the code object version 580 /// @return 56 for cov4 and 256 for cov5 581 uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; } 582 583 /// Indicates whether or not we need to set up our own private segment size. 584 bool usesDynamicStack() const { return DynamicStack; } 585 586 private: 587 /// The kernel object to execute. 588 uint64_t KernelObject; 589 590 /// The args, group and private segments sizes required by a kernel instance. 591 uint32_t ArgsSize; 592 uint32_t GroupSize; 593 uint32_t PrivateSize; 594 bool DynamicStack; 595 596 /// The size of implicit kernel arguments. 597 uint32_t ImplicitArgsSize; 598 599 /// Additional Info for the AMD GPU Kernel 600 std::optional<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfo; 601 }; 602 603 /// Class representing an HSA signal. Signals are used to define dependencies 604 /// between asynchronous operations: kernel launches and memory transfers. 605 struct AMDGPUSignalTy { 606 /// Create an empty signal. 607 AMDGPUSignalTy() : HSASignal({0}), UseCount() {} 608 AMDGPUSignalTy(AMDGPUDeviceTy &Device) : HSASignal({0}), UseCount() {} 609 610 /// Initialize the signal with an initial value. 611 Error init(uint32_t InitialValue = 1) { 612 hsa_status_t Status = 613 hsa_amd_signal_create(InitialValue, 0, nullptr, 0, &HSASignal); 614 return Plugin::check(Status, "Error in hsa_signal_create: %s"); 615 } 616 617 /// Deinitialize the signal. 618 Error deinit() { 619 hsa_status_t Status = hsa_signal_destroy(HSASignal); 620 return Plugin::check(Status, "Error in hsa_signal_destroy: %s"); 621 } 622 623 /// Wait until the signal gets a zero value. 624 Error wait(const uint64_t ActiveTimeout = 0, 625 GenericDeviceTy *Device = nullptr) const { 626 if (ActiveTimeout) { 627 hsa_signal_value_t Got = 1; 628 Got = hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0, 629 ActiveTimeout, HSA_WAIT_STATE_ACTIVE); 630 if (Got == 0) 631 return Plugin::success(); 632 } 633 634 // If there is an RPC device attached to this stream we run it as a server. 635 uint64_t Timeout = UINT64_MAX; 636 auto WaitState = HSA_WAIT_STATE_BLOCKED; 637 while (hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0, 638 Timeout, WaitState) != 0) 639 ; 640 return Plugin::success(); 641 } 642 643 /// Load the value on the signal. 644 hsa_signal_value_t load() const { 645 return hsa_signal_load_scacquire(HSASignal); 646 } 647 648 /// Signal decrementing by one. 649 void signal() { 650 assert(load() > 0 && "Invalid signal value"); 651 hsa_signal_subtract_screlease(HSASignal, 1); 652 } 653 654 /// Reset the signal value before reusing the signal. Do not call this 655 /// function if the signal is being currently used by any watcher, such as a 656 /// plugin thread or the HSA runtime. 657 void reset() { hsa_signal_store_screlease(HSASignal, 1); } 658 659 /// Increase the number of concurrent uses. 660 void increaseUseCount() { UseCount.increase(); } 661 662 /// Decrease the number of concurrent uses and return whether was the last. 663 bool decreaseUseCount() { return UseCount.decrease(); } 664 665 hsa_signal_t get() const { return HSASignal; } 666 667 private: 668 /// The underlying HSA signal. 669 hsa_signal_t HSASignal; 670 671 /// Reference counter for tracking the concurrent use count. This is mainly 672 /// used for knowing how many streams are using the signal. 673 RefCountTy<> UseCount; 674 }; 675 676 /// Classes for holding AMDGPU signals and managing signals. 677 using AMDGPUSignalRef = AMDGPUResourceRef<AMDGPUSignalTy>; 678 using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>; 679 680 /// Class holding an HSA queue to submit kernel and barrier packets. 681 struct AMDGPUQueueTy { 682 /// Create an empty queue. 683 AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {} 684 685 /// Lazily initialize a new queue belonging to a specific agent. 686 Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) { 687 if (Queue) 688 return Plugin::success(); 689 hsa_status_t Status = 690 hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError, 691 &Device, UINT32_MAX, UINT32_MAX, &Queue); 692 return Plugin::check(Status, "Error in hsa_queue_create: %s"); 693 } 694 695 /// Deinitialize the queue and destroy its resources. 696 Error deinit() { 697 std::lock_guard<std::mutex> Lock(Mutex); 698 if (!Queue) 699 return Plugin::success(); 700 hsa_status_t Status = hsa_queue_destroy(Queue); 701 return Plugin::check(Status, "Error in hsa_queue_destroy: %s"); 702 } 703 704 /// Returns the number of streams, this queue is currently assigned to. 705 bool getUserCount() const { return NumUsers; } 706 707 /// Returns if the underlying HSA queue is initialized. 708 bool isInitialized() { return Queue != nullptr; } 709 710 /// Decrement user count of the queue object. 711 void removeUser() { --NumUsers; } 712 713 /// Increase user count of the queue object. 714 void addUser() { ++NumUsers; } 715 716 /// Push a kernel launch to the queue. The kernel launch requires an output 717 /// signal and can define an optional input signal (nullptr if none). 718 Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs, 719 uint32_t NumThreads[3], uint32_t NumBlocks[3], 720 uint32_t GroupSize, uint64_t StackSize, 721 AMDGPUSignalTy *OutputSignal, 722 AMDGPUSignalTy *InputSignal) { 723 assert(OutputSignal && "Invalid kernel output signal"); 724 725 // Lock the queue during the packet publishing process. Notice this blocks 726 // the addition of other packets to the queue. The following piece of code 727 // should be lightweight; do not block the thread, allocate memory, etc. 728 std::lock_guard<std::mutex> Lock(Mutex); 729 assert(Queue && "Interacted with a non-initialized queue!"); 730 731 // Add a barrier packet before the kernel packet in case there is a pending 732 // preceding operation. The barrier packet will delay the processing of 733 // subsequent queue's packets until the barrier input signal are satisfied. 734 // No need output signal needed because the dependency is already guaranteed 735 // by the queue barrier itself. 736 if (InputSignal && InputSignal->load()) 737 if (auto Err = pushBarrierImpl(nullptr, InputSignal)) 738 return Err; 739 740 // Now prepare the kernel packet. 741 uint64_t PacketId; 742 hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId); 743 assert(Packet && "Invalid packet"); 744 745 // The first 32 bits of the packet are written after the other fields 746 uint16_t Dims = NumBlocks[2] * NumThreads[2] > 1 747 ? 3 748 : 1 + (NumBlocks[1] * NumThreads[1] != 1); 749 uint16_t Setup = UINT16_C(Dims) 750 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; 751 Packet->workgroup_size_x = NumThreads[0]; 752 Packet->workgroup_size_y = NumThreads[1]; 753 Packet->workgroup_size_z = NumThreads[2]; 754 Packet->reserved0 = 0; 755 Packet->grid_size_x = NumBlocks[0] * NumThreads[0]; 756 Packet->grid_size_y = NumBlocks[1] * NumThreads[1]; 757 Packet->grid_size_z = NumBlocks[2] * NumThreads[2]; 758 Packet->private_segment_size = 759 Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize(); 760 Packet->group_segment_size = GroupSize; 761 Packet->kernel_object = Kernel.getKernelObject(); 762 Packet->kernarg_address = KernelArgs; 763 Packet->reserved2 = 0; 764 Packet->completion_signal = OutputSignal->get(); 765 766 // Publish the packet. Do not modify the packet after this point. 767 publishKernelPacket(PacketId, Setup, Packet); 768 769 return Plugin::success(); 770 } 771 772 /// Push a barrier packet that will wait up to two input signals. All signals 773 /// are optional (nullptr if none). 774 Error pushBarrier(AMDGPUSignalTy *OutputSignal, 775 const AMDGPUSignalTy *InputSignal1, 776 const AMDGPUSignalTy *InputSignal2) { 777 // Lock the queue during the packet publishing process. 778 std::lock_guard<std::mutex> Lock(Mutex); 779 assert(Queue && "Interacted with a non-initialized queue!"); 780 781 // Push the barrier with the lock acquired. 782 return pushBarrierImpl(OutputSignal, InputSignal1, InputSignal2); 783 } 784 785 private: 786 /// Push a barrier packet that will wait up to two input signals. Assumes the 787 /// the queue lock is acquired. 788 Error pushBarrierImpl(AMDGPUSignalTy *OutputSignal, 789 const AMDGPUSignalTy *InputSignal1, 790 const AMDGPUSignalTy *InputSignal2 = nullptr) { 791 // Add a queue barrier waiting on both the other stream's operation and the 792 // last operation on the current stream (if any). 793 uint64_t PacketId; 794 hsa_barrier_and_packet_t *Packet = 795 (hsa_barrier_and_packet_t *)acquirePacket(PacketId); 796 assert(Packet && "Invalid packet"); 797 798 Packet->reserved0 = 0; 799 Packet->reserved1 = 0; 800 Packet->dep_signal[0] = {0}; 801 Packet->dep_signal[1] = {0}; 802 Packet->dep_signal[2] = {0}; 803 Packet->dep_signal[3] = {0}; 804 Packet->dep_signal[4] = {0}; 805 Packet->reserved2 = 0; 806 Packet->completion_signal = {0}; 807 808 // Set input and output dependencies if needed. 809 if (OutputSignal) 810 Packet->completion_signal = OutputSignal->get(); 811 if (InputSignal1) 812 Packet->dep_signal[0] = InputSignal1->get(); 813 if (InputSignal2) 814 Packet->dep_signal[1] = InputSignal2->get(); 815 816 // Publish the packet. Do not modify the packet after this point. 817 publishBarrierPacket(PacketId, Packet); 818 819 return Plugin::success(); 820 } 821 822 /// Acquire a packet from the queue. This call may block the thread if there 823 /// is no space in the underlying HSA queue. It may need to wait until the HSA 824 /// runtime processes some packets. Assumes the queue lock is acquired. 825 hsa_kernel_dispatch_packet_t *acquirePacket(uint64_t &PacketId) { 826 // Increase the queue index with relaxed memory order. Notice this will need 827 // another subsequent atomic operation with acquire order. 828 PacketId = hsa_queue_add_write_index_relaxed(Queue, 1); 829 830 // Wait for the package to be available. Notice the atomic operation uses 831 // the acquire memory order. 832 while (PacketId - hsa_queue_load_read_index_scacquire(Queue) >= Queue->size) 833 ; 834 835 // Return the packet reference. 836 const uint32_t Mask = Queue->size - 1; // The size is a power of 2. 837 return (hsa_kernel_dispatch_packet_t *)Queue->base_address + 838 (PacketId & Mask); 839 } 840 841 /// Publish the kernel packet so that the HSA runtime can start processing 842 /// the kernel launch. Do not modify the packet once this function is called. 843 /// Assumes the queue lock is acquired. 844 void publishKernelPacket(uint64_t PacketId, uint16_t Setup, 845 hsa_kernel_dispatch_packet_t *Packet) { 846 uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet); 847 848 uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; 849 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; 850 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; 851 852 // Publish the packet. Do not modify the package after this point. 853 uint32_t HeaderWord = Header | (Setup << 16u); 854 __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE); 855 856 // Signal the doorbell about the published packet. 857 hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); 858 } 859 860 /// Publish the barrier packet so that the HSA runtime can start processing 861 /// the barrier. Next packets in the queue will not be processed until all 862 /// barrier dependencies (signals) are satisfied. Assumes the queue is locked 863 void publishBarrierPacket(uint64_t PacketId, 864 hsa_barrier_and_packet_t *Packet) { 865 uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet); 866 uint16_t Setup = 0; 867 uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; 868 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; 869 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; 870 871 // Publish the packet. Do not modify the package after this point. 872 uint32_t HeaderWord = Header | (Setup << 16u); 873 __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE); 874 875 // Signal the doorbell about the published packet. 876 hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); 877 } 878 879 /// Callack that will be called when an error is detected on the HSA queue. 880 static void callbackError(hsa_status_t Status, hsa_queue_t *Source, 881 void *Data); 882 883 /// The HSA queue. 884 hsa_queue_t *Queue; 885 886 /// Mutex to protect the acquiring and publishing of packets. For the moment, 887 /// we need this mutex to prevent publishing packets that are not ready to be 888 /// published in a multi-thread scenario. Without a queue lock, a thread T1 889 /// could acquire packet P and thread T2 acquire packet P+1. Thread T2 could 890 /// publish its packet P+1 (signaling the queue's doorbell) before packet P 891 /// from T1 is ready to be processed. That scenario should be invalid. Thus, 892 /// we use the following mutex to make packet acquiring and publishing atomic. 893 /// TODO: There are other more advanced approaches to avoid this mutex using 894 /// atomic operations. We can further investigate it if this is a bottleneck. 895 std::mutex Mutex; 896 897 /// The number of streams, this queue is currently assigned to. A queue is 898 /// considered idle when this is zero, otherwise: busy. 899 uint32_t NumUsers; 900 }; 901 902 /// Struct that implements a stream of asynchronous operations for AMDGPU 903 /// devices. This class relies on signals to implement streams and define the 904 /// dependencies between asynchronous operations. 905 struct AMDGPUStreamTy { 906 private: 907 /// Utility struct holding arguments for async H2H memory copies. 908 struct MemcpyArgsTy { 909 void *Dst; 910 const void *Src; 911 size_t Size; 912 }; 913 914 /// Utility struct holding arguments for freeing buffers to memory managers. 915 struct ReleaseBufferArgsTy { 916 void *Buffer; 917 AMDGPUMemoryManagerTy *MemoryManager; 918 }; 919 920 /// Utility struct holding arguments for releasing signals to signal managers. 921 struct ReleaseSignalArgsTy { 922 AMDGPUSignalTy *Signal; 923 AMDGPUSignalManagerTy *SignalManager; 924 }; 925 926 using AMDGPUStreamCallbackTy = Error(void *Data); 927 928 /// The stream is composed of N stream's slots. The struct below represents 929 /// the fields of each slot. Each slot has a signal and an optional action 930 /// function. When appending an HSA asynchronous operation to the stream, one 931 /// slot is consumed and used to store the operation's information. The 932 /// operation's output signal is set to the consumed slot's signal. If there 933 /// is a previous asynchronous operation on the previous slot, the HSA async 934 /// operation's input signal is set to the signal of the previous slot. This 935 /// way, we obtain a chain of dependant async operations. The action is a 936 /// function that will be executed eventually after the operation is 937 /// completed, e.g., for releasing a buffer. 938 struct StreamSlotTy { 939 /// The output signal of the stream operation. May be used by the subsequent 940 /// operation as input signal. 941 AMDGPUSignalTy *Signal; 942 943 /// The actions that must be performed after the operation's completion. Set 944 /// to nullptr when there is no action to perform. 945 llvm::SmallVector<AMDGPUStreamCallbackTy *> Callbacks; 946 947 /// Space for the action's arguments. A pointer to these arguments is passed 948 /// to the action function. Notice the space of arguments is limited. 949 union ActionArgsTy { 950 MemcpyArgsTy MemcpyArgs; 951 ReleaseBufferArgsTy ReleaseBufferArgs; 952 ReleaseSignalArgsTy ReleaseSignalArgs; 953 void *CallbackArgs; 954 }; 955 956 llvm::SmallVector<ActionArgsTy> ActionArgs; 957 958 /// Create an empty slot. 959 StreamSlotTy() : Signal(nullptr), Callbacks({}), ActionArgs({}) {} 960 961 /// Schedule a host memory copy action on the slot. 962 Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) { 963 Callbacks.emplace_back(memcpyAction); 964 ActionArgs.emplace_back().MemcpyArgs = MemcpyArgsTy{Dst, Src, Size}; 965 return Plugin::success(); 966 } 967 968 /// Schedule a release buffer action on the slot. 969 Error schedReleaseBuffer(void *Buffer, AMDGPUMemoryManagerTy &Manager) { 970 Callbacks.emplace_back(releaseBufferAction); 971 ActionArgs.emplace_back().ReleaseBufferArgs = 972 ReleaseBufferArgsTy{Buffer, &Manager}; 973 return Plugin::success(); 974 } 975 976 /// Schedule a signal release action on the slot. 977 Error schedReleaseSignal(AMDGPUSignalTy *SignalToRelease, 978 AMDGPUSignalManagerTy *SignalManager) { 979 Callbacks.emplace_back(releaseSignalAction); 980 ActionArgs.emplace_back().ReleaseSignalArgs = 981 ReleaseSignalArgsTy{SignalToRelease, SignalManager}; 982 return Plugin::success(); 983 } 984 985 /// Register a callback to be called on compleition 986 Error schedCallback(AMDGPUStreamCallbackTy *Func, void *Data) { 987 Callbacks.emplace_back(Func); 988 ActionArgs.emplace_back().CallbackArgs = Data; 989 990 return Plugin::success(); 991 } 992 993 // Perform the action if needed. 994 Error performAction() { 995 if (Callbacks.empty()) 996 return Plugin::success(); 997 998 assert(Callbacks.size() == ActionArgs.size() && "Size mismatch"); 999 for (auto [Callback, ActionArg] : llvm::zip(Callbacks, ActionArgs)) { 1000 // Perform the action. 1001 if (Callback == memcpyAction) { 1002 if (auto Err = memcpyAction(&ActionArg)) 1003 return Err; 1004 } else if (Callback == releaseBufferAction) { 1005 if (auto Err = releaseBufferAction(&ActionArg)) 1006 return Err; 1007 } else if (Callback == releaseSignalAction) { 1008 if (auto Err = releaseSignalAction(&ActionArg)) 1009 return Err; 1010 } else if (Callback) { 1011 if (auto Err = Callback(ActionArg.CallbackArgs)) 1012 return Err; 1013 } 1014 } 1015 1016 // Invalidate the action. 1017 Callbacks.clear(); 1018 ActionArgs.clear(); 1019 1020 return Plugin::success(); 1021 } 1022 }; 1023 1024 /// The device agent where the stream was created. 1025 hsa_agent_t Agent; 1026 1027 /// The queue that the stream uses to launch kernels. 1028 AMDGPUQueueTy *Queue; 1029 1030 /// The manager of signals to reuse signals. 1031 AMDGPUSignalManagerTy &SignalManager; 1032 1033 /// A reference to the associated device. 1034 GenericDeviceTy &Device; 1035 1036 /// Array of stream slots. Use std::deque because it can dynamically grow 1037 /// without invalidating the already inserted elements. For instance, the 1038 /// std::vector may invalidate the elements by reallocating the internal 1039 /// array if there is not enough space on new insertions. 1040 std::deque<StreamSlotTy> Slots; 1041 1042 /// The next available slot on the queue. This is reset to zero each time the 1043 /// stream is synchronized. It also indicates the current number of consumed 1044 /// slots at a given time. 1045 uint32_t NextSlot; 1046 1047 /// The synchronization id. This number is increased each time the stream is 1048 /// synchronized. It is useful to detect if an AMDGPUEventTy points to an 1049 /// operation that was already finalized in a previous stream sycnhronize. 1050 uint32_t SyncCycle; 1051 1052 /// Mutex to protect stream's management. 1053 mutable std::mutex Mutex; 1054 1055 /// Timeout hint for HSA actively waiting for signal value to change 1056 const uint64_t StreamBusyWaitMicroseconds; 1057 1058 /// Indicate to spread data transfers across all avilable SDMAs 1059 bool UseMultipleSdmaEngines; 1060 1061 /// Return the current number of asychronous operations on the stream. 1062 uint32_t size() const { return NextSlot; } 1063 1064 /// Return the last valid slot on the stream. 1065 uint32_t last() const { return size() - 1; } 1066 1067 /// Consume one slot from the stream. Since the stream uses signals on demand 1068 /// and releases them once the slot is no longer used, the function requires 1069 /// an idle signal for the new consumed slot. 1070 std::pair<uint32_t, AMDGPUSignalTy *> consume(AMDGPUSignalTy *OutputSignal) { 1071 // Double the stream size if needed. Since we use std::deque, this operation 1072 // does not invalidate the already added slots. 1073 if (Slots.size() == NextSlot) 1074 Slots.resize(Slots.size() * 2); 1075 1076 // Update the next available slot and the stream size. 1077 uint32_t Curr = NextSlot++; 1078 1079 // Retrieve the input signal, if any, of the current operation. 1080 AMDGPUSignalTy *InputSignal = (Curr > 0) ? Slots[Curr - 1].Signal : nullptr; 1081 1082 // Set the output signal of the current slot. 1083 Slots[Curr].Signal = OutputSignal; 1084 1085 return std::make_pair(Curr, InputSignal); 1086 } 1087 1088 /// Complete all pending post actions and reset the stream after synchronizing 1089 /// or positively querying the stream. 1090 Error complete() { 1091 for (uint32_t Slot = 0; Slot < NextSlot; ++Slot) { 1092 // Take the post action of the operation if any. 1093 if (auto Err = Slots[Slot].performAction()) 1094 return Err; 1095 1096 // Release the slot's signal if possible. Otherwise, another user will. 1097 if (Slots[Slot].Signal->decreaseUseCount()) 1098 if (auto Err = SignalManager.returnResource(Slots[Slot].Signal)) 1099 return Err; 1100 1101 Slots[Slot].Signal = nullptr; 1102 } 1103 1104 // Reset the stream slots to zero. 1105 NextSlot = 0; 1106 1107 // Increase the synchronization id since the stream completed a sync cycle. 1108 SyncCycle += 1; 1109 1110 return Plugin::success(); 1111 } 1112 1113 /// Make the current stream wait on a specific operation of another stream. 1114 /// The idea is to make the current stream waiting on two signals: 1) the last 1115 /// signal of the current stream, and 2) the last signal of the other stream. 1116 /// Use a barrier packet with two input signals. 1117 Error waitOnStreamOperation(AMDGPUStreamTy &OtherStream, uint32_t Slot) { 1118 if (Queue == nullptr) 1119 return Plugin::error("Target queue was nullptr"); 1120 1121 /// The signal that we must wait from the other stream. 1122 AMDGPUSignalTy *OtherSignal = OtherStream.Slots[Slot].Signal; 1123 1124 // Prevent the release of the other stream's signal. 1125 OtherSignal->increaseUseCount(); 1126 1127 // Retrieve an available signal for the operation's output. 1128 AMDGPUSignalTy *OutputSignal = nullptr; 1129 if (auto Err = SignalManager.getResource(OutputSignal)) 1130 return Err; 1131 OutputSignal->reset(); 1132 OutputSignal->increaseUseCount(); 1133 1134 // Consume stream slot and compute dependencies. 1135 auto [Curr, InputSignal] = consume(OutputSignal); 1136 1137 // Setup the post action to release the signal. 1138 if (auto Err = Slots[Curr].schedReleaseSignal(OtherSignal, &SignalManager)) 1139 return Err; 1140 1141 // Push a barrier into the queue with both input signals. 1142 return Queue->pushBarrier(OutputSignal, InputSignal, OtherSignal); 1143 } 1144 1145 /// Callback for running a specific asynchronous operation. This callback is 1146 /// used for hsa_amd_signal_async_handler. The argument is the operation that 1147 /// should be executed. Notice we use the post action mechanism to codify the 1148 /// asynchronous operation. 1149 static bool asyncActionCallback(hsa_signal_value_t Value, void *Args) { 1150 StreamSlotTy *Slot = reinterpret_cast<StreamSlotTy *>(Args); 1151 assert(Slot && "Invalid slot"); 1152 assert(Slot->Signal && "Invalid signal"); 1153 1154 // This thread is outside the stream mutex. Make sure the thread sees the 1155 // changes on the slot. 1156 std::atomic_thread_fence(std::memory_order_acquire); 1157 1158 // Peform the operation. 1159 if (auto Err = Slot->performAction()) 1160 FATAL_MESSAGE(1, "Error peforming post action: %s", 1161 toString(std::move(Err)).data()); 1162 1163 // Signal the output signal to notify the asycnhronous operation finalized. 1164 Slot->Signal->signal(); 1165 1166 // Unregister callback. 1167 return false; 1168 } 1169 1170 // Callback for host-to-host memory copies. This is an asynchronous action. 1171 static Error memcpyAction(void *Data) { 1172 MemcpyArgsTy *Args = reinterpret_cast<MemcpyArgsTy *>(Data); 1173 assert(Args && "Invalid arguments"); 1174 assert(Args->Dst && "Invalid destination buffer"); 1175 assert(Args->Src && "Invalid source buffer"); 1176 1177 std::memcpy(Args->Dst, Args->Src, Args->Size); 1178 1179 return Plugin::success(); 1180 } 1181 1182 /// Releasing a memory buffer to a memory manager. This is a post completion 1183 /// action. There are two kinds of memory buffers: 1184 /// 1. For kernel arguments. This buffer can be freed after receiving the 1185 /// kernel completion signal. 1186 /// 2. For H2D tranfers that need pinned memory space for staging. This 1187 /// buffer can be freed after receiving the transfer completion signal. 1188 /// 3. For D2H tranfers that need pinned memory space for staging. This 1189 /// buffer cannot be freed after receiving the transfer completion signal 1190 /// because of the following asynchronous H2H callback. 1191 /// For this reason, This action can only be taken at 1192 /// AMDGPUStreamTy::complete() 1193 /// Because of the case 3, all releaseBufferActions are taken at 1194 /// AMDGPUStreamTy::complete() in the current implementation. 1195 static Error releaseBufferAction(void *Data) { 1196 ReleaseBufferArgsTy *Args = reinterpret_cast<ReleaseBufferArgsTy *>(Data); 1197 assert(Args && "Invalid arguments"); 1198 assert(Args->MemoryManager && "Invalid memory manager"); 1199 assert(Args->Buffer && "Invalid buffer"); 1200 1201 // Release the allocation to the memory manager. 1202 return Args->MemoryManager->deallocate(Args->Buffer); 1203 } 1204 1205 /// Releasing a signal object back to SignalManager. This is a post completion 1206 /// action. This action can only be taken at AMDGPUStreamTy::complete() 1207 static Error releaseSignalAction(void *Data) { 1208 ReleaseSignalArgsTy *Args = reinterpret_cast<ReleaseSignalArgsTy *>(Data); 1209 assert(Args && "Invalid arguments"); 1210 assert(Args->Signal && "Invalid signal"); 1211 assert(Args->SignalManager && "Invalid signal manager"); 1212 1213 // Release the signal if needed. 1214 if (Args->Signal->decreaseUseCount()) 1215 if (auto Err = Args->SignalManager->returnResource(Args->Signal)) 1216 return Err; 1217 1218 return Plugin::success(); 1219 } 1220 1221 public: 1222 /// Create an empty stream associated with a specific device. 1223 AMDGPUStreamTy(AMDGPUDeviceTy &Device); 1224 1225 /// Intialize the stream's signals. 1226 Error init() { return Plugin::success(); } 1227 1228 /// Deinitialize the stream's signals. 1229 Error deinit() { return Plugin::success(); } 1230 1231 /// Push a asynchronous kernel to the stream. The kernel arguments must be 1232 /// placed in a special allocation for kernel args and must keep alive until 1233 /// the kernel finalizes. Once the kernel is finished, the stream will release 1234 /// the kernel args buffer to the specified memory manager. 1235 Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs, 1236 uint32_t NumThreads[3], uint32_t NumBlocks[3], 1237 uint32_t GroupSize, uint64_t StackSize, 1238 AMDGPUMemoryManagerTy &MemoryManager) { 1239 if (Queue == nullptr) 1240 return Plugin::error("Target queue was nullptr"); 1241 1242 // Retrieve an available signal for the operation's output. 1243 AMDGPUSignalTy *OutputSignal = nullptr; 1244 if (auto Err = SignalManager.getResource(OutputSignal)) 1245 return Err; 1246 OutputSignal->reset(); 1247 OutputSignal->increaseUseCount(); 1248 1249 std::lock_guard<std::mutex> StreamLock(Mutex); 1250 1251 // Consume stream slot and compute dependencies. 1252 auto [Curr, InputSignal] = consume(OutputSignal); 1253 1254 // Setup the post action to release the kernel args buffer. 1255 if (auto Err = Slots[Curr].schedReleaseBuffer(KernelArgs, MemoryManager)) 1256 return Err; 1257 1258 // If we are running an RPC server we want to wake up the server thread 1259 // whenever there is a kernel running and let it sleep otherwise. 1260 if (Device.getRPCServer()) 1261 Device.Plugin.getRPCServer().Thread->notify(); 1262 1263 // Push the kernel with the output signal and an input signal (optional) 1264 if (auto Err = Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, 1265 NumBlocks, GroupSize, StackSize, 1266 OutputSignal, InputSignal)) 1267 return Err; 1268 1269 // Register a callback to indicate when the kernel is complete. 1270 if (Device.getRPCServer()) { 1271 if (auto Err = Slots[Curr].schedCallback( 1272 [](void *Data) -> llvm::Error { 1273 GenericPluginTy &Plugin = 1274 *reinterpret_cast<GenericPluginTy *>(Data); 1275 Plugin.getRPCServer().Thread->finish(); 1276 return Error::success(); 1277 }, 1278 &Device.Plugin)) 1279 return Err; 1280 } 1281 return Plugin::success(); 1282 } 1283 1284 /// Push an asynchronous memory copy between pinned memory buffers. 1285 Error pushPinnedMemoryCopyAsync(void *Dst, const void *Src, 1286 uint64_t CopySize) { 1287 // Retrieve an available signal for the operation's output. 1288 AMDGPUSignalTy *OutputSignal = nullptr; 1289 if (auto Err = SignalManager.getResource(OutputSignal)) 1290 return Err; 1291 OutputSignal->reset(); 1292 OutputSignal->increaseUseCount(); 1293 1294 std::lock_guard<std::mutex> Lock(Mutex); 1295 1296 // Consume stream slot and compute dependencies. 1297 auto [Curr, InputSignal] = consume(OutputSignal); 1298 1299 // Issue the async memory copy. 1300 if (InputSignal && InputSignal->load()) { 1301 hsa_signal_t InputSignalRaw = InputSignal->get(); 1302 return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, 1303 Agent, CopySize, 1, &InputSignalRaw, 1304 OutputSignal->get()); 1305 } 1306 1307 return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, 1308 Agent, CopySize, 0, nullptr, 1309 OutputSignal->get()); 1310 } 1311 1312 /// Push an asynchronous memory copy device-to-host involving an unpinned 1313 /// memory buffer. The operation consists of a two-step copy from the 1314 /// device buffer to an intermediate pinned host buffer, and then, to a 1315 /// unpinned host buffer. Both operations are asynchronous and dependant. 1316 /// The intermediate pinned buffer will be released to the specified memory 1317 /// manager once the operation completes. 1318 Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter, 1319 uint64_t CopySize, 1320 AMDGPUMemoryManagerTy &MemoryManager) { 1321 // Retrieve available signals for the operation's outputs. 1322 AMDGPUSignalTy *OutputSignals[2] = {}; 1323 if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals)) 1324 return Err; 1325 for (auto *Signal : OutputSignals) { 1326 Signal->reset(); 1327 Signal->increaseUseCount(); 1328 } 1329 1330 std::lock_guard<std::mutex> Lock(Mutex); 1331 1332 // Consume stream slot and compute dependencies. 1333 auto [Curr, InputSignal] = consume(OutputSignals[0]); 1334 1335 // Setup the post action for releasing the intermediate buffer. 1336 if (auto Err = Slots[Curr].schedReleaseBuffer(Inter, MemoryManager)) 1337 return Err; 1338 1339 // Issue the first step: device to host transfer. Avoid defining the input 1340 // dependency if already satisfied. 1341 if (InputSignal && InputSignal->load()) { 1342 hsa_signal_t InputSignalRaw = InputSignal->get(); 1343 if (auto Err = hsa_utils::asyncMemCopy( 1344 UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1, 1345 &InputSignalRaw, OutputSignals[0]->get())) 1346 return Err; 1347 } else { 1348 if (auto Err = hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, 1349 Agent, Src, Agent, CopySize, 0, 1350 nullptr, OutputSignals[0]->get())) 1351 return Err; 1352 } 1353 1354 // Consume another stream slot and compute dependencies. 1355 std::tie(Curr, InputSignal) = consume(OutputSignals[1]); 1356 assert(InputSignal && "Invalid input signal"); 1357 1358 // The std::memcpy is done asynchronously using an async handler. We store 1359 // the function's information in the action but it's not actually an action. 1360 if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst, Inter, CopySize)) 1361 return Err; 1362 1363 // Make changes on this slot visible to the async handler's thread. 1364 std::atomic_thread_fence(std::memory_order_release); 1365 1366 // Issue the second step: host to host transfer. 1367 hsa_status_t Status = hsa_amd_signal_async_handler( 1368 InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback, 1369 (void *)&Slots[Curr]); 1370 1371 return Plugin::check(Status, "Error in hsa_amd_signal_async_handler: %s"); 1372 } 1373 1374 /// Push an asynchronous memory copy host-to-device involving an unpinned 1375 /// memory buffer. The operation consists of a two-step copy from the 1376 /// unpinned host buffer to an intermediate pinned host buffer, and then, to 1377 /// the pinned host buffer. Both operations are asynchronous and dependant. 1378 /// The intermediate pinned buffer will be released to the specified memory 1379 /// manager once the operation completes. 1380 Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter, 1381 uint64_t CopySize, 1382 AMDGPUMemoryManagerTy &MemoryManager) { 1383 // Retrieve available signals for the operation's outputs. 1384 AMDGPUSignalTy *OutputSignals[2] = {}; 1385 if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals)) 1386 return Err; 1387 for (auto *Signal : OutputSignals) { 1388 Signal->reset(); 1389 Signal->increaseUseCount(); 1390 } 1391 1392 AMDGPUSignalTy *OutputSignal = OutputSignals[0]; 1393 1394 std::lock_guard<std::mutex> Lock(Mutex); 1395 1396 // Consume stream slot and compute dependencies. 1397 auto [Curr, InputSignal] = consume(OutputSignal); 1398 1399 // Issue the first step: host to host transfer. 1400 if (InputSignal && InputSignal->load()) { 1401 // The std::memcpy is done asynchronously using an async handler. We store 1402 // the function's information in the action but it is not actually a 1403 // post action. 1404 if (auto Err = Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize)) 1405 return Err; 1406 1407 // Make changes on this slot visible to the async handler's thread. 1408 std::atomic_thread_fence(std::memory_order_release); 1409 1410 hsa_status_t Status = hsa_amd_signal_async_handler( 1411 InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback, 1412 (void *)&Slots[Curr]); 1413 1414 if (auto Err = Plugin::check(Status, 1415 "Error in hsa_amd_signal_async_handler: %s")) 1416 return Err; 1417 1418 // Let's use now the second output signal. 1419 OutputSignal = OutputSignals[1]; 1420 1421 // Consume another stream slot and compute dependencies. 1422 std::tie(Curr, InputSignal) = consume(OutputSignal); 1423 } else { 1424 // All preceding operations completed, copy the memory synchronously. 1425 std::memcpy(Inter, Src, CopySize); 1426 1427 // Return the second signal because it will not be used. 1428 OutputSignals[1]->decreaseUseCount(); 1429 if (auto Err = SignalManager.returnResource(OutputSignals[1])) 1430 return Err; 1431 } 1432 1433 // Setup the post action to release the intermediate pinned buffer. 1434 if (auto Err = Slots[Curr].schedReleaseBuffer(Inter, MemoryManager)) 1435 return Err; 1436 1437 // Issue the second step: host to device transfer. Avoid defining the input 1438 // dependency if already satisfied. 1439 if (InputSignal && InputSignal->load()) { 1440 hsa_signal_t InputSignalRaw = InputSignal->get(); 1441 return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, 1442 Agent, CopySize, 1, &InputSignalRaw, 1443 OutputSignal->get()); 1444 } 1445 return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, 1446 Agent, CopySize, 0, nullptr, 1447 OutputSignal->get()); 1448 } 1449 1450 // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead 1451 Error pushMemoryCopyD2DAsync(void *Dst, hsa_agent_t DstAgent, const void *Src, 1452 hsa_agent_t SrcAgent, uint64_t CopySize) { 1453 AMDGPUSignalTy *OutputSignal; 1454 if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal)) 1455 return Err; 1456 OutputSignal->reset(); 1457 OutputSignal->increaseUseCount(); 1458 1459 std::lock_guard<std::mutex> Lock(Mutex); 1460 1461 // Consume stream slot and compute dependencies. 1462 auto [Curr, InputSignal] = consume(OutputSignal); 1463 1464 // The agents need to have access to the corresponding memory 1465 // This is presently only true if the pointers were originally 1466 // allocated by this runtime or the caller made the appropriate 1467 // access calls. 1468 1469 if (InputSignal && InputSignal->load()) { 1470 hsa_signal_t InputSignalRaw = InputSignal->get(); 1471 return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, 1472 SrcAgent, CopySize, 1, &InputSignalRaw, 1473 OutputSignal->get()); 1474 } 1475 return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, 1476 SrcAgent, CopySize, 0, nullptr, 1477 OutputSignal->get()); 1478 } 1479 1480 /// Synchronize with the stream. The current thread waits until all operations 1481 /// are finalized and it performs the pending post actions (i.e., releasing 1482 /// intermediate buffers). 1483 Error synchronize() { 1484 std::lock_guard<std::mutex> Lock(Mutex); 1485 1486 // No need to synchronize anything. 1487 if (size() == 0) 1488 return Plugin::success(); 1489 1490 // Wait until all previous operations on the stream have completed. 1491 if (auto Err = 1492 Slots[last()].Signal->wait(StreamBusyWaitMicroseconds, &Device)) 1493 return Err; 1494 1495 // Reset the stream and perform all pending post actions. 1496 return complete(); 1497 } 1498 1499 /// Query the stream and complete pending post actions if operations finished. 1500 /// Return whether all the operations completed. This operation does not block 1501 /// the calling thread. 1502 Expected<bool> query() { 1503 std::lock_guard<std::mutex> Lock(Mutex); 1504 1505 // No need to query anything. 1506 if (size() == 0) 1507 return true; 1508 1509 // The last operation did not complete yet. Return directly. 1510 if (Slots[last()].Signal->load()) 1511 return false; 1512 1513 // Reset the stream and perform all pending post actions. 1514 if (auto Err = complete()) 1515 return std::move(Err); 1516 1517 return true; 1518 } 1519 1520 const AMDGPUQueueTy *getQueue() const { return Queue; } 1521 1522 /// Record the state of the stream on an event. 1523 Error recordEvent(AMDGPUEventTy &Event) const; 1524 1525 /// Make the stream wait on an event. 1526 Error waitEvent(const AMDGPUEventTy &Event); 1527 1528 friend struct AMDGPUStreamManagerTy; 1529 }; 1530 1531 /// Class representing an event on AMDGPU. The event basically stores some 1532 /// information regarding the state of the recorded stream. 1533 struct AMDGPUEventTy { 1534 /// Create an empty event. 1535 AMDGPUEventTy(AMDGPUDeviceTy &Device) 1536 : RecordedStream(nullptr), RecordedSlot(-1), RecordedSyncCycle(-1) {} 1537 1538 /// Initialize and deinitialize. 1539 Error init() { return Plugin::success(); } 1540 Error deinit() { return Plugin::success(); } 1541 1542 /// Record the state of a stream on the event. 1543 Error record(AMDGPUStreamTy &Stream) { 1544 std::lock_guard<std::mutex> Lock(Mutex); 1545 1546 // Ignore the last recorded stream. 1547 RecordedStream = &Stream; 1548 1549 return Stream.recordEvent(*this); 1550 } 1551 1552 /// Make a stream wait on the current event. 1553 Error wait(AMDGPUStreamTy &Stream) { 1554 std::lock_guard<std::mutex> Lock(Mutex); 1555 1556 if (!RecordedStream) 1557 return Plugin::error("Event does not have any recorded stream"); 1558 1559 // Synchronizing the same stream. Do nothing. 1560 if (RecordedStream == &Stream) 1561 return Plugin::success(); 1562 1563 // No need to wait anything, the recorded stream already finished the 1564 // corresponding operation. 1565 if (RecordedSlot < 0) 1566 return Plugin::success(); 1567 1568 return Stream.waitEvent(*this); 1569 } 1570 1571 protected: 1572 /// The stream registered in this event. 1573 AMDGPUStreamTy *RecordedStream; 1574 1575 /// The recordered operation on the recorded stream. 1576 int64_t RecordedSlot; 1577 1578 /// The sync cycle when the stream was recorded. Used to detect stale events. 1579 int64_t RecordedSyncCycle; 1580 1581 /// Mutex to safely access event fields. 1582 mutable std::mutex Mutex; 1583 1584 friend struct AMDGPUStreamTy; 1585 }; 1586 1587 Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const { 1588 std::lock_guard<std::mutex> Lock(Mutex); 1589 1590 if (size() > 0) { 1591 // Record the synchronize identifier (to detect stale recordings) and 1592 // the last valid stream's operation. 1593 Event.RecordedSyncCycle = SyncCycle; 1594 Event.RecordedSlot = last(); 1595 1596 assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle"); 1597 assert(Event.RecordedSlot >= 0 && "Invalid recorded slot"); 1598 } else { 1599 // The stream is empty, everything already completed, record nothing. 1600 Event.RecordedSyncCycle = -1; 1601 Event.RecordedSlot = -1; 1602 } 1603 return Plugin::success(); 1604 } 1605 1606 Error AMDGPUStreamTy::waitEvent(const AMDGPUEventTy &Event) { 1607 // Retrieve the recorded stream on the event. 1608 AMDGPUStreamTy &RecordedStream = *Event.RecordedStream; 1609 1610 std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, RecordedStream.Mutex); 1611 1612 // The recorded stream already completed the operation because the synchronize 1613 // identifier is already outdated. 1614 if (RecordedStream.SyncCycle != (uint32_t)Event.RecordedSyncCycle) 1615 return Plugin::success(); 1616 1617 // Again, the recorded stream already completed the operation, the last 1618 // operation's output signal is satisfied. 1619 if (!RecordedStream.Slots[Event.RecordedSlot].Signal->load()) 1620 return Plugin::success(); 1621 1622 // Otherwise, make the current stream wait on the other stream's operation. 1623 return waitOnStreamOperation(RecordedStream, Event.RecordedSlot); 1624 } 1625 1626 struct AMDGPUStreamManagerTy final 1627 : GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> { 1628 using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>; 1629 using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>; 1630 1631 AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent) 1632 : GenericDeviceResourceManagerTy(Device), Device(Device), 1633 OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true), 1634 NextQueue(0), Agent(HSAAgent) {} 1635 1636 Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) { 1637 Queues = std::vector<AMDGPUQueueTy>(NumHSAQueues); 1638 QueueSize = HSAQueueSize; 1639 MaxNumQueues = NumHSAQueues; 1640 // Initialize one queue eagerly 1641 if (auto Err = Queues.front().init(Device, Agent, QueueSize)) 1642 return Err; 1643 1644 return GenericDeviceResourceManagerTy::init(InitialSize); 1645 } 1646 1647 /// Deinitialize the resource pool and delete all resources. This function 1648 /// must be called before the destructor. 1649 Error deinit() override { 1650 // De-init all queues 1651 for (AMDGPUQueueTy &Queue : Queues) { 1652 if (auto Err = Queue.deinit()) 1653 return Err; 1654 } 1655 1656 return GenericDeviceResourceManagerTy::deinit(); 1657 } 1658 1659 /// Get a single stream from the pool or create new resources. 1660 virtual Error getResource(AMDGPUStreamTy *&StreamHandle) override { 1661 return getResourcesImpl(1, &StreamHandle, [this](AMDGPUStreamTy *&Handle) { 1662 return assignNextQueue(Handle); 1663 }); 1664 } 1665 1666 /// Return stream to the pool. 1667 virtual Error returnResource(AMDGPUStreamTy *StreamHandle) override { 1668 return returnResourceImpl(StreamHandle, [](AMDGPUStreamTy *Handle) { 1669 Handle->Queue->removeUser(); 1670 return Plugin::success(); 1671 }); 1672 } 1673 1674 private: 1675 /// Search for and assign an prefereably idle queue to the given Stream. If 1676 /// there is no queue without current users, choose the queue with the lowest 1677 /// user count. If utilization is ignored: use round robin selection. 1678 inline Error assignNextQueue(AMDGPUStreamTy *Stream) { 1679 // Start from zero when tracking utilization, otherwise: round robin policy. 1680 uint32_t Index = OMPX_QueueTracking ? 0 : NextQueue++ % MaxNumQueues; 1681 1682 if (OMPX_QueueTracking) { 1683 // Find the least used queue. 1684 for (uint32_t I = 0; I < MaxNumQueues; ++I) { 1685 // Early exit when an initialized queue is idle. 1686 if (Queues[I].isInitialized() && Queues[I].getUserCount() == 0) { 1687 Index = I; 1688 break; 1689 } 1690 1691 // Update the least used queue. 1692 if (Queues[Index].getUserCount() > Queues[I].getUserCount()) 1693 Index = I; 1694 } 1695 } 1696 1697 // Make sure the queue is initialized, then add user & assign. 1698 if (auto Err = Queues[Index].init(Device, Agent, QueueSize)) 1699 return Err; 1700 Queues[Index].addUser(); 1701 Stream->Queue = &Queues[Index]; 1702 1703 return Plugin::success(); 1704 } 1705 1706 /// The device associated with this stream. 1707 GenericDeviceTy &Device; 1708 1709 /// Envar for controlling the tracking of busy HSA queues. 1710 BoolEnvar OMPX_QueueTracking; 1711 1712 /// The next queue index to use for round robin selection. 1713 uint32_t NextQueue; 1714 1715 /// The queues which are assigned to requested streams. 1716 std::vector<AMDGPUQueueTy> Queues; 1717 1718 /// The corresponding device as HSA agent. 1719 hsa_agent_t Agent; 1720 1721 /// The maximum number of queues. 1722 uint32_t MaxNumQueues; 1723 1724 /// The size of created queues. 1725 uint32_t QueueSize; 1726 }; 1727 1728 /// Abstract class that holds the common members of the actual kernel devices 1729 /// and the host device. Both types should inherit from this class. 1730 struct AMDGenericDeviceTy { 1731 AMDGenericDeviceTy() {} 1732 1733 virtual ~AMDGenericDeviceTy() {} 1734 1735 /// Create all memory pools which the device has access to and classify them. 1736 Error initMemoryPools() { 1737 // Retrieve all memory pools from the device agent(s). 1738 Error Err = retrieveAllMemoryPools(); 1739 if (Err) 1740 return Err; 1741 1742 for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) { 1743 // Initialize the memory pool and retrieve some basic info. 1744 Error Err = MemoryPool->init(); 1745 if (Err) 1746 return Err; 1747 1748 if (!MemoryPool->isGlobal()) 1749 continue; 1750 1751 // Classify the memory pools depending on their properties. 1752 if (MemoryPool->isFineGrained()) { 1753 FineGrainedMemoryPools.push_back(MemoryPool); 1754 if (MemoryPool->supportsKernelArgs()) 1755 ArgsMemoryPools.push_back(MemoryPool); 1756 } else if (MemoryPool->isCoarseGrained()) { 1757 CoarseGrainedMemoryPools.push_back(MemoryPool); 1758 } 1759 } 1760 return Plugin::success(); 1761 } 1762 1763 /// Destroy all memory pools. 1764 Error deinitMemoryPools() { 1765 for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) 1766 delete Pool; 1767 1768 AllMemoryPools.clear(); 1769 FineGrainedMemoryPools.clear(); 1770 CoarseGrainedMemoryPools.clear(); 1771 ArgsMemoryPools.clear(); 1772 1773 return Plugin::success(); 1774 } 1775 1776 /// Retrieve and construct all memory pools from the device agent(s). 1777 virtual Error retrieveAllMemoryPools() = 0; 1778 1779 /// Get the device agent. 1780 virtual hsa_agent_t getAgent() const = 0; 1781 1782 protected: 1783 /// Array of all memory pools available to the host agents. 1784 llvm::SmallVector<AMDGPUMemoryPoolTy *> AllMemoryPools; 1785 1786 /// Array of fine-grained memory pools available to the host agents. 1787 llvm::SmallVector<AMDGPUMemoryPoolTy *> FineGrainedMemoryPools; 1788 1789 /// Array of coarse-grained memory pools available to the host agents. 1790 llvm::SmallVector<AMDGPUMemoryPoolTy *> CoarseGrainedMemoryPools; 1791 1792 /// Array of kernel args memory pools available to the host agents. 1793 llvm::SmallVector<AMDGPUMemoryPoolTy *> ArgsMemoryPools; 1794 }; 1795 1796 /// Class representing the host device. This host device may have more than one 1797 /// HSA host agent. We aggregate all its resources into the same instance. 1798 struct AMDHostDeviceTy : public AMDGenericDeviceTy { 1799 /// Create a host device from an array of host agents. 1800 AMDHostDeviceTy(AMDGPUPluginTy &Plugin, 1801 const llvm::SmallVector<hsa_agent_t> &HostAgents) 1802 : AMDGenericDeviceTy(), Agents(HostAgents), ArgsMemoryManager(Plugin), 1803 PinnedMemoryManager(Plugin) { 1804 assert(HostAgents.size() && "No host agent found"); 1805 } 1806 1807 /// Initialize the host device memory pools and the memory managers for 1808 /// kernel args and host pinned memory allocations. 1809 Error init() { 1810 if (auto Err = initMemoryPools()) 1811 return Err; 1812 1813 if (auto Err = ArgsMemoryManager.init(getArgsMemoryPool())) 1814 return Err; 1815 1816 if (auto Err = PinnedMemoryManager.init(getFineGrainedMemoryPool())) 1817 return Err; 1818 1819 return Plugin::success(); 1820 } 1821 1822 /// Deinitialize memory pools and managers. 1823 Error deinit() { 1824 if (auto Err = deinitMemoryPools()) 1825 return Err; 1826 1827 if (auto Err = ArgsMemoryManager.deinit()) 1828 return Err; 1829 1830 if (auto Err = PinnedMemoryManager.deinit()) 1831 return Err; 1832 1833 return Plugin::success(); 1834 } 1835 1836 /// Retrieve and construct all memory pools from the host agents. 1837 Error retrieveAllMemoryPools() override { 1838 // Iterate through the available pools across the host agents. 1839 for (hsa_agent_t Agent : Agents) { 1840 Error Err = hsa_utils::iterateAgentMemoryPools( 1841 Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) { 1842 AMDGPUMemoryPoolTy *MemoryPool = 1843 new AMDGPUMemoryPoolTy(HSAMemoryPool); 1844 AllMemoryPools.push_back(MemoryPool); 1845 return HSA_STATUS_SUCCESS; 1846 }); 1847 if (Err) 1848 return Err; 1849 } 1850 return Plugin::success(); 1851 } 1852 1853 /// Get one of the host agents. Return always the first agent. 1854 hsa_agent_t getAgent() const override { return Agents[0]; } 1855 1856 /// Get a memory pool for fine-grained allocations. 1857 AMDGPUMemoryPoolTy &getFineGrainedMemoryPool() { 1858 assert(!FineGrainedMemoryPools.empty() && "No fine-grained mempool"); 1859 // Retrive any memory pool. 1860 return *FineGrainedMemoryPools[0]; 1861 } 1862 1863 AMDGPUMemoryPoolTy &getCoarseGrainedMemoryPool() { 1864 assert(!CoarseGrainedMemoryPools.empty() && "No coarse-grained mempool"); 1865 // Retrive any memory pool. 1866 return *CoarseGrainedMemoryPools[0]; 1867 } 1868 1869 /// Get a memory pool for kernel args allocations. 1870 AMDGPUMemoryPoolTy &getArgsMemoryPool() { 1871 assert(!ArgsMemoryPools.empty() && "No kernelargs mempool"); 1872 // Retrieve any memory pool. 1873 return *ArgsMemoryPools[0]; 1874 } 1875 1876 /// Getters for kernel args and host pinned memory managers. 1877 AMDGPUMemoryManagerTy &getArgsMemoryManager() { return ArgsMemoryManager; } 1878 AMDGPUMemoryManagerTy &getPinnedMemoryManager() { 1879 return PinnedMemoryManager; 1880 } 1881 1882 private: 1883 /// Array of agents on the host side. 1884 const llvm::SmallVector<hsa_agent_t> Agents; 1885 1886 // Memory manager for kernel arguments. 1887 AMDGPUMemoryManagerTy ArgsMemoryManager; 1888 1889 // Memory manager for pinned memory. 1890 AMDGPUMemoryManagerTy PinnedMemoryManager; 1891 }; 1892 1893 /// Class implementing the AMDGPU device functionalities which derives from the 1894 /// generic device class. 1895 struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { 1896 // Create an AMDGPU device with a device id and default AMDGPU grid values. 1897 AMDGPUDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices, 1898 AMDHostDeviceTy &HostDevice, hsa_agent_t Agent) 1899 : GenericDeviceTy(Plugin, DeviceId, NumDevices, {}), AMDGenericDeviceTy(), 1900 OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 4), 1901 OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 512), 1902 OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU", 4), 1903 OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES", 1904 1 * 1024 * 1024), // 1MB 1905 OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS", 1906 64), 1907 OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000), 1908 OMPX_UseMultipleSdmaEngines( 1909 "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false), 1910 OMPX_ApuMaps("OMPX_APU_MAPS", false), AMDGPUStreamManager(*this, Agent), 1911 AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent), 1912 HostDevice(HostDevice) {} 1913 1914 ~AMDGPUDeviceTy() {} 1915 1916 /// Initialize the device, its resources and get its properties. 1917 Error initImpl(GenericPluginTy &Plugin) override { 1918 // First setup all the memory pools. 1919 if (auto Err = initMemoryPools()) 1920 return Err; 1921 1922 char GPUName[64]; 1923 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName)) 1924 return Err; 1925 ComputeUnitKind = GPUName; 1926 1927 // Get the wavefront size. 1928 uint32_t WavefrontSize = 0; 1929 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize)) 1930 return Err; 1931 GridValues.GV_Warp_Size = WavefrontSize; 1932 1933 // Get the frequency of the steady clock. If the attribute is missing 1934 // assume running on an older libhsa and default to 0, omp_get_wtime 1935 // will be inaccurate but otherwise programs can still run. 1936 if (getDeviceAttrRaw(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY, 1937 ClockFrequency) != HSA_STATUS_SUCCESS) 1938 ClockFrequency = 0; 1939 1940 // Load the grid values dependending on the wavefront. 1941 if (WavefrontSize == 32) 1942 GridValues = getAMDGPUGridValues<32>(); 1943 else if (WavefrontSize == 64) 1944 GridValues = getAMDGPUGridValues<64>(); 1945 else 1946 return Plugin::error("Unexpected AMDGPU wavefront %d", WavefrontSize); 1947 1948 // Get maximum number of workitems per workgroup. 1949 uint16_t WorkgroupMaxDim[3]; 1950 if (auto Err = 1951 getDeviceAttr(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgroupMaxDim)) 1952 return Err; 1953 GridValues.GV_Max_WG_Size = WorkgroupMaxDim[0]; 1954 1955 // Get maximum number of workgroups. 1956 hsa_dim3_t GridMaxDim; 1957 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim)) 1958 return Err; 1959 1960 GridValues.GV_Max_Teams = GridMaxDim.x / GridValues.GV_Max_WG_Size; 1961 if (GridValues.GV_Max_Teams == 0) 1962 return Plugin::error("Maximum number of teams cannot be zero"); 1963 1964 // Compute the default number of teams. 1965 uint32_t ComputeUnits = 0; 1966 if (auto Err = 1967 getDeviceAttr(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, ComputeUnits)) 1968 return Err; 1969 GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU; 1970 1971 uint32_t WavesPerCU = 0; 1972 if (auto Err = 1973 getDeviceAttr(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, WavesPerCU)) 1974 return Err; 1975 HardwareParallelism = ComputeUnits * WavesPerCU; 1976 1977 // Get maximum size of any device queues and maximum number of queues. 1978 uint32_t MaxQueueSize; 1979 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize)) 1980 return Err; 1981 1982 uint32_t MaxQueues; 1983 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUES_MAX, MaxQueues)) 1984 return Err; 1985 1986 // Compute the number of queues and their size. 1987 OMPX_NumQueues = std::max(1U, std::min(OMPX_NumQueues.get(), MaxQueues)); 1988 OMPX_QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize); 1989 1990 // Initialize stream pool. 1991 if (auto Err = AMDGPUStreamManager.init(OMPX_InitialNumStreams, 1992 OMPX_NumQueues, OMPX_QueueSize)) 1993 return Err; 1994 1995 // Initialize event pool. 1996 if (auto Err = AMDGPUEventManager.init(OMPX_InitialNumEvents)) 1997 return Err; 1998 1999 // Initialize signal pool. 2000 if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals)) 2001 return Err; 2002 2003 // Detect if XNACK is enabled 2004 SmallVector<SmallString<32>> Targets; 2005 if (auto Err = hsa_utils::getTargetTripleAndFeatures(Agent, Targets)) 2006 return Err; 2007 if (!Targets.empty() && Targets[0].str().contains("xnack+")) 2008 IsXnackEnabled = true; 2009 2010 // detect if device is an APU. 2011 if (auto Err = checkIfAPU()) 2012 return Err; 2013 2014 return Plugin::success(); 2015 } 2016 2017 /// Deinitialize the device and release its resources. 2018 Error deinitImpl() override { 2019 // Deinitialize the stream and event pools. 2020 if (auto Err = AMDGPUStreamManager.deinit()) 2021 return Err; 2022 2023 if (auto Err = AMDGPUEventManager.deinit()) 2024 return Err; 2025 2026 if (auto Err = AMDGPUSignalManager.deinit()) 2027 return Err; 2028 2029 // Close modules if necessary. 2030 if (!LoadedImages.empty()) { 2031 // Each image has its own module. 2032 for (DeviceImageTy *Image : LoadedImages) { 2033 AMDGPUDeviceImageTy &AMDImage = 2034 static_cast<AMDGPUDeviceImageTy &>(*Image); 2035 2036 // Unload the executable of the image. 2037 if (auto Err = AMDImage.unloadExecutable()) 2038 return Err; 2039 } 2040 } 2041 2042 // Invalidate agent reference. 2043 Agent = {0}; 2044 2045 return Plugin::success(); 2046 } 2047 2048 virtual Error callGlobalConstructors(GenericPluginTy &Plugin, 2049 DeviceImageTy &Image) override { 2050 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); 2051 if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini")) 2052 Image.setPendingGlobalDtors(); 2053 2054 return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true); 2055 } 2056 2057 virtual Error callGlobalDestructors(GenericPluginTy &Plugin, 2058 DeviceImageTy &Image) override { 2059 if (Image.hasPendingGlobalDtors()) 2060 return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false); 2061 return Plugin::success(); 2062 } 2063 2064 uint64_t getStreamBusyWaitMicroseconds() const { return OMPX_StreamBusyWait; } 2065 2066 Expected<std::unique_ptr<MemoryBuffer>> 2067 doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override { 2068 2069 // TODO: We should try to avoid materialization but there seems to be no 2070 // good linker interface w/o file i/o. 2071 SmallString<128> LinkerInputFilePath; 2072 std::error_code EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit", 2073 "o", LinkerInputFilePath); 2074 if (EC) 2075 return Plugin::error("Failed to create temporary file for linker"); 2076 2077 // Write the file's contents to the output file. 2078 Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr = 2079 FileOutputBuffer::create(LinkerInputFilePath, MB->getBuffer().size()); 2080 if (!OutputOrErr) 2081 return OutputOrErr.takeError(); 2082 std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr); 2083 llvm::copy(MB->getBuffer(), Output->getBufferStart()); 2084 if (Error E = Output->commit()) 2085 return std::move(E); 2086 2087 SmallString<128> LinkerOutputFilePath; 2088 EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit", "so", 2089 LinkerOutputFilePath); 2090 if (EC) 2091 return Plugin::error("Failed to create temporary file for linker"); 2092 2093 const auto &ErrorOrPath = sys::findProgramByName("lld"); 2094 if (!ErrorOrPath) 2095 return createStringError(inconvertibleErrorCode(), 2096 "Failed to find `lld` on the PATH."); 2097 2098 std::string LLDPath = ErrorOrPath.get(); 2099 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(), 2100 "Using `%s` to link JITed amdgcn ouput.", LLDPath.c_str()); 2101 2102 std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind(); 2103 StringRef Args[] = {LLDPath, 2104 "-flavor", 2105 "gnu", 2106 "--no-undefined", 2107 "-shared", 2108 MCPU, 2109 "-o", 2110 LinkerOutputFilePath.data(), 2111 LinkerInputFilePath.data()}; 2112 2113 std::string Error; 2114 int RC = sys::ExecuteAndWait(LLDPath, Args, std::nullopt, {}, 0, 0, &Error); 2115 if (RC) 2116 return Plugin::error("Linking optimized bitcode failed: %s", 2117 Error.c_str()); 2118 2119 auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(LinkerOutputFilePath); 2120 if (!BufferOrErr) 2121 return Plugin::error("Failed to open temporary file for lld"); 2122 2123 // Clean up the temporary files afterwards. 2124 if (sys::fs::remove(LinkerOutputFilePath)) 2125 return Plugin::error("Failed to remove temporary output file for lld"); 2126 if (sys::fs::remove(LinkerInputFilePath)) 2127 return Plugin::error("Failed to remove temporary input file for lld"); 2128 2129 return std::move(*BufferOrErr); 2130 } 2131 2132 /// See GenericDeviceTy::getComputeUnitKind(). 2133 std::string getComputeUnitKind() const override { return ComputeUnitKind; } 2134 2135 /// Returns the clock frequency for the given AMDGPU device. 2136 uint64_t getClockFrequency() const override { return ClockFrequency; } 2137 2138 /// Allocate and construct an AMDGPU kernel. 2139 Expected<GenericKernelTy &> constructKernel(const char *Name) override { 2140 // Allocate and construct the AMDGPU kernel. 2141 AMDGPUKernelTy *AMDGPUKernel = Plugin.allocate<AMDGPUKernelTy>(); 2142 if (!AMDGPUKernel) 2143 return Plugin::error("Failed to allocate memory for AMDGPU kernel"); 2144 2145 new (AMDGPUKernel) AMDGPUKernelTy(Name); 2146 2147 return *AMDGPUKernel; 2148 } 2149 2150 /// Set the current context to this device's context. Do nothing since the 2151 /// AMDGPU devices do not have the concept of contexts. 2152 Error setContext() override { return Plugin::success(); } 2153 2154 /// AMDGPU returns the product of the number of compute units and the waves 2155 /// per compute unit. 2156 uint64_t getHardwareParallelism() const override { 2157 return HardwareParallelism; 2158 } 2159 2160 /// We want to set up the RPC server for host services to the GPU if it is 2161 /// availible. 2162 bool shouldSetupRPCServer() const override { return true; } 2163 2164 /// The RPC interface should have enough space for all availible parallelism. 2165 uint64_t requestedRPCPortCount() const override { 2166 return getHardwareParallelism(); 2167 } 2168 2169 /// Get the stream of the asynchronous info sructure or get a new one. 2170 Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, 2171 AMDGPUStreamTy *&Stream) { 2172 // Get the stream (if any) from the async info. 2173 Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>(); 2174 if (!Stream) { 2175 // There was no stream; get an idle one. 2176 if (auto Err = AMDGPUStreamManager.getResource(Stream)) 2177 return Err; 2178 2179 // Modify the async info's stream. 2180 AsyncInfoWrapper.setQueueAs<AMDGPUStreamTy *>(Stream); 2181 } 2182 return Plugin::success(); 2183 } 2184 2185 /// Load the binary image into the device and allocate an image object. 2186 Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage, 2187 int32_t ImageId) override { 2188 // Allocate and initialize the image object. 2189 AMDGPUDeviceImageTy *AMDImage = Plugin.allocate<AMDGPUDeviceImageTy>(); 2190 new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage); 2191 2192 // Load the HSA executable. 2193 if (Error Err = AMDImage->loadExecutable(*this)) 2194 return std::move(Err); 2195 2196 return AMDImage; 2197 } 2198 2199 /// Allocate memory on the device or related to the device. 2200 void *allocate(size_t Size, void *, TargetAllocTy Kind) override; 2201 2202 /// Deallocate memory on the device or related to the device. 2203 int free(void *TgtPtr, TargetAllocTy Kind) override { 2204 if (TgtPtr == nullptr) 2205 return OFFLOAD_SUCCESS; 2206 2207 AMDGPUMemoryPoolTy *MemoryPool = nullptr; 2208 switch (Kind) { 2209 case TARGET_ALLOC_DEFAULT: 2210 case TARGET_ALLOC_DEVICE: 2211 case TARGET_ALLOC_DEVICE_NON_BLOCKING: 2212 MemoryPool = CoarseGrainedMemoryPools[0]; 2213 break; 2214 case TARGET_ALLOC_HOST: 2215 MemoryPool = &HostDevice.getFineGrainedMemoryPool(); 2216 break; 2217 case TARGET_ALLOC_SHARED: 2218 MemoryPool = &HostDevice.getFineGrainedMemoryPool(); 2219 break; 2220 } 2221 2222 if (!MemoryPool) { 2223 REPORT("No memory pool for the specified allocation kind\n"); 2224 return OFFLOAD_FAIL; 2225 } 2226 2227 if (Error Err = MemoryPool->deallocate(TgtPtr)) { 2228 REPORT("%s\n", toString(std::move(Err)).data()); 2229 return OFFLOAD_FAIL; 2230 } 2231 2232 return OFFLOAD_SUCCESS; 2233 } 2234 2235 /// Synchronize current thread with the pending operations on the async info. 2236 Error synchronizeImpl(__tgt_async_info &AsyncInfo) override { 2237 AMDGPUStreamTy *Stream = 2238 reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue); 2239 assert(Stream && "Invalid stream"); 2240 2241 if (auto Err = Stream->synchronize()) 2242 return Err; 2243 2244 // Once the stream is synchronized, return it to stream pool and reset 2245 // AsyncInfo. This is to make sure the synchronization only works for its 2246 // own tasks. 2247 AsyncInfo.Queue = nullptr; 2248 return AMDGPUStreamManager.returnResource(Stream); 2249 } 2250 2251 /// Query for the completion of the pending operations on the async info. 2252 Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override { 2253 AMDGPUStreamTy *Stream = 2254 reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue); 2255 assert(Stream && "Invalid stream"); 2256 2257 auto CompletedOrErr = Stream->query(); 2258 if (!CompletedOrErr) 2259 return CompletedOrErr.takeError(); 2260 2261 // Return if it the stream did not complete yet. 2262 if (!(*CompletedOrErr)) 2263 return Plugin::success(); 2264 2265 // Once the stream is completed, return it to stream pool and reset 2266 // AsyncInfo. This is to make sure the synchronization only works for its 2267 // own tasks. 2268 AsyncInfo.Queue = nullptr; 2269 return AMDGPUStreamManager.returnResource(Stream); 2270 } 2271 2272 /// Pin the host buffer and return the device pointer that should be used for 2273 /// device transfers. 2274 Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override { 2275 void *PinnedPtr = nullptr; 2276 2277 hsa_status_t Status = 2278 hsa_amd_memory_lock(HstPtr, Size, nullptr, 0, &PinnedPtr); 2279 if (auto Err = Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n")) 2280 return std::move(Err); 2281 2282 return PinnedPtr; 2283 } 2284 2285 /// Unpin the host buffer. 2286 Error dataUnlockImpl(void *HstPtr) override { 2287 hsa_status_t Status = hsa_amd_memory_unlock(HstPtr); 2288 return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n"); 2289 } 2290 2291 /// Check through the HSA runtime whether the \p HstPtr buffer is pinned. 2292 Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr, 2293 void *&BaseDevAccessiblePtr, 2294 size_t &BaseSize) const override { 2295 hsa_amd_pointer_info_t Info; 2296 Info.size = sizeof(hsa_amd_pointer_info_t); 2297 2298 hsa_status_t Status = hsa_amd_pointer_info( 2299 HstPtr, &Info, /*Allocator=*/nullptr, /*num_agents_accessible=*/nullptr, 2300 /*accessible=*/nullptr); 2301 if (auto Err = Plugin::check(Status, "Error in hsa_amd_pointer_info: %s")) 2302 return std::move(Err); 2303 2304 // The buffer may be locked or allocated through HSA allocators. Assume that 2305 // the buffer is host pinned if the runtime reports a HSA type. 2306 if (Info.type != HSA_EXT_POINTER_TYPE_LOCKED && 2307 Info.type != HSA_EXT_POINTER_TYPE_HSA) 2308 return false; 2309 2310 assert(Info.hostBaseAddress && "Invalid host pinned address"); 2311 assert(Info.agentBaseAddress && "Invalid agent pinned address"); 2312 assert(Info.sizeInBytes > 0 && "Invalid pinned allocation size"); 2313 2314 // Save the allocation info in the output parameters. 2315 BaseHstPtr = Info.hostBaseAddress; 2316 BaseDevAccessiblePtr = Info.agentBaseAddress; 2317 BaseSize = Info.sizeInBytes; 2318 2319 return true; 2320 } 2321 2322 /// Submit data to the device (host to device transfer). 2323 Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, 2324 AsyncInfoWrapperTy &AsyncInfoWrapper) override { 2325 AMDGPUStreamTy *Stream = nullptr; 2326 void *PinnedPtr = nullptr; 2327 2328 // Use one-step asynchronous operation when host memory is already pinned. 2329 if (void *PinnedPtr = 2330 PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) { 2331 if (auto Err = getStream(AsyncInfoWrapper, Stream)) 2332 return Err; 2333 return Stream->pushPinnedMemoryCopyAsync(TgtPtr, PinnedPtr, Size); 2334 } 2335 2336 // For large transfers use synchronous behavior. 2337 if (Size >= OMPX_MaxAsyncCopyBytes) { 2338 if (AsyncInfoWrapper.hasQueue()) 2339 if (auto Err = synchronize(AsyncInfoWrapper)) 2340 return Err; 2341 2342 hsa_status_t Status; 2343 Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0, 2344 &PinnedPtr); 2345 if (auto Err = 2346 Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n")) 2347 return Err; 2348 2349 AMDGPUSignalTy Signal; 2350 if (auto Err = Signal.init()) 2351 return Err; 2352 2353 if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr, 2354 Agent, PinnedPtr, Agent, Size, 0, 2355 nullptr, Signal.get())) 2356 return Err; 2357 2358 if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) 2359 return Err; 2360 2361 if (auto Err = Signal.deinit()) 2362 return Err; 2363 2364 Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr)); 2365 return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n"); 2366 } 2367 2368 // Otherwise, use two-step copy with an intermediate pinned host buffer. 2369 AMDGPUMemoryManagerTy &PinnedMemoryManager = 2370 HostDevice.getPinnedMemoryManager(); 2371 if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr)) 2372 return Err; 2373 2374 if (auto Err = getStream(AsyncInfoWrapper, Stream)) 2375 return Err; 2376 2377 return Stream->pushMemoryCopyH2DAsync(TgtPtr, HstPtr, PinnedPtr, Size, 2378 PinnedMemoryManager); 2379 } 2380 2381 /// Retrieve data from the device (device to host transfer). 2382 Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, 2383 AsyncInfoWrapperTy &AsyncInfoWrapper) override { 2384 AMDGPUStreamTy *Stream = nullptr; 2385 void *PinnedPtr = nullptr; 2386 2387 // Use one-step asynchronous operation when host memory is already pinned. 2388 if (void *PinnedPtr = 2389 PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) { 2390 if (auto Err = getStream(AsyncInfoWrapper, Stream)) 2391 return Err; 2392 2393 return Stream->pushPinnedMemoryCopyAsync(PinnedPtr, TgtPtr, Size); 2394 } 2395 2396 // For large transfers use synchronous behavior. 2397 if (Size >= OMPX_MaxAsyncCopyBytes) { 2398 if (AsyncInfoWrapper.hasQueue()) 2399 if (auto Err = synchronize(AsyncInfoWrapper)) 2400 return Err; 2401 2402 hsa_status_t Status; 2403 Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0, 2404 &PinnedPtr); 2405 if (auto Err = 2406 Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n")) 2407 return Err; 2408 2409 AMDGPUSignalTy Signal; 2410 if (auto Err = Signal.init()) 2411 return Err; 2412 2413 if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), 2414 PinnedPtr, Agent, TgtPtr, Agent, 2415 Size, 0, nullptr, Signal.get())) 2416 return Err; 2417 2418 if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) 2419 return Err; 2420 2421 if (auto Err = Signal.deinit()) 2422 return Err; 2423 2424 Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr)); 2425 return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n"); 2426 } 2427 2428 // Otherwise, use two-step copy with an intermediate pinned host buffer. 2429 AMDGPUMemoryManagerTy &PinnedMemoryManager = 2430 HostDevice.getPinnedMemoryManager(); 2431 if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr)) 2432 return Err; 2433 2434 if (auto Err = getStream(AsyncInfoWrapper, Stream)) 2435 return Err; 2436 2437 return Stream->pushMemoryCopyD2HAsync(HstPtr, TgtPtr, PinnedPtr, Size, 2438 PinnedMemoryManager); 2439 } 2440 2441 /// Exchange data between two devices within the plugin. 2442 Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice, 2443 void *DstPtr, int64_t Size, 2444 AsyncInfoWrapperTy &AsyncInfoWrapper) override { 2445 AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice); 2446 2447 // For large transfers use synchronous behavior. 2448 if (Size >= OMPX_MaxAsyncCopyBytes) { 2449 if (AsyncInfoWrapper.hasQueue()) 2450 if (auto Err = synchronize(AsyncInfoWrapper)) 2451 return Err; 2452 2453 AMDGPUSignalTy Signal; 2454 if (auto Err = Signal.init()) 2455 return Err; 2456 2457 if (auto Err = hsa_utils::asyncMemCopy( 2458 useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr, 2459 getAgent(), (uint64_t)Size, 0, nullptr, Signal.get())) 2460 return Err; 2461 2462 if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) 2463 return Err; 2464 2465 return Signal.deinit(); 2466 } 2467 2468 AMDGPUStreamTy *Stream = nullptr; 2469 if (auto Err = getStream(AsyncInfoWrapper, Stream)) 2470 return Err; 2471 if (Size <= 0) 2472 return Plugin::success(); 2473 2474 return Stream->pushMemoryCopyD2DAsync(DstPtr, DstDevice.getAgent(), SrcPtr, 2475 getAgent(), (uint64_t)Size); 2476 } 2477 2478 /// Initialize the async info for interoperability purposes. 2479 Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { 2480 // TODO: Implement this function. 2481 return Plugin::success(); 2482 } 2483 2484 /// Initialize the device info for interoperability purposes. 2485 Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override { 2486 DeviceInfo->Context = nullptr; 2487 2488 if (!DeviceInfo->Device) 2489 DeviceInfo->Device = reinterpret_cast<void *>(Agent.handle); 2490 2491 return Plugin::success(); 2492 } 2493 2494 /// Create an event. 2495 Error createEventImpl(void **EventPtrStorage) override { 2496 AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage); 2497 return AMDGPUEventManager.getResource(*Event); 2498 } 2499 2500 /// Destroy a previously created event. 2501 Error destroyEventImpl(void *EventPtr) override { 2502 AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr); 2503 return AMDGPUEventManager.returnResource(Event); 2504 } 2505 2506 /// Record the event. 2507 Error recordEventImpl(void *EventPtr, 2508 AsyncInfoWrapperTy &AsyncInfoWrapper) override { 2509 AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr); 2510 assert(Event && "Invalid event"); 2511 2512 AMDGPUStreamTy *Stream = nullptr; 2513 if (auto Err = getStream(AsyncInfoWrapper, Stream)) 2514 return Err; 2515 2516 return Event->record(*Stream); 2517 } 2518 2519 /// Make the stream wait on the event. 2520 Error waitEventImpl(void *EventPtr, 2521 AsyncInfoWrapperTy &AsyncInfoWrapper) override { 2522 AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr); 2523 2524 AMDGPUStreamTy *Stream = nullptr; 2525 if (auto Err = getStream(AsyncInfoWrapper, Stream)) 2526 return Err; 2527 2528 return Event->wait(*Stream); 2529 } 2530 2531 /// Synchronize the current thread with the event. 2532 Error syncEventImpl(void *EventPtr) override { 2533 return Plugin::error("Synchronize event not implemented"); 2534 } 2535 2536 /// Print information about the device. 2537 Error obtainInfoImpl(InfoQueueTy &Info) override { 2538 char TmpChar[1000]; 2539 const char *TmpCharPtr = "Unknown"; 2540 uint16_t Major, Minor; 2541 uint32_t TmpUInt, TmpUInt2; 2542 uint32_t CacheSize[4]; 2543 size_t TmpSt; 2544 bool TmpBool; 2545 uint16_t WorkgrpMaxDim[3]; 2546 hsa_dim3_t GridMaxDim; 2547 hsa_status_t Status, Status2; 2548 2549 Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major); 2550 Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor); 2551 if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS) 2552 Info.add("HSA Runtime Version", 2553 std::to_string(Major) + "." + std::to_string(Minor)); 2554 2555 Info.add("HSA OpenMP Device Number", DeviceId); 2556 2557 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar); 2558 if (Status == HSA_STATUS_SUCCESS) 2559 Info.add("Product Name", TmpChar); 2560 2561 Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar); 2562 if (Status == HSA_STATUS_SUCCESS) 2563 Info.add("Device Name", TmpChar); 2564 2565 Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar); 2566 if (Status == HSA_STATUS_SUCCESS) 2567 Info.add("Vendor Name", TmpChar); 2568 2569 hsa_device_type_t DevType; 2570 Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType); 2571 if (Status == HSA_STATUS_SUCCESS) { 2572 switch (DevType) { 2573 case HSA_DEVICE_TYPE_CPU: 2574 TmpCharPtr = "CPU"; 2575 break; 2576 case HSA_DEVICE_TYPE_GPU: 2577 TmpCharPtr = "GPU"; 2578 break; 2579 case HSA_DEVICE_TYPE_DSP: 2580 TmpCharPtr = "DSP"; 2581 break; 2582 } 2583 Info.add("Device Type", TmpCharPtr); 2584 } 2585 2586 Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUES_MAX, TmpUInt); 2587 if (Status == HSA_STATUS_SUCCESS) 2588 Info.add("Max Queues", TmpUInt); 2589 2590 Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MIN_SIZE, TmpUInt); 2591 if (Status == HSA_STATUS_SUCCESS) 2592 Info.add("Queue Min Size", TmpUInt); 2593 2594 Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MAX_SIZE, TmpUInt); 2595 if (Status == HSA_STATUS_SUCCESS) 2596 Info.add("Queue Max Size", TmpUInt); 2597 2598 // FIXME: This is deprecated according to HSA documentation. But using 2599 // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during 2600 // runtime. 2601 Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize); 2602 if (Status == HSA_STATUS_SUCCESS) { 2603 Info.add("Cache"); 2604 2605 for (int I = 0; I < 4; I++) 2606 if (CacheSize[I]) 2607 Info.add<InfoLevel2>("L" + std::to_string(I), CacheSize[I]); 2608 } 2609 2610 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt); 2611 if (Status == HSA_STATUS_SUCCESS) 2612 Info.add("Cacheline Size", TmpUInt); 2613 2614 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt); 2615 if (Status == HSA_STATUS_SUCCESS) 2616 Info.add("Max Clock Freq", TmpUInt, "MHz"); 2617 2618 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt); 2619 if (Status == HSA_STATUS_SUCCESS) 2620 Info.add("Compute Units", TmpUInt); 2621 2622 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt); 2623 if (Status == HSA_STATUS_SUCCESS) 2624 Info.add("SIMD per CU", TmpUInt); 2625 2626 Status = getDeviceAttrRaw(HSA_AGENT_INFO_FAST_F16_OPERATION, TmpBool); 2627 if (Status == HSA_STATUS_SUCCESS) 2628 Info.add("Fast F16 Operation", TmpBool); 2629 2630 Status = getDeviceAttrRaw(HSA_AGENT_INFO_WAVEFRONT_SIZE, TmpUInt2); 2631 if (Status == HSA_STATUS_SUCCESS) 2632 Info.add("Wavefront Size", TmpUInt2); 2633 2634 Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, TmpUInt); 2635 if (Status == HSA_STATUS_SUCCESS) 2636 Info.add("Workgroup Max Size", TmpUInt); 2637 2638 Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim); 2639 if (Status == HSA_STATUS_SUCCESS) { 2640 Info.add("Workgroup Max Size per Dimension"); 2641 Info.add<InfoLevel2>("x", WorkgrpMaxDim[0]); 2642 Info.add<InfoLevel2>("y", WorkgrpMaxDim[1]); 2643 Info.add<InfoLevel2>("z", WorkgrpMaxDim[2]); 2644 } 2645 2646 Status = getDeviceAttrRaw( 2647 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, TmpUInt); 2648 if (Status == HSA_STATUS_SUCCESS) { 2649 Info.add("Max Waves Per CU", TmpUInt); 2650 Info.add("Max Work-item Per CU", TmpUInt * TmpUInt2); 2651 } 2652 2653 Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt); 2654 if (Status == HSA_STATUS_SUCCESS) 2655 Info.add("Grid Max Size", TmpUInt); 2656 2657 Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim); 2658 if (Status == HSA_STATUS_SUCCESS) { 2659 Info.add("Grid Max Size per Dimension"); 2660 Info.add<InfoLevel2>("x", GridMaxDim.x); 2661 Info.add<InfoLevel2>("y", GridMaxDim.y); 2662 Info.add<InfoLevel2>("z", GridMaxDim.z); 2663 } 2664 2665 Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt); 2666 if (Status == HSA_STATUS_SUCCESS) 2667 Info.add("Max fbarriers/Workgrp", TmpUInt); 2668 2669 Info.add("Memory Pools"); 2670 for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) { 2671 std::string TmpStr, TmpStr2; 2672 2673 if (Pool->isGlobal()) 2674 TmpStr = "Global"; 2675 else if (Pool->isReadOnly()) 2676 TmpStr = "ReadOnly"; 2677 else if (Pool->isPrivate()) 2678 TmpStr = "Private"; 2679 else if (Pool->isGroup()) 2680 TmpStr = "Group"; 2681 else 2682 TmpStr = "Unknown"; 2683 2684 Info.add<InfoLevel2>(std::string("Pool ") + TmpStr); 2685 2686 if (Pool->isGlobal()) { 2687 if (Pool->isFineGrained()) 2688 TmpStr2 += "Fine Grained "; 2689 if (Pool->isCoarseGrained()) 2690 TmpStr2 += "Coarse Grained "; 2691 if (Pool->supportsKernelArgs()) 2692 TmpStr2 += "Kernarg "; 2693 2694 Info.add<InfoLevel3>("Flags", TmpStr2); 2695 } 2696 2697 Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt); 2698 if (Status == HSA_STATUS_SUCCESS) 2699 Info.add<InfoLevel3>("Size", TmpSt, "bytes"); 2700 2701 Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, 2702 TmpBool); 2703 if (Status == HSA_STATUS_SUCCESS) 2704 Info.add<InfoLevel3>("Allocatable", TmpBool); 2705 2706 Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, 2707 TmpSt); 2708 if (Status == HSA_STATUS_SUCCESS) 2709 Info.add<InfoLevel3>("Runtime Alloc Granule", TmpSt, "bytes"); 2710 2711 Status = Pool->getAttrRaw( 2712 HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt); 2713 if (Status == HSA_STATUS_SUCCESS) 2714 Info.add<InfoLevel3>("Runtime Alloc Alignment", TmpSt, "bytes"); 2715 2716 Status = 2717 Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool); 2718 if (Status == HSA_STATUS_SUCCESS) 2719 Info.add<InfoLevel3>("Accessable by all", TmpBool); 2720 } 2721 2722 Info.add("ISAs"); 2723 auto Err = hsa_utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) { 2724 Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar); 2725 if (Status == HSA_STATUS_SUCCESS) 2726 Info.add<InfoLevel2>("Name", TmpChar); 2727 2728 return Status; 2729 }); 2730 2731 // Silently consume the error. 2732 if (Err) 2733 consumeError(std::move(Err)); 2734 2735 return Plugin::success(); 2736 } 2737 2738 /// Returns true if auto zero-copy the best configuration for the current 2739 /// arch. 2740 /// On AMDGPUs, automatic zero-copy is turned on 2741 /// when running on an APU with XNACK (unified memory) support 2742 /// enabled. On discrete GPUs, automatic zero-copy is triggered 2743 /// if the user sets the environment variable OMPX_APU_MAPS=1 2744 /// and if XNACK is enabled. The rationale is that zero-copy 2745 /// is the best configuration (performance, memory footprint) on APUs, 2746 /// while it is often not the best on discrete GPUs. 2747 /// XNACK can be enabled with a kernel boot parameter or with 2748 /// the HSA_XNACK environment variable. 2749 bool useAutoZeroCopyImpl() override { 2750 return ((IsAPU || OMPX_ApuMaps) && IsXnackEnabled); 2751 } 2752 2753 /// Getters and setters for stack and heap sizes. 2754 Error getDeviceStackSize(uint64_t &Value) override { 2755 Value = StackSize; 2756 return Plugin::success(); 2757 } 2758 Error setDeviceStackSize(uint64_t Value) override { 2759 StackSize = Value; 2760 return Plugin::success(); 2761 } 2762 Error getDeviceHeapSize(uint64_t &Value) override { 2763 Value = DeviceMemoryPoolSize; 2764 return Plugin::success(); 2765 } 2766 Error setDeviceHeapSize(uint64_t Value) override { 2767 for (DeviceImageTy *Image : LoadedImages) 2768 if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value)) 2769 return Err; 2770 DeviceMemoryPoolSize = Value; 2771 return Plugin::success(); 2772 } 2773 Error getDeviceMemorySize(uint64_t &Value) override { 2774 for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) { 2775 if (Pool->isGlobal()) { 2776 hsa_status_t Status = 2777 Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value); 2778 return Plugin::check(Status, "Error in getting device memory size: %s"); 2779 } 2780 } 2781 return Plugin::error("getDeviceMemorySize:: no global pool"); 2782 } 2783 2784 /// AMDGPU-specific function to get device attributes. 2785 template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) { 2786 hsa_status_t Status = 2787 hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value); 2788 return Plugin::check(Status, "Error in hsa_agent_get_info: %s"); 2789 } 2790 2791 template <typename Ty> 2792 hsa_status_t getDeviceAttrRaw(uint32_t Kind, Ty &Value) { 2793 return hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value); 2794 } 2795 2796 /// Get the device agent. 2797 hsa_agent_t getAgent() const override { return Agent; } 2798 2799 /// Get the signal manager. 2800 AMDGPUSignalManagerTy &getSignalManager() { return AMDGPUSignalManager; } 2801 2802 /// Retrieve and construct all memory pools of the device agent. 2803 Error retrieveAllMemoryPools() override { 2804 // Iterate through the available pools of the device agent. 2805 return hsa_utils::iterateAgentMemoryPools( 2806 Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) { 2807 AMDGPUMemoryPoolTy *MemoryPool = 2808 Plugin.allocate<AMDGPUMemoryPoolTy>(); 2809 new (MemoryPool) AMDGPUMemoryPoolTy(HSAMemoryPool); 2810 AllMemoryPools.push_back(MemoryPool); 2811 return HSA_STATUS_SUCCESS; 2812 }); 2813 } 2814 2815 bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; } 2816 2817 private: 2818 using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>; 2819 using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>; 2820 2821 /// Common method to invoke a single threaded constructor or destructor 2822 /// kernel by name. 2823 Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image, 2824 bool IsCtor) { 2825 const char *KernelName = 2826 IsCtor ? "amdgcn.device.init" : "amdgcn.device.fini"; 2827 // Perform a quick check for the named kernel in the image. The kernel 2828 // should be created by the 'amdgpu-lower-ctor-dtor' pass. 2829 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); 2830 if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName)) 2831 return Plugin::success(); 2832 2833 // Allocate and construct the AMDGPU kernel. 2834 AMDGPUKernelTy AMDGPUKernel(KernelName); 2835 if (auto Err = AMDGPUKernel.init(*this, Image)) 2836 return Err; 2837 2838 AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr); 2839 2840 KernelArgsTy KernelArgs = {}; 2841 uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u}; 2842 if (auto Err = AMDGPUKernel.launchImpl( 2843 *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs, 2844 KernelLaunchParamsTy{}, AsyncInfoWrapper)) 2845 return Err; 2846 2847 Error Err = Plugin::success(); 2848 AsyncInfoWrapper.finalize(Err); 2849 2850 return Err; 2851 } 2852 2853 /// Detect if current architecture is an APU. 2854 Error checkIfAPU() { 2855 // TODO: replace with ROCr API once it becomes available. 2856 llvm::StringRef StrGfxName(ComputeUnitKind); 2857 IsAPU = llvm::StringSwitch<bool>(StrGfxName) 2858 .Case("gfx940", true) 2859 .Default(false); 2860 if (IsAPU) 2861 return Plugin::success(); 2862 2863 bool MayBeAPU = llvm::StringSwitch<bool>(StrGfxName) 2864 .Case("gfx942", true) 2865 .Default(false); 2866 if (!MayBeAPU) 2867 return Plugin::success(); 2868 2869 // can be MI300A or MI300X 2870 uint32_t ChipID = 0; 2871 if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_CHIP_ID, ChipID)) 2872 return Err; 2873 2874 if (!(ChipID & 0x1)) { 2875 IsAPU = true; 2876 return Plugin::success(); 2877 } 2878 return Plugin::success(); 2879 } 2880 2881 /// Envar for controlling the number of HSA queues per device. High number of 2882 /// queues may degrade performance. 2883 UInt32Envar OMPX_NumQueues; 2884 2885 /// Envar for controlling the size of each HSA queue. The size is the number 2886 /// of HSA packets a queue is expected to hold. It is also the number of HSA 2887 /// packets that can be pushed into each queue without waiting the driver to 2888 /// process them. 2889 UInt32Envar OMPX_QueueSize; 2890 2891 /// Envar for controlling the default number of teams relative to the number 2892 /// of compute units (CUs) the device has: 2893 /// #default_teams = OMPX_DefaultTeamsPerCU * #CUs. 2894 UInt32Envar OMPX_DefaultTeamsPerCU; 2895 2896 /// Envar specifying the maximum size in bytes where the memory copies are 2897 /// asynchronous operations. Up to this transfer size, the memory copies are 2898 /// asychronous operations pushed to the corresponding stream. For larger 2899 /// transfers, they are synchronous transfers. 2900 UInt32Envar OMPX_MaxAsyncCopyBytes; 2901 2902 /// Envar controlling the initial number of HSA signals per device. There is 2903 /// one manager of signals per device managing several pre-allocated signals. 2904 /// These signals are mainly used by AMDGPU streams. If needed, more signals 2905 /// will be created. 2906 UInt32Envar OMPX_InitialNumSignals; 2907 2908 /// Environment variables to set the time to wait in active state before 2909 /// switching to blocked state. The default 2000000 busywaits for 2 seconds 2910 /// before going into a blocking HSA wait state. The unit for these variables 2911 /// are microseconds. 2912 UInt32Envar OMPX_StreamBusyWait; 2913 2914 /// Use ROCm 5.7 interface for multiple SDMA engines 2915 BoolEnvar OMPX_UseMultipleSdmaEngines; 2916 2917 /// Value of OMPX_APU_MAPS env var used to force 2918 /// automatic zero-copy behavior on non-APU GPUs. 2919 BoolEnvar OMPX_ApuMaps; 2920 2921 /// Stream manager for AMDGPU streams. 2922 AMDGPUStreamManagerTy AMDGPUStreamManager; 2923 2924 /// Event manager for AMDGPU events. 2925 AMDGPUEventManagerTy AMDGPUEventManager; 2926 2927 /// Signal manager for AMDGPU signals. 2928 AMDGPUSignalManagerTy AMDGPUSignalManager; 2929 2930 /// The agent handler corresponding to the device. 2931 hsa_agent_t Agent; 2932 2933 /// The GPU architecture. 2934 std::string ComputeUnitKind; 2935 2936 /// The frequency of the steady clock inside the device. 2937 uint64_t ClockFrequency; 2938 2939 /// The total number of concurrent work items that can be running on the GPU. 2940 uint64_t HardwareParallelism; 2941 2942 /// Reference to the host device. 2943 AMDHostDeviceTy &HostDevice; 2944 2945 /// The current size of the global device memory pool (managed by us). 2946 uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/; 2947 2948 /// The current size of the stack that will be used in cases where it could 2949 /// not be statically determined. 2950 uint64_t StackSize = 16 * 1024 /* 16 KB */; 2951 2952 /// Is the plugin associated with an APU? 2953 bool IsAPU = false; 2954 2955 /// True is the system is configured with XNACK-Enabled. 2956 /// False otherwise. 2957 bool IsXnackEnabled = false; 2958 }; 2959 2960 Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) { 2961 hsa_code_object_reader_t Reader; 2962 hsa_status_t Status = 2963 hsa_code_object_reader_create_from_memory(getStart(), getSize(), &Reader); 2964 if (auto Err = Plugin::check( 2965 Status, "Error in hsa_code_object_reader_create_from_memory: %s")) 2966 return Err; 2967 2968 Status = hsa_executable_create_alt( 2969 HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", &Executable); 2970 if (auto Err = 2971 Plugin::check(Status, "Error in hsa_executable_create_alt: %s")) 2972 return Err; 2973 2974 hsa_loaded_code_object_t Object; 2975 Status = hsa_executable_load_agent_code_object(Executable, Device.getAgent(), 2976 Reader, "", &Object); 2977 if (auto Err = Plugin::check( 2978 Status, "Error in hsa_executable_load_agent_code_object: %s")) 2979 return Err; 2980 2981 Status = hsa_executable_freeze(Executable, ""); 2982 if (auto Err = Plugin::check(Status, "Error in hsa_executable_freeze: %s")) 2983 return Err; 2984 2985 uint32_t Result; 2986 Status = hsa_executable_validate(Executable, &Result); 2987 if (auto Err = Plugin::check(Status, "Error in hsa_executable_validate: %s")) 2988 return Err; 2989 2990 if (Result) 2991 return Plugin::error("Loaded HSA executable does not validate"); 2992 2993 Status = hsa_code_object_reader_destroy(Reader); 2994 if (auto Err = 2995 Plugin::check(Status, "Error in hsa_code_object_reader_destroy: %s")) 2996 return Err; 2997 2998 if (auto Err = hsa_utils::readAMDGPUMetaDataFromImage( 2999 getMemoryBuffer(), KernelInfoMap, ELFABIVersion)) 3000 return Err; 3001 3002 return Plugin::success(); 3003 } 3004 3005 Expected<hsa_executable_symbol_t> 3006 AMDGPUDeviceImageTy::findDeviceSymbol(GenericDeviceTy &Device, 3007 StringRef SymbolName) const { 3008 3009 AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device); 3010 hsa_agent_t Agent = AMDGPUDevice.getAgent(); 3011 3012 hsa_executable_symbol_t Symbol; 3013 hsa_status_t Status = hsa_executable_get_symbol_by_name( 3014 Executable, SymbolName.data(), &Agent, &Symbol); 3015 if (auto Err = Plugin::check( 3016 Status, "Error in hsa_executable_get_symbol_by_name(%s): %s", 3017 SymbolName.data())) 3018 return std::move(Err); 3019 3020 return Symbol; 3021 } 3022 3023 template <typename ResourceTy> 3024 Error AMDGPUResourceRef<ResourceTy>::create(GenericDeviceTy &Device) { 3025 if (Resource) 3026 return Plugin::error("Creating an existing resource"); 3027 3028 AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device); 3029 3030 Resource = new ResourceTy(AMDGPUDevice); 3031 3032 return Resource->init(); 3033 } 3034 3035 AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device) 3036 : Agent(Device.getAgent()), Queue(nullptr), 3037 SignalManager(Device.getSignalManager()), Device(Device), 3038 // Initialize the std::deque with some empty positions. 3039 Slots(32), NextSlot(0), SyncCycle(0), 3040 StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()), 3041 UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {} 3042 3043 /// Class implementing the AMDGPU-specific functionalities of the global 3044 /// handler. 3045 struct AMDGPUGlobalHandlerTy final : public GenericGlobalHandlerTy { 3046 /// Get the metadata of a global from the device. The name and size of the 3047 /// global is read from DeviceGlobal and the address of the global is written 3048 /// to DeviceGlobal. 3049 Error getGlobalMetadataFromDevice(GenericDeviceTy &Device, 3050 DeviceImageTy &Image, 3051 GlobalTy &DeviceGlobal) override { 3052 AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image); 3053 3054 // Find the symbol on the device executable. 3055 auto SymbolOrErr = 3056 AMDImage.findDeviceSymbol(Device, DeviceGlobal.getName()); 3057 if (!SymbolOrErr) 3058 return SymbolOrErr.takeError(); 3059 3060 hsa_executable_symbol_t Symbol = *SymbolOrErr; 3061 hsa_symbol_kind_t SymbolType; 3062 hsa_status_t Status; 3063 uint64_t SymbolAddr; 3064 uint32_t SymbolSize; 3065 3066 // Retrieve the type, address and size of the symbol. 3067 std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = { 3068 {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType}, 3069 {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &SymbolAddr}, 3070 {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &SymbolSize}}; 3071 3072 for (auto &Info : RequiredInfos) { 3073 Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second); 3074 if (auto Err = Plugin::check( 3075 Status, "Error in hsa_executable_symbol_get_info: %s")) 3076 return Err; 3077 } 3078 3079 // Check the size of the symbol. 3080 if (SymbolSize != DeviceGlobal.getSize()) 3081 return Plugin::error( 3082 "Failed to load global '%s' due to size mismatch (%zu != %zu)", 3083 DeviceGlobal.getName().data(), SymbolSize, 3084 (size_t)DeviceGlobal.getSize()); 3085 3086 // Store the symbol address on the device global metadata. 3087 DeviceGlobal.setPtr(reinterpret_cast<void *>(SymbolAddr)); 3088 3089 return Plugin::success(); 3090 } 3091 }; 3092 3093 /// Class implementing the AMDGPU-specific functionalities of the plugin. 3094 struct AMDGPUPluginTy final : public GenericPluginTy { 3095 /// Create an AMDGPU plugin and initialize the AMDGPU driver. 3096 AMDGPUPluginTy() 3097 : GenericPluginTy(getTripleArch()), Initialized(false), 3098 HostDevice(nullptr) {} 3099 3100 /// This class should not be copied. 3101 AMDGPUPluginTy(const AMDGPUPluginTy &) = delete; 3102 AMDGPUPluginTy(AMDGPUPluginTy &&) = delete; 3103 3104 /// Initialize the plugin and return the number of devices. 3105 Expected<int32_t> initImpl() override { 3106 hsa_status_t Status = hsa_init(); 3107 if (Status != HSA_STATUS_SUCCESS) { 3108 // Cannot call hsa_success_string. 3109 DP("Failed to initialize AMDGPU's HSA library\n"); 3110 return 0; 3111 } 3112 3113 // The initialization of HSA was successful. It should be safe to call 3114 // HSA functions from now on, e.g., hsa_shut_down. 3115 Initialized = true; 3116 3117 // Register event handler to detect memory errors on the devices. 3118 Status = hsa_amd_register_system_event_handler(eventHandler, this); 3119 if (auto Err = Plugin::check( 3120 Status, "Error in hsa_amd_register_system_event_handler: %s")) 3121 return std::move(Err); 3122 3123 // List of host (CPU) agents. 3124 llvm::SmallVector<hsa_agent_t> HostAgents; 3125 3126 // Count the number of available agents. 3127 auto Err = hsa_utils::iterateAgents([&](hsa_agent_t Agent) { 3128 // Get the device type of the agent. 3129 hsa_device_type_t DeviceType; 3130 hsa_status_t Status = 3131 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType); 3132 if (Status != HSA_STATUS_SUCCESS) 3133 return Status; 3134 3135 // Classify the agents into kernel (GPU) and host (CPU) kernels. 3136 if (DeviceType == HSA_DEVICE_TYPE_GPU) { 3137 // Ensure that the GPU agent supports kernel dispatch packets. 3138 hsa_agent_feature_t Features; 3139 Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &Features); 3140 if (Features & HSA_AGENT_FEATURE_KERNEL_DISPATCH) 3141 KernelAgents.push_back(Agent); 3142 } else if (DeviceType == HSA_DEVICE_TYPE_CPU) { 3143 HostAgents.push_back(Agent); 3144 } 3145 return HSA_STATUS_SUCCESS; 3146 }); 3147 3148 if (Err) 3149 return std::move(Err); 3150 3151 int32_t NumDevices = KernelAgents.size(); 3152 if (NumDevices == 0) { 3153 // Do not initialize if there are no devices. 3154 DP("There are no devices supporting AMDGPU.\n"); 3155 return 0; 3156 } 3157 3158 // There are kernel agents but there is no host agent. That should be 3159 // treated as an error. 3160 if (HostAgents.empty()) 3161 return Plugin::error("No AMDGPU host agents"); 3162 3163 // Initialize the host device using host agents. 3164 HostDevice = allocate<AMDHostDeviceTy>(); 3165 new (HostDevice) AMDHostDeviceTy(*this, HostAgents); 3166 3167 // Setup the memory pools of available for the host. 3168 if (auto Err = HostDevice->init()) 3169 return std::move(Err); 3170 3171 return NumDevices; 3172 } 3173 3174 /// Deinitialize the plugin. 3175 Error deinitImpl() override { 3176 // The HSA runtime was not initialized, so nothing from the plugin was 3177 // actually initialized. 3178 if (!Initialized) 3179 return Plugin::success(); 3180 3181 if (HostDevice) 3182 if (auto Err = HostDevice->deinit()) 3183 return Err; 3184 3185 // Finalize the HSA runtime. 3186 hsa_status_t Status = hsa_shut_down(); 3187 return Plugin::check(Status, "Error in hsa_shut_down: %s"); 3188 } 3189 3190 /// Creates an AMDGPU device. 3191 GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId, 3192 int32_t NumDevices) override { 3193 return new AMDGPUDeviceTy(Plugin, DeviceId, NumDevices, getHostDevice(), 3194 getKernelAgent(DeviceId)); 3195 } 3196 3197 /// Creates an AMDGPU global handler. 3198 GenericGlobalHandlerTy *createGlobalHandler() override { 3199 return new AMDGPUGlobalHandlerTy(); 3200 } 3201 3202 Triple::ArchType getTripleArch() const override { return Triple::amdgcn; } 3203 3204 const char *getName() const override { return GETNAME(TARGET_NAME); } 3205 3206 /// Get the ELF code for recognizing the compatible image binary. 3207 uint16_t getMagicElfBits() const override { return ELF::EM_AMDGPU; } 3208 3209 /// Check whether the image is compatible with an AMDGPU device. 3210 Expected<bool> isELFCompatible(uint32_t DeviceId, 3211 StringRef Image) const override { 3212 // Get the associated architecture and flags from the ELF. 3213 auto ElfOrErr = ELF64LEObjectFile::create( 3214 MemoryBufferRef(Image, /*Identifier=*/""), /*InitContent=*/false); 3215 if (!ElfOrErr) 3216 return ElfOrErr.takeError(); 3217 std::optional<StringRef> Processor = ElfOrErr->tryGetCPUName(); 3218 if (!Processor) 3219 return false; 3220 3221 SmallVector<SmallString<32>> Targets; 3222 if (auto Err = hsa_utils::getTargetTripleAndFeatures( 3223 getKernelAgent(DeviceId), Targets)) 3224 return Err; 3225 for (auto &Target : Targets) 3226 if (offloading::amdgpu::isImageCompatibleWithEnv( 3227 Processor ? *Processor : "", ElfOrErr->getPlatformFlags(), 3228 Target.str())) 3229 return true; 3230 return false; 3231 } 3232 3233 bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override { 3234 return true; 3235 } 3236 3237 /// Get the host device instance. 3238 AMDHostDeviceTy &getHostDevice() { 3239 assert(HostDevice && "Host device not initialized"); 3240 return *HostDevice; 3241 } 3242 3243 /// Get the kernel agent with the corresponding agent id. 3244 hsa_agent_t getKernelAgent(int32_t AgentId) const { 3245 assert((uint32_t)AgentId < KernelAgents.size() && "Invalid agent id"); 3246 return KernelAgents[AgentId]; 3247 } 3248 3249 /// Get the list of the available kernel agents. 3250 const llvm::SmallVector<hsa_agent_t> &getKernelAgents() const { 3251 return KernelAgents; 3252 } 3253 3254 private: 3255 /// Event handler that will be called by ROCr if an event is detected. 3256 static hsa_status_t eventHandler(const hsa_amd_event_t *Event, 3257 void *PluginPtr) { 3258 if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT) 3259 return HSA_STATUS_SUCCESS; 3260 3261 SmallVector<std::string> Reasons; 3262 uint32_t ReasonsMask = Event->memory_fault.fault_reason_mask; 3263 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT) 3264 Reasons.emplace_back("Page not present or supervisor privilege"); 3265 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_READ_ONLY) 3266 Reasons.emplace_back("Write access to a read-only page"); 3267 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_NX) 3268 Reasons.emplace_back("Execute access to a page marked NX"); 3269 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HOST_ONLY) 3270 Reasons.emplace_back("GPU attempted access to a host only page"); 3271 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_DRAMECC) 3272 Reasons.emplace_back("DRAM ECC failure"); 3273 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_IMPRECISE) 3274 Reasons.emplace_back("Can't determine the exact fault address"); 3275 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_SRAMECC) 3276 Reasons.emplace_back("SRAM ECC failure (ie registers, no fault address)"); 3277 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HANG) 3278 Reasons.emplace_back("GPU reset following unspecified hang"); 3279 3280 // If we do not know the reason, say so, otherwise remove the trailing comma 3281 // and space. 3282 if (Reasons.empty()) 3283 Reasons.emplace_back("Unknown (" + std::to_string(ReasonsMask) + ")"); 3284 3285 uint32_t Node = -1; 3286 hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node); 3287 3288 AMDGPUPluginTy &Plugin = *reinterpret_cast<AMDGPUPluginTy *>(PluginPtr); 3289 for (uint32_t I = 0, E = Plugin.getNumDevices(); 3290 Node != uint32_t(-1) && I < E; ++I) { 3291 AMDGPUDeviceTy &AMDGPUDevice = 3292 reinterpret_cast<AMDGPUDeviceTy &>(Plugin.getDevice(I)); 3293 auto KernelTraceInfoRecord = 3294 AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor(); 3295 3296 uint32_t DeviceNode = -1; 3297 if (auto Err = 3298 AMDGPUDevice.getDeviceAttr(HSA_AGENT_INFO_NODE, DeviceNode)) { 3299 consumeError(std::move(Err)); 3300 continue; 3301 } 3302 if (DeviceNode != Node) 3303 continue; 3304 void *DevicePtr = (void *)Event->memory_fault.virtual_address; 3305 std::string S; 3306 llvm::raw_string_ostream OS(S); 3307 OS << llvm::format("Memory access fault by GPU %" PRIu32 3308 " (agent 0x%" PRIx64 3309 ") at virtual address %p. Reasons: %s", 3310 Node, Event->memory_fault.agent.handle, 3311 (void *)Event->memory_fault.virtual_address, 3312 llvm::join(Reasons, ", ").c_str()); 3313 ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord); 3314 ErrorReporter::reportMemoryAccessError(AMDGPUDevice, DevicePtr, S, 3315 /*Abort*/ true); 3316 } 3317 3318 // Abort the execution since we do not recover from this error. 3319 FATAL_MESSAGE(1, 3320 "Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64 3321 ") at virtual address %p. Reasons: %s", 3322 Node, Event->memory_fault.agent.handle, 3323 (void *)Event->memory_fault.virtual_address, 3324 llvm::join(Reasons, ", ").c_str()); 3325 3326 return HSA_STATUS_ERROR; 3327 } 3328 3329 /// Indicate whether the HSA runtime was correctly initialized. Even if there 3330 /// is no available devices this boolean will be true. It indicates whether 3331 /// we can safely call HSA functions (e.g., hsa_shut_down). 3332 bool Initialized; 3333 3334 /// Arrays of the available GPU and CPU agents. These arrays of handles should 3335 /// not be here but in the AMDGPUDeviceTy structures directly. However, the 3336 /// HSA standard does not provide API functions to retirve agents directly, 3337 /// only iterating functions. We cache the agents here for convenience. 3338 llvm::SmallVector<hsa_agent_t> KernelAgents; 3339 3340 /// The device representing all HSA host agents. 3341 AMDHostDeviceTy *HostDevice; 3342 }; 3343 3344 Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, 3345 uint32_t NumThreads[3], uint32_t NumBlocks[3], 3346 KernelArgsTy &KernelArgs, 3347 KernelLaunchParamsTy LaunchParams, 3348 AsyncInfoWrapperTy &AsyncInfoWrapper) const { 3349 if (ArgsSize != LaunchParams.Size && 3350 ArgsSize != LaunchParams.Size + getImplicitArgsSize()) 3351 return Plugin::error("Mismatch of kernel arguments size"); 3352 3353 AMDGPUPluginTy &AMDGPUPlugin = 3354 static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin); 3355 AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice(); 3356 AMDGPUMemoryManagerTy &ArgsMemoryManager = HostDevice.getArgsMemoryManager(); 3357 3358 void *AllArgs = nullptr; 3359 if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs)) 3360 return Err; 3361 3362 // Account for user requested dynamic shared memory. 3363 uint32_t GroupSize = getGroupSize(); 3364 if (uint32_t MaxDynCGroupMem = std::max( 3365 KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) { 3366 GroupSize += MaxDynCGroupMem; 3367 } 3368 3369 uint64_t StackSize; 3370 if (auto Err = GenericDevice.getDeviceStackSize(StackSize)) 3371 return Err; 3372 3373 hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr; 3374 if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) { 3375 // Initialize implicit arguments. 3376 ImplArgs = reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>( 3377 utils::advancePtr(AllArgs, LaunchParams.Size)); 3378 3379 // Initialize the implicit arguments to zero. 3380 std::memset(ImplArgs, 0, getImplicitArgsSize()); 3381 } 3382 3383 // Copy the explicit arguments. 3384 // TODO: We should expose the args memory manager alloc to the common part as 3385 // alternative to copying them twice. 3386 if (LaunchParams.Size) 3387 std::memcpy(AllArgs, LaunchParams.Data, LaunchParams.Size); 3388 3389 AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice); 3390 3391 AMDGPUStreamTy *Stream = nullptr; 3392 if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream)) 3393 return Err; 3394 3395 // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used. 3396 if (ImplArgs && 3397 getImplicitArgsSize() == sizeof(hsa_utils::AMDGPUImplicitArgsTy)) { 3398 ImplArgs->BlockCountX = NumBlocks[0]; 3399 ImplArgs->BlockCountY = NumBlocks[1]; 3400 ImplArgs->BlockCountZ = NumBlocks[2]; 3401 ImplArgs->GroupSizeX = NumThreads[0]; 3402 ImplArgs->GroupSizeY = NumThreads[1]; 3403 ImplArgs->GroupSizeZ = NumThreads[2]; 3404 ImplArgs->GridDims = NumBlocks[2] * NumThreads[2] > 1 3405 ? 3 3406 : 1 + (NumBlocks[1] * NumThreads[1] != 1); 3407 ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem; 3408 } 3409 3410 // Push the kernel launch into the stream. 3411 return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks, 3412 GroupSize, StackSize, ArgsMemoryManager); 3413 } 3414 3415 Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, 3416 KernelArgsTy &KernelArgs, 3417 uint32_t NumThreads[3], 3418 uint32_t NumBlocks[3]) const { 3419 // Only do all this when the output is requested 3420 if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL)) 3421 return Plugin::success(); 3422 3423 // We don't have data to print additional info, but no hard error 3424 if (!KernelInfo.has_value()) 3425 return Plugin::success(); 3426 3427 // General Info 3428 auto NumGroups = NumBlocks; 3429 auto ThreadsPerGroup = NumThreads; 3430 3431 // Kernel Arguments Info 3432 auto ArgNum = KernelArgs.NumArgs; 3433 auto LoopTripCount = KernelArgs.Tripcount; 3434 3435 // Details for AMDGPU kernels (read from image) 3436 // https://www.llvm.org/docs/AMDGPUUsage.html#code-object-v4-metadata 3437 auto GroupSegmentSize = (*KernelInfo).GroupSegmentList; 3438 auto SGPRCount = (*KernelInfo).SGPRCount; 3439 auto VGPRCount = (*KernelInfo).VGPRCount; 3440 auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount; 3441 auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount; 3442 auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize; 3443 3444 // Prints additional launch info that contains the following. 3445 // Num Args: The number of kernel arguments 3446 // Teams x Thrds: The number of teams and the number of threads actually 3447 // running. 3448 // MaxFlatWorkgroupSize: Maximum flat work-group size supported by the 3449 // kernel in work-items 3450 // LDS Usage: Amount of bytes used in LDS storage 3451 // S/VGPR Count: the number of S/V GPRs occupied by the kernel 3452 // S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel 3453 // Tripcount: loop tripcount for the kernel 3454 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(), 3455 "#Args: %d Teams x Thrds: %4ux%4u (MaxFlatWorkGroupSize: %u) LDS " 3456 "Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: " 3457 "%lu\n", 3458 ArgNum, NumGroups[0] * NumGroups[1] * NumGroups[2], 3459 ThreadsPerGroup[0] * ThreadsPerGroup[1] * ThreadsPerGroup[2], 3460 MaxFlatWorkgroupSize, GroupSegmentSize, SGPRCount, VGPRCount, 3461 SGPRSpillCount, VGPRSpillCount, LoopTripCount); 3462 3463 return Plugin::success(); 3464 } 3465 3466 template <typename... ArgsTy> 3467 static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) { 3468 hsa_status_t ResultCode = static_cast<hsa_status_t>(Code); 3469 if (ResultCode == HSA_STATUS_SUCCESS || ResultCode == HSA_STATUS_INFO_BREAK) 3470 return Error::success(); 3471 3472 const char *Desc = "Unknown error"; 3473 hsa_status_t Ret = hsa_status_string(ResultCode, &Desc); 3474 if (Ret != HSA_STATUS_SUCCESS) 3475 REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code); 3476 3477 return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(), 3478 ErrFmt, Args..., Desc); 3479 } 3480 3481 void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr, 3482 TargetAllocTy Kind) { 3483 // Allocate memory from the pool. 3484 void *Ptr = nullptr; 3485 if (auto Err = MemoryPool->allocate(Size, &Ptr)) { 3486 consumeError(std::move(Err)); 3487 return nullptr; 3488 } 3489 assert(Ptr && "Invalid pointer"); 3490 3491 // Get a list of agents that can access this memory pool. 3492 llvm::SmallVector<hsa_agent_t> Agents; 3493 llvm::copy_if( 3494 Plugin.getKernelAgents(), std::back_inserter(Agents), 3495 [&](hsa_agent_t Agent) { return MemoryPool->canAccess(Agent); }); 3496 3497 // Allow all valid kernel agents to access the allocation. 3498 if (auto Err = MemoryPool->enableAccess(Ptr, Size, Agents)) { 3499 REPORT("%s\n", toString(std::move(Err)).data()); 3500 return nullptr; 3501 } 3502 return Ptr; 3503 } 3504 3505 void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { 3506 if (Size == 0) 3507 return nullptr; 3508 3509 // Find the correct memory pool. 3510 AMDGPUMemoryPoolTy *MemoryPool = nullptr; 3511 switch (Kind) { 3512 case TARGET_ALLOC_DEFAULT: 3513 case TARGET_ALLOC_DEVICE: 3514 case TARGET_ALLOC_DEVICE_NON_BLOCKING: 3515 MemoryPool = CoarseGrainedMemoryPools[0]; 3516 break; 3517 case TARGET_ALLOC_HOST: 3518 MemoryPool = &HostDevice.getFineGrainedMemoryPool(); 3519 break; 3520 case TARGET_ALLOC_SHARED: 3521 MemoryPool = &HostDevice.getFineGrainedMemoryPool(); 3522 break; 3523 } 3524 3525 if (!MemoryPool) { 3526 REPORT("No memory pool for the specified allocation kind\n"); 3527 return nullptr; 3528 } 3529 3530 // Allocate from the corresponding memory pool. 3531 void *Alloc = nullptr; 3532 if (Error Err = MemoryPool->allocate(Size, &Alloc)) { 3533 REPORT("%s\n", toString(std::move(Err)).data()); 3534 return nullptr; 3535 } 3536 3537 if (Alloc) { 3538 // Get a list of agents that can access this memory pool. Inherently 3539 // necessary for host or shared allocations Also enabled for device memory 3540 // to allow device to device memcpy 3541 llvm::SmallVector<hsa_agent_t> Agents; 3542 llvm::copy_if(static_cast<AMDGPUPluginTy &>(Plugin).getKernelAgents(), 3543 std::back_inserter(Agents), [&](hsa_agent_t Agent) { 3544 return MemoryPool->canAccess(Agent); 3545 }); 3546 3547 // Enable all valid kernel agents to access the buffer. 3548 if (auto Err = MemoryPool->enableAccess(Alloc, Size, Agents)) { 3549 REPORT("%s\n", toString(std::move(Err)).data()); 3550 return nullptr; 3551 } 3552 } 3553 3554 return Alloc; 3555 } 3556 3557 void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source, 3558 void *Data) { 3559 auto &AMDGPUDevice = *reinterpret_cast<AMDGPUDeviceTy *>(Data); 3560 3561 if (Status == HSA_STATUS_ERROR_EXCEPTION) { 3562 auto KernelTraceInfoRecord = 3563 AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor(); 3564 std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher = 3565 [=](__tgt_async_info &AsyncInfo) { 3566 auto *Stream = reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue); 3567 if (!Stream || !Stream->getQueue()) 3568 return false; 3569 return Stream->getQueue()->Queue == Source; 3570 }; 3571 ErrorReporter::reportTrapInKernel(AMDGPUDevice, *KernelTraceInfoRecord, 3572 AsyncInfoWrapperMatcher); 3573 } 3574 3575 auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source); 3576 FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data()); 3577 } 3578 3579 } // namespace plugin 3580 } // namespace target 3581 } // namespace omp 3582 } // namespace llvm 3583 3584 extern "C" { 3585 llvm::omp::target::plugin::GenericPluginTy *createPlugin_amdgpu() { 3586 return new llvm::omp::target::plugin::AMDGPUPluginTy(); 3587 } 3588 } 3589