1 //===- PluginInterface.h - Target independent plugin device interface -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 //===----------------------------------------------------------------------===// 10 11 #ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H 12 #define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H 13 14 #include <cstddef> 15 #include <cstdint> 16 #include <deque> 17 #include <list> 18 #include <map> 19 #include <shared_mutex> 20 #include <vector> 21 22 #include "ExclusiveAccess.h" 23 #include "Shared/APITypes.h" 24 #include "Shared/Debug.h" 25 #include "Shared/Environment.h" 26 #include "Shared/EnvironmentVar.h" 27 #include "Shared/Requirements.h" 28 #include "Shared/Utils.h" 29 30 #include "GlobalHandler.h" 31 #include "JIT.h" 32 #include "MemoryManager.h" 33 #include "RPC.h" 34 #include "omptarget.h" 35 36 #ifdef OMPT_SUPPORT 37 #include "omp-tools.h" 38 #endif 39 40 #include "llvm/ADT/SmallVector.h" 41 #include "llvm/Frontend/OpenMP/OMPConstants.h" 42 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 43 #include "llvm/Support/Allocator.h" 44 #include "llvm/Support/Error.h" 45 #include "llvm/Support/ErrorHandling.h" 46 #include "llvm/Support/MemoryBufferRef.h" 47 #include "llvm/Support/raw_ostream.h" 48 #include "llvm/TargetParser/Triple.h" 49 50 namespace llvm { 51 namespace omp { 52 namespace target { 53 54 namespace plugin { 55 56 struct GenericPluginTy; 57 struct GenericKernelTy; 58 struct GenericDeviceTy; 59 struct RecordReplayTy; 60 61 /// Class that wraps the __tgt_async_info to simply its usage. In case the 62 /// object is constructed without a valid __tgt_async_info, the object will use 63 /// an internal one and will synchronize the current thread with the pending 64 /// operations when calling AsyncInfoWrapperTy::finalize(). This latter function 65 /// must be called before destroying the wrapper object. 66 struct AsyncInfoWrapperTy { 67 AsyncInfoWrapperTy(GenericDeviceTy &Device, __tgt_async_info *AsyncInfoPtr); 68 69 ~AsyncInfoWrapperTy() { 70 assert(!AsyncInfoPtr && "AsyncInfoWrapperTy not finalized"); 71 } 72 73 /// Get the raw __tgt_async_info pointer. 74 operator __tgt_async_info *() const { return AsyncInfoPtr; } 75 76 /// Indicate whether there is queue. 77 bool hasQueue() const { return (AsyncInfoPtr->Queue != nullptr); } 78 79 /// Get the queue. 80 template <typename Ty> Ty getQueueAs() { 81 static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue), 82 "Queue is not of the same size as target type"); 83 return static_cast<Ty>(AsyncInfoPtr->Queue); 84 } 85 86 /// Set the queue. 87 template <typename Ty> void setQueueAs(Ty Queue) { 88 static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue), 89 "Queue is not of the same size as target type"); 90 assert(!AsyncInfoPtr->Queue && "Overwriting queue"); 91 AsyncInfoPtr->Queue = Queue; 92 } 93 94 /// Synchronize with the __tgt_async_info's pending operations if it's the 95 /// internal async info. The error associated to the aysnchronous operations 96 /// issued in this queue must be provided in \p Err. This function will update 97 /// the error parameter with the result of the synchronization if it was 98 /// actually executed. This function must be called before destroying the 99 /// object and only once. 100 void finalize(Error &Err); 101 102 /// Register \p Ptr as an associated alloction that is freed after 103 /// finalization. 104 void freeAllocationAfterSynchronization(void *Ptr) { 105 AsyncInfoPtr->AssociatedAllocations.push_back(Ptr); 106 } 107 108 private: 109 GenericDeviceTy &Device; 110 __tgt_async_info LocalAsyncInfo; 111 __tgt_async_info *AsyncInfoPtr; 112 }; 113 114 /// The information level represents the level of a key-value property in the 115 /// info tree print (i.e. indentation). The first level should be the default. 116 enum InfoLevelKind { InfoLevel1 = 1, InfoLevel2, InfoLevel3 }; 117 118 /// Class for storing device information and later be printed. An object of this 119 /// type acts as a queue of key-value properties. Each property has a key, a 120 /// a value, and an optional unit for the value. For printing purposes, the 121 /// information can be classified into several levels. These levels are useful 122 /// for defining sections and subsections. Thus, each key-value property also 123 /// has an additional field indicating to which level belongs to. Notice that 124 /// we use the level to determine the indentation of the key-value property at 125 /// printing time. See the enum InfoLevelKind for the list of accepted levels. 126 class InfoQueueTy { 127 public: 128 struct InfoQueueEntryTy { 129 std::string Key; 130 std::string Value; 131 std::string Units; 132 uint64_t Level; 133 }; 134 135 private: 136 std::deque<InfoQueueEntryTy> Queue; 137 138 public: 139 /// Add a new info entry to the queue. The entry requires at least a key 140 /// string in \p Key. The value in \p Value is optional and can be any type 141 /// that is representable as a string. The units in \p Units is optional and 142 /// must be a string. The info level is a template parameter that defaults to 143 /// the first level (top level). 144 template <InfoLevelKind L = InfoLevel1, typename T = std::string> 145 void add(const std::string &Key, T Value = T(), 146 const std::string &Units = std::string()) { 147 assert(!Key.empty() && "Invalid info key"); 148 149 // Convert the value to a string depending on its type. 150 if constexpr (std::is_same_v<T, bool>) 151 Queue.push_back({Key, Value ? "Yes" : "No", Units, L}); 152 else if constexpr (std::is_arithmetic_v<T>) 153 Queue.push_back({Key, std::to_string(Value), Units, L}); 154 else 155 Queue.push_back({Key, Value, Units, L}); 156 } 157 158 const std::deque<InfoQueueEntryTy> &getQueue() const { return Queue; } 159 160 /// Print all info entries added to the queue. 161 void print() const { 162 // We print four spances for each level. 163 constexpr uint64_t IndentSize = 4; 164 165 // Find the maximum key length (level + key) to compute the individual 166 // indentation of each entry. 167 uint64_t MaxKeySize = 0; 168 for (const auto &Entry : Queue) { 169 uint64_t KeySize = Entry.Key.size() + Entry.Level * IndentSize; 170 if (KeySize > MaxKeySize) 171 MaxKeySize = KeySize; 172 } 173 174 // Print all info entries. 175 for (const auto &Entry : Queue) { 176 // Compute the indentations for the current entry. 177 uint64_t KeyIndentSize = Entry.Level * IndentSize; 178 uint64_t ValIndentSize = 179 MaxKeySize - (Entry.Key.size() + KeyIndentSize) + IndentSize; 180 181 llvm::outs() << std::string(KeyIndentSize, ' ') << Entry.Key 182 << std::string(ValIndentSize, ' ') << Entry.Value 183 << (Entry.Units.empty() ? "" : " ") << Entry.Units << "\n"; 184 } 185 } 186 }; 187 188 /// Class wrapping a __tgt_device_image and its offload entry table on a 189 /// specific device. This class is responsible for storing and managing 190 /// the offload entries for an image on a device. 191 class DeviceImageTy { 192 /// Image identifier within the corresponding device. Notice that this id is 193 /// not unique between different device; they may overlap. 194 int32_t ImageId; 195 196 /// The pointer to the raw __tgt_device_image. 197 const __tgt_device_image *TgtImage; 198 const __tgt_device_image *TgtImageBitcode; 199 200 /// Reference to the device this image is loaded on. 201 GenericDeviceTy &Device; 202 203 /// If this image has any global destructors that much be called. 204 /// FIXME: This is only required because we currently have no invariants 205 /// towards the lifetime of the underlying image. We should either copy 206 /// the image into memory locally or erase the pointers after init. 207 bool PendingGlobalDtors; 208 209 public: 210 DeviceImageTy(int32_t Id, GenericDeviceTy &Device, 211 const __tgt_device_image *Image) 212 : ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr), Device(Device), 213 PendingGlobalDtors(false) { 214 assert(TgtImage && "Invalid target image"); 215 } 216 217 /// Get the image identifier within the device. 218 int32_t getId() const { return ImageId; } 219 220 /// Get the device that this image is loaded onto. 221 GenericDeviceTy &getDevice() const { return Device; } 222 223 /// Get the pointer to the raw __tgt_device_image. 224 const __tgt_device_image *getTgtImage() const { return TgtImage; } 225 226 void setTgtImageBitcode(const __tgt_device_image *TgtImageBitcode) { 227 this->TgtImageBitcode = TgtImageBitcode; 228 } 229 230 const __tgt_device_image *getTgtImageBitcode() const { 231 return TgtImageBitcode; 232 } 233 234 /// Get the image starting address. 235 void *getStart() const { return TgtImage->ImageStart; } 236 237 /// Get the image size. 238 size_t getSize() const { 239 return utils::getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart); 240 } 241 242 /// Get a memory buffer reference to the whole image. 243 MemoryBufferRef getMemoryBuffer() const { 244 return MemoryBufferRef(StringRef((const char *)getStart(), getSize()), 245 "Image"); 246 } 247 /// Accessors to the boolean value 248 bool setPendingGlobalDtors() { return PendingGlobalDtors = true; } 249 bool hasPendingGlobalDtors() const { return PendingGlobalDtors; } 250 }; 251 252 /// Class implementing common functionalities of offload kernels. Each plugin 253 /// should define the specific kernel class, derive from this generic one, and 254 /// implement the necessary virtual function members. 255 struct GenericKernelTy { 256 /// Construct a kernel with a name and a execution mode. 257 GenericKernelTy(const char *Name) 258 : Name(Name), PreferredNumThreads(0), MaxNumThreads(0) {} 259 260 virtual ~GenericKernelTy() {} 261 262 /// Initialize the kernel object from a specific device. 263 Error init(GenericDeviceTy &GenericDevice, DeviceImageTy &Image); 264 virtual Error initImpl(GenericDeviceTy &GenericDevice, 265 DeviceImageTy &Image) = 0; 266 267 /// Launch the kernel on the specific device. The device must be the same 268 /// one used to initialize the kernel. 269 Error launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, 270 ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, 271 AsyncInfoWrapperTy &AsyncInfoWrapper) const; 272 virtual Error launchImpl(GenericDeviceTy &GenericDevice, 273 uint32_t NumThreads[3], uint32_t NumBlocks[3], 274 KernelArgsTy &KernelArgs, 275 KernelLaunchParamsTy LaunchParams, 276 AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0; 277 278 /// Get the kernel name. 279 const char *getName() const { return Name; } 280 281 /// Get the kernel image. 282 DeviceImageTy &getImage() const { 283 assert(ImagePtr && "Kernel is not initialized!"); 284 return *ImagePtr; 285 } 286 287 /// Return the kernel environment object for kernel \p Name. 288 const KernelEnvironmentTy &getKernelEnvironmentForKernel() { 289 return KernelEnvironment; 290 } 291 292 /// Return a device pointer to a new kernel launch environment. 293 Expected<KernelLaunchEnvironmentTy *> 294 getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version, 295 AsyncInfoWrapperTy &AsyncInfo) const; 296 297 /// Indicate whether an execution mode is valid. 298 static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) { 299 switch (ExecutionMode) { 300 case OMP_TGT_EXEC_MODE_SPMD: 301 case OMP_TGT_EXEC_MODE_GENERIC: 302 case OMP_TGT_EXEC_MODE_GENERIC_SPMD: 303 return true; 304 } 305 return false; 306 } 307 308 protected: 309 /// Get the execution mode name of the kernel. 310 const char *getExecutionModeName() const { 311 switch (KernelEnvironment.Configuration.ExecMode) { 312 case OMP_TGT_EXEC_MODE_SPMD: 313 return "SPMD"; 314 case OMP_TGT_EXEC_MODE_GENERIC: 315 return "Generic"; 316 case OMP_TGT_EXEC_MODE_GENERIC_SPMD: 317 return "Generic-SPMD"; 318 } 319 llvm_unreachable("Unknown execution mode!"); 320 } 321 322 /// Prints generic kernel launch information. 323 Error printLaunchInfo(GenericDeviceTy &GenericDevice, 324 KernelArgsTy &KernelArgs, uint32_t NumThreads[3], 325 uint32_t NumBlocks[3]) const; 326 327 /// Prints plugin-specific kernel launch information after generic kernel 328 /// launch information 329 virtual Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice, 330 KernelArgsTy &KernelArgs, 331 uint32_t NumThreads[3], 332 uint32_t NumBlocks[3]) const; 333 334 private: 335 /// Prepare the arguments before launching the kernel. 336 KernelLaunchParamsTy 337 prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs, 338 ptrdiff_t *ArgOffsets, uint32_t &NumArgs, 339 llvm::SmallVectorImpl<void *> &Args, 340 llvm::SmallVectorImpl<void *> &Ptrs, 341 KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const; 342 343 /// Get the number of threads and blocks for the kernel based on the 344 /// user-defined threads and block clauses. 345 uint32_t getNumThreads(GenericDeviceTy &GenericDevice, 346 uint32_t ThreadLimitClause[3]) const; 347 348 /// The number of threads \p NumThreads can be adjusted by this method. 349 /// \p IsNumThreadsFromUser is true is \p NumThreads is defined by user via 350 /// thread_limit clause. 351 uint32_t getNumBlocks(GenericDeviceTy &GenericDevice, 352 uint32_t BlockLimitClause[3], uint64_t LoopTripCount, 353 uint32_t &NumThreads, bool IsNumThreadsFromUser) const; 354 355 /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode. 356 bool isGenericSPMDMode() const { 357 return KernelEnvironment.Configuration.ExecMode == 358 OMP_TGT_EXEC_MODE_GENERIC_SPMD; 359 } 360 bool isGenericMode() const { 361 return KernelEnvironment.Configuration.ExecMode == 362 OMP_TGT_EXEC_MODE_GENERIC; 363 } 364 bool isSPMDMode() const { 365 return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_SPMD; 366 } 367 368 /// The kernel name. 369 const char *Name; 370 371 /// The image that contains this kernel. 372 DeviceImageTy *ImagePtr = nullptr; 373 374 protected: 375 /// The preferred number of threads to run the kernel. 376 uint32_t PreferredNumThreads; 377 378 /// The maximum number of threads which the kernel could leverage. 379 uint32_t MaxNumThreads; 380 381 /// The kernel environment, including execution flags. 382 KernelEnvironmentTy KernelEnvironment; 383 384 /// The prototype kernel launch environment. 385 KernelLaunchEnvironmentTy KernelLaunchEnvironment; 386 387 /// If the kernel is a bare kernel. 388 bool IsBareKernel = false; 389 }; 390 391 /// Information about an allocation, when it has been allocated, and when/if it 392 /// has been deallocated, for error reporting purposes. 393 struct AllocationTraceInfoTy { 394 395 /// The stack trace of the allocation itself. 396 std::string AllocationTrace; 397 398 /// The stack trace of the deallocation, or empty. 399 std::string DeallocationTrace; 400 401 /// The allocated device pointer. 402 void *DevicePtr = nullptr; 403 404 /// The corresponding host pointer (can be null). 405 void *HostPtr = nullptr; 406 407 /// The size of the allocation. 408 uint64_t Size = 0; 409 410 /// The kind of the allocation. 411 TargetAllocTy Kind = TargetAllocTy::TARGET_ALLOC_DEFAULT; 412 413 /// Information about the last allocation at this address, if any. 414 AllocationTraceInfoTy *LastAllocationInfo = nullptr; 415 416 /// Lock to keep accesses race free. 417 std::mutex Lock; 418 }; 419 420 /// Information about an allocation, when it has been allocated, and when/if it 421 /// has been deallocated, for error reporting purposes. 422 struct KernelTraceInfoTy { 423 424 /// The launched kernel. 425 GenericKernelTy *Kernel; 426 427 /// The stack trace of the launch itself. 428 std::string LaunchTrace; 429 430 /// The async info the kernel was launched in. 431 __tgt_async_info *AsyncInfo; 432 }; 433 434 struct KernelTraceInfoRecordTy { 435 KernelTraceInfoRecordTy() { KTIs.fill({}); } 436 437 /// Return the (maximal) record size. 438 auto size() const { return KTIs.size(); } 439 440 /// Create a new kernel trace info and add it into the record. 441 void emplace(GenericKernelTy *Kernel, const std::string &&StackTrace, 442 __tgt_async_info *AsyncInfo) { 443 KTIs[Idx] = {Kernel, std::move(StackTrace), AsyncInfo}; 444 Idx = (Idx + 1) % size(); 445 } 446 447 /// Return the \p I'th last kernel trace info. 448 auto getKernelTraceInfo(int32_t I) const { 449 // Note that kernel trace infos "grow forward", so lookup is backwards. 450 return KTIs[(Idx - I - 1 + size()) % size()]; 451 } 452 453 private: 454 std::array<KernelTraceInfoTy, 8> KTIs; 455 unsigned Idx = 0; 456 }; 457 458 /// Class representing a map of host pinned allocations. We track these pinned 459 /// allocations, so memory tranfers invloving these buffers can be optimized. 460 class PinnedAllocationMapTy { 461 462 /// Struct representing a map entry. 463 struct EntryTy { 464 /// The host pointer of the pinned allocation. 465 void *HstPtr; 466 467 /// The pointer that devices' driver should use to transfer data from/to the 468 /// pinned allocation. In most plugins, this pointer will be the same as the 469 /// host pointer above. 470 void *DevAccessiblePtr; 471 472 /// The size of the pinned allocation. 473 size_t Size; 474 475 /// Indicate whether the allocation was locked from outside the plugin, for 476 /// instance, from the application. The externally locked allocations are 477 /// not unlocked by the plugin when unregistering the last user. 478 bool ExternallyLocked; 479 480 /// The number of references to the pinned allocation. The allocation should 481 /// remain pinned and registered to the map until the number of references 482 /// becomes zero. 483 mutable size_t References; 484 485 /// Create an entry with the host and device acessible pointers, the buffer 486 /// size, and a boolean indicating whether the buffer was locked externally. 487 EntryTy(void *HstPtr, void *DevAccessiblePtr, size_t Size, 488 bool ExternallyLocked) 489 : HstPtr(HstPtr), DevAccessiblePtr(DevAccessiblePtr), Size(Size), 490 ExternallyLocked(ExternallyLocked), References(1) {} 491 492 /// Utility constructor used for std::set searches. 493 EntryTy(void *HstPtr) 494 : HstPtr(HstPtr), DevAccessiblePtr(nullptr), Size(0), 495 ExternallyLocked(false), References(0) {} 496 }; 497 498 /// Comparator of mep entries. Use the host pointer to enforce an order 499 /// between entries. 500 struct EntryCmpTy { 501 bool operator()(const EntryTy &Left, const EntryTy &Right) const { 502 return Left.HstPtr < Right.HstPtr; 503 } 504 }; 505 506 typedef std::set<EntryTy, EntryCmpTy> PinnedAllocSetTy; 507 508 /// The map of host pinned allocations. 509 PinnedAllocSetTy Allocs; 510 511 /// The mutex to protect accesses to the map. 512 mutable std::shared_mutex Mutex; 513 514 /// Reference to the corresponding device. 515 GenericDeviceTy &Device; 516 517 /// Indicate whether mapped host buffers should be locked automatically. 518 bool LockMappedBuffers; 519 520 /// Indicate whether failures when locking mapped buffers should be ingored. 521 bool IgnoreLockMappedFailures; 522 523 /// Find an allocation that intersects with \p HstPtr pointer. Assume the 524 /// map's mutex is acquired. 525 const EntryTy *findIntersecting(const void *HstPtr) const { 526 if (Allocs.empty()) 527 return nullptr; 528 529 // Search the first allocation with starting address that is not less than 530 // the buffer address. 531 auto It = Allocs.lower_bound({const_cast<void *>(HstPtr)}); 532 533 // Direct match of starting addresses. 534 if (It != Allocs.end() && It->HstPtr == HstPtr) 535 return &(*It); 536 537 // Not direct match but may be a previous pinned allocation in the map which 538 // contains the buffer. Return false if there is no such a previous 539 // allocation. 540 if (It == Allocs.begin()) 541 return nullptr; 542 543 // Move to the previous pinned allocation. 544 --It; 545 546 // The buffer is not contained in the pinned allocation. 547 if (utils::advancePtr(It->HstPtr, It->Size) > HstPtr) 548 return &(*It); 549 550 // None found. 551 return nullptr; 552 } 553 554 /// Insert an entry to the map representing a locked buffer. The number of 555 /// references is set to one. 556 Error insertEntry(void *HstPtr, void *DevAccessiblePtr, size_t Size, 557 bool ExternallyLocked = false); 558 559 /// Erase an existing entry from the map. 560 Error eraseEntry(const EntryTy &Entry); 561 562 /// Register a new user into an entry that represents a locked buffer. Check 563 /// also that the registered buffer with \p HstPtr address and \p Size is 564 /// actually contained into the entry. 565 Error registerEntryUse(const EntryTy &Entry, void *HstPtr, size_t Size); 566 567 /// Unregister a user from the entry and return whether it is the last user. 568 /// If it is the last user, the entry will have to be removed from the map 569 /// and unlock the entry's host buffer (if necessary). 570 Expected<bool> unregisterEntryUse(const EntryTy &Entry); 571 572 /// Indicate whether the first range A fully contains the second range B. 573 static bool contains(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) { 574 void *EndA = utils::advancePtr(PtrA, SizeA); 575 void *EndB = utils::advancePtr(PtrB, SizeB); 576 return (PtrB >= PtrA && EndB <= EndA); 577 } 578 579 /// Indicate whether the first range A intersects with the second range B. 580 static bool intersects(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) { 581 void *EndA = utils::advancePtr(PtrA, SizeA); 582 void *EndB = utils::advancePtr(PtrB, SizeB); 583 return (PtrA < EndB && PtrB < EndA); 584 } 585 586 public: 587 /// Create the map of pinned allocations corresponding to a specific device. 588 PinnedAllocationMapTy(GenericDeviceTy &Device) : Device(Device) { 589 590 // Envar that indicates whether mapped host buffers should be locked 591 // automatically. The possible values are boolean (on/off) and a special: 592 // off: Mapped host buffers are not locked. 593 // on: Mapped host buffers are locked in a best-effort approach. 594 // Failure to lock the buffers are silent. 595 // mandatory: Mapped host buffers are always locked and failures to lock 596 // a buffer results in a fatal error. 597 StringEnvar OMPX_LockMappedBuffers("LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS", 598 "off"); 599 600 bool Enabled; 601 if (StringParser::parse(OMPX_LockMappedBuffers.get().data(), Enabled)) { 602 // Parsed as a boolean value. Enable the feature if necessary. 603 LockMappedBuffers = Enabled; 604 IgnoreLockMappedFailures = true; 605 } else if (OMPX_LockMappedBuffers.get() == "mandatory") { 606 // Enable the feature and failures are fatal. 607 LockMappedBuffers = true; 608 IgnoreLockMappedFailures = false; 609 } else { 610 // Disable by default. 611 DP("Invalid value LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS=%s\n", 612 OMPX_LockMappedBuffers.get().data()); 613 LockMappedBuffers = false; 614 } 615 } 616 617 /// Register a buffer that was recently allocated as a locked host buffer. 618 /// None of the already registered pinned allocations should intersect with 619 /// this new one. The registration requires the host pointer in \p HstPtr, 620 /// the device accessible pointer in \p DevAccessiblePtr, and the size of the 621 /// allocation in \p Size. The allocation must be unregistered using the 622 /// unregisterHostBuffer function. 623 Error registerHostBuffer(void *HstPtr, void *DevAccessiblePtr, size_t Size); 624 625 /// Unregister a host pinned allocation passing the host pointer which was 626 /// previously registered using the registerHostBuffer function. When calling 627 /// this function, the pinned allocation cannot have any other user and will 628 /// not be unlocked by this function. 629 Error unregisterHostBuffer(void *HstPtr); 630 631 /// Lock the host buffer at \p HstPtr or register a new user if it intersects 632 /// with an already existing one. A partial overlapping with extension is not 633 /// allowed. The function returns the device accessible pointer of the pinned 634 /// buffer. The buffer must be unlocked using the unlockHostBuffer function. 635 Expected<void *> lockHostBuffer(void *HstPtr, size_t Size); 636 637 /// Unlock the host buffer at \p HstPtr or unregister a user if other users 638 /// are still using the pinned allocation. If this was the last user, the 639 /// pinned allocation is removed from the map and the memory is unlocked. 640 Error unlockHostBuffer(void *HstPtr); 641 642 /// Lock or register a host buffer that was recently mapped by libomptarget. 643 /// This behavior is applied if LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS is 644 /// enabled. Even if not enabled, externally locked buffers are registered 645 /// in order to optimize their transfers. 646 Error lockMappedHostBuffer(void *HstPtr, size_t Size); 647 648 /// Unlock or unregister a host buffer that was unmapped by libomptarget. 649 Error unlockUnmappedHostBuffer(void *HstPtr); 650 651 /// Return the device accessible pointer associated to the host pinned 652 /// allocation which the \p HstPtr belongs, if any. Return null in case the 653 /// \p HstPtr does not belong to any host pinned allocation. The device 654 /// accessible pointer is the one that devices should use for data transfers 655 /// that involve a host pinned buffer. 656 void *getDeviceAccessiblePtrFromPinnedBuffer(const void *HstPtr) const { 657 std::shared_lock<std::shared_mutex> Lock(Mutex); 658 659 // Find the intersecting allocation if any. 660 const EntryTy *Entry = findIntersecting(HstPtr); 661 if (!Entry) 662 return nullptr; 663 664 return utils::advancePtr(Entry->DevAccessiblePtr, 665 utils::getPtrDiff(HstPtr, Entry->HstPtr)); 666 } 667 668 /// Check whether a buffer belongs to a registered host pinned allocation. 669 bool isHostPinnedBuffer(const void *HstPtr) const { 670 std::shared_lock<std::shared_mutex> Lock(Mutex); 671 672 // Return whether there is an intersecting allocation. 673 return (findIntersecting(const_cast<void *>(HstPtr)) != nullptr); 674 } 675 }; 676 677 /// Class implementing common functionalities of offload devices. Each plugin 678 /// should define the specific device class, derive from this generic one, and 679 /// implement the necessary virtual function members. 680 struct GenericDeviceTy : public DeviceAllocatorTy { 681 /// Construct a device with its device id within the plugin, the number of 682 /// devices in the plugin and the grid values for that kind of device. 683 GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices, 684 const llvm::omp::GV &GridValues); 685 686 /// Get the device identifier within the corresponding plugin. Notice that 687 /// this id is not unique between different plugins; they may overlap. 688 int32_t getDeviceId() const { return DeviceId; } 689 690 /// Set the context of the device if needed, before calling device-specific 691 /// functions. Plugins may implement this function as a no-op if not needed. 692 virtual Error setContext() = 0; 693 694 /// Initialize the device. After this call, the device should be already 695 /// working and ready to accept queries or modifications. 696 Error init(GenericPluginTy &Plugin); 697 virtual Error initImpl(GenericPluginTy &Plugin) = 0; 698 699 /// Deinitialize the device and free all its resources. After this call, the 700 /// device is no longer considered ready, so no queries or modifications are 701 /// allowed. 702 Error deinit(GenericPluginTy &Plugin); 703 virtual Error deinitImpl() = 0; 704 705 /// Load the binary image into the device and return the target table. 706 Expected<DeviceImageTy *> loadBinary(GenericPluginTy &Plugin, 707 const __tgt_device_image *TgtImage); 708 virtual Expected<DeviceImageTy *> 709 loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0; 710 711 /// Setup the device environment if needed. Notice this setup may not be run 712 /// on some plugins. By default, it will be executed, but plugins can change 713 /// this behavior by overriding the shouldSetupDeviceEnvironment function. 714 Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image); 715 716 /// Setup the global device memory pool, if the plugin requires one. 717 Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image, 718 uint64_t PoolSize); 719 720 // Setup the RPC server for this device if needed. This may not run on some 721 // plugins like the CPU targets. By default, it will not be executed so it is 722 // up to the target to override this using the shouldSetupRPCServer function. 723 Error setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image); 724 725 /// Synchronize the current thread with the pending operations on the 726 /// __tgt_async_info structure. 727 Error synchronize(__tgt_async_info *AsyncInfo); 728 virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo) = 0; 729 730 /// Invokes any global constructors on the device if present and is required 731 /// by the target. 732 virtual Error callGlobalConstructors(GenericPluginTy &Plugin, 733 DeviceImageTy &Image) { 734 return Error::success(); 735 } 736 737 /// Invokes any global destructors on the device if present and is required 738 /// by the target. 739 virtual Error callGlobalDestructors(GenericPluginTy &Plugin, 740 DeviceImageTy &Image) { 741 return Error::success(); 742 } 743 744 /// Query for the completion of the pending operations on the __tgt_async_info 745 /// structure in a non-blocking manner. 746 Error queryAsync(__tgt_async_info *AsyncInfo); 747 virtual Error queryAsyncImpl(__tgt_async_info &AsyncInfo) = 0; 748 749 /// Check whether the architecture supports VA management 750 virtual bool supportVAManagement() const { return false; } 751 752 /// Get the total device memory size 753 virtual Error getDeviceMemorySize(uint64_t &DSize); 754 755 /// Allocates \p RSize bytes (rounded up to page size) and hints the driver to 756 /// map it to \p VAddr. The obtained address is stored in \p Addr. At return 757 /// \p RSize contains the actual size which can be equal or larger than the 758 /// requested size. 759 virtual Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize); 760 761 /// De-allocates device memory and unmaps the virtual address \p VAddr 762 virtual Error memoryVAUnMap(void *VAddr, size_t Size); 763 764 /// Allocate data on the device or involving the device. 765 Expected<void *> dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind); 766 767 /// Deallocate data from the device or involving the device. 768 Error dataDelete(void *TgtPtr, TargetAllocTy Kind); 769 770 /// Pin host memory to optimize transfers and return the device accessible 771 /// pointer that devices should use for memory transfers involving the host 772 /// pinned allocation. 773 Expected<void *> dataLock(void *HstPtr, int64_t Size) { 774 return PinnedAllocs.lockHostBuffer(HstPtr, Size); 775 } 776 777 /// Unpin a host memory buffer that was previously pinned. 778 Error dataUnlock(void *HstPtr) { 779 return PinnedAllocs.unlockHostBuffer(HstPtr); 780 } 781 782 /// Lock the host buffer \p HstPtr with \p Size bytes with the vendor-specific 783 /// API and return the device accessible pointer. 784 virtual Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) = 0; 785 786 /// Unlock a previously locked host buffer starting at \p HstPtr. 787 virtual Error dataUnlockImpl(void *HstPtr) = 0; 788 789 /// Mark the host buffer with address \p HstPtr and \p Size bytes as a mapped 790 /// buffer. This means that libomptarget created a new mapping of that host 791 /// buffer (e.g., because a user OpenMP target map) and the buffer may be used 792 /// as source/destination of memory transfers. We can use this information to 793 /// lock the host buffer and optimize its memory transfers. 794 Error notifyDataMapped(void *HstPtr, int64_t Size) { 795 return PinnedAllocs.lockMappedHostBuffer(HstPtr, Size); 796 } 797 798 /// Mark the host buffer with address \p HstPtr as unmapped. This means that 799 /// libomptarget removed an existing mapping. If the plugin locked the buffer 800 /// in notifyDataMapped, this function should unlock it. 801 Error notifyDataUnmapped(void *HstPtr) { 802 return PinnedAllocs.unlockUnmappedHostBuffer(HstPtr); 803 } 804 805 /// Check whether the host buffer with address \p HstPtr is pinned by the 806 /// underlying vendor-specific runtime (if any). Retrieve the host pointer, 807 /// the device accessible pointer and the size of the original pinned buffer. 808 virtual Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr, 809 void *&BaseDevAccessiblePtr, 810 size_t &BaseSize) const = 0; 811 812 /// Submit data to the device (host to device transfer). 813 Error dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size, 814 __tgt_async_info *AsyncInfo); 815 virtual Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, 816 AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; 817 818 /// Retrieve data from the device (device to host transfer). 819 Error dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size, 820 __tgt_async_info *AsyncInfo); 821 virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, 822 AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; 823 824 /// Exchange data between devices (device to device transfer). Calling this 825 /// function is only valid if GenericPlugin::isDataExchangable() passing the 826 /// two devices returns true. 827 Error dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, void *DstPtr, 828 int64_t Size, __tgt_async_info *AsyncInfo); 829 virtual Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev, 830 void *DstPtr, int64_t Size, 831 AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; 832 833 /// Run the kernel associated with \p EntryPtr 834 Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, 835 KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo); 836 837 /// Initialize a __tgt_async_info structure. Related to interop features. 838 Error initAsyncInfo(__tgt_async_info **AsyncInfoPtr); 839 virtual Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; 840 841 /// Initialize a __tgt_device_info structure. Related to interop features. 842 Error initDeviceInfo(__tgt_device_info *DeviceInfo); 843 virtual Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) = 0; 844 845 /// Create an event. 846 Error createEvent(void **EventPtrStorage); 847 virtual Error createEventImpl(void **EventPtrStorage) = 0; 848 849 /// Destroy an event. 850 Error destroyEvent(void *Event); 851 virtual Error destroyEventImpl(void *EventPtr) = 0; 852 853 /// Start the recording of the event. 854 Error recordEvent(void *Event, __tgt_async_info *AsyncInfo); 855 virtual Error recordEventImpl(void *EventPtr, 856 AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; 857 858 /// Wait for an event to finish. Notice this wait is asynchronous if the 859 /// __tgt_async_info is not nullptr. 860 Error waitEvent(void *Event, __tgt_async_info *AsyncInfo); 861 virtual Error waitEventImpl(void *EventPtr, 862 AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; 863 864 /// Synchronize the current thread with the event. 865 Error syncEvent(void *EventPtr); 866 virtual Error syncEventImpl(void *EventPtr) = 0; 867 868 /// Print information about the device. 869 Error printInfo(); 870 virtual Error obtainInfoImpl(InfoQueueTy &Info) = 0; 871 872 /// Getters of the grid values. 873 uint32_t getWarpSize() const { return GridValues.GV_Warp_Size; } 874 uint32_t getThreadLimit() const { return GridValues.GV_Max_WG_Size; } 875 uint32_t getBlockLimit() const { return GridValues.GV_Max_Teams; } 876 uint32_t getDefaultNumThreads() const { 877 return GridValues.GV_Default_WG_Size; 878 } 879 uint32_t getDefaultNumBlocks() const { 880 return GridValues.GV_Default_Num_Teams; 881 } 882 uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; } 883 virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; } 884 885 /// Get target compute unit kind (e.g., sm_80, or gfx908). 886 virtual std::string getComputeUnitKind() const { return "unknown"; } 887 888 /// Post processing after jit backend. The ownership of \p MB will be taken. 889 virtual Expected<std::unique_ptr<MemoryBuffer>> 890 doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const { 891 return std::move(MB); 892 } 893 894 /// The minimum number of threads we use for a low-trip count combined loop. 895 /// Instead of using more threads we increase the outer (block/team) 896 /// parallelism. 897 /// @see OMPX_MinThreadsForLowTripCount 898 virtual uint32_t getMinThreadsForLowTripCountLoop() { 899 return OMPX_MinThreadsForLowTripCount; 900 } 901 902 /// Whether or not to reuse blocks for high trip count loops. 903 /// @see OMPX_ReuseBlocksForHighTripCount 904 bool getReuseBlocksForHighTripCount() { 905 return OMPX_ReuseBlocksForHighTripCount; 906 } 907 908 /// Get the total amount of hardware parallelism supported by the target 909 /// device. This is the total amount of warps or wavefronts that can be 910 /// resident on the device simultaneously. 911 virtual uint64_t getHardwareParallelism() const { return 0; } 912 913 /// Get the RPC server running on this device. 914 RPCServerTy *getRPCServer() const { return RPCServer; } 915 916 /// The number of parallel RPC ports to use on the device. In general, this 917 /// should be roughly equivalent to the amount of hardware parallelism the 918 /// device can support. This is because GPUs in general do not have forward 919 /// progress guarantees, so we minimize thread level dependencies by 920 /// allocating enough space such that each device thread can have a port. This 921 /// is likely overly pessimistic in the average case, but guarantees no 922 /// deadlocks at the cost of memory. This must be overloaded by targets 923 /// expecting to use the RPC server. 924 virtual uint64_t requestedRPCPortCount() const { 925 assert(!shouldSetupRPCServer() && "Default implementation cannot be used"); 926 return 0; 927 } 928 929 virtual Error getDeviceStackSize(uint64_t &V) = 0; 930 931 /// Returns true if current plugin architecture is an APU 932 /// and unified_shared_memory was not requested by the program. 933 bool useAutoZeroCopy(); 934 virtual bool useAutoZeroCopyImpl() { return false; } 935 936 /// Allocate and construct a kernel object. 937 virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0; 938 939 /// Reference to the underlying plugin that created this device. 940 GenericPluginTy &Plugin; 941 942 /// Map to record when allocations have been performed, and when they have 943 /// been deallocated, both for error reporting purposes. 944 ProtectedObj<DenseMap<void *, AllocationTraceInfoTy *>> AllocationTraces; 945 946 /// Return the allocation trace info for a device pointer, that is the 947 /// allocation into which this device pointer points to (or pointed into). 948 AllocationTraceInfoTy *getAllocationTraceInfoForAddr(void *DevicePtr) { 949 auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor(); 950 for (auto &It : *AllocationTraceMap) { 951 if (It.first <= DevicePtr && 952 utils::advancePtr(It.first, It.second->Size) > DevicePtr) 953 return It.second; 954 } 955 return nullptr; 956 } 957 958 /// Return the allocation trace info for a device pointer, that is the 959 /// allocation into which this device pointer points to (or pointed into). 960 AllocationTraceInfoTy * 961 getClosestAllocationTraceInfoForAddr(void *DevicePtr, uintptr_t &Distance) { 962 Distance = 0; 963 if (auto *ATI = getAllocationTraceInfoForAddr(DevicePtr)) { 964 return ATI; 965 } 966 967 AllocationTraceInfoTy *ATI = nullptr; 968 uintptr_t DevicePtrI = uintptr_t(DevicePtr); 969 auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor(); 970 for (auto &It : *AllocationTraceMap) { 971 uintptr_t Begin = uintptr_t(It.second->DevicePtr); 972 uintptr_t End = Begin + It.second->Size - 1; 973 uintptr_t ItDistance = std::min(Begin - DevicePtrI, DevicePtrI - End); 974 if (ATI && ItDistance > Distance) 975 continue; 976 ATI = It.second; 977 Distance = ItDistance; 978 } 979 return ATI; 980 } 981 982 /// Map to record kernel have been launchedl, for error reporting purposes. 983 ProtectedObj<KernelTraceInfoRecordTy> KernelLaunchTraces; 984 985 /// Environment variable to determine if stack traces for kernel launches are 986 /// tracked. 987 UInt32Envar OMPX_TrackNumKernelLaunches = 988 UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0); 989 990 /// Environment variable to determine if stack traces for allocations and 991 /// deallocations are tracked. 992 BoolEnvar OMPX_TrackAllocationTraces = 993 BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false); 994 995 private: 996 /// Get and set the stack size and heap size for the device. If not used, the 997 /// plugin can implement the setters as no-op and setting the output 998 /// value to zero for the getters. 999 virtual Error setDeviceStackSize(uint64_t V) = 0; 1000 virtual Error getDeviceHeapSize(uint64_t &V) = 0; 1001 virtual Error setDeviceHeapSize(uint64_t V) = 0; 1002 1003 /// Indicate whether the device should setup the device environment. Notice 1004 /// that returning false in this function will change the behavior of the 1005 /// setupDeviceEnvironment() function. 1006 virtual bool shouldSetupDeviceEnvironment() const { return true; } 1007 1008 /// Indicate whether the device should setup the global device memory pool. If 1009 /// false is return the value on the device will be uninitialized. 1010 virtual bool shouldSetupDeviceMemoryPool() const { return true; } 1011 1012 /// Indicate whether or not the device should setup the RPC server. This is 1013 /// only necessary for unhosted targets like the GPU. 1014 virtual bool shouldSetupRPCServer() const { return false; } 1015 1016 /// Pointer to the memory manager or nullptr if not available. 1017 MemoryManagerTy *MemoryManager; 1018 1019 /// Environment variables defined by the OpenMP standard. 1020 Int32Envar OMP_TeamLimit; 1021 Int32Envar OMP_NumTeams; 1022 Int32Envar OMP_TeamsThreadLimit; 1023 1024 /// Environment variables defined by the LLVM OpenMP implementation. 1025 Int32Envar OMPX_DebugKind; 1026 UInt32Envar OMPX_SharedMemorySize; 1027 UInt64Envar OMPX_TargetStackSize; 1028 UInt64Envar OMPX_TargetHeapSize; 1029 1030 /// Environment flag to set the minimum number of threads we use for a 1031 /// low-trip count combined loop. Instead of using more threads we increase 1032 /// the outer (block/team) parallelism. 1033 UInt32Envar OMPX_MinThreadsForLowTripCount = 1034 UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32); 1035 1036 BoolEnvar OMPX_ReuseBlocksForHighTripCount = 1037 BoolEnvar("LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT", true); 1038 1039 protected: 1040 /// Environment variables defined by the LLVM OpenMP implementation 1041 /// regarding the initial number of streams and events. 1042 UInt32Envar OMPX_InitialNumStreams; 1043 UInt32Envar OMPX_InitialNumEvents; 1044 1045 /// Array of images loaded into the device. Images are automatically 1046 /// deallocated by the allocator. 1047 llvm::SmallVector<DeviceImageTy *> LoadedImages; 1048 1049 /// The identifier of the device within the plugin. Notice this is not a 1050 /// global device id and is not the device id visible to the OpenMP user. 1051 const int32_t DeviceId; 1052 1053 /// The default grid values used for this device. 1054 llvm::omp::GV GridValues; 1055 1056 /// Enumeration used for representing the current state between two devices 1057 /// two devices (both under the same plugin) for the peer access between them. 1058 /// The states can be a) PENDING when the state has not been queried and needs 1059 /// to be queried, b) AVAILABLE when the peer access is available to be used, 1060 /// and c) UNAVAILABLE if the system does not allow it. 1061 enum class PeerAccessState : uint8_t { AVAILABLE, UNAVAILABLE, PENDING }; 1062 1063 /// Array of peer access states with the rest of devices. This means that if 1064 /// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE, 1065 /// the device I can access device J's memory directly. However, notice this 1066 /// does not mean that device J can access device I's memory directly. 1067 llvm::SmallVector<PeerAccessState> PeerAccesses; 1068 std::mutex PeerAccessesLock; 1069 1070 /// Map of host pinned allocations used for optimize device transfers. 1071 PinnedAllocationMapTy PinnedAllocs; 1072 1073 /// A pointer to an RPC server instance attached to this device if present. 1074 /// This is used to run the RPC server during task synchronization. 1075 RPCServerTy *RPCServer; 1076 1077 #ifdef OMPT_SUPPORT 1078 /// OMPT callback functions 1079 #define defineOmptCallback(Name, Type, Code) Name##_t Name##_fn = nullptr; 1080 FOREACH_OMPT_DEVICE_EVENT(defineOmptCallback) 1081 #undef defineOmptCallback 1082 1083 /// Internal representation for OMPT device (initialize & finalize) 1084 std::atomic<bool> OmptInitialized; 1085 #endif 1086 1087 private: 1088 DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0}; 1089 DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0}; 1090 }; 1091 1092 /// Class implementing common functionalities of offload plugins. Each plugin 1093 /// should define the specific plugin class, derive from this generic one, and 1094 /// implement the necessary virtual function members. 1095 struct GenericPluginTy { 1096 1097 /// Construct a plugin instance. 1098 GenericPluginTy(Triple::ArchType TA) 1099 : GlobalHandler(nullptr), JIT(TA), RPCServer(nullptr), 1100 RecordReplay(nullptr) {} 1101 1102 virtual ~GenericPluginTy() {} 1103 1104 /// Initialize the plugin. 1105 Error init(); 1106 1107 /// Initialize the plugin and return the number of available devices. 1108 virtual Expected<int32_t> initImpl() = 0; 1109 1110 /// Deinitialize the plugin and release the resources. 1111 Error deinit(); 1112 virtual Error deinitImpl() = 0; 1113 1114 /// Create a new device for the underlying plugin. 1115 virtual GenericDeviceTy *createDevice(GenericPluginTy &Plugin, 1116 int32_t DeviceID, 1117 int32_t NumDevices) = 0; 1118 1119 /// Create a new global handler for the underlying plugin. 1120 virtual GenericGlobalHandlerTy *createGlobalHandler() = 0; 1121 1122 /// Get the reference to the device with a certain device id. 1123 GenericDeviceTy &getDevice(int32_t DeviceId) { 1124 assert(isValidDeviceId(DeviceId) && "Invalid device id"); 1125 assert(Devices[DeviceId] && "Device is unitialized"); 1126 1127 return *Devices[DeviceId]; 1128 } 1129 1130 /// Get the number of active devices. 1131 int32_t getNumDevices() const { return NumDevices; } 1132 1133 /// Get the plugin-specific device identifier. 1134 int32_t getUserId(int32_t DeviceId) const { 1135 assert(UserDeviceIds.contains(DeviceId) && "No user-id registered"); 1136 return UserDeviceIds.at(DeviceId); 1137 } 1138 1139 /// Get the ELF code to recognize the binary image of this plugin. 1140 virtual uint16_t getMagicElfBits() const = 0; 1141 1142 /// Get the target triple of this plugin. 1143 virtual Triple::ArchType getTripleArch() const = 0; 1144 1145 /// Get the constant name identifier for this plugin. 1146 virtual const char *getName() const = 0; 1147 1148 /// Allocate a structure using the internal allocator. 1149 template <typename Ty> Ty *allocate() { 1150 return reinterpret_cast<Ty *>(Allocator.Allocate(sizeof(Ty), alignof(Ty))); 1151 } 1152 1153 /// Get the reference to the global handler of this plugin. 1154 GenericGlobalHandlerTy &getGlobalHandler() { 1155 assert(GlobalHandler && "Global handler not initialized"); 1156 return *GlobalHandler; 1157 } 1158 1159 /// Get the reference to the JIT used for all devices connected to this 1160 /// plugin. 1161 JITEngine &getJIT() { return JIT; } 1162 1163 /// Get a reference to the RPC server used to provide host services. 1164 RPCServerTy &getRPCServer() { 1165 assert(RPCServer && "RPC server not initialized"); 1166 return *RPCServer; 1167 } 1168 1169 /// Get a reference to the record and replay interface for the plugin. 1170 RecordReplayTy &getRecordReplay() { 1171 assert(RecordReplay && "RR interface not initialized"); 1172 return *RecordReplay; 1173 } 1174 1175 /// Initialize a device within the plugin. 1176 Error initDevice(int32_t DeviceId); 1177 1178 /// Deinitialize a device within the plugin and release its resources. 1179 Error deinitDevice(int32_t DeviceId); 1180 1181 /// Indicate whether data can be exchanged directly between two devices under 1182 /// this same plugin. If this function returns true, it's safe to call the 1183 /// GenericDeviceTy::exchangeData() function on the source device. 1184 virtual bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) { 1185 return isValidDeviceId(SrcDeviceId) && isValidDeviceId(DstDeviceId); 1186 } 1187 1188 /// Top level interface to verify if a given ELF image can be executed on a 1189 /// given target. Returns true if the \p Image is compatible with the plugin. 1190 Expected<bool> checkELFImage(StringRef Image) const; 1191 1192 /// Return true if the \p Image can be compiled to run on the platform's 1193 /// target architecture. 1194 Expected<bool> checkBitcodeImage(StringRef Image) const; 1195 1196 /// Indicate if an image is compatible with the plugin devices. Notice that 1197 /// this function may be called before actually initializing the devices. So 1198 /// we could not move this function into GenericDeviceTy. 1199 virtual Expected<bool> isELFCompatible(uint32_t DeviceID, 1200 StringRef Image) const = 0; 1201 1202 protected: 1203 /// Indicate whether a device id is valid. 1204 bool isValidDeviceId(int32_t DeviceId) const { 1205 return (DeviceId >= 0 && DeviceId < getNumDevices()); 1206 } 1207 1208 public: 1209 // TODO: This plugin interface needs to be cleaned up. 1210 1211 /// Returns non-zero if the plugin runtime has been initialized. 1212 int32_t is_initialized() const; 1213 1214 /// Returns non-zero if the \p Image is compatible with the plugin. This 1215 /// function does not require the plugin to be initialized before use. 1216 int32_t is_plugin_compatible(__tgt_device_image *Image); 1217 1218 /// Returns non-zero if the \p Image is compatible with the device. 1219 int32_t is_device_compatible(int32_t DeviceId, __tgt_device_image *Image); 1220 1221 /// Returns non-zero if the plugin device has been initialized. 1222 int32_t is_device_initialized(int32_t DeviceId) const; 1223 1224 /// Initialize the device inside of the plugin. 1225 int32_t init_device(int32_t DeviceId); 1226 1227 /// Return the number of devices this plugin can support. 1228 int32_t number_of_devices(); 1229 1230 /// Returns non-zero if the data can be exchanged between the two devices. 1231 int32_t is_data_exchangable(int32_t SrcDeviceId, int32_t DstDeviceId); 1232 1233 /// Initializes the record and replay mechanism inside the plugin. 1234 int32_t initialize_record_replay(int32_t DeviceId, int64_t MemorySize, 1235 void *VAddr, bool isRecord, bool SaveOutput, 1236 uint64_t &ReqPtrArgOffset); 1237 1238 /// Loads the associated binary into the plugin and returns a handle to it. 1239 int32_t load_binary(int32_t DeviceId, __tgt_device_image *TgtImage, 1240 __tgt_device_binary *Binary); 1241 1242 /// Allocates memory that is accessively to the given device. 1243 void *data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr, int32_t Kind); 1244 1245 /// Deallocates memory on the given device. 1246 int32_t data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind); 1247 1248 /// Locks / pins host memory using the plugin runtime. 1249 int32_t data_lock(int32_t DeviceId, void *Ptr, int64_t Size, 1250 void **LockedPtr); 1251 1252 /// Unlocks / unpins host memory using the plugin runtime. 1253 int32_t data_unlock(int32_t DeviceId, void *Ptr); 1254 1255 /// Notify the runtime about a new mapping that has been created outside. 1256 int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr, int64_t Size); 1257 1258 /// Notify t he runtime about a mapping that has been deleted. 1259 int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr); 1260 1261 /// Copy data to the given device. 1262 int32_t data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr, 1263 int64_t Size); 1264 1265 /// Copy data to the given device asynchronously. 1266 int32_t data_submit_async(int32_t DeviceId, void *TgtPtr, void *HstPtr, 1267 int64_t Size, __tgt_async_info *AsyncInfoPtr); 1268 1269 /// Copy data from the given device. 1270 int32_t data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, 1271 int64_t Size); 1272 1273 /// Copy data from the given device asynchornously. 1274 int32_t data_retrieve_async(int32_t DeviceId, void *HstPtr, void *TgtPtr, 1275 int64_t Size, __tgt_async_info *AsyncInfoPtr); 1276 1277 /// Exchange memory addresses between two devices. 1278 int32_t data_exchange(int32_t SrcDeviceId, void *SrcPtr, int32_t DstDeviceId, 1279 void *DstPtr, int64_t Size); 1280 1281 /// Exchange memory addresses between two devices asynchronously. 1282 int32_t data_exchange_async(int32_t SrcDeviceId, void *SrcPtr, 1283 int DstDeviceId, void *DstPtr, int64_t Size, 1284 __tgt_async_info *AsyncInfo); 1285 1286 /// Begin executing a kernel on the given device. 1287 int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, 1288 ptrdiff_t *TgtOffsets, KernelArgsTy *KernelArgs, 1289 __tgt_async_info *AsyncInfoPtr); 1290 1291 /// Synchronize an asyncrhonous queue with the plugin runtime. 1292 int32_t synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr); 1293 1294 /// Query the current state of an asynchronous queue. 1295 int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr); 1296 1297 /// Prints information about the given devices supported by the plugin. 1298 void print_device_info(int32_t DeviceId); 1299 1300 /// Creates an event in the given plugin if supported. 1301 int32_t create_event(int32_t DeviceId, void **EventPtr); 1302 1303 /// Records an event that has occurred. 1304 int32_t record_event(int32_t DeviceId, void *EventPtr, 1305 __tgt_async_info *AsyncInfoPtr); 1306 1307 /// Wait until an event has occurred. 1308 int32_t wait_event(int32_t DeviceId, void *EventPtr, 1309 __tgt_async_info *AsyncInfoPtr); 1310 1311 /// Syncrhonize execution until an event is done. 1312 int32_t sync_event(int32_t DeviceId, void *EventPtr); 1313 1314 /// Remove the event from the plugin. 1315 int32_t destroy_event(int32_t DeviceId, void *EventPtr); 1316 1317 /// Remove the event from the plugin. 1318 void set_info_flag(uint32_t NewInfoLevel); 1319 1320 /// Creates an asynchronous queue for the given plugin. 1321 int32_t init_async_info(int32_t DeviceId, __tgt_async_info **AsyncInfoPtr); 1322 1323 /// Creates device information to be used for diagnostics. 1324 int32_t init_device_info(int32_t DeviceId, __tgt_device_info *DeviceInfo, 1325 const char **ErrStr); 1326 1327 /// Sets the offset into the devices for use by OMPT. 1328 int32_t set_device_identifier(int32_t UserId, int32_t DeviceId); 1329 1330 /// Returns if the plugin can support auotmatic copy. 1331 int32_t use_auto_zero_copy(int32_t DeviceId); 1332 1333 /// Look up a global symbol in the given binary. 1334 int32_t get_global(__tgt_device_binary Binary, uint64_t Size, 1335 const char *Name, void **DevicePtr); 1336 1337 /// Look up a kernel function in the given binary. 1338 int32_t get_function(__tgt_device_binary Binary, const char *Name, 1339 void **KernelPtr); 1340 1341 private: 1342 /// Indicates if the platform runtime has been fully initialized. 1343 bool Initialized = false; 1344 1345 /// Number of devices available for the plugin. 1346 int32_t NumDevices = 0; 1347 1348 /// Map of plugin device identifiers to the user device identifier. 1349 llvm::DenseMap<int32_t, int32_t> UserDeviceIds; 1350 1351 /// Array of pointers to the devices. Initially, they are all set to nullptr. 1352 /// Once a device is initialized, the pointer is stored in the position given 1353 /// by its device id. A position with nullptr means that the corresponding 1354 /// device was not initialized yet. 1355 llvm::SmallVector<GenericDeviceTy *> Devices; 1356 1357 /// Pointer to the global handler for this plugin. 1358 GenericGlobalHandlerTy *GlobalHandler; 1359 1360 /// Internal allocator for different structures. 1361 BumpPtrAllocator Allocator; 1362 1363 /// The JIT engine shared by all devices connected to this plugin. 1364 JITEngine JIT; 1365 1366 /// The interface between the plugin and the GPU for host services. 1367 RPCServerTy *RPCServer; 1368 1369 /// The interface between the plugin and the GPU for host services. 1370 RecordReplayTy *RecordReplay; 1371 }; 1372 1373 namespace Plugin { 1374 /// Create a success error. This is the same as calling Error::success(), but 1375 /// it is recommended to use this one for consistency with Plugin::error() and 1376 /// Plugin::check(). 1377 static inline Error success() { return Error::success(); } 1378 1379 /// Create a string error. 1380 template <typename... ArgsTy> 1381 static Error error(const char *ErrFmt, ArgsTy... Args) { 1382 return createStringError(inconvertibleErrorCode(), ErrFmt, Args...); 1383 } 1384 1385 /// Check the plugin-specific error code and return an error or success 1386 /// accordingly. In case of an error, create a string error with the error 1387 /// description. The ErrFmt should follow the format: 1388 /// "Error in <function name>[<optional info>]: %s" 1389 /// The last format specifier "%s" is mandatory and will be used to place the 1390 /// error code's description. Notice this function should be only called from 1391 /// the plugin-specific code. 1392 /// TODO: Refactor this, must be defined individually by each plugin. 1393 template <typename... ArgsTy> 1394 static Error check(int32_t ErrorCode, const char *ErrFmt, ArgsTy... Args); 1395 } // namespace Plugin 1396 1397 /// Auxiliary interface class for GenericDeviceResourceManagerTy. This class 1398 /// acts as a reference to a device resource, such as a stream, and requires 1399 /// some basic functions to be implemented. The derived class should define an 1400 /// empty constructor that creates an empty and invalid resource reference. Do 1401 /// not create a new resource on the ctor, but on the create() function instead. 1402 /// 1403 /// The derived class should also define the type HandleTy as the underlying 1404 /// resource handle type. For instance, in a CUDA stream it would be: 1405 /// using HandleTy = CUstream; 1406 struct GenericDeviceResourceRef { 1407 /// Create a new resource and stores a reference. 1408 virtual Error create(GenericDeviceTy &Device) = 0; 1409 1410 /// Destroy and release the resources pointed by the reference. 1411 virtual Error destroy(GenericDeviceTy &Device) = 0; 1412 1413 protected: 1414 ~GenericDeviceResourceRef() = default; 1415 }; 1416 1417 /// Class that implements a resource pool belonging to a device. This class 1418 /// operates with references to the actual resources. These reference must 1419 /// derive from the GenericDeviceResourceRef class and implement the create 1420 /// and destroy virtual functions. 1421 template <typename ResourceRef> class GenericDeviceResourceManagerTy { 1422 using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>; 1423 using ResourceHandleTy = typename ResourceRef::HandleTy; 1424 1425 public: 1426 /// Create an empty resource pool for a specific device. 1427 GenericDeviceResourceManagerTy(GenericDeviceTy &Device) 1428 : Device(Device), NextAvailable(0) {} 1429 1430 /// Destroy the resource pool. At this point, the deinit() function should 1431 /// already have been executed so the resource pool should be empty. 1432 virtual ~GenericDeviceResourceManagerTy() { 1433 assert(ResourcePool.empty() && "Resource pool not empty"); 1434 } 1435 1436 /// Initialize the resource pool. 1437 Error init(uint32_t InitialSize) { 1438 assert(ResourcePool.empty() && "Resource pool already initialized"); 1439 return ResourcePoolTy::resizeResourcePool(InitialSize); 1440 } 1441 1442 /// Deinitialize the resource pool and delete all resources. This function 1443 /// must be called before the destructor. 1444 virtual Error deinit() { 1445 if (NextAvailable) 1446 DP("Missing %d resources to be returned\n", NextAvailable); 1447 1448 // TODO: This prevents a bug on libomptarget to make the plugins fail. There 1449 // may be some resources not returned. Do not destroy these ones. 1450 if (auto Err = ResourcePoolTy::resizeResourcePool(NextAvailable)) 1451 return Err; 1452 1453 ResourcePool.clear(); 1454 1455 return Plugin::success(); 1456 } 1457 1458 /// Get a resource from the pool or create new ones. If the function 1459 /// succeeds, the handle to the resource is saved in \p Handle. 1460 virtual Error getResource(ResourceHandleTy &Handle) { 1461 // Get a resource with an empty resource processor. 1462 return getResourcesImpl(1, &Handle, 1463 [](ResourceHandleTy) { return Plugin::success(); }); 1464 } 1465 1466 /// Get multiple resources from the pool or create new ones. If the function 1467 /// succeeds, the handles to the resources are saved in \p Handles. 1468 virtual Error getResources(uint32_t Num, ResourceHandleTy *Handles) { 1469 // Get resources with an empty resource processor. 1470 return getResourcesImpl(Num, Handles, 1471 [](ResourceHandleTy) { return Plugin::success(); }); 1472 } 1473 1474 /// Return resource to the pool. 1475 virtual Error returnResource(ResourceHandleTy Handle) { 1476 // Return a resource with an empty resource processor. 1477 return returnResourceImpl( 1478 Handle, [](ResourceHandleTy) { return Plugin::success(); }); 1479 } 1480 1481 protected: 1482 /// Get multiple resources from the pool or create new ones. If the function 1483 /// succeeds, the handles to the resources are saved in \p Handles. Also 1484 /// process each of the obtained resources with \p Processor. 1485 template <typename FuncTy> 1486 Error getResourcesImpl(uint32_t Num, ResourceHandleTy *Handles, 1487 FuncTy Processor) { 1488 const std::lock_guard<std::mutex> Lock(Mutex); 1489 1490 assert(NextAvailable <= ResourcePool.size() && 1491 "Resource pool is corrupted"); 1492 1493 if (NextAvailable + Num > ResourcePool.size()) 1494 // Double the resource pool or resize it to provide the requested ones. 1495 if (auto Err = ResourcePoolTy::resizeResourcePool( 1496 std::max(NextAvailable * 2, NextAvailable + Num))) 1497 return Err; 1498 1499 // Save the handles in the output array parameter. 1500 for (uint32_t r = 0; r < Num; ++r) 1501 Handles[r] = ResourcePool[NextAvailable + r]; 1502 1503 // Process all obtained resources. 1504 for (uint32_t r = 0; r < Num; ++r) 1505 if (auto Err = Processor(Handles[r])) 1506 return Err; 1507 1508 NextAvailable += Num; 1509 1510 return Plugin::success(); 1511 } 1512 1513 /// Return resource to the pool and process the resource with \p Processor. 1514 template <typename FuncTy> 1515 Error returnResourceImpl(ResourceHandleTy Handle, FuncTy Processor) { 1516 const std::lock_guard<std::mutex> Lock(Mutex); 1517 1518 // Process the returned resource. 1519 if (auto Err = Processor(Handle)) 1520 return Err; 1521 1522 assert(NextAvailable > 0 && "Resource pool is corrupted"); 1523 ResourcePool[--NextAvailable] = Handle; 1524 1525 return Plugin::success(); 1526 } 1527 1528 protected: 1529 /// The resources between \p OldSize and \p NewSize need to be created or 1530 /// destroyed. The mutex is locked when this function is called. 1531 Error resizeResourcePoolImpl(uint32_t OldSize, uint32_t NewSize) { 1532 assert(OldSize != NewSize && "Resizing to the same size"); 1533 1534 if (auto Err = Device.setContext()) 1535 return Err; 1536 1537 if (OldSize < NewSize) { 1538 // Create new resources. 1539 for (uint32_t I = OldSize; I < NewSize; ++I) { 1540 if (auto Err = ResourcePool[I].create(Device)) 1541 return Err; 1542 } 1543 } else { 1544 // Destroy the obsolete resources. 1545 for (uint32_t I = NewSize; I < OldSize; ++I) { 1546 if (auto Err = ResourcePool[I].destroy(Device)) 1547 return Err; 1548 } 1549 } 1550 return Plugin::success(); 1551 } 1552 1553 /// Increase or decrease the number of resources. This function should 1554 /// be called with the mutex acquired. 1555 Error resizeResourcePool(uint32_t NewSize) { 1556 uint32_t OldSize = ResourcePool.size(); 1557 1558 // Nothing to do. 1559 if (OldSize == NewSize) 1560 return Plugin::success(); 1561 1562 if (OldSize < NewSize) { 1563 // Increase the number of resources. 1564 ResourcePool.resize(NewSize); 1565 return ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize); 1566 } 1567 1568 // Decrease the number of resources otherwise. 1569 auto Err = ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize); 1570 ResourcePool.resize(NewSize); 1571 1572 return Err; 1573 } 1574 1575 /// The device to which the resources belong 1576 GenericDeviceTy &Device; 1577 1578 /// Mutex for the resource pool. 1579 std::mutex Mutex; 1580 1581 /// The next available resource in the pool. 1582 uint32_t NextAvailable; 1583 1584 /// The actual resource pool. 1585 std::deque<ResourceRef> ResourcePool; 1586 }; 1587 1588 } // namespace plugin 1589 } // namespace target 1590 } // namespace omp 1591 } // namespace llvm 1592 1593 #endif // OPENMP_LIBOMPTARGET_PLUGINS_COMMON_PLUGININTERFACE_H 1594