//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // RTL NextGen for CUDA machine // //===----------------------------------------------------------------------===// #include #include #include #include #include #include "Shared/APITypes.h" #include "Shared/Debug.h" #include "Shared/Environment.h" #include "GlobalHandler.h" #include "OpenMP/OMPT/Callback.h" #include "PluginInterface.h" #include "Utils/ELF.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPGridValues.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileOutputBuffer.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Program.h" namespace llvm { namespace omp { namespace target { namespace plugin { /// Forward declarations for all specialized data structures. struct CUDAKernelTy; struct CUDADeviceTy; struct CUDAPluginTy; #if (defined(CUDA_VERSION) && (CUDA_VERSION < 11000)) /// Forward declarations for all Virtual Memory Management /// related data structures and functions. This is necessary /// for older cuda versions. typedef void *CUmemGenericAllocationHandle; typedef void *CUmemAllocationProp; typedef void *CUmemAccessDesc; typedef void *CUmemAllocationGranularity_flags; CUresult cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags) {} CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags) {} CUresult cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags) {} CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count) {} CUresult cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option) {} #endif #if (defined(CUDA_VERSION) && (CUDA_VERSION < 11020)) // Forward declarations of asynchronous memory management functions. This is // necessary for older versions of CUDA. CUresult cuMemAllocAsync(CUdeviceptr *ptr, size_t, CUstream) { *ptr = 0; } CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {} #endif /// Class implementing the CUDA device images properties. struct CUDADeviceImageTy : public DeviceImageTy { /// Create the CUDA image with the id and the target image pointer. CUDADeviceImageTy(int32_t ImageId, GenericDeviceTy &Device, const __tgt_device_image *TgtImage) : DeviceImageTy(ImageId, Device, TgtImage), Module(nullptr) {} /// Load the image as a CUDA module. Error loadModule() { assert(!Module && "Module already loaded"); CUresult Res = cuModuleLoadDataEx(&Module, getStart(), 0, nullptr, nullptr); if (auto Err = Plugin::check(Res, "Error in cuModuleLoadDataEx: %s")) return Err; return Plugin::success(); } /// Unload the CUDA module corresponding to the image. Error unloadModule() { assert(Module && "Module not loaded"); CUresult Res = cuModuleUnload(Module); if (auto Err = Plugin::check(Res, "Error in cuModuleUnload: %s")) return Err; Module = nullptr; return Plugin::success(); } /// Getter of the CUDA module. CUmodule getModule() const { return Module; } private: /// The CUDA module that loaded the image. CUmodule Module; }; /// Class implementing the CUDA kernel functionalities which derives from the /// generic kernel class. struct CUDAKernelTy : public GenericKernelTy { /// Create a CUDA kernel with a name and an execution mode. CUDAKernelTy(const char *Name) : GenericKernelTy(Name), Func(nullptr) {} /// Initialize the CUDA kernel. Error initImpl(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) override { CUresult Res; CUDADeviceImageTy &CUDAImage = static_cast(Image); // Retrieve the function pointer of the kernel. Res = cuModuleGetFunction(&Func, CUDAImage.getModule(), getName()); if (auto Err = Plugin::check(Res, "Error in cuModuleGetFunction('%s'): %s", getName())) return Err; // Check that the function pointer is valid. if (!Func) return Plugin::error("Invalid function for kernel %s", getName()); int MaxThreads; Res = cuFuncGetAttribute(&MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func); if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s")) return Err; // The maximum number of threads cannot exceed the maximum of the kernel. MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads); return Plugin::success(); } /// Launch the CUDA kernel function. Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], uint32_t NumBlocks[3], KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; private: /// The CUDA kernel function to execute. CUfunction Func; }; /// Class wrapping a CUDA stream reference. These are the objects handled by the /// Stream Manager for the CUDA plugin. struct CUDAStreamRef final : public GenericDeviceResourceRef { /// The underlying handle type for streams. using HandleTy = CUstream; /// Create an empty reference to an invalid stream. CUDAStreamRef() : Stream(nullptr) {} /// Create a reference to an existing stream. CUDAStreamRef(HandleTy Stream) : Stream(Stream) {} /// Create a new stream and save the reference. The reference must be empty /// before calling to this function. Error create(GenericDeviceTy &Device) override { if (Stream) return Plugin::error("Creating an existing stream"); CUresult Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING); if (auto Err = Plugin::check(Res, "Error in cuStreamCreate: %s")) return Err; return Plugin::success(); } /// Destroy the referenced stream and invalidate the reference. The reference /// must be to a valid stream before calling to this function. Error destroy(GenericDeviceTy &Device) override { if (!Stream) return Plugin::error("Destroying an invalid stream"); CUresult Res = cuStreamDestroy(Stream); if (auto Err = Plugin::check(Res, "Error in cuStreamDestroy: %s")) return Err; Stream = nullptr; return Plugin::success(); } /// Get the underlying CUDA stream. operator HandleTy() const { return Stream; } private: /// The reference to the CUDA stream. HandleTy Stream; }; /// Class wrapping a CUDA event reference. These are the objects handled by the /// Event Manager for the CUDA plugin. struct CUDAEventRef final : public GenericDeviceResourceRef { /// The underlying handle type for events. using HandleTy = CUevent; /// Create an empty reference to an invalid event. CUDAEventRef() : Event(nullptr) {} /// Create a reference to an existing event. CUDAEventRef(HandleTy Event) : Event(Event) {} /// Create a new event and save the reference. The reference must be empty /// before calling to this function. Error create(GenericDeviceTy &Device) override { if (Event) return Plugin::error("Creating an existing event"); CUresult Res = cuEventCreate(&Event, CU_EVENT_DEFAULT); if (auto Err = Plugin::check(Res, "Error in cuEventCreate: %s")) return Err; return Plugin::success(); } /// Destroy the referenced event and invalidate the reference. The reference /// must be to a valid event before calling to this function. Error destroy(GenericDeviceTy &Device) override { if (!Event) return Plugin::error("Destroying an invalid event"); CUresult Res = cuEventDestroy(Event); if (auto Err = Plugin::check(Res, "Error in cuEventDestroy: %s")) return Err; Event = nullptr; return Plugin::success(); } /// Get the underlying CUevent. operator HandleTy() const { return Event; } private: /// The reference to the CUDA event. HandleTy Event; }; /// Class implementing the CUDA device functionalities which derives from the /// generic device class. struct CUDADeviceTy : public GenericDeviceTy { // Create a CUDA device with a device id and the default CUDA grid values. CUDADeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices) : GenericDeviceTy(Plugin, DeviceId, NumDevices, NVPTXGridValues), CUDAStreamManager(*this), CUDAEventManager(*this) {} ~CUDADeviceTy() {} /// Initialize the device, its resources and get its properties. Error initImpl(GenericPluginTy &Plugin) override { CUresult Res = cuDeviceGet(&Device, DeviceId); if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s")) return Err; // Query the current flags of the primary context and set its flags if // it is inactive. unsigned int FormerPrimaryCtxFlags = 0; int FormerPrimaryCtxIsActive = 0; Res = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, &FormerPrimaryCtxIsActive); if (auto Err = Plugin::check(Res, "Error in cuDevicePrimaryCtxGetState: %s")) return Err; if (FormerPrimaryCtxIsActive) { INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "The primary context is active, no change to its flags\n"); if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != CU_CTX_SCHED_BLOCKING_SYNC) INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Warning: The current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); } else { INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "The primary context is inactive, set its flags to " "CU_CTX_SCHED_BLOCKING_SYNC\n"); Res = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); if (auto Err = Plugin::check(Res, "Error in cuDevicePrimaryCtxSetFlags: %s")) return Err; } // Retain the per device primary context and save it to use whenever this // device is selected. Res = cuDevicePrimaryCtxRetain(&Context, Device); if (auto Err = Plugin::check(Res, "Error in cuDevicePrimaryCtxRetain: %s")) return Err; if (auto Err = setContext()) return Err; // Initialize stream pool. if (auto Err = CUDAStreamManager.init(OMPX_InitialNumStreams)) return Err; // Initialize event pool. if (auto Err = CUDAEventManager.init(OMPX_InitialNumEvents)) return Err; // Query attributes to determine number of threads/block and blocks/grid. if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, GridValues.GV_Max_Teams)) return Err; if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, GridValues.GV_Max_WG_Size)) return Err; if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, GridValues.GV_Warp_Size)) return Err; if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, ComputeCapability.Major)) return Err; if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, ComputeCapability.Minor)) return Err; uint32_t NumMuliprocessors = 0; uint32_t MaxThreadsPerSM = 0; uint32_t WarpSize = 0; if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, NumMuliprocessors)) return Err; if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, MaxThreadsPerSM)) return Err; if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, WarpSize)) return Err; HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize); return Plugin::success(); } /// Deinitialize the device and release its resources. Error deinitImpl() override { if (Context) { if (auto Err = setContext()) return Err; } // Deinitialize the stream manager. if (auto Err = CUDAStreamManager.deinit()) return Err; if (auto Err = CUDAEventManager.deinit()) return Err; // Close modules if necessary. if (!LoadedImages.empty()) { assert(Context && "Invalid CUDA context"); // Each image has its own module. for (DeviceImageTy *Image : LoadedImages) { CUDADeviceImageTy &CUDAImage = static_cast(*Image); // Unload the module of the image. if (auto Err = CUDAImage.unloadModule()) return Err; } } if (Context) { CUresult Res = cuDevicePrimaryCtxRelease(Device); if (auto Err = Plugin::check(Res, "Error in cuDevicePrimaryCtxRelease: %s")) return Err; } // Invalidate context and device references. Context = nullptr; Device = CU_DEVICE_INVALID; return Plugin::success(); } virtual Error callGlobalConstructors(GenericPluginTy &Plugin, DeviceImageTy &Image) override { // Check for the presense of global destructors at initialization time. This // is required when the image may be deallocated before destructors are run. GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); if (Handler.isSymbolInImage(*this, Image, "nvptx$device$fini")) Image.setPendingGlobalDtors(); return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true); } virtual Error callGlobalDestructors(GenericPluginTy &Plugin, DeviceImageTy &Image) override { if (Image.hasPendingGlobalDtors()) return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false); return Plugin::success(); } Expected> doJITPostProcessing(std::unique_ptr MB) const override { // TODO: We should be able to use the 'nvidia-ptxjitcompiler' interface to // avoid the call to 'ptxas'. SmallString<128> PTXInputFilePath; std::error_code EC = sys::fs::createTemporaryFile("nvptx-pre-link-jit", "s", PTXInputFilePath); if (EC) return Plugin::error("Failed to create temporary file for ptxas"); // Write the file's contents to the output file. Expected> OutputOrErr = FileOutputBuffer::create(PTXInputFilePath, MB->getBuffer().size()); if (!OutputOrErr) return OutputOrErr.takeError(); std::unique_ptr Output = std::move(*OutputOrErr); llvm::copy(MB->getBuffer(), Output->getBufferStart()); if (Error E = Output->commit()) return std::move(E); SmallString<128> PTXOutputFilePath; EC = sys::fs::createTemporaryFile("nvptx-post-link-jit", "cubin", PTXOutputFilePath); if (EC) return Plugin::error("Failed to create temporary file for ptxas"); // Try to find `ptxas` in the path to compile the PTX to a binary. const auto ErrorOrPath = sys::findProgramByName("ptxas"); if (!ErrorOrPath) return Plugin::error("Failed to find 'ptxas' on the PATH."); std::string Arch = getComputeUnitKind(); StringRef Args[] = {*ErrorOrPath, "-m64", "-O2", "--gpu-name", Arch, "--output-file", PTXOutputFilePath, PTXInputFilePath}; std::string ErrMsg; if (sys::ExecuteAndWait(*ErrorOrPath, Args, std::nullopt, {}, 0, 0, &ErrMsg)) return Plugin::error("Running 'ptxas' failed: %s\n", ErrMsg.c_str()); auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(PTXOutputFilePath.data()); if (!BufferOrErr) return Plugin::error("Failed to open temporary file for ptxas"); // Clean up the temporary files afterwards. if (sys::fs::remove(PTXOutputFilePath)) return Plugin::error("Failed to remove temporary file for ptxas"); if (sys::fs::remove(PTXInputFilePath)) return Plugin::error("Failed to remove temporary file for ptxas"); return std::move(*BufferOrErr); } /// Allocate and construct a CUDA kernel. Expected constructKernel(const char *Name) override { // Allocate and construct the CUDA kernel. CUDAKernelTy *CUDAKernel = Plugin.allocate(); if (!CUDAKernel) return Plugin::error("Failed to allocate memory for CUDA kernel"); new (CUDAKernel) CUDAKernelTy(Name); return *CUDAKernel; } /// Set the current context to this device's context. Error setContext() override { CUresult Res = cuCtxSetCurrent(Context); return Plugin::check(Res, "Error in cuCtxSetCurrent: %s"); } /// NVIDIA returns the product of the SM count and the number of warps that /// fit if the maximum number of threads were scheduled on each SM. uint64_t getHardwareParallelism() const override { return HardwareParallelism; } /// We want to set up the RPC server for host services to the GPU if it is /// availible. bool shouldSetupRPCServer() const override { return true; } /// The RPC interface should have enough space for all availible parallelism. uint64_t requestedRPCPortCount() const override { return getHardwareParallelism(); } /// Get the stream of the asynchronous info sructure or get a new one. Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, CUstream &Stream) { // Get the stream (if any) from the async info. Stream = AsyncInfoWrapper.getQueueAs(); if (!Stream) { // There was no stream; get an idle one. if (auto Err = CUDAStreamManager.getResource(Stream)) return Err; // Modify the async info's stream. AsyncInfoWrapper.setQueueAs(Stream); } return Plugin::success(); } /// Getters of CUDA references. CUcontext getCUDAContext() const { return Context; } CUdevice getCUDADevice() const { return Device; } /// Load the binary image into the device and allocate an image object. Expected loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) override { if (auto Err = setContext()) return std::move(Err); // Allocate and initialize the image object. CUDADeviceImageTy *CUDAImage = Plugin.allocate(); new (CUDAImage) CUDADeviceImageTy(ImageId, *this, TgtImage); // Load the CUDA module. if (auto Err = CUDAImage->loadModule()) return std::move(Err); return CUDAImage; } /// Allocate memory on the device or related to the device. void *allocate(size_t Size, void *, TargetAllocTy Kind) override { if (Size == 0) return nullptr; if (auto Err = setContext()) { REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data()); return nullptr; } void *MemAlloc = nullptr; CUdeviceptr DevicePtr; CUresult Res; switch (Kind) { case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: Res = cuMemAlloc(&DevicePtr, Size); MemAlloc = (void *)DevicePtr; break; case TARGET_ALLOC_HOST: Res = cuMemAllocHost(&MemAlloc, Size); break; case TARGET_ALLOC_SHARED: Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL); MemAlloc = (void *)DevicePtr; break; case TARGET_ALLOC_DEVICE_NON_BLOCKING: { CUstream Stream; if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING))) break; if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream))) break; cuStreamSynchronize(Stream); Res = cuStreamDestroy(Stream); MemAlloc = (void *)DevicePtr; } } if (auto Err = Plugin::check(Res, "Error in cuMemAlloc[Host|Managed]: %s")) { REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data()); return nullptr; } return MemAlloc; } /// Deallocate memory on the device or related to the device. int free(void *TgtPtr, TargetAllocTy Kind) override { if (TgtPtr == nullptr) return OFFLOAD_SUCCESS; if (auto Err = setContext()) { REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data()); return OFFLOAD_FAIL; } CUresult Res; switch (Kind) { case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: case TARGET_ALLOC_SHARED: Res = cuMemFree((CUdeviceptr)TgtPtr); break; case TARGET_ALLOC_HOST: Res = cuMemFreeHost(TgtPtr); break; case TARGET_ALLOC_DEVICE_NON_BLOCKING: { CUstream Stream; if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING))) break; cuMemFreeAsync(reinterpret_cast(TgtPtr), Stream); cuStreamSynchronize(Stream); if ((Res = cuStreamDestroy(Stream))) break; } } if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) { REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data()); return OFFLOAD_FAIL; } return OFFLOAD_SUCCESS; } /// Synchronize current thread with the pending operations on the async info. Error synchronizeImpl(__tgt_async_info &AsyncInfo) override { CUstream Stream = reinterpret_cast(AsyncInfo.Queue); CUresult Res; Res = cuStreamSynchronize(Stream); // Once the stream is synchronized, return it to stream pool and reset // AsyncInfo. This is to make sure the synchronization only works for its // own tasks. AsyncInfo.Queue = nullptr; if (auto Err = CUDAStreamManager.returnResource(Stream)) return Err; return Plugin::check(Res, "Error in cuStreamSynchronize: %s"); } /// CUDA support VA management bool supportVAManagement() const override { #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11000)) return true; #else return false; #endif } /// Allocates \p RSize bytes (rounded up to page size) and hints the cuda /// driver to map it to \p VAddr. The obtained address is stored in \p Addr. /// At return \p RSize contains the actual size Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize) override { CUdeviceptr DVAddr = reinterpret_cast(VAddr); auto IHandle = DeviceMMaps.find(DVAddr); size_t Size = *RSize; if (Size == 0) return Plugin::error("Memory Map Size must be larger than 0"); // Check if we have already mapped this address if (IHandle != DeviceMMaps.end()) return Plugin::error("Address already memory mapped"); CUmemAllocationProp Prop = {}; size_t Granularity = 0; size_t Free, Total; CUresult Res = cuMemGetInfo(&Free, &Total); if (auto Err = Plugin::check(Res, "Error in cuMemGetInfo: %s")) return Err; if (Size >= Free) { *Addr = nullptr; return Plugin::error( "Canot map memory size larger than the available device memory"); } // currently NVidia only supports pinned device types Prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; Prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; Prop.location.id = DeviceId; cuMemGetAllocationGranularity(&Granularity, &Prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); if (auto Err = Plugin::check(Res, "Error in cuMemGetAllocationGranularity: %s")) return Err; if (Granularity == 0) return Plugin::error("Wrong device Page size"); // Ceil to page size. Size = utils::roundUp(Size, Granularity); // Create a handler of our allocation CUmemGenericAllocationHandle AHandle; Res = cuMemCreate(&AHandle, Size, &Prop, 0); if (auto Err = Plugin::check(Res, "Error in cuMemCreate: %s")) return Err; CUdeviceptr DevPtr = 0; Res = cuMemAddressReserve(&DevPtr, Size, 0, DVAddr, 0); if (auto Err = Plugin::check(Res, "Error in cuMemAddressReserve: %s")) return Err; Res = cuMemMap(DevPtr, Size, 0, AHandle, 0); if (auto Err = Plugin::check(Res, "Error in cuMemMap: %s")) return Err; CUmemAccessDesc ADesc = {}; ADesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; ADesc.location.id = DeviceId; ADesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; // Sets address Res = cuMemSetAccess(DevPtr, Size, &ADesc, 1); if (auto Err = Plugin::check(Res, "Error in cuMemSetAccess: %s")) return Err; *Addr = reinterpret_cast(DevPtr); *RSize = Size; DeviceMMaps.insert({DevPtr, AHandle}); return Plugin::success(); } /// De-allocates device memory and Unmaps the Virtual Addr Error memoryVAUnMap(void *VAddr, size_t Size) override { CUdeviceptr DVAddr = reinterpret_cast(VAddr); auto IHandle = DeviceMMaps.find(DVAddr); // Mapping does not exist if (IHandle == DeviceMMaps.end()) { return Plugin::error("Addr is not MemoryMapped"); } if (IHandle == DeviceMMaps.end()) return Plugin::error("Addr is not MemoryMapped"); CUmemGenericAllocationHandle &AllocHandle = IHandle->second; CUresult Res = cuMemUnmap(DVAddr, Size); if (auto Err = Plugin::check(Res, "Error in cuMemUnmap: %s")) return Err; Res = cuMemRelease(AllocHandle); if (auto Err = Plugin::check(Res, "Error in cuMemRelease: %s")) return Err; Res = cuMemAddressFree(DVAddr, Size); if (auto Err = Plugin::check(Res, "Error in cuMemAddressFree: %s")) return Err; DeviceMMaps.erase(IHandle); return Plugin::success(); } /// Query for the completion of the pending operations on the async info. Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override { CUstream Stream = reinterpret_cast(AsyncInfo.Queue); CUresult Res = cuStreamQuery(Stream); // Not ready streams must be considered as successful operations. if (Res == CUDA_ERROR_NOT_READY) return Plugin::success(); // Once the stream is synchronized and the operations completed (or an error // occurs), return it to stream pool and reset AsyncInfo. This is to make // sure the synchronization only works for its own tasks. AsyncInfo.Queue = nullptr; if (auto Err = CUDAStreamManager.returnResource(Stream)) return Err; return Plugin::check(Res, "Error in cuStreamQuery: %s"); } Expected dataLockImpl(void *HstPtr, int64_t Size) override { // TODO: Register the buffer as CUDA host memory. return HstPtr; } Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); } Expected isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr, void *&BaseDevAccessiblePtr, size_t &BaseSize) const override { // TODO: Implement pinning feature for CUDA. return false; } /// Submit data to the device (host to device transfer). Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override { if (auto Err = setContext()) return Err; CUstream Stream; if (auto Err = getStream(AsyncInfoWrapper, Stream)) return Err; CUresult Res = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); return Plugin::check(Res, "Error in cuMemcpyHtoDAsync: %s"); } /// Retrieve data from the device (device to host transfer). Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override { if (auto Err = setContext()) return Err; CUstream Stream; if (auto Err = getStream(AsyncInfoWrapper, Stream)) return Err; CUresult Res = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); return Plugin::check(Res, "Error in cuMemcpyDtoHAsync: %s"); } /// Exchange data between two devices directly. We may use peer access if /// the CUDA devices and driver allow them. Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice, void *DstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override; /// Initialize the async info for interoperability purposes. Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { if (auto Err = setContext()) return Err; CUstream Stream; if (auto Err = getStream(AsyncInfoWrapper, Stream)) return Err; return Plugin::success(); } /// Initialize the device info for interoperability purposes. Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override { assert(Context && "Context is null"); assert(Device != CU_DEVICE_INVALID && "Invalid CUDA device"); if (auto Err = setContext()) return Err; if (!DeviceInfo->Context) DeviceInfo->Context = Context; if (!DeviceInfo->Device) DeviceInfo->Device = reinterpret_cast(Device); return Plugin::success(); } /// Create an event. Error createEventImpl(void **EventPtrStorage) override { CUevent *Event = reinterpret_cast(EventPtrStorage); return CUDAEventManager.getResource(*Event); } /// Destroy a previously created event. Error destroyEventImpl(void *EventPtr) override { CUevent Event = reinterpret_cast(EventPtr); return CUDAEventManager.returnResource(Event); } /// Record the event. Error recordEventImpl(void *EventPtr, AsyncInfoWrapperTy &AsyncInfoWrapper) override { CUevent Event = reinterpret_cast(EventPtr); CUstream Stream; if (auto Err = getStream(AsyncInfoWrapper, Stream)) return Err; CUresult Res = cuEventRecord(Event, Stream); return Plugin::check(Res, "Error in cuEventRecord: %s"); } /// Make the stream wait on the event. Error waitEventImpl(void *EventPtr, AsyncInfoWrapperTy &AsyncInfoWrapper) override { CUevent Event = reinterpret_cast(EventPtr); CUstream Stream; if (auto Err = getStream(AsyncInfoWrapper, Stream)) return Err; // Do not use CU_EVENT_WAIT_DEFAULT here as it is only available from // specific CUDA version, and defined as 0x0. In previous version, per CUDA // API document, that argument has to be 0x0. CUresult Res = cuStreamWaitEvent(Stream, Event, 0); return Plugin::check(Res, "Error in cuStreamWaitEvent: %s"); } /// Synchronize the current thread with the event. Error syncEventImpl(void *EventPtr) override { CUevent Event = reinterpret_cast(EventPtr); CUresult Res = cuEventSynchronize(Event); return Plugin::check(Res, "Error in cuEventSynchronize: %s"); } /// Print information about the device. Error obtainInfoImpl(InfoQueueTy &Info) override { char TmpChar[1000]; const char *TmpCharPtr; size_t TmpSt; int TmpInt; CUresult Res = cuDriverGetVersion(&TmpInt); if (Res == CUDA_SUCCESS) Info.add("CUDA Driver Version", TmpInt); Info.add("CUDA OpenMP Device Number", DeviceId); Res = cuDeviceGetName(TmpChar, 1000, Device); if (Res == CUDA_SUCCESS) Info.add("Device Name", TmpChar); Res = cuDeviceTotalMem(&TmpSt, Device); if (Res == CUDA_SUCCESS) Info.add("Global Memory Size", TmpSt, "bytes"); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Number of Multiprocessors", TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Concurrent Copy and Execution", (bool)TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Total Constant Memory", TmpInt, "bytes"); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Max Shared Memory per Block", TmpInt, "bytes"); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Registers per Block", TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_WARP_SIZE, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Warp Size", TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Maximum Threads per Block", TmpInt); Info.add("Maximum Block Dimensions", ""); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, TmpInt); if (Res == CUDA_SUCCESS) Info.add("x", TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, TmpInt); if (Res == CUDA_SUCCESS) Info.add("y", TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, TmpInt); if (Res == CUDA_SUCCESS) Info.add("z", TmpInt); Info.add("Maximum Grid Dimensions", ""); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt); if (Res == CUDA_SUCCESS) Info.add("x", TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, TmpInt); if (Res == CUDA_SUCCESS) Info.add("y", TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, TmpInt); if (Res == CUDA_SUCCESS) Info.add("z", TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_PITCH, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Maximum Memory Pitch", TmpInt, "bytes"); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Texture Alignment", TmpInt, "bytes"); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Clock Rate", TmpInt, "kHz"); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Execution Timeout", (bool)TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_INTEGRATED, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Integrated Device", (bool)TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Can Map Host Memory", (bool)TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, TmpInt); if (Res == CUDA_SUCCESS) { if (TmpInt == CU_COMPUTEMODE_DEFAULT) TmpCharPtr = "Default"; else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) TmpCharPtr = "Prohibited"; else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) TmpCharPtr = "Exclusive process"; else TmpCharPtr = "Unknown"; Info.add("Compute Mode", TmpCharPtr); } Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Concurrent Kernels", (bool)TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ECC_ENABLED, TmpInt); if (Res == CUDA_SUCCESS) Info.add("ECC Enabled", (bool)TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Memory Clock Rate", TmpInt, "kHz"); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Memory Bus Width", TmpInt, "bits"); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, TmpInt); if (Res == CUDA_SUCCESS) Info.add("L2 Cache Size", TmpInt, "bytes"); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Max Threads Per SMP", TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Async Engines", TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Unified Addressing", (bool)TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Managed Memory", (bool)TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Concurrent Managed Memory", (bool)TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Preemption Supported", (bool)TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Cooperative Launch", (bool)TmpInt); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, TmpInt); if (Res == CUDA_SUCCESS) Info.add("Multi-Device Boars", (bool)TmpInt); Info.add("Compute Capabilities", ComputeCapability.str()); return Plugin::success(); } virtual bool shouldSetupDeviceMemoryPool() const override { /// We use the CUDA malloc for now. return false; } /// Getters and setters for stack and heap sizes. Error getDeviceStackSize(uint64_t &Value) override { return getCtxLimit(CU_LIMIT_STACK_SIZE, Value); } Error setDeviceStackSize(uint64_t Value) override { return setCtxLimit(CU_LIMIT_STACK_SIZE, Value); } Error getDeviceHeapSize(uint64_t &Value) override { return getCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value); } Error setDeviceHeapSize(uint64_t Value) override { return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value); } Error getDeviceMemorySize(uint64_t &Value) override { CUresult Res = cuDeviceTotalMem(&Value, Device); return Plugin::check(Res, "Error in getDeviceMemorySize %s"); } /// CUDA-specific functions for getting and setting context limits. Error setCtxLimit(CUlimit Kind, uint64_t Value) { CUresult Res = cuCtxSetLimit(Kind, Value); return Plugin::check(Res, "Error in cuCtxSetLimit: %s"); } Error getCtxLimit(CUlimit Kind, uint64_t &Value) { CUresult Res = cuCtxGetLimit(&Value, Kind); return Plugin::check(Res, "Error in cuCtxGetLimit: %s"); } /// CUDA-specific function to get device attributes. Error getDeviceAttr(uint32_t Kind, uint32_t &Value) { // TODO: Warn if the new value is larger than the old. CUresult Res = cuDeviceGetAttribute((int *)&Value, (CUdevice_attribute)Kind, Device); return Plugin::check(Res, "Error in cuDeviceGetAttribute: %s"); } CUresult getDeviceAttrRaw(uint32_t Kind, int &Value) { return cuDeviceGetAttribute(&Value, (CUdevice_attribute)Kind, Device); } /// See GenericDeviceTy::getComputeUnitKind(). std::string getComputeUnitKind() const override { return ComputeCapability.str(); } /// Returns the clock frequency for the given NVPTX device. uint64_t getClockFrequency() const override { return 1000000000; } private: using CUDAStreamManagerTy = GenericDeviceResourceManagerTy; using CUDAEventManagerTy = GenericDeviceResourceManagerTy; Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image, bool IsCtor) { const char *KernelName = IsCtor ? "nvptx$device$init" : "nvptx$device$fini"; // Perform a quick check for the named kernel in the image. The kernel // should be created by the 'nvptx-lower-ctor-dtor' pass. GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName)) return Plugin::success(); // The Nvidia backend cannot handle creating the ctor / dtor array // automatically so we must create it ourselves. The backend will emit // several globals that contain function pointers we can call. These are // prefixed with a known name due to Nvidia's lack of section support. auto ELFObjOrErr = Handler.getELFObjectFile(Image); if (!ELFObjOrErr) return ELFObjOrErr.takeError(); // Search for all symbols that contain a constructor or destructor. SmallVector> Funcs; for (ELFSymbolRef Sym : (*ELFObjOrErr)->symbols()) { auto NameOrErr = Sym.getName(); if (!NameOrErr) return NameOrErr.takeError(); if (!NameOrErr->starts_with(IsCtor ? "__init_array_object_" : "__fini_array_object_")) continue; uint16_t Priority; if (NameOrErr->rsplit('_').second.getAsInteger(10, Priority)) return Plugin::error("Invalid priority for constructor or destructor"); Funcs.emplace_back(*NameOrErr, Priority); } // Sort the created array to be in priority order. llvm::sort(Funcs, [=](auto X, auto Y) { return X.second < Y.second; }); // Allocate a buffer to store all of the known constructor / destructor // functions in so we can iterate them on the device. void *Buffer = allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_DEVICE); if (!Buffer) return Plugin::error("Failed to allocate memory for global buffer"); auto *GlobalPtrStart = reinterpret_cast(Buffer); auto *GlobalPtrStop = reinterpret_cast(Buffer) + Funcs.size(); SmallVector FunctionPtrs(Funcs.size()); std::size_t Idx = 0; for (auto [Name, Priority] : Funcs) { GlobalTy FunctionAddr(Name.str(), sizeof(void *), &FunctionPtrs[Idx++]); if (auto Err = Handler.readGlobalFromDevice(*this, Image, FunctionAddr)) return Err; } // Copy the local buffer to the device. if (auto Err = dataSubmit(GlobalPtrStart, FunctionPtrs.data(), FunctionPtrs.size() * sizeof(void *), nullptr)) return Err; // Copy the created buffer to the appropriate symbols so the kernel can // iterate through them. GlobalTy StartGlobal(IsCtor ? "__init_array_start" : "__fini_array_start", sizeof(void *), &GlobalPtrStart); if (auto Err = Handler.writeGlobalToDevice(*this, Image, StartGlobal)) return Err; GlobalTy StopGlobal(IsCtor ? "__init_array_end" : "__fini_array_end", sizeof(void *), &GlobalPtrStop); if (auto Err = Handler.writeGlobalToDevice(*this, Image, StopGlobal)) return Err; CUDAKernelTy CUDAKernel(KernelName); if (auto Err = CUDAKernel.init(*this, Image)) return Err; AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr); KernelArgsTy KernelArgs = {}; uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u}; if (auto Err = CUDAKernel.launchImpl( *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs, KernelLaunchParamsTy{}, AsyncInfoWrapper)) return Err; Error Err = Plugin::success(); AsyncInfoWrapper.finalize(Err); if (free(Buffer, TARGET_ALLOC_DEVICE) != OFFLOAD_SUCCESS) return Plugin::error("Failed to free memory for global buffer"); return Err; } /// Stream manager for CUDA streams. CUDAStreamManagerTy CUDAStreamManager; /// Event manager for CUDA events. CUDAEventManagerTy CUDAEventManager; /// The device's context. This context should be set before performing /// operations on the device. CUcontext Context = nullptr; /// The CUDA device handler. CUdevice Device = CU_DEVICE_INVALID; /// The memory mapped addresses and their handles std::unordered_map DeviceMMaps; /// The compute capability of the corresponding CUDA device. struct ComputeCapabilityTy { uint32_t Major; uint32_t Minor; std::string str() const { return "sm_" + std::to_string(Major * 10 + Minor); } } ComputeCapability; /// The maximum number of warps that can be resident on all the SMs /// simultaneously. uint32_t HardwareParallelism = 0; }; Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], uint32_t NumBlocks[3], KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const { CUDADeviceTy &CUDADevice = static_cast(GenericDevice); CUstream Stream; if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream)) return Err; uint32_t MaxDynCGroupMem = std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize()); void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data, CU_LAUNCH_PARAM_BUFFER_SIZE, reinterpret_cast(&LaunchParams.Size), CU_LAUNCH_PARAM_END}; // If we are running an RPC server we want to wake up the server thread // whenever there is a kernel running and let it sleep otherwise. if (GenericDevice.getRPCServer()) GenericDevice.Plugin.getRPCServer().Thread->notify(); CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2], NumThreads[0], NumThreads[1], NumThreads[2], MaxDynCGroupMem, Stream, nullptr, Config); // Register a callback to indicate when the kernel is complete. if (GenericDevice.getRPCServer()) cuLaunchHostFunc( Stream, [](void *Data) { GenericPluginTy &Plugin = *reinterpret_cast(Data); Plugin.getRPCServer().Thread->finish(); }, &GenericDevice.Plugin); return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName()); } /// Class implementing the CUDA-specific functionalities of the global handler. class CUDAGlobalHandlerTy final : public GenericGlobalHandlerTy { public: /// Get the metadata of a global from the device. The name and size of the /// global is read from DeviceGlobal and the address of the global is written /// to DeviceGlobal. Error getGlobalMetadataFromDevice(GenericDeviceTy &Device, DeviceImageTy &Image, GlobalTy &DeviceGlobal) override { CUDADeviceImageTy &CUDAImage = static_cast(Image); const char *GlobalName = DeviceGlobal.getName().data(); size_t CUSize; CUdeviceptr CUPtr; CUresult Res = cuModuleGetGlobal(&CUPtr, &CUSize, CUDAImage.getModule(), GlobalName); if (auto Err = Plugin::check(Res, "Error in cuModuleGetGlobal for '%s': %s", GlobalName)) return Err; if (CUSize != DeviceGlobal.getSize()) return Plugin::error( "Failed to load global '%s' due to size mismatch (%zu != %zu)", GlobalName, CUSize, (size_t)DeviceGlobal.getSize()); DeviceGlobal.setPtr(reinterpret_cast(CUPtr)); return Plugin::success(); } }; /// Class implementing the CUDA-specific functionalities of the plugin. struct CUDAPluginTy final : public GenericPluginTy { /// Create a CUDA plugin. CUDAPluginTy() : GenericPluginTy(getTripleArch()) {} /// This class should not be copied. CUDAPluginTy(const CUDAPluginTy &) = delete; CUDAPluginTy(CUDAPluginTy &&) = delete; /// Initialize the plugin and return the number of devices. Expected initImpl() override { CUresult Res = cuInit(0); if (Res == CUDA_ERROR_INVALID_HANDLE) { // Cannot call cuGetErrorString if dlsym failed. DP("Failed to load CUDA shared library\n"); return 0; } if (Res == CUDA_ERROR_NO_DEVICE) { // Do not initialize if there are no devices. DP("There are no devices supporting CUDA.\n"); return 0; } if (auto Err = Plugin::check(Res, "Error in cuInit: %s")) return std::move(Err); // Get the number of devices. int NumDevices; Res = cuDeviceGetCount(&NumDevices); if (auto Err = Plugin::check(Res, "Error in cuDeviceGetCount: %s")) return std::move(Err); // Do not initialize if there are no devices. if (NumDevices == 0) DP("There are no devices supporting CUDA.\n"); return NumDevices; } /// Deinitialize the plugin. Error deinitImpl() override { return Plugin::success(); } /// Creates a CUDA device to use for offloading. GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices) override { return new CUDADeviceTy(Plugin, DeviceId, NumDevices); } /// Creates a CUDA global handler. GenericGlobalHandlerTy *createGlobalHandler() override { return new CUDAGlobalHandlerTy(); } /// Get the ELF code for recognizing the compatible image binary. uint16_t getMagicElfBits() const override { return ELF::EM_CUDA; } Triple::ArchType getTripleArch() const override { // TODO: I think we can drop the support for 32-bit NVPTX devices. return Triple::nvptx64; } const char *getName() const override { return GETNAME(TARGET_NAME); } /// Check whether the image is compatible with a CUDA device. Expected isELFCompatible(uint32_t DeviceId, StringRef Image) const override { auto ElfOrErr = ELF64LEObjectFile::create(MemoryBufferRef(Image, /*Identifier=*/""), /*InitContent=*/false); if (!ElfOrErr) return ElfOrErr.takeError(); // Get the numeric value for the image's `sm_` value. auto SM = ElfOrErr->getPlatformFlags() & ELF::EF_CUDA_SM; CUdevice Device; CUresult Res = cuDeviceGet(&Device, DeviceId); if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s")) return std::move(Err); int32_t Major, Minor; Res = cuDeviceGetAttribute( &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device); if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s")) return std::move(Err); Res = cuDeviceGetAttribute( &Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device); if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s")) return std::move(Err); int32_t ImageMajor = SM / 10; int32_t ImageMinor = SM % 10; // A cubin generated for a certain compute capability is supported to // run on any GPU with the same major revision and same or higher minor // revision. return Major == ImageMajor && Minor >= ImageMinor; } }; Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice, void *DstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) { if (auto Err = setContext()) return Err; CUDADeviceTy &DstDevice = static_cast(DstGenericDevice); CUresult Res; int32_t DstDeviceId = DstDevice.DeviceId; CUdeviceptr CUSrcPtr = (CUdeviceptr)SrcPtr; CUdeviceptr CUDstPtr = (CUdeviceptr)DstPtr; int CanAccessPeer = 0; if (DeviceId != DstDeviceId) { // Make sure the lock is released before performing the copies. std::lock_guard Lock(PeerAccessesLock); switch (PeerAccesses[DstDeviceId]) { case PeerAccessState::AVAILABLE: CanAccessPeer = 1; break; case PeerAccessState::UNAVAILABLE: CanAccessPeer = 0; break; case PeerAccessState::PENDING: // Check whether the source device can access the destination device. Res = cuDeviceCanAccessPeer(&CanAccessPeer, Device, DstDevice.Device); if (auto Err = Plugin::check(Res, "Error in cuDeviceCanAccessPeer: %s")) return Err; if (CanAccessPeer) { Res = cuCtxEnablePeerAccess(DstDevice.Context, 0); if (Res == CUDA_ERROR_TOO_MANY_PEERS) { // Resources may be exhausted due to many P2P links. CanAccessPeer = 0; DP("Too many P2P so fall back to D2D memcpy"); } else if (auto Err = Plugin::check(Res, "Error in cuCtxEnablePeerAccess: %s")) return Err; } PeerAccesses[DstDeviceId] = (CanAccessPeer) ? PeerAccessState::AVAILABLE : PeerAccessState::UNAVAILABLE; } } CUstream Stream; if (auto Err = getStream(AsyncInfoWrapper, Stream)) return Err; if (CanAccessPeer) { // TODO: Should we fallback to D2D if peer access fails? Res = cuMemcpyPeerAsync(CUDstPtr, Context, CUSrcPtr, DstDevice.Context, Size, Stream); return Plugin::check(Res, "Error in cuMemcpyPeerAsync: %s"); } // Fallback to D2D copy. Res = cuMemcpyDtoDAsync(CUDstPtr, CUSrcPtr, Size, Stream); return Plugin::check(Res, "Error in cuMemcpyDtoDAsync: %s"); } template static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) { CUresult ResultCode = static_cast(Code); if (ResultCode == CUDA_SUCCESS) return Error::success(); const char *Desc = "Unknown error"; CUresult Ret = cuGetErrorString(ResultCode, &Desc); if (Ret != CUDA_SUCCESS) REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code); return createStringError(inconvertibleErrorCode(), ErrFmt, Args..., Desc); } } // namespace plugin } // namespace target } // namespace omp } // namespace llvm extern "C" { llvm::omp::target::plugin::GenericPluginTy *createPlugin_cuda() { return new llvm::omp::target::plugin::CUDAPluginTy(); } }