1330d8983SJohannes Doerfert //===-------- interface.cpp - Target independent OpenMP target RTL --------===// 2330d8983SJohannes Doerfert // 3330d8983SJohannes Doerfert // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4330d8983SJohannes Doerfert // See https://llvm.org/LICENSE.txt for license information. 5330d8983SJohannes Doerfert // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6330d8983SJohannes Doerfert // 7330d8983SJohannes Doerfert //===----------------------------------------------------------------------===// 8330d8983SJohannes Doerfert // 9330d8983SJohannes Doerfert // Implementation of the interface to be used by Clang during the codegen of a 10330d8983SJohannes Doerfert // target region. 11330d8983SJohannes Doerfert // 12330d8983SJohannes Doerfert //===----------------------------------------------------------------------===// 13330d8983SJohannes Doerfert 14330d8983SJohannes Doerfert #include "OpenMP/OMPT/Interface.h" 15ff12c006SJohannes Doerfert #include "OffloadPolicy.h" 16330d8983SJohannes Doerfert #include "OpenMP/OMPT/Callback.h" 17ff12c006SJohannes Doerfert #include "OpenMP/omp.h" 18330d8983SJohannes Doerfert #include "PluginManager.h" 19ff12c006SJohannes Doerfert #include "omptarget.h" 20330d8983SJohannes Doerfert #include "private.h" 21330d8983SJohannes Doerfert 22330d8983SJohannes Doerfert #include "Shared/EnvironmentVar.h" 23330d8983SJohannes Doerfert #include "Shared/Profile.h" 24330d8983SJohannes Doerfert 25330d8983SJohannes Doerfert #include "Utils/ExponentialBackoff.h" 26330d8983SJohannes Doerfert 27330d8983SJohannes Doerfert #include "llvm/Frontend/OpenMP/OMPConstants.h" 28330d8983SJohannes Doerfert 29330d8983SJohannes Doerfert #include <cassert> 30330d8983SJohannes Doerfert #include <cstdint> 31330d8983SJohannes Doerfert #include <cstdio> 32330d8983SJohannes Doerfert #include <cstdlib> 33330d8983SJohannes Doerfert 34330d8983SJohannes Doerfert #ifdef OMPT_SUPPORT 35330d8983SJohannes Doerfert using namespace llvm::omp::target::ompt; 36330d8983SJohannes Doerfert #endif 37330d8983SJohannes Doerfert 38ff12c006SJohannes Doerfert // If offload is enabled, ensure that device DeviceID has been initialized. 39ff12c006SJohannes Doerfert // 40ff12c006SJohannes Doerfert // The return bool indicates if the offload is to the host device 41ff12c006SJohannes Doerfert // There are three possible results: 42ff12c006SJohannes Doerfert // - Return false if the taregt device is ready for offload 43ff12c006SJohannes Doerfert // - Return true without reporting a runtime error if offload is 44ff12c006SJohannes Doerfert // disabled, perhaps because the initial device was specified. 45ff12c006SJohannes Doerfert // - Report a runtime error and return true. 46ff12c006SJohannes Doerfert // 47ff12c006SJohannes Doerfert // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device. 48ff12c006SJohannes Doerfert // This step might be skipped if offload is disabled. 49ff12c006SJohannes Doerfert bool checkDevice(int64_t &DeviceID, ident_t *Loc) { 50ff12c006SJohannes Doerfert if (OffloadPolicy::get(*PM).Kind == OffloadPolicy::DISABLED) { 51ff12c006SJohannes Doerfert DP("Offload is disabled\n"); 52ff12c006SJohannes Doerfert return true; 53ff12c006SJohannes Doerfert } 54ff12c006SJohannes Doerfert 55ff12c006SJohannes Doerfert if (DeviceID == OFFLOAD_DEVICE_DEFAULT) { 56ff12c006SJohannes Doerfert DeviceID = omp_get_default_device(); 57ff12c006SJohannes Doerfert DP("Use default device id %" PRId64 "\n", DeviceID); 58ff12c006SJohannes Doerfert } 59ff12c006SJohannes Doerfert 60ff12c006SJohannes Doerfert // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669. 61ff12c006SJohannes Doerfert if (omp_get_num_devices() == 0) { 62ff12c006SJohannes Doerfert DP("omp_get_num_devices() == 0 but offload is manadatory\n"); 63ff12c006SJohannes Doerfert handleTargetOutcome(false, Loc); 64ff12c006SJohannes Doerfert return true; 65ff12c006SJohannes Doerfert } 66ff12c006SJohannes Doerfert 67ff12c006SJohannes Doerfert if (DeviceID == omp_get_initial_device()) { 68ff12c006SJohannes Doerfert DP("Device is host (%" PRId64 "), returning as if offload is disabled\n", 69ff12c006SJohannes Doerfert DeviceID); 70ff12c006SJohannes Doerfert return true; 71ff12c006SJohannes Doerfert } 72ff12c006SJohannes Doerfert return false; 73ff12c006SJohannes Doerfert } 74ff12c006SJohannes Doerfert 75330d8983SJohannes Doerfert //////////////////////////////////////////////////////////////////////////////// 76330d8983SJohannes Doerfert /// adds requires flags 77330d8983SJohannes Doerfert EXTERN void __tgt_register_requires(int64_t Flags) { 78330d8983SJohannes Doerfert MESSAGE("The %s function has been removed. Old OpenMP requirements will not " 79330d8983SJohannes Doerfert "be handled", 80330d8983SJohannes Doerfert __PRETTY_FUNCTION__); 81330d8983SJohannes Doerfert } 82330d8983SJohannes Doerfert 83330d8983SJohannes Doerfert EXTERN void __tgt_rtl_init() { initRuntime(); } 84330d8983SJohannes Doerfert EXTERN void __tgt_rtl_deinit() { deinitRuntime(); } 85330d8983SJohannes Doerfert 86330d8983SJohannes Doerfert //////////////////////////////////////////////////////////////////////////////// 87330d8983SJohannes Doerfert /// adds a target shared library to the target execution image 88330d8983SJohannes Doerfert EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) { 89330d8983SJohannes Doerfert initRuntime(); 90330d8983SJohannes Doerfert if (PM->delayRegisterLib(Desc)) 91330d8983SJohannes Doerfert return; 92330d8983SJohannes Doerfert 93330d8983SJohannes Doerfert PM->registerLib(Desc); 94330d8983SJohannes Doerfert } 95330d8983SJohannes Doerfert 96330d8983SJohannes Doerfert //////////////////////////////////////////////////////////////////////////////// 97330d8983SJohannes Doerfert /// Initialize all available devices without registering any image 98330d8983SJohannes Doerfert EXTERN void __tgt_init_all_rtls() { 99330d8983SJohannes Doerfert assert(PM && "Runtime not initialized"); 1007102592aSJohannes Doerfert PM->initializeAllDevices(); 101330d8983SJohannes Doerfert } 102330d8983SJohannes Doerfert 103330d8983SJohannes Doerfert //////////////////////////////////////////////////////////////////////////////// 104330d8983SJohannes Doerfert /// unloads a target shared library 105330d8983SJohannes Doerfert EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) { 106330d8983SJohannes Doerfert PM->unregisterLib(Desc); 107330d8983SJohannes Doerfert 108330d8983SJohannes Doerfert deinitRuntime(); 109330d8983SJohannes Doerfert } 110330d8983SJohannes Doerfert 111330d8983SJohannes Doerfert template <typename TargetAsyncInfoTy> 112330d8983SJohannes Doerfert static inline void 113330d8983SJohannes Doerfert targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, 114330d8983SJohannes Doerfert void **Args, int64_t *ArgSizes, int64_t *ArgTypes, 115330d8983SJohannes Doerfert map_var_info_t *ArgNames, void **ArgMappers, 116330d8983SJohannes Doerfert TargetDataFuncPtrTy TargetDataFunction, const char *RegionTypeMsg, 117330d8983SJohannes Doerfert const char *RegionName) { 118330d8983SJohannes Doerfert assert(PM && "Runtime not initialized"); 119330d8983SJohannes Doerfert static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>, 120330d8983SJohannes Doerfert "TargetAsyncInfoTy must be convertible to AsyncInfoTy."); 121330d8983SJohannes Doerfert 122330d8983SJohannes Doerfert TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy", 123330d8983SJohannes Doerfert "NumArgs=" + std::to_string(ArgNum), Loc); 124330d8983SJohannes Doerfert 125330d8983SJohannes Doerfert DP("Entering data %s region for device %" PRId64 " with %d mappings\n", 126330d8983SJohannes Doerfert RegionName, DeviceId, ArgNum); 127330d8983SJohannes Doerfert 128ff12c006SJohannes Doerfert if (checkDevice(DeviceId, Loc)) { 129330d8983SJohannes Doerfert DP("Not offloading to device %" PRId64 "\n", DeviceId); 130330d8983SJohannes Doerfert return; 131330d8983SJohannes Doerfert } 132330d8983SJohannes Doerfert 133330d8983SJohannes Doerfert if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) 134330d8983SJohannes Doerfert printKernelArguments(Loc, DeviceId, ArgNum, ArgSizes, ArgTypes, ArgNames, 135330d8983SJohannes Doerfert RegionTypeMsg); 136330d8983SJohannes Doerfert #ifdef OMPTARGET_DEBUG 137330d8983SJohannes Doerfert for (int I = 0; I < ArgNum; ++I) { 138330d8983SJohannes Doerfert DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 139330d8983SJohannes Doerfert ", Type=0x%" PRIx64 ", Name=%s\n", 140330d8983SJohannes Doerfert I, DPxPTR(ArgsBase[I]), DPxPTR(Args[I]), ArgSizes[I], ArgTypes[I], 141330d8983SJohannes Doerfert (ArgNames) ? getNameFromMapping(ArgNames[I]).c_str() : "unknown"); 142330d8983SJohannes Doerfert } 143330d8983SJohannes Doerfert #endif 144330d8983SJohannes Doerfert 145330d8983SJohannes Doerfert auto DeviceOrErr = PM->getDevice(DeviceId); 146330d8983SJohannes Doerfert if (!DeviceOrErr) 147330d8983SJohannes Doerfert FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); 148330d8983SJohannes Doerfert 149330d8983SJohannes Doerfert TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr); 150330d8983SJohannes Doerfert AsyncInfoTy &AsyncInfo = TargetAsyncInfo; 151330d8983SJohannes Doerfert 152330d8983SJohannes Doerfert /// RAII to establish tool anchors before and after data begin / end / update 153330d8983SJohannes Doerfert OMPT_IF_BUILT(assert((TargetDataFunction == targetDataBegin || 154330d8983SJohannes Doerfert TargetDataFunction == targetDataEnd || 155330d8983SJohannes Doerfert TargetDataFunction == targetDataUpdate) && 156330d8983SJohannes Doerfert "Encountered unexpected TargetDataFunction during " 157330d8983SJohannes Doerfert "execution of targetData"); 158330d8983SJohannes Doerfert auto CallbackFunctions = 159330d8983SJohannes Doerfert (TargetDataFunction == targetDataBegin) 160330d8983SJohannes Doerfert ? RegionInterface.getCallbacks<ompt_target_enter_data>() 161330d8983SJohannes Doerfert : (TargetDataFunction == targetDataEnd) 162330d8983SJohannes Doerfert ? RegionInterface.getCallbacks<ompt_target_exit_data>() 163330d8983SJohannes Doerfert : RegionInterface.getCallbacks<ompt_target_update>(); 164330d8983SJohannes Doerfert InterfaceRAII TargetDataRAII(CallbackFunctions, DeviceId, 165330d8983SJohannes Doerfert OMPT_GET_RETURN_ADDRESS);) 166330d8983SJohannes Doerfert 167330d8983SJohannes Doerfert int Rc = OFFLOAD_SUCCESS; 168330d8983SJohannes Doerfert Rc = TargetDataFunction(Loc, *DeviceOrErr, ArgNum, ArgsBase, Args, ArgSizes, 169330d8983SJohannes Doerfert ArgTypes, ArgNames, ArgMappers, AsyncInfo, 170330d8983SJohannes Doerfert false /*FromMapper=*/); 171330d8983SJohannes Doerfert 172330d8983SJohannes Doerfert if (Rc == OFFLOAD_SUCCESS) 173330d8983SJohannes Doerfert Rc = AsyncInfo.synchronize(); 174330d8983SJohannes Doerfert 175330d8983SJohannes Doerfert handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); 176330d8983SJohannes Doerfert } 177330d8983SJohannes Doerfert 178330d8983SJohannes Doerfert /// creates host-to-target data mapping, stores it in the 179330d8983SJohannes Doerfert /// libomptarget.so internal structure (an entry in a stack of data maps) 180330d8983SJohannes Doerfert /// and passes the data to the device. 181330d8983SJohannes Doerfert EXTERN void __tgt_target_data_begin_mapper(ident_t *Loc, int64_t DeviceId, 182330d8983SJohannes Doerfert int32_t ArgNum, void **ArgsBase, 183330d8983SJohannes Doerfert void **Args, int64_t *ArgSizes, 184330d8983SJohannes Doerfert int64_t *ArgTypes, 185330d8983SJohannes Doerfert map_var_info_t *ArgNames, 186330d8983SJohannes Doerfert void **ArgMappers) { 187330d8983SJohannes Doerfert OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); 188330d8983SJohannes Doerfert targetData<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, 189330d8983SJohannes Doerfert ArgTypes, ArgNames, ArgMappers, targetDataBegin, 190330d8983SJohannes Doerfert "Entering OpenMP data region with being_mapper", 191330d8983SJohannes Doerfert "begin"); 192330d8983SJohannes Doerfert } 193330d8983SJohannes Doerfert 194330d8983SJohannes Doerfert EXTERN void __tgt_target_data_begin_nowait_mapper( 195330d8983SJohannes Doerfert ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, 196330d8983SJohannes Doerfert void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, 197330d8983SJohannes Doerfert void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, 198330d8983SJohannes Doerfert void *NoAliasDepList) { 199330d8983SJohannes Doerfert OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); 200330d8983SJohannes Doerfert targetData<TaskAsyncInfoWrapperTy>( 201330d8983SJohannes Doerfert Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, 202330d8983SJohannes Doerfert ArgMappers, targetDataBegin, 203330d8983SJohannes Doerfert "Entering OpenMP data region with being_nowait_mapper", "begin"); 204330d8983SJohannes Doerfert } 205330d8983SJohannes Doerfert 206330d8983SJohannes Doerfert /// passes data from the target, releases target memory and destroys 207330d8983SJohannes Doerfert /// the host-target mapping (top entry from the stack of data maps) 208330d8983SJohannes Doerfert /// created by the last __tgt_target_data_begin. 209330d8983SJohannes Doerfert EXTERN void __tgt_target_data_end_mapper(ident_t *Loc, int64_t DeviceId, 210330d8983SJohannes Doerfert int32_t ArgNum, void **ArgsBase, 211330d8983SJohannes Doerfert void **Args, int64_t *ArgSizes, 212330d8983SJohannes Doerfert int64_t *ArgTypes, 213330d8983SJohannes Doerfert map_var_info_t *ArgNames, 214330d8983SJohannes Doerfert void **ArgMappers) { 215330d8983SJohannes Doerfert OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); 216330d8983SJohannes Doerfert targetData<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, 217330d8983SJohannes Doerfert ArgTypes, ArgNames, ArgMappers, targetDataEnd, 218330d8983SJohannes Doerfert "Exiting OpenMP data region with end_mapper", "end"); 219330d8983SJohannes Doerfert } 220330d8983SJohannes Doerfert 221330d8983SJohannes Doerfert EXTERN void __tgt_target_data_end_nowait_mapper( 222330d8983SJohannes Doerfert ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, 223330d8983SJohannes Doerfert void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, 224330d8983SJohannes Doerfert void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, 225330d8983SJohannes Doerfert void *NoAliasDepList) { 226330d8983SJohannes Doerfert OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); 227330d8983SJohannes Doerfert targetData<TaskAsyncInfoWrapperTy>( 228330d8983SJohannes Doerfert Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, 229330d8983SJohannes Doerfert ArgMappers, targetDataEnd, 230330d8983SJohannes Doerfert "Exiting OpenMP data region with end_nowait_mapper", "end"); 231330d8983SJohannes Doerfert } 232330d8983SJohannes Doerfert 233330d8983SJohannes Doerfert EXTERN void __tgt_target_data_update_mapper(ident_t *Loc, int64_t DeviceId, 234330d8983SJohannes Doerfert int32_t ArgNum, void **ArgsBase, 235330d8983SJohannes Doerfert void **Args, int64_t *ArgSizes, 236330d8983SJohannes Doerfert int64_t *ArgTypes, 237330d8983SJohannes Doerfert map_var_info_t *ArgNames, 238330d8983SJohannes Doerfert void **ArgMappers) { 239330d8983SJohannes Doerfert OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); 240330d8983SJohannes Doerfert targetData<AsyncInfoTy>( 241330d8983SJohannes Doerfert Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, 242330d8983SJohannes Doerfert ArgMappers, targetDataUpdate, 243330d8983SJohannes Doerfert "Updating data within the OpenMP data region with update_mapper", 244330d8983SJohannes Doerfert "update"); 245330d8983SJohannes Doerfert } 246330d8983SJohannes Doerfert 247330d8983SJohannes Doerfert EXTERN void __tgt_target_data_update_nowait_mapper( 248330d8983SJohannes Doerfert ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, 249330d8983SJohannes Doerfert void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, 250330d8983SJohannes Doerfert void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, 251330d8983SJohannes Doerfert void *NoAliasDepList) { 252330d8983SJohannes Doerfert OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); 253330d8983SJohannes Doerfert targetData<TaskAsyncInfoWrapperTy>( 254330d8983SJohannes Doerfert Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, 255330d8983SJohannes Doerfert ArgMappers, targetDataUpdate, 256330d8983SJohannes Doerfert "Updating data within the OpenMP data region with update_nowait_mapper", 257330d8983SJohannes Doerfert "update"); 258330d8983SJohannes Doerfert } 259330d8983SJohannes Doerfert 260330d8983SJohannes Doerfert static KernelArgsTy *upgradeKernelArgs(KernelArgsTy *KernelArgs, 261330d8983SJohannes Doerfert KernelArgsTy &LocalKernelArgs, 262330d8983SJohannes Doerfert int32_t NumTeams, int32_t ThreadLimit) { 263330d8983SJohannes Doerfert if (KernelArgs->Version > OMP_KERNEL_ARG_VERSION) 264330d8983SJohannes Doerfert DP("Unexpected ABI version: %u\n", KernelArgs->Version); 265330d8983SJohannes Doerfert 266330d8983SJohannes Doerfert uint32_t UpgradedVersion = KernelArgs->Version; 267330d8983SJohannes Doerfert if (KernelArgs->Version < OMP_KERNEL_ARG_VERSION) { 268330d8983SJohannes Doerfert // The upgraded version will be based on the kernel launch environment. 269330d8983SJohannes Doerfert if (KernelArgs->Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR) 270330d8983SJohannes Doerfert UpgradedVersion = OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR - 1; 271330d8983SJohannes Doerfert else 272330d8983SJohannes Doerfert UpgradedVersion = OMP_KERNEL_ARG_VERSION; 273330d8983SJohannes Doerfert } 274330d8983SJohannes Doerfert if (UpgradedVersion != KernelArgs->Version) { 275330d8983SJohannes Doerfert LocalKernelArgs.Version = UpgradedVersion; 276330d8983SJohannes Doerfert LocalKernelArgs.NumArgs = KernelArgs->NumArgs; 277330d8983SJohannes Doerfert LocalKernelArgs.ArgBasePtrs = KernelArgs->ArgBasePtrs; 278330d8983SJohannes Doerfert LocalKernelArgs.ArgPtrs = KernelArgs->ArgPtrs; 279330d8983SJohannes Doerfert LocalKernelArgs.ArgSizes = KernelArgs->ArgSizes; 280330d8983SJohannes Doerfert LocalKernelArgs.ArgTypes = KernelArgs->ArgTypes; 281330d8983SJohannes Doerfert LocalKernelArgs.ArgNames = KernelArgs->ArgNames; 282330d8983SJohannes Doerfert LocalKernelArgs.ArgMappers = KernelArgs->ArgMappers; 283330d8983SJohannes Doerfert LocalKernelArgs.Tripcount = KernelArgs->Tripcount; 284330d8983SJohannes Doerfert LocalKernelArgs.Flags = KernelArgs->Flags; 285330d8983SJohannes Doerfert LocalKernelArgs.DynCGroupMem = 0; 286330d8983SJohannes Doerfert LocalKernelArgs.NumTeams[0] = NumTeams; 287*92376c3fSShilei Tian LocalKernelArgs.NumTeams[1] = 1; 288*92376c3fSShilei Tian LocalKernelArgs.NumTeams[2] = 1; 289330d8983SJohannes Doerfert LocalKernelArgs.ThreadLimit[0] = ThreadLimit; 290*92376c3fSShilei Tian LocalKernelArgs.ThreadLimit[1] = 1; 291*92376c3fSShilei Tian LocalKernelArgs.ThreadLimit[2] = 1; 292330d8983SJohannes Doerfert return &LocalKernelArgs; 293330d8983SJohannes Doerfert } 294330d8983SJohannes Doerfert 295*92376c3fSShilei Tian // FIXME: This is a WA to "calibrate" the bad work done in the front end. 296*92376c3fSShilei Tian // Delete this ugly code after the front end emits proper values. 297*92376c3fSShilei Tian auto CorrectMultiDim = [](uint32_t(&Val)[3]) { 298*92376c3fSShilei Tian if (Val[1] == 0) 299*92376c3fSShilei Tian Val[1] = 1; 300*92376c3fSShilei Tian if (Val[2] == 0) 301*92376c3fSShilei Tian Val[2] = 1; 302*92376c3fSShilei Tian }; 303*92376c3fSShilei Tian CorrectMultiDim(KernelArgs->ThreadLimit); 304*92376c3fSShilei Tian CorrectMultiDim(KernelArgs->NumTeams); 305*92376c3fSShilei Tian 306330d8983SJohannes Doerfert return KernelArgs; 307330d8983SJohannes Doerfert } 308330d8983SJohannes Doerfert 309330d8983SJohannes Doerfert template <typename TargetAsyncInfoTy> 310330d8983SJohannes Doerfert static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, 311330d8983SJohannes Doerfert int32_t ThreadLimit, void *HostPtr, 312330d8983SJohannes Doerfert KernelArgsTy *KernelArgs) { 313330d8983SJohannes Doerfert assert(PM && "Runtime not initialized"); 314330d8983SJohannes Doerfert static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>, 315330d8983SJohannes Doerfert "Target AsyncInfoTy must be convertible to AsyncInfoTy."); 316330d8983SJohannes Doerfert DP("Entering target region for device %" PRId64 " with entry point " DPxMOD 317330d8983SJohannes Doerfert "\n", 318330d8983SJohannes Doerfert DeviceId, DPxPTR(HostPtr)); 319330d8983SJohannes Doerfert 320ff12c006SJohannes Doerfert if (checkDevice(DeviceId, Loc)) { 321330d8983SJohannes Doerfert DP("Not offloading to device %" PRId64 "\n", DeviceId); 322330d8983SJohannes Doerfert return OMP_TGT_FAIL; 323330d8983SJohannes Doerfert } 324330d8983SJohannes Doerfert 325330d8983SJohannes Doerfert bool IsTeams = NumTeams != -1; 326330d8983SJohannes Doerfert if (!IsTeams) 327330d8983SJohannes Doerfert KernelArgs->NumTeams[0] = NumTeams = 1; 328330d8983SJohannes Doerfert 329330d8983SJohannes Doerfert // Auto-upgrade kernel args version 1 to 2. 330330d8983SJohannes Doerfert KernelArgsTy LocalKernelArgs; 331330d8983SJohannes Doerfert KernelArgs = 332330d8983SJohannes Doerfert upgradeKernelArgs(KernelArgs, LocalKernelArgs, NumTeams, ThreadLimit); 333330d8983SJohannes Doerfert 334330d8983SJohannes Doerfert TIMESCOPE_WITH_DETAILS_AND_IDENT( 335330d8983SJohannes Doerfert "Runtime: target exe", 336330d8983SJohannes Doerfert "NumTeams=" + std::to_string(NumTeams) + 337330d8983SJohannes Doerfert ";NumArgs=" + std::to_string(KernelArgs->NumArgs), 338330d8983SJohannes Doerfert Loc); 339330d8983SJohannes Doerfert 340330d8983SJohannes Doerfert if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) 341330d8983SJohannes Doerfert printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs, 342330d8983SJohannes Doerfert KernelArgs->ArgSizes, KernelArgs->ArgTypes, 343330d8983SJohannes Doerfert KernelArgs->ArgNames, "Entering OpenMP kernel"); 344330d8983SJohannes Doerfert #ifdef OMPTARGET_DEBUG 345330d8983SJohannes Doerfert for (uint32_t I = 0; I < KernelArgs->NumArgs; ++I) { 346330d8983SJohannes Doerfert DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 347330d8983SJohannes Doerfert ", Type=0x%" PRIx64 ", Name=%s\n", 348330d8983SJohannes Doerfert I, DPxPTR(KernelArgs->ArgBasePtrs[I]), DPxPTR(KernelArgs->ArgPtrs[I]), 349330d8983SJohannes Doerfert KernelArgs->ArgSizes[I], KernelArgs->ArgTypes[I], 350330d8983SJohannes Doerfert (KernelArgs->ArgNames) 351330d8983SJohannes Doerfert ? getNameFromMapping(KernelArgs->ArgNames[I]).c_str() 352330d8983SJohannes Doerfert : "unknown"); 353330d8983SJohannes Doerfert } 354330d8983SJohannes Doerfert #endif 355330d8983SJohannes Doerfert 356330d8983SJohannes Doerfert auto DeviceOrErr = PM->getDevice(DeviceId); 357330d8983SJohannes Doerfert if (!DeviceOrErr) 358330d8983SJohannes Doerfert FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); 359330d8983SJohannes Doerfert 360330d8983SJohannes Doerfert TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr); 361330d8983SJohannes Doerfert AsyncInfoTy &AsyncInfo = TargetAsyncInfo; 362330d8983SJohannes Doerfert /// RAII to establish tool anchors before and after target region 363330d8983SJohannes Doerfert OMPT_IF_BUILT(InterfaceRAII TargetRAII( 364330d8983SJohannes Doerfert RegionInterface.getCallbacks<ompt_target>(), DeviceId, 365330d8983SJohannes Doerfert /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);) 366330d8983SJohannes Doerfert 367330d8983SJohannes Doerfert int Rc = OFFLOAD_SUCCESS; 368330d8983SJohannes Doerfert Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo); 369330d8983SJohannes Doerfert { // required to show syncronization 370330d8983SJohannes Doerfert TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: syncronize", "", Loc); 371330d8983SJohannes Doerfert if (Rc == OFFLOAD_SUCCESS) 372330d8983SJohannes Doerfert Rc = AsyncInfo.synchronize(); 373330d8983SJohannes Doerfert 374330d8983SJohannes Doerfert handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); 375330d8983SJohannes Doerfert assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!"); 376330d8983SJohannes Doerfert } 377330d8983SJohannes Doerfert return OMP_TGT_SUCCESS; 378330d8983SJohannes Doerfert } 379330d8983SJohannes Doerfert 380330d8983SJohannes Doerfert /// Implements a kernel entry that executes the target region on the specified 381330d8983SJohannes Doerfert /// device. 382330d8983SJohannes Doerfert /// 383330d8983SJohannes Doerfert /// \param Loc Source location associated with this target region. 384330d8983SJohannes Doerfert /// \param DeviceId The device to execute this region, -1 indicated the default. 385330d8983SJohannes Doerfert /// \param NumTeams Number of teams to launch the region with, -1 indicates a 386330d8983SJohannes Doerfert /// non-teams region and 0 indicates it was unspecified. 387330d8983SJohannes Doerfert /// \param ThreadLimit Limit to the number of threads to use in the kernel 388330d8983SJohannes Doerfert /// launch, 0 indicates it was unspecified. 389330d8983SJohannes Doerfert /// \param HostPtr The pointer to the host function registered with the kernel. 390330d8983SJohannes Doerfert /// \param Args All arguments to this kernel launch (see struct definition). 391330d8983SJohannes Doerfert EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, 392330d8983SJohannes Doerfert int32_t ThreadLimit, void *HostPtr, 393330d8983SJohannes Doerfert KernelArgsTy *KernelArgs) { 394330d8983SJohannes Doerfert OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); 395330d8983SJohannes Doerfert if (KernelArgs->Flags.NoWait) 396330d8983SJohannes Doerfert return targetKernel<TaskAsyncInfoWrapperTy>( 397330d8983SJohannes Doerfert Loc, DeviceId, NumTeams, ThreadLimit, HostPtr, KernelArgs); 398330d8983SJohannes Doerfert return targetKernel<AsyncInfoTy>(Loc, DeviceId, NumTeams, ThreadLimit, 399330d8983SJohannes Doerfert HostPtr, KernelArgs); 400330d8983SJohannes Doerfert } 401330d8983SJohannes Doerfert 402330d8983SJohannes Doerfert /// Activates the record replay mechanism. 403330d8983SJohannes Doerfert /// \param DeviceId The device identifier to execute the target region. 404330d8983SJohannes Doerfert /// \param MemorySize The number of bytes to be (pre-)allocated 405330d8983SJohannes Doerfert /// by the bump allocator 406330d8983SJohannes Doerfert /// /param IsRecord Activates the record replay mechanism in 407330d8983SJohannes Doerfert /// 'record' mode or 'replay' mode. 408330d8983SJohannes Doerfert /// /param SaveOutput Store the device memory after kernel 409330d8983SJohannes Doerfert /// execution on persistent storage 410330d8983SJohannes Doerfert EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, 411330d8983SJohannes Doerfert void *VAddr, bool IsRecord, 412330d8983SJohannes Doerfert bool SaveOutput, 413330d8983SJohannes Doerfert uint64_t &ReqPtrArgOffset) { 414330d8983SJohannes Doerfert assert(PM && "Runtime not initialized"); 415330d8983SJohannes Doerfert OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); 416330d8983SJohannes Doerfert auto DeviceOrErr = PM->getDevice(DeviceId); 417330d8983SJohannes Doerfert if (!DeviceOrErr) 418330d8983SJohannes Doerfert FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); 419330d8983SJohannes Doerfert 420330d8983SJohannes Doerfert [[maybe_unused]] int Rc = target_activate_rr( 421330d8983SJohannes Doerfert *DeviceOrErr, MemorySize, VAddr, IsRecord, SaveOutput, ReqPtrArgOffset); 422330d8983SJohannes Doerfert assert(Rc == OFFLOAD_SUCCESS && 423330d8983SJohannes Doerfert "__tgt_activate_record_replay unexpected failure!"); 424330d8983SJohannes Doerfert return OMP_TGT_SUCCESS; 425330d8983SJohannes Doerfert } 426330d8983SJohannes Doerfert 427330d8983SJohannes Doerfert /// Implements a target kernel entry that replays a pre-recorded kernel. 428330d8983SJohannes Doerfert /// \param Loc Source location associated with this target region (unused). 429330d8983SJohannes Doerfert /// \param DeviceId The device identifier to execute the target region. 430330d8983SJohannes Doerfert /// \param HostPtr A pointer to an address that uniquely identifies the kernel. 431330d8983SJohannes Doerfert /// \param DeviceMemory A pointer to an array storing device memory data to move 432330d8983SJohannes Doerfert /// prior to kernel execution. 433330d8983SJohannes Doerfert /// \param DeviceMemorySize The size of the above device memory data in bytes. 434330d8983SJohannes Doerfert /// \param TgtArgs An array of pointers of the pre-recorded target kernel 435330d8983SJohannes Doerfert /// arguments. 436330d8983SJohannes Doerfert /// \param TgtOffsets An array of pointers of the pre-recorded target kernel 437330d8983SJohannes Doerfert /// argument offsets. 438330d8983SJohannes Doerfert /// \param NumArgs The number of kernel arguments. 439330d8983SJohannes Doerfert /// \param NumTeams Number of teams to launch the target region with. 440330d8983SJohannes Doerfert /// \param ThreadLimit Limit to the number of threads to use in kernel 441330d8983SJohannes Doerfert /// execution. 442330d8983SJohannes Doerfert /// \param LoopTripCount The pre-recorded value of the loop tripcount, if any. 443330d8983SJohannes Doerfert /// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure. 444330d8983SJohannes Doerfert EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, 445330d8983SJohannes Doerfert void *HostPtr, void *DeviceMemory, 446330d8983SJohannes Doerfert int64_t DeviceMemorySize, void **TgtArgs, 447330d8983SJohannes Doerfert ptrdiff_t *TgtOffsets, int32_t NumArgs, 448330d8983SJohannes Doerfert int32_t NumTeams, int32_t ThreadLimit, 449330d8983SJohannes Doerfert uint64_t LoopTripCount) { 450330d8983SJohannes Doerfert assert(PM && "Runtime not initialized"); 451330d8983SJohannes Doerfert OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); 452ff12c006SJohannes Doerfert if (checkDevice(DeviceId, Loc)) { 453330d8983SJohannes Doerfert DP("Not offloading to device %" PRId64 "\n", DeviceId); 454330d8983SJohannes Doerfert return OMP_TGT_FAIL; 455330d8983SJohannes Doerfert } 456330d8983SJohannes Doerfert auto DeviceOrErr = PM->getDevice(DeviceId); 457330d8983SJohannes Doerfert if (!DeviceOrErr) 458330d8983SJohannes Doerfert FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); 459330d8983SJohannes Doerfert 460330d8983SJohannes Doerfert /// RAII to establish tool anchors before and after target region 461330d8983SJohannes Doerfert OMPT_IF_BUILT(InterfaceRAII TargetRAII( 462330d8983SJohannes Doerfert RegionInterface.getCallbacks<ompt_target>(), DeviceId, 463330d8983SJohannes Doerfert /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);) 464330d8983SJohannes Doerfert 465330d8983SJohannes Doerfert AsyncInfoTy AsyncInfo(*DeviceOrErr); 466330d8983SJohannes Doerfert int Rc = target_replay(Loc, *DeviceOrErr, HostPtr, DeviceMemory, 467330d8983SJohannes Doerfert DeviceMemorySize, TgtArgs, TgtOffsets, NumArgs, 468330d8983SJohannes Doerfert NumTeams, ThreadLimit, LoopTripCount, AsyncInfo); 469330d8983SJohannes Doerfert if (Rc == OFFLOAD_SUCCESS) 470330d8983SJohannes Doerfert Rc = AsyncInfo.synchronize(); 471330d8983SJohannes Doerfert handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); 472330d8983SJohannes Doerfert assert(Rc == OFFLOAD_SUCCESS && 473330d8983SJohannes Doerfert "__tgt_target_kernel_replay unexpected failure!"); 474330d8983SJohannes Doerfert return OMP_TGT_SUCCESS; 475330d8983SJohannes Doerfert } 476330d8983SJohannes Doerfert 477330d8983SJohannes Doerfert // Get the current number of components for a user-defined mapper. 478330d8983SJohannes Doerfert EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) { 479330d8983SJohannes Doerfert auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle; 480330d8983SJohannes Doerfert int64_t Size = MapperComponentsPtr->Components.size(); 481330d8983SJohannes Doerfert DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n", 482330d8983SJohannes Doerfert DPxPTR(RtMapperHandle), Size); 483330d8983SJohannes Doerfert return Size; 484330d8983SJohannes Doerfert } 485330d8983SJohannes Doerfert 486330d8983SJohannes Doerfert // Push back one component for a user-defined mapper. 487330d8983SJohannes Doerfert EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base, 488330d8983SJohannes Doerfert void *Begin, int64_t Size, int64_t Type, 489330d8983SJohannes Doerfert void *Name) { 490330d8983SJohannes Doerfert DP("__tgt_push_mapper_component(Handle=" DPxMOD 491330d8983SJohannes Doerfert ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 492330d8983SJohannes Doerfert ", Type=0x%" PRIx64 ", Name=%s).\n", 493330d8983SJohannes Doerfert DPxPTR(RtMapperHandle), DPxPTR(Base), DPxPTR(Begin), Size, Type, 494330d8983SJohannes Doerfert (Name) ? getNameFromMapping(Name).c_str() : "unknown"); 495330d8983SJohannes Doerfert auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle; 496330d8983SJohannes Doerfert MapperComponentsPtr->Components.push_back( 497330d8983SJohannes Doerfert MapComponentInfoTy(Base, Begin, Size, Type, Name)); 498330d8983SJohannes Doerfert } 499330d8983SJohannes Doerfert 500330d8983SJohannes Doerfert EXTERN void __tgt_set_info_flag(uint32_t NewInfoLevel) { 501330d8983SJohannes Doerfert assert(PM && "Runtime not initialized"); 502330d8983SJohannes Doerfert std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); 503330d8983SJohannes Doerfert InfoLevel.store(NewInfoLevel); 504330d8983SJohannes Doerfert } 505330d8983SJohannes Doerfert 506330d8983SJohannes Doerfert EXTERN int __tgt_print_device_info(int64_t DeviceId) { 507330d8983SJohannes Doerfert assert(PM && "Runtime not initialized"); 508330d8983SJohannes Doerfert auto DeviceOrErr = PM->getDevice(DeviceId); 509330d8983SJohannes Doerfert if (!DeviceOrErr) 510330d8983SJohannes Doerfert FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); 511330d8983SJohannes Doerfert 512330d8983SJohannes Doerfert return DeviceOrErr->printDeviceInfo(); 513330d8983SJohannes Doerfert } 514330d8983SJohannes Doerfert 515330d8983SJohannes Doerfert EXTERN void __tgt_target_nowait_query(void **AsyncHandle) { 516330d8983SJohannes Doerfert assert(PM && "Runtime not initialized"); 517330d8983SJohannes Doerfert OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); 518330d8983SJohannes Doerfert 519330d8983SJohannes Doerfert if (!AsyncHandle || !*AsyncHandle) { 520330d8983SJohannes Doerfert FATAL_MESSAGE0( 521330d8983SJohannes Doerfert 1, "Receive an invalid async handle from the current OpenMP task. Is " 522330d8983SJohannes Doerfert "this a target nowait region?\n"); 523330d8983SJohannes Doerfert } 524330d8983SJohannes Doerfert 525330d8983SJohannes Doerfert // Exponential backoff tries to optimally decide if a thread should just query 526330d8983SJohannes Doerfert // for the device operations (work/spin wait on them) or block until they are 527330d8983SJohannes Doerfert // completed (use device side blocking mechanism). This allows the runtime to 528330d8983SJohannes Doerfert // adapt itself when there are a lot of long-running target regions in-flight. 529330d8983SJohannes Doerfert static thread_local utils::ExponentialBackoff QueryCounter( 530330d8983SJohannes Doerfert Int64Envar("OMPTARGET_QUERY_COUNT_MAX", 10), 531330d8983SJohannes Doerfert Int64Envar("OMPTARGET_QUERY_COUNT_THRESHOLD", 5), 532330d8983SJohannes Doerfert Envar<float>("OMPTARGET_QUERY_COUNT_BACKOFF_FACTOR", 0.5f)); 533330d8983SJohannes Doerfert 534330d8983SJohannes Doerfert auto *AsyncInfo = (AsyncInfoTy *)*AsyncHandle; 535330d8983SJohannes Doerfert 536330d8983SJohannes Doerfert // If the thread is actively waiting on too many target nowait regions, we 537330d8983SJohannes Doerfert // should use the blocking sync type. 538330d8983SJohannes Doerfert if (QueryCounter.isAboveThreshold()) 539330d8983SJohannes Doerfert AsyncInfo->SyncType = AsyncInfoTy::SyncTy::BLOCKING; 540330d8983SJohannes Doerfert 541330d8983SJohannes Doerfert if (AsyncInfo->synchronize()) 542330d8983SJohannes Doerfert FATAL_MESSAGE0(1, "Error while querying the async queue for completion.\n"); 543330d8983SJohannes Doerfert // If there are device operations still pending, return immediately without 544330d8983SJohannes Doerfert // deallocating the handle and increase the current thread query count. 545330d8983SJohannes Doerfert if (!AsyncInfo->isDone()) { 546330d8983SJohannes Doerfert QueryCounter.increment(); 547330d8983SJohannes Doerfert return; 548330d8983SJohannes Doerfert } 549330d8983SJohannes Doerfert 550330d8983SJohannes Doerfert // When a thread successfully completes a target nowait region, we 551330d8983SJohannes Doerfert // exponentially backoff its query counter by the query factor. 552330d8983SJohannes Doerfert QueryCounter.decrement(); 553330d8983SJohannes Doerfert 554330d8983SJohannes Doerfert // Delete the handle and unset it from the OpenMP task data. 555330d8983SJohannes Doerfert delete AsyncInfo; 556330d8983SJohannes Doerfert *AsyncHandle = nullptr; 557330d8983SJohannes Doerfert } 558