xref: /llvm-project/offload/src/interface.cpp (revision 92376c3ff5453cb954a614d368fa3d52d6d0fa99)
1 //===-------- interface.cpp - Target independent OpenMP target RTL --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implementation of the interface to be used by Clang during the codegen of a
10 // target region.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "OpenMP/OMPT/Interface.h"
15 #include "OffloadPolicy.h"
16 #include "OpenMP/OMPT/Callback.h"
17 #include "OpenMP/omp.h"
18 #include "PluginManager.h"
19 #include "omptarget.h"
20 #include "private.h"
21 
22 #include "Shared/EnvironmentVar.h"
23 #include "Shared/Profile.h"
24 
25 #include "Utils/ExponentialBackoff.h"
26 
27 #include "llvm/Frontend/OpenMP/OMPConstants.h"
28 
29 #include <cassert>
30 #include <cstdint>
31 #include <cstdio>
32 #include <cstdlib>
33 
34 #ifdef OMPT_SUPPORT
35 using namespace llvm::omp::target::ompt;
36 #endif
37 
38 // If offload is enabled, ensure that device DeviceID has been initialized.
39 //
40 // The return bool indicates if the offload is to the host device
41 // There are three possible results:
42 // - Return false if the taregt device is ready for offload
43 // - Return true without reporting a runtime error if offload is
44 //   disabled, perhaps because the initial device was specified.
45 // - Report a runtime error and return true.
46 //
47 // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device.
48 // This step might be skipped if offload is disabled.
49 bool checkDevice(int64_t &DeviceID, ident_t *Loc) {
50   if (OffloadPolicy::get(*PM).Kind == OffloadPolicy::DISABLED) {
51     DP("Offload is disabled\n");
52     return true;
53   }
54 
55   if (DeviceID == OFFLOAD_DEVICE_DEFAULT) {
56     DeviceID = omp_get_default_device();
57     DP("Use default device id %" PRId64 "\n", DeviceID);
58   }
59 
60   // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669.
61   if (omp_get_num_devices() == 0) {
62     DP("omp_get_num_devices() == 0 but offload is manadatory\n");
63     handleTargetOutcome(false, Loc);
64     return true;
65   }
66 
67   if (DeviceID == omp_get_initial_device()) {
68     DP("Device is host (%" PRId64 "), returning as if offload is disabled\n",
69        DeviceID);
70     return true;
71   }
72   return false;
73 }
74 
75 ////////////////////////////////////////////////////////////////////////////////
76 /// adds requires flags
77 EXTERN void __tgt_register_requires(int64_t Flags) {
78   MESSAGE("The %s function has been removed. Old OpenMP requirements will not "
79           "be handled",
80           __PRETTY_FUNCTION__);
81 }
82 
83 EXTERN void __tgt_rtl_init() { initRuntime(); }
84 EXTERN void __tgt_rtl_deinit() { deinitRuntime(); }
85 
86 ////////////////////////////////////////////////////////////////////////////////
87 /// adds a target shared library to the target execution image
88 EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
89   initRuntime();
90   if (PM->delayRegisterLib(Desc))
91     return;
92 
93   PM->registerLib(Desc);
94 }
95 
96 ////////////////////////////////////////////////////////////////////////////////
97 /// Initialize all available devices without registering any image
98 EXTERN void __tgt_init_all_rtls() {
99   assert(PM && "Runtime not initialized");
100   PM->initializeAllDevices();
101 }
102 
103 ////////////////////////////////////////////////////////////////////////////////
104 /// unloads a target shared library
105 EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) {
106   PM->unregisterLib(Desc);
107 
108   deinitRuntime();
109 }
110 
111 template <typename TargetAsyncInfoTy>
112 static inline void
113 targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
114            void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
115            map_var_info_t *ArgNames, void **ArgMappers,
116            TargetDataFuncPtrTy TargetDataFunction, const char *RegionTypeMsg,
117            const char *RegionName) {
118   assert(PM && "Runtime not initialized");
119   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
120                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
121 
122   TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
123                                    "NumArgs=" + std::to_string(ArgNum), Loc);
124 
125   DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
126      RegionName, DeviceId, ArgNum);
127 
128   if (checkDevice(DeviceId, Loc)) {
129     DP("Not offloading to device %" PRId64 "\n", DeviceId);
130     return;
131   }
132 
133   if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
134     printKernelArguments(Loc, DeviceId, ArgNum, ArgSizes, ArgTypes, ArgNames,
135                          RegionTypeMsg);
136 #ifdef OMPTARGET_DEBUG
137   for (int I = 0; I < ArgNum; ++I) {
138     DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
139        ", Type=0x%" PRIx64 ", Name=%s\n",
140        I, DPxPTR(ArgsBase[I]), DPxPTR(Args[I]), ArgSizes[I], ArgTypes[I],
141        (ArgNames) ? getNameFromMapping(ArgNames[I]).c_str() : "unknown");
142   }
143 #endif
144 
145   auto DeviceOrErr = PM->getDevice(DeviceId);
146   if (!DeviceOrErr)
147     FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
148 
149   TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr);
150   AsyncInfoTy &AsyncInfo = TargetAsyncInfo;
151 
152   /// RAII to establish tool anchors before and after data begin / end / update
153   OMPT_IF_BUILT(assert((TargetDataFunction == targetDataBegin ||
154                         TargetDataFunction == targetDataEnd ||
155                         TargetDataFunction == targetDataUpdate) &&
156                        "Encountered unexpected TargetDataFunction during "
157                        "execution of targetData");
158                 auto CallbackFunctions =
159                     (TargetDataFunction == targetDataBegin)
160                         ? RegionInterface.getCallbacks<ompt_target_enter_data>()
161                     : (TargetDataFunction == targetDataEnd)
162                         ? RegionInterface.getCallbacks<ompt_target_exit_data>()
163                         : RegionInterface.getCallbacks<ompt_target_update>();
164                 InterfaceRAII TargetDataRAII(CallbackFunctions, DeviceId,
165                                              OMPT_GET_RETURN_ADDRESS);)
166 
167   int Rc = OFFLOAD_SUCCESS;
168   Rc = TargetDataFunction(Loc, *DeviceOrErr, ArgNum, ArgsBase, Args, ArgSizes,
169                           ArgTypes, ArgNames, ArgMappers, AsyncInfo,
170                           false /*FromMapper=*/);
171 
172   if (Rc == OFFLOAD_SUCCESS)
173     Rc = AsyncInfo.synchronize();
174 
175   handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
176 }
177 
178 /// creates host-to-target data mapping, stores it in the
179 /// libomptarget.so internal structure (an entry in a stack of data maps)
180 /// and passes the data to the device.
181 EXTERN void __tgt_target_data_begin_mapper(ident_t *Loc, int64_t DeviceId,
182                                            int32_t ArgNum, void **ArgsBase,
183                                            void **Args, int64_t *ArgSizes,
184                                            int64_t *ArgTypes,
185                                            map_var_info_t *ArgNames,
186                                            void **ArgMappers) {
187   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
188   targetData<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes,
189                           ArgTypes, ArgNames, ArgMappers, targetDataBegin,
190                           "Entering OpenMP data region with being_mapper",
191                           "begin");
192 }
193 
194 EXTERN void __tgt_target_data_begin_nowait_mapper(
195     ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
196     void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
197     void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
198     void *NoAliasDepList) {
199   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
200   targetData<TaskAsyncInfoWrapperTy>(
201       Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames,
202       ArgMappers, targetDataBegin,
203       "Entering OpenMP data region with being_nowait_mapper", "begin");
204 }
205 
206 /// passes data from the target, releases target memory and destroys
207 /// the host-target mapping (top entry from the stack of data maps)
208 /// created by the last __tgt_target_data_begin.
209 EXTERN void __tgt_target_data_end_mapper(ident_t *Loc, int64_t DeviceId,
210                                          int32_t ArgNum, void **ArgsBase,
211                                          void **Args, int64_t *ArgSizes,
212                                          int64_t *ArgTypes,
213                                          map_var_info_t *ArgNames,
214                                          void **ArgMappers) {
215   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
216   targetData<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes,
217                           ArgTypes, ArgNames, ArgMappers, targetDataEnd,
218                           "Exiting OpenMP data region with end_mapper", "end");
219 }
220 
221 EXTERN void __tgt_target_data_end_nowait_mapper(
222     ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
223     void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
224     void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
225     void *NoAliasDepList) {
226   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
227   targetData<TaskAsyncInfoWrapperTy>(
228       Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames,
229       ArgMappers, targetDataEnd,
230       "Exiting OpenMP data region with end_nowait_mapper", "end");
231 }
232 
233 EXTERN void __tgt_target_data_update_mapper(ident_t *Loc, int64_t DeviceId,
234                                             int32_t ArgNum, void **ArgsBase,
235                                             void **Args, int64_t *ArgSizes,
236                                             int64_t *ArgTypes,
237                                             map_var_info_t *ArgNames,
238                                             void **ArgMappers) {
239   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
240   targetData<AsyncInfoTy>(
241       Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames,
242       ArgMappers, targetDataUpdate,
243       "Updating data within the OpenMP data region with update_mapper",
244       "update");
245 }
246 
247 EXTERN void __tgt_target_data_update_nowait_mapper(
248     ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
249     void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
250     void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
251     void *NoAliasDepList) {
252   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
253   targetData<TaskAsyncInfoWrapperTy>(
254       Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames,
255       ArgMappers, targetDataUpdate,
256       "Updating data within the OpenMP data region with update_nowait_mapper",
257       "update");
258 }
259 
260 static KernelArgsTy *upgradeKernelArgs(KernelArgsTy *KernelArgs,
261                                        KernelArgsTy &LocalKernelArgs,
262                                        int32_t NumTeams, int32_t ThreadLimit) {
263   if (KernelArgs->Version > OMP_KERNEL_ARG_VERSION)
264     DP("Unexpected ABI version: %u\n", KernelArgs->Version);
265 
266   uint32_t UpgradedVersion = KernelArgs->Version;
267   if (KernelArgs->Version < OMP_KERNEL_ARG_VERSION) {
268     // The upgraded version will be based on the kernel launch environment.
269     if (KernelArgs->Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
270       UpgradedVersion = OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR - 1;
271     else
272       UpgradedVersion = OMP_KERNEL_ARG_VERSION;
273   }
274   if (UpgradedVersion != KernelArgs->Version) {
275     LocalKernelArgs.Version = UpgradedVersion;
276     LocalKernelArgs.NumArgs = KernelArgs->NumArgs;
277     LocalKernelArgs.ArgBasePtrs = KernelArgs->ArgBasePtrs;
278     LocalKernelArgs.ArgPtrs = KernelArgs->ArgPtrs;
279     LocalKernelArgs.ArgSizes = KernelArgs->ArgSizes;
280     LocalKernelArgs.ArgTypes = KernelArgs->ArgTypes;
281     LocalKernelArgs.ArgNames = KernelArgs->ArgNames;
282     LocalKernelArgs.ArgMappers = KernelArgs->ArgMappers;
283     LocalKernelArgs.Tripcount = KernelArgs->Tripcount;
284     LocalKernelArgs.Flags = KernelArgs->Flags;
285     LocalKernelArgs.DynCGroupMem = 0;
286     LocalKernelArgs.NumTeams[0] = NumTeams;
287     LocalKernelArgs.NumTeams[1] = 1;
288     LocalKernelArgs.NumTeams[2] = 1;
289     LocalKernelArgs.ThreadLimit[0] = ThreadLimit;
290     LocalKernelArgs.ThreadLimit[1] = 1;
291     LocalKernelArgs.ThreadLimit[2] = 1;
292     return &LocalKernelArgs;
293   }
294 
295   // FIXME: This is a WA to "calibrate" the bad work done in the front end.
296   // Delete this ugly code after the front end emits proper values.
297   auto CorrectMultiDim = [](uint32_t(&Val)[3]) {
298     if (Val[1] == 0)
299       Val[1] = 1;
300     if (Val[2] == 0)
301       Val[2] = 1;
302   };
303   CorrectMultiDim(KernelArgs->ThreadLimit);
304   CorrectMultiDim(KernelArgs->NumTeams);
305 
306   return KernelArgs;
307 }
308 
309 template <typename TargetAsyncInfoTy>
310 static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
311                                int32_t ThreadLimit, void *HostPtr,
312                                KernelArgsTy *KernelArgs) {
313   assert(PM && "Runtime not initialized");
314   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
315                 "Target AsyncInfoTy must be convertible to AsyncInfoTy.");
316   DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
317      "\n",
318      DeviceId, DPxPTR(HostPtr));
319 
320   if (checkDevice(DeviceId, Loc)) {
321     DP("Not offloading to device %" PRId64 "\n", DeviceId);
322     return OMP_TGT_FAIL;
323   }
324 
325   bool IsTeams = NumTeams != -1;
326   if (!IsTeams)
327     KernelArgs->NumTeams[0] = NumTeams = 1;
328 
329   // Auto-upgrade kernel args version 1 to 2.
330   KernelArgsTy LocalKernelArgs;
331   KernelArgs =
332       upgradeKernelArgs(KernelArgs, LocalKernelArgs, NumTeams, ThreadLimit);
333 
334   TIMESCOPE_WITH_DETAILS_AND_IDENT(
335       "Runtime: target exe",
336       "NumTeams=" + std::to_string(NumTeams) +
337           ";NumArgs=" + std::to_string(KernelArgs->NumArgs),
338       Loc);
339 
340   if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
341     printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
342                          KernelArgs->ArgSizes, KernelArgs->ArgTypes,
343                          KernelArgs->ArgNames, "Entering OpenMP kernel");
344 #ifdef OMPTARGET_DEBUG
345   for (uint32_t I = 0; I < KernelArgs->NumArgs; ++I) {
346     DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
347        ", Type=0x%" PRIx64 ", Name=%s\n",
348        I, DPxPTR(KernelArgs->ArgBasePtrs[I]), DPxPTR(KernelArgs->ArgPtrs[I]),
349        KernelArgs->ArgSizes[I], KernelArgs->ArgTypes[I],
350        (KernelArgs->ArgNames)
351            ? getNameFromMapping(KernelArgs->ArgNames[I]).c_str()
352            : "unknown");
353   }
354 #endif
355 
356   auto DeviceOrErr = PM->getDevice(DeviceId);
357   if (!DeviceOrErr)
358     FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
359 
360   TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr);
361   AsyncInfoTy &AsyncInfo = TargetAsyncInfo;
362   /// RAII to establish tool anchors before and after target region
363   OMPT_IF_BUILT(InterfaceRAII TargetRAII(
364                     RegionInterface.getCallbacks<ompt_target>(), DeviceId,
365                     /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
366 
367   int Rc = OFFLOAD_SUCCESS;
368   Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo);
369   { // required to show syncronization
370     TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: syncronize", "", Loc);
371     if (Rc == OFFLOAD_SUCCESS)
372       Rc = AsyncInfo.synchronize();
373 
374     handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
375     assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
376   }
377   return OMP_TGT_SUCCESS;
378 }
379 
380 /// Implements a kernel entry that executes the target region on the specified
381 /// device.
382 ///
383 /// \param Loc Source location associated with this target region.
384 /// \param DeviceId The device to execute this region, -1 indicated the default.
385 /// \param NumTeams Number of teams to launch the region with, -1 indicates a
386 ///                 non-teams region and 0 indicates it was unspecified.
387 /// \param ThreadLimit Limit to the number of threads to use in the kernel
388 ///                    launch, 0 indicates it was unspecified.
389 /// \param HostPtr  The pointer to the host function registered with the kernel.
390 /// \param Args     All arguments to this kernel launch (see struct definition).
391 EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
392                                int32_t ThreadLimit, void *HostPtr,
393                                KernelArgsTy *KernelArgs) {
394   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
395   if (KernelArgs->Flags.NoWait)
396     return targetKernel<TaskAsyncInfoWrapperTy>(
397         Loc, DeviceId, NumTeams, ThreadLimit, HostPtr, KernelArgs);
398   return targetKernel<AsyncInfoTy>(Loc, DeviceId, NumTeams, ThreadLimit,
399                                    HostPtr, KernelArgs);
400 }
401 
402 /// Activates the record replay mechanism.
403 /// \param DeviceId The device identifier to execute the target region.
404 /// \param MemorySize The number of bytes to be (pre-)allocated
405 ///                   by the bump allocator
406 /// /param IsRecord Activates the record replay mechanism in
407 ///                 'record' mode or 'replay' mode.
408 /// /param SaveOutput Store the device memory after kernel
409 ///                   execution on persistent storage
410 EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
411                                         void *VAddr, bool IsRecord,
412                                         bool SaveOutput,
413                                         uint64_t &ReqPtrArgOffset) {
414   assert(PM && "Runtime not initialized");
415   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
416   auto DeviceOrErr = PM->getDevice(DeviceId);
417   if (!DeviceOrErr)
418     FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
419 
420   [[maybe_unused]] int Rc = target_activate_rr(
421       *DeviceOrErr, MemorySize, VAddr, IsRecord, SaveOutput, ReqPtrArgOffset);
422   assert(Rc == OFFLOAD_SUCCESS &&
423          "__tgt_activate_record_replay unexpected failure!");
424   return OMP_TGT_SUCCESS;
425 }
426 
427 /// Implements a target kernel entry that replays a pre-recorded kernel.
428 /// \param Loc Source location associated with this target region (unused).
429 /// \param DeviceId The device identifier to execute the target region.
430 /// \param HostPtr A pointer to an address that uniquely identifies the kernel.
431 /// \param DeviceMemory A pointer to an array storing device memory data to move
432 ///                     prior to kernel execution.
433 /// \param DeviceMemorySize The size of the above device memory data in bytes.
434 /// \param TgtArgs An array of pointers of the pre-recorded target kernel
435 ///                arguments.
436 /// \param TgtOffsets An array of pointers of the pre-recorded target kernel
437 ///                   argument offsets.
438 /// \param NumArgs The number of kernel arguments.
439 /// \param NumTeams Number of teams to launch the target region with.
440 /// \param ThreadLimit Limit to the number of threads to use in kernel
441 ///                    execution.
442 /// \param LoopTripCount The pre-recorded value of the loop tripcount, if any.
443 /// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure.
444 EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
445                                       void *HostPtr, void *DeviceMemory,
446                                       int64_t DeviceMemorySize, void **TgtArgs,
447                                       ptrdiff_t *TgtOffsets, int32_t NumArgs,
448                                       int32_t NumTeams, int32_t ThreadLimit,
449                                       uint64_t LoopTripCount) {
450   assert(PM && "Runtime not initialized");
451   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
452   if (checkDevice(DeviceId, Loc)) {
453     DP("Not offloading to device %" PRId64 "\n", DeviceId);
454     return OMP_TGT_FAIL;
455   }
456   auto DeviceOrErr = PM->getDevice(DeviceId);
457   if (!DeviceOrErr)
458     FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
459 
460   /// RAII to establish tool anchors before and after target region
461   OMPT_IF_BUILT(InterfaceRAII TargetRAII(
462                     RegionInterface.getCallbacks<ompt_target>(), DeviceId,
463                     /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
464 
465   AsyncInfoTy AsyncInfo(*DeviceOrErr);
466   int Rc = target_replay(Loc, *DeviceOrErr, HostPtr, DeviceMemory,
467                          DeviceMemorySize, TgtArgs, TgtOffsets, NumArgs,
468                          NumTeams, ThreadLimit, LoopTripCount, AsyncInfo);
469   if (Rc == OFFLOAD_SUCCESS)
470     Rc = AsyncInfo.synchronize();
471   handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
472   assert(Rc == OFFLOAD_SUCCESS &&
473          "__tgt_target_kernel_replay unexpected failure!");
474   return OMP_TGT_SUCCESS;
475 }
476 
477 // Get the current number of components for a user-defined mapper.
478 EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
479   auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
480   int64_t Size = MapperComponentsPtr->Components.size();
481   DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
482      DPxPTR(RtMapperHandle), Size);
483   return Size;
484 }
485 
486 // Push back one component for a user-defined mapper.
487 EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base,
488                                         void *Begin, int64_t Size, int64_t Type,
489                                         void *Name) {
490   DP("__tgt_push_mapper_component(Handle=" DPxMOD
491      ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
492      ", Type=0x%" PRIx64 ", Name=%s).\n",
493      DPxPTR(RtMapperHandle), DPxPTR(Base), DPxPTR(Begin), Size, Type,
494      (Name) ? getNameFromMapping(Name).c_str() : "unknown");
495   auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
496   MapperComponentsPtr->Components.push_back(
497       MapComponentInfoTy(Base, Begin, Size, Type, Name));
498 }
499 
500 EXTERN void __tgt_set_info_flag(uint32_t NewInfoLevel) {
501   assert(PM && "Runtime not initialized");
502   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
503   InfoLevel.store(NewInfoLevel);
504 }
505 
506 EXTERN int __tgt_print_device_info(int64_t DeviceId) {
507   assert(PM && "Runtime not initialized");
508   auto DeviceOrErr = PM->getDevice(DeviceId);
509   if (!DeviceOrErr)
510     FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
511 
512   return DeviceOrErr->printDeviceInfo();
513 }
514 
515 EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
516   assert(PM && "Runtime not initialized");
517   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
518 
519   if (!AsyncHandle || !*AsyncHandle) {
520     FATAL_MESSAGE0(
521         1, "Receive an invalid async handle from the current OpenMP task. Is "
522            "this a target nowait region?\n");
523   }
524 
525   // Exponential backoff tries to optimally decide if a thread should just query
526   // for the device operations (work/spin wait on them) or block until they are
527   // completed (use device side blocking mechanism). This allows the runtime to
528   // adapt itself when there are a lot of long-running target regions in-flight.
529   static thread_local utils::ExponentialBackoff QueryCounter(
530       Int64Envar("OMPTARGET_QUERY_COUNT_MAX", 10),
531       Int64Envar("OMPTARGET_QUERY_COUNT_THRESHOLD", 5),
532       Envar<float>("OMPTARGET_QUERY_COUNT_BACKOFF_FACTOR", 0.5f));
533 
534   auto *AsyncInfo = (AsyncInfoTy *)*AsyncHandle;
535 
536   // If the thread is actively waiting on too many target nowait regions, we
537   // should use the blocking sync type.
538   if (QueryCounter.isAboveThreshold())
539     AsyncInfo->SyncType = AsyncInfoTy::SyncTy::BLOCKING;
540 
541   if (AsyncInfo->synchronize())
542     FATAL_MESSAGE0(1, "Error while querying the async queue for completion.\n");
543   // If there are device operations still pending, return immediately without
544   // deallocating the handle and increase the current thread query count.
545   if (!AsyncInfo->isDone()) {
546     QueryCounter.increment();
547     return;
548   }
549 
550   // When a thread successfully completes a target nowait region, we
551   // exponentially backoff its query counter by the query factor.
552   QueryCounter.decrement();
553 
554   // Delete the handle and unset it from the OpenMP task data.
555   delete AsyncInfo;
556   *AsyncHandle = nullptr;
557 }
558