xref: /llvm-project/offload/src/omptarget.cpp (revision 6518b121f037717fd211c36659f7b25266424719)
1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implementation of the interface to be used by Clang during the codegen of a
10 // target region.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "omptarget.h"
15 #include "OffloadPolicy.h"
16 #include "OpenMP/OMPT/Callback.h"
17 #include "OpenMP/OMPT/Interface.h"
18 #include "PluginManager.h"
19 #include "Shared/Debug.h"
20 #include "Shared/EnvironmentVar.h"
21 #include "Shared/Utils.h"
22 #include "device.h"
23 #include "private.h"
24 #include "rtl.h"
25 
26 #include "Shared/Profile.h"
27 
28 #include "OpenMP/Mapping.h"
29 #include "OpenMP/omp.h"
30 
31 #include "llvm/ADT/StringExtras.h"
32 #include "llvm/ADT/bit.h"
33 #include "llvm/Frontend/OpenMP/OMPConstants.h"
34 #include "llvm/Object/ObjectFile.h"
35 
36 #include <cassert>
37 #include <cstdint>
38 #include <vector>
39 
40 using llvm::SmallVector;
41 #ifdef OMPT_SUPPORT
42 using namespace llvm::omp::target::ompt;
43 #endif
44 
45 int AsyncInfoTy::synchronize() {
46   int Result = OFFLOAD_SUCCESS;
47   if (!isQueueEmpty()) {
48     switch (SyncType) {
49     case SyncTy::BLOCKING:
50       // If we have a queue we need to synchronize it now.
51       Result = Device.synchronize(*this);
52       assert(AsyncInfo.Queue == nullptr &&
53              "The device plugin should have nulled the queue to indicate there "
54              "are no outstanding actions!");
55       break;
56     case SyncTy::NON_BLOCKING:
57       Result = Device.queryAsync(*this);
58       break;
59     }
60   }
61 
62   // Run any pending post-processing function registered on this async object.
63   if (Result == OFFLOAD_SUCCESS && isQueueEmpty())
64     Result = runPostProcessing();
65 
66   return Result;
67 }
68 
69 void *&AsyncInfoTy::getVoidPtrLocation() {
70   BufferLocations.push_back(nullptr);
71   return BufferLocations.back();
72 }
73 
74 bool AsyncInfoTy::isDone() const { return isQueueEmpty(); }
75 
76 int32_t AsyncInfoTy::runPostProcessing() {
77   size_t Size = PostProcessingFunctions.size();
78   for (size_t I = 0; I < Size; ++I) {
79     const int Result = PostProcessingFunctions[I]();
80     if (Result != OFFLOAD_SUCCESS)
81       return Result;
82   }
83 
84   // Clear the vector up until the last known function, since post-processing
85   // procedures might add new procedures themselves.
86   const auto *PrevBegin = PostProcessingFunctions.begin();
87   PostProcessingFunctions.erase(PrevBegin, PrevBegin + Size);
88 
89   return OFFLOAD_SUCCESS;
90 }
91 
92 bool AsyncInfoTy::isQueueEmpty() const { return AsyncInfo.Queue == nullptr; }
93 
94 /* All begin addresses for partially mapped structs must be aligned, up to 16,
95  * in order to ensure proper alignment of members. E.g.
96  *
97  * struct S {
98  *   int a;   // 4-aligned
99  *   int b;   // 4-aligned
100  *   int *p;  // 8-aligned
101  * } s1;
102  * ...
103  * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
104  * {
105  *   s1.b = 5;
106  *   for (int i...) s1.p[i] = ...;
107  * }
108  *
109  * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
110  * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
111  * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
112  * requirements for its type. Now, when we allocate memory on the device, in
113  * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
114  * This means that the chunk of the struct on the device will start at a
115  * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
116  * address of p will be a misaligned 0x204 (on the host there was no need to add
117  * padding between b and p, so p comes exactly 4 bytes after b). If the device
118  * kernel tries to access s1.p, a misaligned address error occurs (as reported
119  * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
120  * extending the size of the allocated chuck accordingly, the chuck on the
121  * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
122  * &s1.p=0x208, as they should be to satisfy the alignment requirements.
123  */
124 static const int64_t MaxAlignment = 16;
125 
126 /// Return the alignment requirement of partially mapped structs, see
127 /// MaxAlignment above.
128 static uint64_t getPartialStructRequiredAlignment(void *HstPtrBase) {
129   int LowestOneBit = __builtin_ffsl(reinterpret_cast<uintptr_t>(HstPtrBase));
130   uint64_t BaseAlignment = 1 << (LowestOneBit - 1);
131   return MaxAlignment < BaseAlignment ? MaxAlignment : BaseAlignment;
132 }
133 
134 void handleTargetOutcome(bool Success, ident_t *Loc) {
135   switch (OffloadPolicy::get(*PM).Kind) {
136   case OffloadPolicy::DISABLED:
137     if (Success) {
138       FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
139     }
140     break;
141   case OffloadPolicy::MANDATORY:
142     if (!Success) {
143       if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) {
144         auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor();
145         for (auto &Device : PM->devices(ExclusiveDevicesAccessor))
146           dumpTargetPointerMappings(Loc, Device);
147       } else
148         FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html "
149                         "for debugging options.\n");
150 
151       if (!PM->getNumActivePlugins()) {
152         FAILURE_MESSAGE(
153             "No images found compatible with the installed hardware. ");
154 
155         llvm::SmallVector<llvm::StringRef> Archs;
156         for (auto &Image : PM->deviceImages()) {
157           const char *Start = reinterpret_cast<const char *>(
158               Image.getExecutableImage().ImageStart);
159           uint64_t Length =
160               utils::getPtrDiff(Start, Image.getExecutableImage().ImageEnd);
161           llvm::MemoryBufferRef Buffer(llvm::StringRef(Start, Length),
162                                        /*Identifier=*/"");
163 
164           auto ObjectOrErr = llvm::object::ObjectFile::createObjectFile(Buffer);
165           if (auto Err = ObjectOrErr.takeError()) {
166             llvm::consumeError(std::move(Err));
167             continue;
168           }
169 
170           if (auto CPU = (*ObjectOrErr)->tryGetCPUName())
171             Archs.push_back(*CPU);
172         }
173         fprintf(stderr, "Found %zu image(s): (%s)\n", Archs.size(),
174                 llvm::join(Archs, ",").c_str());
175       }
176 
177       SourceInfo Info(Loc);
178       if (Info.isAvailible())
179         fprintf(stderr, "%s:%d:%d: ", Info.getFilename(), Info.getLine(),
180                 Info.getColumn());
181       else
182         FAILURE_MESSAGE("Source location information not present. Compile with "
183                         "-g or -gline-tables-only.\n");
184       FATAL_MESSAGE0(
185           1, "failure of target construct while offloading is mandatory");
186     } else {
187       if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) {
188         auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor();
189         for (auto &Device : PM->devices(ExclusiveDevicesAccessor))
190           dumpTargetPointerMappings(Loc, Device);
191       }
192     }
193     break;
194   }
195 }
196 
197 static int32_t getParentIndex(int64_t Type) {
198   return ((Type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
199 }
200 
201 void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
202                           const char *Name) {
203   DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
204 
205   if (Size <= 0) {
206     DP("Call to %s with non-positive length\n", Name);
207     return NULL;
208   }
209 
210   void *Rc = NULL;
211 
212   if (DeviceNum == omp_get_initial_device()) {
213     Rc = malloc(Size);
214     DP("%s returns host ptr " DPxMOD "\n", Name, DPxPTR(Rc));
215     return Rc;
216   }
217 
218   auto DeviceOrErr = PM->getDevice(DeviceNum);
219   if (!DeviceOrErr)
220     FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
221 
222   Rc = DeviceOrErr->allocData(Size, nullptr, Kind);
223   DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(Rc));
224   return Rc;
225 }
226 
227 void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
228                         const char *Name) {
229   DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
230      DPxPTR(DevicePtr));
231 
232   if (!DevicePtr) {
233     DP("Call to %s with NULL ptr\n", Name);
234     return;
235   }
236 
237   if (DeviceNum == omp_get_initial_device()) {
238     free(DevicePtr);
239     DP("%s deallocated host ptr\n", Name);
240     return;
241   }
242 
243   auto DeviceOrErr = PM->getDevice(DeviceNum);
244   if (!DeviceOrErr)
245     FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
246 
247   if (DeviceOrErr->deleteData(DevicePtr, Kind) == OFFLOAD_FAIL)
248     FATAL_MESSAGE(DeviceNum, "%s",
249                   "Failed to deallocate device ptr. Set "
250                   "OFFLOAD_TRACK_ALLOCATION_TRACES=1 to track allocations.");
251 
252   DP("omp_target_free deallocated device ptr\n");
253 }
254 
255 void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
256                          const char *Name) {
257   DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
258 
259   if (Size <= 0) {
260     DP("Call to %s with non-positive length\n", Name);
261     return NULL;
262   }
263 
264   void *RC = NULL;
265 
266   auto DeviceOrErr = PM->getDevice(DeviceNum);
267   if (!DeviceOrErr)
268     FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
269 
270   int32_t Err = 0;
271   Err = DeviceOrErr->RTL->data_lock(DeviceNum, HostPtr, Size, &RC);
272   if (Err) {
273     DP("Could not lock ptr %p\n", HostPtr);
274     return nullptr;
275   }
276   DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(RC));
277   return RC;
278 }
279 
280 void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
281   DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
282 
283   auto DeviceOrErr = PM->getDevice(DeviceNum);
284   if (!DeviceOrErr)
285     FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
286 
287   DeviceOrErr->RTL->data_unlock(DeviceNum, HostPtr);
288   DP("%s returns\n", Name);
289 }
290 
291 /// Call the user-defined mapper function followed by the appropriate
292 // targetData* function (targetData{Begin,End,Update}).
293 int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
294                      int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames,
295                      void *ArgMapper, AsyncInfoTy &AsyncInfo,
296                      TargetDataFuncPtrTy TargetDataFunction) {
297   DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper));
298 
299   // The mapper function fills up Components.
300   MapperComponentsTy MapperComponents;
301   MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(ArgMapper);
302   (*MapperFuncPtr)((void *)&MapperComponents, ArgBase, Arg, ArgSize, ArgType,
303                    ArgNames);
304 
305   // Construct new arrays for args_base, args, arg_sizes and arg_types
306   // using the information in MapperComponents and call the corresponding
307   // targetData* function using these new arrays.
308   SmallVector<void *> MapperArgsBase(MapperComponents.Components.size());
309   SmallVector<void *> MapperArgs(MapperComponents.Components.size());
310   SmallVector<int64_t> MapperArgSizes(MapperComponents.Components.size());
311   SmallVector<int64_t> MapperArgTypes(MapperComponents.Components.size());
312   SmallVector<void *> MapperArgNames(MapperComponents.Components.size());
313 
314   for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
315     auto &C = MapperComponents.Components[I];
316     MapperArgsBase[I] = C.Base;
317     MapperArgs[I] = C.Begin;
318     MapperArgSizes[I] = C.Size;
319     MapperArgTypes[I] = C.Type;
320     MapperArgNames[I] = C.Name;
321   }
322 
323   int Rc = TargetDataFunction(Loc, Device, MapperComponents.Components.size(),
324                               MapperArgsBase.data(), MapperArgs.data(),
325                               MapperArgSizes.data(), MapperArgTypes.data(),
326                               MapperArgNames.data(), /*arg_mappers*/ nullptr,
327                               AsyncInfo, /*FromMapper=*/true);
328 
329   return Rc;
330 }
331 
332 /// Internal function to do the mapping and transfer the data to the device
333 int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
334                     void **ArgsBase, void **Args, int64_t *ArgSizes,
335                     int64_t *ArgTypes, map_var_info_t *ArgNames,
336                     void **ArgMappers, AsyncInfoTy &AsyncInfo,
337                     bool FromMapper) {
338   // process each input.
339   for (int32_t I = 0; I < ArgNum; ++I) {
340     // Ignore private variables and arrays - there is no mapping for them.
341     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
342         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
343       continue;
344     TIMESCOPE_WITH_DETAILS_AND_IDENT(
345         "HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
346     if (ArgMappers && ArgMappers[I]) {
347       // Instead of executing the regular path of targetDataBegin, call the
348       // targetDataMapper variant which will call targetDataBegin again
349       // with new arguments.
350       DP("Calling targetDataMapper for the %dth argument\n", I);
351 
352       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
353       int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
354                                 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
355                                 targetDataBegin);
356 
357       if (Rc != OFFLOAD_SUCCESS) {
358         REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
359                " failed.\n");
360         return OFFLOAD_FAIL;
361       }
362 
363       // Skip the rest of this function, continue to the next argument.
364       continue;
365     }
366 
367     void *HstPtrBegin = Args[I];
368     void *HstPtrBase = ArgsBase[I];
369     int64_t DataSize = ArgSizes[I];
370     map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
371 
372     // Adjust for proper alignment if this is a combined entry (for structs).
373     // Look at the next argument - if that is MEMBER_OF this one, then this one
374     // is a combined entry.
375     int64_t TgtPadding = 0;
376     const int NextI = I + 1;
377     if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum &&
378         getParentIndex(ArgTypes[NextI]) == I) {
379       int64_t Alignment = getPartialStructRequiredAlignment(HstPtrBase);
380       TgtPadding = (int64_t)HstPtrBegin % Alignment;
381       if (TgtPadding) {
382         DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
383            "\n",
384            TgtPadding, DPxPTR(HstPtrBegin));
385       }
386     }
387 
388     // Address of pointer on the host and device, respectively.
389     void *PointerHstPtrBegin, *PointerTgtPtrBegin;
390     TargetPointerResultTy PointerTpr;
391     bool IsHostPtr = false;
392     bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
393     // Force the creation of a device side copy of the data when:
394     // a close map modifier was associated with a map that contained a to.
395     bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE;
396     bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
397     bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
398     // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
399     // have reached this point via __tgt_target_data_begin and not __tgt_target
400     // then no argument is marked as TARGET_PARAM ("omp target data map" is not
401     // associated with a target region, so there are no target parameters). This
402     // may be considered a hack, we could revise the scheme in the future.
403     bool UpdateRef =
404         !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && I == 0);
405 
406     MappingInfoTy::HDTTMapAccessorTy HDTTMap =
407         Device.getMappingInfo().HostDataToTargetMap.getExclusiveAccessor();
408     if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
409       DP("Has a pointer entry: \n");
410       // Base is address of pointer.
411       //
412       // Usually, the pointer is already allocated by this time.  For example:
413       //
414       //   #pragma omp target map(s.p[0:N])
415       //
416       // The map entry for s comes first, and the PTR_AND_OBJ entry comes
417       // afterward, so the pointer is already allocated by the time the
418       // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
419       // non-null.  However, "declare target link" can produce a PTR_AND_OBJ
420       // entry for a global that might not already be allocated by the time the
421       // PTR_AND_OBJ entry is handled below, and so the allocation might fail
422       // when HasPresentModifier.
423       PointerTpr = Device.getMappingInfo().getTargetPointer(
424           HDTTMap, HstPtrBase, HstPtrBase, /*TgtPadding=*/0, sizeof(void *),
425           /*HstPtrName=*/nullptr,
426           /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef,
427           HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo,
428           /*OwnedTPR=*/nullptr, /*ReleaseHDTTMap=*/false);
429       PointerTgtPtrBegin = PointerTpr.TargetPointer;
430       IsHostPtr = PointerTpr.Flags.IsHostPointer;
431       if (!PointerTgtPtrBegin) {
432         REPORT("Call to getTargetPointer returned null pointer (%s).\n",
433                HasPresentModifier ? "'present' map type modifier"
434                                   : "device failure or illegal mapping");
435         return OFFLOAD_FAIL;
436       }
437       DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
438          "\n",
439          sizeof(void *), DPxPTR(PointerTgtPtrBegin),
440          (PointerTpr.Flags.IsNewEntry ? "" : " not"));
441       PointerHstPtrBegin = HstPtrBase;
442       // modify current entry.
443       HstPtrBase = *(void **)HstPtrBase;
444       // No need to update pointee ref count for the first element of the
445       // subelement that comes from mapper.
446       UpdateRef =
447           (!FromMapper || I != 0); // subsequently update ref count of pointee
448     }
449 
450     const bool HasFlagTo = ArgTypes[I] & OMP_TGT_MAPTYPE_TO;
451     const bool HasFlagAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
452     // Note that HDTTMap will be released in getTargetPointer.
453     auto TPR = Device.getMappingInfo().getTargetPointer(
454         HDTTMap, HstPtrBegin, HstPtrBase, TgtPadding, DataSize, HstPtrName,
455         HasFlagTo, HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier,
456         HasPresentModifier, HasHoldModifier, AsyncInfo, PointerTpr.getEntry());
457     void *TgtPtrBegin = TPR.TargetPointer;
458     IsHostPtr = TPR.Flags.IsHostPointer;
459     // If data_size==0, then the argument could be a zero-length pointer to
460     // NULL, so getOrAlloc() returning NULL is not an error.
461     if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
462       REPORT("Call to getTargetPointer returned null pointer (%s).\n",
463              HasPresentModifier ? "'present' map type modifier"
464                                 : "device failure or illegal mapping");
465       return OFFLOAD_FAIL;
466     }
467     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
468        " - is%s new\n",
469        DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not"));
470 
471     if (ArgTypes[I] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
472       uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
473       void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
474       DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
475       ArgsBase[I] = TgtPtrBase;
476     }
477 
478     if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
479 
480       uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
481       void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
482 
483       if (PointerTpr.getEntry()->addShadowPointer(ShadowPtrInfoTy{
484               (void **)PointerHstPtrBegin, HstPtrBase,
485               (void **)PointerTgtPtrBegin, ExpectedTgtPtrBase})) {
486         DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
487            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
488 
489         void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation();
490         TgtPtrBase = ExpectedTgtPtrBase;
491 
492         int Ret =
493             Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, sizeof(void *),
494                               AsyncInfo, PointerTpr.getEntry());
495         if (Ret != OFFLOAD_SUCCESS) {
496           REPORT("Copying data to device failed.\n");
497           return OFFLOAD_FAIL;
498         }
499         if (PointerTpr.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
500             OFFLOAD_SUCCESS)
501           return OFFLOAD_FAIL;
502       }
503     }
504 
505     // Check if variable can be used on the device:
506     bool IsStructMember = ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF;
507     if (getInfoLevel() & OMP_INFOTYPE_EMPTY_MAPPING && ArgTypes[I] != 0 &&
508         !IsStructMember && !IsImplicit && !TPR.isPresent() &&
509         !TPR.isContained() && !TPR.isHostPointer())
510       INFO(OMP_INFOTYPE_EMPTY_MAPPING, Device.DeviceID,
511            "variable %s does not have a valid device counterpart\n",
512            (HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown");
513   }
514 
515   return OFFLOAD_SUCCESS;
516 }
517 
518 namespace {
519 /// This structure contains information to deallocate a target pointer, aka.
520 /// used to fix up the shadow map and potentially delete the entry from the
521 /// mapping table via \p DeviceTy::deallocTgtPtr.
522 struct PostProcessingInfo {
523   /// Host pointer used to look up into the map table
524   void *HstPtrBegin;
525 
526   /// Size of the data
527   int64_t DataSize;
528 
529   /// The mapping type (bitfield).
530   int64_t ArgType;
531 
532   /// The target pointer information.
533   TargetPointerResultTy TPR;
534 
535   PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType,
536                      TargetPointerResultTy &&TPR)
537       : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType),
538         TPR(std::move(TPR)) {}
539 };
540 
541 } // namespace
542 
543 /// Applies the necessary post-processing procedures to entries listed in \p
544 /// EntriesInfo after the execution of all device side operations from a target
545 /// data end. This includes the update of pointers at the host and removal of
546 /// device buffer when needed. It returns OFFLOAD_FAIL or OFFLOAD_SUCCESS
547 /// according to the successfulness of the operations.
548 [[nodiscard]] static int
549 postProcessingTargetDataEnd(DeviceTy *Device,
550                             SmallVector<PostProcessingInfo> &EntriesInfo) {
551   int Ret = OFFLOAD_SUCCESS;
552 
553   for (auto &[HstPtrBegin, DataSize, ArgType, TPR] : EntriesInfo) {
554     bool DelEntry = !TPR.isHostPointer();
555 
556     // If the last element from the mapper (for end transfer args comes in
557     // reverse order), do not remove the partial entry, the parent struct still
558     // exists.
559     if ((ArgType & OMP_TGT_MAPTYPE_MEMBER_OF) &&
560         !(ArgType & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
561       DelEntry = false; // protect parent struct from being deallocated
562     }
563 
564     // If we marked the entry to be deleted we need to verify no other
565     // thread reused it by now. If deletion is still supposed to happen by
566     // this thread LR will be set and exclusive access to the HDTT map
567     // will avoid another thread reusing the entry now. Note that we do
568     // not request (exclusive) access to the HDTT map if DelEntry is
569     // not set.
570     MappingInfoTy::HDTTMapAccessorTy HDTTMap =
571         Device->getMappingInfo().HostDataToTargetMap.getExclusiveAccessor();
572 
573     // We cannot use a lock guard because we may end up delete the mutex.
574     // We also explicitly unlocked the entry after it was put in the EntriesInfo
575     // so it can be reused.
576     TPR.getEntry()->lock();
577     auto *Entry = TPR.getEntry();
578 
579     const bool IsNotLastUser = Entry->decDataEndThreadCount() != 0;
580     if (DelEntry && (Entry->getTotalRefCount() != 0 || IsNotLastUser)) {
581       // The thread is not in charge of deletion anymore. Give up access
582       // to the HDTT map and unset the deletion flag.
583       HDTTMap.destroy();
584       DelEntry = false;
585     }
586 
587     // If we copied back to the host a struct/array containing pointers,
588     // we need to restore the original host pointer values from their
589     // shadow copies. If the struct is going to be deallocated, remove any
590     // remaining shadow pointer entries for this struct.
591     const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM;
592     if (HasFrom) {
593       Entry->foreachShadowPointerInfo([&](const ShadowPtrInfoTy &ShadowPtr) {
594         *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
595         DP("Restoring original host pointer value " DPxMOD " for host "
596            "pointer " DPxMOD "\n",
597            DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
598         return OFFLOAD_SUCCESS;
599       });
600     }
601 
602     // Give up the lock as we either don't need it anymore (e.g., done with
603     // TPR), or erase TPR.
604     TPR.setEntry(nullptr);
605 
606     if (!DelEntry)
607       continue;
608 
609     Ret = Device->getMappingInfo().eraseMapEntry(HDTTMap, Entry, DataSize);
610     // Entry is already remove from the map, we can unlock it now.
611     HDTTMap.destroy();
612     Ret |= Device->getMappingInfo().deallocTgtPtrAndEntry(Entry, DataSize);
613     if (Ret != OFFLOAD_SUCCESS) {
614       REPORT("Deallocating data from device failed.\n");
615       break;
616     }
617   }
618 
619   delete &EntriesInfo;
620   return Ret;
621 }
622 
623 /// Internal function to undo the mapping and retrieve the data from the device.
624 int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
625                   void **ArgBases, void **Args, int64_t *ArgSizes,
626                   int64_t *ArgTypes, map_var_info_t *ArgNames,
627                   void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
628   int Ret = OFFLOAD_SUCCESS;
629   auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>();
630   // process each input.
631   for (int32_t I = ArgNum - 1; I >= 0; --I) {
632     // Ignore private variables and arrays - there is no mapping for them.
633     // Also, ignore the use_device_ptr directive, it has no effect here.
634     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
635         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
636       continue;
637 
638     if (ArgMappers && ArgMappers[I]) {
639       // Instead of executing the regular path of targetDataEnd, call the
640       // targetDataMapper variant which will call targetDataEnd again
641       // with new arguments.
642       DP("Calling targetDataMapper for the %dth argument\n", I);
643 
644       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
645       Ret = targetDataMapper(Loc, Device, ArgBases[I], Args[I], ArgSizes[I],
646                              ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
647                              targetDataEnd);
648 
649       if (Ret != OFFLOAD_SUCCESS) {
650         REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
651                " failed.\n");
652         return OFFLOAD_FAIL;
653       }
654 
655       // Skip the rest of this function, continue to the next argument.
656       continue;
657     }
658 
659     void *HstPtrBegin = Args[I];
660     int64_t DataSize = ArgSizes[I];
661     bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
662     bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
663                       (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) &&
664                      !(FromMapper && I == 0);
665     bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
666     bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
667     bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
668 
669     // If PTR_AND_OBJ, HstPtrBegin is address of pointee
670     TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin(
671         HstPtrBegin, DataSize, UpdateRef, HasHoldModifier, !IsImplicit,
672         ForceDelete, /*FromDataEnd=*/true);
673     void *TgtPtrBegin = TPR.TargetPointer;
674     if (!TPR.isPresent() && !TPR.isHostPointer() &&
675         (DataSize || HasPresentModifier)) {
676       DP("Mapping does not exist (%s)\n",
677          (HasPresentModifier ? "'present' map type modifier" : "ignored"));
678       if (HasPresentModifier) {
679         // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13:
680         // "If a map clause appears on a target, target data, target enter data
681         // or target exit data construct with a present map-type-modifier then
682         // on entry to the region if the corresponding list item does not appear
683         // in the device data environment then an error occurs and the program
684         // terminates."
685         //
686         // This should be an error upon entering an "omp target exit data".  It
687         // should not be an error upon exiting an "omp target data" or "omp
688         // target".  For "omp target data", Clang thus doesn't include present
689         // modifiers for end calls.  For "omp target", we have not found a valid
690         // OpenMP program for which the error matters: it appears that, if a
691         // program can guarantee that data is present at the beginning of an
692         // "omp target" region so that there's no error there, that data is also
693         // guaranteed to be present at the end.
694         MESSAGE("device mapping required by 'present' map type modifier does "
695                 "not exist for host address " DPxMOD " (%" PRId64 " bytes)",
696                 DPxPTR(HstPtrBegin), DataSize);
697         return OFFLOAD_FAIL;
698       }
699     } else {
700       DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
701          " - is%s last\n",
702          DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsLast ? "" : " not"));
703     }
704 
705     // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16:
706     // "If the map clause appears on a target, target data, or target exit data
707     // construct and a corresponding list item of the original list item is not
708     // present in the device data environment on exit from the region then the
709     // list item is ignored."
710     if (!TPR.isPresent())
711       continue;
712 
713     // Move data back to the host
714     const bool HasAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
715     const bool HasFrom = ArgTypes[I] & OMP_TGT_MAPTYPE_FROM;
716     if (HasFrom && (HasAlways || TPR.Flags.IsLast) &&
717         !TPR.Flags.IsHostPointer && DataSize != 0) {
718       DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
719          DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
720       TIMESCOPE_WITH_DETAILS_AND_IDENT(
721           "DevToHost", "Size=" + std::to_string(DataSize) + "B", Loc);
722       // Wait for any previous transfer if an event is present.
723       if (void *Event = TPR.getEntry()->getEvent()) {
724         if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
725           REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event));
726           return OFFLOAD_FAIL;
727         }
728       }
729 
730       Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, AsyncInfo,
731                                 TPR.getEntry());
732       if (Ret != OFFLOAD_SUCCESS) {
733         REPORT("Copying data from device failed.\n");
734         return OFFLOAD_FAIL;
735       }
736 
737       // As we are expecting to delete the entry the d2h copy might race
738       // with another one that also tries to delete the entry. This happens
739       // as the entry can be reused and the reuse might happen after the
740       // copy-back was issued but before it completed. Since the reuse might
741       // also copy-back a value we would race.
742       if (TPR.Flags.IsLast) {
743         if (TPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
744             OFFLOAD_SUCCESS)
745           return OFFLOAD_FAIL;
746       }
747     }
748 
749     // Add pointer to the buffer for post-synchronize processing.
750     PostProcessingPtrs->emplace_back(HstPtrBegin, DataSize, ArgTypes[I],
751                                      std::move(TPR));
752     PostProcessingPtrs->back().TPR.getEntry()->unlock();
753   }
754 
755   // Add post-processing functions
756   // TODO: We might want to remove `mutable` in the future by not changing the
757   // captured variables somehow.
758   AsyncInfo.addPostProcessingFunction([=, Device = &Device]() mutable -> int {
759     return postProcessingTargetDataEnd(Device, *PostProcessingPtrs);
760   });
761 
762   return Ret;
763 }
764 
765 static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase,
766                                 void *HstPtrBegin, int64_t ArgSize,
767                                 int64_t ArgType, AsyncInfoTy &AsyncInfo) {
768   TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin(
769       HstPtrBegin, ArgSize, /*UpdateRefCount=*/false,
770       /*UseHoldRefCount=*/false, /*MustContain=*/true);
771   void *TgtPtrBegin = TPR.TargetPointer;
772   if (!TPR.isPresent()) {
773     DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
774     if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
775       MESSAGE("device mapping required by 'present' motion modifier does not "
776               "exist for host address " DPxMOD " (%" PRId64 " bytes)",
777               DPxPTR(HstPtrBegin), ArgSize);
778       return OFFLOAD_FAIL;
779     }
780     return OFFLOAD_SUCCESS;
781   }
782 
783   if (TPR.Flags.IsHostPointer) {
784     DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
785        DPxPTR(HstPtrBegin));
786     return OFFLOAD_SUCCESS;
787   }
788 
789   if (ArgType & OMP_TGT_MAPTYPE_TO) {
790     DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
791        ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
792     int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo,
793                                 TPR.getEntry());
794     if (Ret != OFFLOAD_SUCCESS) {
795       REPORT("Copying data to device failed.\n");
796       return OFFLOAD_FAIL;
797     }
798     if (TPR.getEntry()) {
799       int Ret = TPR.getEntry()->foreachShadowPointerInfo(
800           [&](ShadowPtrInfoTy &ShadowPtr) {
801             DP("Restoring original target pointer value " DPxMOD " for target "
802                "pointer " DPxMOD "\n",
803                DPxPTR(ShadowPtr.TgtPtrVal), DPxPTR(ShadowPtr.TgtPtrAddr));
804             Ret = Device.submitData(ShadowPtr.TgtPtrAddr,
805                                     (void *)&ShadowPtr.TgtPtrVal,
806                                     sizeof(void *), AsyncInfo);
807             if (Ret != OFFLOAD_SUCCESS) {
808               REPORT("Copying data to device failed.\n");
809               return OFFLOAD_FAIL;
810             }
811             return OFFLOAD_SUCCESS;
812           });
813       if (Ret != OFFLOAD_SUCCESS) {
814         DP("Updating shadow map failed\n");
815         return Ret;
816       }
817     }
818   }
819 
820   if (ArgType & OMP_TGT_MAPTYPE_FROM) {
821     DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
822        ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
823     int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo,
824                                   TPR.getEntry());
825     if (Ret != OFFLOAD_SUCCESS) {
826       REPORT("Copying data from device failed.\n");
827       return OFFLOAD_FAIL;
828     }
829 
830     // Wait for device-to-host memcopies for whole struct to complete,
831     // before restoring the correct host pointer.
832     if (auto *Entry = TPR.getEntry()) {
833       AsyncInfo.addPostProcessingFunction([=]() -> int {
834         int Ret = Entry->foreachShadowPointerInfo(
835             [&](const ShadowPtrInfoTy &ShadowPtr) {
836               *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
837               DP("Restoring original host pointer value " DPxMOD
838                  " for host pointer " DPxMOD "\n",
839                  DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
840               return OFFLOAD_SUCCESS;
841             });
842         Entry->unlock();
843         if (Ret != OFFLOAD_SUCCESS) {
844           DP("Updating shadow map failed\n");
845           return Ret;
846         }
847         return OFFLOAD_SUCCESS;
848       });
849     }
850   }
851 
852   return OFFLOAD_SUCCESS;
853 }
854 
855 static int targetDataNonContiguous(ident_t *Loc, DeviceTy &Device,
856                                    void *ArgsBase,
857                                    __tgt_target_non_contig *NonContig,
858                                    uint64_t Size, int64_t ArgType,
859                                    int CurrentDim, int DimSize, uint64_t Offset,
860                                    AsyncInfoTy &AsyncInfo) {
861   int Ret = OFFLOAD_SUCCESS;
862   if (CurrentDim < DimSize) {
863     for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
864       uint64_t CurOffset =
865           (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
866       // we only need to transfer the first element for the last dimension
867       // since we've already got a contiguous piece.
868       if (CurrentDim != DimSize - 1 || I == 0) {
869         Ret = targetDataNonContiguous(Loc, Device, ArgsBase, NonContig, Size,
870                                       ArgType, CurrentDim + 1, DimSize,
871                                       Offset + CurOffset, AsyncInfo);
872         // Stop the whole process if any contiguous piece returns anything
873         // other than OFFLOAD_SUCCESS.
874         if (Ret != OFFLOAD_SUCCESS)
875           return Ret;
876       }
877     }
878   } else {
879     char *Ptr = (char *)ArgsBase + Offset;
880     DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
881        " len %" PRIu64 "\n",
882        DPxPTR(Ptr), Offset, Size);
883     Ret = targetDataContiguous(Loc, Device, ArgsBase, Ptr, Size, ArgType,
884                                AsyncInfo);
885   }
886   return Ret;
887 }
888 
889 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
890                                        int32_t DimSize) {
891   int RemovedDim = 0;
892   for (int I = DimSize - 1; I > 0; --I) {
893     if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
894       RemovedDim++;
895   }
896   return RemovedDim;
897 }
898 
899 /// Internal function to pass data to/from the target.
900 int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
901                      void **ArgsBase, void **Args, int64_t *ArgSizes,
902                      int64_t *ArgTypes, map_var_info_t *ArgNames,
903                      void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) {
904   // process each input.
905   for (int32_t I = 0; I < ArgNum; ++I) {
906     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
907         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
908       continue;
909 
910     if (ArgMappers && ArgMappers[I]) {
911       // Instead of executing the regular path of targetDataUpdate, call the
912       // targetDataMapper variant which will call targetDataUpdate again
913       // with new arguments.
914       DP("Calling targetDataMapper for the %dth argument\n", I);
915 
916       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
917       int Ret = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
918                                  ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
919                                  targetDataUpdate);
920 
921       if (Ret != OFFLOAD_SUCCESS) {
922         REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
923                " failed.\n");
924         return OFFLOAD_FAIL;
925       }
926 
927       // Skip the rest of this function, continue to the next argument.
928       continue;
929     }
930 
931     int Ret = OFFLOAD_SUCCESS;
932 
933     if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
934       __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
935       int32_t DimSize = ArgSizes[I];
936       uint64_t Size =
937           NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
938       int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
939       Ret = targetDataNonContiguous(
940           Loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
941           /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
942     } else {
943       Ret = targetDataContiguous(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
944                                  ArgTypes[I], AsyncInfo);
945     }
946     if (Ret == OFFLOAD_FAIL)
947       return OFFLOAD_FAIL;
948   }
949   return OFFLOAD_SUCCESS;
950 }
951 
952 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
953                                       OMP_TGT_MAPTYPE_LITERAL |
954                                       OMP_TGT_MAPTYPE_IMPLICIT;
955 static bool isLambdaMapping(int64_t Mapping) {
956   return (Mapping & LambdaMapping) == LambdaMapping;
957 }
958 
959 namespace {
960 /// Find the table information in the map or look it up in the translation
961 /// tables.
962 TableMap *getTableMap(void *HostPtr) {
963   std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
964   HostPtrToTableMapTy::iterator TableMapIt =
965       PM->HostPtrToTableMap.find(HostPtr);
966 
967   if (TableMapIt != PM->HostPtrToTableMap.end())
968     return &TableMapIt->second;
969 
970   // We don't have a map. So search all the registered libraries.
971   TableMap *TM = nullptr;
972   std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
973   for (HostEntriesBeginToTransTableTy::iterator Itr =
974            PM->HostEntriesBeginToTransTable.begin();
975        Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
976     // get the translation table (which contains all the good info).
977     TranslationTable *TransTable = &Itr->second;
978     // iterate over all the host table entries to see if we can locate the
979     // host_ptr.
980     llvm::offloading::EntryTy *Cur = TransTable->HostTable.EntriesBegin;
981     for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
982       if (Cur->Address != HostPtr)
983         continue;
984       // we got a match, now fill the HostPtrToTableMap so that we
985       // may avoid this search next time.
986       TM = &(PM->HostPtrToTableMap)[HostPtr];
987       TM->Table = TransTable;
988       TM->Index = I;
989       return TM;
990     }
991   }
992 
993   return nullptr;
994 }
995 
996 /// A class manages private arguments in a target region.
997 class PrivateArgumentManagerTy {
998   /// A data structure for the information of first-private arguments. We can
999   /// use this information to optimize data transfer by packing all
1000   /// first-private arguments and transfer them all at once.
1001   struct FirstPrivateArgInfoTy {
1002     /// Host pointer begin
1003     char *HstPtrBegin;
1004     /// Host pointer end
1005     char *HstPtrEnd;
1006     /// The index of the element in \p TgtArgs corresponding to the argument
1007     int Index;
1008     /// Alignment of the entry (base of the entry, not after the entry).
1009     uint32_t Alignment;
1010     /// Size (without alignment, see padding)
1011     uint32_t Size;
1012     /// Padding used to align this argument entry, if necessary.
1013     uint32_t Padding;
1014     /// Host pointer name
1015     map_var_info_t HstPtrName = nullptr;
1016 
1017     FirstPrivateArgInfoTy(int Index, void *HstPtr, uint32_t Size,
1018                           uint32_t Alignment, uint32_t Padding,
1019                           map_var_info_t HstPtrName = nullptr)
1020         : HstPtrBegin(reinterpret_cast<char *>(HstPtr)),
1021           HstPtrEnd(HstPtrBegin + Size), Index(Index), Alignment(Alignment),
1022           Size(Size), Padding(Padding), HstPtrName(HstPtrName) {}
1023   };
1024 
1025   /// A vector of target pointers for all private arguments
1026   SmallVector<void *> TgtPtrs;
1027 
1028   /// A vector of information of all first-private arguments to be packed
1029   SmallVector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
1030   /// Host buffer for all arguments to be packed
1031   SmallVector<char> FirstPrivateArgBuffer;
1032   /// The total size of all arguments to be packed
1033   int64_t FirstPrivateArgSize = 0;
1034 
1035   /// A reference to the \p DeviceTy object
1036   DeviceTy &Device;
1037   /// A pointer to a \p AsyncInfoTy object
1038   AsyncInfoTy &AsyncInfo;
1039 
1040   // TODO: What would be the best value here? Should we make it configurable?
1041   // If the size is larger than this threshold, we will allocate and transfer it
1042   // immediately instead of packing it.
1043   static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
1044 
1045 public:
1046   /// Constructor
1047   PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
1048       : Device(Dev), AsyncInfo(AsyncInfo) {}
1049 
1050   /// Add a private argument
1051   int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
1052              bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
1053              map_var_info_t HstPtrName = nullptr,
1054              const bool AllocImmediately = false) {
1055     // If the argument is not first-private, or its size is greater than a
1056     // predefined threshold, we will allocate memory and issue the transfer
1057     // immediately.
1058     if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate ||
1059         AllocImmediately) {
1060       TgtPtr = Device.allocData(ArgSize, HstPtr);
1061       if (!TgtPtr) {
1062         DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
1063            (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
1064         return OFFLOAD_FAIL;
1065       }
1066 #ifdef OMPTARGET_DEBUG
1067       void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
1068       DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
1069          " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
1070          "\n",
1071          ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
1072          DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
1073 #endif
1074       // If first-private, copy data from host
1075       if (IsFirstPrivate) {
1076         DP("Submitting firstprivate data to the device.\n");
1077         int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
1078         if (Ret != OFFLOAD_SUCCESS) {
1079           DP("Copying data to device failed, failed.\n");
1080           return OFFLOAD_FAIL;
1081         }
1082       }
1083       TgtPtrs.push_back(TgtPtr);
1084     } else {
1085       DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
1086          DPxPTR(HstPtr), ArgSize);
1087       // When reach this point, the argument must meet all following
1088       // requirements:
1089       // 1. Its size does not exceed the threshold (see the comment for
1090       // FirstPrivateArgSizeThreshold);
1091       // 2. It must be first-private (needs to be mapped to target device).
1092       // We will pack all this kind of arguments to transfer them all at once
1093       // to reduce the number of data transfer. We will not take
1094       // non-first-private arguments, aka. private arguments that doesn't need
1095       // to be mapped to target device, into account because data allocation
1096       // can be very efficient with memory manager.
1097 
1098       // Placeholder value
1099       TgtPtr = nullptr;
1100       auto *LastFPArgInfo =
1101           FirstPrivateArgInfo.empty() ? nullptr : &FirstPrivateArgInfo.back();
1102 
1103       // Compute the start alignment of this entry, add padding if necessary.
1104       // TODO: Consider sorting instead.
1105       uint32_t Padding = 0;
1106       uint32_t StartAlignment =
1107           LastFPArgInfo ? LastFPArgInfo->Alignment : MaxAlignment;
1108       if (LastFPArgInfo) {
1109         // Check if we keep the start alignment or if it is shrunk due to the
1110         // size of the last element.
1111         uint32_t Offset = LastFPArgInfo->Size % StartAlignment;
1112         if (Offset)
1113           StartAlignment = Offset;
1114         // We only need as much alignment as the host pointer had (since we
1115         // don't know the alignment information from the source we might end up
1116         // overaligning accesses but not too much).
1117         uint32_t RequiredAlignment =
1118             llvm::bit_floor(getPartialStructRequiredAlignment(HstPtr));
1119         if (RequiredAlignment > StartAlignment) {
1120           Padding = RequiredAlignment - StartAlignment;
1121           StartAlignment = RequiredAlignment;
1122         }
1123       }
1124 
1125       FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
1126                                        StartAlignment, Padding, HstPtrName);
1127       FirstPrivateArgSize += Padding + ArgSize;
1128     }
1129 
1130     return OFFLOAD_SUCCESS;
1131   }
1132 
1133   /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
1134   /// and start the transfer.
1135   int packAndTransfer(SmallVector<void *> &TgtArgs) {
1136     if (!FirstPrivateArgInfo.empty()) {
1137       assert(FirstPrivateArgSize != 0 &&
1138              "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
1139       FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0);
1140       auto *Itr = FirstPrivateArgBuffer.begin();
1141       // Copy all host data to this buffer
1142       for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1143         // First pad the pointer as we (have to) pad it on the device too.
1144         Itr = std::next(Itr, Info.Padding);
1145         std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
1146         Itr = std::next(Itr, Info.Size);
1147       }
1148       // Allocate target memory
1149       void *TgtPtr =
1150           Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data());
1151       if (TgtPtr == nullptr) {
1152         DP("Failed to allocate target memory for private arguments.\n");
1153         return OFFLOAD_FAIL;
1154       }
1155       TgtPtrs.push_back(TgtPtr);
1156       DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
1157          FirstPrivateArgSize, DPxPTR(TgtPtr));
1158       // Transfer data to target device
1159       int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
1160                                   FirstPrivateArgSize, AsyncInfo);
1161       if (Ret != OFFLOAD_SUCCESS) {
1162         DP("Failed to submit data of private arguments.\n");
1163         return OFFLOAD_FAIL;
1164       }
1165       // Fill in all placeholder pointers
1166       auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
1167       for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1168         void *&Ptr = TgtArgs[Info.Index];
1169         assert(Ptr == nullptr && "Target pointer is already set by mistaken");
1170         // Pad the device pointer to get the right alignment.
1171         TP += Info.Padding;
1172         Ptr = reinterpret_cast<void *>(TP);
1173         TP += Info.Size;
1174         DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
1175            "\n",
1176            DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
1177            DPxPTR(Ptr));
1178       }
1179     }
1180 
1181     return OFFLOAD_SUCCESS;
1182   }
1183 
1184   /// Free all target memory allocated for private arguments
1185   int free() {
1186     for (void *P : TgtPtrs) {
1187       int Ret = Device.deleteData(P);
1188       if (Ret != OFFLOAD_SUCCESS) {
1189         DP("Deallocation of (first-)private arrays failed.\n");
1190         return OFFLOAD_FAIL;
1191       }
1192     }
1193 
1194     TgtPtrs.clear();
1195 
1196     return OFFLOAD_SUCCESS;
1197   }
1198 };
1199 
1200 /// Process data before launching the kernel, including calling targetDataBegin
1201 /// to map and transfer data to target device, transferring (first-)private
1202 /// variables.
1203 static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
1204                              int32_t ArgNum, void **ArgBases, void **Args,
1205                              int64_t *ArgSizes, int64_t *ArgTypes,
1206                              map_var_info_t *ArgNames, void **ArgMappers,
1207                              SmallVector<void *> &TgtArgs,
1208                              SmallVector<ptrdiff_t> &TgtOffsets,
1209                              PrivateArgumentManagerTy &PrivateArgumentManager,
1210                              AsyncInfoTy &AsyncInfo) {
1211 
1212   auto DeviceOrErr = PM->getDevice(DeviceId);
1213   if (!DeviceOrErr)
1214     FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
1215 
1216   int Ret = targetDataBegin(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes,
1217                             ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1218   if (Ret != OFFLOAD_SUCCESS) {
1219     REPORT("Call to targetDataBegin failed, abort target.\n");
1220     return OFFLOAD_FAIL;
1221   }
1222 
1223   // List of (first-)private arrays allocated for this target region
1224   SmallVector<int> TgtArgsPositions(ArgNum, -1);
1225 
1226   for (int32_t I = 0; I < ArgNum; ++I) {
1227     if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
1228       // This is not a target parameter, do not push it into TgtArgs.
1229       // Check for lambda mapping.
1230       if (isLambdaMapping(ArgTypes[I])) {
1231         assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1232                "PTR_AND_OBJ must be also MEMBER_OF.");
1233         unsigned Idx = getParentIndex(ArgTypes[I]);
1234         int TgtIdx = TgtArgsPositions[Idx];
1235         assert(TgtIdx != -1 && "Base address must be translated already.");
1236         // The parent lambda must be processed already and it must be the last
1237         // in TgtArgs and TgtOffsets arrays.
1238         void *HstPtrVal = Args[I];
1239         void *HstPtrBegin = ArgBases[I];
1240         void *HstPtrBase = Args[Idx];
1241         void *TgtPtrBase =
1242             (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
1243         DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
1244         uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1245         void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
1246         void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation();
1247         TargetPointerResultTy TPR =
1248             DeviceOrErr->getMappingInfo().getTgtPtrBegin(
1249                 HstPtrVal, ArgSizes[I], /*UpdateRefCount=*/false,
1250                 /*UseHoldRefCount=*/false);
1251         PointerTgtPtrBegin = TPR.TargetPointer;
1252         if (!TPR.isPresent()) {
1253           DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
1254              DPxPTR(HstPtrVal));
1255           continue;
1256         }
1257         if (TPR.Flags.IsHostPointer) {
1258           DP("Unified memory is active, no need to map lambda captured"
1259              "variable (" DPxMOD ")\n",
1260              DPxPTR(HstPtrVal));
1261           continue;
1262         }
1263         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
1264            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
1265         Ret =
1266             DeviceOrErr->submitData(TgtPtrBegin, &PointerTgtPtrBegin,
1267                                     sizeof(void *), AsyncInfo, TPR.getEntry());
1268         if (Ret != OFFLOAD_SUCCESS) {
1269           REPORT("Copying data to device failed.\n");
1270           return OFFLOAD_FAIL;
1271         }
1272       }
1273       continue;
1274     }
1275     void *HstPtrBegin = Args[I];
1276     void *HstPtrBase = ArgBases[I];
1277     void *TgtPtrBegin;
1278     map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
1279     ptrdiff_t TgtBaseOffset;
1280     TargetPointerResultTy TPR;
1281     if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
1282       DP("Forwarding first-private value " DPxMOD " to the target construct\n",
1283          DPxPTR(HstPtrBase));
1284       TgtPtrBegin = HstPtrBase;
1285       TgtBaseOffset = 0;
1286     } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
1287       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1288       const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO);
1289       // If there is a next argument and it depends on the current one, we need
1290       // to allocate the private memory immediately. If this is not the case,
1291       // then the argument can be marked for optimization and packed with the
1292       // other privates.
1293       const bool AllocImmediately =
1294           (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
1295       Ret = PrivateArgumentManager.addArg(
1296           HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
1297           TgtArgs.size(), HstPtrName, AllocImmediately);
1298       if (Ret != OFFLOAD_SUCCESS) {
1299         REPORT("Failed to process %sprivate argument " DPxMOD "\n",
1300                (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
1301         return OFFLOAD_FAIL;
1302       }
1303     } else {
1304       if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1305         HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
1306       TPR = DeviceOrErr->getMappingInfo().getTgtPtrBegin(
1307           HstPtrBegin, ArgSizes[I],
1308           /*UpdateRefCount=*/false,
1309           /*UseHoldRefCount=*/false);
1310       TgtPtrBegin = TPR.TargetPointer;
1311       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1312 #ifdef OMPTARGET_DEBUG
1313       void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
1314       DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
1315          DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
1316 #endif
1317     }
1318     TgtArgsPositions[I] = TgtArgs.size();
1319     TgtArgs.push_back(TgtPtrBegin);
1320     TgtOffsets.push_back(TgtBaseOffset);
1321   }
1322 
1323   assert(TgtArgs.size() == TgtOffsets.size() &&
1324          "Size mismatch in arguments and offsets");
1325 
1326   // Pack and transfer first-private arguments
1327   Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
1328   if (Ret != OFFLOAD_SUCCESS) {
1329     DP("Failed to pack and transfer first private arguments\n");
1330     return OFFLOAD_FAIL;
1331   }
1332 
1333   return OFFLOAD_SUCCESS;
1334 }
1335 
1336 /// Process data after launching the kernel, including transferring data back to
1337 /// host if needed and deallocating target memory of (first-)private variables.
1338 static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
1339                             int32_t ArgNum, void **ArgBases, void **Args,
1340                             int64_t *ArgSizes, int64_t *ArgTypes,
1341                             map_var_info_t *ArgNames, void **ArgMappers,
1342                             PrivateArgumentManagerTy &PrivateArgumentManager,
1343                             AsyncInfoTy &AsyncInfo) {
1344 
1345   auto DeviceOrErr = PM->getDevice(DeviceId);
1346   if (!DeviceOrErr)
1347     FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
1348 
1349   // Move data from device.
1350   int Ret = targetDataEnd(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes,
1351                           ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1352   if (Ret != OFFLOAD_SUCCESS) {
1353     REPORT("Call to targetDataEnd failed, abort target.\n");
1354     return OFFLOAD_FAIL;
1355   }
1356 
1357   // Free target memory for private arguments after synchronization.
1358   // TODO: We might want to remove `mutable` in the future by not changing the
1359   // captured variables somehow.
1360   AsyncInfo.addPostProcessingFunction(
1361       [PrivateArgumentManager =
1362            std::move(PrivateArgumentManager)]() mutable -> int {
1363         int Ret = PrivateArgumentManager.free();
1364         if (Ret != OFFLOAD_SUCCESS) {
1365           REPORT("Failed to deallocate target memory for private args\n");
1366           return OFFLOAD_FAIL;
1367         }
1368         return Ret;
1369       });
1370 
1371   return OFFLOAD_SUCCESS;
1372 }
1373 } // namespace
1374 
1375 /// performs the same actions as data_begin in case arg_num is
1376 /// non-zero and initiates run of the offloaded region on the target platform;
1377 /// if arg_num is non-zero after the region execution is done it also
1378 /// performs the same action as data_update and data_end above. This function
1379 /// returns 0 if it was able to transfer the execution to a target and an
1380 /// integer different from zero otherwise.
1381 int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
1382            KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo) {
1383   int32_t DeviceId = Device.DeviceID;
1384   TableMap *TM = getTableMap(HostPtr);
1385   // No map for this host pointer found!
1386   if (!TM) {
1387     REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1388            DPxPTR(HostPtr));
1389     return OFFLOAD_FAIL;
1390   }
1391 
1392   // get target table.
1393   __tgt_target_table *TargetTable = nullptr;
1394   {
1395     std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1396     assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1397            "Not expecting a device ID outside the table's bounds!");
1398     TargetTable = TM->Table->TargetsTable[DeviceId];
1399   }
1400   assert(TargetTable && "Global data has not been mapped\n");
1401 
1402   DP("loop trip count is %" PRIu64 ".\n", KernelArgs.Tripcount);
1403 
1404   // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we
1405   // need to manifest base pointers prior to launching a kernel. Even if we have
1406   // mapped an object only partially, e.g. A[N:M], although the kernel is
1407   // expected to access elements starting at address &A[N] and beyond, we still
1408   // need to manifest the base of the array &A[0]. In other cases, e.g. the COI
1409   // API, we need the begin address itself, i.e. &A[N], as the API operates on
1410   // begin addresses, not bases. That's why we pass args and offsets as two
1411   // separate entities so that each plugin can do what it needs. This behavior
1412   // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c.
1413   SmallVector<void *> TgtArgs;
1414   SmallVector<ptrdiff_t> TgtOffsets;
1415 
1416   PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo);
1417 
1418   int NumClangLaunchArgs = KernelArgs.NumArgs;
1419   int Ret = OFFLOAD_SUCCESS;
1420   if (NumClangLaunchArgs) {
1421     // Process data, such as data mapping, before launching the kernel
1422     Ret = processDataBefore(Loc, DeviceId, HostPtr, NumClangLaunchArgs,
1423                             KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs,
1424                             KernelArgs.ArgSizes, KernelArgs.ArgTypes,
1425                             KernelArgs.ArgNames, KernelArgs.ArgMappers, TgtArgs,
1426                             TgtOffsets, PrivateArgumentManager, AsyncInfo);
1427     if (Ret != OFFLOAD_SUCCESS) {
1428       REPORT("Failed to process data before launching the kernel.\n");
1429       return OFFLOAD_FAIL;
1430     }
1431 
1432     // Clang might pass more values via the ArgPtrs to the runtime that we pass
1433     // on to the kernel.
1434     // TOOD: Next time we adjust the KernelArgsTy we should introduce a new
1435     // NumKernelArgs field.
1436     KernelArgs.NumArgs = TgtArgs.size();
1437   }
1438 
1439   // Launch device execution.
1440   void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].Address;
1441   DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1442      TargetTable->EntriesBegin[TM->Index].SymbolName, DPxPTR(TgtEntryPtr),
1443      TM->Index);
1444 
1445   {
1446     assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
1447     TIMESCOPE_WITH_DETAILS_AND_IDENT(
1448         "Kernel Target",
1449         "NumArguments=" + std::to_string(KernelArgs.NumArgs) +
1450             ";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) +
1451             ";TripCount=" + std::to_string(KernelArgs.Tripcount),
1452         Loc);
1453 
1454 #ifdef OMPT_SUPPORT
1455     /// RAII to establish tool anchors before and after kernel launch
1456     int32_t NumTeams = KernelArgs.NumTeams[0];
1457     // No need to guard this with OMPT_IF_BUILT
1458     InterfaceRAII TargetSubmitRAII(
1459         RegionInterface.getCallbacks<ompt_callback_target_submit>(), NumTeams);
1460 #endif
1461 
1462     Ret = Device.launchKernel(TgtEntryPtr, TgtArgs.data(), TgtOffsets.data(),
1463                               KernelArgs, AsyncInfo);
1464   }
1465 
1466   if (Ret != OFFLOAD_SUCCESS) {
1467     REPORT("Executing target region abort target.\n");
1468     return OFFLOAD_FAIL;
1469   }
1470 
1471   if (NumClangLaunchArgs) {
1472     // Transfer data back and deallocate target memory for (first-)private
1473     // variables
1474     Ret = processDataAfter(Loc, DeviceId, HostPtr, NumClangLaunchArgs,
1475                            KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs,
1476                            KernelArgs.ArgSizes, KernelArgs.ArgTypes,
1477                            KernelArgs.ArgNames, KernelArgs.ArgMappers,
1478                            PrivateArgumentManager, AsyncInfo);
1479     if (Ret != OFFLOAD_SUCCESS) {
1480       REPORT("Failed to process data after launching the kernel.\n");
1481       return OFFLOAD_FAIL;
1482     }
1483   }
1484 
1485   return OFFLOAD_SUCCESS;
1486 }
1487 
1488 /// Enables the record replay mechanism by pre-allocating MemorySize
1489 /// and informing the record-replayer of whether to store the output
1490 /// in some file.
1491 int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
1492                        bool IsRecord, bool SaveOutput,
1493                        uint64_t &ReqPtrArgOffset) {
1494   return Device.RTL->initialize_record_replay(Device.DeviceID, MemorySize,
1495                                               VAddr, IsRecord, SaveOutput,
1496                                               ReqPtrArgOffset);
1497 }
1498 
1499 /// Executes a kernel using pre-recorded information for loading to
1500 /// device memory to launch the target kernel with the pre-recorded
1501 /// configuration.
1502 int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
1503                   void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs,
1504                   ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
1505                   int32_t ThreadLimit, uint64_t LoopTripCount,
1506                   AsyncInfoTy &AsyncInfo) {
1507   int32_t DeviceId = Device.DeviceID;
1508   TableMap *TM = getTableMap(HostPtr);
1509   // Fail if the table map fails to find the target kernel pointer for the
1510   // provided host pointer.
1511   if (!TM) {
1512     REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1513            DPxPTR(HostPtr));
1514     return OFFLOAD_FAIL;
1515   }
1516 
1517   // Retrieve the target table of offloading entries.
1518   __tgt_target_table *TargetTable = nullptr;
1519   {
1520     std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1521     assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1522            "Not expecting a device ID outside the table's bounds!");
1523     TargetTable = TM->Table->TargetsTable[DeviceId];
1524   }
1525   assert(TargetTable && "Global data has not been mapped\n");
1526 
1527   // Retrieve the target kernel pointer, allocate and store the recorded device
1528   // memory data, and launch device execution.
1529   void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].Address;
1530   DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1531      TargetTable->EntriesBegin[TM->Index].SymbolName, DPxPTR(TgtEntryPtr),
1532      TM->Index);
1533 
1534   void *TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr,
1535                                   TARGET_ALLOC_DEFAULT);
1536   Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
1537 
1538   KernelArgsTy KernelArgs{};
1539   KernelArgs.Version = OMP_KERNEL_ARG_VERSION;
1540   KernelArgs.NumArgs = NumArgs;
1541   KernelArgs.Tripcount = LoopTripCount;
1542   KernelArgs.NumTeams[0] = NumTeams;
1543   KernelArgs.ThreadLimit[0] = ThreadLimit;
1544 
1545   int Ret = Device.launchKernel(TgtEntryPtr, TgtArgs, TgtOffsets, KernelArgs,
1546                                 AsyncInfo);
1547 
1548   if (Ret != OFFLOAD_SUCCESS) {
1549     REPORT("Executing target region abort target.\n");
1550     return OFFLOAD_FAIL;
1551   }
1552 
1553   return OFFLOAD_SUCCESS;
1554 }
1555