xref: /llvm-project/offload/src/OpenMP/API.cpp (revision caaf8099efa87a7ebca8920971b7d7f719808591)
1 //===----------- api.cpp - Target independent OpenMP target RTL -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implementation of OpenMP API interface functions.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "PluginManager.h"
14 #include "device.h"
15 #include "omptarget.h"
16 #include "rtl.h"
17 
18 #include "OpenMP/InternalTypes.h"
19 #include "OpenMP/Mapping.h"
20 #include "OpenMP/OMPT/Interface.h"
21 #include "OpenMP/omp.h"
22 #include "Shared/Profile.h"
23 
24 #include "llvm/ADT/SmallVector.h"
25 
26 #include <climits>
27 #include <cstdlib>
28 #include <cstring>
29 #include <mutex>
30 
31 EXTERN void ompx_dump_mapping_tables() {
32   ident_t Loc = {0, 0, 0, 0, ";libomptarget;libomptarget;0;0;;"};
33   auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor();
34   for (auto &Device : PM->devices(ExclusiveDevicesAccessor))
35     dumpTargetPointerMappings(&Loc, Device, true);
36 }
37 
38 #ifdef OMPT_SUPPORT
39 using namespace llvm::omp::target::ompt;
40 #endif
41 
42 void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
43                           const char *Name);
44 void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
45                         const char *Name);
46 void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
47                          const char *Name);
48 void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name);
49 
50 // Implemented in libomp, they are called from within __tgt_* functions.
51 extern "C" {
52 int __kmpc_get_target_offload(void) __attribute__((weak));
53 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, int32_t gtid, int32_t flags,
54                                   size_t sizeof_kmp_task_t,
55                                   size_t sizeof_shareds,
56                                   kmp_routine_entry_t task_entry)
57     __attribute__((weak));
58 
59 kmp_task_t *
60 __kmpc_omp_target_task_alloc(ident_t *loc_ref, int32_t gtid, int32_t flags,
61                              size_t sizeof_kmp_task_t, size_t sizeof_shareds,
62                              kmp_routine_entry_t task_entry, int64_t device_id)
63     __attribute__((weak));
64 
65 int32_t __kmpc_omp_task_with_deps(ident_t *loc_ref, int32_t gtid,
66                                   kmp_task_t *new_task, int32_t ndeps,
67                                   kmp_depend_info_t *dep_list,
68                                   int32_t ndeps_noalias,
69                                   kmp_depend_info_t *noalias_dep_list)
70     __attribute__((weak));
71 }
72 
73 EXTERN int omp_get_num_devices(void) {
74   TIMESCOPE();
75   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
76   size_t NumDevices = PM->getNumDevices();
77 
78   DP("Call to omp_get_num_devices returning %zd\n", NumDevices);
79 
80   return NumDevices;
81 }
82 
83 EXTERN int omp_get_device_num(void) {
84   TIMESCOPE();
85   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
86   int HostDevice = omp_get_initial_device();
87 
88   DP("Call to omp_get_device_num returning %d\n", HostDevice);
89 
90   return HostDevice;
91 }
92 
93 EXTERN int omp_get_initial_device(void) {
94   TIMESCOPE();
95   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
96   int HostDevice = omp_get_num_devices();
97   DP("Call to omp_get_initial_device returning %d\n", HostDevice);
98   return HostDevice;
99 }
100 
101 EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
102   TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
103                          ";size=" + std::to_string(Size));
104   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
105   return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
106 }
107 
108 EXTERN void *llvm_omp_target_alloc_device(size_t Size, int DeviceNum) {
109   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
110   return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEVICE, __func__);
111 }
112 
113 EXTERN void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum) {
114   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
115   return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_HOST, __func__);
116 }
117 
118 EXTERN void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum) {
119   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
120   return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_SHARED, __func__);
121 }
122 
123 EXTERN void omp_target_free(void *Ptr, int DeviceNum) {
124   TIMESCOPE();
125   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
126   return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
127 }
128 
129 EXTERN void llvm_omp_target_free_device(void *Ptr, int DeviceNum) {
130   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
131   return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_DEVICE, __func__);
132 }
133 
134 EXTERN void llvm_omp_target_free_host(void *Ptr, int DeviceNum) {
135   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
136   return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_HOST, __func__);
137 }
138 
139 EXTERN void llvm_omp_target_free_shared(void *Ptre, int DeviceNum) {
140   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
141   return targetFreeExplicit(Ptre, DeviceNum, TARGET_ALLOC_SHARED, __func__);
142 }
143 
144 EXTERN void *llvm_omp_target_dynamic_shared_alloc() {
145   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
146   return nullptr;
147 }
148 
149 EXTERN void *llvm_omp_get_dynamic_shared() {
150   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
151   return nullptr;
152 }
153 
154 EXTERN [[nodiscard]] void *llvm_omp_target_lock_mem(void *Ptr, size_t Size,
155                                                     int DeviceNum) {
156   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
157   return targetLockExplicit(Ptr, Size, DeviceNum, __func__);
158 }
159 
160 EXTERN void llvm_omp_target_unlock_mem(void *Ptr, int DeviceNum) {
161   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
162   targetUnlockExplicit(Ptr, DeviceNum, __func__);
163 }
164 
165 EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
166   TIMESCOPE();
167   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
168   DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n",
169      DeviceNum, DPxPTR(Ptr));
170 
171   if (!Ptr) {
172     DP("Call to omp_target_is_present with NULL ptr, returning false\n");
173     return false;
174   }
175 
176   if (DeviceNum == omp_get_initial_device()) {
177     DP("Call to omp_target_is_present on host, returning true\n");
178     return true;
179   }
180 
181   auto DeviceOrErr = PM->getDevice(DeviceNum);
182   if (!DeviceOrErr)
183     FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
184 
185   // omp_target_is_present tests whether a host pointer refers to storage that
186   // is mapped to a given device. However, due to the lack of the storage size,
187   // only check 1 byte. Cannot set size 0 which checks whether the pointer (zero
188   // lengh array) is mapped instead of the referred storage.
189   TargetPointerResultTy TPR =
190       DeviceOrErr->getMappingInfo().getTgtPtrBegin(const_cast<void *>(Ptr), 1,
191                                                    /*UpdateRefCount=*/false,
192                                                    /*UseHoldRefCount=*/false);
193   int Rc = TPR.isPresent();
194   DP("Call to omp_target_is_present returns %d\n", Rc);
195   return Rc;
196 }
197 
198 EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
199                              size_t DstOffset, size_t SrcOffset, int DstDevice,
200                              int SrcDevice) {
201   TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
202                          ";src_dev=" + std::to_string(SrcDevice) +
203                          ";size=" + std::to_string(Length));
204   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
205   DP("Call to omp_target_memcpy, dst device %d, src device %d, "
206      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
207      "src offset %zu, length %zu\n",
208      DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DstOffset, SrcOffset,
209      Length);
210 
211   if (!Dst || !Src || Length <= 0) {
212     if (Length == 0) {
213       DP("Call to omp_target_memcpy with zero length, nothing to do\n");
214       return OFFLOAD_SUCCESS;
215     }
216 
217     REPORT("Call to omp_target_memcpy with invalid arguments\n");
218     return OFFLOAD_FAIL;
219   }
220 
221   int Rc = OFFLOAD_SUCCESS;
222   void *SrcAddr = (char *)const_cast<void *>(Src) + SrcOffset;
223   void *DstAddr = (char *)Dst + DstOffset;
224 
225   if (SrcDevice == omp_get_initial_device() &&
226       DstDevice == omp_get_initial_device()) {
227     DP("copy from host to host\n");
228     const void *P = memcpy(DstAddr, SrcAddr, Length);
229     if (P == NULL)
230       Rc = OFFLOAD_FAIL;
231   } else if (SrcDevice == omp_get_initial_device()) {
232     DP("copy from host to device\n");
233     auto DstDeviceOrErr = PM->getDevice(DstDevice);
234     if (!DstDeviceOrErr)
235       FATAL_MESSAGE(DstDevice, "%s",
236                     toString(DstDeviceOrErr.takeError()).c_str());
237     AsyncInfoTy AsyncInfo(*DstDeviceOrErr);
238     Rc = DstDeviceOrErr->submitData(DstAddr, SrcAddr, Length, AsyncInfo);
239   } else if (DstDevice == omp_get_initial_device()) {
240     DP("copy from device to host\n");
241     auto SrcDeviceOrErr = PM->getDevice(SrcDevice);
242     if (!SrcDeviceOrErr)
243       FATAL_MESSAGE(SrcDevice, "%s",
244                     toString(SrcDeviceOrErr.takeError()).c_str());
245     AsyncInfoTy AsyncInfo(*SrcDeviceOrErr);
246     Rc = SrcDeviceOrErr->retrieveData(DstAddr, SrcAddr, Length, AsyncInfo);
247   } else {
248     DP("copy from device to device\n");
249     auto SrcDeviceOrErr = PM->getDevice(SrcDevice);
250     if (!SrcDeviceOrErr)
251       FATAL_MESSAGE(SrcDevice, "%s",
252                     toString(SrcDeviceOrErr.takeError()).c_str());
253     AsyncInfoTy AsyncInfo(*SrcDeviceOrErr);
254     auto DstDeviceOrErr = PM->getDevice(DstDevice);
255     if (!DstDeviceOrErr)
256       FATAL_MESSAGE(DstDevice, "%s",
257                     toString(DstDeviceOrErr.takeError()).c_str());
258     // First try to use D2D memcpy which is more efficient. If fails, fall back
259     // to unefficient way.
260     if (SrcDeviceOrErr->isDataExchangable(*DstDeviceOrErr)) {
261       AsyncInfoTy AsyncInfo(*SrcDeviceOrErr);
262       Rc = SrcDeviceOrErr->dataExchange(SrcAddr, *DstDeviceOrErr, DstAddr,
263                                         Length, AsyncInfo);
264       if (Rc == OFFLOAD_SUCCESS)
265         return OFFLOAD_SUCCESS;
266     }
267 
268     void *Buffer = malloc(Length);
269     {
270       AsyncInfoTy AsyncInfo(*SrcDeviceOrErr);
271       Rc = SrcDeviceOrErr->retrieveData(Buffer, SrcAddr, Length, AsyncInfo);
272     }
273     if (Rc == OFFLOAD_SUCCESS) {
274       AsyncInfoTy AsyncInfo(*DstDeviceOrErr);
275       Rc = DstDeviceOrErr->submitData(DstAddr, Buffer, Length, AsyncInfo);
276     }
277     free(Buffer);
278   }
279 
280   DP("omp_target_memcpy returns %d\n", Rc);
281   return Rc;
282 }
283 
284 // The helper function that calls omp_target_memcpy or omp_target_memcpy_rect
285 static int libomp_target_memcpy_async_task(int32_t Gtid, kmp_task_t *Task) {
286   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
287   if (Task == nullptr)
288     return OFFLOAD_FAIL;
289 
290   TargetMemcpyArgsTy *Args = (TargetMemcpyArgsTy *)Task->shareds;
291 
292   if (Args == nullptr)
293     return OFFLOAD_FAIL;
294 
295   // Call blocked version
296   int Rc = OFFLOAD_SUCCESS;
297   if (Args->IsRectMemcpy) {
298     Rc = omp_target_memcpy_rect(
299         Args->Dst, Args->Src, Args->ElementSize, Args->NumDims, Args->Volume,
300         Args->DstOffsets, Args->SrcOffsets, Args->DstDimensions,
301         Args->SrcDimensions, Args->DstDevice, Args->SrcDevice);
302 
303     DP("omp_target_memcpy_rect returns %d\n", Rc);
304   } else {
305     Rc = omp_target_memcpy(Args->Dst, Args->Src, Args->Length, Args->DstOffset,
306                            Args->SrcOffset, Args->DstDevice, Args->SrcDevice);
307 
308     DP("omp_target_memcpy returns %d\n", Rc);
309   }
310 
311   // Release the arguments object
312   delete Args;
313 
314   return Rc;
315 }
316 
317 static int libomp_target_memset_async_task(int32_t Gtid, kmp_task_t *Task) {
318   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
319   if (!Task)
320     return OFFLOAD_FAIL;
321 
322   auto *Args = reinterpret_cast<TargetMemsetArgsTy *>(Task->shareds);
323   if (!Args)
324     return OFFLOAD_FAIL;
325 
326   // call omp_target_memset()
327   omp_target_memset(Args->Ptr, Args->C, Args->N, Args->DeviceNum);
328 
329   delete Args;
330 
331   return OFFLOAD_SUCCESS;
332 }
333 
334 static inline void
335 convertDepObjVector(llvm::SmallVector<kmp_depend_info_t> &Vec, int DepObjCount,
336                     omp_depend_t *DepObjList) {
337   for (int i = 0; i < DepObjCount; ++i) {
338     omp_depend_t DepObj = DepObjList[i];
339     Vec.push_back(*((kmp_depend_info_t *)DepObj));
340   }
341 }
342 
343 template <class T>
344 static inline int
345 libomp_helper_task_creation(T *Args, int (*Fn)(int32_t, kmp_task_t *),
346                             int DepObjCount, omp_depend_t *DepObjList) {
347   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
348   // Create global thread ID
349   int Gtid = __kmpc_global_thread_num(nullptr);
350 
351   // Setup the hidden helper flags
352   int32_t Flags = 0;
353   kmp_tasking_flags_t *InputFlags = (kmp_tasking_flags_t *)&Flags;
354   InputFlags->hidden_helper = 1;
355 
356   // Alloc the helper task
357   kmp_task_t *Task = __kmpc_omp_target_task_alloc(
358       nullptr, Gtid, Flags, sizeof(kmp_task_t), 0, Fn, -1);
359   if (!Task) {
360     delete Args;
361     return OFFLOAD_FAIL;
362   }
363 
364   // Setup the arguments for the helper task
365   Task->shareds = Args;
366 
367   // Convert types of depend objects
368   llvm::SmallVector<kmp_depend_info_t> DepObjs;
369   convertDepObjVector(DepObjs, DepObjCount, DepObjList);
370 
371   // Launch the helper task
372   int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Task, DepObjCount,
373                                      DepObjs.data(), 0, nullptr);
374 
375   return Rc;
376 }
377 
378 EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes,
379                                int DeviceNum) {
380   TIMESCOPE();
381   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
382   DP("Call to omp_target_memset, device %d, device pointer %p, size %zu\n",
383      DeviceNum, Ptr, NumBytes);
384 
385   // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
386   // of unspecified behavior, see OpenMP spec).
387   if (!Ptr || NumBytes == 0) {
388     return Ptr;
389   }
390 
391   if (DeviceNum == omp_get_initial_device()) {
392     DP("filling memory on host via memset");
393     memset(Ptr, ByteVal, NumBytes); // ignore return value, memset() cannot fail
394   } else {
395     // TODO: replace the omp_target_memset() slow path with the fast path.
396     // That will require the ability to execute a kernel from within
397     // libomptarget.so (which we do not have at the moment).
398 
399     // This is a very slow path: create a filled array on the host and upload
400     // it to the GPU device.
401     int InitialDevice = omp_get_initial_device();
402     void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
403     if (Shadow) {
404       (void)memset(Shadow, ByteVal, NumBytes);
405       (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
406                               InitialDevice);
407       (void)omp_target_free(Shadow, InitialDevice);
408     } else {
409       // If the omp_target_alloc has failed, let's just not do anything.
410       // omp_target_memset does not have any good way to fail, so we
411       // simply avoid a catastrophic failure of the process for now.
412       DP("omp_target_memset failed to fill memory due to error with "
413          "omp_target_alloc");
414     }
415   }
416 
417   DP("omp_target_memset returns %p\n", Ptr);
418   return Ptr;
419 }
420 
421 EXTERN void *omp_target_memset_async(void *Ptr, int ByteVal, size_t NumBytes,
422                                      int DeviceNum, int DepObjCount,
423                                      omp_depend_t *DepObjList) {
424   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
425   DP("Call to omp_target_memset_async, device %d, device pointer %p, size %zu",
426      DeviceNum, Ptr, NumBytes);
427 
428   // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
429   // of unspecified behavior, see OpenMP spec).
430   if (!Ptr || NumBytes == 0)
431     return Ptr;
432 
433   // Create the task object to deal with the async invocation
434   auto *Args = new TargetMemsetArgsTy{Ptr, ByteVal, NumBytes, DeviceNum};
435 
436   // omp_target_memset_async() cannot fail via a return code, so ignore the
437   // return code of the helper function
438   (void)libomp_helper_task_creation(Args, &libomp_target_memset_async_task,
439                                     DepObjCount, DepObjList);
440 
441   return Ptr;
442 }
443 
444 EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
445                                    size_t DstOffset, size_t SrcOffset,
446                                    int DstDevice, int SrcDevice,
447                                    int DepObjCount, omp_depend_t *DepObjList) {
448   TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
449                          ";src_dev=" + std::to_string(SrcDevice) +
450                          ";size=" + std::to_string(Length));
451   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
452   DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
453      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
454      "src offset %zu, length %zu\n",
455      DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DstOffset, SrcOffset,
456      Length);
457 
458   // Check the source and dest address
459   if (Dst == nullptr || Src == nullptr)
460     return OFFLOAD_FAIL;
461 
462   // Create task object
463   TargetMemcpyArgsTy *Args = new TargetMemcpyArgsTy(
464       Dst, Src, Length, DstOffset, SrcOffset, DstDevice, SrcDevice);
465 
466   // Create and launch helper task
467   int Rc = libomp_helper_task_creation(Args, &libomp_target_memcpy_async_task,
468                                        DepObjCount, DepObjList);
469 
470   DP("omp_target_memcpy_async returns %d\n", Rc);
471   return Rc;
472 }
473 
474 EXTERN int
475 omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
476                        int NumDims, const size_t *Volume,
477                        const size_t *DstOffsets, const size_t *SrcOffsets,
478                        const size_t *DstDimensions, const size_t *SrcDimensions,
479                        int DstDevice, int SrcDevice) {
480   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
481   DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, "
482      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
483      "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
484      "volume " DPxMOD ", element size %zu, num_dims %d\n",
485      DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DPxPTR(DstOffsets),
486      DPxPTR(SrcOffsets), DPxPTR(DstDimensions), DPxPTR(SrcDimensions),
487      DPxPTR(Volume), ElementSize, NumDims);
488 
489   if (!(Dst || Src)) {
490     DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n",
491        INT_MAX);
492     return INT_MAX;
493   }
494 
495   if (!Dst || !Src || ElementSize < 1 || NumDims < 1 || !Volume ||
496       !DstOffsets || !SrcOffsets || !DstDimensions || !SrcDimensions) {
497     REPORT("Call to omp_target_memcpy_rect with invalid arguments\n");
498     return OFFLOAD_FAIL;
499   }
500 
501   int Rc;
502   if (NumDims == 1) {
503     Rc = omp_target_memcpy(Dst, Src, ElementSize * Volume[0],
504                            ElementSize * DstOffsets[0],
505                            ElementSize * SrcOffsets[0], DstDevice, SrcDevice);
506   } else {
507     size_t DstSliceSize = ElementSize;
508     size_t SrcSliceSize = ElementSize;
509     for (int I = 1; I < NumDims; ++I) {
510       DstSliceSize *= DstDimensions[I];
511       SrcSliceSize *= SrcDimensions[I];
512     }
513 
514     size_t DstOff = DstOffsets[0] * DstSliceSize;
515     size_t SrcOff = SrcOffsets[0] * SrcSliceSize;
516     for (size_t I = 0; I < Volume[0]; ++I) {
517       Rc = omp_target_memcpy_rect(
518           (char *)Dst + DstOff + DstSliceSize * I,
519           (char *)const_cast<void *>(Src) + SrcOff + SrcSliceSize * I,
520           ElementSize, NumDims - 1, Volume + 1, DstOffsets + 1, SrcOffsets + 1,
521           DstDimensions + 1, SrcDimensions + 1, DstDevice, SrcDevice);
522 
523       if (Rc) {
524         DP("Recursive call to omp_target_memcpy_rect returns unsuccessfully\n");
525         return Rc;
526       }
527     }
528   }
529 
530   DP("omp_target_memcpy_rect returns %d\n", Rc);
531   return Rc;
532 }
533 
534 EXTERN int omp_target_memcpy_rect_async(
535     void *Dst, const void *Src, size_t ElementSize, int NumDims,
536     const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets,
537     const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice,
538     int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) {
539   TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
540                          ";src_dev=" + std::to_string(SrcDevice) +
541                          ";size=" + std::to_string(ElementSize) +
542                          ";num_dims=" + std::to_string(NumDims));
543   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
544   DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
545      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
546      "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
547      "volume " DPxMOD ", element size %zu, num_dims %d\n",
548      DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DPxPTR(DstOffsets),
549      DPxPTR(SrcOffsets), DPxPTR(DstDimensions), DPxPTR(SrcDimensions),
550      DPxPTR(Volume), ElementSize, NumDims);
551 
552   // Need to check this first to not return OFFLOAD_FAIL instead
553   if (!Dst && !Src) {
554     DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n",
555        INT_MAX);
556     return INT_MAX;
557   }
558 
559   // Check the source and dest address
560   if (Dst == nullptr || Src == nullptr)
561     return OFFLOAD_FAIL;
562 
563   // Create task object
564   TargetMemcpyArgsTy *Args = new TargetMemcpyArgsTy(
565       Dst, Src, ElementSize, NumDims, Volume, DstOffsets, SrcOffsets,
566       DstDimensions, SrcDimensions, DstDevice, SrcDevice);
567 
568   // Create and launch helper task
569   int Rc = libomp_helper_task_creation(Args, &libomp_target_memcpy_async_task,
570                                        DepObjCount, DepObjList);
571 
572   DP("omp_target_memcpy_rect_async returns %d\n", Rc);
573   return Rc;
574 }
575 
576 EXTERN int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr,
577                                     size_t Size, size_t DeviceOffset,
578                                     int DeviceNum) {
579   TIMESCOPE();
580   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
581   DP("Call to omp_target_associate_ptr with host_ptr " DPxMOD ", "
582      "device_ptr " DPxMOD ", size %zu, device_offset %zu, device_num %d\n",
583      DPxPTR(HostPtr), DPxPTR(DevicePtr), Size, DeviceOffset, DeviceNum);
584 
585   if (!HostPtr || !DevicePtr || Size <= 0) {
586     REPORT("Call to omp_target_associate_ptr with invalid arguments\n");
587     return OFFLOAD_FAIL;
588   }
589 
590   if (DeviceNum == omp_get_initial_device()) {
591     REPORT("omp_target_associate_ptr: no association possible on the host\n");
592     return OFFLOAD_FAIL;
593   }
594 
595   auto DeviceOrErr = PM->getDevice(DeviceNum);
596   if (!DeviceOrErr)
597     FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
598 
599   void *DeviceAddr = (void *)((uint64_t)DevicePtr + (uint64_t)DeviceOffset);
600 
601   OMPT_IF_BUILT(InterfaceRAII(
602       RegionInterface.getCallbacks<ompt_target_data_associate>(), DeviceNum,
603       const_cast<void *>(HostPtr), const_cast<void *>(DevicePtr), Size,
604       __builtin_return_address(0)));
605 
606   int Rc = DeviceOrErr->getMappingInfo().associatePtr(
607       const_cast<void *>(HostPtr), const_cast<void *>(DeviceAddr), Size);
608   DP("omp_target_associate_ptr returns %d\n", Rc);
609   return Rc;
610 }
611 
612 EXTERN int omp_target_disassociate_ptr(const void *HostPtr, int DeviceNum) {
613   TIMESCOPE();
614   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
615   DP("Call to omp_target_disassociate_ptr with host_ptr " DPxMOD ", "
616      "device_num %d\n",
617      DPxPTR(HostPtr), DeviceNum);
618 
619   if (!HostPtr) {
620     REPORT("Call to omp_target_associate_ptr with invalid host_ptr\n");
621     return OFFLOAD_FAIL;
622   }
623 
624   if (DeviceNum == omp_get_initial_device()) {
625     REPORT(
626         "omp_target_disassociate_ptr: no association possible on the host\n");
627     return OFFLOAD_FAIL;
628   }
629 
630   auto DeviceOrErr = PM->getDevice(DeviceNum);
631   if (!DeviceOrErr)
632     FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
633 
634   OMPT_IF_BUILT(InterfaceRAII(
635       RegionInterface.getCallbacks<ompt_target_data_disassociate>(), DeviceNum,
636       const_cast<void *>(HostPtr),
637       /*DevicePtr=*/nullptr, /*Size=*/0, __builtin_return_address(0)));
638 
639   int Rc = DeviceOrErr->getMappingInfo().disassociatePtr(
640       const_cast<void *>(HostPtr));
641   DP("omp_target_disassociate_ptr returns %d\n", Rc);
642   return Rc;
643 }
644 
645 EXTERN void *omp_get_mapped_ptr(const void *Ptr, int DeviceNum) {
646   TIMESCOPE();
647   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
648   DP("Call to omp_get_mapped_ptr with ptr " DPxMOD ", device_num %d.\n",
649      DPxPTR(Ptr), DeviceNum);
650 
651   if (!Ptr) {
652     REPORT("Call to omp_get_mapped_ptr with nullptr.\n");
653     return nullptr;
654   }
655 
656   int NumDevices = omp_get_initial_device();
657   if (DeviceNum == NumDevices) {
658     DP("Device %d is initial device, returning Ptr " DPxMOD ".\n",
659            DeviceNum, DPxPTR(Ptr));
660     return const_cast<void *>(Ptr);
661   }
662 
663   if (NumDevices <= DeviceNum) {
664     DP("DeviceNum %d is invalid, returning nullptr.\n", DeviceNum);
665     return nullptr;
666   }
667 
668   auto DeviceOrErr = PM->getDevice(DeviceNum);
669   if (!DeviceOrErr)
670     FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
671 
672   TargetPointerResultTy TPR =
673       DeviceOrErr->getMappingInfo().getTgtPtrBegin(const_cast<void *>(Ptr), 1,
674                                                    /*UpdateRefCount=*/false,
675                                                    /*UseHoldRefCount=*/false);
676   if (!TPR.isPresent()) {
677     DP("Ptr " DPxMOD "is not present on device %d, returning nullptr.\n",
678        DPxPTR(Ptr), DeviceNum);
679     return nullptr;
680   }
681 
682   DP("omp_get_mapped_ptr returns " DPxMOD ".\n", DPxPTR(TPR.TargetPointer));
683 
684   return TPR.TargetPointer;
685 }
686