xref: /llvm-project/offload/include/omptarget.h (revision 80525dfcde5bf8aae6ab6b0810124ba502de6096)
1 //===-------- omptarget.h - Target independent OpenMP target RTL -- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Interface to be used by Clang during the codegen of a
10 // target region.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef _OMPTARGET_H_
15 #define _OMPTARGET_H_
16 
17 #include "Shared/APITypes.h"
18 #include "Shared/Environment.h"
19 #include "Shared/SourceInfo.h"
20 
21 #include "OpenMP/InternalTypes.h"
22 
23 #include <cstddef>
24 #include <cstdint>
25 #include <deque>
26 #include <functional>
27 #include <type_traits>
28 
29 #include "llvm/ADT/SmallVector.h"
30 
31 #define OFFLOAD_SUCCESS (0)
32 #define OFFLOAD_FAIL (~0)
33 
34 #define OFFLOAD_DEVICE_DEFAULT -1
35 
36 // Don't format out enums and structs.
37 // clang-format off
38 
39 /// return flags of __tgt_target_XXX public APIs
40 enum __tgt_target_return_t : int {
41   /// successful offload executed on a target device
42   OMP_TGT_SUCCESS = 0,
43   /// offload may not execute on the requested target device
44   /// this scenario can be caused by the device not available or unsupported
45   /// as described in the Execution Model in the specifcation
46   /// this status may not be used for target device execution failure
47   /// which should be handled internally in libomptarget
48   OMP_TGT_FAIL = ~0
49 };
50 
51 /// Data attributes for each data reference used in an OpenMP target region.
52 enum tgt_map_type {
53   // No flags
54   OMP_TGT_MAPTYPE_NONE            = 0x000,
55   // copy data from host to device
56   OMP_TGT_MAPTYPE_TO              = 0x001,
57   // copy data from device to host
58   OMP_TGT_MAPTYPE_FROM            = 0x002,
59   // copy regardless of the reference count
60   OMP_TGT_MAPTYPE_ALWAYS          = 0x004,
61   // force unmapping of data
62   OMP_TGT_MAPTYPE_DELETE          = 0x008,
63   // map the pointer as well as the pointee
64   OMP_TGT_MAPTYPE_PTR_AND_OBJ     = 0x010,
65   // pass device base address to kernel
66   OMP_TGT_MAPTYPE_TARGET_PARAM    = 0x020,
67   // return base device address of mapped data
68   OMP_TGT_MAPTYPE_RETURN_PARAM    = 0x040,
69   // private variable - not mapped
70   OMP_TGT_MAPTYPE_PRIVATE         = 0x080,
71   // copy by value - not mapped
72   OMP_TGT_MAPTYPE_LITERAL         = 0x100,
73   // mapping is implicit
74   OMP_TGT_MAPTYPE_IMPLICIT        = 0x200,
75   // copy data to device
76   OMP_TGT_MAPTYPE_CLOSE           = 0x400,
77   // runtime error if not already allocated
78   OMP_TGT_MAPTYPE_PRESENT         = 0x1000,
79   // use a separate reference counter so that the data cannot be unmapped within
80   // the structured region
81   // This is an OpenMP extension for the sake of OpenACC support.
82   OMP_TGT_MAPTYPE_OMPX_HOLD       = 0x2000,
83   // descriptor for non-contiguous target-update
84   OMP_TGT_MAPTYPE_NON_CONTIG      = 0x100000000000,
85   // member of struct, member given by [16 MSBs] - 1
86   OMP_TGT_MAPTYPE_MEMBER_OF       = 0xffff000000000000
87 };
88 
89 /// Flags for offload entries.
90 enum OpenMPOffloadingDeclareTargetFlags {
91   /// Mark the entry global as having a 'link' attribute.
92   OMP_DECLARE_TARGET_LINK = 0x01,
93   /// Mark the entry global as being an indirectly callable function.
94   OMP_DECLARE_TARGET_INDIRECT = 0x08,
95   /// This is an entry corresponding to a requirement to be registered.
96   OMP_REGISTER_REQUIRES = 0x10,
97 };
98 
99 enum TargetAllocTy : int32_t {
100   TARGET_ALLOC_DEVICE = 0,
101   TARGET_ALLOC_HOST,
102   TARGET_ALLOC_SHARED,
103   TARGET_ALLOC_DEFAULT,
104   /// The allocation will not block on other streams.
105   TARGET_ALLOC_DEVICE_NON_BLOCKING,
106 };
107 
108 inline KernelArgsTy CTorDTorKernelArgs = {1,       0,       nullptr,   nullptr,
109 	     nullptr, nullptr, nullptr,   nullptr,
110 	     0,      {0,0,0},       {1, 0, 0}, {1, 0, 0}, 0};
111 
112 struct DeviceTy;
113 
114 /// The libomptarget wrapper around a __tgt_async_info object directly
115 /// associated with a libomptarget layer device. RAII semantics to avoid
116 /// mistakes.
117 class AsyncInfoTy {
118 public:
119   enum class SyncTy { BLOCKING, NON_BLOCKING };
120 
121 private:
122   /// Locations we used in (potentially) asynchronous calls which should live
123   /// as long as this AsyncInfoTy object.
124   std::deque<void *> BufferLocations;
125 
126   /// Post-processing operations executed after a successful synchronization.
127   /// \note the post-processing function should return OFFLOAD_SUCCESS or
128   /// OFFLOAD_FAIL appropriately.
129   using PostProcFuncTy = std::function<int()>;
130   llvm::SmallVector<PostProcFuncTy> PostProcessingFunctions;
131 
132   __tgt_async_info AsyncInfo;
133   DeviceTy &Device;
134 
135 public:
136   /// Synchronization method to be used.
137   SyncTy SyncType;
138 
139   AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING)
140       : Device(Device), SyncType(SyncType) {}
141   ~AsyncInfoTy() { synchronize(); }
142 
143   /// Implicit conversion to the __tgt_async_info which is used in the
144   /// plugin interface.
145   operator __tgt_async_info *() { return &AsyncInfo; }
146 
147   /// Synchronize all pending actions.
148   ///
149   /// \note synchronization will be performance in a blocking or non-blocking
150   /// manner, depending on the SyncType.
151   ///
152   /// \note if the operations are completed, the registered post-processing
153   /// functions will be executed once and unregistered afterwards.
154   ///
155   /// \returns OFFLOAD_FAIL or OFFLOAD_SUCCESS appropriately.
156   int synchronize();
157 
158   /// Return a void* reference with a lifetime that is at least as long as this
159   /// AsyncInfoTy object. The location can be used as intermediate buffer.
160   void *&getVoidPtrLocation();
161 
162   /// Check if all asynchronous operations are completed.
163   ///
164   /// \note only a lightweight check. If needed, use synchronize() to query the
165   /// status of AsyncInfo before checking.
166   ///
167   /// \returns true if there is no pending asynchronous operations, false
168   /// otherwise.
169   bool isDone() const;
170 
171   /// Add a new post-processing function to be executed after synchronization.
172   ///
173   /// \param[in] Function is a templated function (e.g., function pointers,
174   /// lambdas, std::function) that can be convertible to a PostProcFuncTy (i.e.,
175   /// it must have int() as its function signature).
176   template <typename FuncTy> void addPostProcessingFunction(FuncTy &&Function) {
177     static_assert(std::is_convertible_v<FuncTy, PostProcFuncTy>,
178                   "Invalid post-processing function type. Please check "
179                   "function signature!");
180     PostProcessingFunctions.emplace_back(Function);
181   }
182 
183 private:
184   /// Run all the post-processing functions sequentially.
185   ///
186   /// \note after a successful execution, all previously registered functions
187   /// are unregistered.
188   ///
189   /// \returns OFFLOAD_FAIL if any post-processing function failed,
190   /// OFFLOAD_SUCCESS otherwise.
191   int32_t runPostProcessing();
192 
193   /// Check if the internal asynchronous info queue is empty or not.
194   ///
195   /// \returns true if empty, false otherwise.
196   bool isQueueEmpty() const;
197 };
198 
199 // Wrapper for task stored async info objects.
200 class TaskAsyncInfoWrapperTy {
201   // Invalid GTID as defined by libomp; keep in sync
202   static constexpr int KMP_GTID_DNE = -2;
203 
204   const int ExecThreadID = KMP_GTID_DNE;
205   AsyncInfoTy LocalAsyncInfo;
206   AsyncInfoTy *AsyncInfo = &LocalAsyncInfo;
207   void **TaskAsyncInfoPtr = nullptr;
208 
209 public:
210   TaskAsyncInfoWrapperTy(DeviceTy &Device)
211       : ExecThreadID(__kmpc_global_thread_num(NULL)), LocalAsyncInfo(Device) {
212     // If we failed to acquired the current global thread id, we cannot
213     // re-enqueue the current task. Thus we should use the local blocking async
214     // info.
215     if (ExecThreadID == KMP_GTID_DNE)
216       return;
217 
218     // Only tasks with an assigned task team can be re-enqueue and thus can
219     // use the non-blocking synchronization scheme. Thus we should use the local
220     // blocking async info, if we don´t have one.
221     if (!__kmpc_omp_has_task_team(ExecThreadID))
222       return;
223 
224     // Acquire a pointer to the AsyncInfo stored inside the current task being
225     // executed.
226     TaskAsyncInfoPtr = __kmpc_omp_get_target_async_handle_ptr(ExecThreadID);
227 
228     // If we cannot acquire such pointer, fallback to using the local blocking
229     // async info.
230     if (!TaskAsyncInfoPtr)
231       return;
232 
233     // When creating a new task async info, the task handle must always be
234     // invalid. We must never overwrite any task async handle and there should
235     // never be any valid handle store inside the task at this point.
236     assert((*TaskAsyncInfoPtr) == nullptr &&
237            "Task async handle is not empty when dispatching new device "
238            "operations. The handle was not cleared properly or "
239            "__tgt_target_nowait_query should have been called!");
240 
241     // If no valid async handle is present, a new AsyncInfo will be allocated
242     // and stored in the current task.
243     AsyncInfo = new AsyncInfoTy(Device, AsyncInfoTy::SyncTy::NON_BLOCKING);
244     *TaskAsyncInfoPtr = (void *)AsyncInfo;
245   }
246 
247   ~TaskAsyncInfoWrapperTy() {
248     // Local async info destruction is automatically handled by ~AsyncInfoTy.
249     if (AsyncInfo == &LocalAsyncInfo)
250       return;
251 
252     // If the are device operations still pending, return immediately without
253     // deallocating the handle.
254     if (!AsyncInfo->isDone())
255       return;
256 
257     // Delete the handle and unset it from the OpenMP task data.
258     delete AsyncInfo;
259     *TaskAsyncInfoPtr = nullptr;
260   }
261 
262   operator AsyncInfoTy &() { return *AsyncInfo; }
263 };
264 
265 /// This struct is a record of non-contiguous information
266 struct __tgt_target_non_contig {
267   uint64_t Offset;
268   uint64_t Count;
269   uint64_t Stride;
270 };
271 
272 #ifdef __cplusplus
273 extern "C" {
274 #endif
275 
276 void ompx_dump_mapping_tables(void);
277 int omp_get_num_devices(void);
278 int omp_get_device_num(void);
279 int omp_get_initial_device(void);
280 void *omp_target_alloc(size_t Size, int DeviceNum);
281 void omp_target_free(void *DevicePtr, int DeviceNum);
282 int omp_target_is_present(const void *Ptr, int DeviceNum);
283 int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
284                       size_t DstOffset, size_t SrcOffset, int DstDevice,
285                       int SrcDevice);
286 int omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
287                            int NumDims, const size_t *Volume,
288                            const size_t *DstOffsets, const size_t *SrcOffsets,
289                            const size_t *DstDimensions,
290                            const size_t *SrcDimensions, int DstDevice,
291                            int SrcDevice);
292 void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
293 int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr,
294                              size_t Size, size_t DeviceOffset, int DeviceNum);
295 int omp_target_disassociate_ptr(const void *HostPtr, int DeviceNum);
296 
297 /// Explicit target memory allocators
298 /// Using the llvm_ prefix until they become part of the OpenMP standard.
299 void *llvm_omp_target_alloc_device(size_t Size, int DeviceNum);
300 void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum);
301 void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
302 
303 /// Explicit target memory deallocators
304 /// Using the llvm_ prefix until they become part of the OpenMP standard.
305 void llvm_omp_target_free_device(void *DevicePtr, int DeviceNum);
306 void llvm_omp_target_free_host(void *DevicePtr, int DeviceNum);
307 void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
308 
309 /// Dummy target so we have a symbol for generating host fallback.
310 void *llvm_omp_target_dynamic_shared_alloc();
311 
312 /// add the clauses of the requires directives in a given file
313 void __tgt_register_requires(int64_t Flags);
314 
315 /// Initializes the runtime library.
316 void __tgt_rtl_init();
317 
318 /// Deinitializes the runtime library.
319 void __tgt_rtl_deinit();
320 
321 /// adds a target shared library to the target execution image
322 void __tgt_register_lib(__tgt_bin_desc *Desc);
323 
324 /// Initialize all RTLs at once
325 void __tgt_init_all_rtls();
326 
327 /// removes a target shared library from the target execution image
328 void __tgt_unregister_lib(__tgt_bin_desc *Desc);
329 
330 // creates the host to target data mapping, stores it in the
331 // libomptarget.so internal structure (an entry in a stack of data maps) and
332 // passes the data to the device;
333 void __tgt_target_data_begin(int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
334                              void **Args, int64_t *ArgSizes, int64_t *ArgTypes);
335 void __tgt_target_data_begin_nowait(int64_t DeviceId, int32_t ArgNum,
336                                     void **ArgsBase, void **Args,
337                                     int64_t *ArgSizes, int64_t *ArgTypes,
338                                     int32_t DepNum, void *DepList,
339                                     int32_t NoAliasDepNum,
340                                     void *NoAliasDepList);
341 void __tgt_target_data_begin_mapper(ident_t *Loc, int64_t DeviceId,
342                                     int32_t ArgNum, void **ArgsBase,
343                                     void **Args, int64_t *ArgSizes,
344                                     int64_t *ArgTypes, map_var_info_t *ArgNames,
345                                     void **ArgMappers);
346 void __tgt_target_data_begin_nowait_mapper(
347     ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
348     void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
349     void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
350     void *NoAliasDepList);
351 
352 // passes data from the target, release target memory and destroys the
353 // host-target mapping (top entry from the stack of data maps) created by
354 // the last __tgt_target_data_begin
355 void __tgt_target_data_end(int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
356                            void **Args, int64_t *ArgSizes, int64_t *ArgTypes);
357 void __tgt_target_data_end_nowait(int64_t DeviceId, int32_t ArgNum,
358                                   void **ArgsBase, void **Args,
359                                   int64_t *ArgSizes, int64_t *ArgTypes,
360                                   int32_t DepNum, void *DepList,
361                                   int32_t NoAliasDepNum, void *NoAliasDepList);
362 void __tgt_target_data_end_mapper(ident_t *Loc, int64_t DeviceId,
363                                   int32_t ArgNum, void **ArgsBase, void **Args,
364                                   int64_t *ArgSizes, int64_t *ArgTypes,
365                                   map_var_info_t *ArgNames, void **ArgMappers);
366 void __tgt_target_data_end_nowait_mapper(
367     ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
368     void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
369     void **ArgMappers, int32_t depNum, void *depList, int32_t NoAliasDepNum,
370     void *NoAliasDepList);
371 
372 /// passes data to/from the target
373 void __tgt_target_data_update(int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
374                               void **Args, int64_t *ArgSizes,
375                               int64_t *ArgTypes);
376 void __tgt_target_data_update_nowait(int64_t DeviceId, int32_t ArgNum,
377                                      void **ArgsBase, void **Args,
378                                      int64_t *ArgSizes, int64_t *ArgTypes,
379                                      int32_t DepNum, void *DepList,
380                                      int32_t NoAliasDepNum,
381                                      void *NoAliasDepList);
382 void __tgt_target_data_update_mapper(ident_t *Loc, int64_t DeviceId,
383                                      int32_t ArgNum, void **ArgsBase,
384                                      void **Args, int64_t *ArgSizes,
385                                      int64_t *ArgTypes,
386                                      map_var_info_t *ArgNames,
387                                      void **ArgMappers);
388 void __tgt_target_data_update_nowait_mapper(
389     ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
390     void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
391     void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
392     void *NoAliasDepList);
393 
394 // Performs the same actions as data_begin in case ArgNum is non-zero
395 // and initiates run of offloaded region on target platform; if ArgNum
396 // is non-zero after the region execution is done it also performs the
397 // same action as data_end above. The following types are used; this
398 // function returns 0 if it was able to transfer the execution to a
399 // target and an int different from zero otherwise.
400 int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
401                         int32_t ThreadLimit, void *HostPtr, KernelArgsTy *Args);
402 
403 // Non-blocking synchronization for target nowait regions. This function
404 // acquires the asynchronous context from task data of the current task being
405 // executed and tries to query for the completion of its operations. If the
406 // operations are still pending, the function returns immediately. If the
407 // operations are completed, all the post-processing procedures stored in the
408 // asynchronous context are executed and the context is removed from the task
409 // data.
410 void __tgt_target_nowait_query(void **AsyncHandle);
411 
412 /// Executes a target kernel by replaying recorded kernel arguments and
413 /// device memory.
414 int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, void *HostPtr,
415                                void *DeviceMemory, int64_t DeviceMemorySize,
416                                void **TgtArgs, ptrdiff_t *TgtOffsets,
417                                int32_t NumArgs, int32_t NumTeams,
418                                int32_t ThreadLimit, uint64_t LoopTripCount);
419 
420 void __tgt_set_info_flag(uint32_t);
421 
422 int __tgt_print_device_info(int64_t DeviceId);
423 
424 int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
425                                  void *VAddr, bool IsRecord, bool SaveOutput,
426                                  uint64_t &ReqPtrArgOffset);
427 
428 #ifdef __cplusplus
429 }
430 #endif
431 
432 #ifdef __cplusplus
433 #define EXTERN extern "C"
434 #else
435 #define EXTERN extern
436 #endif
437 
438 #endif // _OMPTARGET_H_
439