1 //===-------- omptarget.h - Target independent OpenMP target RTL -- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Interface to be used by Clang during the codegen of a 10 // target region. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef _OMPTARGET_H_ 15 #define _OMPTARGET_H_ 16 17 #include "Shared/APITypes.h" 18 #include "Shared/Environment.h" 19 #include "Shared/SourceInfo.h" 20 21 #include "OpenMP/InternalTypes.h" 22 23 #include <cstddef> 24 #include <cstdint> 25 #include <deque> 26 #include <functional> 27 #include <type_traits> 28 29 #include "llvm/ADT/SmallVector.h" 30 31 #define OFFLOAD_SUCCESS (0) 32 #define OFFLOAD_FAIL (~0) 33 34 #define OFFLOAD_DEVICE_DEFAULT -1 35 36 // Don't format out enums and structs. 37 // clang-format off 38 39 /// return flags of __tgt_target_XXX public APIs 40 enum __tgt_target_return_t : int { 41 /// successful offload executed on a target device 42 OMP_TGT_SUCCESS = 0, 43 /// offload may not execute on the requested target device 44 /// this scenario can be caused by the device not available or unsupported 45 /// as described in the Execution Model in the specifcation 46 /// this status may not be used for target device execution failure 47 /// which should be handled internally in libomptarget 48 OMP_TGT_FAIL = ~0 49 }; 50 51 /// Data attributes for each data reference used in an OpenMP target region. 52 enum tgt_map_type { 53 // No flags 54 OMP_TGT_MAPTYPE_NONE = 0x000, 55 // copy data from host to device 56 OMP_TGT_MAPTYPE_TO = 0x001, 57 // copy data from device to host 58 OMP_TGT_MAPTYPE_FROM = 0x002, 59 // copy regardless of the reference count 60 OMP_TGT_MAPTYPE_ALWAYS = 0x004, 61 // force unmapping of data 62 OMP_TGT_MAPTYPE_DELETE = 0x008, 63 // map the pointer as well as the pointee 64 OMP_TGT_MAPTYPE_PTR_AND_OBJ = 0x010, 65 // pass device base address to kernel 66 OMP_TGT_MAPTYPE_TARGET_PARAM = 0x020, 67 // return base device address of mapped data 68 OMP_TGT_MAPTYPE_RETURN_PARAM = 0x040, 69 // private variable - not mapped 70 OMP_TGT_MAPTYPE_PRIVATE = 0x080, 71 // copy by value - not mapped 72 OMP_TGT_MAPTYPE_LITERAL = 0x100, 73 // mapping is implicit 74 OMP_TGT_MAPTYPE_IMPLICIT = 0x200, 75 // copy data to device 76 OMP_TGT_MAPTYPE_CLOSE = 0x400, 77 // runtime error if not already allocated 78 OMP_TGT_MAPTYPE_PRESENT = 0x1000, 79 // use a separate reference counter so that the data cannot be unmapped within 80 // the structured region 81 // This is an OpenMP extension for the sake of OpenACC support. 82 OMP_TGT_MAPTYPE_OMPX_HOLD = 0x2000, 83 // descriptor for non-contiguous target-update 84 OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000, 85 // member of struct, member given by [16 MSBs] - 1 86 OMP_TGT_MAPTYPE_MEMBER_OF = 0xffff000000000000 87 }; 88 89 /// Flags for offload entries. 90 enum OpenMPOffloadingDeclareTargetFlags { 91 /// Mark the entry global as having a 'link' attribute. 92 OMP_DECLARE_TARGET_LINK = 0x01, 93 /// Mark the entry global as being an indirectly callable function. 94 OMP_DECLARE_TARGET_INDIRECT = 0x08, 95 /// This is an entry corresponding to a requirement to be registered. 96 OMP_REGISTER_REQUIRES = 0x10, 97 }; 98 99 enum TargetAllocTy : int32_t { 100 TARGET_ALLOC_DEVICE = 0, 101 TARGET_ALLOC_HOST, 102 TARGET_ALLOC_SHARED, 103 TARGET_ALLOC_DEFAULT, 104 /// The allocation will not block on other streams. 105 TARGET_ALLOC_DEVICE_NON_BLOCKING, 106 }; 107 108 inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr, 109 nullptr, nullptr, nullptr, nullptr, 110 0, {0,0,0}, {1, 0, 0}, {1, 0, 0}, 0}; 111 112 struct DeviceTy; 113 114 /// The libomptarget wrapper around a __tgt_async_info object directly 115 /// associated with a libomptarget layer device. RAII semantics to avoid 116 /// mistakes. 117 class AsyncInfoTy { 118 public: 119 enum class SyncTy { BLOCKING, NON_BLOCKING }; 120 121 private: 122 /// Locations we used in (potentially) asynchronous calls which should live 123 /// as long as this AsyncInfoTy object. 124 std::deque<void *> BufferLocations; 125 126 /// Post-processing operations executed after a successful synchronization. 127 /// \note the post-processing function should return OFFLOAD_SUCCESS or 128 /// OFFLOAD_FAIL appropriately. 129 using PostProcFuncTy = std::function<int()>; 130 llvm::SmallVector<PostProcFuncTy> PostProcessingFunctions; 131 132 __tgt_async_info AsyncInfo; 133 DeviceTy &Device; 134 135 public: 136 /// Synchronization method to be used. 137 SyncTy SyncType; 138 139 AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING) 140 : Device(Device), SyncType(SyncType) {} 141 ~AsyncInfoTy() { synchronize(); } 142 143 /// Implicit conversion to the __tgt_async_info which is used in the 144 /// plugin interface. 145 operator __tgt_async_info *() { return &AsyncInfo; } 146 147 /// Synchronize all pending actions. 148 /// 149 /// \note synchronization will be performance in a blocking or non-blocking 150 /// manner, depending on the SyncType. 151 /// 152 /// \note if the operations are completed, the registered post-processing 153 /// functions will be executed once and unregistered afterwards. 154 /// 155 /// \returns OFFLOAD_FAIL or OFFLOAD_SUCCESS appropriately. 156 int synchronize(); 157 158 /// Return a void* reference with a lifetime that is at least as long as this 159 /// AsyncInfoTy object. The location can be used as intermediate buffer. 160 void *&getVoidPtrLocation(); 161 162 /// Check if all asynchronous operations are completed. 163 /// 164 /// \note only a lightweight check. If needed, use synchronize() to query the 165 /// status of AsyncInfo before checking. 166 /// 167 /// \returns true if there is no pending asynchronous operations, false 168 /// otherwise. 169 bool isDone() const; 170 171 /// Add a new post-processing function to be executed after synchronization. 172 /// 173 /// \param[in] Function is a templated function (e.g., function pointers, 174 /// lambdas, std::function) that can be convertible to a PostProcFuncTy (i.e., 175 /// it must have int() as its function signature). 176 template <typename FuncTy> void addPostProcessingFunction(FuncTy &&Function) { 177 static_assert(std::is_convertible_v<FuncTy, PostProcFuncTy>, 178 "Invalid post-processing function type. Please check " 179 "function signature!"); 180 PostProcessingFunctions.emplace_back(Function); 181 } 182 183 private: 184 /// Run all the post-processing functions sequentially. 185 /// 186 /// \note after a successful execution, all previously registered functions 187 /// are unregistered. 188 /// 189 /// \returns OFFLOAD_FAIL if any post-processing function failed, 190 /// OFFLOAD_SUCCESS otherwise. 191 int32_t runPostProcessing(); 192 193 /// Check if the internal asynchronous info queue is empty or not. 194 /// 195 /// \returns true if empty, false otherwise. 196 bool isQueueEmpty() const; 197 }; 198 199 // Wrapper for task stored async info objects. 200 class TaskAsyncInfoWrapperTy { 201 // Invalid GTID as defined by libomp; keep in sync 202 static constexpr int KMP_GTID_DNE = -2; 203 204 const int ExecThreadID = KMP_GTID_DNE; 205 AsyncInfoTy LocalAsyncInfo; 206 AsyncInfoTy *AsyncInfo = &LocalAsyncInfo; 207 void **TaskAsyncInfoPtr = nullptr; 208 209 public: 210 TaskAsyncInfoWrapperTy(DeviceTy &Device) 211 : ExecThreadID(__kmpc_global_thread_num(NULL)), LocalAsyncInfo(Device) { 212 // If we failed to acquired the current global thread id, we cannot 213 // re-enqueue the current task. Thus we should use the local blocking async 214 // info. 215 if (ExecThreadID == KMP_GTID_DNE) 216 return; 217 218 // Only tasks with an assigned task team can be re-enqueue and thus can 219 // use the non-blocking synchronization scheme. Thus we should use the local 220 // blocking async info, if we don´t have one. 221 if (!__kmpc_omp_has_task_team(ExecThreadID)) 222 return; 223 224 // Acquire a pointer to the AsyncInfo stored inside the current task being 225 // executed. 226 TaskAsyncInfoPtr = __kmpc_omp_get_target_async_handle_ptr(ExecThreadID); 227 228 // If we cannot acquire such pointer, fallback to using the local blocking 229 // async info. 230 if (!TaskAsyncInfoPtr) 231 return; 232 233 // When creating a new task async info, the task handle must always be 234 // invalid. We must never overwrite any task async handle and there should 235 // never be any valid handle store inside the task at this point. 236 assert((*TaskAsyncInfoPtr) == nullptr && 237 "Task async handle is not empty when dispatching new device " 238 "operations. The handle was not cleared properly or " 239 "__tgt_target_nowait_query should have been called!"); 240 241 // If no valid async handle is present, a new AsyncInfo will be allocated 242 // and stored in the current task. 243 AsyncInfo = new AsyncInfoTy(Device, AsyncInfoTy::SyncTy::NON_BLOCKING); 244 *TaskAsyncInfoPtr = (void *)AsyncInfo; 245 } 246 247 ~TaskAsyncInfoWrapperTy() { 248 // Local async info destruction is automatically handled by ~AsyncInfoTy. 249 if (AsyncInfo == &LocalAsyncInfo) 250 return; 251 252 // If the are device operations still pending, return immediately without 253 // deallocating the handle. 254 if (!AsyncInfo->isDone()) 255 return; 256 257 // Delete the handle and unset it from the OpenMP task data. 258 delete AsyncInfo; 259 *TaskAsyncInfoPtr = nullptr; 260 } 261 262 operator AsyncInfoTy &() { return *AsyncInfo; } 263 }; 264 265 /// This struct is a record of non-contiguous information 266 struct __tgt_target_non_contig { 267 uint64_t Offset; 268 uint64_t Count; 269 uint64_t Stride; 270 }; 271 272 #ifdef __cplusplus 273 extern "C" { 274 #endif 275 276 void ompx_dump_mapping_tables(void); 277 int omp_get_num_devices(void); 278 int omp_get_device_num(void); 279 int omp_get_initial_device(void); 280 void *omp_target_alloc(size_t Size, int DeviceNum); 281 void omp_target_free(void *DevicePtr, int DeviceNum); 282 int omp_target_is_present(const void *Ptr, int DeviceNum); 283 int omp_target_memcpy(void *Dst, const void *Src, size_t Length, 284 size_t DstOffset, size_t SrcOffset, int DstDevice, 285 int SrcDevice); 286 int omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize, 287 int NumDims, const size_t *Volume, 288 const size_t *DstOffsets, const size_t *SrcOffsets, 289 const size_t *DstDimensions, 290 const size_t *SrcDimensions, int DstDevice, 291 int SrcDevice); 292 void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum); 293 int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr, 294 size_t Size, size_t DeviceOffset, int DeviceNum); 295 int omp_target_disassociate_ptr(const void *HostPtr, int DeviceNum); 296 297 /// Explicit target memory allocators 298 /// Using the llvm_ prefix until they become part of the OpenMP standard. 299 void *llvm_omp_target_alloc_device(size_t Size, int DeviceNum); 300 void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum); 301 void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); 302 303 /// Explicit target memory deallocators 304 /// Using the llvm_ prefix until they become part of the OpenMP standard. 305 void llvm_omp_target_free_device(void *DevicePtr, int DeviceNum); 306 void llvm_omp_target_free_host(void *DevicePtr, int DeviceNum); 307 void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); 308 309 /// Dummy target so we have a symbol for generating host fallback. 310 void *llvm_omp_target_dynamic_shared_alloc(); 311 312 /// add the clauses of the requires directives in a given file 313 void __tgt_register_requires(int64_t Flags); 314 315 /// Initializes the runtime library. 316 void __tgt_rtl_init(); 317 318 /// Deinitializes the runtime library. 319 void __tgt_rtl_deinit(); 320 321 /// adds a target shared library to the target execution image 322 void __tgt_register_lib(__tgt_bin_desc *Desc); 323 324 /// Initialize all RTLs at once 325 void __tgt_init_all_rtls(); 326 327 /// removes a target shared library from the target execution image 328 void __tgt_unregister_lib(__tgt_bin_desc *Desc); 329 330 // creates the host to target data mapping, stores it in the 331 // libomptarget.so internal structure (an entry in a stack of data maps) and 332 // passes the data to the device; 333 void __tgt_target_data_begin(int64_t DeviceId, int32_t ArgNum, void **ArgsBase, 334 void **Args, int64_t *ArgSizes, int64_t *ArgTypes); 335 void __tgt_target_data_begin_nowait(int64_t DeviceId, int32_t ArgNum, 336 void **ArgsBase, void **Args, 337 int64_t *ArgSizes, int64_t *ArgTypes, 338 int32_t DepNum, void *DepList, 339 int32_t NoAliasDepNum, 340 void *NoAliasDepList); 341 void __tgt_target_data_begin_mapper(ident_t *Loc, int64_t DeviceId, 342 int32_t ArgNum, void **ArgsBase, 343 void **Args, int64_t *ArgSizes, 344 int64_t *ArgTypes, map_var_info_t *ArgNames, 345 void **ArgMappers); 346 void __tgt_target_data_begin_nowait_mapper( 347 ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, 348 void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, 349 void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, 350 void *NoAliasDepList); 351 352 // passes data from the target, release target memory and destroys the 353 // host-target mapping (top entry from the stack of data maps) created by 354 // the last __tgt_target_data_begin 355 void __tgt_target_data_end(int64_t DeviceId, int32_t ArgNum, void **ArgsBase, 356 void **Args, int64_t *ArgSizes, int64_t *ArgTypes); 357 void __tgt_target_data_end_nowait(int64_t DeviceId, int32_t ArgNum, 358 void **ArgsBase, void **Args, 359 int64_t *ArgSizes, int64_t *ArgTypes, 360 int32_t DepNum, void *DepList, 361 int32_t NoAliasDepNum, void *NoAliasDepList); 362 void __tgt_target_data_end_mapper(ident_t *Loc, int64_t DeviceId, 363 int32_t ArgNum, void **ArgsBase, void **Args, 364 int64_t *ArgSizes, int64_t *ArgTypes, 365 map_var_info_t *ArgNames, void **ArgMappers); 366 void __tgt_target_data_end_nowait_mapper( 367 ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, 368 void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, 369 void **ArgMappers, int32_t depNum, void *depList, int32_t NoAliasDepNum, 370 void *NoAliasDepList); 371 372 /// passes data to/from the target 373 void __tgt_target_data_update(int64_t DeviceId, int32_t ArgNum, void **ArgsBase, 374 void **Args, int64_t *ArgSizes, 375 int64_t *ArgTypes); 376 void __tgt_target_data_update_nowait(int64_t DeviceId, int32_t ArgNum, 377 void **ArgsBase, void **Args, 378 int64_t *ArgSizes, int64_t *ArgTypes, 379 int32_t DepNum, void *DepList, 380 int32_t NoAliasDepNum, 381 void *NoAliasDepList); 382 void __tgt_target_data_update_mapper(ident_t *Loc, int64_t DeviceId, 383 int32_t ArgNum, void **ArgsBase, 384 void **Args, int64_t *ArgSizes, 385 int64_t *ArgTypes, 386 map_var_info_t *ArgNames, 387 void **ArgMappers); 388 void __tgt_target_data_update_nowait_mapper( 389 ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, 390 void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, 391 void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, 392 void *NoAliasDepList); 393 394 // Performs the same actions as data_begin in case ArgNum is non-zero 395 // and initiates run of offloaded region on target platform; if ArgNum 396 // is non-zero after the region execution is done it also performs the 397 // same action as data_end above. The following types are used; this 398 // function returns 0 if it was able to transfer the execution to a 399 // target and an int different from zero otherwise. 400 int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, 401 int32_t ThreadLimit, void *HostPtr, KernelArgsTy *Args); 402 403 // Non-blocking synchronization for target nowait regions. This function 404 // acquires the asynchronous context from task data of the current task being 405 // executed and tries to query for the completion of its operations. If the 406 // operations are still pending, the function returns immediately. If the 407 // operations are completed, all the post-processing procedures stored in the 408 // asynchronous context are executed and the context is removed from the task 409 // data. 410 void __tgt_target_nowait_query(void **AsyncHandle); 411 412 /// Executes a target kernel by replaying recorded kernel arguments and 413 /// device memory. 414 int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, void *HostPtr, 415 void *DeviceMemory, int64_t DeviceMemorySize, 416 void **TgtArgs, ptrdiff_t *TgtOffsets, 417 int32_t NumArgs, int32_t NumTeams, 418 int32_t ThreadLimit, uint64_t LoopTripCount); 419 420 void __tgt_set_info_flag(uint32_t); 421 422 int __tgt_print_device_info(int64_t DeviceId); 423 424 int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, 425 void *VAddr, bool IsRecord, bool SaveOutput, 426 uint64_t &ReqPtrArgOffset); 427 428 #ifdef __cplusplus 429 } 430 #endif 431 432 #ifdef __cplusplus 433 #define EXTERN extern "C" 434 #else 435 #define EXTERN extern 436 #endif 437 438 #endif // _OMPTARGET_H_ 439