1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Implementation of the interface to be used by Clang during the codegen of a 10 // target region. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "omptarget.h" 15 #include "OffloadPolicy.h" 16 #include "OpenMP/OMPT/Callback.h" 17 #include "OpenMP/OMPT/Interface.h" 18 #include "PluginManager.h" 19 #include "Shared/Debug.h" 20 #include "Shared/EnvironmentVar.h" 21 #include "Shared/Utils.h" 22 #include "device.h" 23 #include "private.h" 24 #include "rtl.h" 25 26 #include "Shared/Profile.h" 27 28 #include "OpenMP/Mapping.h" 29 #include "OpenMP/omp.h" 30 31 #include "llvm/ADT/StringExtras.h" 32 #include "llvm/ADT/bit.h" 33 #include "llvm/Frontend/OpenMP/OMPConstants.h" 34 #include "llvm/Object/ObjectFile.h" 35 36 #include <cassert> 37 #include <cstdint> 38 #include <vector> 39 40 using llvm::SmallVector; 41 #ifdef OMPT_SUPPORT 42 using namespace llvm::omp::target::ompt; 43 #endif 44 45 int AsyncInfoTy::synchronize() { 46 int Result = OFFLOAD_SUCCESS; 47 if (!isQueueEmpty()) { 48 switch (SyncType) { 49 case SyncTy::BLOCKING: 50 // If we have a queue we need to synchronize it now. 51 Result = Device.synchronize(*this); 52 assert(AsyncInfo.Queue == nullptr && 53 "The device plugin should have nulled the queue to indicate there " 54 "are no outstanding actions!"); 55 break; 56 case SyncTy::NON_BLOCKING: 57 Result = Device.queryAsync(*this); 58 break; 59 } 60 } 61 62 // Run any pending post-processing function registered on this async object. 63 if (Result == OFFLOAD_SUCCESS && isQueueEmpty()) 64 Result = runPostProcessing(); 65 66 return Result; 67 } 68 69 void *&AsyncInfoTy::getVoidPtrLocation() { 70 BufferLocations.push_back(nullptr); 71 return BufferLocations.back(); 72 } 73 74 bool AsyncInfoTy::isDone() const { return isQueueEmpty(); } 75 76 int32_t AsyncInfoTy::runPostProcessing() { 77 size_t Size = PostProcessingFunctions.size(); 78 for (size_t I = 0; I < Size; ++I) { 79 const int Result = PostProcessingFunctions[I](); 80 if (Result != OFFLOAD_SUCCESS) 81 return Result; 82 } 83 84 // Clear the vector up until the last known function, since post-processing 85 // procedures might add new procedures themselves. 86 const auto *PrevBegin = PostProcessingFunctions.begin(); 87 PostProcessingFunctions.erase(PrevBegin, PrevBegin + Size); 88 89 return OFFLOAD_SUCCESS; 90 } 91 92 bool AsyncInfoTy::isQueueEmpty() const { return AsyncInfo.Queue == nullptr; } 93 94 /* All begin addresses for partially mapped structs must be aligned, up to 16, 95 * in order to ensure proper alignment of members. E.g. 96 * 97 * struct S { 98 * int a; // 4-aligned 99 * int b; // 4-aligned 100 * int *p; // 8-aligned 101 * } s1; 102 * ... 103 * #pragma omp target map(tofrom: s1.b, s1.p[0:N]) 104 * { 105 * s1.b = 5; 106 * for (int i...) s1.p[i] = ...; 107 * } 108 * 109 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and 110 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100, 111 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment 112 * requirements for its type. Now, when we allocate memory on the device, in 113 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned. 114 * This means that the chunk of the struct on the device will start at a 115 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and 116 * address of p will be a misaligned 0x204 (on the host there was no need to add 117 * padding between b and p, so p comes exactly 4 bytes after b). If the device 118 * kernel tries to access s1.p, a misaligned address error occurs (as reported 119 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and 120 * extending the size of the allocated chuck accordingly, the chuck on the 121 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and 122 * &s1.p=0x208, as they should be to satisfy the alignment requirements. 123 */ 124 static const int64_t MaxAlignment = 16; 125 126 /// Return the alignment requirement of partially mapped structs, see 127 /// MaxAlignment above. 128 static uint64_t getPartialStructRequiredAlignment(void *HstPtrBase) { 129 int LowestOneBit = __builtin_ffsl(reinterpret_cast<uintptr_t>(HstPtrBase)); 130 uint64_t BaseAlignment = 1 << (LowestOneBit - 1); 131 return MaxAlignment < BaseAlignment ? MaxAlignment : BaseAlignment; 132 } 133 134 void handleTargetOutcome(bool Success, ident_t *Loc) { 135 switch (OffloadPolicy::get(*PM).Kind) { 136 case OffloadPolicy::DISABLED: 137 if (Success) { 138 FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled"); 139 } 140 break; 141 case OffloadPolicy::MANDATORY: 142 if (!Success) { 143 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) { 144 auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor(); 145 for (auto &Device : PM->devices(ExclusiveDevicesAccessor)) 146 dumpTargetPointerMappings(Loc, Device); 147 } else 148 FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html " 149 "for debugging options.\n"); 150 151 if (!PM->getNumActivePlugins()) { 152 FAILURE_MESSAGE( 153 "No images found compatible with the installed hardware. "); 154 155 llvm::SmallVector<llvm::StringRef> Archs; 156 for (auto &Image : PM->deviceImages()) { 157 const char *Start = reinterpret_cast<const char *>( 158 Image.getExecutableImage().ImageStart); 159 uint64_t Length = 160 utils::getPtrDiff(Start, Image.getExecutableImage().ImageEnd); 161 llvm::MemoryBufferRef Buffer(llvm::StringRef(Start, Length), 162 /*Identifier=*/""); 163 164 auto ObjectOrErr = llvm::object::ObjectFile::createObjectFile(Buffer); 165 if (auto Err = ObjectOrErr.takeError()) { 166 llvm::consumeError(std::move(Err)); 167 continue; 168 } 169 170 if (auto CPU = (*ObjectOrErr)->tryGetCPUName()) 171 Archs.push_back(*CPU); 172 } 173 fprintf(stderr, "Found %zu image(s): (%s)\n", Archs.size(), 174 llvm::join(Archs, ",").c_str()); 175 } 176 177 SourceInfo Info(Loc); 178 if (Info.isAvailible()) 179 fprintf(stderr, "%s:%d:%d: ", Info.getFilename(), Info.getLine(), 180 Info.getColumn()); 181 else 182 FAILURE_MESSAGE("Source location information not present. Compile with " 183 "-g or -gline-tables-only.\n"); 184 FATAL_MESSAGE0( 185 1, "failure of target construct while offloading is mandatory"); 186 } else { 187 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) { 188 auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor(); 189 for (auto &Device : PM->devices(ExclusiveDevicesAccessor)) 190 dumpTargetPointerMappings(Loc, Device); 191 } 192 } 193 break; 194 } 195 } 196 197 static int32_t getParentIndex(int64_t Type) { 198 return ((Type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; 199 } 200 201 void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, 202 const char *Name) { 203 DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size); 204 205 if (Size <= 0) { 206 DP("Call to %s with non-positive length\n", Name); 207 return NULL; 208 } 209 210 void *Rc = NULL; 211 212 if (DeviceNum == omp_get_initial_device()) { 213 Rc = malloc(Size); 214 DP("%s returns host ptr " DPxMOD "\n", Name, DPxPTR(Rc)); 215 return Rc; 216 } 217 218 auto DeviceOrErr = PM->getDevice(DeviceNum); 219 if (!DeviceOrErr) 220 FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str()); 221 222 Rc = DeviceOrErr->allocData(Size, nullptr, Kind); 223 DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(Rc)); 224 return Rc; 225 } 226 227 void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, 228 const char *Name) { 229 DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum, 230 DPxPTR(DevicePtr)); 231 232 if (!DevicePtr) { 233 DP("Call to %s with NULL ptr\n", Name); 234 return; 235 } 236 237 if (DeviceNum == omp_get_initial_device()) { 238 free(DevicePtr); 239 DP("%s deallocated host ptr\n", Name); 240 return; 241 } 242 243 auto DeviceOrErr = PM->getDevice(DeviceNum); 244 if (!DeviceOrErr) 245 FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str()); 246 247 if (DeviceOrErr->deleteData(DevicePtr, Kind) == OFFLOAD_FAIL) 248 FATAL_MESSAGE(DeviceNum, "%s", 249 "Failed to deallocate device ptr. Set " 250 "OFFLOAD_TRACK_ALLOCATION_TRACES=1 to track allocations."); 251 252 DP("omp_target_free deallocated device ptr\n"); 253 } 254 255 void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum, 256 const char *Name) { 257 DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size); 258 259 if (Size <= 0) { 260 DP("Call to %s with non-positive length\n", Name); 261 return NULL; 262 } 263 264 void *RC = NULL; 265 266 auto DeviceOrErr = PM->getDevice(DeviceNum); 267 if (!DeviceOrErr) 268 FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str()); 269 270 int32_t Err = 0; 271 Err = DeviceOrErr->RTL->data_lock(DeviceNum, HostPtr, Size, &RC); 272 if (Err) { 273 DP("Could not lock ptr %p\n", HostPtr); 274 return nullptr; 275 } 276 DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(RC)); 277 return RC; 278 } 279 280 void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) { 281 DP("Call to %s for device %d unlocking\n", Name, DeviceNum); 282 283 auto DeviceOrErr = PM->getDevice(DeviceNum); 284 if (!DeviceOrErr) 285 FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str()); 286 287 DeviceOrErr->RTL->data_unlock(DeviceNum, HostPtr); 288 DP("%s returns\n", Name); 289 } 290 291 /// Call the user-defined mapper function followed by the appropriate 292 // targetData* function (targetData{Begin,End,Update}). 293 int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, 294 int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames, 295 void *ArgMapper, AsyncInfoTy &AsyncInfo, 296 TargetDataFuncPtrTy TargetDataFunction) { 297 DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper)); 298 299 // The mapper function fills up Components. 300 MapperComponentsTy MapperComponents; 301 MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(ArgMapper); 302 (*MapperFuncPtr)((void *)&MapperComponents, ArgBase, Arg, ArgSize, ArgType, 303 ArgNames); 304 305 // Construct new arrays for args_base, args, arg_sizes and arg_types 306 // using the information in MapperComponents and call the corresponding 307 // targetData* function using these new arrays. 308 SmallVector<void *> MapperArgsBase(MapperComponents.Components.size()); 309 SmallVector<void *> MapperArgs(MapperComponents.Components.size()); 310 SmallVector<int64_t> MapperArgSizes(MapperComponents.Components.size()); 311 SmallVector<int64_t> MapperArgTypes(MapperComponents.Components.size()); 312 SmallVector<void *> MapperArgNames(MapperComponents.Components.size()); 313 314 for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) { 315 auto &C = MapperComponents.Components[I]; 316 MapperArgsBase[I] = C.Base; 317 MapperArgs[I] = C.Begin; 318 MapperArgSizes[I] = C.Size; 319 MapperArgTypes[I] = C.Type; 320 MapperArgNames[I] = C.Name; 321 } 322 323 int Rc = TargetDataFunction(Loc, Device, MapperComponents.Components.size(), 324 MapperArgsBase.data(), MapperArgs.data(), 325 MapperArgSizes.data(), MapperArgTypes.data(), 326 MapperArgNames.data(), /*arg_mappers*/ nullptr, 327 AsyncInfo, /*FromMapper=*/true); 328 329 return Rc; 330 } 331 332 /// Internal function to do the mapping and transfer the data to the device 333 int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, 334 void **ArgsBase, void **Args, int64_t *ArgSizes, 335 int64_t *ArgTypes, map_var_info_t *ArgNames, 336 void **ArgMappers, AsyncInfoTy &AsyncInfo, 337 bool FromMapper) { 338 // process each input. 339 for (int32_t I = 0; I < ArgNum; ++I) { 340 // Ignore private variables and arrays - there is no mapping for them. 341 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || 342 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) 343 continue; 344 TIMESCOPE_WITH_DETAILS_AND_IDENT( 345 "HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc); 346 if (ArgMappers && ArgMappers[I]) { 347 // Instead of executing the regular path of targetDataBegin, call the 348 // targetDataMapper variant which will call targetDataBegin again 349 // with new arguments. 350 DP("Calling targetDataMapper for the %dth argument\n", I); 351 352 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; 353 int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I], 354 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, 355 targetDataBegin); 356 357 if (Rc != OFFLOAD_SUCCESS) { 358 REPORT("Call to targetDataBegin via targetDataMapper for custom mapper" 359 " failed.\n"); 360 return OFFLOAD_FAIL; 361 } 362 363 // Skip the rest of this function, continue to the next argument. 364 continue; 365 } 366 367 void *HstPtrBegin = Args[I]; 368 void *HstPtrBase = ArgsBase[I]; 369 int64_t DataSize = ArgSizes[I]; 370 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; 371 372 // Adjust for proper alignment if this is a combined entry (for structs). 373 // Look at the next argument - if that is MEMBER_OF this one, then this one 374 // is a combined entry. 375 int64_t TgtPadding = 0; 376 const int NextI = I + 1; 377 if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum && 378 getParentIndex(ArgTypes[NextI]) == I) { 379 int64_t Alignment = getPartialStructRequiredAlignment(HstPtrBase); 380 TgtPadding = (int64_t)HstPtrBegin % Alignment; 381 if (TgtPadding) { 382 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD 383 "\n", 384 TgtPadding, DPxPTR(HstPtrBegin)); 385 } 386 } 387 388 // Address of pointer on the host and device, respectively. 389 void *PointerHstPtrBegin, *PointerTgtPtrBegin; 390 TargetPointerResultTy PointerTpr; 391 bool IsHostPtr = false; 392 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT; 393 // Force the creation of a device side copy of the data when: 394 // a close map modifier was associated with a map that contained a to. 395 bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE; 396 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT; 397 bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD; 398 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we 399 // have reached this point via __tgt_target_data_begin and not __tgt_target 400 // then no argument is marked as TARGET_PARAM ("omp target data map" is not 401 // associated with a target region, so there are no target parameters). This 402 // may be considered a hack, we could revise the scheme in the future. 403 bool UpdateRef = 404 !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && I == 0); 405 406 MappingInfoTy::HDTTMapAccessorTy HDTTMap = 407 Device.getMappingInfo().HostDataToTargetMap.getExclusiveAccessor(); 408 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { 409 DP("Has a pointer entry: \n"); 410 // Base is address of pointer. 411 // 412 // Usually, the pointer is already allocated by this time. For example: 413 // 414 // #pragma omp target map(s.p[0:N]) 415 // 416 // The map entry for s comes first, and the PTR_AND_OBJ entry comes 417 // afterward, so the pointer is already allocated by the time the 418 // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus 419 // non-null. However, "declare target link" can produce a PTR_AND_OBJ 420 // entry for a global that might not already be allocated by the time the 421 // PTR_AND_OBJ entry is handled below, and so the allocation might fail 422 // when HasPresentModifier. 423 PointerTpr = Device.getMappingInfo().getTargetPointer( 424 HDTTMap, HstPtrBase, HstPtrBase, /*TgtPadding=*/0, sizeof(void *), 425 /*HstPtrName=*/nullptr, 426 /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef, 427 HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo, 428 /*OwnedTPR=*/nullptr, /*ReleaseHDTTMap=*/false); 429 PointerTgtPtrBegin = PointerTpr.TargetPointer; 430 IsHostPtr = PointerTpr.Flags.IsHostPointer; 431 if (!PointerTgtPtrBegin) { 432 REPORT("Call to getTargetPointer returned null pointer (%s).\n", 433 HasPresentModifier ? "'present' map type modifier" 434 : "device failure or illegal mapping"); 435 return OFFLOAD_FAIL; 436 } 437 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" 438 "\n", 439 sizeof(void *), DPxPTR(PointerTgtPtrBegin), 440 (PointerTpr.Flags.IsNewEntry ? "" : " not")); 441 PointerHstPtrBegin = HstPtrBase; 442 // modify current entry. 443 HstPtrBase = *(void **)HstPtrBase; 444 // No need to update pointee ref count for the first element of the 445 // subelement that comes from mapper. 446 UpdateRef = 447 (!FromMapper || I != 0); // subsequently update ref count of pointee 448 } 449 450 const bool HasFlagTo = ArgTypes[I] & OMP_TGT_MAPTYPE_TO; 451 const bool HasFlagAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; 452 // Note that HDTTMap will be released in getTargetPointer. 453 auto TPR = Device.getMappingInfo().getTargetPointer( 454 HDTTMap, HstPtrBegin, HstPtrBase, TgtPadding, DataSize, HstPtrName, 455 HasFlagTo, HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier, 456 HasPresentModifier, HasHoldModifier, AsyncInfo, PointerTpr.getEntry()); 457 void *TgtPtrBegin = TPR.TargetPointer; 458 IsHostPtr = TPR.Flags.IsHostPointer; 459 // If data_size==0, then the argument could be a zero-length pointer to 460 // NULL, so getOrAlloc() returning NULL is not an error. 461 if (!TgtPtrBegin && (DataSize || HasPresentModifier)) { 462 REPORT("Call to getTargetPointer returned null pointer (%s).\n", 463 HasPresentModifier ? "'present' map type modifier" 464 : "device failure or illegal mapping"); 465 return OFFLOAD_FAIL; 466 } 467 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD 468 " - is%s new\n", 469 DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not")); 470 471 if (ArgTypes[I] & OMP_TGT_MAPTYPE_RETURN_PARAM) { 472 uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; 473 void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta); 474 DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase)); 475 ArgsBase[I] = TgtPtrBase; 476 } 477 478 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { 479 480 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; 481 void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); 482 483 if (PointerTpr.getEntry()->addShadowPointer(ShadowPtrInfoTy{ 484 (void **)PointerHstPtrBegin, HstPtrBase, 485 (void **)PointerTgtPtrBegin, ExpectedTgtPtrBase})) { 486 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", 487 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); 488 489 void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation(); 490 TgtPtrBase = ExpectedTgtPtrBase; 491 492 int Ret = 493 Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, sizeof(void *), 494 AsyncInfo, PointerTpr.getEntry()); 495 if (Ret != OFFLOAD_SUCCESS) { 496 REPORT("Copying data to device failed.\n"); 497 return OFFLOAD_FAIL; 498 } 499 if (PointerTpr.getEntry()->addEventIfNecessary(Device, AsyncInfo) != 500 OFFLOAD_SUCCESS) 501 return OFFLOAD_FAIL; 502 } 503 } 504 505 // Check if variable can be used on the device: 506 bool IsStructMember = ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF; 507 if (getInfoLevel() & OMP_INFOTYPE_EMPTY_MAPPING && ArgTypes[I] != 0 && 508 !IsStructMember && !IsImplicit && !TPR.isPresent() && 509 !TPR.isContained() && !TPR.isHostPointer()) 510 INFO(OMP_INFOTYPE_EMPTY_MAPPING, Device.DeviceID, 511 "variable %s does not have a valid device counterpart\n", 512 (HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown"); 513 } 514 515 return OFFLOAD_SUCCESS; 516 } 517 518 namespace { 519 /// This structure contains information to deallocate a target pointer, aka. 520 /// used to fix up the shadow map and potentially delete the entry from the 521 /// mapping table via \p DeviceTy::deallocTgtPtr. 522 struct PostProcessingInfo { 523 /// Host pointer used to look up into the map table 524 void *HstPtrBegin; 525 526 /// Size of the data 527 int64_t DataSize; 528 529 /// The mapping type (bitfield). 530 int64_t ArgType; 531 532 /// The target pointer information. 533 TargetPointerResultTy TPR; 534 535 PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType, 536 TargetPointerResultTy &&TPR) 537 : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType), 538 TPR(std::move(TPR)) {} 539 }; 540 541 } // namespace 542 543 /// Applies the necessary post-processing procedures to entries listed in \p 544 /// EntriesInfo after the execution of all device side operations from a target 545 /// data end. This includes the update of pointers at the host and removal of 546 /// device buffer when needed. It returns OFFLOAD_FAIL or OFFLOAD_SUCCESS 547 /// according to the successfulness of the operations. 548 [[nodiscard]] static int 549 postProcessingTargetDataEnd(DeviceTy *Device, 550 SmallVector<PostProcessingInfo> &EntriesInfo) { 551 int Ret = OFFLOAD_SUCCESS; 552 553 for (auto &[HstPtrBegin, DataSize, ArgType, TPR] : EntriesInfo) { 554 bool DelEntry = !TPR.isHostPointer(); 555 556 // If the last element from the mapper (for end transfer args comes in 557 // reverse order), do not remove the partial entry, the parent struct still 558 // exists. 559 if ((ArgType & OMP_TGT_MAPTYPE_MEMBER_OF) && 560 !(ArgType & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { 561 DelEntry = false; // protect parent struct from being deallocated 562 } 563 564 // If we marked the entry to be deleted we need to verify no other 565 // thread reused it by now. If deletion is still supposed to happen by 566 // this thread LR will be set and exclusive access to the HDTT map 567 // will avoid another thread reusing the entry now. Note that we do 568 // not request (exclusive) access to the HDTT map if DelEntry is 569 // not set. 570 MappingInfoTy::HDTTMapAccessorTy HDTTMap = 571 Device->getMappingInfo().HostDataToTargetMap.getExclusiveAccessor(); 572 573 // We cannot use a lock guard because we may end up delete the mutex. 574 // We also explicitly unlocked the entry after it was put in the EntriesInfo 575 // so it can be reused. 576 TPR.getEntry()->lock(); 577 auto *Entry = TPR.getEntry(); 578 579 const bool IsNotLastUser = Entry->decDataEndThreadCount() != 0; 580 if (DelEntry && (Entry->getTotalRefCount() != 0 || IsNotLastUser)) { 581 // The thread is not in charge of deletion anymore. Give up access 582 // to the HDTT map and unset the deletion flag. 583 HDTTMap.destroy(); 584 DelEntry = false; 585 } 586 587 // If we copied back to the host a struct/array containing pointers, 588 // we need to restore the original host pointer values from their 589 // shadow copies. If the struct is going to be deallocated, remove any 590 // remaining shadow pointer entries for this struct. 591 const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM; 592 if (HasFrom) { 593 Entry->foreachShadowPointerInfo([&](const ShadowPtrInfoTy &ShadowPtr) { 594 *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal; 595 DP("Restoring original host pointer value " DPxMOD " for host " 596 "pointer " DPxMOD "\n", 597 DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr)); 598 return OFFLOAD_SUCCESS; 599 }); 600 } 601 602 // Give up the lock as we either don't need it anymore (e.g., done with 603 // TPR), or erase TPR. 604 TPR.setEntry(nullptr); 605 606 if (!DelEntry) 607 continue; 608 609 Ret = Device->getMappingInfo().eraseMapEntry(HDTTMap, Entry, DataSize); 610 // Entry is already remove from the map, we can unlock it now. 611 HDTTMap.destroy(); 612 Ret |= Device->getMappingInfo().deallocTgtPtrAndEntry(Entry, DataSize); 613 if (Ret != OFFLOAD_SUCCESS) { 614 REPORT("Deallocating data from device failed.\n"); 615 break; 616 } 617 } 618 619 delete &EntriesInfo; 620 return Ret; 621 } 622 623 /// Internal function to undo the mapping and retrieve the data from the device. 624 int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, 625 void **ArgBases, void **Args, int64_t *ArgSizes, 626 int64_t *ArgTypes, map_var_info_t *ArgNames, 627 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { 628 int Ret = OFFLOAD_SUCCESS; 629 auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>(); 630 // process each input. 631 for (int32_t I = ArgNum - 1; I >= 0; --I) { 632 // Ignore private variables and arrays - there is no mapping for them. 633 // Also, ignore the use_device_ptr directive, it has no effect here. 634 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || 635 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) 636 continue; 637 638 if (ArgMappers && ArgMappers[I]) { 639 // Instead of executing the regular path of targetDataEnd, call the 640 // targetDataMapper variant which will call targetDataEnd again 641 // with new arguments. 642 DP("Calling targetDataMapper for the %dth argument\n", I); 643 644 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; 645 Ret = targetDataMapper(Loc, Device, ArgBases[I], Args[I], ArgSizes[I], 646 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, 647 targetDataEnd); 648 649 if (Ret != OFFLOAD_SUCCESS) { 650 REPORT("Call to targetDataEnd via targetDataMapper for custom mapper" 651 " failed.\n"); 652 return OFFLOAD_FAIL; 653 } 654 655 // Skip the rest of this function, continue to the next argument. 656 continue; 657 } 658 659 void *HstPtrBegin = Args[I]; 660 int64_t DataSize = ArgSizes[I]; 661 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT; 662 bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) || 663 (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) && 664 !(FromMapper && I == 0); 665 bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE; 666 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT; 667 bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD; 668 669 // If PTR_AND_OBJ, HstPtrBegin is address of pointee 670 TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin( 671 HstPtrBegin, DataSize, UpdateRef, HasHoldModifier, !IsImplicit, 672 ForceDelete, /*FromDataEnd=*/true); 673 void *TgtPtrBegin = TPR.TargetPointer; 674 if (!TPR.isPresent() && !TPR.isHostPointer() && 675 (DataSize || HasPresentModifier)) { 676 DP("Mapping does not exist (%s)\n", 677 (HasPresentModifier ? "'present' map type modifier" : "ignored")); 678 if (HasPresentModifier) { 679 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13: 680 // "If a map clause appears on a target, target data, target enter data 681 // or target exit data construct with a present map-type-modifier then 682 // on entry to the region if the corresponding list item does not appear 683 // in the device data environment then an error occurs and the program 684 // terminates." 685 // 686 // This should be an error upon entering an "omp target exit data". It 687 // should not be an error upon exiting an "omp target data" or "omp 688 // target". For "omp target data", Clang thus doesn't include present 689 // modifiers for end calls. For "omp target", we have not found a valid 690 // OpenMP program for which the error matters: it appears that, if a 691 // program can guarantee that data is present at the beginning of an 692 // "omp target" region so that there's no error there, that data is also 693 // guaranteed to be present at the end. 694 MESSAGE("device mapping required by 'present' map type modifier does " 695 "not exist for host address " DPxMOD " (%" PRId64 " bytes)", 696 DPxPTR(HstPtrBegin), DataSize); 697 return OFFLOAD_FAIL; 698 } 699 } else { 700 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD 701 " - is%s last\n", 702 DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsLast ? "" : " not")); 703 } 704 705 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16: 706 // "If the map clause appears on a target, target data, or target exit data 707 // construct and a corresponding list item of the original list item is not 708 // present in the device data environment on exit from the region then the 709 // list item is ignored." 710 if (!TPR.isPresent()) 711 continue; 712 713 // Move data back to the host 714 const bool HasAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; 715 const bool HasFrom = ArgTypes[I] & OMP_TGT_MAPTYPE_FROM; 716 if (HasFrom && (HasAlways || TPR.Flags.IsLast) && 717 !TPR.Flags.IsHostPointer && DataSize != 0) { 718 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", 719 DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); 720 TIMESCOPE_WITH_DETAILS_AND_IDENT( 721 "DevToHost", "Size=" + std::to_string(DataSize) + "B", Loc); 722 // Wait for any previous transfer if an event is present. 723 if (void *Event = TPR.getEntry()->getEvent()) { 724 if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) { 725 REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event)); 726 return OFFLOAD_FAIL; 727 } 728 } 729 730 Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, AsyncInfo, 731 TPR.getEntry()); 732 if (Ret != OFFLOAD_SUCCESS) { 733 REPORT("Copying data from device failed.\n"); 734 return OFFLOAD_FAIL; 735 } 736 737 // As we are expecting to delete the entry the d2h copy might race 738 // with another one that also tries to delete the entry. This happens 739 // as the entry can be reused and the reuse might happen after the 740 // copy-back was issued but before it completed. Since the reuse might 741 // also copy-back a value we would race. 742 if (TPR.Flags.IsLast) { 743 if (TPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) != 744 OFFLOAD_SUCCESS) 745 return OFFLOAD_FAIL; 746 } 747 } 748 749 // Add pointer to the buffer for post-synchronize processing. 750 PostProcessingPtrs->emplace_back(HstPtrBegin, DataSize, ArgTypes[I], 751 std::move(TPR)); 752 PostProcessingPtrs->back().TPR.getEntry()->unlock(); 753 } 754 755 // Add post-processing functions 756 // TODO: We might want to remove `mutable` in the future by not changing the 757 // captured variables somehow. 758 AsyncInfo.addPostProcessingFunction([=, Device = &Device]() mutable -> int { 759 return postProcessingTargetDataEnd(Device, *PostProcessingPtrs); 760 }); 761 762 return Ret; 763 } 764 765 static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase, 766 void *HstPtrBegin, int64_t ArgSize, 767 int64_t ArgType, AsyncInfoTy &AsyncInfo) { 768 TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin( 769 HstPtrBegin, ArgSize, /*UpdateRefCount=*/false, 770 /*UseHoldRefCount=*/false, /*MustContain=*/true); 771 void *TgtPtrBegin = TPR.TargetPointer; 772 if (!TPR.isPresent()) { 773 DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)); 774 if (ArgType & OMP_TGT_MAPTYPE_PRESENT) { 775 MESSAGE("device mapping required by 'present' motion modifier does not " 776 "exist for host address " DPxMOD " (%" PRId64 " bytes)", 777 DPxPTR(HstPtrBegin), ArgSize); 778 return OFFLOAD_FAIL; 779 } 780 return OFFLOAD_SUCCESS; 781 } 782 783 if (TPR.Flags.IsHostPointer) { 784 DP("hst data:" DPxMOD " unified and shared, becomes a noop\n", 785 DPxPTR(HstPtrBegin)); 786 return OFFLOAD_SUCCESS; 787 } 788 789 if (ArgType & OMP_TGT_MAPTYPE_TO) { 790 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", 791 ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); 792 int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo, 793 TPR.getEntry()); 794 if (Ret != OFFLOAD_SUCCESS) { 795 REPORT("Copying data to device failed.\n"); 796 return OFFLOAD_FAIL; 797 } 798 if (TPR.getEntry()) { 799 int Ret = TPR.getEntry()->foreachShadowPointerInfo( 800 [&](ShadowPtrInfoTy &ShadowPtr) { 801 DP("Restoring original target pointer value " DPxMOD " for target " 802 "pointer " DPxMOD "\n", 803 DPxPTR(ShadowPtr.TgtPtrVal), DPxPTR(ShadowPtr.TgtPtrAddr)); 804 Ret = Device.submitData(ShadowPtr.TgtPtrAddr, 805 (void *)&ShadowPtr.TgtPtrVal, 806 sizeof(void *), AsyncInfo); 807 if (Ret != OFFLOAD_SUCCESS) { 808 REPORT("Copying data to device failed.\n"); 809 return OFFLOAD_FAIL; 810 } 811 return OFFLOAD_SUCCESS; 812 }); 813 if (Ret != OFFLOAD_SUCCESS) { 814 DP("Updating shadow map failed\n"); 815 return Ret; 816 } 817 } 818 } 819 820 if (ArgType & OMP_TGT_MAPTYPE_FROM) { 821 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", 822 ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); 823 int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo, 824 TPR.getEntry()); 825 if (Ret != OFFLOAD_SUCCESS) { 826 REPORT("Copying data from device failed.\n"); 827 return OFFLOAD_FAIL; 828 } 829 830 // Wait for device-to-host memcopies for whole struct to complete, 831 // before restoring the correct host pointer. 832 if (auto *Entry = TPR.getEntry()) { 833 AsyncInfo.addPostProcessingFunction([=]() -> int { 834 int Ret = Entry->foreachShadowPointerInfo( 835 [&](const ShadowPtrInfoTy &ShadowPtr) { 836 *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal; 837 DP("Restoring original host pointer value " DPxMOD 838 " for host pointer " DPxMOD "\n", 839 DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr)); 840 return OFFLOAD_SUCCESS; 841 }); 842 Entry->unlock(); 843 if (Ret != OFFLOAD_SUCCESS) { 844 DP("Updating shadow map failed\n"); 845 return Ret; 846 } 847 return OFFLOAD_SUCCESS; 848 }); 849 } 850 } 851 852 return OFFLOAD_SUCCESS; 853 } 854 855 static int targetDataNonContiguous(ident_t *Loc, DeviceTy &Device, 856 void *ArgsBase, 857 __tgt_target_non_contig *NonContig, 858 uint64_t Size, int64_t ArgType, 859 int CurrentDim, int DimSize, uint64_t Offset, 860 AsyncInfoTy &AsyncInfo) { 861 int Ret = OFFLOAD_SUCCESS; 862 if (CurrentDim < DimSize) { 863 for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) { 864 uint64_t CurOffset = 865 (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride; 866 // we only need to transfer the first element for the last dimension 867 // since we've already got a contiguous piece. 868 if (CurrentDim != DimSize - 1 || I == 0) { 869 Ret = targetDataNonContiguous(Loc, Device, ArgsBase, NonContig, Size, 870 ArgType, CurrentDim + 1, DimSize, 871 Offset + CurOffset, AsyncInfo); 872 // Stop the whole process if any contiguous piece returns anything 873 // other than OFFLOAD_SUCCESS. 874 if (Ret != OFFLOAD_SUCCESS) 875 return Ret; 876 } 877 } 878 } else { 879 char *Ptr = (char *)ArgsBase + Offset; 880 DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64 881 " len %" PRIu64 "\n", 882 DPxPTR(Ptr), Offset, Size); 883 Ret = targetDataContiguous(Loc, Device, ArgsBase, Ptr, Size, ArgType, 884 AsyncInfo); 885 } 886 return Ret; 887 } 888 889 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig, 890 int32_t DimSize) { 891 int RemovedDim = 0; 892 for (int I = DimSize - 1; I > 0; --I) { 893 if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride) 894 RemovedDim++; 895 } 896 return RemovedDim; 897 } 898 899 /// Internal function to pass data to/from the target. 900 int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, 901 void **ArgsBase, void **Args, int64_t *ArgSizes, 902 int64_t *ArgTypes, map_var_info_t *ArgNames, 903 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) { 904 // process each input. 905 for (int32_t I = 0; I < ArgNum; ++I) { 906 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || 907 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) 908 continue; 909 910 if (ArgMappers && ArgMappers[I]) { 911 // Instead of executing the regular path of targetDataUpdate, call the 912 // targetDataMapper variant which will call targetDataUpdate again 913 // with new arguments. 914 DP("Calling targetDataMapper for the %dth argument\n", I); 915 916 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; 917 int Ret = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I], 918 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, 919 targetDataUpdate); 920 921 if (Ret != OFFLOAD_SUCCESS) { 922 REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper" 923 " failed.\n"); 924 return OFFLOAD_FAIL; 925 } 926 927 // Skip the rest of this function, continue to the next argument. 928 continue; 929 } 930 931 int Ret = OFFLOAD_SUCCESS; 932 933 if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) { 934 __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I]; 935 int32_t DimSize = ArgSizes[I]; 936 uint64_t Size = 937 NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride; 938 int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize); 939 Ret = targetDataNonContiguous( 940 Loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I], 941 /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo); 942 } else { 943 Ret = targetDataContiguous(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I], 944 ArgTypes[I], AsyncInfo); 945 } 946 if (Ret == OFFLOAD_FAIL) 947 return OFFLOAD_FAIL; 948 } 949 return OFFLOAD_SUCCESS; 950 } 951 952 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ | 953 OMP_TGT_MAPTYPE_LITERAL | 954 OMP_TGT_MAPTYPE_IMPLICIT; 955 static bool isLambdaMapping(int64_t Mapping) { 956 return (Mapping & LambdaMapping) == LambdaMapping; 957 } 958 959 namespace { 960 /// Find the table information in the map or look it up in the translation 961 /// tables. 962 TableMap *getTableMap(void *HostPtr) { 963 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx); 964 HostPtrToTableMapTy::iterator TableMapIt = 965 PM->HostPtrToTableMap.find(HostPtr); 966 967 if (TableMapIt != PM->HostPtrToTableMap.end()) 968 return &TableMapIt->second; 969 970 // We don't have a map. So search all the registered libraries. 971 TableMap *TM = nullptr; 972 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); 973 for (HostEntriesBeginToTransTableTy::iterator Itr = 974 PM->HostEntriesBeginToTransTable.begin(); 975 Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) { 976 // get the translation table (which contains all the good info). 977 TranslationTable *TransTable = &Itr->second; 978 // iterate over all the host table entries to see if we can locate the 979 // host_ptr. 980 llvm::offloading::EntryTy *Cur = TransTable->HostTable.EntriesBegin; 981 for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) { 982 if (Cur->Address != HostPtr) 983 continue; 984 // we got a match, now fill the HostPtrToTableMap so that we 985 // may avoid this search next time. 986 TM = &(PM->HostPtrToTableMap)[HostPtr]; 987 TM->Table = TransTable; 988 TM->Index = I; 989 return TM; 990 } 991 } 992 993 return nullptr; 994 } 995 996 /// A class manages private arguments in a target region. 997 class PrivateArgumentManagerTy { 998 /// A data structure for the information of first-private arguments. We can 999 /// use this information to optimize data transfer by packing all 1000 /// first-private arguments and transfer them all at once. 1001 struct FirstPrivateArgInfoTy { 1002 /// Host pointer begin 1003 char *HstPtrBegin; 1004 /// Host pointer end 1005 char *HstPtrEnd; 1006 /// The index of the element in \p TgtArgs corresponding to the argument 1007 int Index; 1008 /// Alignment of the entry (base of the entry, not after the entry). 1009 uint32_t Alignment; 1010 /// Size (without alignment, see padding) 1011 uint32_t Size; 1012 /// Padding used to align this argument entry, if necessary. 1013 uint32_t Padding; 1014 /// Host pointer name 1015 map_var_info_t HstPtrName = nullptr; 1016 1017 FirstPrivateArgInfoTy(int Index, void *HstPtr, uint32_t Size, 1018 uint32_t Alignment, uint32_t Padding, 1019 map_var_info_t HstPtrName = nullptr) 1020 : HstPtrBegin(reinterpret_cast<char *>(HstPtr)), 1021 HstPtrEnd(HstPtrBegin + Size), Index(Index), Alignment(Alignment), 1022 Size(Size), Padding(Padding), HstPtrName(HstPtrName) {} 1023 }; 1024 1025 /// A vector of target pointers for all private arguments 1026 SmallVector<void *> TgtPtrs; 1027 1028 /// A vector of information of all first-private arguments to be packed 1029 SmallVector<FirstPrivateArgInfoTy> FirstPrivateArgInfo; 1030 /// Host buffer for all arguments to be packed 1031 SmallVector<char> FirstPrivateArgBuffer; 1032 /// The total size of all arguments to be packed 1033 int64_t FirstPrivateArgSize = 0; 1034 1035 /// A reference to the \p DeviceTy object 1036 DeviceTy &Device; 1037 /// A pointer to a \p AsyncInfoTy object 1038 AsyncInfoTy &AsyncInfo; 1039 1040 // TODO: What would be the best value here? Should we make it configurable? 1041 // If the size is larger than this threshold, we will allocate and transfer it 1042 // immediately instead of packing it. 1043 static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024; 1044 1045 public: 1046 /// Constructor 1047 PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo) 1048 : Device(Dev), AsyncInfo(AsyncInfo) {} 1049 1050 /// Add a private argument 1051 int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset, 1052 bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex, 1053 map_var_info_t HstPtrName = nullptr, 1054 const bool AllocImmediately = false) { 1055 // If the argument is not first-private, or its size is greater than a 1056 // predefined threshold, we will allocate memory and issue the transfer 1057 // immediately. 1058 if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate || 1059 AllocImmediately) { 1060 TgtPtr = Device.allocData(ArgSize, HstPtr); 1061 if (!TgtPtr) { 1062 DP("Data allocation for %sprivate array " DPxMOD " failed.\n", 1063 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr)); 1064 return OFFLOAD_FAIL; 1065 } 1066 #ifdef OMPTARGET_DEBUG 1067 void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset); 1068 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD 1069 " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD 1070 "\n", 1071 ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""), 1072 DPxPTR(HstPtr), DPxPTR(TgtPtrBase)); 1073 #endif 1074 // If first-private, copy data from host 1075 if (IsFirstPrivate) { 1076 DP("Submitting firstprivate data to the device.\n"); 1077 int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo); 1078 if (Ret != OFFLOAD_SUCCESS) { 1079 DP("Copying data to device failed, failed.\n"); 1080 return OFFLOAD_FAIL; 1081 } 1082 } 1083 TgtPtrs.push_back(TgtPtr); 1084 } else { 1085 DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n", 1086 DPxPTR(HstPtr), ArgSize); 1087 // When reach this point, the argument must meet all following 1088 // requirements: 1089 // 1. Its size does not exceed the threshold (see the comment for 1090 // FirstPrivateArgSizeThreshold); 1091 // 2. It must be first-private (needs to be mapped to target device). 1092 // We will pack all this kind of arguments to transfer them all at once 1093 // to reduce the number of data transfer. We will not take 1094 // non-first-private arguments, aka. private arguments that doesn't need 1095 // to be mapped to target device, into account because data allocation 1096 // can be very efficient with memory manager. 1097 1098 // Placeholder value 1099 TgtPtr = nullptr; 1100 auto *LastFPArgInfo = 1101 FirstPrivateArgInfo.empty() ? nullptr : &FirstPrivateArgInfo.back(); 1102 1103 // Compute the start alignment of this entry, add padding if necessary. 1104 // TODO: Consider sorting instead. 1105 uint32_t Padding = 0; 1106 uint32_t StartAlignment = 1107 LastFPArgInfo ? LastFPArgInfo->Alignment : MaxAlignment; 1108 if (LastFPArgInfo) { 1109 // Check if we keep the start alignment or if it is shrunk due to the 1110 // size of the last element. 1111 uint32_t Offset = LastFPArgInfo->Size % StartAlignment; 1112 if (Offset) 1113 StartAlignment = Offset; 1114 // We only need as much alignment as the host pointer had (since we 1115 // don't know the alignment information from the source we might end up 1116 // overaligning accesses but not too much). 1117 uint32_t RequiredAlignment = 1118 llvm::bit_floor(getPartialStructRequiredAlignment(HstPtr)); 1119 if (RequiredAlignment > StartAlignment) { 1120 Padding = RequiredAlignment - StartAlignment; 1121 StartAlignment = RequiredAlignment; 1122 } 1123 } 1124 1125 FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize, 1126 StartAlignment, Padding, HstPtrName); 1127 FirstPrivateArgSize += Padding + ArgSize; 1128 } 1129 1130 return OFFLOAD_SUCCESS; 1131 } 1132 1133 /// Pack first-private arguments, replace place holder pointers in \p TgtArgs, 1134 /// and start the transfer. 1135 int packAndTransfer(SmallVector<void *> &TgtArgs) { 1136 if (!FirstPrivateArgInfo.empty()) { 1137 assert(FirstPrivateArgSize != 0 && 1138 "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty"); 1139 FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0); 1140 auto *Itr = FirstPrivateArgBuffer.begin(); 1141 // Copy all host data to this buffer 1142 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { 1143 // First pad the pointer as we (have to) pad it on the device too. 1144 Itr = std::next(Itr, Info.Padding); 1145 std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr); 1146 Itr = std::next(Itr, Info.Size); 1147 } 1148 // Allocate target memory 1149 void *TgtPtr = 1150 Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data()); 1151 if (TgtPtr == nullptr) { 1152 DP("Failed to allocate target memory for private arguments.\n"); 1153 return OFFLOAD_FAIL; 1154 } 1155 TgtPtrs.push_back(TgtPtr); 1156 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n", 1157 FirstPrivateArgSize, DPxPTR(TgtPtr)); 1158 // Transfer data to target device 1159 int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(), 1160 FirstPrivateArgSize, AsyncInfo); 1161 if (Ret != OFFLOAD_SUCCESS) { 1162 DP("Failed to submit data of private arguments.\n"); 1163 return OFFLOAD_FAIL; 1164 } 1165 // Fill in all placeholder pointers 1166 auto TP = reinterpret_cast<uintptr_t>(TgtPtr); 1167 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { 1168 void *&Ptr = TgtArgs[Info.Index]; 1169 assert(Ptr == nullptr && "Target pointer is already set by mistaken"); 1170 // Pad the device pointer to get the right alignment. 1171 TP += Info.Padding; 1172 Ptr = reinterpret_cast<void *>(TP); 1173 TP += Info.Size; 1174 DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD 1175 "\n", 1176 DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin, 1177 DPxPTR(Ptr)); 1178 } 1179 } 1180 1181 return OFFLOAD_SUCCESS; 1182 } 1183 1184 /// Free all target memory allocated for private arguments 1185 int free() { 1186 for (void *P : TgtPtrs) { 1187 int Ret = Device.deleteData(P); 1188 if (Ret != OFFLOAD_SUCCESS) { 1189 DP("Deallocation of (first-)private arrays failed.\n"); 1190 return OFFLOAD_FAIL; 1191 } 1192 } 1193 1194 TgtPtrs.clear(); 1195 1196 return OFFLOAD_SUCCESS; 1197 } 1198 }; 1199 1200 /// Process data before launching the kernel, including calling targetDataBegin 1201 /// to map and transfer data to target device, transferring (first-)private 1202 /// variables. 1203 static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, 1204 int32_t ArgNum, void **ArgBases, void **Args, 1205 int64_t *ArgSizes, int64_t *ArgTypes, 1206 map_var_info_t *ArgNames, void **ArgMappers, 1207 SmallVector<void *> &TgtArgs, 1208 SmallVector<ptrdiff_t> &TgtOffsets, 1209 PrivateArgumentManagerTy &PrivateArgumentManager, 1210 AsyncInfoTy &AsyncInfo) { 1211 1212 auto DeviceOrErr = PM->getDevice(DeviceId); 1213 if (!DeviceOrErr) 1214 FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); 1215 1216 int Ret = targetDataBegin(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes, 1217 ArgTypes, ArgNames, ArgMappers, AsyncInfo); 1218 if (Ret != OFFLOAD_SUCCESS) { 1219 REPORT("Call to targetDataBegin failed, abort target.\n"); 1220 return OFFLOAD_FAIL; 1221 } 1222 1223 // List of (first-)private arrays allocated for this target region 1224 SmallVector<int> TgtArgsPositions(ArgNum, -1); 1225 1226 for (int32_t I = 0; I < ArgNum; ++I) { 1227 if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { 1228 // This is not a target parameter, do not push it into TgtArgs. 1229 // Check for lambda mapping. 1230 if (isLambdaMapping(ArgTypes[I])) { 1231 assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && 1232 "PTR_AND_OBJ must be also MEMBER_OF."); 1233 unsigned Idx = getParentIndex(ArgTypes[I]); 1234 int TgtIdx = TgtArgsPositions[Idx]; 1235 assert(TgtIdx != -1 && "Base address must be translated already."); 1236 // The parent lambda must be processed already and it must be the last 1237 // in TgtArgs and TgtOffsets arrays. 1238 void *HstPtrVal = Args[I]; 1239 void *HstPtrBegin = ArgBases[I]; 1240 void *HstPtrBase = Args[Idx]; 1241 void *TgtPtrBase = 1242 (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]); 1243 DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase)); 1244 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; 1245 void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); 1246 void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation(); 1247 TargetPointerResultTy TPR = 1248 DeviceOrErr->getMappingInfo().getTgtPtrBegin( 1249 HstPtrVal, ArgSizes[I], /*UpdateRefCount=*/false, 1250 /*UseHoldRefCount=*/false); 1251 PointerTgtPtrBegin = TPR.TargetPointer; 1252 if (!TPR.isPresent()) { 1253 DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n", 1254 DPxPTR(HstPtrVal)); 1255 continue; 1256 } 1257 if (TPR.Flags.IsHostPointer) { 1258 DP("Unified memory is active, no need to map lambda captured" 1259 "variable (" DPxMOD ")\n", 1260 DPxPTR(HstPtrVal)); 1261 continue; 1262 } 1263 DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", 1264 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); 1265 Ret = 1266 DeviceOrErr->submitData(TgtPtrBegin, &PointerTgtPtrBegin, 1267 sizeof(void *), AsyncInfo, TPR.getEntry()); 1268 if (Ret != OFFLOAD_SUCCESS) { 1269 REPORT("Copying data to device failed.\n"); 1270 return OFFLOAD_FAIL; 1271 } 1272 } 1273 continue; 1274 } 1275 void *HstPtrBegin = Args[I]; 1276 void *HstPtrBase = ArgBases[I]; 1277 void *TgtPtrBegin; 1278 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; 1279 ptrdiff_t TgtBaseOffset; 1280 TargetPointerResultTy TPR; 1281 if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) { 1282 DP("Forwarding first-private value " DPxMOD " to the target construct\n", 1283 DPxPTR(HstPtrBase)); 1284 TgtPtrBegin = HstPtrBase; 1285 TgtBaseOffset = 0; 1286 } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) { 1287 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; 1288 const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO); 1289 // If there is a next argument and it depends on the current one, we need 1290 // to allocate the private memory immediately. If this is not the case, 1291 // then the argument can be marked for optimization and packed with the 1292 // other privates. 1293 const bool AllocImmediately = 1294 (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF)); 1295 Ret = PrivateArgumentManager.addArg( 1296 HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin, 1297 TgtArgs.size(), HstPtrName, AllocImmediately); 1298 if (Ret != OFFLOAD_SUCCESS) { 1299 REPORT("Failed to process %sprivate argument " DPxMOD "\n", 1300 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin)); 1301 return OFFLOAD_FAIL; 1302 } 1303 } else { 1304 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) 1305 HstPtrBase = *reinterpret_cast<void **>(HstPtrBase); 1306 TPR = DeviceOrErr->getMappingInfo().getTgtPtrBegin( 1307 HstPtrBegin, ArgSizes[I], 1308 /*UpdateRefCount=*/false, 1309 /*UseHoldRefCount=*/false); 1310 TgtPtrBegin = TPR.TargetPointer; 1311 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; 1312 #ifdef OMPTARGET_DEBUG 1313 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); 1314 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n", 1315 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); 1316 #endif 1317 } 1318 TgtArgsPositions[I] = TgtArgs.size(); 1319 TgtArgs.push_back(TgtPtrBegin); 1320 TgtOffsets.push_back(TgtBaseOffset); 1321 } 1322 1323 assert(TgtArgs.size() == TgtOffsets.size() && 1324 "Size mismatch in arguments and offsets"); 1325 1326 // Pack and transfer first-private arguments 1327 Ret = PrivateArgumentManager.packAndTransfer(TgtArgs); 1328 if (Ret != OFFLOAD_SUCCESS) { 1329 DP("Failed to pack and transfer first private arguments\n"); 1330 return OFFLOAD_FAIL; 1331 } 1332 1333 return OFFLOAD_SUCCESS; 1334 } 1335 1336 /// Process data after launching the kernel, including transferring data back to 1337 /// host if needed and deallocating target memory of (first-)private variables. 1338 static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr, 1339 int32_t ArgNum, void **ArgBases, void **Args, 1340 int64_t *ArgSizes, int64_t *ArgTypes, 1341 map_var_info_t *ArgNames, void **ArgMappers, 1342 PrivateArgumentManagerTy &PrivateArgumentManager, 1343 AsyncInfoTy &AsyncInfo) { 1344 1345 auto DeviceOrErr = PM->getDevice(DeviceId); 1346 if (!DeviceOrErr) 1347 FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); 1348 1349 // Move data from device. 1350 int Ret = targetDataEnd(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes, 1351 ArgTypes, ArgNames, ArgMappers, AsyncInfo); 1352 if (Ret != OFFLOAD_SUCCESS) { 1353 REPORT("Call to targetDataEnd failed, abort target.\n"); 1354 return OFFLOAD_FAIL; 1355 } 1356 1357 // Free target memory for private arguments after synchronization. 1358 // TODO: We might want to remove `mutable` in the future by not changing the 1359 // captured variables somehow. 1360 AsyncInfo.addPostProcessingFunction( 1361 [PrivateArgumentManager = 1362 std::move(PrivateArgumentManager)]() mutable -> int { 1363 int Ret = PrivateArgumentManager.free(); 1364 if (Ret != OFFLOAD_SUCCESS) { 1365 REPORT("Failed to deallocate target memory for private args\n"); 1366 return OFFLOAD_FAIL; 1367 } 1368 return Ret; 1369 }); 1370 1371 return OFFLOAD_SUCCESS; 1372 } 1373 } // namespace 1374 1375 /// performs the same actions as data_begin in case arg_num is 1376 /// non-zero and initiates run of the offloaded region on the target platform; 1377 /// if arg_num is non-zero after the region execution is done it also 1378 /// performs the same action as data_update and data_end above. This function 1379 /// returns 0 if it was able to transfer the execution to a target and an 1380 /// integer different from zero otherwise. 1381 int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, 1382 KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo) { 1383 int32_t DeviceId = Device.DeviceID; 1384 TableMap *TM = getTableMap(HostPtr); 1385 // No map for this host pointer found! 1386 if (!TM) { 1387 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n", 1388 DPxPTR(HostPtr)); 1389 return OFFLOAD_FAIL; 1390 } 1391 1392 // get target table. 1393 __tgt_target_table *TargetTable = nullptr; 1394 { 1395 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); 1396 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId && 1397 "Not expecting a device ID outside the table's bounds!"); 1398 TargetTable = TM->Table->TargetsTable[DeviceId]; 1399 } 1400 assert(TargetTable && "Global data has not been mapped\n"); 1401 1402 DP("loop trip count is %" PRIu64 ".\n", KernelArgs.Tripcount); 1403 1404 // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we 1405 // need to manifest base pointers prior to launching a kernel. Even if we have 1406 // mapped an object only partially, e.g. A[N:M], although the kernel is 1407 // expected to access elements starting at address &A[N] and beyond, we still 1408 // need to manifest the base of the array &A[0]. In other cases, e.g. the COI 1409 // API, we need the begin address itself, i.e. &A[N], as the API operates on 1410 // begin addresses, not bases. That's why we pass args and offsets as two 1411 // separate entities so that each plugin can do what it needs. This behavior 1412 // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c. 1413 SmallVector<void *> TgtArgs; 1414 SmallVector<ptrdiff_t> TgtOffsets; 1415 1416 PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo); 1417 1418 int NumClangLaunchArgs = KernelArgs.NumArgs; 1419 int Ret = OFFLOAD_SUCCESS; 1420 if (NumClangLaunchArgs) { 1421 // Process data, such as data mapping, before launching the kernel 1422 Ret = processDataBefore(Loc, DeviceId, HostPtr, NumClangLaunchArgs, 1423 KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs, 1424 KernelArgs.ArgSizes, KernelArgs.ArgTypes, 1425 KernelArgs.ArgNames, KernelArgs.ArgMappers, TgtArgs, 1426 TgtOffsets, PrivateArgumentManager, AsyncInfo); 1427 if (Ret != OFFLOAD_SUCCESS) { 1428 REPORT("Failed to process data before launching the kernel.\n"); 1429 return OFFLOAD_FAIL; 1430 } 1431 1432 // Clang might pass more values via the ArgPtrs to the runtime that we pass 1433 // on to the kernel. 1434 // TOOD: Next time we adjust the KernelArgsTy we should introduce a new 1435 // NumKernelArgs field. 1436 KernelArgs.NumArgs = TgtArgs.size(); 1437 } 1438 1439 // Launch device execution. 1440 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].Address; 1441 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", 1442 TargetTable->EntriesBegin[TM->Index].SymbolName, DPxPTR(TgtEntryPtr), 1443 TM->Index); 1444 1445 { 1446 assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!"); 1447 TIMESCOPE_WITH_DETAILS_AND_IDENT( 1448 "Kernel Target", 1449 "NumArguments=" + std::to_string(KernelArgs.NumArgs) + 1450 ";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) + 1451 ";TripCount=" + std::to_string(KernelArgs.Tripcount), 1452 Loc); 1453 1454 #ifdef OMPT_SUPPORT 1455 /// RAII to establish tool anchors before and after kernel launch 1456 int32_t NumTeams = KernelArgs.NumTeams[0]; 1457 // No need to guard this with OMPT_IF_BUILT 1458 InterfaceRAII TargetSubmitRAII( 1459 RegionInterface.getCallbacks<ompt_callback_target_submit>(), NumTeams); 1460 #endif 1461 1462 Ret = Device.launchKernel(TgtEntryPtr, TgtArgs.data(), TgtOffsets.data(), 1463 KernelArgs, AsyncInfo); 1464 } 1465 1466 if (Ret != OFFLOAD_SUCCESS) { 1467 REPORT("Executing target region abort target.\n"); 1468 return OFFLOAD_FAIL; 1469 } 1470 1471 if (NumClangLaunchArgs) { 1472 // Transfer data back and deallocate target memory for (first-)private 1473 // variables 1474 Ret = processDataAfter(Loc, DeviceId, HostPtr, NumClangLaunchArgs, 1475 KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs, 1476 KernelArgs.ArgSizes, KernelArgs.ArgTypes, 1477 KernelArgs.ArgNames, KernelArgs.ArgMappers, 1478 PrivateArgumentManager, AsyncInfo); 1479 if (Ret != OFFLOAD_SUCCESS) { 1480 REPORT("Failed to process data after launching the kernel.\n"); 1481 return OFFLOAD_FAIL; 1482 } 1483 } 1484 1485 return OFFLOAD_SUCCESS; 1486 } 1487 1488 /// Enables the record replay mechanism by pre-allocating MemorySize 1489 /// and informing the record-replayer of whether to store the output 1490 /// in some file. 1491 int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr, 1492 bool IsRecord, bool SaveOutput, 1493 uint64_t &ReqPtrArgOffset) { 1494 return Device.RTL->initialize_record_replay(Device.DeviceID, MemorySize, 1495 VAddr, IsRecord, SaveOutput, 1496 ReqPtrArgOffset); 1497 } 1498 1499 /// Executes a kernel using pre-recorded information for loading to 1500 /// device memory to launch the target kernel with the pre-recorded 1501 /// configuration. 1502 int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, 1503 void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs, 1504 ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams, 1505 int32_t ThreadLimit, uint64_t LoopTripCount, 1506 AsyncInfoTy &AsyncInfo) { 1507 int32_t DeviceId = Device.DeviceID; 1508 TableMap *TM = getTableMap(HostPtr); 1509 // Fail if the table map fails to find the target kernel pointer for the 1510 // provided host pointer. 1511 if (!TM) { 1512 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n", 1513 DPxPTR(HostPtr)); 1514 return OFFLOAD_FAIL; 1515 } 1516 1517 // Retrieve the target table of offloading entries. 1518 __tgt_target_table *TargetTable = nullptr; 1519 { 1520 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); 1521 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId && 1522 "Not expecting a device ID outside the table's bounds!"); 1523 TargetTable = TM->Table->TargetsTable[DeviceId]; 1524 } 1525 assert(TargetTable && "Global data has not been mapped\n"); 1526 1527 // Retrieve the target kernel pointer, allocate and store the recorded device 1528 // memory data, and launch device execution. 1529 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].Address; 1530 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", 1531 TargetTable->EntriesBegin[TM->Index].SymbolName, DPxPTR(TgtEntryPtr), 1532 TM->Index); 1533 1534 void *TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr, 1535 TARGET_ALLOC_DEFAULT); 1536 Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo); 1537 1538 KernelArgsTy KernelArgs{}; 1539 KernelArgs.Version = OMP_KERNEL_ARG_VERSION; 1540 KernelArgs.NumArgs = NumArgs; 1541 KernelArgs.Tripcount = LoopTripCount; 1542 KernelArgs.NumTeams[0] = NumTeams; 1543 KernelArgs.ThreadLimit[0] = ThreadLimit; 1544 1545 int Ret = Device.launchKernel(TgtEntryPtr, TgtArgs, TgtOffsets, KernelArgs, 1546 AsyncInfo); 1547 1548 if (Ret != OFFLOAD_SUCCESS) { 1549 REPORT("Executing target region abort target.\n"); 1550 return OFFLOAD_FAIL; 1551 } 1552 1553 return OFFLOAD_SUCCESS; 1554 } 1555