1 //===- AMDGPUAttributor.cpp -----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass uses Attributor framework to deduce AMDGPU attributes. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AMDGPU.h" 14 #include "GCNSubtarget.h" 15 #include "Utils/AMDGPUBaseInfo.h" 16 #include "llvm/Analysis/CycleAnalysis.h" 17 #include "llvm/CodeGen/TargetPassConfig.h" 18 #include "llvm/IR/IntrinsicsAMDGPU.h" 19 #include "llvm/IR/IntrinsicsR600.h" 20 #include "llvm/Target/TargetMachine.h" 21 #include "llvm/Transforms/IPO/Attributor.h" 22 23 #define DEBUG_TYPE "amdgpu-attributor" 24 25 namespace llvm { 26 void initializeCycleInfoWrapperPassPass(PassRegistry &); 27 } // namespace llvm 28 29 using namespace llvm; 30 31 static cl::opt<unsigned> KernargPreloadCount( 32 "amdgpu-kernarg-preload-count", 33 cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0)); 34 35 static cl::opt<unsigned> IndirectCallSpecializationThreshold( 36 "amdgpu-indirect-call-specialization-threshold", 37 cl::desc( 38 "A threshold controls whether an indirect call will be specialized"), 39 cl::init(3)); 40 41 #define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS, 42 43 enum ImplicitArgumentPositions { 44 #include "AMDGPUAttributes.def" 45 LAST_ARG_POS 46 }; 47 48 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS, 49 50 enum ImplicitArgumentMask { 51 NOT_IMPLICIT_INPUT = 0, 52 #include "AMDGPUAttributes.def" 53 ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1 54 }; 55 56 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str}, 57 static constexpr std::pair<ImplicitArgumentMask, StringLiteral> 58 ImplicitAttrs[] = { 59 #include "AMDGPUAttributes.def" 60 }; 61 62 // We do not need to note the x workitem or workgroup id because they are always 63 // initialized. 64 // 65 // TODO: We should not add the attributes if the known compile time workgroup 66 // size is 1 for y/z. 67 static ImplicitArgumentMask 68 intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, 69 bool HasApertureRegs, bool SupportsGetDoorBellID, 70 unsigned CodeObjectVersion) { 71 switch (ID) { 72 case Intrinsic::amdgcn_workitem_id_x: 73 NonKernelOnly = true; 74 return WORKITEM_ID_X; 75 case Intrinsic::amdgcn_workgroup_id_x: 76 NonKernelOnly = true; 77 return WORKGROUP_ID_X; 78 case Intrinsic::amdgcn_workitem_id_y: 79 case Intrinsic::r600_read_tidig_y: 80 return WORKITEM_ID_Y; 81 case Intrinsic::amdgcn_workitem_id_z: 82 case Intrinsic::r600_read_tidig_z: 83 return WORKITEM_ID_Z; 84 case Intrinsic::amdgcn_workgroup_id_y: 85 case Intrinsic::r600_read_tgid_y: 86 return WORKGROUP_ID_Y; 87 case Intrinsic::amdgcn_workgroup_id_z: 88 case Intrinsic::r600_read_tgid_z: 89 return WORKGROUP_ID_Z; 90 case Intrinsic::amdgcn_lds_kernel_id: 91 return LDS_KERNEL_ID; 92 case Intrinsic::amdgcn_dispatch_ptr: 93 return DISPATCH_PTR; 94 case Intrinsic::amdgcn_dispatch_id: 95 return DISPATCH_ID; 96 case Intrinsic::amdgcn_implicitarg_ptr: 97 return IMPLICIT_ARG_PTR; 98 // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access 99 // queue_ptr. 100 case Intrinsic::amdgcn_queue_ptr: 101 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5); 102 return QUEUE_PTR; 103 case Intrinsic::amdgcn_is_shared: 104 case Intrinsic::amdgcn_is_private: 105 if (HasApertureRegs) 106 return NOT_IMPLICIT_INPUT; 107 // Under V5, we need implicitarg_ptr + offsets to access private_base or 108 // shared_base. For pre-V5, however, need to access them through queue_ptr + 109 // offsets. 110 return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR 111 : QUEUE_PTR; 112 case Intrinsic::trap: 113 if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4. 114 return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT 115 : QUEUE_PTR; 116 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5); 117 return QUEUE_PTR; 118 default: 119 return NOT_IMPLICIT_INPUT; 120 } 121 } 122 123 static bool castRequiresQueuePtr(unsigned SrcAS) { 124 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; 125 } 126 127 static bool isDSAddress(const Constant *C) { 128 const GlobalValue *GV = dyn_cast<GlobalValue>(C); 129 if (!GV) 130 return false; 131 unsigned AS = GV->getAddressSpace(); 132 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS; 133 } 134 135 /// Returns true if the function requires the implicit argument be passed 136 /// regardless of the function contents. 137 static bool funcRequiresHostcallPtr(const Function &F) { 138 // Sanitizers require the hostcall buffer passed in the implicit arguments. 139 return F.hasFnAttribute(Attribute::SanitizeAddress) || 140 F.hasFnAttribute(Attribute::SanitizeThread) || 141 F.hasFnAttribute(Attribute::SanitizeMemory) || 142 F.hasFnAttribute(Attribute::SanitizeHWAddress) || 143 F.hasFnAttribute(Attribute::SanitizeMemTag); 144 } 145 146 namespace { 147 class AMDGPUInformationCache : public InformationCache { 148 public: 149 AMDGPUInformationCache(const Module &M, AnalysisGetter &AG, 150 BumpPtrAllocator &Allocator, 151 SetVector<Function *> *CGSCC, TargetMachine &TM) 152 : InformationCache(M, AG, Allocator, CGSCC), TM(TM), 153 CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {} 154 155 TargetMachine &TM; 156 157 enum ConstantStatus : uint8_t { 158 NONE = 0, 159 DS_GLOBAL = 1 << 0, 160 ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1, 161 ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2, 162 ADDR_SPACE_CAST_BOTH_TO_FLAT = 163 ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT 164 }; 165 166 /// Check if the subtarget has aperture regs. 167 bool hasApertureRegs(Function &F) { 168 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 169 return ST.hasApertureRegs(); 170 } 171 172 /// Check if the subtarget supports GetDoorbellID. 173 bool supportsGetDoorbellID(Function &F) { 174 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 175 return ST.supportsGetDoorbellID(); 176 } 177 178 std::optional<std::pair<unsigned, unsigned>> 179 getFlatWorkGroupSizeAttr(const Function &F) const { 180 auto R = AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size"); 181 if (!R) 182 return std::nullopt; 183 return std::make_pair(R->first, *(R->second)); 184 } 185 186 std::pair<unsigned, unsigned> 187 getDefaultFlatWorkGroupSize(const Function &F) const { 188 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 189 return ST.getDefaultFlatWorkGroupSize(F.getCallingConv()); 190 } 191 192 std::pair<unsigned, unsigned> 193 getMaximumFlatWorkGroupRange(const Function &F) { 194 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 195 return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()}; 196 } 197 198 SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) { 199 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 200 return ST.getMaxNumWorkGroups(F); 201 } 202 203 /// Get code object version. 204 unsigned getCodeObjectVersion() const { return CodeObjectVersion; } 205 206 /// Get the effective value of "amdgpu-waves-per-eu" for the function, 207 /// accounting for the interaction with the passed value to use for 208 /// "amdgpu-flat-work-group-size". 209 std::pair<unsigned, unsigned> 210 getWavesPerEU(const Function &F, 211 std::pair<unsigned, unsigned> FlatWorkGroupSize) { 212 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 213 return ST.getWavesPerEU(F, FlatWorkGroupSize); 214 } 215 216 std::optional<std::pair<unsigned, unsigned>> 217 getWavesPerEUAttr(const Function &F) { 218 auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", 219 /*OnlyFirstRequired=*/true); 220 if (!Val) 221 return std::nullopt; 222 if (!Val->second) { 223 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 224 Val->second = ST.getMaxWavesPerEU(); 225 } 226 return std::make_pair(Val->first, *(Val->second)); 227 } 228 229 std::pair<unsigned, unsigned> 230 getEffectiveWavesPerEU(const Function &F, 231 std::pair<unsigned, unsigned> WavesPerEU, 232 std::pair<unsigned, unsigned> FlatWorkGroupSize) { 233 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 234 return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize); 235 } 236 237 unsigned getMaxWavesPerEU(const Function &F) { 238 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 239 return ST.getMaxWavesPerEU(); 240 } 241 242 private: 243 /// Check if the ConstantExpr \p CE uses an addrspacecast from private or 244 /// local to flat. These casts may require the queue pointer. 245 static uint8_t visitConstExpr(const ConstantExpr *CE) { 246 uint8_t Status = NONE; 247 248 if (CE->getOpcode() == Instruction::AddrSpaceCast) { 249 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); 250 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS) 251 Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT; 252 else if (SrcAS == AMDGPUAS::LOCAL_ADDRESS) 253 Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT; 254 } 255 256 return Status; 257 } 258 259 /// Get the constant access bitmap for \p C. 260 uint8_t getConstantAccess(const Constant *C, 261 SmallPtrSetImpl<const Constant *> &Visited) { 262 auto It = ConstantStatus.find(C); 263 if (It != ConstantStatus.end()) 264 return It->second; 265 266 uint8_t Result = 0; 267 if (isDSAddress(C)) 268 Result = DS_GLOBAL; 269 270 if (const auto *CE = dyn_cast<ConstantExpr>(C)) 271 Result |= visitConstExpr(CE); 272 273 for (const Use &U : C->operands()) { 274 const auto *OpC = dyn_cast<Constant>(U); 275 if (!OpC || !Visited.insert(OpC).second) 276 continue; 277 278 Result |= getConstantAccess(OpC, Visited); 279 } 280 return Result; 281 } 282 283 public: 284 /// Returns true if \p Fn needs the queue pointer because of \p C. 285 bool needsQueuePtr(const Constant *C, Function &Fn) { 286 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv()); 287 bool HasAperture = hasApertureRegs(Fn); 288 289 // No need to explore the constants. 290 if (!IsNonEntryFunc && HasAperture) 291 return false; 292 293 SmallPtrSet<const Constant *, 8> Visited; 294 uint8_t Access = getConstantAccess(C, Visited); 295 296 // We need to trap on DS globals in non-entry functions. 297 if (IsNonEntryFunc && (Access & DS_GLOBAL)) 298 return true; 299 300 return !HasAperture && (Access & ADDR_SPACE_CAST_BOTH_TO_FLAT); 301 } 302 303 bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) { 304 SmallPtrSet<const Constant *, 8> Visited; 305 uint8_t Access = getConstantAccess(C, Visited); 306 return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT; 307 } 308 309 private: 310 /// Used to determine if the Constant needs the queue pointer. 311 DenseMap<const Constant *, uint8_t> ConstantStatus; 312 const unsigned CodeObjectVersion; 313 }; 314 315 struct AAAMDAttributes 316 : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>, 317 AbstractAttribute> { 318 using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>, 319 AbstractAttribute>; 320 321 AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {} 322 323 /// Create an abstract attribute view for the position \p IRP. 324 static AAAMDAttributes &createForPosition(const IRPosition &IRP, 325 Attributor &A); 326 327 /// See AbstractAttribute::getName(). 328 const std::string getName() const override { return "AAAMDAttributes"; } 329 330 /// See AbstractAttribute::getIdAddr(). 331 const char *getIdAddr() const override { return &ID; } 332 333 /// This function should return true if the type of the \p AA is 334 /// AAAMDAttributes. 335 static bool classof(const AbstractAttribute *AA) { 336 return (AA->getIdAddr() == &ID); 337 } 338 339 /// Unique ID (due to the unique address) 340 static const char ID; 341 }; 342 const char AAAMDAttributes::ID = 0; 343 344 struct AAUniformWorkGroupSize 345 : public StateWrapper<BooleanState, AbstractAttribute> { 346 using Base = StateWrapper<BooleanState, AbstractAttribute>; 347 AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {} 348 349 /// Create an abstract attribute view for the position \p IRP. 350 static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP, 351 Attributor &A); 352 353 /// See AbstractAttribute::getName(). 354 const std::string getName() const override { 355 return "AAUniformWorkGroupSize"; 356 } 357 358 /// See AbstractAttribute::getIdAddr(). 359 const char *getIdAddr() const override { return &ID; } 360 361 /// This function should return true if the type of the \p AA is 362 /// AAAMDAttributes. 363 static bool classof(const AbstractAttribute *AA) { 364 return (AA->getIdAddr() == &ID); 365 } 366 367 /// Unique ID (due to the unique address) 368 static const char ID; 369 }; 370 const char AAUniformWorkGroupSize::ID = 0; 371 372 struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize { 373 AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A) 374 : AAUniformWorkGroupSize(IRP, A) {} 375 376 void initialize(Attributor &A) override { 377 Function *F = getAssociatedFunction(); 378 CallingConv::ID CC = F->getCallingConv(); 379 380 if (CC != CallingConv::AMDGPU_KERNEL) 381 return; 382 383 bool InitialValue = false; 384 if (F->hasFnAttribute("uniform-work-group-size")) 385 InitialValue = 386 F->getFnAttribute("uniform-work-group-size").getValueAsString() == 387 "true"; 388 389 if (InitialValue) 390 indicateOptimisticFixpoint(); 391 else 392 indicatePessimisticFixpoint(); 393 } 394 395 ChangeStatus updateImpl(Attributor &A) override { 396 ChangeStatus Change = ChangeStatus::UNCHANGED; 397 398 auto CheckCallSite = [&](AbstractCallSite CS) { 399 Function *Caller = CS.getInstruction()->getFunction(); 400 LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName() 401 << "->" << getAssociatedFunction()->getName() << "\n"); 402 403 const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>( 404 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); 405 if (!CallerInfo || !CallerInfo->isValidState()) 406 return false; 407 408 Change = Change | clampStateAndIndicateChange(this->getState(), 409 CallerInfo->getState()); 410 411 return true; 412 }; 413 414 bool AllCallSitesKnown = true; 415 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) 416 return indicatePessimisticFixpoint(); 417 418 return Change; 419 } 420 421 ChangeStatus manifest(Attributor &A) override { 422 SmallVector<Attribute, 8> AttrList; 423 LLVMContext &Ctx = getAssociatedFunction()->getContext(); 424 425 AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size", 426 getAssumed() ? "true" : "false")); 427 return A.manifestAttrs(getIRPosition(), AttrList, 428 /* ForceReplace */ true); 429 } 430 431 bool isValidState() const override { 432 // This state is always valid, even when the state is false. 433 return true; 434 } 435 436 const std::string getAsStr(Attributor *) const override { 437 return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]"; 438 } 439 440 /// See AbstractAttribute::trackStatistics() 441 void trackStatistics() const override {} 442 }; 443 444 AAUniformWorkGroupSize & 445 AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP, 446 Attributor &A) { 447 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 448 return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A); 449 llvm_unreachable( 450 "AAUniformWorkGroupSize is only valid for function position"); 451 } 452 453 struct AAAMDAttributesFunction : public AAAMDAttributes { 454 AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A) 455 : AAAMDAttributes(IRP, A) {} 456 457 void initialize(Attributor &A) override { 458 Function *F = getAssociatedFunction(); 459 460 // If the function requires the implicit arg pointer due to sanitizers, 461 // assume it's needed even if explicitly marked as not requiring it. 462 const bool NeedsHostcall = funcRequiresHostcallPtr(*F); 463 if (NeedsHostcall) { 464 removeAssumedBits(IMPLICIT_ARG_PTR); 465 removeAssumedBits(HOSTCALL_PTR); 466 } 467 468 for (auto Attr : ImplicitAttrs) { 469 if (NeedsHostcall && 470 (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR)) 471 continue; 472 473 if (F->hasFnAttribute(Attr.second)) 474 addKnownBits(Attr.first); 475 } 476 477 if (F->isDeclaration()) 478 return; 479 480 // Ignore functions with graphics calling conventions, these are currently 481 // not allowed to have kernel arguments. 482 if (AMDGPU::isGraphics(F->getCallingConv())) { 483 indicatePessimisticFixpoint(); 484 return; 485 } 486 } 487 488 ChangeStatus updateImpl(Attributor &A) override { 489 Function *F = getAssociatedFunction(); 490 // The current assumed state used to determine a change. 491 auto OrigAssumed = getAssumed(); 492 493 // Check for Intrinsics and propagate attributes. 494 const AACallEdges *AAEdges = A.getAAFor<AACallEdges>( 495 *this, this->getIRPosition(), DepClassTy::REQUIRED); 496 if (!AAEdges || !AAEdges->isValidState() || 497 AAEdges->hasNonAsmUnknownCallee()) 498 return indicatePessimisticFixpoint(); 499 500 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv()); 501 502 bool NeedsImplicit = false; 503 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 504 bool HasApertureRegs = InfoCache.hasApertureRegs(*F); 505 bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F); 506 unsigned COV = InfoCache.getCodeObjectVersion(); 507 508 for (Function *Callee : AAEdges->getOptimisticEdges()) { 509 Intrinsic::ID IID = Callee->getIntrinsicID(); 510 if (IID == Intrinsic::not_intrinsic) { 511 const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>( 512 *this, IRPosition::function(*Callee), DepClassTy::REQUIRED); 513 if (!AAAMD || !AAAMD->isValidState()) 514 return indicatePessimisticFixpoint(); 515 *this &= *AAAMD; 516 continue; 517 } 518 519 bool NonKernelOnly = false; 520 ImplicitArgumentMask AttrMask = 521 intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit, 522 HasApertureRegs, SupportsGetDoorbellID, COV); 523 if (AttrMask != NOT_IMPLICIT_INPUT) { 524 if ((IsNonEntryFunc || !NonKernelOnly)) 525 removeAssumedBits(AttrMask); 526 } 527 } 528 529 // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base. 530 if (NeedsImplicit) 531 removeAssumedBits(IMPLICIT_ARG_PTR); 532 533 if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) { 534 // Under V5, we need implicitarg_ptr + offsets to access private_base or 535 // shared_base. We do not actually need queue_ptr. 536 if (COV >= 5) 537 removeAssumedBits(IMPLICIT_ARG_PTR); 538 else 539 removeAssumedBits(QUEUE_PTR); 540 } 541 542 if (funcRetrievesMultigridSyncArg(A, COV)) { 543 assert(!isAssumed(IMPLICIT_ARG_PTR) && 544 "multigrid_sync_arg needs implicitarg_ptr"); 545 removeAssumedBits(MULTIGRID_SYNC_ARG); 546 } 547 548 if (funcRetrievesHostcallPtr(A, COV)) { 549 assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr"); 550 removeAssumedBits(HOSTCALL_PTR); 551 } 552 553 if (funcRetrievesHeapPtr(A, COV)) { 554 assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr"); 555 removeAssumedBits(HEAP_PTR); 556 } 557 558 if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) { 559 assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr"); 560 removeAssumedBits(QUEUE_PTR); 561 } 562 563 if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) { 564 removeAssumedBits(LDS_KERNEL_ID); 565 } 566 567 if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV)) 568 removeAssumedBits(DEFAULT_QUEUE); 569 570 if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV)) 571 removeAssumedBits(COMPLETION_ACTION); 572 573 if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(A)) 574 removeAssumedBits(FLAT_SCRATCH_INIT); 575 576 return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED 577 : ChangeStatus::UNCHANGED; 578 } 579 580 ChangeStatus manifest(Attributor &A) override { 581 SmallVector<Attribute, 8> AttrList; 582 LLVMContext &Ctx = getAssociatedFunction()->getContext(); 583 584 for (auto Attr : ImplicitAttrs) { 585 if (isKnown(Attr.first)) 586 AttrList.push_back(Attribute::get(Ctx, Attr.second)); 587 } 588 589 return A.manifestAttrs(getIRPosition(), AttrList, 590 /* ForceReplace */ true); 591 } 592 593 const std::string getAsStr(Attributor *) const override { 594 std::string Str; 595 raw_string_ostream OS(Str); 596 OS << "AMDInfo["; 597 for (auto Attr : ImplicitAttrs) 598 if (isAssumed(Attr.first)) 599 OS << ' ' << Attr.second; 600 OS << " ]"; 601 return OS.str(); 602 } 603 604 /// See AbstractAttribute::trackStatistics() 605 void trackStatistics() const override {} 606 607 private: 608 bool checkForQueuePtr(Attributor &A) { 609 Function *F = getAssociatedFunction(); 610 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv()); 611 612 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 613 614 bool NeedsQueuePtr = false; 615 616 auto CheckAddrSpaceCasts = [&](Instruction &I) { 617 unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace(); 618 if (castRequiresQueuePtr(SrcAS)) { 619 NeedsQueuePtr = true; 620 return false; 621 } 622 return true; 623 }; 624 625 bool HasApertureRegs = InfoCache.hasApertureRegs(*F); 626 627 // `checkForAllInstructions` is much more cheaper than going through all 628 // instructions, try it first. 629 630 // The queue pointer is not needed if aperture regs is present. 631 if (!HasApertureRegs) { 632 bool UsedAssumedInformation = false; 633 A.checkForAllInstructions(CheckAddrSpaceCasts, *this, 634 {Instruction::AddrSpaceCast}, 635 UsedAssumedInformation); 636 } 637 638 // If we found that we need the queue pointer, nothing else to do. 639 if (NeedsQueuePtr) 640 return true; 641 642 if (!IsNonEntryFunc && HasApertureRegs) 643 return false; 644 645 for (BasicBlock &BB : *F) { 646 for (Instruction &I : BB) { 647 for (const Use &U : I.operands()) { 648 if (const auto *C = dyn_cast<Constant>(U)) { 649 if (InfoCache.needsQueuePtr(C, *F)) 650 return true; 651 } 652 } 653 } 654 } 655 656 return false; 657 } 658 659 bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) { 660 auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV); 661 AA::RangeTy Range(Pos, 8); 662 return funcRetrievesImplicitKernelArg(A, Range); 663 } 664 665 bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) { 666 auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV); 667 AA::RangeTy Range(Pos, 8); 668 return funcRetrievesImplicitKernelArg(A, Range); 669 } 670 671 bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) { 672 auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV); 673 AA::RangeTy Range(Pos, 8); 674 return funcRetrievesImplicitKernelArg(A, Range); 675 } 676 677 bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) { 678 auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV); 679 AA::RangeTy Range(Pos, 8); 680 return funcRetrievesImplicitKernelArg(A, Range); 681 } 682 683 bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) { 684 if (COV < 5) 685 return false; 686 AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8); 687 return funcRetrievesImplicitKernelArg(A, Range); 688 } 689 690 bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) { 691 if (COV < 5) 692 return false; 693 AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8); 694 return funcRetrievesImplicitKernelArg(A, Range); 695 } 696 697 bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) { 698 // Check if this is a call to the implicitarg_ptr builtin and it 699 // is used to retrieve the hostcall pointer. The implicit arg for 700 // hostcall is not used only if every use of the implicitarg_ptr 701 // is a load that clearly does not retrieve any byte of the 702 // hostcall pointer. We check this by tracing all the uses of the 703 // initial call to the implicitarg_ptr intrinsic. 704 auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) { 705 auto &Call = cast<CallBase>(I); 706 if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr) 707 return true; 708 709 const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>( 710 *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED); 711 if (!PointerInfoAA || !PointerInfoAA->getState().isValidState()) 712 return false; 713 714 return PointerInfoAA->forallInterferingAccesses( 715 Range, [](const AAPointerInfo::Access &Acc, bool IsExact) { 716 return Acc.getRemoteInst()->isDroppable(); 717 }); 718 }; 719 720 bool UsedAssumedInformation = false; 721 return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this, 722 UsedAssumedInformation); 723 } 724 725 bool funcRetrievesLDSKernelId(Attributor &A) { 726 auto DoesNotRetrieve = [&](Instruction &I) { 727 auto &Call = cast<CallBase>(I); 728 return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id; 729 }; 730 bool UsedAssumedInformation = false; 731 return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this, 732 UsedAssumedInformation); 733 } 734 735 // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is 736 // not to be set. 737 bool needFlatScratchInit(Attributor &A) { 738 assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set 739 740 // Check all AddrSpaceCast instructions. FlatScratchInit is needed if 741 // there is a cast from PRIVATE_ADDRESS. 742 auto AddrSpaceCastNotFromPrivate = [](Instruction &I) { 743 return cast<AddrSpaceCastInst>(I).getSrcAddressSpace() != 744 AMDGPUAS::PRIVATE_ADDRESS; 745 }; 746 747 bool UsedAssumedInformation = false; 748 if (!A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *this, 749 {Instruction::AddrSpaceCast}, 750 UsedAssumedInformation)) 751 return true; 752 753 // Check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions 754 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 755 756 Function *F = getAssociatedFunction(); 757 for (Instruction &I : instructions(F)) { 758 for (const Use &U : I.operands()) { 759 if (const auto *C = dyn_cast<Constant>(U)) { 760 if (InfoCache.checkConstForAddrSpaceCastFromPrivate(C)) 761 return true; 762 } 763 } 764 } 765 766 // Finally check callees. 767 768 // This is called on each callee; false means callee shouldn't have 769 // no-flat-scratch-init. 770 auto CheckForNoFlatScratchInit = [&](Instruction &I) { 771 const auto &CB = cast<CallBase>(I); 772 const Function *Callee = CB.getCalledFunction(); 773 774 // Callee == 0 for inline asm or indirect call with known callees. 775 // In the latter case, updateImpl() already checked the callees and we 776 // know their FLAT_SCRATCH_INIT bit is set. 777 // If function has indirect call with unknown callees, the bit is 778 // already removed in updateImpl() and execution won't reach here. 779 if (!Callee) 780 return true; 781 782 return Callee->getIntrinsicID() != 783 Intrinsic::amdgcn_addrspacecast_nonnull; 784 }; 785 786 UsedAssumedInformation = false; 787 // If any callee is false (i.e. need FlatScratchInit), 788 // checkForAllCallLikeInstructions returns false, in which case this 789 // function returns true. 790 return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this, 791 UsedAssumedInformation); 792 } 793 }; 794 795 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP, 796 Attributor &A) { 797 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 798 return *new (A.Allocator) AAAMDAttributesFunction(IRP, A); 799 llvm_unreachable("AAAMDAttributes is only valid for function position"); 800 } 801 802 /// Base class to derive different size ranges. 803 struct AAAMDSizeRangeAttribute 804 : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> { 805 using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>; 806 807 StringRef AttrName; 808 809 AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A, 810 StringRef AttrName) 811 : Base(IRP, 32), AttrName(AttrName) {} 812 813 /// See AbstractAttribute::trackStatistics() 814 void trackStatistics() const override {} 815 816 template <class AttributeImpl> ChangeStatus updateImplImpl(Attributor &A) { 817 ChangeStatus Change = ChangeStatus::UNCHANGED; 818 819 auto CheckCallSite = [&](AbstractCallSite CS) { 820 Function *Caller = CS.getInstruction()->getFunction(); 821 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName() 822 << "->" << getAssociatedFunction()->getName() << '\n'); 823 824 const auto *CallerInfo = A.getAAFor<AttributeImpl>( 825 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); 826 if (!CallerInfo || !CallerInfo->isValidState()) 827 return false; 828 829 Change |= 830 clampStateAndIndicateChange(this->getState(), CallerInfo->getState()); 831 832 return true; 833 }; 834 835 bool AllCallSitesKnown = true; 836 if (!A.checkForAllCallSites(CheckCallSite, *this, 837 /*RequireAllCallSites=*/true, 838 AllCallSitesKnown)) 839 return indicatePessimisticFixpoint(); 840 841 return Change; 842 } 843 844 /// Clamp the assumed range to the default value ([Min, Max]) and emit the 845 /// attribute if it is not same as default. 846 ChangeStatus 847 emitAttributeIfNotDefaultAfterClamp(Attributor &A, 848 std::pair<unsigned, unsigned> Default) { 849 auto [Min, Max] = Default; 850 unsigned Lower = getAssumed().getLower().getZExtValue(); 851 unsigned Upper = getAssumed().getUpper().getZExtValue(); 852 853 // Clamp the range to the default value. 854 if (Lower < Min) 855 Lower = Min; 856 if (Upper > Max + 1) 857 Upper = Max + 1; 858 859 // No manifest if the value is invalid or same as default after clamp. 860 if ((Lower == Min && Upper == Max + 1) || (Upper < Lower)) 861 return ChangeStatus::UNCHANGED; 862 863 Function *F = getAssociatedFunction(); 864 LLVMContext &Ctx = F->getContext(); 865 SmallString<10> Buffer; 866 raw_svector_ostream OS(Buffer); 867 OS << Lower << ',' << Upper - 1; 868 return A.manifestAttrs(getIRPosition(), 869 {Attribute::get(Ctx, AttrName, OS.str())}, 870 /*ForceReplace=*/true); 871 } 872 873 const std::string getAsStr(Attributor *) const override { 874 std::string Str; 875 raw_string_ostream OS(Str); 876 OS << getName() << '['; 877 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1; 878 OS << ']'; 879 return OS.str(); 880 } 881 }; 882 883 /// Propagate amdgpu-flat-work-group-size attribute. 884 struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute { 885 AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A) 886 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {} 887 888 void initialize(Attributor &A) override { 889 Function *F = getAssociatedFunction(); 890 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 891 892 bool HasAttr = false; 893 auto Range = InfoCache.getDefaultFlatWorkGroupSize(*F); 894 auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(*F); 895 896 if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) { 897 // We only consider an attribute that is not max range because the front 898 // end always emits the attribute, unfortunately, and sometimes it emits 899 // the max range. 900 if (*Attr != MaxRange) { 901 Range = *Attr; 902 HasAttr = true; 903 } 904 } 905 906 // We don't want to directly clamp the state if it's the max range because 907 // that is basically the worst state. 908 if (Range == MaxRange) 909 return; 910 911 auto [Min, Max] = Range; 912 ConstantRange CR(APInt(32, Min), APInt(32, Max + 1)); 913 IntegerRangeState IRS(CR); 914 clampStateAndIndicateChange(this->getState(), IRS); 915 916 if (HasAttr || AMDGPU::isEntryFunctionCC(F->getCallingConv())) 917 indicateOptimisticFixpoint(); 918 } 919 920 ChangeStatus updateImpl(Attributor &A) override { 921 return updateImplImpl<AAAMDFlatWorkGroupSize>(A); 922 } 923 924 /// Create an abstract attribute view for the position \p IRP. 925 static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP, 926 Attributor &A); 927 928 ChangeStatus manifest(Attributor &A) override { 929 Function *F = getAssociatedFunction(); 930 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 931 return emitAttributeIfNotDefaultAfterClamp( 932 A, InfoCache.getMaximumFlatWorkGroupRange(*F)); 933 } 934 935 /// See AbstractAttribute::getName() 936 const std::string getName() const override { 937 return "AAAMDFlatWorkGroupSize"; 938 } 939 940 /// See AbstractAttribute::getIdAddr() 941 const char *getIdAddr() const override { return &ID; } 942 943 /// This function should return true if the type of the \p AA is 944 /// AAAMDFlatWorkGroupSize 945 static bool classof(const AbstractAttribute *AA) { 946 return (AA->getIdAddr() == &ID); 947 } 948 949 /// Unique ID (due to the unique address) 950 static const char ID; 951 }; 952 953 const char AAAMDFlatWorkGroupSize::ID = 0; 954 955 AAAMDFlatWorkGroupSize & 956 AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP, 957 Attributor &A) { 958 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 959 return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A); 960 llvm_unreachable( 961 "AAAMDFlatWorkGroupSize is only valid for function position"); 962 } 963 964 struct TupleDecIntegerRangeState : public AbstractState { 965 DecIntegerState<uint32_t> X, Y, Z; 966 967 bool isValidState() const override { 968 return X.isValidState() && Y.isValidState() && Z.isValidState(); 969 } 970 971 bool isAtFixpoint() const override { 972 return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint(); 973 } 974 975 ChangeStatus indicateOptimisticFixpoint() override { 976 return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() | 977 Z.indicateOptimisticFixpoint(); 978 } 979 980 ChangeStatus indicatePessimisticFixpoint() override { 981 return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() | 982 Z.indicatePessimisticFixpoint(); 983 } 984 985 TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) { 986 X ^= Other.X; 987 Y ^= Other.Y; 988 Z ^= Other.Z; 989 return *this; 990 } 991 992 bool operator==(const TupleDecIntegerRangeState &Other) const { 993 return X == Other.X && Y == Other.Y && Z == Other.Z; 994 } 995 996 TupleDecIntegerRangeState &getAssumed() { return *this; } 997 const TupleDecIntegerRangeState &getAssumed() const { return *this; } 998 }; 999 1000 using AAAMDMaxNumWorkgroupsState = 1001 StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>; 1002 1003 /// Propagate amdgpu-max-num-workgroups attribute. 1004 struct AAAMDMaxNumWorkgroups 1005 : public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> { 1006 using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>; 1007 1008 AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {} 1009 1010 void initialize(Attributor &A) override { 1011 Function *F = getAssociatedFunction(); 1012 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 1013 1014 SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F); 1015 1016 X.takeKnownMinimum(MaxNumWorkgroups[0]); 1017 Y.takeKnownMinimum(MaxNumWorkgroups[1]); 1018 Z.takeKnownMinimum(MaxNumWorkgroups[2]); 1019 1020 if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) 1021 indicatePessimisticFixpoint(); 1022 } 1023 1024 ChangeStatus updateImpl(Attributor &A) override { 1025 ChangeStatus Change = ChangeStatus::UNCHANGED; 1026 1027 auto CheckCallSite = [&](AbstractCallSite CS) { 1028 Function *Caller = CS.getInstruction()->getFunction(); 1029 LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName() 1030 << "->" << getAssociatedFunction()->getName() << '\n'); 1031 1032 const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>( 1033 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); 1034 if (!CallerInfo || !CallerInfo->isValidState()) 1035 return false; 1036 1037 Change |= 1038 clampStateAndIndicateChange(this->getState(), CallerInfo->getState()); 1039 return true; 1040 }; 1041 1042 bool AllCallSitesKnown = true; 1043 if (!A.checkForAllCallSites(CheckCallSite, *this, 1044 /*RequireAllCallSites=*/true, 1045 AllCallSitesKnown)) 1046 return indicatePessimisticFixpoint(); 1047 1048 return Change; 1049 } 1050 1051 /// Create an abstract attribute view for the position \p IRP. 1052 static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP, 1053 Attributor &A); 1054 1055 ChangeStatus manifest(Attributor &A) override { 1056 Function *F = getAssociatedFunction(); 1057 LLVMContext &Ctx = F->getContext(); 1058 SmallString<32> Buffer; 1059 raw_svector_ostream OS(Buffer); 1060 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed(); 1061 1062 // TODO: Should annotate loads of the group size for this to do anything 1063 // useful. 1064 return A.manifestAttrs( 1065 getIRPosition(), 1066 {Attribute::get(Ctx, "amdgpu-max-num-workgroups", OS.str())}, 1067 /* ForceReplace= */ true); 1068 } 1069 1070 const std::string getName() const override { return "AAAMDMaxNumWorkgroups"; } 1071 1072 const std::string getAsStr(Attributor *) const override { 1073 std::string Buffer = "AAAMDMaxNumWorkgroupsState["; 1074 raw_string_ostream OS(Buffer); 1075 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed() 1076 << ']'; 1077 return OS.str(); 1078 } 1079 1080 const char *getIdAddr() const override { return &ID; } 1081 1082 /// This function should return true if the type of the \p AA is 1083 /// AAAMDMaxNumWorkgroups 1084 static bool classof(const AbstractAttribute *AA) { 1085 return (AA->getIdAddr() == &ID); 1086 } 1087 1088 void trackStatistics() const override {} 1089 1090 /// Unique ID (due to the unique address) 1091 static const char ID; 1092 }; 1093 1094 const char AAAMDMaxNumWorkgroups::ID = 0; 1095 1096 AAAMDMaxNumWorkgroups & 1097 AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) { 1098 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 1099 return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A); 1100 llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position"); 1101 } 1102 1103 /// Propagate amdgpu-waves-per-eu attribute. 1104 struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute { 1105 AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A) 1106 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {} 1107 1108 void initialize(Attributor &A) override { 1109 Function *F = getAssociatedFunction(); 1110 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 1111 1112 auto TakeRange = [&](std::pair<unsigned, unsigned> R) { 1113 auto [Min, Max] = R; 1114 ConstantRange Range(APInt(32, Min), APInt(32, Max + 1)); 1115 IntegerRangeState RangeState(Range); 1116 clampStateAndIndicateChange(this->getState(), RangeState); 1117 indicateOptimisticFixpoint(); 1118 }; 1119 1120 std::pair<unsigned, unsigned> MaxWavesPerEURange{ 1121 1U, InfoCache.getMaxWavesPerEU(*F)}; 1122 1123 // If the attribute exists, we will honor it if it is not the default. 1124 if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) { 1125 if (*Attr != MaxWavesPerEURange) { 1126 TakeRange(*Attr); 1127 return; 1128 } 1129 } 1130 1131 // Unlike AAAMDFlatWorkGroupSize, it's getting trickier here. Since the 1132 // calculation of waves per EU involves flat work group size, we can't 1133 // simply use an assumed flat work group size as a start point, because the 1134 // update of flat work group size is in an inverse direction of waves per 1135 // EU. However, we can still do something if it is an entry function. Since 1136 // an entry function is a terminal node, and flat work group size either 1137 // from attribute or default will be used anyway, we can take that value and 1138 // calculate the waves per EU based on it. This result can't be updated by 1139 // no means, but that could still allow us to propagate it. 1140 if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) { 1141 std::pair<unsigned, unsigned> FlatWorkGroupSize; 1142 if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) 1143 FlatWorkGroupSize = *Attr; 1144 else 1145 FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F); 1146 TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange, 1147 FlatWorkGroupSize)); 1148 } 1149 } 1150 1151 ChangeStatus updateImpl(Attributor &A) override { 1152 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 1153 ChangeStatus Change = ChangeStatus::UNCHANGED; 1154 1155 auto CheckCallSite = [&](AbstractCallSite CS) { 1156 Function *Caller = CS.getInstruction()->getFunction(); 1157 Function *Func = getAssociatedFunction(); 1158 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName() 1159 << "->" << Func->getName() << '\n'); 1160 1161 const auto *CallerInfo = A.getAAFor<AAAMDWavesPerEU>( 1162 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); 1163 const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>( 1164 *this, IRPosition::function(*Func), DepClassTy::REQUIRED); 1165 if (!CallerInfo || !AssumedGroupSize || !CallerInfo->isValidState() || 1166 !AssumedGroupSize->isValidState()) 1167 return false; 1168 1169 unsigned Min, Max; 1170 std::tie(Min, Max) = InfoCache.getEffectiveWavesPerEU( 1171 *Caller, 1172 {CallerInfo->getAssumed().getLower().getZExtValue(), 1173 CallerInfo->getAssumed().getUpper().getZExtValue() - 1}, 1174 {AssumedGroupSize->getAssumed().getLower().getZExtValue(), 1175 AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1}); 1176 ConstantRange CallerRange(APInt(32, Min), APInt(32, Max + 1)); 1177 IntegerRangeState CallerRangeState(CallerRange); 1178 Change |= clampStateAndIndicateChange(this->getState(), CallerRangeState); 1179 1180 return true; 1181 }; 1182 1183 bool AllCallSitesKnown = true; 1184 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) 1185 return indicatePessimisticFixpoint(); 1186 1187 return Change; 1188 } 1189 1190 /// Create an abstract attribute view for the position \p IRP. 1191 static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP, 1192 Attributor &A); 1193 1194 ChangeStatus manifest(Attributor &A) override { 1195 Function *F = getAssociatedFunction(); 1196 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 1197 return emitAttributeIfNotDefaultAfterClamp( 1198 A, {1U, InfoCache.getMaxWavesPerEU(*F)}); 1199 } 1200 1201 /// See AbstractAttribute::getName() 1202 const std::string getName() const override { return "AAAMDWavesPerEU"; } 1203 1204 /// See AbstractAttribute::getIdAddr() 1205 const char *getIdAddr() const override { return &ID; } 1206 1207 /// This function should return true if the type of the \p AA is 1208 /// AAAMDWavesPerEU 1209 static bool classof(const AbstractAttribute *AA) { 1210 return (AA->getIdAddr() == &ID); 1211 } 1212 1213 /// Unique ID (due to the unique address) 1214 static const char ID; 1215 }; 1216 1217 const char AAAMDWavesPerEU::ID = 0; 1218 1219 AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP, 1220 Attributor &A) { 1221 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 1222 return *new (A.Allocator) AAAMDWavesPerEU(IRP, A); 1223 llvm_unreachable("AAAMDWavesPerEU is only valid for function position"); 1224 } 1225 1226 static bool inlineAsmUsesAGPRs(const InlineAsm *IA) { 1227 for (const auto &CI : IA->ParseConstraints()) { 1228 for (StringRef Code : CI.Codes) { 1229 Code.consume_front("{"); 1230 if (Code.starts_with("a")) 1231 return true; 1232 } 1233 } 1234 1235 return false; 1236 } 1237 1238 struct AAAMDGPUNoAGPR 1239 : public IRAttribute<Attribute::NoUnwind, 1240 StateWrapper<BooleanState, AbstractAttribute>, 1241 AAAMDGPUNoAGPR> { 1242 AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {} 1243 1244 static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP, 1245 Attributor &A) { 1246 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 1247 return *new (A.Allocator) AAAMDGPUNoAGPR(IRP, A); 1248 llvm_unreachable("AAAMDGPUNoAGPR is only valid for function position"); 1249 } 1250 1251 void initialize(Attributor &A) override { 1252 Function *F = getAssociatedFunction(); 1253 if (F->hasFnAttribute("amdgpu-no-agpr")) 1254 indicateOptimisticFixpoint(); 1255 } 1256 1257 const std::string getAsStr(Attributor *A) const override { 1258 return getAssumed() ? "amdgpu-no-agpr" : "amdgpu-maybe-agpr"; 1259 } 1260 1261 void trackStatistics() const override {} 1262 1263 ChangeStatus updateImpl(Attributor &A) override { 1264 // TODO: Use AACallEdges, but then we need a way to inspect asm edges. 1265 1266 auto CheckForNoAGPRs = [&](Instruction &I) { 1267 const auto &CB = cast<CallBase>(I); 1268 const Value *CalleeOp = CB.getCalledOperand(); 1269 const Function *Callee = dyn_cast<Function>(CalleeOp); 1270 if (!Callee) { 1271 if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp)) 1272 return !inlineAsmUsesAGPRs(IA); 1273 return false; 1274 } 1275 1276 // Some intrinsics may use AGPRs, but if we have a choice, we are not 1277 // required to use AGPRs. 1278 if (Callee->isIntrinsic()) 1279 return true; 1280 1281 // TODO: Handle callsite attributes 1282 const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>( 1283 *this, IRPosition::function(*Callee), DepClassTy::REQUIRED); 1284 return CalleeInfo && CalleeInfo->isValidState() && 1285 CalleeInfo->getAssumed(); 1286 }; 1287 1288 bool UsedAssumedInformation = false; 1289 if (!A.checkForAllCallLikeInstructions(CheckForNoAGPRs, *this, 1290 UsedAssumedInformation)) 1291 return indicatePessimisticFixpoint(); 1292 return ChangeStatus::UNCHANGED; 1293 } 1294 1295 ChangeStatus manifest(Attributor &A) override { 1296 if (!getAssumed()) 1297 return ChangeStatus::UNCHANGED; 1298 LLVMContext &Ctx = getAssociatedFunction()->getContext(); 1299 return A.manifestAttrs(getIRPosition(), 1300 {Attribute::get(Ctx, "amdgpu-no-agpr")}); 1301 } 1302 1303 const std::string getName() const override { return "AAAMDGPUNoAGPR"; } 1304 const char *getIdAddr() const override { return &ID; } 1305 1306 /// This function should return true if the type of the \p AA is 1307 /// AAAMDGPUNoAGPRs 1308 static bool classof(const AbstractAttribute *AA) { 1309 return (AA->getIdAddr() == &ID); 1310 } 1311 1312 static const char ID; 1313 }; 1314 1315 const char AAAMDGPUNoAGPR::ID = 0; 1316 1317 static void addPreloadKernArgHint(Function &F, TargetMachine &TM) { 1318 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 1319 for (unsigned I = 0; 1320 I < F.arg_size() && 1321 I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs()); 1322 ++I) { 1323 Argument &Arg = *F.getArg(I); 1324 // Check for incompatible attributes. 1325 if (Arg.hasByRefAttr() || Arg.hasNestAttr()) 1326 break; 1327 1328 Arg.addAttr(Attribute::InReg); 1329 } 1330 } 1331 1332 static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, 1333 AMDGPUAttributorOptions Options) { 1334 SetVector<Function *> Functions; 1335 for (Function &F : M) { 1336 if (!F.isIntrinsic()) 1337 Functions.insert(&F); 1338 } 1339 1340 CallGraphUpdater CGUpdater; 1341 BumpPtrAllocator Allocator; 1342 AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM); 1343 DenseSet<const char *> Allowed( 1344 {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, 1345 &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, 1346 &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, 1347 &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID, 1348 &AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID, 1349 &AAInstanceInfo::ID}); 1350 1351 AttributorConfig AC(CGUpdater); 1352 AC.IsClosedWorldModule = Options.IsClosedWorld; 1353 AC.Allowed = &Allowed; 1354 AC.IsModulePass = true; 1355 AC.DefaultInitializeLiveInternals = false; 1356 AC.IndirectCalleeSpecializationCallback = 1357 [](Attributor &A, const AbstractAttribute &AA, CallBase &CB, 1358 Function &Callee, unsigned NumAssumedCallees) { 1359 return !AMDGPU::isEntryFunctionCC(Callee.getCallingConv()) && 1360 (NumAssumedCallees <= IndirectCallSpecializationThreshold); 1361 }; 1362 AC.IPOAmendableCB = [](const Function &F) { 1363 return F.getCallingConv() == CallingConv::AMDGPU_KERNEL; 1364 }; 1365 1366 Attributor A(Functions, InfoCache, AC); 1367 1368 LLVM_DEBUG(dbgs() << "[AMDGPUAttributor] Module " << M.getName() << " is " 1369 << (AC.IsClosedWorldModule ? "" : "not ") 1370 << "assumed to be a closed world.\n"); 1371 1372 for (auto *F : Functions) { 1373 A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F)); 1374 A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F)); 1375 A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F)); 1376 A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F)); 1377 CallingConv::ID CC = F->getCallingConv(); 1378 if (!AMDGPU::isEntryFunctionCC(CC)) { 1379 A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F)); 1380 A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F)); 1381 } else if (CC == CallingConv::AMDGPU_KERNEL) { 1382 addPreloadKernArgHint(*F, TM); 1383 } 1384 1385 for (auto &I : instructions(F)) { 1386 if (auto *LI = dyn_cast<LoadInst>(&I)) { 1387 A.getOrCreateAAFor<AAAddressSpace>( 1388 IRPosition::value(*LI->getPointerOperand())); 1389 } else if (auto *SI = dyn_cast<StoreInst>(&I)) { 1390 A.getOrCreateAAFor<AAAddressSpace>( 1391 IRPosition::value(*SI->getPointerOperand())); 1392 } else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I)) { 1393 A.getOrCreateAAFor<AAAddressSpace>( 1394 IRPosition::value(*RMW->getPointerOperand())); 1395 } else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I)) { 1396 A.getOrCreateAAFor<AAAddressSpace>( 1397 IRPosition::value(*CmpX->getPointerOperand())); 1398 } 1399 } 1400 } 1401 1402 ChangeStatus Change = A.run(); 1403 return Change == ChangeStatus::CHANGED; 1404 } 1405 1406 class AMDGPUAttributorLegacy : public ModulePass { 1407 public: 1408 AMDGPUAttributorLegacy() : ModulePass(ID) {} 1409 1410 /// doInitialization - Virtual method overridden by subclasses to do 1411 /// any necessary initialization before any pass is run. 1412 bool doInitialization(Module &) override { 1413 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 1414 if (!TPC) 1415 report_fatal_error("TargetMachine is required"); 1416 1417 TM = &TPC->getTM<TargetMachine>(); 1418 return false; 1419 } 1420 1421 bool runOnModule(Module &M) override { 1422 AnalysisGetter AG(this); 1423 return runImpl(M, AG, *TM, /*Options=*/{}); 1424 } 1425 1426 void getAnalysisUsage(AnalysisUsage &AU) const override { 1427 AU.addRequired<CycleInfoWrapperPass>(); 1428 } 1429 1430 StringRef getPassName() const override { return "AMDGPU Attributor"; } 1431 TargetMachine *TM; 1432 static char ID; 1433 }; 1434 } // namespace 1435 1436 PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M, 1437 ModuleAnalysisManager &AM) { 1438 1439 FunctionAnalysisManager &FAM = 1440 AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1441 AnalysisGetter AG(FAM); 1442 1443 // TODO: Probably preserves CFG 1444 return runImpl(M, AG, TM, Options) ? PreservedAnalyses::none() 1445 : PreservedAnalyses::all(); 1446 } 1447 1448 char AMDGPUAttributorLegacy::ID = 0; 1449 1450 Pass *llvm::createAMDGPUAttributorLegacyPass() { 1451 return new AMDGPUAttributorLegacy(); 1452 } 1453 INITIALIZE_PASS_BEGIN(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor", 1454 false, false) 1455 INITIALIZE_PASS_DEPENDENCY(CycleInfoWrapperPass); 1456 INITIALIZE_PASS_END(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor", 1457 false, false) 1458