xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp (revision 44201679c6ec597a8624b38ff8f056c5a8dab901)
1 //===- AMDGPUAttributor.cpp -----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AMDGPU.h"
14 #include "GCNSubtarget.h"
15 #include "Utils/AMDGPUBaseInfo.h"
16 #include "llvm/Analysis/CycleAnalysis.h"
17 #include "llvm/CodeGen/TargetPassConfig.h"
18 #include "llvm/IR/IntrinsicsAMDGPU.h"
19 #include "llvm/IR/IntrinsicsR600.h"
20 #include "llvm/Target/TargetMachine.h"
21 #include "llvm/Transforms/IPO/Attributor.h"
22 
23 #define DEBUG_TYPE "amdgpu-attributor"
24 
25 namespace llvm {
26 void initializeCycleInfoWrapperPassPass(PassRegistry &);
27 } // namespace llvm
28 
29 using namespace llvm;
30 
31 static cl::opt<unsigned> KernargPreloadCount(
32     "amdgpu-kernarg-preload-count",
33     cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));
34 
35 static cl::opt<unsigned> IndirectCallSpecializationThreshold(
36     "amdgpu-indirect-call-specialization-threshold",
37     cl::desc(
38         "A threshold controls whether an indirect call will be specialized"),
39     cl::init(3));
40 
41 #define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
42 
43 enum ImplicitArgumentPositions {
44 #include "AMDGPUAttributes.def"
45   LAST_ARG_POS
46 };
47 
48 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
49 
50 enum ImplicitArgumentMask {
51   NOT_IMPLICIT_INPUT = 0,
52 #include "AMDGPUAttributes.def"
53   ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
54 };
55 
56 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
57 static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
58     ImplicitAttrs[] = {
59 #include "AMDGPUAttributes.def"
60 };
61 
62 // We do not need to note the x workitem or workgroup id because they are always
63 // initialized.
64 //
65 // TODO: We should not add the attributes if the known compile time workgroup
66 // size is 1 for y/z.
67 static ImplicitArgumentMask
68 intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
69                     bool HasApertureRegs, bool SupportsGetDoorBellID,
70                     unsigned CodeObjectVersion) {
71   switch (ID) {
72   case Intrinsic::amdgcn_workitem_id_x:
73     NonKernelOnly = true;
74     return WORKITEM_ID_X;
75   case Intrinsic::amdgcn_workgroup_id_x:
76     NonKernelOnly = true;
77     return WORKGROUP_ID_X;
78   case Intrinsic::amdgcn_workitem_id_y:
79   case Intrinsic::r600_read_tidig_y:
80     return WORKITEM_ID_Y;
81   case Intrinsic::amdgcn_workitem_id_z:
82   case Intrinsic::r600_read_tidig_z:
83     return WORKITEM_ID_Z;
84   case Intrinsic::amdgcn_workgroup_id_y:
85   case Intrinsic::r600_read_tgid_y:
86     return WORKGROUP_ID_Y;
87   case Intrinsic::amdgcn_workgroup_id_z:
88   case Intrinsic::r600_read_tgid_z:
89     return WORKGROUP_ID_Z;
90   case Intrinsic::amdgcn_lds_kernel_id:
91     return LDS_KERNEL_ID;
92   case Intrinsic::amdgcn_dispatch_ptr:
93     return DISPATCH_PTR;
94   case Intrinsic::amdgcn_dispatch_id:
95     return DISPATCH_ID;
96   case Intrinsic::amdgcn_implicitarg_ptr:
97     return IMPLICIT_ARG_PTR;
98   // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
99   // queue_ptr.
100   case Intrinsic::amdgcn_queue_ptr:
101     NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
102     return QUEUE_PTR;
103   case Intrinsic::amdgcn_is_shared:
104   case Intrinsic::amdgcn_is_private:
105     if (HasApertureRegs)
106       return NOT_IMPLICIT_INPUT;
107     // Under V5, we need implicitarg_ptr + offsets to access private_base or
108     // shared_base. For pre-V5, however, need to access them through queue_ptr +
109     // offsets.
110     return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR
111                                                     : QUEUE_PTR;
112   case Intrinsic::trap:
113     if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
114       return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT
115                                                       : QUEUE_PTR;
116     NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
117     return QUEUE_PTR;
118   default:
119     return NOT_IMPLICIT_INPUT;
120   }
121 }
122 
123 static bool castRequiresQueuePtr(unsigned SrcAS) {
124   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
125 }
126 
127 static bool isDSAddress(const Constant *C) {
128   const GlobalValue *GV = dyn_cast<GlobalValue>(C);
129   if (!GV)
130     return false;
131   unsigned AS = GV->getAddressSpace();
132   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
133 }
134 
135 /// Returns true if the function requires the implicit argument be passed
136 /// regardless of the function contents.
137 static bool funcRequiresHostcallPtr(const Function &F) {
138   // Sanitizers require the hostcall buffer passed in the implicit arguments.
139   return F.hasFnAttribute(Attribute::SanitizeAddress) ||
140          F.hasFnAttribute(Attribute::SanitizeThread) ||
141          F.hasFnAttribute(Attribute::SanitizeMemory) ||
142          F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
143          F.hasFnAttribute(Attribute::SanitizeMemTag);
144 }
145 
146 namespace {
147 class AMDGPUInformationCache : public InformationCache {
148 public:
149   AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
150                          BumpPtrAllocator &Allocator,
151                          SetVector<Function *> *CGSCC, TargetMachine &TM)
152       : InformationCache(M, AG, Allocator, CGSCC), TM(TM),
153         CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}
154 
155   TargetMachine &TM;
156 
157   enum ConstantStatus : uint8_t {
158     NONE = 0,
159     DS_GLOBAL = 1 << 0,
160     ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1,
161     ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2,
162     ADDR_SPACE_CAST_BOTH_TO_FLAT =
163         ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT
164   };
165 
166   /// Check if the subtarget has aperture regs.
167   bool hasApertureRegs(Function &F) {
168     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
169     return ST.hasApertureRegs();
170   }
171 
172   /// Check if the subtarget supports GetDoorbellID.
173   bool supportsGetDoorbellID(Function &F) {
174     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
175     return ST.supportsGetDoorbellID();
176   }
177 
178   std::optional<std::pair<unsigned, unsigned>>
179   getFlatWorkGroupSizeAttr(const Function &F) const {
180     auto R = AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size");
181     if (!R)
182       return std::nullopt;
183     return std::make_pair(R->first, *(R->second));
184   }
185 
186   std::pair<unsigned, unsigned>
187   getDefaultFlatWorkGroupSize(const Function &F) const {
188     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
189     return ST.getDefaultFlatWorkGroupSize(F.getCallingConv());
190   }
191 
192   std::pair<unsigned, unsigned>
193   getMaximumFlatWorkGroupRange(const Function &F) {
194     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
195     return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
196   }
197 
198   SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) {
199     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
200     return ST.getMaxNumWorkGroups(F);
201   }
202 
203   /// Get code object version.
204   unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
205 
206   /// Get the effective value of "amdgpu-waves-per-eu" for the function,
207   /// accounting for the interaction with the passed value to use for
208   /// "amdgpu-flat-work-group-size".
209   std::pair<unsigned, unsigned>
210   getWavesPerEU(const Function &F,
211                 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
212     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
213     return ST.getWavesPerEU(F, FlatWorkGroupSize);
214   }
215 
216   std::optional<std::pair<unsigned, unsigned>>
217   getWavesPerEUAttr(const Function &F) {
218     auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu",
219                                                /*OnlyFirstRequired=*/true);
220     if (!Val)
221       return std::nullopt;
222     if (!Val->second) {
223       const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
224       Val->second = ST.getMaxWavesPerEU();
225     }
226     return std::make_pair(Val->first, *(Val->second));
227   }
228 
229   std::pair<unsigned, unsigned>
230   getEffectiveWavesPerEU(const Function &F,
231                          std::pair<unsigned, unsigned> WavesPerEU,
232                          std::pair<unsigned, unsigned> FlatWorkGroupSize) {
233     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
234     return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize);
235   }
236 
237   unsigned getMaxWavesPerEU(const Function &F) {
238     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
239     return ST.getMaxWavesPerEU();
240   }
241 
242 private:
243   /// Check if the ConstantExpr \p CE uses an addrspacecast from private or
244   /// local to flat. These casts may require the queue pointer.
245   static uint8_t visitConstExpr(const ConstantExpr *CE) {
246     uint8_t Status = NONE;
247 
248     if (CE->getOpcode() == Instruction::AddrSpaceCast) {
249       unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
250       if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS)
251         Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
252       else if (SrcAS == AMDGPUAS::LOCAL_ADDRESS)
253         Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
254     }
255 
256     return Status;
257   }
258 
259   /// Get the constant access bitmap for \p C.
260   uint8_t getConstantAccess(const Constant *C,
261                             SmallPtrSetImpl<const Constant *> &Visited) {
262     auto It = ConstantStatus.find(C);
263     if (It != ConstantStatus.end())
264       return It->second;
265 
266     uint8_t Result = 0;
267     if (isDSAddress(C))
268       Result = DS_GLOBAL;
269 
270     if (const auto *CE = dyn_cast<ConstantExpr>(C))
271       Result |= visitConstExpr(CE);
272 
273     for (const Use &U : C->operands()) {
274       const auto *OpC = dyn_cast<Constant>(U);
275       if (!OpC || !Visited.insert(OpC).second)
276         continue;
277 
278       Result |= getConstantAccess(OpC, Visited);
279     }
280     return Result;
281   }
282 
283 public:
284   /// Returns true if \p Fn needs the queue pointer because of \p C.
285   bool needsQueuePtr(const Constant *C, Function &Fn) {
286     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
287     bool HasAperture = hasApertureRegs(Fn);
288 
289     // No need to explore the constants.
290     if (!IsNonEntryFunc && HasAperture)
291       return false;
292 
293     SmallPtrSet<const Constant *, 8> Visited;
294     uint8_t Access = getConstantAccess(C, Visited);
295 
296     // We need to trap on DS globals in non-entry functions.
297     if (IsNonEntryFunc && (Access & DS_GLOBAL))
298       return true;
299 
300     return !HasAperture && (Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
301   }
302 
303   bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) {
304     SmallPtrSet<const Constant *, 8> Visited;
305     uint8_t Access = getConstantAccess(C, Visited);
306     return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
307   }
308 
309 private:
310   /// Used to determine if the Constant needs the queue pointer.
311   DenseMap<const Constant *, uint8_t> ConstantStatus;
312   const unsigned CodeObjectVersion;
313 };
314 
315 struct AAAMDAttributes
316     : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
317                           AbstractAttribute> {
318   using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
319                             AbstractAttribute>;
320 
321   AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
322 
323   /// Create an abstract attribute view for the position \p IRP.
324   static AAAMDAttributes &createForPosition(const IRPosition &IRP,
325                                             Attributor &A);
326 
327   /// See AbstractAttribute::getName().
328   const std::string getName() const override { return "AAAMDAttributes"; }
329 
330   /// See AbstractAttribute::getIdAddr().
331   const char *getIdAddr() const override { return &ID; }
332 
333   /// This function should return true if the type of the \p AA is
334   /// AAAMDAttributes.
335   static bool classof(const AbstractAttribute *AA) {
336     return (AA->getIdAddr() == &ID);
337   }
338 
339   /// Unique ID (due to the unique address)
340   static const char ID;
341 };
342 const char AAAMDAttributes::ID = 0;
343 
344 struct AAUniformWorkGroupSize
345     : public StateWrapper<BooleanState, AbstractAttribute> {
346   using Base = StateWrapper<BooleanState, AbstractAttribute>;
347   AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
348 
349   /// Create an abstract attribute view for the position \p IRP.
350   static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
351                                                    Attributor &A);
352 
353   /// See AbstractAttribute::getName().
354   const std::string getName() const override {
355     return "AAUniformWorkGroupSize";
356   }
357 
358   /// See AbstractAttribute::getIdAddr().
359   const char *getIdAddr() const override { return &ID; }
360 
361   /// This function should return true if the type of the \p AA is
362   /// AAAMDAttributes.
363   static bool classof(const AbstractAttribute *AA) {
364     return (AA->getIdAddr() == &ID);
365   }
366 
367   /// Unique ID (due to the unique address)
368   static const char ID;
369 };
370 const char AAUniformWorkGroupSize::ID = 0;
371 
372 struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
373   AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
374       : AAUniformWorkGroupSize(IRP, A) {}
375 
376   void initialize(Attributor &A) override {
377     Function *F = getAssociatedFunction();
378     CallingConv::ID CC = F->getCallingConv();
379 
380     if (CC != CallingConv::AMDGPU_KERNEL)
381       return;
382 
383     bool InitialValue = false;
384     if (F->hasFnAttribute("uniform-work-group-size"))
385       InitialValue =
386           F->getFnAttribute("uniform-work-group-size").getValueAsString() ==
387           "true";
388 
389     if (InitialValue)
390       indicateOptimisticFixpoint();
391     else
392       indicatePessimisticFixpoint();
393   }
394 
395   ChangeStatus updateImpl(Attributor &A) override {
396     ChangeStatus Change = ChangeStatus::UNCHANGED;
397 
398     auto CheckCallSite = [&](AbstractCallSite CS) {
399       Function *Caller = CS.getInstruction()->getFunction();
400       LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
401                         << "->" << getAssociatedFunction()->getName() << "\n");
402 
403       const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
404           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
405       if (!CallerInfo || !CallerInfo->isValidState())
406         return false;
407 
408       Change = Change | clampStateAndIndicateChange(this->getState(),
409                                                     CallerInfo->getState());
410 
411       return true;
412     };
413 
414     bool AllCallSitesKnown = true;
415     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
416       return indicatePessimisticFixpoint();
417 
418     return Change;
419   }
420 
421   ChangeStatus manifest(Attributor &A) override {
422     SmallVector<Attribute, 8> AttrList;
423     LLVMContext &Ctx = getAssociatedFunction()->getContext();
424 
425     AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
426                                       getAssumed() ? "true" : "false"));
427     return A.manifestAttrs(getIRPosition(), AttrList,
428                            /* ForceReplace */ true);
429   }
430 
431   bool isValidState() const override {
432     // This state is always valid, even when the state is false.
433     return true;
434   }
435 
436   const std::string getAsStr(Attributor *) const override {
437     return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
438   }
439 
440   /// See AbstractAttribute::trackStatistics()
441   void trackStatistics() const override {}
442 };
443 
444 AAUniformWorkGroupSize &
445 AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
446                                           Attributor &A) {
447   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
448     return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
449   llvm_unreachable(
450       "AAUniformWorkGroupSize is only valid for function position");
451 }
452 
453 struct AAAMDAttributesFunction : public AAAMDAttributes {
454   AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
455       : AAAMDAttributes(IRP, A) {}
456 
457   void initialize(Attributor &A) override {
458     Function *F = getAssociatedFunction();
459 
460     // If the function requires the implicit arg pointer due to sanitizers,
461     // assume it's needed even if explicitly marked as not requiring it.
462     const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
463     if (NeedsHostcall) {
464       removeAssumedBits(IMPLICIT_ARG_PTR);
465       removeAssumedBits(HOSTCALL_PTR);
466     }
467 
468     for (auto Attr : ImplicitAttrs) {
469       if (NeedsHostcall &&
470           (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
471         continue;
472 
473       if (F->hasFnAttribute(Attr.second))
474         addKnownBits(Attr.first);
475     }
476 
477     if (F->isDeclaration())
478       return;
479 
480     // Ignore functions with graphics calling conventions, these are currently
481     // not allowed to have kernel arguments.
482     if (AMDGPU::isGraphics(F->getCallingConv())) {
483       indicatePessimisticFixpoint();
484       return;
485     }
486   }
487 
488   ChangeStatus updateImpl(Attributor &A) override {
489     Function *F = getAssociatedFunction();
490     // The current assumed state used to determine a change.
491     auto OrigAssumed = getAssumed();
492 
493     // Check for Intrinsics and propagate attributes.
494     const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
495         *this, this->getIRPosition(), DepClassTy::REQUIRED);
496     if (!AAEdges || !AAEdges->isValidState() ||
497         AAEdges->hasNonAsmUnknownCallee())
498       return indicatePessimisticFixpoint();
499 
500     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
501 
502     bool NeedsImplicit = false;
503     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
504     bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
505     bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
506     unsigned COV = InfoCache.getCodeObjectVersion();
507 
508     for (Function *Callee : AAEdges->getOptimisticEdges()) {
509       Intrinsic::ID IID = Callee->getIntrinsicID();
510       if (IID == Intrinsic::not_intrinsic) {
511         const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
512             *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
513         if (!AAAMD || !AAAMD->isValidState())
514           return indicatePessimisticFixpoint();
515         *this &= *AAAMD;
516         continue;
517       }
518 
519       bool NonKernelOnly = false;
520       ImplicitArgumentMask AttrMask =
521           intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
522                               HasApertureRegs, SupportsGetDoorbellID, COV);
523       if (AttrMask != NOT_IMPLICIT_INPUT) {
524         if ((IsNonEntryFunc || !NonKernelOnly))
525           removeAssumedBits(AttrMask);
526       }
527     }
528 
529     // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
530     if (NeedsImplicit)
531       removeAssumedBits(IMPLICIT_ARG_PTR);
532 
533     if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
534       // Under V5, we need implicitarg_ptr + offsets to access private_base or
535       // shared_base. We do not actually need queue_ptr.
536       if (COV >= 5)
537         removeAssumedBits(IMPLICIT_ARG_PTR);
538       else
539         removeAssumedBits(QUEUE_PTR);
540     }
541 
542     if (funcRetrievesMultigridSyncArg(A, COV)) {
543       assert(!isAssumed(IMPLICIT_ARG_PTR) &&
544              "multigrid_sync_arg needs implicitarg_ptr");
545       removeAssumedBits(MULTIGRID_SYNC_ARG);
546     }
547 
548     if (funcRetrievesHostcallPtr(A, COV)) {
549       assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
550       removeAssumedBits(HOSTCALL_PTR);
551     }
552 
553     if (funcRetrievesHeapPtr(A, COV)) {
554       assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
555       removeAssumedBits(HEAP_PTR);
556     }
557 
558     if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
559       assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
560       removeAssumedBits(QUEUE_PTR);
561     }
562 
563     if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
564       removeAssumedBits(LDS_KERNEL_ID);
565     }
566 
567     if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
568       removeAssumedBits(DEFAULT_QUEUE);
569 
570     if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
571       removeAssumedBits(COMPLETION_ACTION);
572 
573     if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(A))
574       removeAssumedBits(FLAT_SCRATCH_INIT);
575 
576     return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
577                                        : ChangeStatus::UNCHANGED;
578   }
579 
580   ChangeStatus manifest(Attributor &A) override {
581     SmallVector<Attribute, 8> AttrList;
582     LLVMContext &Ctx = getAssociatedFunction()->getContext();
583 
584     for (auto Attr : ImplicitAttrs) {
585       if (isKnown(Attr.first))
586         AttrList.push_back(Attribute::get(Ctx, Attr.second));
587     }
588 
589     return A.manifestAttrs(getIRPosition(), AttrList,
590                            /* ForceReplace */ true);
591   }
592 
593   const std::string getAsStr(Attributor *) const override {
594     std::string Str;
595     raw_string_ostream OS(Str);
596     OS << "AMDInfo[";
597     for (auto Attr : ImplicitAttrs)
598       if (isAssumed(Attr.first))
599         OS << ' ' << Attr.second;
600     OS << " ]";
601     return OS.str();
602   }
603 
604   /// See AbstractAttribute::trackStatistics()
605   void trackStatistics() const override {}
606 
607 private:
608   bool checkForQueuePtr(Attributor &A) {
609     Function *F = getAssociatedFunction();
610     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
611 
612     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
613 
614     bool NeedsQueuePtr = false;
615 
616     auto CheckAddrSpaceCasts = [&](Instruction &I) {
617       unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
618       if (castRequiresQueuePtr(SrcAS)) {
619         NeedsQueuePtr = true;
620         return false;
621       }
622       return true;
623     };
624 
625     bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
626 
627     // `checkForAllInstructions` is much more cheaper than going through all
628     // instructions, try it first.
629 
630     // The queue pointer is not needed if aperture regs is present.
631     if (!HasApertureRegs) {
632       bool UsedAssumedInformation = false;
633       A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
634                                 {Instruction::AddrSpaceCast},
635                                 UsedAssumedInformation);
636     }
637 
638     // If we found  that we need the queue pointer, nothing else to do.
639     if (NeedsQueuePtr)
640       return true;
641 
642     if (!IsNonEntryFunc && HasApertureRegs)
643       return false;
644 
645     for (BasicBlock &BB : *F) {
646       for (Instruction &I : BB) {
647         for (const Use &U : I.operands()) {
648           if (const auto *C = dyn_cast<Constant>(U)) {
649             if (InfoCache.needsQueuePtr(C, *F))
650               return true;
651           }
652         }
653       }
654     }
655 
656     return false;
657   }
658 
659   bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
660     auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV);
661     AA::RangeTy Range(Pos, 8);
662     return funcRetrievesImplicitKernelArg(A, Range);
663   }
664 
665   bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
666     auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV);
667     AA::RangeTy Range(Pos, 8);
668     return funcRetrievesImplicitKernelArg(A, Range);
669   }
670 
671   bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
672     auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV);
673     AA::RangeTy Range(Pos, 8);
674     return funcRetrievesImplicitKernelArg(A, Range);
675   }
676 
677   bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
678     auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV);
679     AA::RangeTy Range(Pos, 8);
680     return funcRetrievesImplicitKernelArg(A, Range);
681   }
682 
683   bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
684     if (COV < 5)
685       return false;
686     AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
687     return funcRetrievesImplicitKernelArg(A, Range);
688   }
689 
690   bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
691     if (COV < 5)
692       return false;
693     AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
694     return funcRetrievesImplicitKernelArg(A, Range);
695   }
696 
697   bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
698     // Check if this is a call to the implicitarg_ptr builtin and it
699     // is used to retrieve the hostcall pointer. The implicit arg for
700     // hostcall is not used only if every use of the implicitarg_ptr
701     // is a load that clearly does not retrieve any byte of the
702     // hostcall pointer. We check this by tracing all the uses of the
703     // initial call to the implicitarg_ptr intrinsic.
704     auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
705       auto &Call = cast<CallBase>(I);
706       if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
707         return true;
708 
709       const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
710           *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
711       if (!PointerInfoAA || !PointerInfoAA->getState().isValidState())
712         return false;
713 
714       return PointerInfoAA->forallInterferingAccesses(
715           Range, [](const AAPointerInfo::Access &Acc, bool IsExact) {
716             return Acc.getRemoteInst()->isDroppable();
717           });
718     };
719 
720     bool UsedAssumedInformation = false;
721     return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
722                                               UsedAssumedInformation);
723   }
724 
725   bool funcRetrievesLDSKernelId(Attributor &A) {
726     auto DoesNotRetrieve = [&](Instruction &I) {
727       auto &Call = cast<CallBase>(I);
728       return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
729     };
730     bool UsedAssumedInformation = false;
731     return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
732                                               UsedAssumedInformation);
733   }
734 
735   // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
736   // not to be set.
737   bool needFlatScratchInit(Attributor &A) {
738     assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set
739 
740     // Check all AddrSpaceCast instructions. FlatScratchInit is needed if
741     // there is a cast from PRIVATE_ADDRESS.
742     auto AddrSpaceCastNotFromPrivate = [](Instruction &I) {
743       return cast<AddrSpaceCastInst>(I).getSrcAddressSpace() !=
744              AMDGPUAS::PRIVATE_ADDRESS;
745     };
746 
747     bool UsedAssumedInformation = false;
748     if (!A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *this,
749                                    {Instruction::AddrSpaceCast},
750                                    UsedAssumedInformation))
751       return true;
752 
753     // Check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions
754     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
755 
756     Function *F = getAssociatedFunction();
757     for (Instruction &I : instructions(F)) {
758       for (const Use &U : I.operands()) {
759         if (const auto *C = dyn_cast<Constant>(U)) {
760           if (InfoCache.checkConstForAddrSpaceCastFromPrivate(C))
761             return true;
762         }
763       }
764     }
765 
766     // Finally check callees.
767 
768     // This is called on each callee; false means callee shouldn't have
769     // no-flat-scratch-init.
770     auto CheckForNoFlatScratchInit = [&](Instruction &I) {
771       const auto &CB = cast<CallBase>(I);
772       const Function *Callee = CB.getCalledFunction();
773 
774       // Callee == 0 for inline asm or indirect call with known callees.
775       // In the latter case, updateImpl() already checked the callees and we
776       // know their FLAT_SCRATCH_INIT bit is set.
777       // If function has indirect call with unknown callees, the bit is
778       // already removed in updateImpl() and execution won't reach here.
779       if (!Callee)
780         return true;
781 
782       return Callee->getIntrinsicID() !=
783              Intrinsic::amdgcn_addrspacecast_nonnull;
784     };
785 
786     UsedAssumedInformation = false;
787     // If any callee is false (i.e. need FlatScratchInit),
788     // checkForAllCallLikeInstructions returns false, in which case this
789     // function returns true.
790     return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this,
791                                               UsedAssumedInformation);
792   }
793 };
794 
795 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
796                                                     Attributor &A) {
797   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
798     return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
799   llvm_unreachable("AAAMDAttributes is only valid for function position");
800 }
801 
802 /// Base class to derive different size ranges.
803 struct AAAMDSizeRangeAttribute
804     : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
805   using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
806 
807   StringRef AttrName;
808 
809   AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
810                           StringRef AttrName)
811       : Base(IRP, 32), AttrName(AttrName) {}
812 
813   /// See AbstractAttribute::trackStatistics()
814   void trackStatistics() const override {}
815 
816   template <class AttributeImpl> ChangeStatus updateImplImpl(Attributor &A) {
817     ChangeStatus Change = ChangeStatus::UNCHANGED;
818 
819     auto CheckCallSite = [&](AbstractCallSite CS) {
820       Function *Caller = CS.getInstruction()->getFunction();
821       LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
822                         << "->" << getAssociatedFunction()->getName() << '\n');
823 
824       const auto *CallerInfo = A.getAAFor<AttributeImpl>(
825           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
826       if (!CallerInfo || !CallerInfo->isValidState())
827         return false;
828 
829       Change |=
830           clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
831 
832       return true;
833     };
834 
835     bool AllCallSitesKnown = true;
836     if (!A.checkForAllCallSites(CheckCallSite, *this,
837                                 /*RequireAllCallSites=*/true,
838                                 AllCallSitesKnown))
839       return indicatePessimisticFixpoint();
840 
841     return Change;
842   }
843 
844   /// Clamp the assumed range to the default value ([Min, Max]) and emit the
845   /// attribute if it is not same as default.
846   ChangeStatus
847   emitAttributeIfNotDefaultAfterClamp(Attributor &A,
848                                       std::pair<unsigned, unsigned> Default) {
849     auto [Min, Max] = Default;
850     unsigned Lower = getAssumed().getLower().getZExtValue();
851     unsigned Upper = getAssumed().getUpper().getZExtValue();
852 
853     // Clamp the range to the default value.
854     if (Lower < Min)
855       Lower = Min;
856     if (Upper > Max + 1)
857       Upper = Max + 1;
858 
859     // No manifest if the value is invalid or same as default after clamp.
860     if ((Lower == Min && Upper == Max + 1) || (Upper < Lower))
861       return ChangeStatus::UNCHANGED;
862 
863     Function *F = getAssociatedFunction();
864     LLVMContext &Ctx = F->getContext();
865     SmallString<10> Buffer;
866     raw_svector_ostream OS(Buffer);
867     OS << Lower << ',' << Upper - 1;
868     return A.manifestAttrs(getIRPosition(),
869                            {Attribute::get(Ctx, AttrName, OS.str())},
870                            /*ForceReplace=*/true);
871   }
872 
873   const std::string getAsStr(Attributor *) const override {
874     std::string Str;
875     raw_string_ostream OS(Str);
876     OS << getName() << '[';
877     OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
878     OS << ']';
879     return OS.str();
880   }
881 };
882 
883 /// Propagate amdgpu-flat-work-group-size attribute.
884 struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
885   AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
886       : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {}
887 
888   void initialize(Attributor &A) override {
889     Function *F = getAssociatedFunction();
890     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
891 
892     bool HasAttr = false;
893     auto Range = InfoCache.getDefaultFlatWorkGroupSize(*F);
894     auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(*F);
895 
896     if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) {
897       // We only consider an attribute that is not max range because the front
898       // end always emits the attribute, unfortunately, and sometimes it emits
899       // the max range.
900       if (*Attr != MaxRange) {
901         Range = *Attr;
902         HasAttr = true;
903       }
904     }
905 
906     // We don't want to directly clamp the state if it's the max range because
907     // that is basically the worst state.
908     if (Range == MaxRange)
909       return;
910 
911     auto [Min, Max] = Range;
912     ConstantRange CR(APInt(32, Min), APInt(32, Max + 1));
913     IntegerRangeState IRS(CR);
914     clampStateAndIndicateChange(this->getState(), IRS);
915 
916     if (HasAttr || AMDGPU::isEntryFunctionCC(F->getCallingConv()))
917       indicateOptimisticFixpoint();
918   }
919 
920   ChangeStatus updateImpl(Attributor &A) override {
921     return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
922   }
923 
924   /// Create an abstract attribute view for the position \p IRP.
925   static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
926                                                    Attributor &A);
927 
928   ChangeStatus manifest(Attributor &A) override {
929     Function *F = getAssociatedFunction();
930     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
931     return emitAttributeIfNotDefaultAfterClamp(
932         A, InfoCache.getMaximumFlatWorkGroupRange(*F));
933   }
934 
935   /// See AbstractAttribute::getName()
936   const std::string getName() const override {
937     return "AAAMDFlatWorkGroupSize";
938   }
939 
940   /// See AbstractAttribute::getIdAddr()
941   const char *getIdAddr() const override { return &ID; }
942 
943   /// This function should return true if the type of the \p AA is
944   /// AAAMDFlatWorkGroupSize
945   static bool classof(const AbstractAttribute *AA) {
946     return (AA->getIdAddr() == &ID);
947   }
948 
949   /// Unique ID (due to the unique address)
950   static const char ID;
951 };
952 
953 const char AAAMDFlatWorkGroupSize::ID = 0;
954 
955 AAAMDFlatWorkGroupSize &
956 AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
957                                           Attributor &A) {
958   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
959     return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
960   llvm_unreachable(
961       "AAAMDFlatWorkGroupSize is only valid for function position");
962 }
963 
964 struct TupleDecIntegerRangeState : public AbstractState {
965   DecIntegerState<uint32_t> X, Y, Z;
966 
967   bool isValidState() const override {
968     return X.isValidState() && Y.isValidState() && Z.isValidState();
969   }
970 
971   bool isAtFixpoint() const override {
972     return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
973   }
974 
975   ChangeStatus indicateOptimisticFixpoint() override {
976     return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() |
977            Z.indicateOptimisticFixpoint();
978   }
979 
980   ChangeStatus indicatePessimisticFixpoint() override {
981     return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() |
982            Z.indicatePessimisticFixpoint();
983   }
984 
985   TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) {
986     X ^= Other.X;
987     Y ^= Other.Y;
988     Z ^= Other.Z;
989     return *this;
990   }
991 
992   bool operator==(const TupleDecIntegerRangeState &Other) const {
993     return X == Other.X && Y == Other.Y && Z == Other.Z;
994   }
995 
996   TupleDecIntegerRangeState &getAssumed() { return *this; }
997   const TupleDecIntegerRangeState &getAssumed() const { return *this; }
998 };
999 
1000 using AAAMDMaxNumWorkgroupsState =
1001     StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
1002 
1003 /// Propagate amdgpu-max-num-workgroups attribute.
1004 struct AAAMDMaxNumWorkgroups
1005     : public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
1006   using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
1007 
1008   AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1009 
1010   void initialize(Attributor &A) override {
1011     Function *F = getAssociatedFunction();
1012     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1013 
1014     SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F);
1015 
1016     X.takeKnownMinimum(MaxNumWorkgroups[0]);
1017     Y.takeKnownMinimum(MaxNumWorkgroups[1]);
1018     Z.takeKnownMinimum(MaxNumWorkgroups[2]);
1019 
1020     if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
1021       indicatePessimisticFixpoint();
1022   }
1023 
1024   ChangeStatus updateImpl(Attributor &A) override {
1025     ChangeStatus Change = ChangeStatus::UNCHANGED;
1026 
1027     auto CheckCallSite = [&](AbstractCallSite CS) {
1028       Function *Caller = CS.getInstruction()->getFunction();
1029       LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
1030                         << "->" << getAssociatedFunction()->getName() << '\n');
1031 
1032       const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
1033           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
1034       if (!CallerInfo || !CallerInfo->isValidState())
1035         return false;
1036 
1037       Change |=
1038           clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
1039       return true;
1040     };
1041 
1042     bool AllCallSitesKnown = true;
1043     if (!A.checkForAllCallSites(CheckCallSite, *this,
1044                                 /*RequireAllCallSites=*/true,
1045                                 AllCallSitesKnown))
1046       return indicatePessimisticFixpoint();
1047 
1048     return Change;
1049   }
1050 
1051   /// Create an abstract attribute view for the position \p IRP.
1052   static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
1053                                                   Attributor &A);
1054 
1055   ChangeStatus manifest(Attributor &A) override {
1056     Function *F = getAssociatedFunction();
1057     LLVMContext &Ctx = F->getContext();
1058     SmallString<32> Buffer;
1059     raw_svector_ostream OS(Buffer);
1060     OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed();
1061 
1062     // TODO: Should annotate loads of the group size for this to do anything
1063     // useful.
1064     return A.manifestAttrs(
1065         getIRPosition(),
1066         {Attribute::get(Ctx, "amdgpu-max-num-workgroups", OS.str())},
1067         /* ForceReplace= */ true);
1068   }
1069 
1070   const std::string getName() const override { return "AAAMDMaxNumWorkgroups"; }
1071 
1072   const std::string getAsStr(Attributor *) const override {
1073     std::string Buffer = "AAAMDMaxNumWorkgroupsState[";
1074     raw_string_ostream OS(Buffer);
1075     OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed()
1076        << ']';
1077     return OS.str();
1078   }
1079 
1080   const char *getIdAddr() const override { return &ID; }
1081 
1082   /// This function should return true if the type of the \p AA is
1083   /// AAAMDMaxNumWorkgroups
1084   static bool classof(const AbstractAttribute *AA) {
1085     return (AA->getIdAddr() == &ID);
1086   }
1087 
1088   void trackStatistics() const override {}
1089 
1090   /// Unique ID (due to the unique address)
1091   static const char ID;
1092 };
1093 
1094 const char AAAMDMaxNumWorkgroups::ID = 0;
1095 
1096 AAAMDMaxNumWorkgroups &
1097 AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) {
1098   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1099     return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A);
1100   llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position");
1101 }
1102 
1103 /// Propagate amdgpu-waves-per-eu attribute.
1104 struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
1105   AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
1106       : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
1107 
1108   void initialize(Attributor &A) override {
1109     Function *F = getAssociatedFunction();
1110     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1111 
1112     auto TakeRange = [&](std::pair<unsigned, unsigned> R) {
1113       auto [Min, Max] = R;
1114       ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
1115       IntegerRangeState RangeState(Range);
1116       clampStateAndIndicateChange(this->getState(), RangeState);
1117       indicateOptimisticFixpoint();
1118     };
1119 
1120     std::pair<unsigned, unsigned> MaxWavesPerEURange{
1121         1U, InfoCache.getMaxWavesPerEU(*F)};
1122 
1123     // If the attribute exists, we will honor it if it is not the default.
1124     if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
1125       if (*Attr != MaxWavesPerEURange) {
1126         TakeRange(*Attr);
1127         return;
1128       }
1129     }
1130 
1131     // Unlike AAAMDFlatWorkGroupSize, it's getting trickier here. Since the
1132     // calculation of waves per EU involves flat work group size, we can't
1133     // simply use an assumed flat work group size as a start point, because the
1134     // update of flat work group size is in an inverse direction of waves per
1135     // EU. However, we can still do something if it is an entry function. Since
1136     // an entry function is a terminal node, and flat work group size either
1137     // from attribute or default will be used anyway, we can take that value and
1138     // calculate the waves per EU based on it. This result can't be updated by
1139     // no means, but that could still allow us to propagate it.
1140     if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
1141       std::pair<unsigned, unsigned> FlatWorkGroupSize;
1142       if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F))
1143         FlatWorkGroupSize = *Attr;
1144       else
1145         FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F);
1146       TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange,
1147                                                  FlatWorkGroupSize));
1148     }
1149   }
1150 
1151   ChangeStatus updateImpl(Attributor &A) override {
1152     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1153     ChangeStatus Change = ChangeStatus::UNCHANGED;
1154 
1155     auto CheckCallSite = [&](AbstractCallSite CS) {
1156       Function *Caller = CS.getInstruction()->getFunction();
1157       Function *Func = getAssociatedFunction();
1158       LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
1159                         << "->" << Func->getName() << '\n');
1160 
1161       const auto *CallerInfo = A.getAAFor<AAAMDWavesPerEU>(
1162           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
1163       const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(
1164           *this, IRPosition::function(*Func), DepClassTy::REQUIRED);
1165       if (!CallerInfo || !AssumedGroupSize || !CallerInfo->isValidState() ||
1166           !AssumedGroupSize->isValidState())
1167         return false;
1168 
1169       unsigned Min, Max;
1170       std::tie(Min, Max) = InfoCache.getEffectiveWavesPerEU(
1171           *Caller,
1172           {CallerInfo->getAssumed().getLower().getZExtValue(),
1173            CallerInfo->getAssumed().getUpper().getZExtValue() - 1},
1174           {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
1175            AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
1176       ConstantRange CallerRange(APInt(32, Min), APInt(32, Max + 1));
1177       IntegerRangeState CallerRangeState(CallerRange);
1178       Change |= clampStateAndIndicateChange(this->getState(), CallerRangeState);
1179 
1180       return true;
1181     };
1182 
1183     bool AllCallSitesKnown = true;
1184     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
1185       return indicatePessimisticFixpoint();
1186 
1187     return Change;
1188   }
1189 
1190   /// Create an abstract attribute view for the position \p IRP.
1191   static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
1192                                             Attributor &A);
1193 
1194   ChangeStatus manifest(Attributor &A) override {
1195     Function *F = getAssociatedFunction();
1196     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1197     return emitAttributeIfNotDefaultAfterClamp(
1198         A, {1U, InfoCache.getMaxWavesPerEU(*F)});
1199   }
1200 
1201   /// See AbstractAttribute::getName()
1202   const std::string getName() const override { return "AAAMDWavesPerEU"; }
1203 
1204   /// See AbstractAttribute::getIdAddr()
1205   const char *getIdAddr() const override { return &ID; }
1206 
1207   /// This function should return true if the type of the \p AA is
1208   /// AAAMDWavesPerEU
1209   static bool classof(const AbstractAttribute *AA) {
1210     return (AA->getIdAddr() == &ID);
1211   }
1212 
1213   /// Unique ID (due to the unique address)
1214   static const char ID;
1215 };
1216 
1217 const char AAAMDWavesPerEU::ID = 0;
1218 
1219 AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
1220                                                     Attributor &A) {
1221   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1222     return *new (A.Allocator) AAAMDWavesPerEU(IRP, A);
1223   llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
1224 }
1225 
1226 static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
1227   for (const auto &CI : IA->ParseConstraints()) {
1228     for (StringRef Code : CI.Codes) {
1229       Code.consume_front("{");
1230       if (Code.starts_with("a"))
1231         return true;
1232     }
1233   }
1234 
1235   return false;
1236 }
1237 
1238 struct AAAMDGPUNoAGPR
1239     : public IRAttribute<Attribute::NoUnwind,
1240                          StateWrapper<BooleanState, AbstractAttribute>,
1241                          AAAMDGPUNoAGPR> {
1242   AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
1243 
1244   static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP,
1245                                            Attributor &A) {
1246     if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1247       return *new (A.Allocator) AAAMDGPUNoAGPR(IRP, A);
1248     llvm_unreachable("AAAMDGPUNoAGPR is only valid for function position");
1249   }
1250 
1251   void initialize(Attributor &A) override {
1252     Function *F = getAssociatedFunction();
1253     if (F->hasFnAttribute("amdgpu-no-agpr"))
1254       indicateOptimisticFixpoint();
1255   }
1256 
1257   const std::string getAsStr(Attributor *A) const override {
1258     return getAssumed() ? "amdgpu-no-agpr" : "amdgpu-maybe-agpr";
1259   }
1260 
1261   void trackStatistics() const override {}
1262 
1263   ChangeStatus updateImpl(Attributor &A) override {
1264     // TODO: Use AACallEdges, but then we need a way to inspect asm edges.
1265 
1266     auto CheckForNoAGPRs = [&](Instruction &I) {
1267       const auto &CB = cast<CallBase>(I);
1268       const Value *CalleeOp = CB.getCalledOperand();
1269       const Function *Callee = dyn_cast<Function>(CalleeOp);
1270       if (!Callee) {
1271         if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))
1272           return !inlineAsmUsesAGPRs(IA);
1273         return false;
1274       }
1275 
1276       // Some intrinsics may use AGPRs, but if we have a choice, we are not
1277       // required to use AGPRs.
1278       if (Callee->isIntrinsic())
1279         return true;
1280 
1281       // TODO: Handle callsite attributes
1282       const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>(
1283           *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
1284       return CalleeInfo && CalleeInfo->isValidState() &&
1285              CalleeInfo->getAssumed();
1286     };
1287 
1288     bool UsedAssumedInformation = false;
1289     if (!A.checkForAllCallLikeInstructions(CheckForNoAGPRs, *this,
1290                                            UsedAssumedInformation))
1291       return indicatePessimisticFixpoint();
1292     return ChangeStatus::UNCHANGED;
1293   }
1294 
1295   ChangeStatus manifest(Attributor &A) override {
1296     if (!getAssumed())
1297       return ChangeStatus::UNCHANGED;
1298     LLVMContext &Ctx = getAssociatedFunction()->getContext();
1299     return A.manifestAttrs(getIRPosition(),
1300                            {Attribute::get(Ctx, "amdgpu-no-agpr")});
1301   }
1302 
1303   const std::string getName() const override { return "AAAMDGPUNoAGPR"; }
1304   const char *getIdAddr() const override { return &ID; }
1305 
1306   /// This function should return true if the type of the \p AA is
1307   /// AAAMDGPUNoAGPRs
1308   static bool classof(const AbstractAttribute *AA) {
1309     return (AA->getIdAddr() == &ID);
1310   }
1311 
1312   static const char ID;
1313 };
1314 
1315 const char AAAMDGPUNoAGPR::ID = 0;
1316 
1317 static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
1318   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
1319   for (unsigned I = 0;
1320        I < F.arg_size() &&
1321        I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs());
1322        ++I) {
1323     Argument &Arg = *F.getArg(I);
1324     // Check for incompatible attributes.
1325     if (Arg.hasByRefAttr() || Arg.hasNestAttr())
1326       break;
1327 
1328     Arg.addAttr(Attribute::InReg);
1329   }
1330 }
1331 
1332 static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1333                     AMDGPUAttributorOptions Options) {
1334   SetVector<Function *> Functions;
1335   for (Function &F : M) {
1336     if (!F.isIntrinsic())
1337       Functions.insert(&F);
1338   }
1339 
1340   CallGraphUpdater CGUpdater;
1341   BumpPtrAllocator Allocator;
1342   AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
1343   DenseSet<const char *> Allowed(
1344       {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
1345        &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
1346        &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
1347        &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
1348        &AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
1349        &AAInstanceInfo::ID});
1350 
1351   AttributorConfig AC(CGUpdater);
1352   AC.IsClosedWorldModule = Options.IsClosedWorld;
1353   AC.Allowed = &Allowed;
1354   AC.IsModulePass = true;
1355   AC.DefaultInitializeLiveInternals = false;
1356   AC.IndirectCalleeSpecializationCallback =
1357       [](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1358          Function &Callee, unsigned NumAssumedCallees) {
1359         return !AMDGPU::isEntryFunctionCC(Callee.getCallingConv()) &&
1360                (NumAssumedCallees <= IndirectCallSpecializationThreshold);
1361       };
1362   AC.IPOAmendableCB = [](const Function &F) {
1363     return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
1364   };
1365 
1366   Attributor A(Functions, InfoCache, AC);
1367 
1368   LLVM_DEBUG(dbgs() << "[AMDGPUAttributor] Module " << M.getName() << " is "
1369                     << (AC.IsClosedWorldModule ? "" : "not ")
1370                     << "assumed to be a closed world.\n");
1371 
1372   for (auto *F : Functions) {
1373     A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F));
1374     A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F));
1375     A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F));
1376     A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F));
1377     CallingConv::ID CC = F->getCallingConv();
1378     if (!AMDGPU::isEntryFunctionCC(CC)) {
1379       A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F));
1380       A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F));
1381     } else if (CC == CallingConv::AMDGPU_KERNEL) {
1382       addPreloadKernArgHint(*F, TM);
1383     }
1384 
1385     for (auto &I : instructions(F)) {
1386       if (auto *LI = dyn_cast<LoadInst>(&I)) {
1387         A.getOrCreateAAFor<AAAddressSpace>(
1388             IRPosition::value(*LI->getPointerOperand()));
1389       } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
1390         A.getOrCreateAAFor<AAAddressSpace>(
1391             IRPosition::value(*SI->getPointerOperand()));
1392       } else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I)) {
1393         A.getOrCreateAAFor<AAAddressSpace>(
1394             IRPosition::value(*RMW->getPointerOperand()));
1395       } else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I)) {
1396         A.getOrCreateAAFor<AAAddressSpace>(
1397             IRPosition::value(*CmpX->getPointerOperand()));
1398       }
1399     }
1400   }
1401 
1402   ChangeStatus Change = A.run();
1403   return Change == ChangeStatus::CHANGED;
1404 }
1405 
1406 class AMDGPUAttributorLegacy : public ModulePass {
1407 public:
1408   AMDGPUAttributorLegacy() : ModulePass(ID) {}
1409 
1410   /// doInitialization - Virtual method overridden by subclasses to do
1411   /// any necessary initialization before any pass is run.
1412   bool doInitialization(Module &) override {
1413     auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1414     if (!TPC)
1415       report_fatal_error("TargetMachine is required");
1416 
1417     TM = &TPC->getTM<TargetMachine>();
1418     return false;
1419   }
1420 
1421   bool runOnModule(Module &M) override {
1422     AnalysisGetter AG(this);
1423     return runImpl(M, AG, *TM, /*Options=*/{});
1424   }
1425 
1426   void getAnalysisUsage(AnalysisUsage &AU) const override {
1427     AU.addRequired<CycleInfoWrapperPass>();
1428   }
1429 
1430   StringRef getPassName() const override { return "AMDGPU Attributor"; }
1431   TargetMachine *TM;
1432   static char ID;
1433 };
1434 } // namespace
1435 
1436 PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
1437                                                   ModuleAnalysisManager &AM) {
1438 
1439   FunctionAnalysisManager &FAM =
1440       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1441   AnalysisGetter AG(FAM);
1442 
1443   // TODO: Probably preserves CFG
1444   return runImpl(M, AG, TM, Options) ? PreservedAnalyses::none()
1445                                      : PreservedAnalyses::all();
1446 }
1447 
1448 char AMDGPUAttributorLegacy::ID = 0;
1449 
1450 Pass *llvm::createAMDGPUAttributorLegacyPass() {
1451   return new AMDGPUAttributorLegacy();
1452 }
1453 INITIALIZE_PASS_BEGIN(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
1454                       false, false)
1455 INITIALIZE_PASS_DEPENDENCY(CycleInfoWrapperPass);
1456 INITIALIZE_PASS_END(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
1457                     false, false)
1458