1 //===- AMDGPUAttributor.cpp -----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "AMDGPU.h"
14 #include "GCNSubtarget.h"
15 #include "Utils/AMDGPUBaseInfo.h"
16 #include "llvm/Analysis/CycleAnalysis.h"
17 #include "llvm/CodeGen/TargetPassConfig.h"
18 #include "llvm/IR/IntrinsicsAMDGPU.h"
19 #include "llvm/IR/IntrinsicsR600.h"
20 #include "llvm/Target/TargetMachine.h"
21 #include "llvm/Transforms/IPO/Attributor.h"
22
23 #define DEBUG_TYPE "amdgpu-attributor"
24
25 namespace llvm {
26 void initializeCycleInfoWrapperPassPass(PassRegistry &);
27 }
28
29 using namespace llvm;
30
31 #define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
32
33 enum ImplicitArgumentPositions {
34 #include "AMDGPUAttributes.def"
35 LAST_ARG_POS
36 };
37
38 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
39
40 enum ImplicitArgumentMask {
41 NOT_IMPLICIT_INPUT = 0,
42 #include "AMDGPUAttributes.def"
43 ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
44 };
45
46 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
47 static constexpr std::pair<ImplicitArgumentMask,
48 StringLiteral> ImplicitAttrs[] = {
49 #include "AMDGPUAttributes.def"
50 };
51
52 // We do not need to note the x workitem or workgroup id because they are always
53 // initialized.
54 //
55 // TODO: We should not add the attributes if the known compile time workgroup
56 // size is 1 for y/z.
57 static ImplicitArgumentMask
intrinsicToAttrMask(Intrinsic::ID ID,bool & NonKernelOnly,bool & NeedsImplicit,bool HasApertureRegs,bool SupportsGetDoorBellID)58 intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
59 bool HasApertureRegs, bool SupportsGetDoorBellID) {
60 unsigned CodeObjectVersion = AMDGPU::getAmdhsaCodeObjectVersion();
61 switch (ID) {
62 case Intrinsic::amdgcn_workitem_id_x:
63 NonKernelOnly = true;
64 return WORKITEM_ID_X;
65 case Intrinsic::amdgcn_workgroup_id_x:
66 NonKernelOnly = true;
67 return WORKGROUP_ID_X;
68 case Intrinsic::amdgcn_workitem_id_y:
69 case Intrinsic::r600_read_tidig_y:
70 return WORKITEM_ID_Y;
71 case Intrinsic::amdgcn_workitem_id_z:
72 case Intrinsic::r600_read_tidig_z:
73 return WORKITEM_ID_Z;
74 case Intrinsic::amdgcn_workgroup_id_y:
75 case Intrinsic::r600_read_tgid_y:
76 return WORKGROUP_ID_Y;
77 case Intrinsic::amdgcn_workgroup_id_z:
78 case Intrinsic::r600_read_tgid_z:
79 return WORKGROUP_ID_Z;
80 case Intrinsic::amdgcn_lds_kernel_id:
81 return LDS_KERNEL_ID;
82 case Intrinsic::amdgcn_dispatch_ptr:
83 return DISPATCH_PTR;
84 case Intrinsic::amdgcn_dispatch_id:
85 return DISPATCH_ID;
86 case Intrinsic::amdgcn_implicitarg_ptr:
87 return IMPLICIT_ARG_PTR;
88 // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
89 // queue_ptr.
90 case Intrinsic::amdgcn_queue_ptr:
91 NeedsImplicit = (CodeObjectVersion == 5);
92 return QUEUE_PTR;
93 case Intrinsic::amdgcn_is_shared:
94 case Intrinsic::amdgcn_is_private:
95 if (HasApertureRegs)
96 return NOT_IMPLICIT_INPUT;
97 // Under V5, we need implicitarg_ptr + offsets to access private_base or
98 // shared_base. For pre-V5, however, need to access them through queue_ptr +
99 // offsets.
100 return CodeObjectVersion == 5 ? IMPLICIT_ARG_PTR : QUEUE_PTR;
101 case Intrinsic::trap:
102 if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
103 return CodeObjectVersion >= 4 ? NOT_IMPLICIT_INPUT : QUEUE_PTR;
104 NeedsImplicit = (CodeObjectVersion == 5); // Need impicitarg_ptr under V5.
105 return QUEUE_PTR;
106 default:
107 return NOT_IMPLICIT_INPUT;
108 }
109 }
110
castRequiresQueuePtr(unsigned SrcAS)111 static bool castRequiresQueuePtr(unsigned SrcAS) {
112 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
113 }
114
isDSAddress(const Constant * C)115 static bool isDSAddress(const Constant *C) {
116 const GlobalValue *GV = dyn_cast<GlobalValue>(C);
117 if (!GV)
118 return false;
119 unsigned AS = GV->getAddressSpace();
120 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
121 }
122
123 /// Returns true if the function requires the implicit argument be passed
124 /// regardless of the function contents.
funcRequiresHostcallPtr(const Function & F)125 static bool funcRequiresHostcallPtr(const Function &F) {
126 // Sanitizers require the hostcall buffer passed in the implicit arguments.
127 return F.hasFnAttribute(Attribute::SanitizeAddress) ||
128 F.hasFnAttribute(Attribute::SanitizeThread) ||
129 F.hasFnAttribute(Attribute::SanitizeMemory) ||
130 F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
131 F.hasFnAttribute(Attribute::SanitizeMemTag);
132 }
133
134 namespace {
135 class AMDGPUInformationCache : public InformationCache {
136 public:
AMDGPUInformationCache(const Module & M,AnalysisGetter & AG,BumpPtrAllocator & Allocator,SetVector<Function * > * CGSCC,TargetMachine & TM)137 AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
138 BumpPtrAllocator &Allocator,
139 SetVector<Function *> *CGSCC, TargetMachine &TM)
140 : InformationCache(M, AG, Allocator, CGSCC), TM(TM) {}
141 TargetMachine &TM;
142
143 enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 };
144
145 /// Check if the subtarget has aperture regs.
hasApertureRegs(Function & F)146 bool hasApertureRegs(Function &F) {
147 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
148 return ST.hasApertureRegs();
149 }
150
151 /// Check if the subtarget supports GetDoorbellID.
supportsGetDoorbellID(Function & F)152 bool supportsGetDoorbellID(Function &F) {
153 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
154 return ST.supportsGetDoorbellID();
155 }
156
getFlatWorkGroupSizes(const Function & F)157 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {
158 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
159 return ST.getFlatWorkGroupSizes(F);
160 }
161
162 std::pair<unsigned, unsigned>
getMaximumFlatWorkGroupRange(const Function & F)163 getMaximumFlatWorkGroupRange(const Function &F) {
164 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
165 return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
166 }
167
168 private:
169 /// Check if the ConstantExpr \p CE requires the queue pointer.
visitConstExpr(const ConstantExpr * CE)170 static bool visitConstExpr(const ConstantExpr *CE) {
171 if (CE->getOpcode() == Instruction::AddrSpaceCast) {
172 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
173 return castRequiresQueuePtr(SrcAS);
174 }
175 return false;
176 }
177
178 /// Get the constant access bitmap for \p C.
getConstantAccess(const Constant * C)179 uint8_t getConstantAccess(const Constant *C) {
180 auto It = ConstantStatus.find(C);
181 if (It != ConstantStatus.end())
182 return It->second;
183
184 uint8_t Result = 0;
185 if (isDSAddress(C))
186 Result = DS_GLOBAL;
187
188 if (const auto *CE = dyn_cast<ConstantExpr>(C))
189 if (visitConstExpr(CE))
190 Result |= ADDR_SPACE_CAST;
191
192 for (const Use &U : C->operands()) {
193 const auto *OpC = dyn_cast<Constant>(U);
194 if (!OpC)
195 continue;
196
197 Result |= getConstantAccess(OpC);
198 }
199 return Result;
200 }
201
202 public:
203 /// Returns true if \p Fn needs the queue pointer because of \p C.
needsQueuePtr(const Constant * C,Function & Fn)204 bool needsQueuePtr(const Constant *C, Function &Fn) {
205 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
206 bool HasAperture = hasApertureRegs(Fn);
207
208 // No need to explore the constants.
209 if (!IsNonEntryFunc && HasAperture)
210 return false;
211
212 uint8_t Access = getConstantAccess(C);
213
214 // We need to trap on DS globals in non-entry functions.
215 if (IsNonEntryFunc && (Access & DS_GLOBAL))
216 return true;
217
218 return !HasAperture && (Access & ADDR_SPACE_CAST);
219 }
220
221 private:
222 /// Used to determine if the Constant needs the queue pointer.
223 DenseMap<const Constant *, uint8_t> ConstantStatus;
224 };
225
226 struct AAAMDAttributes
227 : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
228 AbstractAttribute> {
229 using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
230 AbstractAttribute>;
231
AAAMDAttributes__anon130d36380111::AAAMDAttributes232 AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
233
234 /// Create an abstract attribute view for the position \p IRP.
235 static AAAMDAttributes &createForPosition(const IRPosition &IRP,
236 Attributor &A);
237
238 /// See AbstractAttribute::getName().
getName__anon130d36380111::AAAMDAttributes239 const std::string getName() const override { return "AAAMDAttributes"; }
240
241 /// See AbstractAttribute::getIdAddr().
getIdAddr__anon130d36380111::AAAMDAttributes242 const char *getIdAddr() const override { return &ID; }
243
244 /// This function should return true if the type of the \p AA is
245 /// AAAMDAttributes.
classof__anon130d36380111::AAAMDAttributes246 static bool classof(const AbstractAttribute *AA) {
247 return (AA->getIdAddr() == &ID);
248 }
249
250 /// Unique ID (due to the unique address)
251 static const char ID;
252 };
253 const char AAAMDAttributes::ID = 0;
254
255 struct AAUniformWorkGroupSize
256 : public StateWrapper<BooleanState, AbstractAttribute> {
257 using Base = StateWrapper<BooleanState, AbstractAttribute>;
AAUniformWorkGroupSize__anon130d36380111::AAUniformWorkGroupSize258 AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
259
260 /// Create an abstract attribute view for the position \p IRP.
261 static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
262 Attributor &A);
263
264 /// See AbstractAttribute::getName().
getName__anon130d36380111::AAUniformWorkGroupSize265 const std::string getName() const override {
266 return "AAUniformWorkGroupSize";
267 }
268
269 /// See AbstractAttribute::getIdAddr().
getIdAddr__anon130d36380111::AAUniformWorkGroupSize270 const char *getIdAddr() const override { return &ID; }
271
272 /// This function should return true if the type of the \p AA is
273 /// AAAMDAttributes.
classof__anon130d36380111::AAUniformWorkGroupSize274 static bool classof(const AbstractAttribute *AA) {
275 return (AA->getIdAddr() == &ID);
276 }
277
278 /// Unique ID (due to the unique address)
279 static const char ID;
280 };
281 const char AAUniformWorkGroupSize::ID = 0;
282
283 struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
AAUniformWorkGroupSizeFunction__anon130d36380111::AAUniformWorkGroupSizeFunction284 AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
285 : AAUniformWorkGroupSize(IRP, A) {}
286
initialize__anon130d36380111::AAUniformWorkGroupSizeFunction287 void initialize(Attributor &A) override {
288 Function *F = getAssociatedFunction();
289 CallingConv::ID CC = F->getCallingConv();
290
291 if (CC != CallingConv::AMDGPU_KERNEL)
292 return;
293
294 bool InitialValue = false;
295 if (F->hasFnAttribute("uniform-work-group-size"))
296 InitialValue = F->getFnAttribute("uniform-work-group-size")
297 .getValueAsString()
298 .equals("true");
299
300 if (InitialValue)
301 indicateOptimisticFixpoint();
302 else
303 indicatePessimisticFixpoint();
304 }
305
updateImpl__anon130d36380111::AAUniformWorkGroupSizeFunction306 ChangeStatus updateImpl(Attributor &A) override {
307 ChangeStatus Change = ChangeStatus::UNCHANGED;
308
309 auto CheckCallSite = [&](AbstractCallSite CS) {
310 Function *Caller = CS.getInstruction()->getFunction();
311 LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
312 << "->" << getAssociatedFunction()->getName() << "\n");
313
314 const auto &CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
315 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
316
317 Change = Change | clampStateAndIndicateChange(this->getState(),
318 CallerInfo.getState());
319
320 return true;
321 };
322
323 bool AllCallSitesKnown = true;
324 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
325 return indicatePessimisticFixpoint();
326
327 return Change;
328 }
329
manifest__anon130d36380111::AAUniformWorkGroupSizeFunction330 ChangeStatus manifest(Attributor &A) override {
331 SmallVector<Attribute, 8> AttrList;
332 LLVMContext &Ctx = getAssociatedFunction()->getContext();
333
334 AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
335 getAssumed() ? "true" : "false"));
336 return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
337 /* ForceReplace */ true);
338 }
339
isValidState__anon130d36380111::AAUniformWorkGroupSizeFunction340 bool isValidState() const override {
341 // This state is always valid, even when the state is false.
342 return true;
343 }
344
getAsStr__anon130d36380111::AAUniformWorkGroupSizeFunction345 const std::string getAsStr() const override {
346 return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
347 }
348
349 /// See AbstractAttribute::trackStatistics()
trackStatistics__anon130d36380111::AAUniformWorkGroupSizeFunction350 void trackStatistics() const override {}
351 };
352
353 AAUniformWorkGroupSize &
createForPosition(const IRPosition & IRP,Attributor & A)354 AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
355 Attributor &A) {
356 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
357 return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
358 llvm_unreachable(
359 "AAUniformWorkGroupSize is only valid for function position");
360 }
361
362 struct AAAMDAttributesFunction : public AAAMDAttributes {
AAAMDAttributesFunction__anon130d36380111::AAAMDAttributesFunction363 AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
364 : AAAMDAttributes(IRP, A) {}
365
initialize__anon130d36380111::AAAMDAttributesFunction366 void initialize(Attributor &A) override {
367 Function *F = getAssociatedFunction();
368
369 // If the function requires the implicit arg pointer due to sanitizers,
370 // assume it's needed even if explicitly marked as not requiring it.
371 const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
372 if (NeedsHostcall) {
373 removeAssumedBits(IMPLICIT_ARG_PTR);
374 removeAssumedBits(HOSTCALL_PTR);
375 }
376
377 for (auto Attr : ImplicitAttrs) {
378 if (NeedsHostcall &&
379 (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
380 continue;
381
382 if (F->hasFnAttribute(Attr.second))
383 addKnownBits(Attr.first);
384 }
385
386 if (F->isDeclaration())
387 return;
388
389 // Ignore functions with graphics calling conventions, these are currently
390 // not allowed to have kernel arguments.
391 if (AMDGPU::isGraphics(F->getCallingConv())) {
392 indicatePessimisticFixpoint();
393 return;
394 }
395 }
396
updateImpl__anon130d36380111::AAAMDAttributesFunction397 ChangeStatus updateImpl(Attributor &A) override {
398 Function *F = getAssociatedFunction();
399 // The current assumed state used to determine a change.
400 auto OrigAssumed = getAssumed();
401
402 // Check for Intrinsics and propagate attributes.
403 const AACallEdges &AAEdges = A.getAAFor<AACallEdges>(
404 *this, this->getIRPosition(), DepClassTy::REQUIRED);
405 if (AAEdges.hasNonAsmUnknownCallee())
406 return indicatePessimisticFixpoint();
407
408 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
409
410 bool NeedsImplicit = false;
411 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
412 bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
413 bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
414
415 for (Function *Callee : AAEdges.getOptimisticEdges()) {
416 Intrinsic::ID IID = Callee->getIntrinsicID();
417 if (IID == Intrinsic::not_intrinsic) {
418 const AAAMDAttributes &AAAMD = A.getAAFor<AAAMDAttributes>(
419 *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
420 *this &= AAAMD;
421 continue;
422 }
423
424 bool NonKernelOnly = false;
425 ImplicitArgumentMask AttrMask =
426 intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
427 HasApertureRegs, SupportsGetDoorbellID);
428 if (AttrMask != NOT_IMPLICIT_INPUT) {
429 if ((IsNonEntryFunc || !NonKernelOnly))
430 removeAssumedBits(AttrMask);
431 }
432 }
433
434 // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
435 if (NeedsImplicit)
436 removeAssumedBits(IMPLICIT_ARG_PTR);
437
438 if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
439 // Under V5, we need implicitarg_ptr + offsets to access private_base or
440 // shared_base. We do not actually need queue_ptr.
441 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5)
442 removeAssumedBits(IMPLICIT_ARG_PTR);
443 else
444 removeAssumedBits(QUEUE_PTR);
445 }
446
447 if (funcRetrievesMultigridSyncArg(A)) {
448 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
449 "multigrid_sync_arg needs implicitarg_ptr");
450 removeAssumedBits(MULTIGRID_SYNC_ARG);
451 }
452
453 if (funcRetrievesHostcallPtr(A)) {
454 assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
455 removeAssumedBits(HOSTCALL_PTR);
456 }
457
458 if (funcRetrievesHeapPtr(A)) {
459 assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
460 removeAssumedBits(HEAP_PTR);
461 }
462
463 if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A)) {
464 assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
465 removeAssumedBits(QUEUE_PTR);
466 }
467
468 if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
469 removeAssumedBits(LDS_KERNEL_ID);
470 }
471
472 if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A))
473 removeAssumedBits(DEFAULT_QUEUE);
474
475 if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A))
476 removeAssumedBits(COMPLETION_ACTION);
477
478 return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
479 : ChangeStatus::UNCHANGED;
480 }
481
manifest__anon130d36380111::AAAMDAttributesFunction482 ChangeStatus manifest(Attributor &A) override {
483 SmallVector<Attribute, 8> AttrList;
484 LLVMContext &Ctx = getAssociatedFunction()->getContext();
485
486 for (auto Attr : ImplicitAttrs) {
487 if (isKnown(Attr.first))
488 AttrList.push_back(Attribute::get(Ctx, Attr.second));
489 }
490
491 return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
492 /* ForceReplace */ true);
493 }
494
getAsStr__anon130d36380111::AAAMDAttributesFunction495 const std::string getAsStr() const override {
496 std::string Str;
497 raw_string_ostream OS(Str);
498 OS << "AMDInfo[";
499 for (auto Attr : ImplicitAttrs)
500 OS << ' ' << Attr.second;
501 OS << " ]";
502 return OS.str();
503 }
504
505 /// See AbstractAttribute::trackStatistics()
trackStatistics__anon130d36380111::AAAMDAttributesFunction506 void trackStatistics() const override {}
507
508 private:
checkForQueuePtr__anon130d36380111::AAAMDAttributesFunction509 bool checkForQueuePtr(Attributor &A) {
510 Function *F = getAssociatedFunction();
511 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
512
513 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
514
515 bool NeedsQueuePtr = false;
516
517 auto CheckAddrSpaceCasts = [&](Instruction &I) {
518 unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
519 if (castRequiresQueuePtr(SrcAS)) {
520 NeedsQueuePtr = true;
521 return false;
522 }
523 return true;
524 };
525
526 bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
527
528 // `checkForAllInstructions` is much more cheaper than going through all
529 // instructions, try it first.
530
531 // The queue pointer is not needed if aperture regs is present.
532 if (!HasApertureRegs) {
533 bool UsedAssumedInformation = false;
534 A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
535 {Instruction::AddrSpaceCast},
536 UsedAssumedInformation);
537 }
538
539 // If we found that we need the queue pointer, nothing else to do.
540 if (NeedsQueuePtr)
541 return true;
542
543 if (!IsNonEntryFunc && HasApertureRegs)
544 return false;
545
546 for (BasicBlock &BB : *F) {
547 for (Instruction &I : BB) {
548 for (const Use &U : I.operands()) {
549 if (const auto *C = dyn_cast<Constant>(U)) {
550 if (InfoCache.needsQueuePtr(C, *F))
551 return true;
552 }
553 }
554 }
555 }
556
557 return false;
558 }
559
funcRetrievesMultigridSyncArg__anon130d36380111::AAAMDAttributesFunction560 bool funcRetrievesMultigridSyncArg(Attributor &A) {
561 auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition();
562 AA::RangeTy Range(Pos, 8);
563 return funcRetrievesImplicitKernelArg(A, Range);
564 }
565
funcRetrievesHostcallPtr__anon130d36380111::AAAMDAttributesFunction566 bool funcRetrievesHostcallPtr(Attributor &A) {
567 auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition();
568 AA::RangeTy Range(Pos, 8);
569 return funcRetrievesImplicitKernelArg(A, Range);
570 }
571
funcRetrievesDefaultQueue__anon130d36380111::AAAMDAttributesFunction572 bool funcRetrievesDefaultQueue(Attributor &A) {
573 auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition();
574 AA::RangeTy Range(Pos, 8);
575 return funcRetrievesImplicitKernelArg(A, Range);
576 }
577
funcRetrievesCompletionAction__anon130d36380111::AAAMDAttributesFunction578 bool funcRetrievesCompletionAction(Attributor &A) {
579 auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition();
580 AA::RangeTy Range(Pos, 8);
581 return funcRetrievesImplicitKernelArg(A, Range);
582 }
583
funcRetrievesHeapPtr__anon130d36380111::AAAMDAttributesFunction584 bool funcRetrievesHeapPtr(Attributor &A) {
585 if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
586 return false;
587 AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
588 return funcRetrievesImplicitKernelArg(A, Range);
589 }
590
funcRetrievesQueuePtr__anon130d36380111::AAAMDAttributesFunction591 bool funcRetrievesQueuePtr(Attributor &A) {
592 if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
593 return false;
594 AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
595 return funcRetrievesImplicitKernelArg(A, Range);
596 }
597
funcRetrievesImplicitKernelArg__anon130d36380111::AAAMDAttributesFunction598 bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
599 // Check if this is a call to the implicitarg_ptr builtin and it
600 // is used to retrieve the hostcall pointer. The implicit arg for
601 // hostcall is not used only if every use of the implicitarg_ptr
602 // is a load that clearly does not retrieve any byte of the
603 // hostcall pointer. We check this by tracing all the uses of the
604 // initial call to the implicitarg_ptr intrinsic.
605 auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
606 auto &Call = cast<CallBase>(I);
607 if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
608 return true;
609
610 const auto &PointerInfoAA = A.getAAFor<AAPointerInfo>(
611 *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
612
613 return PointerInfoAA.forallInterferingAccesses(
614 Range, [](const AAPointerInfo::Access &Acc, bool IsExact) {
615 return Acc.getRemoteInst()->isDroppable();
616 });
617 };
618
619 bool UsedAssumedInformation = false;
620 return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
621 UsedAssumedInformation);
622 }
623
funcRetrievesLDSKernelId__anon130d36380111::AAAMDAttributesFunction624 bool funcRetrievesLDSKernelId(Attributor &A) {
625 auto DoesNotRetrieve = [&](Instruction &I) {
626 auto &Call = cast<CallBase>(I);
627 return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
628 };
629 bool UsedAssumedInformation = false;
630 return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
631 UsedAssumedInformation);
632 }
633 };
634
createForPosition(const IRPosition & IRP,Attributor & A)635 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
636 Attributor &A) {
637 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
638 return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
639 llvm_unreachable("AAAMDAttributes is only valid for function position");
640 }
641
642 /// Propagate amdgpu-flat-work-group-size attribute.
643 struct AAAMDFlatWorkGroupSize
644 : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
645 using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
AAAMDFlatWorkGroupSize__anon130d36380111::AAAMDFlatWorkGroupSize646 AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
647 : Base(IRP, 32) {}
648
649 /// See AbstractAttribute::getState(...).
getState__anon130d36380111::AAAMDFlatWorkGroupSize650 IntegerRangeState &getState() override { return *this; }
getState__anon130d36380111::AAAMDFlatWorkGroupSize651 const IntegerRangeState &getState() const override { return *this; }
652
initialize__anon130d36380111::AAAMDFlatWorkGroupSize653 void initialize(Attributor &A) override {
654 Function *F = getAssociatedFunction();
655 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
656 unsigned MinGroupSize, MaxGroupSize;
657 std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);
658 intersectKnown(
659 ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
660
661 if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
662 indicatePessimisticFixpoint();
663 }
664
updateImpl__anon130d36380111::AAAMDFlatWorkGroupSize665 ChangeStatus updateImpl(Attributor &A) override {
666 ChangeStatus Change = ChangeStatus::UNCHANGED;
667
668 auto CheckCallSite = [&](AbstractCallSite CS) {
669 Function *Caller = CS.getInstruction()->getFunction();
670 LLVM_DEBUG(dbgs() << "[AAAMDFlatWorkGroupSize] Call " << Caller->getName()
671 << "->" << getAssociatedFunction()->getName() << '\n');
672
673 const auto &CallerInfo = A.getAAFor<AAAMDFlatWorkGroupSize>(
674 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
675
676 Change |=
677 clampStateAndIndicateChange(this->getState(), CallerInfo.getState());
678
679 return true;
680 };
681
682 bool AllCallSitesKnown = true;
683 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
684 return indicatePessimisticFixpoint();
685
686 return Change;
687 }
688
manifest__anon130d36380111::AAAMDFlatWorkGroupSize689 ChangeStatus manifest(Attributor &A) override {
690 SmallVector<Attribute, 8> AttrList;
691 Function *F = getAssociatedFunction();
692 LLVMContext &Ctx = F->getContext();
693
694 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
695 unsigned Min, Max;
696 std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F);
697
698 // Don't add the attribute if it's the implied default.
699 if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
700 return ChangeStatus::UNCHANGED;
701
702 SmallString<10> Buffer;
703 raw_svector_ostream OS(Buffer);
704 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
705
706 AttrList.push_back(
707 Attribute::get(Ctx, "amdgpu-flat-work-group-size", OS.str()));
708 return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
709 /* ForceReplace */ true);
710 }
711
getAsStr__anon130d36380111::AAAMDFlatWorkGroupSize712 const std::string getAsStr() const override {
713 std::string Str;
714 raw_string_ostream OS(Str);
715 OS << "AMDFlatWorkGroupSize[";
716 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
717 OS << ']';
718 return OS.str();
719 }
720
721 /// See AbstractAttribute::trackStatistics()
trackStatistics__anon130d36380111::AAAMDFlatWorkGroupSize722 void trackStatistics() const override {}
723
724 /// Create an abstract attribute view for the position \p IRP.
725 static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
726 Attributor &A);
727
728 /// See AbstractAttribute::getName()
getName__anon130d36380111::AAAMDFlatWorkGroupSize729 const std::string getName() const override {
730 return "AAAMDFlatWorkGroupSize";
731 }
732
733 /// See AbstractAttribute::getIdAddr()
getIdAddr__anon130d36380111::AAAMDFlatWorkGroupSize734 const char *getIdAddr() const override { return &ID; }
735
736 /// This function should return true if the type of the \p AA is
737 /// AAAMDFlatWorkGroupSize
classof__anon130d36380111::AAAMDFlatWorkGroupSize738 static bool classof(const AbstractAttribute *AA) {
739 return (AA->getIdAddr() == &ID);
740 }
741
742 /// Unique ID (due to the unique address)
743 static const char ID;
744 };
745
746 const char AAAMDFlatWorkGroupSize::ID = 0;
747
748 AAAMDFlatWorkGroupSize &
createForPosition(const IRPosition & IRP,Attributor & A)749 AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
750 Attributor &A) {
751 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
752 return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
753 llvm_unreachable(
754 "AAAMDFlatWorkGroupSize is only valid for function position");
755 }
756
757 class AMDGPUAttributor : public ModulePass {
758 public:
AMDGPUAttributor()759 AMDGPUAttributor() : ModulePass(ID) {}
760
761 /// doInitialization - Virtual method overridden by subclasses to do
762 /// any necessary initialization before any pass is run.
doInitialization(Module &)763 bool doInitialization(Module &) override {
764 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
765 if (!TPC)
766 report_fatal_error("TargetMachine is required");
767
768 TM = &TPC->getTM<TargetMachine>();
769 return false;
770 }
771
runOnModule(Module & M)772 bool runOnModule(Module &M) override {
773 SetVector<Function *> Functions;
774 AnalysisGetter AG(this);
775 for (Function &F : M) {
776 if (!F.isIntrinsic())
777 Functions.insert(&F);
778 }
779
780 CallGraphUpdater CGUpdater;
781 BumpPtrAllocator Allocator;
782 AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
783 DenseSet<const char *> Allowed(
784 {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
785 &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID,
786 &AAPointerInfo::ID, &AAPotentialConstantValues::ID});
787
788 AttributorConfig AC(CGUpdater);
789 AC.Allowed = &Allowed;
790 AC.IsModulePass = true;
791 AC.DefaultInitializeLiveInternals = false;
792
793 Attributor A(Functions, InfoCache, AC);
794
795 for (Function &F : M) {
796 if (!F.isIntrinsic()) {
797 A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
798 A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
799 if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) {
800 A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
801 }
802 }
803 }
804
805 ChangeStatus Change = A.run();
806 return Change == ChangeStatus::CHANGED;
807 }
808
getAnalysisUsage(AnalysisUsage & AU) const809 void getAnalysisUsage(AnalysisUsage &AU) const override {
810 AU.addRequired<CycleInfoWrapperPass>();
811 }
812
getPassName() const813 StringRef getPassName() const override { return "AMDGPU Attributor"; }
814 TargetMachine *TM;
815 static char ID;
816 };
817 } // namespace
818
819 char AMDGPUAttributor::ID = 0;
820
createAMDGPUAttributorPass()821 Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); }
822 INITIALIZE_PASS_BEGIN(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false,
823 false)
824 INITIALIZE_PASS_DEPENDENCY(CycleInfoWrapperPass);
825 INITIALIZE_PASS_END(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false,
826 false)
827