xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp (revision 009368f13053dd11515f583fe36b34b15b356593)
1 //===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass does attempts to make use of reqd_work_group_size metadata
10 /// to eliminate loads from the dispatch packet and to constant fold OpenCL
11 /// get_local_size-like functions.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "Utils/AMDGPUBaseInfo.h"
17 #include "llvm/Analysis/ConstantFolding.h"
18 #include "llvm/Analysis/ValueTracking.h"
19 #include "llvm/CodeGen/Passes.h"
20 #include "llvm/IR/Constants.h"
21 #include "llvm/IR/Function.h"
22 #include "llvm/IR/InstIterator.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/IR/MDBuilder.h"
26 #include "llvm/IR/PatternMatch.h"
27 #include "llvm/Pass.h"
28 
29 #define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
30 
31 using namespace llvm;
32 
33 namespace {
34 
35 // Field offsets in hsa_kernel_dispatch_packet_t.
36 enum DispatchPackedOffsets {
37   WORKGROUP_SIZE_X = 4,
38   WORKGROUP_SIZE_Y = 6,
39   WORKGROUP_SIZE_Z = 8,
40 
41   GRID_SIZE_X = 12,
42   GRID_SIZE_Y = 16,
43   GRID_SIZE_Z = 20
44 };
45 
46 // Field offsets to implicit kernel argument pointer.
47 enum ImplicitArgOffsets {
48   HIDDEN_BLOCK_COUNT_X = 0,
49   HIDDEN_BLOCK_COUNT_Y = 4,
50   HIDDEN_BLOCK_COUNT_Z = 8,
51 
52   HIDDEN_GROUP_SIZE_X = 12,
53   HIDDEN_GROUP_SIZE_Y = 14,
54   HIDDEN_GROUP_SIZE_Z = 16,
55 
56   HIDDEN_REMAINDER_X = 18,
57   HIDDEN_REMAINDER_Y = 20,
58   HIDDEN_REMAINDER_Z = 22,
59 };
60 
61 class AMDGPULowerKernelAttributes : public ModulePass {
62 public:
63   static char ID;
64 
65   AMDGPULowerKernelAttributes() : ModulePass(ID) {}
66 
67   bool runOnModule(Module &M) override;
68 
69   StringRef getPassName() const override {
70     return "AMDGPU Kernel Attributes";
71   }
72 
73   void getAnalysisUsage(AnalysisUsage &AU) const override {
74     AU.setPreservesAll();
75  }
76 };
77 
78 Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
79   auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
80                                  : Intrinsic::amdgcn_dispatch_ptr;
81   return Intrinsic::getDeclarationIfExists(&M, IntrinsicId);
82 }
83 
84 } // end anonymous namespace
85 
86 static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
87                                             uint32_t MaxNumGroups) {
88   if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max())
89     return;
90 
91   if (!Load->getType()->isIntegerTy(32))
92     return;
93 
94   // TODO: If there is existing range metadata, preserve it if it is stricter.
95   MDBuilder MDB(Load->getContext());
96   MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1));
97   Load->setMetadata(LLVMContext::MD_range, Range);
98 }
99 
100 static bool processUse(CallInst *CI, bool IsV5OrAbove) {
101   Function *F = CI->getParent()->getParent();
102 
103   auto *MD = F->getMetadata("reqd_work_group_size");
104   const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
105 
106   const bool HasUniformWorkGroupSize =
107     F->getFnAttribute("uniform-work-group-size").getValueAsBool();
108 
109   SmallVector<unsigned> MaxNumWorkgroups =
110       AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups", 3);
111 
112   if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
113       none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
114     return false;
115 
116   Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
117   Value *GroupSizes[3]  = {nullptr, nullptr, nullptr};
118   Value *Remainders[3]  = {nullptr, nullptr, nullptr};
119   Value *GridSizes[3]   = {nullptr, nullptr, nullptr};
120 
121   const DataLayout &DL = F->getDataLayout();
122 
123   // We expect to see several GEP users, casted to the appropriate type and
124   // loaded.
125   for (User *U : CI->users()) {
126     if (!U->hasOneUse())
127       continue;
128 
129     int64_t Offset = 0;
130     auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr?
131     auto *BCI = dyn_cast<BitCastInst>(U);
132     if (!Load && !BCI) {
133       if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
134         continue;
135       Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
136       BCI = dyn_cast<BitCastInst>(*U->user_begin());
137     }
138 
139     if (BCI) {
140       if (!BCI->hasOneUse())
141         continue;
142       Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI?
143     }
144 
145     if (!Load || !Load->isSimple())
146       continue;
147 
148     unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
149 
150     // TODO: Handle merged loads.
151     if (IsV5OrAbove) { // Base is ImplicitArgPtr.
152       switch (Offset) {
153       case HIDDEN_BLOCK_COUNT_X:
154         if (LoadSize == 4) {
155           BlockCounts[0] = Load;
156           annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]);
157         }
158         break;
159       case HIDDEN_BLOCK_COUNT_Y:
160         if (LoadSize == 4) {
161           BlockCounts[1] = Load;
162           annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]);
163         }
164         break;
165       case HIDDEN_BLOCK_COUNT_Z:
166         if (LoadSize == 4) {
167           BlockCounts[2] = Load;
168           annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]);
169         }
170         break;
171       case HIDDEN_GROUP_SIZE_X:
172         if (LoadSize == 2)
173           GroupSizes[0] = Load;
174         break;
175       case HIDDEN_GROUP_SIZE_Y:
176         if (LoadSize == 2)
177           GroupSizes[1] = Load;
178         break;
179       case HIDDEN_GROUP_SIZE_Z:
180         if (LoadSize == 2)
181           GroupSizes[2] = Load;
182         break;
183       case HIDDEN_REMAINDER_X:
184         if (LoadSize == 2)
185           Remainders[0] = Load;
186         break;
187       case HIDDEN_REMAINDER_Y:
188         if (LoadSize == 2)
189           Remainders[1] = Load;
190         break;
191       case HIDDEN_REMAINDER_Z:
192         if (LoadSize == 2)
193           Remainders[2] = Load;
194         break;
195       default:
196         break;
197       }
198     } else { // Base is DispatchPtr.
199       switch (Offset) {
200       case WORKGROUP_SIZE_X:
201         if (LoadSize == 2)
202           GroupSizes[0] = Load;
203         break;
204       case WORKGROUP_SIZE_Y:
205         if (LoadSize == 2)
206           GroupSizes[1] = Load;
207         break;
208       case WORKGROUP_SIZE_Z:
209         if (LoadSize == 2)
210           GroupSizes[2] = Load;
211         break;
212       case GRID_SIZE_X:
213         if (LoadSize == 4)
214           GridSizes[0] = Load;
215         break;
216       case GRID_SIZE_Y:
217         if (LoadSize == 4)
218           GridSizes[1] = Load;
219         break;
220       case GRID_SIZE_Z:
221         if (LoadSize == 4)
222           GridSizes[2] = Load;
223         break;
224       default:
225         break;
226       }
227     }
228   }
229 
230   bool MadeChange = false;
231   if (IsV5OrAbove && HasUniformWorkGroupSize) {
232     // Under v5  __ockl_get_local_size returns the value computed by the expression:
233     //
234     //   workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder
235     //
236     // For functions with the attribute uniform-work-group-size=true. we can evaluate
237     // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned
238     // for __ockl_get_local_size.
239     for (int I = 0; I < 3; ++I) {
240       Value *BlockCount = BlockCounts[I];
241       if (!BlockCount)
242         continue;
243 
244       using namespace llvm::PatternMatch;
245       auto GroupIDIntrin =
246           I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
247                  : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
248                            : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
249 
250       for (User *ICmp : BlockCount->users()) {
251         if (match(ICmp, m_SpecificICmp(ICmpInst::ICMP_ULT, GroupIDIntrin,
252                                        m_Specific(BlockCount)))) {
253           ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));
254           MadeChange = true;
255         }
256       }
257     }
258 
259     // All remainders should be 0 with uniform work group size.
260     for (Value *Remainder : Remainders) {
261       if (!Remainder)
262         continue;
263       Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType()));
264       MadeChange = true;
265     }
266   } else if (HasUniformWorkGroupSize) { // Pre-V5.
267     // Pattern match the code used to handle partial workgroup dispatches in the
268     // library implementation of get_local_size, so the entire function can be
269     // constant folded with a known group size.
270     //
271     // uint r = grid_size - group_id * group_size;
272     // get_local_size = (r < group_size) ? r : group_size;
273     //
274     // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
275     // the grid_size is required to be a multiple of group_size). In this case:
276     //
277     // grid_size - (group_id * group_size) < group_size
278     // ->
279     // grid_size < group_size + (group_id * group_size)
280     //
281     // (grid_size / group_size) < 1 + group_id
282     //
283     // grid_size / group_size is at least 1, so we can conclude the select
284     // condition is false (except for group_id == 0, where the select result is
285     // the same).
286     for (int I = 0; I < 3; ++I) {
287       Value *GroupSize = GroupSizes[I];
288       Value *GridSize = GridSizes[I];
289       if (!GroupSize || !GridSize)
290         continue;
291 
292       using namespace llvm::PatternMatch;
293       auto GroupIDIntrin =
294           I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
295                  : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
296                            : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
297 
298       for (User *U : GroupSize->users()) {
299         auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
300         if (!ZextGroupSize)
301           continue;
302 
303         for (User *UMin : ZextGroupSize->users()) {
304           if (match(UMin,
305                     m_UMin(m_Sub(m_Specific(GridSize),
306                                  m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))),
307                            m_Specific(ZextGroupSize)))) {
308             if (HasReqdWorkGroupSize) {
309               ConstantInt *KnownSize
310                 = mdconst::extract<ConstantInt>(MD->getOperand(I));
311               UMin->replaceAllUsesWith(ConstantFoldIntegerCast(
312                   KnownSize, UMin->getType(), false, DL));
313             } else {
314               UMin->replaceAllUsesWith(ZextGroupSize);
315             }
316 
317             MadeChange = true;
318           }
319         }
320       }
321     }
322   }
323 
324   // If reqd_work_group_size is set, we can replace work group size with it.
325   if (!HasReqdWorkGroupSize)
326     return MadeChange;
327 
328   for (int I = 0; I < 3; I++) {
329     Value *GroupSize = GroupSizes[I];
330     if (!GroupSize)
331       continue;
332 
333     ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
334     GroupSize->replaceAllUsesWith(
335         ConstantFoldIntegerCast(KnownSize, GroupSize->getType(), false, DL));
336     MadeChange = true;
337   }
338 
339   return MadeChange;
340 }
341 
342 
343 // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
344 // TargetPassConfig for subtarget.
345 bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
346   bool MadeChange = false;
347   bool IsV5OrAbove =
348       AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
349   Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
350 
351   if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
352     return false;
353 
354   SmallPtrSet<Instruction *, 4> HandledUses;
355   for (auto *U : BasePtr->users()) {
356     CallInst *CI = cast<CallInst>(U);
357     if (HandledUses.insert(CI).second) {
358       if (processUse(CI, IsV5OrAbove))
359         MadeChange = true;
360     }
361   }
362 
363   return MadeChange;
364 }
365 
366 
367 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
368                       "AMDGPU Kernel Attributes", false, false)
369 INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
370                     "AMDGPU Kernel Attributes", false, false)
371 
372 char AMDGPULowerKernelAttributes::ID = 0;
373 
374 ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
375   return new AMDGPULowerKernelAttributes();
376 }
377 
378 PreservedAnalyses
379 AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
380   bool IsV5OrAbove =
381       AMDGPU::getAMDHSACodeObjectVersion(*F.getParent()) >= AMDGPU::AMDHSA_COV5;
382   Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove);
383 
384   if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
385     return PreservedAnalyses::all();
386 
387   for (Instruction &I : instructions(F)) {
388     if (CallInst *CI = dyn_cast<CallInst>(&I)) {
389       if (CI->getCalledFunction() == BasePtr)
390         processUse(CI, IsV5OrAbove);
391     }
392   }
393 
394   return PreservedAnalyses::all();
395 }
396