xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp (revision 8d7d89b0811da55c9f4bf21682b563b0ce521f97)
1 //===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass eliminates LDS uses from non-kernel functions.
10 //
11 // The strategy is to create a new struct with a field for each LDS variable
12 // and allocate that struct at the same address for every kernel. Uses of the
13 // original LDS variables are then replaced with compile time offsets from that
14 // known address. AMDGPUMachineFunction allocates the LDS global.
15 //
16 // Local variables with constant annotation or non-undef initializer are passed
17 // through unchanged for simplication or error diagnostics in later passes.
18 //
19 // To reduce the memory overhead variables that are only used by kernels are
20 // excluded from this transform. The analysis to determine whether a variable
21 // is only used by a kernel is cheap and conservative so this may allocate
22 // a variable in every kernel when it was not strictly necessary to do so.
23 //
24 // A possible future refinement is to specialise the structure per-kernel, so
25 // that fields can be elided based on more expensive analysis.
26 //
27 // NOTE: Since this pass will directly pack LDS (assume large LDS) into a struct
28 // type which would cause allocating huge memory for struct instance within
29 // every kernel. Hence, before running this pass, it is advisable to run the
30 // pass "amdgpu-replace-lds-use-with-pointer" which will replace LDS uses within
31 // non-kernel functions by pointers and thereby minimizes the unnecessary per
32 // kernel allocation of LDS memory.
33 //
34 //===----------------------------------------------------------------------===//
35 
36 #include "AMDGPU.h"
37 #include "Utils/AMDGPUBaseInfo.h"
38 #include "Utils/AMDGPULDSUtils.h"
39 #include "llvm/ADT/STLExtras.h"
40 #include "llvm/IR/Constants.h"
41 #include "llvm/IR/DerivedTypes.h"
42 #include "llvm/IR/IRBuilder.h"
43 #include "llvm/IR/InlineAsm.h"
44 #include "llvm/IR/Instructions.h"
45 #include "llvm/IR/MDBuilder.h"
46 #include "llvm/InitializePasses.h"
47 #include "llvm/Pass.h"
48 #include "llvm/Support/CommandLine.h"
49 #include "llvm/Support/Debug.h"
50 #include "llvm/Support/OptimizedStructLayout.h"
51 #include "llvm/Transforms/Utils/ModuleUtils.h"
52 #include <vector>
53 
54 #define DEBUG_TYPE "amdgpu-lower-module-lds"
55 
56 using namespace llvm;
57 
58 static cl::opt<bool> SuperAlignLDSGlobals(
59     "amdgpu-super-align-lds-globals",
60     cl::desc("Increase alignment of LDS if it is not on align boundary"),
61     cl::init(true), cl::Hidden);
62 
63 namespace {
64 
65 class AMDGPULowerModuleLDS : public ModulePass {
66 
67   static void removeFromUsedList(Module &M, StringRef Name,
68                                  SmallPtrSetImpl<Constant *> &ToRemove) {
69     GlobalVariable *GV = M.getNamedGlobal(Name);
70     if (!GV || ToRemove.empty()) {
71       return;
72     }
73 
74     SmallVector<Constant *, 16> Init;
75     auto *CA = cast<ConstantArray>(GV->getInitializer());
76     for (auto &Op : CA->operands()) {
77       // ModuleUtils::appendToUsed only inserts Constants
78       Constant *C = cast<Constant>(Op);
79       if (!ToRemove.contains(C->stripPointerCasts())) {
80         Init.push_back(C);
81       }
82     }
83 
84     if (Init.size() == CA->getNumOperands()) {
85       return; // none to remove
86     }
87 
88     GV->eraseFromParent();
89 
90     for (Constant *C : ToRemove) {
91       C->removeDeadConstantUsers();
92     }
93 
94     if (!Init.empty()) {
95       ArrayType *ATy =
96           ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size());
97       GV =
98           new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
99                                    ConstantArray::get(ATy, Init), Name);
100       GV->setSection("llvm.metadata");
101     }
102   }
103 
104   static void
105   removeFromUsedLists(Module &M,
106                       const std::vector<GlobalVariable *> &LocalVars) {
107     SmallPtrSet<Constant *, 32> LocalVarsSet;
108     for (size_t I = 0; I < LocalVars.size(); I++) {
109       if (Constant *C = dyn_cast<Constant>(LocalVars[I]->stripPointerCasts())) {
110         LocalVarsSet.insert(C);
111       }
112     }
113     removeFromUsedList(M, "llvm.used", LocalVarsSet);
114     removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet);
115   }
116 
117   static void markUsedByKernel(IRBuilder<> &Builder, Function *Func,
118                                GlobalVariable *SGV) {
119     // The llvm.amdgcn.module.lds instance is implicitly used by all kernels
120     // that might call a function which accesses a field within it. This is
121     // presently approximated to 'all kernels' if there are any such functions
122     // in the module. This implicit use is reified as an explicit use here so
123     // that later passes, specifically PromoteAlloca, account for the required
124     // memory without any knowledge of this transform.
125 
126     // An operand bundle on llvm.donothing works because the call instruction
127     // survives until after the last pass that needs to account for LDS. It is
128     // better than inline asm as the latter survives until the end of codegen. A
129     // totally robust solution would be a function with the same semantics as
130     // llvm.donothing that takes a pointer to the instance and is lowered to a
131     // no-op after LDS is allocated, but that is not presently necessary.
132 
133     LLVMContext &Ctx = Func->getContext();
134 
135     Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI());
136 
137     FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {});
138 
139     Function *Decl =
140         Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {});
141 
142     Value *UseInstance[1] = {Builder.CreateInBoundsGEP(
143         SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))};
144 
145     Builder.CreateCall(FTy, Decl, {},
146                        {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)},
147                        "");
148   }
149 
150 private:
151   SmallPtrSet<GlobalValue *, 32> UsedList;
152 
153 public:
154   static char ID;
155 
156   AMDGPULowerModuleLDS() : ModulePass(ID) {
157     initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry());
158   }
159 
160   bool runOnModule(Module &M) override {
161     UsedList = AMDGPU::getUsedList(M);
162 
163     bool Changed = processUsedLDS(M);
164 
165     for (Function &F : M.functions()) {
166       // Only lower compute kernels' LDS.
167       if (!AMDGPU::isKernel(F.getCallingConv()))
168         continue;
169       Changed |= processUsedLDS(M, &F);
170     }
171 
172     UsedList.clear();
173     return Changed;
174   }
175 
176 private:
177   bool processUsedLDS(Module &M, Function *F = nullptr) {
178     LLVMContext &Ctx = M.getContext();
179     const DataLayout &DL = M.getDataLayout();
180 
181     // Find variables to move into new struct instance
182     std::vector<GlobalVariable *> FoundLocalVars =
183         AMDGPU::findVariablesToLower(M, F);
184 
185     if (FoundLocalVars.empty()) {
186       // No variables to rewrite, no changes made.
187       return false;
188     }
189 
190     // Increase the alignment of LDS globals if necessary to maximise the chance
191     // that we can use aligned LDS instructions to access them.
192     if (SuperAlignLDSGlobals) {
193       for (auto *GV : FoundLocalVars) {
194         Align Alignment = AMDGPU::getAlign(DL, GV);
195         TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType());
196 
197         if (GVSize > 8) {
198           // We might want to use a b96 or b128 load/store
199           Alignment = std::max(Alignment, Align(16));
200         } else if (GVSize > 4) {
201           // We might want to use a b64 load/store
202           Alignment = std::max(Alignment, Align(8));
203         } else if (GVSize > 2) {
204           // We might want to use a b32 load/store
205           Alignment = std::max(Alignment, Align(4));
206         } else if (GVSize > 1) {
207           // We might want to use a b16 load/store
208           Alignment = std::max(Alignment, Align(2));
209         }
210 
211         GV->setAlignment(Alignment);
212       }
213     }
214 
215     SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
216     LayoutFields.reserve(FoundLocalVars.size());
217     for (GlobalVariable *GV : FoundLocalVars) {
218       OptimizedStructLayoutField F(GV, DL.getTypeAllocSize(GV->getValueType()),
219                                    AMDGPU::getAlign(DL, GV));
220       LayoutFields.emplace_back(F);
221     }
222 
223     performOptimizedStructLayout(LayoutFields);
224 
225     std::vector<GlobalVariable *> LocalVars;
226     LocalVars.reserve(FoundLocalVars.size()); // will be at least this large
227     {
228       // This usually won't need to insert any padding, perhaps avoid the alloc
229       uint64_t CurrentOffset = 0;
230       for (size_t I = 0; I < LayoutFields.size(); I++) {
231         GlobalVariable *FGV = static_cast<GlobalVariable *>(
232             const_cast<void *>(LayoutFields[I].Id));
233         Align DataAlign = LayoutFields[I].Alignment;
234 
235         uint64_t DataAlignV = DataAlign.value();
236         if (uint64_t Rem = CurrentOffset % DataAlignV) {
237           uint64_t Padding = DataAlignV - Rem;
238 
239           // Append an array of padding bytes to meet alignment requested
240           // Note (o +      (a - (o % a)) ) % a == 0
241           //      (offset + Padding       ) % align == 0
242 
243           Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
244           LocalVars.push_back(new GlobalVariable(
245               M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
246               "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
247               false));
248           CurrentOffset += Padding;
249         }
250 
251         LocalVars.push_back(FGV);
252         CurrentOffset += LayoutFields[I].Size;
253       }
254     }
255 
256     std::vector<Type *> LocalVarTypes;
257     LocalVarTypes.reserve(LocalVars.size());
258     std::transform(
259         LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
260         [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
261 
262     std::string VarName(
263         F ? (Twine("llvm.amdgcn.kernel.") + F->getName() + ".lds").str()
264           : "llvm.amdgcn.module.lds");
265     StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
266 
267     Align StructAlign =
268         AMDGPU::getAlign(DL, LocalVars[0]);
269 
270     GlobalVariable *SGV = new GlobalVariable(
271         M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
272         VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
273         false);
274     SGV->setAlignment(StructAlign);
275     if (!F) {
276       appendToCompilerUsed(
277           M, {static_cast<GlobalValue *>(
278                  ConstantExpr::getPointerBitCastOrAddrSpaceCast(
279                      cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
280     }
281 
282     // The verifier rejects used lists containing an inttoptr of a constant
283     // so remove the variables from these lists before replaceAllUsesWith
284     removeFromUsedLists(M, LocalVars);
285 
286     // Create alias.scope and their lists. Each field in the new structure
287     // does not alias with all other fields.
288     SmallVector<MDNode *> AliasScopes;
289     SmallVector<Metadata *> NoAliasList;
290     if (LocalVars.size() > 1) {
291       MDBuilder MDB(Ctx);
292       AliasScopes.reserve(LocalVars.size());
293       MDNode *Domain = MDB.createAnonymousAliasScopeDomain();
294       for (size_t I = 0; I < LocalVars.size(); I++) {
295         MDNode *Scope = MDB.createAnonymousAliasScope(Domain);
296         AliasScopes.push_back(Scope);
297       }
298       NoAliasList.append(&AliasScopes[1], AliasScopes.end());
299     }
300 
301     // Replace uses of ith variable with a constantexpr to the ith field of the
302     // instance that will be allocated by AMDGPUMachineFunction
303     Type *I32 = Type::getInt32Ty(Ctx);
304     for (size_t I = 0; I < LocalVars.size(); I++) {
305       GlobalVariable *GV = LocalVars[I];
306       Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
307       Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx);
308       if (F) {
309         // Replace all constant uses with instructions if they belong to the
310         // current kernel.
311         for (User *U : make_early_inc_range(GV->users())) {
312           if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
313             AMDGPU::replaceConstantUsesInFunction(C, F);
314         }
315 
316         GV->removeDeadConstantUsers();
317 
318         GV->replaceUsesWithIf(GEP, [F](Use &U) {
319           Instruction *I = dyn_cast<Instruction>(U.getUser());
320           return I && I->getFunction() == F;
321         });
322       } else {
323         GV->replaceAllUsesWith(GEP);
324       }
325       if (GV->use_empty()) {
326         UsedList.erase(GV);
327         GV->eraseFromParent();
328       }
329 
330       uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I);
331       Align A = commonAlignment(StructAlign, Off);
332 
333       if (I)
334         NoAliasList[I - 1] = AliasScopes[I - 1];
335       MDNode *NoAlias =
336           NoAliasList.empty() ? nullptr : MDNode::get(Ctx, NoAliasList);
337       MDNode *AliasScope =
338           AliasScopes.empty() ? nullptr : MDNode::get(Ctx, {AliasScopes[I]});
339 
340       refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
341     }
342 
343     // Mark kernels with asm that reads the address of the allocated structure
344     // This is not necessary for lowering. This lets other passes, specifically
345     // PromoteAlloca, accurately calculate how much LDS will be used by the
346     // kernel after lowering.
347     if (!F) {
348       IRBuilder<> Builder(Ctx);
349       SmallPtrSet<Function *, 32> Kernels;
350       for (auto &I : M.functions()) {
351         Function *Func = &I;
352         if (AMDGPU::isKernelCC(Func) && !Kernels.contains(Func)) {
353           markUsedByKernel(Builder, Func, SGV);
354           Kernels.insert(Func);
355         }
356       }
357     }
358     return true;
359   }
360 
361   void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,
362                                 MDNode *AliasScope, MDNode *NoAlias,
363                                 unsigned MaxDepth = 5) {
364     if (!MaxDepth || (A == 1 && !AliasScope))
365       return;
366 
367     for (User *U : Ptr->users()) {
368       if (auto *I = dyn_cast<Instruction>(U)) {
369         if (AliasScope && I->mayReadOrWriteMemory()) {
370           MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope);
371           AS = MDNode::concatenate(AS, AliasScope);
372           I->setMetadata(LLVMContext::MD_alias_scope, AS);
373 
374           MDNode *NA = I->getMetadata(LLVMContext::MD_noalias);
375           NA = MDNode::concatenate(NA, NoAlias);
376           I->setMetadata(LLVMContext::MD_noalias, NA);
377         }
378       }
379 
380       if (auto *LI = dyn_cast<LoadInst>(U)) {
381         LI->setAlignment(std::max(A, LI->getAlign()));
382         continue;
383       }
384       if (auto *SI = dyn_cast<StoreInst>(U)) {
385         if (SI->getPointerOperand() == Ptr)
386           SI->setAlignment(std::max(A, SI->getAlign()));
387         continue;
388       }
389       if (auto *AI = dyn_cast<AtomicRMWInst>(U)) {
390         // None of atomicrmw operations can work on pointers, but let's
391         // check it anyway in case it will or we will process ConstantExpr.
392         if (AI->getPointerOperand() == Ptr)
393           AI->setAlignment(std::max(A, AI->getAlign()));
394         continue;
395       }
396       if (auto *AI = dyn_cast<AtomicCmpXchgInst>(U)) {
397         if (AI->getPointerOperand() == Ptr)
398           AI->setAlignment(std::max(A, AI->getAlign()));
399         continue;
400       }
401       if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
402         unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
403         APInt Off(BitWidth, 0);
404         if (GEP->getPointerOperand() == Ptr) {
405           Align GA;
406           if (GEP->accumulateConstantOffset(DL, Off))
407             GA = commonAlignment(A, Off.getLimitedValue());
408           refineUsesAlignmentAndAA(GEP, GA, DL, AliasScope, NoAlias,
409                                    MaxDepth - 1);
410         }
411         continue;
412       }
413       if (auto *I = dyn_cast<Instruction>(U)) {
414         if (I->getOpcode() == Instruction::BitCast ||
415             I->getOpcode() == Instruction::AddrSpaceCast)
416           refineUsesAlignmentAndAA(I, A, DL, AliasScope, NoAlias, MaxDepth - 1);
417       }
418     }
419   }
420 };
421 
422 } // namespace
423 char AMDGPULowerModuleLDS::ID = 0;
424 
425 char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID;
426 
427 INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE,
428                 "Lower uses of LDS variables from non-kernel functions", false,
429                 false)
430 
431 ModulePass *llvm::createAMDGPULowerModuleLDSPass() {
432   return new AMDGPULowerModuleLDS();
433 }
434 
435 PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
436                                                 ModuleAnalysisManager &) {
437   return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none()
438                                                : PreservedAnalyses::all();
439 }
440