xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp (revision dc6e8dfdfe7efecfda318d43a06fae18b40eb498)
1 //===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass eliminates LDS uses from non-kernel functions.
10 //
11 // The strategy is to create a new struct with a field for each LDS variable
12 // and allocate that struct at the same address for every kernel. Uses of the
13 // original LDS variables are then replaced with compile time offsets from that
14 // known address. AMDGPUMachineFunction allocates the LDS global.
15 //
16 // Local variables with constant annotation or non-undef initializer are passed
17 // through unchanged for simplication or error diagnostics in later passes.
18 //
19 // To reduce the memory overhead variables that are only used by kernels are
20 // excluded from this transform. The analysis to determine whether a variable
21 // is only used by a kernel is cheap and conservative so this may allocate
22 // a variable in every kernel when it was not strictly necessary to do so.
23 //
24 // A possible future refinement is to specialise the structure per-kernel, so
25 // that fields can be elided based on more expensive analysis.
26 //
27 // NOTE: Since this pass will directly pack LDS (assume large LDS) into a struct
28 // type which would cause allocating huge memory for struct instance within
29 // every kernel. Hence, before running this pass, it is advisable to run the
30 // pass "amdgpu-replace-lds-use-with-pointer" which will replace LDS uses within
31 // non-kernel functions by pointers and thereby minimizes the unnecessary per
32 // kernel allocation of LDS memory.
33 //
34 //===----------------------------------------------------------------------===//
35 
36 #include "AMDGPU.h"
37 #include "Utils/AMDGPUBaseInfo.h"
38 #include "Utils/AMDGPULDSUtils.h"
39 #include "llvm/ADT/STLExtras.h"
40 #include "llvm/IR/Constants.h"
41 #include "llvm/IR/DerivedTypes.h"
42 #include "llvm/IR/IRBuilder.h"
43 #include "llvm/IR/InlineAsm.h"
44 #include "llvm/IR/Instructions.h"
45 #include "llvm/IR/MDBuilder.h"
46 #include "llvm/InitializePasses.h"
47 #include "llvm/Pass.h"
48 #include "llvm/Support/CommandLine.h"
49 #include "llvm/Support/Debug.h"
50 #include "llvm/Support/OptimizedStructLayout.h"
51 #include "llvm/Transforms/Utils/ModuleUtils.h"
52 #include <vector>
53 
54 #define DEBUG_TYPE "amdgpu-lower-module-lds"
55 
56 using namespace llvm;
57 
58 static cl::opt<bool> SuperAlignLDSGlobals(
59     "amdgpu-super-align-lds-globals",
60     cl::desc("Increase alignment of LDS if it is not on align boundary"),
61     cl::init(true), cl::Hidden);
62 
63 namespace {
64 
65 class AMDGPULowerModuleLDS : public ModulePass {
66 
67   static void removeFromUsedList(Module &M, StringRef Name,
68                                  SmallPtrSetImpl<Constant *> &ToRemove) {
69     GlobalVariable *GV = M.getNamedGlobal(Name);
70     if (!GV || ToRemove.empty()) {
71       return;
72     }
73 
74     SmallVector<Constant *, 16> Init;
75     auto *CA = cast<ConstantArray>(GV->getInitializer());
76     for (auto &Op : CA->operands()) {
77       // ModuleUtils::appendToUsed only inserts Constants
78       Constant *C = cast<Constant>(Op);
79       if (!ToRemove.contains(C->stripPointerCasts())) {
80         Init.push_back(C);
81       }
82     }
83 
84     if (Init.size() == CA->getNumOperands()) {
85       return; // none to remove
86     }
87 
88     GV->eraseFromParent();
89 
90     for (Constant *C : ToRemove) {
91       C->removeDeadConstantUsers();
92     }
93 
94     if (!Init.empty()) {
95       ArrayType *ATy =
96           ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size());
97       GV =
98           new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
99                                    ConstantArray::get(ATy, Init), Name);
100       GV->setSection("llvm.metadata");
101     }
102   }
103 
104   static void
105   removeFromUsedLists(Module &M,
106                       const std::vector<GlobalVariable *> &LocalVars) {
107     SmallPtrSet<Constant *, 32> LocalVarsSet;
108     for (size_t I = 0; I < LocalVars.size(); I++) {
109       if (Constant *C = dyn_cast<Constant>(LocalVars[I]->stripPointerCasts())) {
110         LocalVarsSet.insert(C);
111       }
112     }
113     removeFromUsedList(M, "llvm.used", LocalVarsSet);
114     removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet);
115   }
116 
117   static void markUsedByKernel(IRBuilder<> &Builder, Function *Func,
118                                GlobalVariable *SGV) {
119     // The llvm.amdgcn.module.lds instance is implicitly used by all kernels
120     // that might call a function which accesses a field within it. This is
121     // presently approximated to 'all kernels' if there are any such functions
122     // in the module. This implicit use is redefined as an explicit use here so
123     // that later passes, specifically PromoteAlloca, account for the required
124     // memory without any knowledge of this transform.
125 
126     // An operand bundle on llvm.donothing works because the call instruction
127     // survives until after the last pass that needs to account for LDS. It is
128     // better than inline asm as the latter survives until the end of codegen. A
129     // totally robust solution would be a function with the same semantics as
130     // llvm.donothing that takes a pointer to the instance and is lowered to a
131     // no-op after LDS is allocated, but that is not presently necessary.
132 
133     LLVMContext &Ctx = Func->getContext();
134 
135     Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI());
136 
137     FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {});
138 
139     Function *Decl =
140         Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {});
141 
142     Value *UseInstance[1] = {Builder.CreateInBoundsGEP(
143         SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))};
144 
145     Builder.CreateCall(FTy, Decl, {},
146                        {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)},
147                        "");
148   }
149 
150 private:
151   SmallPtrSet<GlobalValue *, 32> UsedList;
152 
153 public:
154   static char ID;
155 
156   AMDGPULowerModuleLDS() : ModulePass(ID) {
157     initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry());
158   }
159 
160   bool runOnModule(Module &M) override {
161     UsedList = AMDGPU::getUsedList(M);
162 
163     bool Changed = processUsedLDS(M);
164 
165     for (Function &F : M.functions()) {
166       if (F.isDeclaration())
167         continue;
168 
169       // Only lower compute kernels' LDS.
170       if (!AMDGPU::isKernel(F.getCallingConv()))
171         continue;
172       Changed |= processUsedLDS(M, &F);
173     }
174 
175     UsedList.clear();
176     return Changed;
177   }
178 
179 private:
180   bool processUsedLDS(Module &M, Function *F = nullptr) {
181     LLVMContext &Ctx = M.getContext();
182     const DataLayout &DL = M.getDataLayout();
183 
184     // Find variables to move into new struct instance
185     std::vector<GlobalVariable *> FoundLocalVars =
186         AMDGPU::findVariablesToLower(M, F);
187 
188     if (FoundLocalVars.empty()) {
189       // No variables to rewrite, no changes made.
190       return false;
191     }
192 
193     // Increase the alignment of LDS globals if necessary to maximise the chance
194     // that we can use aligned LDS instructions to access them.
195     if (SuperAlignLDSGlobals) {
196       for (auto *GV : FoundLocalVars) {
197         Align Alignment = AMDGPU::getAlign(DL, GV);
198         TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType());
199 
200         if (GVSize > 8) {
201           // We might want to use a b96 or b128 load/store
202           Alignment = std::max(Alignment, Align(16));
203         } else if (GVSize > 4) {
204           // We might want to use a b64 load/store
205           Alignment = std::max(Alignment, Align(8));
206         } else if (GVSize > 2) {
207           // We might want to use a b32 load/store
208           Alignment = std::max(Alignment, Align(4));
209         } else if (GVSize > 1) {
210           // We might want to use a b16 load/store
211           Alignment = std::max(Alignment, Align(2));
212         }
213 
214         GV->setAlignment(Alignment);
215       }
216     }
217 
218     SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
219     LayoutFields.reserve(FoundLocalVars.size());
220     for (GlobalVariable *GV : FoundLocalVars) {
221       OptimizedStructLayoutField F(GV, DL.getTypeAllocSize(GV->getValueType()),
222                                    AMDGPU::getAlign(DL, GV));
223       LayoutFields.emplace_back(F);
224     }
225 
226     performOptimizedStructLayout(LayoutFields);
227 
228     std::vector<GlobalVariable *> LocalVars;
229     LocalVars.reserve(FoundLocalVars.size()); // will be at least this large
230     {
231       // This usually won't need to insert any padding, perhaps avoid the alloc
232       uint64_t CurrentOffset = 0;
233       for (size_t I = 0; I < LayoutFields.size(); I++) {
234         GlobalVariable *FGV = static_cast<GlobalVariable *>(
235             const_cast<void *>(LayoutFields[I].Id));
236         Align DataAlign = LayoutFields[I].Alignment;
237 
238         uint64_t DataAlignV = DataAlign.value();
239         if (uint64_t Rem = CurrentOffset % DataAlignV) {
240           uint64_t Padding = DataAlignV - Rem;
241 
242           // Append an array of padding bytes to meet alignment requested
243           // Note (o +      (a - (o % a)) ) % a == 0
244           //      (offset + Padding       ) % align == 0
245 
246           Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
247           LocalVars.push_back(new GlobalVariable(
248               M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
249               "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
250               false));
251           CurrentOffset += Padding;
252         }
253 
254         LocalVars.push_back(FGV);
255         CurrentOffset += LayoutFields[I].Size;
256       }
257     }
258 
259     std::vector<Type *> LocalVarTypes;
260     LocalVarTypes.reserve(LocalVars.size());
261     std::transform(
262         LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
263         [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
264 
265     std::string VarName(
266         F ? (Twine("llvm.amdgcn.kernel.") + F->getName() + ".lds").str()
267           : "llvm.amdgcn.module.lds");
268     StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
269 
270     Align StructAlign =
271         AMDGPU::getAlign(DL, LocalVars[0]);
272 
273     GlobalVariable *SGV = new GlobalVariable(
274         M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
275         VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
276         false);
277     SGV->setAlignment(StructAlign);
278     if (!F) {
279       appendToCompilerUsed(
280           M, {static_cast<GlobalValue *>(
281                  ConstantExpr::getPointerBitCastOrAddrSpaceCast(
282                      cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
283     }
284 
285     // The verifier rejects used lists containing an inttoptr of a constant
286     // so remove the variables from these lists before replaceAllUsesWith
287     removeFromUsedLists(M, LocalVars);
288 
289     // Create alias.scope and their lists. Each field in the new structure
290     // does not alias with all other fields.
291     SmallVector<MDNode *> AliasScopes;
292     SmallVector<Metadata *> NoAliasList;
293     if (LocalVars.size() > 1) {
294       MDBuilder MDB(Ctx);
295       AliasScopes.reserve(LocalVars.size());
296       MDNode *Domain = MDB.createAnonymousAliasScopeDomain();
297       for (size_t I = 0; I < LocalVars.size(); I++) {
298         MDNode *Scope = MDB.createAnonymousAliasScope(Domain);
299         AliasScopes.push_back(Scope);
300       }
301       NoAliasList.append(&AliasScopes[1], AliasScopes.end());
302     }
303 
304     // Replace uses of ith variable with a constantexpr to the ith field of the
305     // instance that will be allocated by AMDGPUMachineFunction
306     Type *I32 = Type::getInt32Ty(Ctx);
307     for (size_t I = 0; I < LocalVars.size(); I++) {
308       GlobalVariable *GV = LocalVars[I];
309       Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
310       Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx);
311       if (F) {
312         // Replace all constant uses with instructions if they belong to the
313         // current kernel.
314         for (User *U : make_early_inc_range(GV->users())) {
315           if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
316             AMDGPU::replaceConstantUsesInFunction(C, F);
317         }
318 
319         GV->removeDeadConstantUsers();
320 
321         GV->replaceUsesWithIf(GEP, [F](Use &U) {
322           Instruction *I = dyn_cast<Instruction>(U.getUser());
323           return I && I->getFunction() == F;
324         });
325       } else {
326         GV->replaceAllUsesWith(GEP);
327       }
328       if (GV->use_empty()) {
329         UsedList.erase(GV);
330         GV->eraseFromParent();
331       }
332 
333       uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I);
334       Align A = commonAlignment(StructAlign, Off);
335 
336       if (I)
337         NoAliasList[I - 1] = AliasScopes[I - 1];
338       MDNode *NoAlias =
339           NoAliasList.empty() ? nullptr : MDNode::get(Ctx, NoAliasList);
340       MDNode *AliasScope =
341           AliasScopes.empty() ? nullptr : MDNode::get(Ctx, {AliasScopes[I]});
342 
343       refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
344     }
345 
346     // Mark kernels with asm that reads the address of the allocated structure
347     // This is not necessary for lowering. This lets other passes, specifically
348     // PromoteAlloca, accurately calculate how much LDS will be used by the
349     // kernel after lowering.
350     if (!F) {
351       IRBuilder<> Builder(Ctx);
352       SmallPtrSet<Function *, 32> Kernels;
353       for (Function &Func : M.functions()) {
354         if (Func.isDeclaration())
355           continue;
356 
357         if (AMDGPU::isKernelCC(&Func) && !Kernels.contains(&Func)) {
358           markUsedByKernel(Builder, &Func, SGV);
359           Kernels.insert(&Func);
360         }
361       }
362     }
363     return true;
364   }
365 
366   void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,
367                                 MDNode *AliasScope, MDNode *NoAlias,
368                                 unsigned MaxDepth = 5) {
369     if (!MaxDepth || (A == 1 && !AliasScope))
370       return;
371 
372     for (User *U : Ptr->users()) {
373       if (auto *I = dyn_cast<Instruction>(U)) {
374         if (AliasScope && I->mayReadOrWriteMemory()) {
375           MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope);
376           AS = MDNode::concatenate(AS, AliasScope);
377           I->setMetadata(LLVMContext::MD_alias_scope, AS);
378 
379           MDNode *NA = I->getMetadata(LLVMContext::MD_noalias);
380           NA = MDNode::concatenate(NA, NoAlias);
381           I->setMetadata(LLVMContext::MD_noalias, NA);
382         }
383       }
384 
385       if (auto *LI = dyn_cast<LoadInst>(U)) {
386         LI->setAlignment(std::max(A, LI->getAlign()));
387         continue;
388       }
389       if (auto *SI = dyn_cast<StoreInst>(U)) {
390         if (SI->getPointerOperand() == Ptr)
391           SI->setAlignment(std::max(A, SI->getAlign()));
392         continue;
393       }
394       if (auto *AI = dyn_cast<AtomicRMWInst>(U)) {
395         // None of atomicrmw operations can work on pointers, but let's
396         // check it anyway in case it will or we will process ConstantExpr.
397         if (AI->getPointerOperand() == Ptr)
398           AI->setAlignment(std::max(A, AI->getAlign()));
399         continue;
400       }
401       if (auto *AI = dyn_cast<AtomicCmpXchgInst>(U)) {
402         if (AI->getPointerOperand() == Ptr)
403           AI->setAlignment(std::max(A, AI->getAlign()));
404         continue;
405       }
406       if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
407         unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
408         APInt Off(BitWidth, 0);
409         if (GEP->getPointerOperand() == Ptr) {
410           Align GA;
411           if (GEP->accumulateConstantOffset(DL, Off))
412             GA = commonAlignment(A, Off.getLimitedValue());
413           refineUsesAlignmentAndAA(GEP, GA, DL, AliasScope, NoAlias,
414                                    MaxDepth - 1);
415         }
416         continue;
417       }
418       if (auto *I = dyn_cast<Instruction>(U)) {
419         if (I->getOpcode() == Instruction::BitCast ||
420             I->getOpcode() == Instruction::AddrSpaceCast)
421           refineUsesAlignmentAndAA(I, A, DL, AliasScope, NoAlias, MaxDepth - 1);
422       }
423     }
424   }
425 };
426 
427 } // namespace
428 char AMDGPULowerModuleLDS::ID = 0;
429 
430 char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID;
431 
432 INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE,
433                 "Lower uses of LDS variables from non-kernel functions", false,
434                 false)
435 
436 ModulePass *llvm::createAMDGPULowerModuleLDSPass() {
437   return new AMDGPULowerModuleLDS();
438 }
439 
440 PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
441                                                 ModuleAnalysisManager &) {
442   return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none()
443                                                : PreservedAnalyses::all();
444 }
445