xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp (revision bc78c099524283b5de44517ee5fbb805d09a7cdc)
1 //===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass eliminates LDS uses from non-kernel functions.
10 //
11 // The strategy is to create a new struct with a field for each LDS variable
12 // and allocate that struct at the same address for every kernel. Uses of the
13 // original LDS variables are then replaced with compile time offsets from that
14 // known address. AMDGPUMachineFunction allocates the LDS global.
15 //
16 // Local variables with constant annotation or non-undef initializer are passed
17 // through unchanged for simplification or error diagnostics in later passes.
18 //
19 // To reduce the memory overhead variables that are only used by kernels are
20 // excluded from this transform. The analysis to determine whether a variable
21 // is only used by a kernel is cheap and conservative so this may allocate
22 // a variable in every kernel when it was not strictly necessary to do so.
23 //
24 // A possible future refinement is to specialise the structure per-kernel, so
25 // that fields can be elided based on more expensive analysis.
26 //
27 //===----------------------------------------------------------------------===//
28 
29 #include "AMDGPU.h"
30 #include "Utils/AMDGPUBaseInfo.h"
31 #include "Utils/AMDGPUMemoryUtils.h"
32 #include "llvm/ADT/STLExtras.h"
33 #include "llvm/Analysis/CallGraph.h"
34 #include "llvm/IR/Constants.h"
35 #include "llvm/IR/DerivedTypes.h"
36 #include "llvm/IR/IRBuilder.h"
37 #include "llvm/IR/InlineAsm.h"
38 #include "llvm/IR/Instructions.h"
39 #include "llvm/IR/MDBuilder.h"
40 #include "llvm/InitializePasses.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/Debug.h"
44 #include "llvm/Support/OptimizedStructLayout.h"
45 #include "llvm/Transforms/Utils/ModuleUtils.h"
46 #include <vector>
47 
48 #define DEBUG_TYPE "amdgpu-lower-module-lds"
49 
50 using namespace llvm;
51 
52 static cl::opt<bool> SuperAlignLDSGlobals(
53     "amdgpu-super-align-lds-globals",
54     cl::desc("Increase alignment of LDS if it is not on align boundary"),
55     cl::init(true), cl::Hidden);
56 
57 namespace {
58 
59 SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
60   SmallPtrSet<GlobalValue *, 32> UsedList;
61 
62   SmallVector<GlobalValue *, 32> TmpVec;
63   collectUsedGlobalVariables(M, TmpVec, true);
64   UsedList.insert(TmpVec.begin(), TmpVec.end());
65 
66   TmpVec.clear();
67   collectUsedGlobalVariables(M, TmpVec, false);
68   UsedList.insert(TmpVec.begin(), TmpVec.end());
69 
70   return UsedList;
71 }
72 
73 class AMDGPULowerModuleLDS : public ModulePass {
74 
75   static void removeFromUsedList(Module &M, StringRef Name,
76                                  SmallPtrSetImpl<Constant *> &ToRemove) {
77     GlobalVariable *GV = M.getNamedGlobal(Name);
78     if (!GV || ToRemove.empty()) {
79       return;
80     }
81 
82     SmallVector<Constant *, 16> Init;
83     auto *CA = cast<ConstantArray>(GV->getInitializer());
84     for (auto &Op : CA->operands()) {
85       // ModuleUtils::appendToUsed only inserts Constants
86       Constant *C = cast<Constant>(Op);
87       if (!ToRemove.contains(C->stripPointerCasts())) {
88         Init.push_back(C);
89       }
90     }
91 
92     if (Init.size() == CA->getNumOperands()) {
93       return; // none to remove
94     }
95 
96     GV->eraseFromParent();
97 
98     for (Constant *C : ToRemove) {
99       C->removeDeadConstantUsers();
100     }
101 
102     if (!Init.empty()) {
103       ArrayType *ATy =
104           ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size());
105       GV =
106           new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
107                                    ConstantArray::get(ATy, Init), Name);
108       GV->setSection("llvm.metadata");
109     }
110   }
111 
112   static void
113   removeFromUsedLists(Module &M,
114                       const std::vector<GlobalVariable *> &LocalVars) {
115     SmallPtrSet<Constant *, 32> LocalVarsSet;
116     for (GlobalVariable *LocalVar : LocalVars)
117       if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts()))
118         LocalVarsSet.insert(C);
119     removeFromUsedList(M, "llvm.used", LocalVarsSet);
120     removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet);
121   }
122 
123   static void markUsedByKernel(IRBuilder<> &Builder, Function *Func,
124                                GlobalVariable *SGV) {
125     // The llvm.amdgcn.module.lds instance is implicitly used by all kernels
126     // that might call a function which accesses a field within it. This is
127     // presently approximated to 'all kernels' if there are any such functions
128     // in the module. This implicit use is redefined as an explicit use here so
129     // that later passes, specifically PromoteAlloca, account for the required
130     // memory without any knowledge of this transform.
131 
132     // An operand bundle on llvm.donothing works because the call instruction
133     // survives until after the last pass that needs to account for LDS. It is
134     // better than inline asm as the latter survives until the end of codegen. A
135     // totally robust solution would be a function with the same semantics as
136     // llvm.donothing that takes a pointer to the instance and is lowered to a
137     // no-op after LDS is allocated, but that is not presently necessary.
138 
139     LLVMContext &Ctx = Func->getContext();
140 
141     Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI());
142 
143     FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {});
144 
145     Function *Decl =
146         Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {});
147 
148     Value *UseInstance[1] = {Builder.CreateInBoundsGEP(
149         SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))};
150 
151     Builder.CreateCall(FTy, Decl, {},
152                        {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)},
153                        "");
154   }
155 
156 private:
157   SmallPtrSet<GlobalValue *, 32> UsedList;
158 
159 public:
160   static char ID;
161 
162   AMDGPULowerModuleLDS() : ModulePass(ID) {
163     initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry());
164   }
165 
166   bool runOnModule(Module &M) override {
167     CallGraph CG = CallGraph(M);
168     UsedList = getUsedList(M);
169     bool Changed = superAlignLDSGlobals(M);
170     Changed |= processUsedLDS(CG, M);
171 
172     for (Function &F : M.functions()) {
173       if (F.isDeclaration())
174         continue;
175 
176       // Only lower compute kernels' LDS.
177       if (!AMDGPU::isKernel(F.getCallingConv()))
178         continue;
179       Changed |= processUsedLDS(CG, M, &F);
180     }
181 
182     UsedList.clear();
183     return Changed;
184   }
185 
186 private:
187   // Increase the alignment of LDS globals if necessary to maximise the chance
188   // that we can use aligned LDS instructions to access them.
189   static bool superAlignLDSGlobals(Module &M) {
190     const DataLayout &DL = M.getDataLayout();
191     bool Changed = false;
192     if (!SuperAlignLDSGlobals) {
193       return Changed;
194     }
195 
196     for (auto &GV : M.globals()) {
197       if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
198         // Only changing alignment of LDS variables
199         continue;
200       }
201       if (!GV.hasInitializer()) {
202         // cuda/hip extern __shared__ variable, leave alignment alone
203         continue;
204       }
205 
206       Align Alignment = AMDGPU::getAlign(DL, &GV);
207       TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType());
208 
209       if (GVSize > 8) {
210         // We might want to use a b96 or b128 load/store
211         Alignment = std::max(Alignment, Align(16));
212       } else if (GVSize > 4) {
213         // We might want to use a b64 load/store
214         Alignment = std::max(Alignment, Align(8));
215       } else if (GVSize > 2) {
216         // We might want to use a b32 load/store
217         Alignment = std::max(Alignment, Align(4));
218       } else if (GVSize > 1) {
219         // We might want to use a b16 load/store
220         Alignment = std::max(Alignment, Align(2));
221       }
222 
223       if (Alignment != AMDGPU::getAlign(DL, &GV)) {
224         Changed = true;
225         GV.setAlignment(Alignment);
226       }
227     }
228     return Changed;
229   }
230 
231   bool processUsedLDS(CallGraph const &CG, Module &M, Function *F = nullptr) {
232     LLVMContext &Ctx = M.getContext();
233     const DataLayout &DL = M.getDataLayout();
234 
235     // Find variables to move into new struct instance
236     std::vector<GlobalVariable *> FoundLocalVars =
237         AMDGPU::findVariablesToLower(M, F);
238 
239     if (FoundLocalVars.empty()) {
240       // No variables to rewrite, no changes made.
241       return false;
242     }
243 
244     SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
245     LayoutFields.reserve(FoundLocalVars.size());
246     for (GlobalVariable *GV : FoundLocalVars) {
247       OptimizedStructLayoutField F(GV, DL.getTypeAllocSize(GV->getValueType()),
248                                    AMDGPU::getAlign(DL, GV));
249       LayoutFields.emplace_back(F);
250     }
251 
252     performOptimizedStructLayout(LayoutFields);
253 
254     std::vector<GlobalVariable *> LocalVars;
255     LocalVars.reserve(FoundLocalVars.size()); // will be at least this large
256     {
257       // This usually won't need to insert any padding, perhaps avoid the alloc
258       uint64_t CurrentOffset = 0;
259       for (size_t I = 0; I < LayoutFields.size(); I++) {
260         GlobalVariable *FGV = static_cast<GlobalVariable *>(
261             const_cast<void *>(LayoutFields[I].Id));
262         Align DataAlign = LayoutFields[I].Alignment;
263 
264         uint64_t DataAlignV = DataAlign.value();
265         if (uint64_t Rem = CurrentOffset % DataAlignV) {
266           uint64_t Padding = DataAlignV - Rem;
267 
268           // Append an array of padding bytes to meet alignment requested
269           // Note (o +      (a - (o % a)) ) % a == 0
270           //      (offset + Padding       ) % align == 0
271 
272           Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
273           LocalVars.push_back(new GlobalVariable(
274               M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
275               "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
276               false));
277           CurrentOffset += Padding;
278         }
279 
280         LocalVars.push_back(FGV);
281         CurrentOffset += LayoutFields[I].Size;
282       }
283     }
284 
285     std::vector<Type *> LocalVarTypes;
286     LocalVarTypes.reserve(LocalVars.size());
287     std::transform(
288         LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
289         [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
290 
291     std::string VarName(
292         F ? (Twine("llvm.amdgcn.kernel.") + F->getName() + ".lds").str()
293           : "llvm.amdgcn.module.lds");
294     StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
295 
296     Align StructAlign =
297         AMDGPU::getAlign(DL, LocalVars[0]);
298 
299     GlobalVariable *SGV = new GlobalVariable(
300         M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
301         VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
302         false);
303     SGV->setAlignment(StructAlign);
304     if (!F) {
305       appendToCompilerUsed(
306           M, {static_cast<GlobalValue *>(
307                  ConstantExpr::getPointerBitCastOrAddrSpaceCast(
308                      cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
309     }
310 
311     // The verifier rejects used lists containing an inttoptr of a constant
312     // so remove the variables from these lists before replaceAllUsesWith
313     removeFromUsedLists(M, LocalVars);
314 
315     // Create alias.scope and their lists. Each field in the new structure
316     // does not alias with all other fields.
317     SmallVector<MDNode *> AliasScopes;
318     SmallVector<Metadata *> NoAliasList;
319     if (LocalVars.size() > 1) {
320       MDBuilder MDB(Ctx);
321       AliasScopes.reserve(LocalVars.size());
322       MDNode *Domain = MDB.createAnonymousAliasScopeDomain();
323       for (size_t I = 0; I < LocalVars.size(); I++) {
324         MDNode *Scope = MDB.createAnonymousAliasScope(Domain);
325         AliasScopes.push_back(Scope);
326       }
327       NoAliasList.append(&AliasScopes[1], AliasScopes.end());
328     }
329 
330     // Replace uses of ith variable with a constantexpr to the ith field of the
331     // instance that will be allocated by AMDGPUMachineFunction
332     Type *I32 = Type::getInt32Ty(Ctx);
333     for (size_t I = 0; I < LocalVars.size(); I++) {
334       GlobalVariable *GV = LocalVars[I];
335       Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
336       Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx);
337       if (F) {
338         // Replace all constant uses with instructions if they belong to the
339         // current kernel.
340         for (User *U : make_early_inc_range(GV->users())) {
341           if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
342             AMDGPU::replaceConstantUsesInFunction(C, F);
343         }
344 
345         GV->removeDeadConstantUsers();
346 
347         GV->replaceUsesWithIf(GEP, [F](Use &U) {
348           Instruction *I = dyn_cast<Instruction>(U.getUser());
349           return I && I->getFunction() == F;
350         });
351       } else {
352         GV->replaceAllUsesWith(GEP);
353       }
354       if (GV->use_empty()) {
355         UsedList.erase(GV);
356         GV->eraseFromParent();
357       }
358 
359       uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I);
360       Align A = commonAlignment(StructAlign, Off);
361 
362       if (I)
363         NoAliasList[I - 1] = AliasScopes[I - 1];
364       MDNode *NoAlias =
365           NoAliasList.empty() ? nullptr : MDNode::get(Ctx, NoAliasList);
366       MDNode *AliasScope =
367           AliasScopes.empty() ? nullptr : MDNode::get(Ctx, {AliasScopes[I]});
368 
369       refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
370     }
371 
372     // This ensures the variable is allocated when called functions access it.
373     // It also lets other passes, specifically PromoteAlloca, accurately
374     // calculate how much LDS will be used by the kernel after lowering.
375     if (!F) {
376       IRBuilder<> Builder(Ctx);
377       for (Function &Func : M.functions()) {
378         if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
379           const CallGraphNode *N = CG[&Func];
380           const bool CalleesRequireModuleLDS = N->size() > 0;
381 
382           if (CalleesRequireModuleLDS) {
383             // If a function this kernel might call requires module LDS,
384             // annotate the kernel to let later passes know it will allocate
385             // this structure, even if not apparent from the IR.
386             markUsedByKernel(Builder, &Func, SGV);
387           } else {
388             // However if we are certain this kernel cannot call a function that
389             // requires module LDS, annotate the kernel so the backend can elide
390             // the allocation without repeating callgraph walks.
391             Func.addFnAttr("amdgpu-elide-module-lds");
392           }
393         }
394       }
395     }
396     return true;
397   }
398 
399   void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,
400                                 MDNode *AliasScope, MDNode *NoAlias,
401                                 unsigned MaxDepth = 5) {
402     if (!MaxDepth || (A == 1 && !AliasScope))
403       return;
404 
405     for (User *U : Ptr->users()) {
406       if (auto *I = dyn_cast<Instruction>(U)) {
407         if (AliasScope && I->mayReadOrWriteMemory()) {
408           MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope);
409           AS = (AS ? MDNode::getMostGenericAliasScope(AS, AliasScope)
410                    : AliasScope);
411           I->setMetadata(LLVMContext::MD_alias_scope, AS);
412 
413           MDNode *NA = I->getMetadata(LLVMContext::MD_noalias);
414           NA = (NA ? MDNode::intersect(NA, NoAlias) : NoAlias);
415           I->setMetadata(LLVMContext::MD_noalias, NA);
416         }
417       }
418 
419       if (auto *LI = dyn_cast<LoadInst>(U)) {
420         LI->setAlignment(std::max(A, LI->getAlign()));
421         continue;
422       }
423       if (auto *SI = dyn_cast<StoreInst>(U)) {
424         if (SI->getPointerOperand() == Ptr)
425           SI->setAlignment(std::max(A, SI->getAlign()));
426         continue;
427       }
428       if (auto *AI = dyn_cast<AtomicRMWInst>(U)) {
429         // None of atomicrmw operations can work on pointers, but let's
430         // check it anyway in case it will or we will process ConstantExpr.
431         if (AI->getPointerOperand() == Ptr)
432           AI->setAlignment(std::max(A, AI->getAlign()));
433         continue;
434       }
435       if (auto *AI = dyn_cast<AtomicCmpXchgInst>(U)) {
436         if (AI->getPointerOperand() == Ptr)
437           AI->setAlignment(std::max(A, AI->getAlign()));
438         continue;
439       }
440       if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
441         unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
442         APInt Off(BitWidth, 0);
443         if (GEP->getPointerOperand() == Ptr) {
444           Align GA;
445           if (GEP->accumulateConstantOffset(DL, Off))
446             GA = commonAlignment(A, Off.getLimitedValue());
447           refineUsesAlignmentAndAA(GEP, GA, DL, AliasScope, NoAlias,
448                                    MaxDepth - 1);
449         }
450         continue;
451       }
452       if (auto *I = dyn_cast<Instruction>(U)) {
453         if (I->getOpcode() == Instruction::BitCast ||
454             I->getOpcode() == Instruction::AddrSpaceCast)
455           refineUsesAlignmentAndAA(I, A, DL, AliasScope, NoAlias, MaxDepth - 1);
456       }
457     }
458   }
459 };
460 
461 } // namespace
462 char AMDGPULowerModuleLDS::ID = 0;
463 
464 char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID;
465 
466 INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE,
467                 "Lower uses of LDS variables from non-kernel functions", false,
468                 false)
469 
470 ModulePass *llvm::createAMDGPULowerModuleLDSPass() {
471   return new AMDGPULowerModuleLDS();
472 }
473 
474 PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
475                                                 ModuleAnalysisManager &) {
476   return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none()
477                                                : PreservedAnalyses::all();
478 }
479