xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp (revision 576060fb41c2de536a24d5d421b1cd8942f20b64)
1 //===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass eliminates local data store, LDS, uses from non-kernel functions.
10 // LDS is contiguous memory allocated per kernel execution.
11 //
12 // Background.
13 //
14 // The programming model is global variables, or equivalently function local
15 // static variables, accessible from kernels or other functions. For uses from
16 // kernels this is straightforward - assign an integer to the kernel for the
17 // memory required by all the variables combined, allocate them within that.
18 // For uses from functions there are performance tradeoffs to choose between.
19 //
20 // This model means the GPU runtime can specify the amount of memory allocated.
21 // If this is more than the kernel assumed, the excess can be made available
22 // using a language specific feature, which IR represents as a variable with
23 // no initializer. This feature is not yet implemented for non-kernel functions.
24 // This lowering could be extended to handle that use case, but would probably
25 // require closer integration with promoteAllocaToLDS.
26 //
27 // Consequences of this GPU feature:
28 // - memory is limited and exceeding it halts compilation
29 // - a global accessed by one kernel exists independent of other kernels
30 // - a global exists independent of simultaneous execution of the same kernel
31 // - the address of the global may be different from different kernels as they
32 //   do not alias, which permits only allocating variables they use
33 // - if the address is allowed to differ, functions need help to find it
34 //
35 // Uses from kernels are implemented here by grouping them in a per-kernel
36 // struct instance. This duplicates the variables, accurately modelling their
37 // aliasing properties relative to a single global representation. It also
38 // permits control over alignment via padding.
39 //
40 // Uses from functions are more complicated and the primary purpose of this
41 // IR pass. Several different lowering are chosen between to meet requirements
42 // to avoid allocating any LDS where it is not necessary, as that impacts
43 // occupancy and may fail the compilation, while not imposing overhead on a
44 // feature whose primary advantage over global memory is performance. The basic
45 // design goal is to avoid one kernel imposing overhead on another.
46 //
47 // Implementation.
48 //
49 // LDS variables with constant annotation or non-undef initializer are passed
50 // through unchanged for simplification or error diagnostics in later passes.
51 // Non-undef initializers are not yet implemented for LDS.
52 //
53 // LDS variables that are always allocated at the same address can be found
54 // by lookup at that address. Otherwise runtime information/cost is required.
55 //
56 // The simplest strategy possible is to group all LDS variables in a single
57 // struct and allocate that struct in every kernel such that the original
58 // variables are always at the same address. LDS is however a limited resource
59 // so this strategy is unusable in practice. It is not implemented here.
60 //
61 // Strategy | Precise allocation | Zero runtime cost | General purpose |
62 //  --------+--------------------+-------------------+-----------------+
63 //   Module |                 No |               Yes |             Yes |
64 //    Table |                Yes |                No |             Yes |
65 //   Kernel |                Yes |               Yes |              No |
66 //   Hybrid |                Yes |           Partial |             Yes |
67 //
68 // Module spends LDS memory to save cycles. Table spends cycles and global
69 // memory to save LDS. Kernel is as fast as kernel allocation but only works
70 // for variables that are known reachable from a single kernel. Hybrid picks
71 // between all three. When forced to choose between LDS and cycles it minimises
72 // LDS use.
73 
74 // The "module" lowering implemented here finds LDS variables which are used by
75 // non-kernel functions and creates a new struct with a field for each of those
76 // LDS variables. Variables that are only used from kernels are excluded.
77 // Kernels that do not use this struct are annoteated with the attribute
78 // amdgpu-elide-module-lds which allows the back end to elide the allocation.
79 //
80 // The "table" lowering implemented here has three components.
81 // First kernels are assigned a unique integer identifier which is available in
82 // functions it calls through the intrinsic amdgcn_lds_kernel_id. The integer
83 // is passed through a specific SGPR, thus works with indirect calls.
84 // Second, each kernel allocates LDS variables independent of other kernels and
85 // writes the addresses it chose for each variable into an array in consistent
86 // order. If the kernel does not allocate a given variable, it writes undef to
87 // the corresponding array location. These arrays are written to a constant
88 // table in the order matching the kernel unique integer identifier.
89 // Third, uses from non-kernel functions are replaced with a table lookup using
90 // the intrinsic function to find the address of the variable.
91 //
92 // "Kernel" lowering is only applicable for variables that are unambiguously
93 // reachable from exactly one kernel. For those cases, accesses to the variable
94 // can be lowered to ConstantExpr address of a struct instance specific to that
95 // one kernel. This is zero cost in space and in compute. It will raise a fatal
96 // error on any variable that might be reachable from multiple kernels and is
97 // thus most easily used as part of the hybrid lowering strategy.
98 //
99 // Hybrid lowering is a mixture of the above. It uses the zero cost kernel
100 // lowering where it can. It lowers the variable accessed by the greatest
101 // number of kernels using the module strategy as that is free for the first
102 // variable. Any futher variables that can be lowered with the module strategy
103 // without incurring LDS memory overhead are. The remaining ones are lowered
104 // via table.
105 //
106 // Consequences
107 // - No heuristics or user controlled magic numbers, hybrid is the right choice
108 // - Kernels that don't use functions (or have had them all inlined) are not
109 //   affected by any lowering for kernels that do.
110 // - Kernels that don't make indirect function calls are not affected by those
111 //   that do.
112 // - Variables which are used by lots of kernels, e.g. those injected by a
113 //   language runtime in most kernels, are expected to have no overhead
114 // - Implementations that instantiate templates per-kernel where those templates
115 //   use LDS are expected to hit the "Kernel" lowering strategy
116 // - The runtime properties impose a cost in compiler implementation complexity
117 //
118 //===----------------------------------------------------------------------===//
119 
120 #include "AMDGPU.h"
121 #include "Utils/AMDGPUBaseInfo.h"
122 #include "Utils/AMDGPUMemoryUtils.h"
123 #include "llvm/ADT/BitVector.h"
124 #include "llvm/ADT/DenseMap.h"
125 #include "llvm/ADT/DenseSet.h"
126 #include "llvm/ADT/STLExtras.h"
127 #include "llvm/ADT/SetOperations.h"
128 #include "llvm/ADT/SetVector.h"
129 #include "llvm/Analysis/CallGraph.h"
130 #include "llvm/IR/Constants.h"
131 #include "llvm/IR/DerivedTypes.h"
132 #include "llvm/IR/IRBuilder.h"
133 #include "llvm/IR/InlineAsm.h"
134 #include "llvm/IR/Instructions.h"
135 #include "llvm/IR/IntrinsicsAMDGPU.h"
136 #include "llvm/IR/MDBuilder.h"
137 #include "llvm/IR/ReplaceConstant.h"
138 #include "llvm/InitializePasses.h"
139 #include "llvm/Pass.h"
140 #include "llvm/Support/CommandLine.h"
141 #include "llvm/Support/Debug.h"
142 #include "llvm/Support/OptimizedStructLayout.h"
143 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
144 #include "llvm/Transforms/Utils/ModuleUtils.h"
145 
146 #include <tuple>
147 #include <vector>
148 
149 #include <cstdio>
150 
151 #define DEBUG_TYPE "amdgpu-lower-module-lds"
152 
153 using namespace llvm;
154 
155 namespace {
156 
157 cl::opt<bool> SuperAlignLDSGlobals(
158     "amdgpu-super-align-lds-globals",
159     cl::desc("Increase alignment of LDS if it is not on align boundary"),
160     cl::init(true), cl::Hidden);
161 
162 enum class LoweringKind { module, table, kernel, hybrid };
163 cl::opt<LoweringKind> LoweringKindLoc(
164     "amdgpu-lower-module-lds-strategy",
165     cl::desc("Specify lowering strategy for function LDS access:"), cl::Hidden,
166     cl::init(LoweringKind::hybrid),
167     cl::values(
168         clEnumValN(LoweringKind::table, "table", "Lower via table lookup"),
169         clEnumValN(LoweringKind::module, "module", "Lower via module struct"),
170         clEnumValN(
171             LoweringKind::kernel, "kernel",
172             "Lower variables reachable from one kernel, otherwise abort"),
173         clEnumValN(LoweringKind::hybrid, "hybrid",
174                    "Lower via mixture of above strategies")));
175 
176 bool isKernelLDS(const Function *F) {
177   // Some weirdness here. AMDGPU::isKernelCC does not call into
178   // AMDGPU::isKernel with the calling conv, it instead calls into
179   // isModuleEntryFunction which returns true for more calling conventions
180   // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel.
181   // There's also a test that checks that the LDS lowering does not hit on
182   // a graphics shader, denoted amdgpu_ps, so stay with the limited case.
183   // Putting LDS in the name of the function to draw attention to this.
184   return AMDGPU::isKernel(F->getCallingConv());
185 }
186 
187 class AMDGPULowerModuleLDS : public ModulePass {
188 
189   static void
190   removeLocalVarsFromUsedLists(Module &M,
191                                const DenseSet<GlobalVariable *> &LocalVars) {
192     // The verifier rejects used lists containing an inttoptr of a constant
193     // so remove the variables from these lists before replaceAllUsesWith
194     SmallPtrSet<Constant *, 8> LocalVarsSet;
195     for (GlobalVariable *LocalVar : LocalVars)
196       LocalVarsSet.insert(cast<Constant>(LocalVar->stripPointerCasts()));
197 
198     removeFromUsedLists(
199         M, [&LocalVarsSet](Constant *C) { return LocalVarsSet.count(C); });
200 
201     for (GlobalVariable *LocalVar : LocalVars)
202       LocalVar->removeDeadConstantUsers();
203   }
204 
205   static void markUsedByKernel(IRBuilder<> &Builder, Function *Func,
206                                GlobalVariable *SGV) {
207     // The llvm.amdgcn.module.lds instance is implicitly used by all kernels
208     // that might call a function which accesses a field within it. This is
209     // presently approximated to 'all kernels' if there are any such functions
210     // in the module. This implicit use is redefined as an explicit use here so
211     // that later passes, specifically PromoteAlloca, account for the required
212     // memory without any knowledge of this transform.
213 
214     // An operand bundle on llvm.donothing works because the call instruction
215     // survives until after the last pass that needs to account for LDS. It is
216     // better than inline asm as the latter survives until the end of codegen. A
217     // totally robust solution would be a function with the same semantics as
218     // llvm.donothing that takes a pointer to the instance and is lowered to a
219     // no-op after LDS is allocated, but that is not presently necessary.
220 
221     LLVMContext &Ctx = Func->getContext();
222 
223     Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI());
224 
225     FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {});
226 
227     Function *Decl =
228         Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {});
229 
230     Value *UseInstance[1] = {Builder.CreateInBoundsGEP(
231         SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))};
232 
233     Builder.CreateCall(FTy, Decl, {},
234                        {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)},
235                        "");
236   }
237 
238   static bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
239     // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
240     // global may have uses from multiple different functions as a result.
241     // This pass specialises LDS variables with respect to the kernel that
242     // allocates them.
243 
244     // This is semantically equivalent to:
245     // for (auto &F : M.functions())
246     //   for (auto &BB : F)
247     //     for (auto &I : BB)
248     //       for (Use &Op : I.operands())
249     //         if (constantExprUsesLDS(Op))
250     //           replaceConstantExprInFunction(I, Op);
251 
252     SmallVector<Constant *> LDSGlobals;
253     for (auto &GV : M.globals())
254       if (AMDGPU::isLDSVariableToLower(GV))
255         LDSGlobals.push_back(&GV);
256 
257     return convertUsersOfConstantsToInstructions(LDSGlobals);
258   }
259 
260 public:
261   static char ID;
262 
263   AMDGPULowerModuleLDS() : ModulePass(ID) {
264     initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry());
265   }
266 
267   using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>;
268 
269   using VariableFunctionMap = DenseMap<GlobalVariable *, DenseSet<Function *>>;
270 
271   static void getUsesOfLDSByFunction(CallGraph const &CG, Module &M,
272                                      FunctionVariableMap &kernels,
273                                      FunctionVariableMap &functions) {
274 
275     // Get uses from the current function, excluding uses by called functions
276     // Two output variables to avoid walking the globals list twice
277     for (auto &GV : M.globals()) {
278       if (!AMDGPU::isLDSVariableToLower(GV)) {
279         continue;
280       }
281 
282       SmallVector<User *, 16> Stack(GV.users());
283       for (User *V : GV.users()) {
284         if (auto *I = dyn_cast<Instruction>(V)) {
285           Function *F = I->getFunction();
286           if (isKernelLDS(F)) {
287             kernels[F].insert(&GV);
288           } else {
289             functions[F].insert(&GV);
290           }
291         }
292       }
293     }
294   }
295 
296   struct LDSUsesInfoTy {
297     FunctionVariableMap direct_access;
298     FunctionVariableMap indirect_access;
299   };
300 
301   static LDSUsesInfoTy getTransitiveUsesOfLDS(CallGraph const &CG, Module &M) {
302 
303     FunctionVariableMap direct_map_kernel;
304     FunctionVariableMap direct_map_function;
305     getUsesOfLDSByFunction(CG, M, direct_map_kernel, direct_map_function);
306 
307     // Collect variables that are used by functions whose address has escaped
308     DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
309     for (Function &F : M.functions()) {
310       if (!isKernelLDS(&F))
311           if (F.hasAddressTaken(nullptr,
312                                 /* IgnoreCallbackUses */ false,
313                                 /* IgnoreAssumeLikeCalls */ false,
314                                 /* IgnoreLLVMUsed */ true,
315                                 /* IgnoreArcAttachedCall */ false)) {
316           set_union(VariablesReachableThroughFunctionPointer,
317                     direct_map_function[&F]);
318         }
319     }
320 
321     auto functionMakesUnknownCall = [&](const Function *F) -> bool {
322       assert(!F->isDeclaration());
323       for (CallGraphNode::CallRecord R : *CG[F]) {
324         if (!R.second->getFunction()) {
325           return true;
326         }
327       }
328       return false;
329     };
330 
331     // Work out which variables are reachable through function calls
332     FunctionVariableMap transitive_map_function = direct_map_function;
333 
334     // If the function makes any unknown call, assume the worst case that it can
335     // access all variables accessed by functions whose address escaped
336     for (Function &F : M.functions()) {
337       if (!F.isDeclaration() && functionMakesUnknownCall(&F)) {
338         if (!isKernelLDS(&F)) {
339           set_union(transitive_map_function[&F],
340                     VariablesReachableThroughFunctionPointer);
341         }
342       }
343     }
344 
345     // Direct implementation of collecting all variables reachable from each
346     // function
347     for (Function &Func : M.functions()) {
348       if (Func.isDeclaration() || isKernelLDS(&Func))
349         continue;
350 
351       DenseSet<Function *> seen; // catches cycles
352       SmallVector<Function *, 4> wip{&Func};
353 
354       while (!wip.empty()) {
355         Function *F = wip.pop_back_val();
356 
357         // Can accelerate this by referring to transitive map for functions that
358         // have already been computed, with more care than this
359         set_union(transitive_map_function[&Func], direct_map_function[F]);
360 
361         for (CallGraphNode::CallRecord R : *CG[F]) {
362           Function *ith = R.second->getFunction();
363           if (ith) {
364             if (!seen.contains(ith)) {
365               seen.insert(ith);
366               wip.push_back(ith);
367             }
368           }
369         }
370       }
371     }
372 
373     // direct_map_kernel lists which variables are used by the kernel
374     // find the variables which are used through a function call
375     FunctionVariableMap indirect_map_kernel;
376 
377     for (Function &Func : M.functions()) {
378       if (Func.isDeclaration() || !isKernelLDS(&Func))
379         continue;
380 
381       for (CallGraphNode::CallRecord R : *CG[&Func]) {
382         Function *ith = R.second->getFunction();
383         if (ith) {
384           set_union(indirect_map_kernel[&Func], transitive_map_function[ith]);
385         } else {
386           set_union(indirect_map_kernel[&Func],
387                     VariablesReachableThroughFunctionPointer);
388         }
389       }
390     }
391 
392     return {std::move(direct_map_kernel), std::move(indirect_map_kernel)};
393   }
394 
395   struct LDSVariableReplacement {
396     GlobalVariable *SGV = nullptr;
397     DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;
398   };
399 
400   // remap from lds global to a constantexpr gep to where it has been moved to
401   // for each kernel
402   // an array with an element for each kernel containing where the corresponding
403   // variable was remapped to
404 
405   static Constant *getAddressesOfVariablesInKernel(
406       LLVMContext &Ctx, ArrayRef<GlobalVariable *> Variables,
407       DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP) {
408     // Create a ConstantArray containing the address of each Variable within the
409     // kernel corresponding to LDSVarsToConstantGEP, or poison if that kernel
410     // does not allocate it
411     // TODO: Drop the ptrtoint conversion
412 
413     Type *I32 = Type::getInt32Ty(Ctx);
414 
415     ArrayType *KernelOffsetsType = ArrayType::get(I32, Variables.size());
416 
417     SmallVector<Constant *> Elements;
418     for (size_t i = 0; i < Variables.size(); i++) {
419       GlobalVariable *GV = Variables[i];
420       if (LDSVarsToConstantGEP.count(GV) != 0) {
421         auto elt = ConstantExpr::getPtrToInt(LDSVarsToConstantGEP[GV], I32);
422         Elements.push_back(elt);
423       } else {
424         Elements.push_back(PoisonValue::get(I32));
425       }
426     }
427     return ConstantArray::get(KernelOffsetsType, Elements);
428   }
429 
430   static GlobalVariable *buildLookupTable(
431       Module &M, ArrayRef<GlobalVariable *> Variables,
432       ArrayRef<Function *> kernels,
433       DenseMap<Function *, LDSVariableReplacement> &KernelToReplacement) {
434     if (Variables.empty()) {
435       return nullptr;
436     }
437     LLVMContext &Ctx = M.getContext();
438 
439     const size_t NumberVariables = Variables.size();
440     const size_t NumberKernels = kernels.size();
441 
442     ArrayType *KernelOffsetsType =
443         ArrayType::get(Type::getInt32Ty(Ctx), NumberVariables);
444 
445     ArrayType *AllKernelsOffsetsType =
446         ArrayType::get(KernelOffsetsType, NumberKernels);
447 
448     std::vector<Constant *> overallConstantExprElts(NumberKernels);
449     for (size_t i = 0; i < NumberKernels; i++) {
450       LDSVariableReplacement Replacement = KernelToReplacement[kernels[i]];
451       overallConstantExprElts[i] = getAddressesOfVariablesInKernel(
452           Ctx, Variables, Replacement.LDSVarsToConstantGEP);
453     }
454 
455     Constant *init =
456         ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts);
457 
458     return new GlobalVariable(
459         M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
460         "llvm.amdgcn.lds.offset.table", nullptr, GlobalValue::NotThreadLocal,
461         AMDGPUAS::CONSTANT_ADDRESS);
462   }
463 
464   void replaceUsesInInstructionsWithTableLookup(
465       Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables,
466       GlobalVariable *LookupTable) {
467 
468     LLVMContext &Ctx = M.getContext();
469     IRBuilder<> Builder(Ctx);
470     Type *I32 = Type::getInt32Ty(Ctx);
471 
472     // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which
473     // lowers to a read from a live in register. Emit it once in the entry
474     // block to spare deduplicating it later.
475 
476     DenseMap<Function *, Value *> tableKernelIndexCache;
477     auto getTableKernelIndex = [&](Function *F) -> Value * {
478       if (tableKernelIndexCache.count(F) == 0) {
479         LLVMContext &Ctx = M.getContext();
480         FunctionType *FTy = FunctionType::get(Type::getInt32Ty(Ctx), {});
481         Function *Decl =
482             Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {});
483 
484         BasicBlock::iterator it =
485             F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
486         Instruction &i = *it;
487         Builder.SetInsertPoint(&i);
488 
489         tableKernelIndexCache[F] = Builder.CreateCall(FTy, Decl, {});
490       }
491 
492       return tableKernelIndexCache[F];
493     };
494 
495     for (size_t Index = 0; Index < ModuleScopeVariables.size(); Index++) {
496       auto *GV = ModuleScopeVariables[Index];
497 
498       for (Use &U : make_early_inc_range(GV->uses())) {
499         auto *I = dyn_cast<Instruction>(U.getUser());
500         if (!I)
501           continue;
502 
503         Value *tableKernelIndex = getTableKernelIndex(I->getFunction());
504 
505         // So if the phi uses this value multiple times, what does this look
506         // like?
507         if (auto *Phi = dyn_cast<PHINode>(I)) {
508           BasicBlock *BB = Phi->getIncomingBlock(U);
509           Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt())));
510         } else {
511           Builder.SetInsertPoint(I);
512         }
513 
514         Value *GEPIdx[3] = {
515             ConstantInt::get(I32, 0),
516             tableKernelIndex,
517             ConstantInt::get(I32, Index),
518         };
519 
520         Value *Address = Builder.CreateInBoundsGEP(
521             LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName());
522 
523         Value *loaded = Builder.CreateLoad(I32, Address);
524 
525         Value *replacement =
526             Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName());
527 
528         U.set(replacement);
529       }
530     }
531   }
532 
533   static DenseSet<Function *> kernelsThatIndirectlyAccessAnyOfPassedVariables(
534       Module &M, LDSUsesInfoTy &LDSUsesInfo,
535       DenseSet<GlobalVariable *> const &VariableSet) {
536 
537     DenseSet<Function *> KernelSet;
538 
539     if (VariableSet.empty()) return KernelSet;
540 
541     for (Function &Func : M.functions()) {
542       if (Func.isDeclaration() || !isKernelLDS(&Func))
543         continue;
544       for (GlobalVariable *GV : LDSUsesInfo.indirect_access[&Func]) {
545         if (VariableSet.contains(GV)) {
546           KernelSet.insert(&Func);
547           break;
548         }
549       }
550     }
551 
552     return KernelSet;
553   }
554 
555   static GlobalVariable *
556   chooseBestVariableForModuleStrategy(const DataLayout &DL,
557                                       VariableFunctionMap &LDSVars) {
558     // Find the global variable with the most indirect uses from kernels
559 
560     struct CandidateTy {
561       GlobalVariable *GV = nullptr;
562       size_t UserCount = 0;
563       size_t Size = 0;
564 
565       CandidateTy() = default;
566 
567       CandidateTy(GlobalVariable *GV, uint64_t UserCount, uint64_t AllocSize)
568           : GV(GV), UserCount(UserCount), Size(AllocSize) {}
569 
570       bool operator<(const CandidateTy &Other) const {
571         // Fewer users makes module scope variable less attractive
572         if (UserCount < Other.UserCount) {
573           return true;
574         }
575         if (UserCount > Other.UserCount) {
576           return false;
577         }
578 
579         // Bigger makes module scope variable less attractive
580         if (Size < Other.Size) {
581           return false;
582         }
583 
584         if (Size > Other.Size) {
585           return true;
586         }
587 
588         // Arbitrary but consistent
589         return GV->getName() < Other.GV->getName();
590       }
591     };
592 
593     CandidateTy MostUsed;
594 
595     for (auto &K : LDSVars) {
596       GlobalVariable *GV = K.first;
597       if (K.second.size() <= 1) {
598         // A variable reachable by only one kernel is best lowered with kernel
599         // strategy
600         continue;
601       }
602       CandidateTy Candidate(GV, K.second.size(),
603                       DL.getTypeAllocSize(GV->getValueType()).getFixedValue());
604       if (MostUsed < Candidate)
605         MostUsed = Candidate;
606     }
607 
608     return MostUsed.GV;
609   }
610 
611   bool runOnModule(Module &M) override {
612     LLVMContext &Ctx = M.getContext();
613     CallGraph CG = CallGraph(M);
614     bool Changed = superAlignLDSGlobals(M);
615 
616     Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
617 
618     Changed = true; // todo: narrow this down
619 
620     // For each kernel, what variables does it access directly or through
621     // callees
622     LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
623 
624     // For each variable accessed through callees, which kernels access it
625     VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
626     for (auto &K : LDSUsesInfo.indirect_access) {
627       Function *F = K.first;
628       assert(isKernelLDS(F));
629       for (GlobalVariable *GV : K.second) {
630         LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
631       }
632     }
633 
634     // Partition variables into the different strategies
635     DenseSet<GlobalVariable *> ModuleScopeVariables;
636     DenseSet<GlobalVariable *> TableLookupVariables;
637     DenseSet<GlobalVariable *> KernelAccessVariables;
638 
639     {
640       GlobalVariable *HybridModuleRoot =
641           LoweringKindLoc != LoweringKind::hybrid
642               ? nullptr
643               : chooseBestVariableForModuleStrategy(
644                     M.getDataLayout(),
645                     LDSToKernelsThatNeedToAccessItIndirectly);
646 
647       DenseSet<Function *> const EmptySet;
648       DenseSet<Function *> const &HybridModuleRootKernels =
649           HybridModuleRoot
650               ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot]
651               : EmptySet;
652 
653       for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
654         // Each iteration of this loop assigns exactly one global variable to
655         // exactly one of the implementation strategies.
656 
657         GlobalVariable *GV = K.first;
658         assert(AMDGPU::isLDSVariableToLower(*GV));
659         assert(K.second.size() != 0);
660 
661         switch (LoweringKindLoc) {
662         case LoweringKind::module:
663           ModuleScopeVariables.insert(GV);
664           break;
665 
666         case LoweringKind::table:
667           TableLookupVariables.insert(GV);
668           break;
669 
670         case LoweringKind::kernel:
671           if (K.second.size() == 1) {
672             KernelAccessVariables.insert(GV);
673           } else {
674             report_fatal_error(
675                 "cannot lower LDS '" + GV->getName() +
676                 "' to kernel access as it is reachable from multiple kernels");
677           }
678           break;
679 
680         case LoweringKind::hybrid: {
681           if (GV == HybridModuleRoot) {
682             assert(K.second.size() != 1);
683             ModuleScopeVariables.insert(GV);
684           } else if (K.second.size() == 1) {
685             KernelAccessVariables.insert(GV);
686           } else if (set_is_subset(K.second, HybridModuleRootKernels)) {
687             ModuleScopeVariables.insert(GV);
688           } else {
689             TableLookupVariables.insert(GV);
690           }
691           break;
692         }
693         }
694       }
695 
696       assert(ModuleScopeVariables.size() + TableLookupVariables.size() +
697                  KernelAccessVariables.size() ==
698              LDSToKernelsThatNeedToAccessItIndirectly.size());
699     } // Variables have now been partitioned into the three lowering strategies.
700 
701     // If the kernel accesses a variable that is going to be stored in the
702     // module instance through a call then that kernel needs to allocate the
703     // module instance
704     DenseSet<Function *> KernelsThatAllocateModuleLDS =
705         kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
706                                                         ModuleScopeVariables);
707     DenseSet<Function *> KernelsThatAllocateTableLDS =
708         kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
709                                                         TableLookupVariables);
710 
711     if (!ModuleScopeVariables.empty()) {
712       LDSVariableReplacement ModuleScopeReplacement =
713           createLDSVariableReplacement(M, "llvm.amdgcn.module.lds",
714                                        ModuleScopeVariables);
715 
716       appendToCompilerUsed(M,
717                            {static_cast<GlobalValue *>(
718                                ConstantExpr::getPointerBitCastOrAddrSpaceCast(
719                                    cast<Constant>(ModuleScopeReplacement.SGV),
720                                    Type::getInt8PtrTy(Ctx)))});
721 
722       // historic
723       removeLocalVarsFromUsedLists(M, ModuleScopeVariables);
724 
725       // Replace all uses of module scope variable from non-kernel functions
726       replaceLDSVariablesWithStruct(
727           M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) {
728             Instruction *I = dyn_cast<Instruction>(U.getUser());
729             if (!I) {
730               return false;
731             }
732             Function *F = I->getFunction();
733             return !isKernelLDS(F);
734           });
735 
736       // Replace uses of module scope variable from kernel functions that
737       // allocate the module scope variable, otherwise leave them unchanged
738       // Record on each kernel whether the module scope global is used by it
739 
740       LLVMContext &Ctx = M.getContext();
741       IRBuilder<> Builder(Ctx);
742 
743       for (Function &Func : M.functions()) {
744         if (Func.isDeclaration() || !isKernelLDS(&Func))
745           continue;
746 
747         if (KernelsThatAllocateModuleLDS.contains(&Func)) {
748           replaceLDSVariablesWithStruct(
749               M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) {
750                 Instruction *I = dyn_cast<Instruction>(U.getUser());
751                 if (!I) {
752                   return false;
753                 }
754                 Function *F = I->getFunction();
755                 return F == &Func;
756               });
757 
758           markUsedByKernel(Builder, &Func, ModuleScopeReplacement.SGV);
759 
760         } else {
761           Func.addFnAttr("amdgpu-elide-module-lds");
762         }
763       }
764     }
765 
766     // Create a struct for each kernel for the non-module-scope variables
767     DenseMap<Function *, LDSVariableReplacement> KernelToReplacement;
768     for (Function &Func : M.functions()) {
769       if (Func.isDeclaration() || !isKernelLDS(&Func))
770         continue;
771 
772       DenseSet<GlobalVariable *> KernelUsedVariables;
773       for (auto &v : LDSUsesInfo.direct_access[&Func]) {
774         KernelUsedVariables.insert(v);
775       }
776       for (auto &v : LDSUsesInfo.indirect_access[&Func]) {
777         KernelUsedVariables.insert(v);
778       }
779 
780       // Variables allocated in module lds must all resolve to that struct,
781       // not to the per-kernel instance.
782       if (KernelsThatAllocateModuleLDS.contains(&Func)) {
783         for (GlobalVariable *v : ModuleScopeVariables) {
784           KernelUsedVariables.erase(v);
785         }
786       }
787 
788       if (KernelUsedVariables.empty()) {
789         // Either used no LDS, or all the LDS it used was also in module
790         continue;
791       }
792 
793       // The association between kernel function and LDS struct is done by
794       // symbol name, which only works if the function in question has a
795       // name This is not expected to be a problem in practice as kernels
796       // are called by name making anonymous ones (which are named by the
797       // backend) difficult to use. This does mean that llvm test cases need
798       // to name the kernels.
799       if (!Func.hasName()) {
800         report_fatal_error("Anonymous kernels cannot use LDS variables");
801       }
802 
803       std::string VarName =
804           (Twine("llvm.amdgcn.kernel.") + Func.getName() + ".lds").str();
805 
806       auto Replacement =
807           createLDSVariableReplacement(M, VarName, KernelUsedVariables);
808 
809       // remove preserves existing codegen
810       removeLocalVarsFromUsedLists(M, KernelUsedVariables);
811       KernelToReplacement[&Func] = Replacement;
812 
813       // Rewrite uses within kernel to the new struct
814       replaceLDSVariablesWithStruct(
815           M, KernelUsedVariables, Replacement, [&Func](Use &U) {
816             Instruction *I = dyn_cast<Instruction>(U.getUser());
817             return I && I->getFunction() == &Func;
818           });
819     }
820 
821     // Lower zero cost accesses to the kernel instances just created
822     for (auto &GV : KernelAccessVariables) {
823       auto &funcs = LDSToKernelsThatNeedToAccessItIndirectly[GV];
824       assert(funcs.size() == 1); // Only one kernel can access it
825       LDSVariableReplacement Replacement =
826           KernelToReplacement[*(funcs.begin())];
827 
828       DenseSet<GlobalVariable *> Vec;
829       Vec.insert(GV);
830 
831       replaceLDSVariablesWithStruct(M, Vec, Replacement, [](Use &U) {
832                                                            return isa<Instruction>(U.getUser());
833       });
834     }
835 
836     if (!KernelsThatAllocateTableLDS.empty()) {
837       // Collect the kernels that allocate table lookup LDS
838       std::vector<Function *> OrderedKernels;
839       {
840         for (Function &Func : M.functions()) {
841           if (Func.isDeclaration())
842             continue;
843           if (!isKernelLDS(&Func))
844             continue;
845 
846           if (KernelsThatAllocateTableLDS.contains(&Func)) {
847             assert(Func.hasName()); // else fatal error earlier
848             OrderedKernels.push_back(&Func);
849           }
850         }
851 
852         // Put them in an arbitrary but reproducible order
853         llvm::sort(OrderedKernels.begin(), OrderedKernels.end(),
854                    [](const Function *lhs, const Function *rhs) -> bool {
855                      return lhs->getName() < rhs->getName();
856                    });
857 
858         // Annotate the kernels with their order in this vector
859         LLVMContext &Ctx = M.getContext();
860         IRBuilder<> Builder(Ctx);
861 
862         if (OrderedKernels.size() > UINT32_MAX) {
863           // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU
864           report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels");
865         }
866 
867         for (size_t i = 0; i < OrderedKernels.size(); i++) {
868           Metadata *AttrMDArgs[1] = {
869               ConstantAsMetadata::get(Builder.getInt32(i)),
870           };
871           OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id",
872                                          MDNode::get(Ctx, AttrMDArgs));
873 
874           markUsedByKernel(Builder, OrderedKernels[i],
875                            KernelToReplacement[OrderedKernels[i]].SGV);
876         }
877       }
878 
879       // The order must be consistent between lookup table and accesses to
880       // lookup table
881       std::vector<GlobalVariable *> TableLookupVariablesOrdered(
882           TableLookupVariables.begin(), TableLookupVariables.end());
883       llvm::sort(TableLookupVariablesOrdered.begin(),
884                  TableLookupVariablesOrdered.end(),
885                  [](const GlobalVariable *lhs, const GlobalVariable *rhs) {
886                    return lhs->getName() < rhs->getName();
887                  });
888 
889       GlobalVariable *LookupTable = buildLookupTable(
890           M, TableLookupVariablesOrdered, OrderedKernels, KernelToReplacement);
891       replaceUsesInInstructionsWithTableLookup(M, TableLookupVariablesOrdered,
892                                                LookupTable);
893     }
894 
895     for (auto &GV : make_early_inc_range(M.globals()))
896       if (AMDGPU::isLDSVariableToLower(GV)) {
897 
898         // probably want to remove from used lists
899         GV.removeDeadConstantUsers();
900         if (GV.use_empty())
901           GV.eraseFromParent();
902       }
903 
904     return Changed;
905   }
906 
907 private:
908   // Increase the alignment of LDS globals if necessary to maximise the chance
909   // that we can use aligned LDS instructions to access them.
910   static bool superAlignLDSGlobals(Module &M) {
911     const DataLayout &DL = M.getDataLayout();
912     bool Changed = false;
913     if (!SuperAlignLDSGlobals) {
914       return Changed;
915     }
916 
917     for (auto &GV : M.globals()) {
918       if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
919         // Only changing alignment of LDS variables
920         continue;
921       }
922       if (!GV.hasInitializer()) {
923         // cuda/hip extern __shared__ variable, leave alignment alone
924         continue;
925       }
926 
927       Align Alignment = AMDGPU::getAlign(DL, &GV);
928       TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType());
929 
930       if (GVSize > 8) {
931         // We might want to use a b96 or b128 load/store
932         Alignment = std::max(Alignment, Align(16));
933       } else if (GVSize > 4) {
934         // We might want to use a b64 load/store
935         Alignment = std::max(Alignment, Align(8));
936       } else if (GVSize > 2) {
937         // We might want to use a b32 load/store
938         Alignment = std::max(Alignment, Align(4));
939       } else if (GVSize > 1) {
940         // We might want to use a b16 load/store
941         Alignment = std::max(Alignment, Align(2));
942       }
943 
944       if (Alignment != AMDGPU::getAlign(DL, &GV)) {
945         Changed = true;
946         GV.setAlignment(Alignment);
947       }
948     }
949     return Changed;
950   }
951 
952   static LDSVariableReplacement createLDSVariableReplacement(
953       Module &M, std::string VarName,
954       DenseSet<GlobalVariable *> const &LDSVarsToTransform) {
955     // Create a struct instance containing LDSVarsToTransform and map from those
956     // variables to ConstantExprGEP
957     // Variables may be introduced to meet alignment requirements. No aliasing
958     // metadata is useful for these as they have no uses. Erased before return.
959 
960     LLVMContext &Ctx = M.getContext();
961     const DataLayout &DL = M.getDataLayout();
962     assert(!LDSVarsToTransform.empty());
963 
964     SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
965     LayoutFields.reserve(LDSVarsToTransform.size());
966     {
967       // The order of fields in this struct depends on the order of
968       // varables in the argument which varies when changing how they
969       // are identified, leading to spurious test breakage.
970       std::vector<GlobalVariable *> Sorted(LDSVarsToTransform.begin(),
971                                            LDSVarsToTransform.end());
972       llvm::sort(Sorted.begin(), Sorted.end(),
973                  [](const GlobalVariable *lhs, const GlobalVariable *rhs) {
974                    return lhs->getName() < rhs->getName();
975                  });
976       for (GlobalVariable *GV : Sorted) {
977         OptimizedStructLayoutField F(GV,
978                                      DL.getTypeAllocSize(GV->getValueType()),
979                                      AMDGPU::getAlign(DL, GV));
980         LayoutFields.emplace_back(F);
981       }
982     }
983 
984     performOptimizedStructLayout(LayoutFields);
985 
986     std::vector<GlobalVariable *> LocalVars;
987     BitVector IsPaddingField;
988     LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large
989     IsPaddingField.reserve(LDSVarsToTransform.size());
990     {
991       uint64_t CurrentOffset = 0;
992       for (size_t I = 0; I < LayoutFields.size(); I++) {
993         GlobalVariable *FGV = static_cast<GlobalVariable *>(
994             const_cast<void *>(LayoutFields[I].Id));
995         Align DataAlign = LayoutFields[I].Alignment;
996 
997         uint64_t DataAlignV = DataAlign.value();
998         if (uint64_t Rem = CurrentOffset % DataAlignV) {
999           uint64_t Padding = DataAlignV - Rem;
1000 
1001           // Append an array of padding bytes to meet alignment requested
1002           // Note (o +      (a - (o % a)) ) % a == 0
1003           //      (offset + Padding       ) % align == 0
1004 
1005           Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
1006           LocalVars.push_back(new GlobalVariable(
1007               M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
1008               "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
1009               false));
1010           IsPaddingField.push_back(true);
1011           CurrentOffset += Padding;
1012         }
1013 
1014         LocalVars.push_back(FGV);
1015         IsPaddingField.push_back(false);
1016         CurrentOffset += LayoutFields[I].Size;
1017       }
1018     }
1019 
1020     std::vector<Type *> LocalVarTypes;
1021     LocalVarTypes.reserve(LocalVars.size());
1022     std::transform(
1023         LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
1024         [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
1025 
1026     StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
1027 
1028     Align StructAlign = AMDGPU::getAlign(DL, LocalVars[0]);
1029 
1030     GlobalVariable *SGV = new GlobalVariable(
1031         M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
1032         VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
1033         false);
1034     SGV->setAlignment(StructAlign);
1035 
1036     DenseMap<GlobalVariable *, Constant *> Map;
1037     Type *I32 = Type::getInt32Ty(Ctx);
1038     for (size_t I = 0; I < LocalVars.size(); I++) {
1039       GlobalVariable *GV = LocalVars[I];
1040       Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
1041       Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx, true);
1042       if (IsPaddingField[I]) {
1043         assert(GV->use_empty());
1044         GV->eraseFromParent();
1045       } else {
1046         Map[GV] = GEP;
1047       }
1048     }
1049     assert(Map.size() == LDSVarsToTransform.size());
1050     return {SGV, std::move(Map)};
1051   }
1052 
1053   template <typename PredicateTy>
1054   void replaceLDSVariablesWithStruct(
1055       Module &M, DenseSet<GlobalVariable *> const &LDSVarsToTransformArg,
1056       LDSVariableReplacement Replacement, PredicateTy Predicate) {
1057     LLVMContext &Ctx = M.getContext();
1058     const DataLayout &DL = M.getDataLayout();
1059 
1060     // A hack... we need to insert the aliasing info in a predictable order for
1061     // lit tests. Would like to have them in a stable order already, ideally the
1062     // same order they get allocated, which might mean an ordered set container
1063     std::vector<GlobalVariable *> LDSVarsToTransform(
1064         LDSVarsToTransformArg.begin(), LDSVarsToTransformArg.end());
1065     llvm::sort(LDSVarsToTransform.begin(), LDSVarsToTransform.end(),
1066                [](const GlobalVariable *lhs, const GlobalVariable *rhs) {
1067                  return lhs->getName() < rhs->getName();
1068                });
1069 
1070     // Create alias.scope and their lists. Each field in the new structure
1071     // does not alias with all other fields.
1072     SmallVector<MDNode *> AliasScopes;
1073     SmallVector<Metadata *> NoAliasList;
1074     const size_t NumberVars = LDSVarsToTransform.size();
1075     if (NumberVars > 1) {
1076       MDBuilder MDB(Ctx);
1077       AliasScopes.reserve(NumberVars);
1078       MDNode *Domain = MDB.createAnonymousAliasScopeDomain();
1079       for (size_t I = 0; I < NumberVars; I++) {
1080         MDNode *Scope = MDB.createAnonymousAliasScope(Domain);
1081         AliasScopes.push_back(Scope);
1082       }
1083       NoAliasList.append(&AliasScopes[1], AliasScopes.end());
1084     }
1085 
1086     // Replace uses of ith variable with a constantexpr to the corresponding
1087     // field of the instance that will be allocated by AMDGPUMachineFunction
1088     for (size_t I = 0; I < NumberVars; I++) {
1089       GlobalVariable *GV = LDSVarsToTransform[I];
1090       Constant *GEP = Replacement.LDSVarsToConstantGEP[GV];
1091 
1092       GV->replaceUsesWithIf(GEP, Predicate);
1093 
1094       APInt APOff(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
1095       GEP->stripAndAccumulateInBoundsConstantOffsets(DL, APOff);
1096       uint64_t Offset = APOff.getZExtValue();
1097 
1098       Align A =
1099           commonAlignment(Replacement.SGV->getAlign().valueOrOne(), Offset);
1100 
1101       if (I)
1102         NoAliasList[I - 1] = AliasScopes[I - 1];
1103       MDNode *NoAlias =
1104           NoAliasList.empty() ? nullptr : MDNode::get(Ctx, NoAliasList);
1105       MDNode *AliasScope =
1106           AliasScopes.empty() ? nullptr : MDNode::get(Ctx, {AliasScopes[I]});
1107 
1108       refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
1109     }
1110   }
1111 
1112   void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,
1113                                 MDNode *AliasScope, MDNode *NoAlias,
1114                                 unsigned MaxDepth = 5) {
1115     if (!MaxDepth || (A == 1 && !AliasScope))
1116       return;
1117 
1118     for (User *U : Ptr->users()) {
1119       if (auto *I = dyn_cast<Instruction>(U)) {
1120         if (AliasScope && I->mayReadOrWriteMemory()) {
1121           MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope);
1122           AS = (AS ? MDNode::getMostGenericAliasScope(AS, AliasScope)
1123                    : AliasScope);
1124           I->setMetadata(LLVMContext::MD_alias_scope, AS);
1125 
1126           MDNode *NA = I->getMetadata(LLVMContext::MD_noalias);
1127           NA = (NA ? MDNode::intersect(NA, NoAlias) : NoAlias);
1128           I->setMetadata(LLVMContext::MD_noalias, NA);
1129         }
1130       }
1131 
1132       if (auto *LI = dyn_cast<LoadInst>(U)) {
1133         LI->setAlignment(std::max(A, LI->getAlign()));
1134         continue;
1135       }
1136       if (auto *SI = dyn_cast<StoreInst>(U)) {
1137         if (SI->getPointerOperand() == Ptr)
1138           SI->setAlignment(std::max(A, SI->getAlign()));
1139         continue;
1140       }
1141       if (auto *AI = dyn_cast<AtomicRMWInst>(U)) {
1142         // None of atomicrmw operations can work on pointers, but let's
1143         // check it anyway in case it will or we will process ConstantExpr.
1144         if (AI->getPointerOperand() == Ptr)
1145           AI->setAlignment(std::max(A, AI->getAlign()));
1146         continue;
1147       }
1148       if (auto *AI = dyn_cast<AtomicCmpXchgInst>(U)) {
1149         if (AI->getPointerOperand() == Ptr)
1150           AI->setAlignment(std::max(A, AI->getAlign()));
1151         continue;
1152       }
1153       if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
1154         unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
1155         APInt Off(BitWidth, 0);
1156         if (GEP->getPointerOperand() == Ptr) {
1157           Align GA;
1158           if (GEP->accumulateConstantOffset(DL, Off))
1159             GA = commonAlignment(A, Off.getLimitedValue());
1160           refineUsesAlignmentAndAA(GEP, GA, DL, AliasScope, NoAlias,
1161                                    MaxDepth - 1);
1162         }
1163         continue;
1164       }
1165       if (auto *I = dyn_cast<Instruction>(U)) {
1166         if (I->getOpcode() == Instruction::BitCast ||
1167             I->getOpcode() == Instruction::AddrSpaceCast)
1168           refineUsesAlignmentAndAA(I, A, DL, AliasScope, NoAlias, MaxDepth - 1);
1169       }
1170     }
1171   }
1172 };
1173 
1174 } // namespace
1175 char AMDGPULowerModuleLDS::ID = 0;
1176 
1177 char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID;
1178 
1179 INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE,
1180                 "Lower uses of LDS variables from non-kernel functions", false,
1181                 false)
1182 
1183 ModulePass *llvm::createAMDGPULowerModuleLDSPass() {
1184   return new AMDGPULowerModuleLDS();
1185 }
1186 
1187 PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
1188                                                 ModuleAnalysisManager &) {
1189   return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none()
1190                                                : PreservedAnalyses::all();
1191 }
1192