xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp (revision d77ae7f2513504655e555cd326208598093d66e2)
1 //===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass eliminates local data store, LDS, uses from non-kernel functions.
10 // LDS is contiguous memory allocated per kernel execution.
11 //
12 // Background.
13 //
14 // The programming model is global variables, or equivalently function local
15 // static variables, accessible from kernels or other functions. For uses from
16 // kernels this is straightforward - assign an integer to the kernel for the
17 // memory required by all the variables combined, allocate them within that.
18 // For uses from functions there are performance tradeoffs to choose between.
19 //
20 // This model means the GPU runtime can specify the amount of memory allocated.
21 // If this is more than the kernel assumed, the excess can be made available
22 // using a language specific feature, which IR represents as a variable with
23 // no initializer. This feature is not yet implemented for non-kernel functions.
24 // This lowering could be extended to handle that use case, but would probably
25 // require closer integration with promoteAllocaToLDS.
26 //
27 // Consequences of this GPU feature:
28 // - memory is limited and exceeding it halts compilation
29 // - a global accessed by one kernel exists independent of other kernels
30 // - a global exists independent of simultaneous execution of the same kernel
31 // - the address of the global may be different from different kernels as they
32 //   do not alias, which permits only allocating variables they use
33 // - if the address is allowed to differ, functions need help to find it
34 //
35 // Uses from kernels are implemented here by grouping them in a per-kernel
36 // struct instance. This duplicates the variables, accurately modelling their
37 // aliasing properties relative to a single global representation. It also
38 // permits control over alignment via padding.
39 //
40 // Uses from functions are more complicated and the primary purpose of this
41 // IR pass. Several different lowering are chosen between to meet requirements
42 // to avoid allocating any LDS where it is not necessary, as that impacts
43 // occupancy and may fail the compilation, while not imposing overhead on a
44 // feature whose primary advantage over global memory is performance. The basic
45 // design goal is to avoid one kernel imposing overhead on another.
46 //
47 // Implementation.
48 //
49 // LDS variables with constant annotation or non-undef initializer are passed
50 // through unchanged for simplification or error diagnostics in later passes.
51 // Non-undef initializers are not yet implemented for LDS.
52 //
53 // LDS variables that are always allocated at the same address can be found
54 // by lookup at that address. Otherwise runtime information/cost is required.
55 //
56 // The simplest strategy possible is to group all LDS variables in a single
57 // struct and allocate that struct in every kernel such that the original
58 // variables are always at the same address. LDS is however a limited resource
59 // so this strategy is unusable in practice. It is not implemented here.
60 //
61 // Strategy | Precise allocation | Zero runtime cost | General purpose |
62 //  --------+--------------------+-------------------+-----------------+
63 //   Module |                 No |               Yes |             Yes |
64 //    Table |                Yes |                No |             Yes |
65 //   Kernel |                Yes |               Yes |              No |
66 //   Hybrid |                Yes |           Partial |             Yes |
67 //
68 // Module spends LDS memory to save cycles. Table spends cycles and global
69 // memory to save LDS. Kernel is as fast as kernel allocation but only works
70 // for variables that are known reachable from a single kernel. Hybrid picks
71 // between all three. When forced to choose between LDS and cycles it minimises
72 // LDS use.
73 
74 // The "module" lowering implemented here finds LDS variables which are used by
75 // non-kernel functions and creates a new struct with a field for each of those
76 // LDS variables. Variables that are only used from kernels are excluded.
77 // Kernels that do not use this struct are annoteated with the attribute
78 // amdgpu-elide-module-lds which allows the back end to elide the allocation.
79 //
80 // The "table" lowering implemented here has three components.
81 // First kernels are assigned a unique integer identifier which is available in
82 // functions it calls through the intrinsic amdgcn_lds_kernel_id. The integer
83 // is passed through a specific SGPR, thus works with indirect calls.
84 // Second, each kernel allocates LDS variables independent of other kernels and
85 // writes the addresses it chose for each variable into an array in consistent
86 // order. If the kernel does not allocate a given variable, it writes undef to
87 // the corresponding array location. These arrays are written to a constant
88 // table in the order matching the kernel unique integer identifier.
89 // Third, uses from non-kernel functions are replaced with a table lookup using
90 // the intrinsic function to find the address of the variable.
91 //
92 // "Kernel" lowering is only applicable for variables that are unambiguously
93 // reachable from exactly one kernel. For those cases, accesses to the variable
94 // can be lowered to ConstantExpr address of a struct instance specific to that
95 // one kernel. This is zero cost in space and in compute. It will raise a fatal
96 // error on any variable that might be reachable from multiple kernels and is
97 // thus most easily used as part of the hybrid lowering strategy.
98 //
99 // Hybrid lowering is a mixture of the above. It uses the zero cost kernel
100 // lowering where it can. It lowers the variable accessed by the greatest
101 // number of kernels using the module strategy as that is free for the first
102 // variable. Any futher variables that can be lowered with the module strategy
103 // without incurring LDS memory overhead are. The remaining ones are lowered
104 // via table.
105 //
106 // Consequences
107 // - No heuristics or user controlled magic numbers, hybrid is the right choice
108 // - Kernels that don't use functions (or have had them all inlined) are not
109 //   affected by any lowering for kernels that do.
110 // - Kernels that don't make indirect function calls are not affected by those
111 //   that do.
112 // - Variables which are used by lots of kernels, e.g. those injected by a
113 //   language runtime in most kernels, are expected to have no overhead
114 // - Implementations that instantiate templates per-kernel where those templates
115 //   use LDS are expected to hit the "Kernel" lowering strategy
116 // - The runtime properties impose a cost in compiler implementation complexity
117 //
118 //===----------------------------------------------------------------------===//
119 
120 #include "AMDGPU.h"
121 #include "Utils/AMDGPUBaseInfo.h"
122 #include "Utils/AMDGPUMemoryUtils.h"
123 #include "llvm/ADT/BitVector.h"
124 #include "llvm/ADT/DenseMap.h"
125 #include "llvm/ADT/DenseSet.h"
126 #include "llvm/ADT/STLExtras.h"
127 #include "llvm/ADT/SetOperations.h"
128 #include "llvm/ADT/SetVector.h"
129 #include "llvm/ADT/StringSwitch.h"
130 #include "llvm/Analysis/CallGraph.h"
131 #include "llvm/IR/Constants.h"
132 #include "llvm/IR/DerivedTypes.h"
133 #include "llvm/IR/IRBuilder.h"
134 #include "llvm/IR/InlineAsm.h"
135 #include "llvm/IR/Instructions.h"
136 #include "llvm/IR/IntrinsicsAMDGPU.h"
137 #include "llvm/IR/MDBuilder.h"
138 #include "llvm/InitializePasses.h"
139 #include "llvm/Pass.h"
140 #include "llvm/Support/CommandLine.h"
141 #include "llvm/Support/Debug.h"
142 #include "llvm/Support/OptimizedStructLayout.h"
143 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
144 #include "llvm/Transforms/Utils/ModuleUtils.h"
145 
146 #include <tuple>
147 #include <vector>
148 
149 #include <cstdio>
150 
151 #define DEBUG_TYPE "amdgpu-lower-module-lds"
152 
153 using namespace llvm;
154 
155 namespace {
156 
157 cl::opt<bool> SuperAlignLDSGlobals(
158     "amdgpu-super-align-lds-globals",
159     cl::desc("Increase alignment of LDS if it is not on align boundary"),
160     cl::init(true), cl::Hidden);
161 
162 enum class LoweringKind { module, table, kernel, hybrid };
163 cl::opt<LoweringKind> LoweringKindLoc(
164     "amdgpu-lower-module-lds-strategy",
165     cl::desc("Specify lowering strategy for function LDS access:"), cl::Hidden,
166     cl::init(LoweringKind::module),
167     cl::values(
168         clEnumValN(LoweringKind::table, "table", "Lower via table lookup"),
169         clEnumValN(LoweringKind::module, "module", "Lower via module struct"),
170         clEnumValN(
171             LoweringKind::kernel, "kernel",
172             "Lower variables reachable from one kernel, otherwise abort"),
173         clEnumValN(LoweringKind::hybrid, "hybrid",
174                    "Lower via mixture of above strategies")));
175 
176 bool isKernelLDS(const Function *F) {
177   // Some weirdness here. AMDGPU::isKernelCC does not call into
178   // AMDGPU::isKernel with the calling conv, it instead calls into
179   // isModuleEntryFunction which returns true for more calling conventions
180   // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel.
181   // There's also a test that checks that the LDS lowering does not hit on
182   // a graphics shader, denoted amdgpu_ps, so stay with the limited case.
183   // Putting LDS in the name of the function to draw attention to this.
184   return AMDGPU::isKernel(F->getCallingConv());
185 }
186 
187 class AMDGPULowerModuleLDS : public ModulePass {
188 
189   static void removeFromUsedList(Module &M, StringRef Name,
190                                  SmallPtrSetImpl<Constant *> &ToRemove) {
191     GlobalVariable *GV = M.getNamedGlobal(Name);
192     if (!GV || ToRemove.empty()) {
193       return;
194     }
195 
196     SmallVector<Constant *, 16> Init;
197     auto *CA = cast<ConstantArray>(GV->getInitializer());
198     for (auto &Op : CA->operands()) {
199       // ModuleUtils::appendToUsed only inserts Constants
200       Constant *C = cast<Constant>(Op);
201       if (!ToRemove.contains(C->stripPointerCasts())) {
202         Init.push_back(C);
203       }
204     }
205 
206     if (Init.size() == CA->getNumOperands()) {
207       return; // none to remove
208     }
209 
210     GV->eraseFromParent();
211 
212     for (Constant *C : ToRemove) {
213       C->removeDeadConstantUsers();
214     }
215 
216     if (!Init.empty()) {
217       ArrayType *ATy =
218           ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size());
219       GV =
220           new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
221                                    ConstantArray::get(ATy, Init), Name);
222       GV->setSection("llvm.metadata");
223     }
224   }
225 
226   static void removeFromUsedLists(Module &M,
227                                   const DenseSet<GlobalVariable *> &LocalVars) {
228     // The verifier rejects used lists containing an inttoptr of a constant
229     // so remove the variables from these lists before replaceAllUsesWith
230 
231     SmallPtrSet<Constant *, 32> LocalVarsSet;
232     for (GlobalVariable *LocalVar : LocalVars)
233       if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts()))
234         LocalVarsSet.insert(C);
235     removeFromUsedList(M, "llvm.used", LocalVarsSet);
236     removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet);
237   }
238 
239   static void markUsedByKernel(IRBuilder<> &Builder, Function *Func,
240                                GlobalVariable *SGV) {
241     // The llvm.amdgcn.module.lds instance is implicitly used by all kernels
242     // that might call a function which accesses a field within it. This is
243     // presently approximated to 'all kernels' if there are any such functions
244     // in the module. This implicit use is redefined as an explicit use here so
245     // that later passes, specifically PromoteAlloca, account for the required
246     // memory without any knowledge of this transform.
247 
248     // An operand bundle on llvm.donothing works because the call instruction
249     // survives until after the last pass that needs to account for LDS. It is
250     // better than inline asm as the latter survives until the end of codegen. A
251     // totally robust solution would be a function with the same semantics as
252     // llvm.donothing that takes a pointer to the instance and is lowered to a
253     // no-op after LDS is allocated, but that is not presently necessary.
254 
255     LLVMContext &Ctx = Func->getContext();
256 
257     Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI());
258 
259     FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {});
260 
261     Function *Decl =
262         Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {});
263 
264     Value *UseInstance[1] = {Builder.CreateInBoundsGEP(
265         SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))};
266 
267     Builder.CreateCall(FTy, Decl, {},
268                        {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)},
269                        "");
270   }
271 
272   static bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
273     // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
274     // global may have uses from multiple different functions as a result.
275     // This pass specialises LDS variables with respect to the kernel that
276     // allocates them.
277 
278     // This is semantically equivalent to:
279     // for (auto &F : M.functions())
280     //   for (auto &BB : F)
281     //     for (auto &I : BB)
282     //       for (Use &Op : I.operands())
283     //         if (constantExprUsesLDS(Op))
284     //           replaceConstantExprInFunction(I, Op);
285 
286     bool Changed = false;
287 
288     // Find all ConstantExpr that are direct users of an LDS global
289     SmallVector<ConstantExpr *> Stack;
290     for (auto &GV : M.globals())
291       if (AMDGPU::isLDSVariableToLower(GV))
292         for (User *U : GV.users())
293           if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
294             Stack.push_back(C);
295 
296     // Expand to include constexpr users of direct users
297     SetVector<ConstantExpr *> ConstExprUsersOfLDS;
298     while (!Stack.empty()) {
299       ConstantExpr *V = Stack.pop_back_val();
300       if (ConstExprUsersOfLDS.contains(V))
301         continue;
302 
303       ConstExprUsersOfLDS.insert(V);
304 
305       for (auto *Nested : V->users())
306         if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Nested))
307           Stack.push_back(CE);
308     }
309 
310     // Find all instructions that use any of the ConstExpr users of LDS
311     SetVector<Instruction *> InstructionWorklist;
312     for (ConstantExpr *CE : ConstExprUsersOfLDS)
313       for (User *U : CE->users())
314         if (auto *I = dyn_cast<Instruction>(U))
315           InstructionWorklist.insert(I);
316 
317     // Replace those ConstExpr operands with instructions
318     while (!InstructionWorklist.empty()) {
319       Instruction *I = InstructionWorklist.pop_back_val();
320       for (Use &U : I->operands()) {
321 
322         auto *BI = I;
323         if (auto *Phi = dyn_cast<PHINode>(I)) {
324           BasicBlock *BB = Phi->getIncomingBlock(U);
325           BasicBlock::iterator It = BB->getFirstInsertionPt();
326           assert(It != BB->end() && "Unexpected empty basic block");
327           BI = &(*(It));
328         }
329 
330         if (ConstantExpr *C = dyn_cast<ConstantExpr>(U.get())) {
331           if (ConstExprUsersOfLDS.contains(C)) {
332             Changed = true;
333             Instruction *NI = C->getAsInstruction(BI);
334             InstructionWorklist.insert(NI);
335             U.set(NI);
336             C->removeDeadConstantUsers();
337           }
338         }
339       }
340     }
341 
342     return Changed;
343   }
344 
345 public:
346   static char ID;
347 
348   AMDGPULowerModuleLDS() : ModulePass(ID) {
349     initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry());
350   }
351 
352   using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>;
353 
354   using VariableFunctionMap = DenseMap<GlobalVariable *, DenseSet<Function *>>;
355 
356   static void getUsesOfLDSByFunction(CallGraph const &CG, Module &M,
357                                      FunctionVariableMap &kernels,
358                                      FunctionVariableMap &functions) {
359 
360     // Get uses from the current function, excluding uses by called functions
361     // Two output variables to avoid walking the globals list twice
362     for (auto &GV : M.globals()) {
363       if (!AMDGPU::isLDSVariableToLower(GV)) {
364         continue;
365       }
366 
367       SmallVector<User *, 16> Stack(GV.users());
368       for (User *V : GV.users()) {
369         if (auto *I = dyn_cast<Instruction>(V)) {
370           Function *F = I->getFunction();
371           if (isKernelLDS(F)) {
372             kernels[F].insert(&GV);
373           } else {
374             functions[F].insert(&GV);
375           }
376         }
377       }
378     }
379   }
380 
381   struct LDSUsesInfoTy {
382     FunctionVariableMap direct_access;
383     FunctionVariableMap indirect_access;
384   };
385 
386   static LDSUsesInfoTy getTransitiveUsesOfLDS(CallGraph const &CG, Module &M) {
387 
388     FunctionVariableMap direct_map_kernel;
389     FunctionVariableMap direct_map_function;
390     getUsesOfLDSByFunction(CG, M, direct_map_kernel, direct_map_function);
391 
392     // Collect variables that are used by functions whose address has escaped
393     DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
394     for (Function &F : M.functions()) {
395       if (!isKernelLDS(&F))
396           if (F.hasAddressTaken(nullptr,
397                                 /* IgnoreCallbackUses */ false,
398                                 /* IgnoreAssumeLikeCalls */ false,
399                                 /* IgnoreLLVMUsed */ true,
400                                 /* IgnoreArcAttachedCall */ false)) {
401           set_union(VariablesReachableThroughFunctionPointer,
402                     direct_map_function[&F]);
403         }
404     }
405 
406     auto functionMakesUnknownCall = [&](const Function *F) -> bool {
407       assert(!F->isDeclaration());
408       for (CallGraphNode::CallRecord R : *CG[F]) {
409         if (!R.second->getFunction()) {
410           return true;
411         }
412       }
413       return false;
414     };
415 
416     // Work out which variables are reachable through function calls
417     FunctionVariableMap transitive_map_function = direct_map_function;
418 
419     // If the function makes any unknown call, assume the worst case that it can
420     // access all variables accessed by functions whose address escaped
421     for (Function &F : M.functions()) {
422       if (!F.isDeclaration() && functionMakesUnknownCall(&F)) {
423         if (!isKernelLDS(&F)) {
424           set_union(transitive_map_function[&F],
425                     VariablesReachableThroughFunctionPointer);
426         }
427       }
428     }
429 
430     // Direct implementation of collecting all variables reachable from each
431     // function
432     for (Function &Func : M.functions()) {
433       if (Func.isDeclaration() || isKernelLDS(&Func))
434         continue;
435 
436       DenseSet<Function *> seen; // catches cycles
437       SmallVector<Function *, 4> wip{&Func};
438 
439       while (!wip.empty()) {
440         Function *F = wip.pop_back_val();
441 
442         // Can accelerate this by referring to transitive map for functions that
443         // have already been computed, with more care than this
444         set_union(transitive_map_function[&Func], direct_map_function[F]);
445 
446         for (CallGraphNode::CallRecord R : *CG[F]) {
447           Function *ith = R.second->getFunction();
448           if (ith) {
449             if (!seen.contains(ith)) {
450               seen.insert(ith);
451               wip.push_back(ith);
452             }
453           }
454         }
455       }
456     }
457 
458     // direct_map_kernel lists which variables are used by the kernel
459     // find the variables which are used through a function call
460     FunctionVariableMap indirect_map_kernel;
461 
462     for (Function &Func : M.functions()) {
463       if (Func.isDeclaration() || !isKernelLDS(&Func))
464         continue;
465 
466       for (CallGraphNode::CallRecord R : *CG[&Func]) {
467         Function *ith = R.second->getFunction();
468         if (ith) {
469           set_union(indirect_map_kernel[&Func], transitive_map_function[ith]);
470         } else {
471           set_union(indirect_map_kernel[&Func],
472                     VariablesReachableThroughFunctionPointer);
473         }
474       }
475     }
476 
477     return {std::move(direct_map_kernel), std::move(indirect_map_kernel)};
478   }
479 
480   struct LDSVariableReplacement {
481     GlobalVariable *SGV = nullptr;
482     DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;
483   };
484 
485   // remap from lds global to a constantexpr gep to where it has been moved to
486   // for each kernel
487   // an array with an element for each kernel containing where the corresponding
488   // variable was remapped to
489 
490   static Constant *getAddressesOfVariablesInKernel(
491       LLVMContext &Ctx, ArrayRef<GlobalVariable *> Variables,
492       DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP) {
493     // Create a ConstantArray containing the address of each Variable within the
494     // kernel corresponding to LDSVarsToConstantGEP, or poison if that kernel
495     // does not allocate it
496     // TODO: Drop the ptrtoint conversion
497 
498     Type *I32 = Type::getInt32Ty(Ctx);
499 
500     ArrayType *KernelOffsetsType = ArrayType::get(I32, Variables.size());
501 
502     SmallVector<Constant *> Elements;
503     for (size_t i = 0; i < Variables.size(); i++) {
504       GlobalVariable *GV = Variables[i];
505       if (LDSVarsToConstantGEP.count(GV) != 0) {
506         auto elt = ConstantExpr::getPtrToInt(LDSVarsToConstantGEP[GV], I32);
507         Elements.push_back(elt);
508       } else {
509         Elements.push_back(PoisonValue::get(I32));
510       }
511     }
512     return ConstantArray::get(KernelOffsetsType, Elements);
513   }
514 
515   static GlobalVariable *buildLookupTable(
516       Module &M, ArrayRef<GlobalVariable *> Variables,
517       ArrayRef<Function *> kernels,
518       DenseMap<Function *, LDSVariableReplacement> &KernelToReplacement) {
519     if (Variables.empty()) {
520       return nullptr;
521     }
522     LLVMContext &Ctx = M.getContext();
523 
524     const size_t NumberVariables = Variables.size();
525     const size_t NumberKernels = kernels.size();
526 
527     ArrayType *KernelOffsetsType =
528         ArrayType::get(Type::getInt32Ty(Ctx), NumberVariables);
529 
530     ArrayType *AllKernelsOffsetsType =
531         ArrayType::get(KernelOffsetsType, NumberKernels);
532 
533     std::vector<Constant *> overallConstantExprElts(NumberKernels);
534     for (size_t i = 0; i < NumberKernels; i++) {
535       LDSVariableReplacement Replacement = KernelToReplacement[kernels[i]];
536       overallConstantExprElts[i] = getAddressesOfVariablesInKernel(
537           Ctx, Variables, Replacement.LDSVarsToConstantGEP);
538     }
539 
540     Constant *init =
541         ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts);
542 
543     return new GlobalVariable(
544         M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
545         "llvm.amdgcn.lds.offset.table", nullptr, GlobalValue::NotThreadLocal,
546         AMDGPUAS::CONSTANT_ADDRESS);
547   }
548 
549   void replaceUsesInInstructionsWithTableLookup(
550       Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables,
551       GlobalVariable *LookupTable) {
552 
553     LLVMContext &Ctx = M.getContext();
554     IRBuilder<> Builder(Ctx);
555     Type *I32 = Type::getInt32Ty(Ctx);
556 
557     // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which
558     // lowers to a read from a live in register. Emit it once in the entry
559     // block to spare deduplicating it later.
560 
561     DenseMap<Function *, Value *> tableKernelIndexCache;
562     auto getTableKernelIndex = [&](Function *F) -> Value * {
563       if (tableKernelIndexCache.count(F) == 0) {
564         LLVMContext &Ctx = M.getContext();
565         FunctionType *FTy = FunctionType::get(Type::getInt32Ty(Ctx), {});
566         Function *Decl =
567             Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {});
568 
569         BasicBlock::iterator it =
570             F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
571         Instruction &i = *it;
572         Builder.SetInsertPoint(&i);
573 
574         tableKernelIndexCache[F] = Builder.CreateCall(FTy, Decl, {});
575       }
576 
577       return tableKernelIndexCache[F];
578     };
579 
580     for (size_t Index = 0; Index < ModuleScopeVariables.size(); Index++) {
581       auto *GV = ModuleScopeVariables[Index];
582 
583       for (Use &U : make_early_inc_range(GV->uses())) {
584         auto *I = dyn_cast<Instruction>(U.getUser());
585         if (!I)
586           continue;
587 
588         Value *tableKernelIndex = getTableKernelIndex(I->getFunction());
589 
590         // So if the phi uses this value multiple times, what does this look
591         // like?
592         if (auto *Phi = dyn_cast<PHINode>(I)) {
593           BasicBlock *BB = Phi->getIncomingBlock(U);
594           Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt())));
595         } else {
596           Builder.SetInsertPoint(I);
597         }
598 
599         Value *GEPIdx[3] = {
600             ConstantInt::get(I32, 0),
601             tableKernelIndex,
602             ConstantInt::get(I32, Index),
603         };
604 
605         Value *Address = Builder.CreateInBoundsGEP(
606             LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName());
607 
608         Value *loaded = Builder.CreateLoad(I32, Address);
609 
610         Value *replacement =
611             Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName());
612 
613         U.set(replacement);
614       }
615     }
616   }
617 
618   static DenseSet<Function *> kernelsThatIndirectlyAccessAnyOfPassedVariables(
619       Module &M, LDSUsesInfoTy &LDSUsesInfo,
620       DenseSet<GlobalVariable *> const &VariableSet) {
621 
622     DenseSet<Function *> KernelSet;
623 
624     if (VariableSet.empty()) return KernelSet;
625 
626     for (Function &Func : M.functions()) {
627       if (Func.isDeclaration() || !isKernelLDS(&Func))
628         continue;
629       for (GlobalVariable *GV : LDSUsesInfo.indirect_access[&Func]) {
630         if (VariableSet.contains(GV)) {
631           KernelSet.insert(&Func);
632           break;
633         }
634       }
635     }
636 
637     return KernelSet;
638   }
639 
640   static GlobalVariable *
641   chooseBestVariableForModuleStrategy(const DataLayout &DL,
642                                       VariableFunctionMap &LDSVars) {
643     // Find the global variable with the most indirect uses from kernels
644 
645     struct CandidateTy {
646       GlobalVariable *GV = nullptr;
647       size_t UserCount = 0;
648       size_t Size = 0;
649 
650       CandidateTy() = default;
651 
652       CandidateTy(GlobalVariable *GV, uint64_t UserCount, uint64_t AllocSize)
653           : GV(GV), UserCount(UserCount), Size(AllocSize) {}
654 
655       bool operator<(const CandidateTy &Other) const {
656         // Fewer users makes module scope variable less attractive
657         if (UserCount < Other.UserCount) {
658           return true;
659         }
660         if (UserCount > Other.UserCount) {
661           return false;
662         }
663 
664         // Bigger makes module scope variable less attractive
665         if (Size < Other.Size) {
666           return false;
667         }
668 
669         if (Size > Other.Size) {
670           return true;
671         }
672 
673         // Arbitrary but consistent
674         return GV->getName() < Other.GV->getName();
675       }
676     };
677 
678     CandidateTy MostUsed;
679 
680     for (auto &K : LDSVars) {
681       GlobalVariable *GV = K.first;
682       if (K.second.size() <= 1) {
683         // A variable reachable by only one kernel is best lowered with kernel
684         // strategy
685         continue;
686       }
687       CandidateTy Candidate(GV, K.second.size(),
688                       DL.getTypeAllocSize(GV->getValueType()).getFixedValue());
689       if (MostUsed < Candidate)
690         MostUsed = Candidate;
691     }
692 
693     return MostUsed.GV;
694   }
695 
696   bool runOnModule(Module &M) override {
697     LLVMContext &Ctx = M.getContext();
698     CallGraph CG = CallGraph(M);
699     bool Changed = superAlignLDSGlobals(M);
700 
701     Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
702 
703     Changed = true; // todo: narrow this down
704 
705     // For each kernel, what variables does it access directly or through
706     // callees
707     LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
708 
709     // For each variable accessed through callees, which kernels access it
710     VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
711     for (auto &K : LDSUsesInfo.indirect_access) {
712       Function *F = K.first;
713       assert(isKernelLDS(F));
714       for (GlobalVariable *GV : K.second) {
715         LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
716       }
717     }
718 
719     // Partition variables into the different strategies
720     DenseSet<GlobalVariable *> ModuleScopeVariables;
721     DenseSet<GlobalVariable *> TableLookupVariables;
722     DenseSet<GlobalVariable *> KernelAccessVariables;
723 
724     {
725       GlobalVariable *HybridModuleRoot =
726           LoweringKindLoc != LoweringKind::hybrid
727               ? nullptr
728               : chooseBestVariableForModuleStrategy(
729                     M.getDataLayout(),
730                     LDSToKernelsThatNeedToAccessItIndirectly);
731 
732       DenseSet<Function *> const EmptySet;
733       DenseSet<Function *> const &HybridModuleRootKernels =
734           HybridModuleRoot
735               ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot]
736               : EmptySet;
737 
738       for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
739         // Each iteration of this loop assigns exactly one global variable to
740         // exactly one of the implementation strategies.
741 
742         GlobalVariable *GV = K.first;
743         assert(AMDGPU::isLDSVariableToLower(*GV));
744         assert(K.second.size() != 0);
745 
746         switch (LoweringKindLoc) {
747         case LoweringKind::module:
748           ModuleScopeVariables.insert(GV);
749           break;
750 
751         case LoweringKind::table:
752           TableLookupVariables.insert(GV);
753           break;
754 
755         case LoweringKind::kernel:
756           if (K.second.size() == 1) {
757             KernelAccessVariables.insert(GV);
758           } else {
759             report_fatal_error("Cannot lower LDS to kernel access as it is "
760                                "reachable from multiple kernels");
761           }
762           break;
763 
764         case LoweringKind::hybrid: {
765           if (GV == HybridModuleRoot) {
766             assert(K.second.size() != 1);
767             ModuleScopeVariables.insert(GV);
768           } else if (K.second.size() == 1) {
769             KernelAccessVariables.insert(GV);
770           } else if (set_is_subset(K.second, HybridModuleRootKernels)) {
771             ModuleScopeVariables.insert(GV);
772           } else {
773             TableLookupVariables.insert(GV);
774           }
775           break;
776         }
777         }
778       }
779 
780       assert(ModuleScopeVariables.size() + TableLookupVariables.size() +
781                  KernelAccessVariables.size() ==
782              LDSToKernelsThatNeedToAccessItIndirectly.size());
783     } // Variables have now been partitioned into the three lowering strategies.
784 
785     // If the kernel accesses a variable that is going to be stored in the
786     // module instance through a call then that kernel needs to allocate the
787     // module instance
788     DenseSet<Function *> KernelsThatAllocateModuleLDS =
789         kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
790                                                         ModuleScopeVariables);
791     DenseSet<Function *> KernelsThatAllocateTableLDS =
792         kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
793                                                         TableLookupVariables);
794 
795     if (!ModuleScopeVariables.empty()) {
796       LDSVariableReplacement ModuleScopeReplacement =
797           createLDSVariableReplacement(M, "llvm.amdgcn.module.lds",
798                                        ModuleScopeVariables);
799 
800       appendToCompilerUsed(M,
801                            {static_cast<GlobalValue *>(
802                                ConstantExpr::getPointerBitCastOrAddrSpaceCast(
803                                    cast<Constant>(ModuleScopeReplacement.SGV),
804                                    Type::getInt8PtrTy(Ctx)))});
805 
806       // historic
807       removeFromUsedLists(M, ModuleScopeVariables);
808 
809       // Replace all uses of module scope variable from non-kernel functions
810       replaceLDSVariablesWithStruct(
811           M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) {
812             Instruction *I = dyn_cast<Instruction>(U.getUser());
813             if (!I) {
814               return false;
815             }
816             Function *F = I->getFunction();
817             return !isKernelLDS(F);
818           });
819 
820       // Replace uses of module scope variable from kernel functions that
821       // allocate the module scope variable, otherwise leave them unchanged
822       // Record on each kernel whether the module scope global is used by it
823 
824       LLVMContext &Ctx = M.getContext();
825       IRBuilder<> Builder(Ctx);
826 
827       for (Function &Func : M.functions()) {
828         if (Func.isDeclaration() || !isKernelLDS(&Func))
829           continue;
830 
831         if (KernelsThatAllocateModuleLDS.contains(&Func)) {
832           replaceLDSVariablesWithStruct(
833               M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) {
834                 Instruction *I = dyn_cast<Instruction>(U.getUser());
835                 if (!I) {
836                   return false;
837                 }
838                 Function *F = I->getFunction();
839                 return F == &Func;
840               });
841 
842           markUsedByKernel(Builder, &Func, ModuleScopeReplacement.SGV);
843 
844         } else {
845           Func.addFnAttr("amdgpu-elide-module-lds");
846         }
847       }
848     }
849 
850     // Create a struct for each kernel for the non-module-scope variables
851     DenseMap<Function *, LDSVariableReplacement> KernelToReplacement;
852     for (Function &Func : M.functions()) {
853       if (Func.isDeclaration() || !isKernelLDS(&Func))
854         continue;
855 
856       DenseSet<GlobalVariable *> KernelUsedVariables;
857       for (auto &v : LDSUsesInfo.direct_access[&Func]) {
858         KernelUsedVariables.insert(v);
859       }
860       for (auto &v : LDSUsesInfo.indirect_access[&Func]) {
861         KernelUsedVariables.insert(v);
862       }
863 
864       // Variables allocated in module lds must all resolve to that struct,
865       // not to the per-kernel instance.
866       if (KernelsThatAllocateModuleLDS.contains(&Func)) {
867         for (GlobalVariable *v : ModuleScopeVariables) {
868           KernelUsedVariables.erase(v);
869         }
870       }
871 
872       if (KernelUsedVariables.empty()) {
873         // Either used no LDS, or all the LDS it used was also in module
874         continue;
875       }
876 
877       // The association between kernel function and LDS struct is done by
878       // symbol name, which only works if the function in question has a
879       // name This is not expected to be a problem in practice as kernels
880       // are called by name making anonymous ones (which are named by the
881       // backend) difficult to use. This does mean that llvm test cases need
882       // to name the kernels.
883       if (!Func.hasName()) {
884         report_fatal_error("Anonymous kernels cannot use LDS variables");
885       }
886 
887       std::string VarName =
888           (Twine("llvm.amdgcn.kernel.") + Func.getName() + ".lds").str();
889 
890       auto Replacement =
891           createLDSVariableReplacement(M, VarName, KernelUsedVariables);
892 
893       // remove preserves existing codegen
894       removeFromUsedLists(M, KernelUsedVariables);
895       KernelToReplacement[&Func] = Replacement;
896 
897       // Rewrite uses within kernel to the new struct
898       replaceLDSVariablesWithStruct(
899           M, KernelUsedVariables, Replacement, [&Func](Use &U) {
900             Instruction *I = dyn_cast<Instruction>(U.getUser());
901             return I && I->getFunction() == &Func;
902           });
903     }
904 
905     // Lower zero cost accesses to the kernel instances just created
906     for (auto &GV : KernelAccessVariables) {
907       auto &funcs = LDSToKernelsThatNeedToAccessItIndirectly[GV];
908       assert(funcs.size() == 1); // Only one kernel can access it
909       LDSVariableReplacement Replacement =
910           KernelToReplacement[*(funcs.begin())];
911 
912       DenseSet<GlobalVariable *> Vec;
913       Vec.insert(GV);
914 
915       replaceLDSVariablesWithStruct(M, Vec, Replacement, [](Use &U) {
916                                                            return isa<Instruction>(U.getUser());
917       });
918     }
919 
920     if (!KernelsThatAllocateTableLDS.empty()) {
921       // Collect the kernels that allocate table lookup LDS
922       std::vector<Function *> OrderedKernels;
923       {
924         for (Function &Func : M.functions()) {
925           if (Func.isDeclaration())
926             continue;
927           if (!isKernelLDS(&Func))
928             continue;
929 
930           if (KernelsThatAllocateTableLDS.contains(&Func)) {
931             assert(Func.hasName()); // else fatal error earlier
932             OrderedKernels.push_back(&Func);
933           }
934         }
935 
936         // Put them in an arbitrary but reproducible order
937         llvm::sort(OrderedKernels.begin(), OrderedKernels.end(),
938                    [](const Function *lhs, const Function *rhs) -> bool {
939                      return lhs->getName() < rhs->getName();
940                    });
941 
942         // Annotate the kernels with their order in this vector
943         LLVMContext &Ctx = M.getContext();
944         IRBuilder<> Builder(Ctx);
945 
946         if (OrderedKernels.size() > UINT32_MAX) {
947           // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU
948           report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels");
949         }
950 
951         for (size_t i = 0; i < OrderedKernels.size(); i++) {
952           Metadata *AttrMDArgs[1] = {
953               ConstantAsMetadata::get(Builder.getInt32(i)),
954           };
955           OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id",
956                                          MDNode::get(Ctx, AttrMDArgs));
957 
958           markUsedByKernel(Builder, OrderedKernels[i],
959                            KernelToReplacement[OrderedKernels[i]].SGV);
960         }
961       }
962 
963       // The order must be consistent between lookup table and accesses to
964       // lookup table
965       std::vector<GlobalVariable *> TableLookupVariablesOrdered(
966           TableLookupVariables.begin(), TableLookupVariables.end());
967       llvm::sort(TableLookupVariablesOrdered.begin(),
968                  TableLookupVariablesOrdered.end(),
969                  [](const GlobalVariable *lhs, const GlobalVariable *rhs) {
970                    return lhs->getName() < rhs->getName();
971                  });
972 
973       GlobalVariable *LookupTable = buildLookupTable(
974           M, TableLookupVariablesOrdered, OrderedKernels, KernelToReplacement);
975       replaceUsesInInstructionsWithTableLookup(M, TableLookupVariablesOrdered,
976                                                LookupTable);
977     }
978 
979     for (auto &GV : make_early_inc_range(M.globals()))
980       if (AMDGPU::isLDSVariableToLower(GV)) {
981 
982         // probably want to remove from used lists
983         GV.removeDeadConstantUsers();
984         if (GV.use_empty())
985           GV.eraseFromParent();
986       }
987 
988     return Changed;
989   }
990 
991 private:
992   // Increase the alignment of LDS globals if necessary to maximise the chance
993   // that we can use aligned LDS instructions to access them.
994   static bool superAlignLDSGlobals(Module &M) {
995     const DataLayout &DL = M.getDataLayout();
996     bool Changed = false;
997     if (!SuperAlignLDSGlobals) {
998       return Changed;
999     }
1000 
1001     for (auto &GV : M.globals()) {
1002       if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
1003         // Only changing alignment of LDS variables
1004         continue;
1005       }
1006       if (!GV.hasInitializer()) {
1007         // cuda/hip extern __shared__ variable, leave alignment alone
1008         continue;
1009       }
1010 
1011       Align Alignment = AMDGPU::getAlign(DL, &GV);
1012       TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType());
1013 
1014       if (GVSize > 8) {
1015         // We might want to use a b96 or b128 load/store
1016         Alignment = std::max(Alignment, Align(16));
1017       } else if (GVSize > 4) {
1018         // We might want to use a b64 load/store
1019         Alignment = std::max(Alignment, Align(8));
1020       } else if (GVSize > 2) {
1021         // We might want to use a b32 load/store
1022         Alignment = std::max(Alignment, Align(4));
1023       } else if (GVSize > 1) {
1024         // We might want to use a b16 load/store
1025         Alignment = std::max(Alignment, Align(2));
1026       }
1027 
1028       if (Alignment != AMDGPU::getAlign(DL, &GV)) {
1029         Changed = true;
1030         GV.setAlignment(Alignment);
1031       }
1032     }
1033     return Changed;
1034   }
1035 
1036   static LDSVariableReplacement createLDSVariableReplacement(
1037       Module &M, std::string VarName,
1038       DenseSet<GlobalVariable *> const &LDSVarsToTransform) {
1039     // Create a struct instance containing LDSVarsToTransform and map from those
1040     // variables to ConstantExprGEP
1041     // Variables may be introduced to meet alignment requirements. No aliasing
1042     // metadata is useful for these as they have no uses. Erased before return.
1043 
1044     LLVMContext &Ctx = M.getContext();
1045     const DataLayout &DL = M.getDataLayout();
1046     assert(!LDSVarsToTransform.empty());
1047 
1048     SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
1049     LayoutFields.reserve(LDSVarsToTransform.size());
1050     {
1051       // The order of fields in this struct depends on the order of
1052       // varables in the argument which varies when changing how they
1053       // are identified, leading to spurious test breakage.
1054       std::vector<GlobalVariable *> Sorted(LDSVarsToTransform.begin(),
1055                                            LDSVarsToTransform.end());
1056       llvm::sort(Sorted.begin(), Sorted.end(),
1057                  [](const GlobalVariable *lhs, const GlobalVariable *rhs) {
1058                    return lhs->getName() < rhs->getName();
1059                  });
1060       for (GlobalVariable *GV : Sorted) {
1061         OptimizedStructLayoutField F(GV,
1062                                      DL.getTypeAllocSize(GV->getValueType()),
1063                                      AMDGPU::getAlign(DL, GV));
1064         LayoutFields.emplace_back(F);
1065       }
1066     }
1067 
1068     performOptimizedStructLayout(LayoutFields);
1069 
1070     std::vector<GlobalVariable *> LocalVars;
1071     BitVector IsPaddingField;
1072     LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large
1073     IsPaddingField.reserve(LDSVarsToTransform.size());
1074     {
1075       uint64_t CurrentOffset = 0;
1076       for (size_t I = 0; I < LayoutFields.size(); I++) {
1077         GlobalVariable *FGV = static_cast<GlobalVariable *>(
1078             const_cast<void *>(LayoutFields[I].Id));
1079         Align DataAlign = LayoutFields[I].Alignment;
1080 
1081         uint64_t DataAlignV = DataAlign.value();
1082         if (uint64_t Rem = CurrentOffset % DataAlignV) {
1083           uint64_t Padding = DataAlignV - Rem;
1084 
1085           // Append an array of padding bytes to meet alignment requested
1086           // Note (o +      (a - (o % a)) ) % a == 0
1087           //      (offset + Padding       ) % align == 0
1088 
1089           Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
1090           LocalVars.push_back(new GlobalVariable(
1091               M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
1092               "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
1093               false));
1094           IsPaddingField.push_back(true);
1095           CurrentOffset += Padding;
1096         }
1097 
1098         LocalVars.push_back(FGV);
1099         IsPaddingField.push_back(false);
1100         CurrentOffset += LayoutFields[I].Size;
1101       }
1102     }
1103 
1104     std::vector<Type *> LocalVarTypes;
1105     LocalVarTypes.reserve(LocalVars.size());
1106     std::transform(
1107         LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
1108         [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
1109 
1110     StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
1111 
1112     Align StructAlign = AMDGPU::getAlign(DL, LocalVars[0]);
1113 
1114     GlobalVariable *SGV = new GlobalVariable(
1115         M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
1116         VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
1117         false);
1118     SGV->setAlignment(StructAlign);
1119 
1120     DenseMap<GlobalVariable *, Constant *> Map;
1121     Type *I32 = Type::getInt32Ty(Ctx);
1122     for (size_t I = 0; I < LocalVars.size(); I++) {
1123       GlobalVariable *GV = LocalVars[I];
1124       Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
1125       Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx, true);
1126       if (IsPaddingField[I]) {
1127         assert(GV->use_empty());
1128         GV->eraseFromParent();
1129       } else {
1130         Map[GV] = GEP;
1131       }
1132     }
1133     assert(Map.size() == LDSVarsToTransform.size());
1134     return {SGV, std::move(Map)};
1135   }
1136 
1137   template <typename PredicateTy>
1138   void replaceLDSVariablesWithStruct(
1139       Module &M, DenseSet<GlobalVariable *> const &LDSVarsToTransformArg,
1140       LDSVariableReplacement Replacement, PredicateTy Predicate) {
1141     LLVMContext &Ctx = M.getContext();
1142     const DataLayout &DL = M.getDataLayout();
1143 
1144     // A hack... we need to insert the aliasing info in a predictable order for
1145     // lit tests. Would like to have them in a stable order already, ideally the
1146     // same order they get allocated, which might mean an ordered set container
1147     std::vector<GlobalVariable *> LDSVarsToTransform(
1148         LDSVarsToTransformArg.begin(), LDSVarsToTransformArg.end());
1149     llvm::sort(LDSVarsToTransform.begin(), LDSVarsToTransform.end(),
1150                [](const GlobalVariable *lhs, const GlobalVariable *rhs) {
1151                  return lhs->getName() < rhs->getName();
1152                });
1153 
1154     // Create alias.scope and their lists. Each field in the new structure
1155     // does not alias with all other fields.
1156     SmallVector<MDNode *> AliasScopes;
1157     SmallVector<Metadata *> NoAliasList;
1158     const size_t NumberVars = LDSVarsToTransform.size();
1159     if (NumberVars > 1) {
1160       MDBuilder MDB(Ctx);
1161       AliasScopes.reserve(NumberVars);
1162       MDNode *Domain = MDB.createAnonymousAliasScopeDomain();
1163       for (size_t I = 0; I < NumberVars; I++) {
1164         MDNode *Scope = MDB.createAnonymousAliasScope(Domain);
1165         AliasScopes.push_back(Scope);
1166       }
1167       NoAliasList.append(&AliasScopes[1], AliasScopes.end());
1168     }
1169 
1170     // Replace uses of ith variable with a constantexpr to the corresponding
1171     // field of the instance that will be allocated by AMDGPUMachineFunction
1172     for (size_t I = 0; I < NumberVars; I++) {
1173       GlobalVariable *GV = LDSVarsToTransform[I];
1174       Constant *GEP = Replacement.LDSVarsToConstantGEP[GV];
1175 
1176       GV->replaceUsesWithIf(GEP, Predicate);
1177 
1178       APInt APOff(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
1179       GEP->stripAndAccumulateInBoundsConstantOffsets(DL, APOff);
1180       uint64_t Offset = APOff.getZExtValue();
1181 
1182       Align A =
1183           commonAlignment(Replacement.SGV->getAlign().valueOrOne(), Offset);
1184 
1185       if (I)
1186         NoAliasList[I - 1] = AliasScopes[I - 1];
1187       MDNode *NoAlias =
1188           NoAliasList.empty() ? nullptr : MDNode::get(Ctx, NoAliasList);
1189       MDNode *AliasScope =
1190           AliasScopes.empty() ? nullptr : MDNode::get(Ctx, {AliasScopes[I]});
1191 
1192       refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
1193     }
1194   }
1195 
1196   void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,
1197                                 MDNode *AliasScope, MDNode *NoAlias,
1198                                 unsigned MaxDepth = 5) {
1199     if (!MaxDepth || (A == 1 && !AliasScope))
1200       return;
1201 
1202     for (User *U : Ptr->users()) {
1203       if (auto *I = dyn_cast<Instruction>(U)) {
1204         if (AliasScope && I->mayReadOrWriteMemory()) {
1205           MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope);
1206           AS = (AS ? MDNode::getMostGenericAliasScope(AS, AliasScope)
1207                    : AliasScope);
1208           I->setMetadata(LLVMContext::MD_alias_scope, AS);
1209 
1210           MDNode *NA = I->getMetadata(LLVMContext::MD_noalias);
1211           NA = (NA ? MDNode::intersect(NA, NoAlias) : NoAlias);
1212           I->setMetadata(LLVMContext::MD_noalias, NA);
1213         }
1214       }
1215 
1216       if (auto *LI = dyn_cast<LoadInst>(U)) {
1217         LI->setAlignment(std::max(A, LI->getAlign()));
1218         continue;
1219       }
1220       if (auto *SI = dyn_cast<StoreInst>(U)) {
1221         if (SI->getPointerOperand() == Ptr)
1222           SI->setAlignment(std::max(A, SI->getAlign()));
1223         continue;
1224       }
1225       if (auto *AI = dyn_cast<AtomicRMWInst>(U)) {
1226         // None of atomicrmw operations can work on pointers, but let's
1227         // check it anyway in case it will or we will process ConstantExpr.
1228         if (AI->getPointerOperand() == Ptr)
1229           AI->setAlignment(std::max(A, AI->getAlign()));
1230         continue;
1231       }
1232       if (auto *AI = dyn_cast<AtomicCmpXchgInst>(U)) {
1233         if (AI->getPointerOperand() == Ptr)
1234           AI->setAlignment(std::max(A, AI->getAlign()));
1235         continue;
1236       }
1237       if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
1238         unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
1239         APInt Off(BitWidth, 0);
1240         if (GEP->getPointerOperand() == Ptr) {
1241           Align GA;
1242           if (GEP->accumulateConstantOffset(DL, Off))
1243             GA = commonAlignment(A, Off.getLimitedValue());
1244           refineUsesAlignmentAndAA(GEP, GA, DL, AliasScope, NoAlias,
1245                                    MaxDepth - 1);
1246         }
1247         continue;
1248       }
1249       if (auto *I = dyn_cast<Instruction>(U)) {
1250         if (I->getOpcode() == Instruction::BitCast ||
1251             I->getOpcode() == Instruction::AddrSpaceCast)
1252           refineUsesAlignmentAndAA(I, A, DL, AliasScope, NoAlias, MaxDepth - 1);
1253       }
1254     }
1255   }
1256 };
1257 
1258 } // namespace
1259 char AMDGPULowerModuleLDS::ID = 0;
1260 
1261 char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID;
1262 
1263 INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE,
1264                 "Lower uses of LDS variables from non-kernel functions", false,
1265                 false)
1266 
1267 ModulePass *llvm::createAMDGPULowerModuleLDSPass() {
1268   return new AMDGPULowerModuleLDS();
1269 }
1270 
1271 PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
1272                                                 ModuleAnalysisManager &) {
1273   return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none()
1274                                                : PreservedAnalyses::all();
1275 }
1276