xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp (revision ff55c9bc63ddd1bbe13376c25ae1fc327e3d5da2)
1 //===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "AMDGPUMemoryUtils.h"
10 #include "AMDGPU.h"
11 #include "Utils/AMDGPUBaseInfo.h"
12 #include "llvm/ADT/SetOperations.h"
13 #include "llvm/ADT/SmallSet.h"
14 #include "llvm/Analysis/AliasAnalysis.h"
15 #include "llvm/Analysis/CallGraph.h"
16 #include "llvm/Analysis/MemorySSA.h"
17 #include "llvm/IR/DataLayout.h"
18 #include "llvm/IR/Instructions.h"
19 #include "llvm/IR/IntrinsicInst.h"
20 #include "llvm/IR/IntrinsicsAMDGPU.h"
21 #include "llvm/IR/ReplaceConstant.h"
22 
23 #define DEBUG_TYPE "amdgpu-memory-utils"
24 
25 using namespace llvm;
26 
27 namespace llvm::AMDGPU {
28 
29 Align getAlign(const DataLayout &DL, const GlobalVariable *GV) {
30   return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
31                                        GV->getValueType());
32 }
33 
34 TargetExtType *isNamedBarrier(const GlobalVariable &GV) {
35   // TODO: Allow arrays and structs, if all members are barriers
36   // in the same scope.
37   // TODO: Disallow other uses of target("amdgcn.named.barrier") including:
38   // - Structs containing barriers in different scope.
39   // - Structs containing a mixture of barriers and other data.
40   // - Globals in other address spaces.
41   // - Allocas.
42   Type *Ty = GV.getValueType();
43   while (true) {
44     if (auto *TTy = dyn_cast<TargetExtType>(Ty))
45       return TTy->getName() == "amdgcn.named.barrier" ? TTy : nullptr;
46     if (auto *STy = dyn_cast<StructType>(Ty)) {
47       if (STy->getNumElements() == 0)
48         return nullptr;
49       Ty = STy->getElementType(0);
50       continue;
51     }
52     return nullptr;
53   }
54 }
55 
56 bool isDynamicLDS(const GlobalVariable &GV) {
57   // external zero size addrspace(3) without initializer is dynlds.
58   const Module *M = GV.getParent();
59   const DataLayout &DL = M->getDataLayout();
60   if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
61     return false;
62   return DL.getTypeAllocSize(GV.getValueType()) == 0;
63 }
64 
65 bool isLDSVariableToLower(const GlobalVariable &GV) {
66   if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
67     return false;
68   }
69   if (isDynamicLDS(GV)) {
70     return true;
71   }
72   if (GV.isConstant()) {
73     // A constant undef variable can't be written to, and any load is
74     // undef, so it should be eliminated by the optimizer. It could be
75     // dropped by the back end if not. This pass skips over it.
76     return false;
77   }
78   if (GV.hasInitializer() && !isa<UndefValue>(GV.getInitializer())) {
79     // Initializers are unimplemented for LDS address space.
80     // Leave such variables in place for consistent error reporting.
81     return false;
82   }
83   return true;
84 }
85 
86 bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
87   // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
88   // global may have uses from multiple different functions as a result.
89   // This pass specialises LDS variables with respect to the kernel that
90   // allocates them.
91 
92   // This is semantically equivalent to (the unimplemented as slow):
93   // for (auto &F : M.functions())
94   //   for (auto &BB : F)
95   //     for (auto &I : BB)
96   //       for (Use &Op : I.operands())
97   //         if (constantExprUsesLDS(Op))
98   //           replaceConstantExprInFunction(I, Op);
99 
100   SmallVector<Constant *> LDSGlobals;
101   for (auto &GV : M.globals())
102     if (AMDGPU::isLDSVariableToLower(GV))
103       LDSGlobals.push_back(&GV);
104   return convertUsersOfConstantsToInstructions(LDSGlobals);
105 }
106 
107 void getUsesOfLDSByFunction(const CallGraph &CG, Module &M,
108                             FunctionVariableMap &kernels,
109                             FunctionVariableMap &Functions) {
110   // Get uses from the current function, excluding uses by called Functions
111   // Two output variables to avoid walking the globals list twice
112   for (auto &GV : M.globals()) {
113     if (!AMDGPU::isLDSVariableToLower(GV))
114       continue;
115     for (User *V : GV.users()) {
116       if (auto *I = dyn_cast<Instruction>(V)) {
117         Function *F = I->getFunction();
118         if (isKernelLDS(F))
119           kernels[F].insert(&GV);
120         else
121           Functions[F].insert(&GV);
122       }
123     }
124   }
125 }
126 
127 bool isKernelLDS(const Function *F) {
128   // Some weirdness here. AMDGPU::isKernelCC does not call into
129   // AMDGPU::isKernel with the calling conv, it instead calls into
130   // isModuleEntryFunction which returns true for more calling conventions
131   // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel.
132   // There's also a test that checks that the LDS lowering does not hit on
133   // a graphics shader, denoted amdgpu_ps, so stay with the limited case.
134   // Putting LDS in the name of the function to draw attention to this.
135   return AMDGPU::isKernel(F->getCallingConv());
136 }
137 
138 LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
139 
140   FunctionVariableMap DirectMapKernel;
141   FunctionVariableMap DirectMapFunction;
142   getUsesOfLDSByFunction(CG, M, DirectMapKernel, DirectMapFunction);
143 
144   // Collect functions whose address has escaped
145   DenseSet<Function *> AddressTakenFuncs;
146   for (Function &F : M.functions()) {
147     if (!isKernelLDS(&F))
148       if (F.hasAddressTaken(nullptr,
149                             /* IgnoreCallbackUses */ false,
150                             /* IgnoreAssumeLikeCalls */ false,
151                             /* IgnoreLLVMUsed */ true,
152                             /* IgnoreArcAttachedCall */ false)) {
153         AddressTakenFuncs.insert(&F);
154       }
155   }
156 
157   // Collect variables that are used by functions whose address has escaped
158   DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
159   for (Function *F : AddressTakenFuncs) {
160     set_union(VariablesReachableThroughFunctionPointer, DirectMapFunction[F]);
161   }
162 
163   auto FunctionMakesUnknownCall = [&](const Function *F) -> bool {
164     assert(!F->isDeclaration());
165     for (const CallGraphNode::CallRecord &R : *CG[F]) {
166       if (!R.second->getFunction())
167         return true;
168     }
169     return false;
170   };
171 
172   // Work out which variables are reachable through function calls
173   FunctionVariableMap TransitiveMapFunction = DirectMapFunction;
174 
175   // If the function makes any unknown call, assume the worst case that it can
176   // access all variables accessed by functions whose address escaped
177   for (Function &F : M.functions()) {
178     if (!F.isDeclaration() && FunctionMakesUnknownCall(&F)) {
179       if (!isKernelLDS(&F)) {
180         set_union(TransitiveMapFunction[&F],
181                   VariablesReachableThroughFunctionPointer);
182       }
183     }
184   }
185 
186   // Direct implementation of collecting all variables reachable from each
187   // function
188   for (Function &Func : M.functions()) {
189     if (Func.isDeclaration() || isKernelLDS(&Func))
190       continue;
191 
192     DenseSet<Function *> seen; // catches cycles
193     SmallVector<Function *, 4> wip = {&Func};
194 
195     while (!wip.empty()) {
196       Function *F = wip.pop_back_val();
197 
198       // Can accelerate this by referring to transitive map for functions that
199       // have already been computed, with more care than this
200       set_union(TransitiveMapFunction[&Func], DirectMapFunction[F]);
201 
202       for (const CallGraphNode::CallRecord &R : *CG[F]) {
203         Function *Ith = R.second->getFunction();
204         if (Ith) {
205           if (!seen.contains(Ith)) {
206             seen.insert(Ith);
207             wip.push_back(Ith);
208           }
209         }
210       }
211     }
212   }
213 
214   // Collect variables that are transitively used by functions whose address has
215   // escaped
216   for (Function *F : AddressTakenFuncs) {
217     set_union(VariablesReachableThroughFunctionPointer,
218               TransitiveMapFunction[F]);
219   }
220 
221   // DirectMapKernel lists which variables are used by the kernel
222   // find the variables which are used through a function call
223   FunctionVariableMap IndirectMapKernel;
224 
225   for (Function &Func : M.functions()) {
226     if (Func.isDeclaration() || !isKernelLDS(&Func))
227       continue;
228 
229     for (const CallGraphNode::CallRecord &R : *CG[&Func]) {
230       Function *Ith = R.second->getFunction();
231       if (Ith) {
232         set_union(IndirectMapKernel[&Func], TransitiveMapFunction[Ith]);
233       }
234     }
235 
236     // Check if the kernel encounters unknows calls, wheher directly or
237     // indirectly.
238     bool SeesUnknownCalls = [&]() {
239       SmallVector<Function *> WorkList = {CG[&Func]->getFunction()};
240       SmallPtrSet<Function *, 8> Visited;
241 
242       while (!WorkList.empty()) {
243         Function *F = WorkList.pop_back_val();
244 
245         for (const CallGraphNode::CallRecord &CallRecord : *CG[F]) {
246           if (!CallRecord.second)
247             continue;
248 
249           Function *Callee = CallRecord.second->getFunction();
250           if (!Callee)
251             return true;
252 
253           if (Visited.insert(Callee).second)
254             WorkList.push_back(Callee);
255         }
256       }
257       return false;
258     }();
259 
260     if (SeesUnknownCalls) {
261       set_union(IndirectMapKernel[&Func],
262                 VariablesReachableThroughFunctionPointer);
263     }
264   }
265 
266   // Verify that we fall into one of 2 cases:
267   //    - All variables are either absolute
268   //      or direct mapped dynamic LDS that is not lowered.
269   //      this is a re-run of the pass
270   //      so we don't have anything to do.
271   //    - No variables are absolute.
272   std::optional<bool> HasAbsoluteGVs;
273   bool HasSpecialGVs = false;
274   for (auto &Map : {DirectMapKernel, IndirectMapKernel}) {
275     for (auto &[Fn, GVs] : Map) {
276       for (auto *GV : GVs) {
277         bool IsAbsolute = GV->isAbsoluteSymbolRef();
278         bool IsDirectMapDynLDSGV =
279             AMDGPU::isDynamicLDS(*GV) && DirectMapKernel.contains(Fn);
280         if (IsDirectMapDynLDSGV)
281           continue;
282         if (isNamedBarrier(*GV)) {
283           HasSpecialGVs = true;
284           continue;
285         }
286         if (HasAbsoluteGVs.has_value()) {
287           if (*HasAbsoluteGVs != IsAbsolute) {
288             report_fatal_error(
289                 "Module cannot mix absolute and non-absolute LDS GVs");
290           }
291         } else
292           HasAbsoluteGVs = IsAbsolute;
293       }
294     }
295   }
296 
297   // If we only had absolute GVs, we have nothing to do, return an empty
298   // result.
299   if (HasAbsoluteGVs && *HasAbsoluteGVs)
300     return {FunctionVariableMap(), FunctionVariableMap(), false};
301 
302   return {std::move(DirectMapKernel), std::move(IndirectMapKernel),
303           HasSpecialGVs};
304 }
305 
306 void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
307                                ArrayRef<StringRef> FnAttrs) {
308   for (StringRef Attr : FnAttrs)
309     KernelRoot->removeFnAttr(Attr);
310 
311   SmallVector<Function *> WorkList = {CG[KernelRoot]->getFunction()};
312   SmallPtrSet<Function *, 8> Visited;
313   bool SeenUnknownCall = false;
314 
315   while (!WorkList.empty()) {
316     Function *F = WorkList.pop_back_val();
317 
318     for (auto &CallRecord : *CG[F]) {
319       if (!CallRecord.second)
320         continue;
321 
322       Function *Callee = CallRecord.second->getFunction();
323       if (!Callee) {
324         if (!SeenUnknownCall) {
325           SeenUnknownCall = true;
326 
327           // If we see any indirect calls, assume nothing about potential
328           // targets.
329           // TODO: This could be refined to possible LDS global users.
330           for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) {
331             Function *PotentialCallee =
332                 ExternalCallRecord.second->getFunction();
333             assert(PotentialCallee);
334             if (!isKernelLDS(PotentialCallee)) {
335               for (StringRef Attr : FnAttrs)
336                 PotentialCallee->removeFnAttr(Attr);
337             }
338           }
339         }
340       } else {
341         for (StringRef Attr : FnAttrs)
342           Callee->removeFnAttr(Attr);
343         if (Visited.insert(Callee).second)
344           WorkList.push_back(Callee);
345       }
346     }
347   }
348 }
349 
350 bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
351   Instruction *DefInst = Def->getMemoryInst();
352 
353   if (isa<FenceInst>(DefInst))
354     return false;
355 
356   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
357     switch (II->getIntrinsicID()) {
358     case Intrinsic::amdgcn_s_barrier:
359     case Intrinsic::amdgcn_s_barrier_signal:
360     case Intrinsic::amdgcn_s_barrier_signal_var:
361     case Intrinsic::amdgcn_s_barrier_signal_isfirst:
362     case Intrinsic::amdgcn_s_barrier_init:
363     case Intrinsic::amdgcn_s_barrier_join:
364     case Intrinsic::amdgcn_s_barrier_wait:
365     case Intrinsic::amdgcn_s_barrier_leave:
366     case Intrinsic::amdgcn_s_get_barrier_state:
367     case Intrinsic::amdgcn_wave_barrier:
368     case Intrinsic::amdgcn_sched_barrier:
369     case Intrinsic::amdgcn_sched_group_barrier:
370       return false;
371     default:
372       break;
373     }
374   }
375 
376   // Ignore atomics not aliasing with the original load, any atomic is a
377   // universal MemoryDef from MSSA's point of view too, just like a fence.
378   const auto checkNoAlias = [AA, Ptr](auto I) -> bool {
379     return I && AA->isNoAlias(I->getPointerOperand(), Ptr);
380   };
381 
382   if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) ||
383       checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst)))
384     return false;
385 
386   return true;
387 }
388 
389 bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
390                            AAResults *AA) {
391   MemorySSAWalker *Walker = MSSA->getWalker();
392   SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)};
393   SmallSet<MemoryAccess *, 8> Visited;
394   MemoryLocation Loc(MemoryLocation::get(Load));
395 
396   LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
397 
398   // Start with a nearest dominating clobbering access, it will be either
399   // live on entry (nothing to do, load is not clobbered), MemoryDef, or
400   // MemoryPhi if several MemoryDefs can define this memory state. In that
401   // case add all Defs to WorkList and continue going up and checking all
402   // the definitions of this memory location until the root. When all the
403   // defs are exhausted and came to the entry state we have no clobber.
404   // Along the scan ignore barriers and fences which are considered clobbers
405   // by the MemorySSA, but not really writing anything into the memory.
406   while (!WorkList.empty()) {
407     MemoryAccess *MA = WorkList.pop_back_val();
408     if (!Visited.insert(MA).second)
409       continue;
410 
411     if (MSSA->isLiveOnEntryDef(MA))
412       continue;
413 
414     if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) {
415       LLVM_DEBUG(dbgs() << "  Def: " << *Def->getMemoryInst() << '\n');
416 
417       if (isReallyAClobber(Load->getPointerOperand(), Def, AA)) {
418         LLVM_DEBUG(dbgs() << "      -> load is clobbered\n");
419         return true;
420       }
421 
422       WorkList.push_back(
423           Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc));
424       continue;
425     }
426 
427     const MemoryPhi *Phi = cast<MemoryPhi>(MA);
428     for (const auto &Use : Phi->incoming_values())
429       WorkList.push_back(cast<MemoryAccess>(&Use));
430   }
431 
432   LLVM_DEBUG(dbgs() << "      -> no clobber\n");
433   return false;
434 }
435 
436 } // end namespace llvm::AMDGPU
437