xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1e8d8bef9SDimitry Andric //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2e8d8bef9SDimitry Andric //
3e8d8bef9SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4e8d8bef9SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5e8d8bef9SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6e8d8bef9SDimitry Andric //
7e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===//
8e8d8bef9SDimitry Andric //
9e8d8bef9SDimitry Andric /// \file
10e8d8bef9SDimitry Andric /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
11e8d8bef9SDimitry Andric /// selection.
12e8d8bef9SDimitry Andric //
13e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===//
14e8d8bef9SDimitry Andric 
15e8d8bef9SDimitry Andric #include "AMDGPU.h"
167a6dacacSDimitry Andric #include "AMDGPUTargetMachine.h"
17e8d8bef9SDimitry Andric #include "llvm/Analysis/AssumptionCache.h"
1806c3fb27SDimitry Andric #include "llvm/Analysis/UniformityAnalysis.h"
19e8d8bef9SDimitry Andric #include "llvm/Analysis/ValueTracking.h"
207a6dacacSDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
21e8d8bef9SDimitry Andric #include "llvm/IR/IRBuilder.h"
22e8d8bef9SDimitry Andric #include "llvm/IR/InstVisitor.h"
23e8d8bef9SDimitry Andric #include "llvm/InitializePasses.h"
24e8d8bef9SDimitry Andric #include "llvm/Support/CommandLine.h"
25e8d8bef9SDimitry Andric #include "llvm/Support/KnownBits.h"
26e8d8bef9SDimitry Andric #include "llvm/Transforms/Utils/Local.h"
27e8d8bef9SDimitry Andric 
28e8d8bef9SDimitry Andric #define DEBUG_TYPE "amdgpu-late-codegenprepare"
29e8d8bef9SDimitry Andric 
30e8d8bef9SDimitry Andric using namespace llvm;
31e8d8bef9SDimitry Andric 
32e8d8bef9SDimitry Andric // Scalar load widening needs running after load-store-vectorizer as that pass
33e8d8bef9SDimitry Andric // doesn't handle overlapping cases. In addition, this pass enhances the
34e8d8bef9SDimitry Andric // widening to handle cases where scalar sub-dword loads are naturally aligned
35e8d8bef9SDimitry Andric // only but not dword aligned.
36e8d8bef9SDimitry Andric static cl::opt<bool>
37e8d8bef9SDimitry Andric     WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
38e8d8bef9SDimitry Andric                cl::desc("Widen sub-dword constant address space loads in "
39e8d8bef9SDimitry Andric                         "AMDGPULateCodeGenPrepare"),
40e8d8bef9SDimitry Andric                cl::ReallyHidden, cl::init(true));
41e8d8bef9SDimitry Andric 
42e8d8bef9SDimitry Andric namespace {
43e8d8bef9SDimitry Andric 
44e8d8bef9SDimitry Andric class AMDGPULateCodeGenPrepare
45e8d8bef9SDimitry Andric     : public FunctionPass,
46e8d8bef9SDimitry Andric       public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
47e8d8bef9SDimitry Andric   Module *Mod = nullptr;
48e8d8bef9SDimitry Andric   const DataLayout *DL = nullptr;
49e8d8bef9SDimitry Andric 
50e8d8bef9SDimitry Andric   AssumptionCache *AC = nullptr;
5106c3fb27SDimitry Andric   UniformityInfo *UA = nullptr;
52e8d8bef9SDimitry Andric 
53*0fca6ea1SDimitry Andric   SmallVector<WeakTrackingVH, 8> DeadInsts;
54*0fca6ea1SDimitry Andric 
55e8d8bef9SDimitry Andric public:
56e8d8bef9SDimitry Andric   static char ID;
57e8d8bef9SDimitry Andric 
58e8d8bef9SDimitry Andric   AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
59e8d8bef9SDimitry Andric 
60e8d8bef9SDimitry Andric   StringRef getPassName() const override {
61e8d8bef9SDimitry Andric     return "AMDGPU IR late optimizations";
62e8d8bef9SDimitry Andric   }
63e8d8bef9SDimitry Andric 
64e8d8bef9SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
657a6dacacSDimitry Andric     AU.addRequired<TargetPassConfig>();
66e8d8bef9SDimitry Andric     AU.addRequired<AssumptionCacheTracker>();
6706c3fb27SDimitry Andric     AU.addRequired<UniformityInfoWrapperPass>();
68e8d8bef9SDimitry Andric     AU.setPreservesAll();
69e8d8bef9SDimitry Andric   }
70e8d8bef9SDimitry Andric 
71e8d8bef9SDimitry Andric   bool doInitialization(Module &M) override;
72e8d8bef9SDimitry Andric   bool runOnFunction(Function &F) override;
73e8d8bef9SDimitry Andric 
74e8d8bef9SDimitry Andric   bool visitInstruction(Instruction &) { return false; }
75e8d8bef9SDimitry Andric 
76e8d8bef9SDimitry Andric   // Check if the specified value is at least DWORD aligned.
77e8d8bef9SDimitry Andric   bool isDWORDAligned(const Value *V) const {
78e8d8bef9SDimitry Andric     KnownBits Known = computeKnownBits(V, *DL, 0, AC);
79e8d8bef9SDimitry Andric     return Known.countMinTrailingZeros() >= 2;
80e8d8bef9SDimitry Andric   }
81e8d8bef9SDimitry Andric 
82e8d8bef9SDimitry Andric   bool canWidenScalarExtLoad(LoadInst &LI) const;
83e8d8bef9SDimitry Andric   bool visitLoadInst(LoadInst &LI);
84e8d8bef9SDimitry Andric };
85e8d8bef9SDimitry Andric 
86*0fca6ea1SDimitry Andric using ValueToValueMap = DenseMap<const Value *, Value *>;
87*0fca6ea1SDimitry Andric 
88*0fca6ea1SDimitry Andric class LiveRegOptimizer {
89*0fca6ea1SDimitry Andric private:
90*0fca6ea1SDimitry Andric   Module *Mod = nullptr;
91*0fca6ea1SDimitry Andric   const DataLayout *DL = nullptr;
92*0fca6ea1SDimitry Andric   const GCNSubtarget *ST;
93*0fca6ea1SDimitry Andric   /// The scalar type to convert to
94*0fca6ea1SDimitry Andric   Type *ConvertToScalar;
95*0fca6ea1SDimitry Andric   /// The set of visited Instructions
96*0fca6ea1SDimitry Andric   SmallPtrSet<Instruction *, 4> Visited;
97*0fca6ea1SDimitry Andric   /// Map of Value -> Converted Value
98*0fca6ea1SDimitry Andric   ValueToValueMap ValMap;
99*0fca6ea1SDimitry Andric   /// Map of containing conversions from Optimal Type -> Original Type per BB.
100*0fca6ea1SDimitry Andric   DenseMap<BasicBlock *, ValueToValueMap> BBUseValMap;
101*0fca6ea1SDimitry Andric 
102*0fca6ea1SDimitry Andric public:
103*0fca6ea1SDimitry Andric   /// Calculate the and \p return  the type to convert to given a problematic \p
104*0fca6ea1SDimitry Andric   /// OriginalType. In some instances, we may widen the type (e.g. v2i8 -> i32).
105*0fca6ea1SDimitry Andric   Type *calculateConvertType(Type *OriginalType);
106*0fca6ea1SDimitry Andric   /// Convert the virtual register defined by \p V to the compatible vector of
107*0fca6ea1SDimitry Andric   /// legal type
108*0fca6ea1SDimitry Andric   Value *convertToOptType(Instruction *V, BasicBlock::iterator &InstPt);
109*0fca6ea1SDimitry Andric   /// Convert the virtual register defined by \p V back to the original type \p
110*0fca6ea1SDimitry Andric   /// ConvertType, stripping away the MSBs in cases where there was an imperfect
111*0fca6ea1SDimitry Andric   /// fit (e.g. v2i32 -> v7i8)
112*0fca6ea1SDimitry Andric   Value *convertFromOptType(Type *ConvertType, Instruction *V,
113*0fca6ea1SDimitry Andric                             BasicBlock::iterator &InstPt,
114*0fca6ea1SDimitry Andric                             BasicBlock *InsertBlock);
115*0fca6ea1SDimitry Andric   /// Check for problematic PHI nodes or cross-bb values based on the value
116*0fca6ea1SDimitry Andric   /// defined by \p I, and coerce to legal types if necessary. For problematic
117*0fca6ea1SDimitry Andric   /// PHI node, we coerce all incoming values in a single invocation.
118*0fca6ea1SDimitry Andric   bool optimizeLiveType(Instruction *I,
119*0fca6ea1SDimitry Andric                         SmallVectorImpl<WeakTrackingVH> &DeadInsts);
120*0fca6ea1SDimitry Andric 
121*0fca6ea1SDimitry Andric   // Whether or not the type should be replaced to avoid inefficient
122*0fca6ea1SDimitry Andric   // legalization code
123*0fca6ea1SDimitry Andric   bool shouldReplace(Type *ITy) {
124*0fca6ea1SDimitry Andric     FixedVectorType *VTy = dyn_cast<FixedVectorType>(ITy);
125*0fca6ea1SDimitry Andric     if (!VTy)
126*0fca6ea1SDimitry Andric       return false;
127*0fca6ea1SDimitry Andric 
128*0fca6ea1SDimitry Andric     auto TLI = ST->getTargetLowering();
129*0fca6ea1SDimitry Andric 
130*0fca6ea1SDimitry Andric     Type *EltTy = VTy->getElementType();
131*0fca6ea1SDimitry Andric     // If the element size is not less than the convert to scalar size, then we
132*0fca6ea1SDimitry Andric     // can't do any bit packing
133*0fca6ea1SDimitry Andric     if (!EltTy->isIntegerTy() ||
134*0fca6ea1SDimitry Andric         EltTy->getScalarSizeInBits() > ConvertToScalar->getScalarSizeInBits())
135*0fca6ea1SDimitry Andric       return false;
136*0fca6ea1SDimitry Andric 
137*0fca6ea1SDimitry Andric     // Only coerce illegal types
138*0fca6ea1SDimitry Andric     TargetLoweringBase::LegalizeKind LK =
139*0fca6ea1SDimitry Andric         TLI->getTypeConversion(EltTy->getContext(), EVT::getEVT(EltTy, false));
140*0fca6ea1SDimitry Andric     return LK.first != TargetLoweringBase::TypeLegal;
141*0fca6ea1SDimitry Andric   }
142*0fca6ea1SDimitry Andric 
143*0fca6ea1SDimitry Andric   LiveRegOptimizer(Module *Mod, const GCNSubtarget *ST) : Mod(Mod), ST(ST) {
144*0fca6ea1SDimitry Andric     DL = &Mod->getDataLayout();
145*0fca6ea1SDimitry Andric     ConvertToScalar = Type::getInt32Ty(Mod->getContext());
146*0fca6ea1SDimitry Andric   }
147*0fca6ea1SDimitry Andric };
148*0fca6ea1SDimitry Andric 
149e8d8bef9SDimitry Andric } // end anonymous namespace
150e8d8bef9SDimitry Andric 
151e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
152e8d8bef9SDimitry Andric   Mod = &M;
153e8d8bef9SDimitry Andric   DL = &Mod->getDataLayout();
154e8d8bef9SDimitry Andric   return false;
155e8d8bef9SDimitry Andric }
156e8d8bef9SDimitry Andric 
157e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
158e8d8bef9SDimitry Andric   if (skipFunction(F))
159e8d8bef9SDimitry Andric     return false;
160e8d8bef9SDimitry Andric 
1617a6dacacSDimitry Andric   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
1627a6dacacSDimitry Andric   const TargetMachine &TM = TPC.getTM<TargetMachine>();
1637a6dacacSDimitry Andric   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
1647a6dacacSDimitry Andric 
165e8d8bef9SDimitry Andric   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
16606c3fb27SDimitry Andric   UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
167e8d8bef9SDimitry Andric 
168*0fca6ea1SDimitry Andric   // "Optimize" the virtual regs that cross basic block boundaries. When
169*0fca6ea1SDimitry Andric   // building the SelectionDAG, vectors of illegal types that cross basic blocks
170*0fca6ea1SDimitry Andric   // will be scalarized and widened, with each scalar living in its
171*0fca6ea1SDimitry Andric   // own register. To work around this, this optimization converts the
172*0fca6ea1SDimitry Andric   // vectors to equivalent vectors of legal type (which are converted back
173*0fca6ea1SDimitry Andric   // before uses in subsequent blocks), to pack the bits into fewer physical
174*0fca6ea1SDimitry Andric   // registers (used in CopyToReg/CopyFromReg pairs).
175*0fca6ea1SDimitry Andric   LiveRegOptimizer LRO(Mod, &ST);
176e8d8bef9SDimitry Andric 
177*0fca6ea1SDimitry Andric   bool Changed = false;
178*0fca6ea1SDimitry Andric 
179*0fca6ea1SDimitry Andric   bool HasScalarSubwordLoads = ST.hasScalarSubwordLoads();
180*0fca6ea1SDimitry Andric 
181*0fca6ea1SDimitry Andric   for (auto &BB : reverse(F))
182*0fca6ea1SDimitry Andric     for (Instruction &I : make_early_inc_range(reverse(BB))) {
183*0fca6ea1SDimitry Andric       Changed |= !HasScalarSubwordLoads && visit(I);
184*0fca6ea1SDimitry Andric       Changed |= LRO.optimizeLiveType(&I, DeadInsts);
185*0fca6ea1SDimitry Andric     }
186*0fca6ea1SDimitry Andric 
187*0fca6ea1SDimitry Andric   RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts);
188e8d8bef9SDimitry Andric   return Changed;
189e8d8bef9SDimitry Andric }
190e8d8bef9SDimitry Andric 
191*0fca6ea1SDimitry Andric Type *LiveRegOptimizer::calculateConvertType(Type *OriginalType) {
192*0fca6ea1SDimitry Andric   assert(OriginalType->getScalarSizeInBits() <=
193*0fca6ea1SDimitry Andric          ConvertToScalar->getScalarSizeInBits());
194*0fca6ea1SDimitry Andric 
195*0fca6ea1SDimitry Andric   FixedVectorType *VTy = cast<FixedVectorType>(OriginalType);
196*0fca6ea1SDimitry Andric 
197*0fca6ea1SDimitry Andric   TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
198*0fca6ea1SDimitry Andric   TypeSize ConvertScalarSize = DL->getTypeSizeInBits(ConvertToScalar);
199*0fca6ea1SDimitry Andric   unsigned ConvertEltCount =
200*0fca6ea1SDimitry Andric       (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
201*0fca6ea1SDimitry Andric 
202*0fca6ea1SDimitry Andric   if (OriginalSize <= ConvertScalarSize)
203*0fca6ea1SDimitry Andric     return IntegerType::get(Mod->getContext(), ConvertScalarSize);
204*0fca6ea1SDimitry Andric 
205*0fca6ea1SDimitry Andric   return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
206*0fca6ea1SDimitry Andric                          ConvertEltCount, false);
207*0fca6ea1SDimitry Andric }
208*0fca6ea1SDimitry Andric 
209*0fca6ea1SDimitry Andric Value *LiveRegOptimizer::convertToOptType(Instruction *V,
210*0fca6ea1SDimitry Andric                                           BasicBlock::iterator &InsertPt) {
211*0fca6ea1SDimitry Andric   FixedVectorType *VTy = cast<FixedVectorType>(V->getType());
212*0fca6ea1SDimitry Andric   Type *NewTy = calculateConvertType(V->getType());
213*0fca6ea1SDimitry Andric 
214*0fca6ea1SDimitry Andric   TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
215*0fca6ea1SDimitry Andric   TypeSize NewSize = DL->getTypeSizeInBits(NewTy);
216*0fca6ea1SDimitry Andric 
217*0fca6ea1SDimitry Andric   IRBuilder<> Builder(V->getParent(), InsertPt);
218*0fca6ea1SDimitry Andric   // If there is a bitsize match, we can fit the old vector into a new vector of
219*0fca6ea1SDimitry Andric   // desired type.
220*0fca6ea1SDimitry Andric   if (OriginalSize == NewSize)
221*0fca6ea1SDimitry Andric     return Builder.CreateBitCast(V, NewTy, V->getName() + ".bc");
222*0fca6ea1SDimitry Andric 
223*0fca6ea1SDimitry Andric   // If there is a bitsize mismatch, we must use a wider vector.
224*0fca6ea1SDimitry Andric   assert(NewSize > OriginalSize);
225*0fca6ea1SDimitry Andric   uint64_t ExpandedVecElementCount = NewSize / VTy->getScalarSizeInBits();
226*0fca6ea1SDimitry Andric 
227*0fca6ea1SDimitry Andric   SmallVector<int, 8> ShuffleMask;
228*0fca6ea1SDimitry Andric   uint64_t OriginalElementCount = VTy->getElementCount().getFixedValue();
229*0fca6ea1SDimitry Andric   for (unsigned I = 0; I < OriginalElementCount; I++)
230*0fca6ea1SDimitry Andric     ShuffleMask.push_back(I);
231*0fca6ea1SDimitry Andric 
232*0fca6ea1SDimitry Andric   for (uint64_t I = OriginalElementCount; I < ExpandedVecElementCount; I++)
233*0fca6ea1SDimitry Andric     ShuffleMask.push_back(OriginalElementCount);
234*0fca6ea1SDimitry Andric 
235*0fca6ea1SDimitry Andric   Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask);
236*0fca6ea1SDimitry Andric   return Builder.CreateBitCast(ExpandedVec, NewTy, V->getName() + ".bc");
237*0fca6ea1SDimitry Andric }
238*0fca6ea1SDimitry Andric 
239*0fca6ea1SDimitry Andric Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V,
240*0fca6ea1SDimitry Andric                                             BasicBlock::iterator &InsertPt,
241*0fca6ea1SDimitry Andric                                             BasicBlock *InsertBB) {
242*0fca6ea1SDimitry Andric   FixedVectorType *NewVTy = cast<FixedVectorType>(ConvertType);
243*0fca6ea1SDimitry Andric 
244*0fca6ea1SDimitry Andric   TypeSize OriginalSize = DL->getTypeSizeInBits(V->getType());
245*0fca6ea1SDimitry Andric   TypeSize NewSize = DL->getTypeSizeInBits(NewVTy);
246*0fca6ea1SDimitry Andric 
247*0fca6ea1SDimitry Andric   IRBuilder<> Builder(InsertBB, InsertPt);
248*0fca6ea1SDimitry Andric   // If there is a bitsize match, we simply convert back to the original type.
249*0fca6ea1SDimitry Andric   if (OriginalSize == NewSize)
250*0fca6ea1SDimitry Andric     return Builder.CreateBitCast(V, NewVTy, V->getName() + ".bc");
251*0fca6ea1SDimitry Andric 
252*0fca6ea1SDimitry Andric   // If there is a bitsize mismatch, then we must have used a wider value to
253*0fca6ea1SDimitry Andric   // hold the bits.
254*0fca6ea1SDimitry Andric   assert(OriginalSize > NewSize);
255*0fca6ea1SDimitry Andric   // For wide scalars, we can just truncate the value.
256*0fca6ea1SDimitry Andric   if (!V->getType()->isVectorTy()) {
257*0fca6ea1SDimitry Andric     Instruction *Trunc = cast<Instruction>(
258*0fca6ea1SDimitry Andric         Builder.CreateTrunc(V, IntegerType::get(Mod->getContext(), NewSize)));
259*0fca6ea1SDimitry Andric     return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
260*0fca6ea1SDimitry Andric   }
261*0fca6ea1SDimitry Andric 
262*0fca6ea1SDimitry Andric   // For wider vectors, we must strip the MSBs to convert back to the original
263*0fca6ea1SDimitry Andric   // type.
264*0fca6ea1SDimitry Andric   VectorType *ExpandedVT = VectorType::get(
265*0fca6ea1SDimitry Andric       Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
266*0fca6ea1SDimitry Andric       (OriginalSize / NewVTy->getScalarSizeInBits()), false);
267*0fca6ea1SDimitry Andric   Instruction *Converted =
268*0fca6ea1SDimitry Andric       cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT));
269*0fca6ea1SDimitry Andric 
270*0fca6ea1SDimitry Andric   unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
271*0fca6ea1SDimitry Andric   SmallVector<int, 8> ShuffleMask(NarrowElementCount);
272*0fca6ea1SDimitry Andric   std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
273*0fca6ea1SDimitry Andric 
274*0fca6ea1SDimitry Andric   return Builder.CreateShuffleVector(Converted, ShuffleMask);
275*0fca6ea1SDimitry Andric }
276*0fca6ea1SDimitry Andric 
277*0fca6ea1SDimitry Andric bool LiveRegOptimizer::optimizeLiveType(
278*0fca6ea1SDimitry Andric     Instruction *I, SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
279*0fca6ea1SDimitry Andric   SmallVector<Instruction *, 4> Worklist;
280*0fca6ea1SDimitry Andric   SmallPtrSet<PHINode *, 4> PhiNodes;
281*0fca6ea1SDimitry Andric   SmallPtrSet<Instruction *, 4> Defs;
282*0fca6ea1SDimitry Andric   SmallPtrSet<Instruction *, 4> Uses;
283*0fca6ea1SDimitry Andric 
284*0fca6ea1SDimitry Andric   Worklist.push_back(cast<Instruction>(I));
285*0fca6ea1SDimitry Andric   while (!Worklist.empty()) {
286*0fca6ea1SDimitry Andric     Instruction *II = Worklist.pop_back_val();
287*0fca6ea1SDimitry Andric 
288*0fca6ea1SDimitry Andric     if (!Visited.insert(II).second)
289*0fca6ea1SDimitry Andric       continue;
290*0fca6ea1SDimitry Andric 
291*0fca6ea1SDimitry Andric     if (!shouldReplace(II->getType()))
292*0fca6ea1SDimitry Andric       continue;
293*0fca6ea1SDimitry Andric 
294*0fca6ea1SDimitry Andric     if (PHINode *Phi = dyn_cast<PHINode>(II)) {
295*0fca6ea1SDimitry Andric       PhiNodes.insert(Phi);
296*0fca6ea1SDimitry Andric       // Collect all the incoming values of problematic PHI nodes.
297*0fca6ea1SDimitry Andric       for (Value *V : Phi->incoming_values()) {
298*0fca6ea1SDimitry Andric         // Repeat the collection process for newly found PHI nodes.
299*0fca6ea1SDimitry Andric         if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
300*0fca6ea1SDimitry Andric           if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
301*0fca6ea1SDimitry Andric             Worklist.push_back(OpPhi);
302*0fca6ea1SDimitry Andric           continue;
303*0fca6ea1SDimitry Andric         }
304*0fca6ea1SDimitry Andric 
305*0fca6ea1SDimitry Andric         Instruction *IncInst = dyn_cast<Instruction>(V);
306*0fca6ea1SDimitry Andric         // Other incoming value types (e.g. vector literals) are unhandled
307*0fca6ea1SDimitry Andric         if (!IncInst && !isa<ConstantAggregateZero>(V))
308*0fca6ea1SDimitry Andric           return false;
309*0fca6ea1SDimitry Andric 
310*0fca6ea1SDimitry Andric         // Collect all other incoming values for coercion.
311*0fca6ea1SDimitry Andric         if (IncInst)
312*0fca6ea1SDimitry Andric           Defs.insert(IncInst);
313*0fca6ea1SDimitry Andric       }
314*0fca6ea1SDimitry Andric     }
315*0fca6ea1SDimitry Andric 
316*0fca6ea1SDimitry Andric     // Collect all relevant uses.
317*0fca6ea1SDimitry Andric     for (User *V : II->users()) {
318*0fca6ea1SDimitry Andric       // Repeat the collection process for problematic PHI nodes.
319*0fca6ea1SDimitry Andric       if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
320*0fca6ea1SDimitry Andric         if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
321*0fca6ea1SDimitry Andric           Worklist.push_back(OpPhi);
322*0fca6ea1SDimitry Andric         continue;
323*0fca6ea1SDimitry Andric       }
324*0fca6ea1SDimitry Andric 
325*0fca6ea1SDimitry Andric       Instruction *UseInst = cast<Instruction>(V);
326*0fca6ea1SDimitry Andric       // Collect all uses of PHINodes and any use the crosses BB boundaries.
327*0fca6ea1SDimitry Andric       if (UseInst->getParent() != II->getParent() || isa<PHINode>(II)) {
328*0fca6ea1SDimitry Andric         Uses.insert(UseInst);
329*0fca6ea1SDimitry Andric         if (!Defs.count(II) && !isa<PHINode>(II)) {
330*0fca6ea1SDimitry Andric           Defs.insert(II);
331*0fca6ea1SDimitry Andric         }
332*0fca6ea1SDimitry Andric       }
333*0fca6ea1SDimitry Andric     }
334*0fca6ea1SDimitry Andric   }
335*0fca6ea1SDimitry Andric 
336*0fca6ea1SDimitry Andric   // Coerce and track the defs.
337*0fca6ea1SDimitry Andric   for (Instruction *D : Defs) {
338*0fca6ea1SDimitry Andric     if (!ValMap.contains(D)) {
339*0fca6ea1SDimitry Andric       BasicBlock::iterator InsertPt = std::next(D->getIterator());
340*0fca6ea1SDimitry Andric       Value *ConvertVal = convertToOptType(D, InsertPt);
341*0fca6ea1SDimitry Andric       assert(ConvertVal);
342*0fca6ea1SDimitry Andric       ValMap[D] = ConvertVal;
343*0fca6ea1SDimitry Andric     }
344*0fca6ea1SDimitry Andric   }
345*0fca6ea1SDimitry Andric 
346*0fca6ea1SDimitry Andric   // Construct new-typed PHI nodes.
347*0fca6ea1SDimitry Andric   for (PHINode *Phi : PhiNodes) {
348*0fca6ea1SDimitry Andric     ValMap[Phi] = PHINode::Create(calculateConvertType(Phi->getType()),
349*0fca6ea1SDimitry Andric                                   Phi->getNumIncomingValues(),
350*0fca6ea1SDimitry Andric                                   Phi->getName() + ".tc", Phi->getIterator());
351*0fca6ea1SDimitry Andric   }
352*0fca6ea1SDimitry Andric 
353*0fca6ea1SDimitry Andric   // Connect all the PHI nodes with their new incoming values.
354*0fca6ea1SDimitry Andric   for (PHINode *Phi : PhiNodes) {
355*0fca6ea1SDimitry Andric     PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
356*0fca6ea1SDimitry Andric     bool MissingIncVal = false;
357*0fca6ea1SDimitry Andric     for (int I = 0, E = Phi->getNumIncomingValues(); I < E; I++) {
358*0fca6ea1SDimitry Andric       Value *IncVal = Phi->getIncomingValue(I);
359*0fca6ea1SDimitry Andric       if (isa<ConstantAggregateZero>(IncVal)) {
360*0fca6ea1SDimitry Andric         Type *NewType = calculateConvertType(Phi->getType());
361*0fca6ea1SDimitry Andric         NewPhi->addIncoming(ConstantInt::get(NewType, 0, false),
362*0fca6ea1SDimitry Andric                             Phi->getIncomingBlock(I));
363*0fca6ea1SDimitry Andric       } else if (ValMap.contains(IncVal) && ValMap[IncVal])
364*0fca6ea1SDimitry Andric         NewPhi->addIncoming(ValMap[IncVal], Phi->getIncomingBlock(I));
365*0fca6ea1SDimitry Andric       else
366*0fca6ea1SDimitry Andric         MissingIncVal = true;
367*0fca6ea1SDimitry Andric     }
368*0fca6ea1SDimitry Andric     if (MissingIncVal) {
369*0fca6ea1SDimitry Andric       Value *DeadVal = ValMap[Phi];
370*0fca6ea1SDimitry Andric       // The coercion chain of the PHI is broken. Delete the Phi
371*0fca6ea1SDimitry Andric       // from the ValMap and any connected / user Phis.
372*0fca6ea1SDimitry Andric       SmallVector<Value *, 4> PHIWorklist;
373*0fca6ea1SDimitry Andric       SmallPtrSet<Value *, 4> VisitedPhis;
374*0fca6ea1SDimitry Andric       PHIWorklist.push_back(DeadVal);
375*0fca6ea1SDimitry Andric       while (!PHIWorklist.empty()) {
376*0fca6ea1SDimitry Andric         Value *NextDeadValue = PHIWorklist.pop_back_val();
377*0fca6ea1SDimitry Andric         VisitedPhis.insert(NextDeadValue);
378*0fca6ea1SDimitry Andric         auto OriginalPhi =
379*0fca6ea1SDimitry Andric             std::find_if(PhiNodes.begin(), PhiNodes.end(),
380*0fca6ea1SDimitry Andric                          [this, &NextDeadValue](PHINode *CandPhi) {
381*0fca6ea1SDimitry Andric                            return ValMap[CandPhi] == NextDeadValue;
382*0fca6ea1SDimitry Andric                          });
383*0fca6ea1SDimitry Andric         // This PHI may have already been removed from maps when
384*0fca6ea1SDimitry Andric         // unwinding a previous Phi
385*0fca6ea1SDimitry Andric         if (OriginalPhi != PhiNodes.end())
386*0fca6ea1SDimitry Andric           ValMap.erase(*OriginalPhi);
387*0fca6ea1SDimitry Andric 
388*0fca6ea1SDimitry Andric         DeadInsts.emplace_back(cast<Instruction>(NextDeadValue));
389*0fca6ea1SDimitry Andric 
390*0fca6ea1SDimitry Andric         for (User *U : NextDeadValue->users()) {
391*0fca6ea1SDimitry Andric           if (!VisitedPhis.contains(cast<PHINode>(U)))
392*0fca6ea1SDimitry Andric             PHIWorklist.push_back(U);
393*0fca6ea1SDimitry Andric         }
394*0fca6ea1SDimitry Andric       }
395*0fca6ea1SDimitry Andric     } else {
396*0fca6ea1SDimitry Andric       DeadInsts.emplace_back(cast<Instruction>(Phi));
397*0fca6ea1SDimitry Andric     }
398*0fca6ea1SDimitry Andric   }
399*0fca6ea1SDimitry Andric   // Coerce back to the original type and replace the uses.
400*0fca6ea1SDimitry Andric   for (Instruction *U : Uses) {
401*0fca6ea1SDimitry Andric     // Replace all converted operands for a use.
402*0fca6ea1SDimitry Andric     for (auto [OpIdx, Op] : enumerate(U->operands())) {
403*0fca6ea1SDimitry Andric       if (ValMap.contains(Op) && ValMap[Op]) {
404*0fca6ea1SDimitry Andric         Value *NewVal = nullptr;
405*0fca6ea1SDimitry Andric         if (BBUseValMap.contains(U->getParent()) &&
406*0fca6ea1SDimitry Andric             BBUseValMap[U->getParent()].contains(ValMap[Op]))
407*0fca6ea1SDimitry Andric           NewVal = BBUseValMap[U->getParent()][ValMap[Op]];
408*0fca6ea1SDimitry Andric         else {
409*0fca6ea1SDimitry Andric           BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt();
410*0fca6ea1SDimitry Andric           // We may pick up ops that were previously converted for users in
411*0fca6ea1SDimitry Andric           // other blocks. If there is an originally typed definition of the Op
412*0fca6ea1SDimitry Andric           // already in this block, simply reuse it.
413*0fca6ea1SDimitry Andric           if (isa<Instruction>(Op) && !isa<PHINode>(Op) &&
414*0fca6ea1SDimitry Andric               U->getParent() == cast<Instruction>(Op)->getParent()) {
415*0fca6ea1SDimitry Andric             NewVal = Op;
416*0fca6ea1SDimitry Andric           } else {
417*0fca6ea1SDimitry Andric             NewVal =
418*0fca6ea1SDimitry Andric                 convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]),
419*0fca6ea1SDimitry Andric                                    InsertPt, U->getParent());
420*0fca6ea1SDimitry Andric             BBUseValMap[U->getParent()][ValMap[Op]] = NewVal;
421*0fca6ea1SDimitry Andric           }
422*0fca6ea1SDimitry Andric         }
423*0fca6ea1SDimitry Andric         assert(NewVal);
424*0fca6ea1SDimitry Andric         U->setOperand(OpIdx, NewVal);
425*0fca6ea1SDimitry Andric       }
426*0fca6ea1SDimitry Andric     }
427*0fca6ea1SDimitry Andric   }
428*0fca6ea1SDimitry Andric 
429*0fca6ea1SDimitry Andric   return true;
430*0fca6ea1SDimitry Andric }
431*0fca6ea1SDimitry Andric 
432e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
433e8d8bef9SDimitry Andric   unsigned AS = LI.getPointerAddressSpace();
434e8d8bef9SDimitry Andric   // Skip non-constant address space.
435e8d8bef9SDimitry Andric   if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
436e8d8bef9SDimitry Andric       AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
437e8d8bef9SDimitry Andric     return false;
438e8d8bef9SDimitry Andric   // Skip non-simple loads.
439e8d8bef9SDimitry Andric   if (!LI.isSimple())
440e8d8bef9SDimitry Andric     return false;
441*0fca6ea1SDimitry Andric   Type *Ty = LI.getType();
442e8d8bef9SDimitry Andric   // Skip aggregate types.
443e8d8bef9SDimitry Andric   if (Ty->isAggregateType())
444e8d8bef9SDimitry Andric     return false;
445e8d8bef9SDimitry Andric   unsigned TySize = DL->getTypeStoreSize(Ty);
446e8d8bef9SDimitry Andric   // Only handle sub-DWORD loads.
447e8d8bef9SDimitry Andric   if (TySize >= 4)
448e8d8bef9SDimitry Andric     return false;
449e8d8bef9SDimitry Andric   // That load must be at least naturally aligned.
450e8d8bef9SDimitry Andric   if (LI.getAlign() < DL->getABITypeAlign(Ty))
451e8d8bef9SDimitry Andric     return false;
452e8d8bef9SDimitry Andric   // It should be uniform, i.e. a scalar load.
45306c3fb27SDimitry Andric   return UA->isUniform(&LI);
454e8d8bef9SDimitry Andric }
455e8d8bef9SDimitry Andric 
456e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
457e8d8bef9SDimitry Andric   if (!WidenLoads)
458e8d8bef9SDimitry Andric     return false;
459e8d8bef9SDimitry Andric 
460e8d8bef9SDimitry Andric   // Skip if that load is already aligned on DWORD at least as it's handled in
461e8d8bef9SDimitry Andric   // SDAG.
462e8d8bef9SDimitry Andric   if (LI.getAlign() >= 4)
463e8d8bef9SDimitry Andric     return false;
464e8d8bef9SDimitry Andric 
465e8d8bef9SDimitry Andric   if (!canWidenScalarExtLoad(LI))
466e8d8bef9SDimitry Andric     return false;
467e8d8bef9SDimitry Andric 
468e8d8bef9SDimitry Andric   int64_t Offset = 0;
469e8d8bef9SDimitry Andric   auto *Base =
470e8d8bef9SDimitry Andric       GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
471e8d8bef9SDimitry Andric   // If that base is not DWORD aligned, it's not safe to perform the following
472e8d8bef9SDimitry Andric   // transforms.
473e8d8bef9SDimitry Andric   if (!isDWORDAligned(Base))
474e8d8bef9SDimitry Andric     return false;
475e8d8bef9SDimitry Andric 
476e8d8bef9SDimitry Andric   int64_t Adjust = Offset & 0x3;
477e8d8bef9SDimitry Andric   if (Adjust == 0) {
478e8d8bef9SDimitry Andric     // With a zero adjust, the original alignment could be promoted with a
479e8d8bef9SDimitry Andric     // better one.
480e8d8bef9SDimitry Andric     LI.setAlignment(Align(4));
481e8d8bef9SDimitry Andric     return true;
482e8d8bef9SDimitry Andric   }
483e8d8bef9SDimitry Andric 
484e8d8bef9SDimitry Andric   IRBuilder<> IRB(&LI);
485e8d8bef9SDimitry Andric   IRB.SetCurrentDebugLocation(LI.getDebugLoc());
486e8d8bef9SDimitry Andric 
48706c3fb27SDimitry Andric   unsigned LdBits = DL->getTypeStoreSizeInBits(LI.getType());
488e8d8bef9SDimitry Andric   auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
489e8d8bef9SDimitry Andric 
49006c3fb27SDimitry Andric   auto *NewPtr = IRB.CreateConstGEP1_64(
491fe6060f1SDimitry Andric       IRB.getInt8Ty(),
49206c3fb27SDimitry Andric       IRB.CreateAddrSpaceCast(Base, LI.getPointerOperand()->getType()),
49306c3fb27SDimitry Andric       Offset - Adjust);
49406c3fb27SDimitry Andric 
495fe6060f1SDimitry Andric   LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
496e8d8bef9SDimitry Andric   NewLd->copyMetadata(LI);
497e8d8bef9SDimitry Andric   NewLd->setMetadata(LLVMContext::MD_range, nullptr);
498e8d8bef9SDimitry Andric 
499e8d8bef9SDimitry Andric   unsigned ShAmt = Adjust * 8;
500e8d8bef9SDimitry Andric   auto *NewVal = IRB.CreateBitCast(
501e8d8bef9SDimitry Andric       IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
502e8d8bef9SDimitry Andric   LI.replaceAllUsesWith(NewVal);
503*0fca6ea1SDimitry Andric   DeadInsts.emplace_back(&LI);
504e8d8bef9SDimitry Andric 
505e8d8bef9SDimitry Andric   return true;
506e8d8bef9SDimitry Andric }
507e8d8bef9SDimitry Andric 
508e8d8bef9SDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
509e8d8bef9SDimitry Andric                       "AMDGPU IR late optimizations", false, false)
5107a6dacacSDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
511e8d8bef9SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
51206c3fb27SDimitry Andric INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
513e8d8bef9SDimitry Andric INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
514e8d8bef9SDimitry Andric                     "AMDGPU IR late optimizations", false, false)
515e8d8bef9SDimitry Andric 
516e8d8bef9SDimitry Andric char AMDGPULateCodeGenPrepare::ID = 0;
517e8d8bef9SDimitry Andric 
518e8d8bef9SDimitry Andric FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
519e8d8bef9SDimitry Andric   return new AMDGPULateCodeGenPrepare();
520e8d8bef9SDimitry Andric }
521