xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp (revision e8d8bef961a50d4dc22501cde4fb9fb0be1b2532)
1*e8d8bef9SDimitry Andric //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2*e8d8bef9SDimitry Andric //
3*e8d8bef9SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*e8d8bef9SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*e8d8bef9SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*e8d8bef9SDimitry Andric //
7*e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===//
8*e8d8bef9SDimitry Andric //
9*e8d8bef9SDimitry Andric /// \file
10*e8d8bef9SDimitry Andric /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
11*e8d8bef9SDimitry Andric /// selection.
12*e8d8bef9SDimitry Andric //
13*e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===//
14*e8d8bef9SDimitry Andric 
15*e8d8bef9SDimitry Andric #include "AMDGPU.h"
16*e8d8bef9SDimitry Andric #include "llvm/Analysis/AssumptionCache.h"
17*e8d8bef9SDimitry Andric #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
18*e8d8bef9SDimitry Andric #include "llvm/Analysis/ValueTracking.h"
19*e8d8bef9SDimitry Andric #include "llvm/IR/IRBuilder.h"
20*e8d8bef9SDimitry Andric #include "llvm/IR/InstVisitor.h"
21*e8d8bef9SDimitry Andric #include "llvm/InitializePasses.h"
22*e8d8bef9SDimitry Andric #include "llvm/Support/CommandLine.h"
23*e8d8bef9SDimitry Andric #include "llvm/Support/KnownBits.h"
24*e8d8bef9SDimitry Andric #include "llvm/Transforms/Utils/Local.h"
25*e8d8bef9SDimitry Andric 
26*e8d8bef9SDimitry Andric #define DEBUG_TYPE "amdgpu-late-codegenprepare"
27*e8d8bef9SDimitry Andric 
28*e8d8bef9SDimitry Andric using namespace llvm;
29*e8d8bef9SDimitry Andric 
30*e8d8bef9SDimitry Andric // Scalar load widening needs running after load-store-vectorizer as that pass
31*e8d8bef9SDimitry Andric // doesn't handle overlapping cases. In addition, this pass enhances the
32*e8d8bef9SDimitry Andric // widening to handle cases where scalar sub-dword loads are naturally aligned
33*e8d8bef9SDimitry Andric // only but not dword aligned.
34*e8d8bef9SDimitry Andric static cl::opt<bool>
35*e8d8bef9SDimitry Andric     WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
36*e8d8bef9SDimitry Andric                cl::desc("Widen sub-dword constant address space loads in "
37*e8d8bef9SDimitry Andric                         "AMDGPULateCodeGenPrepare"),
38*e8d8bef9SDimitry Andric                cl::ReallyHidden, cl::init(true));
39*e8d8bef9SDimitry Andric 
40*e8d8bef9SDimitry Andric namespace {
41*e8d8bef9SDimitry Andric 
42*e8d8bef9SDimitry Andric class AMDGPULateCodeGenPrepare
43*e8d8bef9SDimitry Andric     : public FunctionPass,
44*e8d8bef9SDimitry Andric       public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
45*e8d8bef9SDimitry Andric   Module *Mod = nullptr;
46*e8d8bef9SDimitry Andric   const DataLayout *DL = nullptr;
47*e8d8bef9SDimitry Andric 
48*e8d8bef9SDimitry Andric   AssumptionCache *AC = nullptr;
49*e8d8bef9SDimitry Andric   LegacyDivergenceAnalysis *DA = nullptr;
50*e8d8bef9SDimitry Andric 
51*e8d8bef9SDimitry Andric public:
52*e8d8bef9SDimitry Andric   static char ID;
53*e8d8bef9SDimitry Andric 
54*e8d8bef9SDimitry Andric   AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
55*e8d8bef9SDimitry Andric 
56*e8d8bef9SDimitry Andric   StringRef getPassName() const override {
57*e8d8bef9SDimitry Andric     return "AMDGPU IR late optimizations";
58*e8d8bef9SDimitry Andric   }
59*e8d8bef9SDimitry Andric 
60*e8d8bef9SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
61*e8d8bef9SDimitry Andric     AU.addRequired<AssumptionCacheTracker>();
62*e8d8bef9SDimitry Andric     AU.addRequired<LegacyDivergenceAnalysis>();
63*e8d8bef9SDimitry Andric     AU.setPreservesAll();
64*e8d8bef9SDimitry Andric   }
65*e8d8bef9SDimitry Andric 
66*e8d8bef9SDimitry Andric   bool doInitialization(Module &M) override;
67*e8d8bef9SDimitry Andric   bool runOnFunction(Function &F) override;
68*e8d8bef9SDimitry Andric 
69*e8d8bef9SDimitry Andric   bool visitInstruction(Instruction &) { return false; }
70*e8d8bef9SDimitry Andric 
71*e8d8bef9SDimitry Andric   // Check if the specified value is at least DWORD aligned.
72*e8d8bef9SDimitry Andric   bool isDWORDAligned(const Value *V) const {
73*e8d8bef9SDimitry Andric     KnownBits Known = computeKnownBits(V, *DL, 0, AC);
74*e8d8bef9SDimitry Andric     return Known.countMinTrailingZeros() >= 2;
75*e8d8bef9SDimitry Andric   }
76*e8d8bef9SDimitry Andric 
77*e8d8bef9SDimitry Andric   bool canWidenScalarExtLoad(LoadInst &LI) const;
78*e8d8bef9SDimitry Andric   bool visitLoadInst(LoadInst &LI);
79*e8d8bef9SDimitry Andric };
80*e8d8bef9SDimitry Andric 
81*e8d8bef9SDimitry Andric } // end anonymous namespace
82*e8d8bef9SDimitry Andric 
83*e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
84*e8d8bef9SDimitry Andric   Mod = &M;
85*e8d8bef9SDimitry Andric   DL = &Mod->getDataLayout();
86*e8d8bef9SDimitry Andric   return false;
87*e8d8bef9SDimitry Andric }
88*e8d8bef9SDimitry Andric 
89*e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
90*e8d8bef9SDimitry Andric   if (skipFunction(F))
91*e8d8bef9SDimitry Andric     return false;
92*e8d8bef9SDimitry Andric 
93*e8d8bef9SDimitry Andric   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
94*e8d8bef9SDimitry Andric   DA = &getAnalysis<LegacyDivergenceAnalysis>();
95*e8d8bef9SDimitry Andric 
96*e8d8bef9SDimitry Andric   bool Changed = false;
97*e8d8bef9SDimitry Andric   for (auto &BB : F)
98*e8d8bef9SDimitry Andric     for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) {
99*e8d8bef9SDimitry Andric       Instruction *I = &*BI++;
100*e8d8bef9SDimitry Andric       Changed |= visit(*I);
101*e8d8bef9SDimitry Andric     }
102*e8d8bef9SDimitry Andric 
103*e8d8bef9SDimitry Andric   return Changed;
104*e8d8bef9SDimitry Andric }
105*e8d8bef9SDimitry Andric 
106*e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
107*e8d8bef9SDimitry Andric   unsigned AS = LI.getPointerAddressSpace();
108*e8d8bef9SDimitry Andric   // Skip non-constant address space.
109*e8d8bef9SDimitry Andric   if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
110*e8d8bef9SDimitry Andric       AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
111*e8d8bef9SDimitry Andric     return false;
112*e8d8bef9SDimitry Andric   // Skip non-simple loads.
113*e8d8bef9SDimitry Andric   if (!LI.isSimple())
114*e8d8bef9SDimitry Andric     return false;
115*e8d8bef9SDimitry Andric   auto *Ty = LI.getType();
116*e8d8bef9SDimitry Andric   // Skip aggregate types.
117*e8d8bef9SDimitry Andric   if (Ty->isAggregateType())
118*e8d8bef9SDimitry Andric     return false;
119*e8d8bef9SDimitry Andric   unsigned TySize = DL->getTypeStoreSize(Ty);
120*e8d8bef9SDimitry Andric   // Only handle sub-DWORD loads.
121*e8d8bef9SDimitry Andric   if (TySize >= 4)
122*e8d8bef9SDimitry Andric     return false;
123*e8d8bef9SDimitry Andric   // That load must be at least naturally aligned.
124*e8d8bef9SDimitry Andric   if (LI.getAlign() < DL->getABITypeAlign(Ty))
125*e8d8bef9SDimitry Andric     return false;
126*e8d8bef9SDimitry Andric   // It should be uniform, i.e. a scalar load.
127*e8d8bef9SDimitry Andric   return DA->isUniform(&LI);
128*e8d8bef9SDimitry Andric }
129*e8d8bef9SDimitry Andric 
130*e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
131*e8d8bef9SDimitry Andric   if (!WidenLoads)
132*e8d8bef9SDimitry Andric     return false;
133*e8d8bef9SDimitry Andric 
134*e8d8bef9SDimitry Andric   // Skip if that load is already aligned on DWORD at least as it's handled in
135*e8d8bef9SDimitry Andric   // SDAG.
136*e8d8bef9SDimitry Andric   if (LI.getAlign() >= 4)
137*e8d8bef9SDimitry Andric     return false;
138*e8d8bef9SDimitry Andric 
139*e8d8bef9SDimitry Andric   if (!canWidenScalarExtLoad(LI))
140*e8d8bef9SDimitry Andric     return false;
141*e8d8bef9SDimitry Andric 
142*e8d8bef9SDimitry Andric   int64_t Offset = 0;
143*e8d8bef9SDimitry Andric   auto *Base =
144*e8d8bef9SDimitry Andric       GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
145*e8d8bef9SDimitry Andric   // If that base is not DWORD aligned, it's not safe to perform the following
146*e8d8bef9SDimitry Andric   // transforms.
147*e8d8bef9SDimitry Andric   if (!isDWORDAligned(Base))
148*e8d8bef9SDimitry Andric     return false;
149*e8d8bef9SDimitry Andric 
150*e8d8bef9SDimitry Andric   int64_t Adjust = Offset & 0x3;
151*e8d8bef9SDimitry Andric   if (Adjust == 0) {
152*e8d8bef9SDimitry Andric     // With a zero adjust, the original alignment could be promoted with a
153*e8d8bef9SDimitry Andric     // better one.
154*e8d8bef9SDimitry Andric     LI.setAlignment(Align(4));
155*e8d8bef9SDimitry Andric     return true;
156*e8d8bef9SDimitry Andric   }
157*e8d8bef9SDimitry Andric 
158*e8d8bef9SDimitry Andric   IRBuilder<> IRB(&LI);
159*e8d8bef9SDimitry Andric   IRB.SetCurrentDebugLocation(LI.getDebugLoc());
160*e8d8bef9SDimitry Andric 
161*e8d8bef9SDimitry Andric   unsigned AS = LI.getPointerAddressSpace();
162*e8d8bef9SDimitry Andric   unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
163*e8d8bef9SDimitry Andric   auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
164*e8d8bef9SDimitry Andric 
165*e8d8bef9SDimitry Andric   PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
166*e8d8bef9SDimitry Andric   PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
167*e8d8bef9SDimitry Andric   auto *NewPtr = IRB.CreateBitCast(
168*e8d8bef9SDimitry Andric       IRB.CreateConstGEP1_64(IRB.CreateBitCast(Base, Int8PtrTy),
169*e8d8bef9SDimitry Andric                              Offset - Adjust),
170*e8d8bef9SDimitry Andric       Int32PtrTy);
171*e8d8bef9SDimitry Andric   LoadInst *NewLd = IRB.CreateAlignedLoad(NewPtr, Align(4));
172*e8d8bef9SDimitry Andric   NewLd->copyMetadata(LI);
173*e8d8bef9SDimitry Andric   NewLd->setMetadata(LLVMContext::MD_range, nullptr);
174*e8d8bef9SDimitry Andric 
175*e8d8bef9SDimitry Andric   unsigned ShAmt = Adjust * 8;
176*e8d8bef9SDimitry Andric   auto *NewVal = IRB.CreateBitCast(
177*e8d8bef9SDimitry Andric       IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
178*e8d8bef9SDimitry Andric   LI.replaceAllUsesWith(NewVal);
179*e8d8bef9SDimitry Andric   RecursivelyDeleteTriviallyDeadInstructions(&LI);
180*e8d8bef9SDimitry Andric 
181*e8d8bef9SDimitry Andric   return true;
182*e8d8bef9SDimitry Andric }
183*e8d8bef9SDimitry Andric 
184*e8d8bef9SDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
185*e8d8bef9SDimitry Andric                       "AMDGPU IR late optimizations", false, false)
186*e8d8bef9SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
187*e8d8bef9SDimitry Andric INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
188*e8d8bef9SDimitry Andric INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
189*e8d8bef9SDimitry Andric                     "AMDGPU IR late optimizations", false, false)
190*e8d8bef9SDimitry Andric 
191*e8d8bef9SDimitry Andric char AMDGPULateCodeGenPrepare::ID = 0;
192*e8d8bef9SDimitry Andric 
193*e8d8bef9SDimitry Andric FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
194*e8d8bef9SDimitry Andric   return new AMDGPULateCodeGenPrepare();
195*e8d8bef9SDimitry Andric }
196