xref: /openbsd-src/gnu/llvm/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp (revision 73471bf04ceb096474c7f0fa83b1b65c70a787a1)
1*73471bf0Spatrick //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2*73471bf0Spatrick //
3*73471bf0Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*73471bf0Spatrick // See https://llvm.org/LICENSE.txt for license information.
5*73471bf0Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*73471bf0Spatrick //
7*73471bf0Spatrick //===----------------------------------------------------------------------===//
8*73471bf0Spatrick //
9*73471bf0Spatrick /// \file
10*73471bf0Spatrick /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
11*73471bf0Spatrick /// selection.
12*73471bf0Spatrick //
13*73471bf0Spatrick //===----------------------------------------------------------------------===//
14*73471bf0Spatrick 
15*73471bf0Spatrick #include "AMDGPU.h"
16*73471bf0Spatrick #include "llvm/Analysis/AssumptionCache.h"
17*73471bf0Spatrick #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
18*73471bf0Spatrick #include "llvm/Analysis/ValueTracking.h"
19*73471bf0Spatrick #include "llvm/IR/IRBuilder.h"
20*73471bf0Spatrick #include "llvm/IR/InstVisitor.h"
21*73471bf0Spatrick #include "llvm/InitializePasses.h"
22*73471bf0Spatrick #include "llvm/Support/CommandLine.h"
23*73471bf0Spatrick #include "llvm/Support/KnownBits.h"
24*73471bf0Spatrick #include "llvm/Transforms/Utils/Local.h"
25*73471bf0Spatrick 
26*73471bf0Spatrick #define DEBUG_TYPE "amdgpu-late-codegenprepare"
27*73471bf0Spatrick 
28*73471bf0Spatrick using namespace llvm;
29*73471bf0Spatrick 
30*73471bf0Spatrick // Scalar load widening needs running after load-store-vectorizer as that pass
31*73471bf0Spatrick // doesn't handle overlapping cases. In addition, this pass enhances the
32*73471bf0Spatrick // widening to handle cases where scalar sub-dword loads are naturally aligned
33*73471bf0Spatrick // only but not dword aligned.
34*73471bf0Spatrick static cl::opt<bool>
35*73471bf0Spatrick     WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
36*73471bf0Spatrick                cl::desc("Widen sub-dword constant address space loads in "
37*73471bf0Spatrick                         "AMDGPULateCodeGenPrepare"),
38*73471bf0Spatrick                cl::ReallyHidden, cl::init(true));
39*73471bf0Spatrick 
40*73471bf0Spatrick namespace {
41*73471bf0Spatrick 
42*73471bf0Spatrick class AMDGPULateCodeGenPrepare
43*73471bf0Spatrick     : public FunctionPass,
44*73471bf0Spatrick       public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
45*73471bf0Spatrick   Module *Mod = nullptr;
46*73471bf0Spatrick   const DataLayout *DL = nullptr;
47*73471bf0Spatrick 
48*73471bf0Spatrick   AssumptionCache *AC = nullptr;
49*73471bf0Spatrick   LegacyDivergenceAnalysis *DA = nullptr;
50*73471bf0Spatrick 
51*73471bf0Spatrick public:
52*73471bf0Spatrick   static char ID;
53*73471bf0Spatrick 
54*73471bf0Spatrick   AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
55*73471bf0Spatrick 
56*73471bf0Spatrick   StringRef getPassName() const override {
57*73471bf0Spatrick     return "AMDGPU IR late optimizations";
58*73471bf0Spatrick   }
59*73471bf0Spatrick 
60*73471bf0Spatrick   void getAnalysisUsage(AnalysisUsage &AU) const override {
61*73471bf0Spatrick     AU.addRequired<AssumptionCacheTracker>();
62*73471bf0Spatrick     AU.addRequired<LegacyDivergenceAnalysis>();
63*73471bf0Spatrick     AU.setPreservesAll();
64*73471bf0Spatrick   }
65*73471bf0Spatrick 
66*73471bf0Spatrick   bool doInitialization(Module &M) override;
67*73471bf0Spatrick   bool runOnFunction(Function &F) override;
68*73471bf0Spatrick 
69*73471bf0Spatrick   bool visitInstruction(Instruction &) { return false; }
70*73471bf0Spatrick 
71*73471bf0Spatrick   // Check if the specified value is at least DWORD aligned.
72*73471bf0Spatrick   bool isDWORDAligned(const Value *V) const {
73*73471bf0Spatrick     KnownBits Known = computeKnownBits(V, *DL, 0, AC);
74*73471bf0Spatrick     return Known.countMinTrailingZeros() >= 2;
75*73471bf0Spatrick   }
76*73471bf0Spatrick 
77*73471bf0Spatrick   bool canWidenScalarExtLoad(LoadInst &LI) const;
78*73471bf0Spatrick   bool visitLoadInst(LoadInst &LI);
79*73471bf0Spatrick };
80*73471bf0Spatrick 
81*73471bf0Spatrick } // end anonymous namespace
82*73471bf0Spatrick 
83*73471bf0Spatrick bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
84*73471bf0Spatrick   Mod = &M;
85*73471bf0Spatrick   DL = &Mod->getDataLayout();
86*73471bf0Spatrick   return false;
87*73471bf0Spatrick }
88*73471bf0Spatrick 
89*73471bf0Spatrick bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
90*73471bf0Spatrick   if (skipFunction(F))
91*73471bf0Spatrick     return false;
92*73471bf0Spatrick 
93*73471bf0Spatrick   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
94*73471bf0Spatrick   DA = &getAnalysis<LegacyDivergenceAnalysis>();
95*73471bf0Spatrick 
96*73471bf0Spatrick   bool Changed = false;
97*73471bf0Spatrick   for (auto &BB : F)
98*73471bf0Spatrick     for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) {
99*73471bf0Spatrick       Instruction *I = &*BI++;
100*73471bf0Spatrick       Changed |= visit(*I);
101*73471bf0Spatrick     }
102*73471bf0Spatrick 
103*73471bf0Spatrick   return Changed;
104*73471bf0Spatrick }
105*73471bf0Spatrick 
106*73471bf0Spatrick bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
107*73471bf0Spatrick   unsigned AS = LI.getPointerAddressSpace();
108*73471bf0Spatrick   // Skip non-constant address space.
109*73471bf0Spatrick   if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
110*73471bf0Spatrick       AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
111*73471bf0Spatrick     return false;
112*73471bf0Spatrick   // Skip non-simple loads.
113*73471bf0Spatrick   if (!LI.isSimple())
114*73471bf0Spatrick     return false;
115*73471bf0Spatrick   auto *Ty = LI.getType();
116*73471bf0Spatrick   // Skip aggregate types.
117*73471bf0Spatrick   if (Ty->isAggregateType())
118*73471bf0Spatrick     return false;
119*73471bf0Spatrick   unsigned TySize = DL->getTypeStoreSize(Ty);
120*73471bf0Spatrick   // Only handle sub-DWORD loads.
121*73471bf0Spatrick   if (TySize >= 4)
122*73471bf0Spatrick     return false;
123*73471bf0Spatrick   // That load must be at least naturally aligned.
124*73471bf0Spatrick   if (LI.getAlign() < DL->getABITypeAlign(Ty))
125*73471bf0Spatrick     return false;
126*73471bf0Spatrick   // It should be uniform, i.e. a scalar load.
127*73471bf0Spatrick   return DA->isUniform(&LI);
128*73471bf0Spatrick }
129*73471bf0Spatrick 
130*73471bf0Spatrick bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
131*73471bf0Spatrick   if (!WidenLoads)
132*73471bf0Spatrick     return false;
133*73471bf0Spatrick 
134*73471bf0Spatrick   // Skip if that load is already aligned on DWORD at least as it's handled in
135*73471bf0Spatrick   // SDAG.
136*73471bf0Spatrick   if (LI.getAlign() >= 4)
137*73471bf0Spatrick     return false;
138*73471bf0Spatrick 
139*73471bf0Spatrick   if (!canWidenScalarExtLoad(LI))
140*73471bf0Spatrick     return false;
141*73471bf0Spatrick 
142*73471bf0Spatrick   int64_t Offset = 0;
143*73471bf0Spatrick   auto *Base =
144*73471bf0Spatrick       GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
145*73471bf0Spatrick   // If that base is not DWORD aligned, it's not safe to perform the following
146*73471bf0Spatrick   // transforms.
147*73471bf0Spatrick   if (!isDWORDAligned(Base))
148*73471bf0Spatrick     return false;
149*73471bf0Spatrick 
150*73471bf0Spatrick   int64_t Adjust = Offset & 0x3;
151*73471bf0Spatrick   if (Adjust == 0) {
152*73471bf0Spatrick     // With a zero adjust, the original alignment could be promoted with a
153*73471bf0Spatrick     // better one.
154*73471bf0Spatrick     LI.setAlignment(Align(4));
155*73471bf0Spatrick     return true;
156*73471bf0Spatrick   }
157*73471bf0Spatrick 
158*73471bf0Spatrick   IRBuilder<> IRB(&LI);
159*73471bf0Spatrick   IRB.SetCurrentDebugLocation(LI.getDebugLoc());
160*73471bf0Spatrick 
161*73471bf0Spatrick   unsigned AS = LI.getPointerAddressSpace();
162*73471bf0Spatrick   unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
163*73471bf0Spatrick   auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
164*73471bf0Spatrick 
165*73471bf0Spatrick   PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
166*73471bf0Spatrick   PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
167*73471bf0Spatrick   auto *NewPtr = IRB.CreateBitCast(
168*73471bf0Spatrick       IRB.CreateConstGEP1_64(
169*73471bf0Spatrick           IRB.getInt8Ty(),
170*73471bf0Spatrick           IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy),
171*73471bf0Spatrick           Offset - Adjust),
172*73471bf0Spatrick       Int32PtrTy);
173*73471bf0Spatrick   LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
174*73471bf0Spatrick   NewLd->copyMetadata(LI);
175*73471bf0Spatrick   NewLd->setMetadata(LLVMContext::MD_range, nullptr);
176*73471bf0Spatrick 
177*73471bf0Spatrick   unsigned ShAmt = Adjust * 8;
178*73471bf0Spatrick   auto *NewVal = IRB.CreateBitCast(
179*73471bf0Spatrick       IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
180*73471bf0Spatrick   LI.replaceAllUsesWith(NewVal);
181*73471bf0Spatrick   RecursivelyDeleteTriviallyDeadInstructions(&LI);
182*73471bf0Spatrick 
183*73471bf0Spatrick   return true;
184*73471bf0Spatrick }
185*73471bf0Spatrick 
186*73471bf0Spatrick INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
187*73471bf0Spatrick                       "AMDGPU IR late optimizations", false, false)
188*73471bf0Spatrick INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
189*73471bf0Spatrick INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
190*73471bf0Spatrick INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
191*73471bf0Spatrick                     "AMDGPU IR late optimizations", false, false)
192*73471bf0Spatrick 
193*73471bf0Spatrick char AMDGPULateCodeGenPrepare::ID = 0;
194*73471bf0Spatrick 
195*73471bf0Spatrick FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
196*73471bf0Spatrick   return new AMDGPULateCodeGenPrepare();
197*73471bf0Spatrick }
198