xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp (revision 46c3d5cb05d63ed7ee1935aa3fd0d96307a9dcac)
1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
11 /// selection.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "llvm/Analysis/AssumptionCache.h"
17 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
18 #include "llvm/Analysis/ValueTracking.h"
19 #include "llvm/CodeGen/Passes.h"
20 #include "llvm/IR/IRBuilder.h"
21 #include "llvm/IR/InstVisitor.h"
22 #include "llvm/InitializePasses.h"
23 #include "llvm/Support/CommandLine.h"
24 #include "llvm/Support/KnownBits.h"
25 #include "llvm/Transforms/Utils/Local.h"
26 #include <cassert>
27 #include <iterator>
28 
29 #define DEBUG_TYPE "amdgpu-late-codegenprepare"
30 
31 using namespace llvm;
32 
33 // Scalar load widening needs running after load-store-vectorizer as that pass
34 // doesn't handle overlapping cases. In addition, this pass enhances the
35 // widening to handle cases where scalar sub-dword loads are naturally aligned
36 // only but not dword aligned.
37 static cl::opt<bool>
38     WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
39                cl::desc("Widen sub-dword constant address space loads in "
40                         "AMDGPULateCodeGenPrepare"),
41                cl::ReallyHidden, cl::init(true));
42 
43 namespace {
44 
45 class AMDGPULateCodeGenPrepare
46     : public FunctionPass,
47       public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
48   Module *Mod = nullptr;
49   const DataLayout *DL = nullptr;
50 
51   AssumptionCache *AC = nullptr;
52   LegacyDivergenceAnalysis *DA = nullptr;
53 
54 public:
55   static char ID;
56 
57   AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
58 
59   StringRef getPassName() const override {
60     return "AMDGPU IR late optimizations";
61   }
62 
63   void getAnalysisUsage(AnalysisUsage &AU) const override {
64     AU.addRequired<AssumptionCacheTracker>();
65     AU.addRequired<LegacyDivergenceAnalysis>();
66     AU.setPreservesAll();
67   }
68 
69   bool doInitialization(Module &M) override;
70   bool runOnFunction(Function &F) override;
71 
72   bool visitInstruction(Instruction &) { return false; }
73 
74   // Check if the specified value is at least DWORD aligned.
75   bool isDWORDAligned(const Value *V) const {
76     KnownBits Known = computeKnownBits(V, *DL, 0, AC);
77     return Known.countMinTrailingZeros() >= 2;
78   }
79 
80   bool canWidenScalarExtLoad(LoadInst &LI) const;
81   bool visitLoadInst(LoadInst &LI);
82 };
83 
84 } // end anonymous namespace
85 
86 bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
87   Mod = &M;
88   DL = &Mod->getDataLayout();
89   return false;
90 }
91 
92 bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
93   if (skipFunction(F))
94     return false;
95 
96   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
97   DA = &getAnalysis<LegacyDivergenceAnalysis>();
98 
99   bool Changed = false;
100   for (auto &BB : F)
101     for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) {
102       Instruction *I = &*BI++;
103       Changed |= visit(*I);
104     }
105 
106   return Changed;
107 }
108 
109 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
110   unsigned AS = LI.getPointerAddressSpace();
111   // Skip non-constant address space.
112   if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
113       AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
114     return false;
115   // Skip non-simple loads.
116   if (!LI.isSimple())
117     return false;
118   auto *Ty = LI.getType();
119   // Skip aggregate types.
120   if (Ty->isAggregateType())
121     return false;
122   unsigned TySize = DL->getTypeStoreSize(Ty);
123   // Only handle sub-DWORD loads.
124   if (TySize >= 4)
125     return false;
126   // That load must be at least naturally aligned.
127   if (LI.getAlign() < DL->getABITypeAlign(Ty))
128     return false;
129   // It should be uniform, i.e. a scalar load.
130   return DA->isUniform(&LI);
131 }
132 
133 bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
134   if (!WidenLoads)
135     return false;
136 
137   // Skip if that load is already aligned on DWORD at least as it's handled in
138   // SDAG.
139   if (LI.getAlign() >= 4)
140     return false;
141 
142   if (!canWidenScalarExtLoad(LI))
143     return false;
144 
145   int64_t Offset = 0;
146   auto *Base =
147       GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
148   // If that base is not DWORD aligned, it's not safe to perform the following
149   // transforms.
150   if (!isDWORDAligned(Base))
151     return false;
152 
153   int64_t Adjust = Offset & 0x3;
154   if (Adjust == 0) {
155     // With a zero adjust, the original alignment could be promoted with a
156     // better one.
157     LI.setAlignment(Align(4));
158     return true;
159   }
160 
161   IRBuilder<> IRB(&LI);
162   IRB.SetCurrentDebugLocation(LI.getDebugLoc());
163 
164   unsigned AS = LI.getPointerAddressSpace();
165   unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
166   auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
167 
168   PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
169   PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
170   auto *NewPtr = IRB.CreateBitCast(
171       IRB.CreateConstGEP1_64(IRB.CreateBitCast(Base, Int8PtrTy),
172                              Offset - Adjust),
173       Int32PtrTy);
174   LoadInst *NewLd = IRB.CreateAlignedLoad(NewPtr, Align(4));
175   NewLd->copyMetadata(LI);
176   NewLd->setMetadata(LLVMContext::MD_range, nullptr);
177 
178   unsigned ShAmt = Adjust * 8;
179   auto *NewVal = IRB.CreateBitCast(
180       IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
181   LI.replaceAllUsesWith(NewVal);
182   RecursivelyDeleteTriviallyDeadInstructions(&LI);
183 
184   return true;
185 }
186 
187 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
188                       "AMDGPU IR late optimizations", false, false)
189 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
190 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
191 INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
192                     "AMDGPU IR late optimizations", false, false)
193 
194 char AMDGPULateCodeGenPrepare::ID = 0;
195 
196 FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
197   return new AMDGPULateCodeGenPrepare();
198 }
199