1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
11 /// selection.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "AMDGPU.h"
16 #include "llvm/Analysis/AssumptionCache.h"
17 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
18 #include "llvm/Analysis/ValueTracking.h"
19 #include "llvm/IR/IRBuilder.h"
20 #include "llvm/IR/InstVisitor.h"
21 #include "llvm/InitializePasses.h"
22 #include "llvm/Support/CommandLine.h"
23 #include "llvm/Support/KnownBits.h"
24 #include "llvm/Transforms/Utils/Local.h"
25
26 #define DEBUG_TYPE "amdgpu-late-codegenprepare"
27
28 using namespace llvm;
29
30 // Scalar load widening needs running after load-store-vectorizer as that pass
31 // doesn't handle overlapping cases. In addition, this pass enhances the
32 // widening to handle cases where scalar sub-dword loads are naturally aligned
33 // only but not dword aligned.
34 static cl::opt<bool>
35 WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
36 cl::desc("Widen sub-dword constant address space loads in "
37 "AMDGPULateCodeGenPrepare"),
38 cl::ReallyHidden, cl::init(true));
39
40 namespace {
41
42 class AMDGPULateCodeGenPrepare
43 : public FunctionPass,
44 public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
45 Module *Mod = nullptr;
46 const DataLayout *DL = nullptr;
47
48 AssumptionCache *AC = nullptr;
49 LegacyDivergenceAnalysis *DA = nullptr;
50
51 public:
52 static char ID;
53
AMDGPULateCodeGenPrepare()54 AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
55
getPassName() const56 StringRef getPassName() const override {
57 return "AMDGPU IR late optimizations";
58 }
59
getAnalysisUsage(AnalysisUsage & AU) const60 void getAnalysisUsage(AnalysisUsage &AU) const override {
61 AU.addRequired<AssumptionCacheTracker>();
62 AU.addRequired<LegacyDivergenceAnalysis>();
63 AU.setPreservesAll();
64 }
65
66 bool doInitialization(Module &M) override;
67 bool runOnFunction(Function &F) override;
68
visitInstruction(Instruction &)69 bool visitInstruction(Instruction &) { return false; }
70
71 // Check if the specified value is at least DWORD aligned.
isDWORDAligned(const Value * V) const72 bool isDWORDAligned(const Value *V) const {
73 KnownBits Known = computeKnownBits(V, *DL, 0, AC);
74 return Known.countMinTrailingZeros() >= 2;
75 }
76
77 bool canWidenScalarExtLoad(LoadInst &LI) const;
78 bool visitLoadInst(LoadInst &LI);
79 };
80
81 } // end anonymous namespace
82
doInitialization(Module & M)83 bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
84 Mod = &M;
85 DL = &Mod->getDataLayout();
86 return false;
87 }
88
runOnFunction(Function & F)89 bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
90 if (skipFunction(F))
91 return false;
92
93 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
94 DA = &getAnalysis<LegacyDivergenceAnalysis>();
95
96 bool Changed = false;
97 for (auto &BB : F)
98 for (Instruction &I : llvm::make_early_inc_range(BB))
99 Changed |= visit(I);
100
101 return Changed;
102 }
103
canWidenScalarExtLoad(LoadInst & LI) const104 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
105 unsigned AS = LI.getPointerAddressSpace();
106 // Skip non-constant address space.
107 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
108 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
109 return false;
110 // Skip non-simple loads.
111 if (!LI.isSimple())
112 return false;
113 auto *Ty = LI.getType();
114 // Skip aggregate types.
115 if (Ty->isAggregateType())
116 return false;
117 unsigned TySize = DL->getTypeStoreSize(Ty);
118 // Only handle sub-DWORD loads.
119 if (TySize >= 4)
120 return false;
121 // That load must be at least naturally aligned.
122 if (LI.getAlign() < DL->getABITypeAlign(Ty))
123 return false;
124 // It should be uniform, i.e. a scalar load.
125 return DA->isUniform(&LI);
126 }
127
visitLoadInst(LoadInst & LI)128 bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
129 if (!WidenLoads)
130 return false;
131
132 // Skip if that load is already aligned on DWORD at least as it's handled in
133 // SDAG.
134 if (LI.getAlign() >= 4)
135 return false;
136
137 if (!canWidenScalarExtLoad(LI))
138 return false;
139
140 int64_t Offset = 0;
141 auto *Base =
142 GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
143 // If that base is not DWORD aligned, it's not safe to perform the following
144 // transforms.
145 if (!isDWORDAligned(Base))
146 return false;
147
148 int64_t Adjust = Offset & 0x3;
149 if (Adjust == 0) {
150 // With a zero adjust, the original alignment could be promoted with a
151 // better one.
152 LI.setAlignment(Align(4));
153 return true;
154 }
155
156 IRBuilder<> IRB(&LI);
157 IRB.SetCurrentDebugLocation(LI.getDebugLoc());
158
159 unsigned AS = LI.getPointerAddressSpace();
160 unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
161 auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
162
163 PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
164 PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
165 auto *NewPtr = IRB.CreateBitCast(
166 IRB.CreateConstGEP1_64(
167 IRB.getInt8Ty(),
168 IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy),
169 Offset - Adjust),
170 Int32PtrTy);
171 LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
172 NewLd->copyMetadata(LI);
173 NewLd->setMetadata(LLVMContext::MD_range, nullptr);
174
175 unsigned ShAmt = Adjust * 8;
176 auto *NewVal = IRB.CreateBitCast(
177 IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
178 LI.replaceAllUsesWith(NewVal);
179 RecursivelyDeleteTriviallyDeadInstructions(&LI);
180
181 return true;
182 }
183
184 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
185 "AMDGPU IR late optimizations", false, false)
186 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
187 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
188 INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
189 "AMDGPU IR late optimizations", false, false)
190
191 char AMDGPULateCodeGenPrepare::ID = 0;
192
createAMDGPULateCodeGenPreparePass()193 FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
194 return new AMDGPULateCodeGenPrepare();
195 }
196