xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp (revision 4a77414660d9ccd5c39cecfcc2dc1bf7fa6866e5)
1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
11 /// selection.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "llvm/Analysis/AssumptionCache.h"
17 #include "llvm/Analysis/UniformityAnalysis.h"
18 #include "llvm/Analysis/ValueTracking.h"
19 #include "llvm/IR/IRBuilder.h"
20 #include "llvm/IR/InstVisitor.h"
21 #include "llvm/InitializePasses.h"
22 #include "llvm/Support/CommandLine.h"
23 #include "llvm/Support/KnownBits.h"
24 #include "llvm/Transforms/Utils/Local.h"
25 
26 #define DEBUG_TYPE "amdgpu-late-codegenprepare"
27 
28 using namespace llvm;
29 
30 // Scalar load widening needs running after load-store-vectorizer as that pass
31 // doesn't handle overlapping cases. In addition, this pass enhances the
32 // widening to handle cases where scalar sub-dword loads are naturally aligned
33 // only but not dword aligned.
34 static cl::opt<bool>
35     WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
36                cl::desc("Widen sub-dword constant address space loads in "
37                         "AMDGPULateCodeGenPrepare"),
38                cl::ReallyHidden, cl::init(true));
39 
40 namespace {
41 
42 class AMDGPULateCodeGenPrepare
43     : public FunctionPass,
44       public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
45   Module *Mod = nullptr;
46   const DataLayout *DL = nullptr;
47 
48   AssumptionCache *AC = nullptr;
49   UniformityInfo *UA = nullptr;
50 
51 public:
52   static char ID;
53 
54   AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
55 
56   StringRef getPassName() const override {
57     return "AMDGPU IR late optimizations";
58   }
59 
60   void getAnalysisUsage(AnalysisUsage &AU) const override {
61     AU.addRequired<AssumptionCacheTracker>();
62     AU.addRequired<UniformityInfoWrapperPass>();
63     AU.setPreservesAll();
64   }
65 
66   bool doInitialization(Module &M) override;
67   bool runOnFunction(Function &F) override;
68 
69   bool visitInstruction(Instruction &) { return false; }
70 
71   // Check if the specified value is at least DWORD aligned.
72   bool isDWORDAligned(const Value *V) const {
73     KnownBits Known = computeKnownBits(V, *DL, 0, AC);
74     return Known.countMinTrailingZeros() >= 2;
75   }
76 
77   bool canWidenScalarExtLoad(LoadInst &LI) const;
78   bool visitLoadInst(LoadInst &LI);
79 };
80 
81 } // end anonymous namespace
82 
83 bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
84   Mod = &M;
85   DL = &Mod->getDataLayout();
86   return false;
87 }
88 
89 bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
90   if (skipFunction(F))
91     return false;
92 
93   // TODO: Skip this on GFX12 which does have scalar sub-dword loads.
94 
95   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
96   UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
97 
98   bool Changed = false;
99   for (auto &BB : F)
100     for (Instruction &I : llvm::make_early_inc_range(BB))
101       Changed |= visit(I);
102 
103   return Changed;
104 }
105 
106 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
107   unsigned AS = LI.getPointerAddressSpace();
108   // Skip non-constant address space.
109   if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
110       AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
111     return false;
112   // Skip non-simple loads.
113   if (!LI.isSimple())
114     return false;
115   auto *Ty = LI.getType();
116   // Skip aggregate types.
117   if (Ty->isAggregateType())
118     return false;
119   unsigned TySize = DL->getTypeStoreSize(Ty);
120   // Only handle sub-DWORD loads.
121   if (TySize >= 4)
122     return false;
123   // That load must be at least naturally aligned.
124   if (LI.getAlign() < DL->getABITypeAlign(Ty))
125     return false;
126   // It should be uniform, i.e. a scalar load.
127   return UA->isUniform(&LI);
128 }
129 
130 bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
131   if (!WidenLoads)
132     return false;
133 
134   // Skip if that load is already aligned on DWORD at least as it's handled in
135   // SDAG.
136   if (LI.getAlign() >= 4)
137     return false;
138 
139   if (!canWidenScalarExtLoad(LI))
140     return false;
141 
142   int64_t Offset = 0;
143   auto *Base =
144       GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
145   // If that base is not DWORD aligned, it's not safe to perform the following
146   // transforms.
147   if (!isDWORDAligned(Base))
148     return false;
149 
150   int64_t Adjust = Offset & 0x3;
151   if (Adjust == 0) {
152     // With a zero adjust, the original alignment could be promoted with a
153     // better one.
154     LI.setAlignment(Align(4));
155     return true;
156   }
157 
158   IRBuilder<> IRB(&LI);
159   IRB.SetCurrentDebugLocation(LI.getDebugLoc());
160 
161   unsigned LdBits = DL->getTypeStoreSizeInBits(LI.getType());
162   auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
163 
164   auto *NewPtr = IRB.CreateConstGEP1_64(
165       IRB.getInt8Ty(),
166       IRB.CreateAddrSpaceCast(Base, LI.getPointerOperand()->getType()),
167       Offset - Adjust);
168 
169   LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
170   NewLd->copyMetadata(LI);
171   NewLd->setMetadata(LLVMContext::MD_range, nullptr);
172 
173   unsigned ShAmt = Adjust * 8;
174   auto *NewVal = IRB.CreateBitCast(
175       IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
176   LI.replaceAllUsesWith(NewVal);
177   RecursivelyDeleteTriviallyDeadInstructions(&LI);
178 
179   return true;
180 }
181 
182 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
183                       "AMDGPU IR late optimizations", false, false)
184 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
185 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
186 INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
187                     "AMDGPU IR late optimizations", false, false)
188 
189 char AMDGPULateCodeGenPrepare::ID = 0;
190 
191 FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
192   return new AMDGPULateCodeGenPrepare();
193 }
194