xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp (revision 349cc55c9796c4596a5b9904cd3281af295f878f)
1e8d8bef9SDimitry Andric //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2e8d8bef9SDimitry Andric //
3e8d8bef9SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4e8d8bef9SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5e8d8bef9SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6e8d8bef9SDimitry Andric //
7e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===//
8e8d8bef9SDimitry Andric //
9e8d8bef9SDimitry Andric /// \file
10e8d8bef9SDimitry Andric /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
11e8d8bef9SDimitry Andric /// selection.
12e8d8bef9SDimitry Andric //
13e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===//
14e8d8bef9SDimitry Andric 
15e8d8bef9SDimitry Andric #include "AMDGPU.h"
16e8d8bef9SDimitry Andric #include "llvm/Analysis/AssumptionCache.h"
17e8d8bef9SDimitry Andric #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
18e8d8bef9SDimitry Andric #include "llvm/Analysis/ValueTracking.h"
19e8d8bef9SDimitry Andric #include "llvm/IR/IRBuilder.h"
20e8d8bef9SDimitry Andric #include "llvm/IR/InstVisitor.h"
21e8d8bef9SDimitry Andric #include "llvm/InitializePasses.h"
22e8d8bef9SDimitry Andric #include "llvm/Support/CommandLine.h"
23e8d8bef9SDimitry Andric #include "llvm/Support/KnownBits.h"
24e8d8bef9SDimitry Andric #include "llvm/Transforms/Utils/Local.h"
25e8d8bef9SDimitry Andric 
26e8d8bef9SDimitry Andric #define DEBUG_TYPE "amdgpu-late-codegenprepare"
27e8d8bef9SDimitry Andric 
28e8d8bef9SDimitry Andric using namespace llvm;
29e8d8bef9SDimitry Andric 
30e8d8bef9SDimitry Andric // Scalar load widening needs running after load-store-vectorizer as that pass
31e8d8bef9SDimitry Andric // doesn't handle overlapping cases. In addition, this pass enhances the
32e8d8bef9SDimitry Andric // widening to handle cases where scalar sub-dword loads are naturally aligned
33e8d8bef9SDimitry Andric // only but not dword aligned.
34e8d8bef9SDimitry Andric static cl::opt<bool>
35e8d8bef9SDimitry Andric     WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
36e8d8bef9SDimitry Andric                cl::desc("Widen sub-dword constant address space loads in "
37e8d8bef9SDimitry Andric                         "AMDGPULateCodeGenPrepare"),
38e8d8bef9SDimitry Andric                cl::ReallyHidden, cl::init(true));
39e8d8bef9SDimitry Andric 
40e8d8bef9SDimitry Andric namespace {
41e8d8bef9SDimitry Andric 
42e8d8bef9SDimitry Andric class AMDGPULateCodeGenPrepare
43e8d8bef9SDimitry Andric     : public FunctionPass,
44e8d8bef9SDimitry Andric       public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
45e8d8bef9SDimitry Andric   Module *Mod = nullptr;
46e8d8bef9SDimitry Andric   const DataLayout *DL = nullptr;
47e8d8bef9SDimitry Andric 
48e8d8bef9SDimitry Andric   AssumptionCache *AC = nullptr;
49e8d8bef9SDimitry Andric   LegacyDivergenceAnalysis *DA = nullptr;
50e8d8bef9SDimitry Andric 
51e8d8bef9SDimitry Andric public:
52e8d8bef9SDimitry Andric   static char ID;
53e8d8bef9SDimitry Andric 
54e8d8bef9SDimitry Andric   AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
55e8d8bef9SDimitry Andric 
56e8d8bef9SDimitry Andric   StringRef getPassName() const override {
57e8d8bef9SDimitry Andric     return "AMDGPU IR late optimizations";
58e8d8bef9SDimitry Andric   }
59e8d8bef9SDimitry Andric 
60e8d8bef9SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
61e8d8bef9SDimitry Andric     AU.addRequired<AssumptionCacheTracker>();
62e8d8bef9SDimitry Andric     AU.addRequired<LegacyDivergenceAnalysis>();
63e8d8bef9SDimitry Andric     AU.setPreservesAll();
64e8d8bef9SDimitry Andric   }
65e8d8bef9SDimitry Andric 
66e8d8bef9SDimitry Andric   bool doInitialization(Module &M) override;
67e8d8bef9SDimitry Andric   bool runOnFunction(Function &F) override;
68e8d8bef9SDimitry Andric 
69e8d8bef9SDimitry Andric   bool visitInstruction(Instruction &) { return false; }
70e8d8bef9SDimitry Andric 
71e8d8bef9SDimitry Andric   // Check if the specified value is at least DWORD aligned.
72e8d8bef9SDimitry Andric   bool isDWORDAligned(const Value *V) const {
73e8d8bef9SDimitry Andric     KnownBits Known = computeKnownBits(V, *DL, 0, AC);
74e8d8bef9SDimitry Andric     return Known.countMinTrailingZeros() >= 2;
75e8d8bef9SDimitry Andric   }
76e8d8bef9SDimitry Andric 
77e8d8bef9SDimitry Andric   bool canWidenScalarExtLoad(LoadInst &LI) const;
78e8d8bef9SDimitry Andric   bool visitLoadInst(LoadInst &LI);
79e8d8bef9SDimitry Andric };
80e8d8bef9SDimitry Andric 
81e8d8bef9SDimitry Andric } // end anonymous namespace
82e8d8bef9SDimitry Andric 
83e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
84e8d8bef9SDimitry Andric   Mod = &M;
85e8d8bef9SDimitry Andric   DL = &Mod->getDataLayout();
86e8d8bef9SDimitry Andric   return false;
87e8d8bef9SDimitry Andric }
88e8d8bef9SDimitry Andric 
89e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
90e8d8bef9SDimitry Andric   if (skipFunction(F))
91e8d8bef9SDimitry Andric     return false;
92e8d8bef9SDimitry Andric 
93e8d8bef9SDimitry Andric   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
94e8d8bef9SDimitry Andric   DA = &getAnalysis<LegacyDivergenceAnalysis>();
95e8d8bef9SDimitry Andric 
96e8d8bef9SDimitry Andric   bool Changed = false;
97e8d8bef9SDimitry Andric   for (auto &BB : F)
98*349cc55cSDimitry Andric     for (Instruction &I : llvm::make_early_inc_range(BB))
99*349cc55cSDimitry Andric       Changed |= visit(I);
100e8d8bef9SDimitry Andric 
101e8d8bef9SDimitry Andric   return Changed;
102e8d8bef9SDimitry Andric }
103e8d8bef9SDimitry Andric 
104e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
105e8d8bef9SDimitry Andric   unsigned AS = LI.getPointerAddressSpace();
106e8d8bef9SDimitry Andric   // Skip non-constant address space.
107e8d8bef9SDimitry Andric   if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
108e8d8bef9SDimitry Andric       AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
109e8d8bef9SDimitry Andric     return false;
110e8d8bef9SDimitry Andric   // Skip non-simple loads.
111e8d8bef9SDimitry Andric   if (!LI.isSimple())
112e8d8bef9SDimitry Andric     return false;
113e8d8bef9SDimitry Andric   auto *Ty = LI.getType();
114e8d8bef9SDimitry Andric   // Skip aggregate types.
115e8d8bef9SDimitry Andric   if (Ty->isAggregateType())
116e8d8bef9SDimitry Andric     return false;
117e8d8bef9SDimitry Andric   unsigned TySize = DL->getTypeStoreSize(Ty);
118e8d8bef9SDimitry Andric   // Only handle sub-DWORD loads.
119e8d8bef9SDimitry Andric   if (TySize >= 4)
120e8d8bef9SDimitry Andric     return false;
121e8d8bef9SDimitry Andric   // That load must be at least naturally aligned.
122e8d8bef9SDimitry Andric   if (LI.getAlign() < DL->getABITypeAlign(Ty))
123e8d8bef9SDimitry Andric     return false;
124e8d8bef9SDimitry Andric   // It should be uniform, i.e. a scalar load.
125e8d8bef9SDimitry Andric   return DA->isUniform(&LI);
126e8d8bef9SDimitry Andric }
127e8d8bef9SDimitry Andric 
128e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
129e8d8bef9SDimitry Andric   if (!WidenLoads)
130e8d8bef9SDimitry Andric     return false;
131e8d8bef9SDimitry Andric 
132e8d8bef9SDimitry Andric   // Skip if that load is already aligned on DWORD at least as it's handled in
133e8d8bef9SDimitry Andric   // SDAG.
134e8d8bef9SDimitry Andric   if (LI.getAlign() >= 4)
135e8d8bef9SDimitry Andric     return false;
136e8d8bef9SDimitry Andric 
137e8d8bef9SDimitry Andric   if (!canWidenScalarExtLoad(LI))
138e8d8bef9SDimitry Andric     return false;
139e8d8bef9SDimitry Andric 
140e8d8bef9SDimitry Andric   int64_t Offset = 0;
141e8d8bef9SDimitry Andric   auto *Base =
142e8d8bef9SDimitry Andric       GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
143e8d8bef9SDimitry Andric   // If that base is not DWORD aligned, it's not safe to perform the following
144e8d8bef9SDimitry Andric   // transforms.
145e8d8bef9SDimitry Andric   if (!isDWORDAligned(Base))
146e8d8bef9SDimitry Andric     return false;
147e8d8bef9SDimitry Andric 
148e8d8bef9SDimitry Andric   int64_t Adjust = Offset & 0x3;
149e8d8bef9SDimitry Andric   if (Adjust == 0) {
150e8d8bef9SDimitry Andric     // With a zero adjust, the original alignment could be promoted with a
151e8d8bef9SDimitry Andric     // better one.
152e8d8bef9SDimitry Andric     LI.setAlignment(Align(4));
153e8d8bef9SDimitry Andric     return true;
154e8d8bef9SDimitry Andric   }
155e8d8bef9SDimitry Andric 
156e8d8bef9SDimitry Andric   IRBuilder<> IRB(&LI);
157e8d8bef9SDimitry Andric   IRB.SetCurrentDebugLocation(LI.getDebugLoc());
158e8d8bef9SDimitry Andric 
159e8d8bef9SDimitry Andric   unsigned AS = LI.getPointerAddressSpace();
160e8d8bef9SDimitry Andric   unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
161e8d8bef9SDimitry Andric   auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
162e8d8bef9SDimitry Andric 
163e8d8bef9SDimitry Andric   PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
164e8d8bef9SDimitry Andric   PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
165e8d8bef9SDimitry Andric   auto *NewPtr = IRB.CreateBitCast(
166fe6060f1SDimitry Andric       IRB.CreateConstGEP1_64(
167fe6060f1SDimitry Andric           IRB.getInt8Ty(),
168fe6060f1SDimitry Andric           IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy),
169e8d8bef9SDimitry Andric           Offset - Adjust),
170e8d8bef9SDimitry Andric       Int32PtrTy);
171fe6060f1SDimitry Andric   LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
172e8d8bef9SDimitry Andric   NewLd->copyMetadata(LI);
173e8d8bef9SDimitry Andric   NewLd->setMetadata(LLVMContext::MD_range, nullptr);
174e8d8bef9SDimitry Andric 
175e8d8bef9SDimitry Andric   unsigned ShAmt = Adjust * 8;
176e8d8bef9SDimitry Andric   auto *NewVal = IRB.CreateBitCast(
177e8d8bef9SDimitry Andric       IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
178e8d8bef9SDimitry Andric   LI.replaceAllUsesWith(NewVal);
179e8d8bef9SDimitry Andric   RecursivelyDeleteTriviallyDeadInstructions(&LI);
180e8d8bef9SDimitry Andric 
181e8d8bef9SDimitry Andric   return true;
182e8d8bef9SDimitry Andric }
183e8d8bef9SDimitry Andric 
184e8d8bef9SDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
185e8d8bef9SDimitry Andric                       "AMDGPU IR late optimizations", false, false)
186e8d8bef9SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
187e8d8bef9SDimitry Andric INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
188e8d8bef9SDimitry Andric INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
189e8d8bef9SDimitry Andric                     "AMDGPU IR late optimizations", false, false)
190e8d8bef9SDimitry Andric 
191e8d8bef9SDimitry Andric char AMDGPULateCodeGenPrepare::ID = 0;
192e8d8bef9SDimitry Andric 
193e8d8bef9SDimitry Andric FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
194e8d8bef9SDimitry Andric   return new AMDGPULateCodeGenPrepare();
195e8d8bef9SDimitry Andric }
196