xref: /openbsd-src/gnu/llvm/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp (revision d415bd752c734aee168c4ee86ff32e8cc249eb16)
173471bf0Spatrick //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
273471bf0Spatrick //
373471bf0Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
473471bf0Spatrick // See https://llvm.org/LICENSE.txt for license information.
573471bf0Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
673471bf0Spatrick //
773471bf0Spatrick //===----------------------------------------------------------------------===//
873471bf0Spatrick //
973471bf0Spatrick /// \file
1073471bf0Spatrick /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
1173471bf0Spatrick /// selection.
1273471bf0Spatrick //
1373471bf0Spatrick //===----------------------------------------------------------------------===//
1473471bf0Spatrick 
1573471bf0Spatrick #include "AMDGPU.h"
1673471bf0Spatrick #include "llvm/Analysis/AssumptionCache.h"
1773471bf0Spatrick #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
1873471bf0Spatrick #include "llvm/Analysis/ValueTracking.h"
1973471bf0Spatrick #include "llvm/IR/IRBuilder.h"
2073471bf0Spatrick #include "llvm/IR/InstVisitor.h"
2173471bf0Spatrick #include "llvm/InitializePasses.h"
2273471bf0Spatrick #include "llvm/Support/CommandLine.h"
2373471bf0Spatrick #include "llvm/Support/KnownBits.h"
2473471bf0Spatrick #include "llvm/Transforms/Utils/Local.h"
2573471bf0Spatrick 
2673471bf0Spatrick #define DEBUG_TYPE "amdgpu-late-codegenprepare"
2773471bf0Spatrick 
2873471bf0Spatrick using namespace llvm;
2973471bf0Spatrick 
3073471bf0Spatrick // Scalar load widening needs running after load-store-vectorizer as that pass
3173471bf0Spatrick // doesn't handle overlapping cases. In addition, this pass enhances the
3273471bf0Spatrick // widening to handle cases where scalar sub-dword loads are naturally aligned
3373471bf0Spatrick // only but not dword aligned.
3473471bf0Spatrick static cl::opt<bool>
3573471bf0Spatrick     WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
3673471bf0Spatrick                cl::desc("Widen sub-dword constant address space loads in "
3773471bf0Spatrick                         "AMDGPULateCodeGenPrepare"),
3873471bf0Spatrick                cl::ReallyHidden, cl::init(true));
3973471bf0Spatrick 
4073471bf0Spatrick namespace {
4173471bf0Spatrick 
4273471bf0Spatrick class AMDGPULateCodeGenPrepare
4373471bf0Spatrick     : public FunctionPass,
4473471bf0Spatrick       public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
4573471bf0Spatrick   Module *Mod = nullptr;
4673471bf0Spatrick   const DataLayout *DL = nullptr;
4773471bf0Spatrick 
4873471bf0Spatrick   AssumptionCache *AC = nullptr;
4973471bf0Spatrick   LegacyDivergenceAnalysis *DA = nullptr;
5073471bf0Spatrick 
5173471bf0Spatrick public:
5273471bf0Spatrick   static char ID;
5373471bf0Spatrick 
AMDGPULateCodeGenPrepare()5473471bf0Spatrick   AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
5573471bf0Spatrick 
getPassName() const5673471bf0Spatrick   StringRef getPassName() const override {
5773471bf0Spatrick     return "AMDGPU IR late optimizations";
5873471bf0Spatrick   }
5973471bf0Spatrick 
getAnalysisUsage(AnalysisUsage & AU) const6073471bf0Spatrick   void getAnalysisUsage(AnalysisUsage &AU) const override {
6173471bf0Spatrick     AU.addRequired<AssumptionCacheTracker>();
6273471bf0Spatrick     AU.addRequired<LegacyDivergenceAnalysis>();
6373471bf0Spatrick     AU.setPreservesAll();
6473471bf0Spatrick   }
6573471bf0Spatrick 
6673471bf0Spatrick   bool doInitialization(Module &M) override;
6773471bf0Spatrick   bool runOnFunction(Function &F) override;
6873471bf0Spatrick 
visitInstruction(Instruction &)6973471bf0Spatrick   bool visitInstruction(Instruction &) { return false; }
7073471bf0Spatrick 
7173471bf0Spatrick   // Check if the specified value is at least DWORD aligned.
isDWORDAligned(const Value * V) const7273471bf0Spatrick   bool isDWORDAligned(const Value *V) const {
7373471bf0Spatrick     KnownBits Known = computeKnownBits(V, *DL, 0, AC);
7473471bf0Spatrick     return Known.countMinTrailingZeros() >= 2;
7573471bf0Spatrick   }
7673471bf0Spatrick 
7773471bf0Spatrick   bool canWidenScalarExtLoad(LoadInst &LI) const;
7873471bf0Spatrick   bool visitLoadInst(LoadInst &LI);
7973471bf0Spatrick };
8073471bf0Spatrick 
8173471bf0Spatrick } // end anonymous namespace
8273471bf0Spatrick 
doInitialization(Module & M)8373471bf0Spatrick bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
8473471bf0Spatrick   Mod = &M;
8573471bf0Spatrick   DL = &Mod->getDataLayout();
8673471bf0Spatrick   return false;
8773471bf0Spatrick }
8873471bf0Spatrick 
runOnFunction(Function & F)8973471bf0Spatrick bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
9073471bf0Spatrick   if (skipFunction(F))
9173471bf0Spatrick     return false;
9273471bf0Spatrick 
9373471bf0Spatrick   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
9473471bf0Spatrick   DA = &getAnalysis<LegacyDivergenceAnalysis>();
9573471bf0Spatrick 
9673471bf0Spatrick   bool Changed = false;
9773471bf0Spatrick   for (auto &BB : F)
98*d415bd75Srobert     for (Instruction &I : llvm::make_early_inc_range(BB))
99*d415bd75Srobert       Changed |= visit(I);
10073471bf0Spatrick 
10173471bf0Spatrick   return Changed;
10273471bf0Spatrick }
10373471bf0Spatrick 
canWidenScalarExtLoad(LoadInst & LI) const10473471bf0Spatrick bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
10573471bf0Spatrick   unsigned AS = LI.getPointerAddressSpace();
10673471bf0Spatrick   // Skip non-constant address space.
10773471bf0Spatrick   if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10873471bf0Spatrick       AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
10973471bf0Spatrick     return false;
11073471bf0Spatrick   // Skip non-simple loads.
11173471bf0Spatrick   if (!LI.isSimple())
11273471bf0Spatrick     return false;
11373471bf0Spatrick   auto *Ty = LI.getType();
11473471bf0Spatrick   // Skip aggregate types.
11573471bf0Spatrick   if (Ty->isAggregateType())
11673471bf0Spatrick     return false;
11773471bf0Spatrick   unsigned TySize = DL->getTypeStoreSize(Ty);
11873471bf0Spatrick   // Only handle sub-DWORD loads.
11973471bf0Spatrick   if (TySize >= 4)
12073471bf0Spatrick     return false;
12173471bf0Spatrick   // That load must be at least naturally aligned.
12273471bf0Spatrick   if (LI.getAlign() < DL->getABITypeAlign(Ty))
12373471bf0Spatrick     return false;
12473471bf0Spatrick   // It should be uniform, i.e. a scalar load.
12573471bf0Spatrick   return DA->isUniform(&LI);
12673471bf0Spatrick }
12773471bf0Spatrick 
visitLoadInst(LoadInst & LI)12873471bf0Spatrick bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
12973471bf0Spatrick   if (!WidenLoads)
13073471bf0Spatrick     return false;
13173471bf0Spatrick 
13273471bf0Spatrick   // Skip if that load is already aligned on DWORD at least as it's handled in
13373471bf0Spatrick   // SDAG.
13473471bf0Spatrick   if (LI.getAlign() >= 4)
13573471bf0Spatrick     return false;
13673471bf0Spatrick 
13773471bf0Spatrick   if (!canWidenScalarExtLoad(LI))
13873471bf0Spatrick     return false;
13973471bf0Spatrick 
14073471bf0Spatrick   int64_t Offset = 0;
14173471bf0Spatrick   auto *Base =
14273471bf0Spatrick       GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
14373471bf0Spatrick   // If that base is not DWORD aligned, it's not safe to perform the following
14473471bf0Spatrick   // transforms.
14573471bf0Spatrick   if (!isDWORDAligned(Base))
14673471bf0Spatrick     return false;
14773471bf0Spatrick 
14873471bf0Spatrick   int64_t Adjust = Offset & 0x3;
14973471bf0Spatrick   if (Adjust == 0) {
15073471bf0Spatrick     // With a zero adjust, the original alignment could be promoted with a
15173471bf0Spatrick     // better one.
15273471bf0Spatrick     LI.setAlignment(Align(4));
15373471bf0Spatrick     return true;
15473471bf0Spatrick   }
15573471bf0Spatrick 
15673471bf0Spatrick   IRBuilder<> IRB(&LI);
15773471bf0Spatrick   IRB.SetCurrentDebugLocation(LI.getDebugLoc());
15873471bf0Spatrick 
15973471bf0Spatrick   unsigned AS = LI.getPointerAddressSpace();
16073471bf0Spatrick   unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
16173471bf0Spatrick   auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
16273471bf0Spatrick 
16373471bf0Spatrick   PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
16473471bf0Spatrick   PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
16573471bf0Spatrick   auto *NewPtr = IRB.CreateBitCast(
16673471bf0Spatrick       IRB.CreateConstGEP1_64(
16773471bf0Spatrick           IRB.getInt8Ty(),
16873471bf0Spatrick           IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy),
16973471bf0Spatrick           Offset - Adjust),
17073471bf0Spatrick       Int32PtrTy);
17173471bf0Spatrick   LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
17273471bf0Spatrick   NewLd->copyMetadata(LI);
17373471bf0Spatrick   NewLd->setMetadata(LLVMContext::MD_range, nullptr);
17473471bf0Spatrick 
17573471bf0Spatrick   unsigned ShAmt = Adjust * 8;
17673471bf0Spatrick   auto *NewVal = IRB.CreateBitCast(
17773471bf0Spatrick       IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
17873471bf0Spatrick   LI.replaceAllUsesWith(NewVal);
17973471bf0Spatrick   RecursivelyDeleteTriviallyDeadInstructions(&LI);
18073471bf0Spatrick 
18173471bf0Spatrick   return true;
18273471bf0Spatrick }
18373471bf0Spatrick 
18473471bf0Spatrick INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
18573471bf0Spatrick                       "AMDGPU IR late optimizations", false, false)
18673471bf0Spatrick INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
18773471bf0Spatrick INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
18873471bf0Spatrick INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
18973471bf0Spatrick                     "AMDGPU IR late optimizations", false, false)
19073471bf0Spatrick 
19173471bf0Spatrick char AMDGPULateCodeGenPrepare::ID = 0;
19273471bf0Spatrick 
createAMDGPULateCodeGenPreparePass()19373471bf0Spatrick FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
19473471bf0Spatrick   return new AMDGPULateCodeGenPrepare();
19573471bf0Spatrick }
196