173471bf0Spatrick //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
273471bf0Spatrick //
373471bf0Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
473471bf0Spatrick // See https://llvm.org/LICENSE.txt for license information.
573471bf0Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
673471bf0Spatrick //
773471bf0Spatrick //===----------------------------------------------------------------------===//
873471bf0Spatrick //
973471bf0Spatrick /// \file
1073471bf0Spatrick /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
1173471bf0Spatrick /// selection.
1273471bf0Spatrick //
1373471bf0Spatrick //===----------------------------------------------------------------------===//
1473471bf0Spatrick
1573471bf0Spatrick #include "AMDGPU.h"
1673471bf0Spatrick #include "llvm/Analysis/AssumptionCache.h"
1773471bf0Spatrick #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
1873471bf0Spatrick #include "llvm/Analysis/ValueTracking.h"
1973471bf0Spatrick #include "llvm/IR/IRBuilder.h"
2073471bf0Spatrick #include "llvm/IR/InstVisitor.h"
2173471bf0Spatrick #include "llvm/InitializePasses.h"
2273471bf0Spatrick #include "llvm/Support/CommandLine.h"
2373471bf0Spatrick #include "llvm/Support/KnownBits.h"
2473471bf0Spatrick #include "llvm/Transforms/Utils/Local.h"
2573471bf0Spatrick
2673471bf0Spatrick #define DEBUG_TYPE "amdgpu-late-codegenprepare"
2773471bf0Spatrick
2873471bf0Spatrick using namespace llvm;
2973471bf0Spatrick
3073471bf0Spatrick // Scalar load widening needs running after load-store-vectorizer as that pass
3173471bf0Spatrick // doesn't handle overlapping cases. In addition, this pass enhances the
3273471bf0Spatrick // widening to handle cases where scalar sub-dword loads are naturally aligned
3373471bf0Spatrick // only but not dword aligned.
3473471bf0Spatrick static cl::opt<bool>
3573471bf0Spatrick WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
3673471bf0Spatrick cl::desc("Widen sub-dword constant address space loads in "
3773471bf0Spatrick "AMDGPULateCodeGenPrepare"),
3873471bf0Spatrick cl::ReallyHidden, cl::init(true));
3973471bf0Spatrick
4073471bf0Spatrick namespace {
4173471bf0Spatrick
4273471bf0Spatrick class AMDGPULateCodeGenPrepare
4373471bf0Spatrick : public FunctionPass,
4473471bf0Spatrick public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
4573471bf0Spatrick Module *Mod = nullptr;
4673471bf0Spatrick const DataLayout *DL = nullptr;
4773471bf0Spatrick
4873471bf0Spatrick AssumptionCache *AC = nullptr;
4973471bf0Spatrick LegacyDivergenceAnalysis *DA = nullptr;
5073471bf0Spatrick
5173471bf0Spatrick public:
5273471bf0Spatrick static char ID;
5373471bf0Spatrick
AMDGPULateCodeGenPrepare()5473471bf0Spatrick AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
5573471bf0Spatrick
getPassName() const5673471bf0Spatrick StringRef getPassName() const override {
5773471bf0Spatrick return "AMDGPU IR late optimizations";
5873471bf0Spatrick }
5973471bf0Spatrick
getAnalysisUsage(AnalysisUsage & AU) const6073471bf0Spatrick void getAnalysisUsage(AnalysisUsage &AU) const override {
6173471bf0Spatrick AU.addRequired<AssumptionCacheTracker>();
6273471bf0Spatrick AU.addRequired<LegacyDivergenceAnalysis>();
6373471bf0Spatrick AU.setPreservesAll();
6473471bf0Spatrick }
6573471bf0Spatrick
6673471bf0Spatrick bool doInitialization(Module &M) override;
6773471bf0Spatrick bool runOnFunction(Function &F) override;
6873471bf0Spatrick
visitInstruction(Instruction &)6973471bf0Spatrick bool visitInstruction(Instruction &) { return false; }
7073471bf0Spatrick
7173471bf0Spatrick // Check if the specified value is at least DWORD aligned.
isDWORDAligned(const Value * V) const7273471bf0Spatrick bool isDWORDAligned(const Value *V) const {
7373471bf0Spatrick KnownBits Known = computeKnownBits(V, *DL, 0, AC);
7473471bf0Spatrick return Known.countMinTrailingZeros() >= 2;
7573471bf0Spatrick }
7673471bf0Spatrick
7773471bf0Spatrick bool canWidenScalarExtLoad(LoadInst &LI) const;
7873471bf0Spatrick bool visitLoadInst(LoadInst &LI);
7973471bf0Spatrick };
8073471bf0Spatrick
8173471bf0Spatrick } // end anonymous namespace
8273471bf0Spatrick
doInitialization(Module & M)8373471bf0Spatrick bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
8473471bf0Spatrick Mod = &M;
8573471bf0Spatrick DL = &Mod->getDataLayout();
8673471bf0Spatrick return false;
8773471bf0Spatrick }
8873471bf0Spatrick
runOnFunction(Function & F)8973471bf0Spatrick bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
9073471bf0Spatrick if (skipFunction(F))
9173471bf0Spatrick return false;
9273471bf0Spatrick
9373471bf0Spatrick AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
9473471bf0Spatrick DA = &getAnalysis<LegacyDivergenceAnalysis>();
9573471bf0Spatrick
9673471bf0Spatrick bool Changed = false;
9773471bf0Spatrick for (auto &BB : F)
98*d415bd75Srobert for (Instruction &I : llvm::make_early_inc_range(BB))
99*d415bd75Srobert Changed |= visit(I);
10073471bf0Spatrick
10173471bf0Spatrick return Changed;
10273471bf0Spatrick }
10373471bf0Spatrick
canWidenScalarExtLoad(LoadInst & LI) const10473471bf0Spatrick bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
10573471bf0Spatrick unsigned AS = LI.getPointerAddressSpace();
10673471bf0Spatrick // Skip non-constant address space.
10773471bf0Spatrick if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10873471bf0Spatrick AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
10973471bf0Spatrick return false;
11073471bf0Spatrick // Skip non-simple loads.
11173471bf0Spatrick if (!LI.isSimple())
11273471bf0Spatrick return false;
11373471bf0Spatrick auto *Ty = LI.getType();
11473471bf0Spatrick // Skip aggregate types.
11573471bf0Spatrick if (Ty->isAggregateType())
11673471bf0Spatrick return false;
11773471bf0Spatrick unsigned TySize = DL->getTypeStoreSize(Ty);
11873471bf0Spatrick // Only handle sub-DWORD loads.
11973471bf0Spatrick if (TySize >= 4)
12073471bf0Spatrick return false;
12173471bf0Spatrick // That load must be at least naturally aligned.
12273471bf0Spatrick if (LI.getAlign() < DL->getABITypeAlign(Ty))
12373471bf0Spatrick return false;
12473471bf0Spatrick // It should be uniform, i.e. a scalar load.
12573471bf0Spatrick return DA->isUniform(&LI);
12673471bf0Spatrick }
12773471bf0Spatrick
visitLoadInst(LoadInst & LI)12873471bf0Spatrick bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
12973471bf0Spatrick if (!WidenLoads)
13073471bf0Spatrick return false;
13173471bf0Spatrick
13273471bf0Spatrick // Skip if that load is already aligned on DWORD at least as it's handled in
13373471bf0Spatrick // SDAG.
13473471bf0Spatrick if (LI.getAlign() >= 4)
13573471bf0Spatrick return false;
13673471bf0Spatrick
13773471bf0Spatrick if (!canWidenScalarExtLoad(LI))
13873471bf0Spatrick return false;
13973471bf0Spatrick
14073471bf0Spatrick int64_t Offset = 0;
14173471bf0Spatrick auto *Base =
14273471bf0Spatrick GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
14373471bf0Spatrick // If that base is not DWORD aligned, it's not safe to perform the following
14473471bf0Spatrick // transforms.
14573471bf0Spatrick if (!isDWORDAligned(Base))
14673471bf0Spatrick return false;
14773471bf0Spatrick
14873471bf0Spatrick int64_t Adjust = Offset & 0x3;
14973471bf0Spatrick if (Adjust == 0) {
15073471bf0Spatrick // With a zero adjust, the original alignment could be promoted with a
15173471bf0Spatrick // better one.
15273471bf0Spatrick LI.setAlignment(Align(4));
15373471bf0Spatrick return true;
15473471bf0Spatrick }
15573471bf0Spatrick
15673471bf0Spatrick IRBuilder<> IRB(&LI);
15773471bf0Spatrick IRB.SetCurrentDebugLocation(LI.getDebugLoc());
15873471bf0Spatrick
15973471bf0Spatrick unsigned AS = LI.getPointerAddressSpace();
16073471bf0Spatrick unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
16173471bf0Spatrick auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
16273471bf0Spatrick
16373471bf0Spatrick PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
16473471bf0Spatrick PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
16573471bf0Spatrick auto *NewPtr = IRB.CreateBitCast(
16673471bf0Spatrick IRB.CreateConstGEP1_64(
16773471bf0Spatrick IRB.getInt8Ty(),
16873471bf0Spatrick IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy),
16973471bf0Spatrick Offset - Adjust),
17073471bf0Spatrick Int32PtrTy);
17173471bf0Spatrick LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
17273471bf0Spatrick NewLd->copyMetadata(LI);
17373471bf0Spatrick NewLd->setMetadata(LLVMContext::MD_range, nullptr);
17473471bf0Spatrick
17573471bf0Spatrick unsigned ShAmt = Adjust * 8;
17673471bf0Spatrick auto *NewVal = IRB.CreateBitCast(
17773471bf0Spatrick IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
17873471bf0Spatrick LI.replaceAllUsesWith(NewVal);
17973471bf0Spatrick RecursivelyDeleteTriviallyDeadInstructions(&LI);
18073471bf0Spatrick
18173471bf0Spatrick return true;
18273471bf0Spatrick }
18373471bf0Spatrick
18473471bf0Spatrick INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
18573471bf0Spatrick "AMDGPU IR late optimizations", false, false)
18673471bf0Spatrick INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
18773471bf0Spatrick INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
18873471bf0Spatrick INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
18973471bf0Spatrick "AMDGPU IR late optimizations", false, false)
19073471bf0Spatrick
19173471bf0Spatrick char AMDGPULateCodeGenPrepare::ID = 0;
19273471bf0Spatrick
createAMDGPULateCodeGenPreparePass()19373471bf0Spatrick FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
19473471bf0Spatrick return new AMDGPULateCodeGenPrepare();
19573471bf0Spatrick }
196