1e8d8bef9SDimitry Andric //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2e8d8bef9SDimitry Andric // 3e8d8bef9SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4e8d8bef9SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5e8d8bef9SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6e8d8bef9SDimitry Andric // 7e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===// 8e8d8bef9SDimitry Andric // 9e8d8bef9SDimitry Andric /// \file 10e8d8bef9SDimitry Andric /// This pass does misc. AMDGPU optimizations on IR *just* before instruction 11e8d8bef9SDimitry Andric /// selection. 12e8d8bef9SDimitry Andric // 13e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===// 14e8d8bef9SDimitry Andric 15e8d8bef9SDimitry Andric #include "AMDGPU.h" 16e8d8bef9SDimitry Andric #include "llvm/Analysis/AssumptionCache.h" 17e8d8bef9SDimitry Andric #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 18e8d8bef9SDimitry Andric #include "llvm/Analysis/ValueTracking.h" 19e8d8bef9SDimitry Andric #include "llvm/IR/IRBuilder.h" 20e8d8bef9SDimitry Andric #include "llvm/IR/InstVisitor.h" 21e8d8bef9SDimitry Andric #include "llvm/InitializePasses.h" 22e8d8bef9SDimitry Andric #include "llvm/Support/CommandLine.h" 23e8d8bef9SDimitry Andric #include "llvm/Support/KnownBits.h" 24e8d8bef9SDimitry Andric #include "llvm/Transforms/Utils/Local.h" 25e8d8bef9SDimitry Andric 26e8d8bef9SDimitry Andric #define DEBUG_TYPE "amdgpu-late-codegenprepare" 27e8d8bef9SDimitry Andric 28e8d8bef9SDimitry Andric using namespace llvm; 29e8d8bef9SDimitry Andric 30e8d8bef9SDimitry Andric // Scalar load widening needs running after load-store-vectorizer as that pass 31e8d8bef9SDimitry Andric // doesn't handle overlapping cases. In addition, this pass enhances the 32e8d8bef9SDimitry Andric // widening to handle cases where scalar sub-dword loads are naturally aligned 33e8d8bef9SDimitry Andric // only but not dword aligned. 34e8d8bef9SDimitry Andric static cl::opt<bool> 35e8d8bef9SDimitry Andric WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", 36e8d8bef9SDimitry Andric cl::desc("Widen sub-dword constant address space loads in " 37e8d8bef9SDimitry Andric "AMDGPULateCodeGenPrepare"), 38e8d8bef9SDimitry Andric cl::ReallyHidden, cl::init(true)); 39e8d8bef9SDimitry Andric 40e8d8bef9SDimitry Andric namespace { 41e8d8bef9SDimitry Andric 42e8d8bef9SDimitry Andric class AMDGPULateCodeGenPrepare 43e8d8bef9SDimitry Andric : public FunctionPass, 44e8d8bef9SDimitry Andric public InstVisitor<AMDGPULateCodeGenPrepare, bool> { 45e8d8bef9SDimitry Andric Module *Mod = nullptr; 46e8d8bef9SDimitry Andric const DataLayout *DL = nullptr; 47e8d8bef9SDimitry Andric 48e8d8bef9SDimitry Andric AssumptionCache *AC = nullptr; 49e8d8bef9SDimitry Andric LegacyDivergenceAnalysis *DA = nullptr; 50e8d8bef9SDimitry Andric 51e8d8bef9SDimitry Andric public: 52e8d8bef9SDimitry Andric static char ID; 53e8d8bef9SDimitry Andric 54e8d8bef9SDimitry Andric AMDGPULateCodeGenPrepare() : FunctionPass(ID) {} 55e8d8bef9SDimitry Andric 56e8d8bef9SDimitry Andric StringRef getPassName() const override { 57e8d8bef9SDimitry Andric return "AMDGPU IR late optimizations"; 58e8d8bef9SDimitry Andric } 59e8d8bef9SDimitry Andric 60e8d8bef9SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 61e8d8bef9SDimitry Andric AU.addRequired<AssumptionCacheTracker>(); 62e8d8bef9SDimitry Andric AU.addRequired<LegacyDivergenceAnalysis>(); 63e8d8bef9SDimitry Andric AU.setPreservesAll(); 64e8d8bef9SDimitry Andric } 65e8d8bef9SDimitry Andric 66e8d8bef9SDimitry Andric bool doInitialization(Module &M) override; 67e8d8bef9SDimitry Andric bool runOnFunction(Function &F) override; 68e8d8bef9SDimitry Andric 69e8d8bef9SDimitry Andric bool visitInstruction(Instruction &) { return false; } 70e8d8bef9SDimitry Andric 71e8d8bef9SDimitry Andric // Check if the specified value is at least DWORD aligned. 72e8d8bef9SDimitry Andric bool isDWORDAligned(const Value *V) const { 73e8d8bef9SDimitry Andric KnownBits Known = computeKnownBits(V, *DL, 0, AC); 74e8d8bef9SDimitry Andric return Known.countMinTrailingZeros() >= 2; 75e8d8bef9SDimitry Andric } 76e8d8bef9SDimitry Andric 77e8d8bef9SDimitry Andric bool canWidenScalarExtLoad(LoadInst &LI) const; 78e8d8bef9SDimitry Andric bool visitLoadInst(LoadInst &LI); 79e8d8bef9SDimitry Andric }; 80e8d8bef9SDimitry Andric 81e8d8bef9SDimitry Andric } // end anonymous namespace 82e8d8bef9SDimitry Andric 83e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) { 84e8d8bef9SDimitry Andric Mod = &M; 85e8d8bef9SDimitry Andric DL = &Mod->getDataLayout(); 86e8d8bef9SDimitry Andric return false; 87e8d8bef9SDimitry Andric } 88e8d8bef9SDimitry Andric 89e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) { 90e8d8bef9SDimitry Andric if (skipFunction(F)) 91e8d8bef9SDimitry Andric return false; 92e8d8bef9SDimitry Andric 93e8d8bef9SDimitry Andric AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 94e8d8bef9SDimitry Andric DA = &getAnalysis<LegacyDivergenceAnalysis>(); 95e8d8bef9SDimitry Andric 96e8d8bef9SDimitry Andric bool Changed = false; 97e8d8bef9SDimitry Andric for (auto &BB : F) 98*349cc55cSDimitry Andric for (Instruction &I : llvm::make_early_inc_range(BB)) 99*349cc55cSDimitry Andric Changed |= visit(I); 100e8d8bef9SDimitry Andric 101e8d8bef9SDimitry Andric return Changed; 102e8d8bef9SDimitry Andric } 103e8d8bef9SDimitry Andric 104e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const { 105e8d8bef9SDimitry Andric unsigned AS = LI.getPointerAddressSpace(); 106e8d8bef9SDimitry Andric // Skip non-constant address space. 107e8d8bef9SDimitry Andric if (AS != AMDGPUAS::CONSTANT_ADDRESS && 108e8d8bef9SDimitry Andric AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) 109e8d8bef9SDimitry Andric return false; 110e8d8bef9SDimitry Andric // Skip non-simple loads. 111e8d8bef9SDimitry Andric if (!LI.isSimple()) 112e8d8bef9SDimitry Andric return false; 113e8d8bef9SDimitry Andric auto *Ty = LI.getType(); 114e8d8bef9SDimitry Andric // Skip aggregate types. 115e8d8bef9SDimitry Andric if (Ty->isAggregateType()) 116e8d8bef9SDimitry Andric return false; 117e8d8bef9SDimitry Andric unsigned TySize = DL->getTypeStoreSize(Ty); 118e8d8bef9SDimitry Andric // Only handle sub-DWORD loads. 119e8d8bef9SDimitry Andric if (TySize >= 4) 120e8d8bef9SDimitry Andric return false; 121e8d8bef9SDimitry Andric // That load must be at least naturally aligned. 122e8d8bef9SDimitry Andric if (LI.getAlign() < DL->getABITypeAlign(Ty)) 123e8d8bef9SDimitry Andric return false; 124e8d8bef9SDimitry Andric // It should be uniform, i.e. a scalar load. 125e8d8bef9SDimitry Andric return DA->isUniform(&LI); 126e8d8bef9SDimitry Andric } 127e8d8bef9SDimitry Andric 128e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { 129e8d8bef9SDimitry Andric if (!WidenLoads) 130e8d8bef9SDimitry Andric return false; 131e8d8bef9SDimitry Andric 132e8d8bef9SDimitry Andric // Skip if that load is already aligned on DWORD at least as it's handled in 133e8d8bef9SDimitry Andric // SDAG. 134e8d8bef9SDimitry Andric if (LI.getAlign() >= 4) 135e8d8bef9SDimitry Andric return false; 136e8d8bef9SDimitry Andric 137e8d8bef9SDimitry Andric if (!canWidenScalarExtLoad(LI)) 138e8d8bef9SDimitry Andric return false; 139e8d8bef9SDimitry Andric 140e8d8bef9SDimitry Andric int64_t Offset = 0; 141e8d8bef9SDimitry Andric auto *Base = 142e8d8bef9SDimitry Andric GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL); 143e8d8bef9SDimitry Andric // If that base is not DWORD aligned, it's not safe to perform the following 144e8d8bef9SDimitry Andric // transforms. 145e8d8bef9SDimitry Andric if (!isDWORDAligned(Base)) 146e8d8bef9SDimitry Andric return false; 147e8d8bef9SDimitry Andric 148e8d8bef9SDimitry Andric int64_t Adjust = Offset & 0x3; 149e8d8bef9SDimitry Andric if (Adjust == 0) { 150e8d8bef9SDimitry Andric // With a zero adjust, the original alignment could be promoted with a 151e8d8bef9SDimitry Andric // better one. 152e8d8bef9SDimitry Andric LI.setAlignment(Align(4)); 153e8d8bef9SDimitry Andric return true; 154e8d8bef9SDimitry Andric } 155e8d8bef9SDimitry Andric 156e8d8bef9SDimitry Andric IRBuilder<> IRB(&LI); 157e8d8bef9SDimitry Andric IRB.SetCurrentDebugLocation(LI.getDebugLoc()); 158e8d8bef9SDimitry Andric 159e8d8bef9SDimitry Andric unsigned AS = LI.getPointerAddressSpace(); 160e8d8bef9SDimitry Andric unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8; 161e8d8bef9SDimitry Andric auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits); 162e8d8bef9SDimitry Andric 163e8d8bef9SDimitry Andric PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS); 164e8d8bef9SDimitry Andric PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS); 165e8d8bef9SDimitry Andric auto *NewPtr = IRB.CreateBitCast( 166fe6060f1SDimitry Andric IRB.CreateConstGEP1_64( 167fe6060f1SDimitry Andric IRB.getInt8Ty(), 168fe6060f1SDimitry Andric IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy), 169e8d8bef9SDimitry Andric Offset - Adjust), 170e8d8bef9SDimitry Andric Int32PtrTy); 171fe6060f1SDimitry Andric LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4)); 172e8d8bef9SDimitry Andric NewLd->copyMetadata(LI); 173e8d8bef9SDimitry Andric NewLd->setMetadata(LLVMContext::MD_range, nullptr); 174e8d8bef9SDimitry Andric 175e8d8bef9SDimitry Andric unsigned ShAmt = Adjust * 8; 176e8d8bef9SDimitry Andric auto *NewVal = IRB.CreateBitCast( 177e8d8bef9SDimitry Andric IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType()); 178e8d8bef9SDimitry Andric LI.replaceAllUsesWith(NewVal); 179e8d8bef9SDimitry Andric RecursivelyDeleteTriviallyDeadInstructions(&LI); 180e8d8bef9SDimitry Andric 181e8d8bef9SDimitry Andric return true; 182e8d8bef9SDimitry Andric } 183e8d8bef9SDimitry Andric 184e8d8bef9SDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, 185e8d8bef9SDimitry Andric "AMDGPU IR late optimizations", false, false) 186e8d8bef9SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 187e8d8bef9SDimitry Andric INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 188e8d8bef9SDimitry Andric INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE, 189e8d8bef9SDimitry Andric "AMDGPU IR late optimizations", false, false) 190e8d8bef9SDimitry Andric 191e8d8bef9SDimitry Andric char AMDGPULateCodeGenPrepare::ID = 0; 192e8d8bef9SDimitry Andric 193e8d8bef9SDimitry Andric FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() { 194e8d8bef9SDimitry Andric return new AMDGPULateCodeGenPrepare(); 195e8d8bef9SDimitry Andric } 196