1e8d8bef9SDimitry Andric //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2e8d8bef9SDimitry Andric // 3e8d8bef9SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4e8d8bef9SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5e8d8bef9SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6e8d8bef9SDimitry Andric // 7e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===// 8e8d8bef9SDimitry Andric // 9e8d8bef9SDimitry Andric /// \file 10e8d8bef9SDimitry Andric /// This pass does misc. AMDGPU optimizations on IR *just* before instruction 11e8d8bef9SDimitry Andric /// selection. 12e8d8bef9SDimitry Andric // 13e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===// 14e8d8bef9SDimitry Andric 15e8d8bef9SDimitry Andric #include "AMDGPU.h" 167a6dacacSDimitry Andric #include "AMDGPUTargetMachine.h" 17e8d8bef9SDimitry Andric #include "llvm/Analysis/AssumptionCache.h" 1806c3fb27SDimitry Andric #include "llvm/Analysis/UniformityAnalysis.h" 19e8d8bef9SDimitry Andric #include "llvm/Analysis/ValueTracking.h" 207a6dacacSDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h" 21e8d8bef9SDimitry Andric #include "llvm/IR/IRBuilder.h" 22e8d8bef9SDimitry Andric #include "llvm/IR/InstVisitor.h" 23e8d8bef9SDimitry Andric #include "llvm/InitializePasses.h" 24e8d8bef9SDimitry Andric #include "llvm/Support/CommandLine.h" 25e8d8bef9SDimitry Andric #include "llvm/Support/KnownBits.h" 26e8d8bef9SDimitry Andric #include "llvm/Transforms/Utils/Local.h" 27e8d8bef9SDimitry Andric 28e8d8bef9SDimitry Andric #define DEBUG_TYPE "amdgpu-late-codegenprepare" 29e8d8bef9SDimitry Andric 30e8d8bef9SDimitry Andric using namespace llvm; 31e8d8bef9SDimitry Andric 32e8d8bef9SDimitry Andric // Scalar load widening needs running after load-store-vectorizer as that pass 33e8d8bef9SDimitry Andric // doesn't handle overlapping cases. In addition, this pass enhances the 34e8d8bef9SDimitry Andric // widening to handle cases where scalar sub-dword loads are naturally aligned 35e8d8bef9SDimitry Andric // only but not dword aligned. 36e8d8bef9SDimitry Andric static cl::opt<bool> 37e8d8bef9SDimitry Andric WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", 38e8d8bef9SDimitry Andric cl::desc("Widen sub-dword constant address space loads in " 39e8d8bef9SDimitry Andric "AMDGPULateCodeGenPrepare"), 40e8d8bef9SDimitry Andric cl::ReallyHidden, cl::init(true)); 41e8d8bef9SDimitry Andric 42e8d8bef9SDimitry Andric namespace { 43e8d8bef9SDimitry Andric 44e8d8bef9SDimitry Andric class AMDGPULateCodeGenPrepare 45e8d8bef9SDimitry Andric : public FunctionPass, 46e8d8bef9SDimitry Andric public InstVisitor<AMDGPULateCodeGenPrepare, bool> { 47e8d8bef9SDimitry Andric Module *Mod = nullptr; 48e8d8bef9SDimitry Andric const DataLayout *DL = nullptr; 49e8d8bef9SDimitry Andric 50e8d8bef9SDimitry Andric AssumptionCache *AC = nullptr; 5106c3fb27SDimitry Andric UniformityInfo *UA = nullptr; 52e8d8bef9SDimitry Andric 53*0fca6ea1SDimitry Andric SmallVector<WeakTrackingVH, 8> DeadInsts; 54*0fca6ea1SDimitry Andric 55e8d8bef9SDimitry Andric public: 56e8d8bef9SDimitry Andric static char ID; 57e8d8bef9SDimitry Andric 58e8d8bef9SDimitry Andric AMDGPULateCodeGenPrepare() : FunctionPass(ID) {} 59e8d8bef9SDimitry Andric 60e8d8bef9SDimitry Andric StringRef getPassName() const override { 61e8d8bef9SDimitry Andric return "AMDGPU IR late optimizations"; 62e8d8bef9SDimitry Andric } 63e8d8bef9SDimitry Andric 64e8d8bef9SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 657a6dacacSDimitry Andric AU.addRequired<TargetPassConfig>(); 66e8d8bef9SDimitry Andric AU.addRequired<AssumptionCacheTracker>(); 6706c3fb27SDimitry Andric AU.addRequired<UniformityInfoWrapperPass>(); 68e8d8bef9SDimitry Andric AU.setPreservesAll(); 69e8d8bef9SDimitry Andric } 70e8d8bef9SDimitry Andric 71e8d8bef9SDimitry Andric bool doInitialization(Module &M) override; 72e8d8bef9SDimitry Andric bool runOnFunction(Function &F) override; 73e8d8bef9SDimitry Andric 74e8d8bef9SDimitry Andric bool visitInstruction(Instruction &) { return false; } 75e8d8bef9SDimitry Andric 76e8d8bef9SDimitry Andric // Check if the specified value is at least DWORD aligned. 77e8d8bef9SDimitry Andric bool isDWORDAligned(const Value *V) const { 78e8d8bef9SDimitry Andric KnownBits Known = computeKnownBits(V, *DL, 0, AC); 79e8d8bef9SDimitry Andric return Known.countMinTrailingZeros() >= 2; 80e8d8bef9SDimitry Andric } 81e8d8bef9SDimitry Andric 82e8d8bef9SDimitry Andric bool canWidenScalarExtLoad(LoadInst &LI) const; 83e8d8bef9SDimitry Andric bool visitLoadInst(LoadInst &LI); 84e8d8bef9SDimitry Andric }; 85e8d8bef9SDimitry Andric 86*0fca6ea1SDimitry Andric using ValueToValueMap = DenseMap<const Value *, Value *>; 87*0fca6ea1SDimitry Andric 88*0fca6ea1SDimitry Andric class LiveRegOptimizer { 89*0fca6ea1SDimitry Andric private: 90*0fca6ea1SDimitry Andric Module *Mod = nullptr; 91*0fca6ea1SDimitry Andric const DataLayout *DL = nullptr; 92*0fca6ea1SDimitry Andric const GCNSubtarget *ST; 93*0fca6ea1SDimitry Andric /// The scalar type to convert to 94*0fca6ea1SDimitry Andric Type *ConvertToScalar; 95*0fca6ea1SDimitry Andric /// The set of visited Instructions 96*0fca6ea1SDimitry Andric SmallPtrSet<Instruction *, 4> Visited; 97*0fca6ea1SDimitry Andric /// Map of Value -> Converted Value 98*0fca6ea1SDimitry Andric ValueToValueMap ValMap; 99*0fca6ea1SDimitry Andric /// Map of containing conversions from Optimal Type -> Original Type per BB. 100*0fca6ea1SDimitry Andric DenseMap<BasicBlock *, ValueToValueMap> BBUseValMap; 101*0fca6ea1SDimitry Andric 102*0fca6ea1SDimitry Andric public: 103*0fca6ea1SDimitry Andric /// Calculate the and \p return the type to convert to given a problematic \p 104*0fca6ea1SDimitry Andric /// OriginalType. In some instances, we may widen the type (e.g. v2i8 -> i32). 105*0fca6ea1SDimitry Andric Type *calculateConvertType(Type *OriginalType); 106*0fca6ea1SDimitry Andric /// Convert the virtual register defined by \p V to the compatible vector of 107*0fca6ea1SDimitry Andric /// legal type 108*0fca6ea1SDimitry Andric Value *convertToOptType(Instruction *V, BasicBlock::iterator &InstPt); 109*0fca6ea1SDimitry Andric /// Convert the virtual register defined by \p V back to the original type \p 110*0fca6ea1SDimitry Andric /// ConvertType, stripping away the MSBs in cases where there was an imperfect 111*0fca6ea1SDimitry Andric /// fit (e.g. v2i32 -> v7i8) 112*0fca6ea1SDimitry Andric Value *convertFromOptType(Type *ConvertType, Instruction *V, 113*0fca6ea1SDimitry Andric BasicBlock::iterator &InstPt, 114*0fca6ea1SDimitry Andric BasicBlock *InsertBlock); 115*0fca6ea1SDimitry Andric /// Check for problematic PHI nodes or cross-bb values based on the value 116*0fca6ea1SDimitry Andric /// defined by \p I, and coerce to legal types if necessary. For problematic 117*0fca6ea1SDimitry Andric /// PHI node, we coerce all incoming values in a single invocation. 118*0fca6ea1SDimitry Andric bool optimizeLiveType(Instruction *I, 119*0fca6ea1SDimitry Andric SmallVectorImpl<WeakTrackingVH> &DeadInsts); 120*0fca6ea1SDimitry Andric 121*0fca6ea1SDimitry Andric // Whether or not the type should be replaced to avoid inefficient 122*0fca6ea1SDimitry Andric // legalization code 123*0fca6ea1SDimitry Andric bool shouldReplace(Type *ITy) { 124*0fca6ea1SDimitry Andric FixedVectorType *VTy = dyn_cast<FixedVectorType>(ITy); 125*0fca6ea1SDimitry Andric if (!VTy) 126*0fca6ea1SDimitry Andric return false; 127*0fca6ea1SDimitry Andric 128*0fca6ea1SDimitry Andric auto TLI = ST->getTargetLowering(); 129*0fca6ea1SDimitry Andric 130*0fca6ea1SDimitry Andric Type *EltTy = VTy->getElementType(); 131*0fca6ea1SDimitry Andric // If the element size is not less than the convert to scalar size, then we 132*0fca6ea1SDimitry Andric // can't do any bit packing 133*0fca6ea1SDimitry Andric if (!EltTy->isIntegerTy() || 134*0fca6ea1SDimitry Andric EltTy->getScalarSizeInBits() > ConvertToScalar->getScalarSizeInBits()) 135*0fca6ea1SDimitry Andric return false; 136*0fca6ea1SDimitry Andric 137*0fca6ea1SDimitry Andric // Only coerce illegal types 138*0fca6ea1SDimitry Andric TargetLoweringBase::LegalizeKind LK = 139*0fca6ea1SDimitry Andric TLI->getTypeConversion(EltTy->getContext(), EVT::getEVT(EltTy, false)); 140*0fca6ea1SDimitry Andric return LK.first != TargetLoweringBase::TypeLegal; 141*0fca6ea1SDimitry Andric } 142*0fca6ea1SDimitry Andric 143*0fca6ea1SDimitry Andric LiveRegOptimizer(Module *Mod, const GCNSubtarget *ST) : Mod(Mod), ST(ST) { 144*0fca6ea1SDimitry Andric DL = &Mod->getDataLayout(); 145*0fca6ea1SDimitry Andric ConvertToScalar = Type::getInt32Ty(Mod->getContext()); 146*0fca6ea1SDimitry Andric } 147*0fca6ea1SDimitry Andric }; 148*0fca6ea1SDimitry Andric 149e8d8bef9SDimitry Andric } // end anonymous namespace 150e8d8bef9SDimitry Andric 151e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) { 152e8d8bef9SDimitry Andric Mod = &M; 153e8d8bef9SDimitry Andric DL = &Mod->getDataLayout(); 154e8d8bef9SDimitry Andric return false; 155e8d8bef9SDimitry Andric } 156e8d8bef9SDimitry Andric 157e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) { 158e8d8bef9SDimitry Andric if (skipFunction(F)) 159e8d8bef9SDimitry Andric return false; 160e8d8bef9SDimitry Andric 1617a6dacacSDimitry Andric const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 1627a6dacacSDimitry Andric const TargetMachine &TM = TPC.getTM<TargetMachine>(); 1637a6dacacSDimitry Andric const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 1647a6dacacSDimitry Andric 165e8d8bef9SDimitry Andric AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 16606c3fb27SDimitry Andric UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); 167e8d8bef9SDimitry Andric 168*0fca6ea1SDimitry Andric // "Optimize" the virtual regs that cross basic block boundaries. When 169*0fca6ea1SDimitry Andric // building the SelectionDAG, vectors of illegal types that cross basic blocks 170*0fca6ea1SDimitry Andric // will be scalarized and widened, with each scalar living in its 171*0fca6ea1SDimitry Andric // own register. To work around this, this optimization converts the 172*0fca6ea1SDimitry Andric // vectors to equivalent vectors of legal type (which are converted back 173*0fca6ea1SDimitry Andric // before uses in subsequent blocks), to pack the bits into fewer physical 174*0fca6ea1SDimitry Andric // registers (used in CopyToReg/CopyFromReg pairs). 175*0fca6ea1SDimitry Andric LiveRegOptimizer LRO(Mod, &ST); 176e8d8bef9SDimitry Andric 177*0fca6ea1SDimitry Andric bool Changed = false; 178*0fca6ea1SDimitry Andric 179*0fca6ea1SDimitry Andric bool HasScalarSubwordLoads = ST.hasScalarSubwordLoads(); 180*0fca6ea1SDimitry Andric 181*0fca6ea1SDimitry Andric for (auto &BB : reverse(F)) 182*0fca6ea1SDimitry Andric for (Instruction &I : make_early_inc_range(reverse(BB))) { 183*0fca6ea1SDimitry Andric Changed |= !HasScalarSubwordLoads && visit(I); 184*0fca6ea1SDimitry Andric Changed |= LRO.optimizeLiveType(&I, DeadInsts); 185*0fca6ea1SDimitry Andric } 186*0fca6ea1SDimitry Andric 187*0fca6ea1SDimitry Andric RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts); 188e8d8bef9SDimitry Andric return Changed; 189e8d8bef9SDimitry Andric } 190e8d8bef9SDimitry Andric 191*0fca6ea1SDimitry Andric Type *LiveRegOptimizer::calculateConvertType(Type *OriginalType) { 192*0fca6ea1SDimitry Andric assert(OriginalType->getScalarSizeInBits() <= 193*0fca6ea1SDimitry Andric ConvertToScalar->getScalarSizeInBits()); 194*0fca6ea1SDimitry Andric 195*0fca6ea1SDimitry Andric FixedVectorType *VTy = cast<FixedVectorType>(OriginalType); 196*0fca6ea1SDimitry Andric 197*0fca6ea1SDimitry Andric TypeSize OriginalSize = DL->getTypeSizeInBits(VTy); 198*0fca6ea1SDimitry Andric TypeSize ConvertScalarSize = DL->getTypeSizeInBits(ConvertToScalar); 199*0fca6ea1SDimitry Andric unsigned ConvertEltCount = 200*0fca6ea1SDimitry Andric (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize; 201*0fca6ea1SDimitry Andric 202*0fca6ea1SDimitry Andric if (OriginalSize <= ConvertScalarSize) 203*0fca6ea1SDimitry Andric return IntegerType::get(Mod->getContext(), ConvertScalarSize); 204*0fca6ea1SDimitry Andric 205*0fca6ea1SDimitry Andric return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize), 206*0fca6ea1SDimitry Andric ConvertEltCount, false); 207*0fca6ea1SDimitry Andric } 208*0fca6ea1SDimitry Andric 209*0fca6ea1SDimitry Andric Value *LiveRegOptimizer::convertToOptType(Instruction *V, 210*0fca6ea1SDimitry Andric BasicBlock::iterator &InsertPt) { 211*0fca6ea1SDimitry Andric FixedVectorType *VTy = cast<FixedVectorType>(V->getType()); 212*0fca6ea1SDimitry Andric Type *NewTy = calculateConvertType(V->getType()); 213*0fca6ea1SDimitry Andric 214*0fca6ea1SDimitry Andric TypeSize OriginalSize = DL->getTypeSizeInBits(VTy); 215*0fca6ea1SDimitry Andric TypeSize NewSize = DL->getTypeSizeInBits(NewTy); 216*0fca6ea1SDimitry Andric 217*0fca6ea1SDimitry Andric IRBuilder<> Builder(V->getParent(), InsertPt); 218*0fca6ea1SDimitry Andric // If there is a bitsize match, we can fit the old vector into a new vector of 219*0fca6ea1SDimitry Andric // desired type. 220*0fca6ea1SDimitry Andric if (OriginalSize == NewSize) 221*0fca6ea1SDimitry Andric return Builder.CreateBitCast(V, NewTy, V->getName() + ".bc"); 222*0fca6ea1SDimitry Andric 223*0fca6ea1SDimitry Andric // If there is a bitsize mismatch, we must use a wider vector. 224*0fca6ea1SDimitry Andric assert(NewSize > OriginalSize); 225*0fca6ea1SDimitry Andric uint64_t ExpandedVecElementCount = NewSize / VTy->getScalarSizeInBits(); 226*0fca6ea1SDimitry Andric 227*0fca6ea1SDimitry Andric SmallVector<int, 8> ShuffleMask; 228*0fca6ea1SDimitry Andric uint64_t OriginalElementCount = VTy->getElementCount().getFixedValue(); 229*0fca6ea1SDimitry Andric for (unsigned I = 0; I < OriginalElementCount; I++) 230*0fca6ea1SDimitry Andric ShuffleMask.push_back(I); 231*0fca6ea1SDimitry Andric 232*0fca6ea1SDimitry Andric for (uint64_t I = OriginalElementCount; I < ExpandedVecElementCount; I++) 233*0fca6ea1SDimitry Andric ShuffleMask.push_back(OriginalElementCount); 234*0fca6ea1SDimitry Andric 235*0fca6ea1SDimitry Andric Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask); 236*0fca6ea1SDimitry Andric return Builder.CreateBitCast(ExpandedVec, NewTy, V->getName() + ".bc"); 237*0fca6ea1SDimitry Andric } 238*0fca6ea1SDimitry Andric 239*0fca6ea1SDimitry Andric Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V, 240*0fca6ea1SDimitry Andric BasicBlock::iterator &InsertPt, 241*0fca6ea1SDimitry Andric BasicBlock *InsertBB) { 242*0fca6ea1SDimitry Andric FixedVectorType *NewVTy = cast<FixedVectorType>(ConvertType); 243*0fca6ea1SDimitry Andric 244*0fca6ea1SDimitry Andric TypeSize OriginalSize = DL->getTypeSizeInBits(V->getType()); 245*0fca6ea1SDimitry Andric TypeSize NewSize = DL->getTypeSizeInBits(NewVTy); 246*0fca6ea1SDimitry Andric 247*0fca6ea1SDimitry Andric IRBuilder<> Builder(InsertBB, InsertPt); 248*0fca6ea1SDimitry Andric // If there is a bitsize match, we simply convert back to the original type. 249*0fca6ea1SDimitry Andric if (OriginalSize == NewSize) 250*0fca6ea1SDimitry Andric return Builder.CreateBitCast(V, NewVTy, V->getName() + ".bc"); 251*0fca6ea1SDimitry Andric 252*0fca6ea1SDimitry Andric // If there is a bitsize mismatch, then we must have used a wider value to 253*0fca6ea1SDimitry Andric // hold the bits. 254*0fca6ea1SDimitry Andric assert(OriginalSize > NewSize); 255*0fca6ea1SDimitry Andric // For wide scalars, we can just truncate the value. 256*0fca6ea1SDimitry Andric if (!V->getType()->isVectorTy()) { 257*0fca6ea1SDimitry Andric Instruction *Trunc = cast<Instruction>( 258*0fca6ea1SDimitry Andric Builder.CreateTrunc(V, IntegerType::get(Mod->getContext(), NewSize))); 259*0fca6ea1SDimitry Andric return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy)); 260*0fca6ea1SDimitry Andric } 261*0fca6ea1SDimitry Andric 262*0fca6ea1SDimitry Andric // For wider vectors, we must strip the MSBs to convert back to the original 263*0fca6ea1SDimitry Andric // type. 264*0fca6ea1SDimitry Andric VectorType *ExpandedVT = VectorType::get( 265*0fca6ea1SDimitry Andric Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()), 266*0fca6ea1SDimitry Andric (OriginalSize / NewVTy->getScalarSizeInBits()), false); 267*0fca6ea1SDimitry Andric Instruction *Converted = 268*0fca6ea1SDimitry Andric cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT)); 269*0fca6ea1SDimitry Andric 270*0fca6ea1SDimitry Andric unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue(); 271*0fca6ea1SDimitry Andric SmallVector<int, 8> ShuffleMask(NarrowElementCount); 272*0fca6ea1SDimitry Andric std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0); 273*0fca6ea1SDimitry Andric 274*0fca6ea1SDimitry Andric return Builder.CreateShuffleVector(Converted, ShuffleMask); 275*0fca6ea1SDimitry Andric } 276*0fca6ea1SDimitry Andric 277*0fca6ea1SDimitry Andric bool LiveRegOptimizer::optimizeLiveType( 278*0fca6ea1SDimitry Andric Instruction *I, SmallVectorImpl<WeakTrackingVH> &DeadInsts) { 279*0fca6ea1SDimitry Andric SmallVector<Instruction *, 4> Worklist; 280*0fca6ea1SDimitry Andric SmallPtrSet<PHINode *, 4> PhiNodes; 281*0fca6ea1SDimitry Andric SmallPtrSet<Instruction *, 4> Defs; 282*0fca6ea1SDimitry Andric SmallPtrSet<Instruction *, 4> Uses; 283*0fca6ea1SDimitry Andric 284*0fca6ea1SDimitry Andric Worklist.push_back(cast<Instruction>(I)); 285*0fca6ea1SDimitry Andric while (!Worklist.empty()) { 286*0fca6ea1SDimitry Andric Instruction *II = Worklist.pop_back_val(); 287*0fca6ea1SDimitry Andric 288*0fca6ea1SDimitry Andric if (!Visited.insert(II).second) 289*0fca6ea1SDimitry Andric continue; 290*0fca6ea1SDimitry Andric 291*0fca6ea1SDimitry Andric if (!shouldReplace(II->getType())) 292*0fca6ea1SDimitry Andric continue; 293*0fca6ea1SDimitry Andric 294*0fca6ea1SDimitry Andric if (PHINode *Phi = dyn_cast<PHINode>(II)) { 295*0fca6ea1SDimitry Andric PhiNodes.insert(Phi); 296*0fca6ea1SDimitry Andric // Collect all the incoming values of problematic PHI nodes. 297*0fca6ea1SDimitry Andric for (Value *V : Phi->incoming_values()) { 298*0fca6ea1SDimitry Andric // Repeat the collection process for newly found PHI nodes. 299*0fca6ea1SDimitry Andric if (PHINode *OpPhi = dyn_cast<PHINode>(V)) { 300*0fca6ea1SDimitry Andric if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi)) 301*0fca6ea1SDimitry Andric Worklist.push_back(OpPhi); 302*0fca6ea1SDimitry Andric continue; 303*0fca6ea1SDimitry Andric } 304*0fca6ea1SDimitry Andric 305*0fca6ea1SDimitry Andric Instruction *IncInst = dyn_cast<Instruction>(V); 306*0fca6ea1SDimitry Andric // Other incoming value types (e.g. vector literals) are unhandled 307*0fca6ea1SDimitry Andric if (!IncInst && !isa<ConstantAggregateZero>(V)) 308*0fca6ea1SDimitry Andric return false; 309*0fca6ea1SDimitry Andric 310*0fca6ea1SDimitry Andric // Collect all other incoming values for coercion. 311*0fca6ea1SDimitry Andric if (IncInst) 312*0fca6ea1SDimitry Andric Defs.insert(IncInst); 313*0fca6ea1SDimitry Andric } 314*0fca6ea1SDimitry Andric } 315*0fca6ea1SDimitry Andric 316*0fca6ea1SDimitry Andric // Collect all relevant uses. 317*0fca6ea1SDimitry Andric for (User *V : II->users()) { 318*0fca6ea1SDimitry Andric // Repeat the collection process for problematic PHI nodes. 319*0fca6ea1SDimitry Andric if (PHINode *OpPhi = dyn_cast<PHINode>(V)) { 320*0fca6ea1SDimitry Andric if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi)) 321*0fca6ea1SDimitry Andric Worklist.push_back(OpPhi); 322*0fca6ea1SDimitry Andric continue; 323*0fca6ea1SDimitry Andric } 324*0fca6ea1SDimitry Andric 325*0fca6ea1SDimitry Andric Instruction *UseInst = cast<Instruction>(V); 326*0fca6ea1SDimitry Andric // Collect all uses of PHINodes and any use the crosses BB boundaries. 327*0fca6ea1SDimitry Andric if (UseInst->getParent() != II->getParent() || isa<PHINode>(II)) { 328*0fca6ea1SDimitry Andric Uses.insert(UseInst); 329*0fca6ea1SDimitry Andric if (!Defs.count(II) && !isa<PHINode>(II)) { 330*0fca6ea1SDimitry Andric Defs.insert(II); 331*0fca6ea1SDimitry Andric } 332*0fca6ea1SDimitry Andric } 333*0fca6ea1SDimitry Andric } 334*0fca6ea1SDimitry Andric } 335*0fca6ea1SDimitry Andric 336*0fca6ea1SDimitry Andric // Coerce and track the defs. 337*0fca6ea1SDimitry Andric for (Instruction *D : Defs) { 338*0fca6ea1SDimitry Andric if (!ValMap.contains(D)) { 339*0fca6ea1SDimitry Andric BasicBlock::iterator InsertPt = std::next(D->getIterator()); 340*0fca6ea1SDimitry Andric Value *ConvertVal = convertToOptType(D, InsertPt); 341*0fca6ea1SDimitry Andric assert(ConvertVal); 342*0fca6ea1SDimitry Andric ValMap[D] = ConvertVal; 343*0fca6ea1SDimitry Andric } 344*0fca6ea1SDimitry Andric } 345*0fca6ea1SDimitry Andric 346*0fca6ea1SDimitry Andric // Construct new-typed PHI nodes. 347*0fca6ea1SDimitry Andric for (PHINode *Phi : PhiNodes) { 348*0fca6ea1SDimitry Andric ValMap[Phi] = PHINode::Create(calculateConvertType(Phi->getType()), 349*0fca6ea1SDimitry Andric Phi->getNumIncomingValues(), 350*0fca6ea1SDimitry Andric Phi->getName() + ".tc", Phi->getIterator()); 351*0fca6ea1SDimitry Andric } 352*0fca6ea1SDimitry Andric 353*0fca6ea1SDimitry Andric // Connect all the PHI nodes with their new incoming values. 354*0fca6ea1SDimitry Andric for (PHINode *Phi : PhiNodes) { 355*0fca6ea1SDimitry Andric PHINode *NewPhi = cast<PHINode>(ValMap[Phi]); 356*0fca6ea1SDimitry Andric bool MissingIncVal = false; 357*0fca6ea1SDimitry Andric for (int I = 0, E = Phi->getNumIncomingValues(); I < E; I++) { 358*0fca6ea1SDimitry Andric Value *IncVal = Phi->getIncomingValue(I); 359*0fca6ea1SDimitry Andric if (isa<ConstantAggregateZero>(IncVal)) { 360*0fca6ea1SDimitry Andric Type *NewType = calculateConvertType(Phi->getType()); 361*0fca6ea1SDimitry Andric NewPhi->addIncoming(ConstantInt::get(NewType, 0, false), 362*0fca6ea1SDimitry Andric Phi->getIncomingBlock(I)); 363*0fca6ea1SDimitry Andric } else if (ValMap.contains(IncVal) && ValMap[IncVal]) 364*0fca6ea1SDimitry Andric NewPhi->addIncoming(ValMap[IncVal], Phi->getIncomingBlock(I)); 365*0fca6ea1SDimitry Andric else 366*0fca6ea1SDimitry Andric MissingIncVal = true; 367*0fca6ea1SDimitry Andric } 368*0fca6ea1SDimitry Andric if (MissingIncVal) { 369*0fca6ea1SDimitry Andric Value *DeadVal = ValMap[Phi]; 370*0fca6ea1SDimitry Andric // The coercion chain of the PHI is broken. Delete the Phi 371*0fca6ea1SDimitry Andric // from the ValMap and any connected / user Phis. 372*0fca6ea1SDimitry Andric SmallVector<Value *, 4> PHIWorklist; 373*0fca6ea1SDimitry Andric SmallPtrSet<Value *, 4> VisitedPhis; 374*0fca6ea1SDimitry Andric PHIWorklist.push_back(DeadVal); 375*0fca6ea1SDimitry Andric while (!PHIWorklist.empty()) { 376*0fca6ea1SDimitry Andric Value *NextDeadValue = PHIWorklist.pop_back_val(); 377*0fca6ea1SDimitry Andric VisitedPhis.insert(NextDeadValue); 378*0fca6ea1SDimitry Andric auto OriginalPhi = 379*0fca6ea1SDimitry Andric std::find_if(PhiNodes.begin(), PhiNodes.end(), 380*0fca6ea1SDimitry Andric [this, &NextDeadValue](PHINode *CandPhi) { 381*0fca6ea1SDimitry Andric return ValMap[CandPhi] == NextDeadValue; 382*0fca6ea1SDimitry Andric }); 383*0fca6ea1SDimitry Andric // This PHI may have already been removed from maps when 384*0fca6ea1SDimitry Andric // unwinding a previous Phi 385*0fca6ea1SDimitry Andric if (OriginalPhi != PhiNodes.end()) 386*0fca6ea1SDimitry Andric ValMap.erase(*OriginalPhi); 387*0fca6ea1SDimitry Andric 388*0fca6ea1SDimitry Andric DeadInsts.emplace_back(cast<Instruction>(NextDeadValue)); 389*0fca6ea1SDimitry Andric 390*0fca6ea1SDimitry Andric for (User *U : NextDeadValue->users()) { 391*0fca6ea1SDimitry Andric if (!VisitedPhis.contains(cast<PHINode>(U))) 392*0fca6ea1SDimitry Andric PHIWorklist.push_back(U); 393*0fca6ea1SDimitry Andric } 394*0fca6ea1SDimitry Andric } 395*0fca6ea1SDimitry Andric } else { 396*0fca6ea1SDimitry Andric DeadInsts.emplace_back(cast<Instruction>(Phi)); 397*0fca6ea1SDimitry Andric } 398*0fca6ea1SDimitry Andric } 399*0fca6ea1SDimitry Andric // Coerce back to the original type and replace the uses. 400*0fca6ea1SDimitry Andric for (Instruction *U : Uses) { 401*0fca6ea1SDimitry Andric // Replace all converted operands for a use. 402*0fca6ea1SDimitry Andric for (auto [OpIdx, Op] : enumerate(U->operands())) { 403*0fca6ea1SDimitry Andric if (ValMap.contains(Op) && ValMap[Op]) { 404*0fca6ea1SDimitry Andric Value *NewVal = nullptr; 405*0fca6ea1SDimitry Andric if (BBUseValMap.contains(U->getParent()) && 406*0fca6ea1SDimitry Andric BBUseValMap[U->getParent()].contains(ValMap[Op])) 407*0fca6ea1SDimitry Andric NewVal = BBUseValMap[U->getParent()][ValMap[Op]]; 408*0fca6ea1SDimitry Andric else { 409*0fca6ea1SDimitry Andric BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt(); 410*0fca6ea1SDimitry Andric // We may pick up ops that were previously converted for users in 411*0fca6ea1SDimitry Andric // other blocks. If there is an originally typed definition of the Op 412*0fca6ea1SDimitry Andric // already in this block, simply reuse it. 413*0fca6ea1SDimitry Andric if (isa<Instruction>(Op) && !isa<PHINode>(Op) && 414*0fca6ea1SDimitry Andric U->getParent() == cast<Instruction>(Op)->getParent()) { 415*0fca6ea1SDimitry Andric NewVal = Op; 416*0fca6ea1SDimitry Andric } else { 417*0fca6ea1SDimitry Andric NewVal = 418*0fca6ea1SDimitry Andric convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]), 419*0fca6ea1SDimitry Andric InsertPt, U->getParent()); 420*0fca6ea1SDimitry Andric BBUseValMap[U->getParent()][ValMap[Op]] = NewVal; 421*0fca6ea1SDimitry Andric } 422*0fca6ea1SDimitry Andric } 423*0fca6ea1SDimitry Andric assert(NewVal); 424*0fca6ea1SDimitry Andric U->setOperand(OpIdx, NewVal); 425*0fca6ea1SDimitry Andric } 426*0fca6ea1SDimitry Andric } 427*0fca6ea1SDimitry Andric } 428*0fca6ea1SDimitry Andric 429*0fca6ea1SDimitry Andric return true; 430*0fca6ea1SDimitry Andric } 431*0fca6ea1SDimitry Andric 432e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const { 433e8d8bef9SDimitry Andric unsigned AS = LI.getPointerAddressSpace(); 434e8d8bef9SDimitry Andric // Skip non-constant address space. 435e8d8bef9SDimitry Andric if (AS != AMDGPUAS::CONSTANT_ADDRESS && 436e8d8bef9SDimitry Andric AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) 437e8d8bef9SDimitry Andric return false; 438e8d8bef9SDimitry Andric // Skip non-simple loads. 439e8d8bef9SDimitry Andric if (!LI.isSimple()) 440e8d8bef9SDimitry Andric return false; 441*0fca6ea1SDimitry Andric Type *Ty = LI.getType(); 442e8d8bef9SDimitry Andric // Skip aggregate types. 443e8d8bef9SDimitry Andric if (Ty->isAggregateType()) 444e8d8bef9SDimitry Andric return false; 445e8d8bef9SDimitry Andric unsigned TySize = DL->getTypeStoreSize(Ty); 446e8d8bef9SDimitry Andric // Only handle sub-DWORD loads. 447e8d8bef9SDimitry Andric if (TySize >= 4) 448e8d8bef9SDimitry Andric return false; 449e8d8bef9SDimitry Andric // That load must be at least naturally aligned. 450e8d8bef9SDimitry Andric if (LI.getAlign() < DL->getABITypeAlign(Ty)) 451e8d8bef9SDimitry Andric return false; 452e8d8bef9SDimitry Andric // It should be uniform, i.e. a scalar load. 45306c3fb27SDimitry Andric return UA->isUniform(&LI); 454e8d8bef9SDimitry Andric } 455e8d8bef9SDimitry Andric 456e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { 457e8d8bef9SDimitry Andric if (!WidenLoads) 458e8d8bef9SDimitry Andric return false; 459e8d8bef9SDimitry Andric 460e8d8bef9SDimitry Andric // Skip if that load is already aligned on DWORD at least as it's handled in 461e8d8bef9SDimitry Andric // SDAG. 462e8d8bef9SDimitry Andric if (LI.getAlign() >= 4) 463e8d8bef9SDimitry Andric return false; 464e8d8bef9SDimitry Andric 465e8d8bef9SDimitry Andric if (!canWidenScalarExtLoad(LI)) 466e8d8bef9SDimitry Andric return false; 467e8d8bef9SDimitry Andric 468e8d8bef9SDimitry Andric int64_t Offset = 0; 469e8d8bef9SDimitry Andric auto *Base = 470e8d8bef9SDimitry Andric GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL); 471e8d8bef9SDimitry Andric // If that base is not DWORD aligned, it's not safe to perform the following 472e8d8bef9SDimitry Andric // transforms. 473e8d8bef9SDimitry Andric if (!isDWORDAligned(Base)) 474e8d8bef9SDimitry Andric return false; 475e8d8bef9SDimitry Andric 476e8d8bef9SDimitry Andric int64_t Adjust = Offset & 0x3; 477e8d8bef9SDimitry Andric if (Adjust == 0) { 478e8d8bef9SDimitry Andric // With a zero adjust, the original alignment could be promoted with a 479e8d8bef9SDimitry Andric // better one. 480e8d8bef9SDimitry Andric LI.setAlignment(Align(4)); 481e8d8bef9SDimitry Andric return true; 482e8d8bef9SDimitry Andric } 483e8d8bef9SDimitry Andric 484e8d8bef9SDimitry Andric IRBuilder<> IRB(&LI); 485e8d8bef9SDimitry Andric IRB.SetCurrentDebugLocation(LI.getDebugLoc()); 486e8d8bef9SDimitry Andric 48706c3fb27SDimitry Andric unsigned LdBits = DL->getTypeStoreSizeInBits(LI.getType()); 488e8d8bef9SDimitry Andric auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits); 489e8d8bef9SDimitry Andric 49006c3fb27SDimitry Andric auto *NewPtr = IRB.CreateConstGEP1_64( 491fe6060f1SDimitry Andric IRB.getInt8Ty(), 49206c3fb27SDimitry Andric IRB.CreateAddrSpaceCast(Base, LI.getPointerOperand()->getType()), 49306c3fb27SDimitry Andric Offset - Adjust); 49406c3fb27SDimitry Andric 495fe6060f1SDimitry Andric LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4)); 496e8d8bef9SDimitry Andric NewLd->copyMetadata(LI); 497e8d8bef9SDimitry Andric NewLd->setMetadata(LLVMContext::MD_range, nullptr); 498e8d8bef9SDimitry Andric 499e8d8bef9SDimitry Andric unsigned ShAmt = Adjust * 8; 500e8d8bef9SDimitry Andric auto *NewVal = IRB.CreateBitCast( 501e8d8bef9SDimitry Andric IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType()); 502e8d8bef9SDimitry Andric LI.replaceAllUsesWith(NewVal); 503*0fca6ea1SDimitry Andric DeadInsts.emplace_back(&LI); 504e8d8bef9SDimitry Andric 505e8d8bef9SDimitry Andric return true; 506e8d8bef9SDimitry Andric } 507e8d8bef9SDimitry Andric 508e8d8bef9SDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, 509e8d8bef9SDimitry Andric "AMDGPU IR late optimizations", false, false) 5107a6dacacSDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 511e8d8bef9SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 51206c3fb27SDimitry Andric INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) 513e8d8bef9SDimitry Andric INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE, 514e8d8bef9SDimitry Andric "AMDGPU IR late optimizations", false, false) 515e8d8bef9SDimitry Andric 516e8d8bef9SDimitry Andric char AMDGPULateCodeGenPrepare::ID = 0; 517e8d8bef9SDimitry Andric 518e8d8bef9SDimitry Andric FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() { 519e8d8bef9SDimitry Andric return new AMDGPULateCodeGenPrepare(); 520e8d8bef9SDimitry Andric } 521