1*e8d8bef9SDimitry Andric //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2*e8d8bef9SDimitry Andric // 3*e8d8bef9SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*e8d8bef9SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5*e8d8bef9SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*e8d8bef9SDimitry Andric // 7*e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===// 8*e8d8bef9SDimitry Andric // 9*e8d8bef9SDimitry Andric /// \file 10*e8d8bef9SDimitry Andric /// This pass does misc. AMDGPU optimizations on IR *just* before instruction 11*e8d8bef9SDimitry Andric /// selection. 12*e8d8bef9SDimitry Andric // 13*e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===// 14*e8d8bef9SDimitry Andric 15*e8d8bef9SDimitry Andric #include "AMDGPU.h" 16*e8d8bef9SDimitry Andric #include "llvm/Analysis/AssumptionCache.h" 17*e8d8bef9SDimitry Andric #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 18*e8d8bef9SDimitry Andric #include "llvm/Analysis/ValueTracking.h" 19*e8d8bef9SDimitry Andric #include "llvm/IR/IRBuilder.h" 20*e8d8bef9SDimitry Andric #include "llvm/IR/InstVisitor.h" 21*e8d8bef9SDimitry Andric #include "llvm/InitializePasses.h" 22*e8d8bef9SDimitry Andric #include "llvm/Support/CommandLine.h" 23*e8d8bef9SDimitry Andric #include "llvm/Support/KnownBits.h" 24*e8d8bef9SDimitry Andric #include "llvm/Transforms/Utils/Local.h" 25*e8d8bef9SDimitry Andric 26*e8d8bef9SDimitry Andric #define DEBUG_TYPE "amdgpu-late-codegenprepare" 27*e8d8bef9SDimitry Andric 28*e8d8bef9SDimitry Andric using namespace llvm; 29*e8d8bef9SDimitry Andric 30*e8d8bef9SDimitry Andric // Scalar load widening needs running after load-store-vectorizer as that pass 31*e8d8bef9SDimitry Andric // doesn't handle overlapping cases. In addition, this pass enhances the 32*e8d8bef9SDimitry Andric // widening to handle cases where scalar sub-dword loads are naturally aligned 33*e8d8bef9SDimitry Andric // only but not dword aligned. 34*e8d8bef9SDimitry Andric static cl::opt<bool> 35*e8d8bef9SDimitry Andric WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", 36*e8d8bef9SDimitry Andric cl::desc("Widen sub-dword constant address space loads in " 37*e8d8bef9SDimitry Andric "AMDGPULateCodeGenPrepare"), 38*e8d8bef9SDimitry Andric cl::ReallyHidden, cl::init(true)); 39*e8d8bef9SDimitry Andric 40*e8d8bef9SDimitry Andric namespace { 41*e8d8bef9SDimitry Andric 42*e8d8bef9SDimitry Andric class AMDGPULateCodeGenPrepare 43*e8d8bef9SDimitry Andric : public FunctionPass, 44*e8d8bef9SDimitry Andric public InstVisitor<AMDGPULateCodeGenPrepare, bool> { 45*e8d8bef9SDimitry Andric Module *Mod = nullptr; 46*e8d8bef9SDimitry Andric const DataLayout *DL = nullptr; 47*e8d8bef9SDimitry Andric 48*e8d8bef9SDimitry Andric AssumptionCache *AC = nullptr; 49*e8d8bef9SDimitry Andric LegacyDivergenceAnalysis *DA = nullptr; 50*e8d8bef9SDimitry Andric 51*e8d8bef9SDimitry Andric public: 52*e8d8bef9SDimitry Andric static char ID; 53*e8d8bef9SDimitry Andric 54*e8d8bef9SDimitry Andric AMDGPULateCodeGenPrepare() : FunctionPass(ID) {} 55*e8d8bef9SDimitry Andric 56*e8d8bef9SDimitry Andric StringRef getPassName() const override { 57*e8d8bef9SDimitry Andric return "AMDGPU IR late optimizations"; 58*e8d8bef9SDimitry Andric } 59*e8d8bef9SDimitry Andric 60*e8d8bef9SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 61*e8d8bef9SDimitry Andric AU.addRequired<AssumptionCacheTracker>(); 62*e8d8bef9SDimitry Andric AU.addRequired<LegacyDivergenceAnalysis>(); 63*e8d8bef9SDimitry Andric AU.setPreservesAll(); 64*e8d8bef9SDimitry Andric } 65*e8d8bef9SDimitry Andric 66*e8d8bef9SDimitry Andric bool doInitialization(Module &M) override; 67*e8d8bef9SDimitry Andric bool runOnFunction(Function &F) override; 68*e8d8bef9SDimitry Andric 69*e8d8bef9SDimitry Andric bool visitInstruction(Instruction &) { return false; } 70*e8d8bef9SDimitry Andric 71*e8d8bef9SDimitry Andric // Check if the specified value is at least DWORD aligned. 72*e8d8bef9SDimitry Andric bool isDWORDAligned(const Value *V) const { 73*e8d8bef9SDimitry Andric KnownBits Known = computeKnownBits(V, *DL, 0, AC); 74*e8d8bef9SDimitry Andric return Known.countMinTrailingZeros() >= 2; 75*e8d8bef9SDimitry Andric } 76*e8d8bef9SDimitry Andric 77*e8d8bef9SDimitry Andric bool canWidenScalarExtLoad(LoadInst &LI) const; 78*e8d8bef9SDimitry Andric bool visitLoadInst(LoadInst &LI); 79*e8d8bef9SDimitry Andric }; 80*e8d8bef9SDimitry Andric 81*e8d8bef9SDimitry Andric } // end anonymous namespace 82*e8d8bef9SDimitry Andric 83*e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) { 84*e8d8bef9SDimitry Andric Mod = &M; 85*e8d8bef9SDimitry Andric DL = &Mod->getDataLayout(); 86*e8d8bef9SDimitry Andric return false; 87*e8d8bef9SDimitry Andric } 88*e8d8bef9SDimitry Andric 89*e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) { 90*e8d8bef9SDimitry Andric if (skipFunction(F)) 91*e8d8bef9SDimitry Andric return false; 92*e8d8bef9SDimitry Andric 93*e8d8bef9SDimitry Andric AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 94*e8d8bef9SDimitry Andric DA = &getAnalysis<LegacyDivergenceAnalysis>(); 95*e8d8bef9SDimitry Andric 96*e8d8bef9SDimitry Andric bool Changed = false; 97*e8d8bef9SDimitry Andric for (auto &BB : F) 98*e8d8bef9SDimitry Andric for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) { 99*e8d8bef9SDimitry Andric Instruction *I = &*BI++; 100*e8d8bef9SDimitry Andric Changed |= visit(*I); 101*e8d8bef9SDimitry Andric } 102*e8d8bef9SDimitry Andric 103*e8d8bef9SDimitry Andric return Changed; 104*e8d8bef9SDimitry Andric } 105*e8d8bef9SDimitry Andric 106*e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const { 107*e8d8bef9SDimitry Andric unsigned AS = LI.getPointerAddressSpace(); 108*e8d8bef9SDimitry Andric // Skip non-constant address space. 109*e8d8bef9SDimitry Andric if (AS != AMDGPUAS::CONSTANT_ADDRESS && 110*e8d8bef9SDimitry Andric AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) 111*e8d8bef9SDimitry Andric return false; 112*e8d8bef9SDimitry Andric // Skip non-simple loads. 113*e8d8bef9SDimitry Andric if (!LI.isSimple()) 114*e8d8bef9SDimitry Andric return false; 115*e8d8bef9SDimitry Andric auto *Ty = LI.getType(); 116*e8d8bef9SDimitry Andric // Skip aggregate types. 117*e8d8bef9SDimitry Andric if (Ty->isAggregateType()) 118*e8d8bef9SDimitry Andric return false; 119*e8d8bef9SDimitry Andric unsigned TySize = DL->getTypeStoreSize(Ty); 120*e8d8bef9SDimitry Andric // Only handle sub-DWORD loads. 121*e8d8bef9SDimitry Andric if (TySize >= 4) 122*e8d8bef9SDimitry Andric return false; 123*e8d8bef9SDimitry Andric // That load must be at least naturally aligned. 124*e8d8bef9SDimitry Andric if (LI.getAlign() < DL->getABITypeAlign(Ty)) 125*e8d8bef9SDimitry Andric return false; 126*e8d8bef9SDimitry Andric // It should be uniform, i.e. a scalar load. 127*e8d8bef9SDimitry Andric return DA->isUniform(&LI); 128*e8d8bef9SDimitry Andric } 129*e8d8bef9SDimitry Andric 130*e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { 131*e8d8bef9SDimitry Andric if (!WidenLoads) 132*e8d8bef9SDimitry Andric return false; 133*e8d8bef9SDimitry Andric 134*e8d8bef9SDimitry Andric // Skip if that load is already aligned on DWORD at least as it's handled in 135*e8d8bef9SDimitry Andric // SDAG. 136*e8d8bef9SDimitry Andric if (LI.getAlign() >= 4) 137*e8d8bef9SDimitry Andric return false; 138*e8d8bef9SDimitry Andric 139*e8d8bef9SDimitry Andric if (!canWidenScalarExtLoad(LI)) 140*e8d8bef9SDimitry Andric return false; 141*e8d8bef9SDimitry Andric 142*e8d8bef9SDimitry Andric int64_t Offset = 0; 143*e8d8bef9SDimitry Andric auto *Base = 144*e8d8bef9SDimitry Andric GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL); 145*e8d8bef9SDimitry Andric // If that base is not DWORD aligned, it's not safe to perform the following 146*e8d8bef9SDimitry Andric // transforms. 147*e8d8bef9SDimitry Andric if (!isDWORDAligned(Base)) 148*e8d8bef9SDimitry Andric return false; 149*e8d8bef9SDimitry Andric 150*e8d8bef9SDimitry Andric int64_t Adjust = Offset & 0x3; 151*e8d8bef9SDimitry Andric if (Adjust == 0) { 152*e8d8bef9SDimitry Andric // With a zero adjust, the original alignment could be promoted with a 153*e8d8bef9SDimitry Andric // better one. 154*e8d8bef9SDimitry Andric LI.setAlignment(Align(4)); 155*e8d8bef9SDimitry Andric return true; 156*e8d8bef9SDimitry Andric } 157*e8d8bef9SDimitry Andric 158*e8d8bef9SDimitry Andric IRBuilder<> IRB(&LI); 159*e8d8bef9SDimitry Andric IRB.SetCurrentDebugLocation(LI.getDebugLoc()); 160*e8d8bef9SDimitry Andric 161*e8d8bef9SDimitry Andric unsigned AS = LI.getPointerAddressSpace(); 162*e8d8bef9SDimitry Andric unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8; 163*e8d8bef9SDimitry Andric auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits); 164*e8d8bef9SDimitry Andric 165*e8d8bef9SDimitry Andric PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS); 166*e8d8bef9SDimitry Andric PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS); 167*e8d8bef9SDimitry Andric auto *NewPtr = IRB.CreateBitCast( 168*e8d8bef9SDimitry Andric IRB.CreateConstGEP1_64(IRB.CreateBitCast(Base, Int8PtrTy), 169*e8d8bef9SDimitry Andric Offset - Adjust), 170*e8d8bef9SDimitry Andric Int32PtrTy); 171*e8d8bef9SDimitry Andric LoadInst *NewLd = IRB.CreateAlignedLoad(NewPtr, Align(4)); 172*e8d8bef9SDimitry Andric NewLd->copyMetadata(LI); 173*e8d8bef9SDimitry Andric NewLd->setMetadata(LLVMContext::MD_range, nullptr); 174*e8d8bef9SDimitry Andric 175*e8d8bef9SDimitry Andric unsigned ShAmt = Adjust * 8; 176*e8d8bef9SDimitry Andric auto *NewVal = IRB.CreateBitCast( 177*e8d8bef9SDimitry Andric IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType()); 178*e8d8bef9SDimitry Andric LI.replaceAllUsesWith(NewVal); 179*e8d8bef9SDimitry Andric RecursivelyDeleteTriviallyDeadInstructions(&LI); 180*e8d8bef9SDimitry Andric 181*e8d8bef9SDimitry Andric return true; 182*e8d8bef9SDimitry Andric } 183*e8d8bef9SDimitry Andric 184*e8d8bef9SDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, 185*e8d8bef9SDimitry Andric "AMDGPU IR late optimizations", false, false) 186*e8d8bef9SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 187*e8d8bef9SDimitry Andric INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 188*e8d8bef9SDimitry Andric INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE, 189*e8d8bef9SDimitry Andric "AMDGPU IR late optimizations", false, false) 190*e8d8bef9SDimitry Andric 191*e8d8bef9SDimitry Andric char AMDGPULateCodeGenPrepare::ID = 0; 192*e8d8bef9SDimitry Andric 193*e8d8bef9SDimitry Andric FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() { 194*e8d8bef9SDimitry Andric return new AMDGPULateCodeGenPrepare(); 195*e8d8bef9SDimitry Andric } 196