1*73471bf0Spatrick //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2*73471bf0Spatrick // 3*73471bf0Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*73471bf0Spatrick // See https://llvm.org/LICENSE.txt for license information. 5*73471bf0Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*73471bf0Spatrick // 7*73471bf0Spatrick //===----------------------------------------------------------------------===// 8*73471bf0Spatrick // 9*73471bf0Spatrick /// \file 10*73471bf0Spatrick /// This pass does misc. AMDGPU optimizations on IR *just* before instruction 11*73471bf0Spatrick /// selection. 12*73471bf0Spatrick // 13*73471bf0Spatrick //===----------------------------------------------------------------------===// 14*73471bf0Spatrick 15*73471bf0Spatrick #include "AMDGPU.h" 16*73471bf0Spatrick #include "llvm/Analysis/AssumptionCache.h" 17*73471bf0Spatrick #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 18*73471bf0Spatrick #include "llvm/Analysis/ValueTracking.h" 19*73471bf0Spatrick #include "llvm/IR/IRBuilder.h" 20*73471bf0Spatrick #include "llvm/IR/InstVisitor.h" 21*73471bf0Spatrick #include "llvm/InitializePasses.h" 22*73471bf0Spatrick #include "llvm/Support/CommandLine.h" 23*73471bf0Spatrick #include "llvm/Support/KnownBits.h" 24*73471bf0Spatrick #include "llvm/Transforms/Utils/Local.h" 25*73471bf0Spatrick 26*73471bf0Spatrick #define DEBUG_TYPE "amdgpu-late-codegenprepare" 27*73471bf0Spatrick 28*73471bf0Spatrick using namespace llvm; 29*73471bf0Spatrick 30*73471bf0Spatrick // Scalar load widening needs running after load-store-vectorizer as that pass 31*73471bf0Spatrick // doesn't handle overlapping cases. In addition, this pass enhances the 32*73471bf0Spatrick // widening to handle cases where scalar sub-dword loads are naturally aligned 33*73471bf0Spatrick // only but not dword aligned. 34*73471bf0Spatrick static cl::opt<bool> 35*73471bf0Spatrick WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", 36*73471bf0Spatrick cl::desc("Widen sub-dword constant address space loads in " 37*73471bf0Spatrick "AMDGPULateCodeGenPrepare"), 38*73471bf0Spatrick cl::ReallyHidden, cl::init(true)); 39*73471bf0Spatrick 40*73471bf0Spatrick namespace { 41*73471bf0Spatrick 42*73471bf0Spatrick class AMDGPULateCodeGenPrepare 43*73471bf0Spatrick : public FunctionPass, 44*73471bf0Spatrick public InstVisitor<AMDGPULateCodeGenPrepare, bool> { 45*73471bf0Spatrick Module *Mod = nullptr; 46*73471bf0Spatrick const DataLayout *DL = nullptr; 47*73471bf0Spatrick 48*73471bf0Spatrick AssumptionCache *AC = nullptr; 49*73471bf0Spatrick LegacyDivergenceAnalysis *DA = nullptr; 50*73471bf0Spatrick 51*73471bf0Spatrick public: 52*73471bf0Spatrick static char ID; 53*73471bf0Spatrick 54*73471bf0Spatrick AMDGPULateCodeGenPrepare() : FunctionPass(ID) {} 55*73471bf0Spatrick 56*73471bf0Spatrick StringRef getPassName() const override { 57*73471bf0Spatrick return "AMDGPU IR late optimizations"; 58*73471bf0Spatrick } 59*73471bf0Spatrick 60*73471bf0Spatrick void getAnalysisUsage(AnalysisUsage &AU) const override { 61*73471bf0Spatrick AU.addRequired<AssumptionCacheTracker>(); 62*73471bf0Spatrick AU.addRequired<LegacyDivergenceAnalysis>(); 63*73471bf0Spatrick AU.setPreservesAll(); 64*73471bf0Spatrick } 65*73471bf0Spatrick 66*73471bf0Spatrick bool doInitialization(Module &M) override; 67*73471bf0Spatrick bool runOnFunction(Function &F) override; 68*73471bf0Spatrick 69*73471bf0Spatrick bool visitInstruction(Instruction &) { return false; } 70*73471bf0Spatrick 71*73471bf0Spatrick // Check if the specified value is at least DWORD aligned. 72*73471bf0Spatrick bool isDWORDAligned(const Value *V) const { 73*73471bf0Spatrick KnownBits Known = computeKnownBits(V, *DL, 0, AC); 74*73471bf0Spatrick return Known.countMinTrailingZeros() >= 2; 75*73471bf0Spatrick } 76*73471bf0Spatrick 77*73471bf0Spatrick bool canWidenScalarExtLoad(LoadInst &LI) const; 78*73471bf0Spatrick bool visitLoadInst(LoadInst &LI); 79*73471bf0Spatrick }; 80*73471bf0Spatrick 81*73471bf0Spatrick } // end anonymous namespace 82*73471bf0Spatrick 83*73471bf0Spatrick bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) { 84*73471bf0Spatrick Mod = &M; 85*73471bf0Spatrick DL = &Mod->getDataLayout(); 86*73471bf0Spatrick return false; 87*73471bf0Spatrick } 88*73471bf0Spatrick 89*73471bf0Spatrick bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) { 90*73471bf0Spatrick if (skipFunction(F)) 91*73471bf0Spatrick return false; 92*73471bf0Spatrick 93*73471bf0Spatrick AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 94*73471bf0Spatrick DA = &getAnalysis<LegacyDivergenceAnalysis>(); 95*73471bf0Spatrick 96*73471bf0Spatrick bool Changed = false; 97*73471bf0Spatrick for (auto &BB : F) 98*73471bf0Spatrick for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) { 99*73471bf0Spatrick Instruction *I = &*BI++; 100*73471bf0Spatrick Changed |= visit(*I); 101*73471bf0Spatrick } 102*73471bf0Spatrick 103*73471bf0Spatrick return Changed; 104*73471bf0Spatrick } 105*73471bf0Spatrick 106*73471bf0Spatrick bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const { 107*73471bf0Spatrick unsigned AS = LI.getPointerAddressSpace(); 108*73471bf0Spatrick // Skip non-constant address space. 109*73471bf0Spatrick if (AS != AMDGPUAS::CONSTANT_ADDRESS && 110*73471bf0Spatrick AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) 111*73471bf0Spatrick return false; 112*73471bf0Spatrick // Skip non-simple loads. 113*73471bf0Spatrick if (!LI.isSimple()) 114*73471bf0Spatrick return false; 115*73471bf0Spatrick auto *Ty = LI.getType(); 116*73471bf0Spatrick // Skip aggregate types. 117*73471bf0Spatrick if (Ty->isAggregateType()) 118*73471bf0Spatrick return false; 119*73471bf0Spatrick unsigned TySize = DL->getTypeStoreSize(Ty); 120*73471bf0Spatrick // Only handle sub-DWORD loads. 121*73471bf0Spatrick if (TySize >= 4) 122*73471bf0Spatrick return false; 123*73471bf0Spatrick // That load must be at least naturally aligned. 124*73471bf0Spatrick if (LI.getAlign() < DL->getABITypeAlign(Ty)) 125*73471bf0Spatrick return false; 126*73471bf0Spatrick // It should be uniform, i.e. a scalar load. 127*73471bf0Spatrick return DA->isUniform(&LI); 128*73471bf0Spatrick } 129*73471bf0Spatrick 130*73471bf0Spatrick bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { 131*73471bf0Spatrick if (!WidenLoads) 132*73471bf0Spatrick return false; 133*73471bf0Spatrick 134*73471bf0Spatrick // Skip if that load is already aligned on DWORD at least as it's handled in 135*73471bf0Spatrick // SDAG. 136*73471bf0Spatrick if (LI.getAlign() >= 4) 137*73471bf0Spatrick return false; 138*73471bf0Spatrick 139*73471bf0Spatrick if (!canWidenScalarExtLoad(LI)) 140*73471bf0Spatrick return false; 141*73471bf0Spatrick 142*73471bf0Spatrick int64_t Offset = 0; 143*73471bf0Spatrick auto *Base = 144*73471bf0Spatrick GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL); 145*73471bf0Spatrick // If that base is not DWORD aligned, it's not safe to perform the following 146*73471bf0Spatrick // transforms. 147*73471bf0Spatrick if (!isDWORDAligned(Base)) 148*73471bf0Spatrick return false; 149*73471bf0Spatrick 150*73471bf0Spatrick int64_t Adjust = Offset & 0x3; 151*73471bf0Spatrick if (Adjust == 0) { 152*73471bf0Spatrick // With a zero adjust, the original alignment could be promoted with a 153*73471bf0Spatrick // better one. 154*73471bf0Spatrick LI.setAlignment(Align(4)); 155*73471bf0Spatrick return true; 156*73471bf0Spatrick } 157*73471bf0Spatrick 158*73471bf0Spatrick IRBuilder<> IRB(&LI); 159*73471bf0Spatrick IRB.SetCurrentDebugLocation(LI.getDebugLoc()); 160*73471bf0Spatrick 161*73471bf0Spatrick unsigned AS = LI.getPointerAddressSpace(); 162*73471bf0Spatrick unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8; 163*73471bf0Spatrick auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits); 164*73471bf0Spatrick 165*73471bf0Spatrick PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS); 166*73471bf0Spatrick PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS); 167*73471bf0Spatrick auto *NewPtr = IRB.CreateBitCast( 168*73471bf0Spatrick IRB.CreateConstGEP1_64( 169*73471bf0Spatrick IRB.getInt8Ty(), 170*73471bf0Spatrick IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy), 171*73471bf0Spatrick Offset - Adjust), 172*73471bf0Spatrick Int32PtrTy); 173*73471bf0Spatrick LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4)); 174*73471bf0Spatrick NewLd->copyMetadata(LI); 175*73471bf0Spatrick NewLd->setMetadata(LLVMContext::MD_range, nullptr); 176*73471bf0Spatrick 177*73471bf0Spatrick unsigned ShAmt = Adjust * 8; 178*73471bf0Spatrick auto *NewVal = IRB.CreateBitCast( 179*73471bf0Spatrick IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType()); 180*73471bf0Spatrick LI.replaceAllUsesWith(NewVal); 181*73471bf0Spatrick RecursivelyDeleteTriviallyDeadInstructions(&LI); 182*73471bf0Spatrick 183*73471bf0Spatrick return true; 184*73471bf0Spatrick } 185*73471bf0Spatrick 186*73471bf0Spatrick INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, 187*73471bf0Spatrick "AMDGPU IR late optimizations", false, false) 188*73471bf0Spatrick INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 189*73471bf0Spatrick INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 190*73471bf0Spatrick INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE, 191*73471bf0Spatrick "AMDGPU IR late optimizations", false, false) 192*73471bf0Spatrick 193*73471bf0Spatrick char AMDGPULateCodeGenPrepare::ID = 0; 194*73471bf0Spatrick 195*73471bf0Spatrick FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() { 196*73471bf0Spatrick return new AMDGPULateCodeGenPrepare(); 197*73471bf0Spatrick } 198