1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass does misc. AMDGPU optimizations on IR *just* before instruction 11 /// selection. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "llvm/Analysis/AssumptionCache.h" 17 #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 18 #include "llvm/Analysis/ValueTracking.h" 19 #include "llvm/CodeGen/Passes.h" 20 #include "llvm/IR/IRBuilder.h" 21 #include "llvm/IR/InstVisitor.h" 22 #include "llvm/InitializePasses.h" 23 #include "llvm/Support/CommandLine.h" 24 #include "llvm/Support/KnownBits.h" 25 #include "llvm/Transforms/Utils/Local.h" 26 #include <cassert> 27 #include <iterator> 28 29 #define DEBUG_TYPE "amdgpu-late-codegenprepare" 30 31 using namespace llvm; 32 33 // Scalar load widening needs running after load-store-vectorizer as that pass 34 // doesn't handle overlapping cases. In addition, this pass enhances the 35 // widening to handle cases where scalar sub-dword loads are naturally aligned 36 // only but not dword aligned. 37 static cl::opt<bool> 38 WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", 39 cl::desc("Widen sub-dword constant address space loads in " 40 "AMDGPULateCodeGenPrepare"), 41 cl::ReallyHidden, cl::init(true)); 42 43 namespace { 44 45 class AMDGPULateCodeGenPrepare 46 : public FunctionPass, 47 public InstVisitor<AMDGPULateCodeGenPrepare, bool> { 48 Module *Mod = nullptr; 49 const DataLayout *DL = nullptr; 50 51 AssumptionCache *AC = nullptr; 52 LegacyDivergenceAnalysis *DA = nullptr; 53 54 public: 55 static char ID; 56 57 AMDGPULateCodeGenPrepare() : FunctionPass(ID) {} 58 59 StringRef getPassName() const override { 60 return "AMDGPU IR late optimizations"; 61 } 62 63 void getAnalysisUsage(AnalysisUsage &AU) const override { 64 AU.addRequired<AssumptionCacheTracker>(); 65 AU.addRequired<LegacyDivergenceAnalysis>(); 66 AU.setPreservesAll(); 67 } 68 69 bool doInitialization(Module &M) override; 70 bool runOnFunction(Function &F) override; 71 72 bool visitInstruction(Instruction &) { return false; } 73 74 // Check if the specified value is at least DWORD aligned. 75 bool isDWORDAligned(const Value *V) const { 76 KnownBits Known = computeKnownBits(V, *DL, 0, AC); 77 return Known.countMinTrailingZeros() >= 2; 78 } 79 80 bool canWidenScalarExtLoad(LoadInst &LI) const; 81 bool visitLoadInst(LoadInst &LI); 82 }; 83 84 } // end anonymous namespace 85 86 bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) { 87 Mod = &M; 88 DL = &Mod->getDataLayout(); 89 return false; 90 } 91 92 bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) { 93 if (skipFunction(F)) 94 return false; 95 96 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 97 DA = &getAnalysis<LegacyDivergenceAnalysis>(); 98 99 bool Changed = false; 100 for (auto &BB : F) 101 for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) { 102 Instruction *I = &*BI++; 103 Changed |= visit(*I); 104 } 105 106 return Changed; 107 } 108 109 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const { 110 unsigned AS = LI.getPointerAddressSpace(); 111 // Skip non-constant address space. 112 if (AS != AMDGPUAS::CONSTANT_ADDRESS && 113 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) 114 return false; 115 // Skip non-simple loads. 116 if (!LI.isSimple()) 117 return false; 118 auto *Ty = LI.getType(); 119 // Skip aggregate types. 120 if (Ty->isAggregateType()) 121 return false; 122 unsigned TySize = DL->getTypeStoreSize(Ty); 123 // Only handle sub-DWORD loads. 124 if (TySize >= 4) 125 return false; 126 // That load must be at least naturally aligned. 127 if (LI.getAlign() < DL->getABITypeAlign(Ty)) 128 return false; 129 // It should be uniform, i.e. a scalar load. 130 return DA->isUniform(&LI); 131 } 132 133 bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { 134 if (!WidenLoads) 135 return false; 136 137 // Skip if that load is already aligned on DWORD at least as it's handled in 138 // SDAG. 139 if (LI.getAlign() >= 4) 140 return false; 141 142 if (!canWidenScalarExtLoad(LI)) 143 return false; 144 145 int64_t Offset = 0; 146 auto *Base = 147 GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL); 148 // If that base is not DWORD aligned, it's not safe to perform the following 149 // transforms. 150 if (!isDWORDAligned(Base)) 151 return false; 152 153 int64_t Adjust = Offset & 0x3; 154 if (Adjust == 0) { 155 // With a zero adjust, the original alignment could be promoted with a 156 // better one. 157 LI.setAlignment(Align(4)); 158 return true; 159 } 160 161 IRBuilder<> IRB(&LI); 162 IRB.SetCurrentDebugLocation(LI.getDebugLoc()); 163 164 unsigned AS = LI.getPointerAddressSpace(); 165 unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8; 166 auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits); 167 168 PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS); 169 PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS); 170 auto *NewPtr = IRB.CreateBitCast( 171 IRB.CreateConstGEP1_64(IRB.CreateBitCast(Base, Int8PtrTy), 172 Offset - Adjust), 173 Int32PtrTy); 174 LoadInst *NewLd = IRB.CreateAlignedLoad(NewPtr, Align(4)); 175 NewLd->copyMetadata(LI); 176 NewLd->setMetadata(LLVMContext::MD_range, nullptr); 177 178 unsigned ShAmt = Adjust * 8; 179 auto *NewVal = IRB.CreateBitCast( 180 IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType()); 181 LI.replaceAllUsesWith(NewVal); 182 RecursivelyDeleteTriviallyDeadInstructions(&LI); 183 184 return true; 185 } 186 187 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, 188 "AMDGPU IR late optimizations", false, false) 189 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 190 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 191 INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE, 192 "AMDGPU IR late optimizations", false, false) 193 194 char AMDGPULateCodeGenPrepare::ID = 0; 195 196 FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() { 197 return new AMDGPULateCodeGenPrepare(); 198 } 199