1 //===-------- LoopDataPrefetch.cpp - Loop Data Prefetching Pass -----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a Loop Data Prefetching Pass. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Transforms/Scalar/LoopDataPrefetch.h" 15 16 #define DEBUG_TYPE "loop-data-prefetch" 17 #include "llvm/ADT/DepthFirstIterator.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/Analysis/AssumptionCache.h" 20 #include "llvm/Analysis/CodeMetrics.h" 21 #include "llvm/Analysis/InstructionSimplify.h" 22 #include "llvm/Analysis/LoopInfo.h" 23 #include "llvm/Analysis/OptimizationDiagnosticInfo.h" 24 #include "llvm/Analysis/ScalarEvolution.h" 25 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 26 #include "llvm/Analysis/ScalarEvolutionExpander.h" 27 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 28 #include "llvm/Analysis/TargetTransformInfo.h" 29 #include "llvm/Analysis/ValueTracking.h" 30 #include "llvm/IR/CFG.h" 31 #include "llvm/IR/Dominators.h" 32 #include "llvm/IR/Function.h" 33 #include "llvm/IR/IntrinsicInst.h" 34 #include "llvm/IR/Module.h" 35 #include "llvm/Support/CommandLine.h" 36 #include "llvm/Support/Debug.h" 37 #include "llvm/Transforms/Scalar.h" 38 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 39 #include "llvm/Transforms/Utils/Local.h" 40 #include "llvm/Transforms/Utils/ValueMapper.h" 41 using namespace llvm; 42 43 // By default, we limit this to creating 16 PHIs (which is a little over half 44 // of the allocatable register set). 45 static cl::opt<bool> 46 PrefetchWrites("loop-prefetch-writes", cl::Hidden, cl::init(false), 47 cl::desc("Prefetch write addresses")); 48 49 static cl::opt<unsigned> 50 PrefetchDistance("prefetch-distance", 51 cl::desc("Number of instructions to prefetch ahead"), 52 cl::Hidden); 53 54 static cl::opt<unsigned> 55 MinPrefetchStride("min-prefetch-stride", 56 cl::desc("Min stride to add prefetches"), cl::Hidden); 57 58 static cl::opt<unsigned> MaxPrefetchIterationsAhead( 59 "max-prefetch-iters-ahead", 60 cl::desc("Max number of iterations to prefetch ahead"), cl::Hidden); 61 62 STATISTIC(NumPrefetches, "Number of prefetches inserted"); 63 64 namespace { 65 66 /// Loop prefetch implementation class. 67 class LoopDataPrefetch { 68 public: 69 LoopDataPrefetch(AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, 70 const TargetTransformInfo *TTI, 71 OptimizationRemarkEmitter *ORE) 72 : AC(AC), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {} 73 74 bool run(); 75 76 private: 77 bool runOnLoop(Loop *L); 78 79 /// \brief Check if the the stride of the accesses is large enough to 80 /// warrant a prefetch. 81 bool isStrideLargeEnough(const SCEVAddRecExpr *AR); 82 83 unsigned getMinPrefetchStride() { 84 if (MinPrefetchStride.getNumOccurrences() > 0) 85 return MinPrefetchStride; 86 return TTI->getMinPrefetchStride(); 87 } 88 89 unsigned getPrefetchDistance() { 90 if (PrefetchDistance.getNumOccurrences() > 0) 91 return PrefetchDistance; 92 return TTI->getPrefetchDistance(); 93 } 94 95 unsigned getMaxPrefetchIterationsAhead() { 96 if (MaxPrefetchIterationsAhead.getNumOccurrences() > 0) 97 return MaxPrefetchIterationsAhead; 98 return TTI->getMaxPrefetchIterationsAhead(); 99 } 100 101 AssumptionCache *AC; 102 LoopInfo *LI; 103 ScalarEvolution *SE; 104 const TargetTransformInfo *TTI; 105 OptimizationRemarkEmitter *ORE; 106 }; 107 108 /// Legacy class for inserting loop data prefetches. 109 class LoopDataPrefetchLegacyPass : public FunctionPass { 110 public: 111 static char ID; // Pass ID, replacement for typeid 112 LoopDataPrefetchLegacyPass() : FunctionPass(ID) { 113 initializeLoopDataPrefetchLegacyPassPass(*PassRegistry::getPassRegistry()); 114 } 115 116 void getAnalysisUsage(AnalysisUsage &AU) const override { 117 AU.addRequired<AssumptionCacheTracker>(); 118 AU.addPreserved<DominatorTreeWrapperPass>(); 119 AU.addRequired<LoopInfoWrapperPass>(); 120 AU.addPreserved<LoopInfoWrapperPass>(); 121 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 122 AU.addRequired<ScalarEvolutionWrapperPass>(); 123 // FIXME: For some reason, preserving SE here breaks LSR (even if 124 // this pass changes nothing). 125 // AU.addPreserved<ScalarEvolutionWrapperPass>(); 126 AU.addRequired<TargetTransformInfoWrapperPass>(); 127 } 128 129 bool runOnFunction(Function &F) override; 130 }; 131 } 132 133 char LoopDataPrefetchLegacyPass::ID = 0; 134 INITIALIZE_PASS_BEGIN(LoopDataPrefetchLegacyPass, "loop-data-prefetch", 135 "Loop Data Prefetch", false, false) 136 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 137 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 138 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 139 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 140 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 141 INITIALIZE_PASS_END(LoopDataPrefetchLegacyPass, "loop-data-prefetch", 142 "Loop Data Prefetch", false, false) 143 144 FunctionPass *llvm::createLoopDataPrefetchPass() { 145 return new LoopDataPrefetchLegacyPass(); 146 } 147 148 bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) { 149 unsigned TargetMinStride = getMinPrefetchStride(); 150 // No need to check if any stride goes. 151 if (TargetMinStride <= 1) 152 return true; 153 154 const auto *ConstStride = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)); 155 // If MinStride is set, don't prefetch unless we can ensure that stride is 156 // larger. 157 if (!ConstStride) 158 return false; 159 160 unsigned AbsStride = std::abs(ConstStride->getAPInt().getSExtValue()); 161 return TargetMinStride <= AbsStride; 162 } 163 164 PreservedAnalyses LoopDataPrefetchPass::run(Function &F, 165 FunctionAnalysisManager &AM) { 166 LoopInfo *LI = &AM.getResult<LoopAnalysis>(F); 167 ScalarEvolution *SE = &AM.getResult<ScalarEvolutionAnalysis>(F); 168 AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F); 169 OptimizationRemarkEmitter *ORE = 170 &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 171 const TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F); 172 173 LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE); 174 bool Changed = LDP.run(); 175 176 if (Changed) { 177 PreservedAnalyses PA; 178 PA.preserve<DominatorTreeAnalysis>(); 179 PA.preserve<LoopAnalysis>(); 180 return PA; 181 } 182 183 return PreservedAnalyses::all(); 184 } 185 186 bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) { 187 if (skipFunction(F)) 188 return false; 189 190 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 191 ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 192 AssumptionCache *AC = 193 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 194 OptimizationRemarkEmitter *ORE = 195 &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 196 const TargetTransformInfo *TTI = 197 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 198 199 LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE); 200 return LDP.run(); 201 } 202 203 bool LoopDataPrefetch::run() { 204 // If PrefetchDistance is not set, don't run the pass. This gives an 205 // opportunity for targets to run this pass for selected subtargets only 206 // (whose TTI sets PrefetchDistance). 207 if (getPrefetchDistance() == 0) 208 return false; 209 assert(TTI->getCacheLineSize() && "Cache line size is not set for target"); 210 211 bool MadeChange = false; 212 213 for (Loop *I : *LI) 214 for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L) 215 MadeChange |= runOnLoop(*L); 216 217 return MadeChange; 218 } 219 220 bool LoopDataPrefetch::runOnLoop(Loop *L) { 221 bool MadeChange = false; 222 223 // Only prefetch in the inner-most loop 224 if (!L->empty()) 225 return MadeChange; 226 227 SmallPtrSet<const Value *, 32> EphValues; 228 CodeMetrics::collectEphemeralValues(L, AC, EphValues); 229 230 // Calculate the number of iterations ahead to prefetch 231 CodeMetrics Metrics; 232 for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); 233 I != IE; ++I) { 234 235 // If the loop already has prefetches, then assume that the user knows 236 // what they are doing and don't add any more. 237 for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end(); 238 J != JE; ++J) 239 if (CallInst *CI = dyn_cast<CallInst>(J)) 240 if (Function *F = CI->getCalledFunction()) 241 if (F->getIntrinsicID() == Intrinsic::prefetch) 242 return MadeChange; 243 244 Metrics.analyzeBasicBlock(*I, *TTI, EphValues); 245 } 246 unsigned LoopSize = Metrics.NumInsts; 247 if (!LoopSize) 248 LoopSize = 1; 249 250 unsigned ItersAhead = getPrefetchDistance() / LoopSize; 251 if (!ItersAhead) 252 ItersAhead = 1; 253 254 if (ItersAhead > getMaxPrefetchIterationsAhead()) 255 return MadeChange; 256 257 DEBUG(dbgs() << "Prefetching " << ItersAhead 258 << " iterations ahead (loop size: " << LoopSize << ") in " 259 << L->getHeader()->getParent()->getName() << ": " << *L); 260 261 SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> PrefLoads; 262 for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); 263 I != IE; ++I) { 264 for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end(); 265 J != JE; ++J) { 266 Value *PtrValue; 267 Instruction *MemI; 268 269 if (LoadInst *LMemI = dyn_cast<LoadInst>(J)) { 270 MemI = LMemI; 271 PtrValue = LMemI->getPointerOperand(); 272 } else if (StoreInst *SMemI = dyn_cast<StoreInst>(J)) { 273 if (!PrefetchWrites) continue; 274 MemI = SMemI; 275 PtrValue = SMemI->getPointerOperand(); 276 } else continue; 277 278 unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); 279 if (PtrAddrSpace) 280 continue; 281 282 if (L->isLoopInvariant(PtrValue)) 283 continue; 284 285 const SCEV *LSCEV = SE->getSCEV(PtrValue); 286 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); 287 if (!LSCEVAddRec) 288 continue; 289 290 // Check if the the stride of the accesses is large enough to warrant a 291 // prefetch. 292 if (!isStrideLargeEnough(LSCEVAddRec)) 293 continue; 294 295 // We don't want to double prefetch individual cache lines. If this load 296 // is known to be within one cache line of some other load that has 297 // already been prefetched, then don't prefetch this one as well. 298 bool DupPref = false; 299 for (const auto &PrefLoad : PrefLoads) { 300 const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, PrefLoad.second); 301 if (const SCEVConstant *ConstPtrDiff = 302 dyn_cast<SCEVConstant>(PtrDiff)) { 303 int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue()); 304 if (PD < (int64_t) TTI->getCacheLineSize()) { 305 DupPref = true; 306 break; 307 } 308 } 309 } 310 if (DupPref) 311 continue; 312 313 const SCEV *NextLSCEV = SE->getAddExpr(LSCEVAddRec, SE->getMulExpr( 314 SE->getConstant(LSCEVAddRec->getType(), ItersAhead), 315 LSCEVAddRec->getStepRecurrence(*SE))); 316 if (!isSafeToExpand(NextLSCEV, *SE)) 317 continue; 318 319 PrefLoads.push_back(std::make_pair(MemI, LSCEVAddRec)); 320 321 Type *I8Ptr = Type::getInt8PtrTy((*I)->getContext(), PtrAddrSpace); 322 SCEVExpander SCEVE(*SE, J->getModule()->getDataLayout(), "prefaddr"); 323 Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, MemI); 324 325 IRBuilder<> Builder(MemI); 326 Module *M = (*I)->getParent()->getParent(); 327 Type *I32 = Type::getInt32Ty((*I)->getContext()); 328 Value *PrefetchFunc = Intrinsic::getDeclaration(M, Intrinsic::prefetch); 329 Builder.CreateCall( 330 PrefetchFunc, 331 {PrefPtrValue, 332 ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1), 333 ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)}); 334 ++NumPrefetches; 335 DEBUG(dbgs() << " Access: " << *PtrValue << ", SCEV: " << *LSCEV 336 << "\n"); 337 ORE->emitOptimizationRemark(DEBUG_TYPE, MemI, "prefetched memory access"); 338 339 MadeChange = true; 340 } 341 } 342 343 return MadeChange; 344 } 345 346