1 //===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// \brief Analyzes if a function potentially memory bound and if a kernel 11 /// kernel may benefit from limiting number of waves to reduce cache thrashing. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUPerfHintAnalysis.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "Utils/AMDGPUBaseInfo.h" 19 #include "llvm/ADT/SmallSet.h" 20 #include "llvm/ADT/Statistic.h" 21 #include "llvm/Analysis/CallGraph.h" 22 #include "llvm/Analysis/CallGraphSCCPass.h" 23 #include "llvm/Analysis/LazyCallGraph.h" 24 #include "llvm/Analysis/ValueTracking.h" 25 #include "llvm/CodeGen/TargetLowering.h" 26 #include "llvm/CodeGen/TargetPassConfig.h" 27 #include "llvm/CodeGen/TargetSubtargetInfo.h" 28 #include "llvm/IR/Instructions.h" 29 #include "llvm/IR/IntrinsicInst.h" 30 #include "llvm/Support/CommandLine.h" 31 #include "llvm/Target/TargetMachine.h" 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "amdgpu-perf-hint" 36 37 static cl::opt<unsigned> 38 MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, 39 cl::desc("Function mem bound threshold in %")); 40 41 static cl::opt<unsigned> 42 LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, 43 cl::desc("Kernel limit wave threshold in %")); 44 45 static cl::opt<unsigned> 46 IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, 47 cl::desc("Indirect access memory instruction weight")); 48 49 static cl::opt<unsigned> 50 LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, 51 cl::desc("Large stride memory access weight")); 52 53 static cl::opt<unsigned> 54 LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, 55 cl::desc("Large stride memory access threshold")); 56 57 STATISTIC(NumMemBound, "Number of functions marked as memory bound"); 58 STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave"); 59 60 namespace { 61 62 struct AMDGPUPerfHint { 63 friend AMDGPUPerfHintAnalysis; 64 65 public: 66 AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_, 67 const SITargetLowering *TLI_) 68 : FIM(FIM_), TLI(TLI_) {} 69 70 bool runOnFunction(Function &F); 71 72 private: 73 struct MemAccessInfo { 74 const Value *V = nullptr; 75 const Value *Base = nullptr; 76 int64_t Offset = 0; 77 MemAccessInfo() = default; 78 bool isLargeStride(MemAccessInfo &Reference) const; 79 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 80 Printable print() const { 81 return Printable([this](raw_ostream &OS) { 82 OS << "Value: " << *V << '\n' 83 << "Base: " << *Base << " Offset: " << Offset << '\n'; 84 }); 85 } 86 #endif 87 }; 88 89 MemAccessInfo makeMemAccessInfo(Instruction *) const; 90 91 MemAccessInfo LastAccess; // Last memory access info 92 93 AMDGPUPerfHintAnalysis::FuncInfoMap &FIM; 94 95 const DataLayout *DL = nullptr; 96 97 const SITargetLowering *TLI; 98 99 AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F); 100 static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F); 101 static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F); 102 103 bool isIndirectAccess(const Instruction *Inst) const; 104 105 /// Check if the instruction is large stride. 106 /// The purpose is to identify memory access pattern like: 107 /// x = a[i]; 108 /// y = a[i+1000]; 109 /// z = a[i+2000]; 110 /// In the above example, the second and third memory access will be marked 111 /// large stride memory access. 112 bool isLargeStride(const Instruction *Inst); 113 114 bool isGlobalAddr(const Value *V) const; 115 bool isLocalAddr(const Value *V) const; 116 bool isGlobalLoadUsedInBB(const Instruction &) const; 117 }; 118 119 static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType( 120 const Instruction *Inst) { 121 if (const auto *LI = dyn_cast<LoadInst>(Inst)) 122 return {LI->getPointerOperand(), LI->getType()}; 123 if (const auto *SI = dyn_cast<StoreInst>(Inst)) 124 return {SI->getPointerOperand(), SI->getValueOperand()->getType()}; 125 if (const auto *AI = dyn_cast<AtomicCmpXchgInst>(Inst)) 126 return {AI->getPointerOperand(), AI->getCompareOperand()->getType()}; 127 if (const auto *AI = dyn_cast<AtomicRMWInst>(Inst)) 128 return {AI->getPointerOperand(), AI->getValOperand()->getType()}; 129 if (const auto *MI = dyn_cast<AnyMemIntrinsic>(Inst)) 130 return {MI->getRawDest(), Type::getInt8Ty(MI->getContext())}; 131 132 return {nullptr, nullptr}; 133 } 134 135 bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { 136 LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); 137 SmallSet<const Value *, 32> WorkSet; 138 SmallSet<const Value *, 32> Visited; 139 if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) { 140 if (isGlobalAddr(MO)) 141 WorkSet.insert(MO); 142 } 143 144 while (!WorkSet.empty()) { 145 const Value *V = *WorkSet.begin(); 146 WorkSet.erase(*WorkSet.begin()); 147 if (!Visited.insert(V).second) 148 continue; 149 LLVM_DEBUG(dbgs() << " check: " << *V << '\n'); 150 151 if (const auto *LD = dyn_cast<LoadInst>(V)) { 152 const auto *M = LD->getPointerOperand(); 153 if (isGlobalAddr(M)) { 154 LLVM_DEBUG(dbgs() << " is IA\n"); 155 return true; 156 } 157 continue; 158 } 159 160 if (const auto *GEP = dyn_cast<GetElementPtrInst>(V)) { 161 const auto *P = GEP->getPointerOperand(); 162 WorkSet.insert(P); 163 for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I) 164 WorkSet.insert(GEP->getOperand(I)); 165 continue; 166 } 167 168 if (const auto *U = dyn_cast<UnaryInstruction>(V)) { 169 WorkSet.insert(U->getOperand(0)); 170 continue; 171 } 172 173 if (const auto *BO = dyn_cast<BinaryOperator>(V)) { 174 WorkSet.insert(BO->getOperand(0)); 175 WorkSet.insert(BO->getOperand(1)); 176 continue; 177 } 178 179 if (const auto *S = dyn_cast<SelectInst>(V)) { 180 WorkSet.insert(S->getFalseValue()); 181 WorkSet.insert(S->getTrueValue()); 182 continue; 183 } 184 185 if (const auto *E = dyn_cast<ExtractElementInst>(V)) { 186 WorkSet.insert(E->getVectorOperand()); 187 continue; 188 } 189 190 LLVM_DEBUG(dbgs() << " dropped\n"); 191 } 192 193 LLVM_DEBUG(dbgs() << " is not IA\n"); 194 return false; 195 } 196 197 // Returns true if the global load `I` is used in its own basic block. 198 bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const { 199 const auto *Ld = dyn_cast<LoadInst>(&I); 200 if (!Ld) 201 return false; 202 if (!isGlobalAddr(Ld->getPointerOperand())) 203 return false; 204 205 for (const User *Usr : Ld->users()) { 206 if (const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) { 207 if (UsrInst->getParent() == I.getParent()) 208 return true; 209 } 210 } 211 212 return false; 213 } 214 215 AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { 216 AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F]; 217 218 LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n'); 219 220 for (auto &B : F) { 221 LastAccess = MemAccessInfo(); 222 unsigned UsedGlobalLoadsInBB = 0; 223 for (auto &I : B) { 224 if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) { 225 unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32); 226 // TODO: Check if the global load and its user are close to each other 227 // instead (Or do this analysis in GCNSchedStrategy?). 228 if (isGlobalLoadUsedInBB(I)) 229 UsedGlobalLoadsInBB += Size; 230 if (isIndirectAccess(&I)) 231 FI.IAMInstCost += Size; 232 if (isLargeStride(&I)) 233 FI.LSMInstCost += Size; 234 FI.MemInstCost += Size; 235 FI.InstCost += Size; 236 continue; 237 } 238 if (auto *CB = dyn_cast<CallBase>(&I)) { 239 Function *Callee = CB->getCalledFunction(); 240 if (!Callee || Callee->isDeclaration()) { 241 ++FI.InstCost; 242 continue; 243 } 244 if (&F == Callee) // Handle immediate recursion 245 continue; 246 247 auto Loc = FIM.find(Callee); 248 if (Loc == FIM.end()) 249 continue; 250 251 FI.MemInstCost += Loc->second.MemInstCost; 252 FI.InstCost += Loc->second.InstCost; 253 FI.IAMInstCost += Loc->second.IAMInstCost; 254 FI.LSMInstCost += Loc->second.LSMInstCost; 255 } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 256 TargetLoweringBase::AddrMode AM; 257 auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL); 258 AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr)); 259 AM.HasBaseReg = !AM.BaseGV; 260 if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(), 261 GEP->getPointerAddressSpace())) 262 // Offset will likely be folded into load or store 263 continue; 264 ++FI.InstCost; 265 } else { 266 ++FI.InstCost; 267 } 268 } 269 270 if (!FI.HasDenseGlobalMemAcc) { 271 unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size(); 272 if (GlobalMemAccPercentage > 50) { 273 LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since " 274 << B.getName() << " has " << GlobalMemAccPercentage 275 << "% global memory access\n"); 276 FI.HasDenseGlobalMemAcc = true; 277 } 278 } 279 } 280 281 return &FI; 282 } 283 284 bool AMDGPUPerfHint::runOnFunction(Function &F) { 285 const Module &M = *F.getParent(); 286 DL = &M.getDataLayout(); 287 288 if (F.hasFnAttribute("amdgpu-wave-limiter") && 289 F.hasFnAttribute("amdgpu-memory-bound")) 290 return false; 291 292 const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F); 293 294 LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost 295 << '\n' 296 << " IAMInst cost: " << Info->IAMInstCost << '\n' 297 << " LSMInst cost: " << Info->LSMInstCost << '\n' 298 << " TotalInst cost: " << Info->InstCost << '\n'); 299 300 bool Changed = false; 301 302 if (isMemBound(*Info)) { 303 LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n"); 304 NumMemBound++; 305 F.addFnAttr("amdgpu-memory-bound", "true"); 306 Changed = true; 307 } 308 309 if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) { 310 LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n"); 311 NumLimitWave++; 312 F.addFnAttr("amdgpu-wave-limiter", "true"); 313 Changed = true; 314 } 315 316 return Changed; 317 } 318 319 bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { 320 // Reverting optimal scheduling in favour of occupancy with basic block(s) 321 // having dense global memory access can potentially hurt performance. 322 if (FI.HasDenseGlobalMemAcc) 323 return true; 324 325 return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh; 326 } 327 328 bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { 329 return ((FI.MemInstCost + FI.IAMInstCost * IAWeight + 330 FI.LSMInstCost * LSWeight) * 100 / FI.InstCost) > LimitWaveThresh; 331 } 332 333 bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const { 334 if (auto *PT = dyn_cast<PointerType>(V->getType())) { 335 unsigned As = PT->getAddressSpace(); 336 // Flat likely points to global too. 337 return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS; 338 } 339 return false; 340 } 341 342 bool AMDGPUPerfHint::isLocalAddr(const Value *V) const { 343 if (auto *PT = dyn_cast<PointerType>(V->getType())) 344 return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; 345 return false; 346 } 347 348 bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) { 349 LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n'); 350 351 MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst)); 352 bool IsLargeStride = MAI.isLargeStride(LastAccess); 353 if (MAI.Base) 354 LastAccess = std::move(MAI); 355 356 return IsLargeStride; 357 } 358 359 AMDGPUPerfHint::MemAccessInfo 360 AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const { 361 MemAccessInfo MAI; 362 const Value *MO = getMemoryInstrPtrAndType(Inst).first; 363 364 LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n'); 365 // Do not treat local-addr memory access as large stride. 366 if (isLocalAddr(MO)) 367 return MAI; 368 369 MAI.V = MO; 370 MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL); 371 return MAI; 372 } 373 374 bool AMDGPUPerfHint::MemAccessInfo::isLargeStride( 375 MemAccessInfo &Reference) const { 376 377 if (!Base || !Reference.Base || Base != Reference.Base) 378 return false; 379 380 uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset 381 : Reference.Offset - Offset; 382 bool Result = Diff > LargeStrideThresh; 383 LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n" 384 << print() << "<=>\n" 385 << Reference.print() << "Result:" << Result << '\n'); 386 return Result; 387 } 388 389 class AMDGPUPerfHintAnalysisLegacy : public CallGraphSCCPass { 390 private: 391 // FIXME: This is relying on maintaining state between different SCCs. 392 AMDGPUPerfHintAnalysis Impl; 393 394 public: 395 static char ID; 396 397 AMDGPUPerfHintAnalysisLegacy() : CallGraphSCCPass(ID) {} 398 399 bool runOnSCC(CallGraphSCC &SCC) override; 400 401 void getAnalysisUsage(AnalysisUsage &AU) const override { 402 AU.setPreservesAll(); 403 } 404 }; 405 406 } // namespace 407 408 bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const { 409 auto FI = FIM.find(F); 410 if (FI == FIM.end()) 411 return false; 412 413 return AMDGPUPerfHint::isMemBound(FI->second); 414 } 415 416 bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const { 417 auto FI = FIM.find(F); 418 if (FI == FIM.end()) 419 return false; 420 421 return AMDGPUPerfHint::needLimitWave(FI->second); 422 } 423 424 bool AMDGPUPerfHintAnalysis::runOnSCC(const GCNTargetMachine &TM, 425 CallGraphSCC &SCC) { 426 bool Changed = false; 427 for (CallGraphNode *I : SCC) { 428 Function *F = I->getFunction(); 429 if (!F || F->isDeclaration()) 430 continue; 431 432 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*F); 433 AMDGPUPerfHint Analyzer(FIM, ST.getTargetLowering()); 434 435 if (Analyzer.runOnFunction(*F)) 436 Changed = true; 437 } 438 439 return Changed; 440 } 441 442 bool AMDGPUPerfHintAnalysis::run(const GCNTargetMachine &TM, 443 LazyCallGraph &CG) { 444 bool Changed = false; 445 446 CG.buildRefSCCs(); 447 448 for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs()) { 449 for (LazyCallGraph::SCC &SCC : RC) { 450 if (SCC.size() != 1) 451 continue; 452 Function &F = SCC.begin()->getFunction(); 453 // TODO: Skip without norecurse, or interposable? 454 if (F.isDeclaration()) 455 continue; 456 457 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 458 AMDGPUPerfHint Analyzer(FIM, ST.getTargetLowering()); 459 if (Analyzer.runOnFunction(F)) 460 Changed = true; 461 } 462 } 463 464 return Changed; 465 } 466 467 char AMDGPUPerfHintAnalysisLegacy::ID = 0; 468 char &llvm::AMDGPUPerfHintAnalysisLegacyID = AMDGPUPerfHintAnalysisLegacy::ID; 469 470 INITIALIZE_PASS(AMDGPUPerfHintAnalysisLegacy, DEBUG_TYPE, 471 "Analysis if a function is memory bound", true, true) 472 473 bool AMDGPUPerfHintAnalysisLegacy::runOnSCC(CallGraphSCC &SCC) { 474 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 475 if (!TPC) 476 return false; 477 478 const GCNTargetMachine &TM = TPC->getTM<GCNTargetMachine>(); 479 return Impl.runOnSCC(TM, SCC); 480 } 481 482 PreservedAnalyses AMDGPUPerfHintAnalysisPass::run(Module &M, 483 ModuleAnalysisManager &AM) { 484 auto &CG = AM.getResult<LazyCallGraphAnalysis>(M); 485 486 bool Changed = Impl->run(TM, CG); 487 if (!Changed) 488 return PreservedAnalyses::all(); 489 490 PreservedAnalyses PA; 491 PA.preserve<LazyCallGraphAnalysis>(); 492 return PA; 493 } 494