1 //===- bolt/Passes/IndirectCallPromotion.cpp ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the IndirectCallPromotion class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/IndirectCallPromotion.h" 14 #include "bolt/Passes/BinaryFunctionCallGraph.h" 15 #include "bolt/Passes/DataflowInfoManager.h" 16 #include "bolt/Passes/Inliner.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/Support/CommandLine.h" 19 #include <iterator> 20 21 #define DEBUG_TYPE "ICP" 22 #define DEBUG_VERBOSE(Level, X) \ 23 if (opts::Verbosity >= (Level)) { \ 24 X; \ 25 } 26 27 using namespace llvm; 28 using namespace bolt; 29 30 namespace opts { 31 32 extern cl::OptionCategory BoltOptCategory; 33 34 extern cl::opt<IndirectCallPromotionType> ICP; 35 extern cl::opt<unsigned> Verbosity; 36 extern cl::opt<unsigned> ExecutionCountThreshold; 37 38 static cl::opt<unsigned> ICPJTRemainingPercentThreshold( 39 "icp-jt-remaining-percent-threshold", 40 cl::desc("The percentage threshold against remaining unpromoted indirect " 41 "call count for the promotion for jump tables"), 42 cl::init(30), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); 43 44 static cl::opt<unsigned> ICPJTTotalPercentThreshold( 45 "icp-jt-total-percent-threshold", 46 cl::desc( 47 "The percentage threshold against total count for the promotion for " 48 "jump tables"), 49 cl::init(5), cl::Hidden, cl::cat(BoltOptCategory)); 50 51 static cl::opt<unsigned> ICPCallsRemainingPercentThreshold( 52 "icp-calls-remaining-percent-threshold", 53 cl::desc("The percentage threshold against remaining unpromoted indirect " 54 "call count for the promotion for calls"), 55 cl::init(50), cl::Hidden, cl::cat(BoltOptCategory)); 56 57 static cl::opt<unsigned> ICPCallsTotalPercentThreshold( 58 "icp-calls-total-percent-threshold", 59 cl::desc( 60 "The percentage threshold against total count for the promotion for " 61 "calls"), 62 cl::init(30), cl::Hidden, cl::cat(BoltOptCategory)); 63 64 static cl::opt<unsigned> ICPMispredictThreshold( 65 "indirect-call-promotion-mispredict-threshold", 66 cl::desc("misprediction threshold for skipping ICP on an " 67 "indirect call"), 68 cl::init(0), cl::cat(BoltOptCategory)); 69 70 static cl::alias ICPMispredictThresholdAlias( 71 "icp-mp-threshold", 72 cl::desc("alias for --indirect-call-promotion-mispredict-threshold"), 73 cl::aliasopt(ICPMispredictThreshold)); 74 75 static cl::opt<bool> ICPUseMispredicts( 76 "indirect-call-promotion-use-mispredicts", 77 cl::desc("use misprediction frequency for determining whether or not ICP " 78 "should be applied at a callsite. The " 79 "-indirect-call-promotion-mispredict-threshold value will be used " 80 "by this heuristic"), 81 cl::cat(BoltOptCategory)); 82 83 static cl::alias ICPUseMispredictsAlias( 84 "icp-use-mp", 85 cl::desc("alias for --indirect-call-promotion-use-mispredicts"), 86 cl::aliasopt(ICPUseMispredicts)); 87 88 static cl::opt<unsigned> 89 ICPTopN("indirect-call-promotion-topn", 90 cl::desc("limit number of targets to consider when doing indirect " 91 "call promotion. 0 = no limit"), 92 cl::init(3), cl::cat(BoltOptCategory)); 93 94 static cl::alias 95 ICPTopNAlias("icp-topn", 96 cl::desc("alias for --indirect-call-promotion-topn"), 97 cl::aliasopt(ICPTopN)); 98 99 static cl::opt<unsigned> ICPCallsTopN( 100 "indirect-call-promotion-calls-topn", 101 cl::desc("limit number of targets to consider when doing indirect " 102 "call promotion on calls. 0 = no limit"), 103 cl::init(0), cl::cat(BoltOptCategory)); 104 105 static cl::alias ICPCallsTopNAlias( 106 "icp-calls-topn", 107 cl::desc("alias for --indirect-call-promotion-calls-topn"), 108 cl::aliasopt(ICPCallsTopN)); 109 110 static cl::opt<unsigned> ICPJumpTablesTopN( 111 "indirect-call-promotion-jump-tables-topn", 112 cl::desc("limit number of targets to consider when doing indirect " 113 "call promotion on jump tables. 0 = no limit"), 114 cl::init(0), cl::cat(BoltOptCategory)); 115 116 static cl::alias ICPJumpTablesTopNAlias( 117 "icp-jt-topn", 118 cl::desc("alias for --indirect-call-promotion-jump-tables-topn"), 119 cl::aliasopt(ICPJumpTablesTopN)); 120 121 static cl::opt<bool> EliminateLoads( 122 "icp-eliminate-loads", 123 cl::desc("enable load elimination using memory profiling data when " 124 "performing ICP"), 125 cl::init(true), cl::cat(BoltOptCategory)); 126 127 static cl::opt<unsigned> ICPTopCallsites( 128 "icp-top-callsites", 129 cl::desc("optimize hottest calls until at least this percentage of all " 130 "indirect calls frequency is covered. 0 = all callsites"), 131 cl::init(99), cl::Hidden, cl::cat(BoltOptCategory)); 132 133 static cl::list<std::string> 134 ICPFuncsList("icp-funcs", cl::CommaSeparated, 135 cl::desc("list of functions to enable ICP for"), 136 cl::value_desc("func1,func2,func3,..."), cl::Hidden, 137 cl::cat(BoltOptCategory)); 138 139 static cl::opt<bool> 140 ICPOldCodeSequence("icp-old-code-sequence", 141 cl::desc("use old code sequence for promoted calls"), 142 cl::Hidden, cl::cat(BoltOptCategory)); 143 144 static cl::opt<bool> ICPJumpTablesByTarget( 145 "icp-jump-tables-targets", 146 cl::desc( 147 "for jump tables, optimize indirect jmp targets instead of indices"), 148 cl::Hidden, cl::cat(BoltOptCategory)); 149 150 static cl::alias 151 ICPJumpTablesByTargetAlias("icp-jt-targets", 152 cl::desc("alias for --icp-jump-tables-targets"), 153 cl::aliasopt(ICPJumpTablesByTarget)); 154 155 static cl::opt<bool> ICPPeelForInline( 156 "icp-inline", cl::desc("only promote call targets eligible for inlining"), 157 cl::Hidden, cl::cat(BoltOptCategory)); 158 159 } // namespace opts 160 161 static bool verifyProfile(std::map<uint64_t, BinaryFunction> &BFs) { 162 bool IsValid = true; 163 for (auto &BFI : BFs) { 164 BinaryFunction &BF = BFI.second; 165 if (!BF.isSimple()) 166 continue; 167 for (const BinaryBasicBlock &BB : BF) { 168 auto BI = BB.branch_info_begin(); 169 for (BinaryBasicBlock *SuccBB : BB.successors()) { 170 if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) { 171 if (BB.getKnownExecutionCount() == 0 || 172 SuccBB->getKnownExecutionCount() == 0) { 173 errs() << "BOLT-WARNING: profile verification failed after ICP for " 174 "function " 175 << BF << '\n'; 176 IsValid = false; 177 } 178 } 179 ++BI; 180 } 181 } 182 } 183 return IsValid; 184 } 185 186 namespace llvm { 187 namespace bolt { 188 189 IndirectCallPromotion::Callsite::Callsite(BinaryFunction &BF, 190 const IndirectCallProfile &ICP) 191 : From(BF.getSymbol()), To(ICP.Offset), Mispreds(ICP.Mispreds), 192 Branches(ICP.Count) { 193 if (ICP.Symbol) { 194 To.Sym = ICP.Symbol; 195 To.Addr = 0; 196 } 197 } 198 199 void IndirectCallPromotion::printDecision( 200 llvm::raw_ostream &OS, 201 std::vector<IndirectCallPromotion::Callsite> &Targets, unsigned N) const { 202 uint64_t TotalCount = 0; 203 uint64_t TotalMispreds = 0; 204 for (const Callsite &S : Targets) { 205 TotalCount += S.Branches; 206 TotalMispreds += S.Mispreds; 207 } 208 if (!TotalCount) 209 TotalCount = 1; 210 if (!TotalMispreds) 211 TotalMispreds = 1; 212 213 OS << "BOLT-INFO: ICP decision for call site with " << Targets.size() 214 << " targets, Count = " << TotalCount << ", Mispreds = " << TotalMispreds 215 << "\n"; 216 217 size_t I = 0; 218 for (const Callsite &S : Targets) { 219 OS << "Count = " << S.Branches << ", " 220 << format("%.1f", (100.0 * S.Branches) / TotalCount) << ", " 221 << "Mispreds = " << S.Mispreds << ", " 222 << format("%.1f", (100.0 * S.Mispreds) / TotalMispreds); 223 if (I < N) 224 OS << " * to be optimized *"; 225 if (!S.JTIndices.empty()) { 226 OS << " Indices:"; 227 for (const uint64_t Idx : S.JTIndices) 228 OS << " " << Idx; 229 } 230 OS << "\n"; 231 I += S.JTIndices.empty() ? 1 : S.JTIndices.size(); 232 } 233 } 234 235 // Get list of targets for a given call sorted by most frequently 236 // called first. 237 std::vector<IndirectCallPromotion::Callsite> 238 IndirectCallPromotion::getCallTargets(BinaryBasicBlock &BB, 239 const MCInst &Inst) const { 240 BinaryFunction &BF = *BB.getFunction(); 241 const BinaryContext &BC = BF.getBinaryContext(); 242 std::vector<Callsite> Targets; 243 244 if (const JumpTable *JT = BF.getJumpTable(Inst)) { 245 // Don't support PIC jump tables for now 246 if (!opts::ICPJumpTablesByTarget && JT->Type == JumpTable::JTT_PIC) 247 return Targets; 248 const Location From(BF.getSymbol()); 249 const std::pair<size_t, size_t> Range = 250 JT->getEntriesForAddress(BC.MIB->getJumpTable(Inst)); 251 assert(JT->Counts.empty() || JT->Counts.size() >= Range.second); 252 JumpTable::JumpInfo DefaultJI; 253 const JumpTable::JumpInfo *JI = 254 JT->Counts.empty() ? &DefaultJI : &JT->Counts[Range.first]; 255 const size_t JIAdj = JT->Counts.empty() ? 0 : 1; 256 assert(JT->Type == JumpTable::JTT_PIC || 257 JT->EntrySize == BC.AsmInfo->getCodePointerSize()); 258 for (size_t I = Range.first; I < Range.second; ++I, JI += JIAdj) { 259 MCSymbol *Entry = JT->Entries[I]; 260 const BinaryBasicBlock *ToBB = BF.getBasicBlockForLabel(Entry); 261 assert(ToBB || Entry == BF.getFunctionEndLabel() || 262 Entry == BF.getFunctionEndLabel(FragmentNum::cold())); 263 if (Entry == BF.getFunctionEndLabel() || 264 Entry == BF.getFunctionEndLabel(FragmentNum::cold())) 265 continue; 266 const Location To(Entry); 267 const BinaryBasicBlock::BinaryBranchInfo &BI = BB.getBranchInfo(*ToBB); 268 Targets.emplace_back(From, To, BI.MispredictedCount, BI.Count, 269 I - Range.first); 270 } 271 272 // Sort by symbol then addr. 273 llvm::sort(Targets, [](const Callsite &A, const Callsite &B) { 274 if (A.To.Sym && B.To.Sym) 275 return A.To.Sym < B.To.Sym; 276 else if (A.To.Sym && !B.To.Sym) 277 return true; 278 else if (!A.To.Sym && B.To.Sym) 279 return false; 280 else 281 return A.To.Addr < B.To.Addr; 282 }); 283 284 // Targets may contain multiple entries to the same target, but using 285 // different indices. Their profile will report the same number of branches 286 // for different indices if the target is the same. That's because we don't 287 // profile the index value, but only the target via LBR. 288 auto First = Targets.begin(); 289 auto Last = Targets.end(); 290 auto Result = First; 291 while (++First != Last) { 292 Callsite &A = *Result; 293 const Callsite &B = *First; 294 if (A.To.Sym && B.To.Sym && A.To.Sym == B.To.Sym) 295 A.JTIndices.insert(A.JTIndices.end(), B.JTIndices.begin(), 296 B.JTIndices.end()); 297 else 298 *(++Result) = *First; 299 } 300 ++Result; 301 302 LLVM_DEBUG(if (Targets.end() - Result > 0) { 303 dbgs() << "BOLT-INFO: ICP: " << (Targets.end() - Result) 304 << " duplicate targets removed\n"; 305 }); 306 307 Targets.erase(Result, Targets.end()); 308 } else { 309 // Don't try to optimize PC relative indirect calls. 310 if (Inst.getOperand(0).isReg() && 311 Inst.getOperand(0).getReg() == BC.MRI->getProgramCounter()) 312 return Targets; 313 314 const auto ICSP = BC.MIB->tryGetAnnotationAs<IndirectCallSiteProfile>( 315 Inst, "CallProfile"); 316 if (ICSP) { 317 for (const IndirectCallProfile &CSP : ICSP.get()) { 318 Callsite Site(BF, CSP); 319 if (Site.isValid()) 320 Targets.emplace_back(std::move(Site)); 321 } 322 } 323 } 324 325 // Sort by target count, number of indices in case of jump table, and 326 // mispredicts. We prioritize targets with high count, small number of indices 327 // and high mispredicts. Break ties by selecting targets with lower addresses. 328 llvm::stable_sort(Targets, [](const Callsite &A, const Callsite &B) { 329 if (A.Branches != B.Branches) 330 return A.Branches > B.Branches; 331 if (A.JTIndices.size() != B.JTIndices.size()) 332 return A.JTIndices.size() < B.JTIndices.size(); 333 if (A.Mispreds != B.Mispreds) 334 return A.Mispreds > B.Mispreds; 335 return A.To.Addr < B.To.Addr; 336 }); 337 338 // Remove non-symbol targets 339 llvm::erase_if(Targets, [](const Callsite &CS) { return !CS.To.Sym; }); 340 341 LLVM_DEBUG(if (BF.getJumpTable(Inst)) { 342 uint64_t TotalCount = 0; 343 uint64_t TotalMispreds = 0; 344 for (const Callsite &S : Targets) { 345 TotalCount += S.Branches; 346 TotalMispreds += S.Mispreds; 347 } 348 if (!TotalCount) 349 TotalCount = 1; 350 if (!TotalMispreds) 351 TotalMispreds = 1; 352 353 dbgs() << "BOLT-INFO: ICP: jump table size = " << Targets.size() 354 << ", Count = " << TotalCount << ", Mispreds = " << TotalMispreds 355 << "\n"; 356 357 size_t I = 0; 358 for (const Callsite &S : Targets) { 359 dbgs() << "Count[" << I << "] = " << S.Branches << ", " 360 << format("%.1f", (100.0 * S.Branches) / TotalCount) << ", " 361 << "Mispreds[" << I << "] = " << S.Mispreds << ", " 362 << format("%.1f", (100.0 * S.Mispreds) / TotalMispreds) << "\n"; 363 ++I; 364 } 365 }); 366 367 return Targets; 368 } 369 370 IndirectCallPromotion::JumpTableInfoType 371 IndirectCallPromotion::maybeGetHotJumpTableTargets(BinaryBasicBlock &BB, 372 MCInst &CallInst, 373 MCInst *&TargetFetchInst, 374 const JumpTable *JT) const { 375 assert(JT && "Can't get jump table addrs for non-jump tables."); 376 377 BinaryFunction &Function = *BB.getFunction(); 378 BinaryContext &BC = Function.getBinaryContext(); 379 380 if (!Function.hasMemoryProfile() || !opts::EliminateLoads) 381 return JumpTableInfoType(); 382 383 JumpTableInfoType HotTargets; 384 MCInst *MemLocInstr; 385 MCInst *PCRelBaseOut; 386 unsigned BaseReg, IndexReg; 387 int64_t DispValue; 388 const MCExpr *DispExpr; 389 MutableArrayRef<MCInst> Insts(&BB.front(), &CallInst); 390 const IndirectBranchType Type = BC.MIB->analyzeIndirectBranch( 391 CallInst, Insts.begin(), Insts.end(), BC.AsmInfo->getCodePointerSize(), 392 MemLocInstr, BaseReg, IndexReg, DispValue, DispExpr, PCRelBaseOut); 393 394 assert(MemLocInstr && "There should always be a load for jump tables"); 395 if (!MemLocInstr) 396 return JumpTableInfoType(); 397 398 LLVM_DEBUG({ 399 dbgs() << "BOLT-INFO: ICP attempting to find memory profiling data for " 400 << "jump table in " << Function << " at @ " 401 << (&CallInst - &BB.front()) << "\n" 402 << "BOLT-INFO: ICP target fetch instructions:\n"; 403 BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function); 404 if (MemLocInstr != &CallInst) 405 BC.printInstruction(dbgs(), CallInst, 0, &Function); 406 }); 407 408 DEBUG_VERBOSE(1, { 409 dbgs() << "Jmp info: Type = " << (unsigned)Type << ", " 410 << "BaseReg = " << BC.MRI->getName(BaseReg) << ", " 411 << "IndexReg = " << BC.MRI->getName(IndexReg) << ", " 412 << "DispValue = " << Twine::utohexstr(DispValue) << ", " 413 << "DispExpr = " << DispExpr << ", " 414 << "MemLocInstr = "; 415 BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function); 416 dbgs() << "\n"; 417 }); 418 419 ++TotalIndexBasedCandidates; 420 421 auto ErrorOrMemAccessProfile = 422 BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MemLocInstr, 423 "MemoryAccessProfile"); 424 if (!ErrorOrMemAccessProfile) { 425 DEBUG_VERBOSE(1, dbgs() 426 << "BOLT-INFO: ICP no memory profiling data found\n"); 427 return JumpTableInfoType(); 428 } 429 MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccessProfile.get(); 430 431 uint64_t ArrayStart; 432 if (DispExpr) { 433 ErrorOr<uint64_t> DispValueOrError = 434 BC.getSymbolValue(*BC.MIB->getTargetSymbol(DispExpr)); 435 assert(DispValueOrError && "global symbol needs a value"); 436 ArrayStart = *DispValueOrError; 437 } else { 438 ArrayStart = static_cast<uint64_t>(DispValue); 439 } 440 441 if (BaseReg == BC.MRI->getProgramCounter()) 442 ArrayStart += Function.getAddress() + MemAccessProfile.NextInstrOffset; 443 444 // This is a map of [symbol] -> [count, index] and is used to combine indices 445 // into the jump table since there may be multiple addresses that all have the 446 // same entry. 447 std::map<MCSymbol *, std::pair<uint64_t, uint64_t>> HotTargetMap; 448 const std::pair<size_t, size_t> Range = JT->getEntriesForAddress(ArrayStart); 449 450 for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) { 451 size_t Index; 452 // Mem data occasionally includes nullprs, ignore them. 453 if (!AccessInfo.MemoryObject && !AccessInfo.Offset) 454 continue; 455 456 if (AccessInfo.Offset % JT->EntrySize != 0) // ignore bogus data 457 return JumpTableInfoType(); 458 459 if (AccessInfo.MemoryObject) { 460 // Deal with bad/stale data 461 if (!AccessInfo.MemoryObject->getName().startswith( 462 "JUMP_TABLE/" + Function.getOneName().str())) 463 return JumpTableInfoType(); 464 Index = 465 (AccessInfo.Offset - (ArrayStart - JT->getAddress())) / JT->EntrySize; 466 } else { 467 Index = (AccessInfo.Offset - ArrayStart) / JT->EntrySize; 468 } 469 470 // If Index is out of range it probably means the memory profiling data is 471 // wrong for this instruction, bail out. 472 if (Index >= Range.second) { 473 LLVM_DEBUG(dbgs() << "BOLT-INFO: Index out of range of " << Range.first 474 << ", " << Range.second << "\n"); 475 return JumpTableInfoType(); 476 } 477 478 // Make sure the hot index points at a legal label corresponding to a BB, 479 // e.g. not the end of function (unreachable) label. 480 if (!Function.getBasicBlockForLabel(JT->Entries[Index + Range.first])) { 481 LLVM_DEBUG({ 482 dbgs() << "BOLT-INFO: hot index " << Index << " pointing at bogus " 483 << "label " << JT->Entries[Index + Range.first]->getName() 484 << " in jump table:\n"; 485 JT->print(dbgs()); 486 dbgs() << "HotTargetMap:\n"; 487 for (std::pair<MCSymbol *const, std::pair<uint64_t, uint64_t>> &HT : 488 HotTargetMap) 489 dbgs() << "BOLT-INFO: " << HT.first->getName() 490 << " = (count=" << HT.second.first 491 << ", index=" << HT.second.second << ")\n"; 492 }); 493 return JumpTableInfoType(); 494 } 495 496 std::pair<uint64_t, uint64_t> &HotTarget = 497 HotTargetMap[JT->Entries[Index + Range.first]]; 498 HotTarget.first += AccessInfo.Count; 499 HotTarget.second = Index; 500 } 501 502 llvm::copy(llvm::make_second_range(HotTargetMap), 503 std::back_inserter(HotTargets)); 504 505 // Sort with highest counts first. 506 llvm::sort(reverse(HotTargets)); 507 508 LLVM_DEBUG({ 509 dbgs() << "BOLT-INFO: ICP jump table hot targets:\n"; 510 for (const std::pair<uint64_t, uint64_t> &Target : HotTargets) 511 dbgs() << "BOLT-INFO: Idx = " << Target.second << ", " 512 << "Count = " << Target.first << "\n"; 513 }); 514 515 BC.MIB->getOrCreateAnnotationAs<uint16_t>(CallInst, "JTIndexReg") = IndexReg; 516 517 TargetFetchInst = MemLocInstr; 518 519 return HotTargets; 520 } 521 522 IndirectCallPromotion::SymTargetsType 523 IndirectCallPromotion::findCallTargetSymbols(std::vector<Callsite> &Targets, 524 size_t &N, BinaryBasicBlock &BB, 525 MCInst &CallInst, 526 MCInst *&TargetFetchInst) const { 527 const JumpTable *JT = BB.getFunction()->getJumpTable(CallInst); 528 SymTargetsType SymTargets; 529 530 if (!JT) { 531 for (size_t I = 0; I < N; ++I) { 532 assert(Targets[I].To.Sym && "All ICP targets must be to known symbols"); 533 assert(Targets[I].JTIndices.empty() && 534 "Can't have jump table indices for non-jump tables"); 535 SymTargets.emplace_back(Targets[I].To.Sym, 0); 536 } 537 return SymTargets; 538 } 539 540 // Use memory profile to select hot targets. 541 JumpTableInfoType HotTargets = 542 maybeGetHotJumpTableTargets(BB, CallInst, TargetFetchInst, JT); 543 544 auto findTargetsIndex = [&](uint64_t JTIndex) { 545 for (size_t I = 0; I < Targets.size(); ++I) 546 if (llvm::is_contained(Targets[I].JTIndices, JTIndex)) 547 return I; 548 LLVM_DEBUG(dbgs() << "BOLT-ERROR: Unable to find target index for hot jump " 549 << " table entry in " << *BB.getFunction() << "\n"); 550 llvm_unreachable("Hot indices must be referred to by at least one " 551 "callsite"); 552 }; 553 554 if (!HotTargets.empty()) { 555 if (opts::Verbosity >= 1) 556 for (size_t I = 0; I < HotTargets.size(); ++I) 557 outs() << "BOLT-INFO: HotTarget[" << I << "] = (" << HotTargets[I].first 558 << ", " << HotTargets[I].second << ")\n"; 559 560 // Recompute hottest targets, now discriminating which index is hot 561 // NOTE: This is a tradeoff. On one hand, we get index information. On the 562 // other hand, info coming from the memory profile is much less accurate 563 // than LBRs. So we may actually end up working with more coarse 564 // profile granularity in exchange for information about indices. 565 std::vector<Callsite> NewTargets; 566 std::map<const MCSymbol *, uint32_t> IndicesPerTarget; 567 uint64_t TotalMemAccesses = 0; 568 for (size_t I = 0; I < HotTargets.size(); ++I) { 569 const uint64_t TargetIndex = findTargetsIndex(HotTargets[I].second); 570 ++IndicesPerTarget[Targets[TargetIndex].To.Sym]; 571 TotalMemAccesses += HotTargets[I].first; 572 } 573 uint64_t RemainingMemAccesses = TotalMemAccesses; 574 const size_t TopN = 575 opts::ICPJumpTablesTopN ? opts::ICPJumpTablesTopN : opts::ICPTopN; 576 size_t I = 0; 577 for (; I < HotTargets.size(); ++I) { 578 const uint64_t MemAccesses = HotTargets[I].first; 579 if (100 * MemAccesses < 580 TotalMemAccesses * opts::ICPJTTotalPercentThreshold) 581 break; 582 if (100 * MemAccesses < 583 RemainingMemAccesses * opts::ICPJTRemainingPercentThreshold) 584 break; 585 if (TopN && I >= TopN) 586 break; 587 RemainingMemAccesses -= MemAccesses; 588 589 const uint64_t JTIndex = HotTargets[I].second; 590 Callsite &Target = Targets[findTargetsIndex(JTIndex)]; 591 592 NewTargets.push_back(Target); 593 std::vector<uint64_t>({JTIndex}).swap(NewTargets.back().JTIndices); 594 llvm::erase(Target.JTIndices, JTIndex); 595 596 // Keep fixCFG counts sane if more indices use this same target later 597 assert(IndicesPerTarget[Target.To.Sym] > 0 && "wrong map"); 598 NewTargets.back().Branches = 599 Target.Branches / IndicesPerTarget[Target.To.Sym]; 600 NewTargets.back().Mispreds = 601 Target.Mispreds / IndicesPerTarget[Target.To.Sym]; 602 assert(Target.Branches >= NewTargets.back().Branches); 603 assert(Target.Mispreds >= NewTargets.back().Mispreds); 604 Target.Branches -= NewTargets.back().Branches; 605 Target.Mispreds -= NewTargets.back().Mispreds; 606 } 607 llvm::copy(Targets, std::back_inserter(NewTargets)); 608 std::swap(NewTargets, Targets); 609 N = I; 610 611 if (N == 0 && opts::Verbosity >= 1) { 612 outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " in " 613 << BB.getName() << ": failed to meet thresholds after memory " 614 << "profile data was loaded.\n"; 615 return SymTargets; 616 } 617 } 618 619 for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) { 620 Callsite &Target = Targets[TgtIdx]; 621 assert(Target.To.Sym && "All ICP targets must be to known symbols"); 622 assert(!Target.JTIndices.empty() && "Jump tables must have indices"); 623 for (uint64_t Idx : Target.JTIndices) { 624 SymTargets.emplace_back(Target.To.Sym, Idx); 625 ++I; 626 } 627 } 628 629 return SymTargets; 630 } 631 632 IndirectCallPromotion::MethodInfoType IndirectCallPromotion::maybeGetVtableSyms( 633 BinaryBasicBlock &BB, MCInst &Inst, 634 const SymTargetsType &SymTargets) const { 635 BinaryFunction &Function = *BB.getFunction(); 636 BinaryContext &BC = Function.getBinaryContext(); 637 std::vector<std::pair<MCSymbol *, uint64_t>> VtableSyms; 638 std::vector<MCInst *> MethodFetchInsns; 639 unsigned VtableReg, MethodReg; 640 uint64_t MethodOffset; 641 642 assert(!Function.getJumpTable(Inst) && 643 "Can't get vtable addrs for jump tables."); 644 645 if (!Function.hasMemoryProfile() || !opts::EliminateLoads) 646 return MethodInfoType(); 647 648 MutableArrayRef<MCInst> Insts(&BB.front(), &Inst + 1); 649 if (!BC.MIB->analyzeVirtualMethodCall(Insts.begin(), Insts.end(), 650 MethodFetchInsns, VtableReg, MethodReg, 651 MethodOffset)) { 652 DEBUG_VERBOSE( 653 1, dbgs() << "BOLT-INFO: ICP unable to analyze method call in " 654 << Function << " at @ " << (&Inst - &BB.front()) << "\n"); 655 return MethodInfoType(); 656 } 657 658 ++TotalMethodLoadEliminationCandidates; 659 660 DEBUG_VERBOSE(1, { 661 dbgs() << "BOLT-INFO: ICP found virtual method call in " << Function 662 << " at @ " << (&Inst - &BB.front()) << "\n"; 663 dbgs() << "BOLT-INFO: ICP method fetch instructions:\n"; 664 for (MCInst *Inst : MethodFetchInsns) 665 BC.printInstruction(dbgs(), *Inst, 0, &Function); 666 667 if (MethodFetchInsns.back() != &Inst) 668 BC.printInstruction(dbgs(), Inst, 0, &Function); 669 }); 670 671 // Try to get value profiling data for the method load instruction. 672 auto ErrorOrMemAccessProfile = 673 BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MethodFetchInsns.back(), 674 "MemoryAccessProfile"); 675 if (!ErrorOrMemAccessProfile) { 676 DEBUG_VERBOSE(1, dbgs() 677 << "BOLT-INFO: ICP no memory profiling data found\n"); 678 return MethodInfoType(); 679 } 680 MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccessProfile.get(); 681 682 // Find the vtable that each method belongs to. 683 std::map<const MCSymbol *, uint64_t> MethodToVtable; 684 685 for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) { 686 uint64_t Address = AccessInfo.Offset; 687 if (AccessInfo.MemoryObject) 688 Address += AccessInfo.MemoryObject->getAddress(); 689 690 // Ignore bogus data. 691 if (!Address) 692 continue; 693 694 const uint64_t VtableBase = Address - MethodOffset; 695 696 DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP vtable = " 697 << Twine::utohexstr(VtableBase) << "+" 698 << MethodOffset << "/" << AccessInfo.Count << "\n"); 699 700 if (ErrorOr<uint64_t> MethodAddr = BC.getPointerAtAddress(Address)) { 701 BinaryData *MethodBD = BC.getBinaryDataAtAddress(MethodAddr.get()); 702 if (!MethodBD) // skip unknown methods 703 continue; 704 MCSymbol *MethodSym = MethodBD->getSymbol(); 705 MethodToVtable[MethodSym] = VtableBase; 706 DEBUG_VERBOSE(1, { 707 const BinaryFunction *Method = BC.getFunctionForSymbol(MethodSym); 708 dbgs() << "BOLT-INFO: ICP found method = " 709 << Twine::utohexstr(MethodAddr.get()) << "/" 710 << (Method ? Method->getPrintName() : "") << "\n"; 711 }); 712 } 713 } 714 715 // Find the vtable for each target symbol. 716 for (size_t I = 0; I < SymTargets.size(); ++I) { 717 auto Itr = MethodToVtable.find(SymTargets[I].first); 718 if (Itr != MethodToVtable.end()) { 719 if (BinaryData *BD = BC.getBinaryDataContainingAddress(Itr->second)) { 720 const uint64_t Addend = Itr->second - BD->getAddress(); 721 VtableSyms.emplace_back(BD->getSymbol(), Addend); 722 continue; 723 } 724 } 725 // Give up if we can't find the vtable for a method. 726 DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP can't find vtable for " 727 << SymTargets[I].first->getName() << "\n"); 728 return MethodInfoType(); 729 } 730 731 // Make sure the vtable reg is not clobbered by the argument passing code 732 if (VtableReg != MethodReg) { 733 for (MCInst *CurInst = MethodFetchInsns.front(); CurInst < &Inst; 734 ++CurInst) { 735 const MCInstrDesc &InstrInfo = BC.MII->get(CurInst->getOpcode()); 736 if (InstrInfo.hasDefOfPhysReg(*CurInst, VtableReg, *BC.MRI)) 737 return MethodInfoType(); 738 } 739 } 740 741 return MethodInfoType(VtableSyms, MethodFetchInsns); 742 } 743 744 std::vector<std::unique_ptr<BinaryBasicBlock>> 745 IndirectCallPromotion::rewriteCall( 746 BinaryBasicBlock &IndCallBlock, const MCInst &CallInst, 747 MCPlusBuilder::BlocksVectorTy &&ICPcode, 748 const std::vector<MCInst *> &MethodFetchInsns) const { 749 BinaryFunction &Function = *IndCallBlock.getFunction(); 750 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 751 752 // Create new basic blocks with correct code in each one first. 753 std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs; 754 const bool IsTailCallOrJT = 755 (MIB->isTailCall(CallInst) || Function.getJumpTable(CallInst)); 756 757 // If we are tracking the indirect call/jump address, propagate the address to 758 // the ICP code. 759 const std::optional<uint32_t> IndirectInstrOffset = MIB->getOffset(CallInst); 760 if (IndirectInstrOffset) { 761 for (auto &[Symbol, Instructions] : ICPcode) 762 for (MCInst &Inst : Instructions) 763 MIB->setOffset(Inst, *IndirectInstrOffset); 764 } 765 766 // Move instructions from the tail of the original call block 767 // to the merge block. 768 769 // Remember any pseudo instructions following a tail call. These 770 // must be preserved and moved to the original block. 771 InstructionListType TailInsts; 772 const MCInst *TailInst = &CallInst; 773 if (IsTailCallOrJT) 774 while (TailInst + 1 < &(*IndCallBlock.end()) && 775 MIB->isPseudo(*(TailInst + 1))) 776 TailInsts.push_back(*++TailInst); 777 778 InstructionListType MovedInst = IndCallBlock.splitInstructions(&CallInst); 779 // Link new BBs to the original input offset of the indirect call site or its 780 // containing BB, so we can map samples recorded in new BBs back to the 781 // original BB seen in the input binary (if using BAT). 782 const uint32_t OrigOffset = IndirectInstrOffset 783 ? *IndirectInstrOffset 784 : IndCallBlock.getInputOffset(); 785 786 IndCallBlock.eraseInstructions(MethodFetchInsns.begin(), 787 MethodFetchInsns.end()); 788 if (IndCallBlock.empty() || 789 (!MethodFetchInsns.empty() && MethodFetchInsns.back() == &CallInst)) 790 IndCallBlock.addInstructions(ICPcode.front().second.begin(), 791 ICPcode.front().second.end()); 792 else 793 IndCallBlock.replaceInstruction(std::prev(IndCallBlock.end()), 794 ICPcode.front().second); 795 IndCallBlock.addInstructions(TailInsts.begin(), TailInsts.end()); 796 797 for (auto Itr = ICPcode.begin() + 1; Itr != ICPcode.end(); ++Itr) { 798 MCSymbol *&Sym = Itr->first; 799 InstructionListType &Insts = Itr->second; 800 assert(Sym); 801 std::unique_ptr<BinaryBasicBlock> TBB = Function.createBasicBlock(Sym); 802 TBB->setOffset(OrigOffset); 803 for (MCInst &Inst : Insts) // sanitize new instructions. 804 if (MIB->isCall(Inst)) 805 MIB->removeAnnotation(Inst, "CallProfile"); 806 TBB->addInstructions(Insts.begin(), Insts.end()); 807 NewBBs.emplace_back(std::move(TBB)); 808 } 809 810 // Move tail of instructions from after the original call to 811 // the merge block. 812 if (!IsTailCallOrJT) 813 NewBBs.back()->addInstructions(MovedInst.begin(), MovedInst.end()); 814 815 return NewBBs; 816 } 817 818 BinaryBasicBlock * 819 IndirectCallPromotion::fixCFG(BinaryBasicBlock &IndCallBlock, 820 const bool IsTailCall, const bool IsJumpTable, 821 IndirectCallPromotion::BasicBlocksVector &&NewBBs, 822 const std::vector<Callsite> &Targets) const { 823 BinaryFunction &Function = *IndCallBlock.getFunction(); 824 using BinaryBranchInfo = BinaryBasicBlock::BinaryBranchInfo; 825 BinaryBasicBlock *MergeBlock = nullptr; 826 827 // Scale indirect call counts to the execution count of the original 828 // basic block containing the indirect call. 829 uint64_t TotalCount = IndCallBlock.getKnownExecutionCount(); 830 uint64_t TotalIndirectBranches = 0; 831 for (const Callsite &Target : Targets) 832 TotalIndirectBranches += Target.Branches; 833 if (TotalIndirectBranches == 0) 834 TotalIndirectBranches = 1; 835 BinaryBasicBlock::BranchInfoType BBI; 836 BinaryBasicBlock::BranchInfoType ScaledBBI; 837 for (const Callsite &Target : Targets) { 838 const size_t NumEntries = 839 std::max(static_cast<std::size_t>(1UL), Target.JTIndices.size()); 840 for (size_t I = 0; I < NumEntries; ++I) { 841 BBI.push_back( 842 BinaryBranchInfo{(Target.Branches + NumEntries - 1) / NumEntries, 843 (Target.Mispreds + NumEntries - 1) / NumEntries}); 844 ScaledBBI.push_back( 845 BinaryBranchInfo{uint64_t(TotalCount * Target.Branches / 846 (NumEntries * TotalIndirectBranches)), 847 uint64_t(TotalCount * Target.Mispreds / 848 (NumEntries * TotalIndirectBranches))}); 849 } 850 } 851 852 if (IsJumpTable) { 853 BinaryBasicBlock *NewIndCallBlock = NewBBs.back().get(); 854 IndCallBlock.moveAllSuccessorsTo(NewIndCallBlock); 855 856 std::vector<MCSymbol *> SymTargets; 857 for (const Callsite &Target : Targets) { 858 const size_t NumEntries = 859 std::max(static_cast<std::size_t>(1UL), Target.JTIndices.size()); 860 for (size_t I = 0; I < NumEntries; ++I) 861 SymTargets.push_back(Target.To.Sym); 862 } 863 assert(SymTargets.size() > NewBBs.size() - 1 && 864 "There must be a target symbol associated with each new BB."); 865 866 for (uint64_t I = 0; I < NewBBs.size(); ++I) { 867 BinaryBasicBlock *SourceBB = I ? NewBBs[I - 1].get() : &IndCallBlock; 868 SourceBB->setExecutionCount(TotalCount); 869 870 BinaryBasicBlock *TargetBB = 871 Function.getBasicBlockForLabel(SymTargets[I]); 872 SourceBB->addSuccessor(TargetBB, ScaledBBI[I]); // taken 873 874 TotalCount -= ScaledBBI[I].Count; 875 SourceBB->addSuccessor(NewBBs[I].get(), TotalCount); // fall-through 876 877 // Update branch info for the indirect jump. 878 BinaryBasicBlock::BinaryBranchInfo &BranchInfo = 879 NewIndCallBlock->getBranchInfo(*TargetBB); 880 if (BranchInfo.Count > BBI[I].Count) 881 BranchInfo.Count -= BBI[I].Count; 882 else 883 BranchInfo.Count = 0; 884 885 if (BranchInfo.MispredictedCount > BBI[I].MispredictedCount) 886 BranchInfo.MispredictedCount -= BBI[I].MispredictedCount; 887 else 888 BranchInfo.MispredictedCount = 0; 889 } 890 } else { 891 assert(NewBBs.size() >= 2); 892 assert(NewBBs.size() % 2 == 1 || IndCallBlock.succ_empty()); 893 assert(NewBBs.size() % 2 == 1 || IsTailCall); 894 895 auto ScaledBI = ScaledBBI.begin(); 896 auto updateCurrentBranchInfo = [&] { 897 assert(ScaledBI != ScaledBBI.end()); 898 TotalCount -= ScaledBI->Count; 899 ++ScaledBI; 900 }; 901 902 if (!IsTailCall) { 903 MergeBlock = NewBBs.back().get(); 904 IndCallBlock.moveAllSuccessorsTo(MergeBlock); 905 } 906 907 // Fix up successors and execution counts. 908 updateCurrentBranchInfo(); 909 IndCallBlock.addSuccessor(NewBBs[1].get(), TotalCount); 910 IndCallBlock.addSuccessor(NewBBs[0].get(), ScaledBBI[0]); 911 912 const size_t Adj = IsTailCall ? 1 : 2; 913 for (size_t I = 0; I < NewBBs.size() - Adj; ++I) { 914 assert(TotalCount <= IndCallBlock.getExecutionCount() || 915 TotalCount <= uint64_t(TotalIndirectBranches)); 916 uint64_t ExecCount = ScaledBBI[(I + 1) / 2].Count; 917 if (I % 2 == 0) { 918 if (MergeBlock) 919 NewBBs[I]->addSuccessor(MergeBlock, ScaledBBI[(I + 1) / 2].Count); 920 } else { 921 assert(I + 2 < NewBBs.size()); 922 updateCurrentBranchInfo(); 923 NewBBs[I]->addSuccessor(NewBBs[I + 2].get(), TotalCount); 924 NewBBs[I]->addSuccessor(NewBBs[I + 1].get(), ScaledBBI[(I + 1) / 2]); 925 ExecCount += TotalCount; 926 } 927 NewBBs[I]->setExecutionCount(ExecCount); 928 } 929 930 if (MergeBlock) { 931 // Arrange for the MergeBlock to be the fallthrough for the first 932 // promoted call block. 933 std::unique_ptr<BinaryBasicBlock> MBPtr; 934 std::swap(MBPtr, NewBBs.back()); 935 NewBBs.pop_back(); 936 NewBBs.emplace(NewBBs.begin() + 1, std::move(MBPtr)); 937 // TODO: is COUNT_FALLTHROUGH_EDGE the right thing here? 938 NewBBs.back()->addSuccessor(MergeBlock, TotalCount); // uncond branch 939 } 940 } 941 942 // Update the execution count. 943 NewBBs.back()->setExecutionCount(TotalCount); 944 945 // Update BB and BB layout. 946 Function.insertBasicBlocks(&IndCallBlock, std::move(NewBBs)); 947 assert(Function.validateCFG()); 948 949 return MergeBlock; 950 } 951 952 size_t IndirectCallPromotion::canPromoteCallsite( 953 const BinaryBasicBlock &BB, const MCInst &Inst, 954 const std::vector<Callsite> &Targets, uint64_t NumCalls) { 955 BinaryFunction *BF = BB.getFunction(); 956 const BinaryContext &BC = BF->getBinaryContext(); 957 958 if (BB.getKnownExecutionCount() < opts::ExecutionCountThreshold) 959 return 0; 960 961 const bool IsJumpTable = BF->getJumpTable(Inst); 962 963 auto computeStats = [&](size_t N) { 964 for (size_t I = 0; I < N; ++I) 965 if (IsJumpTable) 966 TotalNumFrequentJmps += Targets[I].Branches; 967 else 968 TotalNumFrequentCalls += Targets[I].Branches; 969 }; 970 971 // If we have no targets (or no calls), skip this callsite. 972 if (Targets.empty() || !NumCalls) { 973 if (opts::Verbosity >= 1) { 974 const ptrdiff_t InstIdx = &Inst - &(*BB.begin()); 975 outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx << " in " 976 << BB.getName() << ", calls = " << NumCalls 977 << ", targets empty or NumCalls == 0.\n"; 978 } 979 return 0; 980 } 981 982 size_t TopN = opts::ICPTopN; 983 if (IsJumpTable) 984 TopN = opts::ICPJumpTablesTopN ? opts::ICPJumpTablesTopN : TopN; 985 else 986 TopN = opts::ICPCallsTopN ? opts::ICPCallsTopN : TopN; 987 988 const size_t TrialN = TopN ? std::min(TopN, Targets.size()) : Targets.size(); 989 990 if (opts::ICPTopCallsites && !BC.MIB->hasAnnotation(Inst, "DoICP")) 991 return 0; 992 993 // Pick the top N targets. 994 uint64_t TotalMispredictsTopN = 0; 995 size_t N = 0; 996 997 if (opts::ICPUseMispredicts && 998 (!IsJumpTable || opts::ICPJumpTablesByTarget)) { 999 // Count total number of mispredictions for (at most) the top N targets. 1000 // We may choose a smaller N (TrialN vs. N) if the frequency threshold 1001 // is exceeded by fewer targets. 1002 double Threshold = double(opts::ICPMispredictThreshold); 1003 for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) { 1004 Threshold -= (100.0 * Targets[I].Mispreds) / NumCalls; 1005 TotalMispredictsTopN += Targets[I].Mispreds; 1006 } 1007 computeStats(N); 1008 1009 // Compute the misprediction frequency of the top N call targets. If this 1010 // frequency is greater than the threshold, we should try ICP on this 1011 // callsite. 1012 const double TopNFrequency = (100.0 * TotalMispredictsTopN) / NumCalls; 1013 if (TopNFrequency == 0 || TopNFrequency < opts::ICPMispredictThreshold) { 1014 if (opts::Verbosity >= 1) { 1015 const ptrdiff_t InstIdx = &Inst - &(*BB.begin()); 1016 outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx 1017 << " in " << BB.getName() << ", calls = " << NumCalls 1018 << ", top N mis. frequency " << format("%.1f", TopNFrequency) 1019 << "% < " << opts::ICPMispredictThreshold << "%\n"; 1020 } 1021 return 0; 1022 } 1023 } else { 1024 size_t MaxTargets = 0; 1025 1026 // Count total number of calls for (at most) the top N targets. 1027 // We may choose a smaller N (TrialN vs. N) if the frequency threshold 1028 // is exceeded by fewer targets. 1029 const unsigned TotalThreshold = IsJumpTable 1030 ? opts::ICPJTTotalPercentThreshold 1031 : opts::ICPCallsTotalPercentThreshold; 1032 const unsigned RemainingThreshold = 1033 IsJumpTable ? opts::ICPJTRemainingPercentThreshold 1034 : opts::ICPCallsRemainingPercentThreshold; 1035 uint64_t NumRemainingCalls = NumCalls; 1036 for (size_t I = 0; I < TrialN; ++I, ++MaxTargets) { 1037 if (100 * Targets[I].Branches < NumCalls * TotalThreshold) 1038 break; 1039 if (100 * Targets[I].Branches < NumRemainingCalls * RemainingThreshold) 1040 break; 1041 if (N + (Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size()) > 1042 TrialN) 1043 break; 1044 TotalMispredictsTopN += Targets[I].Mispreds; 1045 NumRemainingCalls -= Targets[I].Branches; 1046 N += Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size(); 1047 } 1048 computeStats(MaxTargets); 1049 1050 // Don't check misprediction frequency for jump tables -- we don't really 1051 // care as long as we are saving loads from the jump table. 1052 if (!IsJumpTable || opts::ICPJumpTablesByTarget) { 1053 // Compute the misprediction frequency of the top N call targets. If 1054 // this frequency is less than the threshold, we should skip ICP at 1055 // this callsite. 1056 const double TopNMispredictFrequency = 1057 (100.0 * TotalMispredictsTopN) / NumCalls; 1058 1059 if (TopNMispredictFrequency < opts::ICPMispredictThreshold) { 1060 if (opts::Verbosity >= 1) { 1061 const ptrdiff_t InstIdx = &Inst - &(*BB.begin()); 1062 outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx 1063 << " in " << BB.getName() << ", calls = " << NumCalls 1064 << ", top N mispredict frequency " 1065 << format("%.1f", TopNMispredictFrequency) << "% < " 1066 << opts::ICPMispredictThreshold << "%\n"; 1067 } 1068 return 0; 1069 } 1070 } 1071 } 1072 1073 // Filter by inline-ability of target functions, stop at first target that 1074 // can't be inlined. 1075 if (!IsJumpTable && opts::ICPPeelForInline) { 1076 for (size_t I = 0; I < N; ++I) { 1077 const MCSymbol *TargetSym = Targets[I].To.Sym; 1078 const BinaryFunction *TargetBF = BC.getFunctionForSymbol(TargetSym); 1079 if (!TargetBF || !BinaryFunctionPass::shouldOptimize(*TargetBF) || 1080 getInliningInfo(*TargetBF).Type == InliningType::INL_NONE) { 1081 N = I; 1082 break; 1083 } 1084 } 1085 } 1086 1087 // Filter functions that can have ICP applied (for debugging) 1088 if (!opts::ICPFuncsList.empty()) { 1089 for (std::string &Name : opts::ICPFuncsList) 1090 if (BF->hasName(Name)) 1091 return N; 1092 return 0; 1093 } 1094 1095 return N; 1096 } 1097 1098 void IndirectCallPromotion::printCallsiteInfo( 1099 const BinaryBasicBlock &BB, const MCInst &Inst, 1100 const std::vector<Callsite> &Targets, const size_t N, 1101 uint64_t NumCalls) const { 1102 BinaryContext &BC = BB.getFunction()->getBinaryContext(); 1103 const bool IsTailCall = BC.MIB->isTailCall(Inst); 1104 const bool IsJumpTable = BB.getFunction()->getJumpTable(Inst); 1105 const ptrdiff_t InstIdx = &Inst - &(*BB.begin()); 1106 1107 outs() << "BOLT-INFO: ICP candidate branch info: " << *BB.getFunction() 1108 << " @ " << InstIdx << " in " << BB.getName() 1109 << " -> calls = " << NumCalls 1110 << (IsTailCall ? " (tail)" : (IsJumpTable ? " (jump table)" : "")) 1111 << "\n"; 1112 for (size_t I = 0; I < N; I++) { 1113 const double Frequency = 100.0 * Targets[I].Branches / NumCalls; 1114 const double MisFrequency = 100.0 * Targets[I].Mispreds / NumCalls; 1115 outs() << "BOLT-INFO: "; 1116 if (Targets[I].To.Sym) 1117 outs() << Targets[I].To.Sym->getName(); 1118 else 1119 outs() << Targets[I].To.Addr; 1120 outs() << ", calls = " << Targets[I].Branches 1121 << ", mispreds = " << Targets[I].Mispreds 1122 << ", taken freq = " << format("%.1f", Frequency) << "%" 1123 << ", mis. freq = " << format("%.1f", MisFrequency) << "%"; 1124 bool First = true; 1125 for (uint64_t JTIndex : Targets[I].JTIndices) { 1126 outs() << (First ? ", indices = " : ", ") << JTIndex; 1127 First = false; 1128 } 1129 outs() << "\n"; 1130 } 1131 1132 LLVM_DEBUG({ 1133 dbgs() << "BOLT-INFO: ICP original call instruction:"; 1134 BC.printInstruction(dbgs(), Inst, Targets[0].From.Addr, nullptr, true); 1135 }); 1136 } 1137 1138 void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) { 1139 if (opts::ICP == ICP_NONE) 1140 return; 1141 1142 auto &BFs = BC.getBinaryFunctions(); 1143 1144 const bool OptimizeCalls = (opts::ICP == ICP_CALLS || opts::ICP == ICP_ALL); 1145 const bool OptimizeJumpTables = 1146 (opts::ICP == ICP_JUMP_TABLES || opts::ICP == ICP_ALL); 1147 1148 std::unique_ptr<RegAnalysis> RA; 1149 std::unique_ptr<BinaryFunctionCallGraph> CG; 1150 if (OptimizeJumpTables) { 1151 CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC))); 1152 RA.reset(new RegAnalysis(BC, &BFs, &*CG)); 1153 } 1154 1155 // If icp-top-callsites is enabled, compute the total number of indirect 1156 // calls and then optimize the hottest callsites that contribute to that 1157 // total. 1158 SetVector<BinaryFunction *> Functions; 1159 if (opts::ICPTopCallsites == 0) { 1160 for (auto &KV : BFs) 1161 Functions.insert(&KV.second); 1162 } else { 1163 using IndirectCallsite = std::tuple<uint64_t, MCInst *, BinaryFunction *>; 1164 std::vector<IndirectCallsite> IndirectCalls; 1165 size_t TotalIndirectCalls = 0; 1166 1167 // Find all the indirect callsites. 1168 for (auto &BFIt : BFs) { 1169 BinaryFunction &Function = BFIt.second; 1170 1171 if (!shouldOptimize(Function)) 1172 continue; 1173 1174 const bool HasLayout = !Function.getLayout().block_empty(); 1175 1176 for (BinaryBasicBlock &BB : Function) { 1177 if (HasLayout && Function.isSplit() && BB.isCold()) 1178 continue; 1179 1180 for (MCInst &Inst : BB) { 1181 const bool IsJumpTable = Function.getJumpTable(Inst); 1182 const bool HasIndirectCallProfile = 1183 BC.MIB->hasAnnotation(Inst, "CallProfile"); 1184 const bool IsDirectCall = 1185 (BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0)); 1186 1187 if (!IsDirectCall && 1188 ((HasIndirectCallProfile && !IsJumpTable && OptimizeCalls) || 1189 (IsJumpTable && OptimizeJumpTables))) { 1190 uint64_t NumCalls = 0; 1191 for (const Callsite &BInfo : getCallTargets(BB, Inst)) 1192 NumCalls += BInfo.Branches; 1193 IndirectCalls.push_back( 1194 std::make_tuple(NumCalls, &Inst, &Function)); 1195 TotalIndirectCalls += NumCalls; 1196 } 1197 } 1198 } 1199 } 1200 1201 // Sort callsites by execution count. 1202 llvm::sort(reverse(IndirectCalls)); 1203 1204 // Find callsites that contribute to the top "opts::ICPTopCallsites"% 1205 // number of calls. 1206 const float TopPerc = opts::ICPTopCallsites / 100.0f; 1207 int64_t MaxCalls = TotalIndirectCalls * TopPerc; 1208 uint64_t LastFreq = std::numeric_limits<uint64_t>::max(); 1209 size_t Num = 0; 1210 for (const IndirectCallsite &IC : IndirectCalls) { 1211 const uint64_t CurFreq = std::get<0>(IC); 1212 // Once we decide to stop, include at least all branches that share the 1213 // same frequency of the last one to avoid non-deterministic behavior 1214 // (e.g. turning on/off ICP depending on the order of functions) 1215 if (MaxCalls <= 0 && CurFreq != LastFreq) 1216 break; 1217 MaxCalls -= CurFreq; 1218 LastFreq = CurFreq; 1219 BC.MIB->addAnnotation(*std::get<1>(IC), "DoICP", true); 1220 Functions.insert(std::get<2>(IC)); 1221 ++Num; 1222 } 1223 outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls 1224 << ", " << Num << " callsites cover " << opts::ICPTopCallsites 1225 << "% of all indirect calls\n"; 1226 } 1227 1228 for (BinaryFunction *FuncPtr : Functions) { 1229 BinaryFunction &Function = *FuncPtr; 1230 1231 if (!shouldOptimize(Function)) 1232 continue; 1233 1234 const bool HasLayout = !Function.getLayout().block_empty(); 1235 1236 // Total number of indirect calls issued from the current Function. 1237 // (a fraction of TotalIndirectCalls) 1238 uint64_t FuncTotalIndirectCalls = 0; 1239 uint64_t FuncTotalIndirectJmps = 0; 1240 1241 std::vector<BinaryBasicBlock *> BBs; 1242 for (BinaryBasicBlock &BB : Function) { 1243 // Skip indirect calls in cold blocks. 1244 if (!HasLayout || !Function.isSplit() || !BB.isCold()) 1245 BBs.push_back(&BB); 1246 } 1247 if (BBs.empty()) 1248 continue; 1249 1250 DataflowInfoManager Info(Function, RA.get(), nullptr); 1251 while (!BBs.empty()) { 1252 BinaryBasicBlock *BB = BBs.back(); 1253 BBs.pop_back(); 1254 1255 for (unsigned Idx = 0; Idx < BB->size(); ++Idx) { 1256 MCInst &Inst = BB->getInstructionAtIndex(Idx); 1257 const ptrdiff_t InstIdx = &Inst - &(*BB->begin()); 1258 const bool IsTailCall = BC.MIB->isTailCall(Inst); 1259 const bool HasIndirectCallProfile = 1260 BC.MIB->hasAnnotation(Inst, "CallProfile"); 1261 const bool IsJumpTable = Function.getJumpTable(Inst); 1262 1263 if (BC.MIB->isCall(Inst)) 1264 TotalCalls += BB->getKnownExecutionCount(); 1265 1266 if (IsJumpTable && !OptimizeJumpTables) 1267 continue; 1268 1269 if (!IsJumpTable && (!HasIndirectCallProfile || !OptimizeCalls)) 1270 continue; 1271 1272 // Ignore direct calls. 1273 if (BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0)) 1274 continue; 1275 1276 assert((BC.MIB->isCall(Inst) || BC.MIB->isIndirectBranch(Inst)) && 1277 "expected a call or an indirect jump instruction"); 1278 1279 if (IsJumpTable) 1280 ++TotalJumpTableCallsites; 1281 else 1282 ++TotalIndirectCallsites; 1283 1284 std::vector<Callsite> Targets = getCallTargets(*BB, Inst); 1285 1286 // Compute the total number of calls from this particular callsite. 1287 uint64_t NumCalls = 0; 1288 for (const Callsite &BInfo : Targets) 1289 NumCalls += BInfo.Branches; 1290 if (!IsJumpTable) 1291 FuncTotalIndirectCalls += NumCalls; 1292 else 1293 FuncTotalIndirectJmps += NumCalls; 1294 1295 // If FLAGS regs is alive after this jmp site, do not try 1296 // promoting because we will clobber FLAGS. 1297 if (IsJumpTable) { 1298 ErrorOr<const BitVector &> State = 1299 Info.getLivenessAnalysis().getStateBefore(Inst); 1300 if (!State || (State && (*State)[BC.MIB->getFlagsReg()])) { 1301 if (opts::Verbosity >= 1) 1302 outs() << "BOLT-INFO: ICP failed in " << Function << " @ " 1303 << InstIdx << " in " << BB->getName() 1304 << ", calls = " << NumCalls 1305 << (State ? ", cannot clobber flags reg.\n" 1306 : ", no liveness data available.\n"); 1307 continue; 1308 } 1309 } 1310 1311 // Should this callsite be optimized? Return the number of targets 1312 // to use when promoting this call. A value of zero means to skip 1313 // this callsite. 1314 size_t N = canPromoteCallsite(*BB, Inst, Targets, NumCalls); 1315 1316 // If it is a jump table and it failed to meet our initial threshold, 1317 // proceed to findCallTargetSymbols -- it may reevaluate N if 1318 // memory profile is present 1319 if (!N && !IsJumpTable) 1320 continue; 1321 1322 if (opts::Verbosity >= 1) 1323 printCallsiteInfo(*BB, Inst, Targets, N, NumCalls); 1324 1325 // Find MCSymbols or absolute addresses for each call target. 1326 MCInst *TargetFetchInst = nullptr; 1327 const SymTargetsType SymTargets = 1328 findCallTargetSymbols(Targets, N, *BB, Inst, TargetFetchInst); 1329 1330 // findCallTargetSymbols may have changed N if mem profile is available 1331 // for jump tables 1332 if (!N) 1333 continue; 1334 1335 LLVM_DEBUG(printDecision(dbgs(), Targets, N)); 1336 1337 // If we can't resolve any of the target symbols, punt on this callsite. 1338 // TODO: can this ever happen? 1339 if (SymTargets.size() < N) { 1340 const size_t LastTarget = SymTargets.size(); 1341 if (opts::Verbosity >= 1) 1342 outs() << "BOLT-INFO: ICP failed in " << Function << " @ " 1343 << InstIdx << " in " << BB->getName() 1344 << ", calls = " << NumCalls 1345 << ", ICP failed to find target symbol for " 1346 << Targets[LastTarget].To.Sym->getName() << "\n"; 1347 continue; 1348 } 1349 1350 MethodInfoType MethodInfo; 1351 1352 if (!IsJumpTable) { 1353 MethodInfo = maybeGetVtableSyms(*BB, Inst, SymTargets); 1354 TotalMethodLoadsEliminated += MethodInfo.first.empty() ? 0 : 1; 1355 LLVM_DEBUG(dbgs() 1356 << "BOLT-INFO: ICP " 1357 << (!MethodInfo.first.empty() ? "found" : "did not find") 1358 << " vtables for all methods.\n"); 1359 } else if (TargetFetchInst) { 1360 ++TotalIndexBasedJumps; 1361 MethodInfo.second.push_back(TargetFetchInst); 1362 } 1363 1364 // Generate new promoted call code for this callsite. 1365 MCPlusBuilder::BlocksVectorTy ICPcode = 1366 (IsJumpTable && !opts::ICPJumpTablesByTarget) 1367 ? BC.MIB->jumpTablePromotion(Inst, SymTargets, 1368 MethodInfo.second, BC.Ctx.get()) 1369 : BC.MIB->indirectCallPromotion( 1370 Inst, SymTargets, MethodInfo.first, MethodInfo.second, 1371 opts::ICPOldCodeSequence, BC.Ctx.get()); 1372 1373 if (ICPcode.empty()) { 1374 if (opts::Verbosity >= 1) 1375 outs() << "BOLT-INFO: ICP failed in " << Function << " @ " 1376 << InstIdx << " in " << BB->getName() 1377 << ", calls = " << NumCalls 1378 << ", unable to generate promoted call code.\n"; 1379 continue; 1380 } 1381 1382 LLVM_DEBUG({ 1383 uint64_t Offset = Targets[0].From.Addr; 1384 dbgs() << "BOLT-INFO: ICP indirect call code:\n"; 1385 for (const auto &entry : ICPcode) { 1386 const MCSymbol *const &Sym = entry.first; 1387 const InstructionListType &Insts = entry.second; 1388 if (Sym) 1389 dbgs() << Sym->getName() << ":\n"; 1390 Offset = BC.printInstructions(dbgs(), Insts.begin(), Insts.end(), 1391 Offset); 1392 } 1393 dbgs() << "---------------------------------------------------\n"; 1394 }); 1395 1396 // Rewrite the CFG with the newly generated ICP code. 1397 std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs = 1398 rewriteCall(*BB, Inst, std::move(ICPcode), MethodInfo.second); 1399 1400 // Fix the CFG after inserting the new basic blocks. 1401 BinaryBasicBlock *MergeBlock = 1402 fixCFG(*BB, IsTailCall, IsJumpTable, std::move(NewBBs), Targets); 1403 1404 // Since the tail of the original block was split off and it may contain 1405 // additional indirect calls, we must add the merge block to the set of 1406 // blocks to process. 1407 if (MergeBlock) 1408 BBs.push_back(MergeBlock); 1409 1410 if (opts::Verbosity >= 1) 1411 outs() << "BOLT-INFO: ICP succeeded in " << Function << " @ " 1412 << InstIdx << " in " << BB->getName() 1413 << " -> calls = " << NumCalls << "\n"; 1414 1415 if (IsJumpTable) 1416 ++TotalOptimizedJumpTableCallsites; 1417 else 1418 ++TotalOptimizedIndirectCallsites; 1419 1420 Modified.insert(&Function); 1421 } 1422 } 1423 TotalIndirectCalls += FuncTotalIndirectCalls; 1424 TotalIndirectJmps += FuncTotalIndirectJmps; 1425 } 1426 1427 outs() << "BOLT-INFO: ICP total indirect callsites with profile = " 1428 << TotalIndirectCallsites << "\n" 1429 << "BOLT-INFO: ICP total jump table callsites = " 1430 << TotalJumpTableCallsites << "\n" 1431 << "BOLT-INFO: ICP total number of calls = " << TotalCalls << "\n" 1432 << "BOLT-INFO: ICP percentage of calls that are indirect = " 1433 << format("%.1f", (100.0 * TotalIndirectCalls) / TotalCalls) << "%\n" 1434 << "BOLT-INFO: ICP percentage of indirect calls that can be " 1435 "optimized = " 1436 << format("%.1f", (100.0 * TotalNumFrequentCalls) / 1437 std::max<size_t>(TotalIndirectCalls, 1)) 1438 << "%\n" 1439 << "BOLT-INFO: ICP percentage of indirect callsites that are " 1440 "optimized = " 1441 << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) / 1442 std::max<uint64_t>(TotalIndirectCallsites, 1)) 1443 << "%\n" 1444 << "BOLT-INFO: ICP number of method load elimination candidates = " 1445 << TotalMethodLoadEliminationCandidates << "\n" 1446 << "BOLT-INFO: ICP percentage of method calls candidates that have " 1447 "loads eliminated = " 1448 << format("%.1f", (100.0 * TotalMethodLoadsEliminated) / 1449 std::max<uint64_t>( 1450 TotalMethodLoadEliminationCandidates, 1)) 1451 << "%\n" 1452 << "BOLT-INFO: ICP percentage of indirect branches that are " 1453 "optimized = " 1454 << format("%.1f", (100.0 * TotalNumFrequentJmps) / 1455 std::max<uint64_t>(TotalIndirectJmps, 1)) 1456 << "%\n" 1457 << "BOLT-INFO: ICP percentage of jump table callsites that are " 1458 << "optimized = " 1459 << format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) / 1460 std::max<uint64_t>(TotalJumpTableCallsites, 1)) 1461 << "%\n" 1462 << "BOLT-INFO: ICP number of jump table callsites that can use hot " 1463 << "indices = " << TotalIndexBasedCandidates << "\n" 1464 << "BOLT-INFO: ICP percentage of jump table callsites that use hot " 1465 "indices = " 1466 << format("%.1f", (100.0 * TotalIndexBasedJumps) / 1467 std::max<uint64_t>(TotalIndexBasedCandidates, 1)) 1468 << "%\n"; 1469 1470 (void)verifyProfile; 1471 #ifndef NDEBUG 1472 verifyProfile(BFs); 1473 #endif 1474 } 1475 1476 } // namespace bolt 1477 } // namespace llvm 1478