1 //===- bolt/Passes/IndirectCallPromotion.cpp ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the IndirectCallPromotion class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/IndirectCallPromotion.h" 14 #include "bolt/Passes/BinaryFunctionCallGraph.h" 15 #include "bolt/Passes/DataflowInfoManager.h" 16 #include "bolt/Passes/Inliner.h" 17 #include "llvm/Support/CommandLine.h" 18 19 #define DEBUG_TYPE "ICP" 20 #define DEBUG_VERBOSE(Level, X) \ 21 if (opts::Verbosity >= (Level)) { \ 22 X; \ 23 } 24 25 using namespace llvm; 26 using namespace bolt; 27 28 namespace opts { 29 30 extern cl::OptionCategory BoltOptCategory; 31 32 extern cl::opt<IndirectCallPromotionType> ICP; 33 extern cl::opt<unsigned> Verbosity; 34 extern cl::opt<unsigned> ExecutionCountThreshold; 35 36 static cl::opt<unsigned> ICPJTRemainingPercentThreshold( 37 "icp-jt-remaining-percent-threshold", 38 cl::desc("The percentage threshold against remaining unpromoted indirect " 39 "call count for the promotion for jump tables"), 40 cl::init(30), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); 41 42 static cl::opt<unsigned> ICPJTTotalPercentThreshold( 43 "icp-jt-total-percent-threshold", 44 cl::desc( 45 "The percentage threshold against total count for the promotion for " 46 "jump tables"), 47 cl::init(5), cl::Hidden, cl::cat(BoltOptCategory)); 48 49 static cl::opt<unsigned> ICPCallsRemainingPercentThreshold( 50 "icp-calls-remaining-percent-threshold", 51 cl::desc("The percentage threshold against remaining unpromoted indirect " 52 "call count for the promotion for calls"), 53 cl::init(50), cl::Hidden, cl::cat(BoltOptCategory)); 54 55 static cl::opt<unsigned> ICPCallsTotalPercentThreshold( 56 "icp-calls-total-percent-threshold", 57 cl::desc( 58 "The percentage threshold against total count for the promotion for " 59 "calls"), 60 cl::init(30), cl::Hidden, cl::cat(BoltOptCategory)); 61 62 static cl::opt<unsigned> ICPMispredictThreshold( 63 "indirect-call-promotion-mispredict-threshold", 64 cl::desc("misprediction threshold for skipping ICP on an " 65 "indirect call"), 66 cl::init(0), cl::cat(BoltOptCategory)); 67 68 static cl::alias ICPMispredictThresholdAlias( 69 "icp-mp-threshold", 70 cl::desc("alias for --indirect-call-promotion-mispredict-threshold"), 71 cl::aliasopt(ICPMispredictThreshold)); 72 73 static cl::opt<bool> ICPUseMispredicts( 74 "indirect-call-promotion-use-mispredicts", 75 cl::desc("use misprediction frequency for determining whether or not ICP " 76 "should be applied at a callsite. The " 77 "-indirect-call-promotion-mispredict-threshold value will be used " 78 "by this heuristic"), 79 cl::cat(BoltOptCategory)); 80 81 static cl::alias ICPUseMispredictsAlias( 82 "icp-use-mp", 83 cl::desc("alias for --indirect-call-promotion-use-mispredicts"), 84 cl::aliasopt(ICPUseMispredicts)); 85 86 static cl::opt<unsigned> 87 ICPTopN("indirect-call-promotion-topn", 88 cl::desc("limit number of targets to consider when doing indirect " 89 "call promotion. 0 = no limit"), 90 cl::init(3), cl::cat(BoltOptCategory)); 91 92 static cl::alias 93 ICPTopNAlias("icp-topn", 94 cl::desc("alias for --indirect-call-promotion-topn"), 95 cl::aliasopt(ICPTopN)); 96 97 static cl::opt<unsigned> ICPCallsTopN( 98 "indirect-call-promotion-calls-topn", 99 cl::desc("limit number of targets to consider when doing indirect " 100 "call promotion on calls. 0 = no limit"), 101 cl::init(0), cl::cat(BoltOptCategory)); 102 103 static cl::alias ICPCallsTopNAlias( 104 "icp-calls-topn", 105 cl::desc("alias for --indirect-call-promotion-calls-topn"), 106 cl::aliasopt(ICPCallsTopN)); 107 108 static cl::opt<unsigned> ICPJumpTablesTopN( 109 "indirect-call-promotion-jump-tables-topn", 110 cl::desc("limit number of targets to consider when doing indirect " 111 "call promotion on jump tables. 0 = no limit"), 112 cl::init(0), cl::cat(BoltOptCategory)); 113 114 static cl::alias ICPJumpTablesTopNAlias( 115 "icp-jt-topn", 116 cl::desc("alias for --indirect-call-promotion-jump-tables-topn"), 117 cl::aliasopt(ICPJumpTablesTopN)); 118 119 static cl::opt<bool> EliminateLoads( 120 "icp-eliminate-loads", 121 cl::desc("enable load elimination using memory profiling data when " 122 "performing ICP"), 123 cl::init(true), cl::cat(BoltOptCategory)); 124 125 static cl::opt<unsigned> ICPTopCallsites( 126 "icp-top-callsites", 127 cl::desc("optimize hottest calls until at least this percentage of all " 128 "indirect calls frequency is covered. 0 = all callsites"), 129 cl::init(99), cl::Hidden, cl::cat(BoltOptCategory)); 130 131 static cl::list<std::string> 132 ICPFuncsList("icp-funcs", cl::CommaSeparated, 133 cl::desc("list of functions to enable ICP for"), 134 cl::value_desc("func1,func2,func3,..."), cl::Hidden, 135 cl::cat(BoltOptCategory)); 136 137 static cl::opt<bool> 138 ICPOldCodeSequence("icp-old-code-sequence", 139 cl::desc("use old code sequence for promoted calls"), 140 cl::Hidden, cl::cat(BoltOptCategory)); 141 142 static cl::opt<bool> ICPJumpTablesByTarget( 143 "icp-jump-tables-targets", 144 cl::desc( 145 "for jump tables, optimize indirect jmp targets instead of indices"), 146 cl::Hidden, cl::cat(BoltOptCategory)); 147 148 static cl::alias 149 ICPJumpTablesByTargetAlias("icp-jt-targets", 150 cl::desc("alias for --icp-jump-tables-targets"), 151 cl::aliasopt(ICPJumpTablesByTarget)); 152 153 static cl::opt<bool> ICPPeelForInline( 154 "icp-inline", cl::desc("only promote call targets eligible for inlining"), 155 cl::Hidden, cl::cat(BoltOptCategory)); 156 157 } // namespace opts 158 159 static bool verifyProfile(std::map<uint64_t, BinaryFunction> &BFs) { 160 bool IsValid = true; 161 for (auto &BFI : BFs) { 162 BinaryFunction &BF = BFI.second; 163 if (!BF.isSimple()) 164 continue; 165 for (BinaryBasicBlock *BB : BF.layout()) { 166 auto BI = BB->branch_info_begin(); 167 for (BinaryBasicBlock *SuccBB : BB->successors()) { 168 if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) { 169 if (BB->getKnownExecutionCount() == 0 || 170 SuccBB->getKnownExecutionCount() == 0) { 171 errs() << "BOLT-WARNING: profile verification failed after ICP for " 172 "function " 173 << BF << '\n'; 174 IsValid = false; 175 } 176 } 177 ++BI; 178 } 179 } 180 } 181 return IsValid; 182 } 183 184 namespace llvm { 185 namespace bolt { 186 187 IndirectCallPromotion::Callsite::Callsite(BinaryFunction &BF, 188 const IndirectCallProfile &ICP) 189 : From(BF.getSymbol()), To(ICP.Offset), Mispreds(ICP.Mispreds), 190 Branches(ICP.Count) { 191 if (ICP.Symbol) { 192 To.Sym = ICP.Symbol; 193 To.Addr = 0; 194 } 195 } 196 197 void IndirectCallPromotion::printDecision( 198 llvm::raw_ostream &OS, 199 std::vector<IndirectCallPromotion::Callsite> &Targets, unsigned N) const { 200 uint64_t TotalCount = 0; 201 uint64_t TotalMispreds = 0; 202 for (const Callsite &S : Targets) { 203 TotalCount += S.Branches; 204 TotalMispreds += S.Mispreds; 205 } 206 if (!TotalCount) 207 TotalCount = 1; 208 if (!TotalMispreds) 209 TotalMispreds = 1; 210 211 OS << "BOLT-INFO: ICP decision for call site with " << Targets.size() 212 << " targets, Count = " << TotalCount << ", Mispreds = " << TotalMispreds 213 << "\n"; 214 215 size_t I = 0; 216 for (const Callsite &S : Targets) { 217 OS << "Count = " << S.Branches << ", " 218 << format("%.1f", (100.0 * S.Branches) / TotalCount) << ", " 219 << "Mispreds = " << S.Mispreds << ", " 220 << format("%.1f", (100.0 * S.Mispreds) / TotalMispreds); 221 if (I < N) 222 OS << " * to be optimized *"; 223 if (!S.JTIndices.empty()) { 224 OS << " Indices:"; 225 for (const uint64_t Idx : S.JTIndices) 226 OS << " " << Idx; 227 } 228 OS << "\n"; 229 I += S.JTIndices.empty() ? 1 : S.JTIndices.size(); 230 } 231 } 232 233 // Get list of targets for a given call sorted by most frequently 234 // called first. 235 std::vector<IndirectCallPromotion::Callsite> 236 IndirectCallPromotion::getCallTargets(BinaryBasicBlock &BB, 237 const MCInst &Inst) const { 238 BinaryFunction &BF = *BB.getFunction(); 239 const BinaryContext &BC = BF.getBinaryContext(); 240 std::vector<Callsite> Targets; 241 242 if (const JumpTable *JT = BF.getJumpTable(Inst)) { 243 // Don't support PIC jump tables for now 244 if (!opts::ICPJumpTablesByTarget && JT->Type == JumpTable::JTT_PIC) 245 return Targets; 246 const Location From(BF.getSymbol()); 247 const std::pair<size_t, size_t> Range = 248 JT->getEntriesForAddress(BC.MIB->getJumpTable(Inst)); 249 assert(JT->Counts.empty() || JT->Counts.size() >= Range.second); 250 JumpTable::JumpInfo DefaultJI; 251 const JumpTable::JumpInfo *JI = 252 JT->Counts.empty() ? &DefaultJI : &JT->Counts[Range.first]; 253 const size_t JIAdj = JT->Counts.empty() ? 0 : 1; 254 assert(JT->Type == JumpTable::JTT_PIC || 255 JT->EntrySize == BC.AsmInfo->getCodePointerSize()); 256 for (size_t I = Range.first; I < Range.second; ++I, JI += JIAdj) { 257 MCSymbol *Entry = JT->Entries[I]; 258 assert(BF.getBasicBlockForLabel(Entry) || 259 Entry == BF.getFunctionEndLabel() || 260 Entry == BF.getFunctionColdEndLabel()); 261 if (Entry == BF.getFunctionEndLabel() || 262 Entry == BF.getFunctionColdEndLabel()) 263 continue; 264 const Location To(Entry); 265 const BinaryBasicBlock::BinaryBranchInfo &BI = BB.getBranchInfo(Entry); 266 Targets.emplace_back(From, To, BI.MispredictedCount, BI.Count, 267 I - Range.first); 268 } 269 270 // Sort by symbol then addr. 271 llvm::sort(Targets, [](const Callsite &A, const Callsite &B) { 272 if (A.To.Sym && B.To.Sym) 273 return A.To.Sym < B.To.Sym; 274 else if (A.To.Sym && !B.To.Sym) 275 return true; 276 else if (!A.To.Sym && B.To.Sym) 277 return false; 278 else 279 return A.To.Addr < B.To.Addr; 280 }); 281 282 // Targets may contain multiple entries to the same target, but using 283 // different indices. Their profile will report the same number of branches 284 // for different indices if the target is the same. That's because we don't 285 // profile the index value, but only the target via LBR. 286 auto First = Targets.begin(); 287 auto Last = Targets.end(); 288 auto Result = First; 289 while (++First != Last) { 290 Callsite &A = *Result; 291 const Callsite &B = *First; 292 if (A.To.Sym && B.To.Sym && A.To.Sym == B.To.Sym) 293 A.JTIndices.insert(A.JTIndices.end(), B.JTIndices.begin(), 294 B.JTIndices.end()); 295 else 296 *(++Result) = *First; 297 } 298 ++Result; 299 300 LLVM_DEBUG(if (Targets.end() - Result > 0) { 301 dbgs() << "BOLT-INFO: ICP: " << (Targets.end() - Result) 302 << " duplicate targets removed\n"; 303 }); 304 305 Targets.erase(Result, Targets.end()); 306 } else { 307 // Don't try to optimize PC relative indirect calls. 308 if (Inst.getOperand(0).isReg() && 309 Inst.getOperand(0).getReg() == BC.MRI->getProgramCounter()) 310 return Targets; 311 312 const auto ICSP = BC.MIB->tryGetAnnotationAs<IndirectCallSiteProfile>( 313 Inst, "CallProfile"); 314 if (ICSP) { 315 for (const IndirectCallProfile &CSP : ICSP.get()) { 316 Callsite Site(BF, CSP); 317 if (Site.isValid()) 318 Targets.emplace_back(std::move(Site)); 319 } 320 } 321 } 322 323 // Sort by target count, number of indices in case of jump table, and 324 // mispredicts. We prioritize targets with high count, small number of indices 325 // and high mispredicts. Break ties by selecting targets with lower addresses. 326 llvm::stable_sort(Targets, [](const Callsite &A, const Callsite &B) { 327 if (A.Branches != B.Branches) 328 return A.Branches > B.Branches; 329 if (A.JTIndices.size() != B.JTIndices.size()) 330 return A.JTIndices.size() < B.JTIndices.size(); 331 if (A.Mispreds != B.Mispreds) 332 return A.Mispreds > B.Mispreds; 333 return A.To.Addr < B.To.Addr; 334 }); 335 336 // Remove non-symbol targets 337 llvm::erase_if(Targets, [](const Callsite &CS) { return !CS.To.Sym; }); 338 339 LLVM_DEBUG(if (BF.getJumpTable(Inst)) { 340 uint64_t TotalCount = 0; 341 uint64_t TotalMispreds = 0; 342 for (const Callsite &S : Targets) { 343 TotalCount += S.Branches; 344 TotalMispreds += S.Mispreds; 345 } 346 if (!TotalCount) 347 TotalCount = 1; 348 if (!TotalMispreds) 349 TotalMispreds = 1; 350 351 dbgs() << "BOLT-INFO: ICP: jump table size = " << Targets.size() 352 << ", Count = " << TotalCount << ", Mispreds = " << TotalMispreds 353 << "\n"; 354 355 size_t I = 0; 356 for (const Callsite &S : Targets) { 357 dbgs() << "Count[" << I << "] = " << S.Branches << ", " 358 << format("%.1f", (100.0 * S.Branches) / TotalCount) << ", " 359 << "Mispreds[" << I << "] = " << S.Mispreds << ", " 360 << format("%.1f", (100.0 * S.Mispreds) / TotalMispreds) << "\n"; 361 ++I; 362 } 363 }); 364 365 return Targets; 366 } 367 368 IndirectCallPromotion::JumpTableInfoType 369 IndirectCallPromotion::maybeGetHotJumpTableTargets(BinaryBasicBlock &BB, 370 MCInst &CallInst, 371 MCInst *&TargetFetchInst, 372 const JumpTable *JT) const { 373 assert(JT && "Can't get jump table addrs for non-jump tables."); 374 375 BinaryFunction &Function = *BB.getFunction(); 376 BinaryContext &BC = Function.getBinaryContext(); 377 378 if (!Function.hasMemoryProfile() || !opts::EliminateLoads) 379 return JumpTableInfoType(); 380 381 JumpTableInfoType HotTargets; 382 MCInst *MemLocInstr; 383 MCInst *PCRelBaseOut; 384 unsigned BaseReg, IndexReg; 385 int64_t DispValue; 386 const MCExpr *DispExpr; 387 MutableArrayRef<MCInst> Insts(&BB.front(), &CallInst); 388 const IndirectBranchType Type = BC.MIB->analyzeIndirectBranch( 389 CallInst, Insts.begin(), Insts.end(), BC.AsmInfo->getCodePointerSize(), 390 MemLocInstr, BaseReg, IndexReg, DispValue, DispExpr, PCRelBaseOut); 391 392 assert(MemLocInstr && "There should always be a load for jump tables"); 393 if (!MemLocInstr) 394 return JumpTableInfoType(); 395 396 LLVM_DEBUG({ 397 dbgs() << "BOLT-INFO: ICP attempting to find memory profiling data for " 398 << "jump table in " << Function << " at @ " 399 << (&CallInst - &BB.front()) << "\n" 400 << "BOLT-INFO: ICP target fetch instructions:\n"; 401 BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function); 402 if (MemLocInstr != &CallInst) 403 BC.printInstruction(dbgs(), CallInst, 0, &Function); 404 }); 405 406 DEBUG_VERBOSE(1, { 407 dbgs() << "Jmp info: Type = " << (unsigned)Type << ", " 408 << "BaseReg = " << BC.MRI->getName(BaseReg) << ", " 409 << "IndexReg = " << BC.MRI->getName(IndexReg) << ", " 410 << "DispValue = " << Twine::utohexstr(DispValue) << ", " 411 << "DispExpr = " << DispExpr << ", " 412 << "MemLocInstr = "; 413 BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function); 414 dbgs() << "\n"; 415 }); 416 417 ++TotalIndexBasedCandidates; 418 419 auto ErrorOrMemAccesssProfile = 420 BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MemLocInstr, 421 "MemoryAccessProfile"); 422 if (!ErrorOrMemAccesssProfile) { 423 DEBUG_VERBOSE(1, dbgs() 424 << "BOLT-INFO: ICP no memory profiling data found\n"); 425 return JumpTableInfoType(); 426 } 427 MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccesssProfile.get(); 428 429 uint64_t ArrayStart; 430 if (DispExpr) { 431 ErrorOr<uint64_t> DispValueOrError = 432 BC.getSymbolValue(*BC.MIB->getTargetSymbol(DispExpr)); 433 assert(DispValueOrError && "global symbol needs a value"); 434 ArrayStart = *DispValueOrError; 435 } else { 436 ArrayStart = static_cast<uint64_t>(DispValue); 437 } 438 439 if (BaseReg == BC.MRI->getProgramCounter()) 440 ArrayStart += Function.getAddress() + MemAccessProfile.NextInstrOffset; 441 442 // This is a map of [symbol] -> [count, index] and is used to combine indices 443 // into the jump table since there may be multiple addresses that all have the 444 // same entry. 445 std::map<MCSymbol *, std::pair<uint64_t, uint64_t>> HotTargetMap; 446 const std::pair<size_t, size_t> Range = JT->getEntriesForAddress(ArrayStart); 447 448 for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) { 449 size_t Index; 450 // Mem data occasionally includes nullprs, ignore them. 451 if (!AccessInfo.MemoryObject && !AccessInfo.Offset) 452 continue; 453 454 if (AccessInfo.Offset % JT->EntrySize != 0) // ignore bogus data 455 return JumpTableInfoType(); 456 457 if (AccessInfo.MemoryObject) { 458 // Deal with bad/stale data 459 if (!AccessInfo.MemoryObject->getName().startswith( 460 "JUMP_TABLE/" + Function.getOneName().str())) 461 return JumpTableInfoType(); 462 Index = 463 (AccessInfo.Offset - (ArrayStart - JT->getAddress())) / JT->EntrySize; 464 } else { 465 Index = (AccessInfo.Offset - ArrayStart) / JT->EntrySize; 466 } 467 468 // If Index is out of range it probably means the memory profiling data is 469 // wrong for this instruction, bail out. 470 if (Index >= Range.second) { 471 LLVM_DEBUG(dbgs() << "BOLT-INFO: Index out of range of " << Range.first 472 << ", " << Range.second << "\n"); 473 return JumpTableInfoType(); 474 } 475 476 // Make sure the hot index points at a legal label corresponding to a BB, 477 // e.g. not the end of function (unreachable) label. 478 if (!Function.getBasicBlockForLabel(JT->Entries[Index + Range.first])) { 479 LLVM_DEBUG({ 480 dbgs() << "BOLT-INFO: hot index " << Index << " pointing at bogus " 481 << "label " << JT->Entries[Index + Range.first]->getName() 482 << " in jump table:\n"; 483 JT->print(dbgs()); 484 dbgs() << "HotTargetMap:\n"; 485 for (std::pair<MCSymbol *const, std::pair<uint64_t, uint64_t>> &HT : 486 HotTargetMap) 487 dbgs() << "BOLT-INFO: " << HT.first->getName() 488 << " = (count=" << HT.second.first 489 << ", index=" << HT.second.second << ")\n"; 490 }); 491 return JumpTableInfoType(); 492 } 493 494 std::pair<uint64_t, uint64_t> &HotTarget = 495 HotTargetMap[JT->Entries[Index + Range.first]]; 496 HotTarget.first += AccessInfo.Count; 497 HotTarget.second = Index; 498 } 499 500 llvm::transform( 501 HotTargetMap, std::back_inserter(HotTargets), 502 [](const std::pair<MCSymbol *, std::pair<uint64_t, uint64_t>> &A) { 503 return A.second; 504 }); 505 506 // Sort with highest counts first. 507 llvm::sort(reverse(HotTargets)); 508 509 LLVM_DEBUG({ 510 dbgs() << "BOLT-INFO: ICP jump table hot targets:\n"; 511 for (const std::pair<uint64_t, uint64_t> &Target : HotTargets) 512 dbgs() << "BOLT-INFO: Idx = " << Target.second << ", " 513 << "Count = " << Target.first << "\n"; 514 }); 515 516 BC.MIB->getOrCreateAnnotationAs<uint16_t>(CallInst, "JTIndexReg") = IndexReg; 517 518 TargetFetchInst = MemLocInstr; 519 520 return HotTargets; 521 } 522 523 IndirectCallPromotion::SymTargetsType 524 IndirectCallPromotion::findCallTargetSymbols(std::vector<Callsite> &Targets, 525 size_t &N, BinaryBasicBlock &BB, 526 MCInst &CallInst, 527 MCInst *&TargetFetchInst) const { 528 const JumpTable *JT = BB.getFunction()->getJumpTable(CallInst); 529 SymTargetsType SymTargets; 530 531 if (!JT) { 532 for (size_t I = 0; I < N; ++I) { 533 assert(Targets[I].To.Sym && "All ICP targets must be to known symbols"); 534 assert(Targets[I].JTIndices.empty() && 535 "Can't have jump table indices for non-jump tables"); 536 SymTargets.emplace_back(Targets[I].To.Sym, 0); 537 } 538 return SymTargets; 539 } 540 541 // Use memory profile to select hot targets. 542 JumpTableInfoType HotTargets = 543 maybeGetHotJumpTableTargets(BB, CallInst, TargetFetchInst, JT); 544 545 auto findTargetsIndex = [&](uint64_t JTIndex) { 546 for (size_t I = 0; I < Targets.size(); ++I) 547 if (llvm::is_contained(Targets[I].JTIndices, JTIndex)) 548 return I; 549 LLVM_DEBUG(dbgs() << "BOLT-ERROR: Unable to find target index for hot jump " 550 << " table entry in " << *BB.getFunction() << "\n"); 551 llvm_unreachable("Hot indices must be referred to by at least one " 552 "callsite"); 553 }; 554 555 if (!HotTargets.empty()) { 556 if (opts::Verbosity >= 1) 557 for (size_t I = 0; I < HotTargets.size(); ++I) 558 outs() << "BOLT-INFO: HotTarget[" << I << "] = (" << HotTargets[I].first 559 << ", " << HotTargets[I].second << ")\n"; 560 561 // Recompute hottest targets, now discriminating which index is hot 562 // NOTE: This is a tradeoff. On one hand, we get index information. On the 563 // other hand, info coming from the memory profile is much less accurate 564 // than LBRs. So we may actually end up working with more coarse 565 // profile granularity in exchange for information about indices. 566 std::vector<Callsite> NewTargets; 567 std::map<const MCSymbol *, uint32_t> IndicesPerTarget; 568 uint64_t TotalMemAccesses = 0; 569 for (size_t I = 0; I < HotTargets.size(); ++I) { 570 const uint64_t TargetIndex = findTargetsIndex(HotTargets[I].second); 571 ++IndicesPerTarget[Targets[TargetIndex].To.Sym]; 572 TotalMemAccesses += HotTargets[I].first; 573 } 574 uint64_t RemainingMemAccesses = TotalMemAccesses; 575 const size_t TopN = 576 opts::ICPJumpTablesTopN ? opts::ICPJumpTablesTopN : opts::ICPTopN; 577 size_t I = 0; 578 for (; I < HotTargets.size(); ++I) { 579 const uint64_t MemAccesses = HotTargets[I].first; 580 if (100 * MemAccesses < 581 TotalMemAccesses * opts::ICPJTTotalPercentThreshold) 582 break; 583 if (100 * MemAccesses < 584 RemainingMemAccesses * opts::ICPJTRemainingPercentThreshold) 585 break; 586 if (TopN && I >= TopN) 587 break; 588 RemainingMemAccesses -= MemAccesses; 589 590 const uint64_t JTIndex = HotTargets[I].second; 591 Callsite &Target = Targets[findTargetsIndex(JTIndex)]; 592 593 NewTargets.push_back(Target); 594 std::vector<uint64_t>({JTIndex}).swap(NewTargets.back().JTIndices); 595 llvm::erase_value(Target.JTIndices, JTIndex); 596 597 // Keep fixCFG counts sane if more indices use this same target later 598 assert(IndicesPerTarget[Target.To.Sym] > 0 && "wrong map"); 599 NewTargets.back().Branches = 600 Target.Branches / IndicesPerTarget[Target.To.Sym]; 601 NewTargets.back().Mispreds = 602 Target.Mispreds / IndicesPerTarget[Target.To.Sym]; 603 assert(Target.Branches >= NewTargets.back().Branches); 604 assert(Target.Mispreds >= NewTargets.back().Mispreds); 605 Target.Branches -= NewTargets.back().Branches; 606 Target.Mispreds -= NewTargets.back().Mispreds; 607 } 608 llvm::copy(Targets, std::back_inserter(NewTargets)); 609 std::swap(NewTargets, Targets); 610 N = I; 611 612 if (N == 0 && opts::Verbosity >= 1) { 613 outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " in " 614 << BB.getName() << ": failed to meet thresholds after memory " 615 << "profile data was loaded.\n"; 616 return SymTargets; 617 } 618 } 619 620 for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) { 621 Callsite &Target = Targets[TgtIdx]; 622 assert(Target.To.Sym && "All ICP targets must be to known symbols"); 623 assert(!Target.JTIndices.empty() && "Jump tables must have indices"); 624 for (uint64_t Idx : Target.JTIndices) { 625 SymTargets.emplace_back(Target.To.Sym, Idx); 626 ++I; 627 } 628 } 629 630 return SymTargets; 631 } 632 633 IndirectCallPromotion::MethodInfoType IndirectCallPromotion::maybeGetVtableSyms( 634 BinaryBasicBlock &BB, MCInst &Inst, 635 const SymTargetsType &SymTargets) const { 636 BinaryFunction &Function = *BB.getFunction(); 637 BinaryContext &BC = Function.getBinaryContext(); 638 std::vector<std::pair<MCSymbol *, uint64_t>> VtableSyms; 639 std::vector<MCInst *> MethodFetchInsns; 640 unsigned VtableReg, MethodReg; 641 uint64_t MethodOffset; 642 643 assert(!Function.getJumpTable(Inst) && 644 "Can't get vtable addrs for jump tables."); 645 646 if (!Function.hasMemoryProfile() || !opts::EliminateLoads) 647 return MethodInfoType(); 648 649 MutableArrayRef<MCInst> Insts(&BB.front(), &Inst + 1); 650 if (!BC.MIB->analyzeVirtualMethodCall(Insts.begin(), Insts.end(), 651 MethodFetchInsns, VtableReg, MethodReg, 652 MethodOffset)) { 653 DEBUG_VERBOSE( 654 1, dbgs() << "BOLT-INFO: ICP unable to analyze method call in " 655 << Function << " at @ " << (&Inst - &BB.front()) << "\n"); 656 return MethodInfoType(); 657 } 658 659 ++TotalMethodLoadEliminationCandidates; 660 661 DEBUG_VERBOSE(1, { 662 dbgs() << "BOLT-INFO: ICP found virtual method call in " << Function 663 << " at @ " << (&Inst - &BB.front()) << "\n"; 664 dbgs() << "BOLT-INFO: ICP method fetch instructions:\n"; 665 for (MCInst *Inst : MethodFetchInsns) 666 BC.printInstruction(dbgs(), *Inst, 0, &Function); 667 668 if (MethodFetchInsns.back() != &Inst) 669 BC.printInstruction(dbgs(), Inst, 0, &Function); 670 }); 671 672 // Try to get value profiling data for the method load instruction. 673 auto ErrorOrMemAccesssProfile = 674 BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MethodFetchInsns.back(), 675 "MemoryAccessProfile"); 676 if (!ErrorOrMemAccesssProfile) { 677 DEBUG_VERBOSE(1, dbgs() 678 << "BOLT-INFO: ICP no memory profiling data found\n"); 679 return MethodInfoType(); 680 } 681 MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccesssProfile.get(); 682 683 // Find the vtable that each method belongs to. 684 std::map<const MCSymbol *, uint64_t> MethodToVtable; 685 686 for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) { 687 uint64_t Address = AccessInfo.Offset; 688 if (AccessInfo.MemoryObject) 689 Address += AccessInfo.MemoryObject->getAddress(); 690 691 // Ignore bogus data. 692 if (!Address) 693 continue; 694 695 const uint64_t VtableBase = Address - MethodOffset; 696 697 DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP vtable = " 698 << Twine::utohexstr(VtableBase) << "+" 699 << MethodOffset << "/" << AccessInfo.Count << "\n"); 700 701 if (ErrorOr<uint64_t> MethodAddr = BC.getPointerAtAddress(Address)) { 702 BinaryData *MethodBD = BC.getBinaryDataAtAddress(MethodAddr.get()); 703 if (!MethodBD) // skip unknown methods 704 continue; 705 MCSymbol *MethodSym = MethodBD->getSymbol(); 706 MethodToVtable[MethodSym] = VtableBase; 707 DEBUG_VERBOSE(1, { 708 const BinaryFunction *Method = BC.getFunctionForSymbol(MethodSym); 709 dbgs() << "BOLT-INFO: ICP found method = " 710 << Twine::utohexstr(MethodAddr.get()) << "/" 711 << (Method ? Method->getPrintName() : "") << "\n"; 712 }); 713 } 714 } 715 716 // Find the vtable for each target symbol. 717 for (size_t I = 0; I < SymTargets.size(); ++I) { 718 auto Itr = MethodToVtable.find(SymTargets[I].first); 719 if (Itr != MethodToVtable.end()) { 720 if (BinaryData *BD = BC.getBinaryDataContainingAddress(Itr->second)) { 721 const uint64_t Addend = Itr->second - BD->getAddress(); 722 VtableSyms.emplace_back(BD->getSymbol(), Addend); 723 continue; 724 } 725 } 726 // Give up if we can't find the vtable for a method. 727 DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP can't find vtable for " 728 << SymTargets[I].first->getName() << "\n"); 729 return MethodInfoType(); 730 } 731 732 // Make sure the vtable reg is not clobbered by the argument passing code 733 if (VtableReg != MethodReg) { 734 for (MCInst *CurInst = MethodFetchInsns.front(); CurInst < &Inst; 735 ++CurInst) { 736 const MCInstrDesc &InstrInfo = BC.MII->get(CurInst->getOpcode()); 737 if (InstrInfo.hasDefOfPhysReg(*CurInst, VtableReg, *BC.MRI)) 738 return MethodInfoType(); 739 } 740 } 741 742 return MethodInfoType(VtableSyms, MethodFetchInsns); 743 } 744 745 std::vector<std::unique_ptr<BinaryBasicBlock>> 746 IndirectCallPromotion::rewriteCall( 747 BinaryBasicBlock &IndCallBlock, const MCInst &CallInst, 748 MCPlusBuilder::BlocksVectorTy &&ICPcode, 749 const std::vector<MCInst *> &MethodFetchInsns) const { 750 BinaryFunction &Function = *IndCallBlock.getFunction(); 751 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); 752 753 // Create new basic blocks with correct code in each one first. 754 std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs; 755 const bool IsTailCallOrJT = 756 (MIB->isTailCall(CallInst) || Function.getJumpTable(CallInst)); 757 758 // Move instructions from the tail of the original call block 759 // to the merge block. 760 761 // Remember any pseudo instructions following a tail call. These 762 // must be preserved and moved to the original block. 763 InstructionListType TailInsts; 764 const MCInst *TailInst = &CallInst; 765 if (IsTailCallOrJT) 766 while (TailInst + 1 < &(*IndCallBlock.end()) && 767 MIB->isPseudo(*(TailInst + 1))) 768 TailInsts.push_back(*++TailInst); 769 770 InstructionListType MovedInst = IndCallBlock.splitInstructions(&CallInst); 771 // Link new BBs to the original input offset of the BB where the indirect 772 // call site is, so we can map samples recorded in new BBs back to the 773 // original BB seen in the input binary (if using BAT) 774 const uint32_t OrigOffset = IndCallBlock.getInputOffset(); 775 776 IndCallBlock.eraseInstructions(MethodFetchInsns.begin(), 777 MethodFetchInsns.end()); 778 if (IndCallBlock.empty() || 779 (!MethodFetchInsns.empty() && MethodFetchInsns.back() == &CallInst)) 780 IndCallBlock.addInstructions(ICPcode.front().second.begin(), 781 ICPcode.front().second.end()); 782 else 783 IndCallBlock.replaceInstruction(std::prev(IndCallBlock.end()), 784 ICPcode.front().second); 785 IndCallBlock.addInstructions(TailInsts.begin(), TailInsts.end()); 786 787 for (auto Itr = ICPcode.begin() + 1; Itr != ICPcode.end(); ++Itr) { 788 MCSymbol *&Sym = Itr->first; 789 InstructionListType &Insts = Itr->second; 790 assert(Sym); 791 std::unique_ptr<BinaryBasicBlock> TBB = Function.createBasicBlock(Sym); 792 TBB->setOffset(OrigOffset); 793 for (MCInst &Inst : Insts) // sanitize new instructions. 794 if (MIB->isCall(Inst)) 795 MIB->removeAnnotation(Inst, "CallProfile"); 796 TBB->addInstructions(Insts.begin(), Insts.end()); 797 NewBBs.emplace_back(std::move(TBB)); 798 } 799 800 // Move tail of instructions from after the original call to 801 // the merge block. 802 if (!IsTailCallOrJT) 803 NewBBs.back()->addInstructions(MovedInst.begin(), MovedInst.end()); 804 805 return NewBBs; 806 } 807 808 BinaryBasicBlock * 809 IndirectCallPromotion::fixCFG(BinaryBasicBlock &IndCallBlock, 810 const bool IsTailCall, const bool IsJumpTable, 811 IndirectCallPromotion::BasicBlocksVector &&NewBBs, 812 const std::vector<Callsite> &Targets) const { 813 BinaryFunction &Function = *IndCallBlock.getFunction(); 814 using BinaryBranchInfo = BinaryBasicBlock::BinaryBranchInfo; 815 BinaryBasicBlock *MergeBlock = nullptr; 816 817 // Scale indirect call counts to the execution count of the original 818 // basic block containing the indirect call. 819 uint64_t TotalCount = IndCallBlock.getKnownExecutionCount(); 820 uint64_t TotalIndirectBranches = 0; 821 for (const Callsite &Target : Targets) 822 TotalIndirectBranches += Target.Branches; 823 if (TotalIndirectBranches == 0) 824 TotalIndirectBranches = 1; 825 BinaryBasicBlock::BranchInfoType BBI; 826 BinaryBasicBlock::BranchInfoType ScaledBBI; 827 for (const Callsite &Target : Targets) { 828 const size_t NumEntries = 829 std::max(static_cast<std::size_t>(1UL), Target.JTIndices.size()); 830 for (size_t I = 0; I < NumEntries; ++I) { 831 BBI.push_back( 832 BinaryBranchInfo{(Target.Branches + NumEntries - 1) / NumEntries, 833 (Target.Mispreds + NumEntries - 1) / NumEntries}); 834 ScaledBBI.push_back( 835 BinaryBranchInfo{uint64_t(TotalCount * Target.Branches / 836 (NumEntries * TotalIndirectBranches)), 837 uint64_t(TotalCount * Target.Mispreds / 838 (NumEntries * TotalIndirectBranches))}); 839 } 840 } 841 842 if (IsJumpTable) { 843 BinaryBasicBlock *NewIndCallBlock = NewBBs.back().get(); 844 IndCallBlock.moveAllSuccessorsTo(NewIndCallBlock); 845 846 std::vector<MCSymbol *> SymTargets; 847 for (const Callsite &Target : Targets) { 848 const size_t NumEntries = 849 std::max(static_cast<std::size_t>(1UL), Target.JTIndices.size()); 850 for (size_t I = 0; I < NumEntries; ++I) 851 SymTargets.push_back(Target.To.Sym); 852 } 853 assert(SymTargets.size() > NewBBs.size() - 1 && 854 "There must be a target symbol associated with each new BB."); 855 856 for (uint64_t I = 0; I < NewBBs.size(); ++I) { 857 BinaryBasicBlock *SourceBB = I ? NewBBs[I - 1].get() : &IndCallBlock; 858 SourceBB->setExecutionCount(TotalCount); 859 860 BinaryBasicBlock *TargetBB = 861 Function.getBasicBlockForLabel(SymTargets[I]); 862 SourceBB->addSuccessor(TargetBB, ScaledBBI[I]); // taken 863 864 TotalCount -= ScaledBBI[I].Count; 865 SourceBB->addSuccessor(NewBBs[I].get(), TotalCount); // fall-through 866 867 // Update branch info for the indirect jump. 868 BinaryBasicBlock::BinaryBranchInfo &BranchInfo = 869 NewIndCallBlock->getBranchInfo(*TargetBB); 870 if (BranchInfo.Count > BBI[I].Count) 871 BranchInfo.Count -= BBI[I].Count; 872 else 873 BranchInfo.Count = 0; 874 875 if (BranchInfo.MispredictedCount > BBI[I].MispredictedCount) 876 BranchInfo.MispredictedCount -= BBI[I].MispredictedCount; 877 else 878 BranchInfo.MispredictedCount = 0; 879 } 880 } else { 881 assert(NewBBs.size() >= 2); 882 assert(NewBBs.size() % 2 == 1 || IndCallBlock.succ_empty()); 883 assert(NewBBs.size() % 2 == 1 || IsTailCall); 884 885 auto ScaledBI = ScaledBBI.begin(); 886 auto updateCurrentBranchInfo = [&] { 887 assert(ScaledBI != ScaledBBI.end()); 888 TotalCount -= ScaledBI->Count; 889 ++ScaledBI; 890 }; 891 892 if (!IsTailCall) { 893 MergeBlock = NewBBs.back().get(); 894 IndCallBlock.moveAllSuccessorsTo(MergeBlock); 895 } 896 897 // Fix up successors and execution counts. 898 updateCurrentBranchInfo(); 899 IndCallBlock.addSuccessor(NewBBs[1].get(), TotalCount); 900 IndCallBlock.addSuccessor(NewBBs[0].get(), ScaledBBI[0]); 901 902 const size_t Adj = IsTailCall ? 1 : 2; 903 for (size_t I = 0; I < NewBBs.size() - Adj; ++I) { 904 assert(TotalCount <= IndCallBlock.getExecutionCount() || 905 TotalCount <= uint64_t(TotalIndirectBranches)); 906 uint64_t ExecCount = ScaledBBI[(I + 1) / 2].Count; 907 if (I % 2 == 0) { 908 if (MergeBlock) 909 NewBBs[I]->addSuccessor(MergeBlock, ScaledBBI[(I + 1) / 2].Count); 910 } else { 911 assert(I + 2 < NewBBs.size()); 912 updateCurrentBranchInfo(); 913 NewBBs[I]->addSuccessor(NewBBs[I + 2].get(), TotalCount); 914 NewBBs[I]->addSuccessor(NewBBs[I + 1].get(), ScaledBBI[(I + 1) / 2]); 915 ExecCount += TotalCount; 916 } 917 NewBBs[I]->setExecutionCount(ExecCount); 918 } 919 920 if (MergeBlock) { 921 // Arrange for the MergeBlock to be the fallthrough for the first 922 // promoted call block. 923 std::unique_ptr<BinaryBasicBlock> MBPtr; 924 std::swap(MBPtr, NewBBs.back()); 925 NewBBs.pop_back(); 926 NewBBs.emplace(NewBBs.begin() + 1, std::move(MBPtr)); 927 // TODO: is COUNT_FALLTHROUGH_EDGE the right thing here? 928 NewBBs.back()->addSuccessor(MergeBlock, TotalCount); // uncond branch 929 } 930 } 931 932 // Update the execution count. 933 NewBBs.back()->setExecutionCount(TotalCount); 934 935 // Update BB and BB layout. 936 Function.insertBasicBlocks(&IndCallBlock, std::move(NewBBs)); 937 assert(Function.validateCFG()); 938 939 return MergeBlock; 940 } 941 942 size_t IndirectCallPromotion::canPromoteCallsite( 943 const BinaryBasicBlock &BB, const MCInst &Inst, 944 const std::vector<Callsite> &Targets, uint64_t NumCalls) { 945 BinaryFunction *BF = BB.getFunction(); 946 const BinaryContext &BC = BF->getBinaryContext(); 947 948 if (BB.getKnownExecutionCount() < opts::ExecutionCountThreshold) 949 return 0; 950 951 const bool IsJumpTable = BF->getJumpTable(Inst); 952 953 auto computeStats = [&](size_t N) { 954 for (size_t I = 0; I < N; ++I) 955 if (IsJumpTable) 956 TotalNumFrequentJmps += Targets[I].Branches; 957 else 958 TotalNumFrequentCalls += Targets[I].Branches; 959 }; 960 961 // If we have no targets (or no calls), skip this callsite. 962 if (Targets.empty() || !NumCalls) { 963 if (opts::Verbosity >= 1) { 964 const ptrdiff_t InstIdx = &Inst - &(*BB.begin()); 965 outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx << " in " 966 << BB.getName() << ", calls = " << NumCalls 967 << ", targets empty or NumCalls == 0.\n"; 968 } 969 return 0; 970 } 971 972 size_t TopN = opts::ICPTopN; 973 if (IsJumpTable) 974 TopN = opts::ICPJumpTablesTopN ? opts::ICPJumpTablesTopN : TopN; 975 else 976 TopN = opts::ICPCallsTopN ? opts::ICPCallsTopN : TopN; 977 978 const size_t TrialN = TopN ? std::min(TopN, Targets.size()) : Targets.size(); 979 980 if (opts::ICPTopCallsites > 0) { 981 if (!BC.MIB->hasAnnotation(Inst, "DoICP")) 982 return 0; 983 } 984 985 // Pick the top N targets. 986 uint64_t TotalMispredictsTopN = 0; 987 size_t N = 0; 988 989 if (opts::ICPUseMispredicts && 990 (!IsJumpTable || opts::ICPJumpTablesByTarget)) { 991 // Count total number of mispredictions for (at most) the top N targets. 992 // We may choose a smaller N (TrialN vs. N) if the frequency threshold 993 // is exceeded by fewer targets. 994 double Threshold = double(opts::ICPMispredictThreshold); 995 for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) { 996 Threshold -= (100.0 * Targets[I].Mispreds) / NumCalls; 997 TotalMispredictsTopN += Targets[I].Mispreds; 998 } 999 computeStats(N); 1000 1001 // Compute the misprediction frequency of the top N call targets. If this 1002 // frequency is greater than the threshold, we should try ICP on this 1003 // callsite. 1004 const double TopNFrequency = (100.0 * TotalMispredictsTopN) / NumCalls; 1005 if (TopNFrequency == 0 || TopNFrequency < opts::ICPMispredictThreshold) { 1006 if (opts::Verbosity >= 1) { 1007 const ptrdiff_t InstIdx = &Inst - &(*BB.begin()); 1008 outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx 1009 << " in " << BB.getName() << ", calls = " << NumCalls 1010 << ", top N mis. frequency " << format("%.1f", TopNFrequency) 1011 << "% < " << opts::ICPMispredictThreshold << "%\n"; 1012 } 1013 return 0; 1014 } 1015 } else { 1016 size_t MaxTargets = 0; 1017 1018 // Count total number of calls for (at most) the top N targets. 1019 // We may choose a smaller N (TrialN vs. N) if the frequency threshold 1020 // is exceeded by fewer targets. 1021 const unsigned TotalThreshold = IsJumpTable 1022 ? opts::ICPJTTotalPercentThreshold 1023 : opts::ICPCallsTotalPercentThreshold; 1024 const unsigned RemainingThreshold = 1025 IsJumpTable ? opts::ICPJTRemainingPercentThreshold 1026 : opts::ICPCallsRemainingPercentThreshold; 1027 uint64_t NumRemainingCalls = NumCalls; 1028 for (size_t I = 0; I < TrialN; ++I, ++MaxTargets) { 1029 if (100 * Targets[I].Branches < NumCalls * TotalThreshold) 1030 break; 1031 if (100 * Targets[I].Branches < NumRemainingCalls * RemainingThreshold) 1032 break; 1033 if (N + (Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size()) > 1034 TrialN) 1035 break; 1036 TotalMispredictsTopN += Targets[I].Mispreds; 1037 NumRemainingCalls -= Targets[I].Branches; 1038 N += Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size(); 1039 } 1040 computeStats(MaxTargets); 1041 1042 // Don't check misprediction frequency for jump tables -- we don't really 1043 // care as long as we are saving loads from the jump table. 1044 if (!IsJumpTable || opts::ICPJumpTablesByTarget) { 1045 // Compute the misprediction frequency of the top N call targets. If 1046 // this frequency is less than the threshold, we should skip ICP at 1047 // this callsite. 1048 const double TopNMispredictFrequency = 1049 (100.0 * TotalMispredictsTopN) / NumCalls; 1050 1051 if (TopNMispredictFrequency < opts::ICPMispredictThreshold) { 1052 if (opts::Verbosity >= 1) { 1053 const ptrdiff_t InstIdx = &Inst - &(*BB.begin()); 1054 outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx 1055 << " in " << BB.getName() << ", calls = " << NumCalls 1056 << ", top N mispredict frequency " 1057 << format("%.1f", TopNMispredictFrequency) << "% < " 1058 << opts::ICPMispredictThreshold << "%\n"; 1059 } 1060 return 0; 1061 } 1062 } 1063 } 1064 1065 // Filter by inline-ability of target functions, stop at first target that 1066 // can't be inlined. 1067 if (opts::ICPPeelForInline) { 1068 for (size_t I = 0; I < N; ++I) { 1069 const MCSymbol *TargetSym = Targets[I].To.Sym; 1070 const BinaryFunction *TargetBF = BC.getFunctionForSymbol(TargetSym); 1071 if (!BinaryFunctionPass::shouldOptimize(*TargetBF) || 1072 getInliningInfo(*TargetBF).Type == InliningType::INL_NONE) { 1073 N = I; 1074 break; 1075 } 1076 } 1077 } 1078 1079 // Filter functions that can have ICP applied (for debugging) 1080 if (!opts::ICPFuncsList.empty()) { 1081 for (std::string &Name : opts::ICPFuncsList) 1082 if (BF->hasName(Name)) 1083 return N; 1084 return 0; 1085 } 1086 1087 return N; 1088 } 1089 1090 void IndirectCallPromotion::printCallsiteInfo( 1091 const BinaryBasicBlock &BB, const MCInst &Inst, 1092 const std::vector<Callsite> &Targets, const size_t N, 1093 uint64_t NumCalls) const { 1094 BinaryContext &BC = BB.getFunction()->getBinaryContext(); 1095 const bool IsTailCall = BC.MIB->isTailCall(Inst); 1096 const bool IsJumpTable = BB.getFunction()->getJumpTable(Inst); 1097 const ptrdiff_t InstIdx = &Inst - &(*BB.begin()); 1098 1099 outs() << "BOLT-INFO: ICP candidate branch info: " << *BB.getFunction() 1100 << " @ " << InstIdx << " in " << BB.getName() 1101 << " -> calls = " << NumCalls 1102 << (IsTailCall ? " (tail)" : (IsJumpTable ? " (jump table)" : "")) 1103 << "\n"; 1104 for (size_t I = 0; I < N; I++) { 1105 const double Frequency = 100.0 * Targets[I].Branches / NumCalls; 1106 const double MisFrequency = 100.0 * Targets[I].Mispreds / NumCalls; 1107 outs() << "BOLT-INFO: "; 1108 if (Targets[I].To.Sym) 1109 outs() << Targets[I].To.Sym->getName(); 1110 else 1111 outs() << Targets[I].To.Addr; 1112 outs() << ", calls = " << Targets[I].Branches 1113 << ", mispreds = " << Targets[I].Mispreds 1114 << ", taken freq = " << format("%.1f", Frequency) << "%" 1115 << ", mis. freq = " << format("%.1f", MisFrequency) << "%"; 1116 bool First = true; 1117 for (uint64_t JTIndex : Targets[I].JTIndices) { 1118 outs() << (First ? ", indices = " : ", ") << JTIndex; 1119 First = false; 1120 } 1121 outs() << "\n"; 1122 } 1123 1124 LLVM_DEBUG({ 1125 dbgs() << "BOLT-INFO: ICP original call instruction:"; 1126 BC.printInstruction(dbgs(), Inst, Targets[0].From.Addr, nullptr, true); 1127 }); 1128 } 1129 1130 void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) { 1131 if (opts::ICP == ICP_NONE) 1132 return; 1133 1134 auto &BFs = BC.getBinaryFunctions(); 1135 1136 const bool OptimizeCalls = (opts::ICP == ICP_CALLS || opts::ICP == ICP_ALL); 1137 const bool OptimizeJumpTables = 1138 (opts::ICP == ICP_JUMP_TABLES || opts::ICP == ICP_ALL); 1139 1140 std::unique_ptr<RegAnalysis> RA; 1141 std::unique_ptr<BinaryFunctionCallGraph> CG; 1142 if (OptimizeJumpTables) { 1143 CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC))); 1144 RA.reset(new RegAnalysis(BC, &BFs, &*CG)); 1145 } 1146 1147 // If icp-top-callsites is enabled, compute the total number of indirect 1148 // calls and then optimize the hottest callsites that contribute to that 1149 // total. 1150 SetVector<BinaryFunction *> Functions; 1151 if (opts::ICPTopCallsites == 0) { 1152 for (auto &KV : BFs) 1153 Functions.insert(&KV.second); 1154 } else { 1155 using IndirectCallsite = std::tuple<uint64_t, MCInst *, BinaryFunction *>; 1156 std::vector<IndirectCallsite> IndirectCalls; 1157 size_t TotalIndirectCalls = 0; 1158 1159 // Find all the indirect callsites. 1160 for (auto &BFIt : BFs) { 1161 BinaryFunction &Function = BFIt.second; 1162 1163 if (!Function.isSimple() || Function.isIgnored() || 1164 !Function.hasProfile()) 1165 continue; 1166 1167 const bool HasLayout = !Function.layout_empty(); 1168 1169 for (BinaryBasicBlock &BB : Function) { 1170 if (HasLayout && Function.isSplit() && BB.isCold()) 1171 continue; 1172 1173 for (MCInst &Inst : BB) { 1174 const bool IsJumpTable = Function.getJumpTable(Inst); 1175 const bool HasIndirectCallProfile = 1176 BC.MIB->hasAnnotation(Inst, "CallProfile"); 1177 const bool IsDirectCall = 1178 (BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0)); 1179 1180 if (!IsDirectCall && 1181 ((HasIndirectCallProfile && !IsJumpTable && OptimizeCalls) || 1182 (IsJumpTable && OptimizeJumpTables))) { 1183 uint64_t NumCalls = 0; 1184 for (const Callsite &BInfo : getCallTargets(BB, Inst)) 1185 NumCalls += BInfo.Branches; 1186 IndirectCalls.push_back( 1187 std::make_tuple(NumCalls, &Inst, &Function)); 1188 TotalIndirectCalls += NumCalls; 1189 } 1190 } 1191 } 1192 } 1193 1194 // Sort callsites by execution count. 1195 llvm::sort(reverse(IndirectCalls)); 1196 1197 // Find callsites that contribute to the top "opts::ICPTopCallsites"% 1198 // number of calls. 1199 const float TopPerc = opts::ICPTopCallsites / 100.0f; 1200 int64_t MaxCalls = TotalIndirectCalls * TopPerc; 1201 uint64_t LastFreq = std::numeric_limits<uint64_t>::max(); 1202 size_t Num = 0; 1203 for (const IndirectCallsite &IC : IndirectCalls) { 1204 const uint64_t CurFreq = std::get<0>(IC); 1205 // Once we decide to stop, include at least all branches that share the 1206 // same frequency of the last one to avoid non-deterministic behavior 1207 // (e.g. turning on/off ICP depending on the order of functions) 1208 if (MaxCalls <= 0 && CurFreq != LastFreq) 1209 break; 1210 MaxCalls -= CurFreq; 1211 LastFreq = CurFreq; 1212 BC.MIB->addAnnotation(*std::get<1>(IC), "DoICP", true); 1213 Functions.insert(std::get<2>(IC)); 1214 ++Num; 1215 } 1216 outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls 1217 << ", " << Num << " callsites cover " << opts::ICPTopCallsites 1218 << "% of all indirect calls\n"; 1219 } 1220 1221 for (BinaryFunction *FuncPtr : Functions) { 1222 BinaryFunction &Function = *FuncPtr; 1223 1224 if (!Function.isSimple() || Function.isIgnored() || !Function.hasProfile()) 1225 continue; 1226 1227 const bool HasLayout = !Function.layout_empty(); 1228 1229 // Total number of indirect calls issued from the current Function. 1230 // (a fraction of TotalIndirectCalls) 1231 uint64_t FuncTotalIndirectCalls = 0; 1232 uint64_t FuncTotalIndirectJmps = 0; 1233 1234 std::vector<BinaryBasicBlock *> BBs; 1235 for (BinaryBasicBlock &BB : Function) { 1236 // Skip indirect calls in cold blocks. 1237 if (!HasLayout || !Function.isSplit() || !BB.isCold()) 1238 BBs.push_back(&BB); 1239 } 1240 if (BBs.empty()) 1241 continue; 1242 1243 DataflowInfoManager Info(Function, RA.get(), nullptr); 1244 while (!BBs.empty()) { 1245 BinaryBasicBlock *BB = BBs.back(); 1246 BBs.pop_back(); 1247 1248 for (unsigned Idx = 0; Idx < BB->size(); ++Idx) { 1249 MCInst &Inst = BB->getInstructionAtIndex(Idx); 1250 const ptrdiff_t InstIdx = &Inst - &(*BB->begin()); 1251 const bool IsTailCall = BC.MIB->isTailCall(Inst); 1252 const bool HasIndirectCallProfile = 1253 BC.MIB->hasAnnotation(Inst, "CallProfile"); 1254 const bool IsJumpTable = Function.getJumpTable(Inst); 1255 1256 if (BC.MIB->isCall(Inst)) 1257 TotalCalls += BB->getKnownExecutionCount(); 1258 1259 if (IsJumpTable && !OptimizeJumpTables) 1260 continue; 1261 1262 if (!IsJumpTable && (!HasIndirectCallProfile || !OptimizeCalls)) 1263 continue; 1264 1265 // Ignore direct calls. 1266 if (BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0)) 1267 continue; 1268 1269 assert((BC.MIB->isCall(Inst) || BC.MIB->isIndirectBranch(Inst)) && 1270 "expected a call or an indirect jump instruction"); 1271 1272 if (IsJumpTable) 1273 ++TotalJumpTableCallsites; 1274 else 1275 ++TotalIndirectCallsites; 1276 1277 std::vector<Callsite> Targets = getCallTargets(*BB, Inst); 1278 1279 // Compute the total number of calls from this particular callsite. 1280 uint64_t NumCalls = 0; 1281 for (const Callsite &BInfo : Targets) 1282 NumCalls += BInfo.Branches; 1283 if (!IsJumpTable) 1284 FuncTotalIndirectCalls += NumCalls; 1285 else 1286 FuncTotalIndirectJmps += NumCalls; 1287 1288 // If FLAGS regs is alive after this jmp site, do not try 1289 // promoting because we will clobber FLAGS. 1290 if (IsJumpTable) { 1291 ErrorOr<const BitVector &> State = 1292 Info.getLivenessAnalysis().getStateBefore(Inst); 1293 if (!State || (State && (*State)[BC.MIB->getFlagsReg()])) { 1294 if (opts::Verbosity >= 1) 1295 outs() << "BOLT-INFO: ICP failed in " << Function << " @ " 1296 << InstIdx << " in " << BB->getName() 1297 << ", calls = " << NumCalls 1298 << (State ? ", cannot clobber flags reg.\n" 1299 : ", no liveness data available.\n"); 1300 continue; 1301 } 1302 } 1303 1304 // Should this callsite be optimized? Return the number of targets 1305 // to use when promoting this call. A value of zero means to skip 1306 // this callsite. 1307 size_t N = canPromoteCallsite(*BB, Inst, Targets, NumCalls); 1308 1309 // If it is a jump table and it failed to meet our initial threshold, 1310 // proceed to findCallTargetSymbols -- it may reevaluate N if 1311 // memory profile is present 1312 if (!N && !IsJumpTable) 1313 continue; 1314 1315 if (opts::Verbosity >= 1) 1316 printCallsiteInfo(*BB, Inst, Targets, N, NumCalls); 1317 1318 // Find MCSymbols or absolute addresses for each call target. 1319 MCInst *TargetFetchInst = nullptr; 1320 const SymTargetsType SymTargets = 1321 findCallTargetSymbols(Targets, N, *BB, Inst, TargetFetchInst); 1322 1323 // findCallTargetSymbols may have changed N if mem profile is available 1324 // for jump tables 1325 if (!N) 1326 continue; 1327 1328 LLVM_DEBUG(printDecision(dbgs(), Targets, N)); 1329 1330 // If we can't resolve any of the target symbols, punt on this callsite. 1331 // TODO: can this ever happen? 1332 if (SymTargets.size() < N) { 1333 const size_t LastTarget = SymTargets.size(); 1334 if (opts::Verbosity >= 1) 1335 outs() << "BOLT-INFO: ICP failed in " << Function << " @ " 1336 << InstIdx << " in " << BB->getName() 1337 << ", calls = " << NumCalls 1338 << ", ICP failed to find target symbol for " 1339 << Targets[LastTarget].To.Sym->getName() << "\n"; 1340 continue; 1341 } 1342 1343 MethodInfoType MethodInfo; 1344 1345 if (!IsJumpTable) { 1346 MethodInfo = maybeGetVtableSyms(*BB, Inst, SymTargets); 1347 TotalMethodLoadsEliminated += MethodInfo.first.empty() ? 0 : 1; 1348 LLVM_DEBUG(dbgs() 1349 << "BOLT-INFO: ICP " 1350 << (!MethodInfo.first.empty() ? "found" : "did not find") 1351 << " vtables for all methods.\n"); 1352 } else if (TargetFetchInst) { 1353 ++TotalIndexBasedJumps; 1354 MethodInfo.second.push_back(TargetFetchInst); 1355 } 1356 1357 // Generate new promoted call code for this callsite. 1358 MCPlusBuilder::BlocksVectorTy ICPcode = 1359 (IsJumpTable && !opts::ICPJumpTablesByTarget) 1360 ? BC.MIB->jumpTablePromotion(Inst, SymTargets, 1361 MethodInfo.second, BC.Ctx.get()) 1362 : BC.MIB->indirectCallPromotion( 1363 Inst, SymTargets, MethodInfo.first, MethodInfo.second, 1364 opts::ICPOldCodeSequence, BC.Ctx.get()); 1365 1366 if (ICPcode.empty()) { 1367 if (opts::Verbosity >= 1) 1368 outs() << "BOLT-INFO: ICP failed in " << Function << " @ " 1369 << InstIdx << " in " << BB->getName() 1370 << ", calls = " << NumCalls 1371 << ", unable to generate promoted call code.\n"; 1372 continue; 1373 } 1374 1375 LLVM_DEBUG({ 1376 uint64_t Offset = Targets[0].From.Addr; 1377 dbgs() << "BOLT-INFO: ICP indirect call code:\n"; 1378 for (const auto &entry : ICPcode) { 1379 const MCSymbol *const &Sym = entry.first; 1380 const InstructionListType &Insts = entry.second; 1381 if (Sym) 1382 dbgs() << Sym->getName() << ":\n"; 1383 Offset = BC.printInstructions(dbgs(), Insts.begin(), Insts.end(), 1384 Offset); 1385 } 1386 dbgs() << "---------------------------------------------------\n"; 1387 }); 1388 1389 // Rewrite the CFG with the newly generated ICP code. 1390 std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs = 1391 rewriteCall(*BB, Inst, std::move(ICPcode), MethodInfo.second); 1392 1393 // Fix the CFG after inserting the new basic blocks. 1394 BinaryBasicBlock *MergeBlock = 1395 fixCFG(*BB, IsTailCall, IsJumpTable, std::move(NewBBs), Targets); 1396 1397 // Since the tail of the original block was split off and it may contain 1398 // additional indirect calls, we must add the merge block to the set of 1399 // blocks to process. 1400 if (MergeBlock) 1401 BBs.push_back(MergeBlock); 1402 1403 if (opts::Verbosity >= 1) 1404 outs() << "BOLT-INFO: ICP succeeded in " << Function << " @ " 1405 << InstIdx << " in " << BB->getName() 1406 << " -> calls = " << NumCalls << "\n"; 1407 1408 if (IsJumpTable) 1409 ++TotalOptimizedJumpTableCallsites; 1410 else 1411 ++TotalOptimizedIndirectCallsites; 1412 1413 Modified.insert(&Function); 1414 } 1415 } 1416 TotalIndirectCalls += FuncTotalIndirectCalls; 1417 TotalIndirectJmps += FuncTotalIndirectJmps; 1418 } 1419 1420 outs() << "BOLT-INFO: ICP total indirect callsites with profile = " 1421 << TotalIndirectCallsites << "\n" 1422 << "BOLT-INFO: ICP total jump table callsites = " 1423 << TotalJumpTableCallsites << "\n" 1424 << "BOLT-INFO: ICP total number of calls = " << TotalCalls << "\n" 1425 << "BOLT-INFO: ICP percentage of calls that are indirect = " 1426 << format("%.1f", (100.0 * TotalIndirectCalls) / TotalCalls) << "%\n" 1427 << "BOLT-INFO: ICP percentage of indirect calls that can be " 1428 "optimized = " 1429 << format("%.1f", (100.0 * TotalNumFrequentCalls) / 1430 std::max<size_t>(TotalIndirectCalls, 1)) 1431 << "%\n" 1432 << "BOLT-INFO: ICP percentage of indirect callsites that are " 1433 "optimized = " 1434 << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) / 1435 std::max<uint64_t>(TotalIndirectCallsites, 1)) 1436 << "%\n" 1437 << "BOLT-INFO: ICP number of method load elimination candidates = " 1438 << TotalMethodLoadEliminationCandidates << "\n" 1439 << "BOLT-INFO: ICP percentage of method calls candidates that have " 1440 "loads eliminated = " 1441 << format("%.1f", (100.0 * TotalMethodLoadsEliminated) / 1442 std::max<uint64_t>( 1443 TotalMethodLoadEliminationCandidates, 1)) 1444 << "%\n" 1445 << "BOLT-INFO: ICP percentage of indirect branches that are " 1446 "optimized = " 1447 << format("%.1f", (100.0 * TotalNumFrequentJmps) / 1448 std::max<uint64_t>(TotalIndirectJmps, 1)) 1449 << "%\n" 1450 << "BOLT-INFO: ICP percentage of jump table callsites that are " 1451 << "optimized = " 1452 << format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) / 1453 std::max<uint64_t>(TotalJumpTableCallsites, 1)) 1454 << "%\n" 1455 << "BOLT-INFO: ICP number of jump table callsites that can use hot " 1456 << "indices = " << TotalIndexBasedCandidates << "\n" 1457 << "BOLT-INFO: ICP percentage of jump table callsites that use hot " 1458 "indices = " 1459 << format("%.1f", (100.0 * TotalIndexBasedJumps) / 1460 std::max<uint64_t>(TotalIndexBasedCandidates, 1)) 1461 << "%\n"; 1462 1463 (void)verifyProfile; 1464 #ifndef NDEBUG 1465 verifyProfile(BFs); 1466 #endif 1467 } 1468 1469 } // namespace bolt 1470 } // namespace llvm 1471