//===- bolt/Passes/IndirectCallPromotion.cpp ------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the IndirectCallPromotion class. // //===----------------------------------------------------------------------===// #include "bolt/Passes/IndirectCallPromotion.h" #include "bolt/Core/BinaryFunctionCallGraph.h" #include "bolt/Passes/DataflowInfoManager.h" #include "bolt/Passes/Inliner.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/CommandLine.h" #include #define DEBUG_TYPE "ICP" #define DEBUG_VERBOSE(Level, X) \ if (opts::Verbosity >= (Level)) { \ X; \ } using namespace llvm; using namespace bolt; namespace opts { extern cl::OptionCategory BoltOptCategory; extern cl::opt ICP; extern cl::opt Verbosity; extern cl::opt ExecutionCountThreshold; static cl::opt ICPJTRemainingPercentThreshold( "icp-jt-remaining-percent-threshold", cl::desc("The percentage threshold against remaining unpromoted indirect " "call count for the promotion for jump tables"), cl::init(30), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); static cl::opt ICPJTTotalPercentThreshold( "icp-jt-total-percent-threshold", cl::desc( "The percentage threshold against total count for the promotion for " "jump tables"), cl::init(5), cl::Hidden, cl::cat(BoltOptCategory)); static cl::opt ICPCallsRemainingPercentThreshold( "icp-calls-remaining-percent-threshold", cl::desc("The percentage threshold against remaining unpromoted indirect " "call count for the promotion for calls"), cl::init(50), cl::Hidden, cl::cat(BoltOptCategory)); static cl::opt ICPCallsTotalPercentThreshold( "icp-calls-total-percent-threshold", cl::desc( "The percentage threshold against total count for the promotion for " "calls"), cl::init(30), cl::Hidden, cl::cat(BoltOptCategory)); static cl::opt ICPMispredictThreshold( "indirect-call-promotion-mispredict-threshold", cl::desc("misprediction threshold for skipping ICP on an " "indirect call"), cl::init(0), cl::cat(BoltOptCategory)); static cl::alias ICPMispredictThresholdAlias( "icp-mp-threshold", cl::desc("alias for --indirect-call-promotion-mispredict-threshold"), cl::aliasopt(ICPMispredictThreshold)); static cl::opt ICPUseMispredicts( "indirect-call-promotion-use-mispredicts", cl::desc("use misprediction frequency for determining whether or not ICP " "should be applied at a callsite. The " "-indirect-call-promotion-mispredict-threshold value will be used " "by this heuristic"), cl::cat(BoltOptCategory)); static cl::alias ICPUseMispredictsAlias( "icp-use-mp", cl::desc("alias for --indirect-call-promotion-use-mispredicts"), cl::aliasopt(ICPUseMispredicts)); static cl::opt ICPTopN("indirect-call-promotion-topn", cl::desc("limit number of targets to consider when doing indirect " "call promotion. 0 = no limit"), cl::init(3), cl::cat(BoltOptCategory)); static cl::alias ICPTopNAlias("icp-topn", cl::desc("alias for --indirect-call-promotion-topn"), cl::aliasopt(ICPTopN)); static cl::opt ICPCallsTopN( "indirect-call-promotion-calls-topn", cl::desc("limit number of targets to consider when doing indirect " "call promotion on calls. 0 = no limit"), cl::init(0), cl::cat(BoltOptCategory)); static cl::alias ICPCallsTopNAlias( "icp-calls-topn", cl::desc("alias for --indirect-call-promotion-calls-topn"), cl::aliasopt(ICPCallsTopN)); static cl::opt ICPJumpTablesTopN( "indirect-call-promotion-jump-tables-topn", cl::desc("limit number of targets to consider when doing indirect " "call promotion on jump tables. 0 = no limit"), cl::init(0), cl::cat(BoltOptCategory)); static cl::alias ICPJumpTablesTopNAlias( "icp-jt-topn", cl::desc("alias for --indirect-call-promotion-jump-tables-topn"), cl::aliasopt(ICPJumpTablesTopN)); static cl::opt EliminateLoads( "icp-eliminate-loads", cl::desc("enable load elimination using memory profiling data when " "performing ICP"), cl::init(true), cl::cat(BoltOptCategory)); static cl::opt ICPTopCallsites( "icp-top-callsites", cl::desc("optimize hottest calls until at least this percentage of all " "indirect calls frequency is covered. 0 = all callsites"), cl::init(99), cl::Hidden, cl::cat(BoltOptCategory)); static cl::list ICPFuncsList("icp-funcs", cl::CommaSeparated, cl::desc("list of functions to enable ICP for"), cl::value_desc("func1,func2,func3,..."), cl::Hidden, cl::cat(BoltOptCategory)); static cl::opt ICPOldCodeSequence("icp-old-code-sequence", cl::desc("use old code sequence for promoted calls"), cl::Hidden, cl::cat(BoltOptCategory)); static cl::opt ICPJumpTablesByTarget( "icp-jump-tables-targets", cl::desc( "for jump tables, optimize indirect jmp targets instead of indices"), cl::Hidden, cl::cat(BoltOptCategory)); static cl::alias ICPJumpTablesByTargetAlias("icp-jt-targets", cl::desc("alias for --icp-jump-tables-targets"), cl::aliasopt(ICPJumpTablesByTarget)); static cl::opt ICPPeelForInline( "icp-inline", cl::desc("only promote call targets eligible for inlining"), cl::Hidden, cl::cat(BoltOptCategory)); } // namespace opts #ifndef NDEBUG static bool verifyProfile(std::map &BFs) { bool IsValid = true; for (auto &BFI : BFs) { BinaryFunction &BF = BFI.second; if (!BF.isSimple()) continue; for (const BinaryBasicBlock &BB : BF) { auto BI = BB.branch_info_begin(); for (BinaryBasicBlock *SuccBB : BB.successors()) { if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) { if (BB.getKnownExecutionCount() == 0 || SuccBB->getKnownExecutionCount() == 0) { BF.getBinaryContext().errs() << "BOLT-WARNING: profile verification failed after ICP for " "function " << BF << '\n'; IsValid = false; } } ++BI; } } } return IsValid; } #endif namespace llvm { namespace bolt { IndirectCallPromotion::Callsite::Callsite(BinaryFunction &BF, const IndirectCallProfile &ICP) : From(BF.getSymbol()), To(ICP.Offset), Mispreds(ICP.Mispreds), Branches(ICP.Count) { if (ICP.Symbol) { To.Sym = ICP.Symbol; To.Addr = 0; } } void IndirectCallPromotion::printDecision( llvm::raw_ostream &OS, std::vector &Targets, unsigned N) const { uint64_t TotalCount = 0; uint64_t TotalMispreds = 0; for (const Callsite &S : Targets) { TotalCount += S.Branches; TotalMispreds += S.Mispreds; } if (!TotalCount) TotalCount = 1; if (!TotalMispreds) TotalMispreds = 1; OS << "BOLT-INFO: ICP decision for call site with " << Targets.size() << " targets, Count = " << TotalCount << ", Mispreds = " << TotalMispreds << "\n"; size_t I = 0; for (const Callsite &S : Targets) { OS << "Count = " << S.Branches << ", " << format("%.1f", (100.0 * S.Branches) / TotalCount) << ", " << "Mispreds = " << S.Mispreds << ", " << format("%.1f", (100.0 * S.Mispreds) / TotalMispreds); if (I < N) OS << " * to be optimized *"; if (!S.JTIndices.empty()) { OS << " Indices:"; for (const uint64_t Idx : S.JTIndices) OS << " " << Idx; } OS << "\n"; I += S.JTIndices.empty() ? 1 : S.JTIndices.size(); } } // Get list of targets for a given call sorted by most frequently // called first. std::vector IndirectCallPromotion::getCallTargets(BinaryBasicBlock &BB, const MCInst &Inst) const { BinaryFunction &BF = *BB.getFunction(); const BinaryContext &BC = BF.getBinaryContext(); std::vector Targets; if (const JumpTable *JT = BF.getJumpTable(Inst)) { // Don't support PIC jump tables for now if (!opts::ICPJumpTablesByTarget && JT->Type == JumpTable::JTT_PIC) return Targets; const Location From(BF.getSymbol()); const std::pair Range = JT->getEntriesForAddress(BC.MIB->getJumpTable(Inst)); assert(JT->Counts.empty() || JT->Counts.size() >= Range.second); JumpTable::JumpInfo DefaultJI; const JumpTable::JumpInfo *JI = JT->Counts.empty() ? &DefaultJI : &JT->Counts[Range.first]; const size_t JIAdj = JT->Counts.empty() ? 0 : 1; assert(JT->Type == JumpTable::JTT_PIC || JT->EntrySize == BC.AsmInfo->getCodePointerSize()); for (size_t I = Range.first; I < Range.second; ++I, JI += JIAdj) { MCSymbol *Entry = JT->Entries[I]; const BinaryBasicBlock *ToBB = BF.getBasicBlockForLabel(Entry); assert(ToBB || Entry == BF.getFunctionEndLabel() || Entry == BF.getFunctionEndLabel(FragmentNum::cold())); if (Entry == BF.getFunctionEndLabel() || Entry == BF.getFunctionEndLabel(FragmentNum::cold())) continue; const Location To(Entry); const BinaryBasicBlock::BinaryBranchInfo &BI = BB.getBranchInfo(*ToBB); Targets.emplace_back(From, To, BI.MispredictedCount, BI.Count, I - Range.first); } // Sort by symbol then addr. llvm::sort(Targets, [](const Callsite &A, const Callsite &B) { if (A.To.Sym && B.To.Sym) return A.To.Sym < B.To.Sym; else if (A.To.Sym && !B.To.Sym) return true; else if (!A.To.Sym && B.To.Sym) return false; else return A.To.Addr < B.To.Addr; }); // Targets may contain multiple entries to the same target, but using // different indices. Their profile will report the same number of branches // for different indices if the target is the same. That's because we don't // profile the index value, but only the target via LBR. auto First = Targets.begin(); auto Last = Targets.end(); auto Result = First; while (++First != Last) { Callsite &A = *Result; const Callsite &B = *First; if (A.To.Sym && B.To.Sym && A.To.Sym == B.To.Sym) A.JTIndices.insert(A.JTIndices.end(), B.JTIndices.begin(), B.JTIndices.end()); else *(++Result) = *First; } ++Result; LLVM_DEBUG(if (Targets.end() - Result > 0) { dbgs() << "BOLT-INFO: ICP: " << (Targets.end() - Result) << " duplicate targets removed\n"; }); Targets.erase(Result, Targets.end()); } else { // Don't try to optimize PC relative indirect calls. if (Inst.getOperand(0).isReg() && Inst.getOperand(0).getReg() == BC.MRI->getProgramCounter()) return Targets; const auto ICSP = BC.MIB->tryGetAnnotationAs( Inst, "CallProfile"); if (ICSP) { for (const IndirectCallProfile &CSP : ICSP.get()) { Callsite Site(BF, CSP); if (Site.isValid()) Targets.emplace_back(std::move(Site)); } } } // Sort by target count, number of indices in case of jump table, and // mispredicts. We prioritize targets with high count, small number of indices // and high mispredicts. Break ties by selecting targets with lower addresses. llvm::stable_sort(Targets, [](const Callsite &A, const Callsite &B) { if (A.Branches != B.Branches) return A.Branches > B.Branches; if (A.JTIndices.size() != B.JTIndices.size()) return A.JTIndices.size() < B.JTIndices.size(); if (A.Mispreds != B.Mispreds) return A.Mispreds > B.Mispreds; return A.To.Addr < B.To.Addr; }); // Remove non-symbol targets llvm::erase_if(Targets, [](const Callsite &CS) { return !CS.To.Sym; }); LLVM_DEBUG(if (BF.getJumpTable(Inst)) { uint64_t TotalCount = 0; uint64_t TotalMispreds = 0; for (const Callsite &S : Targets) { TotalCount += S.Branches; TotalMispreds += S.Mispreds; } if (!TotalCount) TotalCount = 1; if (!TotalMispreds) TotalMispreds = 1; dbgs() << "BOLT-INFO: ICP: jump table size = " << Targets.size() << ", Count = " << TotalCount << ", Mispreds = " << TotalMispreds << "\n"; size_t I = 0; for (const Callsite &S : Targets) { dbgs() << "Count[" << I << "] = " << S.Branches << ", " << format("%.1f", (100.0 * S.Branches) / TotalCount) << ", " << "Mispreds[" << I << "] = " << S.Mispreds << ", " << format("%.1f", (100.0 * S.Mispreds) / TotalMispreds) << "\n"; ++I; } }); return Targets; } IndirectCallPromotion::JumpTableInfoType IndirectCallPromotion::maybeGetHotJumpTableTargets(BinaryBasicBlock &BB, MCInst &CallInst, MCInst *&TargetFetchInst, const JumpTable *JT) const { assert(JT && "Can't get jump table addrs for non-jump tables."); BinaryFunction &Function = *BB.getFunction(); BinaryContext &BC = Function.getBinaryContext(); if (!Function.hasMemoryProfile() || !opts::EliminateLoads) return JumpTableInfoType(); JumpTableInfoType HotTargets; MCInst *MemLocInstr; MCInst *PCRelBaseOut; MCInst *FixedEntryLoadInstr; unsigned BaseReg, IndexReg; int64_t DispValue; const MCExpr *DispExpr; MutableArrayRef Insts(&BB.front(), &CallInst); const IndirectBranchType Type = BC.MIB->analyzeIndirectBranch( CallInst, Insts.begin(), Insts.end(), BC.AsmInfo->getCodePointerSize(), MemLocInstr, BaseReg, IndexReg, DispValue, DispExpr, PCRelBaseOut, FixedEntryLoadInstr); assert(MemLocInstr && "There should always be a load for jump tables"); if (!MemLocInstr) return JumpTableInfoType(); LLVM_DEBUG({ dbgs() << "BOLT-INFO: ICP attempting to find memory profiling data for " << "jump table in " << Function << " at @ " << (&CallInst - &BB.front()) << "\n" << "BOLT-INFO: ICP target fetch instructions:\n"; BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function); if (MemLocInstr != &CallInst) BC.printInstruction(dbgs(), CallInst, 0, &Function); }); DEBUG_VERBOSE(1, { dbgs() << "Jmp info: Type = " << (unsigned)Type << ", " << "BaseReg = " << BC.MRI->getName(BaseReg) << ", " << "IndexReg = " << BC.MRI->getName(IndexReg) << ", " << "DispValue = " << Twine::utohexstr(DispValue) << ", " << "DispExpr = " << DispExpr << ", " << "MemLocInstr = "; BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function); dbgs() << "\n"; }); ++TotalIndexBasedCandidates; auto ErrorOrMemAccessProfile = BC.MIB->tryGetAnnotationAs(*MemLocInstr, "MemoryAccessProfile"); if (!ErrorOrMemAccessProfile) { DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP no memory profiling data found\n"); return JumpTableInfoType(); } MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccessProfile.get(); uint64_t ArrayStart; if (DispExpr) { ErrorOr DispValueOrError = BC.getSymbolValue(*BC.MIB->getTargetSymbol(DispExpr)); assert(DispValueOrError && "global symbol needs a value"); ArrayStart = *DispValueOrError; } else { ArrayStart = static_cast(DispValue); } if (BaseReg == BC.MRI->getProgramCounter()) ArrayStart += Function.getAddress() + MemAccessProfile.NextInstrOffset; // This is a map of [symbol] -> [count, index] and is used to combine indices // into the jump table since there may be multiple addresses that all have the // same entry. std::map> HotTargetMap; const std::pair Range = JT->getEntriesForAddress(ArrayStart); for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) { size_t Index; // Mem data occasionally includes nullprs, ignore them. if (!AccessInfo.MemoryObject && !AccessInfo.Offset) continue; if (AccessInfo.Offset % JT->EntrySize != 0) // ignore bogus data return JumpTableInfoType(); if (AccessInfo.MemoryObject) { // Deal with bad/stale data if (!AccessInfo.MemoryObject->getName().starts_with( "JUMP_TABLE/" + Function.getOneName().str())) return JumpTableInfoType(); Index = (AccessInfo.Offset - (ArrayStart - JT->getAddress())) / JT->EntrySize; } else { Index = (AccessInfo.Offset - ArrayStart) / JT->EntrySize; } // If Index is out of range it probably means the memory profiling data is // wrong for this instruction, bail out. if (Index >= Range.second) { LLVM_DEBUG(dbgs() << "BOLT-INFO: Index out of range of " << Range.first << ", " << Range.second << "\n"); return JumpTableInfoType(); } // Make sure the hot index points at a legal label corresponding to a BB, // e.g. not the end of function (unreachable) label. if (!Function.getBasicBlockForLabel(JT->Entries[Index + Range.first])) { LLVM_DEBUG({ dbgs() << "BOLT-INFO: hot index " << Index << " pointing at bogus " << "label " << JT->Entries[Index + Range.first]->getName() << " in jump table:\n"; JT->print(dbgs()); dbgs() << "HotTargetMap:\n"; for (std::pair> &HT : HotTargetMap) dbgs() << "BOLT-INFO: " << HT.first->getName() << " = (count=" << HT.second.first << ", index=" << HT.second.second << ")\n"; }); return JumpTableInfoType(); } std::pair &HotTarget = HotTargetMap[JT->Entries[Index + Range.first]]; HotTarget.first += AccessInfo.Count; HotTarget.second = Index; } llvm::copy(llvm::make_second_range(HotTargetMap), std::back_inserter(HotTargets)); // Sort with highest counts first. llvm::sort(reverse(HotTargets)); LLVM_DEBUG({ dbgs() << "BOLT-INFO: ICP jump table hot targets:\n"; for (const std::pair &Target : HotTargets) dbgs() << "BOLT-INFO: Idx = " << Target.second << ", " << "Count = " << Target.first << "\n"; }); BC.MIB->getOrCreateAnnotationAs(CallInst, "JTIndexReg") = IndexReg; TargetFetchInst = MemLocInstr; return HotTargets; } IndirectCallPromotion::SymTargetsType IndirectCallPromotion::findCallTargetSymbols(std::vector &Targets, size_t &N, BinaryBasicBlock &BB, MCInst &CallInst, MCInst *&TargetFetchInst) const { const BinaryContext &BC = BB.getFunction()->getBinaryContext(); const JumpTable *JT = BB.getFunction()->getJumpTable(CallInst); SymTargetsType SymTargets; if (!JT) { for (size_t I = 0; I < N; ++I) { assert(Targets[I].To.Sym && "All ICP targets must be to known symbols"); assert(Targets[I].JTIndices.empty() && "Can't have jump table indices for non-jump tables"); SymTargets.emplace_back(Targets[I].To.Sym, 0); } return SymTargets; } // Use memory profile to select hot targets. JumpTableInfoType HotTargets = maybeGetHotJumpTableTargets(BB, CallInst, TargetFetchInst, JT); auto findTargetsIndex = [&](uint64_t JTIndex) { for (size_t I = 0; I < Targets.size(); ++I) if (llvm::is_contained(Targets[I].JTIndices, JTIndex)) return I; LLVM_DEBUG(dbgs() << "BOLT-ERROR: Unable to find target index for hot jump " << " table entry in " << *BB.getFunction() << "\n"); llvm_unreachable("Hot indices must be referred to by at least one " "callsite"); }; if (!HotTargets.empty()) { if (opts::Verbosity >= 1) for (size_t I = 0; I < HotTargets.size(); ++I) BC.outs() << "BOLT-INFO: HotTarget[" << I << "] = (" << HotTargets[I].first << ", " << HotTargets[I].second << ")\n"; // Recompute hottest targets, now discriminating which index is hot // NOTE: This is a tradeoff. On one hand, we get index information. On the // other hand, info coming from the memory profile is much less accurate // than LBRs. So we may actually end up working with more coarse // profile granularity in exchange for information about indices. std::vector NewTargets; std::map IndicesPerTarget; uint64_t TotalMemAccesses = 0; for (size_t I = 0; I < HotTargets.size(); ++I) { const uint64_t TargetIndex = findTargetsIndex(HotTargets[I].second); ++IndicesPerTarget[Targets[TargetIndex].To.Sym]; TotalMemAccesses += HotTargets[I].first; } uint64_t RemainingMemAccesses = TotalMemAccesses; const size_t TopN = opts::ICPJumpTablesTopN ? opts::ICPJumpTablesTopN : opts::ICPTopN; size_t I = 0; for (; I < HotTargets.size(); ++I) { const uint64_t MemAccesses = HotTargets[I].first; if (100 * MemAccesses < TotalMemAccesses * opts::ICPJTTotalPercentThreshold) break; if (100 * MemAccesses < RemainingMemAccesses * opts::ICPJTRemainingPercentThreshold) break; if (TopN && I >= TopN) break; RemainingMemAccesses -= MemAccesses; const uint64_t JTIndex = HotTargets[I].second; Callsite &Target = Targets[findTargetsIndex(JTIndex)]; NewTargets.push_back(Target); std::vector({JTIndex}).swap(NewTargets.back().JTIndices); llvm::erase(Target.JTIndices, JTIndex); // Keep fixCFG counts sane if more indices use this same target later assert(IndicesPerTarget[Target.To.Sym] > 0 && "wrong map"); NewTargets.back().Branches = Target.Branches / IndicesPerTarget[Target.To.Sym]; NewTargets.back().Mispreds = Target.Mispreds / IndicesPerTarget[Target.To.Sym]; assert(Target.Branches >= NewTargets.back().Branches); assert(Target.Mispreds >= NewTargets.back().Mispreds); Target.Branches -= NewTargets.back().Branches; Target.Mispreds -= NewTargets.back().Mispreds; } llvm::copy(Targets, std::back_inserter(NewTargets)); std::swap(NewTargets, Targets); N = I; if (N == 0 && opts::Verbosity >= 1) { BC.outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " in " << BB.getName() << ": failed to meet thresholds after memory " << "profile data was loaded.\n"; return SymTargets; } } for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) { Callsite &Target = Targets[TgtIdx]; assert(Target.To.Sym && "All ICP targets must be to known symbols"); assert(!Target.JTIndices.empty() && "Jump tables must have indices"); for (uint64_t Idx : Target.JTIndices) { SymTargets.emplace_back(Target.To.Sym, Idx); ++I; } } return SymTargets; } IndirectCallPromotion::MethodInfoType IndirectCallPromotion::maybeGetVtableSyms( BinaryBasicBlock &BB, MCInst &Inst, const SymTargetsType &SymTargets) const { BinaryFunction &Function = *BB.getFunction(); BinaryContext &BC = Function.getBinaryContext(); std::vector> VtableSyms; std::vector MethodFetchInsns; unsigned VtableReg, MethodReg; uint64_t MethodOffset; assert(!Function.getJumpTable(Inst) && "Can't get vtable addrs for jump tables."); if (!Function.hasMemoryProfile() || !opts::EliminateLoads) return MethodInfoType(); MutableArrayRef Insts(&BB.front(), &Inst + 1); if (!BC.MIB->analyzeVirtualMethodCall(Insts.begin(), Insts.end(), MethodFetchInsns, VtableReg, MethodReg, MethodOffset)) { DEBUG_VERBOSE( 1, dbgs() << "BOLT-INFO: ICP unable to analyze method call in " << Function << " at @ " << (&Inst - &BB.front()) << "\n"); return MethodInfoType(); } ++TotalMethodLoadEliminationCandidates; DEBUG_VERBOSE(1, { dbgs() << "BOLT-INFO: ICP found virtual method call in " << Function << " at @ " << (&Inst - &BB.front()) << "\n"; dbgs() << "BOLT-INFO: ICP method fetch instructions:\n"; for (MCInst *Inst : MethodFetchInsns) BC.printInstruction(dbgs(), *Inst, 0, &Function); if (MethodFetchInsns.back() != &Inst) BC.printInstruction(dbgs(), Inst, 0, &Function); }); // Try to get value profiling data for the method load instruction. auto ErrorOrMemAccessProfile = BC.MIB->tryGetAnnotationAs(*MethodFetchInsns.back(), "MemoryAccessProfile"); if (!ErrorOrMemAccessProfile) { DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP no memory profiling data found\n"); return MethodInfoType(); } MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccessProfile.get(); // Find the vtable that each method belongs to. std::map MethodToVtable; for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) { uint64_t Address = AccessInfo.Offset; if (AccessInfo.MemoryObject) Address += AccessInfo.MemoryObject->getAddress(); // Ignore bogus data. if (!Address) continue; const uint64_t VtableBase = Address - MethodOffset; DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP vtable = " << Twine::utohexstr(VtableBase) << "+" << MethodOffset << "/" << AccessInfo.Count << "\n"); if (ErrorOr MethodAddr = BC.getPointerAtAddress(Address)) { BinaryData *MethodBD = BC.getBinaryDataAtAddress(MethodAddr.get()); if (!MethodBD) // skip unknown methods continue; MCSymbol *MethodSym = MethodBD->getSymbol(); MethodToVtable[MethodSym] = VtableBase; DEBUG_VERBOSE(1, { const BinaryFunction *Method = BC.getFunctionForSymbol(MethodSym); dbgs() << "BOLT-INFO: ICP found method = " << Twine::utohexstr(MethodAddr.get()) << "/" << (Method ? Method->getPrintName() : "") << "\n"; }); } } // Find the vtable for each target symbol. for (size_t I = 0; I < SymTargets.size(); ++I) { auto Itr = MethodToVtable.find(SymTargets[I].first); if (Itr != MethodToVtable.end()) { if (BinaryData *BD = BC.getBinaryDataContainingAddress(Itr->second)) { const uint64_t Addend = Itr->second - BD->getAddress(); VtableSyms.emplace_back(BD->getSymbol(), Addend); continue; } } // Give up if we can't find the vtable for a method. DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP can't find vtable for " << SymTargets[I].first->getName() << "\n"); return MethodInfoType(); } // Make sure the vtable reg is not clobbered by the argument passing code if (VtableReg != MethodReg) { for (MCInst *CurInst = MethodFetchInsns.front(); CurInst < &Inst; ++CurInst) { const MCInstrDesc &InstrInfo = BC.MII->get(CurInst->getOpcode()); if (InstrInfo.hasDefOfPhysReg(*CurInst, VtableReg, *BC.MRI)) return MethodInfoType(); } } return MethodInfoType(VtableSyms, MethodFetchInsns); } std::vector> IndirectCallPromotion::rewriteCall( BinaryBasicBlock &IndCallBlock, const MCInst &CallInst, MCPlusBuilder::BlocksVectorTy &&ICPcode, const std::vector &MethodFetchInsns) const { BinaryFunction &Function = *IndCallBlock.getFunction(); MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); // Create new basic blocks with correct code in each one first. std::vector> NewBBs; const bool IsTailCallOrJT = (MIB->isTailCall(CallInst) || Function.getJumpTable(CallInst)); // If we are tracking the indirect call/jump address, propagate the address to // the ICP code. const std::optional IndirectInstrOffset = MIB->getOffset(CallInst); if (IndirectInstrOffset) { for (auto &[Symbol, Instructions] : ICPcode) for (MCInst &Inst : Instructions) MIB->setOffset(Inst, *IndirectInstrOffset); } // Move instructions from the tail of the original call block // to the merge block. // Remember any pseudo instructions following a tail call. These // must be preserved and moved to the original block. InstructionListType TailInsts; const MCInst *TailInst = &CallInst; if (IsTailCallOrJT) while (TailInst + 1 < &(*IndCallBlock.end()) && MIB->isPseudo(*(TailInst + 1))) TailInsts.push_back(*++TailInst); InstructionListType MovedInst = IndCallBlock.splitInstructions(&CallInst); // Link new BBs to the original input offset of the indirect call site or its // containing BB, so we can map samples recorded in new BBs back to the // original BB seen in the input binary (if using BAT). const uint32_t OrigOffset = IndirectInstrOffset ? *IndirectInstrOffset : IndCallBlock.getInputOffset(); IndCallBlock.eraseInstructions(MethodFetchInsns.begin(), MethodFetchInsns.end()); if (IndCallBlock.empty() || (!MethodFetchInsns.empty() && MethodFetchInsns.back() == &CallInst)) IndCallBlock.addInstructions(ICPcode.front().second.begin(), ICPcode.front().second.end()); else IndCallBlock.replaceInstruction(std::prev(IndCallBlock.end()), ICPcode.front().second); IndCallBlock.addInstructions(TailInsts.begin(), TailInsts.end()); for (auto Itr = ICPcode.begin() + 1; Itr != ICPcode.end(); ++Itr) { MCSymbol *&Sym = Itr->first; InstructionListType &Insts = Itr->second; assert(Sym); std::unique_ptr TBB = Function.createBasicBlock(Sym); TBB->setOffset(OrigOffset); for (MCInst &Inst : Insts) // sanitize new instructions. if (MIB->isCall(Inst)) MIB->removeAnnotation(Inst, "CallProfile"); TBB->addInstructions(Insts.begin(), Insts.end()); NewBBs.emplace_back(std::move(TBB)); } // Move tail of instructions from after the original call to // the merge block. if (!IsTailCallOrJT) NewBBs.back()->addInstructions(MovedInst.begin(), MovedInst.end()); return NewBBs; } BinaryBasicBlock * IndirectCallPromotion::fixCFG(BinaryBasicBlock &IndCallBlock, const bool IsTailCall, const bool IsJumpTable, IndirectCallPromotion::BasicBlocksVector &&NewBBs, const std::vector &Targets) const { BinaryFunction &Function = *IndCallBlock.getFunction(); using BinaryBranchInfo = BinaryBasicBlock::BinaryBranchInfo; BinaryBasicBlock *MergeBlock = nullptr; // Scale indirect call counts to the execution count of the original // basic block containing the indirect call. uint64_t TotalCount = IndCallBlock.getKnownExecutionCount(); uint64_t TotalIndirectBranches = 0; for (const Callsite &Target : Targets) TotalIndirectBranches += Target.Branches; if (TotalIndirectBranches == 0) TotalIndirectBranches = 1; BinaryBasicBlock::BranchInfoType BBI; BinaryBasicBlock::BranchInfoType ScaledBBI; for (const Callsite &Target : Targets) { const size_t NumEntries = std::max(static_cast(1UL), Target.JTIndices.size()); for (size_t I = 0; I < NumEntries; ++I) { BBI.push_back( BinaryBranchInfo{(Target.Branches + NumEntries - 1) / NumEntries, (Target.Mispreds + NumEntries - 1) / NumEntries}); ScaledBBI.push_back( BinaryBranchInfo{uint64_t(TotalCount * Target.Branches / (NumEntries * TotalIndirectBranches)), uint64_t(TotalCount * Target.Mispreds / (NumEntries * TotalIndirectBranches))}); } } if (IsJumpTable) { BinaryBasicBlock *NewIndCallBlock = NewBBs.back().get(); IndCallBlock.moveAllSuccessorsTo(NewIndCallBlock); std::vector SymTargets; for (const Callsite &Target : Targets) { const size_t NumEntries = std::max(static_cast(1UL), Target.JTIndices.size()); for (size_t I = 0; I < NumEntries; ++I) SymTargets.push_back(Target.To.Sym); } assert(SymTargets.size() > NewBBs.size() - 1 && "There must be a target symbol associated with each new BB."); for (uint64_t I = 0; I < NewBBs.size(); ++I) { BinaryBasicBlock *SourceBB = I ? NewBBs[I - 1].get() : &IndCallBlock; SourceBB->setExecutionCount(TotalCount); BinaryBasicBlock *TargetBB = Function.getBasicBlockForLabel(SymTargets[I]); SourceBB->addSuccessor(TargetBB, ScaledBBI[I]); // taken TotalCount -= ScaledBBI[I].Count; SourceBB->addSuccessor(NewBBs[I].get(), TotalCount); // fall-through // Update branch info for the indirect jump. BinaryBasicBlock::BinaryBranchInfo &BranchInfo = NewIndCallBlock->getBranchInfo(*TargetBB); if (BranchInfo.Count > BBI[I].Count) BranchInfo.Count -= BBI[I].Count; else BranchInfo.Count = 0; if (BranchInfo.MispredictedCount > BBI[I].MispredictedCount) BranchInfo.MispredictedCount -= BBI[I].MispredictedCount; else BranchInfo.MispredictedCount = 0; } } else { assert(NewBBs.size() >= 2); assert(NewBBs.size() % 2 == 1 || IndCallBlock.succ_empty()); assert(NewBBs.size() % 2 == 1 || IsTailCall); auto ScaledBI = ScaledBBI.begin(); auto updateCurrentBranchInfo = [&] { assert(ScaledBI != ScaledBBI.end()); TotalCount -= ScaledBI->Count; ++ScaledBI; }; if (!IsTailCall) { MergeBlock = NewBBs.back().get(); IndCallBlock.moveAllSuccessorsTo(MergeBlock); } // Fix up successors and execution counts. updateCurrentBranchInfo(); IndCallBlock.addSuccessor(NewBBs[1].get(), TotalCount); IndCallBlock.addSuccessor(NewBBs[0].get(), ScaledBBI[0]); const size_t Adj = IsTailCall ? 1 : 2; for (size_t I = 0; I < NewBBs.size() - Adj; ++I) { assert(TotalCount <= IndCallBlock.getExecutionCount() || TotalCount <= uint64_t(TotalIndirectBranches)); uint64_t ExecCount = ScaledBBI[(I + 1) / 2].Count; if (I % 2 == 0) { if (MergeBlock) NewBBs[I]->addSuccessor(MergeBlock, ScaledBBI[(I + 1) / 2].Count); } else { assert(I + 2 < NewBBs.size()); updateCurrentBranchInfo(); NewBBs[I]->addSuccessor(NewBBs[I + 2].get(), TotalCount); NewBBs[I]->addSuccessor(NewBBs[I + 1].get(), ScaledBBI[(I + 1) / 2]); ExecCount += TotalCount; } NewBBs[I]->setExecutionCount(ExecCount); } if (MergeBlock) { // Arrange for the MergeBlock to be the fallthrough for the first // promoted call block. std::unique_ptr MBPtr; std::swap(MBPtr, NewBBs.back()); NewBBs.pop_back(); NewBBs.emplace(NewBBs.begin() + 1, std::move(MBPtr)); // TODO: is COUNT_FALLTHROUGH_EDGE the right thing here? NewBBs.back()->addSuccessor(MergeBlock, TotalCount); // uncond branch } } // Update the execution count. NewBBs.back()->setExecutionCount(TotalCount); // Update BB and BB layout. Function.insertBasicBlocks(&IndCallBlock, std::move(NewBBs)); assert(Function.validateCFG()); return MergeBlock; } size_t IndirectCallPromotion::canPromoteCallsite( const BinaryBasicBlock &BB, const MCInst &Inst, const std::vector &Targets, uint64_t NumCalls) { BinaryFunction *BF = BB.getFunction(); const BinaryContext &BC = BF->getBinaryContext(); if (BB.getKnownExecutionCount() < opts::ExecutionCountThreshold) return 0; const bool IsJumpTable = BF->getJumpTable(Inst); auto computeStats = [&](size_t N) { for (size_t I = 0; I < N; ++I) if (IsJumpTable) TotalNumFrequentJmps += Targets[I].Branches; else TotalNumFrequentCalls += Targets[I].Branches; }; // If we have no targets (or no calls), skip this callsite. if (Targets.empty() || !NumCalls) { if (opts::Verbosity >= 1) { const ptrdiff_t InstIdx = &Inst - &(*BB.begin()); BC.outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx << " in " << BB.getName() << ", calls = " << NumCalls << ", targets empty or NumCalls == 0.\n"; } return 0; } size_t TopN = opts::ICPTopN; if (IsJumpTable) TopN = opts::ICPJumpTablesTopN ? opts::ICPJumpTablesTopN : TopN; else TopN = opts::ICPCallsTopN ? opts::ICPCallsTopN : TopN; const size_t TrialN = TopN ? std::min(TopN, Targets.size()) : Targets.size(); if (opts::ICPTopCallsites && !BC.MIB->hasAnnotation(Inst, "DoICP")) return 0; // Pick the top N targets. uint64_t TotalMispredictsTopN = 0; size_t N = 0; if (opts::ICPUseMispredicts && (!IsJumpTable || opts::ICPJumpTablesByTarget)) { // Count total number of mispredictions for (at most) the top N targets. // We may choose a smaller N (TrialN vs. N) if the frequency threshold // is exceeded by fewer targets. double Threshold = double(opts::ICPMispredictThreshold); for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) { Threshold -= (100.0 * Targets[I].Mispreds) / NumCalls; TotalMispredictsTopN += Targets[I].Mispreds; } computeStats(N); // Compute the misprediction frequency of the top N call targets. If this // frequency is greater than the threshold, we should try ICP on this // callsite. const double TopNFrequency = (100.0 * TotalMispredictsTopN) / NumCalls; if (TopNFrequency == 0 || TopNFrequency < opts::ICPMispredictThreshold) { if (opts::Verbosity >= 1) { const ptrdiff_t InstIdx = &Inst - &(*BB.begin()); BC.outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx << " in " << BB.getName() << ", calls = " << NumCalls << ", top N mis. frequency " << format("%.1f", TopNFrequency) << "% < " << opts::ICPMispredictThreshold << "%\n"; } return 0; } } else { size_t MaxTargets = 0; // Count total number of calls for (at most) the top N targets. // We may choose a smaller N (TrialN vs. N) if the frequency threshold // is exceeded by fewer targets. const unsigned TotalThreshold = IsJumpTable ? opts::ICPJTTotalPercentThreshold : opts::ICPCallsTotalPercentThreshold; const unsigned RemainingThreshold = IsJumpTable ? opts::ICPJTRemainingPercentThreshold : opts::ICPCallsRemainingPercentThreshold; uint64_t NumRemainingCalls = NumCalls; for (size_t I = 0; I < TrialN; ++I, ++MaxTargets) { if (100 * Targets[I].Branches < NumCalls * TotalThreshold) break; if (100 * Targets[I].Branches < NumRemainingCalls * RemainingThreshold) break; if (N + (Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size()) > TrialN) break; TotalMispredictsTopN += Targets[I].Mispreds; NumRemainingCalls -= Targets[I].Branches; N += Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size(); } computeStats(MaxTargets); // Don't check misprediction frequency for jump tables -- we don't really // care as long as we are saving loads from the jump table. if (!IsJumpTable || opts::ICPJumpTablesByTarget) { // Compute the misprediction frequency of the top N call targets. If // this frequency is less than the threshold, we should skip ICP at // this callsite. const double TopNMispredictFrequency = (100.0 * TotalMispredictsTopN) / NumCalls; if (TopNMispredictFrequency < opts::ICPMispredictThreshold) { if (opts::Verbosity >= 1) { const ptrdiff_t InstIdx = &Inst - &(*BB.begin()); BC.outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx << " in " << BB.getName() << ", calls = " << NumCalls << ", top N mispredict frequency " << format("%.1f", TopNMispredictFrequency) << "% < " << opts::ICPMispredictThreshold << "%\n"; } return 0; } } } // Filter by inline-ability of target functions, stop at first target that // can't be inlined. if (!IsJumpTable && opts::ICPPeelForInline) { for (size_t I = 0; I < N; ++I) { const MCSymbol *TargetSym = Targets[I].To.Sym; const BinaryFunction *TargetBF = BC.getFunctionForSymbol(TargetSym); if (!TargetBF || !BinaryFunctionPass::shouldOptimize(*TargetBF) || getInliningInfo(*TargetBF).Type == InliningType::INL_NONE) { N = I; break; } } } // Filter functions that can have ICP applied (for debugging) if (!opts::ICPFuncsList.empty()) { for (std::string &Name : opts::ICPFuncsList) if (BF->hasName(Name)) return N; return 0; } return N; } void IndirectCallPromotion::printCallsiteInfo( const BinaryBasicBlock &BB, const MCInst &Inst, const std::vector &Targets, const size_t N, uint64_t NumCalls) const { BinaryContext &BC = BB.getFunction()->getBinaryContext(); const bool IsTailCall = BC.MIB->isTailCall(Inst); const bool IsJumpTable = BB.getFunction()->getJumpTable(Inst); const ptrdiff_t InstIdx = &Inst - &(*BB.begin()); BC.outs() << "BOLT-INFO: ICP candidate branch info: " << *BB.getFunction() << " @ " << InstIdx << " in " << BB.getName() << " -> calls = " << NumCalls << (IsTailCall ? " (tail)" : (IsJumpTable ? " (jump table)" : "")) << "\n"; for (size_t I = 0; I < N; I++) { const double Frequency = 100.0 * Targets[I].Branches / NumCalls; const double MisFrequency = 100.0 * Targets[I].Mispreds / NumCalls; BC.outs() << "BOLT-INFO: "; if (Targets[I].To.Sym) BC.outs() << Targets[I].To.Sym->getName(); else BC.outs() << Targets[I].To.Addr; BC.outs() << ", calls = " << Targets[I].Branches << ", mispreds = " << Targets[I].Mispreds << ", taken freq = " << format("%.1f", Frequency) << "%" << ", mis. freq = " << format("%.1f", MisFrequency) << "%"; bool First = true; for (uint64_t JTIndex : Targets[I].JTIndices) { BC.outs() << (First ? ", indices = " : ", ") << JTIndex; First = false; } BC.outs() << "\n"; } LLVM_DEBUG({ dbgs() << "BOLT-INFO: ICP original call instruction:"; BC.printInstruction(dbgs(), Inst, Targets[0].From.Addr, nullptr, true); }); } Error IndirectCallPromotion::runOnFunctions(BinaryContext &BC) { if (opts::ICP == ICP_NONE) return Error::success(); auto &BFs = BC.getBinaryFunctions(); const bool OptimizeCalls = (opts::ICP == ICP_CALLS || opts::ICP == ICP_ALL); const bool OptimizeJumpTables = (opts::ICP == ICP_JUMP_TABLES || opts::ICP == ICP_ALL); std::unique_ptr RA; std::unique_ptr CG; if (OptimizeJumpTables) { CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC))); RA.reset(new RegAnalysis(BC, &BFs, &*CG)); } // If icp-top-callsites is enabled, compute the total number of indirect // calls and then optimize the hottest callsites that contribute to that // total. SetVector Functions; if (opts::ICPTopCallsites == 0) { for (auto &KV : BFs) Functions.insert(&KV.second); } else { using IndirectCallsite = std::tuple; std::vector IndirectCalls; size_t TotalIndirectCalls = 0; // Find all the indirect callsites. for (auto &BFIt : BFs) { BinaryFunction &Function = BFIt.second; if (!shouldOptimize(Function)) continue; const bool HasLayout = !Function.getLayout().block_empty(); for (BinaryBasicBlock &BB : Function) { if (HasLayout && Function.isSplit() && BB.isCold()) continue; for (MCInst &Inst : BB) { const bool IsJumpTable = Function.getJumpTable(Inst); const bool HasIndirectCallProfile = BC.MIB->hasAnnotation(Inst, "CallProfile"); const bool IsDirectCall = (BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0)); if (!IsDirectCall && ((HasIndirectCallProfile && !IsJumpTable && OptimizeCalls) || (IsJumpTable && OptimizeJumpTables))) { uint64_t NumCalls = 0; for (const Callsite &BInfo : getCallTargets(BB, Inst)) NumCalls += BInfo.Branches; IndirectCalls.push_back( std::make_tuple(NumCalls, &Inst, &Function)); TotalIndirectCalls += NumCalls; } } } } // Sort callsites by execution count. llvm::sort(reverse(IndirectCalls)); // Find callsites that contribute to the top "opts::ICPTopCallsites"% // number of calls. const float TopPerc = opts::ICPTopCallsites / 100.0f; int64_t MaxCalls = TotalIndirectCalls * TopPerc; uint64_t LastFreq = std::numeric_limits::max(); size_t Num = 0; for (const IndirectCallsite &IC : IndirectCalls) { const uint64_t CurFreq = std::get<0>(IC); // Once we decide to stop, include at least all branches that share the // same frequency of the last one to avoid non-deterministic behavior // (e.g. turning on/off ICP depending on the order of functions) if (MaxCalls <= 0 && CurFreq != LastFreq) break; MaxCalls -= CurFreq; LastFreq = CurFreq; BC.MIB->addAnnotation(*std::get<1>(IC), "DoICP", true); Functions.insert(std::get<2>(IC)); ++Num; } BC.outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls << ", " << Num << " callsites cover " << opts::ICPTopCallsites << "% of all indirect calls\n"; } for (BinaryFunction *FuncPtr : Functions) { BinaryFunction &Function = *FuncPtr; if (!shouldOptimize(Function)) continue; const bool HasLayout = !Function.getLayout().block_empty(); // Total number of indirect calls issued from the current Function. // (a fraction of TotalIndirectCalls) uint64_t FuncTotalIndirectCalls = 0; uint64_t FuncTotalIndirectJmps = 0; std::vector BBs; for (BinaryBasicBlock &BB : Function) { // Skip indirect calls in cold blocks. if (!HasLayout || !Function.isSplit() || !BB.isCold()) BBs.push_back(&BB); } if (BBs.empty()) continue; DataflowInfoManager Info(Function, RA.get(), nullptr); while (!BBs.empty()) { BinaryBasicBlock *BB = BBs.back(); BBs.pop_back(); for (unsigned Idx = 0; Idx < BB->size(); ++Idx) { MCInst &Inst = BB->getInstructionAtIndex(Idx); const ptrdiff_t InstIdx = &Inst - &(*BB->begin()); const bool IsTailCall = BC.MIB->isTailCall(Inst); const bool HasIndirectCallProfile = BC.MIB->hasAnnotation(Inst, "CallProfile"); const bool IsJumpTable = Function.getJumpTable(Inst); if (BC.MIB->isCall(Inst)) TotalCalls += BB->getKnownExecutionCount(); if (IsJumpTable && !OptimizeJumpTables) continue; if (!IsJumpTable && (!HasIndirectCallProfile || !OptimizeCalls)) continue; // Ignore direct calls. if (BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0)) continue; assert((BC.MIB->isCall(Inst) || BC.MIB->isIndirectBranch(Inst)) && "expected a call or an indirect jump instruction"); if (IsJumpTable) ++TotalJumpTableCallsites; else ++TotalIndirectCallsites; std::vector Targets = getCallTargets(*BB, Inst); // Compute the total number of calls from this particular callsite. uint64_t NumCalls = 0; for (const Callsite &BInfo : Targets) NumCalls += BInfo.Branches; if (!IsJumpTable) FuncTotalIndirectCalls += NumCalls; else FuncTotalIndirectJmps += NumCalls; // If FLAGS regs is alive after this jmp site, do not try // promoting because we will clobber FLAGS. if (IsJumpTable) { ErrorOr State = Info.getLivenessAnalysis().getStateBefore(Inst); if (!State || (State && (*State)[BC.MIB->getFlagsReg()])) { if (opts::Verbosity >= 1) BC.outs() << "BOLT-INFO: ICP failed in " << Function << " @ " << InstIdx << " in " << BB->getName() << ", calls = " << NumCalls << (State ? ", cannot clobber flags reg.\n" : ", no liveness data available.\n"); continue; } } // Should this callsite be optimized? Return the number of targets // to use when promoting this call. A value of zero means to skip // this callsite. size_t N = canPromoteCallsite(*BB, Inst, Targets, NumCalls); // If it is a jump table and it failed to meet our initial threshold, // proceed to findCallTargetSymbols -- it may reevaluate N if // memory profile is present if (!N && !IsJumpTable) continue; if (opts::Verbosity >= 1) printCallsiteInfo(*BB, Inst, Targets, N, NumCalls); // Find MCSymbols or absolute addresses for each call target. MCInst *TargetFetchInst = nullptr; const SymTargetsType SymTargets = findCallTargetSymbols(Targets, N, *BB, Inst, TargetFetchInst); // findCallTargetSymbols may have changed N if mem profile is available // for jump tables if (!N) continue; LLVM_DEBUG(printDecision(dbgs(), Targets, N)); // If we can't resolve any of the target symbols, punt on this callsite. // TODO: can this ever happen? if (SymTargets.size() < N) { const size_t LastTarget = SymTargets.size(); if (opts::Verbosity >= 1) BC.outs() << "BOLT-INFO: ICP failed in " << Function << " @ " << InstIdx << " in " << BB->getName() << ", calls = " << NumCalls << ", ICP failed to find target symbol for " << Targets[LastTarget].To.Sym->getName() << "\n"; continue; } MethodInfoType MethodInfo; if (!IsJumpTable) { MethodInfo = maybeGetVtableSyms(*BB, Inst, SymTargets); TotalMethodLoadsEliminated += MethodInfo.first.empty() ? 0 : 1; LLVM_DEBUG(dbgs() << "BOLT-INFO: ICP " << (!MethodInfo.first.empty() ? "found" : "did not find") << " vtables for all methods.\n"); } else if (TargetFetchInst) { ++TotalIndexBasedJumps; MethodInfo.second.push_back(TargetFetchInst); } // Generate new promoted call code for this callsite. MCPlusBuilder::BlocksVectorTy ICPcode = (IsJumpTable && !opts::ICPJumpTablesByTarget) ? BC.MIB->jumpTablePromotion(Inst, SymTargets, MethodInfo.second, BC.Ctx.get()) : BC.MIB->indirectCallPromotion( Inst, SymTargets, MethodInfo.first, MethodInfo.second, opts::ICPOldCodeSequence, BC.Ctx.get()); if (ICPcode.empty()) { if (opts::Verbosity >= 1) BC.outs() << "BOLT-INFO: ICP failed in " << Function << " @ " << InstIdx << " in " << BB->getName() << ", calls = " << NumCalls << ", unable to generate promoted call code.\n"; continue; } LLVM_DEBUG({ uint64_t Offset = Targets[0].From.Addr; dbgs() << "BOLT-INFO: ICP indirect call code:\n"; for (const auto &entry : ICPcode) { const MCSymbol *const &Sym = entry.first; const InstructionListType &Insts = entry.second; if (Sym) dbgs() << Sym->getName() << ":\n"; Offset = BC.printInstructions(dbgs(), Insts.begin(), Insts.end(), Offset); } dbgs() << "---------------------------------------------------\n"; }); // Rewrite the CFG with the newly generated ICP code. std::vector> NewBBs = rewriteCall(*BB, Inst, std::move(ICPcode), MethodInfo.second); // Fix the CFG after inserting the new basic blocks. BinaryBasicBlock *MergeBlock = fixCFG(*BB, IsTailCall, IsJumpTable, std::move(NewBBs), Targets); // Since the tail of the original block was split off and it may contain // additional indirect calls, we must add the merge block to the set of // blocks to process. if (MergeBlock) BBs.push_back(MergeBlock); if (opts::Verbosity >= 1) BC.outs() << "BOLT-INFO: ICP succeeded in " << Function << " @ " << InstIdx << " in " << BB->getName() << " -> calls = " << NumCalls << "\n"; if (IsJumpTable) ++TotalOptimizedJumpTableCallsites; else ++TotalOptimizedIndirectCallsites; Modified.insert(&Function); } } TotalIndirectCalls += FuncTotalIndirectCalls; TotalIndirectJmps += FuncTotalIndirectJmps; } BC.outs() << "BOLT-INFO: ICP total indirect callsites with profile = " << TotalIndirectCallsites << "\n" << "BOLT-INFO: ICP total jump table callsites = " << TotalJumpTableCallsites << "\n" << "BOLT-INFO: ICP total number of calls = " << TotalCalls << "\n" << "BOLT-INFO: ICP percentage of calls that are indirect = " << format("%.1f", (100.0 * TotalIndirectCalls) / TotalCalls) << "%\n" << "BOLT-INFO: ICP percentage of indirect calls that can be " "optimized = " << format("%.1f", (100.0 * TotalNumFrequentCalls) / std::max(TotalIndirectCalls, 1)) << "%\n" << "BOLT-INFO: ICP percentage of indirect callsites that are " "optimized = " << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) / std::max(TotalIndirectCallsites, 1)) << "%\n" << "BOLT-INFO: ICP number of method load elimination candidates = " << TotalMethodLoadEliminationCandidates << "\n" << "BOLT-INFO: ICP percentage of method calls candidates that have " "loads eliminated = " << format("%.1f", (100.0 * TotalMethodLoadsEliminated) / std::max(TotalMethodLoadEliminationCandidates, 1)) << "%\n" << "BOLT-INFO: ICP percentage of indirect branches that are " "optimized = " << format("%.1f", (100.0 * TotalNumFrequentJmps) / std::max(TotalIndirectJmps, 1)) << "%\n" << "BOLT-INFO: ICP percentage of jump table callsites that are " << "optimized = " << format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) / std::max(TotalJumpTableCallsites, 1)) << "%\n" << "BOLT-INFO: ICP number of jump table callsites that can use hot " << "indices = " << TotalIndexBasedCandidates << "\n" << "BOLT-INFO: ICP percentage of jump table callsites that use hot " "indices = " << format("%.1f", (100.0 * TotalIndexBasedJumps) / std::max(TotalIndexBasedCandidates, 1)) << "%\n"; #ifndef NDEBUG verifyProfile(BFs); #endif return Error::success(); } } // namespace bolt } // namespace llvm