xref: /llvm-project/bolt/lib/Passes/IndirectCallPromotion.cpp (revision 16e67e6932ef2e82d593eba8203b36ed972c9c45)
1 //===- bolt/Passes/IndirectCallPromotion.cpp ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the IndirectCallPromotion class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/IndirectCallPromotion.h"
14 #include "bolt/Passes/BinaryFunctionCallGraph.h"
15 #include "bolt/Passes/DataflowInfoManager.h"
16 #include "bolt/Passes/Inliner.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/Support/CommandLine.h"
19 #include <iterator>
20 
21 #define DEBUG_TYPE "ICP"
22 #define DEBUG_VERBOSE(Level, X)                                                \
23   if (opts::Verbosity >= (Level)) {                                            \
24     X;                                                                         \
25   }
26 
27 using namespace llvm;
28 using namespace bolt;
29 
30 namespace opts {
31 
32 extern cl::OptionCategory BoltOptCategory;
33 
34 extern cl::opt<IndirectCallPromotionType> ICP;
35 extern cl::opt<unsigned> Verbosity;
36 extern cl::opt<unsigned> ExecutionCountThreshold;
37 
38 static cl::opt<unsigned> ICPJTRemainingPercentThreshold(
39     "icp-jt-remaining-percent-threshold",
40     cl::desc("The percentage threshold against remaining unpromoted indirect "
41              "call count for the promotion for jump tables"),
42     cl::init(30), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
43 
44 static cl::opt<unsigned> ICPJTTotalPercentThreshold(
45     "icp-jt-total-percent-threshold",
46     cl::desc(
47         "The percentage threshold against total count for the promotion for "
48         "jump tables"),
49     cl::init(5), cl::Hidden, cl::cat(BoltOptCategory));
50 
51 static cl::opt<unsigned> ICPCallsRemainingPercentThreshold(
52     "icp-calls-remaining-percent-threshold",
53     cl::desc("The percentage threshold against remaining unpromoted indirect "
54              "call count for the promotion for calls"),
55     cl::init(50), cl::Hidden, cl::cat(BoltOptCategory));
56 
57 static cl::opt<unsigned> ICPCallsTotalPercentThreshold(
58     "icp-calls-total-percent-threshold",
59     cl::desc(
60         "The percentage threshold against total count for the promotion for "
61         "calls"),
62     cl::init(30), cl::Hidden, cl::cat(BoltOptCategory));
63 
64 static cl::opt<unsigned> ICPMispredictThreshold(
65     "indirect-call-promotion-mispredict-threshold",
66     cl::desc("misprediction threshold for skipping ICP on an "
67              "indirect call"),
68     cl::init(0), cl::cat(BoltOptCategory));
69 
70 static cl::alias ICPMispredictThresholdAlias(
71     "icp-mp-threshold",
72     cl::desc("alias for --indirect-call-promotion-mispredict-threshold"),
73     cl::aliasopt(ICPMispredictThreshold));
74 
75 static cl::opt<bool> ICPUseMispredicts(
76     "indirect-call-promotion-use-mispredicts",
77     cl::desc("use misprediction frequency for determining whether or not ICP "
78              "should be applied at a callsite.  The "
79              "-indirect-call-promotion-mispredict-threshold value will be used "
80              "by this heuristic"),
81     cl::cat(BoltOptCategory));
82 
83 static cl::alias ICPUseMispredictsAlias(
84     "icp-use-mp",
85     cl::desc("alias for --indirect-call-promotion-use-mispredicts"),
86     cl::aliasopt(ICPUseMispredicts));
87 
88 static cl::opt<unsigned>
89     ICPTopN("indirect-call-promotion-topn",
90             cl::desc("limit number of targets to consider when doing indirect "
91                      "call promotion. 0 = no limit"),
92             cl::init(3), cl::cat(BoltOptCategory));
93 
94 static cl::alias
95     ICPTopNAlias("icp-topn",
96                  cl::desc("alias for --indirect-call-promotion-topn"),
97                  cl::aliasopt(ICPTopN));
98 
99 static cl::opt<unsigned> ICPCallsTopN(
100     "indirect-call-promotion-calls-topn",
101     cl::desc("limit number of targets to consider when doing indirect "
102              "call promotion on calls. 0 = no limit"),
103     cl::init(0), cl::cat(BoltOptCategory));
104 
105 static cl::alias ICPCallsTopNAlias(
106     "icp-calls-topn",
107     cl::desc("alias for --indirect-call-promotion-calls-topn"),
108     cl::aliasopt(ICPCallsTopN));
109 
110 static cl::opt<unsigned> ICPJumpTablesTopN(
111     "indirect-call-promotion-jump-tables-topn",
112     cl::desc("limit number of targets to consider when doing indirect "
113              "call promotion on jump tables. 0 = no limit"),
114     cl::init(0), cl::cat(BoltOptCategory));
115 
116 static cl::alias ICPJumpTablesTopNAlias(
117     "icp-jt-topn",
118     cl::desc("alias for --indirect-call-promotion-jump-tables-topn"),
119     cl::aliasopt(ICPJumpTablesTopN));
120 
121 static cl::opt<bool> EliminateLoads(
122     "icp-eliminate-loads",
123     cl::desc("enable load elimination using memory profiling data when "
124              "performing ICP"),
125     cl::init(true), cl::cat(BoltOptCategory));
126 
127 static cl::opt<unsigned> ICPTopCallsites(
128     "icp-top-callsites",
129     cl::desc("optimize hottest calls until at least this percentage of all "
130              "indirect calls frequency is covered. 0 = all callsites"),
131     cl::init(99), cl::Hidden, cl::cat(BoltOptCategory));
132 
133 static cl::list<std::string>
134     ICPFuncsList("icp-funcs", cl::CommaSeparated,
135                  cl::desc("list of functions to enable ICP for"),
136                  cl::value_desc("func1,func2,func3,..."), cl::Hidden,
137                  cl::cat(BoltOptCategory));
138 
139 static cl::opt<bool>
140     ICPOldCodeSequence("icp-old-code-sequence",
141                        cl::desc("use old code sequence for promoted calls"),
142                        cl::Hidden, cl::cat(BoltOptCategory));
143 
144 static cl::opt<bool> ICPJumpTablesByTarget(
145     "icp-jump-tables-targets",
146     cl::desc(
147         "for jump tables, optimize indirect jmp targets instead of indices"),
148     cl::Hidden, cl::cat(BoltOptCategory));
149 
150 static cl::alias
151     ICPJumpTablesByTargetAlias("icp-jt-targets",
152                                cl::desc("alias for --icp-jump-tables-targets"),
153                                cl::aliasopt(ICPJumpTablesByTarget));
154 
155 static cl::opt<bool> ICPPeelForInline(
156     "icp-inline", cl::desc("only promote call targets eligible for inlining"),
157     cl::Hidden, cl::cat(BoltOptCategory));
158 
159 } // namespace opts
160 
161 static bool verifyProfile(std::map<uint64_t, BinaryFunction> &BFs) {
162   bool IsValid = true;
163   for (auto &BFI : BFs) {
164     BinaryFunction &BF = BFI.second;
165     if (!BF.isSimple())
166       continue;
167     for (const BinaryBasicBlock &BB : BF) {
168       auto BI = BB.branch_info_begin();
169       for (BinaryBasicBlock *SuccBB : BB.successors()) {
170         if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) {
171           if (BB.getKnownExecutionCount() == 0 ||
172               SuccBB->getKnownExecutionCount() == 0) {
173             errs() << "BOLT-WARNING: profile verification failed after ICP for "
174                       "function "
175                    << BF << '\n';
176             IsValid = false;
177           }
178         }
179         ++BI;
180       }
181     }
182   }
183   return IsValid;
184 }
185 
186 namespace llvm {
187 namespace bolt {
188 
189 IndirectCallPromotion::Callsite::Callsite(BinaryFunction &BF,
190                                           const IndirectCallProfile &ICP)
191     : From(BF.getSymbol()), To(ICP.Offset), Mispreds(ICP.Mispreds),
192       Branches(ICP.Count) {
193   if (ICP.Symbol) {
194     To.Sym = ICP.Symbol;
195     To.Addr = 0;
196   }
197 }
198 
199 void IndirectCallPromotion::printDecision(
200     llvm::raw_ostream &OS,
201     std::vector<IndirectCallPromotion::Callsite> &Targets, unsigned N) const {
202   uint64_t TotalCount = 0;
203   uint64_t TotalMispreds = 0;
204   for (const Callsite &S : Targets) {
205     TotalCount += S.Branches;
206     TotalMispreds += S.Mispreds;
207   }
208   if (!TotalCount)
209     TotalCount = 1;
210   if (!TotalMispreds)
211     TotalMispreds = 1;
212 
213   OS << "BOLT-INFO: ICP decision for call site with " << Targets.size()
214      << " targets, Count = " << TotalCount << ", Mispreds = " << TotalMispreds
215      << "\n";
216 
217   size_t I = 0;
218   for (const Callsite &S : Targets) {
219     OS << "Count = " << S.Branches << ", "
220        << format("%.1f", (100.0 * S.Branches) / TotalCount) << ", "
221        << "Mispreds = " << S.Mispreds << ", "
222        << format("%.1f", (100.0 * S.Mispreds) / TotalMispreds);
223     if (I < N)
224       OS << " * to be optimized *";
225     if (!S.JTIndices.empty()) {
226       OS << " Indices:";
227       for (const uint64_t Idx : S.JTIndices)
228         OS << " " << Idx;
229     }
230     OS << "\n";
231     I += S.JTIndices.empty() ? 1 : S.JTIndices.size();
232   }
233 }
234 
235 // Get list of targets for a given call sorted by most frequently
236 // called first.
237 std::vector<IndirectCallPromotion::Callsite>
238 IndirectCallPromotion::getCallTargets(BinaryBasicBlock &BB,
239                                       const MCInst &Inst) const {
240   BinaryFunction &BF = *BB.getFunction();
241   const BinaryContext &BC = BF.getBinaryContext();
242   std::vector<Callsite> Targets;
243 
244   if (const JumpTable *JT = BF.getJumpTable(Inst)) {
245     // Don't support PIC jump tables for now
246     if (!opts::ICPJumpTablesByTarget && JT->Type == JumpTable::JTT_PIC)
247       return Targets;
248     const Location From(BF.getSymbol());
249     const std::pair<size_t, size_t> Range =
250         JT->getEntriesForAddress(BC.MIB->getJumpTable(Inst));
251     assert(JT->Counts.empty() || JT->Counts.size() >= Range.second);
252     JumpTable::JumpInfo DefaultJI;
253     const JumpTable::JumpInfo *JI =
254         JT->Counts.empty() ? &DefaultJI : &JT->Counts[Range.first];
255     const size_t JIAdj = JT->Counts.empty() ? 0 : 1;
256     assert(JT->Type == JumpTable::JTT_PIC ||
257            JT->EntrySize == BC.AsmInfo->getCodePointerSize());
258     for (size_t I = Range.first; I < Range.second; ++I, JI += JIAdj) {
259       MCSymbol *Entry = JT->Entries[I];
260       const BinaryBasicBlock *ToBB = BF.getBasicBlockForLabel(Entry);
261       assert(ToBB || Entry == BF.getFunctionEndLabel() ||
262              Entry == BF.getFunctionEndLabel(FragmentNum::cold()));
263       if (Entry == BF.getFunctionEndLabel() ||
264           Entry == BF.getFunctionEndLabel(FragmentNum::cold()))
265         continue;
266       const Location To(Entry);
267       const BinaryBasicBlock::BinaryBranchInfo &BI = BB.getBranchInfo(*ToBB);
268       Targets.emplace_back(From, To, BI.MispredictedCount, BI.Count,
269                            I - Range.first);
270     }
271 
272     // Sort by symbol then addr.
273     llvm::sort(Targets, [](const Callsite &A, const Callsite &B) {
274       if (A.To.Sym && B.To.Sym)
275         return A.To.Sym < B.To.Sym;
276       else if (A.To.Sym && !B.To.Sym)
277         return true;
278       else if (!A.To.Sym && B.To.Sym)
279         return false;
280       else
281         return A.To.Addr < B.To.Addr;
282     });
283 
284     // Targets may contain multiple entries to the same target, but using
285     // different indices. Their profile will report the same number of branches
286     // for different indices if the target is the same. That's because we don't
287     // profile the index value, but only the target via LBR.
288     auto First = Targets.begin();
289     auto Last = Targets.end();
290     auto Result = First;
291     while (++First != Last) {
292       Callsite &A = *Result;
293       const Callsite &B = *First;
294       if (A.To.Sym && B.To.Sym && A.To.Sym == B.To.Sym)
295         A.JTIndices.insert(A.JTIndices.end(), B.JTIndices.begin(),
296                            B.JTIndices.end());
297       else
298         *(++Result) = *First;
299     }
300     ++Result;
301 
302     LLVM_DEBUG(if (Targets.end() - Result > 0) {
303       dbgs() << "BOLT-INFO: ICP: " << (Targets.end() - Result)
304              << " duplicate targets removed\n";
305     });
306 
307     Targets.erase(Result, Targets.end());
308   } else {
309     // Don't try to optimize PC relative indirect calls.
310     if (Inst.getOperand(0).isReg() &&
311         Inst.getOperand(0).getReg() == BC.MRI->getProgramCounter())
312       return Targets;
313 
314     const auto ICSP = BC.MIB->tryGetAnnotationAs<IndirectCallSiteProfile>(
315         Inst, "CallProfile");
316     if (ICSP) {
317       for (const IndirectCallProfile &CSP : ICSP.get()) {
318         Callsite Site(BF, CSP);
319         if (Site.isValid())
320           Targets.emplace_back(std::move(Site));
321       }
322     }
323   }
324 
325   // Sort by target count, number of indices in case of jump table, and
326   // mispredicts. We prioritize targets with high count, small number of indices
327   // and high mispredicts. Break ties by selecting targets with lower addresses.
328   llvm::stable_sort(Targets, [](const Callsite &A, const Callsite &B) {
329     if (A.Branches != B.Branches)
330       return A.Branches > B.Branches;
331     if (A.JTIndices.size() != B.JTIndices.size())
332       return A.JTIndices.size() < B.JTIndices.size();
333     if (A.Mispreds != B.Mispreds)
334       return A.Mispreds > B.Mispreds;
335     return A.To.Addr < B.To.Addr;
336   });
337 
338   // Remove non-symbol targets
339   llvm::erase_if(Targets, [](const Callsite &CS) { return !CS.To.Sym; });
340 
341   LLVM_DEBUG(if (BF.getJumpTable(Inst)) {
342     uint64_t TotalCount = 0;
343     uint64_t TotalMispreds = 0;
344     for (const Callsite &S : Targets) {
345       TotalCount += S.Branches;
346       TotalMispreds += S.Mispreds;
347     }
348     if (!TotalCount)
349       TotalCount = 1;
350     if (!TotalMispreds)
351       TotalMispreds = 1;
352 
353     dbgs() << "BOLT-INFO: ICP: jump table size = " << Targets.size()
354            << ", Count = " << TotalCount << ", Mispreds = " << TotalMispreds
355            << "\n";
356 
357     size_t I = 0;
358     for (const Callsite &S : Targets) {
359       dbgs() << "Count[" << I << "] = " << S.Branches << ", "
360              << format("%.1f", (100.0 * S.Branches) / TotalCount) << ", "
361              << "Mispreds[" << I << "] = " << S.Mispreds << ", "
362              << format("%.1f", (100.0 * S.Mispreds) / TotalMispreds) << "\n";
363       ++I;
364     }
365   });
366 
367   return Targets;
368 }
369 
370 IndirectCallPromotion::JumpTableInfoType
371 IndirectCallPromotion::maybeGetHotJumpTableTargets(BinaryBasicBlock &BB,
372                                                    MCInst &CallInst,
373                                                    MCInst *&TargetFetchInst,
374                                                    const JumpTable *JT) const {
375   assert(JT && "Can't get jump table addrs for non-jump tables.");
376 
377   BinaryFunction &Function = *BB.getFunction();
378   BinaryContext &BC = Function.getBinaryContext();
379 
380   if (!Function.hasMemoryProfile() || !opts::EliminateLoads)
381     return JumpTableInfoType();
382 
383   JumpTableInfoType HotTargets;
384   MCInst *MemLocInstr;
385   MCInst *PCRelBaseOut;
386   unsigned BaseReg, IndexReg;
387   int64_t DispValue;
388   const MCExpr *DispExpr;
389   MutableArrayRef<MCInst> Insts(&BB.front(), &CallInst);
390   const IndirectBranchType Type = BC.MIB->analyzeIndirectBranch(
391       CallInst, Insts.begin(), Insts.end(), BC.AsmInfo->getCodePointerSize(),
392       MemLocInstr, BaseReg, IndexReg, DispValue, DispExpr, PCRelBaseOut);
393 
394   assert(MemLocInstr && "There should always be a load for jump tables");
395   if (!MemLocInstr)
396     return JumpTableInfoType();
397 
398   LLVM_DEBUG({
399     dbgs() << "BOLT-INFO: ICP attempting to find memory profiling data for "
400            << "jump table in " << Function << " at @ "
401            << (&CallInst - &BB.front()) << "\n"
402            << "BOLT-INFO: ICP target fetch instructions:\n";
403     BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function);
404     if (MemLocInstr != &CallInst)
405       BC.printInstruction(dbgs(), CallInst, 0, &Function);
406   });
407 
408   DEBUG_VERBOSE(1, {
409     dbgs() << "Jmp info: Type = " << (unsigned)Type << ", "
410            << "BaseReg = " << BC.MRI->getName(BaseReg) << ", "
411            << "IndexReg = " << BC.MRI->getName(IndexReg) << ", "
412            << "DispValue = " << Twine::utohexstr(DispValue) << ", "
413            << "DispExpr = " << DispExpr << ", "
414            << "MemLocInstr = ";
415     BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function);
416     dbgs() << "\n";
417   });
418 
419   ++TotalIndexBasedCandidates;
420 
421   auto ErrorOrMemAccessProfile =
422       BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MemLocInstr,
423                                                       "MemoryAccessProfile");
424   if (!ErrorOrMemAccessProfile) {
425     DEBUG_VERBOSE(1, dbgs()
426                          << "BOLT-INFO: ICP no memory profiling data found\n");
427     return JumpTableInfoType();
428   }
429   MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccessProfile.get();
430 
431   uint64_t ArrayStart;
432   if (DispExpr) {
433     ErrorOr<uint64_t> DispValueOrError =
434         BC.getSymbolValue(*BC.MIB->getTargetSymbol(DispExpr));
435     assert(DispValueOrError && "global symbol needs a value");
436     ArrayStart = *DispValueOrError;
437   } else {
438     ArrayStart = static_cast<uint64_t>(DispValue);
439   }
440 
441   if (BaseReg == BC.MRI->getProgramCounter())
442     ArrayStart += Function.getAddress() + MemAccessProfile.NextInstrOffset;
443 
444   // This is a map of [symbol] -> [count, index] and is used to combine indices
445   // into the jump table since there may be multiple addresses that all have the
446   // same entry.
447   std::map<MCSymbol *, std::pair<uint64_t, uint64_t>> HotTargetMap;
448   const std::pair<size_t, size_t> Range = JT->getEntriesForAddress(ArrayStart);
449 
450   for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) {
451     size_t Index;
452     // Mem data occasionally includes nullprs, ignore them.
453     if (!AccessInfo.MemoryObject && !AccessInfo.Offset)
454       continue;
455 
456     if (AccessInfo.Offset % JT->EntrySize != 0) // ignore bogus data
457       return JumpTableInfoType();
458 
459     if (AccessInfo.MemoryObject) {
460       // Deal with bad/stale data
461       if (!AccessInfo.MemoryObject->getName().startswith(
462               "JUMP_TABLE/" + Function.getOneName().str()))
463         return JumpTableInfoType();
464       Index =
465           (AccessInfo.Offset - (ArrayStart - JT->getAddress())) / JT->EntrySize;
466     } else {
467       Index = (AccessInfo.Offset - ArrayStart) / JT->EntrySize;
468     }
469 
470     // If Index is out of range it probably means the memory profiling data is
471     // wrong for this instruction, bail out.
472     if (Index >= Range.second) {
473       LLVM_DEBUG(dbgs() << "BOLT-INFO: Index out of range of " << Range.first
474                         << ", " << Range.second << "\n");
475       return JumpTableInfoType();
476     }
477 
478     // Make sure the hot index points at a legal label corresponding to a BB,
479     // e.g. not the end of function (unreachable) label.
480     if (!Function.getBasicBlockForLabel(JT->Entries[Index + Range.first])) {
481       LLVM_DEBUG({
482         dbgs() << "BOLT-INFO: hot index " << Index << " pointing at bogus "
483                << "label " << JT->Entries[Index + Range.first]->getName()
484                << " in jump table:\n";
485         JT->print(dbgs());
486         dbgs() << "HotTargetMap:\n";
487         for (std::pair<MCSymbol *const, std::pair<uint64_t, uint64_t>> &HT :
488              HotTargetMap)
489           dbgs() << "BOLT-INFO: " << HT.first->getName()
490                  << " = (count=" << HT.second.first
491                  << ", index=" << HT.second.second << ")\n";
492       });
493       return JumpTableInfoType();
494     }
495 
496     std::pair<uint64_t, uint64_t> &HotTarget =
497         HotTargetMap[JT->Entries[Index + Range.first]];
498     HotTarget.first += AccessInfo.Count;
499     HotTarget.second = Index;
500   }
501 
502   llvm::copy(llvm::make_second_range(HotTargetMap),
503              std::back_inserter(HotTargets));
504 
505   // Sort with highest counts first.
506   llvm::sort(reverse(HotTargets));
507 
508   LLVM_DEBUG({
509     dbgs() << "BOLT-INFO: ICP jump table hot targets:\n";
510     for (const std::pair<uint64_t, uint64_t> &Target : HotTargets)
511       dbgs() << "BOLT-INFO:  Idx = " << Target.second << ", "
512              << "Count = " << Target.first << "\n";
513   });
514 
515   BC.MIB->getOrCreateAnnotationAs<uint16_t>(CallInst, "JTIndexReg") = IndexReg;
516 
517   TargetFetchInst = MemLocInstr;
518 
519   return HotTargets;
520 }
521 
522 IndirectCallPromotion::SymTargetsType
523 IndirectCallPromotion::findCallTargetSymbols(std::vector<Callsite> &Targets,
524                                              size_t &N, BinaryBasicBlock &BB,
525                                              MCInst &CallInst,
526                                              MCInst *&TargetFetchInst) const {
527   const JumpTable *JT = BB.getFunction()->getJumpTable(CallInst);
528   SymTargetsType SymTargets;
529 
530   if (!JT) {
531     for (size_t I = 0; I < N; ++I) {
532       assert(Targets[I].To.Sym && "All ICP targets must be to known symbols");
533       assert(Targets[I].JTIndices.empty() &&
534              "Can't have jump table indices for non-jump tables");
535       SymTargets.emplace_back(Targets[I].To.Sym, 0);
536     }
537     return SymTargets;
538   }
539 
540   // Use memory profile to select hot targets.
541   JumpTableInfoType HotTargets =
542       maybeGetHotJumpTableTargets(BB, CallInst, TargetFetchInst, JT);
543 
544   auto findTargetsIndex = [&](uint64_t JTIndex) {
545     for (size_t I = 0; I < Targets.size(); ++I)
546       if (llvm::is_contained(Targets[I].JTIndices, JTIndex))
547         return I;
548     LLVM_DEBUG(dbgs() << "BOLT-ERROR: Unable to find target index for hot jump "
549                       << " table entry in " << *BB.getFunction() << "\n");
550     llvm_unreachable("Hot indices must be referred to by at least one "
551                      "callsite");
552   };
553 
554   if (!HotTargets.empty()) {
555     if (opts::Verbosity >= 1)
556       for (size_t I = 0; I < HotTargets.size(); ++I)
557         outs() << "BOLT-INFO: HotTarget[" << I << "] = (" << HotTargets[I].first
558                << ", " << HotTargets[I].second << ")\n";
559 
560     // Recompute hottest targets, now discriminating which index is hot
561     // NOTE: This is a tradeoff. On one hand, we get index information. On the
562     // other hand, info coming from the memory profile is much less accurate
563     // than LBRs. So we may actually end up working with more coarse
564     // profile granularity in exchange for information about indices.
565     std::vector<Callsite> NewTargets;
566     std::map<const MCSymbol *, uint32_t> IndicesPerTarget;
567     uint64_t TotalMemAccesses = 0;
568     for (size_t I = 0; I < HotTargets.size(); ++I) {
569       const uint64_t TargetIndex = findTargetsIndex(HotTargets[I].second);
570       ++IndicesPerTarget[Targets[TargetIndex].To.Sym];
571       TotalMemAccesses += HotTargets[I].first;
572     }
573     uint64_t RemainingMemAccesses = TotalMemAccesses;
574     const size_t TopN =
575         opts::ICPJumpTablesTopN ? opts::ICPJumpTablesTopN : opts::ICPTopN;
576     size_t I = 0;
577     for (; I < HotTargets.size(); ++I) {
578       const uint64_t MemAccesses = HotTargets[I].first;
579       if (100 * MemAccesses <
580           TotalMemAccesses * opts::ICPJTTotalPercentThreshold)
581         break;
582       if (100 * MemAccesses <
583           RemainingMemAccesses * opts::ICPJTRemainingPercentThreshold)
584         break;
585       if (TopN && I >= TopN)
586         break;
587       RemainingMemAccesses -= MemAccesses;
588 
589       const uint64_t JTIndex = HotTargets[I].second;
590       Callsite &Target = Targets[findTargetsIndex(JTIndex)];
591 
592       NewTargets.push_back(Target);
593       std::vector<uint64_t>({JTIndex}).swap(NewTargets.back().JTIndices);
594       llvm::erase_value(Target.JTIndices, JTIndex);
595 
596       // Keep fixCFG counts sane if more indices use this same target later
597       assert(IndicesPerTarget[Target.To.Sym] > 0 && "wrong map");
598       NewTargets.back().Branches =
599           Target.Branches / IndicesPerTarget[Target.To.Sym];
600       NewTargets.back().Mispreds =
601           Target.Mispreds / IndicesPerTarget[Target.To.Sym];
602       assert(Target.Branches >= NewTargets.back().Branches);
603       assert(Target.Mispreds >= NewTargets.back().Mispreds);
604       Target.Branches -= NewTargets.back().Branches;
605       Target.Mispreds -= NewTargets.back().Mispreds;
606     }
607     llvm::copy(Targets, std::back_inserter(NewTargets));
608     std::swap(NewTargets, Targets);
609     N = I;
610 
611     if (N == 0 && opts::Verbosity >= 1) {
612       outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " in "
613              << BB.getName() << ": failed to meet thresholds after memory "
614              << "profile data was loaded.\n";
615       return SymTargets;
616     }
617   }
618 
619   for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) {
620     Callsite &Target = Targets[TgtIdx];
621     assert(Target.To.Sym && "All ICP targets must be to known symbols");
622     assert(!Target.JTIndices.empty() && "Jump tables must have indices");
623     for (uint64_t Idx : Target.JTIndices) {
624       SymTargets.emplace_back(Target.To.Sym, Idx);
625       ++I;
626     }
627   }
628 
629   return SymTargets;
630 }
631 
632 IndirectCallPromotion::MethodInfoType IndirectCallPromotion::maybeGetVtableSyms(
633     BinaryBasicBlock &BB, MCInst &Inst,
634     const SymTargetsType &SymTargets) const {
635   BinaryFunction &Function = *BB.getFunction();
636   BinaryContext &BC = Function.getBinaryContext();
637   std::vector<std::pair<MCSymbol *, uint64_t>> VtableSyms;
638   std::vector<MCInst *> MethodFetchInsns;
639   unsigned VtableReg, MethodReg;
640   uint64_t MethodOffset;
641 
642   assert(!Function.getJumpTable(Inst) &&
643          "Can't get vtable addrs for jump tables.");
644 
645   if (!Function.hasMemoryProfile() || !opts::EliminateLoads)
646     return MethodInfoType();
647 
648   MutableArrayRef<MCInst> Insts(&BB.front(), &Inst + 1);
649   if (!BC.MIB->analyzeVirtualMethodCall(Insts.begin(), Insts.end(),
650                                         MethodFetchInsns, VtableReg, MethodReg,
651                                         MethodOffset)) {
652     DEBUG_VERBOSE(
653         1, dbgs() << "BOLT-INFO: ICP unable to analyze method call in "
654                   << Function << " at @ " << (&Inst - &BB.front()) << "\n");
655     return MethodInfoType();
656   }
657 
658   ++TotalMethodLoadEliminationCandidates;
659 
660   DEBUG_VERBOSE(1, {
661     dbgs() << "BOLT-INFO: ICP found virtual method call in " << Function
662            << " at @ " << (&Inst - &BB.front()) << "\n";
663     dbgs() << "BOLT-INFO: ICP method fetch instructions:\n";
664     for (MCInst *Inst : MethodFetchInsns)
665       BC.printInstruction(dbgs(), *Inst, 0, &Function);
666 
667     if (MethodFetchInsns.back() != &Inst)
668       BC.printInstruction(dbgs(), Inst, 0, &Function);
669   });
670 
671   // Try to get value profiling data for the method load instruction.
672   auto ErrorOrMemAccessProfile =
673       BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MethodFetchInsns.back(),
674                                                       "MemoryAccessProfile");
675   if (!ErrorOrMemAccessProfile) {
676     DEBUG_VERBOSE(1, dbgs()
677                          << "BOLT-INFO: ICP no memory profiling data found\n");
678     return MethodInfoType();
679   }
680   MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccessProfile.get();
681 
682   // Find the vtable that each method belongs to.
683   std::map<const MCSymbol *, uint64_t> MethodToVtable;
684 
685   for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) {
686     uint64_t Address = AccessInfo.Offset;
687     if (AccessInfo.MemoryObject)
688       Address += AccessInfo.MemoryObject->getAddress();
689 
690     // Ignore bogus data.
691     if (!Address)
692       continue;
693 
694     const uint64_t VtableBase = Address - MethodOffset;
695 
696     DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP vtable = "
697                             << Twine::utohexstr(VtableBase) << "+"
698                             << MethodOffset << "/" << AccessInfo.Count << "\n");
699 
700     if (ErrorOr<uint64_t> MethodAddr = BC.getPointerAtAddress(Address)) {
701       BinaryData *MethodBD = BC.getBinaryDataAtAddress(MethodAddr.get());
702       if (!MethodBD) // skip unknown methods
703         continue;
704       MCSymbol *MethodSym = MethodBD->getSymbol();
705       MethodToVtable[MethodSym] = VtableBase;
706       DEBUG_VERBOSE(1, {
707         const BinaryFunction *Method = BC.getFunctionForSymbol(MethodSym);
708         dbgs() << "BOLT-INFO: ICP found method = "
709                << Twine::utohexstr(MethodAddr.get()) << "/"
710                << (Method ? Method->getPrintName() : "") << "\n";
711       });
712     }
713   }
714 
715   // Find the vtable for each target symbol.
716   for (size_t I = 0; I < SymTargets.size(); ++I) {
717     auto Itr = MethodToVtable.find(SymTargets[I].first);
718     if (Itr != MethodToVtable.end()) {
719       if (BinaryData *BD = BC.getBinaryDataContainingAddress(Itr->second)) {
720         const uint64_t Addend = Itr->second - BD->getAddress();
721         VtableSyms.emplace_back(BD->getSymbol(), Addend);
722         continue;
723       }
724     }
725     // Give up if we can't find the vtable for a method.
726     DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP can't find vtable for "
727                             << SymTargets[I].first->getName() << "\n");
728     return MethodInfoType();
729   }
730 
731   // Make sure the vtable reg is not clobbered by the argument passing code
732   if (VtableReg != MethodReg) {
733     for (MCInst *CurInst = MethodFetchInsns.front(); CurInst < &Inst;
734          ++CurInst) {
735       const MCInstrDesc &InstrInfo = BC.MII->get(CurInst->getOpcode());
736       if (InstrInfo.hasDefOfPhysReg(*CurInst, VtableReg, *BC.MRI))
737         return MethodInfoType();
738     }
739   }
740 
741   return MethodInfoType(VtableSyms, MethodFetchInsns);
742 }
743 
744 std::vector<std::unique_ptr<BinaryBasicBlock>>
745 IndirectCallPromotion::rewriteCall(
746     BinaryBasicBlock &IndCallBlock, const MCInst &CallInst,
747     MCPlusBuilder::BlocksVectorTy &&ICPcode,
748     const std::vector<MCInst *> &MethodFetchInsns) const {
749   BinaryFunction &Function = *IndCallBlock.getFunction();
750   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
751 
752   // Create new basic blocks with correct code in each one first.
753   std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs;
754   const bool IsTailCallOrJT =
755       (MIB->isTailCall(CallInst) || Function.getJumpTable(CallInst));
756 
757   // Move instructions from the tail of the original call block
758   // to the merge block.
759 
760   // Remember any pseudo instructions following a tail call.  These
761   // must be preserved and moved to the original block.
762   InstructionListType TailInsts;
763   const MCInst *TailInst = &CallInst;
764   if (IsTailCallOrJT)
765     while (TailInst + 1 < &(*IndCallBlock.end()) &&
766            MIB->isPseudo(*(TailInst + 1)))
767       TailInsts.push_back(*++TailInst);
768 
769   InstructionListType MovedInst = IndCallBlock.splitInstructions(&CallInst);
770   // Link new BBs to the original input offset of the BB where the indirect
771   // call site is, so we can map samples recorded in new BBs back to the
772   // original BB seen in the input binary (if using BAT)
773   const uint32_t OrigOffset = IndCallBlock.getInputOffset();
774 
775   IndCallBlock.eraseInstructions(MethodFetchInsns.begin(),
776                                  MethodFetchInsns.end());
777   if (IndCallBlock.empty() ||
778       (!MethodFetchInsns.empty() && MethodFetchInsns.back() == &CallInst))
779     IndCallBlock.addInstructions(ICPcode.front().second.begin(),
780                                  ICPcode.front().second.end());
781   else
782     IndCallBlock.replaceInstruction(std::prev(IndCallBlock.end()),
783                                     ICPcode.front().second);
784   IndCallBlock.addInstructions(TailInsts.begin(), TailInsts.end());
785 
786   for (auto Itr = ICPcode.begin() + 1; Itr != ICPcode.end(); ++Itr) {
787     MCSymbol *&Sym = Itr->first;
788     InstructionListType &Insts = Itr->second;
789     assert(Sym);
790     std::unique_ptr<BinaryBasicBlock> TBB = Function.createBasicBlock(Sym);
791     TBB->setOffset(OrigOffset);
792     for (MCInst &Inst : Insts) // sanitize new instructions.
793       if (MIB->isCall(Inst))
794         MIB->removeAnnotation(Inst, "CallProfile");
795     TBB->addInstructions(Insts.begin(), Insts.end());
796     NewBBs.emplace_back(std::move(TBB));
797   }
798 
799   // Move tail of instructions from after the original call to
800   // the merge block.
801   if (!IsTailCallOrJT)
802     NewBBs.back()->addInstructions(MovedInst.begin(), MovedInst.end());
803 
804   return NewBBs;
805 }
806 
807 BinaryBasicBlock *
808 IndirectCallPromotion::fixCFG(BinaryBasicBlock &IndCallBlock,
809                               const bool IsTailCall, const bool IsJumpTable,
810                               IndirectCallPromotion::BasicBlocksVector &&NewBBs,
811                               const std::vector<Callsite> &Targets) const {
812   BinaryFunction &Function = *IndCallBlock.getFunction();
813   using BinaryBranchInfo = BinaryBasicBlock::BinaryBranchInfo;
814   BinaryBasicBlock *MergeBlock = nullptr;
815 
816   // Scale indirect call counts to the execution count of the original
817   // basic block containing the indirect call.
818   uint64_t TotalCount = IndCallBlock.getKnownExecutionCount();
819   uint64_t TotalIndirectBranches = 0;
820   for (const Callsite &Target : Targets)
821     TotalIndirectBranches += Target.Branches;
822   if (TotalIndirectBranches == 0)
823     TotalIndirectBranches = 1;
824   BinaryBasicBlock::BranchInfoType BBI;
825   BinaryBasicBlock::BranchInfoType ScaledBBI;
826   for (const Callsite &Target : Targets) {
827     const size_t NumEntries =
828         std::max(static_cast<std::size_t>(1UL), Target.JTIndices.size());
829     for (size_t I = 0; I < NumEntries; ++I) {
830       BBI.push_back(
831           BinaryBranchInfo{(Target.Branches + NumEntries - 1) / NumEntries,
832                            (Target.Mispreds + NumEntries - 1) / NumEntries});
833       ScaledBBI.push_back(
834           BinaryBranchInfo{uint64_t(TotalCount * Target.Branches /
835                                     (NumEntries * TotalIndirectBranches)),
836                            uint64_t(TotalCount * Target.Mispreds /
837                                     (NumEntries * TotalIndirectBranches))});
838     }
839   }
840 
841   if (IsJumpTable) {
842     BinaryBasicBlock *NewIndCallBlock = NewBBs.back().get();
843     IndCallBlock.moveAllSuccessorsTo(NewIndCallBlock);
844 
845     std::vector<MCSymbol *> SymTargets;
846     for (const Callsite &Target : Targets) {
847       const size_t NumEntries =
848           std::max(static_cast<std::size_t>(1UL), Target.JTIndices.size());
849       for (size_t I = 0; I < NumEntries; ++I)
850         SymTargets.push_back(Target.To.Sym);
851     }
852     assert(SymTargets.size() > NewBBs.size() - 1 &&
853            "There must be a target symbol associated with each new BB.");
854 
855     for (uint64_t I = 0; I < NewBBs.size(); ++I) {
856       BinaryBasicBlock *SourceBB = I ? NewBBs[I - 1].get() : &IndCallBlock;
857       SourceBB->setExecutionCount(TotalCount);
858 
859       BinaryBasicBlock *TargetBB =
860           Function.getBasicBlockForLabel(SymTargets[I]);
861       SourceBB->addSuccessor(TargetBB, ScaledBBI[I]); // taken
862 
863       TotalCount -= ScaledBBI[I].Count;
864       SourceBB->addSuccessor(NewBBs[I].get(), TotalCount); // fall-through
865 
866       // Update branch info for the indirect jump.
867       BinaryBasicBlock::BinaryBranchInfo &BranchInfo =
868           NewIndCallBlock->getBranchInfo(*TargetBB);
869       if (BranchInfo.Count > BBI[I].Count)
870         BranchInfo.Count -= BBI[I].Count;
871       else
872         BranchInfo.Count = 0;
873 
874       if (BranchInfo.MispredictedCount > BBI[I].MispredictedCount)
875         BranchInfo.MispredictedCount -= BBI[I].MispredictedCount;
876       else
877         BranchInfo.MispredictedCount = 0;
878     }
879   } else {
880     assert(NewBBs.size() >= 2);
881     assert(NewBBs.size() % 2 == 1 || IndCallBlock.succ_empty());
882     assert(NewBBs.size() % 2 == 1 || IsTailCall);
883 
884     auto ScaledBI = ScaledBBI.begin();
885     auto updateCurrentBranchInfo = [&] {
886       assert(ScaledBI != ScaledBBI.end());
887       TotalCount -= ScaledBI->Count;
888       ++ScaledBI;
889     };
890 
891     if (!IsTailCall) {
892       MergeBlock = NewBBs.back().get();
893       IndCallBlock.moveAllSuccessorsTo(MergeBlock);
894     }
895 
896     // Fix up successors and execution counts.
897     updateCurrentBranchInfo();
898     IndCallBlock.addSuccessor(NewBBs[1].get(), TotalCount);
899     IndCallBlock.addSuccessor(NewBBs[0].get(), ScaledBBI[0]);
900 
901     const size_t Adj = IsTailCall ? 1 : 2;
902     for (size_t I = 0; I < NewBBs.size() - Adj; ++I) {
903       assert(TotalCount <= IndCallBlock.getExecutionCount() ||
904              TotalCount <= uint64_t(TotalIndirectBranches));
905       uint64_t ExecCount = ScaledBBI[(I + 1) / 2].Count;
906       if (I % 2 == 0) {
907         if (MergeBlock)
908           NewBBs[I]->addSuccessor(MergeBlock, ScaledBBI[(I + 1) / 2].Count);
909       } else {
910         assert(I + 2 < NewBBs.size());
911         updateCurrentBranchInfo();
912         NewBBs[I]->addSuccessor(NewBBs[I + 2].get(), TotalCount);
913         NewBBs[I]->addSuccessor(NewBBs[I + 1].get(), ScaledBBI[(I + 1) / 2]);
914         ExecCount += TotalCount;
915       }
916       NewBBs[I]->setExecutionCount(ExecCount);
917     }
918 
919     if (MergeBlock) {
920       // Arrange for the MergeBlock to be the fallthrough for the first
921       // promoted call block.
922       std::unique_ptr<BinaryBasicBlock> MBPtr;
923       std::swap(MBPtr, NewBBs.back());
924       NewBBs.pop_back();
925       NewBBs.emplace(NewBBs.begin() + 1, std::move(MBPtr));
926       // TODO: is COUNT_FALLTHROUGH_EDGE the right thing here?
927       NewBBs.back()->addSuccessor(MergeBlock, TotalCount); // uncond branch
928     }
929   }
930 
931   // Update the execution count.
932   NewBBs.back()->setExecutionCount(TotalCount);
933 
934   // Update BB and BB layout.
935   Function.insertBasicBlocks(&IndCallBlock, std::move(NewBBs));
936   assert(Function.validateCFG());
937 
938   return MergeBlock;
939 }
940 
941 size_t IndirectCallPromotion::canPromoteCallsite(
942     const BinaryBasicBlock &BB, const MCInst &Inst,
943     const std::vector<Callsite> &Targets, uint64_t NumCalls) {
944   BinaryFunction *BF = BB.getFunction();
945   const BinaryContext &BC = BF->getBinaryContext();
946 
947   if (BB.getKnownExecutionCount() < opts::ExecutionCountThreshold)
948     return 0;
949 
950   const bool IsJumpTable = BF->getJumpTable(Inst);
951 
952   auto computeStats = [&](size_t N) {
953     for (size_t I = 0; I < N; ++I)
954       if (IsJumpTable)
955         TotalNumFrequentJmps += Targets[I].Branches;
956       else
957         TotalNumFrequentCalls += Targets[I].Branches;
958   };
959 
960   // If we have no targets (or no calls), skip this callsite.
961   if (Targets.empty() || !NumCalls) {
962     if (opts::Verbosity >= 1) {
963       const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
964       outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx << " in "
965              << BB.getName() << ", calls = " << NumCalls
966              << ", targets empty or NumCalls == 0.\n";
967     }
968     return 0;
969   }
970 
971   size_t TopN = opts::ICPTopN;
972   if (IsJumpTable)
973     TopN = opts::ICPJumpTablesTopN ? opts::ICPJumpTablesTopN : TopN;
974   else
975     TopN = opts::ICPCallsTopN ? opts::ICPCallsTopN : TopN;
976 
977   const size_t TrialN = TopN ? std::min(TopN, Targets.size()) : Targets.size();
978 
979   if (opts::ICPTopCallsites && !BC.MIB->hasAnnotation(Inst, "DoICP"))
980     return 0;
981 
982   // Pick the top N targets.
983   uint64_t TotalMispredictsTopN = 0;
984   size_t N = 0;
985 
986   if (opts::ICPUseMispredicts &&
987       (!IsJumpTable || opts::ICPJumpTablesByTarget)) {
988     // Count total number of mispredictions for (at most) the top N targets.
989     // We may choose a smaller N (TrialN vs. N) if the frequency threshold
990     // is exceeded by fewer targets.
991     double Threshold = double(opts::ICPMispredictThreshold);
992     for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) {
993       Threshold -= (100.0 * Targets[I].Mispreds) / NumCalls;
994       TotalMispredictsTopN += Targets[I].Mispreds;
995     }
996     computeStats(N);
997 
998     // Compute the misprediction frequency of the top N call targets.  If this
999     // frequency is greater than the threshold, we should try ICP on this
1000     // callsite.
1001     const double TopNFrequency = (100.0 * TotalMispredictsTopN) / NumCalls;
1002     if (TopNFrequency == 0 || TopNFrequency < opts::ICPMispredictThreshold) {
1003       if (opts::Verbosity >= 1) {
1004         const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
1005         outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
1006                << " in " << BB.getName() << ", calls = " << NumCalls
1007                << ", top N mis. frequency " << format("%.1f", TopNFrequency)
1008                << "% < " << opts::ICPMispredictThreshold << "%\n";
1009       }
1010       return 0;
1011     }
1012   } else {
1013     size_t MaxTargets = 0;
1014 
1015     // Count total number of calls for (at most) the top N targets.
1016     // We may choose a smaller N (TrialN vs. N) if the frequency threshold
1017     // is exceeded by fewer targets.
1018     const unsigned TotalThreshold = IsJumpTable
1019                                         ? opts::ICPJTTotalPercentThreshold
1020                                         : opts::ICPCallsTotalPercentThreshold;
1021     const unsigned RemainingThreshold =
1022         IsJumpTable ? opts::ICPJTRemainingPercentThreshold
1023                     : opts::ICPCallsRemainingPercentThreshold;
1024     uint64_t NumRemainingCalls = NumCalls;
1025     for (size_t I = 0; I < TrialN; ++I, ++MaxTargets) {
1026       if (100 * Targets[I].Branches < NumCalls * TotalThreshold)
1027         break;
1028       if (100 * Targets[I].Branches < NumRemainingCalls * RemainingThreshold)
1029         break;
1030       if (N + (Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size()) >
1031           TrialN)
1032         break;
1033       TotalMispredictsTopN += Targets[I].Mispreds;
1034       NumRemainingCalls -= Targets[I].Branches;
1035       N += Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size();
1036     }
1037     computeStats(MaxTargets);
1038 
1039     // Don't check misprediction frequency for jump tables -- we don't really
1040     // care as long as we are saving loads from the jump table.
1041     if (!IsJumpTable || opts::ICPJumpTablesByTarget) {
1042       // Compute the misprediction frequency of the top N call targets.  If
1043       // this frequency is less than the threshold, we should skip ICP at
1044       // this callsite.
1045       const double TopNMispredictFrequency =
1046           (100.0 * TotalMispredictsTopN) / NumCalls;
1047 
1048       if (TopNMispredictFrequency < opts::ICPMispredictThreshold) {
1049         if (opts::Verbosity >= 1) {
1050           const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
1051           outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
1052                  << " in " << BB.getName() << ", calls = " << NumCalls
1053                  << ", top N mispredict frequency "
1054                  << format("%.1f", TopNMispredictFrequency) << "% < "
1055                  << opts::ICPMispredictThreshold << "%\n";
1056         }
1057         return 0;
1058       }
1059     }
1060   }
1061 
1062   // Filter by inline-ability of target functions, stop at first target that
1063   // can't be inlined.
1064   if (!IsJumpTable && opts::ICPPeelForInline) {
1065     for (size_t I = 0; I < N; ++I) {
1066       const MCSymbol *TargetSym = Targets[I].To.Sym;
1067       const BinaryFunction *TargetBF = BC.getFunctionForSymbol(TargetSym);
1068       if (!TargetBF || !BinaryFunctionPass::shouldOptimize(*TargetBF) ||
1069           getInliningInfo(*TargetBF).Type == InliningType::INL_NONE) {
1070         N = I;
1071         break;
1072       }
1073     }
1074   }
1075 
1076   // Filter functions that can have ICP applied (for debugging)
1077   if (!opts::ICPFuncsList.empty()) {
1078     for (std::string &Name : opts::ICPFuncsList)
1079       if (BF->hasName(Name))
1080         return N;
1081     return 0;
1082   }
1083 
1084   return N;
1085 }
1086 
1087 void IndirectCallPromotion::printCallsiteInfo(
1088     const BinaryBasicBlock &BB, const MCInst &Inst,
1089     const std::vector<Callsite> &Targets, const size_t N,
1090     uint64_t NumCalls) const {
1091   BinaryContext &BC = BB.getFunction()->getBinaryContext();
1092   const bool IsTailCall = BC.MIB->isTailCall(Inst);
1093   const bool IsJumpTable = BB.getFunction()->getJumpTable(Inst);
1094   const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
1095 
1096   outs() << "BOLT-INFO: ICP candidate branch info: " << *BB.getFunction()
1097          << " @ " << InstIdx << " in " << BB.getName()
1098          << " -> calls = " << NumCalls
1099          << (IsTailCall ? " (tail)" : (IsJumpTable ? " (jump table)" : ""))
1100          << "\n";
1101   for (size_t I = 0; I < N; I++) {
1102     const double Frequency = 100.0 * Targets[I].Branches / NumCalls;
1103     const double MisFrequency = 100.0 * Targets[I].Mispreds / NumCalls;
1104     outs() << "BOLT-INFO:   ";
1105     if (Targets[I].To.Sym)
1106       outs() << Targets[I].To.Sym->getName();
1107     else
1108       outs() << Targets[I].To.Addr;
1109     outs() << ", calls = " << Targets[I].Branches
1110            << ", mispreds = " << Targets[I].Mispreds
1111            << ", taken freq = " << format("%.1f", Frequency) << "%"
1112            << ", mis. freq = " << format("%.1f", MisFrequency) << "%";
1113     bool First = true;
1114     for (uint64_t JTIndex : Targets[I].JTIndices) {
1115       outs() << (First ? ", indices = " : ", ") << JTIndex;
1116       First = false;
1117     }
1118     outs() << "\n";
1119   }
1120 
1121   LLVM_DEBUG({
1122     dbgs() << "BOLT-INFO: ICP original call instruction:";
1123     BC.printInstruction(dbgs(), Inst, Targets[0].From.Addr, nullptr, true);
1124   });
1125 }
1126 
1127 void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
1128   if (opts::ICP == ICP_NONE)
1129     return;
1130 
1131   auto &BFs = BC.getBinaryFunctions();
1132 
1133   const bool OptimizeCalls = (opts::ICP == ICP_CALLS || opts::ICP == ICP_ALL);
1134   const bool OptimizeJumpTables =
1135       (opts::ICP == ICP_JUMP_TABLES || opts::ICP == ICP_ALL);
1136 
1137   std::unique_ptr<RegAnalysis> RA;
1138   std::unique_ptr<BinaryFunctionCallGraph> CG;
1139   if (OptimizeJumpTables) {
1140     CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
1141     RA.reset(new RegAnalysis(BC, &BFs, &*CG));
1142   }
1143 
1144   // If icp-top-callsites is enabled, compute the total number of indirect
1145   // calls and then optimize the hottest callsites that contribute to that
1146   // total.
1147   SetVector<BinaryFunction *> Functions;
1148   if (opts::ICPTopCallsites == 0) {
1149     for (auto &KV : BFs)
1150       Functions.insert(&KV.second);
1151   } else {
1152     using IndirectCallsite = std::tuple<uint64_t, MCInst *, BinaryFunction *>;
1153     std::vector<IndirectCallsite> IndirectCalls;
1154     size_t TotalIndirectCalls = 0;
1155 
1156     // Find all the indirect callsites.
1157     for (auto &BFIt : BFs) {
1158       BinaryFunction &Function = BFIt.second;
1159 
1160       if (!shouldOptimize(Function))
1161         continue;
1162 
1163       const bool HasLayout = !Function.getLayout().block_empty();
1164 
1165       for (BinaryBasicBlock &BB : Function) {
1166         if (HasLayout && Function.isSplit() && BB.isCold())
1167           continue;
1168 
1169         for (MCInst &Inst : BB) {
1170           const bool IsJumpTable = Function.getJumpTable(Inst);
1171           const bool HasIndirectCallProfile =
1172               BC.MIB->hasAnnotation(Inst, "CallProfile");
1173           const bool IsDirectCall =
1174               (BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0));
1175 
1176           if (!IsDirectCall &&
1177               ((HasIndirectCallProfile && !IsJumpTable && OptimizeCalls) ||
1178                (IsJumpTable && OptimizeJumpTables))) {
1179             uint64_t NumCalls = 0;
1180             for (const Callsite &BInfo : getCallTargets(BB, Inst))
1181               NumCalls += BInfo.Branches;
1182             IndirectCalls.push_back(
1183                 std::make_tuple(NumCalls, &Inst, &Function));
1184             TotalIndirectCalls += NumCalls;
1185           }
1186         }
1187       }
1188     }
1189 
1190     // Sort callsites by execution count.
1191     llvm::sort(reverse(IndirectCalls));
1192 
1193     // Find callsites that contribute to the top "opts::ICPTopCallsites"%
1194     // number of calls.
1195     const float TopPerc = opts::ICPTopCallsites / 100.0f;
1196     int64_t MaxCalls = TotalIndirectCalls * TopPerc;
1197     uint64_t LastFreq = std::numeric_limits<uint64_t>::max();
1198     size_t Num = 0;
1199     for (const IndirectCallsite &IC : IndirectCalls) {
1200       const uint64_t CurFreq = std::get<0>(IC);
1201       // Once we decide to stop, include at least all branches that share the
1202       // same frequency of the last one to avoid non-deterministic behavior
1203       // (e.g. turning on/off ICP depending on the order of functions)
1204       if (MaxCalls <= 0 && CurFreq != LastFreq)
1205         break;
1206       MaxCalls -= CurFreq;
1207       LastFreq = CurFreq;
1208       BC.MIB->addAnnotation(*std::get<1>(IC), "DoICP", true);
1209       Functions.insert(std::get<2>(IC));
1210       ++Num;
1211     }
1212     outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls
1213            << ", " << Num << " callsites cover " << opts::ICPTopCallsites
1214            << "% of all indirect calls\n";
1215   }
1216 
1217   for (BinaryFunction *FuncPtr : Functions) {
1218     BinaryFunction &Function = *FuncPtr;
1219 
1220     if (!shouldOptimize(Function))
1221       continue;
1222 
1223     const bool HasLayout = !Function.getLayout().block_empty();
1224 
1225     // Total number of indirect calls issued from the current Function.
1226     // (a fraction of TotalIndirectCalls)
1227     uint64_t FuncTotalIndirectCalls = 0;
1228     uint64_t FuncTotalIndirectJmps = 0;
1229 
1230     std::vector<BinaryBasicBlock *> BBs;
1231     for (BinaryBasicBlock &BB : Function) {
1232       // Skip indirect calls in cold blocks.
1233       if (!HasLayout || !Function.isSplit() || !BB.isCold())
1234         BBs.push_back(&BB);
1235     }
1236     if (BBs.empty())
1237       continue;
1238 
1239     DataflowInfoManager Info(Function, RA.get(), nullptr);
1240     while (!BBs.empty()) {
1241       BinaryBasicBlock *BB = BBs.back();
1242       BBs.pop_back();
1243 
1244       for (unsigned Idx = 0; Idx < BB->size(); ++Idx) {
1245         MCInst &Inst = BB->getInstructionAtIndex(Idx);
1246         const ptrdiff_t InstIdx = &Inst - &(*BB->begin());
1247         const bool IsTailCall = BC.MIB->isTailCall(Inst);
1248         const bool HasIndirectCallProfile =
1249             BC.MIB->hasAnnotation(Inst, "CallProfile");
1250         const bool IsJumpTable = Function.getJumpTable(Inst);
1251 
1252         if (BC.MIB->isCall(Inst))
1253           TotalCalls += BB->getKnownExecutionCount();
1254 
1255         if (IsJumpTable && !OptimizeJumpTables)
1256           continue;
1257 
1258         if (!IsJumpTable && (!HasIndirectCallProfile || !OptimizeCalls))
1259           continue;
1260 
1261         // Ignore direct calls.
1262         if (BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0))
1263           continue;
1264 
1265         assert((BC.MIB->isCall(Inst) || BC.MIB->isIndirectBranch(Inst)) &&
1266                "expected a call or an indirect jump instruction");
1267 
1268         if (IsJumpTable)
1269           ++TotalJumpTableCallsites;
1270         else
1271           ++TotalIndirectCallsites;
1272 
1273         std::vector<Callsite> Targets = getCallTargets(*BB, Inst);
1274 
1275         // Compute the total number of calls from this particular callsite.
1276         uint64_t NumCalls = 0;
1277         for (const Callsite &BInfo : Targets)
1278           NumCalls += BInfo.Branches;
1279         if (!IsJumpTable)
1280           FuncTotalIndirectCalls += NumCalls;
1281         else
1282           FuncTotalIndirectJmps += NumCalls;
1283 
1284         // If FLAGS regs is alive after this jmp site, do not try
1285         // promoting because we will clobber FLAGS.
1286         if (IsJumpTable) {
1287           ErrorOr<const BitVector &> State =
1288               Info.getLivenessAnalysis().getStateBefore(Inst);
1289           if (!State || (State && (*State)[BC.MIB->getFlagsReg()])) {
1290             if (opts::Verbosity >= 1)
1291               outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
1292                      << InstIdx << " in " << BB->getName()
1293                      << ", calls = " << NumCalls
1294                      << (State ? ", cannot clobber flags reg.\n"
1295                                : ", no liveness data available.\n");
1296             continue;
1297           }
1298         }
1299 
1300         // Should this callsite be optimized?  Return the number of targets
1301         // to use when promoting this call.  A value of zero means to skip
1302         // this callsite.
1303         size_t N = canPromoteCallsite(*BB, Inst, Targets, NumCalls);
1304 
1305         // If it is a jump table and it failed to meet our initial threshold,
1306         // proceed to findCallTargetSymbols -- it may reevaluate N if
1307         // memory profile is present
1308         if (!N && !IsJumpTable)
1309           continue;
1310 
1311         if (opts::Verbosity >= 1)
1312           printCallsiteInfo(*BB, Inst, Targets, N, NumCalls);
1313 
1314         // Find MCSymbols or absolute addresses for each call target.
1315         MCInst *TargetFetchInst = nullptr;
1316         const SymTargetsType SymTargets =
1317             findCallTargetSymbols(Targets, N, *BB, Inst, TargetFetchInst);
1318 
1319         // findCallTargetSymbols may have changed N if mem profile is available
1320         // for jump tables
1321         if (!N)
1322           continue;
1323 
1324         LLVM_DEBUG(printDecision(dbgs(), Targets, N));
1325 
1326         // If we can't resolve any of the target symbols, punt on this callsite.
1327         // TODO: can this ever happen?
1328         if (SymTargets.size() < N) {
1329           const size_t LastTarget = SymTargets.size();
1330           if (opts::Verbosity >= 1)
1331             outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
1332                    << InstIdx << " in " << BB->getName()
1333                    << ", calls = " << NumCalls
1334                    << ", ICP failed to find target symbol for "
1335                    << Targets[LastTarget].To.Sym->getName() << "\n";
1336           continue;
1337         }
1338 
1339         MethodInfoType MethodInfo;
1340 
1341         if (!IsJumpTable) {
1342           MethodInfo = maybeGetVtableSyms(*BB, Inst, SymTargets);
1343           TotalMethodLoadsEliminated += MethodInfo.first.empty() ? 0 : 1;
1344           LLVM_DEBUG(dbgs()
1345                      << "BOLT-INFO: ICP "
1346                      << (!MethodInfo.first.empty() ? "found" : "did not find")
1347                      << " vtables for all methods.\n");
1348         } else if (TargetFetchInst) {
1349           ++TotalIndexBasedJumps;
1350           MethodInfo.second.push_back(TargetFetchInst);
1351         }
1352 
1353         // Generate new promoted call code for this callsite.
1354         MCPlusBuilder::BlocksVectorTy ICPcode =
1355             (IsJumpTable && !opts::ICPJumpTablesByTarget)
1356                 ? BC.MIB->jumpTablePromotion(Inst, SymTargets,
1357                                              MethodInfo.second, BC.Ctx.get())
1358                 : BC.MIB->indirectCallPromotion(
1359                       Inst, SymTargets, MethodInfo.first, MethodInfo.second,
1360                       opts::ICPOldCodeSequence, BC.Ctx.get());
1361 
1362         if (ICPcode.empty()) {
1363           if (opts::Verbosity >= 1)
1364             outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
1365                    << InstIdx << " in " << BB->getName()
1366                    << ", calls = " << NumCalls
1367                    << ", unable to generate promoted call code.\n";
1368           continue;
1369         }
1370 
1371         LLVM_DEBUG({
1372           uint64_t Offset = Targets[0].From.Addr;
1373           dbgs() << "BOLT-INFO: ICP indirect call code:\n";
1374           for (const auto &entry : ICPcode) {
1375             const MCSymbol *const &Sym = entry.first;
1376             const InstructionListType &Insts = entry.second;
1377             if (Sym)
1378               dbgs() << Sym->getName() << ":\n";
1379             Offset = BC.printInstructions(dbgs(), Insts.begin(), Insts.end(),
1380                                           Offset);
1381           }
1382           dbgs() << "---------------------------------------------------\n";
1383         });
1384 
1385         // Rewrite the CFG with the newly generated ICP code.
1386         std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs =
1387             rewriteCall(*BB, Inst, std::move(ICPcode), MethodInfo.second);
1388 
1389         // Fix the CFG after inserting the new basic blocks.
1390         BinaryBasicBlock *MergeBlock =
1391             fixCFG(*BB, IsTailCall, IsJumpTable, std::move(NewBBs), Targets);
1392 
1393         // Since the tail of the original block was split off and it may contain
1394         // additional indirect calls, we must add the merge block to the set of
1395         // blocks to process.
1396         if (MergeBlock)
1397           BBs.push_back(MergeBlock);
1398 
1399         if (opts::Verbosity >= 1)
1400           outs() << "BOLT-INFO: ICP succeeded in " << Function << " @ "
1401                  << InstIdx << " in " << BB->getName()
1402                  << " -> calls = " << NumCalls << "\n";
1403 
1404         if (IsJumpTable)
1405           ++TotalOptimizedJumpTableCallsites;
1406         else
1407           ++TotalOptimizedIndirectCallsites;
1408 
1409         Modified.insert(&Function);
1410       }
1411     }
1412     TotalIndirectCalls += FuncTotalIndirectCalls;
1413     TotalIndirectJmps += FuncTotalIndirectJmps;
1414   }
1415 
1416   outs() << "BOLT-INFO: ICP total indirect callsites with profile = "
1417          << TotalIndirectCallsites << "\n"
1418          << "BOLT-INFO: ICP total jump table callsites = "
1419          << TotalJumpTableCallsites << "\n"
1420          << "BOLT-INFO: ICP total number of calls = " << TotalCalls << "\n"
1421          << "BOLT-INFO: ICP percentage of calls that are indirect = "
1422          << format("%.1f", (100.0 * TotalIndirectCalls) / TotalCalls) << "%\n"
1423          << "BOLT-INFO: ICP percentage of indirect calls that can be "
1424             "optimized = "
1425          << format("%.1f", (100.0 * TotalNumFrequentCalls) /
1426                                std::max<size_t>(TotalIndirectCalls, 1))
1427          << "%\n"
1428          << "BOLT-INFO: ICP percentage of indirect callsites that are "
1429             "optimized = "
1430          << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) /
1431                                std::max<uint64_t>(TotalIndirectCallsites, 1))
1432          << "%\n"
1433          << "BOLT-INFO: ICP number of method load elimination candidates = "
1434          << TotalMethodLoadEliminationCandidates << "\n"
1435          << "BOLT-INFO: ICP percentage of method calls candidates that have "
1436             "loads eliminated = "
1437          << format("%.1f", (100.0 * TotalMethodLoadsEliminated) /
1438                                std::max<uint64_t>(
1439                                    TotalMethodLoadEliminationCandidates, 1))
1440          << "%\n"
1441          << "BOLT-INFO: ICP percentage of indirect branches that are "
1442             "optimized = "
1443          << format("%.1f", (100.0 * TotalNumFrequentJmps) /
1444                                std::max<uint64_t>(TotalIndirectJmps, 1))
1445          << "%\n"
1446          << "BOLT-INFO: ICP percentage of jump table callsites that are "
1447          << "optimized = "
1448          << format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) /
1449                                std::max<uint64_t>(TotalJumpTableCallsites, 1))
1450          << "%\n"
1451          << "BOLT-INFO: ICP number of jump table callsites that can use hot "
1452          << "indices = " << TotalIndexBasedCandidates << "\n"
1453          << "BOLT-INFO: ICP percentage of jump table callsites that use hot "
1454             "indices = "
1455          << format("%.1f", (100.0 * TotalIndexBasedJumps) /
1456                                std::max<uint64_t>(TotalIndexBasedCandidates, 1))
1457          << "%\n";
1458 
1459   (void)verifyProfile;
1460 #ifndef NDEBUG
1461   verifyProfile(BFs);
1462 #endif
1463 }
1464 
1465 } // namespace bolt
1466 } // namespace llvm
1467