xref: /llvm-project/bolt/lib/Passes/BinaryPasses.cpp (revision 1b8e0cf090a08b2c517eb2a3e101332d692063c2)
1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements multiple passes for binary optimization and analysis.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/BinaryPasses.h"
14 #include "bolt/Core/FunctionLayout.h"
15 #include "bolt/Core/ParallelUtilities.h"
16 #include "bolt/Passes/ReorderAlgorithm.h"
17 #include "bolt/Passes/ReorderFunctions.h"
18 #include "bolt/Utils/CommandLineOpts.h"
19 #include "llvm/Support/CommandLine.h"
20 #include <atomic>
21 #include <mutex>
22 #include <numeric>
23 #include <vector>
24 
25 #define DEBUG_TYPE "bolt-opts"
26 
27 using namespace llvm;
28 using namespace bolt;
29 
30 static const char *dynoStatsOptName(const bolt::DynoStats::Category C) {
31   assert(C > bolt::DynoStats::FIRST_DYNO_STAT &&
32          C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category.");
33 
34   static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1];
35 
36   OptNames[C] = bolt::DynoStats::Description(C);
37 
38   std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-');
39 
40   return OptNames[C].c_str();
41 }
42 
43 namespace opts {
44 
45 extern cl::OptionCategory BoltCategory;
46 extern cl::OptionCategory BoltOptCategory;
47 
48 extern cl::opt<unsigned> Verbosity;
49 extern cl::opt<bool> EnableBAT;
50 extern cl::opt<unsigned> ExecutionCountThreshold;
51 extern cl::opt<bool> UpdateDebugSections;
52 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
53 
54 enum DynoStatsSortOrder : char {
55   Ascending,
56   Descending
57 };
58 
59 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt(
60     "print-sorted-by-order",
61     cl::desc("use ascending or descending order when printing functions "
62              "ordered by dyno stats"),
63     cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory));
64 
65 cl::list<std::string>
66 HotTextMoveSections("hot-text-move-sections",
67   cl::desc("list of sections containing functions used for hugifying hot text. "
68            "BOLT makes sure these functions are not placed on the same page as "
69            "the hot text. (default=\'.stub,.mover\')."),
70   cl::value_desc("sec1,sec2,sec3,..."),
71   cl::CommaSeparated,
72   cl::ZeroOrMore,
73   cl::cat(BoltCategory));
74 
75 bool isHotTextMover(const BinaryFunction &Function) {
76   for (std::string &SectionName : opts::HotTextMoveSections) {
77     if (Function.getOriginSectionName() &&
78         *Function.getOriginSectionName() == SectionName)
79       return true;
80   }
81 
82   return false;
83 }
84 
85 static cl::opt<bool> MinBranchClusters(
86     "min-branch-clusters",
87     cl::desc("use a modified clustering algorithm geared towards minimizing "
88              "branches"),
89     cl::Hidden, cl::cat(BoltOptCategory));
90 
91 static cl::list<Peepholes::PeepholeOpts> Peepholes(
92     "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"),
93     cl::value_desc("opt1,opt2,opt3,..."),
94     cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"),
95                clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps",
96                           "remove double jumps when able"),
97                clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps",
98                           "insert tail call traps"),
99                clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches",
100                           "remove useless conditional branches"),
101                clEnumValN(Peepholes::PEEP_ALL, "all",
102                           "enable all peephole optimizations")),
103     cl::ZeroOrMore, cl::cat(BoltOptCategory));
104 
105 static cl::opt<unsigned>
106     PrintFuncStat("print-function-statistics",
107                   cl::desc("print statistics about basic block ordering"),
108                   cl::init(0), cl::cat(BoltOptCategory));
109 
110 static cl::opt<bool> PrintLargeFunctions(
111     "print-large-functions",
112     cl::desc("print functions that could not be overwritten due to excessive "
113              "size"),
114     cl::init(false), cl::cat(BoltOptCategory));
115 
116 static cl::list<bolt::DynoStats::Category>
117     PrintSortedBy("print-sorted-by", cl::CommaSeparated,
118                   cl::desc("print functions sorted by order of dyno stats"),
119                   cl::value_desc("key1,key2,key3,..."),
120                   cl::values(
121 #define D(name, description, ...)                                              \
122   clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name),   \
123              description),
124                       REAL_DYNO_STATS
125 #undef D
126                           clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all",
127                                      "sorted by all names")),
128                   cl::ZeroOrMore, cl::cat(BoltOptCategory));
129 
130 static cl::opt<bool>
131     PrintUnknown("print-unknown",
132                  cl::desc("print names of functions with unknown control flow"),
133                  cl::cat(BoltCategory), cl::Hidden);
134 
135 static cl::opt<bool>
136     PrintUnknownCFG("print-unknown-cfg",
137                     cl::desc("dump CFG of functions with unknown control flow"),
138                     cl::cat(BoltCategory), cl::ReallyHidden);
139 
140 // Please MSVC19 with a forward declaration: otherwise it reports an error about
141 // an undeclared variable inside a callback.
142 extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks;
143 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks(
144     "reorder-blocks", cl::desc("change layout of basic blocks in a function"),
145     cl::init(bolt::ReorderBasicBlocks::LT_NONE),
146     cl::values(
147         clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none",
148                    "do not reorder basic blocks"),
149         clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse",
150                    "layout blocks in reverse order"),
151         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal",
152                    "perform optimal layout based on profile"),
153         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH,
154                    "branch-predictor",
155                    "perform optimal layout prioritizing branch "
156                    "predictions"),
157         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache",
158                    "perform optimal layout prioritizing I-cache "
159                    "behavior"),
160         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+",
161                    "perform layout optimizing I-cache behavior"),
162         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp",
163                    "perform layout optimizing I-cache behavior"),
164         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
165                    "cluster-shuffle", "perform random layout of clusters")),
166     cl::ZeroOrMore, cl::cat(BoltOptCategory),
167     cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) {
168       if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) {
169         errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please"
170                << " use '-reorder-blocks=ext-tsp' instead\n";
171         ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP;
172       }
173     }));
174 
175 static cl::opt<unsigned> ReportBadLayout(
176     "report-bad-layout",
177     cl::desc("print top <uint> functions with suboptimal code layout on input"),
178     cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
179 
180 static cl::opt<bool>
181     ReportStaleFuncs("report-stale",
182                      cl::desc("print the list of functions with stale profile"),
183                      cl::Hidden, cl::cat(BoltOptCategory));
184 
185 enum SctcModes : char {
186   SctcAlways,
187   SctcPreserveDirection,
188   SctcHeuristic
189 };
190 
191 static cl::opt<SctcModes>
192 SctcMode("sctc-mode",
193   cl::desc("mode for simplify conditional tail calls"),
194   cl::init(SctcAlways),
195   cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"),
196     clEnumValN(SctcPreserveDirection,
197       "preserve",
198       "only perform sctc when branch direction is "
199       "preserved"),
200     clEnumValN(SctcHeuristic,
201       "heuristic",
202       "use branch prediction data to control sctc")),
203   cl::ZeroOrMore,
204   cl::cat(BoltOptCategory));
205 
206 static cl::opt<unsigned>
207 StaleThreshold("stale-threshold",
208     cl::desc(
209       "maximum percentage of stale functions to tolerate (default: 100)"),
210     cl::init(100),
211     cl::Hidden,
212     cl::cat(BoltOptCategory));
213 
214 static cl::opt<unsigned> TSPThreshold(
215     "tsp-threshold",
216     cl::desc(
217         "maximum number of hot basic blocks in a function for which to use "
218         "a precise TSP solution while re-ordering basic blocks"),
219     cl::init(10), cl::Hidden, cl::cat(BoltOptCategory));
220 
221 static cl::opt<unsigned> TopCalledLimit(
222     "top-called-limit",
223     cl::desc("maximum number of functions to print in top called "
224              "functions section"),
225     cl::init(100), cl::Hidden, cl::cat(BoltCategory));
226 
227 // Profile density options, synced with llvm-profgen/ProfileGenerator.cpp
228 static cl::opt<int> ProfileDensityCutOffHot(
229     "profile-density-cutoff-hot", cl::init(990000),
230     cl::desc("Total samples cutoff for functions used to calculate "
231              "profile density."));
232 
233 static cl::opt<double> ProfileDensityThreshold(
234     "profile-density-threshold", cl::init(60),
235     cl::desc("If the profile density is below the given threshold, it "
236              "will be suggested to increase the sampling rate."),
237     cl::Optional);
238 
239 } // namespace opts
240 
241 namespace llvm {
242 namespace bolt {
243 
244 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const {
245   return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG &&
246          !BF.isIgnored();
247 }
248 
249 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const {
250   return BF.isSimple() && !BF.isIgnored();
251 }
252 
253 void NormalizeCFG::runOnFunction(BinaryFunction &BF) {
254   uint64_t NumRemoved = 0;
255   uint64_t NumDuplicateEdges = 0;
256   uint64_t NeedsFixBranches = 0;
257   for (BinaryBasicBlock &BB : BF) {
258     if (!BB.empty())
259       continue;
260 
261     if (BB.isEntryPoint() || BB.isLandingPad())
262       continue;
263 
264     // Handle a dangling empty block.
265     if (BB.succ_size() == 0) {
266       // If an empty dangling basic block has a predecessor, it could be a
267       // result of codegen for __builtin_unreachable. In such case, do not
268       // remove the block.
269       if (BB.pred_size() == 0) {
270         BB.markValid(false);
271         ++NumRemoved;
272       }
273       continue;
274     }
275 
276     // The block should have just one successor.
277     BinaryBasicBlock *Successor = BB.getSuccessor();
278     assert(Successor && "invalid CFG encountered");
279 
280     // Redirect all predecessors to the successor block.
281     while (!BB.pred_empty()) {
282       BinaryBasicBlock *Predecessor = *BB.pred_begin();
283       if (Predecessor->hasJumpTable())
284         break;
285 
286       if (Predecessor == Successor)
287         break;
288 
289       BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB);
290       Predecessor->replaceSuccessor(&BB, Successor, BI.Count,
291                                     BI.MispredictedCount);
292       // We need to fix branches even if we failed to replace all successors
293       // and remove the block.
294       NeedsFixBranches = true;
295     }
296 
297     if (BB.pred_empty()) {
298       BB.removeAllSuccessors();
299       BB.markValid(false);
300       ++NumRemoved;
301     }
302   }
303 
304   if (NumRemoved)
305     BF.eraseInvalidBBs();
306 
307   // Check for duplicate successors. Do it after the empty block elimination as
308   // we can get more duplicate successors.
309   for (BinaryBasicBlock &BB : BF)
310     if (!BB.hasJumpTable() && BB.succ_size() == 2 &&
311         BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true))
312       ++NumDuplicateEdges;
313 
314   // fixBranches() will get rid of duplicate edges and update jump instructions.
315   if (NumDuplicateEdges || NeedsFixBranches)
316     BF.fixBranches();
317 
318   NumDuplicateEdgesMerged += NumDuplicateEdges;
319   NumBlocksRemoved += NumRemoved;
320 }
321 
322 Error NormalizeCFG::runOnFunctions(BinaryContext &BC) {
323   ParallelUtilities::runOnEachFunction(
324       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
325       [&](BinaryFunction &BF) { runOnFunction(BF); },
326       [&](const BinaryFunction &BF) { return !shouldOptimize(BF); },
327       "NormalizeCFG");
328   if (NumBlocksRemoved)
329     BC.outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
330               << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
331   if (NumDuplicateEdgesMerged)
332     BC.outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
333               << " duplicate CFG edge"
334               << (NumDuplicateEdgesMerged == 1 ? "" : "s") << '\n';
335   return Error::success();
336 }
337 
338 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
339   BinaryContext &BC = Function.getBinaryContext();
340   unsigned Count;
341   uint64_t Bytes;
342   Function.markUnreachableBlocks();
343   LLVM_DEBUG({
344     for (BinaryBasicBlock &BB : Function) {
345       if (!BB.isValid()) {
346         dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName()
347                << " in function " << Function << "\n";
348         Function.dump();
349       }
350     }
351   });
352   BinaryContext::IndependentCodeEmitter Emitter =
353       BC.createIndependentMCCodeEmitter();
354   std::tie(Count, Bytes) = Function.eraseInvalidBBs(Emitter.MCE.get());
355   DeletedBlocks += Count;
356   DeletedBytes += Bytes;
357   if (Count) {
358     auto L = BC.scopeLock();
359     Modified.insert(&Function);
360     if (opts::Verbosity > 0)
361       BC.outs() << "BOLT-INFO: removed " << Count
362                 << " dead basic block(s) accounting for " << Bytes
363                 << " bytes in function " << Function << '\n';
364   }
365 }
366 
367 Error EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
368   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
369     runOnFunction(BF);
370   };
371 
372   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
373     return !shouldOptimize(BF) || BF.getLayout().block_empty();
374   };
375 
376   ParallelUtilities::runOnEachFunction(
377       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
378       SkipPredicate, "elimininate-unreachable");
379 
380   if (DeletedBlocks)
381     BC.outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
382               << DeletedBytes << " bytes of code\n";
383   return Error::success();
384 }
385 
386 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
387   return (BinaryFunctionPass::shouldPrint(BF) &&
388           opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
389 }
390 
391 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const {
392   // Apply execution count threshold
393   if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
394     return false;
395 
396   return BinaryFunctionPass::shouldOptimize(BF);
397 }
398 
399 Error ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
400   if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
401     return Error::success();
402 
403   std::atomic_uint64_t ModifiedFuncCount(0);
404   std::mutex FunctionEditDistanceMutex;
405   DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance;
406 
407   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
408     SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder;
409     if (opts::PrintFuncStat > 0)
410       llvm::copy(BF.getLayout().blocks(), std::back_inserter(OldBlockOrder));
411 
412     const bool LayoutChanged =
413         modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters);
414     if (LayoutChanged) {
415       ModifiedFuncCount.fetch_add(1, std::memory_order_relaxed);
416       if (opts::PrintFuncStat > 0) {
417         const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder);
418         std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex);
419         FunctionEditDistance[&BF] = Distance;
420       }
421     }
422   };
423 
424   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
425     return !shouldOptimize(BF);
426   };
427 
428   ParallelUtilities::runOnEachFunction(
429       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
430       "ReorderBasicBlocks");
431   const size_t NumAllProfiledFunctions =
432       BC.NumProfiledFuncs + BC.NumStaleProfileFuncs;
433 
434   BC.outs() << "BOLT-INFO: basic block reordering modified layout of "
435             << format(
436                    "%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n",
437                    ModifiedFuncCount.load(std::memory_order_relaxed),
438                    100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
439                        NumAllProfiledFunctions,
440                    100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
441                        BC.getBinaryFunctions().size());
442 
443   if (opts::PrintFuncStat > 0) {
444     raw_ostream &OS = BC.outs();
445     // Copy all the values into vector in order to sort them
446     std::map<uint64_t, BinaryFunction &> ScoreMap;
447     auto &BFs = BC.getBinaryFunctions();
448     for (auto It = BFs.begin(); It != BFs.end(); ++It)
449       ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>(
450           It->second.getFunctionScore(), It->second));
451 
452     OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
453     OS << "           There are " << BFs.size() << " functions in total. \n";
454     OS << "           Number of functions being modified: "
455        << ModifiedFuncCount.load(std::memory_order_relaxed) << "\n";
456     OS << "           User asks for detailed information on top "
457        << opts::PrintFuncStat << " functions. (Ranked by function score)"
458        << "\n\n";
459     uint64_t I = 0;
460     for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit =
461              ScoreMap.rbegin();
462          Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) {
463       BinaryFunction &Function = Rit->second;
464 
465       OS << "           Information for function of top: " << (I + 1) << ": \n";
466       OS << "             Function Score is: " << Function.getFunctionScore()
467          << "\n";
468       OS << "             There are " << Function.size()
469          << " number of blocks in this function.\n";
470       OS << "             There are " << Function.getInstructionCount()
471          << " number of instructions in this function.\n";
472       OS << "             The edit distance for this function is: "
473          << FunctionEditDistance.lookup(&Function) << "\n\n";
474     }
475   }
476   return Error::success();
477 }
478 
479 bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
480                                               LayoutType Type,
481                                               bool MinBranchClusters) const {
482   if (BF.size() == 0 || Type == LT_NONE)
483     return false;
484 
485   BinaryFunction::BasicBlockOrderType NewLayout;
486   std::unique_ptr<ReorderAlgorithm> Algo;
487 
488   // Cannot do optimal layout without profile.
489   if (Type != LT_REVERSE && !BF.hasValidProfile())
490     return false;
491 
492   if (Type == LT_REVERSE) {
493     Algo.reset(new ReverseReorderAlgorithm());
494   } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) {
495     // Work on optimal solution if problem is small enough
496     LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n");
497     Algo.reset(new TSPReorderAlgorithm());
498   } else {
499     LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n");
500 
501     std::unique_ptr<ClusterAlgorithm> CAlgo;
502     if (MinBranchClusters)
503       CAlgo.reset(new MinBranchGreedyClusterAlgorithm());
504     else
505       CAlgo.reset(new PHGreedyClusterAlgorithm());
506 
507     switch (Type) {
508     case LT_OPTIMIZE:
509       Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo)));
510       break;
511 
512     case LT_OPTIMIZE_BRANCH:
513       Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo)));
514       break;
515 
516     case LT_OPTIMIZE_CACHE:
517       Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
518       break;
519 
520     case LT_OPTIMIZE_EXT_TSP:
521       Algo.reset(new ExtTSPReorderAlgorithm());
522       break;
523 
524     case LT_OPTIMIZE_SHUFFLE:
525       Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo)));
526       break;
527 
528     default:
529       llvm_unreachable("unexpected layout type");
530     }
531   }
532 
533   Algo->reorderBasicBlocks(BF, NewLayout);
534 
535   return BF.getLayout().update(NewLayout);
536 }
537 
538 Error FixupBranches::runOnFunctions(BinaryContext &BC) {
539   for (auto &It : BC.getBinaryFunctions()) {
540     BinaryFunction &Function = It.second;
541     if (!BC.shouldEmit(Function) || !Function.isSimple())
542       continue;
543 
544     Function.fixBranches();
545   }
546   return Error::success();
547 }
548 
549 Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
550   std::atomic<bool> HasFatal{false};
551   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
552     if (!BF.finalizeCFIState()) {
553       if (BC.HasRelocations) {
554         BC.errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
555                   << ". Exiting.\n";
556         HasFatal = true;
557         return;
558       }
559       BF.setSimple(false);
560       return;
561     }
562 
563     BF.setFinalized();
564 
565     // Update exception handling information.
566     BF.updateEHRanges();
567   };
568 
569   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
570     return !BC.shouldEmit(BF);
571   };
572 
573   ParallelUtilities::runOnEachFunction(
574       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
575       SkipPredicate, "FinalizeFunctions");
576   if (HasFatal)
577     return createFatalBOLTError("finalize CFI state failure");
578   return Error::success();
579 }
580 
581 Error CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
582   if (BC.HasRelocations)
583     return Error::success();
584 
585   // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
586   // incorrect meta data.
587   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
588     uint64_t HotSize, ColdSize;
589     std::tie(HotSize, ColdSize) =
590         BC.calculateEmittedSize(BF, /*FixBranches=*/false);
591     uint64_t MainFragmentSize = HotSize;
592     if (BF.hasIslandsInfo()) {
593       MainFragmentSize +=
594           offsetToAlignment(BF.getAddress() + MainFragmentSize,
595                             Align(BF.getConstantIslandAlignment()));
596       MainFragmentSize += BF.estimateConstantIslandSize();
597     }
598     if (MainFragmentSize > BF.getMaxSize()) {
599       if (opts::PrintLargeFunctions)
600         BC.outs() << "BOLT-INFO: " << BF << " size of " << MainFragmentSize
601                   << " bytes exceeds allocated space by "
602                   << (MainFragmentSize - BF.getMaxSize()) << " bytes\n";
603       BF.setSimple(false);
604     }
605   };
606 
607   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
608     return !shouldOptimize(BF);
609   };
610 
611   ParallelUtilities::runOnEachFunction(
612       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
613       SkipFunc, "CheckLargeFunctions");
614 
615   return Error::success();
616 }
617 
618 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
619   // Unlike other passes, allow functions in non-CFG state.
620   return BF.isSimple() && !BF.isIgnored();
621 }
622 
623 Error LowerAnnotations::runOnFunctions(BinaryContext &BC) {
624   // Convert GnuArgsSize annotations into CFIs.
625   for (BinaryFunction *BF : BC.getAllBinaryFunctions()) {
626     for (FunctionFragment &FF : BF->getLayout().fragments()) {
627       // Reset at the start of the new fragment.
628       int64_t CurrentGnuArgsSize = 0;
629 
630       for (BinaryBasicBlock *const BB : FF) {
631         for (auto II = BB->begin(); II != BB->end(); ++II) {
632           if (!BF->usesGnuArgsSize() || !BC.MIB->isInvoke(*II))
633             continue;
634 
635           const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II);
636           assert(NewGnuArgsSize >= 0 && "Expected non-negative GNU_args_size.");
637           if (NewGnuArgsSize == CurrentGnuArgsSize)
638             continue;
639 
640           auto InsertII = BF->addCFIInstruction(
641               BB, II,
642               MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize));
643           CurrentGnuArgsSize = NewGnuArgsSize;
644           II = std::next(InsertII);
645         }
646       }
647     }
648   }
649   return Error::success();
650 }
651 
652 // Check for dirty state in MCSymbol objects that might be a consequence
653 // of running calculateEmittedSize() in parallel, during split functions
654 // pass. If an inconsistent state is found (symbol already registered or
655 // already defined), clean it.
656 Error CleanMCState::runOnFunctions(BinaryContext &BC) {
657   MCContext &Ctx = *BC.Ctx;
658   for (const auto &SymMapEntry : Ctx.getSymbols()) {
659     const MCSymbol *S = SymMapEntry.getValue().Symbol;
660     if (!S)
661       continue;
662     if (S->isDefined()) {
663       LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
664                         << "\" is already defined\n");
665       const_cast<MCSymbol *>(S)->setUndefined();
666     }
667     if (S->isRegistered()) {
668       LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
669                         << "\" is already registered\n");
670       const_cast<MCSymbol *>(S)->setIsRegistered(false);
671     }
672     LLVM_DEBUG(if (S->isVariable()) {
673       dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n";
674     });
675   }
676   return Error::success();
677 }
678 
679 // This peephole fixes jump instructions that jump to another basic
680 // block with a single jump instruction, e.g.
681 //
682 // B0: ...
683 //     jmp  B1   (or jcc B1)
684 //
685 // B1: jmp  B2
686 //
687 // ->
688 //
689 // B0: ...
690 //     jmp  B2   (or jcc B2)
691 //
692 static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
693   uint64_t NumDoubleJumps = 0;
694 
695   MCContext *Ctx = Function.getBinaryContext().Ctx.get();
696   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
697   for (BinaryBasicBlock &BB : Function) {
698     auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ,
699                              const MCSymbol *SuccSym,
700                              std::optional<uint32_t> Offset) {
701       // Ignore infinite loop jumps or fallthrough tail jumps.
702       if (Pred == Succ || Succ == &BB)
703         return false;
704 
705       if (Succ) {
706         const MCSymbol *TBB = nullptr;
707         const MCSymbol *FBB = nullptr;
708         MCInst *CondBranch = nullptr;
709         MCInst *UncondBranch = nullptr;
710         bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
711         if (!Res) {
712           LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
713                      Pred->dump());
714           return false;
715         }
716         Pred->replaceSuccessor(&BB, Succ);
717 
718         // We must patch up any existing branch instructions to match up
719         // with the new successor.
720         assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) &&
721                "Predecessor block has inconsistent number of successors");
722         if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) {
723           MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx);
724         } else if (UncondBranch &&
725                    MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) {
726           MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx);
727         } else if (!UncondBranch) {
728           assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ &&
729                  "Don't add an explicit jump to a fallthrough block.");
730           Pred->addBranchInstruction(Succ);
731         }
732       } else {
733         // Succ will be null in the tail call case.  In this case we
734         // need to explicitly add a tail call instruction.
735         MCInst *Branch = Pred->getLastNonPseudoInstr();
736         if (Branch && MIB->isUnconditionalBranch(*Branch)) {
737           assert(MIB->getTargetSymbol(*Branch) == BB.getLabel());
738           Pred->removeSuccessor(&BB);
739           Pred->eraseInstruction(Pred->findInstruction(Branch));
740           Pred->addTailCallInstruction(SuccSym);
741           if (Offset) {
742             MCInst *TailCall = Pred->getLastNonPseudoInstr();
743             assert(TailCall);
744             MIB->setOffset(*TailCall, *Offset);
745           }
746         } else {
747           return false;
748         }
749       }
750 
751       ++NumDoubleJumps;
752       LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from "
753                         << Pred->getName() << " -> " << BB.getName() << " to "
754                         << Pred->getName() << " -> " << SuccSym->getName()
755                         << (!Succ ? " (tail)\n" : "\n"));
756 
757       return true;
758     };
759 
760     if (BB.getNumNonPseudos() != 1 || BB.isLandingPad())
761       continue;
762 
763     MCInst *Inst = BB.getFirstNonPseudoInstr();
764     const bool IsTailCall = MIB->isTailCall(*Inst);
765 
766     if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall)
767       continue;
768 
769     // If we operate after SCTC make sure it's not a conditional tail call.
770     if (IsTailCall && MIB->isConditionalBranch(*Inst))
771       continue;
772 
773     const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst);
774     BinaryBasicBlock *Succ = BB.getSuccessor();
775 
776     if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym))
777       continue;
778 
779     std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()};
780 
781     for (BinaryBasicBlock *Pred : Preds) {
782       if (Pred->isLandingPad())
783         continue;
784 
785       if (Pred->getSuccessor() == &BB ||
786           (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) ||
787           Pred->getConditionalSuccessor(false) == &BB)
788         if (checkAndPatch(Pred, Succ, SuccSym, MIB->getOffset(*Inst)) &&
789             MarkInvalid)
790           BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() ||
791                        BB.isEntryPoint());
792     }
793   }
794 
795   return NumDoubleJumps;
796 }
797 
798 bool SimplifyConditionalTailCalls::shouldRewriteBranch(
799     const BinaryBasicBlock *PredBB, const MCInst &CondBranch,
800     const BinaryBasicBlock *BB, const bool DirectionFlag) {
801   if (BeenOptimized.count(PredBB))
802     return false;
803 
804   const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB);
805 
806   if (IsForward)
807     ++NumOrigForwardBranches;
808   else
809     ++NumOrigBackwardBranches;
810 
811   if (opts::SctcMode == opts::SctcAlways)
812     return true;
813 
814   if (opts::SctcMode == opts::SctcPreserveDirection)
815     return IsForward == DirectionFlag;
816 
817   const ErrorOr<std::pair<double, double>> Frequency =
818       PredBB->getBranchStats(BB);
819 
820   // It's ok to rewrite the conditional branch if the new target will be
821   // a backward branch.
822 
823   // If no data available for these branches, then it should be ok to
824   // do the optimization since it will reduce code size.
825   if (Frequency.getError())
826     return true;
827 
828   // TODO: should this use misprediction frequency instead?
829   const bool Result = (IsForward && Frequency.get().first >= 0.5) ||
830                       (!IsForward && Frequency.get().first <= 0.5);
831 
832   return Result == DirectionFlag;
833 }
834 
835 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
836   // Need updated indices to correctly detect branch' direction.
837   BF.getLayout().updateLayoutIndices();
838   BF.markUnreachableBlocks();
839 
840   MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
841   MCContext *Ctx = BF.getBinaryContext().Ctx.get();
842   uint64_t NumLocalCTCCandidates = 0;
843   uint64_t NumLocalCTCs = 0;
844   uint64_t LocalCTCTakenCount = 0;
845   uint64_t LocalCTCExecCount = 0;
846   std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>>
847       NeedsUncondBranch;
848 
849   // Will block be deleted by UCE?
850   auto isValid = [](const BinaryBasicBlock *BB) {
851     return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint());
852   };
853 
854   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
855     // Locate BB with a single direct tail-call instruction.
856     if (BB->getNumNonPseudos() != 1)
857       continue;
858 
859     MCInst *Instr = BB->getFirstNonPseudoInstr();
860     if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr))
861       continue;
862 
863     const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr);
864     if (!CalleeSymbol)
865       continue;
866 
867     // Detect direction of the possible conditional tail call.
868     const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol);
869 
870     // Iterate through all predecessors.
871     for (BinaryBasicBlock *PredBB : BB->predecessors()) {
872       BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true);
873       if (!CondSucc)
874         continue;
875 
876       ++NumLocalCTCCandidates;
877 
878       const MCSymbol *TBB = nullptr;
879       const MCSymbol *FBB = nullptr;
880       MCInst *CondBranch = nullptr;
881       MCInst *UncondBranch = nullptr;
882       bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
883 
884       // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
885       if (!Result) {
886         LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
887                    PredBB->dump());
888         continue;
889       }
890 
891       assert(Result && "internal error analyzing conditional branch");
892       assert(CondBranch && "conditional branch expected");
893 
894       // Skip dynamic branches for now.
895       if (BF.getBinaryContext().MIB->isDynamicBranch(*CondBranch))
896         continue;
897 
898       // It's possible that PredBB is also a successor to BB that may have
899       // been processed by a previous iteration of the SCTC loop, in which
900       // case it may have been marked invalid.  We should skip rewriting in
901       // this case.
902       if (!PredBB->isValid()) {
903         assert(PredBB->isSuccessor(BB) &&
904                "PredBB should be valid if it is not a successor to BB");
905         continue;
906       }
907 
908       // We don't want to reverse direction of the branch in new order
909       // without further profile analysis.
910       const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC;
911       if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag))
912         continue;
913 
914       // Record this block so that we don't try to optimize it twice.
915       BeenOptimized.insert(PredBB);
916 
917       uint64_t Count = 0;
918       if (CondSucc != BB) {
919         // Patch the new target address into the conditional branch.
920         MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx);
921         // Since we reversed the condition on the branch we need to change
922         // the target for the unconditional branch or add a unconditional
923         // branch to the old target.  This has to be done manually since
924         // fixupBranches is not called after SCTC.
925         NeedsUncondBranch.emplace_back(PredBB, CondSucc);
926         Count = PredBB->getFallthroughBranchInfo().Count;
927       } else {
928         // Change destination of the conditional branch.
929         MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx);
930         Count = PredBB->getTakenBranchInfo().Count;
931       }
932       const uint64_t CTCTakenFreq =
933           Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count;
934 
935       // Annotate it, so "isCall" returns true for this jcc
936       MIB->setConditionalTailCall(*CondBranch);
937       // Add info about the conditional tail call frequency, otherwise this
938       // info will be lost when we delete the associated BranchInfo entry
939       auto &CTCAnnotation =
940           MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount");
941       CTCAnnotation = CTCTakenFreq;
942       // Preserve Offset annotation, used in BAT.
943       // Instr is a direct tail call instruction that was created when CTCs are
944       // first expanded, and has the original CTC offset set.
945       if (std::optional<uint32_t> Offset = MIB->getOffset(*Instr))
946         MIB->setOffset(*CondBranch, *Offset);
947 
948       // Remove the unused successor which may be eliminated later
949       // if there are no other users.
950       PredBB->removeSuccessor(BB);
951       // Update BB execution count
952       if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount())
953         BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq);
954       else if (CTCTakenFreq > BB->getKnownExecutionCount())
955         BB->setExecutionCount(0);
956 
957       ++NumLocalCTCs;
958       LocalCTCTakenCount += CTCTakenFreq;
959       LocalCTCExecCount += PredBB->getKnownExecutionCount();
960     }
961 
962     // Remove the block from CFG if all predecessors were removed.
963     BB->markValid(isValid(BB));
964   }
965 
966   // Add unconditional branches at the end of BBs to new successors
967   // as long as the successor is not a fallthrough.
968   for (auto &Entry : NeedsUncondBranch) {
969     BinaryBasicBlock *PredBB = Entry.first;
970     const BinaryBasicBlock *CondSucc = Entry.second;
971 
972     const MCSymbol *TBB = nullptr;
973     const MCSymbol *FBB = nullptr;
974     MCInst *CondBranch = nullptr;
975     MCInst *UncondBranch = nullptr;
976     PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
977 
978     // Find the next valid block.  Invalid blocks will be deleted
979     // so they shouldn't be considered fallthrough targets.
980     const BinaryBasicBlock *NextBlock =
981         BF.getLayout().getBasicBlockAfter(PredBB, false);
982     while (NextBlock && !isValid(NextBlock))
983       NextBlock = BF.getLayout().getBasicBlockAfter(NextBlock, false);
984 
985     // Get the unconditional successor to this block.
986     const BinaryBasicBlock *PredSucc = PredBB->getSuccessor();
987     assert(PredSucc && "The other branch should be a tail call");
988 
989     const bool HasFallthrough = (NextBlock && PredSucc == NextBlock);
990 
991     if (UncondBranch) {
992       if (HasFallthrough)
993         PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch));
994       else
995         MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx);
996     } else if (!HasFallthrough) {
997       MCInst Branch;
998       MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx);
999       PredBB->addInstruction(Branch);
1000     }
1001   }
1002 
1003   if (NumLocalCTCs > 0) {
1004     NumDoubleJumps += fixDoubleJumps(BF, true);
1005     // Clean-up unreachable tail-call blocks.
1006     const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs();
1007     DeletedBlocks += Stats.first;
1008     DeletedBytes += Stats.second;
1009 
1010     assert(BF.validateCFG());
1011   }
1012 
1013   LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
1014                     << " conditional tail calls from a total of "
1015                     << NumLocalCTCCandidates << " candidates in function " << BF
1016                     << ". CTCs execution count for this function is "
1017                     << LocalCTCExecCount << " and CTC taken count is "
1018                     << LocalCTCTakenCount << "\n";);
1019 
1020   NumTailCallsPatched += NumLocalCTCs;
1021   NumCandidateTailCalls += NumLocalCTCCandidates;
1022   CTCExecCount += LocalCTCExecCount;
1023   CTCTakenCount += LocalCTCTakenCount;
1024 
1025   return NumLocalCTCs > 0;
1026 }
1027 
1028 Error SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
1029   if (!BC.isX86())
1030     return Error::success();
1031 
1032   for (auto &It : BC.getBinaryFunctions()) {
1033     BinaryFunction &Function = It.second;
1034 
1035     if (!shouldOptimize(Function))
1036       continue;
1037 
1038     if (fixTailCalls(Function)) {
1039       Modified.insert(&Function);
1040       Function.setHasCanonicalCFG(false);
1041     }
1042   }
1043 
1044   if (NumTailCallsPatched)
1045     BC.outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
1046               << " tail calls (" << NumOrigForwardBranches << " forward)"
1047               << " tail calls (" << NumOrigBackwardBranches << " backward)"
1048               << " from a total of " << NumCandidateTailCalls
1049               << " while removing " << NumDoubleJumps << " double jumps"
1050               << " and removing " << DeletedBlocks << " basic blocks"
1051               << " totalling " << DeletedBytes
1052               << " bytes of code. CTCs total execution count is "
1053               << CTCExecCount << " and the number of times CTCs are taken is "
1054               << CTCTakenCount << "\n";
1055   return Error::success();
1056 }
1057 
1058 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
1059   uint64_t Count = 0;
1060   const BinaryContext &BC = Function.getBinaryContext();
1061   for (BinaryBasicBlock &BB : Function) {
1062     for (MCInst &Inst : BB) {
1063       // Skip shortening instructions with Size annotation.
1064       if (BC.MIB->getSize(Inst))
1065         continue;
1066 
1067       MCInst OriginalInst;
1068       if (opts::Verbosity > 2)
1069         OriginalInst = Inst;
1070 
1071       if (!BC.MIB->shortenInstruction(Inst, *BC.STI))
1072         continue;
1073 
1074       if (opts::Verbosity > 2) {
1075         BC.scopeLock();
1076         BC.outs() << "BOLT-INFO: shortening:\nBOLT-INFO:    ";
1077         BC.printInstruction(BC.outs(), OriginalInst, 0, &Function);
1078         BC.outs() << "BOLT-INFO: to:";
1079         BC.printInstruction(BC.outs(), Inst, 0, &Function);
1080       }
1081 
1082       ++Count;
1083     }
1084   }
1085 
1086   return Count;
1087 }
1088 
1089 Error ShortenInstructions::runOnFunctions(BinaryContext &BC) {
1090   std::atomic<uint64_t> NumShortened{0};
1091   if (!BC.isX86())
1092     return Error::success();
1093 
1094   ParallelUtilities::runOnEachFunction(
1095       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR,
1096       [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); },
1097       nullptr, "ShortenInstructions");
1098 
1099   if (NumShortened)
1100     BC.outs() << "BOLT-INFO: " << NumShortened
1101               << " instructions were shortened\n";
1102   return Error::success();
1103 }
1104 
1105 void Peepholes::addTailcallTraps(BinaryFunction &Function) {
1106   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
1107   for (BinaryBasicBlock &BB : Function) {
1108     MCInst *Inst = BB.getLastNonPseudoInstr();
1109     if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) {
1110       MCInst Trap;
1111       MIB->createTrap(Trap);
1112       BB.addInstruction(Trap);
1113       ++TailCallTraps;
1114     }
1115   }
1116 }
1117 
1118 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) {
1119   for (BinaryBasicBlock &BB : Function) {
1120     if (BB.succ_size() != 2)
1121       continue;
1122 
1123     BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true);
1124     BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false);
1125     if (CondBB != UncondBB)
1126       continue;
1127 
1128     const MCSymbol *TBB = nullptr;
1129     const MCSymbol *FBB = nullptr;
1130     MCInst *CondBranch = nullptr;
1131     MCInst *UncondBranch = nullptr;
1132     bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
1133 
1134     // analyzeBranch() can fail due to unusual branch instructions,
1135     // e.g. jrcxz, or jump tables (indirect jump).
1136     if (!Result || !CondBranch)
1137       continue;
1138 
1139     BB.removeDuplicateConditionalSuccessor(CondBranch);
1140     ++NumUselessCondBranches;
1141   }
1142 }
1143 
1144 Error Peepholes::runOnFunctions(BinaryContext &BC) {
1145   const char Opts =
1146       std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0,
1147                       [](const char A, const PeepholeOpts B) { return A | B; });
1148   if (Opts == PEEP_NONE)
1149     return Error::success();
1150 
1151   for (auto &It : BC.getBinaryFunctions()) {
1152     BinaryFunction &Function = It.second;
1153     if (shouldOptimize(Function)) {
1154       if (Opts & PEEP_DOUBLE_JUMPS)
1155         NumDoubleJumps += fixDoubleJumps(Function, false);
1156       if (Opts & PEEP_TAILCALL_TRAPS)
1157         addTailcallTraps(Function);
1158       if (Opts & PEEP_USELESS_BRANCHES)
1159         removeUselessCondBranches(Function);
1160       assert(Function.validateCFG());
1161     }
1162   }
1163   BC.outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
1164             << " double jumps patched.\n"
1165             << "BOLT-INFO: Peephole: " << TailCallTraps
1166             << " tail call traps inserted.\n"
1167             << "BOLT-INFO: Peephole: " << NumUselessCondBranches
1168             << " useless conditional branches removed.\n";
1169   return Error::success();
1170 }
1171 
1172 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
1173   BinaryContext &BC = BF.getBinaryContext();
1174   MCPlusBuilder *MIB = BC.MIB.get();
1175 
1176   uint64_t NumLocalLoadsSimplified = 0;
1177   uint64_t NumDynamicLocalLoadsSimplified = 0;
1178   uint64_t NumLocalLoadsFound = 0;
1179   uint64_t NumDynamicLocalLoadsFound = 0;
1180 
1181   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
1182     for (MCInst &Inst : *BB) {
1183       unsigned Opcode = Inst.getOpcode();
1184       const MCInstrDesc &Desc = BC.MII->get(Opcode);
1185 
1186       // Skip instructions that do not load from memory.
1187       if (!Desc.mayLoad())
1188         continue;
1189 
1190       // Try to statically evaluate the target memory address;
1191       uint64_t TargetAddress;
1192 
1193       if (MIB->hasPCRelOperand(Inst)) {
1194         // Try to find the symbol that corresponds to the PC-relative operand.
1195         MCOperand *DispOpI = MIB->getMemOperandDisp(Inst);
1196         assert(DispOpI != Inst.end() && "expected PC-relative displacement");
1197         assert(DispOpI->isExpr() &&
1198                "found PC-relative with non-symbolic displacement");
1199 
1200         // Get displacement symbol.
1201         const MCSymbol *DisplSymbol;
1202         uint64_t DisplOffset;
1203 
1204         std::tie(DisplSymbol, DisplOffset) =
1205             MIB->getTargetSymbolInfo(DispOpI->getExpr());
1206 
1207         if (!DisplSymbol)
1208           continue;
1209 
1210         // Look up the symbol address in the global symbols map of the binary
1211         // context object.
1212         BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName());
1213         if (!BD)
1214           continue;
1215         TargetAddress = BD->getAddress() + DisplOffset;
1216       } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) {
1217         continue;
1218       }
1219 
1220       // Get the contents of the section containing the target address of the
1221       // memory operand. We are only interested in read-only sections.
1222       ErrorOr<BinarySection &> DataSection =
1223           BC.getSectionForAddress(TargetAddress);
1224       if (!DataSection || DataSection->isWritable())
1225         continue;
1226 
1227       if (BC.getRelocationAt(TargetAddress) ||
1228           BC.getDynamicRelocationAt(TargetAddress))
1229         continue;
1230 
1231       uint32_t Offset = TargetAddress - DataSection->getAddress();
1232       StringRef ConstantData = DataSection->getContents();
1233 
1234       ++NumLocalLoadsFound;
1235       if (BB->hasProfile())
1236         NumDynamicLocalLoadsFound += BB->getExecutionCount();
1237 
1238       if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) {
1239         ++NumLocalLoadsSimplified;
1240         if (BB->hasProfile())
1241           NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
1242       }
1243     }
1244   }
1245 
1246   NumLoadsFound += NumLocalLoadsFound;
1247   NumDynamicLoadsFound += NumDynamicLocalLoadsFound;
1248   NumLoadsSimplified += NumLocalLoadsSimplified;
1249   NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified;
1250 
1251   return NumLocalLoadsSimplified > 0;
1252 }
1253 
1254 Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
1255   for (auto &It : BC.getBinaryFunctions()) {
1256     BinaryFunction &Function = It.second;
1257     if (shouldOptimize(Function) && simplifyRODataLoads(Function))
1258       Modified.insert(&Function);
1259   }
1260 
1261   BC.outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
1262             << NumLoadsFound << " loads from a statically computed address.\n"
1263             << "BOLT-INFO: dynamic loads simplified: "
1264             << NumDynamicLoadsSimplified << "\n"
1265             << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound
1266             << "\n";
1267   return Error::success();
1268 }
1269 
1270 Error AssignSections::runOnFunctions(BinaryContext &BC) {
1271   for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
1272     Function->setCodeSectionName(BC.getInjectedCodeSectionName());
1273     Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
1274   }
1275 
1276   // In non-relocation mode functions have pre-assigned section names.
1277   if (!BC.HasRelocations)
1278     return Error::success();
1279 
1280   const bool UseColdSection =
1281       BC.NumProfiledFuncs > 0 ||
1282       opts::ReorderFunctions == ReorderFunctions::RT_USER;
1283   for (auto &BFI : BC.getBinaryFunctions()) {
1284     BinaryFunction &Function = BFI.second;
1285     if (opts::isHotTextMover(Function)) {
1286       Function.setCodeSectionName(BC.getHotTextMoverSectionName());
1287       Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
1288       continue;
1289     }
1290 
1291     if (!UseColdSection || Function.hasValidIndex())
1292       Function.setCodeSectionName(BC.getMainCodeSectionName());
1293     else
1294       Function.setCodeSectionName(BC.getColdCodeSectionName());
1295 
1296     if (Function.isSplit())
1297       Function.setColdCodeSectionName(BC.getColdCodeSectionName());
1298   }
1299   return Error::success();
1300 }
1301 
1302 Error PrintProfileStats::runOnFunctions(BinaryContext &BC) {
1303   double FlowImbalanceMean = 0.0;
1304   size_t NumBlocksConsidered = 0;
1305   double WorstBias = 0.0;
1306   const BinaryFunction *WorstBiasFunc = nullptr;
1307 
1308   // For each function CFG, we fill an IncomingMap with the sum of the frequency
1309   // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
1310   // of the frequency of outgoing edges.
1311   using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
1312   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
1313   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
1314 
1315   // Compute mean
1316   for (const auto &BFI : BC.getBinaryFunctions()) {
1317     const BinaryFunction &Function = BFI.second;
1318     if (Function.empty() || !Function.isSimple())
1319       continue;
1320     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1321     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1322     for (const BinaryBasicBlock &BB : Function) {
1323       uint64_t TotalOutgoing = 0ULL;
1324       auto SuccBIIter = BB.branch_info_begin();
1325       for (BinaryBasicBlock *Succ : BB.successors()) {
1326         uint64_t Count = SuccBIIter->Count;
1327         if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
1328           ++SuccBIIter;
1329           continue;
1330         }
1331         TotalOutgoing += Count;
1332         IncomingMap[Succ] += Count;
1333         ++SuccBIIter;
1334       }
1335       OutgoingMap[&BB] = TotalOutgoing;
1336     }
1337 
1338     size_t NumBlocks = 0;
1339     double Mean = 0.0;
1340     for (const BinaryBasicBlock &BB : Function) {
1341       // Do not compute score for low frequency blocks, entry or exit blocks
1342       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
1343         continue;
1344       ++NumBlocks;
1345       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1346       Mean += fabs(Difference / IncomingMap[&BB]);
1347     }
1348 
1349     FlowImbalanceMean += Mean;
1350     NumBlocksConsidered += NumBlocks;
1351     if (!NumBlocks)
1352       continue;
1353     double FuncMean = Mean / NumBlocks;
1354     if (FuncMean > WorstBias) {
1355       WorstBias = FuncMean;
1356       WorstBiasFunc = &Function;
1357     }
1358   }
1359   if (NumBlocksConsidered > 0)
1360     FlowImbalanceMean /= NumBlocksConsidered;
1361 
1362   // Compute standard deviation
1363   NumBlocksConsidered = 0;
1364   double FlowImbalanceVar = 0.0;
1365   for (const auto &BFI : BC.getBinaryFunctions()) {
1366     const BinaryFunction &Function = BFI.second;
1367     if (Function.empty() || !Function.isSimple())
1368       continue;
1369     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1370     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1371     for (const BinaryBasicBlock &BB : Function) {
1372       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
1373         continue;
1374       ++NumBlocksConsidered;
1375       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1376       FlowImbalanceVar +=
1377           pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2);
1378     }
1379   }
1380   if (NumBlocksConsidered) {
1381     FlowImbalanceVar /= NumBlocksConsidered;
1382     FlowImbalanceVar = sqrt(FlowImbalanceVar);
1383   }
1384 
1385   // Report to user
1386   BC.outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
1387                       (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
1388   if (WorstBiasFunc && opts::Verbosity >= 1) {
1389     BC.outs() << "Worst average bias observed in "
1390               << WorstBiasFunc->getPrintName() << "\n";
1391     LLVM_DEBUG(WorstBiasFunc->dump());
1392   }
1393   return Error::success();
1394 }
1395 
1396 Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
1397   uint64_t NumRegularFunctions = 0;
1398   uint64_t NumStaleProfileFunctions = 0;
1399   uint64_t NumAllStaleFunctions = 0;
1400   uint64_t NumInferredFunctions = 0;
1401   uint64_t NumNonSimpleProfiledFunctions = 0;
1402   uint64_t NumUnknownControlFlowFunctions = 0;
1403   uint64_t TotalSampleCount = 0;
1404   uint64_t StaleSampleCount = 0;
1405   uint64_t InferredSampleCount = 0;
1406   std::vector<const BinaryFunction *> ProfiledFunctions;
1407   std::vector<std::pair<double, uint64_t>> FuncDensityList;
1408   const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
1409   for (auto &BFI : BC.getBinaryFunctions()) {
1410     const BinaryFunction &Function = BFI.second;
1411 
1412     // Ignore PLT functions for stats.
1413     if (Function.isPLTFunction())
1414       continue;
1415 
1416     // Adjustment for BAT mode: the profile for BOLT split fragments is combined
1417     // so only count the hot fragment.
1418     const uint64_t Address = Function.getAddress();
1419     bool IsHotParentOfBOLTSplitFunction = !Function.getFragments().empty() &&
1420                                           BAT && BAT->isBATFunction(Address) &&
1421                                           !BAT->fetchParentAddress(Address);
1422 
1423     ++NumRegularFunctions;
1424 
1425     // In BOLTed binaries split functions are non-simple (due to non-relocation
1426     // mode), but the original function is known to be simple and we have a
1427     // valid profile for it.
1428     if (!Function.isSimple() && !IsHotParentOfBOLTSplitFunction) {
1429       if (Function.hasProfile())
1430         ++NumNonSimpleProfiledFunctions;
1431       continue;
1432     }
1433 
1434     if (Function.hasUnknownControlFlow()) {
1435       if (opts::PrintUnknownCFG)
1436         Function.dump();
1437       else if (opts::PrintUnknown)
1438         BC.errs() << "function with unknown control flow: " << Function << '\n';
1439 
1440       ++NumUnknownControlFlowFunctions;
1441     }
1442 
1443     if (!Function.hasProfile())
1444       continue;
1445 
1446     uint64_t SampleCount = Function.getRawBranchCount();
1447     TotalSampleCount += SampleCount;
1448 
1449     if (Function.hasValidProfile()) {
1450       ProfiledFunctions.push_back(&Function);
1451       if (Function.hasInferredProfile()) {
1452         ++NumInferredFunctions;
1453         InferredSampleCount += SampleCount;
1454         ++NumAllStaleFunctions;
1455       }
1456     } else {
1457       if (opts::ReportStaleFuncs) {
1458         BC.outs() << StaleFuncsHeader;
1459         StaleFuncsHeader = "";
1460         BC.outs() << "  " << Function << '\n';
1461       }
1462       ++NumStaleProfileFunctions;
1463       StaleSampleCount += SampleCount;
1464       ++NumAllStaleFunctions;
1465     }
1466 
1467     if (opts::ShowDensity) {
1468       uint64_t Size = Function.getSize();
1469       // In case of BOLT split functions registered in BAT, executed traces are
1470       // automatically attributed to the main fragment. Add up function sizes
1471       // for all fragments.
1472       if (IsHotParentOfBOLTSplitFunction)
1473         for (const BinaryFunction *Fragment : Function.getFragments())
1474           Size += Fragment->getSize();
1475       double Density = (double)1.0 * Function.getSampleCountInBytes() / Size;
1476       FuncDensityList.emplace_back(Density, SampleCount);
1477       LLVM_DEBUG(BC.outs() << Function << ": executed bytes "
1478                            << Function.getSampleCountInBytes() << ", size (b) "
1479                            << Size << ", density " << Density
1480                            << ", sample count " << SampleCount << '\n');
1481     }
1482   }
1483   BC.NumProfiledFuncs = ProfiledFunctions.size();
1484   BC.NumStaleProfileFuncs = NumStaleProfileFunctions;
1485 
1486   const size_t NumAllProfiledFunctions =
1487       ProfiledFunctions.size() + NumStaleProfileFunctions;
1488   BC.outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
1489             << NumRegularFunctions << " functions in the binary ("
1490             << format("%.1f", NumAllProfiledFunctions /
1491                                   (float)NumRegularFunctions * 100.0f)
1492             << "%) have non-empty execution profile\n";
1493   if (NumNonSimpleProfiledFunctions) {
1494     BC.outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
1495               << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
1496               << " with profile could not be optimized\n";
1497   }
1498   if (NumAllStaleFunctions) {
1499     const float PctStale =
1500         NumAllStaleFunctions / (float)NumAllProfiledFunctions * 100.0f;
1501     const float PctStaleFuncsWithEqualBlockCount =
1502         (float)BC.Stats.NumStaleFuncsWithEqualBlockCount /
1503         NumAllStaleFunctions * 100.0f;
1504     const float PctStaleBlocksWithEqualIcount =
1505         (float)BC.Stats.NumStaleBlocksWithEqualIcount /
1506         BC.Stats.NumStaleBlocks * 100.0f;
1507     auto printErrorOrWarning = [&]() {
1508       if (PctStale > opts::StaleThreshold)
1509         BC.errs() << "BOLT-ERROR: ";
1510       else
1511         BC.errs() << "BOLT-WARNING: ";
1512     };
1513     printErrorOrWarning();
1514     BC.errs() << NumAllStaleFunctions
1515               << format(" (%.1f%% of all profiled)", PctStale) << " function"
1516               << (NumAllStaleFunctions == 1 ? "" : "s")
1517               << " have invalid (possibly stale) profile."
1518                  " Use -report-stale to see the list.\n";
1519     if (TotalSampleCount > 0) {
1520       printErrorOrWarning();
1521       BC.errs() << (StaleSampleCount + InferredSampleCount) << " out of "
1522                 << TotalSampleCount << " samples in the binary ("
1523                 << format("%.1f",
1524                           ((100.0f * (StaleSampleCount + InferredSampleCount)) /
1525                            TotalSampleCount))
1526                 << "%) belong to functions with invalid"
1527                    " (possibly stale) profile.\n";
1528     }
1529     BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount
1530               << " stale function"
1531               << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s")
1532               << format(" (%.1f%% of all stale)",
1533                         PctStaleFuncsWithEqualBlockCount)
1534               << " have matching block count.\n";
1535     BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount
1536               << " stale block"
1537               << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s")
1538               << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount)
1539               << " have matching icount.\n";
1540     if (PctStale > opts::StaleThreshold) {
1541       return createFatalBOLTError(
1542           Twine("BOLT-ERROR: stale functions exceed specified threshold of ") +
1543           Twine(opts::StaleThreshold.getValue()) + Twine("%. Exiting.\n"));
1544     }
1545   }
1546   if (NumInferredFunctions) {
1547     BC.outs() << format(
1548         "BOLT-INFO: inferred profile for %d (%.2f%% of profiled, "
1549         "%.2f%% of stale) functions responsible for %.2f%% samples"
1550         " (%zu out of %zu)\n",
1551         NumInferredFunctions,
1552         100.0 * NumInferredFunctions / NumAllProfiledFunctions,
1553         100.0 * NumInferredFunctions / NumAllStaleFunctions,
1554         100.0 * InferredSampleCount / TotalSampleCount, InferredSampleCount,
1555         TotalSampleCount);
1556     BC.outs() << format(
1557         "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks"
1558         " (%zu out of %zu stale) responsible for %.2f%% samples"
1559         " (%zu out of %zu stale)\n",
1560         100.0 * BC.Stats.NumExactMatchedBlocks / BC.Stats.NumStaleBlocks,
1561         BC.Stats.NumExactMatchedBlocks, BC.Stats.NumStaleBlocks,
1562         100.0 * BC.Stats.ExactMatchedSampleCount / BC.Stats.StaleSampleCount,
1563         BC.Stats.ExactMatchedSampleCount, BC.Stats.StaleSampleCount);
1564     BC.outs() << format(
1565         "BOLT-INFO: inference found an exact pseudo probe match for %.2f%% of "
1566         "basic blocks (%zu out of %zu stale) responsible for %.2f%% samples"
1567         " (%zu out of %zu stale)\n",
1568         100.0 * BC.Stats.NumPseudoProbeExactMatchedBlocks /
1569             BC.Stats.NumStaleBlocks,
1570         BC.Stats.NumPseudoProbeExactMatchedBlocks, BC.Stats.NumStaleBlocks,
1571         100.0 * BC.Stats.PseudoProbeExactMatchedSampleCount /
1572             BC.Stats.StaleSampleCount,
1573         BC.Stats.PseudoProbeExactMatchedSampleCount, BC.Stats.StaleSampleCount);
1574     BC.outs() << format(
1575         "BOLT-INFO: inference found a loose pseudo probe match for %.2f%% of "
1576         "basic blocks (%zu out of %zu stale) responsible for %.2f%% samples"
1577         " (%zu out of %zu stale)\n",
1578         100.0 * BC.Stats.NumPseudoProbeLooseMatchedBlocks /
1579             BC.Stats.NumStaleBlocks,
1580         BC.Stats.NumPseudoProbeLooseMatchedBlocks, BC.Stats.NumStaleBlocks,
1581         100.0 * BC.Stats.PseudoProbeLooseMatchedSampleCount /
1582             BC.Stats.StaleSampleCount,
1583         BC.Stats.PseudoProbeLooseMatchedSampleCount, BC.Stats.StaleSampleCount);
1584     BC.outs() << format(
1585         "BOLT-INFO: inference found a call match for %.2f%% of basic "
1586         "blocks"
1587         " (%zu out of %zu stale) responsible for %.2f%% samples"
1588         " (%zu out of %zu stale)\n",
1589         100.0 * BC.Stats.NumCallMatchedBlocks / BC.Stats.NumStaleBlocks,
1590         BC.Stats.NumCallMatchedBlocks, BC.Stats.NumStaleBlocks,
1591         100.0 * BC.Stats.CallMatchedSampleCount / BC.Stats.StaleSampleCount,
1592         BC.Stats.CallMatchedSampleCount, BC.Stats.StaleSampleCount);
1593     BC.outs() << format(
1594         "BOLT-INFO: inference found a loose match for %.2f%% of basic "
1595         "blocks"
1596         " (%zu out of %zu stale) responsible for %.2f%% samples"
1597         " (%zu out of %zu stale)\n",
1598         100.0 * BC.Stats.NumLooseMatchedBlocks / BC.Stats.NumStaleBlocks,
1599         BC.Stats.NumLooseMatchedBlocks, BC.Stats.NumStaleBlocks,
1600         100.0 * BC.Stats.LooseMatchedSampleCount / BC.Stats.StaleSampleCount,
1601         BC.Stats.LooseMatchedSampleCount, BC.Stats.StaleSampleCount);
1602   }
1603 
1604   if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
1605     BC.outs() << "BOLT-INFO: profile for " << NumUnusedObjects
1606               << " objects was ignored\n";
1607   }
1608 
1609   if (ProfiledFunctions.size() > 10) {
1610     if (opts::Verbosity >= 1) {
1611       BC.outs() << "BOLT-INFO: top called functions are:\n";
1612       llvm::sort(ProfiledFunctions,
1613                  [](const BinaryFunction *A, const BinaryFunction *B) {
1614                    return B->getExecutionCount() < A->getExecutionCount();
1615                  });
1616       auto SFI = ProfiledFunctions.begin();
1617       auto SFIend = ProfiledFunctions.end();
1618       for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend;
1619            ++SFI, ++I)
1620         BC.outs() << "  " << **SFI << " : " << (*SFI)->getExecutionCount()
1621                   << '\n';
1622     }
1623   }
1624 
1625   if (!opts::PrintSortedBy.empty()) {
1626     std::vector<BinaryFunction *> Functions;
1627     std::map<const BinaryFunction *, DynoStats> Stats;
1628 
1629     for (auto &BFI : BC.getBinaryFunctions()) {
1630       BinaryFunction &BF = BFI.second;
1631       if (shouldOptimize(BF) && BF.hasValidProfile()) {
1632         Functions.push_back(&BF);
1633         Stats.emplace(&BF, getDynoStats(BF));
1634       }
1635     }
1636 
1637     const bool SortAll =
1638         llvm::is_contained(opts::PrintSortedBy, DynoStats::LAST_DYNO_STAT);
1639 
1640     const bool Ascending =
1641         opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending;
1642 
1643     std::function<bool(const DynoStats &, const DynoStats &)>
1644         DynoStatsComparator =
1645             SortAll ? [](const DynoStats &StatsA,
1646                          const DynoStats &StatsB) { return StatsA < StatsB; }
1647                     : [](const DynoStats &StatsA, const DynoStats &StatsB) {
1648                         return StatsA.lessThan(StatsB, opts::PrintSortedBy);
1649                       };
1650 
1651     llvm::stable_sort(Functions,
1652                       [Ascending, &Stats, DynoStatsComparator](
1653                           const BinaryFunction *A, const BinaryFunction *B) {
1654                         auto StatsItr = Stats.find(A);
1655                         assert(StatsItr != Stats.end());
1656                         const DynoStats &StatsA = StatsItr->second;
1657 
1658                         StatsItr = Stats.find(B);
1659                         assert(StatsItr != Stats.end());
1660                         const DynoStats &StatsB = StatsItr->second;
1661 
1662                         return Ascending ? DynoStatsComparator(StatsA, StatsB)
1663                                          : DynoStatsComparator(StatsB, StatsA);
1664                       });
1665 
1666     BC.outs() << "BOLT-INFO: top functions sorted by ";
1667     if (SortAll) {
1668       BC.outs() << "dyno stats";
1669     } else {
1670       BC.outs() << "(";
1671       bool PrintComma = false;
1672       for (const DynoStats::Category Category : opts::PrintSortedBy) {
1673         if (PrintComma)
1674           BC.outs() << ", ";
1675         BC.outs() << DynoStats::Description(Category);
1676         PrintComma = true;
1677       }
1678       BC.outs() << ")";
1679     }
1680 
1681     BC.outs() << " are:\n";
1682     auto SFI = Functions.begin();
1683     for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
1684       const DynoStats Stats = getDynoStats(**SFI);
1685       BC.outs() << "  " << **SFI;
1686       if (!SortAll) {
1687         BC.outs() << " (";
1688         bool PrintComma = false;
1689         for (const DynoStats::Category Category : opts::PrintSortedBy) {
1690           if (PrintComma)
1691             BC.outs() << ", ";
1692           BC.outs() << dynoStatsOptName(Category) << "=" << Stats[Category];
1693           PrintComma = true;
1694         }
1695         BC.outs() << ")";
1696       }
1697       BC.outs() << "\n";
1698     }
1699   }
1700 
1701   if (!BC.TrappedFunctions.empty()) {
1702     BC.errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
1703               << (BC.TrappedFunctions.size() > 1 ? "s" : "")
1704               << " will trap on entry. Use -trap-avx512=0 to disable"
1705                  " traps.";
1706     if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) {
1707       BC.errs() << '\n';
1708       for (const BinaryFunction *Function : BC.TrappedFunctions)
1709         BC.errs() << "  " << *Function << '\n';
1710     } else {
1711       BC.errs() << " Use -v=1 to see the list.\n";
1712     }
1713   }
1714 
1715   // Collect and print information about suboptimal code layout on input.
1716   if (opts::ReportBadLayout) {
1717     std::vector<BinaryFunction *> SuboptimalFuncs;
1718     for (auto &BFI : BC.getBinaryFunctions()) {
1719       BinaryFunction &BF = BFI.second;
1720       if (!BF.hasValidProfile())
1721         continue;
1722 
1723       const uint64_t HotThreshold =
1724           std::max<uint64_t>(BF.getKnownExecutionCount(), 1);
1725       bool HotSeen = false;
1726       for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) {
1727         if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
1728           HotSeen = true;
1729           continue;
1730         }
1731         if (HotSeen && BB->getKnownExecutionCount() == 0) {
1732           SuboptimalFuncs.push_back(&BF);
1733           break;
1734         }
1735       }
1736     }
1737 
1738     if (!SuboptimalFuncs.empty()) {
1739       llvm::sort(SuboptimalFuncs,
1740                  [](const BinaryFunction *A, const BinaryFunction *B) {
1741                    return A->getKnownExecutionCount() / A->getSize() >
1742                           B->getKnownExecutionCount() / B->getSize();
1743                  });
1744 
1745       BC.outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
1746                 << " functions have "
1747                    "cold code in the middle of hot code. Top functions are:\n";
1748       for (unsigned I = 0;
1749            I < std::min(static_cast<size_t>(opts::ReportBadLayout),
1750                         SuboptimalFuncs.size());
1751            ++I)
1752         SuboptimalFuncs[I]->print(BC.outs());
1753     }
1754   }
1755 
1756   if (NumUnknownControlFlowFunctions) {
1757     BC.outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
1758               << " functions have instructions with unknown control flow";
1759     if (!opts::PrintUnknown)
1760       BC.outs() << ". Use -print-unknown to see the list.";
1761     BC.outs() << '\n';
1762   }
1763 
1764   if (opts::ShowDensity) {
1765     double Density = 0.0;
1766     // Sorted by the density in descending order.
1767     llvm::stable_sort(FuncDensityList,
1768                       [&](const std::pair<double, uint64_t> &A,
1769                           const std::pair<double, uint64_t> &B) {
1770                         if (A.first != B.first)
1771                           return A.first > B.first;
1772                         return A.second < B.second;
1773                       });
1774 
1775     uint64_t AccumulatedSamples = 0;
1776     uint32_t I = 0;
1777     assert(opts::ProfileDensityCutOffHot <= 1000000 &&
1778            "The cutoff value is greater than 1000000(100%)");
1779     while (AccumulatedSamples <
1780                TotalSampleCount *
1781                    static_cast<float>(opts::ProfileDensityCutOffHot) /
1782                    1000000 &&
1783            I < FuncDensityList.size()) {
1784       AccumulatedSamples += FuncDensityList[I].second;
1785       Density = FuncDensityList[I].first;
1786       I++;
1787     }
1788     if (Density == 0.0) {
1789       BC.errs() << "BOLT-WARNING: the output profile is empty or the "
1790                    "--profile-density-cutoff-hot option is "
1791                    "set too low. Please check your command.\n";
1792     } else if (Density < opts::ProfileDensityThreshold) {
1793       BC.errs()
1794           << "BOLT-WARNING: BOLT is estimated to optimize better with "
1795           << format("%.1f", opts::ProfileDensityThreshold / Density)
1796           << "x more samples. Please consider increasing sampling rate or "
1797              "profiling for longer duration to get more samples.\n";
1798     }
1799 
1800     BC.outs() << "BOLT-INFO: Functions with density >= "
1801               << format("%.1f", Density) << " account for "
1802               << format("%.2f",
1803                         static_cast<double>(opts::ProfileDensityCutOffHot) /
1804                             10000)
1805               << "% total sample counts.\n";
1806   }
1807   return Error::success();
1808 }
1809 
1810 Error InstructionLowering::runOnFunctions(BinaryContext &BC) {
1811   for (auto &BFI : BC.getBinaryFunctions())
1812     for (BinaryBasicBlock &BB : BFI.second)
1813       for (MCInst &Instruction : BB)
1814         BC.MIB->lowerTailCall(Instruction);
1815   return Error::success();
1816 }
1817 
1818 Error StripRepRet::runOnFunctions(BinaryContext &BC) {
1819   if (!BC.isX86())
1820     return Error::success();
1821 
1822   uint64_t NumPrefixesRemoved = 0;
1823   uint64_t NumBytesSaved = 0;
1824   for (auto &BFI : BC.getBinaryFunctions()) {
1825     for (BinaryBasicBlock &BB : BFI.second) {
1826       auto LastInstRIter = BB.getLastNonPseudo();
1827       if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) ||
1828           !BC.MIB->deleteREPPrefix(*LastInstRIter))
1829         continue;
1830 
1831       NumPrefixesRemoved += BB.getKnownExecutionCount();
1832       ++NumBytesSaved;
1833     }
1834   }
1835 
1836   if (NumBytesSaved)
1837     BC.outs() << "BOLT-INFO: removed " << NumBytesSaved
1838               << " 'repz' prefixes"
1839                  " with estimated execution count of "
1840               << NumPrefixesRemoved << " times.\n";
1841   return Error::success();
1842 }
1843 
1844 Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
1845   if (!BC.isX86())
1846     return Error::success();
1847 
1848   uint64_t NumInlined = 0;
1849   uint64_t NumInlinedDyno = 0;
1850   for (auto &BFI : BC.getBinaryFunctions()) {
1851     for (BinaryBasicBlock &BB : BFI.second) {
1852       for (auto II = BB.begin(); II != BB.end(); ++II) {
1853         MCInst &Inst = *II;
1854 
1855         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1856             !Inst.getOperand(0).isExpr())
1857           continue;
1858 
1859         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1860         if (CalleeSymbol->getName() != "memcpy" &&
1861             CalleeSymbol->getName() != "memcpy@PLT" &&
1862             CalleeSymbol->getName() != "_memcpy8")
1863           continue;
1864 
1865         const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
1866         const bool IsTailCall = BC.MIB->isTailCall(Inst);
1867 
1868         const InstructionListType NewCode =
1869             BC.MIB->createInlineMemcpy(IsMemcpy8);
1870         II = BB.replaceInstruction(II, NewCode);
1871         std::advance(II, NewCode.size() - 1);
1872         if (IsTailCall) {
1873           MCInst Return;
1874           BC.MIB->createReturn(Return);
1875           II = BB.insertInstruction(std::next(II), std::move(Return));
1876         }
1877 
1878         ++NumInlined;
1879         NumInlinedDyno += BB.getKnownExecutionCount();
1880       }
1881     }
1882   }
1883 
1884   if (NumInlined) {
1885     BC.outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
1886     if (NumInlinedDyno)
1887       BC.outs() << ". The calls were executed " << NumInlinedDyno
1888                 << " times based on profile.";
1889     BC.outs() << '\n';
1890   }
1891   return Error::success();
1892 }
1893 
1894 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
1895   if (!BinaryFunctionPass::shouldOptimize(Function))
1896     return false;
1897 
1898   for (const std::string &FunctionSpec : Spec) {
1899     StringRef FunctionName = StringRef(FunctionSpec).split(':').first;
1900     if (Function.hasNameRegex(FunctionName))
1901       return true;
1902   }
1903 
1904   return false;
1905 }
1906 
1907 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
1908     const BinaryFunction &Function) const {
1909   StringRef SitesString;
1910   for (const std::string &FunctionSpec : Spec) {
1911     StringRef FunctionName;
1912     std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
1913     if (Function.hasNameRegex(FunctionName))
1914       break;
1915     SitesString = "";
1916   }
1917 
1918   std::set<size_t> Sites;
1919   SmallVector<StringRef, 4> SitesVec;
1920   SitesString.split(SitesVec, ':');
1921   for (StringRef SiteString : SitesVec) {
1922     if (SiteString.empty())
1923       continue;
1924     size_t Result;
1925     if (!SiteString.getAsInteger(10, Result))
1926       Sites.emplace(Result);
1927   }
1928 
1929   return Sites;
1930 }
1931 
1932 Error SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
1933   if (!BC.isX86())
1934     return Error::success();
1935 
1936   uint64_t NumSpecialized = 0;
1937   uint64_t NumSpecializedDyno = 0;
1938   for (auto &BFI : BC.getBinaryFunctions()) {
1939     BinaryFunction &Function = BFI.second;
1940     if (!shouldOptimize(Function))
1941       continue;
1942 
1943     std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function);
1944     auto shouldOptimize = [&](size_t N) {
1945       return CallsToOptimize.empty() || CallsToOptimize.count(N);
1946     };
1947 
1948     std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
1949     size_t CallSiteID = 0;
1950     for (BinaryBasicBlock *CurBB : Blocks) {
1951       for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
1952         MCInst &Inst = *II;
1953 
1954         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1955             !Inst.getOperand(0).isExpr())
1956           continue;
1957 
1958         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1959         if (CalleeSymbol->getName() != "memcpy" &&
1960             CalleeSymbol->getName() != "memcpy@PLT")
1961           continue;
1962 
1963         if (BC.MIB->isTailCall(Inst))
1964           continue;
1965 
1966         ++CallSiteID;
1967 
1968         if (!shouldOptimize(CallSiteID))
1969           continue;
1970 
1971         // Create a copy of a call to memcpy(dest, src, size).
1972         MCInst MemcpyInstr = Inst;
1973 
1974         BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II);
1975 
1976         BinaryBasicBlock *NextBB = nullptr;
1977         if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
1978           NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
1979           NextBB->eraseInstruction(NextBB->begin());
1980         } else {
1981           NextBB = OneByteMemcpyBB->getSuccessor();
1982           OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
1983           assert(NextBB && "unexpected call to memcpy() with no return");
1984         }
1985 
1986         BinaryBasicBlock *MemcpyBB = Function.addBasicBlock();
1987         MemcpyBB->setOffset(CurBB->getInputOffset());
1988         InstructionListType CmpJCC =
1989             BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1,
1990                                 OneByteMemcpyBB->getLabel(), BC.Ctx.get());
1991         CurBB->addInstructions(CmpJCC);
1992         CurBB->addSuccessor(MemcpyBB);
1993 
1994         MemcpyBB->addInstruction(std::move(MemcpyInstr));
1995         MemcpyBB->addSuccessor(NextBB);
1996         MemcpyBB->setCFIState(NextBB->getCFIState());
1997         MemcpyBB->setExecutionCount(0);
1998 
1999         // To prevent the actual call from being moved to cold, we set its
2000         // execution count to 1.
2001         if (CurBB->getKnownExecutionCount() > 0)
2002           MemcpyBB->setExecutionCount(1);
2003 
2004         InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy();
2005         OneByteMemcpyBB->addInstructions(OneByteMemcpy);
2006 
2007         ++NumSpecialized;
2008         NumSpecializedDyno += CurBB->getKnownExecutionCount();
2009 
2010         CurBB = NextBB;
2011 
2012         // Note: we don't expect the next instruction to be a call to memcpy.
2013         II = CurBB->begin();
2014       }
2015     }
2016   }
2017 
2018   if (NumSpecialized) {
2019     BC.outs() << "BOLT-INFO: specialized " << NumSpecialized
2020               << " memcpy() call sites for size 1";
2021     if (NumSpecializedDyno)
2022       BC.outs() << ". The calls were executed " << NumSpecializedDyno
2023                 << " times based on profile.";
2024     BC.outs() << '\n';
2025   }
2026   return Error::success();
2027 }
2028 
2029 void RemoveNops::runOnFunction(BinaryFunction &BF) {
2030   const BinaryContext &BC = BF.getBinaryContext();
2031   for (BinaryBasicBlock &BB : BF) {
2032     for (int64_t I = BB.size() - 1; I >= 0; --I) {
2033       MCInst &Inst = BB.getInstructionAtIndex(I);
2034       if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP"))
2035         BB.eraseInstructionAtIndex(I);
2036     }
2037   }
2038 }
2039 
2040 Error RemoveNops::runOnFunctions(BinaryContext &BC) {
2041   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
2042     runOnFunction(BF);
2043   };
2044 
2045   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
2046     return BF.shouldPreserveNops();
2047   };
2048 
2049   ParallelUtilities::runOnEachFunction(
2050       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
2051       SkipFunc, "RemoveNops");
2052   return Error::success();
2053 }
2054 
2055 } // namespace bolt
2056 } // namespace llvm
2057